diff options
Diffstat (limited to 'kernel')
114 files changed, 7422 insertions, 4767 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e898c5b9d02c..2d9de86b7e76 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -2,16 +2,15 @@ | |||
2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o printk.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o groups.o |
14 | obj-y += groups.o | ||
15 | 14 | ||
16 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
17 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
@@ -20,10 +19,12 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg | |||
20 | CFLAGS_REMOVE_mutex-debug.o = -pg | 19 | CFLAGS_REMOVE_mutex-debug.o = -pg |
21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 20 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 21 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
23 | CFLAGS_REMOVE_sched_clock.o = -pg | ||
24 | CFLAGS_REMOVE_irq_work.o = -pg | 22 | CFLAGS_REMOVE_irq_work.o = -pg |
25 | endif | 23 | endif |
26 | 24 | ||
25 | obj-y += sched/ | ||
26 | obj-y += power/ | ||
27 | |||
27 | obj-$(CONFIG_FREEZER) += freezer.o | 28 | obj-$(CONFIG_FREEZER) += freezer.o |
28 | obj-$(CONFIG_PROFILING) += profile.o | 29 | obj-$(CONFIG_PROFILING) += profile.o |
29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o | 30 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o |
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | |||
52 | obj-$(CONFIG_UID16) += uid16.o | 53 | obj-$(CONFIG_UID16) += uid16.o |
53 | obj-$(CONFIG_MODULES) += module.o | 54 | obj-$(CONFIG_MODULES) += module.o |
54 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 55 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
55 | obj-$(CONFIG_PM) += power/ | ||
56 | obj-$(CONFIG_FREEZER) += power/ | ||
57 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 56 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
58 | obj-$(CONFIG_KEXEC) += kexec.o | 57 | obj-$(CONFIG_KEXEC) += kexec.o |
59 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | 58 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o |
@@ -99,7 +98,6 @@ obj-$(CONFIG_TRACING) += trace/ | |||
99 | obj-$(CONFIG_X86_DS) += trace/ | 98 | obj-$(CONFIG_X86_DS) += trace/ |
100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 99 | obj-$(CONFIG_RING_BUFFER) += trace/ |
101 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 100 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
102 | obj-$(CONFIG_SMP) += sched_cpupri.o | ||
103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 101 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
104 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | 102 | obj-$(CONFIG_CPU_PM) += cpu_pm.o |
105 | 103 | ||
@@ -110,15 +108,6 @@ obj-$(CONFIG_PADATA) += padata.o | |||
110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 108 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
111 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 109 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
112 | 110 | ||
113 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
114 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
115 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
116 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
117 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
118 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
119 | CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer | ||
120 | endif | ||
121 | |||
122 | $(obj)/configs.o: $(obj)/config_data.h | 111 | $(obj)/configs.o: $(obj)/config_data.h |
123 | 112 | ||
124 | # config_data.h contains the same information as ikconfig.h but gzipped. | 113 | # config_data.h contains the same information as ikconfig.h but gzipped. |
diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2ddc..02e6167a53b0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
84 | * the cache line to have the data after getting the lock. | 84 | * the cache line to have the data after getting the lock. |
85 | */ | 85 | */ |
86 | struct bsd_acct_struct { | 86 | struct bsd_acct_struct { |
87 | volatile int active; | 87 | int active; |
88 | volatile int needcheck; | 88 | unsigned long needcheck; |
89 | struct file *file; | 89 | struct file *file; |
90 | struct pid_namespace *ns; | 90 | struct pid_namespace *ns; |
91 | struct timer_list timer; | ||
92 | struct list_head list; | 91 | struct list_head list; |
93 | }; | 92 | }; |
94 | 93 | ||
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock); | |||
96 | static LIST_HEAD(acct_list); | 95 | static LIST_HEAD(acct_list); |
97 | 96 | ||
98 | /* | 97 | /* |
99 | * Called whenever the timer says to check the free space. | ||
100 | */ | ||
101 | static void acct_timeout(unsigned long x) | ||
102 | { | ||
103 | struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; | ||
104 | acct->needcheck = 1; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Check the amount of free space and suspend/resume accordingly. | 98 | * Check the amount of free space and suspend/resume accordingly. |
109 | */ | 99 | */ |
110 | static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | 100 | static int check_free_space(struct bsd_acct_struct *acct, struct file *file) |
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
112 | struct kstatfs sbuf; | 102 | struct kstatfs sbuf; |
113 | int res; | 103 | int res; |
114 | int act; | 104 | int act; |
115 | sector_t resume; | 105 | u64 resume; |
116 | sector_t suspend; | 106 | u64 suspend; |
117 | 107 | ||
118 | spin_lock(&acct_lock); | 108 | spin_lock(&acct_lock); |
119 | res = acct->active; | 109 | res = acct->active; |
120 | if (!file || !acct->needcheck) | 110 | if (!file || time_is_before_jiffies(acct->needcheck)) |
121 | goto out; | 111 | goto out; |
122 | spin_unlock(&acct_lock); | 112 | spin_unlock(&acct_lock); |
123 | 113 | ||
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
127 | suspend = sbuf.f_blocks * SUSPEND; | 117 | suspend = sbuf.f_blocks * SUSPEND; |
128 | resume = sbuf.f_blocks * RESUME; | 118 | resume = sbuf.f_blocks * RESUME; |
129 | 119 | ||
130 | sector_div(suspend, 100); | 120 | do_div(suspend, 100); |
131 | sector_div(resume, 100); | 121 | do_div(resume, 100); |
132 | 122 | ||
133 | if (sbuf.f_bavail <= suspend) | 123 | if (sbuf.f_bavail <= suspend) |
134 | act = -1; | 124 | act = -1; |
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
160 | } | 150 | } |
161 | } | 151 | } |
162 | 152 | ||
163 | del_timer(&acct->timer); | 153 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; |
164 | acct->needcheck = 0; | ||
165 | acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; | ||
166 | add_timer(&acct->timer); | ||
167 | res = acct->active; | 154 | res = acct->active; |
168 | out: | 155 | out: |
169 | spin_unlock(&acct_lock); | 156 | spin_unlock(&acct_lock); |
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | |||
185 | if (acct->file) { | 172 | if (acct->file) { |
186 | old_acct = acct->file; | 173 | old_acct = acct->file; |
187 | old_ns = acct->ns; | 174 | old_ns = acct->ns; |
188 | del_timer(&acct->timer); | ||
189 | acct->active = 0; | 175 | acct->active = 0; |
190 | acct->needcheck = 0; | ||
191 | acct->file = NULL; | 176 | acct->file = NULL; |
192 | acct->ns = NULL; | 177 | acct->ns = NULL; |
193 | list_del(&acct->list); | 178 | list_del(&acct->list); |
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | |||
195 | if (file) { | 180 | if (file) { |
196 | acct->file = file; | 181 | acct->file = file; |
197 | acct->ns = ns; | 182 | acct->ns = ns; |
198 | acct->needcheck = 0; | 183 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; |
199 | acct->active = 1; | 184 | acct->active = 1; |
200 | list_add(&acct->list, &acct_list); | 185 | list_add(&acct->list, &acct_list); |
201 | /* It's been deleted if it was used before so this is safe */ | ||
202 | setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); | ||
203 | acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; | ||
204 | add_timer(&acct->timer); | ||
205 | } | 186 | } |
206 | if (old_acct) { | 187 | if (old_acct) { |
207 | mnt_unpin(old_acct->f_path.mnt); | 188 | mnt_unpin(old_acct->f_path.mnt); |
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb) | |||
334 | spin_lock(&acct_lock); | 315 | spin_lock(&acct_lock); |
335 | restart: | 316 | restart: |
336 | list_for_each_entry(acct, &acct_list, list) | 317 | list_for_each_entry(acct, &acct_list, list) |
337 | if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { | 318 | if (acct->file && acct->file->f_path.dentry->d_sb == sb) { |
338 | acct_file_reopen(acct, NULL, NULL); | 319 | acct_file_reopen(acct, NULL, NULL); |
339 | goto restart; | 320 | goto restart; |
340 | } | 321 | } |
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns) | |||
348 | if (acct == NULL) | 329 | if (acct == NULL) |
349 | return; | 330 | return; |
350 | 331 | ||
351 | del_timer_sync(&acct->timer); | ||
352 | spin_lock(&acct_lock); | 332 | spin_lock(&acct_lock); |
353 | if (acct->file != NULL) | 333 | if (acct->file != NULL) |
354 | acct_file_reopen(acct, NULL, NULL); | 334 | acct_file_reopen(acct, NULL, NULL); |
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
498 | * Fill the accounting struct with the needed info as recorded | 478 | * Fill the accounting struct with the needed info as recorded |
499 | * by the different kernel functions. | 479 | * by the different kernel functions. |
500 | */ | 480 | */ |
501 | memset((caddr_t)&ac, 0, sizeof(acct_t)); | 481 | memset(&ac, 0, sizeof(acct_t)); |
502 | 482 | ||
503 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; | 483 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; |
504 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); | 484 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); |
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead) | |||
613 | pacct->ac_flag |= ACORE; | 593 | pacct->ac_flag |= ACORE; |
614 | if (current->flags & PF_SIGNALED) | 594 | if (current->flags & PF_SIGNALED) |
615 | pacct->ac_flag |= AXSIG; | 595 | pacct->ac_flag |= AXSIG; |
616 | pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); | 596 | pacct->ac_utime += current->utime; |
617 | pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); | 597 | pacct->ac_stime += current->stime; |
618 | pacct->ac_minflt += current->min_flt; | 598 | pacct->ac_minflt += current->min_flt; |
619 | pacct->ac_majflt += current->maj_flt; | 599 | pacct->ac_majflt += current->maj_flt; |
620 | spin_unlock_irq(¤t->sighand->siglock); | 600 | spin_unlock_irq(¤t->sighand->siglock); |
diff --git a/kernel/async.c b/kernel/async.c index 80b74b88fefe..bd0c168a3bbe 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); | |||
78 | 78 | ||
79 | static atomic_t entry_count; | 79 | static atomic_t entry_count; |
80 | 80 | ||
81 | extern int initcall_debug; | ||
82 | |||
83 | 81 | ||
84 | /* | 82 | /* |
85 | * MUST be called with the lock held! | 83 | * MUST be called with the lock held! |
diff --git a/kernel/audit.c b/kernel/audit.c index 09fae2677a45..bb0eb5bb9a0a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
601 | case AUDIT_TTY_SET: | 601 | case AUDIT_TTY_SET: |
602 | case AUDIT_TRIM: | 602 | case AUDIT_TRIM: |
603 | case AUDIT_MAKE_EQUIV: | 603 | case AUDIT_MAKE_EQUIV: |
604 | if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) | 604 | if (!capable(CAP_AUDIT_CONTROL)) |
605 | err = -EPERM; | 605 | err = -EPERM; |
606 | break; | 606 | break; |
607 | case AUDIT_USER: | 607 | case AUDIT_USER: |
608 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: | 608 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
609 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: | 609 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: |
610 | if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) | 610 | if (!capable(CAP_AUDIT_WRITE)) |
611 | err = -EPERM; | 611 | err = -EPERM; |
612 | break; | 612 | break; |
613 | default: /* bad msg */ | 613 | default: /* bad msg */ |
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
631 | } | 631 | } |
632 | 632 | ||
633 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 633 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
634 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", | 634 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", |
635 | pid, uid, auid, ses); | 635 | pid, uid, auid, ses); |
636 | if (sid) { | 636 | if (sid) { |
637 | rc = security_secid_to_secctx(sid, &ctx, &len); | 637 | rc = security_secid_to_secctx(sid, &ctx, &len); |
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | |||
1260 | avail = audit_expand(ab, | 1260 | avail = audit_expand(ab, |
1261 | max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); | 1261 | max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); |
1262 | if (!avail) | 1262 | if (!avail) |
1263 | goto out; | 1263 | goto out_va_end; |
1264 | len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); | 1264 | len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); |
1265 | } | 1265 | } |
1266 | va_end(args2); | ||
1267 | if (len > 0) | 1266 | if (len > 0) |
1268 | skb_put(skb, len); | 1267 | skb_put(skb, len); |
1268 | out_va_end: | ||
1269 | va_end(args2); | ||
1269 | out: | 1270 | out: |
1270 | return; | 1271 | return; |
1271 | } | 1272 | } |
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | |||
1422 | char *p, *pathname; | 1423 | char *p, *pathname; |
1423 | 1424 | ||
1424 | if (prefix) | 1425 | if (prefix) |
1425 | audit_log_format(ab, " %s", prefix); | 1426 | audit_log_format(ab, "%s", prefix); |
1426 | 1427 | ||
1427 | /* We will allow 11 spaces for ' (deleted)' to be appended */ | 1428 | /* We will allow 11 spaces for ' (deleted)' to be appended */ |
1428 | pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); | 1429 | pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); |
diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2c..816766803371 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -36,12 +36,8 @@ enum audit_state { | |||
36 | AUDIT_DISABLED, /* Do not create per-task audit_context. | 36 | AUDIT_DISABLED, /* Do not create per-task audit_context. |
37 | * No syscall-specific audit records can | 37 | * No syscall-specific audit records can |
38 | * be generated. */ | 38 | * be generated. */ |
39 | AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, | ||
40 | * but don't necessarily fill it in at | ||
41 | * syscall entry time (i.e., filter | ||
42 | * instead). */ | ||
43 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, | 39 | AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, |
44 | * and always fill it in at syscall | 40 | * and fill it in at syscall |
45 | * entry time. This makes a full | 41 | * entry time. This makes a full |
46 | * syscall record available if some | 42 | * syscall record available if some |
47 | * other part of the kernel decides it | 43 | * other part of the kernel decides it |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d678..a6c3f1abd206 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
235 | switch(listnr) { | 235 | switch(listnr) { |
236 | default: | 236 | default: |
237 | goto exit_err; | 237 | goto exit_err; |
238 | case AUDIT_FILTER_USER: | ||
239 | case AUDIT_FILTER_TYPE: | ||
240 | #ifdef CONFIG_AUDITSYSCALL | 238 | #ifdef CONFIG_AUDITSYSCALL |
241 | case AUDIT_FILTER_ENTRY: | 239 | case AUDIT_FILTER_ENTRY: |
240 | if (rule->action == AUDIT_ALWAYS) | ||
241 | goto exit_err; | ||
242 | case AUDIT_FILTER_EXIT: | 242 | case AUDIT_FILTER_EXIT: |
243 | case AUDIT_FILTER_TASK: | 243 | case AUDIT_FILTER_TASK: |
244 | #endif | 244 | #endif |
245 | case AUDIT_FILTER_USER: | ||
246 | case AUDIT_FILTER_TYPE: | ||
245 | ; | 247 | ; |
246 | } | 248 | } |
247 | if (unlikely(rule->action == AUDIT_POSSIBLE)) { | 249 | if (unlikely(rule->action == AUDIT_POSSIBLE)) { |
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
385 | goto exit_free; | 387 | goto exit_free; |
386 | break; | 388 | break; |
387 | case AUDIT_FILETYPE: | 389 | case AUDIT_FILETYPE: |
388 | if ((f->val & ~S_IFMT) > S_IFMT) | 390 | if (f->val & ~S_IFMT) |
389 | goto exit_free; | 391 | goto exit_free; |
390 | break; | 392 | break; |
391 | case AUDIT_INODE: | 393 | case AUDIT_INODE: |
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
459 | case AUDIT_ARG1: | 461 | case AUDIT_ARG1: |
460 | case AUDIT_ARG2: | 462 | case AUDIT_ARG2: |
461 | case AUDIT_ARG3: | 463 | case AUDIT_ARG3: |
464 | case AUDIT_OBJ_UID: | ||
465 | case AUDIT_OBJ_GID: | ||
462 | break; | 466 | break; |
463 | case AUDIT_ARCH: | 467 | case AUDIT_ARCH: |
464 | entry->rule.arch_f = f; | 468 | entry->rule.arch_f = f; |
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
522 | goto exit_free; | 526 | goto exit_free; |
523 | break; | 527 | break; |
524 | case AUDIT_FILTERKEY: | 528 | case AUDIT_FILTERKEY: |
525 | err = -EINVAL; | ||
526 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) | 529 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) |
527 | goto exit_free; | 530 | goto exit_free; |
528 | str = audit_unpack_string(&bufp, &remain, f->val); | 531 | str = audit_unpack_string(&bufp, &remain, f->val); |
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
536 | goto exit_free; | 539 | goto exit_free; |
537 | break; | 540 | break; |
538 | case AUDIT_FILETYPE: | 541 | case AUDIT_FILETYPE: |
539 | if ((f->val & ~S_IFMT) > S_IFMT) | 542 | if (f->val & ~S_IFMT) |
543 | goto exit_free; | ||
544 | break; | ||
545 | case AUDIT_FIELD_COMPARE: | ||
546 | if (f->val > AUDIT_MAX_FIELD_COMPARE) | ||
540 | goto exit_free; | 547 | goto exit_free; |
541 | break; | 548 | break; |
542 | default: | 549 | default: |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 47b7fc1ea893..af1de0f34eae 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -70,9 +70,15 @@ | |||
70 | 70 | ||
71 | #include "audit.h" | 71 | #include "audit.h" |
72 | 72 | ||
73 | /* flags stating the success for a syscall */ | ||
74 | #define AUDITSC_INVALID 0 | ||
75 | #define AUDITSC_SUCCESS 1 | ||
76 | #define AUDITSC_FAILURE 2 | ||
77 | |||
73 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | 78 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context |
74 | * for saving names from getname(). */ | 79 | * for saving names from getname(). If we get more names we will allocate |
75 | #define AUDIT_NAMES 20 | 80 | * a name dynamically and also add those to the list anchored by names_list. */ |
81 | #define AUDIT_NAMES 5 | ||
76 | 82 | ||
77 | /* Indicates that audit should log the full pathname. */ | 83 | /* Indicates that audit should log the full pathname. */ |
78 | #define AUDIT_NAME_FULL -1 | 84 | #define AUDIT_NAME_FULL -1 |
@@ -101,9 +107,8 @@ struct audit_cap_data { | |||
101 | * | 107 | * |
102 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ | 108 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ |
103 | struct audit_names { | 109 | struct audit_names { |
110 | struct list_head list; /* audit_context->names_list */ | ||
104 | const char *name; | 111 | const char *name; |
105 | int name_len; /* number of name's characters to log */ | ||
106 | unsigned name_put; /* call __putname() for this name */ | ||
107 | unsigned long ino; | 112 | unsigned long ino; |
108 | dev_t dev; | 113 | dev_t dev; |
109 | umode_t mode; | 114 | umode_t mode; |
@@ -113,6 +118,14 @@ struct audit_names { | |||
113 | u32 osid; | 118 | u32 osid; |
114 | struct audit_cap_data fcap; | 119 | struct audit_cap_data fcap; |
115 | unsigned int fcap_ver; | 120 | unsigned int fcap_ver; |
121 | int name_len; /* number of name's characters to log */ | ||
122 | bool name_put; /* call __putname() for this name */ | ||
123 | /* | ||
124 | * This was an allocated audit_names and not from the array of | ||
125 | * names allocated in the task audit context. Thus this name | ||
126 | * should be freed on syscall exit | ||
127 | */ | ||
128 | bool should_free; | ||
116 | }; | 129 | }; |
117 | 130 | ||
118 | struct audit_aux_data { | 131 | struct audit_aux_data { |
@@ -174,8 +187,17 @@ struct audit_context { | |||
174 | long return_code;/* syscall return code */ | 187 | long return_code;/* syscall return code */ |
175 | u64 prio; | 188 | u64 prio; |
176 | int return_valid; /* return code is valid */ | 189 | int return_valid; /* return code is valid */ |
177 | int name_count; | 190 | /* |
178 | struct audit_names names[AUDIT_NAMES]; | 191 | * The names_list is the list of all audit_names collected during this |
192 | * syscall. The first AUDIT_NAMES entries in the names_list will | ||
193 | * actually be from the preallocated_names array for performance | ||
194 | * reasons. Except during allocation they should never be referenced | ||
195 | * through the preallocated_names array and should only be found/used | ||
196 | * by running the names_list. | ||
197 | */ | ||
198 | struct audit_names preallocated_names[AUDIT_NAMES]; | ||
199 | int name_count; /* total records in names_list */ | ||
200 | struct list_head names_list; /* anchor for struct audit_names->list */ | ||
179 | char * filterkey; /* key for rule that triggered record */ | 201 | char * filterkey; /* key for rule that triggered record */ |
180 | struct path pwd; | 202 | struct path pwd; |
181 | struct audit_context *previous; /* For nested syscalls */ | 203 | struct audit_context *previous; /* For nested syscalls */ |
@@ -210,12 +232,12 @@ struct audit_context { | |||
210 | struct { | 232 | struct { |
211 | uid_t uid; | 233 | uid_t uid; |
212 | gid_t gid; | 234 | gid_t gid; |
213 | mode_t mode; | 235 | umode_t mode; |
214 | u32 osid; | 236 | u32 osid; |
215 | int has_perm; | 237 | int has_perm; |
216 | uid_t perm_uid; | 238 | uid_t perm_uid; |
217 | gid_t perm_gid; | 239 | gid_t perm_gid; |
218 | mode_t perm_mode; | 240 | umode_t perm_mode; |
219 | unsigned long qbytes; | 241 | unsigned long qbytes; |
220 | } ipc; | 242 | } ipc; |
221 | struct { | 243 | struct { |
@@ -234,7 +256,7 @@ struct audit_context { | |||
234 | } mq_sendrecv; | 256 | } mq_sendrecv; |
235 | struct { | 257 | struct { |
236 | int oflag; | 258 | int oflag; |
237 | mode_t mode; | 259 | umode_t mode; |
238 | struct mq_attr attr; | 260 | struct mq_attr attr; |
239 | } mq_open; | 261 | } mq_open; |
240 | struct { | 262 | struct { |
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask) | |||
305 | } | 327 | } |
306 | } | 328 | } |
307 | 329 | ||
308 | static int audit_match_filetype(struct audit_context *ctx, int which) | 330 | static int audit_match_filetype(struct audit_context *ctx, int val) |
309 | { | 331 | { |
310 | unsigned index = which & ~S_IFMT; | 332 | struct audit_names *n; |
311 | mode_t mode = which & S_IFMT; | 333 | umode_t mode = (umode_t)val; |
312 | 334 | ||
313 | if (unlikely(!ctx)) | 335 | if (unlikely(!ctx)) |
314 | return 0; | 336 | return 0; |
315 | 337 | ||
316 | if (index >= ctx->name_count) | 338 | list_for_each_entry(n, &ctx->names_list, list) { |
317 | return 0; | 339 | if ((n->ino != -1) && |
318 | if (ctx->names[index].ino == -1) | 340 | ((n->mode & S_IFMT) == mode)) |
319 | return 0; | 341 | return 1; |
320 | if ((ctx->names[index].mode ^ mode) & S_IFMT) | 342 | } |
321 | return 0; | 343 | |
322 | return 1; | 344 | return 0; |
323 | } | 345 | } |
324 | 346 | ||
325 | /* | 347 | /* |
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
441 | return 0; | 463 | return 0; |
442 | } | 464 | } |
443 | 465 | ||
466 | static int audit_compare_id(uid_t uid1, | ||
467 | struct audit_names *name, | ||
468 | unsigned long name_offset, | ||
469 | struct audit_field *f, | ||
470 | struct audit_context *ctx) | ||
471 | { | ||
472 | struct audit_names *n; | ||
473 | unsigned long addr; | ||
474 | uid_t uid2; | ||
475 | int rc; | ||
476 | |||
477 | BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); | ||
478 | |||
479 | if (name) { | ||
480 | addr = (unsigned long)name; | ||
481 | addr += name_offset; | ||
482 | |||
483 | uid2 = *(uid_t *)addr; | ||
484 | rc = audit_comparator(uid1, f->op, uid2); | ||
485 | if (rc) | ||
486 | return rc; | ||
487 | } | ||
488 | |||
489 | if (ctx) { | ||
490 | list_for_each_entry(n, &ctx->names_list, list) { | ||
491 | addr = (unsigned long)n; | ||
492 | addr += name_offset; | ||
493 | |||
494 | uid2 = *(uid_t *)addr; | ||
495 | |||
496 | rc = audit_comparator(uid1, f->op, uid2); | ||
497 | if (rc) | ||
498 | return rc; | ||
499 | } | ||
500 | } | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | static int audit_field_compare(struct task_struct *tsk, | ||
505 | const struct cred *cred, | ||
506 | struct audit_field *f, | ||
507 | struct audit_context *ctx, | ||
508 | struct audit_names *name) | ||
509 | { | ||
510 | switch (f->val) { | ||
511 | /* process to file object comparisons */ | ||
512 | case AUDIT_COMPARE_UID_TO_OBJ_UID: | ||
513 | return audit_compare_id(cred->uid, | ||
514 | name, offsetof(struct audit_names, uid), | ||
515 | f, ctx); | ||
516 | case AUDIT_COMPARE_GID_TO_OBJ_GID: | ||
517 | return audit_compare_id(cred->gid, | ||
518 | name, offsetof(struct audit_names, gid), | ||
519 | f, ctx); | ||
520 | case AUDIT_COMPARE_EUID_TO_OBJ_UID: | ||
521 | return audit_compare_id(cred->euid, | ||
522 | name, offsetof(struct audit_names, uid), | ||
523 | f, ctx); | ||
524 | case AUDIT_COMPARE_EGID_TO_OBJ_GID: | ||
525 | return audit_compare_id(cred->egid, | ||
526 | name, offsetof(struct audit_names, gid), | ||
527 | f, ctx); | ||
528 | case AUDIT_COMPARE_AUID_TO_OBJ_UID: | ||
529 | return audit_compare_id(tsk->loginuid, | ||
530 | name, offsetof(struct audit_names, uid), | ||
531 | f, ctx); | ||
532 | case AUDIT_COMPARE_SUID_TO_OBJ_UID: | ||
533 | return audit_compare_id(cred->suid, | ||
534 | name, offsetof(struct audit_names, uid), | ||
535 | f, ctx); | ||
536 | case AUDIT_COMPARE_SGID_TO_OBJ_GID: | ||
537 | return audit_compare_id(cred->sgid, | ||
538 | name, offsetof(struct audit_names, gid), | ||
539 | f, ctx); | ||
540 | case AUDIT_COMPARE_FSUID_TO_OBJ_UID: | ||
541 | return audit_compare_id(cred->fsuid, | ||
542 | name, offsetof(struct audit_names, uid), | ||
543 | f, ctx); | ||
544 | case AUDIT_COMPARE_FSGID_TO_OBJ_GID: | ||
545 | return audit_compare_id(cred->fsgid, | ||
546 | name, offsetof(struct audit_names, gid), | ||
547 | f, ctx); | ||
548 | /* uid comparisons */ | ||
549 | case AUDIT_COMPARE_UID_TO_AUID: | ||
550 | return audit_comparator(cred->uid, f->op, tsk->loginuid); | ||
551 | case AUDIT_COMPARE_UID_TO_EUID: | ||
552 | return audit_comparator(cred->uid, f->op, cred->euid); | ||
553 | case AUDIT_COMPARE_UID_TO_SUID: | ||
554 | return audit_comparator(cred->uid, f->op, cred->suid); | ||
555 | case AUDIT_COMPARE_UID_TO_FSUID: | ||
556 | return audit_comparator(cred->uid, f->op, cred->fsuid); | ||
557 | /* auid comparisons */ | ||
558 | case AUDIT_COMPARE_AUID_TO_EUID: | ||
559 | return audit_comparator(tsk->loginuid, f->op, cred->euid); | ||
560 | case AUDIT_COMPARE_AUID_TO_SUID: | ||
561 | return audit_comparator(tsk->loginuid, f->op, cred->suid); | ||
562 | case AUDIT_COMPARE_AUID_TO_FSUID: | ||
563 | return audit_comparator(tsk->loginuid, f->op, cred->fsuid); | ||
564 | /* euid comparisons */ | ||
565 | case AUDIT_COMPARE_EUID_TO_SUID: | ||
566 | return audit_comparator(cred->euid, f->op, cred->suid); | ||
567 | case AUDIT_COMPARE_EUID_TO_FSUID: | ||
568 | return audit_comparator(cred->euid, f->op, cred->fsuid); | ||
569 | /* suid comparisons */ | ||
570 | case AUDIT_COMPARE_SUID_TO_FSUID: | ||
571 | return audit_comparator(cred->suid, f->op, cred->fsuid); | ||
572 | /* gid comparisons */ | ||
573 | case AUDIT_COMPARE_GID_TO_EGID: | ||
574 | return audit_comparator(cred->gid, f->op, cred->egid); | ||
575 | case AUDIT_COMPARE_GID_TO_SGID: | ||
576 | return audit_comparator(cred->gid, f->op, cred->sgid); | ||
577 | case AUDIT_COMPARE_GID_TO_FSGID: | ||
578 | return audit_comparator(cred->gid, f->op, cred->fsgid); | ||
579 | /* egid comparisons */ | ||
580 | case AUDIT_COMPARE_EGID_TO_SGID: | ||
581 | return audit_comparator(cred->egid, f->op, cred->sgid); | ||
582 | case AUDIT_COMPARE_EGID_TO_FSGID: | ||
583 | return audit_comparator(cred->egid, f->op, cred->fsgid); | ||
584 | /* sgid comparison */ | ||
585 | case AUDIT_COMPARE_SGID_TO_FSGID: | ||
586 | return audit_comparator(cred->sgid, f->op, cred->fsgid); | ||
587 | default: | ||
588 | WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); | ||
589 | return 0; | ||
590 | } | ||
591 | return 0; | ||
592 | } | ||
593 | |||
444 | /* Determine if any context name data matches a rule's watch data */ | 594 | /* Determine if any context name data matches a rule's watch data */ |
445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 595 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
446 | * otherwise. | 596 | * otherwise. |
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
457 | bool task_creation) | 607 | bool task_creation) |
458 | { | 608 | { |
459 | const struct cred *cred; | 609 | const struct cred *cred; |
460 | int i, j, need_sid = 1; | 610 | int i, need_sid = 1; |
461 | u32 sid; | 611 | u32 sid; |
462 | 612 | ||
463 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); | 613 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); |
464 | 614 | ||
465 | for (i = 0; i < rule->field_count; i++) { | 615 | for (i = 0; i < rule->field_count; i++) { |
466 | struct audit_field *f = &rule->fields[i]; | 616 | struct audit_field *f = &rule->fields[i]; |
617 | struct audit_names *n; | ||
467 | int result = 0; | 618 | int result = 0; |
468 | 619 | ||
469 | switch (f->type) { | 620 | switch (f->type) { |
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
522 | } | 673 | } |
523 | break; | 674 | break; |
524 | case AUDIT_DEVMAJOR: | 675 | case AUDIT_DEVMAJOR: |
525 | if (name) | 676 | if (name) { |
526 | result = audit_comparator(MAJOR(name->dev), | 677 | if (audit_comparator(MAJOR(name->dev), f->op, f->val) || |
527 | f->op, f->val); | 678 | audit_comparator(MAJOR(name->rdev), f->op, f->val)) |
528 | else if (ctx) { | 679 | ++result; |
529 | for (j = 0; j < ctx->name_count; j++) { | 680 | } else if (ctx) { |
530 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { | 681 | list_for_each_entry(n, &ctx->names_list, list) { |
682 | if (audit_comparator(MAJOR(n->dev), f->op, f->val) || | ||
683 | audit_comparator(MAJOR(n->rdev), f->op, f->val)) { | ||
531 | ++result; | 684 | ++result; |
532 | break; | 685 | break; |
533 | } | 686 | } |
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
535 | } | 688 | } |
536 | break; | 689 | break; |
537 | case AUDIT_DEVMINOR: | 690 | case AUDIT_DEVMINOR: |
538 | if (name) | 691 | if (name) { |
539 | result = audit_comparator(MINOR(name->dev), | 692 | if (audit_comparator(MINOR(name->dev), f->op, f->val) || |
540 | f->op, f->val); | 693 | audit_comparator(MINOR(name->rdev), f->op, f->val)) |
541 | else if (ctx) { | 694 | ++result; |
542 | for (j = 0; j < ctx->name_count; j++) { | 695 | } else if (ctx) { |
543 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { | 696 | list_for_each_entry(n, &ctx->names_list, list) { |
697 | if (audit_comparator(MINOR(n->dev), f->op, f->val) || | ||
698 | audit_comparator(MINOR(n->rdev), f->op, f->val)) { | ||
544 | ++result; | 699 | ++result; |
545 | break; | 700 | break; |
546 | } | 701 | } |
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
551 | if (name) | 706 | if (name) |
552 | result = (name->ino == f->val); | 707 | result = (name->ino == f->val); |
553 | else if (ctx) { | 708 | else if (ctx) { |
554 | for (j = 0; j < ctx->name_count; j++) { | 709 | list_for_each_entry(n, &ctx->names_list, list) { |
555 | if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { | 710 | if (audit_comparator(n->ino, f->op, f->val)) { |
711 | ++result; | ||
712 | break; | ||
713 | } | ||
714 | } | ||
715 | } | ||
716 | break; | ||
717 | case AUDIT_OBJ_UID: | ||
718 | if (name) { | ||
719 | result = audit_comparator(name->uid, f->op, f->val); | ||
720 | } else if (ctx) { | ||
721 | list_for_each_entry(n, &ctx->names_list, list) { | ||
722 | if (audit_comparator(n->uid, f->op, f->val)) { | ||
723 | ++result; | ||
724 | break; | ||
725 | } | ||
726 | } | ||
727 | } | ||
728 | break; | ||
729 | case AUDIT_OBJ_GID: | ||
730 | if (name) { | ||
731 | result = audit_comparator(name->gid, f->op, f->val); | ||
732 | } else if (ctx) { | ||
733 | list_for_each_entry(n, &ctx->names_list, list) { | ||
734 | if (audit_comparator(n->gid, f->op, f->val)) { | ||
556 | ++result; | 735 | ++result; |
557 | break; | 736 | break; |
558 | } | 737 | } |
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
607 | name->osid, f->type, f->op, | 786 | name->osid, f->type, f->op, |
608 | f->lsm_rule, ctx); | 787 | f->lsm_rule, ctx); |
609 | } else if (ctx) { | 788 | } else if (ctx) { |
610 | for (j = 0; j < ctx->name_count; j++) { | 789 | list_for_each_entry(n, &ctx->names_list, list) { |
611 | if (security_audit_rule_match( | 790 | if (security_audit_rule_match(n->osid, f->type, |
612 | ctx->names[j].osid, | 791 | f->op, f->lsm_rule, |
613 | f->type, f->op, | 792 | ctx)) { |
614 | f->lsm_rule, ctx)) { | ||
615 | ++result; | 793 | ++result; |
616 | break; | 794 | break; |
617 | } | 795 | } |
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
643 | case AUDIT_FILETYPE: | 821 | case AUDIT_FILETYPE: |
644 | result = audit_match_filetype(ctx, f->val); | 822 | result = audit_match_filetype(ctx, f->val); |
645 | break; | 823 | break; |
824 | case AUDIT_FIELD_COMPARE: | ||
825 | result = audit_field_compare(tsk, cred, f, ctx, name); | ||
826 | break; | ||
646 | } | 827 | } |
647 | |||
648 | if (!result) | 828 | if (!result) |
649 | return 0; | 829 | return 0; |
650 | } | 830 | } |
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
722 | return AUDIT_BUILD_CONTEXT; | 902 | return AUDIT_BUILD_CONTEXT; |
723 | } | 903 | } |
724 | 904 | ||
725 | /* At syscall exit time, this filter is called if any audit_names[] have been | 905 | /* |
906 | * Given an audit_name check the inode hash table to see if they match. | ||
907 | * Called holding the rcu read lock to protect the use of audit_inode_hash | ||
908 | */ | ||
909 | static int audit_filter_inode_name(struct task_struct *tsk, | ||
910 | struct audit_names *n, | ||
911 | struct audit_context *ctx) { | ||
912 | int word, bit; | ||
913 | int h = audit_hash_ino((u32)n->ino); | ||
914 | struct list_head *list = &audit_inode_hash[h]; | ||
915 | struct audit_entry *e; | ||
916 | enum audit_state state; | ||
917 | |||
918 | word = AUDIT_WORD(ctx->major); | ||
919 | bit = AUDIT_BIT(ctx->major); | ||
920 | |||
921 | if (list_empty(list)) | ||
922 | return 0; | ||
923 | |||
924 | list_for_each_entry_rcu(e, list, list) { | ||
925 | if ((e->rule.mask[word] & bit) == bit && | ||
926 | audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { | ||
927 | ctx->current_state = state; | ||
928 | return 1; | ||
929 | } | ||
930 | } | ||
931 | |||
932 | return 0; | ||
933 | } | ||
934 | |||
935 | /* At syscall exit time, this filter is called if any audit_names have been | ||
726 | * collected during syscall processing. We only check rules in sublists at hash | 936 | * collected during syscall processing. We only check rules in sublists at hash |
727 | * buckets applicable to the inode numbers in audit_names[]. | 937 | * buckets applicable to the inode numbers in audit_names. |
728 | * Regarding audit_state, same rules apply as for audit_filter_syscall(). | 938 | * Regarding audit_state, same rules apply as for audit_filter_syscall(). |
729 | */ | 939 | */ |
730 | void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) | 940 | void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) |
731 | { | 941 | { |
732 | int i; | 942 | struct audit_names *n; |
733 | struct audit_entry *e; | ||
734 | enum audit_state state; | ||
735 | 943 | ||
736 | if (audit_pid && tsk->tgid == audit_pid) | 944 | if (audit_pid && tsk->tgid == audit_pid) |
737 | return; | 945 | return; |
738 | 946 | ||
739 | rcu_read_lock(); | 947 | rcu_read_lock(); |
740 | for (i = 0; i < ctx->name_count; i++) { | ||
741 | int word = AUDIT_WORD(ctx->major); | ||
742 | int bit = AUDIT_BIT(ctx->major); | ||
743 | struct audit_names *n = &ctx->names[i]; | ||
744 | int h = audit_hash_ino((u32)n->ino); | ||
745 | struct list_head *list = &audit_inode_hash[h]; | ||
746 | |||
747 | if (list_empty(list)) | ||
748 | continue; | ||
749 | 948 | ||
750 | list_for_each_entry_rcu(e, list, list) { | 949 | list_for_each_entry(n, &ctx->names_list, list) { |
751 | if ((e->rule.mask[word] & bit) == bit && | 950 | if (audit_filter_inode_name(tsk, n, ctx)) |
752 | audit_filter_rules(tsk, &e->rule, ctx, n, | 951 | break; |
753 | &state, false)) { | ||
754 | rcu_read_unlock(); | ||
755 | ctx->current_state = state; | ||
756 | return; | ||
757 | } | ||
758 | } | ||
759 | } | 952 | } |
760 | rcu_read_unlock(); | 953 | rcu_read_unlock(); |
761 | } | 954 | } |
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
766 | { | 959 | { |
767 | struct audit_context *context = tsk->audit_context; | 960 | struct audit_context *context = tsk->audit_context; |
768 | 961 | ||
769 | if (likely(!context)) | 962 | if (!context) |
770 | return NULL; | 963 | return NULL; |
771 | context->return_valid = return_valid; | 964 | context->return_valid = return_valid; |
772 | 965 | ||
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
799 | 992 | ||
800 | static inline void audit_free_names(struct audit_context *context) | 993 | static inline void audit_free_names(struct audit_context *context) |
801 | { | 994 | { |
802 | int i; | 995 | struct audit_names *n, *next; |
803 | 996 | ||
804 | #if AUDIT_DEBUG == 2 | 997 | #if AUDIT_DEBUG == 2 |
805 | if (context->put_count + context->ino_count != context->name_count) { | 998 | if (context->put_count + context->ino_count != context->name_count) { |
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context) | |||
810 | context->serial, context->major, context->in_syscall, | 1003 | context->serial, context->major, context->in_syscall, |
811 | context->name_count, context->put_count, | 1004 | context->name_count, context->put_count, |
812 | context->ino_count); | 1005 | context->ino_count); |
813 | for (i = 0; i < context->name_count; i++) { | 1006 | list_for_each_entry(n, &context->names_list, list) { |
814 | printk(KERN_ERR "names[%d] = %p = %s\n", i, | 1007 | printk(KERN_ERR "names[%d] = %p = %s\n", i, |
815 | context->names[i].name, | 1008 | n->name, n->name ?: "(null)"); |
816 | context->names[i].name ?: "(null)"); | ||
817 | } | 1009 | } |
818 | dump_stack(); | 1010 | dump_stack(); |
819 | return; | 1011 | return; |
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context) | |||
824 | context->ino_count = 0; | 1016 | context->ino_count = 0; |
825 | #endif | 1017 | #endif |
826 | 1018 | ||
827 | for (i = 0; i < context->name_count; i++) { | 1019 | list_for_each_entry_safe(n, next, &context->names_list, list) { |
828 | if (context->names[i].name && context->names[i].name_put) | 1020 | list_del(&n->list); |
829 | __putname(context->names[i].name); | 1021 | if (n->name && n->name_put) |
1022 | __putname(n->name); | ||
1023 | if (n->should_free) | ||
1024 | kfree(n); | ||
830 | } | 1025 | } |
831 | context->name_count = 0; | 1026 | context->name_count = 0; |
832 | path_put(&context->pwd); | 1027 | path_put(&context->pwd); |
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) | |||
864 | return NULL; | 1059 | return NULL; |
865 | audit_zero_context(context, state); | 1060 | audit_zero_context(context, state); |
866 | INIT_LIST_HEAD(&context->killed_trees); | 1061 | INIT_LIST_HEAD(&context->killed_trees); |
1062 | INIT_LIST_HEAD(&context->names_list); | ||
867 | return context; | 1063 | return context; |
868 | } | 1064 | } |
869 | 1065 | ||
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk) | |||
886 | return 0; /* Return if not auditing. */ | 1082 | return 0; /* Return if not auditing. */ |
887 | 1083 | ||
888 | state = audit_filter_task(tsk, &key); | 1084 | state = audit_filter_task(tsk, &key); |
889 | if (likely(state == AUDIT_DISABLED)) | 1085 | if (state == AUDIT_DISABLED) |
890 | return 0; | 1086 | return 0; |
891 | 1087 | ||
892 | if (!(context = audit_alloc_context(state))) { | 1088 | if (!(context = audit_alloc_context(state))) { |
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
975 | while (vma) { | 1171 | while (vma) { |
976 | if ((vma->vm_flags & VM_EXECUTABLE) && | 1172 | if ((vma->vm_flags & VM_EXECUTABLE) && |
977 | vma->vm_file) { | 1173 | vma->vm_file) { |
978 | audit_log_d_path(ab, "exe=", | 1174 | audit_log_d_path(ab, " exe=", |
979 | &vma->vm_file->f_path); | 1175 | &vma->vm_file->f_path); |
980 | break; | 1176 | break; |
981 | } | 1177 | } |
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, | |||
1166 | struct audit_buffer **ab, | 1362 | struct audit_buffer **ab, |
1167 | struct audit_aux_data_execve *axi) | 1363 | struct audit_aux_data_execve *axi) |
1168 | { | 1364 | { |
1169 | int i; | 1365 | int i, len; |
1170 | size_t len, len_sent = 0; | 1366 | size_t len_sent = 0; |
1171 | const char __user *p; | 1367 | const char __user *p; |
1172 | char *buf; | 1368 | char *buf; |
1173 | 1369 | ||
@@ -1249,7 +1445,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1249 | case AUDIT_IPC: { | 1445 | case AUDIT_IPC: { |
1250 | u32 osid = context->ipc.osid; | 1446 | u32 osid = context->ipc.osid; |
1251 | 1447 | ||
1252 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", | 1448 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", |
1253 | context->ipc.uid, context->ipc.gid, context->ipc.mode); | 1449 | context->ipc.uid, context->ipc.gid, context->ipc.mode); |
1254 | if (osid) { | 1450 | if (osid) { |
1255 | char *ctx = NULL; | 1451 | char *ctx = NULL; |
@@ -1267,7 +1463,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1267 | ab = audit_log_start(context, GFP_KERNEL, | 1463 | ab = audit_log_start(context, GFP_KERNEL, |
1268 | AUDIT_IPC_SET_PERM); | 1464 | AUDIT_IPC_SET_PERM); |
1269 | audit_log_format(ab, | 1465 | audit_log_format(ab, |
1270 | "qbytes=%lx ouid=%u ogid=%u mode=%#o", | 1466 | "qbytes=%lx ouid=%u ogid=%u mode=%#ho", |
1271 | context->ipc.qbytes, | 1467 | context->ipc.qbytes, |
1272 | context->ipc.perm_uid, | 1468 | context->ipc.perm_uid, |
1273 | context->ipc.perm_gid, | 1469 | context->ipc.perm_gid, |
@@ -1278,7 +1474,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1278 | break; } | 1474 | break; } |
1279 | case AUDIT_MQ_OPEN: { | 1475 | case AUDIT_MQ_OPEN: { |
1280 | audit_log_format(ab, | 1476 | audit_log_format(ab, |
1281 | "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " | 1477 | "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " |
1282 | "mq_msgsize=%ld mq_curmsgs=%ld", | 1478 | "mq_msgsize=%ld mq_curmsgs=%ld", |
1283 | context->mq_open.oflag, context->mq_open.mode, | 1479 | context->mq_open.oflag, context->mq_open.mode, |
1284 | context->mq_open.attr.mq_flags, | 1480 | context->mq_open.attr.mq_flags, |
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1324 | audit_log_end(ab); | 1520 | audit_log_end(ab); |
1325 | } | 1521 | } |
1326 | 1522 | ||
1523 | static void audit_log_name(struct audit_context *context, struct audit_names *n, | ||
1524 | int record_num, int *call_panic) | ||
1525 | { | ||
1526 | struct audit_buffer *ab; | ||
1527 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | ||
1528 | if (!ab) | ||
1529 | return; /* audit_panic has been called */ | ||
1530 | |||
1531 | audit_log_format(ab, "item=%d", record_num); | ||
1532 | |||
1533 | if (n->name) { | ||
1534 | switch (n->name_len) { | ||
1535 | case AUDIT_NAME_FULL: | ||
1536 | /* log the full path */ | ||
1537 | audit_log_format(ab, " name="); | ||
1538 | audit_log_untrustedstring(ab, n->name); | ||
1539 | break; | ||
1540 | case 0: | ||
1541 | /* name was specified as a relative path and the | ||
1542 | * directory component is the cwd */ | ||
1543 | audit_log_d_path(ab, " name=", &context->pwd); | ||
1544 | break; | ||
1545 | default: | ||
1546 | /* log the name's directory component */ | ||
1547 | audit_log_format(ab, " name="); | ||
1548 | audit_log_n_untrustedstring(ab, n->name, | ||
1549 | n->name_len); | ||
1550 | } | ||
1551 | } else | ||
1552 | audit_log_format(ab, " name=(null)"); | ||
1553 | |||
1554 | if (n->ino != (unsigned long)-1) { | ||
1555 | audit_log_format(ab, " inode=%lu" | ||
1556 | " dev=%02x:%02x mode=%#ho" | ||
1557 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
1558 | n->ino, | ||
1559 | MAJOR(n->dev), | ||
1560 | MINOR(n->dev), | ||
1561 | n->mode, | ||
1562 | n->uid, | ||
1563 | n->gid, | ||
1564 | MAJOR(n->rdev), | ||
1565 | MINOR(n->rdev)); | ||
1566 | } | ||
1567 | if (n->osid != 0) { | ||
1568 | char *ctx = NULL; | ||
1569 | u32 len; | ||
1570 | if (security_secid_to_secctx( | ||
1571 | n->osid, &ctx, &len)) { | ||
1572 | audit_log_format(ab, " osid=%u", n->osid); | ||
1573 | *call_panic = 2; | ||
1574 | } else { | ||
1575 | audit_log_format(ab, " obj=%s", ctx); | ||
1576 | security_release_secctx(ctx, len); | ||
1577 | } | ||
1578 | } | ||
1579 | |||
1580 | audit_log_fcaps(ab, n); | ||
1581 | |||
1582 | audit_log_end(ab); | ||
1583 | } | ||
1584 | |||
1327 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 1585 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
1328 | { | 1586 | { |
1329 | const struct cred *cred; | 1587 | const struct cred *cred; |
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1331 | struct audit_buffer *ab; | 1589 | struct audit_buffer *ab; |
1332 | struct audit_aux_data *aux; | 1590 | struct audit_aux_data *aux; |
1333 | const char *tty; | 1591 | const char *tty; |
1592 | struct audit_names *n; | ||
1334 | 1593 | ||
1335 | /* tsk == current */ | 1594 | /* tsk == current */ |
1336 | context->pid = tsk->pid; | 1595 | context->pid = tsk->pid; |
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1466 | if (context->pwd.dentry && context->pwd.mnt) { | 1725 | if (context->pwd.dentry && context->pwd.mnt) { |
1467 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); | 1726 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); |
1468 | if (ab) { | 1727 | if (ab) { |
1469 | audit_log_d_path(ab, "cwd=", &context->pwd); | 1728 | audit_log_d_path(ab, " cwd=", &context->pwd); |
1470 | audit_log_end(ab); | 1729 | audit_log_end(ab); |
1471 | } | 1730 | } |
1472 | } | 1731 | } |
1473 | for (i = 0; i < context->name_count; i++) { | ||
1474 | struct audit_names *n = &context->names[i]; | ||
1475 | 1732 | ||
1476 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | 1733 | i = 0; |
1477 | if (!ab) | 1734 | list_for_each_entry(n, &context->names_list, list) |
1478 | continue; /* audit_panic has been called */ | 1735 | audit_log_name(context, n, i++, &call_panic); |
1479 | |||
1480 | audit_log_format(ab, "item=%d", i); | ||
1481 | |||
1482 | if (n->name) { | ||
1483 | switch(n->name_len) { | ||
1484 | case AUDIT_NAME_FULL: | ||
1485 | /* log the full path */ | ||
1486 | audit_log_format(ab, " name="); | ||
1487 | audit_log_untrustedstring(ab, n->name); | ||
1488 | break; | ||
1489 | case 0: | ||
1490 | /* name was specified as a relative path and the | ||
1491 | * directory component is the cwd */ | ||
1492 | audit_log_d_path(ab, "name=", &context->pwd); | ||
1493 | break; | ||
1494 | default: | ||
1495 | /* log the name's directory component */ | ||
1496 | audit_log_format(ab, " name="); | ||
1497 | audit_log_n_untrustedstring(ab, n->name, | ||
1498 | n->name_len); | ||
1499 | } | ||
1500 | } else | ||
1501 | audit_log_format(ab, " name=(null)"); | ||
1502 | |||
1503 | if (n->ino != (unsigned long)-1) { | ||
1504 | audit_log_format(ab, " inode=%lu" | ||
1505 | " dev=%02x:%02x mode=%#o" | ||
1506 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
1507 | n->ino, | ||
1508 | MAJOR(n->dev), | ||
1509 | MINOR(n->dev), | ||
1510 | n->mode, | ||
1511 | n->uid, | ||
1512 | n->gid, | ||
1513 | MAJOR(n->rdev), | ||
1514 | MINOR(n->rdev)); | ||
1515 | } | ||
1516 | if (n->osid != 0) { | ||
1517 | char *ctx = NULL; | ||
1518 | u32 len; | ||
1519 | if (security_secid_to_secctx( | ||
1520 | n->osid, &ctx, &len)) { | ||
1521 | audit_log_format(ab, " osid=%u", n->osid); | ||
1522 | call_panic = 2; | ||
1523 | } else { | ||
1524 | audit_log_format(ab, " obj=%s", ctx); | ||
1525 | security_release_secctx(ctx, len); | ||
1526 | } | ||
1527 | } | ||
1528 | |||
1529 | audit_log_fcaps(ab, n); | ||
1530 | |||
1531 | audit_log_end(ab); | ||
1532 | } | ||
1533 | 1736 | ||
1534 | /* Send end of event record to help user space know we are finished */ | 1737 | /* Send end of event record to help user space know we are finished */ |
1535 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); | 1738 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); |
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1545 | * | 1748 | * |
1546 | * Called from copy_process and do_exit | 1749 | * Called from copy_process and do_exit |
1547 | */ | 1750 | */ |
1548 | void audit_free(struct task_struct *tsk) | 1751 | void __audit_free(struct task_struct *tsk) |
1549 | { | 1752 | { |
1550 | struct audit_context *context; | 1753 | struct audit_context *context; |
1551 | 1754 | ||
1552 | context = audit_get_context(tsk, 0, 0); | 1755 | context = audit_get_context(tsk, 0, 0); |
1553 | if (likely(!context)) | 1756 | if (!context) |
1554 | return; | 1757 | return; |
1555 | 1758 | ||
1556 | /* Check for system calls that do not go through the exit | 1759 | /* Check for system calls that do not go through the exit |
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk) | |||
1583 | * will only be written if another part of the kernel requests that it | 1786 | * will only be written if another part of the kernel requests that it |
1584 | * be written). | 1787 | * be written). |
1585 | */ | 1788 | */ |
1586 | void audit_syscall_entry(int arch, int major, | 1789 | void __audit_syscall_entry(int arch, int major, |
1587 | unsigned long a1, unsigned long a2, | 1790 | unsigned long a1, unsigned long a2, |
1588 | unsigned long a3, unsigned long a4) | 1791 | unsigned long a3, unsigned long a4) |
1589 | { | 1792 | { |
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major, | |||
1591 | struct audit_context *context = tsk->audit_context; | 1794 | struct audit_context *context = tsk->audit_context; |
1592 | enum audit_state state; | 1795 | enum audit_state state; |
1593 | 1796 | ||
1594 | if (unlikely(!context)) | 1797 | if (!context) |
1595 | return; | 1798 | return; |
1596 | 1799 | ||
1597 | /* | 1800 | /* |
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major, | |||
1648 | context->prio = 0; | 1851 | context->prio = 0; |
1649 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); | 1852 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); |
1650 | } | 1853 | } |
1651 | if (likely(state == AUDIT_DISABLED)) | 1854 | if (state == AUDIT_DISABLED) |
1652 | return; | 1855 | return; |
1653 | 1856 | ||
1654 | context->serial = 0; | 1857 | context->serial = 0; |
@@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major, | |||
1658 | context->ppid = 0; | 1861 | context->ppid = 0; |
1659 | } | 1862 | } |
1660 | 1863 | ||
1661 | void audit_finish_fork(struct task_struct *child) | ||
1662 | { | ||
1663 | struct audit_context *ctx = current->audit_context; | ||
1664 | struct audit_context *p = child->audit_context; | ||
1665 | if (!p || !ctx) | ||
1666 | return; | ||
1667 | if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) | ||
1668 | return; | ||
1669 | p->arch = ctx->arch; | ||
1670 | p->major = ctx->major; | ||
1671 | memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); | ||
1672 | p->ctime = ctx->ctime; | ||
1673 | p->dummy = ctx->dummy; | ||
1674 | p->in_syscall = ctx->in_syscall; | ||
1675 | p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); | ||
1676 | p->ppid = current->pid; | ||
1677 | p->prio = ctx->prio; | ||
1678 | p->current_state = ctx->current_state; | ||
1679 | } | ||
1680 | |||
1681 | /** | 1864 | /** |
1682 | * audit_syscall_exit - deallocate audit context after a system call | 1865 | * audit_syscall_exit - deallocate audit context after a system call |
1683 | * @valid: success/failure flag | 1866 | * @success: success value of the syscall |
1684 | * @return_code: syscall return value | 1867 | * @return_code: return value of the syscall |
1685 | * | 1868 | * |
1686 | * Tear down after system call. If the audit context has been marked as | 1869 | * Tear down after system call. If the audit context has been marked as |
1687 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from | 1870 | * auditable (either because of the AUDIT_RECORD_CONTEXT state from |
1688 | * filtering, or because some other part of the kernel write an audit | 1871 | * filtering, or because some other part of the kernel wrote an audit |
1689 | * message), then write out the syscall information. In call cases, | 1872 | * message), then write out the syscall information. In call cases, |
1690 | * free the names stored from getname(). | 1873 | * free the names stored from getname(). |
1691 | */ | 1874 | */ |
1692 | void audit_syscall_exit(int valid, long return_code) | 1875 | void __audit_syscall_exit(int success, long return_code) |
1693 | { | 1876 | { |
1694 | struct task_struct *tsk = current; | 1877 | struct task_struct *tsk = current; |
1695 | struct audit_context *context; | 1878 | struct audit_context *context; |
1696 | 1879 | ||
1697 | context = audit_get_context(tsk, valid, return_code); | 1880 | if (success) |
1881 | success = AUDITSC_SUCCESS; | ||
1882 | else | ||
1883 | success = AUDITSC_FAILURE; | ||
1698 | 1884 | ||
1699 | if (likely(!context)) | 1885 | context = audit_get_context(tsk, success, return_code); |
1886 | if (!context) | ||
1700 | return; | 1887 | return; |
1701 | 1888 | ||
1702 | if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) | 1889 | if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) |
@@ -1821,6 +2008,30 @@ retry: | |||
1821 | #endif | 2008 | #endif |
1822 | } | 2009 | } |
1823 | 2010 | ||
2011 | static struct audit_names *audit_alloc_name(struct audit_context *context) | ||
2012 | { | ||
2013 | struct audit_names *aname; | ||
2014 | |||
2015 | if (context->name_count < AUDIT_NAMES) { | ||
2016 | aname = &context->preallocated_names[context->name_count]; | ||
2017 | memset(aname, 0, sizeof(*aname)); | ||
2018 | } else { | ||
2019 | aname = kzalloc(sizeof(*aname), GFP_NOFS); | ||
2020 | if (!aname) | ||
2021 | return NULL; | ||
2022 | aname->should_free = true; | ||
2023 | } | ||
2024 | |||
2025 | aname->ino = (unsigned long)-1; | ||
2026 | list_add_tail(&aname->list, &context->names_list); | ||
2027 | |||
2028 | context->name_count++; | ||
2029 | #if AUDIT_DEBUG | ||
2030 | context->ino_count++; | ||
2031 | #endif | ||
2032 | return aname; | ||
2033 | } | ||
2034 | |||
1824 | /** | 2035 | /** |
1825 | * audit_getname - add a name to the list | 2036 | * audit_getname - add a name to the list |
1826 | * @name: name to add | 2037 | * @name: name to add |
@@ -1831,9 +2042,7 @@ retry: | |||
1831 | void __audit_getname(const char *name) | 2042 | void __audit_getname(const char *name) |
1832 | { | 2043 | { |
1833 | struct audit_context *context = current->audit_context; | 2044 | struct audit_context *context = current->audit_context; |
1834 | 2045 | struct audit_names *n; | |
1835 | if (IS_ERR(name) || !name) | ||
1836 | return; | ||
1837 | 2046 | ||
1838 | if (!context->in_syscall) { | 2047 | if (!context->in_syscall) { |
1839 | #if AUDIT_DEBUG == 2 | 2048 | #if AUDIT_DEBUG == 2 |
@@ -1843,13 +2052,15 @@ void __audit_getname(const char *name) | |||
1843 | #endif | 2052 | #endif |
1844 | return; | 2053 | return; |
1845 | } | 2054 | } |
1846 | BUG_ON(context->name_count >= AUDIT_NAMES); | 2055 | |
1847 | context->names[context->name_count].name = name; | 2056 | n = audit_alloc_name(context); |
1848 | context->names[context->name_count].name_len = AUDIT_NAME_FULL; | 2057 | if (!n) |
1849 | context->names[context->name_count].name_put = 1; | 2058 | return; |
1850 | context->names[context->name_count].ino = (unsigned long)-1; | 2059 | |
1851 | context->names[context->name_count].osid = 0; | 2060 | n->name = name; |
1852 | ++context->name_count; | 2061 | n->name_len = AUDIT_NAME_FULL; |
2062 | n->name_put = true; | ||
2063 | |||
1853 | if (!context->pwd.dentry) | 2064 | if (!context->pwd.dentry) |
1854 | get_fs_pwd(current->fs, &context->pwd); | 2065 | get_fs_pwd(current->fs, &context->pwd); |
1855 | } | 2066 | } |
@@ -1871,12 +2082,13 @@ void audit_putname(const char *name) | |||
1871 | printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", | 2082 | printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", |
1872 | __FILE__, __LINE__, context->serial, name); | 2083 | __FILE__, __LINE__, context->serial, name); |
1873 | if (context->name_count) { | 2084 | if (context->name_count) { |
2085 | struct audit_names *n; | ||
1874 | int i; | 2086 | int i; |
1875 | for (i = 0; i < context->name_count; i++) | 2087 | |
2088 | list_for_each_entry(n, &context->names_list, list) | ||
1876 | printk(KERN_ERR "name[%d] = %p = %s\n", i, | 2089 | printk(KERN_ERR "name[%d] = %p = %s\n", i, |
1877 | context->names[i].name, | 2090 | n->name, n->name ?: "(null)"); |
1878 | context->names[i].name ?: "(null)"); | 2091 | } |
1879 | } | ||
1880 | #endif | 2092 | #endif |
1881 | __putname(name); | 2093 | __putname(name); |
1882 | } | 2094 | } |
@@ -1897,39 +2109,11 @@ void audit_putname(const char *name) | |||
1897 | #endif | 2109 | #endif |
1898 | } | 2110 | } |
1899 | 2111 | ||
1900 | static int audit_inc_name_count(struct audit_context *context, | ||
1901 | const struct inode *inode) | ||
1902 | { | ||
1903 | if (context->name_count >= AUDIT_NAMES) { | ||
1904 | if (inode) | ||
1905 | printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " | ||
1906 | "dev=%02x:%02x, inode=%lu\n", | ||
1907 | MAJOR(inode->i_sb->s_dev), | ||
1908 | MINOR(inode->i_sb->s_dev), | ||
1909 | inode->i_ino); | ||
1910 | |||
1911 | else | ||
1912 | printk(KERN_DEBUG "name_count maxed, losing inode data\n"); | ||
1913 | return 1; | ||
1914 | } | ||
1915 | context->name_count++; | ||
1916 | #if AUDIT_DEBUG | ||
1917 | context->ino_count++; | ||
1918 | #endif | ||
1919 | return 0; | ||
1920 | } | ||
1921 | |||
1922 | |||
1923 | static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) | 2112 | static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) |
1924 | { | 2113 | { |
1925 | struct cpu_vfs_cap_data caps; | 2114 | struct cpu_vfs_cap_data caps; |
1926 | int rc; | 2115 | int rc; |
1927 | 2116 | ||
1928 | memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); | ||
1929 | memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); | ||
1930 | name->fcap.fE = 0; | ||
1931 | name->fcap_ver = 0; | ||
1932 | |||
1933 | if (!dentry) | 2117 | if (!dentry) |
1934 | return 0; | 2118 | return 0; |
1935 | 2119 | ||
@@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent | |||
1969 | */ | 2153 | */ |
1970 | void __audit_inode(const char *name, const struct dentry *dentry) | 2154 | void __audit_inode(const char *name, const struct dentry *dentry) |
1971 | { | 2155 | { |
1972 | int idx; | ||
1973 | struct audit_context *context = current->audit_context; | 2156 | struct audit_context *context = current->audit_context; |
1974 | const struct inode *inode = dentry->d_inode; | 2157 | const struct inode *inode = dentry->d_inode; |
2158 | struct audit_names *n; | ||
1975 | 2159 | ||
1976 | if (!context->in_syscall) | 2160 | if (!context->in_syscall) |
1977 | return; | 2161 | return; |
1978 | if (context->name_count | 2162 | |
1979 | && context->names[context->name_count-1].name | 2163 | list_for_each_entry_reverse(n, &context->names_list, list) { |
1980 | && context->names[context->name_count-1].name == name) | 2164 | if (n->name && (n->name == name)) |
1981 | idx = context->name_count - 1; | 2165 | goto out; |
1982 | else if (context->name_count > 1 | ||
1983 | && context->names[context->name_count-2].name | ||
1984 | && context->names[context->name_count-2].name == name) | ||
1985 | idx = context->name_count - 2; | ||
1986 | else { | ||
1987 | /* FIXME: how much do we care about inodes that have no | ||
1988 | * associated name? */ | ||
1989 | if (audit_inc_name_count(context, inode)) | ||
1990 | return; | ||
1991 | idx = context->name_count - 1; | ||
1992 | context->names[idx].name = NULL; | ||
1993 | } | 2166 | } |
2167 | |||
2168 | /* unable to find the name from a previous getname() */ | ||
2169 | n = audit_alloc_name(context); | ||
2170 | if (!n) | ||
2171 | return; | ||
2172 | out: | ||
1994 | handle_path(dentry); | 2173 | handle_path(dentry); |
1995 | audit_copy_inode(&context->names[idx], dentry, inode); | 2174 | audit_copy_inode(n, dentry, inode); |
1996 | } | 2175 | } |
1997 | 2176 | ||
1998 | /** | 2177 | /** |
@@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) | |||
2011 | void __audit_inode_child(const struct dentry *dentry, | 2190 | void __audit_inode_child(const struct dentry *dentry, |
2012 | const struct inode *parent) | 2191 | const struct inode *parent) |
2013 | { | 2192 | { |
2014 | int idx; | ||
2015 | struct audit_context *context = current->audit_context; | 2193 | struct audit_context *context = current->audit_context; |
2016 | const char *found_parent = NULL, *found_child = NULL; | 2194 | const char *found_parent = NULL, *found_child = NULL; |
2017 | const struct inode *inode = dentry->d_inode; | 2195 | const struct inode *inode = dentry->d_inode; |
2018 | const char *dname = dentry->d_name.name; | 2196 | const char *dname = dentry->d_name.name; |
2197 | struct audit_names *n; | ||
2019 | int dirlen = 0; | 2198 | int dirlen = 0; |
2020 | 2199 | ||
2021 | if (!context->in_syscall) | 2200 | if (!context->in_syscall) |
@@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2025 | handle_one(inode); | 2204 | handle_one(inode); |
2026 | 2205 | ||
2027 | /* parent is more likely, look for it first */ | 2206 | /* parent is more likely, look for it first */ |
2028 | for (idx = 0; idx < context->name_count; idx++) { | 2207 | list_for_each_entry(n, &context->names_list, list) { |
2029 | struct audit_names *n = &context->names[idx]; | ||
2030 | |||
2031 | if (!n->name) | 2208 | if (!n->name) |
2032 | continue; | 2209 | continue; |
2033 | 2210 | ||
@@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2040 | } | 2217 | } |
2041 | 2218 | ||
2042 | /* no matching parent, look for matching child */ | 2219 | /* no matching parent, look for matching child */ |
2043 | for (idx = 0; idx < context->name_count; idx++) { | 2220 | list_for_each_entry(n, &context->names_list, list) { |
2044 | struct audit_names *n = &context->names[idx]; | ||
2045 | |||
2046 | if (!n->name) | 2221 | if (!n->name) |
2047 | continue; | 2222 | continue; |
2048 | 2223 | ||
@@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry, | |||
2060 | 2235 | ||
2061 | add_names: | 2236 | add_names: |
2062 | if (!found_parent) { | 2237 | if (!found_parent) { |
2063 | if (audit_inc_name_count(context, parent)) | 2238 | n = audit_alloc_name(context); |
2239 | if (!n) | ||
2064 | return; | 2240 | return; |
2065 | idx = context->name_count - 1; | 2241 | audit_copy_inode(n, NULL, parent); |
2066 | context->names[idx].name = NULL; | ||
2067 | audit_copy_inode(&context->names[idx], NULL, parent); | ||
2068 | } | 2242 | } |
2069 | 2243 | ||
2070 | if (!found_child) { | 2244 | if (!found_child) { |
2071 | if (audit_inc_name_count(context, inode)) | 2245 | n = audit_alloc_name(context); |
2246 | if (!n) | ||
2072 | return; | 2247 | return; |
2073 | idx = context->name_count - 1; | ||
2074 | 2248 | ||
2075 | /* Re-use the name belonging to the slot for a matching parent | 2249 | /* Re-use the name belonging to the slot for a matching parent |
2076 | * directory. All names for this context are relinquished in | 2250 | * directory. All names for this context are relinquished in |
2077 | * audit_free_names() */ | 2251 | * audit_free_names() */ |
2078 | if (found_parent) { | 2252 | if (found_parent) { |
2079 | context->names[idx].name = found_parent; | 2253 | n->name = found_parent; |
2080 | context->names[idx].name_len = AUDIT_NAME_FULL; | 2254 | n->name_len = AUDIT_NAME_FULL; |
2081 | /* don't call __putname() */ | 2255 | /* don't call __putname() */ |
2082 | context->names[idx].name_put = 0; | 2256 | n->name_put = false; |
2083 | } else { | ||
2084 | context->names[idx].name = NULL; | ||
2085 | } | 2257 | } |
2086 | 2258 | ||
2087 | if (inode) | 2259 | if (inode) |
2088 | audit_copy_inode(&context->names[idx], NULL, inode); | 2260 | audit_copy_inode(n, NULL, inode); |
2089 | else | ||
2090 | context->names[idx].ino = (unsigned long)-1; | ||
2091 | } | 2261 | } |
2092 | } | 2262 | } |
2093 | EXPORT_SYMBOL_GPL(__audit_inode_child); | 2263 | EXPORT_SYMBOL_GPL(__audit_inode_child); |
@@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx, | |||
2121 | static atomic_t session_id = ATOMIC_INIT(0); | 2291 | static atomic_t session_id = ATOMIC_INIT(0); |
2122 | 2292 | ||
2123 | /** | 2293 | /** |
2124 | * audit_set_loginuid - set a task's audit_context loginuid | 2294 | * audit_set_loginuid - set current task's audit_context loginuid |
2125 | * @task: task whose audit context is being modified | ||
2126 | * @loginuid: loginuid value | 2295 | * @loginuid: loginuid value |
2127 | * | 2296 | * |
2128 | * Returns 0. | 2297 | * Returns 0. |
2129 | * | 2298 | * |
2130 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). | 2299 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). |
2131 | */ | 2300 | */ |
2132 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 2301 | int audit_set_loginuid(uid_t loginuid) |
2133 | { | 2302 | { |
2134 | unsigned int sessionid = atomic_inc_return(&session_id); | 2303 | struct task_struct *task = current; |
2135 | struct audit_context *context = task->audit_context; | 2304 | struct audit_context *context = task->audit_context; |
2305 | unsigned int sessionid; | ||
2306 | |||
2307 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE | ||
2308 | if (task->loginuid != -1) | ||
2309 | return -EPERM; | ||
2310 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | ||
2311 | if (!capable(CAP_AUDIT_CONTROL)) | ||
2312 | return -EPERM; | ||
2313 | #endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | ||
2136 | 2314 | ||
2315 | sessionid = atomic_inc_return(&session_id); | ||
2137 | if (context && context->in_syscall) { | 2316 | if (context && context->in_syscall) { |
2138 | struct audit_buffer *ab; | 2317 | struct audit_buffer *ab; |
2139 | 2318 | ||
@@ -2160,7 +2339,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | |||
2160 | * @attr: queue attributes | 2339 | * @attr: queue attributes |
2161 | * | 2340 | * |
2162 | */ | 2341 | */ |
2163 | void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) | 2342 | void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) |
2164 | { | 2343 | { |
2165 | struct audit_context *context = current->audit_context; | 2344 | struct audit_context *context = current->audit_context; |
2166 | 2345 | ||
@@ -2260,7 +2439,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
2260 | * | 2439 | * |
2261 | * Called only after audit_ipc_obj(). | 2440 | * Called only after audit_ipc_obj(). |
2262 | */ | 2441 | */ |
2263 | void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | 2442 | void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) |
2264 | { | 2443 | { |
2265 | struct audit_context *context = current->audit_context; | 2444 | struct audit_context *context = current->audit_context; |
2266 | 2445 | ||
@@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod | |||
2271 | context->ipc.has_perm = 1; | 2450 | context->ipc.has_perm = 1; |
2272 | } | 2451 | } |
2273 | 2452 | ||
2274 | int audit_bprm(struct linux_binprm *bprm) | 2453 | int __audit_bprm(struct linux_binprm *bprm) |
2275 | { | 2454 | { |
2276 | struct audit_aux_data_execve *ax; | 2455 | struct audit_aux_data_execve *ax; |
2277 | struct audit_context *context = current->audit_context; | 2456 | struct audit_context *context = current->audit_context; |
2278 | 2457 | ||
2279 | if (likely(!audit_enabled || !context || context->dummy)) | ||
2280 | return 0; | ||
2281 | |||
2282 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | 2458 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); |
2283 | if (!ax) | 2459 | if (!ax) |
2284 | return -ENOMEM; | 2460 | return -ENOMEM; |
@@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm) | |||
2299 | * @args: args array | 2475 | * @args: args array |
2300 | * | 2476 | * |
2301 | */ | 2477 | */ |
2302 | void audit_socketcall(int nargs, unsigned long *args) | 2478 | void __audit_socketcall(int nargs, unsigned long *args) |
2303 | { | 2479 | { |
2304 | struct audit_context *context = current->audit_context; | 2480 | struct audit_context *context = current->audit_context; |
2305 | 2481 | ||
2306 | if (likely(!context || context->dummy)) | ||
2307 | return; | ||
2308 | |||
2309 | context->type = AUDIT_SOCKETCALL; | 2482 | context->type = AUDIT_SOCKETCALL; |
2310 | context->socketcall.nargs = nargs; | 2483 | context->socketcall.nargs = nargs; |
2311 | memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); | 2484 | memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); |
@@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2) | |||
2331 | * | 2504 | * |
2332 | * Returns 0 for success or NULL context or < 0 on error. | 2505 | * Returns 0 for success or NULL context or < 0 on error. |
2333 | */ | 2506 | */ |
2334 | int audit_sockaddr(int len, void *a) | 2507 | int __audit_sockaddr(int len, void *a) |
2335 | { | 2508 | { |
2336 | struct audit_context *context = current->audit_context; | 2509 | struct audit_context *context = current->audit_context; |
2337 | 2510 | ||
2338 | if (likely(!context || context->dummy)) | ||
2339 | return 0; | ||
2340 | |||
2341 | if (!context->sockaddr) { | 2511 | if (!context->sockaddr) { |
2342 | void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); | 2512 | void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); |
2343 | if (!p) | 2513 | if (!p) |
@@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags) | |||
2499 | context->type = AUDIT_MMAP; | 2669 | context->type = AUDIT_MMAP; |
2500 | } | 2670 | } |
2501 | 2671 | ||
2672 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | ||
2673 | { | ||
2674 | uid_t auid, uid; | ||
2675 | gid_t gid; | ||
2676 | unsigned int sessionid; | ||
2677 | |||
2678 | auid = audit_get_loginuid(current); | ||
2679 | sessionid = audit_get_sessionid(current); | ||
2680 | current_uid_gid(&uid, &gid); | ||
2681 | |||
2682 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | ||
2683 | auid, uid, gid, sessionid); | ||
2684 | audit_log_task_context(ab); | ||
2685 | audit_log_format(ab, " pid=%d comm=", current->pid); | ||
2686 | audit_log_untrustedstring(ab, current->comm); | ||
2687 | audit_log_format(ab, " reason="); | ||
2688 | audit_log_string(ab, reason); | ||
2689 | audit_log_format(ab, " sig=%ld", signr); | ||
2690 | } | ||
2502 | /** | 2691 | /** |
2503 | * audit_core_dumps - record information about processes that end abnormally | 2692 | * audit_core_dumps - record information about processes that end abnormally |
2504 | * @signr: signal value | 2693 | * @signr: signal value |
@@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags) | |||
2509 | void audit_core_dumps(long signr) | 2698 | void audit_core_dumps(long signr) |
2510 | { | 2699 | { |
2511 | struct audit_buffer *ab; | 2700 | struct audit_buffer *ab; |
2512 | u32 sid; | ||
2513 | uid_t auid = audit_get_loginuid(current), uid; | ||
2514 | gid_t gid; | ||
2515 | unsigned int sessionid = audit_get_sessionid(current); | ||
2516 | 2701 | ||
2517 | if (!audit_enabled) | 2702 | if (!audit_enabled) |
2518 | return; | 2703 | return; |
@@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr) | |||
2521 | return; | 2706 | return; |
2522 | 2707 | ||
2523 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2708 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2524 | current_uid_gid(&uid, &gid); | 2709 | audit_log_abend(ab, "memory violation", signr); |
2525 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | 2710 | audit_log_end(ab); |
2526 | auid, uid, gid, sessionid); | 2711 | } |
2527 | security_task_getsecid(current, &sid); | ||
2528 | if (sid) { | ||
2529 | char *ctx = NULL; | ||
2530 | u32 len; | ||
2531 | 2712 | ||
2532 | if (security_secid_to_secctx(sid, &ctx, &len)) | 2713 | void __audit_seccomp(unsigned long syscall) |
2533 | audit_log_format(ab, " ssid=%u", sid); | 2714 | { |
2534 | else { | 2715 | struct audit_buffer *ab; |
2535 | audit_log_format(ab, " subj=%s", ctx); | 2716 | |
2536 | security_release_secctx(ctx, len); | 2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2537 | } | 2718 | audit_log_abend(ab, "seccomp", SIGKILL); |
2538 | } | 2719 | audit_log_format(ab, " syscall=%ld", syscall); |
2539 | audit_log_format(ab, " pid=%d comm=", current->pid); | ||
2540 | audit_log_untrustedstring(ab, current->comm); | ||
2541 | audit_log_format(ab, " sig=%ld", signr); | ||
2542 | audit_log_end(ab); | 2720 | audit_log_end(ab); |
2543 | } | 2721 | } |
2544 | 2722 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index b463871a4e69..3f1adb6c6470 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -287,74 +287,84 @@ error: | |||
287 | } | 287 | } |
288 | 288 | ||
289 | /** | 289 | /** |
290 | * has_capability - Does a task have a capability in init_user_ns | 290 | * has_ns_capability - Does a task have a capability in a specific user ns |
291 | * @t: The task in question | 291 | * @t: The task in question |
292 | * @ns: target user namespace | ||
292 | * @cap: The capability to be tested for | 293 | * @cap: The capability to be tested for |
293 | * | 294 | * |
294 | * Return true if the specified task has the given superior capability | 295 | * Return true if the specified task has the given superior capability |
295 | * currently in effect to the initial user namespace, false if not. | 296 | * currently in effect to the specified user namespace, false if not. |
296 | * | 297 | * |
297 | * Note that this does not set PF_SUPERPRIV on the task. | 298 | * Note that this does not set PF_SUPERPRIV on the task. |
298 | */ | 299 | */ |
299 | bool has_capability(struct task_struct *t, int cap) | 300 | bool has_ns_capability(struct task_struct *t, |
301 | struct user_namespace *ns, int cap) | ||
300 | { | 302 | { |
301 | int ret = security_real_capable(t, &init_user_ns, cap); | 303 | int ret; |
304 | |||
305 | rcu_read_lock(); | ||
306 | ret = security_capable(__task_cred(t), ns, cap); | ||
307 | rcu_read_unlock(); | ||
302 | 308 | ||
303 | return (ret == 0); | 309 | return (ret == 0); |
304 | } | 310 | } |
305 | 311 | ||
306 | /** | 312 | /** |
307 | * has_capability - Does a task have a capability in a specific user ns | 313 | * has_capability - Does a task have a capability in init_user_ns |
308 | * @t: The task in question | 314 | * @t: The task in question |
309 | * @ns: target user namespace | ||
310 | * @cap: The capability to be tested for | 315 | * @cap: The capability to be tested for |
311 | * | 316 | * |
312 | * Return true if the specified task has the given superior capability | 317 | * Return true if the specified task has the given superior capability |
313 | * currently in effect to the specified user namespace, false if not. | 318 | * currently in effect to the initial user namespace, false if not. |
314 | * | 319 | * |
315 | * Note that this does not set PF_SUPERPRIV on the task. | 320 | * Note that this does not set PF_SUPERPRIV on the task. |
316 | */ | 321 | */ |
317 | bool has_ns_capability(struct task_struct *t, | 322 | bool has_capability(struct task_struct *t, int cap) |
318 | struct user_namespace *ns, int cap) | ||
319 | { | 323 | { |
320 | int ret = security_real_capable(t, ns, cap); | 324 | return has_ns_capability(t, &init_user_ns, cap); |
321 | |||
322 | return (ret == 0); | ||
323 | } | 325 | } |
324 | 326 | ||
325 | /** | 327 | /** |
326 | * has_capability_noaudit - Does a task have a capability (unaudited) | 328 | * has_ns_capability_noaudit - Does a task have a capability (unaudited) |
329 | * in a specific user ns. | ||
327 | * @t: The task in question | 330 | * @t: The task in question |
331 | * @ns: target user namespace | ||
328 | * @cap: The capability to be tested for | 332 | * @cap: The capability to be tested for |
329 | * | 333 | * |
330 | * Return true if the specified task has the given superior capability | 334 | * Return true if the specified task has the given superior capability |
331 | * currently in effect to init_user_ns, false if not. Don't write an | 335 | * currently in effect to the specified user namespace, false if not. |
332 | * audit message for the check. | 336 | * Do not write an audit message for the check. |
333 | * | 337 | * |
334 | * Note that this does not set PF_SUPERPRIV on the task. | 338 | * Note that this does not set PF_SUPERPRIV on the task. |
335 | */ | 339 | */ |
336 | bool has_capability_noaudit(struct task_struct *t, int cap) | 340 | bool has_ns_capability_noaudit(struct task_struct *t, |
341 | struct user_namespace *ns, int cap) | ||
337 | { | 342 | { |
338 | int ret = security_real_capable_noaudit(t, &init_user_ns, cap); | 343 | int ret; |
344 | |||
345 | rcu_read_lock(); | ||
346 | ret = security_capable_noaudit(__task_cred(t), ns, cap); | ||
347 | rcu_read_unlock(); | ||
339 | 348 | ||
340 | return (ret == 0); | 349 | return (ret == 0); |
341 | } | 350 | } |
342 | 351 | ||
343 | /** | 352 | /** |
344 | * capable - Determine if the current task has a superior capability in effect | 353 | * has_capability_noaudit - Does a task have a capability (unaudited) in the |
354 | * initial user ns | ||
355 | * @t: The task in question | ||
345 | * @cap: The capability to be tested for | 356 | * @cap: The capability to be tested for |
346 | * | 357 | * |
347 | * Return true if the current task has the given superior capability currently | 358 | * Return true if the specified task has the given superior capability |
348 | * available for use, false if not. | 359 | * currently in effect to init_user_ns, false if not. Don't write an |
360 | * audit message for the check. | ||
349 | * | 361 | * |
350 | * This sets PF_SUPERPRIV on the task if the capability is available on the | 362 | * Note that this does not set PF_SUPERPRIV on the task. |
351 | * assumption that it's about to be used. | ||
352 | */ | 363 | */ |
353 | bool capable(int cap) | 364 | bool has_capability_noaudit(struct task_struct *t, int cap) |
354 | { | 365 | { |
355 | return ns_capable(&init_user_ns, cap); | 366 | return has_ns_capability_noaudit(t, &init_user_ns, cap); |
356 | } | 367 | } |
357 | EXPORT_SYMBOL(capable); | ||
358 | 368 | ||
359 | /** | 369 | /** |
360 | * ns_capable - Determine if the current task has a superior capability in effect | 370 | * ns_capable - Determine if the current task has a superior capability in effect |
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) | |||
374 | BUG(); | 384 | BUG(); |
375 | } | 385 | } |
376 | 386 | ||
377 | if (security_capable(ns, current_cred(), cap) == 0) { | 387 | if (security_capable(current_cred(), ns, cap) == 0) { |
378 | current->flags |= PF_SUPERPRIV; | 388 | current->flags |= PF_SUPERPRIV; |
379 | return true; | 389 | return true; |
380 | } | 390 | } |
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap) | |||
383 | EXPORT_SYMBOL(ns_capable); | 393 | EXPORT_SYMBOL(ns_capable); |
384 | 394 | ||
385 | /** | 395 | /** |
386 | * task_ns_capable - Determine whether current task has a superior | 396 | * capable - Determine if the current task has a superior capability in effect |
387 | * capability targeted at a specific task's user namespace. | 397 | * @cap: The capability to be tested for |
388 | * @t: The task whose user namespace is targeted. | 398 | * |
389 | * @cap: The capability in question. | 399 | * Return true if the current task has the given superior capability currently |
400 | * available for use, false if not. | ||
390 | * | 401 | * |
391 | * Return true if it does, false otherwise. | 402 | * This sets PF_SUPERPRIV on the task if the capability is available on the |
403 | * assumption that it's about to be used. | ||
392 | */ | 404 | */ |
393 | bool task_ns_capable(struct task_struct *t, int cap) | 405 | bool capable(int cap) |
394 | { | 406 | { |
395 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); | 407 | return ns_capable(&init_user_ns, cap); |
396 | } | 408 | } |
397 | EXPORT_SYMBOL(task_ns_capable); | 409 | EXPORT_SYMBOL(capable); |
398 | 410 | ||
399 | /** | 411 | /** |
400 | * nsown_capable - Check superior capability to one's own user_ns | 412 | * nsown_capable - Check superior capability to one's own user_ns |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9d5648f3cdc..a5d3b5325f77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -63,7 +63,24 @@ | |||
63 | 63 | ||
64 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
65 | 65 | ||
66 | /* | ||
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | ||
68 | * hierarchy must be performed while holding it. | ||
69 | * | ||
70 | * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify | ||
71 | * cgroupfs_root of any cgroup hierarchy - subsys list, flags, | ||
72 | * release_agent_path and so on. Modifying requires both cgroup_mutex and | ||
73 | * cgroup_root_mutex. Readers can acquire either of the two. This is to | ||
74 | * break the following locking order cycle. | ||
75 | * | ||
76 | * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem | ||
77 | * B. namespace_sem -> cgroup_mutex | ||
78 | * | ||
79 | * B happens only through cgroup_show_options() and using cgroup_root_mutex | ||
80 | * breaks it. | ||
81 | */ | ||
66 | static DEFINE_MUTEX(cgroup_mutex); | 82 | static DEFINE_MUTEX(cgroup_mutex); |
83 | static DEFINE_MUTEX(cgroup_root_mutex); | ||
67 | 84 | ||
68 | /* | 85 | /* |
69 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 86 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
760 | * -> cgroup_mkdir. | 777 | * -> cgroup_mkdir. |
761 | */ | 778 | */ |
762 | 779 | ||
763 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 780 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
764 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | 781 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); |
765 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 782 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
766 | static int cgroup_populate_dir(struct cgroup *cgrp); | 783 | static int cgroup_populate_dir(struct cgroup *cgrp); |
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
775 | static int alloc_css_id(struct cgroup_subsys *ss, | 792 | static int alloc_css_id(struct cgroup_subsys *ss, |
776 | struct cgroup *parent, struct cgroup *child); | 793 | struct cgroup *parent, struct cgroup *child); |
777 | 794 | ||
778 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | 795 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
779 | { | 796 | { |
780 | struct inode *inode = new_inode(sb); | 797 | struct inode *inode = new_inode(sb); |
781 | 798 | ||
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
921 | * | 938 | * |
922 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | 939 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
923 | */ | 940 | */ |
924 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 941 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
925 | 942 | ||
926 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | 943 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
927 | { | 944 | { |
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
953 | int i; | 970 | int i; |
954 | 971 | ||
955 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 972 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
973 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | ||
956 | 974 | ||
957 | removed_bits = root->actual_subsys_bits & ~final_bits; | 975 | removed_bits = root->actual_subsys_bits & ~final_bits; |
958 | added_bits = final_bits & ~root->actual_subsys_bits; | 976 | added_bits = final_bits & ~root->actual_subsys_bits; |
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1038 | return 0; | 1056 | return 0; |
1039 | } | 1057 | } |
1040 | 1058 | ||
1041 | static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | 1059 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) |
1042 | { | 1060 | { |
1043 | struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; | 1061 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; |
1044 | struct cgroup_subsys *ss; | 1062 | struct cgroup_subsys *ss; |
1045 | 1063 | ||
1046 | mutex_lock(&cgroup_mutex); | 1064 | mutex_lock(&cgroup_root_mutex); |
1047 | for_each_subsys(root, ss) | 1065 | for_each_subsys(root, ss) |
1048 | seq_printf(seq, ",%s", ss->name); | 1066 | seq_printf(seq, ",%s", ss->name); |
1049 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 1067 | if (test_bit(ROOT_NOPREFIX, &root->flags)) |
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1054 | seq_puts(seq, ",clone_children"); | 1072 | seq_puts(seq, ",clone_children"); |
1055 | if (strlen(root->name)) | 1073 | if (strlen(root->name)) |
1056 | seq_printf(seq, ",name=%s", root->name); | 1074 | seq_printf(seq, ",name=%s", root->name); |
1057 | mutex_unlock(&cgroup_mutex); | 1075 | mutex_unlock(&cgroup_root_mutex); |
1058 | return 0; | 1076 | return 0; |
1059 | } | 1077 | } |
1060 | 1078 | ||
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1175 | 1193 | ||
1176 | /* | 1194 | /* |
1177 | * If the 'all' option was specified select all the subsystems, | 1195 | * If the 'all' option was specified select all the subsystems, |
1178 | * otherwise 'all, 'none' and a subsystem name options were not | 1196 | * otherwise if 'none', 'name=' and a subsystem name options |
1179 | * specified, let's default to 'all' | 1197 | * were not specified, let's default to 'all' |
1180 | */ | 1198 | */ |
1181 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | 1199 | if (all_ss || (!one_ss && !opts->none && !opts->name)) { |
1182 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1200 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1183 | struct cgroup_subsys *ss = subsys[i]; | 1201 | struct cgroup_subsys *ss = subsys[i]; |
1184 | if (ss == NULL) | 1202 | if (ss == NULL) |
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1269 | 1287 | ||
1270 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1288 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1271 | mutex_lock(&cgroup_mutex); | 1289 | mutex_lock(&cgroup_mutex); |
1290 | mutex_lock(&cgroup_root_mutex); | ||
1272 | 1291 | ||
1273 | /* See what subsystems are wanted */ | 1292 | /* See what subsystems are wanted */ |
1274 | ret = parse_cgroupfs_options(data, &opts); | 1293 | ret = parse_cgroupfs_options(data, &opts); |
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1297 | out_unlock: | 1316 | out_unlock: |
1298 | kfree(opts.release_agent); | 1317 | kfree(opts.release_agent); |
1299 | kfree(opts.name); | 1318 | kfree(opts.name); |
1319 | mutex_unlock(&cgroup_root_mutex); | ||
1300 | mutex_unlock(&cgroup_mutex); | 1320 | mutex_unlock(&cgroup_mutex); |
1301 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1321 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1302 | return ret; | 1322 | return ret; |
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1481 | int ret = 0; | 1501 | int ret = 0; |
1482 | struct super_block *sb; | 1502 | struct super_block *sb; |
1483 | struct cgroupfs_root *new_root; | 1503 | struct cgroupfs_root *new_root; |
1504 | struct inode *inode; | ||
1484 | 1505 | ||
1485 | /* First find the desired set of subsystems */ | 1506 | /* First find the desired set of subsystems */ |
1486 | mutex_lock(&cgroup_mutex); | 1507 | mutex_lock(&cgroup_mutex); |
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1514 | /* We used the new root structure, so this is a new hierarchy */ | 1535 | /* We used the new root structure, so this is a new hierarchy */ |
1515 | struct list_head tmp_cg_links; | 1536 | struct list_head tmp_cg_links; |
1516 | struct cgroup *root_cgrp = &root->top_cgroup; | 1537 | struct cgroup *root_cgrp = &root->top_cgroup; |
1517 | struct inode *inode; | ||
1518 | struct cgroupfs_root *existing_root; | 1538 | struct cgroupfs_root *existing_root; |
1519 | const struct cred *cred; | 1539 | const struct cred *cred; |
1520 | int i; | 1540 | int i; |
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1528 | 1548 | ||
1529 | mutex_lock(&inode->i_mutex); | 1549 | mutex_lock(&inode->i_mutex); |
1530 | mutex_lock(&cgroup_mutex); | 1550 | mutex_lock(&cgroup_mutex); |
1551 | mutex_lock(&cgroup_root_mutex); | ||
1531 | 1552 | ||
1532 | if (strlen(root->name)) { | 1553 | /* Check for name clashes with existing mounts */ |
1533 | /* Check for name clashes with existing mounts */ | 1554 | ret = -EBUSY; |
1534 | for_each_active_root(existing_root) { | 1555 | if (strlen(root->name)) |
1535 | if (!strcmp(existing_root->name, root->name)) { | 1556 | for_each_active_root(existing_root) |
1536 | ret = -EBUSY; | 1557 | if (!strcmp(existing_root->name, root->name)) |
1537 | mutex_unlock(&cgroup_mutex); | 1558 | goto unlock_drop; |
1538 | mutex_unlock(&inode->i_mutex); | ||
1539 | goto drop_new_super; | ||
1540 | } | ||
1541 | } | ||
1542 | } | ||
1543 | 1559 | ||
1544 | /* | 1560 | /* |
1545 | * We're accessing css_set_count without locking | 1561 | * We're accessing css_set_count without locking |
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1549 | * have some link structures left over | 1565 | * have some link structures left over |
1550 | */ | 1566 | */ |
1551 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); | 1567 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); |
1552 | if (ret) { | 1568 | if (ret) |
1553 | mutex_unlock(&cgroup_mutex); | 1569 | goto unlock_drop; |
1554 | mutex_unlock(&inode->i_mutex); | ||
1555 | goto drop_new_super; | ||
1556 | } | ||
1557 | 1570 | ||
1558 | ret = rebind_subsystems(root, root->subsys_bits); | 1571 | ret = rebind_subsystems(root, root->subsys_bits); |
1559 | if (ret == -EBUSY) { | 1572 | if (ret == -EBUSY) { |
1560 | mutex_unlock(&cgroup_mutex); | ||
1561 | mutex_unlock(&inode->i_mutex); | ||
1562 | free_cg_links(&tmp_cg_links); | 1573 | free_cg_links(&tmp_cg_links); |
1563 | goto drop_new_super; | 1574 | goto unlock_drop; |
1564 | } | 1575 | } |
1565 | /* | 1576 | /* |
1566 | * There must be no failure case after here, since rebinding | 1577 | * There must be no failure case after here, since rebinding |
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1599 | cred = override_creds(&init_cred); | 1610 | cred = override_creds(&init_cred); |
1600 | cgroup_populate_dir(root_cgrp); | 1611 | cgroup_populate_dir(root_cgrp); |
1601 | revert_creds(cred); | 1612 | revert_creds(cred); |
1613 | mutex_unlock(&cgroup_root_mutex); | ||
1602 | mutex_unlock(&cgroup_mutex); | 1614 | mutex_unlock(&cgroup_mutex); |
1603 | mutex_unlock(&inode->i_mutex); | 1615 | mutex_unlock(&inode->i_mutex); |
1604 | } else { | 1616 | } else { |
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1615 | kfree(opts.name); | 1627 | kfree(opts.name); |
1616 | return dget(sb->s_root); | 1628 | return dget(sb->s_root); |
1617 | 1629 | ||
1630 | unlock_drop: | ||
1631 | mutex_unlock(&cgroup_root_mutex); | ||
1632 | mutex_unlock(&cgroup_mutex); | ||
1633 | mutex_unlock(&inode->i_mutex); | ||
1618 | drop_new_super: | 1634 | drop_new_super: |
1619 | deactivate_locked_super(sb); | 1635 | deactivate_locked_super(sb); |
1620 | drop_modules: | 1636 | drop_modules: |
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1639 | BUG_ON(!list_empty(&cgrp->sibling)); | 1655 | BUG_ON(!list_empty(&cgrp->sibling)); |
1640 | 1656 | ||
1641 | mutex_lock(&cgroup_mutex); | 1657 | mutex_lock(&cgroup_mutex); |
1658 | mutex_lock(&cgroup_root_mutex); | ||
1642 | 1659 | ||
1643 | /* Rebind all subsystems back to the default hierarchy */ | 1660 | /* Rebind all subsystems back to the default hierarchy */ |
1644 | ret = rebind_subsystems(root, 0); | 1661 | ret = rebind_subsystems(root, 0); |
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1664 | root_count--; | 1681 | root_count--; |
1665 | } | 1682 | } |
1666 | 1683 | ||
1684 | mutex_unlock(&cgroup_root_mutex); | ||
1667 | mutex_unlock(&cgroup_mutex); | 1685 | mutex_unlock(&cgroup_mutex); |
1668 | 1686 | ||
1669 | kill_litter_super(sb); | 1687 | kill_litter_super(sb); |
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1740 | EXPORT_SYMBOL_GPL(cgroup_path); | 1758 | EXPORT_SYMBOL_GPL(cgroup_path); |
1741 | 1759 | ||
1742 | /* | 1760 | /* |
1761 | * Control Group taskset | ||
1762 | */ | ||
1763 | struct task_and_cgroup { | ||
1764 | struct task_struct *task; | ||
1765 | struct cgroup *cgrp; | ||
1766 | }; | ||
1767 | |||
1768 | struct cgroup_taskset { | ||
1769 | struct task_and_cgroup single; | ||
1770 | struct flex_array *tc_array; | ||
1771 | int tc_array_len; | ||
1772 | int idx; | ||
1773 | struct cgroup *cur_cgrp; | ||
1774 | }; | ||
1775 | |||
1776 | /** | ||
1777 | * cgroup_taskset_first - reset taskset and return the first task | ||
1778 | * @tset: taskset of interest | ||
1779 | * | ||
1780 | * @tset iteration is initialized and the first task is returned. | ||
1781 | */ | ||
1782 | struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) | ||
1783 | { | ||
1784 | if (tset->tc_array) { | ||
1785 | tset->idx = 0; | ||
1786 | return cgroup_taskset_next(tset); | ||
1787 | } else { | ||
1788 | tset->cur_cgrp = tset->single.cgrp; | ||
1789 | return tset->single.task; | ||
1790 | } | ||
1791 | } | ||
1792 | EXPORT_SYMBOL_GPL(cgroup_taskset_first); | ||
1793 | |||
1794 | /** | ||
1795 | * cgroup_taskset_next - iterate to the next task in taskset | ||
1796 | * @tset: taskset of interest | ||
1797 | * | ||
1798 | * Return the next task in @tset. Iteration must have been initialized | ||
1799 | * with cgroup_taskset_first(). | ||
1800 | */ | ||
1801 | struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | ||
1802 | { | ||
1803 | struct task_and_cgroup *tc; | ||
1804 | |||
1805 | if (!tset->tc_array || tset->idx >= tset->tc_array_len) | ||
1806 | return NULL; | ||
1807 | |||
1808 | tc = flex_array_get(tset->tc_array, tset->idx++); | ||
1809 | tset->cur_cgrp = tc->cgrp; | ||
1810 | return tc->task; | ||
1811 | } | ||
1812 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); | ||
1813 | |||
1814 | /** | ||
1815 | * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task | ||
1816 | * @tset: taskset of interest | ||
1817 | * | ||
1818 | * Return the cgroup for the current (last returned) task of @tset. This | ||
1819 | * function must be preceded by either cgroup_taskset_first() or | ||
1820 | * cgroup_taskset_next(). | ||
1821 | */ | ||
1822 | struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) | ||
1823 | { | ||
1824 | return tset->cur_cgrp; | ||
1825 | } | ||
1826 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); | ||
1827 | |||
1828 | /** | ||
1829 | * cgroup_taskset_size - return the number of tasks in taskset | ||
1830 | * @tset: taskset of interest | ||
1831 | */ | ||
1832 | int cgroup_taskset_size(struct cgroup_taskset *tset) | ||
1833 | { | ||
1834 | return tset->tc_array ? tset->tc_array_len : 1; | ||
1835 | } | ||
1836 | EXPORT_SYMBOL_GPL(cgroup_taskset_size); | ||
1837 | |||
1838 | |||
1839 | /* | ||
1743 | * cgroup_task_migrate - move a task from one cgroup to another. | 1840 | * cgroup_task_migrate - move a task from one cgroup to another. |
1744 | * | 1841 | * |
1745 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1842 | * 'guarantee' is set if the caller promises that a new css_set for the task |
1746 | * will already exist. If not set, this function might sleep, and can fail with | 1843 | * will already exist. If not set, this function might sleep, and can fail with |
1747 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. | 1844 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. |
1748 | */ | 1845 | */ |
1749 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1846 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1750 | struct task_struct *tsk, bool guarantee) | 1847 | struct task_struct *tsk, bool guarantee) |
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1753 | struct css_set *newcg; | 1850 | struct css_set *newcg; |
1754 | 1851 | ||
1755 | /* | 1852 | /* |
1756 | * get old css_set. we need to take task_lock and refcount it, because | 1853 | * We are synchronized through threadgroup_lock() against PF_EXITING |
1757 | * an exiting task can change its css_set to init_css_set and drop its | 1854 | * setting such that we can't race against cgroup_exit() changing the |
1758 | * old one without taking cgroup_mutex. | 1855 | * css_set to init_css_set and dropping the old one. |
1759 | */ | 1856 | */ |
1760 | task_lock(tsk); | 1857 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
1761 | oldcg = tsk->cgroups; | 1858 | oldcg = tsk->cgroups; |
1762 | get_css_set(oldcg); | ||
1763 | task_unlock(tsk); | ||
1764 | 1859 | ||
1765 | /* locate or allocate a new css_set for this task. */ | 1860 | /* locate or allocate a new css_set for this task. */ |
1766 | if (guarantee) { | 1861 | if (guarantee) { |
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1775 | might_sleep(); | 1870 | might_sleep(); |
1776 | /* find_css_set will give us newcg already referenced. */ | 1871 | /* find_css_set will give us newcg already referenced. */ |
1777 | newcg = find_css_set(oldcg, cgrp); | 1872 | newcg = find_css_set(oldcg, cgrp); |
1778 | if (!newcg) { | 1873 | if (!newcg) |
1779 | put_css_set(oldcg); | ||
1780 | return -ENOMEM; | 1874 | return -ENOMEM; |
1781 | } | ||
1782 | } | 1875 | } |
1783 | put_css_set(oldcg); | ||
1784 | 1876 | ||
1785 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ | ||
1786 | task_lock(tsk); | 1877 | task_lock(tsk); |
1787 | if (tsk->flags & PF_EXITING) { | ||
1788 | task_unlock(tsk); | ||
1789 | put_css_set(newcg); | ||
1790 | return -ESRCH; | ||
1791 | } | ||
1792 | rcu_assign_pointer(tsk->cgroups, newcg); | 1878 | rcu_assign_pointer(tsk->cgroups, newcg); |
1793 | task_unlock(tsk); | 1879 | task_unlock(tsk); |
1794 | 1880 | ||
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1814 | * @cgrp: the cgroup the task is attaching to | 1900 | * @cgrp: the cgroup the task is attaching to |
1815 | * @tsk: the task to be attached | 1901 | * @tsk: the task to be attached |
1816 | * | 1902 | * |
1817 | * Call holding cgroup_mutex. May take task_lock of | 1903 | * Call with cgroup_mutex and threadgroup locked. May take task_lock of |
1818 | * the task 'tsk' during call. | 1904 | * @tsk during call. |
1819 | */ | 1905 | */ |
1820 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1906 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1821 | { | 1907 | { |
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1823 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1909 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1824 | struct cgroup *oldcgrp; | 1910 | struct cgroup *oldcgrp; |
1825 | struct cgroupfs_root *root = cgrp->root; | 1911 | struct cgroupfs_root *root = cgrp->root; |
1912 | struct cgroup_taskset tset = { }; | ||
1913 | |||
1914 | /* @tsk either already exited or can't exit until the end */ | ||
1915 | if (tsk->flags & PF_EXITING) | ||
1916 | return -ESRCH; | ||
1826 | 1917 | ||
1827 | /* Nothing to do if the task is already in that cgroup */ | 1918 | /* Nothing to do if the task is already in that cgroup */ |
1828 | oldcgrp = task_cgroup_from_root(tsk, root); | 1919 | oldcgrp = task_cgroup_from_root(tsk, root); |
1829 | if (cgrp == oldcgrp) | 1920 | if (cgrp == oldcgrp) |
1830 | return 0; | 1921 | return 0; |
1831 | 1922 | ||
1923 | tset.single.task = tsk; | ||
1924 | tset.single.cgrp = oldcgrp; | ||
1925 | |||
1832 | for_each_subsys(root, ss) { | 1926 | for_each_subsys(root, ss) { |
1833 | if (ss->can_attach) { | 1927 | if (ss->can_attach) { |
1834 | retval = ss->can_attach(ss, cgrp, tsk); | 1928 | retval = ss->can_attach(ss, cgrp, &tset); |
1835 | if (retval) { | 1929 | if (retval) { |
1836 | /* | 1930 | /* |
1837 | * Remember on which subsystem the can_attach() | 1931 | * Remember on which subsystem the can_attach() |
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1843 | goto out; | 1937 | goto out; |
1844 | } | 1938 | } |
1845 | } | 1939 | } |
1846 | if (ss->can_attach_task) { | ||
1847 | retval = ss->can_attach_task(cgrp, tsk); | ||
1848 | if (retval) { | ||
1849 | failed_ss = ss; | ||
1850 | goto out; | ||
1851 | } | ||
1852 | } | ||
1853 | } | 1940 | } |
1854 | 1941 | ||
1855 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); | 1942 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1857 | goto out; | 1944 | goto out; |
1858 | 1945 | ||
1859 | for_each_subsys(root, ss) { | 1946 | for_each_subsys(root, ss) { |
1860 | if (ss->pre_attach) | ||
1861 | ss->pre_attach(cgrp); | ||
1862 | if (ss->attach_task) | ||
1863 | ss->attach_task(cgrp, tsk); | ||
1864 | if (ss->attach) | 1947 | if (ss->attach) |
1865 | ss->attach(ss, cgrp, oldcgrp, tsk); | 1948 | ss->attach(ss, cgrp, &tset); |
1866 | } | 1949 | } |
1867 | 1950 | ||
1868 | synchronize_rcu(); | 1951 | synchronize_rcu(); |
@@ -1884,7 +1967,7 @@ out: | |||
1884 | */ | 1967 | */ |
1885 | break; | 1968 | break; |
1886 | if (ss->cancel_attach) | 1969 | if (ss->cancel_attach) |
1887 | ss->cancel_attach(ss, cgrp, tsk); | 1970 | ss->cancel_attach(ss, cgrp, &tset); |
1888 | } | 1971 | } |
1889 | } | 1972 | } |
1890 | return retval; | 1973 | return retval; |
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp, | |||
1935 | 2018 | ||
1936 | read_lock(&css_set_lock); | 2019 | read_lock(&css_set_lock); |
1937 | newcg = find_existing_css_set(cg, cgrp, template); | 2020 | newcg = find_existing_css_set(cg, cgrp, template); |
1938 | if (newcg) | ||
1939 | get_css_set(newcg); | ||
1940 | read_unlock(&css_set_lock); | 2021 | read_unlock(&css_set_lock); |
1941 | 2022 | ||
1942 | /* doesn't exist at all? */ | 2023 | /* doesn't exist at all? */ |
1943 | if (!newcg) | 2024 | if (!newcg) |
1944 | return false; | 2025 | return false; |
1945 | /* see if it's already in the list */ | 2026 | /* see if it's already in the list */ |
1946 | list_for_each_entry(cg_entry, newcg_list, links) { | 2027 | list_for_each_entry(cg_entry, newcg_list, links) |
1947 | if (cg_entry->cg == newcg) { | 2028 | if (cg_entry->cg == newcg) |
1948 | put_css_set(newcg); | ||
1949 | return true; | 2029 | return true; |
1950 | } | ||
1951 | } | ||
1952 | 2030 | ||
1953 | /* not found */ | 2031 | /* not found */ |
1954 | put_css_set(newcg); | ||
1955 | return false; | 2032 | return false; |
1956 | } | 2033 | } |
1957 | 2034 | ||
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | |||
1985 | * @cgrp: the cgroup to attach to | 2062 | * @cgrp: the cgroup to attach to |
1986 | * @leader: the threadgroup leader task_struct of the group to be attached | 2063 | * @leader: the threadgroup leader task_struct of the group to be attached |
1987 | * | 2064 | * |
1988 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will | 2065 | * Call holding cgroup_mutex and the group_rwsem of the leader. Will take |
1989 | * take task_lock of each thread in leader's threadgroup individually in turn. | 2066 | * task_lock of each thread in leader's threadgroup individually in turn. |
1990 | */ | 2067 | */ |
1991 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | 2068 | static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) |
1992 | { | 2069 | { |
1993 | int retval, i, group_size; | 2070 | int retval, i, group_size; |
1994 | struct cgroup_subsys *ss, *failed_ss = NULL; | 2071 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1995 | bool cancel_failed_ss = false; | ||
1996 | /* guaranteed to be initialized later, but the compiler needs this */ | 2072 | /* guaranteed to be initialized later, but the compiler needs this */ |
1997 | struct cgroup *oldcgrp = NULL; | ||
1998 | struct css_set *oldcg; | 2073 | struct css_set *oldcg; |
1999 | struct cgroupfs_root *root = cgrp->root; | 2074 | struct cgroupfs_root *root = cgrp->root; |
2000 | /* threadgroup list cursor and array */ | 2075 | /* threadgroup list cursor and array */ |
2001 | struct task_struct *tsk; | 2076 | struct task_struct *tsk; |
2077 | struct task_and_cgroup *tc; | ||
2002 | struct flex_array *group; | 2078 | struct flex_array *group; |
2079 | struct cgroup_taskset tset = { }; | ||
2003 | /* | 2080 | /* |
2004 | * we need to make sure we have css_sets for all the tasks we're | 2081 | * we need to make sure we have css_sets for all the tasks we're |
2005 | * going to move -before- we actually start moving them, so that in | 2082 | * going to move -before- we actually start moving them, so that in |
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2012 | * step 0: in order to do expensive, possibly blocking operations for | 2089 | * step 0: in order to do expensive, possibly blocking operations for |
2013 | * every thread, we cannot iterate the thread group list, since it needs | 2090 | * every thread, we cannot iterate the thread group list, since it needs |
2014 | * rcu or tasklist locked. instead, build an array of all threads in the | 2091 | * rcu or tasklist locked. instead, build an array of all threads in the |
2015 | * group - threadgroup_fork_lock prevents new threads from appearing, | 2092 | * group - group_rwsem prevents new threads from appearing, and if |
2016 | * and if threads exit, this will just be an over-estimate. | 2093 | * threads exit, this will just be an over-estimate. |
2017 | */ | 2094 | */ |
2018 | group_size = get_nr_threads(leader); | 2095 | group_size = get_nr_threads(leader); |
2019 | /* flex_array supports very large thread-groups better than kmalloc. */ | 2096 | /* flex_array supports very large thread-groups better than kmalloc. */ |
2020 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, | 2097 | group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); |
2021 | GFP_KERNEL); | ||
2022 | if (!group) | 2098 | if (!group) |
2023 | return -ENOMEM; | 2099 | return -ENOMEM; |
2024 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | 2100 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ |
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2040 | retval = -EAGAIN; | 2116 | retval = -EAGAIN; |
2041 | goto out_free_group_list; | 2117 | goto out_free_group_list; |
2042 | } | 2118 | } |
2043 | /* take a reference on each task in the group to go in the array. */ | 2119 | |
2044 | tsk = leader; | 2120 | tsk = leader; |
2045 | i = 0; | 2121 | i = 0; |
2046 | do { | 2122 | do { |
2123 | struct task_and_cgroup ent; | ||
2124 | |||
2125 | /* @tsk either already exited or can't exit until the end */ | ||
2126 | if (tsk->flags & PF_EXITING) | ||
2127 | continue; | ||
2128 | |||
2047 | /* as per above, nr_threads may decrease, but not increase. */ | 2129 | /* as per above, nr_threads may decrease, but not increase. */ |
2048 | BUG_ON(i >= group_size); | 2130 | BUG_ON(i >= group_size); |
2049 | get_task_struct(tsk); | ||
2050 | /* | 2131 | /* |
2051 | * saying GFP_ATOMIC has no effect here because we did prealloc | 2132 | * saying GFP_ATOMIC has no effect here because we did prealloc |
2052 | * earlier, but it's good form to communicate our expectations. | 2133 | * earlier, but it's good form to communicate our expectations. |
2053 | */ | 2134 | */ |
2054 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); | 2135 | ent.task = tsk; |
2136 | ent.cgrp = task_cgroup_from_root(tsk, root); | ||
2137 | /* nothing to do if this task is already in the cgroup */ | ||
2138 | if (ent.cgrp == cgrp) | ||
2139 | continue; | ||
2140 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); | ||
2055 | BUG_ON(retval != 0); | 2141 | BUG_ON(retval != 0); |
2056 | i++; | 2142 | i++; |
2057 | } while_each_thread(leader, tsk); | 2143 | } while_each_thread(leader, tsk); |
2058 | /* remember the number of threads in the array for later. */ | 2144 | /* remember the number of threads in the array for later. */ |
2059 | group_size = i; | 2145 | group_size = i; |
2146 | tset.tc_array = group; | ||
2147 | tset.tc_array_len = group_size; | ||
2060 | read_unlock(&tasklist_lock); | 2148 | read_unlock(&tasklist_lock); |
2061 | 2149 | ||
2150 | /* methods shouldn't be called if no task is actually migrating */ | ||
2151 | retval = 0; | ||
2152 | if (!group_size) | ||
2153 | goto out_free_group_list; | ||
2154 | |||
2062 | /* | 2155 | /* |
2063 | * step 1: check that we can legitimately attach to the cgroup. | 2156 | * step 1: check that we can legitimately attach to the cgroup. |
2064 | */ | 2157 | */ |
2065 | for_each_subsys(root, ss) { | 2158 | for_each_subsys(root, ss) { |
2066 | if (ss->can_attach) { | 2159 | if (ss->can_attach) { |
2067 | retval = ss->can_attach(ss, cgrp, leader); | 2160 | retval = ss->can_attach(ss, cgrp, &tset); |
2068 | if (retval) { | 2161 | if (retval) { |
2069 | failed_ss = ss; | 2162 | failed_ss = ss; |
2070 | goto out_cancel_attach; | 2163 | goto out_cancel_attach; |
2071 | } | 2164 | } |
2072 | } | 2165 | } |
2073 | /* a callback to be run on every thread in the threadgroup. */ | ||
2074 | if (ss->can_attach_task) { | ||
2075 | /* run on each task in the threadgroup. */ | ||
2076 | for (i = 0; i < group_size; i++) { | ||
2077 | tsk = flex_array_get_ptr(group, i); | ||
2078 | retval = ss->can_attach_task(cgrp, tsk); | ||
2079 | if (retval) { | ||
2080 | failed_ss = ss; | ||
2081 | cancel_failed_ss = true; | ||
2082 | goto out_cancel_attach; | ||
2083 | } | ||
2084 | } | ||
2085 | } | ||
2086 | } | 2166 | } |
2087 | 2167 | ||
2088 | /* | 2168 | /* |
@@ -2091,72 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2091 | */ | 2171 | */ |
2092 | INIT_LIST_HEAD(&newcg_list); | 2172 | INIT_LIST_HEAD(&newcg_list); |
2093 | for (i = 0; i < group_size; i++) { | 2173 | for (i = 0; i < group_size; i++) { |
2094 | tsk = flex_array_get_ptr(group, i); | 2174 | tc = flex_array_get(group, i); |
2095 | /* nothing to do if this task is already in the cgroup */ | 2175 | oldcg = tc->task->cgroups; |
2096 | oldcgrp = task_cgroup_from_root(tsk, root); | 2176 | |
2097 | if (cgrp == oldcgrp) | 2177 | /* if we don't already have it in the list get a new one */ |
2098 | continue; | 2178 | if (!css_set_check_fetched(cgrp, tc->task, oldcg, |
2099 | /* get old css_set pointer */ | 2179 | &newcg_list)) { |
2100 | task_lock(tsk); | ||
2101 | if (tsk->flags & PF_EXITING) { | ||
2102 | /* ignore this task if it's going away */ | ||
2103 | task_unlock(tsk); | ||
2104 | continue; | ||
2105 | } | ||
2106 | oldcg = tsk->cgroups; | ||
2107 | get_css_set(oldcg); | ||
2108 | task_unlock(tsk); | ||
2109 | /* see if the new one for us is already in the list? */ | ||
2110 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { | ||
2111 | /* was already there, nothing to do. */ | ||
2112 | put_css_set(oldcg); | ||
2113 | } else { | ||
2114 | /* we don't already have it. get new one. */ | ||
2115 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | 2180 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); |
2116 | put_css_set(oldcg); | ||
2117 | if (retval) | 2181 | if (retval) |
2118 | goto out_list_teardown; | 2182 | goto out_list_teardown; |
2119 | } | 2183 | } |
2120 | } | 2184 | } |
2121 | 2185 | ||
2122 | /* | 2186 | /* |
2123 | * step 3: now that we're guaranteed success wrt the css_sets, proceed | 2187 | * step 3: now that we're guaranteed success wrt the css_sets, |
2124 | * to move all tasks to the new cgroup, calling ss->attach_task for each | 2188 | * proceed to move all tasks to the new cgroup. There are no |
2125 | * one along the way. there are no failure cases after here, so this is | 2189 | * failure cases after here, so this is the commit point. |
2126 | * the commit point. | ||
2127 | */ | 2190 | */ |
2128 | for_each_subsys(root, ss) { | ||
2129 | if (ss->pre_attach) | ||
2130 | ss->pre_attach(cgrp); | ||
2131 | } | ||
2132 | for (i = 0; i < group_size; i++) { | 2191 | for (i = 0; i < group_size; i++) { |
2133 | tsk = flex_array_get_ptr(group, i); | 2192 | tc = flex_array_get(group, i); |
2134 | /* leave current thread as it is if it's already there */ | 2193 | retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); |
2135 | oldcgrp = task_cgroup_from_root(tsk, root); | 2194 | BUG_ON(retval); |
2136 | if (cgrp == oldcgrp) | ||
2137 | continue; | ||
2138 | /* if the thread is PF_EXITING, it can just get skipped. */ | ||
2139 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); | ||
2140 | if (retval == 0) { | ||
2141 | /* attach each task to each subsystem */ | ||
2142 | for_each_subsys(root, ss) { | ||
2143 | if (ss->attach_task) | ||
2144 | ss->attach_task(cgrp, tsk); | ||
2145 | } | ||
2146 | } else { | ||
2147 | BUG_ON(retval != -ESRCH); | ||
2148 | } | ||
2149 | } | 2195 | } |
2150 | /* nothing is sensitive to fork() after this point. */ | 2196 | /* nothing is sensitive to fork() after this point. */ |
2151 | 2197 | ||
2152 | /* | 2198 | /* |
2153 | * step 4: do expensive, non-thread-specific subsystem callbacks. | 2199 | * step 4: do subsystem attach callbacks. |
2154 | * TODO: if ever a subsystem needs to know the oldcgrp for each task | ||
2155 | * being moved, this call will need to be reworked to communicate that. | ||
2156 | */ | 2200 | */ |
2157 | for_each_subsys(root, ss) { | 2201 | for_each_subsys(root, ss) { |
2158 | if (ss->attach) | 2202 | if (ss->attach) |
2159 | ss->attach(ss, cgrp, oldcgrp, leader); | 2203 | ss->attach(ss, cgrp, &tset); |
2160 | } | 2204 | } |
2161 | 2205 | ||
2162 | /* | 2206 | /* |
@@ -2176,20 +2220,12 @@ out_cancel_attach: | |||
2176 | /* same deal as in cgroup_attach_task */ | 2220 | /* same deal as in cgroup_attach_task */ |
2177 | if (retval) { | 2221 | if (retval) { |
2178 | for_each_subsys(root, ss) { | 2222 | for_each_subsys(root, ss) { |
2179 | if (ss == failed_ss) { | 2223 | if (ss == failed_ss) |
2180 | if (cancel_failed_ss && ss->cancel_attach) | ||
2181 | ss->cancel_attach(ss, cgrp, leader); | ||
2182 | break; | 2224 | break; |
2183 | } | ||
2184 | if (ss->cancel_attach) | 2225 | if (ss->cancel_attach) |
2185 | ss->cancel_attach(ss, cgrp, leader); | 2226 | ss->cancel_attach(ss, cgrp, &tset); |
2186 | } | 2227 | } |
2187 | } | 2228 | } |
2188 | /* clean up the array of referenced threads in the group. */ | ||
2189 | for (i = 0; i < group_size; i++) { | ||
2190 | tsk = flex_array_get_ptr(group, i); | ||
2191 | put_task_struct(tsk); | ||
2192 | } | ||
2193 | out_free_group_list: | 2229 | out_free_group_list: |
2194 | flex_array_free(group); | 2230 | flex_array_free(group); |
2195 | return retval; | 2231 | return retval; |
@@ -2197,8 +2233,8 @@ out_free_group_list: | |||
2197 | 2233 | ||
2198 | /* | 2234 | /* |
2199 | * Find the task_struct of the task to attach by vpid and pass it along to the | 2235 | * Find the task_struct of the task to attach by vpid and pass it along to the |
2200 | * function to attach either it or all tasks in its threadgroup. Will take | 2236 | * function to attach either it or all tasks in its threadgroup. Will lock |
2201 | * cgroup_mutex; may take task_lock of task. | 2237 | * cgroup_mutex and threadgroup; may take task_lock of task. |
2202 | */ | 2238 | */ |
2203 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | 2239 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) |
2204 | { | 2240 | { |
@@ -2225,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2225 | * detect it later. | 2261 | * detect it later. |
2226 | */ | 2262 | */ |
2227 | tsk = tsk->group_leader; | 2263 | tsk = tsk->group_leader; |
2228 | } else if (tsk->flags & PF_EXITING) { | ||
2229 | /* optimization for the single-task-only case */ | ||
2230 | rcu_read_unlock(); | ||
2231 | cgroup_unlock(); | ||
2232 | return -ESRCH; | ||
2233 | } | 2264 | } |
2234 | |||
2235 | /* | 2265 | /* |
2236 | * even if we're attaching all tasks in the thread group, we | 2266 | * even if we're attaching all tasks in the thread group, we |
2237 | * only need to check permissions on one of them. | 2267 | * only need to check permissions on one of them. |
@@ -2254,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2254 | get_task_struct(tsk); | 2284 | get_task_struct(tsk); |
2255 | } | 2285 | } |
2256 | 2286 | ||
2257 | if (threadgroup) { | 2287 | threadgroup_lock(tsk); |
2258 | threadgroup_fork_write_lock(tsk); | 2288 | |
2289 | if (threadgroup) | ||
2259 | ret = cgroup_attach_proc(cgrp, tsk); | 2290 | ret = cgroup_attach_proc(cgrp, tsk); |
2260 | threadgroup_fork_write_unlock(tsk); | 2291 | else |
2261 | } else { | ||
2262 | ret = cgroup_attach_task(cgrp, tsk); | 2292 | ret = cgroup_attach_task(cgrp, tsk); |
2263 | } | 2293 | |
2294 | threadgroup_unlock(tsk); | ||
2295 | |||
2264 | put_task_struct(tsk); | 2296 | put_task_struct(tsk); |
2265 | cgroup_unlock(); | 2297 | cgroup_unlock(); |
2266 | return ret; | 2298 | return ret; |
@@ -2311,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | |||
2311 | return -EINVAL; | 2343 | return -EINVAL; |
2312 | if (!cgroup_lock_live_group(cgrp)) | 2344 | if (!cgroup_lock_live_group(cgrp)) |
2313 | return -ENODEV; | 2345 | return -ENODEV; |
2346 | mutex_lock(&cgroup_root_mutex); | ||
2314 | strcpy(cgrp->root->release_agent_path, buffer); | 2347 | strcpy(cgrp->root->release_agent_path, buffer); |
2348 | mutex_unlock(&cgroup_root_mutex); | ||
2315 | cgroup_unlock(); | 2349 | cgroup_unlock(); |
2316 | return 0; | 2350 | return 0; |
2317 | } | 2351 | } |
@@ -2590,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file) | |||
2590 | return __d_cft(file->f_dentry); | 2624 | return __d_cft(file->f_dentry); |
2591 | } | 2625 | } |
2592 | 2626 | ||
2593 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2627 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, |
2594 | struct super_block *sb) | 2628 | struct super_block *sb) |
2595 | { | 2629 | { |
2596 | struct inode *inode; | 2630 | struct inode *inode; |
@@ -2631,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
2631 | * @mode: mode to set on new directory. | 2665 | * @mode: mode to set on new directory. |
2632 | */ | 2666 | */ |
2633 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | 2667 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, |
2634 | mode_t mode) | 2668 | umode_t mode) |
2635 | { | 2669 | { |
2636 | struct dentry *parent; | 2670 | struct dentry *parent; |
2637 | int error = 0; | 2671 | int error = 0; |
@@ -2658,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | |||
2658 | * returns S_IRUGO if it has only a read handler | 2692 | * returns S_IRUGO if it has only a read handler |
2659 | * returns S_IWUSR if it has only a write hander | 2693 | * returns S_IWUSR if it has only a write hander |
2660 | */ | 2694 | */ |
2661 | static mode_t cgroup_file_mode(const struct cftype *cft) | 2695 | static umode_t cgroup_file_mode(const struct cftype *cft) |
2662 | { | 2696 | { |
2663 | mode_t mode = 0; | 2697 | umode_t mode = 0; |
2664 | 2698 | ||
2665 | if (cft->mode) | 2699 | if (cft->mode) |
2666 | return cft->mode; | 2700 | return cft->mode; |
@@ -2683,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp, | |||
2683 | struct dentry *dir = cgrp->dentry; | 2717 | struct dentry *dir = cgrp->dentry; |
2684 | struct dentry *dentry; | 2718 | struct dentry *dentry; |
2685 | int error; | 2719 | int error; |
2686 | mode_t mode; | 2720 | umode_t mode; |
2687 | 2721 | ||
2688 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2722 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2689 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2723 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
@@ -2794,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
2794 | } | 2828 | } |
2795 | 2829 | ||
2796 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 2830 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
2831 | __acquires(css_set_lock) | ||
2797 | { | 2832 | { |
2798 | /* | 2833 | /* |
2799 | * The first time anyone tries to iterate across a cgroup, | 2834 | * The first time anyone tries to iterate across a cgroup, |
@@ -2833,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
2833 | } | 2868 | } |
2834 | 2869 | ||
2835 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 2870 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) |
2871 | __releases(css_set_lock) | ||
2836 | { | 2872 | { |
2837 | read_unlock(&css_set_lock); | 2873 | read_unlock(&css_set_lock); |
2838 | } | 2874 | } |
@@ -3757,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | |||
3757 | * Must be called with the mutex on the parent inode held | 3793 | * Must be called with the mutex on the parent inode held |
3758 | */ | 3794 | */ |
3759 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 3795 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
3760 | mode_t mode) | 3796 | umode_t mode) |
3761 | { | 3797 | { |
3762 | struct cgroup *cgrp; | 3798 | struct cgroup *cgrp; |
3763 | struct cgroupfs_root *root = parent->root; | 3799 | struct cgroupfs_root *root = parent->root; |
@@ -3851,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3851 | return err; | 3887 | return err; |
3852 | } | 3888 | } |
3853 | 3889 | ||
3854 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) | 3890 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
3855 | { | 3891 | { |
3856 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; | 3892 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; |
3857 | 3893 | ||
@@ -4496,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
4496 | * | 4532 | * |
4497 | * A pointer to the shared css_set was automatically copied in | 4533 | * A pointer to the shared css_set was automatically copied in |
4498 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 4534 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
4499 | * it was not made under the protection of RCU or cgroup_mutex, so | 4535 | * it was not made under the protection of RCU, cgroup_mutex or |
4500 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might | 4536 | * threadgroup_change_begin(), so it might no longer be a valid |
4501 | * have already changed current->cgroups, allowing the previously | 4537 | * cgroup pointer. cgroup_attach_task() might have already changed |
4502 | * referenced cgroup group to be removed and freed. | 4538 | * current->cgroups, allowing the previously referenced cgroup |
4539 | * group to be removed and freed. | ||
4540 | * | ||
4541 | * Outside the pointer validity we also need to process the css_set | ||
4542 | * inheritance between threadgoup_change_begin() and | ||
4543 | * threadgoup_change_end(), this way there is no leak in any process | ||
4544 | * wide migration performed by cgroup_attach_proc() that could otherwise | ||
4545 | * miss a thread because it is too early or too late in the fork stage. | ||
4503 | * | 4546 | * |
4504 | * At the point that cgroup_fork() is called, 'current' is the parent | 4547 | * At the point that cgroup_fork() is called, 'current' is the parent |
4505 | * task, and the passed argument 'child' points to the child task. | 4548 | * task, and the passed argument 'child' points to the child task. |
4506 | */ | 4549 | */ |
4507 | void cgroup_fork(struct task_struct *child) | 4550 | void cgroup_fork(struct task_struct *child) |
4508 | { | 4551 | { |
4509 | task_lock(current); | 4552 | /* |
4553 | * We don't need to task_lock() current because current->cgroups | ||
4554 | * can't be changed concurrently here. The parent obviously hasn't | ||
4555 | * exited and called cgroup_exit(), and we are synchronized against | ||
4556 | * cgroup migration through threadgroup_change_begin(). | ||
4557 | */ | ||
4510 | child->cgroups = current->cgroups; | 4558 | child->cgroups = current->cgroups; |
4511 | get_css_set(child->cgroups); | 4559 | get_css_set(child->cgroups); |
4512 | task_unlock(current); | ||
4513 | INIT_LIST_HEAD(&child->cg_list); | 4560 | INIT_LIST_HEAD(&child->cg_list); |
4514 | } | 4561 | } |
4515 | 4562 | ||
@@ -4551,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child) | |||
4551 | { | 4598 | { |
4552 | if (use_task_css_set_links) { | 4599 | if (use_task_css_set_links) { |
4553 | write_lock(&css_set_lock); | 4600 | write_lock(&css_set_lock); |
4554 | task_lock(child); | 4601 | if (list_empty(&child->cg_list)) { |
4555 | if (list_empty(&child->cg_list)) | 4602 | /* |
4603 | * It's safe to use child->cgroups without task_lock() | ||
4604 | * here because we are protected through | ||
4605 | * threadgroup_change_begin() against concurrent | ||
4606 | * css_set change in cgroup_task_migrate(). Also | ||
4607 | * the task can't exit at that point until | ||
4608 | * wake_up_new_task() is called, so we are protected | ||
4609 | * against cgroup_exit() setting child->cgroup to | ||
4610 | * init_css_set. | ||
4611 | */ | ||
4556 | list_add(&child->cg_list, &child->cgroups->tasks); | 4612 | list_add(&child->cg_list, &child->cgroups->tasks); |
4557 | task_unlock(child); | 4613 | } |
4558 | write_unlock(&css_set_lock); | 4614 | write_unlock(&css_set_lock); |
4559 | } | 4615 | } |
4560 | } | 4616 | } |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5e828a2ca8e6..fc0646b78a64 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 48 | struct freezer, css); |
49 | } | 49 | } |
50 | 50 | ||
51 | static inline int __cgroup_freezing_or_frozen(struct task_struct *task) | 51 | bool cgroup_freezing(struct task_struct *task) |
52 | { | 52 | { |
53 | enum freezer_state state = task_freezer(task)->state; | 53 | enum freezer_state state; |
54 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); | 54 | bool ret; |
55 | } | ||
56 | 55 | ||
57 | int cgroup_freezing_or_frozen(struct task_struct *task) | 56 | rcu_read_lock(); |
58 | { | 57 | state = task_freezer(task)->state; |
59 | int result; | 58 | ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; |
60 | task_lock(task); | 59 | rcu_read_unlock(); |
61 | result = __cgroup_freezing_or_frozen(task); | 60 | |
62 | task_unlock(task); | 61 | return ret; |
63 | return result; | ||
64 | } | 62 | } |
65 | 63 | ||
66 | /* | 64 | /* |
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys; | |||
102 | * freezer_can_attach(): | 100 | * freezer_can_attach(): |
103 | * cgroup_mutex (held by caller of can_attach) | 101 | * cgroup_mutex (held by caller of can_attach) |
104 | * | 102 | * |
105 | * cgroup_freezing_or_frozen(): | ||
106 | * task->alloc_lock (to get task's cgroup) | ||
107 | * | ||
108 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | 103 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): |
109 | * freezer->lock | 104 | * freezer->lock |
110 | * sighand->siglock (if the cgroup is freezing) | 105 | * sighand->siglock (if the cgroup is freezing) |
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys; | |||
130 | * write_lock css_set_lock (cgroup iterator start) | 125 | * write_lock css_set_lock (cgroup iterator start) |
131 | * task->alloc_lock | 126 | * task->alloc_lock |
132 | * read_lock css_set_lock (cgroup iterator start) | 127 | * read_lock css_set_lock (cgroup iterator start) |
133 | * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) | 128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) |
134 | * sighand->siglock | 129 | * sighand->siglock |
135 | */ | 130 | */ |
136 | static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | 131 | static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, |
@@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | |||
150 | static void freezer_destroy(struct cgroup_subsys *ss, | 145 | static void freezer_destroy(struct cgroup_subsys *ss, |
151 | struct cgroup *cgroup) | 146 | struct cgroup *cgroup) |
152 | { | 147 | { |
153 | kfree(cgroup_freezer(cgroup)); | 148 | struct freezer *freezer = cgroup_freezer(cgroup); |
149 | |||
150 | if (freezer->state != CGROUP_THAWED) | ||
151 | atomic_dec(&system_freezing_cnt); | ||
152 | kfree(freezer); | ||
153 | } | ||
154 | |||
155 | /* task is frozen or will freeze immediately when next it gets woken */ | ||
156 | static bool is_task_frozen_enough(struct task_struct *task) | ||
157 | { | ||
158 | return frozen(task) || | ||
159 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
154 | } | 160 | } |
155 | 161 | ||
156 | /* | 162 | /* |
@@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
160 | */ | 166 | */ |
161 | static int freezer_can_attach(struct cgroup_subsys *ss, | 167 | static int freezer_can_attach(struct cgroup_subsys *ss, |
162 | struct cgroup *new_cgroup, | 168 | struct cgroup *new_cgroup, |
163 | struct task_struct *task) | 169 | struct cgroup_taskset *tset) |
164 | { | 170 | { |
165 | struct freezer *freezer; | 171 | struct freezer *freezer; |
172 | struct task_struct *task; | ||
166 | 173 | ||
167 | /* | 174 | /* |
168 | * Anything frozen can't move or be moved to/from. | 175 | * Anything frozen can't move or be moved to/from. |
169 | */ | 176 | */ |
177 | cgroup_taskset_for_each(task, new_cgroup, tset) | ||
178 | if (cgroup_freezing(task)) | ||
179 | return -EBUSY; | ||
170 | 180 | ||
171 | freezer = cgroup_freezer(new_cgroup); | 181 | freezer = cgroup_freezer(new_cgroup); |
172 | if (freezer->state != CGROUP_THAWED) | 182 | if (freezer->state != CGROUP_THAWED) |
@@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
175 | return 0; | 185 | return 0; |
176 | } | 186 | } |
177 | 187 | ||
178 | static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | ||
179 | { | ||
180 | rcu_read_lock(); | ||
181 | if (__cgroup_freezing_or_frozen(tsk)) { | ||
182 | rcu_read_unlock(); | ||
183 | return -EBUSY; | ||
184 | } | ||
185 | rcu_read_unlock(); | ||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | 188 | static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) |
190 | { | 189 | { |
191 | struct freezer *freezer; | 190 | struct freezer *freezer; |
@@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |||
213 | 212 | ||
214 | /* Locking avoids race with FREEZING -> THAWED transitions. */ | 213 | /* Locking avoids race with FREEZING -> THAWED transitions. */ |
215 | if (freezer->state == CGROUP_FREEZING) | 214 | if (freezer->state == CGROUP_FREEZING) |
216 | freeze_task(task, true); | 215 | freeze_task(task); |
217 | spin_unlock_irq(&freezer->lock); | 216 | spin_unlock_irq(&freezer->lock); |
218 | } | 217 | } |
219 | 218 | ||
@@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup, | |||
231 | cgroup_iter_start(cgroup, &it); | 230 | cgroup_iter_start(cgroup, &it); |
232 | while ((task = cgroup_iter_next(cgroup, &it))) { | 231 | while ((task = cgroup_iter_next(cgroup, &it))) { |
233 | ntotal++; | 232 | ntotal++; |
234 | if (frozen(task)) | 233 | if (freezing(task) && is_task_frozen_enough(task)) |
235 | nfrozen++; | 234 | nfrozen++; |
236 | } | 235 | } |
237 | 236 | ||
@@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
279 | struct task_struct *task; | 278 | struct task_struct *task; |
280 | unsigned int num_cant_freeze_now = 0; | 279 | unsigned int num_cant_freeze_now = 0; |
281 | 280 | ||
282 | freezer->state = CGROUP_FREEZING; | ||
283 | cgroup_iter_start(cgroup, &it); | 281 | cgroup_iter_start(cgroup, &it); |
284 | while ((task = cgroup_iter_next(cgroup, &it))) { | 282 | while ((task = cgroup_iter_next(cgroup, &it))) { |
285 | if (!freeze_task(task, true)) | 283 | if (!freeze_task(task)) |
286 | continue; | 284 | continue; |
287 | if (frozen(task)) | 285 | if (is_task_frozen_enough(task)) |
288 | continue; | 286 | continue; |
289 | if (!freezing(task) && !freezer_should_skip(task)) | 287 | if (!freezing(task) && !freezer_should_skip(task)) |
290 | num_cant_freeze_now++; | 288 | num_cant_freeze_now++; |
@@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
300 | struct task_struct *task; | 298 | struct task_struct *task; |
301 | 299 | ||
302 | cgroup_iter_start(cgroup, &it); | 300 | cgroup_iter_start(cgroup, &it); |
303 | while ((task = cgroup_iter_next(cgroup, &it))) { | 301 | while ((task = cgroup_iter_next(cgroup, &it))) |
304 | thaw_process(task); | 302 | __thaw_task(task); |
305 | } | ||
306 | cgroup_iter_end(cgroup, &it); | 303 | cgroup_iter_end(cgroup, &it); |
307 | |||
308 | freezer->state = CGROUP_THAWED; | ||
309 | } | 304 | } |
310 | 305 | ||
311 | static int freezer_change_state(struct cgroup *cgroup, | 306 | static int freezer_change_state(struct cgroup *cgroup, |
@@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup, | |||
319 | spin_lock_irq(&freezer->lock); | 314 | spin_lock_irq(&freezer->lock); |
320 | 315 | ||
321 | update_if_frozen(cgroup, freezer); | 316 | update_if_frozen(cgroup, freezer); |
322 | if (goal_state == freezer->state) | ||
323 | goto out; | ||
324 | 317 | ||
325 | switch (goal_state) { | 318 | switch (goal_state) { |
326 | case CGROUP_THAWED: | 319 | case CGROUP_THAWED: |
320 | if (freezer->state != CGROUP_THAWED) | ||
321 | atomic_dec(&system_freezing_cnt); | ||
322 | freezer->state = CGROUP_THAWED; | ||
327 | unfreeze_cgroup(cgroup, freezer); | 323 | unfreeze_cgroup(cgroup, freezer); |
328 | break; | 324 | break; |
329 | case CGROUP_FROZEN: | 325 | case CGROUP_FROZEN: |
326 | if (freezer->state == CGROUP_THAWED) | ||
327 | atomic_inc(&system_freezing_cnt); | ||
328 | freezer->state = CGROUP_FREEZING; | ||
330 | retval = try_to_freeze_cgroup(cgroup, freezer); | 329 | retval = try_to_freeze_cgroup(cgroup, freezer); |
331 | break; | 330 | break; |
332 | default: | 331 | default: |
333 | BUG(); | 332 | BUG(); |
334 | } | 333 | } |
335 | out: | 334 | |
336 | spin_unlock_irq(&freezer->lock); | 335 | spin_unlock_irq(&freezer->lock); |
337 | 336 | ||
338 | return retval; | 337 | return retval; |
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = { | |||
381 | .populate = freezer_populate, | 380 | .populate = freezer_populate, |
382 | .subsys_id = freezer_subsys_id, | 381 | .subsys_id = freezer_subsys_id, |
383 | .can_attach = freezer_can_attach, | 382 | .can_attach = freezer_can_attach, |
384 | .can_attach_task = freezer_can_attach_task, | ||
385 | .pre_attach = NULL, | ||
386 | .attach_task = NULL, | ||
387 | .attach = NULL, | ||
388 | .fork = freezer_fork, | 383 | .fork = freezer_fork, |
389 | .exit = NULL, | ||
390 | }; | 384 | }; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 563f13609470..2060c6e57027 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu) | |||
178 | write_lock_irq(&tasklist_lock); | 178 | write_lock_irq(&tasklist_lock); |
179 | for_each_process(p) { | 179 | for_each_process(p) { |
180 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 180 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
181 | (!cputime_eq(p->utime, cputime_zero) || | 181 | (p->utime || p->stime)) |
182 | !cputime_eq(p->stime, cputime_zero))) | ||
183 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 182 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " |
184 | "(state = %ld, flags = %x)\n", | 183 | "(state = %ld, flags = %x)\n", |
185 | p->comm, task_pid_nr(p), cpu, | 184 | p->comm, task_pid_nr(p), cpu, |
@@ -380,6 +379,7 @@ out: | |||
380 | cpu_maps_update_done(); | 379 | cpu_maps_update_done(); |
381 | return err; | 380 | return err; |
382 | } | 381 | } |
382 | EXPORT_SYMBOL_GPL(cpu_up); | ||
383 | 383 | ||
384 | #ifdef CONFIG_PM_SLEEP_SMP | 384 | #ifdef CONFIG_PM_SLEEP_SMP |
385 | static cpumask_var_t frozen_cpus; | 385 | static cpumask_var_t frozen_cpus; |
@@ -470,7 +470,7 @@ out: | |||
470 | cpu_maps_update_done(); | 470 | cpu_maps_update_done(); |
471 | } | 471 | } |
472 | 472 | ||
473 | static int alloc_frozen_cpus(void) | 473 | static int __init alloc_frozen_cpus(void) |
474 | { | 474 | { |
475 | if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) | 475 | if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) |
476 | return -ENOMEM; | 476 | return -ENOMEM; |
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, | |||
543 | } | 543 | } |
544 | 544 | ||
545 | 545 | ||
546 | int cpu_hotplug_pm_sync_init(void) | 546 | static int __init cpu_hotplug_pm_sync_init(void) |
547 | { | 547 | { |
548 | pm_notifier(cpu_hotplug_pm_callback, 0); | 548 | pm_notifier(cpu_hotplug_pm_callback, 0); |
549 | return 0; | 549 | return 0; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fe58c46a426..a09ac2b9a661 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
123 | struct cpuset, css); | 123 | struct cpuset, css); |
124 | } | 124 | } |
125 | 125 | ||
126 | #ifdef CONFIG_NUMA | ||
127 | static inline bool task_has_mempolicy(struct task_struct *task) | ||
128 | { | ||
129 | return task->mempolicy; | ||
130 | } | ||
131 | #else | ||
132 | static inline bool task_has_mempolicy(struct task_struct *task) | ||
133 | { | ||
134 | return false; | ||
135 | } | ||
136 | #endif | ||
137 | |||
138 | |||
126 | /* bits in struct cpuset flags field */ | 139 | /* bits in struct cpuset flags field */ |
127 | typedef enum { | 140 | typedef enum { |
128 | CS_CPU_EXCLUSIVE, | 141 | CS_CPU_EXCLUSIVE, |
@@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
949 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | 962 | static void cpuset_change_task_nodemask(struct task_struct *tsk, |
950 | nodemask_t *newmems) | 963 | nodemask_t *newmems) |
951 | { | 964 | { |
952 | bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); | 965 | bool need_loop; |
953 | 966 | ||
954 | repeat: | 967 | repeat: |
955 | /* | 968 | /* |
@@ -962,6 +975,14 @@ repeat: | |||
962 | return; | 975 | return; |
963 | 976 | ||
964 | task_lock(tsk); | 977 | task_lock(tsk); |
978 | /* | ||
979 | * Determine if a loop is necessary if another thread is doing | ||
980 | * get_mems_allowed(). If at least one node remains unchanged and | ||
981 | * tsk does not have a mempolicy, then an empty nodemask will not be | ||
982 | * possible when mems_allowed is larger than a word. | ||
983 | */ | ||
984 | need_loop = task_has_mempolicy(tsk) || | ||
985 | !nodes_intersects(*newmems, tsk->mems_allowed); | ||
965 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 986 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
966 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | 987 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); |
967 | 988 | ||
@@ -981,11 +1002,9 @@ repeat: | |||
981 | 1002 | ||
982 | /* | 1003 | /* |
983 | * Allocation of memory is very fast, we needn't sleep when waiting | 1004 | * Allocation of memory is very fast, we needn't sleep when waiting |
984 | * for the read-side. No wait is necessary, however, if at least one | 1005 | * for the read-side. |
985 | * node remains unchanged. | ||
986 | */ | 1006 | */ |
987 | while (masks_disjoint && | 1007 | while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { |
988 | ACCESS_ONCE(tsk->mems_allowed_change_disable)) { | ||
989 | task_unlock(tsk); | 1008 | task_unlock(tsk); |
990 | if (!task_curr(tsk)) | 1009 | if (!task_curr(tsk)) |
991 | yield(); | 1010 | yield(); |
@@ -1370,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1370 | return val; | 1389 | return val; |
1371 | } | 1390 | } |
1372 | 1391 | ||
1373 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | ||
1374 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
1375 | struct task_struct *tsk) | ||
1376 | { | ||
1377 | struct cpuset *cs = cgroup_cs(cont); | ||
1378 | |||
1379 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
1380 | return -ENOSPC; | ||
1381 | |||
1382 | /* | ||
1383 | * Kthreads bound to specific cpus cannot be moved to a new cpuset; we | ||
1384 | * cannot change their cpu affinity and isolating such threads by their | ||
1385 | * set of allowed nodes is unnecessary. Thus, cpusets are not | ||
1386 | * applicable for such threads. This prevents checking for success of | ||
1387 | * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may | ||
1388 | * be changed. | ||
1389 | */ | ||
1390 | if (tsk->flags & PF_THREAD_BOUND) | ||
1391 | return -EINVAL; | ||
1392 | |||
1393 | return 0; | ||
1394 | } | ||
1395 | |||
1396 | static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) | ||
1397 | { | ||
1398 | return security_task_setscheduler(task); | ||
1399 | } | ||
1400 | |||
1401 | /* | 1392 | /* |
1402 | * Protected by cgroup_lock. The nodemasks must be stored globally because | 1393 | * Protected by cgroup_lock. The nodemasks must be stored globally because |
1403 | * dynamically allocating them is not allowed in pre_attach, and they must | 1394 | * dynamically allocating them is not allowed in can_attach, and they must |
1404 | * persist among pre_attach, attach_task, and attach. | 1395 | * persist until attach. |
1405 | */ | 1396 | */ |
1406 | static cpumask_var_t cpus_attach; | 1397 | static cpumask_var_t cpus_attach; |
1407 | static nodemask_t cpuset_attach_nodemask_from; | 1398 | static nodemask_t cpuset_attach_nodemask_from; |
1408 | static nodemask_t cpuset_attach_nodemask_to; | 1399 | static nodemask_t cpuset_attach_nodemask_to; |
1409 | 1400 | ||
1410 | /* Set-up work for before attaching each task. */ | 1401 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1411 | static void cpuset_pre_attach(struct cgroup *cont) | 1402 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
1403 | struct cgroup_taskset *tset) | ||
1412 | { | 1404 | { |
1413 | struct cpuset *cs = cgroup_cs(cont); | 1405 | struct cpuset *cs = cgroup_cs(cgrp); |
1406 | struct task_struct *task; | ||
1407 | int ret; | ||
1414 | 1408 | ||
1409 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
1410 | return -ENOSPC; | ||
1411 | |||
1412 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
1413 | /* | ||
1414 | * Kthreads bound to specific cpus cannot be moved to a new | ||
1415 | * cpuset; we cannot change their cpu affinity and | ||
1416 | * isolating such threads by their set of allowed nodes is | ||
1417 | * unnecessary. Thus, cpusets are not applicable for such | ||
1418 | * threads. This prevents checking for success of | ||
1419 | * set_cpus_allowed_ptr() on all attached tasks before | ||
1420 | * cpus_allowed may be changed. | ||
1421 | */ | ||
1422 | if (task->flags & PF_THREAD_BOUND) | ||
1423 | return -EINVAL; | ||
1424 | if ((ret = security_task_setscheduler(task))) | ||
1425 | return ret; | ||
1426 | } | ||
1427 | |||
1428 | /* prepare for attach */ | ||
1415 | if (cs == &top_cpuset) | 1429 | if (cs == &top_cpuset) |
1416 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1430 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1417 | else | 1431 | else |
1418 | guarantee_online_cpus(cs, cpus_attach); | 1432 | guarantee_online_cpus(cs, cpus_attach); |
1419 | 1433 | ||
1420 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | 1434 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); |
1421 | } | ||
1422 | |||
1423 | /* Per-thread attachment work. */ | ||
1424 | static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) | ||
1425 | { | ||
1426 | int err; | ||
1427 | struct cpuset *cs = cgroup_cs(cont); | ||
1428 | 1435 | ||
1429 | /* | 1436 | return 0; |
1430 | * can_attach beforehand should guarantee that this doesn't fail. | ||
1431 | * TODO: have a better way to handle failure here | ||
1432 | */ | ||
1433 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | ||
1434 | WARN_ON_ONCE(err); | ||
1435 | |||
1436 | cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); | ||
1437 | cpuset_update_task_spread_flag(cs, tsk); | ||
1438 | } | 1437 | } |
1439 | 1438 | ||
1440 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1439 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
1441 | struct cgroup *oldcont, struct task_struct *tsk) | 1440 | struct cgroup_taskset *tset) |
1442 | { | 1441 | { |
1443 | struct mm_struct *mm; | 1442 | struct mm_struct *mm; |
1444 | struct cpuset *cs = cgroup_cs(cont); | 1443 | struct task_struct *task; |
1445 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1444 | struct task_struct *leader = cgroup_taskset_first(tset); |
1445 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | ||
1446 | struct cpuset *cs = cgroup_cs(cgrp); | ||
1447 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | ||
1448 | |||
1449 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
1450 | /* | ||
1451 | * can_attach beforehand should guarantee that this doesn't | ||
1452 | * fail. TODO: have a better way to handle failure here | ||
1453 | */ | ||
1454 | WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); | ||
1455 | |||
1456 | cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); | ||
1457 | cpuset_update_task_spread_flag(cs, task); | ||
1458 | } | ||
1446 | 1459 | ||
1447 | /* | 1460 | /* |
1448 | * Change mm, possibly for multiple threads in a threadgroup. This is | 1461 | * Change mm, possibly for multiple threads in a threadgroup. This is |
@@ -1450,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1450 | */ | 1463 | */ |
1451 | cpuset_attach_nodemask_from = oldcs->mems_allowed; | 1464 | cpuset_attach_nodemask_from = oldcs->mems_allowed; |
1452 | cpuset_attach_nodemask_to = cs->mems_allowed; | 1465 | cpuset_attach_nodemask_to = cs->mems_allowed; |
1453 | mm = get_task_mm(tsk); | 1466 | mm = get_task_mm(leader); |
1454 | if (mm) { | 1467 | if (mm) { |
1455 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | 1468 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1456 | if (is_memory_migrate(cs)) | 1469 | if (is_memory_migrate(cs)) |
@@ -1906,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = { | |||
1906 | .create = cpuset_create, | 1919 | .create = cpuset_create, |
1907 | .destroy = cpuset_destroy, | 1920 | .destroy = cpuset_destroy, |
1908 | .can_attach = cpuset_can_attach, | 1921 | .can_attach = cpuset_can_attach, |
1909 | .can_attach_task = cpuset_can_attach_task, | ||
1910 | .pre_attach = cpuset_pre_attach, | ||
1911 | .attach_task = cpuset_attach_task, | ||
1912 | .attach = cpuset_attach, | 1922 | .attach = cpuset_attach, |
1913 | .populate = cpuset_populate, | 1923 | .populate = cpuset_populate, |
1914 | .post_clone = cpuset_post_clone, | 1924 | .post_clone = cpuset_post_clone, |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 63786e71a3cd..e2ae7349437f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv) | |||
1982 | kdb_printf("%-20s%8u 0x%p ", mod->name, | 1982 | kdb_printf("%-20s%8u 0x%p ", mod->name, |
1983 | mod->core_size, (void *)mod); | 1983 | mod->core_size, (void *)mod); |
1984 | #ifdef CONFIG_MODULE_UNLOAD | 1984 | #ifdef CONFIG_MODULE_UNLOAD |
1985 | kdb_printf("%4d ", module_refcount(mod)); | 1985 | kdb_printf("%4ld ", module_refcount(mod)); |
1986 | #endif | 1986 | #endif |
1987 | if (mod->state == MODULE_STATE_GOING) | 1987 | if (mod->state == MODULE_STATE_GOING) |
1988 | kdb_printf(" (Unloading)"); | 1988 | kdb_printf(" (Unloading)"); |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 5532dd37aa86..7d6fb40d2188 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) | |||
636 | (p->exit_state & EXIT_ZOMBIE) ? 'Z' : | 636 | (p->exit_state & EXIT_ZOMBIE) ? 'Z' : |
637 | (p->exit_state & EXIT_DEAD) ? 'E' : | 637 | (p->exit_state & EXIT_DEAD) ? 'E' : |
638 | (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; | 638 | (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; |
639 | if (p->pid == 0) { | 639 | if (is_idle_task(p)) { |
640 | /* Idle task. Is it really idle, apart from the kdb | 640 | /* Idle task. Is it really idle, apart from the kdb |
641 | * interrupt? */ | 641 | * interrupt? */ |
642 | if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { | 642 | if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c36..22d901f9caf4 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile | |||
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER | |||
2 | CFLAGS_REMOVE_core.o = -pg | 2 | CFLAGS_REMOVE_core.o = -pg |
3 | endif | 3 | endif |
4 | 4 | ||
5 | obj-y := core.o ring_buffer.o | 5 | obj-y := core.o ring_buffer.o callchain.o |
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 000000000000..6581a040f399 --- /dev/null +++ b/kernel/events/callchain.c | |||
@@ -0,0 +1,189 @@ | |||
1 | /* | ||
2 | * Performance events callchain code, extracted from core.c: | ||
3 | * | ||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
8 | * | ||
9 | * For licensing details see kernel-base/COPYING | ||
10 | */ | ||
11 | |||
12 | #include <linux/perf_event.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include "internal.h" | ||
15 | |||
16 | struct callchain_cpus_entries { | ||
17 | struct rcu_head rcu_head; | ||
18 | struct perf_callchain_entry *cpu_entries[0]; | ||
19 | }; | ||
20 | |||
21 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
22 | static atomic_t nr_callchain_events; | ||
23 | static DEFINE_MUTEX(callchain_mutex); | ||
24 | static struct callchain_cpus_entries *callchain_cpus_entries; | ||
25 | |||
26 | |||
27 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
28 | struct pt_regs *regs) | ||
29 | { | ||
30 | } | ||
31 | |||
32 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
33 | struct pt_regs *regs) | ||
34 | { | ||
35 | } | ||
36 | |||
37 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
38 | { | ||
39 | struct callchain_cpus_entries *entries; | ||
40 | int cpu; | ||
41 | |||
42 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
43 | |||
44 | for_each_possible_cpu(cpu) | ||
45 | kfree(entries->cpu_entries[cpu]); | ||
46 | |||
47 | kfree(entries); | ||
48 | } | ||
49 | |||
50 | static void release_callchain_buffers(void) | ||
51 | { | ||
52 | struct callchain_cpus_entries *entries; | ||
53 | |||
54 | entries = callchain_cpus_entries; | ||
55 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
57 | } | ||
58 | |||
59 | static int alloc_callchain_buffers(void) | ||
60 | { | ||
61 | int cpu; | ||
62 | int size; | ||
63 | struct callchain_cpus_entries *entries; | ||
64 | |||
65 | /* | ||
66 | * We can't use the percpu allocation API for data that can be | ||
67 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
68 | * until that gets sorted out. | ||
69 | */ | ||
70 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
71 | |||
72 | entries = kzalloc(size, GFP_KERNEL); | ||
73 | if (!entries) | ||
74 | return -ENOMEM; | ||
75 | |||
76 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
77 | |||
78 | for_each_possible_cpu(cpu) { | ||
79 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
80 | cpu_to_node(cpu)); | ||
81 | if (!entries->cpu_entries[cpu]) | ||
82 | goto fail; | ||
83 | } | ||
84 | |||
85 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
86 | |||
87 | return 0; | ||
88 | |||
89 | fail: | ||
90 | for_each_possible_cpu(cpu) | ||
91 | kfree(entries->cpu_entries[cpu]); | ||
92 | kfree(entries); | ||
93 | |||
94 | return -ENOMEM; | ||
95 | } | ||
96 | |||
97 | int get_callchain_buffers(void) | ||
98 | { | ||
99 | int err = 0; | ||
100 | int count; | ||
101 | |||
102 | mutex_lock(&callchain_mutex); | ||
103 | |||
104 | count = atomic_inc_return(&nr_callchain_events); | ||
105 | if (WARN_ON_ONCE(count < 1)) { | ||
106 | err = -EINVAL; | ||
107 | goto exit; | ||
108 | } | ||
109 | |||
110 | if (count > 1) { | ||
111 | /* If the allocation failed, give up */ | ||
112 | if (!callchain_cpus_entries) | ||
113 | err = -ENOMEM; | ||
114 | goto exit; | ||
115 | } | ||
116 | |||
117 | err = alloc_callchain_buffers(); | ||
118 | exit: | ||
119 | mutex_unlock(&callchain_mutex); | ||
120 | |||
121 | return err; | ||
122 | } | ||
123 | |||
124 | void put_callchain_buffers(void) | ||
125 | { | ||
126 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
127 | release_callchain_buffers(); | ||
128 | mutex_unlock(&callchain_mutex); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
133 | { | ||
134 | int cpu; | ||
135 | struct callchain_cpus_entries *entries; | ||
136 | |||
137 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
138 | if (*rctx == -1) | ||
139 | return NULL; | ||
140 | |||
141 | entries = rcu_dereference(callchain_cpus_entries); | ||
142 | if (!entries) | ||
143 | return NULL; | ||
144 | |||
145 | cpu = smp_processor_id(); | ||
146 | |||
147 | return &entries->cpu_entries[cpu][*rctx]; | ||
148 | } | ||
149 | |||
150 | static void | ||
151 | put_callchain_entry(int rctx) | ||
152 | { | ||
153 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
154 | } | ||
155 | |||
156 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
157 | { | ||
158 | int rctx; | ||
159 | struct perf_callchain_entry *entry; | ||
160 | |||
161 | |||
162 | entry = get_callchain_entry(&rctx); | ||
163 | if (rctx == -1) | ||
164 | return NULL; | ||
165 | |||
166 | if (!entry) | ||
167 | goto exit_put; | ||
168 | |||
169 | entry->nr = 0; | ||
170 | |||
171 | if (!user_mode(regs)) { | ||
172 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
173 | perf_callchain_kernel(entry, regs); | ||
174 | if (current->mm) | ||
175 | regs = task_pt_regs(current); | ||
176 | else | ||
177 | regs = NULL; | ||
178 | } | ||
179 | |||
180 | if (regs) { | ||
181 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
182 | perf_callchain_user(entry, regs); | ||
183 | } | ||
184 | |||
185 | exit_put: | ||
186 | put_callchain_entry(rctx); | ||
187 | |||
188 | return entry; | ||
189 | } | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e8457da6f95..1b5c081d8b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
10 | */ | 10 | */ |
@@ -128,7 +128,7 @@ enum event_type_t { | |||
128 | * perf_sched_events : >0 events exist | 128 | * perf_sched_events : >0 events exist |
129 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 129 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
130 | */ | 130 | */ |
131 | struct jump_label_key perf_sched_events __read_mostly; | 131 | struct jump_label_key_deferred perf_sched_events __read_mostly; |
132 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 132 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
133 | 133 | ||
134 | static atomic_t nr_mmap_events __read_mostly; | 134 | static atomic_t nr_mmap_events __read_mostly; |
@@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
185 | static void update_context_time(struct perf_event_context *ctx); | 185 | static void update_context_time(struct perf_event_context *ctx); |
186 | static u64 perf_event_time(struct perf_event *event); | 186 | static u64 perf_event_time(struct perf_event *event); |
187 | 187 | ||
188 | static void ring_buffer_attach(struct perf_event *event, | ||
189 | struct ring_buffer *rb); | ||
190 | |||
188 | void __weak perf_event_print_debug(void) { } | 191 | void __weak perf_event_print_debug(void) { } |
189 | 192 | ||
190 | extern __weak const char *perf_pmu_name(void) | 193 | extern __weak const char *perf_pmu_name(void) |
@@ -812,7 +815,7 @@ static void update_event_times(struct perf_event *event) | |||
812 | * here. | 815 | * here. |
813 | */ | 816 | */ |
814 | if (is_cgroup_event(event)) | 817 | if (is_cgroup_event(event)) |
815 | run_end = perf_event_time(event); | 818 | run_end = perf_cgroup_event_time(event); |
816 | else if (ctx->is_active) | 819 | else if (ctx->is_active) |
817 | run_end = ctx->time; | 820 | run_end = ctx->time; |
818 | else | 821 | else |
@@ -1127,6 +1130,8 @@ event_sched_out(struct perf_event *event, | |||
1127 | if (!is_software_event(event)) | 1130 | if (!is_software_event(event)) |
1128 | cpuctx->active_oncpu--; | 1131 | cpuctx->active_oncpu--; |
1129 | ctx->nr_active--; | 1132 | ctx->nr_active--; |
1133 | if (event->attr.freq && event->attr.sample_freq) | ||
1134 | ctx->nr_freq--; | ||
1130 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1135 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
1131 | cpuctx->exclusive = 0; | 1136 | cpuctx->exclusive = 0; |
1132 | } | 1137 | } |
@@ -1322,6 +1327,7 @@ retry: | |||
1322 | } | 1327 | } |
1323 | raw_spin_unlock_irq(&ctx->lock); | 1328 | raw_spin_unlock_irq(&ctx->lock); |
1324 | } | 1329 | } |
1330 | EXPORT_SYMBOL_GPL(perf_event_disable); | ||
1325 | 1331 | ||
1326 | static void perf_set_shadow_time(struct perf_event *event, | 1332 | static void perf_set_shadow_time(struct perf_event *event, |
1327 | struct perf_event_context *ctx, | 1333 | struct perf_event_context *ctx, |
@@ -1403,6 +1409,8 @@ event_sched_in(struct perf_event *event, | |||
1403 | if (!is_software_event(event)) | 1409 | if (!is_software_event(event)) |
1404 | cpuctx->active_oncpu++; | 1410 | cpuctx->active_oncpu++; |
1405 | ctx->nr_active++; | 1411 | ctx->nr_active++; |
1412 | if (event->attr.freq && event->attr.sample_freq) | ||
1413 | ctx->nr_freq++; | ||
1406 | 1414 | ||
1407 | if (event->attr.exclusive) | 1415 | if (event->attr.exclusive) |
1408 | cpuctx->exclusive = 1; | 1416 | cpuctx->exclusive = 1; |
@@ -1659,8 +1667,7 @@ retry: | |||
1659 | * Note: this works for group members as well as group leaders | 1667 | * Note: this works for group members as well as group leaders |
1660 | * since the non-leader members' sibling_lists will be empty. | 1668 | * since the non-leader members' sibling_lists will be empty. |
1661 | */ | 1669 | */ |
1662 | static void __perf_event_mark_enabled(struct perf_event *event, | 1670 | static void __perf_event_mark_enabled(struct perf_event *event) |
1663 | struct perf_event_context *ctx) | ||
1664 | { | 1671 | { |
1665 | struct perf_event *sub; | 1672 | struct perf_event *sub; |
1666 | u64 tstamp = perf_event_time(event); | 1673 | u64 tstamp = perf_event_time(event); |
@@ -1698,7 +1705,7 @@ static int __perf_event_enable(void *info) | |||
1698 | */ | 1705 | */ |
1699 | perf_cgroup_set_timestamp(current, ctx); | 1706 | perf_cgroup_set_timestamp(current, ctx); |
1700 | 1707 | ||
1701 | __perf_event_mark_enabled(event, ctx); | 1708 | __perf_event_mark_enabled(event); |
1702 | 1709 | ||
1703 | if (!event_filter_match(event)) { | 1710 | if (!event_filter_match(event)) { |
1704 | if (is_cgroup_event(event)) | 1711 | if (is_cgroup_event(event)) |
@@ -1779,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) | |||
1779 | 1786 | ||
1780 | retry: | 1787 | retry: |
1781 | if (!ctx->is_active) { | 1788 | if (!ctx->is_active) { |
1782 | __perf_event_mark_enabled(event, ctx); | 1789 | __perf_event_mark_enabled(event); |
1783 | goto out; | 1790 | goto out; |
1784 | } | 1791 | } |
1785 | 1792 | ||
@@ -1806,6 +1813,7 @@ retry: | |||
1806 | out: | 1813 | out: |
1807 | raw_spin_unlock_irq(&ctx->lock); | 1814 | raw_spin_unlock_irq(&ctx->lock); |
1808 | } | 1815 | } |
1816 | EXPORT_SYMBOL_GPL(perf_event_enable); | ||
1809 | 1817 | ||
1810 | int perf_event_refresh(struct perf_event *event, int refresh) | 1818 | int perf_event_refresh(struct perf_event *event, int refresh) |
1811 | { | 1819 | { |
@@ -2171,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2171 | */ | 2179 | */ |
2172 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2180 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2173 | 2181 | ||
2174 | perf_event_sched_in(cpuctx, ctx, task); | 2182 | if (ctx->nr_events) |
2183 | cpuctx->task_ctx = ctx; | ||
2175 | 2184 | ||
2176 | cpuctx->task_ctx = ctx; | 2185 | perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); |
2177 | 2186 | ||
2178 | perf_pmu_enable(ctx->pmu); | 2187 | perf_pmu_enable(ctx->pmu); |
2179 | perf_ctx_unlock(cpuctx, ctx); | 2188 | perf_ctx_unlock(cpuctx, ctx); |
@@ -2291,7 +2300,10 @@ do { \ | |||
2291 | return div64_u64(dividend, divisor); | 2300 | return div64_u64(dividend, divisor); |
2292 | } | 2301 | } |
2293 | 2302 | ||
2294 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 2303 | static DEFINE_PER_CPU(int, perf_throttled_count); |
2304 | static DEFINE_PER_CPU(u64, perf_throttled_seq); | ||
2305 | |||
2306 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) | ||
2295 | { | 2307 | { |
2296 | struct hw_perf_event *hwc = &event->hw; | 2308 | struct hw_perf_event *hwc = &event->hw; |
2297 | s64 period, sample_period; | 2309 | s64 period, sample_period; |
@@ -2310,19 +2322,40 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
2310 | hwc->sample_period = sample_period; | 2322 | hwc->sample_period = sample_period; |
2311 | 2323 | ||
2312 | if (local64_read(&hwc->period_left) > 8*sample_period) { | 2324 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
2313 | event->pmu->stop(event, PERF_EF_UPDATE); | 2325 | if (disable) |
2326 | event->pmu->stop(event, PERF_EF_UPDATE); | ||
2327 | |||
2314 | local64_set(&hwc->period_left, 0); | 2328 | local64_set(&hwc->period_left, 0); |
2315 | event->pmu->start(event, PERF_EF_RELOAD); | 2329 | |
2330 | if (disable) | ||
2331 | event->pmu->start(event, PERF_EF_RELOAD); | ||
2316 | } | 2332 | } |
2317 | } | 2333 | } |
2318 | 2334 | ||
2319 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | 2335 | /* |
2336 | * combine freq adjustment with unthrottling to avoid two passes over the | ||
2337 | * events. At the same time, make sure, having freq events does not change | ||
2338 | * the rate of unthrottling as that would introduce bias. | ||
2339 | */ | ||
2340 | static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | ||
2341 | int needs_unthr) | ||
2320 | { | 2342 | { |
2321 | struct perf_event *event; | 2343 | struct perf_event *event; |
2322 | struct hw_perf_event *hwc; | 2344 | struct hw_perf_event *hwc; |
2323 | u64 interrupts, now; | 2345 | u64 now, period = TICK_NSEC; |
2324 | s64 delta; | 2346 | s64 delta; |
2325 | 2347 | ||
2348 | /* | ||
2349 | * only need to iterate over all events iff: | ||
2350 | * - context have events in frequency mode (needs freq adjust) | ||
2351 | * - there are events to unthrottle on this cpu | ||
2352 | */ | ||
2353 | if (!(ctx->nr_freq || needs_unthr)) | ||
2354 | return; | ||
2355 | |||
2356 | raw_spin_lock(&ctx->lock); | ||
2357 | perf_pmu_disable(ctx->pmu); | ||
2358 | |||
2326 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 2359 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
2327 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2360 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2328 | continue; | 2361 | continue; |
@@ -2332,13 +2365,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2332 | 2365 | ||
2333 | hwc = &event->hw; | 2366 | hwc = &event->hw; |
2334 | 2367 | ||
2335 | interrupts = hwc->interrupts; | 2368 | if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { |
2336 | hwc->interrupts = 0; | 2369 | hwc->interrupts = 0; |
2337 | |||
2338 | /* | ||
2339 | * unthrottle events on the tick | ||
2340 | */ | ||
2341 | if (interrupts == MAX_INTERRUPTS) { | ||
2342 | perf_log_throttle(event, 1); | 2370 | perf_log_throttle(event, 1); |
2343 | event->pmu->start(event, 0); | 2371 | event->pmu->start(event, 0); |
2344 | } | 2372 | } |
@@ -2346,14 +2374,30 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2346 | if (!event->attr.freq || !event->attr.sample_freq) | 2374 | if (!event->attr.freq || !event->attr.sample_freq) |
2347 | continue; | 2375 | continue; |
2348 | 2376 | ||
2349 | event->pmu->read(event); | 2377 | /* |
2378 | * stop the event and update event->count | ||
2379 | */ | ||
2380 | event->pmu->stop(event, PERF_EF_UPDATE); | ||
2381 | |||
2350 | now = local64_read(&event->count); | 2382 | now = local64_read(&event->count); |
2351 | delta = now - hwc->freq_count_stamp; | 2383 | delta = now - hwc->freq_count_stamp; |
2352 | hwc->freq_count_stamp = now; | 2384 | hwc->freq_count_stamp = now; |
2353 | 2385 | ||
2386 | /* | ||
2387 | * restart the event | ||
2388 | * reload only if value has changed | ||
2389 | * we have stopped the event so tell that | ||
2390 | * to perf_adjust_period() to avoid stopping it | ||
2391 | * twice. | ||
2392 | */ | ||
2354 | if (delta > 0) | 2393 | if (delta > 0) |
2355 | perf_adjust_period(event, period, delta); | 2394 | perf_adjust_period(event, period, delta, false); |
2395 | |||
2396 | event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); | ||
2356 | } | 2397 | } |
2398 | |||
2399 | perf_pmu_enable(ctx->pmu); | ||
2400 | raw_spin_unlock(&ctx->lock); | ||
2357 | } | 2401 | } |
2358 | 2402 | ||
2359 | /* | 2403 | /* |
@@ -2376,7 +2420,6 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
2376 | */ | 2420 | */ |
2377 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | 2421 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) |
2378 | { | 2422 | { |
2379 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; | ||
2380 | struct perf_event_context *ctx = NULL; | 2423 | struct perf_event_context *ctx = NULL; |
2381 | int rotate = 0, remove = 1; | 2424 | int rotate = 0, remove = 1; |
2382 | 2425 | ||
@@ -2393,15 +2436,12 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2393 | rotate = 1; | 2436 | rotate = 1; |
2394 | } | 2437 | } |
2395 | 2438 | ||
2396 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2397 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
2398 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
2399 | if (ctx) | ||
2400 | perf_ctx_adjust_freq(ctx, interval); | ||
2401 | |||
2402 | if (!rotate) | 2439 | if (!rotate) |
2403 | goto done; | 2440 | goto done; |
2404 | 2441 | ||
2442 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2443 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
2444 | |||
2405 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2445 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2406 | if (ctx) | 2446 | if (ctx) |
2407 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | 2447 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); |
@@ -2412,22 +2452,33 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2412 | 2452 | ||
2413 | perf_event_sched_in(cpuctx, ctx, current); | 2453 | perf_event_sched_in(cpuctx, ctx, current); |
2414 | 2454 | ||
2455 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2456 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2415 | done: | 2457 | done: |
2416 | if (remove) | 2458 | if (remove) |
2417 | list_del_init(&cpuctx->rotation_list); | 2459 | list_del_init(&cpuctx->rotation_list); |
2418 | |||
2419 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2420 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2421 | } | 2460 | } |
2422 | 2461 | ||
2423 | void perf_event_task_tick(void) | 2462 | void perf_event_task_tick(void) |
2424 | { | 2463 | { |
2425 | struct list_head *head = &__get_cpu_var(rotation_list); | 2464 | struct list_head *head = &__get_cpu_var(rotation_list); |
2426 | struct perf_cpu_context *cpuctx, *tmp; | 2465 | struct perf_cpu_context *cpuctx, *tmp; |
2466 | struct perf_event_context *ctx; | ||
2467 | int throttled; | ||
2427 | 2468 | ||
2428 | WARN_ON(!irqs_disabled()); | 2469 | WARN_ON(!irqs_disabled()); |
2429 | 2470 | ||
2471 | __this_cpu_inc(perf_throttled_seq); | ||
2472 | throttled = __this_cpu_xchg(perf_throttled_count, 0); | ||
2473 | |||
2430 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | 2474 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { |
2475 | ctx = &cpuctx->ctx; | ||
2476 | perf_adjust_freq_unthr_context(ctx, throttled); | ||
2477 | |||
2478 | ctx = cpuctx->task_ctx; | ||
2479 | if (ctx) | ||
2480 | perf_adjust_freq_unthr_context(ctx, throttled); | ||
2481 | |||
2431 | if (cpuctx->jiffies_interval == 1 || | 2482 | if (cpuctx->jiffies_interval == 1 || |
2432 | !(jiffies % cpuctx->jiffies_interval)) | 2483 | !(jiffies % cpuctx->jiffies_interval)) |
2433 | perf_rotate_context(cpuctx); | 2484 | perf_rotate_context(cpuctx); |
@@ -2444,7 +2495,7 @@ static int event_enable_on_exec(struct perf_event *event, | |||
2444 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 2495 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
2445 | return 0; | 2496 | return 0; |
2446 | 2497 | ||
2447 | __perf_event_mark_enabled(event, ctx); | 2498 | __perf_event_mark_enabled(event); |
2448 | 2499 | ||
2449 | return 1; | 2500 | return 1; |
2450 | } | 2501 | } |
@@ -2476,13 +2527,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
2476 | raw_spin_lock(&ctx->lock); | 2527 | raw_spin_lock(&ctx->lock); |
2477 | task_ctx_sched_out(ctx); | 2528 | task_ctx_sched_out(ctx); |
2478 | 2529 | ||
2479 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 2530 | list_for_each_entry(event, &ctx->event_list, event_entry) { |
2480 | ret = event_enable_on_exec(event, ctx); | ||
2481 | if (ret) | ||
2482 | enabled = 1; | ||
2483 | } | ||
2484 | |||
2485 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { | ||
2486 | ret = event_enable_on_exec(event, ctx); | 2531 | ret = event_enable_on_exec(event, ctx); |
2487 | if (ret) | 2532 | if (ret) |
2488 | enabled = 1; | 2533 | enabled = 1; |
@@ -2570,215 +2615,6 @@ static u64 perf_event_read(struct perf_event *event) | |||
2570 | } | 2615 | } |
2571 | 2616 | ||
2572 | /* | 2617 | /* |
2573 | * Callchain support | ||
2574 | */ | ||
2575 | |||
2576 | struct callchain_cpus_entries { | ||
2577 | struct rcu_head rcu_head; | ||
2578 | struct perf_callchain_entry *cpu_entries[0]; | ||
2579 | }; | ||
2580 | |||
2581 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
2582 | static atomic_t nr_callchain_events; | ||
2583 | static DEFINE_MUTEX(callchain_mutex); | ||
2584 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
2585 | |||
2586 | |||
2587 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
2588 | struct pt_regs *regs) | ||
2589 | { | ||
2590 | } | ||
2591 | |||
2592 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
2593 | struct pt_regs *regs) | ||
2594 | { | ||
2595 | } | ||
2596 | |||
2597 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
2598 | { | ||
2599 | struct callchain_cpus_entries *entries; | ||
2600 | int cpu; | ||
2601 | |||
2602 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
2603 | |||
2604 | for_each_possible_cpu(cpu) | ||
2605 | kfree(entries->cpu_entries[cpu]); | ||
2606 | |||
2607 | kfree(entries); | ||
2608 | } | ||
2609 | |||
2610 | static void release_callchain_buffers(void) | ||
2611 | { | ||
2612 | struct callchain_cpus_entries *entries; | ||
2613 | |||
2614 | entries = callchain_cpus_entries; | ||
2615 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
2616 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
2617 | } | ||
2618 | |||
2619 | static int alloc_callchain_buffers(void) | ||
2620 | { | ||
2621 | int cpu; | ||
2622 | int size; | ||
2623 | struct callchain_cpus_entries *entries; | ||
2624 | |||
2625 | /* | ||
2626 | * We can't use the percpu allocation API for data that can be | ||
2627 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
2628 | * until that gets sorted out. | ||
2629 | */ | ||
2630 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
2631 | |||
2632 | entries = kzalloc(size, GFP_KERNEL); | ||
2633 | if (!entries) | ||
2634 | return -ENOMEM; | ||
2635 | |||
2636 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
2637 | |||
2638 | for_each_possible_cpu(cpu) { | ||
2639 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
2640 | cpu_to_node(cpu)); | ||
2641 | if (!entries->cpu_entries[cpu]) | ||
2642 | goto fail; | ||
2643 | } | ||
2644 | |||
2645 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
2646 | |||
2647 | return 0; | ||
2648 | |||
2649 | fail: | ||
2650 | for_each_possible_cpu(cpu) | ||
2651 | kfree(entries->cpu_entries[cpu]); | ||
2652 | kfree(entries); | ||
2653 | |||
2654 | return -ENOMEM; | ||
2655 | } | ||
2656 | |||
2657 | static int get_callchain_buffers(void) | ||
2658 | { | ||
2659 | int err = 0; | ||
2660 | int count; | ||
2661 | |||
2662 | mutex_lock(&callchain_mutex); | ||
2663 | |||
2664 | count = atomic_inc_return(&nr_callchain_events); | ||
2665 | if (WARN_ON_ONCE(count < 1)) { | ||
2666 | err = -EINVAL; | ||
2667 | goto exit; | ||
2668 | } | ||
2669 | |||
2670 | if (count > 1) { | ||
2671 | /* If the allocation failed, give up */ | ||
2672 | if (!callchain_cpus_entries) | ||
2673 | err = -ENOMEM; | ||
2674 | goto exit; | ||
2675 | } | ||
2676 | |||
2677 | err = alloc_callchain_buffers(); | ||
2678 | if (err) | ||
2679 | release_callchain_buffers(); | ||
2680 | exit: | ||
2681 | mutex_unlock(&callchain_mutex); | ||
2682 | |||
2683 | return err; | ||
2684 | } | ||
2685 | |||
2686 | static void put_callchain_buffers(void) | ||
2687 | { | ||
2688 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
2689 | release_callchain_buffers(); | ||
2690 | mutex_unlock(&callchain_mutex); | ||
2691 | } | ||
2692 | } | ||
2693 | |||
2694 | static int get_recursion_context(int *recursion) | ||
2695 | { | ||
2696 | int rctx; | ||
2697 | |||
2698 | if (in_nmi()) | ||
2699 | rctx = 3; | ||
2700 | else if (in_irq()) | ||
2701 | rctx = 2; | ||
2702 | else if (in_softirq()) | ||
2703 | rctx = 1; | ||
2704 | else | ||
2705 | rctx = 0; | ||
2706 | |||
2707 | if (recursion[rctx]) | ||
2708 | return -1; | ||
2709 | |||
2710 | recursion[rctx]++; | ||
2711 | barrier(); | ||
2712 | |||
2713 | return rctx; | ||
2714 | } | ||
2715 | |||
2716 | static inline void put_recursion_context(int *recursion, int rctx) | ||
2717 | { | ||
2718 | barrier(); | ||
2719 | recursion[rctx]--; | ||
2720 | } | ||
2721 | |||
2722 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
2723 | { | ||
2724 | int cpu; | ||
2725 | struct callchain_cpus_entries *entries; | ||
2726 | |||
2727 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
2728 | if (*rctx == -1) | ||
2729 | return NULL; | ||
2730 | |||
2731 | entries = rcu_dereference(callchain_cpus_entries); | ||
2732 | if (!entries) | ||
2733 | return NULL; | ||
2734 | |||
2735 | cpu = smp_processor_id(); | ||
2736 | |||
2737 | return &entries->cpu_entries[cpu][*rctx]; | ||
2738 | } | ||
2739 | |||
2740 | static void | ||
2741 | put_callchain_entry(int rctx) | ||
2742 | { | ||
2743 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
2744 | } | ||
2745 | |||
2746 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2747 | { | ||
2748 | int rctx; | ||
2749 | struct perf_callchain_entry *entry; | ||
2750 | |||
2751 | |||
2752 | entry = get_callchain_entry(&rctx); | ||
2753 | if (rctx == -1) | ||
2754 | return NULL; | ||
2755 | |||
2756 | if (!entry) | ||
2757 | goto exit_put; | ||
2758 | |||
2759 | entry->nr = 0; | ||
2760 | |||
2761 | if (!user_mode(regs)) { | ||
2762 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2763 | perf_callchain_kernel(entry, regs); | ||
2764 | if (current->mm) | ||
2765 | regs = task_pt_regs(current); | ||
2766 | else | ||
2767 | regs = NULL; | ||
2768 | } | ||
2769 | |||
2770 | if (regs) { | ||
2771 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2772 | perf_callchain_user(entry, regs); | ||
2773 | } | ||
2774 | |||
2775 | exit_put: | ||
2776 | put_callchain_entry(rctx); | ||
2777 | |||
2778 | return entry; | ||
2779 | } | ||
2780 | |||
2781 | /* | ||
2782 | * Initialize the perf_event context in a task_struct: | 2618 | * Initialize the perf_event context in a task_struct: |
2783 | */ | 2619 | */ |
2784 | static void __perf_event_init_context(struct perf_event_context *ctx) | 2620 | static void __perf_event_init_context(struct perf_event_context *ctx) |
@@ -2942,7 +2778,7 @@ static void free_event(struct perf_event *event) | |||
2942 | 2778 | ||
2943 | if (!event->parent) { | 2779 | if (!event->parent) { |
2944 | if (event->attach_state & PERF_ATTACH_TASK) | 2780 | if (event->attach_state & PERF_ATTACH_TASK) |
2945 | jump_label_dec(&perf_sched_events); | 2781 | jump_label_dec_deferred(&perf_sched_events); |
2946 | if (event->attr.mmap || event->attr.mmap_data) | 2782 | if (event->attr.mmap || event->attr.mmap_data) |
2947 | atomic_dec(&nr_mmap_events); | 2783 | atomic_dec(&nr_mmap_events); |
2948 | if (event->attr.comm) | 2784 | if (event->attr.comm) |
@@ -2953,7 +2789,7 @@ static void free_event(struct perf_event *event) | |||
2953 | put_callchain_buffers(); | 2789 | put_callchain_buffers(); |
2954 | if (is_cgroup_event(event)) { | 2790 | if (is_cgroup_event(event)) { |
2955 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | 2791 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); |
2956 | jump_label_dec(&perf_sched_events); | 2792 | jump_label_dec_deferred(&perf_sched_events); |
2957 | } | 2793 | } |
2958 | } | 2794 | } |
2959 | 2795 | ||
@@ -3190,12 +3026,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3190 | struct ring_buffer *rb; | 3026 | struct ring_buffer *rb; |
3191 | unsigned int events = POLL_HUP; | 3027 | unsigned int events = POLL_HUP; |
3192 | 3028 | ||
3029 | /* | ||
3030 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | ||
3031 | * grabs the rb reference but perf_event_set_output() overrides it. | ||
3032 | * Here is the timeline for two threads T1, T2: | ||
3033 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3034 | * t1: T2, old_rb = event->rb | ||
3035 | * t2: T2, event->rb = new rb | ||
3036 | * t3: T2, ring_buffer_detach(old_rb) | ||
3037 | * t4: T1, ring_buffer_attach(rb1) | ||
3038 | * t5: T1, poll_wait(event->waitq) | ||
3039 | * | ||
3040 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3041 | * thereby ensuring that the assignment of the new ring buffer | ||
3042 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3043 | */ | ||
3044 | mutex_lock(&event->mmap_mutex); | ||
3045 | |||
3193 | rcu_read_lock(); | 3046 | rcu_read_lock(); |
3194 | rb = rcu_dereference(event->rb); | 3047 | rb = rcu_dereference(event->rb); |
3195 | if (rb) | 3048 | if (rb) { |
3049 | ring_buffer_attach(event, rb); | ||
3196 | events = atomic_xchg(&rb->poll, 0); | 3050 | events = atomic_xchg(&rb->poll, 0); |
3051 | } | ||
3197 | rcu_read_unlock(); | 3052 | rcu_read_unlock(); |
3198 | 3053 | ||
3054 | mutex_unlock(&event->mmap_mutex); | ||
3055 | |||
3199 | poll_wait(file, &event->waitq, wait); | 3056 | poll_wait(file, &event->waitq, wait); |
3200 | 3057 | ||
3201 | return events; | 3058 | return events; |
@@ -3496,6 +3353,53 @@ unlock: | |||
3496 | return ret; | 3353 | return ret; |
3497 | } | 3354 | } |
3498 | 3355 | ||
3356 | static void ring_buffer_attach(struct perf_event *event, | ||
3357 | struct ring_buffer *rb) | ||
3358 | { | ||
3359 | unsigned long flags; | ||
3360 | |||
3361 | if (!list_empty(&event->rb_entry)) | ||
3362 | return; | ||
3363 | |||
3364 | spin_lock_irqsave(&rb->event_lock, flags); | ||
3365 | if (!list_empty(&event->rb_entry)) | ||
3366 | goto unlock; | ||
3367 | |||
3368 | list_add(&event->rb_entry, &rb->event_list); | ||
3369 | unlock: | ||
3370 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3371 | } | ||
3372 | |||
3373 | static void ring_buffer_detach(struct perf_event *event, | ||
3374 | struct ring_buffer *rb) | ||
3375 | { | ||
3376 | unsigned long flags; | ||
3377 | |||
3378 | if (list_empty(&event->rb_entry)) | ||
3379 | return; | ||
3380 | |||
3381 | spin_lock_irqsave(&rb->event_lock, flags); | ||
3382 | list_del_init(&event->rb_entry); | ||
3383 | wake_up_all(&event->waitq); | ||
3384 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3385 | } | ||
3386 | |||
3387 | static void ring_buffer_wakeup(struct perf_event *event) | ||
3388 | { | ||
3389 | struct ring_buffer *rb; | ||
3390 | |||
3391 | rcu_read_lock(); | ||
3392 | rb = rcu_dereference(event->rb); | ||
3393 | if (!rb) | ||
3394 | goto unlock; | ||
3395 | |||
3396 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | ||
3397 | wake_up_all(&event->waitq); | ||
3398 | |||
3399 | unlock: | ||
3400 | rcu_read_unlock(); | ||
3401 | } | ||
3402 | |||
3499 | static void rb_free_rcu(struct rcu_head *rcu_head) | 3403 | static void rb_free_rcu(struct rcu_head *rcu_head) |
3500 | { | 3404 | { |
3501 | struct ring_buffer *rb; | 3405 | struct ring_buffer *rb; |
@@ -3521,9 +3425,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3521 | 3425 | ||
3522 | static void ring_buffer_put(struct ring_buffer *rb) | 3426 | static void ring_buffer_put(struct ring_buffer *rb) |
3523 | { | 3427 | { |
3428 | struct perf_event *event, *n; | ||
3429 | unsigned long flags; | ||
3430 | |||
3524 | if (!atomic_dec_and_test(&rb->refcount)) | 3431 | if (!atomic_dec_and_test(&rb->refcount)) |
3525 | return; | 3432 | return; |
3526 | 3433 | ||
3434 | spin_lock_irqsave(&rb->event_lock, flags); | ||
3435 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3436 | list_del_init(&event->rb_entry); | ||
3437 | wake_up_all(&event->waitq); | ||
3438 | } | ||
3439 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3440 | |||
3527 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3441 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3528 | } | 3442 | } |
3529 | 3443 | ||
@@ -3546,6 +3460,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3546 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3460 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
3547 | vma->vm_mm->pinned_vm -= event->mmap_locked; | 3461 | vma->vm_mm->pinned_vm -= event->mmap_locked; |
3548 | rcu_assign_pointer(event->rb, NULL); | 3462 | rcu_assign_pointer(event->rb, NULL); |
3463 | ring_buffer_detach(event, rb); | ||
3549 | mutex_unlock(&event->mmap_mutex); | 3464 | mutex_unlock(&event->mmap_mutex); |
3550 | 3465 | ||
3551 | ring_buffer_put(rb); | 3466 | ring_buffer_put(rb); |
@@ -3700,7 +3615,7 @@ static const struct file_operations perf_fops = { | |||
3700 | 3615 | ||
3701 | void perf_event_wakeup(struct perf_event *event) | 3616 | void perf_event_wakeup(struct perf_event *event) |
3702 | { | 3617 | { |
3703 | wake_up_all(&event->waitq); | 3618 | ring_buffer_wakeup(event); |
3704 | 3619 | ||
3705 | if (event->pending_kill) { | 3620 | if (event->pending_kill) { |
3706 | kill_fasync(&event->fasync, SIGIO, event->pending_kill); | 3621 | kill_fasync(&event->fasync, SIGIO, event->pending_kill); |
@@ -4624,6 +4539,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
4624 | { | 4539 | { |
4625 | int events = atomic_read(&event->event_limit); | 4540 | int events = atomic_read(&event->event_limit); |
4626 | struct hw_perf_event *hwc = &event->hw; | 4541 | struct hw_perf_event *hwc = &event->hw; |
4542 | u64 seq; | ||
4627 | int ret = 0; | 4543 | int ret = 0; |
4628 | 4544 | ||
4629 | /* | 4545 | /* |
@@ -4633,14 +4549,20 @@ static int __perf_event_overflow(struct perf_event *event, | |||
4633 | if (unlikely(!is_sampling_event(event))) | 4549 | if (unlikely(!is_sampling_event(event))) |
4634 | return 0; | 4550 | return 0; |
4635 | 4551 | ||
4636 | if (unlikely(hwc->interrupts >= max_samples_per_tick)) { | 4552 | seq = __this_cpu_read(perf_throttled_seq); |
4637 | if (throttle) { | 4553 | if (seq != hwc->interrupts_seq) { |
4554 | hwc->interrupts_seq = seq; | ||
4555 | hwc->interrupts = 1; | ||
4556 | } else { | ||
4557 | hwc->interrupts++; | ||
4558 | if (unlikely(throttle | ||
4559 | && hwc->interrupts >= max_samples_per_tick)) { | ||
4560 | __this_cpu_inc(perf_throttled_count); | ||
4638 | hwc->interrupts = MAX_INTERRUPTS; | 4561 | hwc->interrupts = MAX_INTERRUPTS; |
4639 | perf_log_throttle(event, 0); | 4562 | perf_log_throttle(event, 0); |
4640 | ret = 1; | 4563 | ret = 1; |
4641 | } | 4564 | } |
4642 | } else | 4565 | } |
4643 | hwc->interrupts++; | ||
4644 | 4566 | ||
4645 | if (event->attr.freq) { | 4567 | if (event->attr.freq) { |
4646 | u64 now = perf_clock(); | 4568 | u64 now = perf_clock(); |
@@ -4649,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
4649 | hwc->freq_time_stamp = now; | 4571 | hwc->freq_time_stamp = now; |
4650 | 4572 | ||
4651 | if (delta > 0 && delta < 2*TICK_NSEC) | 4573 | if (delta > 0 && delta < 2*TICK_NSEC) |
4652 | perf_adjust_period(event, delta, hwc->last_period); | 4574 | perf_adjust_period(event, delta, hwc->last_period, true); |
4653 | } | 4575 | } |
4654 | 4576 | ||
4655 | /* | 4577 | /* |
@@ -4737,7 +4659,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4737 | struct hw_perf_event *hwc = &event->hw; | 4659 | struct hw_perf_event *hwc = &event->hw; |
4738 | int throttle = 0; | 4660 | int throttle = 0; |
4739 | 4661 | ||
4740 | data->period = event->hw.last_period; | ||
4741 | if (!overflow) | 4662 | if (!overflow) |
4742 | overflow = perf_swevent_set_period(event); | 4663 | overflow = perf_swevent_set_period(event); |
4743 | 4664 | ||
@@ -4771,6 +4692,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
4771 | if (!is_sampling_event(event)) | 4692 | if (!is_sampling_event(event)) |
4772 | return; | 4693 | return; |
4773 | 4694 | ||
4695 | if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { | ||
4696 | data->period = nr; | ||
4697 | return perf_swevent_overflow(event, 1, data, regs); | ||
4698 | } else | ||
4699 | data->period = event->hw.last_period; | ||
4700 | |||
4774 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4701 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
4775 | return perf_swevent_overflow(event, 1, data, regs); | 4702 | return perf_swevent_overflow(event, 1, data, regs); |
4776 | 4703 | ||
@@ -5283,7 +5210,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5283 | regs = get_irq_regs(); | 5210 | regs = get_irq_regs(); |
5284 | 5211 | ||
5285 | if (regs && !perf_exclude_event(event, regs)) { | 5212 | if (regs && !perf_exclude_event(event, regs)) { |
5286 | if (!(event->attr.exclude_idle && current->pid == 0)) | 5213 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5287 | if (perf_event_overflow(event, &data, regs)) | 5214 | if (perf_event_overflow(event, &data, regs)) |
5288 | ret = HRTIMER_NORESTART; | 5215 | ret = HRTIMER_NORESTART; |
5289 | } | 5216 | } |
@@ -5822,6 +5749,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5822 | INIT_LIST_HEAD(&event->group_entry); | 5749 | INIT_LIST_HEAD(&event->group_entry); |
5823 | INIT_LIST_HEAD(&event->event_entry); | 5750 | INIT_LIST_HEAD(&event->event_entry); |
5824 | INIT_LIST_HEAD(&event->sibling_list); | 5751 | INIT_LIST_HEAD(&event->sibling_list); |
5752 | INIT_LIST_HEAD(&event->rb_entry); | ||
5753 | |||
5825 | init_waitqueue_head(&event->waitq); | 5754 | init_waitqueue_head(&event->waitq); |
5826 | init_irq_work(&event->pending, perf_pending_event); | 5755 | init_irq_work(&event->pending, perf_pending_event); |
5827 | 5756 | ||
@@ -5896,7 +5825,7 @@ done: | |||
5896 | 5825 | ||
5897 | if (!event->parent) { | 5826 | if (!event->parent) { |
5898 | if (event->attach_state & PERF_ATTACH_TASK) | 5827 | if (event->attach_state & PERF_ATTACH_TASK) |
5899 | jump_label_inc(&perf_sched_events); | 5828 | jump_label_inc(&perf_sched_events.key); |
5900 | if (event->attr.mmap || event->attr.mmap_data) | 5829 | if (event->attr.mmap || event->attr.mmap_data) |
5901 | atomic_inc(&nr_mmap_events); | 5830 | atomic_inc(&nr_mmap_events); |
5902 | if (event->attr.comm) | 5831 | if (event->attr.comm) |
@@ -6028,6 +5957,8 @@ set: | |||
6028 | 5957 | ||
6029 | old_rb = event->rb; | 5958 | old_rb = event->rb; |
6030 | rcu_assign_pointer(event->rb, rb); | 5959 | rcu_assign_pointer(event->rb, rb); |
5960 | if (old_rb) | ||
5961 | ring_buffer_detach(event, old_rb); | ||
6031 | ret = 0; | 5962 | ret = 0; |
6032 | unlock: | 5963 | unlock: |
6033 | mutex_unlock(&event->mmap_mutex); | 5964 | mutex_unlock(&event->mmap_mutex); |
@@ -6132,7 +6063,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6132 | * - that may need work on context switch | 6063 | * - that may need work on context switch |
6133 | */ | 6064 | */ |
6134 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | 6065 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); |
6135 | jump_label_inc(&perf_sched_events); | 6066 | jump_label_inc(&perf_sched_events.key); |
6136 | } | 6067 | } |
6137 | 6068 | ||
6138 | /* | 6069 | /* |
@@ -6978,6 +6909,9 @@ void __init perf_event_init(void) | |||
6978 | 6909 | ||
6979 | ret = init_hw_breakpoint(); | 6910 | ret = init_hw_breakpoint(); |
6980 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 6911 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
6912 | |||
6913 | /* do not patch jump label more than once per second */ | ||
6914 | jump_label_rate_limit(&perf_sched_events, HZ); | ||
6981 | } | 6915 | } |
6982 | 6916 | ||
6983 | static int __init perf_event_sysfs_init(void) | 6917 | static int __init perf_event_sysfs_init(void) |
@@ -7044,10 +6978,13 @@ static int __perf_cgroup_move(void *info) | |||
7044 | return 0; | 6978 | return 0; |
7045 | } | 6979 | } |
7046 | 6980 | ||
7047 | static void | 6981 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7048 | perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) | 6982 | struct cgroup_taskset *tset) |
7049 | { | 6983 | { |
7050 | task_function_call(task, __perf_cgroup_move, task); | 6984 | struct task_struct *task; |
6985 | |||
6986 | cgroup_taskset_for_each(task, cgrp, tset) | ||
6987 | task_function_call(task, __perf_cgroup_move, task); | ||
7051 | } | 6988 | } |
7052 | 6989 | ||
7053 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 6990 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
@@ -7061,7 +6998,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7061 | if (!(task->flags & PF_EXITING)) | 6998 | if (!(task->flags & PF_EXITING)) |
7062 | return; | 6999 | return; |
7063 | 7000 | ||
7064 | perf_cgroup_attach_task(cgrp, task); | 7001 | task_function_call(task, __perf_cgroup_move, task); |
7065 | } | 7002 | } |
7066 | 7003 | ||
7067 | struct cgroup_subsys perf_subsys = { | 7004 | struct cgroup_subsys perf_subsys = { |
@@ -7070,6 +7007,6 @@ struct cgroup_subsys perf_subsys = { | |||
7070 | .create = perf_cgroup_create, | 7007 | .create = perf_cgroup_create, |
7071 | .destroy = perf_cgroup_destroy, | 7008 | .destroy = perf_cgroup_destroy, |
7072 | .exit = perf_cgroup_exit, | 7009 | .exit = perf_cgroup_exit, |
7073 | .attach_task = perf_cgroup_attach_task, | 7010 | .attach = perf_cgroup_attach, |
7074 | }; | 7011 | }; |
7075 | #endif /* CONFIG_CGROUP_PERF */ | 7012 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..ee706ce44aa0 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -651,10 +651,10 @@ int __init init_hw_breakpoint(void) | |||
651 | 651 | ||
652 | err_alloc: | 652 | err_alloc: |
653 | for_each_possible_cpu(err_cpu) { | 653 | for_each_possible_cpu(err_cpu) { |
654 | if (err_cpu == cpu) | ||
655 | break; | ||
656 | for (i = 0; i < TYPE_MAX; i++) | 654 | for (i = 0; i < TYPE_MAX; i++) |
657 | kfree(per_cpu(nr_task_bp_pinned[i], cpu)); | 655 | kfree(per_cpu(nr_task_bp_pinned[i], cpu)); |
656 | if (err_cpu == cpu) | ||
657 | break; | ||
658 | } | 658 | } |
659 | 659 | ||
660 | return -ENOMEM; | 660 | return -ENOMEM; |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..b0b107f90afc 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -1,6 +1,10 @@ | |||
1 | #ifndef _KERNEL_EVENTS_INTERNAL_H | 1 | #ifndef _KERNEL_EVENTS_INTERNAL_H |
2 | #define _KERNEL_EVENTS_INTERNAL_H | 2 | #define _KERNEL_EVENTS_INTERNAL_H |
3 | 3 | ||
4 | #include <linux/hardirq.h> | ||
5 | |||
6 | /* Buffer handling */ | ||
7 | |||
4 | #define RING_BUFFER_WRITABLE 0x01 | 8 | #define RING_BUFFER_WRITABLE 0x01 |
5 | 9 | ||
6 | struct ring_buffer { | 10 | struct ring_buffer { |
@@ -22,6 +26,9 @@ struct ring_buffer { | |||
22 | local_t lost; /* nr records lost */ | 26 | local_t lost; /* nr records lost */ |
23 | 27 | ||
24 | long watermark; /* wakeup watermark */ | 28 | long watermark; /* wakeup watermark */ |
29 | /* poll crap */ | ||
30 | spinlock_t event_lock; | ||
31 | struct list_head event_list; | ||
25 | 32 | ||
26 | struct perf_event_mmap_page *user_page; | 33 | struct perf_event_mmap_page *user_page; |
27 | void *data_pages[0]; | 34 | void *data_pages[0]; |
@@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb) | |||
64 | } | 71 | } |
65 | #endif | 72 | #endif |
66 | 73 | ||
67 | static unsigned long perf_data_size(struct ring_buffer *rb) | 74 | static inline unsigned long perf_data_size(struct ring_buffer *rb) |
68 | { | 75 | { |
69 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 76 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); |
70 | } | 77 | } |
@@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle, | |||
93 | } while (len); | 100 | } while (len); |
94 | } | 101 | } |
95 | 102 | ||
103 | /* Callchain handling */ | ||
104 | extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); | ||
105 | extern int get_callchain_buffers(void); | ||
106 | extern void put_callchain_buffers(void); | ||
107 | |||
108 | static inline int get_recursion_context(int *recursion) | ||
109 | { | ||
110 | int rctx; | ||
111 | |||
112 | if (in_nmi()) | ||
113 | rctx = 3; | ||
114 | else if (in_irq()) | ||
115 | rctx = 2; | ||
116 | else if (in_softirq()) | ||
117 | rctx = 1; | ||
118 | else | ||
119 | rctx = 0; | ||
120 | |||
121 | if (recursion[rctx]) | ||
122 | return -1; | ||
123 | |||
124 | recursion[rctx]++; | ||
125 | barrier(); | ||
126 | |||
127 | return rctx; | ||
128 | } | ||
129 | |||
130 | static inline void put_recursion_context(int *recursion, int rctx) | ||
131 | { | ||
132 | barrier(); | ||
133 | recursion[rctx]--; | ||
134 | } | ||
135 | |||
96 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | 136 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0f..6ddaba43fb7a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
10 | */ | 10 | */ |
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | |||
209 | rb->writable = 1; | 209 | rb->writable = 1; |
210 | 210 | ||
211 | atomic_set(&rb->refcount, 1); | 211 | atomic_set(&rb->refcount, 1); |
212 | |||
213 | INIT_LIST_HEAD(&rb->event_list); | ||
214 | spin_lock_init(&rb->event_lock); | ||
212 | } | 215 | } |
213 | 216 | ||
214 | #ifndef CONFIG_PERF_USE_VMALLOC | 217 | #ifndef CONFIG_PERF_USE_VMALLOC |
diff --git a/kernel/exit.c b/kernel/exit.c index d0b7d988f873..4b4042f9bc6a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | 53 | #include <linux/oom.h> |
54 | #include <linux/writeback.h> | ||
54 | 55 | ||
55 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
56 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
121 | * We won't ever get here for the group leader, since it | 122 | * We won't ever get here for the group leader, since it |
122 | * will have been the last reference on the signal_struct. | 123 | * will have been the last reference on the signal_struct. |
123 | */ | 124 | */ |
124 | sig->utime = cputime_add(sig->utime, tsk->utime); | 125 | sig->utime += tsk->utime; |
125 | sig->stime = cputime_add(sig->stime, tsk->stime); | 126 | sig->stime += tsk->stime; |
126 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | 127 | sig->gtime += tsk->gtime; |
127 | sig->min_flt += tsk->min_flt; | 128 | sig->min_flt += tsk->min_flt; |
128 | sig->maj_flt += tsk->maj_flt; | 129 | sig->maj_flt += tsk->maj_flt; |
129 | sig->nvcsw += tsk->nvcsw; | 130 | sig->nvcsw += tsk->nvcsw; |
@@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk) | |||
679 | tsk->mm = NULL; | 680 | tsk->mm = NULL; |
680 | up_read(&mm->mmap_sem); | 681 | up_read(&mm->mmap_sem); |
681 | enter_lazy_tlb(mm, current); | 682 | enter_lazy_tlb(mm, current); |
682 | /* We don't want this task to be frozen prematurely */ | ||
683 | clear_freeze_flag(tsk); | ||
684 | task_unlock(tsk); | 683 | task_unlock(tsk); |
685 | mm_update_next_owner(mm); | 684 | mm_update_next_owner(mm); |
686 | mmput(mm); | 685 | mmput(mm); |
@@ -888,7 +887,7 @@ static void check_stack_usage(void) | |||
888 | static inline void check_stack_usage(void) {} | 887 | static inline void check_stack_usage(void) {} |
889 | #endif | 888 | #endif |
890 | 889 | ||
891 | NORET_TYPE void do_exit(long code) | 890 | void do_exit(long code) |
892 | { | 891 | { |
893 | struct task_struct *tsk = current; | 892 | struct task_struct *tsk = current; |
894 | int group_dead; | 893 | int group_dead; |
@@ -965,8 +964,7 @@ NORET_TYPE void do_exit(long code) | |||
965 | acct_collect(code, group_dead); | 964 | acct_collect(code, group_dead); |
966 | if (group_dead) | 965 | if (group_dead) |
967 | tty_audit_exit(); | 966 | tty_audit_exit(); |
968 | if (unlikely(tsk->audit_context)) | 967 | audit_free(tsk); |
969 | audit_free(tsk); | ||
970 | 968 | ||
971 | tsk->exit_code = code; | 969 | tsk->exit_code = code; |
972 | taskstats_exit(tsk, group_dead); | 970 | taskstats_exit(tsk, group_dead); |
@@ -1037,9 +1035,28 @@ NORET_TYPE void do_exit(long code) | |||
1037 | validate_creds_for_do_exit(tsk); | 1035 | validate_creds_for_do_exit(tsk); |
1038 | 1036 | ||
1039 | preempt_disable(); | 1037 | preempt_disable(); |
1038 | if (tsk->nr_dirtied) | ||
1039 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | ||
1040 | exit_rcu(); | 1040 | exit_rcu(); |
1041 | |||
1042 | /* | ||
1043 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
1044 | * when the following two conditions become true. | ||
1045 | * - There is race condition of mmap_sem (It is acquired by | ||
1046 | * exit_mm()), and | ||
1047 | * - SMI occurs before setting TASK_RUNINNG. | ||
1048 | * (or hypervisor of virtual machine switches to other guest) | ||
1049 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
1050 | * | ||
1051 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
1052 | * is held by try_to_wake_up() | ||
1053 | */ | ||
1054 | smp_mb(); | ||
1055 | raw_spin_unlock_wait(&tsk->pi_lock); | ||
1056 | |||
1041 | /* causes final put_task_struct in finish_task_switch(). */ | 1057 | /* causes final put_task_struct in finish_task_switch(). */ |
1042 | tsk->state = TASK_DEAD; | 1058 | tsk->state = TASK_DEAD; |
1059 | tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
1043 | schedule(); | 1060 | schedule(); |
1044 | BUG(); | 1061 | BUG(); |
1045 | /* Avoid "noreturn function does return". */ | 1062 | /* Avoid "noreturn function does return". */ |
@@ -1049,7 +1066,7 @@ NORET_TYPE void do_exit(long code) | |||
1049 | 1066 | ||
1050 | EXPORT_SYMBOL_GPL(do_exit); | 1067 | EXPORT_SYMBOL_GPL(do_exit); |
1051 | 1068 | ||
1052 | NORET_TYPE void complete_and_exit(struct completion *comp, long code) | 1069 | void complete_and_exit(struct completion *comp, long code) |
1053 | { | 1070 | { |
1054 | if (comp) | 1071 | if (comp) |
1055 | complete(comp); | 1072 | complete(comp); |
@@ -1068,7 +1085,7 @@ SYSCALL_DEFINE1(exit, int, error_code) | |||
1068 | * Take down every thread in the group. This is called by fatal signals | 1085 | * Take down every thread in the group. This is called by fatal signals |
1069 | * as well as by sys_exit_group (below). | 1086 | * as well as by sys_exit_group (below). |
1070 | */ | 1087 | */ |
1071 | NORET_TYPE void | 1088 | void |
1072 | do_group_exit(int exit_code) | 1089 | do_group_exit(int exit_code) |
1073 | { | 1090 | { |
1074 | struct signal_struct *sig = current->signal; | 1091 | struct signal_struct *sig = current->signal; |
@@ -1255,19 +1272,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1255 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1272 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1256 | psig = p->real_parent->signal; | 1273 | psig = p->real_parent->signal; |
1257 | sig = p->signal; | 1274 | sig = p->signal; |
1258 | psig->cutime = | 1275 | psig->cutime += tgutime + sig->cutime; |
1259 | cputime_add(psig->cutime, | 1276 | psig->cstime += tgstime + sig->cstime; |
1260 | cputime_add(tgutime, | 1277 | psig->cgtime += p->gtime + sig->gtime + sig->cgtime; |
1261 | sig->cutime)); | ||
1262 | psig->cstime = | ||
1263 | cputime_add(psig->cstime, | ||
1264 | cputime_add(tgstime, | ||
1265 | sig->cstime)); | ||
1266 | psig->cgtime = | ||
1267 | cputime_add(psig->cgtime, | ||
1268 | cputime_add(p->gtime, | ||
1269 | cputime_add(sig->gtime, | ||
1270 | sig->cgtime))); | ||
1271 | psig->cmin_flt += | 1278 | psig->cmin_flt += |
1272 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1279 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1273 | psig->cmaj_flt += | 1280 | psig->cmaj_flt += |
@@ -1540,8 +1547,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1540 | } | 1547 | } |
1541 | 1548 | ||
1542 | /* dead body doesn't have much to contribute */ | 1549 | /* dead body doesn't have much to contribute */ |
1543 | if (p->exit_state == EXIT_DEAD) | 1550 | if (unlikely(p->exit_state == EXIT_DEAD)) { |
1551 | /* | ||
1552 | * But do not ignore this task until the tracer does | ||
1553 | * wait_task_zombie()->do_notify_parent(). | ||
1554 | */ | ||
1555 | if (likely(!ptrace) && unlikely(ptrace_reparented(p))) | ||
1556 | wo->notask_error = 0; | ||
1544 | return 0; | 1557 | return 0; |
1558 | } | ||
1545 | 1559 | ||
1546 | /* slay zombie? */ | 1560 | /* slay zombie? */ |
1547 | if (p->exit_state == EXIT_ZOMBIE) { | 1561 | if (p->exit_state == EXIT_ZOMBIE) { |
diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d088..e2cd3e2a5ae8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -66,6 +66,7 @@ | |||
66 | #include <linux/user-return-notifier.h> | 66 | #include <linux/user-return-notifier.h> |
67 | #include <linux/oom.h> | 67 | #include <linux/oom.h> |
68 | #include <linux/khugepaged.h> | 68 | #include <linux/khugepaged.h> |
69 | #include <linux/signalfd.h> | ||
69 | 70 | ||
70 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
71 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -76,6 +77,9 @@ | |||
76 | 77 | ||
77 | #include <trace/events/sched.h> | 78 | #include <trace/events/sched.h> |
78 | 79 | ||
80 | #define CREATE_TRACE_POINTS | ||
81 | #include <trace/events/task.h> | ||
82 | |||
79 | /* | 83 | /* |
80 | * Protected counters by write_lock_irq(&tasklist_lock) | 84 | * Protected counters by write_lock_irq(&tasklist_lock) |
81 | */ | 85 | */ |
@@ -644,6 +648,26 @@ struct mm_struct *get_task_mm(struct task_struct *task) | |||
644 | } | 648 | } |
645 | EXPORT_SYMBOL_GPL(get_task_mm); | 649 | EXPORT_SYMBOL_GPL(get_task_mm); |
646 | 650 | ||
651 | struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) | ||
652 | { | ||
653 | struct mm_struct *mm; | ||
654 | int err; | ||
655 | |||
656 | err = mutex_lock_killable(&task->signal->cred_guard_mutex); | ||
657 | if (err) | ||
658 | return ERR_PTR(err); | ||
659 | |||
660 | mm = get_task_mm(task); | ||
661 | if (mm && mm != current->mm && | ||
662 | !ptrace_may_access(task, mode)) { | ||
663 | mmput(mm); | ||
664 | mm = ERR_PTR(-EACCES); | ||
665 | } | ||
666 | mutex_unlock(&task->signal->cred_guard_mutex); | ||
667 | |||
668 | return mm; | ||
669 | } | ||
670 | |||
647 | /* Please note the differences between mmput and mm_release. | 671 | /* Please note the differences between mmput and mm_release. |
648 | * mmput is called whenever we stop holding onto a mm_struct, | 672 | * mmput is called whenever we stop holding onto a mm_struct, |
649 | * error success whatever. | 673 | * error success whatever. |
@@ -870,6 +894,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
870 | { | 894 | { |
871 | #ifdef CONFIG_BLOCK | 895 | #ifdef CONFIG_BLOCK |
872 | struct io_context *ioc = current->io_context; | 896 | struct io_context *ioc = current->io_context; |
897 | struct io_context *new_ioc; | ||
873 | 898 | ||
874 | if (!ioc) | 899 | if (!ioc) |
875 | return 0; | 900 | return 0; |
@@ -881,11 +906,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
881 | if (unlikely(!tsk->io_context)) | 906 | if (unlikely(!tsk->io_context)) |
882 | return -ENOMEM; | 907 | return -ENOMEM; |
883 | } else if (ioprio_valid(ioc->ioprio)) { | 908 | } else if (ioprio_valid(ioc->ioprio)) { |
884 | tsk->io_context = alloc_io_context(GFP_KERNEL, -1); | 909 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); |
885 | if (unlikely(!tsk->io_context)) | 910 | if (unlikely(!new_ioc)) |
886 | return -ENOMEM; | 911 | return -ENOMEM; |
887 | 912 | ||
888 | tsk->io_context->ioprio = ioc->ioprio; | 913 | new_ioc->ioprio = ioc->ioprio; |
914 | put_io_context(new_ioc); | ||
889 | } | 915 | } |
890 | #endif | 916 | #endif |
891 | return 0; | 917 | return 0; |
@@ -910,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
910 | 936 | ||
911 | void __cleanup_sighand(struct sighand_struct *sighand) | 937 | void __cleanup_sighand(struct sighand_struct *sighand) |
912 | { | 938 | { |
913 | if (atomic_dec_and_test(&sighand->count)) | 939 | if (atomic_dec_and_test(&sighand->count)) { |
940 | signalfd_cleanup(sighand); | ||
914 | kmem_cache_free(sighand_cachep, sighand); | 941 | kmem_cache_free(sighand_cachep, sighand); |
942 | } | ||
915 | } | 943 | } |
916 | 944 | ||
917 | 945 | ||
@@ -972,7 +1000,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
972 | sched_autogroup_fork(sig); | 1000 | sched_autogroup_fork(sig); |
973 | 1001 | ||
974 | #ifdef CONFIG_CGROUPS | 1002 | #ifdef CONFIG_CGROUPS |
975 | init_rwsem(&sig->threadgroup_fork_lock); | 1003 | init_rwsem(&sig->group_rwsem); |
976 | #endif | 1004 | #endif |
977 | 1005 | ||
978 | sig->oom_adj = current->signal->oom_adj; | 1006 | sig->oom_adj = current->signal->oom_adj; |
@@ -992,7 +1020,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
992 | new_flags |= PF_FORKNOEXEC; | 1020 | new_flags |= PF_FORKNOEXEC; |
993 | new_flags |= PF_STARTING; | 1021 | new_flags |= PF_STARTING; |
994 | p->flags = new_flags; | 1022 | p->flags = new_flags; |
995 | clear_freeze_flag(p); | ||
996 | } | 1023 | } |
997 | 1024 | ||
998 | SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) | 1025 | SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) |
@@ -1023,8 +1050,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | |||
1023 | */ | 1050 | */ |
1024 | static void posix_cpu_timers_init(struct task_struct *tsk) | 1051 | static void posix_cpu_timers_init(struct task_struct *tsk) |
1025 | { | 1052 | { |
1026 | tsk->cputime_expires.prof_exp = cputime_zero; | 1053 | tsk->cputime_expires.prof_exp = 0; |
1027 | tsk->cputime_expires.virt_exp = cputime_zero; | 1054 | tsk->cputime_expires.virt_exp = 0; |
1028 | tsk->cputime_expires.sched_exp = 0; | 1055 | tsk->cputime_expires.sched_exp = 0; |
1029 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); | 1056 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
1030 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); | 1057 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
@@ -1132,14 +1159,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1132 | 1159 | ||
1133 | init_sigpending(&p->pending); | 1160 | init_sigpending(&p->pending); |
1134 | 1161 | ||
1135 | p->utime = cputime_zero; | 1162 | p->utime = p->stime = p->gtime = 0; |
1136 | p->stime = cputime_zero; | 1163 | p->utimescaled = p->stimescaled = 0; |
1137 | p->gtime = cputime_zero; | ||
1138 | p->utimescaled = cputime_zero; | ||
1139 | p->stimescaled = cputime_zero; | ||
1140 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1164 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1141 | p->prev_utime = cputime_zero; | 1165 | p->prev_utime = p->prev_stime = 0; |
1142 | p->prev_stime = cputime_zero; | ||
1143 | #endif | 1166 | #endif |
1144 | #if defined(SPLIT_RSS_COUNTING) | 1167 | #if defined(SPLIT_RSS_COUNTING) |
1145 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1168 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
@@ -1158,7 +1181,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1158 | p->io_context = NULL; | 1181 | p->io_context = NULL; |
1159 | p->audit_context = NULL; | 1182 | p->audit_context = NULL; |
1160 | if (clone_flags & CLONE_THREAD) | 1183 | if (clone_flags & CLONE_THREAD) |
1161 | threadgroup_fork_read_lock(current); | 1184 | threadgroup_change_begin(current); |
1162 | cgroup_fork(p); | 1185 | cgroup_fork(p); |
1163 | #ifdef CONFIG_NUMA | 1186 | #ifdef CONFIG_NUMA |
1164 | p->mempolicy = mpol_dup(p->mempolicy); | 1187 | p->mempolicy = mpol_dup(p->mempolicy); |
@@ -1296,6 +1319,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1296 | 1319 | ||
1297 | p->nr_dirtied = 0; | 1320 | p->nr_dirtied = 0; |
1298 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | 1321 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); |
1322 | p->dirty_paused_when = 0; | ||
1299 | 1323 | ||
1300 | /* | 1324 | /* |
1301 | * Ok, make it visible to the rest of the system. | 1325 | * Ok, make it visible to the rest of the system. |
@@ -1373,8 +1397,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1373 | proc_fork_connector(p); | 1397 | proc_fork_connector(p); |
1374 | cgroup_post_fork(p); | 1398 | cgroup_post_fork(p); |
1375 | if (clone_flags & CLONE_THREAD) | 1399 | if (clone_flags & CLONE_THREAD) |
1376 | threadgroup_fork_read_unlock(current); | 1400 | threadgroup_change_end(current); |
1377 | perf_event_fork(p); | 1401 | perf_event_fork(p); |
1402 | |||
1403 | trace_task_newtask(p, clone_flags); | ||
1404 | |||
1378 | return p; | 1405 | return p; |
1379 | 1406 | ||
1380 | bad_fork_free_pid: | 1407 | bad_fork_free_pid: |
@@ -1408,7 +1435,7 @@ bad_fork_cleanup_policy: | |||
1408 | bad_fork_cleanup_cgroup: | 1435 | bad_fork_cleanup_cgroup: |
1409 | #endif | 1436 | #endif |
1410 | if (clone_flags & CLONE_THREAD) | 1437 | if (clone_flags & CLONE_THREAD) |
1411 | threadgroup_fork_read_unlock(current); | 1438 | threadgroup_change_end(current); |
1412 | cgroup_exit(p, cgroup_callbacks_done); | 1439 | cgroup_exit(p, cgroup_callbacks_done); |
1413 | delayacct_tsk_free(p); | 1440 | delayacct_tsk_free(p); |
1414 | module_put(task_thread_info(p)->exec_domain->module); | 1441 | module_put(task_thread_info(p)->exec_domain->module); |
@@ -1523,8 +1550,6 @@ long do_fork(unsigned long clone_flags, | |||
1523 | init_completion(&vfork); | 1550 | init_completion(&vfork); |
1524 | } | 1551 | } |
1525 | 1552 | ||
1526 | audit_finish_fork(p); | ||
1527 | |||
1528 | /* | 1553 | /* |
1529 | * We set PF_STARTING at creation in case tracing wants to | 1554 | * We set PF_STARTING at creation in case tracing wants to |
1530 | * use this to distinguish a fully live task from one that | 1555 | * use this to distinguish a fully live task from one that |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 7be56c534397..9815b8d1eed5 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -9,101 +9,114 @@ | |||
9 | #include <linux/export.h> | 9 | #include <linux/export.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/freezer.h> | 11 | #include <linux/freezer.h> |
12 | #include <linux/kthread.h> | ||
12 | 13 | ||
13 | /* | 14 | /* total number of freezing conditions in effect */ |
14 | * freezing is complete, mark current process as frozen | 15 | atomic_t system_freezing_cnt = ATOMIC_INIT(0); |
16 | EXPORT_SYMBOL(system_freezing_cnt); | ||
17 | |||
18 | /* indicate whether PM freezing is in effect, protected by pm_mutex */ | ||
19 | bool pm_freezing; | ||
20 | bool pm_nosig_freezing; | ||
21 | |||
22 | /* protects freezing and frozen transitions */ | ||
23 | static DEFINE_SPINLOCK(freezer_lock); | ||
24 | |||
25 | /** | ||
26 | * freezing_slow_path - slow path for testing whether a task needs to be frozen | ||
27 | * @p: task to be tested | ||
28 | * | ||
29 | * This function is called by freezing() if system_freezing_cnt isn't zero | ||
30 | * and tests whether @p needs to enter and stay in frozen state. Can be | ||
31 | * called under any context. The freezers are responsible for ensuring the | ||
32 | * target tasks see the updated state. | ||
15 | */ | 33 | */ |
16 | static inline void frozen_process(void) | 34 | bool freezing_slow_path(struct task_struct *p) |
17 | { | 35 | { |
18 | if (!unlikely(current->flags & PF_NOFREEZE)) { | 36 | if (p->flags & PF_NOFREEZE) |
19 | current->flags |= PF_FROZEN; | 37 | return false; |
20 | smp_wmb(); | 38 | |
21 | } | 39 | if (pm_nosig_freezing || cgroup_freezing(p)) |
22 | clear_freeze_flag(current); | 40 | return true; |
41 | |||
42 | if (pm_freezing && !(p->flags & PF_KTHREAD)) | ||
43 | return true; | ||
44 | |||
45 | return false; | ||
23 | } | 46 | } |
47 | EXPORT_SYMBOL(freezing_slow_path); | ||
24 | 48 | ||
25 | /* Refrigerator is place where frozen processes are stored :-). */ | 49 | /* Refrigerator is place where frozen processes are stored :-). */ |
26 | void refrigerator(void) | 50 | bool __refrigerator(bool check_kthr_stop) |
27 | { | 51 | { |
28 | /* Hmm, should we be allowed to suspend when there are realtime | 52 | /* Hmm, should we be allowed to suspend when there are realtime |
29 | processes around? */ | 53 | processes around? */ |
30 | long save; | 54 | bool was_frozen = false; |
55 | long save = current->state; | ||
31 | 56 | ||
32 | task_lock(current); | ||
33 | if (freezing(current)) { | ||
34 | frozen_process(); | ||
35 | task_unlock(current); | ||
36 | } else { | ||
37 | task_unlock(current); | ||
38 | return; | ||
39 | } | ||
40 | save = current->state; | ||
41 | pr_debug("%s entered refrigerator\n", current->comm); | 57 | pr_debug("%s entered refrigerator\n", current->comm); |
42 | 58 | ||
43 | spin_lock_irq(¤t->sighand->siglock); | ||
44 | recalc_sigpending(); /* We sent fake signal, clean it up */ | ||
45 | spin_unlock_irq(¤t->sighand->siglock); | ||
46 | |||
47 | /* prevent accounting of that task to load */ | ||
48 | current->flags |= PF_FREEZING; | ||
49 | |||
50 | for (;;) { | 59 | for (;;) { |
51 | set_current_state(TASK_UNINTERRUPTIBLE); | 60 | set_current_state(TASK_UNINTERRUPTIBLE); |
52 | if (!frozen(current)) | 61 | |
62 | spin_lock_irq(&freezer_lock); | ||
63 | current->flags |= PF_FROZEN; | ||
64 | if (!freezing(current) || | ||
65 | (check_kthr_stop && kthread_should_stop())) | ||
66 | current->flags &= ~PF_FROZEN; | ||
67 | spin_unlock_irq(&freezer_lock); | ||
68 | |||
69 | if (!(current->flags & PF_FROZEN)) | ||
53 | break; | 70 | break; |
71 | was_frozen = true; | ||
54 | schedule(); | 72 | schedule(); |
55 | } | 73 | } |
56 | 74 | ||
57 | /* Remove the accounting blocker */ | ||
58 | current->flags &= ~PF_FREEZING; | ||
59 | |||
60 | pr_debug("%s left refrigerator\n", current->comm); | 75 | pr_debug("%s left refrigerator\n", current->comm); |
61 | __set_current_state(save); | 76 | |
77 | /* | ||
78 | * Restore saved task state before returning. The mb'd version | ||
79 | * needs to be used; otherwise, it might silently break | ||
80 | * synchronization which depends on ordered task state change. | ||
81 | */ | ||
82 | set_current_state(save); | ||
83 | |||
84 | return was_frozen; | ||
62 | } | 85 | } |
63 | EXPORT_SYMBOL(refrigerator); | 86 | EXPORT_SYMBOL(__refrigerator); |
64 | 87 | ||
65 | static void fake_signal_wake_up(struct task_struct *p) | 88 | static void fake_signal_wake_up(struct task_struct *p) |
66 | { | 89 | { |
67 | unsigned long flags; | 90 | unsigned long flags; |
68 | 91 | ||
69 | spin_lock_irqsave(&p->sighand->siglock, flags); | 92 | if (lock_task_sighand(p, &flags)) { |
70 | signal_wake_up(p, 0); | 93 | signal_wake_up(p, 0); |
71 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 94 | unlock_task_sighand(p, &flags); |
95 | } | ||
72 | } | 96 | } |
73 | 97 | ||
74 | /** | 98 | /** |
75 | * freeze_task - send a freeze request to given task | 99 | * freeze_task - send a freeze request to given task |
76 | * @p: task to send the request to | 100 | * @p: task to send the request to |
77 | * @sig_only: if set, the request will only be sent if the task has the | 101 | * |
78 | * PF_FREEZER_NOSIG flag unset | 102 | * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE |
79 | * Return value: 'false', if @sig_only is set and the task has | 103 | * flag and either sending a fake signal to it or waking it up, depending |
80 | * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise | 104 | * on whether it has %PF_FREEZER_NOSIG set. |
81 | * | 105 | * |
82 | * The freeze request is sent by setting the tasks's TIF_FREEZE flag and | 106 | * RETURNS: |
83 | * either sending a fake signal to it or waking it up, depending on whether | 107 | * %false, if @p is not freezing or already frozen; %true, otherwise |
84 | * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task | ||
85 | * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its | ||
86 | * TIF_FREEZE flag will not be set. | ||
87 | */ | 108 | */ |
88 | bool freeze_task(struct task_struct *p, bool sig_only) | 109 | bool freeze_task(struct task_struct *p) |
89 | { | 110 | { |
90 | /* | 111 | unsigned long flags; |
91 | * We first check if the task is freezing and next if it has already | 112 | |
92 | * been frozen to avoid the race with frozen_process() which first marks | 113 | spin_lock_irqsave(&freezer_lock, flags); |
93 | * the task as frozen and next clears its TIF_FREEZE. | 114 | if (!freezing(p) || frozen(p)) { |
94 | */ | 115 | spin_unlock_irqrestore(&freezer_lock, flags); |
95 | if (!freezing(p)) { | 116 | return false; |
96 | smp_rmb(); | ||
97 | if (frozen(p)) | ||
98 | return false; | ||
99 | |||
100 | if (!sig_only || should_send_signal(p)) | ||
101 | set_freeze_flag(p); | ||
102 | else | ||
103 | return false; | ||
104 | } | 117 | } |
105 | 118 | ||
106 | if (should_send_signal(p)) { | 119 | if (!(p->flags & PF_KTHREAD)) { |
107 | fake_signal_wake_up(p); | 120 | fake_signal_wake_up(p); |
108 | /* | 121 | /* |
109 | * fake_signal_wake_up() goes through p's scheduler | 122 | * fake_signal_wake_up() goes through p's scheduler |
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
111 | * TASK_RUNNING transition can't race with task state | 124 | * TASK_RUNNING transition can't race with task state |
112 | * testing in try_to_freeze_tasks(). | 125 | * testing in try_to_freeze_tasks(). |
113 | */ | 126 | */ |
114 | } else if (sig_only) { | ||
115 | return false; | ||
116 | } else { | 127 | } else { |
117 | wake_up_state(p, TASK_INTERRUPTIBLE); | 128 | wake_up_state(p, TASK_INTERRUPTIBLE); |
118 | } | 129 | } |
119 | 130 | ||
131 | spin_unlock_irqrestore(&freezer_lock, flags); | ||
120 | return true; | 132 | return true; |
121 | } | 133 | } |
122 | 134 | ||
123 | void cancel_freezing(struct task_struct *p) | 135 | void __thaw_task(struct task_struct *p) |
124 | { | 136 | { |
125 | unsigned long flags; | 137 | unsigned long flags; |
126 | 138 | ||
127 | if (freezing(p)) { | 139 | /* |
128 | pr_debug(" clean up: %s\n", p->comm); | 140 | * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to |
129 | clear_freeze_flag(p); | 141 | * be visible to @p as waking up implies wmb. Waking up inside |
130 | spin_lock_irqsave(&p->sighand->siglock, flags); | 142 | * freezer_lock also prevents wakeups from leaking outside |
131 | recalc_sigpending_and_wake(p); | 143 | * refrigerator. |
132 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 144 | */ |
133 | } | 145 | spin_lock_irqsave(&freezer_lock, flags); |
134 | } | 146 | if (frozen(p)) |
135 | 147 | wake_up_process(p); | |
136 | static int __thaw_process(struct task_struct *p) | 148 | spin_unlock_irqrestore(&freezer_lock, flags); |
137 | { | ||
138 | if (frozen(p)) { | ||
139 | p->flags &= ~PF_FROZEN; | ||
140 | return 1; | ||
141 | } | ||
142 | clear_freeze_flag(p); | ||
143 | return 0; | ||
144 | } | 149 | } |
145 | 150 | ||
146 | /* | 151 | /** |
147 | * Wake up a frozen process | 152 | * set_freezable - make %current freezable |
148 | * | 153 | * |
149 | * task_lock() is needed to prevent the race with refrigerator() which may | 154 | * Mark %current freezable and enter refrigerator if necessary. |
150 | * occur if the freezing of tasks fails. Namely, without the lock, if the | ||
151 | * freezing of tasks failed, thaw_tasks() might have run before a task in | ||
152 | * refrigerator() could call frozen_process(), in which case the task would be | ||
153 | * frozen and no one would thaw it. | ||
154 | */ | 155 | */ |
155 | int thaw_process(struct task_struct *p) | 156 | bool set_freezable(void) |
156 | { | 157 | { |
157 | task_lock(p); | 158 | might_sleep(); |
158 | if (__thaw_process(p) == 1) { | 159 | |
159 | task_unlock(p); | 160 | /* |
160 | wake_up_process(p); | 161 | * Modify flags while holding freezer_lock. This ensures the |
161 | return 1; | 162 | * freezer notices that we aren't frozen yet or the freezing |
162 | } | 163 | * condition is visible to try_to_freeze() below. |
163 | task_unlock(p); | 164 | */ |
164 | return 0; | 165 | spin_lock_irq(&freezer_lock); |
166 | current->flags &= ~PF_NOFREEZE; | ||
167 | spin_unlock_irq(&freezer_lock); | ||
168 | |||
169 | return try_to_freeze(); | ||
165 | } | 170 | } |
166 | EXPORT_SYMBOL(thaw_process); | 171 | EXPORT_SYMBOL(set_freezable); |
diff --git a/kernel/futex.c b/kernel/futex.c index ea87f4d2f455..1614be20173d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -314,17 +314,29 @@ again: | |||
314 | #endif | 314 | #endif |
315 | 315 | ||
316 | lock_page(page_head); | 316 | lock_page(page_head); |
317 | |||
318 | /* | ||
319 | * If page_head->mapping is NULL, then it cannot be a PageAnon | ||
320 | * page; but it might be the ZERO_PAGE or in the gate area or | ||
321 | * in a special mapping (all cases which we are happy to fail); | ||
322 | * or it may have been a good file page when get_user_pages_fast | ||
323 | * found it, but truncated or holepunched or subjected to | ||
324 | * invalidate_complete_page2 before we got the page lock (also | ||
325 | * cases which we are happy to fail). And we hold a reference, | ||
326 | * so refcount care in invalidate_complete_page's remove_mapping | ||
327 | * prevents drop_caches from setting mapping to NULL beneath us. | ||
328 | * | ||
329 | * The case we do have to guard against is when memory pressure made | ||
330 | * shmem_writepage move it from filecache to swapcache beneath us: | ||
331 | * an unlikely race, but we do need to retry for page_head->mapping. | ||
332 | */ | ||
317 | if (!page_head->mapping) { | 333 | if (!page_head->mapping) { |
334 | int shmem_swizzled = PageSwapCache(page_head); | ||
318 | unlock_page(page_head); | 335 | unlock_page(page_head); |
319 | put_page(page_head); | 336 | put_page(page_head); |
320 | /* | 337 | if (shmem_swizzled) |
321 | * ZERO_PAGE pages don't have a mapping. Avoid a busy loop | 338 | goto again; |
322 | * trying to find one. RW mapping would have COW'd (and thus | 339 | return -EFAULT; |
323 | * have a mapping) so this page is RO and won't ever change. | ||
324 | */ | ||
325 | if ((page_head == ZERO_PAGE(address))) | ||
326 | return -EFAULT; | ||
327 | goto again; | ||
328 | } | 340 | } |
329 | 341 | ||
330 | /* | 342 | /* |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 422e567eecf6..ae34bf51682b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
885 | struct hrtimer_clock_base *base, | 885 | struct hrtimer_clock_base *base, |
886 | unsigned long newstate, int reprogram) | 886 | unsigned long newstate, int reprogram) |
887 | { | 887 | { |
888 | struct timerqueue_node *next_timer; | ||
888 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 889 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
889 | goto out; | 890 | goto out; |
890 | 891 | ||
891 | if (&timer->node == timerqueue_getnext(&base->active)) { | 892 | next_timer = timerqueue_getnext(&base->active); |
893 | timerqueue_del(&base->active, &timer->node); | ||
894 | if (&timer->node == next_timer) { | ||
892 | #ifdef CONFIG_HIGH_RES_TIMERS | 895 | #ifdef CONFIG_HIGH_RES_TIMERS |
893 | /* Reprogram the clock event device. if enabled */ | 896 | /* Reprogram the clock event device. if enabled */ |
894 | if (reprogram && hrtimer_hres_active()) { | 897 | if (reprogram && hrtimer_hres_active()) { |
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
901 | } | 904 | } |
902 | #endif | 905 | #endif |
903 | } | 906 | } |
904 | timerqueue_del(&base->active, &timer->node); | ||
905 | if (!timerqueue_getnext(&base->active)) | 907 | if (!timerqueue_getnext(&base->active)) |
906 | base->cpu_base->active_bases &= ~(1 << base->index); | 908 | base->cpu_base->active_bases &= ~(1 << base->index); |
907 | out: | 909 | out: |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8b1748d0172c..2e48ec0c2e91 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
74 | 74 | ||
75 | /* | 75 | /* |
76 | * Ensure the task is not frozen. | 76 | * Ensure the task is not frozen. |
77 | * Also, when a freshly created task is scheduled once, changes | 77 | * Also, skip vfork and any other user process that freezer should skip. |
78 | * its state to TASK_UNINTERRUPTIBLE without having ever been | ||
79 | * switched out once, it musn't be checked. | ||
80 | */ | 78 | */ |
81 | if (unlikely(t->flags & PF_FROZEN || !switch_count)) | 79 | if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) |
80 | return; | ||
81 | |||
82 | /* | ||
83 | * When a freshly created task is scheduled once, changes its state to | ||
84 | * TASK_UNINTERRUPTIBLE without having ever been switched out once, it | ||
85 | * musn't be checked. | ||
86 | */ | ||
87 | if (unlikely(!switch_count)) | ||
82 | return; | 88 | return; |
83 | 89 | ||
84 | if (switch_count != t->last_switch_count) { | 90 | if (switch_count != t->last_switch_count) { |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f44e401..0119b9d467ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) | |||
53 | if (desc->irq_data.chip->irq_set_type) | 53 | if (desc->irq_data.chip->irq_set_type) |
54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, | 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
55 | IRQ_TYPE_PROBE); | 55 | IRQ_TYPE_PROBE); |
56 | irq_startup(desc); | 56 | irq_startup(desc, false); |
57 | } | 57 | } |
58 | raw_spin_unlock_irq(&desc->lock); | 58 | raw_spin_unlock_irq(&desc->lock); |
59 | } | 59 | } |
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) | |||
70 | raw_spin_lock_irq(&desc->lock); | 70 | raw_spin_lock_irq(&desc->lock); |
71 | if (!desc->action && irq_settings_can_probe(desc)) { | 71 | if (!desc->action && irq_settings_can_probe(desc)) { |
72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; | 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
73 | if (irq_startup(desc)) | 73 | if (irq_startup(desc, false)) |
74 | desc->istate |= IRQS_PENDING; | 74 | desc->istate |= IRQS_PENDING; |
75 | } | 75 | } |
76 | raw_spin_unlock_irq(&desc->lock); | 76 | raw_spin_unlock_irq(&desc->lock); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..fb7db75ee0c8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -157,19 +157,22 @@ static void irq_state_set_masked(struct irq_desc *desc) | |||
157 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | 157 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); |
158 | } | 158 | } |
159 | 159 | ||
160 | int irq_startup(struct irq_desc *desc) | 160 | int irq_startup(struct irq_desc *desc, bool resend) |
161 | { | 161 | { |
162 | int ret = 0; | ||
163 | |||
162 | irq_state_clr_disabled(desc); | 164 | irq_state_clr_disabled(desc); |
163 | desc->depth = 0; | 165 | desc->depth = 0; |
164 | 166 | ||
165 | if (desc->irq_data.chip->irq_startup) { | 167 | if (desc->irq_data.chip->irq_startup) { |
166 | int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | 168 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); |
167 | irq_state_clr_masked(desc); | 169 | irq_state_clr_masked(desc); |
168 | return ret; | 170 | } else { |
171 | irq_enable(desc); | ||
169 | } | 172 | } |
170 | 173 | if (resend) | |
171 | irq_enable(desc); | 174 | check_irq_resend(desc, desc->irq_data.irq); |
172 | return 0; | 175 | return ret; |
173 | } | 176 | } |
174 | 177 | ||
175 | void irq_shutdown(struct irq_desc *desc) | 178 | void irq_shutdown(struct irq_desc *desc) |
@@ -330,6 +333,24 @@ out_unlock: | |||
330 | } | 333 | } |
331 | EXPORT_SYMBOL_GPL(handle_simple_irq); | 334 | EXPORT_SYMBOL_GPL(handle_simple_irq); |
332 | 335 | ||
336 | /* | ||
337 | * Called unconditionally from handle_level_irq() and only for oneshot | ||
338 | * interrupts from handle_fasteoi_irq() | ||
339 | */ | ||
340 | static void cond_unmask_irq(struct irq_desc *desc) | ||
341 | { | ||
342 | /* | ||
343 | * We need to unmask in the following cases: | ||
344 | * - Standard level irq (IRQF_ONESHOT is not set) | ||
345 | * - Oneshot irq which did not wake the thread (caused by a | ||
346 | * spurious interrupt or a primary handler handling it | ||
347 | * completely). | ||
348 | */ | ||
349 | if (!irqd_irq_disabled(&desc->irq_data) && | ||
350 | irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) | ||
351 | unmask_irq(desc); | ||
352 | } | ||
353 | |||
333 | /** | 354 | /** |
334 | * handle_level_irq - Level type irq handler | 355 | * handle_level_irq - Level type irq handler |
335 | * @irq: the interrupt number | 356 | * @irq: the interrupt number |
@@ -362,8 +383,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
362 | 383 | ||
363 | handle_irq_event(desc); | 384 | handle_irq_event(desc); |
364 | 385 | ||
365 | if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) | 386 | cond_unmask_irq(desc); |
366 | unmask_irq(desc); | 387 | |
367 | out_unlock: | 388 | out_unlock: |
368 | raw_spin_unlock(&desc->lock); | 389 | raw_spin_unlock(&desc->lock); |
369 | } | 390 | } |
@@ -417,6 +438,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
417 | preflow_handler(desc); | 438 | preflow_handler(desc); |
418 | handle_irq_event(desc); | 439 | handle_irq_event(desc); |
419 | 440 | ||
441 | if (desc->istate & IRQS_ONESHOT) | ||
442 | cond_unmask_irq(desc); | ||
443 | |||
420 | out_eoi: | 444 | out_eoi: |
421 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | 445 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
422 | out_unlock: | 446 | out_unlock: |
@@ -625,7 +649,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
625 | irq_settings_set_noprobe(desc); | 649 | irq_settings_set_noprobe(desc); |
626 | irq_settings_set_norequest(desc); | 650 | irq_settings_set_norequest(desc); |
627 | irq_settings_set_nothread(desc); | 651 | irq_settings_set_nothread(desc); |
628 | irq_startup(desc); | 652 | irq_startup(desc, true); |
629 | } | 653 | } |
630 | out: | 654 | out: |
631 | irq_put_desc_busunlock(desc, flags); | 655 | irq_put_desc_busunlock(desc, flags); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a73dd6c7372d..40378ff877e7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -15,7 +15,7 @@ | |||
15 | 15 | ||
16 | #define istate core_internal_state__do_not_mess_with_it | 16 | #define istate core_internal_state__do_not_mess_with_it |
17 | 17 | ||
18 | extern int noirqdebug; | 18 | extern bool noirqdebug; |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Bits used by threaded handlers: | 21 | * Bits used by threaded handlers: |
@@ -67,7 +67,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
67 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 67 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
68 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 68 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
69 | 69 | ||
70 | extern int irq_startup(struct irq_desc *desc); | 70 | extern int irq_startup(struct irq_desc *desc, bool resend); |
71 | extern void irq_shutdown(struct irq_desc *desc); | 71 | extern void irq_shutdown(struct irq_desc *desc); |
72 | extern void irq_enable(struct irq_desc *desc); | 72 | extern void irq_enable(struct irq_desc *desc); |
73 | extern void irq_disable(struct irq_desc *desc); | 73 | extern void irq_disable(struct irq_desc *desc); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 200ce832c585..1f9e26526b69 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, | |||
135 | return -EINVAL; | 135 | return -EINVAL; |
136 | if (intsize < 1) | 136 | if (intsize < 1) |
137 | return -EINVAL; | 137 | return -EINVAL; |
138 | if (d->nr_irq && ((intspec[0] < d->hwirq_base) || | ||
139 | (intspec[0] >= d->hwirq_base + d->nr_irq))) | ||
140 | return -EINVAL; | ||
138 | 141 | ||
139 | *out_hwirq = intspec[0]; | 142 | *out_hwirq = intspec[0]; |
140 | *out_type = IRQ_TYPE_NONE; | 143 | *out_type = IRQ_TYPE_NONE; |
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, | |||
143 | return 0; | 146 | return 0; |
144 | } | 147 | } |
145 | 148 | ||
146 | struct irq_domain_ops irq_domain_simple_ops = { | ||
147 | .dt_translate = irq_domain_simple_dt_translate, | ||
148 | }; | ||
149 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
150 | |||
151 | /** | 149 | /** |
152 | * irq_domain_create_simple() - Set up a 'simple' translation range | 150 | * irq_domain_create_simple() - Set up a 'simple' translation range |
153 | */ | 151 | */ |
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match, | |||
182 | } | 180 | } |
183 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | 181 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); |
184 | #endif /* CONFIG_OF_IRQ */ | 182 | #endif /* CONFIG_OF_IRQ */ |
183 | |||
184 | struct irq_domain_ops irq_domain_simple_ops = { | ||
185 | #ifdef CONFIG_OF_IRQ | ||
186 | .dt_translate = irq_domain_simple_dt_translate, | ||
187 | #endif /* CONFIG_OF_IRQ */ | ||
188 | }; | ||
189 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 67ce837ae52c..32313c084442 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) | |||
623 | 623 | ||
624 | static int irq_wait_for_interrupt(struct irqaction *action) | 624 | static int irq_wait_for_interrupt(struct irqaction *action) |
625 | { | 625 | { |
626 | set_current_state(TASK_INTERRUPTIBLE); | ||
627 | |||
626 | while (!kthread_should_stop()) { | 628 | while (!kthread_should_stop()) { |
627 | set_current_state(TASK_INTERRUPTIBLE); | ||
628 | 629 | ||
629 | if (test_and_clear_bit(IRQTF_RUNTHREAD, | 630 | if (test_and_clear_bit(IRQTF_RUNTHREAD, |
630 | &action->thread_flags)) { | 631 | &action->thread_flags)) { |
@@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
632 | return 0; | 633 | return 0; |
633 | } | 634 | } |
634 | schedule(); | 635 | schedule(); |
636 | set_current_state(TASK_INTERRUPTIBLE); | ||
635 | } | 637 | } |
638 | __set_current_state(TASK_RUNNING); | ||
636 | return -1; | 639 | return -1; |
637 | } | 640 | } |
638 | 641 | ||
@@ -1024,7 +1027,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1024 | desc->istate |= IRQS_ONESHOT; | 1027 | desc->istate |= IRQS_ONESHOT; |
1025 | 1028 | ||
1026 | if (irq_settings_can_autoenable(desc)) | 1029 | if (irq_settings_can_autoenable(desc)) |
1027 | irq_startup(desc); | 1030 | irq_startup(desc, true); |
1028 | else | 1031 | else |
1029 | /* Undo nested disables: */ | 1032 | /* Undo nested disables: */ |
1030 | desc->depth = 1; | 1033 | desc->depth = 1; |
@@ -1289,7 +1292,7 @@ EXPORT_SYMBOL(free_irq); | |||
1289 | * and to set up the interrupt handler in the right order. | 1292 | * and to set up the interrupt handler in the right order. |
1290 | * | 1293 | * |
1291 | * If you want to set up a threaded irq handler for your device | 1294 | * If you want to set up a threaded irq handler for your device |
1292 | * then you need to supply @handler and @thread_fn. @handler ist | 1295 | * then you need to supply @handler and @thread_fn. @handler is |
1293 | * still called in hard interrupt context and has to check | 1296 | * still called in hard interrupt context and has to check |
1294 | * whether the interrupt originates from the device. If yes it | 1297 | * whether the interrupt originates from the device. If yes it |
1295 | * needs to disable the interrupt on the device and return | 1298 | * needs to disable the interrupt on the device and return |
@@ -1596,7 +1599,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
1596 | return -ENOMEM; | 1599 | return -ENOMEM; |
1597 | 1600 | ||
1598 | action->handler = handler; | 1601 | action->handler = handler; |
1599 | action->flags = IRQF_PERCPU; | 1602 | action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; |
1600 | action->name = devname; | 1603 | action->name = devname; |
1601 | action->percpu_dev_id = dev_id; | 1604 | action->percpu_dev_id = dev_id; |
1602 | 1605 | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b5f4742693c0..611cd6003c45 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) | |||
84 | */ | 84 | */ |
85 | action = desc->action; | 85 | action = desc->action; |
86 | if (!action || !(action->flags & IRQF_SHARED) || | 86 | if (!action || !(action->flags & IRQF_SHARED) || |
87 | (action->flags & __IRQF_TIMER) || !action->next) | 87 | (action->flags & __IRQF_TIMER) || |
88 | (action->handler(irq, action->dev_id) == IRQ_HANDLED) || | ||
89 | !action->next) | ||
88 | goto out; | 90 | goto out; |
89 | 91 | ||
90 | /* Already running on another processor */ | 92 | /* Already running on another processor */ |
@@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
323 | desc->irqs_unhandled = 0; | 325 | desc->irqs_unhandled = 0; |
324 | } | 326 | } |
325 | 327 | ||
326 | int noirqdebug __read_mostly; | 328 | bool noirqdebug __read_mostly; |
327 | 329 | ||
328 | int noirqdebug_setup(char *str) | 330 | int noirqdebug_setup(char *str) |
329 | { | 331 | { |
diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153da..22000c3db0dd 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
52 | 52 | ||
53 | cval = it->expires; | 53 | cval = it->expires; |
54 | cinterval = it->incr; | 54 | cinterval = it->incr; |
55 | if (!cputime_eq(cval, cputime_zero)) { | 55 | if (cval) { |
56 | struct task_cputime cputime; | 56 | struct task_cputime cputime; |
57 | cputime_t t; | 57 | cputime_t t; |
58 | 58 | ||
59 | thread_group_cputimer(tsk, &cputime); | 59 | thread_group_cputimer(tsk, &cputime); |
60 | if (clock_id == CPUCLOCK_PROF) | 60 | if (clock_id == CPUCLOCK_PROF) |
61 | t = cputime_add(cputime.utime, cputime.stime); | 61 | t = cputime.utime + cputime.stime; |
62 | else | 62 | else |
63 | /* CPUCLOCK_VIRT */ | 63 | /* CPUCLOCK_VIRT */ |
64 | t = cputime.utime; | 64 | t = cputime.utime; |
65 | 65 | ||
66 | if (cputime_le(cval, t)) | 66 | if (cval < t) |
67 | /* about to fire */ | 67 | /* about to fire */ |
68 | cval = cputime_one_jiffy; | 68 | cval = cputime_one_jiffy; |
69 | else | 69 | else |
70 | cval = cputime_sub(cval, t); | 70 | cval = cval - t; |
71 | } | 71 | } |
72 | 72 | ||
73 | spin_unlock_irq(&tsk->sighand->siglock); | 73 | spin_unlock_irq(&tsk->sighand->siglock); |
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
161 | 161 | ||
162 | cval = it->expires; | 162 | cval = it->expires; |
163 | cinterval = it->incr; | 163 | cinterval = it->incr; |
164 | if (!cputime_eq(cval, cputime_zero) || | 164 | if (cval || nval) { |
165 | !cputime_eq(nval, cputime_zero)) { | 165 | if (nval > 0) |
166 | if (cputime_gt(nval, cputime_zero)) | 166 | nval += cputime_one_jiffy; |
167 | nval = cputime_add(nval, cputime_one_jiffy); | ||
168 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); | 167 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); |
169 | } | 168 | } |
170 | it->expires = nval; | 169 | it->expires = nval; |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bbdfe2a462a0..01d3b70fc98a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key) | |||
66 | return; | 66 | return; |
67 | 67 | ||
68 | jump_label_lock(); | 68 | jump_label_lock(); |
69 | if (atomic_add_return(1, &key->enabled) == 1) | 69 | if (atomic_read(&key->enabled) == 0) |
70 | jump_label_update(key, JUMP_LABEL_ENABLE); | 70 | jump_label_update(key, JUMP_LABEL_ENABLE); |
71 | atomic_inc(&key->enabled); | ||
71 | jump_label_unlock(); | 72 | jump_label_unlock(); |
72 | } | 73 | } |
74 | EXPORT_SYMBOL_GPL(jump_label_inc); | ||
73 | 75 | ||
74 | void jump_label_dec(struct jump_label_key *key) | 76 | static void __jump_label_dec(struct jump_label_key *key, |
77 | unsigned long rate_limit, struct delayed_work *work) | ||
75 | { | 78 | { |
76 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) | 79 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) |
77 | return; | 80 | return; |
78 | 81 | ||
79 | jump_label_update(key, JUMP_LABEL_DISABLE); | 82 | if (rate_limit) { |
83 | atomic_inc(&key->enabled); | ||
84 | schedule_delayed_work(work, rate_limit); | ||
85 | } else | ||
86 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
87 | |||
80 | jump_label_unlock(); | 88 | jump_label_unlock(); |
81 | } | 89 | } |
90 | EXPORT_SYMBOL_GPL(jump_label_dec); | ||
91 | |||
92 | static void jump_label_update_timeout(struct work_struct *work) | ||
93 | { | ||
94 | struct jump_label_key_deferred *key = | ||
95 | container_of(work, struct jump_label_key_deferred, work.work); | ||
96 | __jump_label_dec(&key->key, 0, NULL); | ||
97 | } | ||
98 | |||
99 | void jump_label_dec(struct jump_label_key *key) | ||
100 | { | ||
101 | __jump_label_dec(key, 0, NULL); | ||
102 | } | ||
103 | |||
104 | void jump_label_dec_deferred(struct jump_label_key_deferred *key) | ||
105 | { | ||
106 | __jump_label_dec(&key->key, key->timeout, &key->work); | ||
107 | } | ||
108 | |||
109 | |||
110 | void jump_label_rate_limit(struct jump_label_key_deferred *key, | ||
111 | unsigned long rl) | ||
112 | { | ||
113 | key->timeout = rl; | ||
114 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | ||
115 | } | ||
82 | 116 | ||
83 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | 117 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) |
84 | { | 118 | { |
@@ -110,7 +144,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, | |||
110 | * running code can override this to make the non-live update case | 144 | * running code can override this to make the non-live update case |
111 | * cheaper. | 145 | * cheaper. |
112 | */ | 146 | */ |
113 | void __weak arch_jump_label_transform_static(struct jump_entry *entry, | 147 | void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, |
114 | enum jump_label_type type) | 148 | enum jump_label_type type) |
115 | { | 149 | { |
116 | arch_jump_label_transform(entry, type); | 150 | arch_jump_label_transform(entry, type); |
@@ -216,8 +250,13 @@ void jump_label_apply_nops(struct module *mod) | |||
216 | if (iter_start == iter_stop) | 250 | if (iter_start == iter_stop) |
217 | return; | 251 | return; |
218 | 252 | ||
219 | for (iter = iter_start; iter < iter_stop; iter++) | 253 | for (iter = iter_start; iter < iter_stop; iter++) { |
220 | arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); | 254 | struct jump_label_key *iterk; |
255 | |||
256 | iterk = (struct jump_label_key *)(unsigned long)iter->key; | ||
257 | arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? | ||
258 | JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); | ||
259 | } | ||
221 | } | 260 | } |
222 | 261 | ||
223 | static int jump_label_add_module(struct module *mod) | 262 | static int jump_label_add_module(struct module *mod) |
@@ -257,8 +296,7 @@ static int jump_label_add_module(struct module *mod) | |||
257 | key->next = jlm; | 296 | key->next = jlm; |
258 | 297 | ||
259 | if (jump_label_enabled(key)) | 298 | if (jump_label_enabled(key)) |
260 | __jump_label_update(key, iter, iter_stop, | 299 | __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); |
261 | JUMP_LABEL_ENABLE); | ||
262 | } | 300 | } |
263 | 301 | ||
264 | return 0; | 302 | return 0; |
diff --git a/kernel/kexec.c b/kernel/kexec.c index dc7bc0829286..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/console.h> | 32 | #include <linux/console.h> |
33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/kmsg_dump.h> | ||
36 | #include <linux/syscore_ops.h> | 35 | #include <linux/syscore_ops.h> |
37 | 36 | ||
38 | #include <asm/page.h> | 37 | #include <asm/page.h> |
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) | |||
1094 | if (kexec_crash_image) { | 1093 | if (kexec_crash_image) { |
1095 | struct pt_regs fixed_regs; | 1094 | struct pt_regs fixed_regs; |
1096 | 1095 | ||
1097 | kmsg_dump(KMSG_DUMP_KEXEC); | ||
1098 | |||
1099 | crash_setup_regs(&fixed_regs, regs); | 1096 | crash_setup_regs(&fixed_regs, regs); |
1100 | crash_save_vmcoreinfo(); | 1097 | crash_save_vmcoreinfo(); |
1101 | machine_crash_shutdown(&fixed_regs); | 1098 | machine_crash_shutdown(&fixed_regs); |
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) | |||
1132 | { | 1129 | { |
1133 | int ret = 0; | 1130 | int ret = 0; |
1134 | unsigned long start, end; | 1131 | unsigned long start, end; |
1132 | unsigned long old_size; | ||
1133 | struct resource *ram_res; | ||
1135 | 1134 | ||
1136 | mutex_lock(&kexec_mutex); | 1135 | mutex_lock(&kexec_mutex); |
1137 | 1136 | ||
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size) | |||
1141 | } | 1140 | } |
1142 | start = crashk_res.start; | 1141 | start = crashk_res.start; |
1143 | end = crashk_res.end; | 1142 | end = crashk_res.end; |
1143 | old_size = (end == 0) ? 0 : end - start + 1; | ||
1144 | if (new_size >= old_size) { | ||
1145 | ret = (new_size == old_size) ? 0 : -EINVAL; | ||
1146 | goto unlock; | ||
1147 | } | ||
1144 | 1148 | ||
1145 | if (new_size >= end - start + 1) { | 1149 | ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); |
1146 | ret = -EINVAL; | 1150 | if (!ram_res) { |
1147 | if (new_size == end - start + 1) | 1151 | ret = -ENOMEM; |
1148 | ret = 0; | ||
1149 | goto unlock; | 1152 | goto unlock; |
1150 | } | 1153 | } |
1151 | 1154 | ||
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size) | |||
1157 | 1160 | ||
1158 | if ((start == end) && (crashk_res.parent != NULL)) | 1161 | if ((start == end) && (crashk_res.parent != NULL)) |
1159 | release_resource(&crashk_res); | 1162 | release_resource(&crashk_res); |
1163 | |||
1164 | ram_res->start = end; | ||
1165 | ram_res->end = crashk_res.end; | ||
1166 | ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; | ||
1167 | ram_res->name = "System RAM"; | ||
1168 | |||
1160 | crashk_res.end = end - 1; | 1169 | crashk_res.end = end - 1; |
1170 | |||
1171 | insert_resource(&iomem_resource, ram_res); | ||
1161 | crash_unmap_reserved_pages(); | 1172 | crash_unmap_reserved_pages(); |
1162 | 1173 | ||
1163 | unlock: | 1174 | unlock: |
@@ -1523,7 +1534,7 @@ int kernel_kexec(void) | |||
1523 | 1534 | ||
1524 | #ifdef CONFIG_KEXEC_JUMP | 1535 | #ifdef CONFIG_KEXEC_JUMP |
1525 | if (kexec_image->preserve_context) { | 1536 | if (kexec_image->preserve_context) { |
1526 | mutex_lock(&pm_mutex); | 1537 | lock_system_sleep(); |
1527 | pm_prepare_console(); | 1538 | pm_prepare_console(); |
1528 | error = freeze_processes(); | 1539 | error = freeze_processes(); |
1529 | if (error) { | 1540 | if (error) { |
@@ -1576,7 +1587,7 @@ int kernel_kexec(void) | |||
1576 | thaw_processes(); | 1587 | thaw_processes(); |
1577 | Restore_console: | 1588 | Restore_console: |
1578 | pm_restore_console(); | 1589 | pm_restore_console(); |
1579 | mutex_unlock(&pm_mutex); | 1590 | unlock_system_sleep(); |
1580 | } | 1591 | } |
1581 | #endif | 1592 | #endif |
1582 | 1593 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index a4bea97c75b6..a0a88543934e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/resource.h> | 36 | #include <linux/resource.h> |
37 | #include <linux/notifier.h> | 37 | #include <linux/notifier.h> |
38 | #include <linux/suspend.h> | 38 | #include <linux/suspend.h> |
39 | #include <linux/rwsem.h> | ||
39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
40 | 41 | ||
41 | #include <trace/events/module.h> | 42 | #include <trace/events/module.h> |
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq; | |||
50 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; | 51 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; |
51 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; | 52 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; |
52 | static DEFINE_SPINLOCK(umh_sysctl_lock); | 53 | static DEFINE_SPINLOCK(umh_sysctl_lock); |
54 | static DECLARE_RWSEM(umhelper_sem); | ||
53 | 55 | ||
54 | #ifdef CONFIG_MODULES | 56 | #ifdef CONFIG_MODULES |
55 | 57 | ||
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work) | |||
275 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | 277 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY |
276 | * (used for preventing user land processes from being created after the user | 278 | * (used for preventing user land processes from being created after the user |
277 | * land has been frozen during a system-wide hibernation or suspend operation). | 279 | * land has been frozen during a system-wide hibernation or suspend operation). |
280 | * Should always be manipulated under umhelper_sem acquired for write. | ||
278 | */ | 281 | */ |
279 | static int usermodehelper_disabled = 1; | 282 | static int usermodehelper_disabled = 1; |
280 | 283 | ||
@@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1; | |||
282 | static atomic_t running_helpers = ATOMIC_INIT(0); | 285 | static atomic_t running_helpers = ATOMIC_INIT(0); |
283 | 286 | ||
284 | /* | 287 | /* |
285 | * Wait queue head used by usermodehelper_pm_callback() to wait for all running | 288 | * Wait queue head used by usermodehelper_disable() to wait for all running |
286 | * helpers to finish. | 289 | * helpers to finish. |
287 | */ | 290 | */ |
288 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); | 291 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); |
289 | 292 | ||
290 | /* | 293 | /* |
291 | * Time to wait for running_helpers to become zero before the setting of | 294 | * Time to wait for running_helpers to become zero before the setting of |
292 | * usermodehelper_disabled in usermodehelper_pm_callback() fails | 295 | * usermodehelper_disabled in usermodehelper_disable() fails |
293 | */ | 296 | */ |
294 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) | 297 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) |
295 | 298 | ||
299 | void read_lock_usermodehelper(void) | ||
300 | { | ||
301 | down_read(&umhelper_sem); | ||
302 | } | ||
303 | EXPORT_SYMBOL_GPL(read_lock_usermodehelper); | ||
304 | |||
305 | void read_unlock_usermodehelper(void) | ||
306 | { | ||
307 | up_read(&umhelper_sem); | ||
308 | } | ||
309 | EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); | ||
310 | |||
296 | /** | 311 | /** |
297 | * usermodehelper_disable - prevent new helpers from being started | 312 | * usermodehelper_disable - prevent new helpers from being started |
298 | */ | 313 | */ |
@@ -300,8 +315,10 @@ int usermodehelper_disable(void) | |||
300 | { | 315 | { |
301 | long retval; | 316 | long retval; |
302 | 317 | ||
318 | down_write(&umhelper_sem); | ||
303 | usermodehelper_disabled = 1; | 319 | usermodehelper_disabled = 1; |
304 | smp_mb(); | 320 | up_write(&umhelper_sem); |
321 | |||
305 | /* | 322 | /* |
306 | * From now on call_usermodehelper_exec() won't start any new | 323 | * From now on call_usermodehelper_exec() won't start any new |
307 | * helpers, so it is sufficient if running_helpers turns out to | 324 | * helpers, so it is sufficient if running_helpers turns out to |
@@ -314,7 +331,9 @@ int usermodehelper_disable(void) | |||
314 | if (retval) | 331 | if (retval) |
315 | return 0; | 332 | return 0; |
316 | 333 | ||
334 | down_write(&umhelper_sem); | ||
317 | usermodehelper_disabled = 0; | 335 | usermodehelper_disabled = 0; |
336 | up_write(&umhelper_sem); | ||
318 | return -EAGAIN; | 337 | return -EAGAIN; |
319 | } | 338 | } |
320 | 339 | ||
@@ -323,7 +342,9 @@ int usermodehelper_disable(void) | |||
323 | */ | 342 | */ |
324 | void usermodehelper_enable(void) | 343 | void usermodehelper_enable(void) |
325 | { | 344 | { |
345 | down_write(&umhelper_sem); | ||
326 | usermodehelper_disabled = 0; | 346 | usermodehelper_disabled = 0; |
347 | up_write(&umhelper_sem); | ||
327 | } | 348 | } |
328 | 349 | ||
329 | /** | 350 | /** |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823b..9788c0ec6f43 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
1077 | /* Early boot. kretprobe_table_locks not yet initialized. */ | 1077 | /* Early boot. kretprobe_table_locks not yet initialized. */ |
1078 | return; | 1078 | return; |
1079 | 1079 | ||
1080 | INIT_HLIST_HEAD(&empty_rp); | ||
1080 | hash = hash_ptr(tk, KPROBE_HASH_BITS); | 1081 | hash = hash_ptr(tk, KPROBE_HASH_BITS); |
1081 | head = &kretprobe_inst_table[hash]; | 1082 | head = &kretprobe_inst_table[hash]; |
1082 | kretprobe_table_lock(hash, &flags); | 1083 | kretprobe_table_lock(hash, &flags); |
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
1085 | recycle_rp_inst(ri, &empty_rp); | 1086 | recycle_rp_inst(ri, &empty_rp); |
1086 | } | 1087 | } |
1087 | kretprobe_table_unlock(hash, &flags); | 1088 | kretprobe_table_unlock(hash, &flags); |
1088 | INIT_HLIST_HEAD(&empty_rp); | ||
1089 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { | 1089 | hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { |
1090 | hlist_del(&ri->hlist); | 1090 | hlist_del(&ri->hlist); |
1091 | kfree(ri); | 1091 | kfree(ri); |
@@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1673 | ri->rp = rp; | 1673 | ri->rp = rp; |
1674 | ri->task = current; | 1674 | ri->task = current; |
1675 | 1675 | ||
1676 | if (rp->entry_handler && rp->entry_handler(ri, regs)) | 1676 | if (rp->entry_handler && rp->entry_handler(ri, regs)) { |
1677 | raw_spin_lock_irqsave(&rp->lock, flags); | ||
1678 | hlist_add_head(&ri->hlist, &rp->free_instances); | ||
1679 | raw_spin_unlock_irqrestore(&rp->lock, flags); | ||
1677 | return 0; | 1680 | return 0; |
1681 | } | ||
1678 | 1682 | ||
1679 | arch_prepare_kretprobe(ri, regs); | 1683 | arch_prepare_kretprobe(ri, regs); |
1680 | 1684 | ||
@@ -2198,7 +2202,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
2198 | const char __user *user_buf, size_t count, loff_t *ppos) | 2202 | const char __user *user_buf, size_t count, loff_t *ppos) |
2199 | { | 2203 | { |
2200 | char buf[32]; | 2204 | char buf[32]; |
2201 | int buf_size; | 2205 | size_t buf_size; |
2202 | 2206 | ||
2203 | buf_size = min(count, (sizeof(buf)-1)); | 2207 | buf_size = min(count, (sizeof(buf)-1)); |
2204 | if (copy_from_user(buf, user_buf, buf_size)) | 2208 | if (copy_from_user(buf, user_buf, buf_size)) |
diff --git a/kernel/kthread.c b/kernel/kthread.c index b6d216a92639..3d3de633702e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -59,6 +59,31 @@ int kthread_should_stop(void) | |||
59 | EXPORT_SYMBOL(kthread_should_stop); | 59 | EXPORT_SYMBOL(kthread_should_stop); |
60 | 60 | ||
61 | /** | 61 | /** |
62 | * kthread_freezable_should_stop - should this freezable kthread return now? | ||
63 | * @was_frozen: optional out parameter, indicates whether %current was frozen | ||
64 | * | ||
65 | * kthread_should_stop() for freezable kthreads, which will enter | ||
66 | * refrigerator if necessary. This function is safe from kthread_stop() / | ||
67 | * freezer deadlock and freezable kthreads should use this function instead | ||
68 | * of calling try_to_freeze() directly. | ||
69 | */ | ||
70 | bool kthread_freezable_should_stop(bool *was_frozen) | ||
71 | { | ||
72 | bool frozen = false; | ||
73 | |||
74 | might_sleep(); | ||
75 | |||
76 | if (unlikely(freezing(current))) | ||
77 | frozen = __refrigerator(true); | ||
78 | |||
79 | if (was_frozen) | ||
80 | *was_frozen = frozen; | ||
81 | |||
82 | return kthread_should_stop(); | ||
83 | } | ||
84 | EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); | ||
85 | |||
86 | /** | ||
62 | * kthread_data - return data value specified on kthread creation | 87 | * kthread_data - return data value specified on kthread creation |
63 | * @task: kthread task in question | 88 | * @task: kthread task in question |
64 | * | 89 | * |
@@ -257,7 +282,7 @@ int kthreadd(void *unused) | |||
257 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 282 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
258 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 283 | set_mems_allowed(node_states[N_HIGH_MEMORY]); |
259 | 284 | ||
260 | current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; | 285 | current->flags |= PF_NOFREEZE; |
261 | 286 | ||
262 | for (;;) { | 287 | for (;;) { |
263 | set_current_state(TASK_INTERRUPTIBLE); | 288 | set_current_state(TASK_INTERRUPTIBLE); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e69434b070da..8889f7dd7c46 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/stringify.h> | 44 | #include <linux/stringify.h> |
45 | #include <linux/bitops.h> | 45 | #include <linux/bitops.h> |
46 | #include <linux/gfp.h> | 46 | #include <linux/gfp.h> |
47 | #include <linux/kmemcheck.h> | ||
47 | 48 | ||
48 | #include <asm/sections.h> | 49 | #include <asm/sections.h> |
49 | 50 | ||
@@ -430,6 +431,7 @@ unsigned int max_lockdep_depth; | |||
430 | * about it later on, in lockdep_info(). | 431 | * about it later on, in lockdep_info(). |
431 | */ | 432 | */ |
432 | static int lockdep_init_error; | 433 | static int lockdep_init_error; |
434 | static const char *lock_init_error; | ||
433 | static unsigned long lockdep_init_trace_data[20]; | 435 | static unsigned long lockdep_init_trace_data[20]; |
434 | static struct stack_trace lockdep_init_trace = { | 436 | static struct stack_trace lockdep_init_trace = { |
435 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), | 437 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), |
@@ -498,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
498 | usage[i] = '\0'; | 500 | usage[i] = '\0'; |
499 | } | 501 | } |
500 | 502 | ||
501 | static int __print_lock_name(struct lock_class *class) | 503 | static void __print_lock_name(struct lock_class *class) |
502 | { | 504 | { |
503 | char str[KSYM_NAME_LEN]; | 505 | char str[KSYM_NAME_LEN]; |
504 | const char *name; | 506 | const char *name; |
505 | 507 | ||
506 | name = class->name; | 508 | name = class->name; |
507 | if (!name) | ||
508 | name = __get_key_name(class->key, str); | ||
509 | |||
510 | return printk("%s", name); | ||
511 | } | ||
512 | |||
513 | static void print_lock_name(struct lock_class *class) | ||
514 | { | ||
515 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | ||
516 | const char *name; | ||
517 | |||
518 | get_usage_chars(class, usage); | ||
519 | |||
520 | name = class->name; | ||
521 | if (!name) { | 509 | if (!name) { |
522 | name = __get_key_name(class->key, str); | 510 | name = __get_key_name(class->key, str); |
523 | printk(" (%s", name); | 511 | printk("%s", name); |
524 | } else { | 512 | } else { |
525 | printk(" (%s", name); | 513 | printk("%s", name); |
526 | if (class->name_version > 1) | 514 | if (class->name_version > 1) |
527 | printk("#%d", class->name_version); | 515 | printk("#%d", class->name_version); |
528 | if (class->subclass) | 516 | if (class->subclass) |
529 | printk("/%d", class->subclass); | 517 | printk("/%d", class->subclass); |
530 | } | 518 | } |
519 | } | ||
520 | |||
521 | static void print_lock_name(struct lock_class *class) | ||
522 | { | ||
523 | char usage[LOCK_USAGE_CHARS]; | ||
524 | |||
525 | get_usage_chars(class, usage); | ||
526 | |||
527 | printk(" ("); | ||
528 | __print_lock_name(class); | ||
531 | printk("){%s}", usage); | 529 | printk("){%s}", usage); |
532 | } | 530 | } |
533 | 531 | ||
@@ -567,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr) | |||
567 | } | 565 | } |
568 | } | 566 | } |
569 | 567 | ||
570 | static void print_kernel_version(void) | 568 | static void print_kernel_ident(void) |
571 | { | 569 | { |
572 | printk("%s %.*s\n", init_utsname()->release, | 570 | printk("%s %.*s %s\n", init_utsname()->release, |
573 | (int)strcspn(init_utsname()->version, " "), | 571 | (int)strcspn(init_utsname()->version, " "), |
574 | init_utsname()->version); | 572 | init_utsname()->version, |
573 | print_tainted()); | ||
575 | } | 574 | } |
576 | 575 | ||
577 | static int very_verbose(struct lock_class *class) | 576 | static int very_verbose(struct lock_class *class) |
@@ -655,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
655 | if (unlikely(!lockdep_initialized)) { | 654 | if (unlikely(!lockdep_initialized)) { |
656 | lockdep_init(); | 655 | lockdep_init(); |
657 | lockdep_init_error = 1; | 656 | lockdep_init_error = 1; |
657 | lock_init_error = lock->name; | ||
658 | save_stack_trace(&lockdep_init_trace); | 658 | save_stack_trace(&lockdep_init_trace); |
659 | } | 659 | } |
660 | #endif | 660 | #endif |
@@ -722,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
722 | 722 | ||
723 | class = look_up_lock_class(lock, subclass); | 723 | class = look_up_lock_class(lock, subclass); |
724 | if (likely(class)) | 724 | if (likely(class)) |
725 | return class; | 725 | goto out_set_class_cache; |
726 | 726 | ||
727 | /* | 727 | /* |
728 | * Debug-check: all keys must be persistent! | 728 | * Debug-check: all keys must be persistent! |
@@ -807,6 +807,7 @@ out_unlock_set: | |||
807 | graph_unlock(); | 807 | graph_unlock(); |
808 | raw_local_irq_restore(flags); | 808 | raw_local_irq_restore(flags); |
809 | 809 | ||
810 | out_set_class_cache: | ||
810 | if (!subclass || force) | 811 | if (!subclass || force) |
811 | lock->class_cache[0] = class; | 812 | lock->class_cache[0] = class; |
812 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | 813 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
@@ -1148,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1148 | printk("\n"); | 1149 | printk("\n"); |
1149 | printk("======================================================\n"); | 1150 | printk("======================================================\n"); |
1150 | printk("[ INFO: possible circular locking dependency detected ]\n"); | 1151 | printk("[ INFO: possible circular locking dependency detected ]\n"); |
1151 | print_kernel_version(); | 1152 | print_kernel_ident(); |
1152 | printk("-------------------------------------------------------\n"); | 1153 | printk("-------------------------------------------------------\n"); |
1153 | printk("%s/%d is trying to acquire lock:\n", | 1154 | printk("%s/%d is trying to acquire lock:\n", |
1154 | curr->comm, task_pid_nr(curr)); | 1155 | curr->comm, task_pid_nr(curr)); |
@@ -1487,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1487 | printk("======================================================\n"); | 1488 | printk("======================================================\n"); |
1488 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1489 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", |
1489 | irqclass, irqclass); | 1490 | irqclass, irqclass); |
1490 | print_kernel_version(); | 1491 | print_kernel_ident(); |
1491 | printk("------------------------------------------------------\n"); | 1492 | printk("------------------------------------------------------\n"); |
1492 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1493 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1493 | curr->comm, task_pid_nr(curr), | 1494 | curr->comm, task_pid_nr(curr), |
@@ -1716,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1716 | printk("\n"); | 1717 | printk("\n"); |
1717 | printk("=============================================\n"); | 1718 | printk("=============================================\n"); |
1718 | printk("[ INFO: possible recursive locking detected ]\n"); | 1719 | printk("[ INFO: possible recursive locking detected ]\n"); |
1719 | print_kernel_version(); | 1720 | print_kernel_ident(); |
1720 | printk("---------------------------------------------\n"); | 1721 | printk("---------------------------------------------\n"); |
1721 | printk("%s/%d is trying to acquire lock:\n", | 1722 | printk("%s/%d is trying to acquire lock:\n", |
1722 | curr->comm, task_pid_nr(curr)); | 1723 | curr->comm, task_pid_nr(curr)); |
@@ -2223,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2223 | printk("\n"); | 2224 | printk("\n"); |
2224 | printk("=================================\n"); | 2225 | printk("=================================\n"); |
2225 | printk("[ INFO: inconsistent lock state ]\n"); | 2226 | printk("[ INFO: inconsistent lock state ]\n"); |
2226 | print_kernel_version(); | 2227 | print_kernel_ident(); |
2227 | printk("---------------------------------\n"); | 2228 | printk("---------------------------------\n"); |
2228 | 2229 | ||
2229 | printk("inconsistent {%s} -> {%s} usage.\n", | 2230 | printk("inconsistent {%s} -> {%s} usage.\n", |
@@ -2288,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2288 | printk("\n"); | 2289 | printk("\n"); |
2289 | printk("=========================================================\n"); | 2290 | printk("=========================================================\n"); |
2290 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); | 2291 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); |
2291 | print_kernel_version(); | 2292 | print_kernel_ident(); |
2292 | printk("---------------------------------------------------------\n"); | 2293 | printk("---------------------------------------------------------\n"); |
2293 | printk("%s/%d just changed the state of lock:\n", | 2294 | printk("%s/%d just changed the state of lock:\n", |
2294 | curr->comm, task_pid_nr(curr)); | 2295 | curr->comm, task_pid_nr(curr)); |
@@ -2948,7 +2949,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2948 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2949 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
2949 | struct lock_class_key *key, int subclass) | 2950 | struct lock_class_key *key, int subclass) |
2950 | { | 2951 | { |
2951 | memset(lock, 0, sizeof(*lock)); | 2952 | int i; |
2953 | |||
2954 | kmemcheck_mark_initialized(lock, sizeof(*lock)); | ||
2955 | |||
2956 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
2957 | lock->class_cache[i] = NULL; | ||
2952 | 2958 | ||
2953 | #ifdef CONFIG_LOCK_STAT | 2959 | #ifdef CONFIG_LOCK_STAT |
2954 | lock->cpu = raw_smp_processor_id(); | 2960 | lock->cpu = raw_smp_processor_id(); |
@@ -3169,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3169 | printk("\n"); | 3175 | printk("\n"); |
3170 | printk("=====================================\n"); | 3176 | printk("=====================================\n"); |
3171 | printk("[ BUG: bad unlock balance detected! ]\n"); | 3177 | printk("[ BUG: bad unlock balance detected! ]\n"); |
3178 | print_kernel_ident(); | ||
3172 | printk("-------------------------------------\n"); | 3179 | printk("-------------------------------------\n"); |
3173 | printk("%s/%d is trying to release lock (", | 3180 | printk("%s/%d is trying to release lock (", |
3174 | curr->comm, task_pid_nr(curr)); | 3181 | curr->comm, task_pid_nr(curr)); |
@@ -3613,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3613 | printk("\n"); | 3620 | printk("\n"); |
3614 | printk("=================================\n"); | 3621 | printk("=================================\n"); |
3615 | printk("[ BUG: bad contention detected! ]\n"); | 3622 | printk("[ BUG: bad contention detected! ]\n"); |
3623 | print_kernel_ident(); | ||
3616 | printk("---------------------------------\n"); | 3624 | printk("---------------------------------\n"); |
3617 | printk("%s/%d is trying to contend lock (", | 3625 | printk("%s/%d is trying to contend lock (", |
3618 | curr->comm, task_pid_nr(curr)); | 3626 | curr->comm, task_pid_nr(curr)); |
@@ -3968,7 +3976,8 @@ void __init lockdep_info(void) | |||
3968 | 3976 | ||
3969 | #ifdef CONFIG_DEBUG_LOCKDEP | 3977 | #ifdef CONFIG_DEBUG_LOCKDEP |
3970 | if (lockdep_init_error) { | 3978 | if (lockdep_init_error) { |
3971 | printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); | 3979 | printk("WARNING: lockdep init error! lock-%s was acquired" |
3980 | "before lockdep_init\n", lock_init_error); | ||
3972 | printk("Call stack leading to lockdep invocation was:\n"); | 3981 | printk("Call stack leading to lockdep invocation was:\n"); |
3973 | print_stack_trace(&lockdep_init_trace, 0); | 3982 | print_stack_trace(&lockdep_init_trace, 0); |
3974 | } | 3983 | } |
@@ -3987,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
3987 | printk("\n"); | 3996 | printk("\n"); |
3988 | printk("=========================\n"); | 3997 | printk("=========================\n"); |
3989 | printk("[ BUG: held lock freed! ]\n"); | 3998 | printk("[ BUG: held lock freed! ]\n"); |
3999 | print_kernel_ident(); | ||
3990 | printk("-------------------------\n"); | 4000 | printk("-------------------------\n"); |
3991 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4001 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
3992 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4002 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
@@ -4044,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr) | |||
4044 | printk("\n"); | 4054 | printk("\n"); |
4045 | printk("=====================================\n"); | 4055 | printk("=====================================\n"); |
4046 | printk("[ BUG: lock held at task exit time! ]\n"); | 4056 | printk("[ BUG: lock held at task exit time! ]\n"); |
4057 | print_kernel_ident(); | ||
4047 | printk("-------------------------------------\n"); | 4058 | printk("-------------------------------------\n"); |
4048 | printk("%s/%d is exiting with locks still held!\n", | 4059 | printk("%s/%d is exiting with locks still held!\n", |
4049 | curr->comm, task_pid_nr(curr)); | 4060 | curr->comm, task_pid_nr(curr)); |
@@ -4141,6 +4152,7 @@ void lockdep_sys_exit(void) | |||
4141 | printk("\n"); | 4152 | printk("\n"); |
4142 | printk("================================================\n"); | 4153 | printk("================================================\n"); |
4143 | printk("[ BUG: lock held when returning to user space! ]\n"); | 4154 | printk("[ BUG: lock held when returning to user space! ]\n"); |
4155 | print_kernel_ident(); | ||
4144 | printk("------------------------------------------------\n"); | 4156 | printk("------------------------------------------------\n"); |
4145 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4157 | printk("%s/%d is leaving the kernel with locks still held!\n", |
4146 | curr->comm, curr->pid); | 4158 | curr->comm, curr->pid); |
@@ -4160,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4160 | printk("\n"); | 4172 | printk("\n"); |
4161 | printk("===============================\n"); | 4173 | printk("===============================\n"); |
4162 | printk("[ INFO: suspicious RCU usage. ]\n"); | 4174 | printk("[ INFO: suspicious RCU usage. ]\n"); |
4175 | print_kernel_ident(); | ||
4163 | printk("-------------------------------\n"); | 4176 | printk("-------------------------------\n"); |
4164 | printk("%s:%d %s!\n", file, line, s); | 4177 | printk("%s:%d %s!\n", file, line, s); |
4165 | printk("\nother info that might help us debug this:\n\n"); | 4178 | printk("\nother info that might help us debug this:\n\n"); |
4166 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); | 4179 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); |
4180 | |||
4181 | /* | ||
4182 | * If a CPU is in the RCU-free window in idle (ie: in the section | ||
4183 | * between rcu_idle_enter() and rcu_idle_exit(), then RCU | ||
4184 | * considers that CPU to be in an "extended quiescent state", | ||
4185 | * which means that RCU will be completely ignoring that CPU. | ||
4186 | * Therefore, rcu_read_lock() and friends have absolutely no | ||
4187 | * effect on a CPU running in that state. In other words, even if | ||
4188 | * such an RCU-idle CPU has called rcu_read_lock(), RCU might well | ||
4189 | * delete data structures out from under it. RCU really has no | ||
4190 | * choice here: we need to keep an RCU-free window in idle where | ||
4191 | * the CPU may possibly enter into low power mode. This way we can | ||
4192 | * notice an extended quiescent state to other CPUs that started a grace | ||
4193 | * period. Otherwise we would delay any grace period as long as we run | ||
4194 | * in the idle task. | ||
4195 | * | ||
4196 | * So complain bitterly if someone does call rcu_read_lock(), | ||
4197 | * rcu_read_lock_bh() and so on from extended quiescent states. | ||
4198 | */ | ||
4199 | if (rcu_is_cpu_idle()) | ||
4200 | printk("RCU used illegally from extended quiescent state!\n"); | ||
4201 | |||
4167 | lockdep_print_held_locks(curr); | 4202 | lockdep_print_held_locks(curr); |
4168 | printk("\nstack backtrace:\n"); | 4203 | printk("\nstack backtrace:\n"); |
4169 | dump_stack(); | 4204 | dump_stack(); |
diff --git a/kernel/module.c b/kernel/module.c index 178333c48d1e..2c932760fd33 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -62,12 +62,6 @@ | |||
62 | #define CREATE_TRACE_POINTS | 62 | #define CREATE_TRACE_POINTS |
63 | #include <trace/events/module.h> | 63 | #include <trace/events/module.h> |
64 | 64 | ||
65 | #if 0 | ||
66 | #define DEBUGP printk | ||
67 | #else | ||
68 | #define DEBUGP(fmt , a...) | ||
69 | #endif | ||
70 | |||
71 | #ifndef ARCH_SHF_SMALL | 65 | #ifndef ARCH_SHF_SMALL |
72 | #define ARCH_SHF_SMALL 0 | 66 | #define ARCH_SHF_SMALL 0 |
73 | #endif | 67 | #endif |
@@ -138,7 +132,6 @@ struct load_info { | |||
138 | unsigned long len; | 132 | unsigned long len; |
139 | Elf_Shdr *sechdrs; | 133 | Elf_Shdr *sechdrs; |
140 | char *secstrings, *strtab; | 134 | char *secstrings, *strtab; |
141 | unsigned long *strmap; | ||
142 | unsigned long symoffs, stroffs; | 135 | unsigned long symoffs, stroffs; |
143 | struct _ddebug *debug; | 136 | struct _ddebug *debug; |
144 | unsigned int num_debug; | 137 | unsigned int num_debug; |
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
410 | return fsa.sym; | 403 | return fsa.sym; |
411 | } | 404 | } |
412 | 405 | ||
413 | DEBUGP("Failed to find symbol %s\n", name); | 406 | pr_debug("Failed to find symbol %s\n", name); |
414 | return NULL; | 407 | return NULL; |
415 | } | 408 | } |
416 | EXPORT_SYMBOL_GPL(find_symbol); | 409 | EXPORT_SYMBOL_GPL(find_symbol); |
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b) | |||
600 | 593 | ||
601 | list_for_each_entry(use, &b->source_list, source_list) { | 594 | list_for_each_entry(use, &b->source_list, source_list) { |
602 | if (use->source == a) { | 595 | if (use->source == a) { |
603 | DEBUGP("%s uses %s!\n", a->name, b->name); | 596 | pr_debug("%s uses %s!\n", a->name, b->name); |
604 | return 1; | 597 | return 1; |
605 | } | 598 | } |
606 | } | 599 | } |
607 | DEBUGP("%s does not use %s!\n", a->name, b->name); | 600 | pr_debug("%s does not use %s!\n", a->name, b->name); |
608 | return 0; | 601 | return 0; |
609 | } | 602 | } |
610 | 603 | ||
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b) | |||
619 | { | 612 | { |
620 | struct module_use *use; | 613 | struct module_use *use; |
621 | 614 | ||
622 | DEBUGP("Allocating new usage for %s.\n", a->name); | 615 | pr_debug("Allocating new usage for %s.\n", a->name); |
623 | use = kmalloc(sizeof(*use), GFP_ATOMIC); | 616 | use = kmalloc(sizeof(*use), GFP_ATOMIC); |
624 | if (!use) { | 617 | if (!use) { |
625 | printk(KERN_WARNING "%s: out of memory loading\n", a->name); | 618 | printk(KERN_WARNING "%s: out of memory loading\n", a->name); |
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod) | |||
663 | mutex_lock(&module_mutex); | 656 | mutex_lock(&module_mutex); |
664 | list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { | 657 | list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { |
665 | struct module *i = use->target; | 658 | struct module *i = use->target; |
666 | DEBUGP("%s unusing %s\n", mod->name, i->name); | 659 | pr_debug("%s unusing %s\n", mod->name, i->name); |
667 | module_put(i); | 660 | module_put(i); |
668 | list_del(&use->source_list); | 661 | list_del(&use->source_list); |
669 | list_del(&use->target_list); | 662 | list_del(&use->target_list); |
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced) | |||
726 | } | 719 | } |
727 | } | 720 | } |
728 | 721 | ||
729 | unsigned int module_refcount(struct module *mod) | 722 | unsigned long module_refcount(struct module *mod) |
730 | { | 723 | { |
731 | unsigned int incs = 0, decs = 0; | 724 | unsigned long incs = 0, decs = 0; |
732 | int cpu; | 725 | int cpu; |
733 | 726 | ||
734 | for_each_possible_cpu(cpu) | 727 | for_each_possible_cpu(cpu) |
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod) | |||
761 | /* Since we might sleep for some time, release the mutex first */ | 754 | /* Since we might sleep for some time, release the mutex first */ |
762 | mutex_unlock(&module_mutex); | 755 | mutex_unlock(&module_mutex); |
763 | for (;;) { | 756 | for (;;) { |
764 | DEBUGP("Looking at refcount...\n"); | 757 | pr_debug("Looking at refcount...\n"); |
765 | set_current_state(TASK_UNINTERRUPTIBLE); | 758 | set_current_state(TASK_UNINTERRUPTIBLE); |
766 | if (module_refcount(mod) == 0) | 759 | if (module_refcount(mod) == 0) |
767 | break; | 760 | break; |
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
804 | if (mod->state != MODULE_STATE_LIVE) { | 797 | if (mod->state != MODULE_STATE_LIVE) { |
805 | /* FIXME: if (force), slam module count and wake up | 798 | /* FIXME: if (force), slam module count and wake up |
806 | waiter --RR */ | 799 | waiter --RR */ |
807 | DEBUGP("%s already dying\n", mod->name); | 800 | pr_debug("%s already dying\n", mod->name); |
808 | ret = -EBUSY; | 801 | ret = -EBUSY; |
809 | goto out; | 802 | goto out; |
810 | } | 803 | } |
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) | |||
854 | struct module_use *use; | 847 | struct module_use *use; |
855 | int printed_something = 0; | 848 | int printed_something = 0; |
856 | 849 | ||
857 | seq_printf(m, " %u ", module_refcount(mod)); | 850 | seq_printf(m, " %lu ", module_refcount(mod)); |
858 | 851 | ||
859 | /* Always include a trailing , so userspace can differentiate | 852 | /* Always include a trailing , so userspace can differentiate |
860 | between this and the old multi-field proc format. */ | 853 | between this and the old multi-field proc format. */ |
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); | |||
904 | static ssize_t show_refcnt(struct module_attribute *mattr, | 897 | static ssize_t show_refcnt(struct module_attribute *mattr, |
905 | struct module_kobject *mk, char *buffer) | 898 | struct module_kobject *mk, char *buffer) |
906 | { | 899 | { |
907 | return sprintf(buffer, "%u\n", module_refcount(mk->mod)); | 900 | return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); |
908 | } | 901 | } |
909 | 902 | ||
910 | static struct module_attribute refcnt = { | 903 | static struct module_attribute modinfo_refcnt = |
911 | .attr = { .name = "refcnt", .mode = 0444 }, | 904 | __ATTR(refcnt, 0444, show_refcnt, NULL); |
912 | .show = show_refcnt, | ||
913 | }; | ||
914 | 905 | ||
915 | void module_put(struct module *module) | 906 | void module_put(struct module *module) |
916 | { | 907 | { |
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod) | |||
951 | } | 942 | } |
952 | #endif /* CONFIG_MODULE_UNLOAD */ | 943 | #endif /* CONFIG_MODULE_UNLOAD */ |
953 | 944 | ||
945 | static size_t module_flags_taint(struct module *mod, char *buf) | ||
946 | { | ||
947 | size_t l = 0; | ||
948 | |||
949 | if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) | ||
950 | buf[l++] = 'P'; | ||
951 | if (mod->taints & (1 << TAINT_OOT_MODULE)) | ||
952 | buf[l++] = 'O'; | ||
953 | if (mod->taints & (1 << TAINT_FORCED_MODULE)) | ||
954 | buf[l++] = 'F'; | ||
955 | if (mod->taints & (1 << TAINT_CRAP)) | ||
956 | buf[l++] = 'C'; | ||
957 | /* | ||
958 | * TAINT_FORCED_RMMOD: could be added. | ||
959 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | ||
960 | * apply to modules. | ||
961 | */ | ||
962 | return l; | ||
963 | } | ||
964 | |||
954 | static ssize_t show_initstate(struct module_attribute *mattr, | 965 | static ssize_t show_initstate(struct module_attribute *mattr, |
955 | struct module_kobject *mk, char *buffer) | 966 | struct module_kobject *mk, char *buffer) |
956 | { | 967 | { |
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, | |||
970 | return sprintf(buffer, "%s\n", state); | 981 | return sprintf(buffer, "%s\n", state); |
971 | } | 982 | } |
972 | 983 | ||
973 | static struct module_attribute initstate = { | 984 | static struct module_attribute modinfo_initstate = |
974 | .attr = { .name = "initstate", .mode = 0444 }, | 985 | __ATTR(initstate, 0444, show_initstate, NULL); |
975 | .show = show_initstate, | ||
976 | }; | ||
977 | 986 | ||
978 | static ssize_t store_uevent(struct module_attribute *mattr, | 987 | static ssize_t store_uevent(struct module_attribute *mattr, |
979 | struct module_kobject *mk, | 988 | struct module_kobject *mk, |
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr, | |||
986 | return count; | 995 | return count; |
987 | } | 996 | } |
988 | 997 | ||
989 | struct module_attribute module_uevent = { | 998 | struct module_attribute module_uevent = |
990 | .attr = { .name = "uevent", .mode = 0200 }, | 999 | __ATTR(uevent, 0200, NULL, store_uevent); |
991 | .store = store_uevent, | 1000 | |
992 | }; | 1001 | static ssize_t show_coresize(struct module_attribute *mattr, |
1002 | struct module_kobject *mk, char *buffer) | ||
1003 | { | ||
1004 | return sprintf(buffer, "%u\n", mk->mod->core_size); | ||
1005 | } | ||
1006 | |||
1007 | static struct module_attribute modinfo_coresize = | ||
1008 | __ATTR(coresize, 0444, show_coresize, NULL); | ||
1009 | |||
1010 | static ssize_t show_initsize(struct module_attribute *mattr, | ||
1011 | struct module_kobject *mk, char *buffer) | ||
1012 | { | ||
1013 | return sprintf(buffer, "%u\n", mk->mod->init_size); | ||
1014 | } | ||
1015 | |||
1016 | static struct module_attribute modinfo_initsize = | ||
1017 | __ATTR(initsize, 0444, show_initsize, NULL); | ||
1018 | |||
1019 | static ssize_t show_taint(struct module_attribute *mattr, | ||
1020 | struct module_kobject *mk, char *buffer) | ||
1021 | { | ||
1022 | size_t l; | ||
1023 | |||
1024 | l = module_flags_taint(mk->mod, buffer); | ||
1025 | buffer[l++] = '\n'; | ||
1026 | return l; | ||
1027 | } | ||
1028 | |||
1029 | static struct module_attribute modinfo_taint = | ||
1030 | __ATTR(taint, 0444, show_taint, NULL); | ||
993 | 1031 | ||
994 | static struct module_attribute *modinfo_attrs[] = { | 1032 | static struct module_attribute *modinfo_attrs[] = { |
1033 | &module_uevent, | ||
995 | &modinfo_version, | 1034 | &modinfo_version, |
996 | &modinfo_srcversion, | 1035 | &modinfo_srcversion, |
997 | &initstate, | 1036 | &modinfo_initstate, |
998 | &module_uevent, | 1037 | &modinfo_coresize, |
1038 | &modinfo_initsize, | ||
1039 | &modinfo_taint, | ||
999 | #ifdef CONFIG_MODULE_UNLOAD | 1040 | #ifdef CONFIG_MODULE_UNLOAD |
1000 | &refcnt, | 1041 | &modinfo_refcnt, |
1001 | #endif | 1042 | #endif |
1002 | NULL, | 1043 | NULL, |
1003 | }; | 1044 | }; |
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
1057 | 1098 | ||
1058 | if (versions[i].crc == maybe_relocated(*crc, crc_owner)) | 1099 | if (versions[i].crc == maybe_relocated(*crc, crc_owner)) |
1059 | return 1; | 1100 | return 1; |
1060 | DEBUGP("Found checksum %lX vs module %lX\n", | 1101 | pr_debug("Found checksum %lX vs module %lX\n", |
1061 | maybe_relocated(*crc, crc_owner), versions[i].crc); | 1102 | maybe_relocated(*crc, crc_owner), versions[i].crc); |
1062 | goto bad_version; | 1103 | goto bad_version; |
1063 | } | 1104 | } |
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
1834 | case SHN_COMMON: | 1875 | case SHN_COMMON: |
1835 | /* We compiled with -fno-common. These are not | 1876 | /* We compiled with -fno-common. These are not |
1836 | supposed to happen. */ | 1877 | supposed to happen. */ |
1837 | DEBUGP("Common symbol: %s\n", name); | 1878 | pr_debug("Common symbol: %s\n", name); |
1838 | printk("%s: please compile with -fno-common\n", | 1879 | printk("%s: please compile with -fno-common\n", |
1839 | mod->name); | 1880 | mod->name); |
1840 | ret = -ENOEXEC; | 1881 | ret = -ENOEXEC; |
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
1842 | 1883 | ||
1843 | case SHN_ABS: | 1884 | case SHN_ABS: |
1844 | /* Don't need to do anything */ | 1885 | /* Don't need to do anything */ |
1845 | DEBUGP("Absolute symbol: 0x%08lx\n", | 1886 | pr_debug("Absolute symbol: 0x%08lx\n", |
1846 | (long)sym[i].st_value); | 1887 | (long)sym[i].st_value); |
1847 | break; | 1888 | break; |
1848 | 1889 | ||
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1966 | for (i = 0; i < info->hdr->e_shnum; i++) | 2007 | for (i = 0; i < info->hdr->e_shnum; i++) |
1967 | info->sechdrs[i].sh_entsize = ~0UL; | 2008 | info->sechdrs[i].sh_entsize = ~0UL; |
1968 | 2009 | ||
1969 | DEBUGP("Core section allocation order:\n"); | 2010 | pr_debug("Core section allocation order:\n"); |
1970 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 2011 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
1971 | for (i = 0; i < info->hdr->e_shnum; ++i) { | 2012 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
1972 | Elf_Shdr *s = &info->sechdrs[i]; | 2013 | Elf_Shdr *s = &info->sechdrs[i]; |
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1978 | || strstarts(sname, ".init")) | 2019 | || strstarts(sname, ".init")) |
1979 | continue; | 2020 | continue; |
1980 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 2021 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1981 | DEBUGP("\t%s\n", name); | 2022 | pr_debug("\t%s\n", sname); |
1982 | } | 2023 | } |
1983 | switch (m) { | 2024 | switch (m) { |
1984 | case 0: /* executable */ | 2025 | case 0: /* executable */ |
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1995 | } | 2036 | } |
1996 | } | 2037 | } |
1997 | 2038 | ||
1998 | DEBUGP("Init section allocation order:\n"); | 2039 | pr_debug("Init section allocation order:\n"); |
1999 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 2040 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
2000 | for (i = 0; i < info->hdr->e_shnum; ++i) { | 2041 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
2001 | Elf_Shdr *s = &info->sechdrs[i]; | 2042 | Elf_Shdr *s = &info->sechdrs[i]; |
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
2008 | continue; | 2049 | continue; |
2009 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | 2050 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) |
2010 | | INIT_OFFSET_MASK); | 2051 | | INIT_OFFSET_MASK); |
2011 | DEBUGP("\t%s\n", sname); | 2052 | pr_debug("\t%s\n", sname); |
2012 | } | 2053 | } |
2013 | switch (m) { | 2054 | switch (m) { |
2014 | case 0: /* executable */ | 2055 | case 0: /* executable */ |
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, | |||
2178 | return true; | 2219 | return true; |
2179 | } | 2220 | } |
2180 | 2221 | ||
2222 | /* | ||
2223 | * We only allocate and copy the strings needed by the parts of symtab | ||
2224 | * we keep. This is simple, but has the effect of making multiple | ||
2225 | * copies of duplicates. We could be more sophisticated, see | ||
2226 | * linux-kernel thread starting with | ||
2227 | * <73defb5e4bca04a6431392cc341112b1@localhost>. | ||
2228 | */ | ||
2181 | static void layout_symtab(struct module *mod, struct load_info *info) | 2229 | static void layout_symtab(struct module *mod, struct load_info *info) |
2182 | { | 2230 | { |
2183 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; | 2231 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; |
2184 | Elf_Shdr *strsect = info->sechdrs + info->index.str; | 2232 | Elf_Shdr *strsect = info->sechdrs + info->index.str; |
2185 | const Elf_Sym *src; | 2233 | const Elf_Sym *src; |
2186 | unsigned int i, nsrc, ndst; | 2234 | unsigned int i, nsrc, ndst, strtab_size; |
2187 | 2235 | ||
2188 | /* Put symbol section at end of init part of module. */ | 2236 | /* Put symbol section at end of init part of module. */ |
2189 | symsect->sh_flags |= SHF_ALLOC; | 2237 | symsect->sh_flags |= SHF_ALLOC; |
2190 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, | 2238 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, |
2191 | info->index.sym) | INIT_OFFSET_MASK; | 2239 | info->index.sym) | INIT_OFFSET_MASK; |
2192 | DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); | 2240 | pr_debug("\t%s\n", info->secstrings + symsect->sh_name); |
2193 | 2241 | ||
2194 | src = (void *)info->hdr + symsect->sh_offset; | 2242 | src = (void *)info->hdr + symsect->sh_offset; |
2195 | nsrc = symsect->sh_size / sizeof(*src); | 2243 | nsrc = symsect->sh_size / sizeof(*src); |
2196 | for (ndst = i = 1; i < nsrc; ++i, ++src) | ||
2197 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { | ||
2198 | unsigned int j = src->st_name; | ||
2199 | 2244 | ||
2200 | while (!__test_and_set_bit(j, info->strmap) | 2245 | /* Compute total space required for the core symbols' strtab. */ |
2201 | && info->strtab[j]) | 2246 | for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) |
2202 | ++j; | 2247 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { |
2203 | ++ndst; | 2248 | strtab_size += strlen(&info->strtab[src->st_name]) + 1; |
2249 | ndst++; | ||
2204 | } | 2250 | } |
2205 | 2251 | ||
2206 | /* Append room for core symbols at end of core part. */ | 2252 | /* Append room for core symbols at end of core part. */ |
2207 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); | 2253 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); |
2208 | mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); | 2254 | info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); |
2255 | mod->core_size += strtab_size; | ||
2209 | 2256 | ||
2210 | /* Put string table section at end of init part of module. */ | 2257 | /* Put string table section at end of init part of module. */ |
2211 | strsect->sh_flags |= SHF_ALLOC; | 2258 | strsect->sh_flags |= SHF_ALLOC; |
2212 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, | 2259 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, |
2213 | info->index.str) | INIT_OFFSET_MASK; | 2260 | info->index.str) | INIT_OFFSET_MASK; |
2214 | DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); | 2261 | pr_debug("\t%s\n", info->secstrings + strsect->sh_name); |
2215 | |||
2216 | /* Append room for core symbols' strings at end of core part. */ | ||
2217 | info->stroffs = mod->core_size; | ||
2218 | __set_bit(0, info->strmap); | ||
2219 | mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); | ||
2220 | } | 2262 | } |
2221 | 2263 | ||
2222 | static void add_kallsyms(struct module *mod, const struct load_info *info) | 2264 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) | |||
2237 | mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); | 2279 | mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); |
2238 | 2280 | ||
2239 | mod->core_symtab = dst = mod->module_core + info->symoffs; | 2281 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2282 | mod->core_strtab = s = mod->module_core + info->stroffs; | ||
2240 | src = mod->symtab; | 2283 | src = mod->symtab; |
2241 | *dst = *src; | 2284 | *dst = *src; |
2285 | *s++ = 0; | ||
2242 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { | 2286 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { |
2243 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) | 2287 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) |
2244 | continue; | 2288 | continue; |
2289 | |||
2245 | dst[ndst] = *src; | 2290 | dst[ndst] = *src; |
2246 | dst[ndst].st_name = bitmap_weight(info->strmap, | 2291 | dst[ndst++].st_name = s - mod->core_strtab; |
2247 | dst[ndst].st_name); | 2292 | s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; |
2248 | ++ndst; | ||
2249 | } | 2293 | } |
2250 | mod->core_num_syms = ndst; | 2294 | mod->core_num_syms = ndst; |
2251 | |||
2252 | mod->core_strtab = s = mod->module_core + info->stroffs; | ||
2253 | for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) | ||
2254 | if (test_bit(i, info->strmap)) | ||
2255 | *++s = mod->strtab[i]; | ||
2256 | } | 2295 | } |
2257 | #else | 2296 | #else |
2258 | static inline void layout_symtab(struct module *mod, struct load_info *info) | 2297 | static inline void layout_symtab(struct module *mod, struct load_info *info) |
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info) | |||
2621 | mod->module_init = ptr; | 2660 | mod->module_init = ptr; |
2622 | 2661 | ||
2623 | /* Transfer each section which specifies SHF_ALLOC */ | 2662 | /* Transfer each section which specifies SHF_ALLOC */ |
2624 | DEBUGP("final section addresses:\n"); | 2663 | pr_debug("final section addresses:\n"); |
2625 | for (i = 0; i < info->hdr->e_shnum; i++) { | 2664 | for (i = 0; i < info->hdr->e_shnum; i++) { |
2626 | void *dest; | 2665 | void *dest; |
2627 | Elf_Shdr *shdr = &info->sechdrs[i]; | 2666 | Elf_Shdr *shdr = &info->sechdrs[i]; |
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info) | |||
2639 | memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); | 2678 | memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); |
2640 | /* Update sh_addr to point to copy in image. */ | 2679 | /* Update sh_addr to point to copy in image. */ |
2641 | shdr->sh_addr = (unsigned long)dest; | 2680 | shdr->sh_addr = (unsigned long)dest; |
2642 | DEBUGP("\t0x%lx %s\n", | 2681 | pr_debug("\t0x%lx %s\n", |
2643 | shdr->sh_addr, info->secstrings + shdr->sh_name); | 2682 | (long)shdr->sh_addr, info->secstrings + shdr->sh_name); |
2644 | } | 2683 | } |
2645 | 2684 | ||
2646 | return 0; | 2685 | return 0; |
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info) | |||
2742 | this is done generically; there doesn't appear to be any | 2781 | this is done generically; there doesn't appear to be any |
2743 | special cases for the architectures. */ | 2782 | special cases for the architectures. */ |
2744 | layout_sections(mod, info); | 2783 | layout_sections(mod, info); |
2745 | |||
2746 | info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) | ||
2747 | * sizeof(long), GFP_KERNEL); | ||
2748 | if (!info->strmap) { | ||
2749 | err = -ENOMEM; | ||
2750 | goto free_percpu; | ||
2751 | } | ||
2752 | layout_symtab(mod, info); | 2784 | layout_symtab(mod, info); |
2753 | 2785 | ||
2754 | /* Allocate and move to the final place */ | 2786 | /* Allocate and move to the final place */ |
2755 | err = move_module(mod, info); | 2787 | err = move_module(mod, info); |
2756 | if (err) | 2788 | if (err) |
2757 | goto free_strmap; | 2789 | goto free_percpu; |
2758 | 2790 | ||
2759 | /* Module has been copied to its final place now: return it. */ | 2791 | /* Module has been copied to its final place now: return it. */ |
2760 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | 2792 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
2761 | kmemleak_load_module(mod, info); | 2793 | kmemleak_load_module(mod, info); |
2762 | return mod; | 2794 | return mod; |
2763 | 2795 | ||
2764 | free_strmap: | ||
2765 | kfree(info->strmap); | ||
2766 | free_percpu: | 2796 | free_percpu: |
2767 | percpu_modfree(mod); | 2797 | percpu_modfree(mod); |
2768 | out: | 2798 | out: |
@@ -2772,7 +2802,6 @@ out: | |||
2772 | /* mod is no longer valid after this! */ | 2802 | /* mod is no longer valid after this! */ |
2773 | static void module_deallocate(struct module *mod, struct load_info *info) | 2803 | static void module_deallocate(struct module *mod, struct load_info *info) |
2774 | { | 2804 | { |
2775 | kfree(info->strmap); | ||
2776 | percpu_modfree(mod); | 2805 | percpu_modfree(mod); |
2777 | module_free(mod, mod->module_init); | 2806 | module_free(mod, mod->module_init); |
2778 | module_free(mod, mod->module_core); | 2807 | module_free(mod, mod->module_core); |
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod, | |||
2811 | struct module *mod; | 2840 | struct module *mod; |
2812 | long err; | 2841 | long err; |
2813 | 2842 | ||
2814 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | 2843 | pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", |
2815 | umod, len, uargs); | 2844 | umod, len, uargs); |
2816 | 2845 | ||
2817 | /* Copy in the blobs from userspace, check they are vaguely sane. */ | 2846 | /* Copy in the blobs from userspace, check they are vaguely sane. */ |
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod, | |||
2902 | if (err < 0) | 2931 | if (err < 0) |
2903 | goto unlink; | 2932 | goto unlink; |
2904 | 2933 | ||
2905 | /* Get rid of temporary copy and strmap. */ | 2934 | /* Get rid of temporary copy. */ |
2906 | kfree(info.strmap); | ||
2907 | free_copy(&info); | 2935 | free_copy(&info); |
2908 | 2936 | ||
2909 | /* Done! */ | 2937 | /* Done! */ |
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf) | |||
3256 | mod->state == MODULE_STATE_GOING || | 3284 | mod->state == MODULE_STATE_GOING || |
3257 | mod->state == MODULE_STATE_COMING) { | 3285 | mod->state == MODULE_STATE_COMING) { |
3258 | buf[bx++] = '('; | 3286 | buf[bx++] = '('; |
3259 | if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) | 3287 | bx += module_flags_taint(mod, buf + bx); |
3260 | buf[bx++] = 'P'; | ||
3261 | else if (mod->taints & (1 << TAINT_OOT_MODULE)) | ||
3262 | buf[bx++] = 'O'; | ||
3263 | if (mod->taints & (1 << TAINT_FORCED_MODULE)) | ||
3264 | buf[bx++] = 'F'; | ||
3265 | if (mod->taints & (1 << TAINT_CRAP)) | ||
3266 | buf[bx++] = 'C'; | ||
3267 | /* | ||
3268 | * TAINT_FORCED_RMMOD: could be added. | ||
3269 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | ||
3270 | * apply to modules. | ||
3271 | */ | ||
3272 | |||
3273 | /* Show a - for module-is-being-unloaded */ | 3288 | /* Show a - for module-is-being-unloaded */ |
3274 | if (mod->state == MODULE_STATE_GOING) | 3289 | if (mod->state == MODULE_STATE_GOING) |
3275 | buf[bx++] = '-'; | 3290 | buf[bx++] = '-'; |
diff --git a/kernel/panic.c b/kernel/panic.c index b26593604214..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -49,6 +49,15 @@ static long no_blink(int state) | |||
49 | long (*panic_blink)(int state); | 49 | long (*panic_blink)(int state); |
50 | EXPORT_SYMBOL(panic_blink); | 50 | EXPORT_SYMBOL(panic_blink); |
51 | 51 | ||
52 | /* | ||
53 | * Stop ourself in panic -- architecture code may override this | ||
54 | */ | ||
55 | void __weak panic_smp_self_stop(void) | ||
56 | { | ||
57 | while (1) | ||
58 | cpu_relax(); | ||
59 | } | ||
60 | |||
52 | /** | 61 | /** |
53 | * panic - halt the system | 62 | * panic - halt the system |
54 | * @fmt: The text string to print | 63 | * @fmt: The text string to print |
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); | |||
57 | * | 66 | * |
58 | * This function never returns. | 67 | * This function never returns. |
59 | */ | 68 | */ |
60 | NORET_TYPE void panic(const char * fmt, ...) | 69 | void panic(const char *fmt, ...) |
61 | { | 70 | { |
71 | static DEFINE_SPINLOCK(panic_lock); | ||
62 | static char buf[1024]; | 72 | static char buf[1024]; |
63 | va_list args; | 73 | va_list args; |
64 | long i, i_next = 0; | 74 | long i, i_next = 0; |
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
68 | * It's possible to come here directly from a panic-assertion and | 78 | * It's possible to come here directly from a panic-assertion and |
69 | * not have preempt disabled. Some functions called from here want | 79 | * not have preempt disabled. Some functions called from here want |
70 | * preempt to be disabled. No point enabling it later though... | 80 | * preempt to be disabled. No point enabling it later though... |
81 | * | ||
82 | * Only one CPU is allowed to execute the panic code from here. For | ||
83 | * multiple parallel invocations of panic, all other CPUs either | ||
84 | * stop themself or will wait until they are stopped by the 1st CPU | ||
85 | * with smp_send_stop(). | ||
71 | */ | 86 | */ |
72 | preempt_disable(); | 87 | if (!spin_trylock(&panic_lock)) |
88 | panic_smp_self_stop(); | ||
73 | 89 | ||
74 | console_verbose(); | 90 | console_verbose(); |
75 | bust_spinlocks(1); | 91 | bust_spinlocks(1); |
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
78 | va_end(args); | 94 | va_end(args); |
79 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); | 95 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); |
80 | #ifdef CONFIG_DEBUG_BUGVERBOSE | 96 | #ifdef CONFIG_DEBUG_BUGVERBOSE |
81 | dump_stack(); | 97 | /* |
98 | * Avoid nested stack-dumping if a panic occurs during oops processing | ||
99 | */ | ||
100 | if (!oops_in_progress) | ||
101 | dump_stack(); | ||
82 | #endif | 102 | #endif |
83 | 103 | ||
84 | /* | 104 | /* |
@@ -237,11 +257,20 @@ void add_taint(unsigned flag) | |||
237 | * Can't trust the integrity of the kernel anymore. | 257 | * Can't trust the integrity of the kernel anymore. |
238 | * We don't call directly debug_locks_off() because the issue | 258 | * We don't call directly debug_locks_off() because the issue |
239 | * is not necessarily serious enough to set oops_in_progress to 1 | 259 | * is not necessarily serious enough to set oops_in_progress to 1 |
240 | * Also we want to keep up lockdep for staging development and | 260 | * Also we want to keep up lockdep for staging/out-of-tree |
241 | * post-warning case. | 261 | * development and post-warning case. |
242 | */ | 262 | */ |
243 | if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) | 263 | switch (flag) { |
244 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | 264 | case TAINT_CRAP: |
265 | case TAINT_OOT_MODULE: | ||
266 | case TAINT_WARN: | ||
267 | case TAINT_FIRMWARE_WORKAROUND: | ||
268 | break; | ||
269 | |||
270 | default: | ||
271 | if (__debug_locks_off()) | ||
272 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | ||
273 | } | ||
245 | 274 | ||
246 | set_bit(flag, &tainted_mask); | 275 | set_bit(flag, &tainted_mask); |
247 | } | 276 | } |
diff --git a/kernel/params.c b/kernel/params.c index 65aae11eb93f..4bc965d8a1fe 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -25,12 +25,6 @@ | |||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
27 | 27 | ||
28 | #if 0 | ||
29 | #define DEBUGP printk | ||
30 | #else | ||
31 | #define DEBUGP(fmt, a...) | ||
32 | #endif | ||
33 | |||
34 | /* Protects all parameters, and incidentally kmalloced_param list. */ | 28 | /* Protects all parameters, and incidentally kmalloced_param list. */ |
35 | static DEFINE_MUTEX(param_lock); | 29 | static DEFINE_MUTEX(param_lock); |
36 | 30 | ||
@@ -103,9 +97,10 @@ static int parse_one(char *param, | |||
103 | for (i = 0; i < num_params; i++) { | 97 | for (i = 0; i < num_params; i++) { |
104 | if (parameq(param, params[i].name)) { | 98 | if (parameq(param, params[i].name)) { |
105 | /* No one handled NULL, so do it here. */ | 99 | /* No one handled NULL, so do it here. */ |
106 | if (!val && params[i].ops->set != param_set_bool) | 100 | if (!val && params[i].ops->set != param_set_bool |
101 | && params[i].ops->set != param_set_bint) | ||
107 | return -EINVAL; | 102 | return -EINVAL; |
108 | DEBUGP("They are equal! Calling %p\n", | 103 | pr_debug("They are equal! Calling %p\n", |
109 | params[i].ops->set); | 104 | params[i].ops->set); |
110 | mutex_lock(¶m_lock); | 105 | mutex_lock(¶m_lock); |
111 | err = params[i].ops->set(val, ¶ms[i]); | 106 | err = params[i].ops->set(val, ¶ms[i]); |
@@ -115,11 +110,11 @@ static int parse_one(char *param, | |||
115 | } | 110 | } |
116 | 111 | ||
117 | if (handle_unknown) { | 112 | if (handle_unknown) { |
118 | DEBUGP("Unknown argument: calling %p\n", handle_unknown); | 113 | pr_debug("Unknown argument: calling %p\n", handle_unknown); |
119 | return handle_unknown(param, val); | 114 | return handle_unknown(param, val); |
120 | } | 115 | } |
121 | 116 | ||
122 | DEBUGP("Unknown argument `%s'\n", param); | 117 | pr_debug("Unknown argument `%s'\n", param); |
123 | return -ENOENT; | 118 | return -ENOENT; |
124 | } | 119 | } |
125 | 120 | ||
@@ -184,7 +179,7 @@ int parse_args(const char *name, | |||
184 | { | 179 | { |
185 | char *param, *val; | 180 | char *param, *val; |
186 | 181 | ||
187 | DEBUGP("Parsing ARGS: %s\n", args); | 182 | pr_debug("Parsing ARGS: %s\n", args); |
188 | 183 | ||
189 | /* Chew leading spaces */ | 184 | /* Chew leading spaces */ |
190 | args = skip_spaces(args); | 185 | args = skip_spaces(args); |
@@ -369,6 +364,30 @@ struct kernel_param_ops param_ops_invbool = { | |||
369 | }; | 364 | }; |
370 | EXPORT_SYMBOL(param_ops_invbool); | 365 | EXPORT_SYMBOL(param_ops_invbool); |
371 | 366 | ||
367 | int param_set_bint(const char *val, const struct kernel_param *kp) | ||
368 | { | ||
369 | struct kernel_param boolkp; | ||
370 | bool v; | ||
371 | int ret; | ||
372 | |||
373 | /* Match bool exactly, by re-using it. */ | ||
374 | boolkp = *kp; | ||
375 | boolkp.arg = &v; | ||
376 | boolkp.flags |= KPARAM_ISBOOL; | ||
377 | |||
378 | ret = param_set_bool(val, &boolkp); | ||
379 | if (ret == 0) | ||
380 | *(int *)kp->arg = v; | ||
381 | return ret; | ||
382 | } | ||
383 | EXPORT_SYMBOL(param_set_bint); | ||
384 | |||
385 | struct kernel_param_ops param_ops_bint = { | ||
386 | .set = param_set_bint, | ||
387 | .get = param_get_int, | ||
388 | }; | ||
389 | EXPORT_SYMBOL(param_ops_bint); | ||
390 | |||
372 | /* We break the rule and mangle the string. */ | 391 | /* We break the rule and mangle the string. */ |
373 | static int param_array(const char *name, | 392 | static int param_array(const char *name, |
374 | const char *val, | 393 | const char *val, |
diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5f..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) | |||
137 | } | 137 | } |
138 | 138 | ||
139 | /* | 139 | /* |
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | 140 | * We might be racing with someone else trying to set pid_ns->last_pid |
141 | * at the pid allocation time (there's also a sysctl for this, but racing | ||
142 | * with this one is OK, see comment in kernel/pid_namespace.c about it). | ||
141 | * We want the winner to have the "later" value, because if the | 143 | * We want the winner to have the "later" value, because if the |
142 | * "earlier" value prevails, then a pid may get reused immediately. | 144 | * "earlier" value prevails, then a pid may get reused immediately. |
143 | * | 145 | * |
@@ -541,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) | |||
541 | */ | 543 | */ |
542 | void __init pidhash_init(void) | 544 | void __init pidhash_init(void) |
543 | { | 545 | { |
544 | int i, pidhash_size; | 546 | unsigned int i, pidhash_size; |
545 | 547 | ||
546 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
547 | HASH_EARLY | HASH_SMALL, | 549 | HASH_EARLY | HASH_SMALL, |
548 | &pidhash_shift, NULL, 4096); | 550 | &pidhash_shift, NULL, 4096); |
549 | pidhash_size = 1 << pidhash_shift; | 551 | pidhash_size = 1U << pidhash_shift; |
550 | 552 | ||
551 | for (i = 0; i < pidhash_size; i++) | 553 | for (i = 0; i < pidhash_size; i++) |
552 | INIT_HLIST_HEAD(&pid_hash[i]); | 554 | INIT_HLIST_HEAD(&pid_hash[i]); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
191 | return; | 191 | return; |
192 | } | 192 | } |
193 | 193 | ||
194 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | ||
195 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
196 | { | ||
197 | struct ctl_table tmp = *table; | ||
198 | |||
199 | if (write && !capable(CAP_SYS_ADMIN)) | ||
200 | return -EPERM; | ||
201 | |||
202 | /* | ||
203 | * Writing directly to ns' last_pid field is OK, since this field | ||
204 | * is volatile in a living namespace anyway and a code writing to | ||
205 | * it should synchronize its usage with external means. | ||
206 | */ | ||
207 | |||
208 | tmp.data = ¤t->nsproxy->pid_ns->last_pid; | ||
209 | return proc_dointvec(&tmp, write, buffer, lenp, ppos); | ||
210 | } | ||
211 | |||
212 | static struct ctl_table pid_ns_ctl_table[] = { | ||
213 | { | ||
214 | .procname = "ns_last_pid", | ||
215 | .maxlen = sizeof(int), | ||
216 | .mode = 0666, /* permissions are checked in the handler */ | ||
217 | .proc_handler = pid_ns_ctl_handler, | ||
218 | }, | ||
219 | { } | ||
220 | }; | ||
221 | |||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | ||
223 | |||
194 | static __init int pid_namespaces_init(void) | 224 | static __init int pid_namespaces_init(void) |
195 | { | 225 | { |
196 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 226 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
227 | register_sysctl_paths(kern_path, pid_ns_ctl_table); | ||
197 | return 0; | 228 | return 0; |
198 | } | 229 | } |
199 | 230 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e7cb76dc18f5..125cb67daa21 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock, | |||
78 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 78 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
79 | return now.sched < then.sched; | 79 | return now.sched < then.sched; |
80 | } else { | 80 | } else { |
81 | return cputime_lt(now.cpu, then.cpu); | 81 | return now.cpu < then.cpu; |
82 | } | 82 | } |
83 | } | 83 | } |
84 | static inline void cpu_time_add(const clockid_t which_clock, | 84 | static inline void cpu_time_add(const clockid_t which_clock, |
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock, | |||
88 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 88 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
89 | acc->sched += val.sched; | 89 | acc->sched += val.sched; |
90 | } else { | 90 | } else { |
91 | acc->cpu = cputime_add(acc->cpu, val.cpu); | 91 | acc->cpu += val.cpu; |
92 | } | 92 | } |
93 | } | 93 | } |
94 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | 94 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, |
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | |||
98 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 98 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
99 | a.sched -= b.sched; | 99 | a.sched -= b.sched; |
100 | } else { | 100 | } else { |
101 | a.cpu = cputime_sub(a.cpu, b.cpu); | 101 | a.cpu -= b.cpu; |
102 | } | 102 | } |
103 | return a; | 103 | return a; |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * Divide and limit the result to res >= 1 | ||
108 | * | ||
109 | * This is necessary to prevent signal delivery starvation, when the result of | ||
110 | * the division would be rounded down to 0. | ||
111 | */ | ||
112 | static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) | ||
113 | { | ||
114 | cputime_t res = cputime_div(time, div); | ||
115 | |||
116 | return max_t(cputime_t, res, 1); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Update expiry time from increment, and increase overrun count, | 107 | * Update expiry time from increment, and increase overrun count, |
121 | * given the current clock sample. | 108 | * given the current clock sample. |
122 | */ | 109 | */ |
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer, | |||
148 | } else { | 135 | } else { |
149 | cputime_t delta, incr; | 136 | cputime_t delta, incr; |
150 | 137 | ||
151 | if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) | 138 | if (now.cpu < timer->it.cpu.expires.cpu) |
152 | return; | 139 | return; |
153 | incr = timer->it.cpu.incr.cpu; | 140 | incr = timer->it.cpu.incr.cpu; |
154 | delta = cputime_sub(cputime_add(now.cpu, incr), | 141 | delta = now.cpu + incr - timer->it.cpu.expires.cpu; |
155 | timer->it.cpu.expires.cpu); | ||
156 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | 142 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ |
157 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) | 143 | for (i = 0; incr < delta - incr; i++) |
158 | incr = cputime_add(incr, incr); | 144 | incr += incr; |
159 | for (; i >= 0; incr = cputime_halve(incr), i--) { | 145 | for (; i >= 0; incr = incr >> 1, i--) { |
160 | if (cputime_lt(delta, incr)) | 146 | if (delta < incr) |
161 | continue; | 147 | continue; |
162 | timer->it.cpu.expires.cpu = | 148 | timer->it.cpu.expires.cpu += incr; |
163 | cputime_add(timer->it.cpu.expires.cpu, incr); | ||
164 | timer->it_overrun += 1 << i; | 149 | timer->it_overrun += 1 << i; |
165 | delta = cputime_sub(delta, incr); | 150 | delta -= incr; |
166 | } | 151 | } |
167 | } | 152 | } |
168 | } | 153 | } |
169 | 154 | ||
170 | static inline cputime_t prof_ticks(struct task_struct *p) | 155 | static inline cputime_t prof_ticks(struct task_struct *p) |
171 | { | 156 | { |
172 | return cputime_add(p->utime, p->stime); | 157 | return p->utime + p->stime; |
173 | } | 158 | } |
174 | static inline cputime_t virt_ticks(struct task_struct *p) | 159 | static inline cputime_t virt_ticks(struct task_struct *p) |
175 | { | 160 | { |
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
248 | 233 | ||
249 | t = tsk; | 234 | t = tsk; |
250 | do { | 235 | do { |
251 | times->utime = cputime_add(times->utime, t->utime); | 236 | times->utime += t->utime; |
252 | times->stime = cputime_add(times->stime, t->stime); | 237 | times->stime += t->stime; |
253 | times->sum_exec_runtime += task_sched_runtime(t); | 238 | times->sum_exec_runtime += task_sched_runtime(t); |
254 | } while_each_thread(tsk, t); | 239 | } while_each_thread(tsk, t); |
255 | out: | 240 | out: |
@@ -258,10 +243,10 @@ out: | |||
258 | 243 | ||
259 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 244 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) |
260 | { | 245 | { |
261 | if (cputime_gt(b->utime, a->utime)) | 246 | if (b->utime > a->utime) |
262 | a->utime = b->utime; | 247 | a->utime = b->utime; |
263 | 248 | ||
264 | if (cputime_gt(b->stime, a->stime)) | 249 | if (b->stime > a->stime) |
265 | a->stime = b->stime; | 250 | a->stime = b->stime; |
266 | 251 | ||
267 | if (b->sum_exec_runtime > a->sum_exec_runtime) | 252 | if (b->sum_exec_runtime > a->sum_exec_runtime) |
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
306 | return -EINVAL; | 291 | return -EINVAL; |
307 | case CPUCLOCK_PROF: | 292 | case CPUCLOCK_PROF: |
308 | thread_group_cputime(p, &cputime); | 293 | thread_group_cputime(p, &cputime); |
309 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | 294 | cpu->cpu = cputime.utime + cputime.stime; |
310 | break; | 295 | break; |
311 | case CPUCLOCK_VIRT: | 296 | case CPUCLOCK_VIRT: |
312 | thread_group_cputime(p, &cputime); | 297 | thread_group_cputime(p, &cputime); |
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head, | |||
470 | unsigned long long sum_exec_runtime) | 455 | unsigned long long sum_exec_runtime) |
471 | { | 456 | { |
472 | struct cpu_timer_list *timer, *next; | 457 | struct cpu_timer_list *timer, *next; |
473 | cputime_t ptime = cputime_add(utime, stime); | 458 | cputime_t ptime = utime + stime; |
474 | 459 | ||
475 | list_for_each_entry_safe(timer, next, head, entry) { | 460 | list_for_each_entry_safe(timer, next, head, entry) { |
476 | list_del_init(&timer->entry); | 461 | list_del_init(&timer->entry); |
477 | if (cputime_lt(timer->expires.cpu, ptime)) { | 462 | if (timer->expires.cpu < ptime) { |
478 | timer->expires.cpu = cputime_zero; | 463 | timer->expires.cpu = 0; |
479 | } else { | 464 | } else { |
480 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | 465 | timer->expires.cpu -= ptime; |
481 | ptime); | ||
482 | } | 466 | } |
483 | } | 467 | } |
484 | 468 | ||
485 | ++head; | 469 | ++head; |
486 | list_for_each_entry_safe(timer, next, head, entry) { | 470 | list_for_each_entry_safe(timer, next, head, entry) { |
487 | list_del_init(&timer->entry); | 471 | list_del_init(&timer->entry); |
488 | if (cputime_lt(timer->expires.cpu, utime)) { | 472 | if (timer->expires.cpu < utime) { |
489 | timer->expires.cpu = cputime_zero; | 473 | timer->expires.cpu = 0; |
490 | } else { | 474 | } else { |
491 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | 475 | timer->expires.cpu -= utime; |
492 | utime); | ||
493 | } | 476 | } |
494 | } | 477 | } |
495 | 478 | ||
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) | |||
520 | struct signal_struct *const sig = tsk->signal; | 503 | struct signal_struct *const sig = tsk->signal; |
521 | 504 | ||
522 | cleanup_timers(tsk->signal->cpu_timers, | 505 | cleanup_timers(tsk->signal->cpu_timers, |
523 | cputime_add(tsk->utime, sig->utime), | 506 | tsk->utime + sig->utime, tsk->stime + sig->stime, |
524 | cputime_add(tsk->stime, sig->stime), | ||
525 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); | 507 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); |
526 | } | 508 | } |
527 | 509 | ||
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) | |||
540 | 522 | ||
541 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) | 523 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) |
542 | { | 524 | { |
543 | return cputime_eq(expires, cputime_zero) || | 525 | return expires == 0 || expires > new_exp; |
544 | cputime_gt(expires, new_exp); | ||
545 | } | 526 | } |
546 | 527 | ||
547 | /* | 528 | /* |
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
651 | default: | 632 | default: |
652 | return -EINVAL; | 633 | return -EINVAL; |
653 | case CPUCLOCK_PROF: | 634 | case CPUCLOCK_PROF: |
654 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | 635 | cpu->cpu = cputime.utime + cputime.stime; |
655 | break; | 636 | break; |
656 | case CPUCLOCK_VIRT: | 637 | case CPUCLOCK_VIRT: |
657 | cpu->cpu = cputime.utime; | 638 | cpu->cpu = cputime.utime; |
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
918 | unsigned long soft; | 899 | unsigned long soft; |
919 | 900 | ||
920 | maxfire = 20; | 901 | maxfire = 20; |
921 | tsk->cputime_expires.prof_exp = cputime_zero; | 902 | tsk->cputime_expires.prof_exp = 0; |
922 | while (!list_empty(timers)) { | 903 | while (!list_empty(timers)) { |
923 | struct cpu_timer_list *t = list_first_entry(timers, | 904 | struct cpu_timer_list *t = list_first_entry(timers, |
924 | struct cpu_timer_list, | 905 | struct cpu_timer_list, |
925 | entry); | 906 | entry); |
926 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | 907 | if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { |
927 | tsk->cputime_expires.prof_exp = t->expires.cpu; | 908 | tsk->cputime_expires.prof_exp = t->expires.cpu; |
928 | break; | 909 | break; |
929 | } | 910 | } |
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
933 | 914 | ||
934 | ++timers; | 915 | ++timers; |
935 | maxfire = 20; | 916 | maxfire = 20; |
936 | tsk->cputime_expires.virt_exp = cputime_zero; | 917 | tsk->cputime_expires.virt_exp = 0; |
937 | while (!list_empty(timers)) { | 918 | while (!list_empty(timers)) { |
938 | struct cpu_timer_list *t = list_first_entry(timers, | 919 | struct cpu_timer_list *t = list_first_entry(timers, |
939 | struct cpu_timer_list, | 920 | struct cpu_timer_list, |
940 | entry); | 921 | entry); |
941 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | 922 | if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { |
942 | tsk->cputime_expires.virt_exp = t->expires.cpu; | 923 | tsk->cputime_expires.virt_exp = t->expires.cpu; |
943 | break; | 924 | break; |
944 | } | 925 | } |
@@ -1009,20 +990,19 @@ static u32 onecputick; | |||
1009 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | 990 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, |
1010 | cputime_t *expires, cputime_t cur_time, int signo) | 991 | cputime_t *expires, cputime_t cur_time, int signo) |
1011 | { | 992 | { |
1012 | if (cputime_eq(it->expires, cputime_zero)) | 993 | if (!it->expires) |
1013 | return; | 994 | return; |
1014 | 995 | ||
1015 | if (cputime_ge(cur_time, it->expires)) { | 996 | if (cur_time >= it->expires) { |
1016 | if (!cputime_eq(it->incr, cputime_zero)) { | 997 | if (it->incr) { |
1017 | it->expires = cputime_add(it->expires, it->incr); | 998 | it->expires += it->incr; |
1018 | it->error += it->incr_error; | 999 | it->error += it->incr_error; |
1019 | if (it->error >= onecputick) { | 1000 | if (it->error >= onecputick) { |
1020 | it->expires = cputime_sub(it->expires, | 1001 | it->expires -= cputime_one_jiffy; |
1021 | cputime_one_jiffy); | ||
1022 | it->error -= onecputick; | 1002 | it->error -= onecputick; |
1023 | } | 1003 | } |
1024 | } else { | 1004 | } else { |
1025 | it->expires = cputime_zero; | 1005 | it->expires = 0; |
1026 | } | 1006 | } |
1027 | 1007 | ||
1028 | trace_itimer_expire(signo == SIGPROF ? | 1008 | trace_itimer_expire(signo == SIGPROF ? |
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1031 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); | 1011 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); |
1032 | } | 1012 | } |
1033 | 1013 | ||
1034 | if (!cputime_eq(it->expires, cputime_zero) && | 1014 | if (it->expires && (!*expires || it->expires < *expires)) { |
1035 | (cputime_eq(*expires, cputime_zero) || | ||
1036 | cputime_lt(it->expires, *expires))) { | ||
1037 | *expires = it->expires; | 1015 | *expires = it->expires; |
1038 | } | 1016 | } |
1039 | } | 1017 | } |
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1048 | */ | 1026 | */ |
1049 | static inline int task_cputime_zero(const struct task_cputime *cputime) | 1027 | static inline int task_cputime_zero(const struct task_cputime *cputime) |
1050 | { | 1028 | { |
1051 | if (cputime_eq(cputime->utime, cputime_zero) && | 1029 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) |
1052 | cputime_eq(cputime->stime, cputime_zero) && | ||
1053 | cputime->sum_exec_runtime == 0) | ||
1054 | return 1; | 1030 | return 1; |
1055 | return 0; | 1031 | return 0; |
1056 | } | 1032 | } |
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk, | |||
1076 | */ | 1052 | */ |
1077 | thread_group_cputimer(tsk, &cputime); | 1053 | thread_group_cputimer(tsk, &cputime); |
1078 | utime = cputime.utime; | 1054 | utime = cputime.utime; |
1079 | ptime = cputime_add(utime, cputime.stime); | 1055 | ptime = utime + cputime.stime; |
1080 | sum_sched_runtime = cputime.sum_exec_runtime; | 1056 | sum_sched_runtime = cputime.sum_exec_runtime; |
1081 | maxfire = 20; | 1057 | maxfire = 20; |
1082 | prof_expires = cputime_zero; | 1058 | prof_expires = 0; |
1083 | while (!list_empty(timers)) { | 1059 | while (!list_empty(timers)) { |
1084 | struct cpu_timer_list *tl = list_first_entry(timers, | 1060 | struct cpu_timer_list *tl = list_first_entry(timers, |
1085 | struct cpu_timer_list, | 1061 | struct cpu_timer_list, |
1086 | entry); | 1062 | entry); |
1087 | if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { | 1063 | if (!--maxfire || ptime < tl->expires.cpu) { |
1088 | prof_expires = tl->expires.cpu; | 1064 | prof_expires = tl->expires.cpu; |
1089 | break; | 1065 | break; |
1090 | } | 1066 | } |
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk, | |||
1094 | 1070 | ||
1095 | ++timers; | 1071 | ++timers; |
1096 | maxfire = 20; | 1072 | maxfire = 20; |
1097 | virt_expires = cputime_zero; | 1073 | virt_expires = 0; |
1098 | while (!list_empty(timers)) { | 1074 | while (!list_empty(timers)) { |
1099 | struct cpu_timer_list *tl = list_first_entry(timers, | 1075 | struct cpu_timer_list *tl = list_first_entry(timers, |
1100 | struct cpu_timer_list, | 1076 | struct cpu_timer_list, |
1101 | entry); | 1077 | entry); |
1102 | if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { | 1078 | if (!--maxfire || utime < tl->expires.cpu) { |
1103 | virt_expires = tl->expires.cpu; | 1079 | virt_expires = tl->expires.cpu; |
1104 | break; | 1080 | break; |
1105 | } | 1081 | } |
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1154 | } | 1130 | } |
1155 | } | 1131 | } |
1156 | x = secs_to_cputime(soft); | 1132 | x = secs_to_cputime(soft); |
1157 | if (cputime_eq(prof_expires, cputime_zero) || | 1133 | if (!prof_expires || x < prof_expires) { |
1158 | cputime_lt(x, prof_expires)) { | ||
1159 | prof_expires = x; | 1134 | prof_expires = x; |
1160 | } | 1135 | } |
1161 | } | 1136 | } |
@@ -1249,12 +1224,9 @@ out: | |||
1249 | static inline int task_cputime_expired(const struct task_cputime *sample, | 1224 | static inline int task_cputime_expired(const struct task_cputime *sample, |
1250 | const struct task_cputime *expires) | 1225 | const struct task_cputime *expires) |
1251 | { | 1226 | { |
1252 | if (!cputime_eq(expires->utime, cputime_zero) && | 1227 | if (expires->utime && sample->utime >= expires->utime) |
1253 | cputime_ge(sample->utime, expires->utime)) | ||
1254 | return 1; | 1228 | return 1; |
1255 | if (!cputime_eq(expires->stime, cputime_zero) && | 1229 | if (expires->stime && sample->utime + sample->stime >= expires->stime) |
1256 | cputime_ge(cputime_add(sample->utime, sample->stime), | ||
1257 | expires->stime)) | ||
1258 | return 1; | 1230 | return 1; |
1259 | if (expires->sum_exec_runtime != 0 && | 1231 | if (expires->sum_exec_runtime != 0 && |
1260 | sample->sum_exec_runtime >= expires->sum_exec_runtime) | 1232 | sample->sum_exec_runtime >= expires->sum_exec_runtime) |
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1389 | * it to be relative, *newval argument is relative and we update | 1361 | * it to be relative, *newval argument is relative and we update |
1390 | * it to be absolute. | 1362 | * it to be absolute. |
1391 | */ | 1363 | */ |
1392 | if (!cputime_eq(*oldval, cputime_zero)) { | 1364 | if (*oldval) { |
1393 | if (cputime_le(*oldval, now.cpu)) { | 1365 | if (*oldval <= now.cpu) { |
1394 | /* Just about to fire. */ | 1366 | /* Just about to fire. */ |
1395 | *oldval = cputime_one_jiffy; | 1367 | *oldval = cputime_one_jiffy; |
1396 | } else { | 1368 | } else { |
1397 | *oldval = cputime_sub(*oldval, now.cpu); | 1369 | *oldval -= now.cpu; |
1398 | } | 1370 | } |
1399 | } | 1371 | } |
1400 | 1372 | ||
1401 | if (cputime_eq(*newval, cputime_zero)) | 1373 | if (!*newval) |
1402 | return; | 1374 | return; |
1403 | *newval = cputime_add(*newval, now.cpu); | 1375 | *newval += now.cpu; |
1404 | } | 1376 | } |
1405 | 1377 | ||
1406 | /* | 1378 | /* |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 196c01268ebd..6d6d28870335 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -43,8 +43,6 @@ int in_suspend __nosavedata; | |||
43 | enum { | 43 | enum { |
44 | HIBERNATION_INVALID, | 44 | HIBERNATION_INVALID, |
45 | HIBERNATION_PLATFORM, | 45 | HIBERNATION_PLATFORM, |
46 | HIBERNATION_TEST, | ||
47 | HIBERNATION_TESTPROC, | ||
48 | HIBERNATION_SHUTDOWN, | 46 | HIBERNATION_SHUTDOWN, |
49 | HIBERNATION_REBOOT, | 47 | HIBERNATION_REBOOT, |
50 | /* keep last */ | 48 | /* keep last */ |
@@ -55,7 +53,7 @@ enum { | |||
55 | 53 | ||
56 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 54 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
57 | 55 | ||
58 | static bool freezer_test_done; | 56 | bool freezer_test_done; |
59 | 57 | ||
60 | static const struct platform_hibernation_ops *hibernation_ops; | 58 | static const struct platform_hibernation_ops *hibernation_ops; |
61 | 59 | ||
@@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) | |||
71 | WARN_ON(1); | 69 | WARN_ON(1); |
72 | return; | 70 | return; |
73 | } | 71 | } |
74 | mutex_lock(&pm_mutex); | 72 | lock_system_sleep(); |
75 | hibernation_ops = ops; | 73 | hibernation_ops = ops; |
76 | if (ops) | 74 | if (ops) |
77 | hibernation_mode = HIBERNATION_PLATFORM; | 75 | hibernation_mode = HIBERNATION_PLATFORM; |
78 | else if (hibernation_mode == HIBERNATION_PLATFORM) | 76 | else if (hibernation_mode == HIBERNATION_PLATFORM) |
79 | hibernation_mode = HIBERNATION_SHUTDOWN; | 77 | hibernation_mode = HIBERNATION_SHUTDOWN; |
80 | 78 | ||
81 | mutex_unlock(&pm_mutex); | 79 | unlock_system_sleep(); |
82 | } | 80 | } |
83 | 81 | ||
84 | static bool entering_platform_hibernation; | 82 | static bool entering_platform_hibernation; |
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void) | |||
96 | mdelay(5000); | 94 | mdelay(5000); |
97 | } | 95 | } |
98 | 96 | ||
99 | static int hibernation_testmode(int mode) | ||
100 | { | ||
101 | if (hibernation_mode == mode) { | ||
102 | hibernation_debug_sleep(); | ||
103 | return 1; | ||
104 | } | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | static int hibernation_test(int level) | 97 | static int hibernation_test(int level) |
109 | { | 98 | { |
110 | if (pm_test_level == level) { | 99 | if (pm_test_level == level) { |
@@ -114,7 +103,6 @@ static int hibernation_test(int level) | |||
114 | return 0; | 103 | return 0; |
115 | } | 104 | } |
116 | #else /* !CONFIG_PM_DEBUG */ | 105 | #else /* !CONFIG_PM_DEBUG */ |
117 | static int hibernation_testmode(int mode) { return 0; } | ||
118 | static int hibernation_test(int level) { return 0; } | 106 | static int hibernation_test(int level) { return 0; } |
119 | #endif /* !CONFIG_PM_DEBUG */ | 107 | #endif /* !CONFIG_PM_DEBUG */ |
120 | 108 | ||
@@ -278,8 +266,7 @@ static int create_image(int platform_mode) | |||
278 | goto Platform_finish; | 266 | goto Platform_finish; |
279 | 267 | ||
280 | error = disable_nonboot_cpus(); | 268 | error = disable_nonboot_cpus(); |
281 | if (error || hibernation_test(TEST_CPUS) | 269 | if (error || hibernation_test(TEST_CPUS)) |
282 | || hibernation_testmode(HIBERNATION_TEST)) | ||
283 | goto Enable_cpus; | 270 | goto Enable_cpus; |
284 | 271 | ||
285 | local_irq_disable(); | 272 | local_irq_disable(); |
@@ -333,7 +320,7 @@ static int create_image(int platform_mode) | |||
333 | */ | 320 | */ |
334 | int hibernation_snapshot(int platform_mode) | 321 | int hibernation_snapshot(int platform_mode) |
335 | { | 322 | { |
336 | pm_message_t msg = PMSG_RECOVER; | 323 | pm_message_t msg; |
337 | int error; | 324 | int error; |
338 | 325 | ||
339 | error = platform_begin(platform_mode); | 326 | error = platform_begin(platform_mode); |
@@ -347,39 +334,40 @@ int hibernation_snapshot(int platform_mode) | |||
347 | 334 | ||
348 | error = freeze_kernel_threads(); | 335 | error = freeze_kernel_threads(); |
349 | if (error) | 336 | if (error) |
350 | goto Close; | 337 | goto Cleanup; |
351 | 338 | ||
352 | if (hibernation_test(TEST_FREEZER) || | 339 | if (hibernation_test(TEST_FREEZER)) { |
353 | hibernation_testmode(HIBERNATION_TESTPROC)) { | ||
354 | 340 | ||
355 | /* | 341 | /* |
356 | * Indicate to the caller that we are returning due to a | 342 | * Indicate to the caller that we are returning due to a |
357 | * successful freezer test. | 343 | * successful freezer test. |
358 | */ | 344 | */ |
359 | freezer_test_done = true; | 345 | freezer_test_done = true; |
360 | goto Close; | 346 | goto Cleanup; |
361 | } | 347 | } |
362 | 348 | ||
363 | error = dpm_prepare(PMSG_FREEZE); | 349 | error = dpm_prepare(PMSG_FREEZE); |
364 | if (error) | 350 | if (error) { |
365 | goto Complete_devices; | 351 | dpm_complete(PMSG_RECOVER); |
352 | goto Cleanup; | ||
353 | } | ||
366 | 354 | ||
367 | suspend_console(); | 355 | suspend_console(); |
368 | pm_restrict_gfp_mask(); | 356 | pm_restrict_gfp_mask(); |
357 | |||
369 | error = dpm_suspend(PMSG_FREEZE); | 358 | error = dpm_suspend(PMSG_FREEZE); |
370 | if (error) | ||
371 | goto Recover_platform; | ||
372 | 359 | ||
373 | if (hibernation_test(TEST_DEVICES)) | 360 | if (error || hibernation_test(TEST_DEVICES)) |
374 | goto Recover_platform; | 361 | platform_recover(platform_mode); |
362 | else | ||
363 | error = create_image(platform_mode); | ||
375 | 364 | ||
376 | error = create_image(platform_mode); | ||
377 | /* | 365 | /* |
378 | * Control returns here (1) after the image has been created or the | 366 | * In the case that we call create_image() above, the control |
367 | * returns here (1) after the image has been created or the | ||
379 | * image creation has failed and (2) after a successful restore. | 368 | * image creation has failed and (2) after a successful restore. |
380 | */ | 369 | */ |
381 | 370 | ||
382 | Resume_devices: | ||
383 | /* We may need to release the preallocated image pages here. */ | 371 | /* We may need to release the preallocated image pages here. */ |
384 | if (error || !in_suspend) | 372 | if (error || !in_suspend) |
385 | swsusp_free(); | 373 | swsusp_free(); |
@@ -391,17 +379,15 @@ int hibernation_snapshot(int platform_mode) | |||
391 | pm_restore_gfp_mask(); | 379 | pm_restore_gfp_mask(); |
392 | 380 | ||
393 | resume_console(); | 381 | resume_console(); |
394 | |||
395 | Complete_devices: | ||
396 | dpm_complete(msg); | 382 | dpm_complete(msg); |
397 | 383 | ||
398 | Close: | 384 | Close: |
399 | platform_end(platform_mode); | 385 | platform_end(platform_mode); |
400 | return error; | 386 | return error; |
401 | 387 | ||
402 | Recover_platform: | 388 | Cleanup: |
403 | platform_recover(platform_mode); | 389 | swsusp_free(); |
404 | goto Resume_devices; | 390 | goto Close; |
405 | } | 391 | } |
406 | 392 | ||
407 | /** | 393 | /** |
@@ -586,9 +572,6 @@ int hibernation_platform_enter(void) | |||
586 | static void power_down(void) | 572 | static void power_down(void) |
587 | { | 573 | { |
588 | switch (hibernation_mode) { | 574 | switch (hibernation_mode) { |
589 | case HIBERNATION_TEST: | ||
590 | case HIBERNATION_TESTPROC: | ||
591 | break; | ||
592 | case HIBERNATION_REBOOT: | 575 | case HIBERNATION_REBOOT: |
593 | kernel_restart(NULL); | 576 | kernel_restart(NULL); |
594 | break; | 577 | break; |
@@ -607,17 +590,6 @@ static void power_down(void) | |||
607 | while(1); | 590 | while(1); |
608 | } | 591 | } |
609 | 592 | ||
610 | static int prepare_processes(void) | ||
611 | { | ||
612 | int error = 0; | ||
613 | |||
614 | if (freeze_processes()) { | ||
615 | error = -EBUSY; | ||
616 | thaw_processes(); | ||
617 | } | ||
618 | return error; | ||
619 | } | ||
620 | |||
621 | /** | 593 | /** |
622 | * hibernate - Carry out system hibernation, including saving the image. | 594 | * hibernate - Carry out system hibernation, including saving the image. |
623 | */ | 595 | */ |
@@ -625,7 +597,7 @@ int hibernate(void) | |||
625 | { | 597 | { |
626 | int error; | 598 | int error; |
627 | 599 | ||
628 | mutex_lock(&pm_mutex); | 600 | lock_system_sleep(); |
629 | /* The snapshot device should not be opened while we're running */ | 601 | /* The snapshot device should not be opened while we're running */ |
630 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 602 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
631 | error = -EBUSY; | 603 | error = -EBUSY; |
@@ -650,7 +622,7 @@ int hibernate(void) | |||
650 | sys_sync(); | 622 | sys_sync(); |
651 | printk("done.\n"); | 623 | printk("done.\n"); |
652 | 624 | ||
653 | error = prepare_processes(); | 625 | error = freeze_processes(); |
654 | if (error) | 626 | if (error) |
655 | goto Finish; | 627 | goto Finish; |
656 | 628 | ||
@@ -693,7 +665,7 @@ int hibernate(void) | |||
693 | pm_restore_console(); | 665 | pm_restore_console(); |
694 | atomic_inc(&snapshot_device_available); | 666 | atomic_inc(&snapshot_device_available); |
695 | Unlock: | 667 | Unlock: |
696 | mutex_unlock(&pm_mutex); | 668 | unlock_system_sleep(); |
697 | return error; | 669 | return error; |
698 | } | 670 | } |
699 | 671 | ||
@@ -807,11 +779,13 @@ static int software_resume(void) | |||
807 | goto close_finish; | 779 | goto close_finish; |
808 | 780 | ||
809 | error = create_basic_memory_bitmaps(); | 781 | error = create_basic_memory_bitmaps(); |
810 | if (error) | 782 | if (error) { |
783 | usermodehelper_enable(); | ||
811 | goto close_finish; | 784 | goto close_finish; |
785 | } | ||
812 | 786 | ||
813 | pr_debug("PM: Preparing processes for restore.\n"); | 787 | pr_debug("PM: Preparing processes for restore.\n"); |
814 | error = prepare_processes(); | 788 | error = freeze_processes(); |
815 | if (error) { | 789 | if (error) { |
816 | swsusp_close(FMODE_READ); | 790 | swsusp_close(FMODE_READ); |
817 | goto Done; | 791 | goto Done; |
@@ -851,8 +825,6 @@ static const char * const hibernation_modes[] = { | |||
851 | [HIBERNATION_PLATFORM] = "platform", | 825 | [HIBERNATION_PLATFORM] = "platform", |
852 | [HIBERNATION_SHUTDOWN] = "shutdown", | 826 | [HIBERNATION_SHUTDOWN] = "shutdown", |
853 | [HIBERNATION_REBOOT] = "reboot", | 827 | [HIBERNATION_REBOOT] = "reboot", |
854 | [HIBERNATION_TEST] = "test", | ||
855 | [HIBERNATION_TESTPROC] = "testproc", | ||
856 | }; | 828 | }; |
857 | 829 | ||
858 | /* | 830 | /* |
@@ -861,17 +833,15 @@ static const char * const hibernation_modes[] = { | |||
861 | * Hibernation can be handled in several ways. There are a few different ways | 833 | * Hibernation can be handled in several ways. There are a few different ways |
862 | * to put the system into the sleep state: using the platform driver (e.g. ACPI | 834 | * to put the system into the sleep state: using the platform driver (e.g. ACPI |
863 | * or other hibernation_ops), powering it off or rebooting it (for testing | 835 | * or other hibernation_ops), powering it off or rebooting it (for testing |
864 | * mostly), or using one of the two available test modes. | 836 | * mostly). |
865 | * | 837 | * |
866 | * The sysfs file /sys/power/disk provides an interface for selecting the | 838 | * The sysfs file /sys/power/disk provides an interface for selecting the |
867 | * hibernation mode to use. Reading from this file causes the available modes | 839 | * hibernation mode to use. Reading from this file causes the available modes |
868 | * to be printed. There are 5 modes that can be supported: | 840 | * to be printed. There are 3 modes that can be supported: |
869 | * | 841 | * |
870 | * 'platform' | 842 | * 'platform' |
871 | * 'shutdown' | 843 | * 'shutdown' |
872 | * 'reboot' | 844 | * 'reboot' |
873 | * 'test' | ||
874 | * 'testproc' | ||
875 | * | 845 | * |
876 | * If a platform hibernation driver is in use, 'platform' will be supported | 846 | * If a platform hibernation driver is in use, 'platform' will be supported |
877 | * and will be used by default. Otherwise, 'shutdown' will be used by default. | 847 | * and will be used by default. Otherwise, 'shutdown' will be used by default. |
@@ -895,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
895 | switch (i) { | 865 | switch (i) { |
896 | case HIBERNATION_SHUTDOWN: | 866 | case HIBERNATION_SHUTDOWN: |
897 | case HIBERNATION_REBOOT: | 867 | case HIBERNATION_REBOOT: |
898 | case HIBERNATION_TEST: | ||
899 | case HIBERNATION_TESTPROC: | ||
900 | break; | 868 | break; |
901 | case HIBERNATION_PLATFORM: | 869 | case HIBERNATION_PLATFORM: |
902 | if (hibernation_ops) | 870 | if (hibernation_ops) |
@@ -925,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
925 | p = memchr(buf, '\n', n); | 893 | p = memchr(buf, '\n', n); |
926 | len = p ? p - buf : n; | 894 | len = p ? p - buf : n; |
927 | 895 | ||
928 | mutex_lock(&pm_mutex); | 896 | lock_system_sleep(); |
929 | for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { | 897 | for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { |
930 | if (len == strlen(hibernation_modes[i]) | 898 | if (len == strlen(hibernation_modes[i]) |
931 | && !strncmp(buf, hibernation_modes[i], len)) { | 899 | && !strncmp(buf, hibernation_modes[i], len)) { |
@@ -937,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
937 | switch (mode) { | 905 | switch (mode) { |
938 | case HIBERNATION_SHUTDOWN: | 906 | case HIBERNATION_SHUTDOWN: |
939 | case HIBERNATION_REBOOT: | 907 | case HIBERNATION_REBOOT: |
940 | case HIBERNATION_TEST: | ||
941 | case HIBERNATION_TESTPROC: | ||
942 | hibernation_mode = mode; | 908 | hibernation_mode = mode; |
943 | break; | 909 | break; |
944 | case HIBERNATION_PLATFORM: | 910 | case HIBERNATION_PLATFORM: |
@@ -953,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
953 | if (!error) | 919 | if (!error) |
954 | pr_debug("PM: Hibernation mode set to '%s'\n", | 920 | pr_debug("PM: Hibernation mode set to '%s'\n", |
955 | hibernation_modes[mode]); | 921 | hibernation_modes[mode]); |
956 | mutex_unlock(&pm_mutex); | 922 | unlock_system_sleep(); |
957 | return error ? error : n; | 923 | return error ? error : n; |
958 | } | 924 | } |
959 | 925 | ||
@@ -980,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
980 | if (maj != MAJOR(res) || min != MINOR(res)) | 946 | if (maj != MAJOR(res) || min != MINOR(res)) |
981 | goto out; | 947 | goto out; |
982 | 948 | ||
983 | mutex_lock(&pm_mutex); | 949 | lock_system_sleep(); |
984 | swsusp_resume_device = res; | 950 | swsusp_resume_device = res; |
985 | mutex_unlock(&pm_mutex); | 951 | unlock_system_sleep(); |
986 | printk(KERN_INFO "PM: Starting manual resume from disk\n"); | 952 | printk(KERN_INFO "PM: Starting manual resume from disk\n"); |
987 | noresume = 0; | 953 | noresume = 0; |
988 | software_resume(); | 954 | software_resume(); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 36e0f0903c32..9824b41e5a18 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (c) 2003 Patrick Mochel | 4 | * Copyright (c) 2003 Patrick Mochel |
5 | * Copyright (c) 2003 Open Source Development Lab | 5 | * Copyright (c) 2003 Open Source Development Lab |
6 | * | 6 | * |
7 | * This file is released under the GPLv2 | 7 | * This file is released under the GPLv2 |
8 | * | 8 | * |
9 | */ | 9 | */ |
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
116 | p = memchr(buf, '\n', n); | 116 | p = memchr(buf, '\n', n); |
117 | len = p ? p - buf : n; | 117 | len = p ? p - buf : n; |
118 | 118 | ||
119 | mutex_lock(&pm_mutex); | 119 | lock_system_sleep(); |
120 | 120 | ||
121 | level = TEST_FIRST; | 121 | level = TEST_FIRST; |
122 | for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) | 122 | for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) |
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
126 | break; | 126 | break; |
127 | } | 127 | } |
128 | 128 | ||
129 | mutex_unlock(&pm_mutex); | 129 | unlock_system_sleep(); |
130 | 130 | ||
131 | return error ? error : n; | 131 | return error ? error : n; |
132 | } | 132 | } |
@@ -240,7 +240,7 @@ struct kobject *power_kobj; | |||
240 | * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and | 240 | * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and |
241 | * 'disk' (Suspend-to-Disk). | 241 | * 'disk' (Suspend-to-Disk). |
242 | * | 242 | * |
243 | * store() accepts one of those strings, translates it into the | 243 | * store() accepts one of those strings, translates it into the |
244 | * proper enumerated value, and initiates a suspend transition. | 244 | * proper enumerated value, and initiates a suspend transition. |
245 | */ | 245 | */ |
246 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | 246 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
282 | /* First, check if we are requested to hibernate */ | 282 | /* First, check if we are requested to hibernate */ |
283 | if (len == 4 && !strncmp(buf, "disk", len)) { | 283 | if (len == 4 && !strncmp(buf, "disk", len)) { |
284 | error = hibernate(); | 284 | error = hibernate(); |
285 | goto Exit; | 285 | goto Exit; |
286 | } | 286 | } |
287 | 287 | ||
288 | #ifdef CONFIG_SUSPEND | 288 | #ifdef CONFIG_SUSPEND |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 23a2db1ec442..21724eee5206 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
50 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) | 50 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) |
51 | 51 | ||
52 | /* kernel/power/hibernate.c */ | 52 | /* kernel/power/hibernate.c */ |
53 | extern bool freezer_test_done; | ||
54 | |||
53 | extern int hibernation_snapshot(int platform_mode); | 55 | extern int hibernation_snapshot(int platform_mode); |
54 | extern int hibernation_restore(int platform_mode); | 56 | extern int hibernation_restore(int platform_mode); |
55 | extern int hibernation_platform_enter(void); | 57 | extern int hibernation_platform_enter(void); |
@@ -229,8 +231,28 @@ extern int pm_test_level; | |||
229 | #ifdef CONFIG_SUSPEND_FREEZER | 231 | #ifdef CONFIG_SUSPEND_FREEZER |
230 | static inline int suspend_freeze_processes(void) | 232 | static inline int suspend_freeze_processes(void) |
231 | { | 233 | { |
232 | int error = freeze_processes(); | 234 | int error; |
233 | return error ? : freeze_kernel_threads(); | 235 | |
236 | error = freeze_processes(); | ||
237 | |||
238 | /* | ||
239 | * freeze_processes() automatically thaws every task if freezing | ||
240 | * fails. So we need not do anything extra upon error. | ||
241 | */ | ||
242 | if (error) | ||
243 | goto Finish; | ||
244 | |||
245 | error = freeze_kernel_threads(); | ||
246 | |||
247 | /* | ||
248 | * freeze_kernel_threads() thaws only kernel threads upon freezing | ||
249 | * failure. So we have to thaw the userspace tasks ourselves. | ||
250 | */ | ||
251 | if (error) | ||
252 | thaw_processes(); | ||
253 | |||
254 | Finish: | ||
255 | return error; | ||
234 | } | 256 | } |
235 | 257 | ||
236 | static inline void suspend_thaw_processes(void) | 258 | static inline void suspend_thaw_processes(void) |
diff --git a/kernel/power/process.c b/kernel/power/process.c index addbbe5531bc..7e426459e60a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -22,16 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | #define TIMEOUT (20 * HZ) | 23 | #define TIMEOUT (20 * HZ) |
24 | 24 | ||
25 | static inline int freezable(struct task_struct * p) | 25 | static int try_to_freeze_tasks(bool user_only) |
26 | { | ||
27 | if ((p == current) || | ||
28 | (p->flags & PF_NOFREEZE) || | ||
29 | (p->exit_state != 0)) | ||
30 | return 0; | ||
31 | return 1; | ||
32 | } | ||
33 | |||
34 | static int try_to_freeze_tasks(bool sig_only) | ||
35 | { | 26 | { |
36 | struct task_struct *g, *p; | 27 | struct task_struct *g, *p; |
37 | unsigned long end_time; | 28 | unsigned long end_time; |
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only) | |||
46 | 37 | ||
47 | end_time = jiffies + TIMEOUT; | 38 | end_time = jiffies + TIMEOUT; |
48 | 39 | ||
49 | if (!sig_only) | 40 | if (!user_only) |
50 | freeze_workqueues_begin(); | 41 | freeze_workqueues_begin(); |
51 | 42 | ||
52 | while (true) { | 43 | while (true) { |
53 | todo = 0; | 44 | todo = 0; |
54 | read_lock(&tasklist_lock); | 45 | read_lock(&tasklist_lock); |
55 | do_each_thread(g, p) { | 46 | do_each_thread(g, p) { |
56 | if (frozen(p) || !freezable(p)) | 47 | if (p == current || !freeze_task(p)) |
57 | continue; | ||
58 | |||
59 | if (!freeze_task(p, sig_only)) | ||
60 | continue; | 48 | continue; |
61 | 49 | ||
62 | /* | 50 | /* |
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
77 | } while_each_thread(g, p); | 65 | } while_each_thread(g, p); |
78 | read_unlock(&tasklist_lock); | 66 | read_unlock(&tasklist_lock); |
79 | 67 | ||
80 | if (!sig_only) { | 68 | if (!user_only) { |
81 | wq_busy = freeze_workqueues_busy(); | 69 | wq_busy = freeze_workqueues_busy(); |
82 | todo += wq_busy; | 70 | todo += wq_busy; |
83 | } | 71 | } |
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only) | |||
103 | elapsed_csecs = elapsed_csecs64; | 91 | elapsed_csecs = elapsed_csecs64; |
104 | 92 | ||
105 | if (todo) { | 93 | if (todo) { |
106 | /* This does not unfreeze processes that are already frozen | ||
107 | * (we have slightly ugly calling convention in that respect, | ||
108 | * and caller must call thaw_processes() if something fails), | ||
109 | * but it cleans up leftover PF_FREEZE requests. | ||
110 | */ | ||
111 | printk("\n"); | 94 | printk("\n"); |
112 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " | 95 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " |
113 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 96 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only) | |||
115 | elapsed_csecs / 100, elapsed_csecs % 100, | 98 | elapsed_csecs / 100, elapsed_csecs % 100, |
116 | todo - wq_busy, wq_busy); | 99 | todo - wq_busy, wq_busy); |
117 | 100 | ||
118 | thaw_workqueues(); | ||
119 | |||
120 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
121 | do_each_thread(g, p) { | 102 | do_each_thread(g, p) { |
122 | task_lock(p); | 103 | if (!wakeup && !freezer_should_skip(p) && |
123 | if (!wakeup && freezing(p) && !freezer_should_skip(p)) | 104 | p != current && freezing(p) && !frozen(p)) |
124 | sched_show_task(p); | 105 | sched_show_task(p); |
125 | cancel_freezing(p); | ||
126 | task_unlock(p); | ||
127 | } while_each_thread(g, p); | 106 | } while_each_thread(g, p); |
128 | read_unlock(&tasklist_lock); | 107 | read_unlock(&tasklist_lock); |
129 | } else { | 108 | } else { |
@@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only) | |||
136 | 115 | ||
137 | /** | 116 | /** |
138 | * freeze_processes - Signal user space processes to enter the refrigerator. | 117 | * freeze_processes - Signal user space processes to enter the refrigerator. |
118 | * | ||
119 | * On success, returns 0. On failure, -errno and system is fully thawed. | ||
139 | */ | 120 | */ |
140 | int freeze_processes(void) | 121 | int freeze_processes(void) |
141 | { | 122 | { |
142 | int error; | 123 | int error; |
143 | 124 | ||
125 | if (!pm_freezing) | ||
126 | atomic_inc(&system_freezing_cnt); | ||
127 | |||
144 | printk("Freezing user space processes ... "); | 128 | printk("Freezing user space processes ... "); |
129 | pm_freezing = true; | ||
145 | error = try_to_freeze_tasks(true); | 130 | error = try_to_freeze_tasks(true); |
146 | if (!error) { | 131 | if (!error) { |
147 | printk("done."); | 132 | printk("done."); |
@@ -150,17 +135,25 @@ int freeze_processes(void) | |||
150 | printk("\n"); | 135 | printk("\n"); |
151 | BUG_ON(in_atomic()); | 136 | BUG_ON(in_atomic()); |
152 | 137 | ||
138 | if (error) | ||
139 | thaw_processes(); | ||
153 | return error; | 140 | return error; |
154 | } | 141 | } |
155 | 142 | ||
156 | /** | 143 | /** |
157 | * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. | 144 | * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. |
145 | * | ||
146 | * On success, returns 0. On failure, -errno and only the kernel threads are | ||
147 | * thawed, so as to give a chance to the caller to do additional cleanups | ||
148 | * (if any) before thawing the userspace tasks. So, it is the responsibility | ||
149 | * of the caller to thaw the userspace tasks, when the time is right. | ||
158 | */ | 150 | */ |
159 | int freeze_kernel_threads(void) | 151 | int freeze_kernel_threads(void) |
160 | { | 152 | { |
161 | int error; | 153 | int error; |
162 | 154 | ||
163 | printk("Freezing remaining freezable tasks ... "); | 155 | printk("Freezing remaining freezable tasks ... "); |
156 | pm_nosig_freezing = true; | ||
164 | error = try_to_freeze_tasks(false); | 157 | error = try_to_freeze_tasks(false); |
165 | if (!error) | 158 | if (!error) |
166 | printk("done."); | 159 | printk("done."); |
@@ -168,38 +161,52 @@ int freeze_kernel_threads(void) | |||
168 | printk("\n"); | 161 | printk("\n"); |
169 | BUG_ON(in_atomic()); | 162 | BUG_ON(in_atomic()); |
170 | 163 | ||
164 | if (error) | ||
165 | thaw_kernel_threads(); | ||
171 | return error; | 166 | return error; |
172 | } | 167 | } |
173 | 168 | ||
174 | static void thaw_tasks(bool nosig_only) | 169 | void thaw_processes(void) |
175 | { | 170 | { |
176 | struct task_struct *g, *p; | 171 | struct task_struct *g, *p; |
177 | 172 | ||
178 | read_lock(&tasklist_lock); | 173 | if (pm_freezing) |
179 | do_each_thread(g, p) { | 174 | atomic_dec(&system_freezing_cnt); |
180 | if (!freezable(p)) | 175 | pm_freezing = false; |
181 | continue; | 176 | pm_nosig_freezing = false; |
182 | 177 | ||
183 | if (nosig_only && should_send_signal(p)) | 178 | oom_killer_enable(); |
184 | continue; | 179 | |
180 | printk("Restarting tasks ... "); | ||
185 | 181 | ||
186 | if (cgroup_freezing_or_frozen(p)) | 182 | thaw_workqueues(); |
187 | continue; | ||
188 | 183 | ||
189 | thaw_process(p); | 184 | read_lock(&tasklist_lock); |
185 | do_each_thread(g, p) { | ||
186 | __thaw_task(p); | ||
190 | } while_each_thread(g, p); | 187 | } while_each_thread(g, p); |
191 | read_unlock(&tasklist_lock); | 188 | read_unlock(&tasklist_lock); |
189 | |||
190 | schedule(); | ||
191 | printk("done.\n"); | ||
192 | } | 192 | } |
193 | 193 | ||
194 | void thaw_processes(void) | 194 | void thaw_kernel_threads(void) |
195 | { | 195 | { |
196 | oom_killer_enable(); | 196 | struct task_struct *g, *p; |
197 | |||
198 | pm_nosig_freezing = false; | ||
199 | printk("Restarting kernel threads ... "); | ||
197 | 200 | ||
198 | printk("Restarting tasks ... "); | ||
199 | thaw_workqueues(); | 201 | thaw_workqueues(); |
200 | thaw_tasks(true); | 202 | |
201 | thaw_tasks(false); | 203 | read_lock(&tasklist_lock); |
204 | do_each_thread(g, p) { | ||
205 | if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) | ||
206 | __thaw_task(p); | ||
207 | } while_each_thread(g, p); | ||
208 | read_unlock(&tasklist_lock); | ||
209 | |||
202 | schedule(); | 210 | schedule(); |
203 | printk("done.\n"); | 211 | printk("done.\n"); |
204 | } | 212 | } |
205 | |||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c1441392..6a768e537001 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone) | |||
812 | unsigned int res; | 812 | unsigned int res; |
813 | 813 | ||
814 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | 814 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); |
815 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); | 815 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), |
816 | LINKED_PAGE_DATA_SIZE); | ||
816 | return 2 * res; | 817 | return 2 * res; |
817 | } | 818 | } |
818 | 819 | ||
@@ -858,6 +859,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | |||
858 | PageReserved(page)) | 859 | PageReserved(page)) |
859 | return NULL; | 860 | return NULL; |
860 | 861 | ||
862 | if (page_is_guard(page)) | ||
863 | return NULL; | ||
864 | |||
861 | return page; | 865 | return page; |
862 | } | 866 | } |
863 | 867 | ||
@@ -920,6 +924,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) | |||
920 | && (!kernel_page_present(page) || pfn_is_nosave(pfn))) | 924 | && (!kernel_page_present(page) || pfn_is_nosave(pfn))) |
921 | return NULL; | 925 | return NULL; |
922 | 926 | ||
927 | if (page_is_guard(page)) | ||
928 | return NULL; | ||
929 | |||
923 | return page; | 930 | return page; |
924 | } | 931 | } |
925 | 932 | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4953dc054c53..4fd51beed879 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops; | |||
42 | */ | 42 | */ |
43 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 43 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
44 | { | 44 | { |
45 | mutex_lock(&pm_mutex); | 45 | lock_system_sleep(); |
46 | suspend_ops = ops; | 46 | suspend_ops = ops; |
47 | mutex_unlock(&pm_mutex); | 47 | unlock_system_sleep(); |
48 | } | 48 | } |
49 | EXPORT_SYMBOL_GPL(suspend_set_ops); | 49 | EXPORT_SYMBOL_GPL(suspend_set_ops); |
50 | 50 | ||
@@ -106,13 +106,11 @@ static int suspend_prepare(void) | |||
106 | goto Finish; | 106 | goto Finish; |
107 | 107 | ||
108 | error = suspend_freeze_processes(); | 108 | error = suspend_freeze_processes(); |
109 | if (error) { | 109 | if (!error) |
110 | suspend_stats.failed_freeze++; | ||
111 | dpm_save_failed_step(SUSPEND_FREEZE); | ||
112 | } else | ||
113 | return 0; | 110 | return 0; |
114 | 111 | ||
115 | suspend_thaw_processes(); | 112 | suspend_stats.failed_freeze++; |
113 | dpm_save_failed_step(SUSPEND_FREEZE); | ||
116 | usermodehelper_enable(); | 114 | usermodehelper_enable(); |
117 | Finish: | 115 | Finish: |
118 | pm_notifier_call_chain(PM_POST_SUSPEND); | 116 | pm_notifier_call_chain(PM_POST_SUSPEND); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11a594c4ba25..8742fd013a94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/bitops.h> | 18 | #include <linux/bitops.h> |
19 | #include <linux/genhd.h> | 19 | #include <linux/genhd.h> |
20 | #include <linux/device.h> | 20 | #include <linux/device.h> |
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/bio.h> | 21 | #include <linux/bio.h> |
23 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
24 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
@@ -774,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags) | |||
774 | 773 | ||
775 | pr_debug("PM: Free swap pages: %u\n", free_swap); | 774 | pr_debug("PM: Free swap pages: %u\n", free_swap); |
776 | 775 | ||
777 | required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? | 776 | required = PAGES_FOR_IO + nr_pages; |
778 | nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); | ||
779 | return free_swap > required; | 777 | return free_swap > required; |
780 | } | 778 | } |
781 | 779 | ||
@@ -803,10 +801,12 @@ int swsusp_write(unsigned int flags) | |||
803 | printk(KERN_ERR "PM: Cannot get swap writer\n"); | 801 | printk(KERN_ERR "PM: Cannot get swap writer\n"); |
804 | return error; | 802 | return error; |
805 | } | 803 | } |
806 | if (!enough_swap(pages, flags)) { | 804 | if (flags & SF_NOCOMPRESS_MODE) { |
807 | printk(KERN_ERR "PM: Not enough free swap\n"); | 805 | if (!enough_swap(pages, flags)) { |
808 | error = -ENOSPC; | 806 | printk(KERN_ERR "PM: Not enough free swap\n"); |
809 | goto out_finish; | 807 | error = -ENOSPC; |
808 | goto out_finish; | ||
809 | } | ||
810 | } | 810 | } |
811 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 811 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
812 | error = snapshot_read_next(&snapshot); | 812 | error = snapshot_read_next(&snapshot); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535c2b88..3e100075b13c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/swapops.h> | 21 | #include <linux/swapops.h> |
22 | #include <linux/pm.h> | 22 | #include <linux/pm.h> |
23 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
24 | #include <linux/compat.h> | ||
24 | #include <linux/console.h> | 25 | #include <linux/console.h> |
25 | #include <linux/cpu.h> | 26 | #include <linux/cpu.h> |
26 | #include <linux/freezer.h> | 27 | #include <linux/freezer.h> |
@@ -30,28 +31,6 @@ | |||
30 | 31 | ||
31 | #include "power.h" | 32 | #include "power.h" |
32 | 33 | ||
33 | /* | ||
34 | * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and | ||
35 | * will be removed in the future. They are only preserved here for | ||
36 | * compatibility with existing userland utilities. | ||
37 | */ | ||
38 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | ||
39 | #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) | ||
40 | |||
41 | #define PMOPS_PREPARE 1 | ||
42 | #define PMOPS_ENTER 2 | ||
43 | #define PMOPS_FINISH 3 | ||
44 | |||
45 | /* | ||
46 | * NOTE: The following ioctl definitions are wrong and have been replaced with | ||
47 | * correct ones. They are only preserved here for compatibility with existing | ||
48 | * userland utilities and will be removed in the future. | ||
49 | */ | ||
50 | #define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) | ||
51 | #define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) | ||
52 | #define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) | ||
53 | #define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) | ||
54 | |||
55 | 34 | ||
56 | #define SNAPSHOT_MINOR 231 | 35 | #define SNAPSHOT_MINOR 231 |
57 | 36 | ||
@@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
71 | struct snapshot_data *data; | 50 | struct snapshot_data *data; |
72 | int error; | 51 | int error; |
73 | 52 | ||
74 | mutex_lock(&pm_mutex); | 53 | lock_system_sleep(); |
75 | 54 | ||
76 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 55 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
77 | error = -EBUSY; | 56 | error = -EBUSY; |
@@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
123 | data->platform_support = 0; | 102 | data->platform_support = 0; |
124 | 103 | ||
125 | Unlock: | 104 | Unlock: |
126 | mutex_unlock(&pm_mutex); | 105 | unlock_system_sleep(); |
127 | 106 | ||
128 | return error; | 107 | return error; |
129 | } | 108 | } |
@@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
132 | { | 111 | { |
133 | struct snapshot_data *data; | 112 | struct snapshot_data *data; |
134 | 113 | ||
135 | mutex_lock(&pm_mutex); | 114 | lock_system_sleep(); |
136 | 115 | ||
137 | swsusp_free(); | 116 | swsusp_free(); |
138 | free_basic_memory_bitmaps(); | 117 | free_basic_memory_bitmaps(); |
@@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
146 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 125 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
147 | atomic_inc(&snapshot_device_available); | 126 | atomic_inc(&snapshot_device_available); |
148 | 127 | ||
149 | mutex_unlock(&pm_mutex); | 128 | unlock_system_sleep(); |
150 | 129 | ||
151 | return 0; | 130 | return 0; |
152 | } | 131 | } |
@@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, | |||
158 | ssize_t res; | 137 | ssize_t res; |
159 | loff_t pg_offp = *offp & ~PAGE_MASK; | 138 | loff_t pg_offp = *offp & ~PAGE_MASK; |
160 | 139 | ||
161 | mutex_lock(&pm_mutex); | 140 | lock_system_sleep(); |
162 | 141 | ||
163 | data = filp->private_data; | 142 | data = filp->private_data; |
164 | if (!data->ready) { | 143 | if (!data->ready) { |
@@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, | |||
179 | *offp += res; | 158 | *offp += res; |
180 | 159 | ||
181 | Unlock: | 160 | Unlock: |
182 | mutex_unlock(&pm_mutex); | 161 | unlock_system_sleep(); |
183 | 162 | ||
184 | return res; | 163 | return res; |
185 | } | 164 | } |
@@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
191 | ssize_t res; | 170 | ssize_t res; |
192 | loff_t pg_offp = *offp & ~PAGE_MASK; | 171 | loff_t pg_offp = *offp & ~PAGE_MASK; |
193 | 172 | ||
194 | mutex_lock(&pm_mutex); | 173 | lock_system_sleep(); |
195 | 174 | ||
196 | data = filp->private_data; | 175 | data = filp->private_data; |
197 | 176 | ||
@@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
208 | if (res > 0) | 187 | if (res > 0) |
209 | *offp += res; | 188 | *offp += res; |
210 | unlock: | 189 | unlock: |
211 | mutex_unlock(&pm_mutex); | 190 | unlock_system_sleep(); |
212 | 191 | ||
213 | return res; | 192 | return res; |
214 | } | 193 | } |
215 | 194 | ||
216 | static void snapshot_deprecated_ioctl(unsigned int cmd) | ||
217 | { | ||
218 | if (printk_ratelimit()) | ||
219 | printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " | ||
220 | "be removed soon, update your suspend-to-disk " | ||
221 | "utilities\n", | ||
222 | __builtin_return_address(0), cmd); | ||
223 | } | ||
224 | |||
225 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, | 195 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, |
226 | unsigned long arg) | 196 | unsigned long arg) |
227 | { | 197 | { |
@@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
257 | break; | 227 | break; |
258 | 228 | ||
259 | error = freeze_processes(); | 229 | error = freeze_processes(); |
260 | if (error) { | 230 | if (error) |
261 | thaw_processes(); | ||
262 | usermodehelper_enable(); | 231 | usermodehelper_enable(); |
263 | } | 232 | else |
264 | if (!error) | ||
265 | data->frozen = 1; | 233 | data->frozen = 1; |
266 | break; | 234 | break; |
267 | 235 | ||
@@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
274 | data->frozen = 0; | 242 | data->frozen = 0; |
275 | break; | 243 | break; |
276 | 244 | ||
277 | case SNAPSHOT_ATOMIC_SNAPSHOT: | ||
278 | snapshot_deprecated_ioctl(cmd); | ||
279 | case SNAPSHOT_CREATE_IMAGE: | 245 | case SNAPSHOT_CREATE_IMAGE: |
280 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | 246 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { |
281 | error = -EPERM; | 247 | error = -EPERM; |
@@ -283,10 +249,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
283 | } | 249 | } |
284 | pm_restore_gfp_mask(); | 250 | pm_restore_gfp_mask(); |
285 | error = hibernation_snapshot(data->platform_support); | 251 | error = hibernation_snapshot(data->platform_support); |
286 | if (!error) | 252 | if (error) { |
253 | thaw_kernel_threads(); | ||
254 | } else { | ||
287 | error = put_user(in_suspend, (int __user *)arg); | 255 | error = put_user(in_suspend, (int __user *)arg); |
288 | if (!error) | 256 | if (!error && !freezer_test_done) |
289 | data->ready = 1; | 257 | data->ready = 1; |
258 | if (freezer_test_done) { | ||
259 | freezer_test_done = false; | ||
260 | thaw_kernel_threads(); | ||
261 | } | ||
262 | } | ||
290 | break; | 263 | break; |
291 | 264 | ||
292 | case SNAPSHOT_ATOMIC_RESTORE: | 265 | case SNAPSHOT_ATOMIC_RESTORE: |
@@ -303,10 +276,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
303 | swsusp_free(); | 276 | swsusp_free(); |
304 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | 277 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); |
305 | data->ready = 0; | 278 | data->ready = 0; |
279 | /* | ||
280 | * It is necessary to thaw kernel threads here, because | ||
281 | * SNAPSHOT_CREATE_IMAGE may be invoked directly after | ||
282 | * SNAPSHOT_FREE. In that case, if kernel threads were not | ||
283 | * thawed, the preallocation of memory carried out by | ||
284 | * hibernation_snapshot() might run into problems (i.e. it | ||
285 | * might fail or even deadlock). | ||
286 | */ | ||
287 | thaw_kernel_threads(); | ||
306 | break; | 288 | break; |
307 | 289 | ||
308 | case SNAPSHOT_SET_IMAGE_SIZE: | ||
309 | snapshot_deprecated_ioctl(cmd); | ||
310 | case SNAPSHOT_PREF_IMAGE_SIZE: | 290 | case SNAPSHOT_PREF_IMAGE_SIZE: |
311 | image_size = arg; | 291 | image_size = arg; |
312 | break; | 292 | break; |
@@ -321,16 +301,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
321 | error = put_user(size, (loff_t __user *)arg); | 301 | error = put_user(size, (loff_t __user *)arg); |
322 | break; | 302 | break; |
323 | 303 | ||
324 | case SNAPSHOT_AVAIL_SWAP: | ||
325 | snapshot_deprecated_ioctl(cmd); | ||
326 | case SNAPSHOT_AVAIL_SWAP_SIZE: | 304 | case SNAPSHOT_AVAIL_SWAP_SIZE: |
327 | size = count_swap_pages(data->swap, 1); | 305 | size = count_swap_pages(data->swap, 1); |
328 | size <<= PAGE_SHIFT; | 306 | size <<= PAGE_SHIFT; |
329 | error = put_user(size, (loff_t __user *)arg); | 307 | error = put_user(size, (loff_t __user *)arg); |
330 | break; | 308 | break; |
331 | 309 | ||
332 | case SNAPSHOT_GET_SWAP_PAGE: | ||
333 | snapshot_deprecated_ioctl(cmd); | ||
334 | case SNAPSHOT_ALLOC_SWAP_PAGE: | 310 | case SNAPSHOT_ALLOC_SWAP_PAGE: |
335 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | 311 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { |
336 | error = -ENODEV; | 312 | error = -ENODEV; |
@@ -353,27 +329,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
353 | free_all_swap_pages(data->swap); | 329 | free_all_swap_pages(data->swap); |
354 | break; | 330 | break; |
355 | 331 | ||
356 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ | ||
357 | snapshot_deprecated_ioctl(cmd); | ||
358 | if (!swsusp_swap_in_use()) { | ||
359 | /* | ||
360 | * User space encodes device types as two-byte values, | ||
361 | * so we need to recode them | ||
362 | */ | ||
363 | if (old_decode_dev(arg)) { | ||
364 | data->swap = swap_type_of(old_decode_dev(arg), | ||
365 | 0, NULL); | ||
366 | if (data->swap < 0) | ||
367 | error = -ENODEV; | ||
368 | } else { | ||
369 | data->swap = -1; | ||
370 | error = -EINVAL; | ||
371 | } | ||
372 | } else { | ||
373 | error = -EPERM; | ||
374 | } | ||
375 | break; | ||
376 | |||
377 | case SNAPSHOT_S2RAM: | 332 | case SNAPSHOT_S2RAM: |
378 | if (!data->frozen) { | 333 | if (!data->frozen) { |
379 | error = -EPERM; | 334 | error = -EPERM; |
@@ -396,33 +351,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
396 | error = hibernation_platform_enter(); | 351 | error = hibernation_platform_enter(); |
397 | break; | 352 | break; |
398 | 353 | ||
399 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ | ||
400 | snapshot_deprecated_ioctl(cmd); | ||
401 | error = -EINVAL; | ||
402 | |||
403 | switch (arg) { | ||
404 | |||
405 | case PMOPS_PREPARE: | ||
406 | data->platform_support = 1; | ||
407 | error = 0; | ||
408 | break; | ||
409 | |||
410 | case PMOPS_ENTER: | ||
411 | if (data->platform_support) | ||
412 | error = hibernation_platform_enter(); | ||
413 | break; | ||
414 | |||
415 | case PMOPS_FINISH: | ||
416 | if (data->platform_support) | ||
417 | error = 0; | ||
418 | break; | ||
419 | |||
420 | default: | ||
421 | printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); | ||
422 | |||
423 | } | ||
424 | break; | ||
425 | |||
426 | case SNAPSHOT_SET_SWAP_AREA: | 354 | case SNAPSHOT_SET_SWAP_AREA: |
427 | if (swsusp_swap_in_use()) { | 355 | if (swsusp_swap_in_use()) { |
428 | error = -EPERM; | 356 | error = -EPERM; |
@@ -464,6 +392,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
464 | return error; | 392 | return error; |
465 | } | 393 | } |
466 | 394 | ||
395 | #ifdef CONFIG_COMPAT | ||
396 | |||
397 | struct compat_resume_swap_area { | ||
398 | compat_loff_t offset; | ||
399 | u32 dev; | ||
400 | } __packed; | ||
401 | |||
402 | static long | ||
403 | snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
404 | { | ||
405 | BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); | ||
406 | |||
407 | switch (cmd) { | ||
408 | case SNAPSHOT_GET_IMAGE_SIZE: | ||
409 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
410 | case SNAPSHOT_ALLOC_SWAP_PAGE: { | ||
411 | compat_loff_t __user *uoffset = compat_ptr(arg); | ||
412 | loff_t offset; | ||
413 | mm_segment_t old_fs; | ||
414 | int err; | ||
415 | |||
416 | old_fs = get_fs(); | ||
417 | set_fs(KERNEL_DS); | ||
418 | err = snapshot_ioctl(file, cmd, (unsigned long) &offset); | ||
419 | set_fs(old_fs); | ||
420 | if (!err && put_user(offset, uoffset)) | ||
421 | err = -EFAULT; | ||
422 | return err; | ||
423 | } | ||
424 | |||
425 | case SNAPSHOT_CREATE_IMAGE: | ||
426 | return snapshot_ioctl(file, cmd, | ||
427 | (unsigned long) compat_ptr(arg)); | ||
428 | |||
429 | case SNAPSHOT_SET_SWAP_AREA: { | ||
430 | struct compat_resume_swap_area __user *u_swap_area = | ||
431 | compat_ptr(arg); | ||
432 | struct resume_swap_area swap_area; | ||
433 | mm_segment_t old_fs; | ||
434 | int err; | ||
435 | |||
436 | err = get_user(swap_area.offset, &u_swap_area->offset); | ||
437 | err |= get_user(swap_area.dev, &u_swap_area->dev); | ||
438 | if (err) | ||
439 | return -EFAULT; | ||
440 | old_fs = get_fs(); | ||
441 | set_fs(KERNEL_DS); | ||
442 | err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, | ||
443 | (unsigned long) &swap_area); | ||
444 | set_fs(old_fs); | ||
445 | return err; | ||
446 | } | ||
447 | |||
448 | default: | ||
449 | return snapshot_ioctl(file, cmd, arg); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | #endif /* CONFIG_COMPAT */ | ||
454 | |||
467 | static const struct file_operations snapshot_fops = { | 455 | static const struct file_operations snapshot_fops = { |
468 | .open = snapshot_open, | 456 | .open = snapshot_open, |
469 | .release = snapshot_release, | 457 | .release = snapshot_release, |
@@ -471,6 +459,9 @@ static const struct file_operations snapshot_fops = { | |||
471 | .write = snapshot_write, | 459 | .write = snapshot_write, |
472 | .llseek = no_llseek, | 460 | .llseek = no_llseek, |
473 | .unlocked_ioctl = snapshot_ioctl, | 461 | .unlocked_ioctl = snapshot_ioctl, |
462 | #ifdef CONFIG_COMPAT | ||
463 | .compat_ioctl = snapshot_compat_ioctl, | ||
464 | #endif | ||
474 | }; | 465 | }; |
475 | 466 | ||
476 | static struct miscdevice snapshot_device = { | 467 | static struct miscdevice snapshot_device = { |
diff --git a/kernel/printk.c b/kernel/printk.c index 1455a0d4eedd..13c0a1143f49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early) | |||
199 | unsigned long mem; | 199 | unsigned long mem; |
200 | 200 | ||
201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); | 201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); |
202 | if (mem == MEMBLOCK_ERROR) | 202 | if (!mem) |
203 | return; | 203 | return; |
204 | new_log_buf = __va(mem); | 204 | new_log_buf = __va(mem); |
205 | } else { | 205 | } else { |
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end) | |||
521 | } | 521 | } |
522 | } | 522 | } |
523 | 523 | ||
524 | static int __read_mostly ignore_loglevel; | 524 | static bool __read_mostly ignore_loglevel; |
525 | 525 | ||
526 | static int __init ignore_loglevel_setup(char *str) | 526 | static int __init ignore_loglevel_setup(char *str) |
527 | { | 527 | { |
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str) | |||
532 | } | 532 | } |
533 | 533 | ||
534 | early_param("ignore_loglevel", ignore_loglevel_setup); | 534 | early_param("ignore_loglevel", ignore_loglevel_setup); |
535 | module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); | 535 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); |
536 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | 536 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" |
537 | "print all kernel messages to the console."); | 537 | "print all kernel messages to the console."); |
538 | 538 | ||
@@ -688,6 +688,7 @@ static void zap_locks(void) | |||
688 | 688 | ||
689 | oops_timestamp = jiffies; | 689 | oops_timestamp = jiffies; |
690 | 690 | ||
691 | debug_locks_off(); | ||
691 | /* If a crash is occurring, make sure we can't deadlock */ | 692 | /* If a crash is occurring, make sure we can't deadlock */ |
692 | raw_spin_lock_init(&logbuf_lock); | 693 | raw_spin_lock_init(&logbuf_lock); |
693 | /* And make sure that we print immediately */ | 694 | /* And make sure that we print immediately */ |
@@ -695,9 +696,9 @@ static void zap_locks(void) | |||
695 | } | 696 | } |
696 | 697 | ||
697 | #if defined(CONFIG_PRINTK_TIME) | 698 | #if defined(CONFIG_PRINTK_TIME) |
698 | static int printk_time = 1; | 699 | static bool printk_time = 1; |
699 | #else | 700 | #else |
700 | static int printk_time = 0; | 701 | static bool printk_time = 0; |
701 | #endif | 702 | #endif |
702 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 703 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
703 | 704 | ||
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
840 | boot_delay_msec(); | 841 | boot_delay_msec(); |
841 | printk_delay(); | 842 | printk_delay(); |
842 | 843 | ||
843 | preempt_disable(); | ||
844 | /* This stops the holder of console_sem just where we want him */ | 844 | /* This stops the holder of console_sem just where we want him */ |
845 | raw_local_irq_save(flags); | 845 | local_irq_save(flags); |
846 | this_cpu = smp_processor_id(); | 846 | this_cpu = smp_processor_id(); |
847 | 847 | ||
848 | /* | 848 | /* |
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | * recursion and return - but flag the recursion so that | 856 | * recursion and return - but flag the recursion so that |
857 | * it can be printed at the next appropriate moment: | 857 | * it can be printed at the next appropriate moment: |
858 | */ | 858 | */ |
859 | if (!oops_in_progress) { | 859 | if (!oops_in_progress && !lockdep_recursing(current)) { |
860 | recursion_bug = 1; | 860 | recursion_bug = 1; |
861 | goto out_restore_irqs; | 861 | goto out_restore_irqs; |
862 | } | 862 | } |
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
962 | 962 | ||
963 | lockdep_on(); | 963 | lockdep_on(); |
964 | out_restore_irqs: | 964 | out_restore_irqs: |
965 | raw_local_irq_restore(flags); | 965 | local_irq_restore(flags); |
966 | 966 | ||
967 | preempt_enable(); | ||
968 | return printed_len; | 967 | return printed_len; |
969 | } | 968 | } |
970 | EXPORT_SYMBOL(printk); | 969 | EXPORT_SYMBOL(printk); |
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
1099 | return -1; | 1098 | return -1; |
1100 | } | 1099 | } |
1101 | 1100 | ||
1102 | int console_suspend_enabled = 1; | 1101 | bool console_suspend_enabled = 1; |
1103 | EXPORT_SYMBOL(console_suspend_enabled); | 1102 | EXPORT_SYMBOL(console_suspend_enabled); |
1104 | 1103 | ||
1105 | static int __init console_suspend_disable(char *str) | 1104 | static int __init console_suspend_disable(char *str) |
@@ -1293,10 +1292,11 @@ again: | |||
1293 | raw_spin_lock(&logbuf_lock); | 1292 | raw_spin_lock(&logbuf_lock); |
1294 | if (con_start != log_end) | 1293 | if (con_start != log_end) |
1295 | retry = 1; | 1294 | retry = 1; |
1295 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1296 | |||
1296 | if (retry && console_trylock()) | 1297 | if (retry && console_trylock()) |
1297 | goto again; | 1298 | goto again; |
1298 | 1299 | ||
1299 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1300 | if (wake_klogd) | 1300 | if (wake_klogd) |
1301 | wake_up_klogd(); | 1301 | wake_up_klogd(); |
1302 | } | 1302 | } |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 24d04477b257..00ab2ca5ed11 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child) | |||
96 | */ | 96 | */ |
97 | if (!(child->flags & PF_EXITING) && | 97 | if (!(child->flags & PF_EXITING) && |
98 | (child->signal->flags & SIGNAL_STOP_STOPPED || | 98 | (child->signal->flags & SIGNAL_STOP_STOPPED || |
99 | child->signal->group_stop_count)) | 99 | child->signal->group_stop_count)) { |
100 | child->jobctl |= JOBCTL_STOP_PENDING; | 100 | child->jobctl |= JOBCTL_STOP_PENDING; |
101 | 101 | ||
102 | /* | ||
103 | * This is only possible if this thread was cloned by the | ||
104 | * traced task running in the stopped group, set the signal | ||
105 | * for the future reports. | ||
106 | * FIXME: we should change ptrace_init_task() to handle this | ||
107 | * case. | ||
108 | */ | ||
109 | if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) | ||
110 | child->jobctl |= SIGSTOP; | ||
111 | } | ||
112 | |||
102 | /* | 113 | /* |
103 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick | 114 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick |
104 | * @child in the butt. Note that @resume should be used iff @child | 115 | * @child in the butt. Note that @resume should be used iff @child |
@@ -161,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state) | |||
161 | return ret; | 172 | return ret; |
162 | } | 173 | } |
163 | 174 | ||
175 | static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) | ||
176 | { | ||
177 | if (mode & PTRACE_MODE_NOAUDIT) | ||
178 | return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); | ||
179 | else | ||
180 | return has_ns_capability(current, ns, CAP_SYS_PTRACE); | ||
181 | } | ||
182 | |||
164 | int __ptrace_may_access(struct task_struct *task, unsigned int mode) | 183 | int __ptrace_may_access(struct task_struct *task, unsigned int mode) |
165 | { | 184 | { |
166 | const struct cred *cred = current_cred(), *tcred; | 185 | const struct cred *cred = current_cred(), *tcred; |
@@ -187,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
187 | cred->gid == tcred->sgid && | 206 | cred->gid == tcred->sgid && |
188 | cred->gid == tcred->gid)) | 207 | cred->gid == tcred->gid)) |
189 | goto ok; | 208 | goto ok; |
190 | if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) | 209 | if (ptrace_has_cap(tcred->user->user_ns, mode)) |
191 | goto ok; | 210 | goto ok; |
192 | rcu_read_unlock(); | 211 | rcu_read_unlock(); |
193 | return -EPERM; | 212 | return -EPERM; |
@@ -196,7 +215,7 @@ ok: | |||
196 | smp_rmb(); | 215 | smp_rmb(); |
197 | if (task->mm) | 216 | if (task->mm) |
198 | dumpable = get_dumpable(task->mm); | 217 | dumpable = get_dumpable(task->mm); |
199 | if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) | 218 | if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) |
200 | return -EPERM; | 219 | return -EPERM; |
201 | 220 | ||
202 | return security_ptrace_access_check(task, mode); | 221 | return security_ptrace_access_check(task, mode); |
@@ -266,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
266 | task->ptrace = PT_PTRACED; | 285 | task->ptrace = PT_PTRACED; |
267 | if (seize) | 286 | if (seize) |
268 | task->ptrace |= PT_SEIZED; | 287 | task->ptrace |= PT_SEIZED; |
269 | if (task_ns_capable(task, CAP_SYS_PTRACE)) | 288 | if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) |
270 | task->ptrace |= PT_PTRACE_CAP; | 289 | task->ptrace |= PT_PTRACE_CAP; |
271 | 290 | ||
272 | __ptrace_link(task, current); | 291 | __ptrace_link(task, current); |
diff --git a/kernel/rcu.h b/kernel/rcu.h index f600868d550d..aa88baab5f78 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -30,6 +30,13 @@ | |||
30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * Process-level increment to ->dynticks_nesting field. This allows for | ||
34 | * architectures that use half-interrupts and half-exceptions from | ||
35 | * process context. | ||
36 | */ | ||
37 | #define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) | ||
38 | |||
39 | /* | ||
33 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | 40 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally |
34 | * by call_rcu() and rcu callback execution, and are therefore not part of the | 41 | * by call_rcu() and rcu callback execution, and are therefore not part of the |
35 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. | 42 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c5b98e565aee..2bc4e135ff23 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void) | |||
93 | { | 93 | { |
94 | if (!debug_lockdep_rcu_enabled()) | 94 | if (!debug_lockdep_rcu_enabled()) |
95 | return 1; | 95 | return 1; |
96 | if (rcu_is_cpu_idle()) | ||
97 | return 0; | ||
96 | return in_softirq() || irqs_disabled(); | 98 | return in_softirq() || irqs_disabled(); |
97 | } | 99 | } |
98 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 100 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = { | |||
316 | }; | 318 | }; |
317 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 319 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
318 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 320 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
321 | |||
322 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | ||
323 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) | ||
324 | { | ||
325 | trace_rcu_torture_read(rcutorturename, rhp); | ||
326 | } | ||
327 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | ||
328 | #else | ||
329 | #define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) | ||
330 | #endif | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 636af6d9c6e5..977296dca0a4 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head, | |||
53 | 53 | ||
54 | #include "rcutiny_plugin.h" | 54 | #include "rcutiny_plugin.h" |
55 | 55 | ||
56 | #ifdef CONFIG_NO_HZ | 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; |
57 | 57 | ||
58 | static long rcu_dynticks_nesting = 1; | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
59 | static void rcu_idle_enter_common(long long oldval) | ||
60 | { | ||
61 | if (rcu_dynticks_nesting) { | ||
62 | RCU_TRACE(trace_rcu_dyntick("--=", | ||
63 | oldval, rcu_dynticks_nesting)); | ||
64 | return; | ||
65 | } | ||
66 | RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); | ||
67 | if (!is_idle_task(current)) { | ||
68 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
69 | |||
70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | ||
71 | oldval, rcu_dynticks_nesting)); | ||
72 | ftrace_dump(DUMP_ALL); | ||
73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
74 | current->pid, current->comm, | ||
75 | idle->pid, idle->comm); /* must be idle task! */ | ||
76 | } | ||
77 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | ||
78 | } | ||
59 | 79 | ||
60 | /* | 80 | /* |
61 | * Enter dynticks-idle mode, which is an extended quiescent state | 81 | * Enter idle, which is an extended quiescent state if we have fully |
62 | * if we have fully entered that mode (i.e., if the new value of | 82 | * entered that mode (i.e., if the new value of dynticks_nesting is zero). |
63 | * dynticks_nesting is zero). | ||
64 | */ | 83 | */ |
65 | void rcu_enter_nohz(void) | 84 | void rcu_idle_enter(void) |
66 | { | 85 | { |
67 | if (--rcu_dynticks_nesting == 0) | 86 | unsigned long flags; |
68 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 87 | long long oldval; |
88 | |||
89 | local_irq_save(flags); | ||
90 | oldval = rcu_dynticks_nesting; | ||
91 | rcu_dynticks_nesting = 0; | ||
92 | rcu_idle_enter_common(oldval); | ||
93 | local_irq_restore(flags); | ||
69 | } | 94 | } |
70 | 95 | ||
71 | /* | 96 | /* |
72 | * Exit dynticks-idle mode, so that we are no longer in an extended | 97 | * Exit an interrupt handler towards idle. |
73 | * quiescent state. | ||
74 | */ | 98 | */ |
75 | void rcu_exit_nohz(void) | 99 | void rcu_irq_exit(void) |
100 | { | ||
101 | unsigned long flags; | ||
102 | long long oldval; | ||
103 | |||
104 | local_irq_save(flags); | ||
105 | oldval = rcu_dynticks_nesting; | ||
106 | rcu_dynticks_nesting--; | ||
107 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); | ||
108 | rcu_idle_enter_common(oldval); | ||
109 | local_irq_restore(flags); | ||
110 | } | ||
111 | |||
112 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | ||
113 | static void rcu_idle_exit_common(long long oldval) | ||
76 | { | 114 | { |
115 | if (oldval) { | ||
116 | RCU_TRACE(trace_rcu_dyntick("++=", | ||
117 | oldval, rcu_dynticks_nesting)); | ||
118 | return; | ||
119 | } | ||
120 | RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); | ||
121 | if (!is_idle_task(current)) { | ||
122 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
123 | |||
124 | RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", | ||
125 | oldval, rcu_dynticks_nesting)); | ||
126 | ftrace_dump(DUMP_ALL); | ||
127 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
128 | current->pid, current->comm, | ||
129 | idle->pid, idle->comm); /* must be idle task! */ | ||
130 | } | ||
131 | } | ||
132 | |||
133 | /* | ||
134 | * Exit idle, so that we are no longer in an extended quiescent state. | ||
135 | */ | ||
136 | void rcu_idle_exit(void) | ||
137 | { | ||
138 | unsigned long flags; | ||
139 | long long oldval; | ||
140 | |||
141 | local_irq_save(flags); | ||
142 | oldval = rcu_dynticks_nesting; | ||
143 | WARN_ON_ONCE(oldval != 0); | ||
144 | rcu_dynticks_nesting = DYNTICK_TASK_NESTING; | ||
145 | rcu_idle_exit_common(oldval); | ||
146 | local_irq_restore(flags); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Enter an interrupt handler, moving away from idle. | ||
151 | */ | ||
152 | void rcu_irq_enter(void) | ||
153 | { | ||
154 | unsigned long flags; | ||
155 | long long oldval; | ||
156 | |||
157 | local_irq_save(flags); | ||
158 | oldval = rcu_dynticks_nesting; | ||
77 | rcu_dynticks_nesting++; | 159 | rcu_dynticks_nesting++; |
160 | WARN_ON_ONCE(rcu_dynticks_nesting == 0); | ||
161 | rcu_idle_exit_common(oldval); | ||
162 | local_irq_restore(flags); | ||
163 | } | ||
164 | |||
165 | #ifdef CONFIG_PROVE_RCU | ||
166 | |||
167 | /* | ||
168 | * Test whether RCU thinks that the current CPU is idle. | ||
169 | */ | ||
170 | int rcu_is_cpu_idle(void) | ||
171 | { | ||
172 | return !rcu_dynticks_nesting; | ||
78 | } | 173 | } |
174 | EXPORT_SYMBOL(rcu_is_cpu_idle); | ||
175 | |||
176 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
79 | 177 | ||
80 | #endif /* #ifdef CONFIG_NO_HZ */ | 178 | /* |
179 | * Test whether the current CPU was interrupted from idle. Nested | ||
180 | * interrupts don't count, we must be running at the first interrupt | ||
181 | * level. | ||
182 | */ | ||
183 | int rcu_is_cpu_rrupt_from_idle(void) | ||
184 | { | ||
185 | return rcu_dynticks_nesting <= 0; | ||
186 | } | ||
81 | 187 | ||
82 | /* | 188 | /* |
83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). | 189 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu) | |||
126 | 232 | ||
127 | /* | 233 | /* |
128 | * Check to see if the scheduling-clock interrupt came from an extended | 234 | * Check to see if the scheduling-clock interrupt came from an extended |
129 | * quiescent state, and, if so, tell RCU about it. | 235 | * quiescent state, and, if so, tell RCU about it. This function must |
236 | * be called from hardirq context. It is normally called from the | ||
237 | * scheduling-clock interrupt. | ||
130 | */ | 238 | */ |
131 | void rcu_check_callbacks(int cpu, int user) | 239 | void rcu_check_callbacks(int cpu, int user) |
132 | { | 240 | { |
133 | if (user || | 241 | if (user || rcu_is_cpu_rrupt_from_idle()) |
134 | (idle_cpu(cpu) && | ||
135 | !in_softirq() && | ||
136 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) | ||
137 | rcu_sched_qs(cpu); | 242 | rcu_sched_qs(cpu); |
138 | else if (!in_softirq()) | 243 | else if (!in_softirq()) |
139 | rcu_bh_qs(cpu); | 244 | rcu_bh_qs(cpu); |
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
154 | /* If no RCU callbacks ready to invoke, just return. */ | 259 | /* If no RCU callbacks ready to invoke, just return. */ |
155 | if (&rcp->rcucblist == rcp->donetail) { | 260 | if (&rcp->rcucblist == rcp->donetail) { |
156 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | 261 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); |
157 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); | 262 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, |
263 | ACCESS_ONCE(rcp->rcucblist), | ||
264 | need_resched(), | ||
265 | is_idle_task(current), | ||
266 | rcu_is_callbacks_kthread())); | ||
158 | return; | 267 | return; |
159 | } | 268 | } |
160 | 269 | ||
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
183 | RCU_TRACE(cb_count++); | 292 | RCU_TRACE(cb_count++); |
184 | } | 293 | } |
185 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 294 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
186 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); | 295 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), |
296 | is_idle_task(current), | ||
297 | rcu_is_callbacks_kthread())); | ||
187 | } | 298 | } |
188 | 299 | ||
189 | static void rcu_process_callbacks(struct softirq_action *unused) | 300 | static void rcu_process_callbacks(struct softirq_action *unused) |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 2b0484a5dc28..9cb1ae4aabdd 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -312,8 +312,8 @@ static int rcu_boost(void) | |||
312 | rt_mutex_lock(&mtx); | 312 | rt_mutex_lock(&mtx); |
313 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 313 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
314 | 314 | ||
315 | return rcu_preempt_ctrlblk.boost_tasks != NULL || | 315 | return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || |
316 | rcu_preempt_ctrlblk.exp_tasks != NULL; | 316 | ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; |
317 | } | 317 | } |
318 | 318 | ||
319 | /* | 319 | /* |
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void) | |||
885 | wake_up(&rcu_kthread_wq); | 885 | wake_up(&rcu_kthread_wq); |
886 | } | 886 | } |
887 | 887 | ||
888 | #ifdef CONFIG_RCU_TRACE | ||
889 | |||
890 | /* | ||
891 | * Is the current CPU running the RCU-callbacks kthread? | ||
892 | * Caller must have preemption disabled. | ||
893 | */ | ||
894 | static bool rcu_is_callbacks_kthread(void) | ||
895 | { | ||
896 | return rcu_kthread_task == current; | ||
897 | } | ||
898 | |||
899 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
900 | |||
888 | /* | 901 | /* |
889 | * This kthread invokes RCU callbacks whose grace periods have | 902 | * This kthread invokes RCU callbacks whose grace periods have |
890 | * elapsed. It is awakened as needed, and takes the place of the | 903 | * elapsed. It is awakened as needed, and takes the place of the |
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void) | |||
938 | raise_softirq(RCU_SOFTIRQ); | 951 | raise_softirq(RCU_SOFTIRQ); |
939 | } | 952 | } |
940 | 953 | ||
954 | #ifdef CONFIG_RCU_TRACE | ||
955 | |||
956 | /* | ||
957 | * There is no callback kthread, so this thread is never it. | ||
958 | */ | ||
959 | static bool rcu_is_callbacks_kthread(void) | ||
960 | { | ||
961 | return false; | ||
962 | } | ||
963 | |||
964 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
965 | |||
941 | void rcu_init(void) | 966 | void rcu_init(void) |
942 | { | 967 | { |
943 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 968 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 764825c2685c..a58ac285fc69 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -56,14 +56,16 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | |||
56 | static int nfakewriters = 4; /* # fake writer threads */ | 56 | static int nfakewriters = 4; /* # fake writer threads */ |
57 | static int stat_interval; /* Interval between stats, in seconds. */ | 57 | static int stat_interval; /* Interval between stats, in seconds. */ |
58 | /* Defaults to "only at end of test". */ | 58 | /* Defaults to "only at end of test". */ |
59 | static int verbose; /* Print more debug info. */ | 59 | static bool verbose; /* Print more debug info. */ |
60 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 60 | static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
68 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
67 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | 69 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ |
68 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | 70 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ |
69 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | 71 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ |
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444); | |||
91 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 93 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
92 | module_param(fqs_stutter, int, 0444); | 94 | module_param(fqs_stutter, int, 0444); |
93 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 95 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
96 | module_param(onoff_interval, int, 0444); | ||
97 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
98 | module_param(shutdown_secs, int, 0444); | ||
99 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | ||
94 | module_param(test_boost, int, 0444); | 100 | module_param(test_boost, int, 0444); |
95 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 101 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
96 | module_param(test_boost_interval, int, 0444); | 102 | module_param(test_boost_interval, int, 0444); |
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task; | |||
119 | static struct task_struct *stutter_task; | 125 | static struct task_struct *stutter_task; |
120 | static struct task_struct *fqs_task; | 126 | static struct task_struct *fqs_task; |
121 | static struct task_struct *boost_tasks[NR_CPUS]; | 127 | static struct task_struct *boost_tasks[NR_CPUS]; |
128 | static struct task_struct *shutdown_task; | ||
129 | #ifdef CONFIG_HOTPLUG_CPU | ||
130 | static struct task_struct *onoff_task; | ||
131 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
122 | 132 | ||
123 | #define RCU_TORTURE_PIPE_LEN 10 | 133 | #define RCU_TORTURE_PIPE_LEN 10 |
124 | 134 | ||
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror; | |||
149 | static long n_rcu_torture_boost_failure; | 159 | static long n_rcu_torture_boost_failure; |
150 | static long n_rcu_torture_boosts; | 160 | static long n_rcu_torture_boosts; |
151 | static long n_rcu_torture_timers; | 161 | static long n_rcu_torture_timers; |
162 | static long n_offline_attempts; | ||
163 | static long n_offline_successes; | ||
164 | static long n_online_attempts; | ||
165 | static long n_online_successes; | ||
152 | static struct list_head rcu_torture_removed; | 166 | static struct list_head rcu_torture_removed; |
153 | static cpumask_var_t shuffle_tmp_mask; | 167 | static cpumask_var_t shuffle_tmp_mask; |
154 | 168 | ||
@@ -160,6 +174,8 @@ static int stutter_pause_test; | |||
160 | #define RCUTORTURE_RUNNABLE_INIT 0 | 174 | #define RCUTORTURE_RUNNABLE_INIT 0 |
161 | #endif | 175 | #endif |
162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 176 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
177 | module_param(rcutorture_runnable, int, 0444); | ||
178 | MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | ||
163 | 179 | ||
164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | 180 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
165 | #define rcu_can_boost() 1 | 181 | #define rcu_can_boost() 1 |
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | |||
167 | #define rcu_can_boost() 0 | 183 | #define rcu_can_boost() 0 |
168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | 184 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
169 | 185 | ||
186 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | ||
170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 187 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 188 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
172 | /* and boost task create/destroy. */ | 189 | /* and boost task create/destroy. */ |
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD; | |||
182 | */ | 199 | */ |
183 | static DEFINE_MUTEX(fullstop_mutex); | 200 | static DEFINE_MUTEX(fullstop_mutex); |
184 | 201 | ||
202 | /* Forward reference. */ | ||
203 | static void rcu_torture_cleanup(void); | ||
204 | |||
185 | /* | 205 | /* |
186 | * Detect and respond to a system shutdown. | 206 | * Detect and respond to a system shutdown. |
187 | */ | 207 | */ |
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = { | |||
612 | .name = "srcu" | 632 | .name = "srcu" |
613 | }; | 633 | }; |
614 | 634 | ||
635 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | ||
636 | { | ||
637 | return srcu_read_lock_raw(&srcu_ctl); | ||
638 | } | ||
639 | |||
640 | static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | ||
641 | { | ||
642 | srcu_read_unlock_raw(&srcu_ctl, idx); | ||
643 | } | ||
644 | |||
645 | static struct rcu_torture_ops srcu_raw_ops = { | ||
646 | .init = srcu_torture_init, | ||
647 | .cleanup = srcu_torture_cleanup, | ||
648 | .readlock = srcu_torture_read_lock_raw, | ||
649 | .read_delay = srcu_read_delay, | ||
650 | .readunlock = srcu_torture_read_unlock_raw, | ||
651 | .completed = srcu_torture_completed, | ||
652 | .deferred_free = rcu_sync_torture_deferred_free, | ||
653 | .sync = srcu_torture_synchronize, | ||
654 | .cb_barrier = NULL, | ||
655 | .stats = srcu_torture_stats, | ||
656 | .name = "srcu_raw" | ||
657 | }; | ||
658 | |||
615 | static void srcu_torture_synchronize_expedited(void) | 659 | static void srcu_torture_synchronize_expedited(void) |
616 | { | 660 | { |
617 | synchronize_srcu_expedited(&srcu_ctl); | 661 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg) | |||
913 | return 0; | 957 | return 0; |
914 | } | 958 | } |
915 | 959 | ||
960 | void rcutorture_trace_dump(void) | ||
961 | { | ||
962 | static atomic_t beenhere = ATOMIC_INIT(0); | ||
963 | |||
964 | if (atomic_read(&beenhere)) | ||
965 | return; | ||
966 | if (atomic_xchg(&beenhere, 1) != 0) | ||
967 | return; | ||
968 | do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); | ||
969 | ftrace_dump(DUMP_ALL); | ||
970 | } | ||
971 | |||
916 | /* | 972 | /* |
917 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, | 973 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, |
918 | * incrementing the corresponding element of the pipeline array. The | 974 | * incrementing the corresponding element of the pipeline array. The |
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused) | |||
934 | rcu_read_lock_bh_held() || | 990 | rcu_read_lock_bh_held() || |
935 | rcu_read_lock_sched_held() || | 991 | rcu_read_lock_sched_held() || |
936 | srcu_read_lock_held(&srcu_ctl)); | 992 | srcu_read_lock_held(&srcu_ctl)); |
993 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
937 | if (p == NULL) { | 994 | if (p == NULL) { |
938 | /* Leave because rcu_torture_writer is not yet underway */ | 995 | /* Leave because rcu_torture_writer is not yet underway */ |
939 | cur_ops->readunlock(idx); | 996 | cur_ops->readunlock(idx); |
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused) | |||
951 | /* Should not happen, but... */ | 1008 | /* Should not happen, but... */ |
952 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1009 | pipe_count = RCU_TORTURE_PIPE_LEN; |
953 | } | 1010 | } |
1011 | if (pipe_count > 1) | ||
1012 | rcutorture_trace_dump(); | ||
954 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1013 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
955 | completed = cur_ops->completed() - completed; | 1014 | completed = cur_ops->completed() - completed; |
956 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1015 | if (completed > RCU_TORTURE_PIPE_LEN) { |
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg) | |||
994 | rcu_read_lock_bh_held() || | 1053 | rcu_read_lock_bh_held() || |
995 | rcu_read_lock_sched_held() || | 1054 | rcu_read_lock_sched_held() || |
996 | srcu_read_lock_held(&srcu_ctl)); | 1055 | srcu_read_lock_held(&srcu_ctl)); |
1056 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
997 | if (p == NULL) { | 1057 | if (p == NULL) { |
998 | /* Wait for rcu_torture_writer to get underway */ | 1058 | /* Wait for rcu_torture_writer to get underway */ |
999 | cur_ops->readunlock(idx); | 1059 | cur_ops->readunlock(idx); |
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg) | |||
1009 | /* Should not happen, but... */ | 1069 | /* Should not happen, but... */ |
1010 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1070 | pipe_count = RCU_TORTURE_PIPE_LEN; |
1011 | } | 1071 | } |
1072 | if (pipe_count > 1) | ||
1073 | rcutorture_trace_dump(); | ||
1012 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1074 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
1013 | completed = cur_ops->completed() - completed; | 1075 | completed = cur_ops->completed() - completed; |
1014 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1076 | if (completed > RCU_TORTURE_PIPE_LEN) { |
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page) | |||
1056 | cnt += sprintf(&page[cnt], | 1118 | cnt += sprintf(&page[cnt], |
1057 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1119 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1058 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1120 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1059 | "rtbf: %ld rtb: %ld nt: %ld", | 1121 | "rtbf: %ld rtb: %ld nt: %ld " |
1122 | "onoff: %ld/%ld:%ld/%ld", | ||
1060 | rcu_torture_current, | 1123 | rcu_torture_current, |
1061 | rcu_torture_current_version, | 1124 | rcu_torture_current_version, |
1062 | list_empty(&rcu_torture_freelist), | 1125 | list_empty(&rcu_torture_freelist), |
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page) | |||
1068 | n_rcu_torture_boost_rterror, | 1131 | n_rcu_torture_boost_rterror, |
1069 | n_rcu_torture_boost_failure, | 1132 | n_rcu_torture_boost_failure, |
1070 | n_rcu_torture_boosts, | 1133 | n_rcu_torture_boosts, |
1071 | n_rcu_torture_timers); | 1134 | n_rcu_torture_timers, |
1135 | n_online_successes, | ||
1136 | n_online_attempts, | ||
1137 | n_offline_successes, | ||
1138 | n_offline_attempts); | ||
1072 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1139 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1073 | n_rcu_torture_boost_ktrerror != 0 || | 1140 | n_rcu_torture_boost_ktrerror != 0 || |
1074 | n_rcu_torture_boost_rterror != 0 || | 1141 | n_rcu_torture_boost_rterror != 0 || |
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |||
1232 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1299 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1233 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1300 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1234 | "test_boost=%d/%d test_boost_interval=%d " | 1301 | "test_boost=%d/%d test_boost_interval=%d " |
1235 | "test_boost_duration=%d\n", | 1302 | "test_boost_duration=%d shutdown_secs=%d " |
1303 | "onoff_interval=%d\n", | ||
1236 | torture_type, tag, nrealreaders, nfakewriters, | 1304 | torture_type, tag, nrealreaders, nfakewriters, |
1237 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1305 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1238 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1306 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1239 | test_boost, cur_ops->can_boost, | 1307 | test_boost, cur_ops->can_boost, |
1240 | test_boost_interval, test_boost_duration); | 1308 | test_boost_interval, test_boost_duration, shutdown_secs, |
1309 | onoff_interval); | ||
1241 | } | 1310 | } |
1242 | 1311 | ||
1243 | static struct notifier_block rcutorture_shutdown_nb = { | 1312 | static struct notifier_block rcutorture_shutdown_nb = { |
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu) | |||
1287 | return 0; | 1356 | return 0; |
1288 | } | 1357 | } |
1289 | 1358 | ||
1359 | /* | ||
1360 | * Cause the rcutorture test to shutdown the system after the test has | ||
1361 | * run for the time specified by the shutdown_secs module parameter. | ||
1362 | */ | ||
1363 | static int | ||
1364 | rcu_torture_shutdown(void *arg) | ||
1365 | { | ||
1366 | long delta; | ||
1367 | unsigned long jiffies_snap; | ||
1368 | |||
1369 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); | ||
1370 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
1371 | while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && | ||
1372 | !kthread_should_stop()) { | ||
1373 | delta = shutdown_time - jiffies_snap; | ||
1374 | if (verbose) | ||
1375 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1376 | "rcu_torture_shutdown task: %lu " | ||
1377 | "jiffies remaining\n", | ||
1378 | torture_type, delta); | ||
1379 | schedule_timeout_interruptible(delta); | ||
1380 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
1381 | } | ||
1382 | if (kthread_should_stop()) { | ||
1383 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); | ||
1384 | return 0; | ||
1385 | } | ||
1386 | |||
1387 | /* OK, shut down the system. */ | ||
1388 | |||
1389 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); | ||
1390 | shutdown_task = NULL; /* Avoid self-kill deadlock. */ | ||
1391 | rcu_torture_cleanup(); /* Get the success/failure message. */ | ||
1392 | kernel_power_off(); /* Shut down the system. */ | ||
1393 | return 0; | ||
1394 | } | ||
1395 | |||
1396 | #ifdef CONFIG_HOTPLUG_CPU | ||
1397 | |||
1398 | /* | ||
1399 | * Execute random CPU-hotplug operations at the interval specified | ||
1400 | * by the onoff_interval. | ||
1401 | */ | ||
1402 | static int __cpuinit | ||
1403 | rcu_torture_onoff(void *arg) | ||
1404 | { | ||
1405 | int cpu; | ||
1406 | int maxcpu = -1; | ||
1407 | DEFINE_RCU_RANDOM(rand); | ||
1408 | |||
1409 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | ||
1410 | for_each_online_cpu(cpu) | ||
1411 | maxcpu = cpu; | ||
1412 | WARN_ON(maxcpu < 0); | ||
1413 | while (!kthread_should_stop()) { | ||
1414 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | ||
1415 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | ||
1416 | if (verbose) | ||
1417 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1418 | "rcu_torture_onoff task: offlining %d\n", | ||
1419 | torture_type, cpu); | ||
1420 | n_offline_attempts++; | ||
1421 | if (cpu_down(cpu) == 0) { | ||
1422 | if (verbose) | ||
1423 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1424 | "rcu_torture_onoff task: " | ||
1425 | "offlined %d\n", | ||
1426 | torture_type, cpu); | ||
1427 | n_offline_successes++; | ||
1428 | } | ||
1429 | } else if (cpu_is_hotpluggable(cpu)) { | ||
1430 | if (verbose) | ||
1431 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1432 | "rcu_torture_onoff task: onlining %d\n", | ||
1433 | torture_type, cpu); | ||
1434 | n_online_attempts++; | ||
1435 | if (cpu_up(cpu) == 0) { | ||
1436 | if (verbose) | ||
1437 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1438 | "rcu_torture_onoff task: " | ||
1439 | "onlined %d\n", | ||
1440 | torture_type, cpu); | ||
1441 | n_online_successes++; | ||
1442 | } | ||
1443 | } | ||
1444 | schedule_timeout_interruptible(onoff_interval * HZ); | ||
1445 | } | ||
1446 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); | ||
1447 | return 0; | ||
1448 | } | ||
1449 | |||
1450 | static int __cpuinit | ||
1451 | rcu_torture_onoff_init(void) | ||
1452 | { | ||
1453 | if (onoff_interval <= 0) | ||
1454 | return 0; | ||
1455 | onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); | ||
1456 | if (IS_ERR(onoff_task)) { | ||
1457 | onoff_task = NULL; | ||
1458 | return PTR_ERR(onoff_task); | ||
1459 | } | ||
1460 | return 0; | ||
1461 | } | ||
1462 | |||
1463 | static void rcu_torture_onoff_cleanup(void) | ||
1464 | { | ||
1465 | if (onoff_task == NULL) | ||
1466 | return; | ||
1467 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | ||
1468 | kthread_stop(onoff_task); | ||
1469 | } | ||
1470 | |||
1471 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1472 | |||
1473 | static void | ||
1474 | rcu_torture_onoff_init(void) | ||
1475 | { | ||
1476 | } | ||
1477 | |||
1478 | static void rcu_torture_onoff_cleanup(void) | ||
1479 | { | ||
1480 | } | ||
1481 | |||
1482 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
1483 | |||
1290 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1484 | static int rcutorture_cpu_notify(struct notifier_block *self, |
1291 | unsigned long action, void *hcpu) | 1485 | unsigned long action, void *hcpu) |
1292 | { | 1486 | { |
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void) | |||
1391 | for_each_possible_cpu(i) | 1585 | for_each_possible_cpu(i) |
1392 | rcutorture_booster_cleanup(i); | 1586 | rcutorture_booster_cleanup(i); |
1393 | } | 1587 | } |
1588 | if (shutdown_task != NULL) { | ||
1589 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | ||
1590 | kthread_stop(shutdown_task); | ||
1591 | } | ||
1592 | rcu_torture_onoff_cleanup(); | ||
1394 | 1593 | ||
1395 | /* Wait for all RCU callbacks to fire. */ | 1594 | /* Wait for all RCU callbacks to fire. */ |
1396 | 1595 | ||
@@ -1416,7 +1615,7 @@ rcu_torture_init(void) | |||
1416 | static struct rcu_torture_ops *torture_ops[] = | 1615 | static struct rcu_torture_ops *torture_ops[] = |
1417 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1616 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1418 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1617 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1419 | &srcu_ops, &srcu_expedited_ops, | 1618 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, |
1420 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1619 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1421 | 1620 | ||
1422 | mutex_lock(&fullstop_mutex); | 1621 | mutex_lock(&fullstop_mutex); |
@@ -1607,6 +1806,18 @@ rcu_torture_init(void) | |||
1607 | } | 1806 | } |
1608 | } | 1807 | } |
1609 | } | 1808 | } |
1809 | if (shutdown_secs > 0) { | ||
1810 | shutdown_time = jiffies + shutdown_secs * HZ; | ||
1811 | shutdown_task = kthread_run(rcu_torture_shutdown, NULL, | ||
1812 | "rcu_torture_shutdown"); | ||
1813 | if (IS_ERR(shutdown_task)) { | ||
1814 | firsterr = PTR_ERR(shutdown_task); | ||
1815 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); | ||
1816 | shutdown_task = NULL; | ||
1817 | goto unwind; | ||
1818 | } | ||
1819 | } | ||
1820 | rcu_torture_onoff_init(); | ||
1610 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1821 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1611 | rcutorture_record_test_transition(); | 1822 | rcutorture_record_test_transition(); |
1612 | mutex_unlock(&fullstop_mutex); | 1823 | mutex_unlock(&fullstop_mutex); |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b76d812740c..6c4a6722abfd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
69 | NUM_RCU_LVL_3, \ | 69 | NUM_RCU_LVL_3, \ |
70 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ | 70 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ |
71 | }, \ | 71 | }, \ |
72 | .signaled = RCU_GP_IDLE, \ | 72 | .fqs_state = RCU_GP_IDLE, \ |
73 | .gpnum = -300, \ | 73 | .gpnum = -300, \ |
74 | .completed = -300, \ | 74 | .completed = -300, \ |
75 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 75 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu) | |||
195 | } | 195 | } |
196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
197 | 197 | ||
198 | #ifdef CONFIG_NO_HZ | ||
199 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 198 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
200 | .dynticks_nesting = 1, | 199 | .dynticks_nesting = DYNTICK_TASK_NESTING, |
201 | .dynticks = ATOMIC_INIT(1), | 200 | .dynticks = ATOMIC_INIT(1), |
202 | }; | 201 | }; |
203 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
204 | 202 | ||
205 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 203 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
206 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 204 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ |
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
328 | return 1; | 326 | return 1; |
329 | } | 327 | } |
330 | 328 | ||
331 | /* If preemptible RCU, no point in sending reschedule IPI. */ | 329 | /* |
332 | if (rdp->preemptible) | 330 | * The CPU is online, so send it a reschedule IPI. This forces |
333 | return 0; | 331 | * it through the scheduler, and (inefficiently) also handles cases |
334 | 332 | * where idle loops fail to inform RCU about the CPU being idle. | |
335 | /* The CPU is online, so send it a reschedule IPI. */ | 333 | */ |
336 | if (rdp->cpu != smp_processor_id()) | 334 | if (rdp->cpu != smp_processor_id()) |
337 | smp_send_reschedule(rdp->cpu); | 335 | smp_send_reschedule(rdp->cpu); |
338 | else | 336 | else |
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
343 | 341 | ||
344 | #endif /* #ifdef CONFIG_SMP */ | 342 | #endif /* #ifdef CONFIG_SMP */ |
345 | 343 | ||
346 | #ifdef CONFIG_NO_HZ | 344 | /* |
345 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle | ||
346 | * | ||
347 | * If the new value of the ->dynticks_nesting counter now is zero, | ||
348 | * we really have entered idle, and must do the appropriate accounting. | ||
349 | * The caller must have disabled interrupts. | ||
350 | */ | ||
351 | static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | ||
352 | { | ||
353 | trace_rcu_dyntick("Start", oldval, 0); | ||
354 | if (!is_idle_task(current)) { | ||
355 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
356 | |||
357 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | ||
358 | ftrace_dump(DUMP_ALL); | ||
359 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
360 | current->pid, current->comm, | ||
361 | idle->pid, idle->comm); /* must be idle task! */ | ||
362 | } | ||
363 | rcu_prepare_for_idle(smp_processor_id()); | ||
364 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
365 | smp_mb__before_atomic_inc(); /* See above. */ | ||
366 | atomic_inc(&rdtp->dynticks); | ||
367 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
368 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
369 | } | ||
347 | 370 | ||
348 | /** | 371 | /** |
349 | * rcu_enter_nohz - inform RCU that current CPU is entering nohz | 372 | * rcu_idle_enter - inform RCU that current CPU is entering idle |
350 | * | 373 | * |
351 | * Enter nohz mode, in other words, -leave- the mode in which RCU | 374 | * Enter idle mode, in other words, -leave- the mode in which RCU |
352 | * read-side critical sections can occur. (Though RCU read-side | 375 | * read-side critical sections can occur. (Though RCU read-side |
353 | * critical sections can occur in irq handlers in nohz mode, a possibility | 376 | * critical sections can occur in irq handlers in idle, a possibility |
354 | * handled by rcu_irq_enter() and rcu_irq_exit()). | 377 | * handled by irq_enter() and irq_exit().) |
378 | * | ||
379 | * We crowbar the ->dynticks_nesting field to zero to allow for | ||
380 | * the possibility of usermode upcalls having messed up our count | ||
381 | * of interrupt nesting level during the prior busy period. | ||
355 | */ | 382 | */ |
356 | void rcu_enter_nohz(void) | 383 | void rcu_idle_enter(void) |
357 | { | 384 | { |
358 | unsigned long flags; | 385 | unsigned long flags; |
386 | long long oldval; | ||
359 | struct rcu_dynticks *rdtp; | 387 | struct rcu_dynticks *rdtp; |
360 | 388 | ||
361 | local_irq_save(flags); | 389 | local_irq_save(flags); |
362 | rdtp = &__get_cpu_var(rcu_dynticks); | 390 | rdtp = &__get_cpu_var(rcu_dynticks); |
363 | if (--rdtp->dynticks_nesting) { | 391 | oldval = rdtp->dynticks_nesting; |
364 | local_irq_restore(flags); | 392 | rdtp->dynticks_nesting = 0; |
365 | return; | 393 | rcu_idle_enter_common(rdtp, oldval); |
366 | } | ||
367 | trace_rcu_dyntick("Start"); | ||
368 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
369 | smp_mb__before_atomic_inc(); /* See above. */ | ||
370 | atomic_inc(&rdtp->dynticks); | ||
371 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
372 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
373 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
374 | } | 395 | } |
375 | 396 | ||
376 | /* | 397 | /** |
377 | * rcu_exit_nohz - inform RCU that current CPU is leaving nohz | 398 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
399 | * | ||
400 | * Exit from an interrupt handler, which might possibly result in entering | ||
401 | * idle mode, in other words, leaving the mode in which read-side critical | ||
402 | * sections can occur. | ||
378 | * | 403 | * |
379 | * Exit nohz mode, in other words, -enter- the mode in which RCU | 404 | * This code assumes that the idle loop never does anything that might |
380 | * read-side critical sections normally occur. | 405 | * result in unbalanced calls to irq_enter() and irq_exit(). If your |
406 | * architecture violates this assumption, RCU will give you what you | ||
407 | * deserve, good and hard. But very infrequently and irreproducibly. | ||
408 | * | ||
409 | * Use things like work queues to work around this limitation. | ||
410 | * | ||
411 | * You have been warned. | ||
381 | */ | 412 | */ |
382 | void rcu_exit_nohz(void) | 413 | void rcu_irq_exit(void) |
383 | { | 414 | { |
384 | unsigned long flags; | 415 | unsigned long flags; |
416 | long long oldval; | ||
385 | struct rcu_dynticks *rdtp; | 417 | struct rcu_dynticks *rdtp; |
386 | 418 | ||
387 | local_irq_save(flags); | 419 | local_irq_save(flags); |
388 | rdtp = &__get_cpu_var(rcu_dynticks); | 420 | rdtp = &__get_cpu_var(rcu_dynticks); |
389 | if (rdtp->dynticks_nesting++) { | 421 | oldval = rdtp->dynticks_nesting; |
390 | local_irq_restore(flags); | 422 | rdtp->dynticks_nesting--; |
391 | return; | 423 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
392 | } | 424 | if (rdtp->dynticks_nesting) |
425 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | ||
426 | else | ||
427 | rcu_idle_enter_common(rdtp, oldval); | ||
428 | local_irq_restore(flags); | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle | ||
433 | * | ||
434 | * If the new value of the ->dynticks_nesting counter was previously zero, | ||
435 | * we really have exited idle, and must do the appropriate accounting. | ||
436 | * The caller must have disabled interrupts. | ||
437 | */ | ||
438 | static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | ||
439 | { | ||
393 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 440 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ |
394 | atomic_inc(&rdtp->dynticks); | 441 | atomic_inc(&rdtp->dynticks); |
395 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 442 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
396 | smp_mb__after_atomic_inc(); /* See above. */ | 443 | smp_mb__after_atomic_inc(); /* See above. */ |
397 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 444 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
398 | trace_rcu_dyntick("End"); | 445 | rcu_cleanup_after_idle(smp_processor_id()); |
446 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | ||
447 | if (!is_idle_task(current)) { | ||
448 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
449 | |||
450 | trace_rcu_dyntick("Error on exit: not idle task", | ||
451 | oldval, rdtp->dynticks_nesting); | ||
452 | ftrace_dump(DUMP_ALL); | ||
453 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
454 | current->pid, current->comm, | ||
455 | idle->pid, idle->comm); /* must be idle task! */ | ||
456 | } | ||
457 | } | ||
458 | |||
459 | /** | ||
460 | * rcu_idle_exit - inform RCU that current CPU is leaving idle | ||
461 | * | ||
462 | * Exit idle mode, in other words, -enter- the mode in which RCU | ||
463 | * read-side critical sections can occur. | ||
464 | * | ||
465 | * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to | ||
466 | * allow for the possibility of usermode upcalls messing up our count | ||
467 | * of interrupt nesting level during the busy period that is just | ||
468 | * now starting. | ||
469 | */ | ||
470 | void rcu_idle_exit(void) | ||
471 | { | ||
472 | unsigned long flags; | ||
473 | struct rcu_dynticks *rdtp; | ||
474 | long long oldval; | ||
475 | |||
476 | local_irq_save(flags); | ||
477 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
478 | oldval = rdtp->dynticks_nesting; | ||
479 | WARN_ON_ONCE(oldval != 0); | ||
480 | rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; | ||
481 | rcu_idle_exit_common(rdtp, oldval); | ||
482 | local_irq_restore(flags); | ||
483 | } | ||
484 | |||
485 | /** | ||
486 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | ||
487 | * | ||
488 | * Enter an interrupt handler, which might possibly result in exiting | ||
489 | * idle mode, in other words, entering the mode in which read-side critical | ||
490 | * sections can occur. | ||
491 | * | ||
492 | * Note that the Linux kernel is fully capable of entering an interrupt | ||
493 | * handler that it never exits, for example when doing upcalls to | ||
494 | * user mode! This code assumes that the idle loop never does upcalls to | ||
495 | * user mode. If your architecture does do upcalls from the idle loop (or | ||
496 | * does anything else that results in unbalanced calls to the irq_enter() | ||
497 | * and irq_exit() functions), RCU will give you what you deserve, good | ||
498 | * and hard. But very infrequently and irreproducibly. | ||
499 | * | ||
500 | * Use things like work queues to work around this limitation. | ||
501 | * | ||
502 | * You have been warned. | ||
503 | */ | ||
504 | void rcu_irq_enter(void) | ||
505 | { | ||
506 | unsigned long flags; | ||
507 | struct rcu_dynticks *rdtp; | ||
508 | long long oldval; | ||
509 | |||
510 | local_irq_save(flags); | ||
511 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
512 | oldval = rdtp->dynticks_nesting; | ||
513 | rdtp->dynticks_nesting++; | ||
514 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | ||
515 | if (oldval) | ||
516 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | ||
517 | else | ||
518 | rcu_idle_exit_common(rdtp, oldval); | ||
399 | local_irq_restore(flags); | 519 | local_irq_restore(flags); |
400 | } | 520 | } |
401 | 521 | ||
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void) | |||
442 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 562 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
443 | } | 563 | } |
444 | 564 | ||
565 | #ifdef CONFIG_PROVE_RCU | ||
566 | |||
445 | /** | 567 | /** |
446 | * rcu_irq_enter - inform RCU of entry to hard irq context | 568 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle |
447 | * | 569 | * |
448 | * If the CPU was idle with dynamic ticks active, this updates the | 570 | * If the current CPU is in its idle loop and is neither in an interrupt |
449 | * rdtp->dynticks to let the RCU handling know that the CPU is active. | 571 | * or NMI handler, return true. |
450 | */ | 572 | */ |
451 | void rcu_irq_enter(void) | 573 | int rcu_is_cpu_idle(void) |
452 | { | 574 | { |
453 | rcu_exit_nohz(); | 575 | int ret; |
576 | |||
577 | preempt_disable(); | ||
578 | ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; | ||
579 | preempt_enable(); | ||
580 | return ret; | ||
454 | } | 581 | } |
582 | EXPORT_SYMBOL(rcu_is_cpu_idle); | ||
583 | |||
584 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
455 | 585 | ||
456 | /** | 586 | /** |
457 | * rcu_irq_exit - inform RCU of exit from hard irq context | 587 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle |
458 | * | 588 | * |
459 | * If the CPU was idle with dynamic ticks active, update the rdp->dynticks | 589 | * If the current CPU is idle or running at a first-level (not nested) |
460 | * to put let the RCU handling be aware that the CPU is going back to idle | 590 | * interrupt from idle, return true. The caller must have at least |
461 | * with no ticks. | 591 | * disabled preemption. |
462 | */ | 592 | */ |
463 | void rcu_irq_exit(void) | 593 | int rcu_is_cpu_rrupt_from_idle(void) |
464 | { | 594 | { |
465 | rcu_enter_nohz(); | 595 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; |
466 | } | 596 | } |
467 | 597 | ||
468 | #ifdef CONFIG_SMP | 598 | #ifdef CONFIG_SMP |
@@ -475,7 +605,7 @@ void rcu_irq_exit(void) | |||
475 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 605 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
476 | { | 606 | { |
477 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 607 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
478 | return 0; | 608 | return (rdp->dynticks_snap & 0x1) == 0; |
479 | } | 609 | } |
480 | 610 | ||
481 | /* | 611 | /* |
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
512 | 642 | ||
513 | #endif /* #ifdef CONFIG_SMP */ | 643 | #endif /* #ifdef CONFIG_SMP */ |
514 | 644 | ||
515 | #else /* #ifdef CONFIG_NO_HZ */ | ||
516 | |||
517 | #ifdef CONFIG_SMP | ||
518 | |||
519 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | ||
520 | { | ||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | ||
525 | { | ||
526 | return rcu_implicit_offline_qs(rdp); | ||
527 | } | ||
528 | |||
529 | #endif /* #ifdef CONFIG_SMP */ | ||
530 | |||
531 | #endif /* #else #ifdef CONFIG_NO_HZ */ | ||
532 | |||
533 | int rcu_cpu_stall_suppress __read_mostly; | ||
534 | |||
535 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 645 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
536 | { | 646 | { |
537 | rsp->gp_start = jiffies; | 647 | rsp->gp_start = jiffies; |
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
866 | /* Advance to a new grace period and initialize state. */ | 976 | /* Advance to a new grace period and initialize state. */ |
867 | rsp->gpnum++; | 977 | rsp->gpnum++; |
868 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 978 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); |
869 | WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); | 979 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); |
870 | rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | 980 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ |
871 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 981 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
872 | record_gp_stall_check_time(rsp); | 982 | record_gp_stall_check_time(rsp); |
873 | 983 | ||
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
877 | rnp->qsmask = rnp->qsmaskinit; | 987 | rnp->qsmask = rnp->qsmaskinit; |
878 | rnp->gpnum = rsp->gpnum; | 988 | rnp->gpnum = rsp->gpnum; |
879 | rnp->completed = rsp->completed; | 989 | rnp->completed = rsp->completed; |
880 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 990 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ |
881 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 991 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
882 | rcu_preempt_boost_start_gp(rnp); | 992 | rcu_preempt_boost_start_gp(rnp); |
883 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 993 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
927 | 1037 | ||
928 | rnp = rcu_get_root(rsp); | 1038 | rnp = rcu_get_root(rsp); |
929 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1039 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
930 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 1040 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
931 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1041 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
932 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1042 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
933 | } | 1043 | } |
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
991 | 1101 | ||
992 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ | 1102 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ |
993 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1103 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); |
994 | rsp->signaled = RCU_GP_IDLE; | 1104 | rsp->fqs_state = RCU_GP_IDLE; |
995 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 1105 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
996 | } | 1106 | } |
997 | 1107 | ||
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1221 | else | 1331 | else |
1222 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1332 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1223 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1333 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1224 | rcu_report_exp_rnp(rsp, rnp); | 1334 | rcu_report_exp_rnp(rsp, rnp, true); |
1225 | rcu_node_kthread_setaffinity(rnp, -1); | 1335 | rcu_node_kthread_setaffinity(rnp, -1); |
1226 | } | 1336 | } |
1227 | 1337 | ||
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1263 | /* If no callbacks are ready, just return.*/ | 1373 | /* If no callbacks are ready, just return.*/ |
1264 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1374 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
1265 | trace_rcu_batch_start(rsp->name, 0, 0); | 1375 | trace_rcu_batch_start(rsp->name, 0, 0); |
1266 | trace_rcu_batch_end(rsp->name, 0); | 1376 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), |
1377 | need_resched(), is_idle_task(current), | ||
1378 | rcu_is_callbacks_kthread()); | ||
1267 | return; | 1379 | return; |
1268 | } | 1380 | } |
1269 | 1381 | ||
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1291 | debug_rcu_head_unqueue(list); | 1403 | debug_rcu_head_unqueue(list); |
1292 | __rcu_reclaim(rsp->name, list); | 1404 | __rcu_reclaim(rsp->name, list); |
1293 | list = next; | 1405 | list = next; |
1294 | if (++count >= bl) | 1406 | /* Stop only if limit reached and CPU has something to do. */ |
1407 | if (++count >= bl && | ||
1408 | (need_resched() || | ||
1409 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | ||
1295 | break; | 1410 | break; |
1296 | } | 1411 | } |
1297 | 1412 | ||
1298 | local_irq_save(flags); | 1413 | local_irq_save(flags); |
1299 | trace_rcu_batch_end(rsp->name, count); | 1414 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), |
1415 | is_idle_task(current), | ||
1416 | rcu_is_callbacks_kthread()); | ||
1300 | 1417 | ||
1301 | /* Update count, and requeue any remaining callbacks. */ | 1418 | /* Update count, and requeue any remaining callbacks. */ |
1302 | rdp->qlen -= count; | 1419 | rdp->qlen -= count; |
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1334 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). | 1451 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). |
1335 | * Also schedule RCU core processing. | 1452 | * Also schedule RCU core processing. |
1336 | * | 1453 | * |
1337 | * This function must be called with hardirqs disabled. It is normally | 1454 | * This function must be called from hardirq context. It is normally |
1338 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | 1455 | * invoked from the scheduling-clock interrupt. If rcu_pending returns |
1339 | * false, there is no point in invoking rcu_check_callbacks(). | 1456 | * false, there is no point in invoking rcu_check_callbacks(). |
1340 | */ | 1457 | */ |
1341 | void rcu_check_callbacks(int cpu, int user) | 1458 | void rcu_check_callbacks(int cpu, int user) |
1342 | { | 1459 | { |
1343 | trace_rcu_utilization("Start scheduler-tick"); | 1460 | trace_rcu_utilization("Start scheduler-tick"); |
1344 | if (user || | 1461 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
1345 | (idle_cpu(cpu) && rcu_scheduler_active && | ||
1346 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
1347 | 1462 | ||
1348 | /* | 1463 | /* |
1349 | * Get here if this CPU took its interrupt from user | 1464 | * Get here if this CPU took its interrupt from user |
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1457 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ | 1572 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ |
1458 | } | 1573 | } |
1459 | rsp->fqs_active = 1; | 1574 | rsp->fqs_active = 1; |
1460 | switch (rsp->signaled) { | 1575 | switch (rsp->fqs_state) { |
1461 | case RCU_GP_IDLE: | 1576 | case RCU_GP_IDLE: |
1462 | case RCU_GP_INIT: | 1577 | case RCU_GP_INIT: |
1463 | 1578 | ||
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1473 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1588 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
1474 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 1589 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1475 | if (rcu_gp_in_progress(rsp)) | 1590 | if (rcu_gp_in_progress(rsp)) |
1476 | rsp->signaled = RCU_FORCE_QS; | 1591 | rsp->fqs_state = RCU_FORCE_QS; |
1477 | break; | 1592 | break; |
1478 | 1593 | ||
1479 | case RCU_FORCE_QS: | 1594 | case RCU_FORCE_QS: |
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu) | |||
1812 | * by the current CPU, even if none need be done immediately, returning | 1927 | * by the current CPU, even if none need be done immediately, returning |
1813 | * 1 if so. | 1928 | * 1 if so. |
1814 | */ | 1929 | */ |
1815 | static int rcu_needs_cpu_quick_check(int cpu) | 1930 | static int rcu_cpu_has_callbacks(int cpu) |
1816 | { | 1931 | { |
1817 | /* RCU callbacks either ready or pending? */ | 1932 | /* RCU callbacks either ready or pending? */ |
1818 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 1933 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1913 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 2028 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1914 | rdp->nxttail[i] = &rdp->nxtlist; | 2029 | rdp->nxttail[i] = &rdp->nxtlist; |
1915 | rdp->qlen = 0; | 2030 | rdp->qlen = 0; |
1916 | #ifdef CONFIG_NO_HZ | ||
1917 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2031 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
1918 | #endif /* #ifdef CONFIG_NO_HZ */ | 2032 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); |
2033 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | ||
1919 | rdp->cpu = cpu; | 2034 | rdp->cpu = cpu; |
1920 | rdp->rsp = rsp; | 2035 | rdp->rsp = rsp; |
1921 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2036 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
1942 | rdp->qlen_last_fqs_check = 0; | 2057 | rdp->qlen_last_fqs_check = 0; |
1943 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2058 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1944 | rdp->blimit = blimit; | 2059 | rdp->blimit = blimit; |
2060 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; | ||
2061 | atomic_set(&rdp->dynticks->dynticks, | ||
2062 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | ||
2063 | rcu_prepare_for_idle_init(cpu); | ||
1945 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2064 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1946 | 2065 | ||
1947 | /* | 2066 | /* |
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2023 | rcu_send_cbs_to_online(&rcu_bh_state); | 2142 | rcu_send_cbs_to_online(&rcu_bh_state); |
2024 | rcu_send_cbs_to_online(&rcu_sched_state); | 2143 | rcu_send_cbs_to_online(&rcu_sched_state); |
2025 | rcu_preempt_send_cbs_to_online(); | 2144 | rcu_preempt_send_cbs_to_online(); |
2145 | rcu_cleanup_after_idle(cpu); | ||
2026 | break; | 2146 | break; |
2027 | case CPU_DEAD: | 2147 | case CPU_DEAD: |
2028 | case CPU_DEAD_FROZEN: | 2148 | case CPU_DEAD_FROZEN: |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 849ce9ec51fe..fddff92d6676 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,9 +84,10 @@ | |||
84 | * Dynticks per-CPU state. | 84 | * Dynticks per-CPU state. |
85 | */ | 85 | */ |
86 | struct rcu_dynticks { | 86 | struct rcu_dynticks { |
87 | int dynticks_nesting; /* Track irq/process nesting level. */ | 87 | long long dynticks_nesting; /* Track irq/process nesting level. */ |
88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
89 | atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | ||
90 | }; | 91 | }; |
91 | 92 | ||
92 | /* RCU's kthread states for tracing. */ | 93 | /* RCU's kthread states for tracing. */ |
@@ -274,16 +275,12 @@ struct rcu_data { | |||
274 | /* did other CPU force QS recently? */ | 275 | /* did other CPU force QS recently? */ |
275 | long blimit; /* Upper limit on a processed batch */ | 276 | long blimit; /* Upper limit on a processed batch */ |
276 | 277 | ||
277 | #ifdef CONFIG_NO_HZ | ||
278 | /* 3) dynticks interface. */ | 278 | /* 3) dynticks interface. */ |
279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | 279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ |
280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | 280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ |
281 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
282 | 281 | ||
283 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 282 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
284 | #ifdef CONFIG_NO_HZ | ||
285 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | 283 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ |
286 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
287 | unsigned long offline_fqs; /* Kicked due to being offline. */ | 284 | unsigned long offline_fqs; /* Kicked due to being offline. */ |
288 | unsigned long resched_ipi; /* Sent a resched IPI. */ | 285 | unsigned long resched_ipi; /* Sent a resched IPI. */ |
289 | 286 | ||
@@ -302,16 +299,12 @@ struct rcu_data { | |||
302 | struct rcu_state *rsp; | 299 | struct rcu_state *rsp; |
303 | }; | 300 | }; |
304 | 301 | ||
305 | /* Values for signaled field in struct rcu_state. */ | 302 | /* Values for fqs_state field in struct rcu_state. */ |
306 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | 303 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
307 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | 304 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
308 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | 305 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
309 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | 306 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
310 | #ifdef CONFIG_NO_HZ | ||
311 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 307 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
312 | #else /* #ifdef CONFIG_NO_HZ */ | ||
313 | #define RCU_SIGNAL_INIT RCU_FORCE_QS | ||
314 | #endif /* #else #ifdef CONFIG_NO_HZ */ | ||
315 | 308 | ||
316 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 309 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
317 | 310 | ||
@@ -361,7 +354,7 @@ struct rcu_state { | |||
361 | 354 | ||
362 | /* The following fields are guarded by the root rcu_node's lock. */ | 355 | /* The following fields are guarded by the root rcu_node's lock. */ |
363 | 356 | ||
364 | u8 signaled ____cacheline_internodealigned_in_smp; | 357 | u8 fqs_state ____cacheline_internodealigned_in_smp; |
365 | /* Force QS state. */ | 358 | /* Force QS state. */ |
366 | u8 fqs_active; /* force_quiescent_state() */ | 359 | u8 fqs_active; /* force_quiescent_state() */ |
367 | /* is running. */ | 360 | /* is running. */ |
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu); | |||
451 | static void rcu_preempt_process_callbacks(void); | 444 | static void rcu_preempt_process_callbacks(void); |
452 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 445 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
453 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 446 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) |
454 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | 447 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
448 | bool wake); | ||
455 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 449 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ |
456 | static int rcu_preempt_pending(int cpu); | 450 | static int rcu_preempt_pending(int cpu); |
457 | static int rcu_preempt_needs_cpu(int cpu); | 451 | static int rcu_preempt_needs_cpu(int cpu); |
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void); | |||
461 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 455 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
462 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 456 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
463 | static void invoke_rcu_callbacks_kthread(void); | 457 | static void invoke_rcu_callbacks_kthread(void); |
458 | static bool rcu_is_callbacks_kthread(void); | ||
464 | #ifdef CONFIG_RCU_BOOST | 459 | #ifdef CONFIG_RCU_BOOST |
465 | static void rcu_preempt_do_callbacks(void); | 460 | static void rcu_preempt_do_callbacks(void); |
466 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | 461 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, |
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | |||
473 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 468 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
474 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | 469 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); |
475 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 470 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
471 | static void rcu_prepare_for_idle_init(int cpu); | ||
472 | static void rcu_cleanup_after_idle(int cpu); | ||
473 | static void rcu_prepare_for_idle(int cpu); | ||
476 | 474 | ||
477 | #endif /* #ifndef RCU_TREE_NONCORE */ | 475 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4b9b9f8a4184..8bb35d73e1f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
312 | { | 312 | { |
313 | int empty; | 313 | int empty; |
314 | int empty_exp; | 314 | int empty_exp; |
315 | int empty_exp_now; | ||
315 | unsigned long flags; | 316 | unsigned long flags; |
316 | struct list_head *np; | 317 | struct list_head *np; |
317 | #ifdef CONFIG_RCU_BOOST | 318 | #ifdef CONFIG_RCU_BOOST |
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
382 | /* | 383 | /* |
383 | * If this was the last task on the current list, and if | 384 | * If this was the last task on the current list, and if |
384 | * we aren't waiting on any CPUs, report the quiescent state. | 385 | * we aren't waiting on any CPUs, report the quiescent state. |
385 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 386 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, |
387 | * so we must take a snapshot of the expedited state. | ||
386 | */ | 388 | */ |
389 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | ||
387 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 390 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
388 | trace_rcu_quiescent_state_report("preempt_rcu", | 391 | trace_rcu_quiescent_state_report("preempt_rcu", |
389 | rnp->gpnum, | 392 | rnp->gpnum, |
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
406 | * If this was the last task on the expedited lists, | 409 | * If this was the last task on the expedited lists, |
407 | * then we need to report up the rcu_node hierarchy. | 410 | * then we need to report up the rcu_node hierarchy. |
408 | */ | 411 | */ |
409 | if (!empty_exp && !rcu_preempted_readers_exp(rnp)) | 412 | if (!empty_exp && empty_exp_now) |
410 | rcu_report_exp_rnp(&rcu_preempt_state, rnp); | 413 | rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); |
411 | } else { | 414 | } else { |
412 | local_irq_restore(flags); | 415 | local_irq_restore(flags); |
413 | } | 416 | } |
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | |||
729 | * recursively up the tree. (Calm down, calm down, we do the recursion | 732 | * recursively up the tree. (Calm down, calm down, we do the recursion |
730 | * iteratively!) | 733 | * iteratively!) |
731 | * | 734 | * |
735 | * Most callers will set the "wake" flag, but the task initiating the | ||
736 | * expedited grace period need not wake itself. | ||
737 | * | ||
732 | * Caller must hold sync_rcu_preempt_exp_mutex. | 738 | * Caller must hold sync_rcu_preempt_exp_mutex. |
733 | */ | 739 | */ |
734 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | 740 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
741 | bool wake) | ||
735 | { | 742 | { |
736 | unsigned long flags; | 743 | unsigned long flags; |
737 | unsigned long mask; | 744 | unsigned long mask; |
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
744 | } | 751 | } |
745 | if (rnp->parent == NULL) { | 752 | if (rnp->parent == NULL) { |
746 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 753 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
747 | wake_up(&sync_rcu_preempt_exp_wq); | 754 | if (wake) |
755 | wake_up(&sync_rcu_preempt_exp_wq); | ||
748 | break; | 756 | break; |
749 | } | 757 | } |
750 | mask = rnp->grpmask; | 758 | mask = rnp->grpmask; |
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
777 | must_wait = 1; | 785 | must_wait = 1; |
778 | } | 786 | } |
779 | if (!must_wait) | 787 | if (!must_wait) |
780 | rcu_report_exp_rnp(rsp, rnp); | 788 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ |
781 | } | 789 | } |
782 | 790 | ||
783 | /* | 791 | /* |
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
1069 | * report on tasks preempted in RCU read-side critical sections during | 1077 | * report on tasks preempted in RCU read-side critical sections during |
1070 | * expedited RCU grace periods. | 1078 | * expedited RCU grace periods. |
1071 | */ | 1079 | */ |
1072 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | 1080 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
1081 | bool wake) | ||
1073 | { | 1082 | { |
1074 | return; | ||
1075 | } | 1083 | } |
1076 | 1084 | ||
1077 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1085 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) | |||
1157 | 1165 | ||
1158 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 1166 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
1159 | 1167 | ||
1160 | static struct lock_class_key rcu_boost_class; | ||
1161 | |||
1162 | /* | 1168 | /* |
1163 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | 1169 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks |
1164 | * or ->boost_tasks, advancing the pointer to the next task in the | 1170 | * or ->boost_tasks, advancing the pointer to the next task in the |
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1221 | */ | 1227 | */ |
1222 | t = container_of(tb, struct task_struct, rcu_node_entry); | 1228 | t = container_of(tb, struct task_struct, rcu_node_entry); |
1223 | rt_mutex_init_proxy_locked(&mtx, t); | 1229 | rt_mutex_init_proxy_locked(&mtx, t); |
1224 | /* Avoid lockdep false positives. This rt_mutex is its own thing. */ | ||
1225 | lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, | ||
1226 | "rcu_boost_mutex"); | ||
1227 | t->rcu_boost_mutex = &mtx; | 1230 | t->rcu_boost_mutex = &mtx; |
1228 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1231 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1229 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | 1232 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ |
1230 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 1233 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
1231 | 1234 | ||
1232 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | 1235 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || |
1236 | ACCESS_ONCE(rnp->boost_tasks) != NULL; | ||
1233 | } | 1237 | } |
1234 | 1238 | ||
1235 | /* | 1239 | /* |
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1329 | } | 1333 | } |
1330 | 1334 | ||
1331 | /* | 1335 | /* |
1336 | * Is the current CPU running the RCU-callbacks kthread? | ||
1337 | * Caller must have preemption disabled. | ||
1338 | */ | ||
1339 | static bool rcu_is_callbacks_kthread(void) | ||
1340 | { | ||
1341 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1332 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | 1345 | * Set the affinity of the boost kthread. The CPU-hotplug locks are |
1333 | * held, so no one should be messing with the existence of the boost | 1346 | * held, so no one should be messing with the existence of the boost |
1334 | * kthread. | 1347 | * kthread. |
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1772 | WARN_ON_ONCE(1); | 1785 | WARN_ON_ONCE(1); |
1773 | } | 1786 | } |
1774 | 1787 | ||
1788 | static bool rcu_is_callbacks_kthread(void) | ||
1789 | { | ||
1790 | return false; | ||
1791 | } | ||
1792 | |||
1775 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | 1793 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) |
1776 | { | 1794 | { |
1777 | } | 1795 | } |
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void) | |||
1907 | * grace period works for us. | 1925 | * grace period works for us. |
1908 | */ | 1926 | */ |
1909 | get_online_cpus(); | 1927 | get_online_cpus(); |
1910 | snap = atomic_read(&sync_sched_expedited_started) - 1; | 1928 | snap = atomic_read(&sync_sched_expedited_started); |
1911 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 1929 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
1912 | } | 1930 | } |
1913 | 1931 | ||
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |||
1939 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1957 | * 1 if so. This function is part of the RCU implementation; it is -not- |
1940 | * an exported member of the RCU API. | 1958 | * an exported member of the RCU API. |
1941 | * | 1959 | * |
1942 | * Because we have preemptible RCU, just check whether this CPU needs | 1960 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1943 | * any flavor of RCU. Do not chew up lots of CPU cycles with preemption | 1961 | * any flavor of RCU. |
1944 | * disabled in a most-likely vain attempt to cause RCU not to need this CPU. | ||
1945 | */ | 1962 | */ |
1946 | int rcu_needs_cpu(int cpu) | 1963 | int rcu_needs_cpu(int cpu) |
1947 | { | 1964 | { |
1948 | return rcu_needs_cpu_quick_check(cpu); | 1965 | return rcu_cpu_has_callbacks(cpu); |
1966 | } | ||
1967 | |||
1968 | /* | ||
1969 | * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. | ||
1970 | */ | ||
1971 | static void rcu_prepare_for_idle_init(int cpu) | ||
1972 | { | ||
1973 | } | ||
1974 | |||
1975 | /* | ||
1976 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | ||
1977 | * after it. | ||
1978 | */ | ||
1979 | static void rcu_cleanup_after_idle(int cpu) | ||
1980 | { | ||
1981 | } | ||
1982 | |||
1983 | /* | ||
1984 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, | ||
1985 | * is nothing. | ||
1986 | */ | ||
1987 | static void rcu_prepare_for_idle(int cpu) | ||
1988 | { | ||
1949 | } | 1989 | } |
1950 | 1990 | ||
1951 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1991 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1952 | 1992 | ||
1953 | #define RCU_NEEDS_CPU_FLUSHES 5 | 1993 | /* |
1994 | * This code is invoked when a CPU goes idle, at which point we want | ||
1995 | * to have the CPU do everything required for RCU so that it can enter | ||
1996 | * the energy-efficient dyntick-idle mode. This is handled by a | ||
1997 | * state machine implemented by rcu_prepare_for_idle() below. | ||
1998 | * | ||
1999 | * The following three proprocessor symbols control this state machine: | ||
2000 | * | ||
2001 | * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt | ||
2002 | * to satisfy RCU. Beyond this point, it is better to incur a periodic | ||
2003 | * scheduling-clock interrupt than to loop through the state machine | ||
2004 | * at full power. | ||
2005 | * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are | ||
2006 | * optional if RCU does not need anything immediately from this | ||
2007 | * CPU, even if this CPU still has RCU callbacks queued. The first | ||
2008 | * times through the state machine are mandatory: we need to give | ||
2009 | * the state machine a chance to communicate a quiescent state | ||
2010 | * to the RCU core. | ||
2011 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | ||
2012 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | ||
2013 | * is sized to be roughly one RCU grace period. Those energy-efficiency | ||
2014 | * benchmarkers who might otherwise be tempted to set this to a large | ||
2015 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your | ||
2016 | * system. And if you are -that- concerned about energy efficiency, | ||
2017 | * just power the system down and be done with it! | ||
2018 | * | ||
2019 | * The values below work well in practice. If future workloads require | ||
2020 | * adjustment, they can be converted into kernel config parameters, though | ||
2021 | * making the state machine smarter might be a better option. | ||
2022 | */ | ||
2023 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | ||
2024 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | ||
2025 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | ||
2026 | |||
1954 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 2027 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1955 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 2028 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
2029 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | ||
2030 | static ktime_t rcu_idle_gp_wait; | ||
1956 | 2031 | ||
1957 | /* | 2032 | /* |
1958 | * Check to see if any future RCU-related work will need to be done | 2033 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
1959 | * by the current CPU, even if none need be done immediately, returning | 2034 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter |
1960 | * 1 if so. This function is part of the RCU implementation; it is -not- | 2035 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to |
1961 | * an exported member of the RCU API. | 2036 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed |
2037 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
2038 | * it is better to incur scheduling-clock interrupts than to spin | ||
2039 | * continuously for the same time duration! | ||
2040 | */ | ||
2041 | int rcu_needs_cpu(int cpu) | ||
2042 | { | ||
2043 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
2044 | if (!rcu_cpu_has_callbacks(cpu)) | ||
2045 | return 0; | ||
2046 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
2047 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
2048 | } | ||
2049 | |||
2050 | /* | ||
2051 | * Timer handler used to force CPU to start pushing its remaining RCU | ||
2052 | * callbacks in the case where it entered dyntick-idle mode with callbacks | ||
2053 | * pending. The hander doesn't really need to do anything because the | ||
2054 | * real work is done upon re-entry to idle, or by the next scheduling-clock | ||
2055 | * interrupt should idle not be re-entered. | ||
2056 | */ | ||
2057 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | ||
2058 | { | ||
2059 | trace_rcu_prep_idle("Timer"); | ||
2060 | return HRTIMER_NORESTART; | ||
2061 | } | ||
2062 | |||
2063 | /* | ||
2064 | * Initialize the timer used to pull CPUs out of dyntick-idle mode. | ||
2065 | */ | ||
2066 | static void rcu_prepare_for_idle_init(int cpu) | ||
2067 | { | ||
2068 | static int firsttime = 1; | ||
2069 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2070 | |||
2071 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2072 | hrtp->function = rcu_idle_gp_timer_func; | ||
2073 | if (firsttime) { | ||
2074 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2075 | |||
2076 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2077 | firsttime = 0; | ||
2078 | } | ||
2079 | } | ||
2080 | |||
2081 | /* | ||
2082 | * Clean up for exit from idle. Because we are exiting from idle, there | ||
2083 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | ||
2084 | * do nothing if this timer is not active, so just cancel it unconditionally. | ||
2085 | */ | ||
2086 | static void rcu_cleanup_after_idle(int cpu) | ||
2087 | { | ||
2088 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | ||
2089 | } | ||
2090 | |||
2091 | /* | ||
2092 | * Check to see if any RCU-related work can be done by the current CPU, | ||
2093 | * and if so, schedule a softirq to get it done. This function is part | ||
2094 | * of the RCU implementation; it is -not- an exported member of the RCU API. | ||
1962 | * | 2095 | * |
1963 | * Because we are not supporting preemptible RCU, attempt to accelerate | 2096 | * The idea is for the current CPU to clear out all work required by the |
1964 | * any current grace periods so that RCU no longer needs this CPU, but | 2097 | * RCU core for the current grace period, so that this CPU can be permitted |
1965 | * only if all other CPUs are already in dynticks-idle mode. This will | 2098 | * to enter dyntick-idle mode. In some cases, it will need to be awakened |
1966 | * allow the CPU cores to be powered down immediately, as opposed to after | 2099 | * at the end of the grace period by whatever CPU ends the grace period. |
1967 | * waiting many milliseconds for grace periods to elapse. | 2100 | * This allows CPUs to go dyntick-idle more quickly, and to reduce the |
2101 | * number of wakeups by a modest integer factor. | ||
1968 | * | 2102 | * |
1969 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2103 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
1970 | * disabled, we do one pass of force_quiescent_state(), then do a | 2104 | * disabled, we do one pass of force_quiescent_state(), then do a |
1971 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2105 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
1972 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2106 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
2107 | * | ||
2108 | * The caller must have disabled interrupts. | ||
1973 | */ | 2109 | */ |
1974 | int rcu_needs_cpu(int cpu) | 2110 | static void rcu_prepare_for_idle(int cpu) |
1975 | { | 2111 | { |
1976 | int c = 0; | 2112 | unsigned long flags; |
1977 | int snap; | 2113 | |
1978 | int thatcpu; | 2114 | local_irq_save(flags); |
1979 | 2115 | ||
1980 | /* Check for being in the holdoff period. */ | 2116 | /* |
1981 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) | 2117 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
1982 | return rcu_needs_cpu_quick_check(cpu); | 2118 | * Also reset state to avoid prejudicing later attempts. |
1983 | 2119 | */ | |
1984 | /* Don't bother unless we are the last non-dyntick-idle CPU. */ | 2120 | if (!rcu_cpu_has_callbacks(cpu)) { |
1985 | for_each_online_cpu(thatcpu) { | 2121 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
1986 | if (thatcpu == cpu) | 2122 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
1987 | continue; | 2123 | local_irq_restore(flags); |
1988 | snap = atomic_add_return(0, &per_cpu(rcu_dynticks, | 2124 | trace_rcu_prep_idle("No callbacks"); |
1989 | thatcpu).dynticks); | 2125 | return; |
1990 | smp_mb(); /* Order sampling of snap with end of grace period. */ | 2126 | } |
1991 | if ((snap & 0x1) != 0) { | 2127 | |
1992 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2128 | /* |
1993 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2129 | * If in holdoff mode, just return. We will presumably have |
1994 | return rcu_needs_cpu_quick_check(cpu); | 2130 | * refrained from disabling the scheduling-clock tick. |
1995 | } | 2131 | */ |
2132 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | ||
2133 | local_irq_restore(flags); | ||
2134 | trace_rcu_prep_idle("In holdoff"); | ||
2135 | return; | ||
1996 | } | 2136 | } |
1997 | 2137 | ||
1998 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2138 | /* Check and update the rcu_dyntick_drain sequencing. */ |
1999 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2139 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2000 | /* First time through, initialize the counter. */ | 2140 | /* First time through, initialize the counter. */ |
2001 | per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; | 2141 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; |
2142 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | ||
2143 | !rcu_pending(cpu)) { | ||
2144 | /* Can we go dyntick-idle despite still having callbacks? */ | ||
2145 | trace_rcu_prep_idle("Dyntick with callbacks"); | ||
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | ||
2147 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | ||
2148 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | ||
2149 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | ||
2150 | return; /* Nothing more to do immediately. */ | ||
2002 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2151 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2003 | /* We have hit the limit, so time to give up. */ | 2152 | /* We have hit the limit, so time to give up. */ |
2004 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2153 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2005 | return rcu_needs_cpu_quick_check(cpu); | 2154 | local_irq_restore(flags); |
2155 | trace_rcu_prep_idle("Begin holdoff"); | ||
2156 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | ||
2157 | return; | ||
2006 | } | 2158 | } |
2007 | 2159 | ||
2008 | /* Do one step pushing remaining RCU callbacks through. */ | 2160 | /* |
2161 | * Do one step of pushing the remaining RCU callbacks through | ||
2162 | * the RCU core state machine. | ||
2163 | */ | ||
2164 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
2165 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | ||
2166 | local_irq_restore(flags); | ||
2167 | rcu_preempt_qs(cpu); | ||
2168 | force_quiescent_state(&rcu_preempt_state, 0); | ||
2169 | local_irq_save(flags); | ||
2170 | } | ||
2171 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
2009 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | 2172 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { |
2173 | local_irq_restore(flags); | ||
2010 | rcu_sched_qs(cpu); | 2174 | rcu_sched_qs(cpu); |
2011 | force_quiescent_state(&rcu_sched_state, 0); | 2175 | force_quiescent_state(&rcu_sched_state, 0); |
2012 | c = c || per_cpu(rcu_sched_data, cpu).nxtlist; | 2176 | local_irq_save(flags); |
2013 | } | 2177 | } |
2014 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | 2178 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { |
2179 | local_irq_restore(flags); | ||
2015 | rcu_bh_qs(cpu); | 2180 | rcu_bh_qs(cpu); |
2016 | force_quiescent_state(&rcu_bh_state, 0); | 2181 | force_quiescent_state(&rcu_bh_state, 0); |
2017 | c = c || per_cpu(rcu_bh_data, cpu).nxtlist; | 2182 | local_irq_save(flags); |
2018 | } | 2183 | } |
2019 | 2184 | ||
2020 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 2185 | /* |
2021 | if (c) | 2186 | * If RCU callbacks are still pending, RCU still needs this CPU. |
2187 | * So try forcing the callbacks through the grace period. | ||
2188 | */ | ||
2189 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2190 | local_irq_restore(flags); | ||
2191 | trace_rcu_prep_idle("More callbacks"); | ||
2022 | invoke_rcu_core(); | 2192 | invoke_rcu_core(); |
2023 | return c; | 2193 | } else { |
2194 | local_irq_restore(flags); | ||
2195 | trace_rcu_prep_idle("Callbacks drained"); | ||
2196 | } | ||
2024 | } | 2197 | } |
2025 | 2198 | ||
2026 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2199 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9feffa4c0695..654cfe67f0d1 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
67 | rdp->completed, rdp->gpnum, | 67 | rdp->completed, rdp->gpnum, |
68 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 68 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
69 | rdp->qs_pending); | 69 | rdp->qs_pending); |
70 | #ifdef CONFIG_NO_HZ | 70 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
71 | seq_printf(m, " dt=%d/%d/%d df=%lu", | ||
72 | atomic_read(&rdp->dynticks->dynticks), | 71 | atomic_read(&rdp->dynticks->dynticks), |
73 | rdp->dynticks->dynticks_nesting, | 72 | rdp->dynticks->dynticks_nesting, |
74 | rdp->dynticks->dynticks_nmi_nesting, | 73 | rdp->dynticks->dynticks_nmi_nesting, |
75 | rdp->dynticks_fqs); | 74 | rdp->dynticks_fqs); |
76 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
77 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 75 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
78 | seq_printf(m, " ql=%ld qs=%c%c%c%c", | 76 | seq_printf(m, " ql=%ld qs=%c%c%c%c", |
79 | rdp->qlen, | 77 | rdp->qlen, |
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
141 | rdp->completed, rdp->gpnum, | 139 | rdp->completed, rdp->gpnum, |
142 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 140 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
143 | rdp->qs_pending); | 141 | rdp->qs_pending); |
144 | #ifdef CONFIG_NO_HZ | 142 | seq_printf(m, ",%d,%llx,%d,%lu", |
145 | seq_printf(m, ",%d,%d,%d,%lu", | ||
146 | atomic_read(&rdp->dynticks->dynticks), | 143 | atomic_read(&rdp->dynticks->dynticks), |
147 | rdp->dynticks->dynticks_nesting, | 144 | rdp->dynticks->dynticks_nesting, |
148 | rdp->dynticks->dynticks_nmi_nesting, | 145 | rdp->dynticks->dynticks_nmi_nesting, |
149 | rdp->dynticks_fqs); | 146 | rdp->dynticks_fqs); |
150 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
151 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 147 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
152 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, | 148 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, |
153 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 149 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
171 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 167 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
172 | { | 168 | { |
173 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | 169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
174 | #ifdef CONFIG_NO_HZ | ||
175 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
176 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
177 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); | 171 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); |
178 | #ifdef CONFIG_RCU_BOOST | 172 | #ifdef CONFIG_RCU_BOOST |
179 | seq_puts(m, "\"kt\",\"ktl\""); | 173 | seq_puts(m, "\"kt\",\"ktl\""); |
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
278 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
279 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
280 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
281 | rsp->completed, gpnum, rsp->signaled, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
282 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
283 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
284 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
diff --git a/kernel/relay.c b/kernel/relay.c index 226fade4d727..ab56a1764d4d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -164,10 +164,14 @@ depopulate: | |||
164 | */ | 164 | */ |
165 | static struct rchan_buf *relay_create_buf(struct rchan *chan) | 165 | static struct rchan_buf *relay_create_buf(struct rchan *chan) |
166 | { | 166 | { |
167 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); | 167 | struct rchan_buf *buf; |
168 | if (!buf) | 168 | |
169 | if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) | ||
169 | return NULL; | 170 | return NULL; |
170 | 171 | ||
172 | buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); | ||
173 | if (!buf) | ||
174 | return NULL; | ||
171 | buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); | 175 | buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); |
172 | if (!buf->padding) | 176 | if (!buf->padding) |
173 | goto free_buf; | 177 | goto free_buf; |
@@ -302,7 +306,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf, | |||
302 | */ | 306 | */ |
303 | static struct dentry *create_buf_file_default_callback(const char *filename, | 307 | static struct dentry *create_buf_file_default_callback(const char *filename, |
304 | struct dentry *parent, | 308 | struct dentry *parent, |
305 | int mode, | 309 | umode_t mode, |
306 | struct rchan_buf *buf, | 310 | struct rchan_buf *buf, |
307 | int *is_global) | 311 | int *is_global) |
308 | { | 312 | { |
@@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, | |||
574 | 578 | ||
575 | if (!(subbuf_size && n_subbufs)) | 579 | if (!(subbuf_size && n_subbufs)) |
576 | return NULL; | 580 | return NULL; |
581 | if (subbuf_size > UINT_MAX / n_subbufs) | ||
582 | return NULL; | ||
577 | 583 | ||
578 | chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); | 584 | chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); |
579 | if (!chan) | 585 | if (!chan) |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 34683efa2cce..d508363858b3 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -66,6 +66,31 @@ done: | |||
66 | return ret; | 66 | return ret; |
67 | } | 67 | } |
68 | 68 | ||
69 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | ||
70 | struct res_counter **limit_fail_at) | ||
71 | { | ||
72 | int ret, r; | ||
73 | unsigned long flags; | ||
74 | struct res_counter *c; | ||
75 | |||
76 | r = ret = 0; | ||
77 | *limit_fail_at = NULL; | ||
78 | local_irq_save(flags); | ||
79 | for (c = counter; c != NULL; c = c->parent) { | ||
80 | spin_lock(&c->lock); | ||
81 | r = res_counter_charge_locked(c, val); | ||
82 | if (r) | ||
83 | c->usage += val; | ||
84 | spin_unlock(&c->lock); | ||
85 | if (r < 0 && ret == 0) { | ||
86 | *limit_fail_at = c; | ||
87 | ret = r; | ||
88 | } | ||
89 | } | ||
90 | local_irq_restore(flags); | ||
91 | |||
92 | return ret; | ||
93 | } | ||
69 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 94 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
70 | { | 95 | { |
71 | if (WARN_ON(counter->usage < val)) | 96 | if (WARN_ON(counter->usage < val)) |
@@ -159,8 +184,7 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
159 | return 0; | 184 | return 0; |
160 | } | 185 | } |
161 | 186 | ||
162 | /* FIXME - make memparse() take const char* args */ | 187 | *res = memparse(buf, &end); |
163 | *res = memparse((char *)buf, &end); | ||
164 | if (*end != '\0') | 188 | if (*end != '\0') |
165 | return -EINVAL; | 189 | return -EINVAL; |
166 | 190 | ||
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 8eafd1bd273e..16502d3a71c8 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
101 | 101 | ||
102 | printk("\n============================================\n"); | 102 | printk("\n============================================\n"); |
103 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 103 | printk( "[ BUG: circular locking deadlock detected! ]\n"); |
104 | printk("%s\n", print_tainted()); | ||
104 | printk( "--------------------------------------------\n"); | 105 | printk( "--------------------------------------------\n"); |
105 | printk("%s/%d is deadlocking current task %s/%d\n\n", | 106 | printk("%s/%d is deadlocking current task %s/%d\n\n", |
106 | task->comm, task_pid_nr(task), | 107 | task->comm, task_pid_nr(task), |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 3d9f31cd79e7..98ec49475460 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -6,11 +6,11 @@ | |||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | 6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | #include <linux/device.h> | ||
9 | #include <linux/kthread.h> | 10 | #include <linux/kthread.h> |
10 | #include <linux/export.h> | 11 | #include <linux/export.h> |
11 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
12 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
13 | #include <linux/sysdev.h> | ||
14 | #include <linux/timer.h> | 14 | #include <linux/timer.h> |
15 | #include <linux/freezer.h> | 15 | #include <linux/freezer.h> |
16 | 16 | ||
@@ -27,7 +27,7 @@ struct test_thread_data { | |||
27 | int opdata; | 27 | int opdata; |
28 | int mutexes[MAX_RT_TEST_MUTEXES]; | 28 | int mutexes[MAX_RT_TEST_MUTEXES]; |
29 | int event; | 29 | int event; |
30 | struct sys_device sysdev; | 30 | struct device dev; |
31 | }; | 31 | }; |
32 | 32 | ||
33 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | 33 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; |
@@ -271,7 +271,7 @@ static int test_func(void *data) | |||
271 | * | 271 | * |
272 | * opcode:data | 272 | * opcode:data |
273 | */ | 273 | */ |
274 | static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, | 274 | static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, |
275 | const char *buf, size_t count) | 275 | const char *buf, size_t count) |
276 | { | 276 | { |
277 | struct sched_param schedpar; | 277 | struct sched_param schedpar; |
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut | |||
279 | char cmdbuf[32]; | 279 | char cmdbuf[32]; |
280 | int op, dat, tid, ret; | 280 | int op, dat, tid, ret; |
281 | 281 | ||
282 | td = container_of(dev, struct test_thread_data, sysdev); | 282 | td = container_of(dev, struct test_thread_data, dev); |
283 | tid = td->sysdev.id; | 283 | tid = td->dev.id; |
284 | 284 | ||
285 | /* strings from sysfs write are not 0 terminated! */ | 285 | /* strings from sysfs write are not 0 terminated! */ |
286 | if (count >= sizeof(cmdbuf)) | 286 | if (count >= sizeof(cmdbuf)) |
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut | |||
334 | * @dev: thread to query | 334 | * @dev: thread to query |
335 | * @buf: char buffer to be filled with thread status info | 335 | * @buf: char buffer to be filled with thread status info |
336 | */ | 336 | */ |
337 | static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, | 337 | static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, |
338 | char *buf) | 338 | char *buf) |
339 | { | 339 | { |
340 | struct test_thread_data *td; | 340 | struct test_thread_data *td; |
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
342 | char *curr = buf; | 342 | char *curr = buf; |
343 | int i; | 343 | int i; |
344 | 344 | ||
345 | td = container_of(dev, struct test_thread_data, sysdev); | 345 | td = container_of(dev, struct test_thread_data, dev); |
346 | tsk = threads[td->sysdev.id]; | 346 | tsk = threads[td->dev.id]; |
347 | 347 | ||
348 | spin_lock(&rttest_lock); | 348 | spin_lock(&rttest_lock); |
349 | 349 | ||
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
360 | spin_unlock(&rttest_lock); | 360 | spin_unlock(&rttest_lock); |
361 | 361 | ||
362 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | 362 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, |
363 | mutexes[td->sysdev.id].owner); | 363 | mutexes[td->dev.id].owner); |
364 | 364 | ||
365 | return curr - buf; | 365 | return curr - buf; |
366 | } | 366 | } |
367 | 367 | ||
368 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | 368 | static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); |
369 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | 369 | static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); |
370 | 370 | ||
371 | static struct sysdev_class rttest_sysclass = { | 371 | static struct bus_type rttest_subsys = { |
372 | .name = "rttest", | 372 | .name = "rttest", |
373 | .dev_name = "rttest", | ||
373 | }; | 374 | }; |
374 | 375 | ||
375 | static int init_test_thread(int id) | 376 | static int init_test_thread(int id) |
376 | { | 377 | { |
377 | thread_data[id].sysdev.cls = &rttest_sysclass; | 378 | thread_data[id].dev.bus = &rttest_subsys; |
378 | thread_data[id].sysdev.id = id; | 379 | thread_data[id].dev.id = id; |
379 | 380 | ||
380 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | 381 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); |
381 | if (IS_ERR(threads[id])) | 382 | if (IS_ERR(threads[id])) |
382 | return PTR_ERR(threads[id]); | 383 | return PTR_ERR(threads[id]); |
383 | 384 | ||
384 | return sysdev_register(&thread_data[id].sysdev); | 385 | return device_register(&thread_data[id].dev); |
385 | } | 386 | } |
386 | 387 | ||
387 | static int init_rttest(void) | 388 | static int init_rttest(void) |
@@ -393,7 +394,7 @@ static int init_rttest(void) | |||
393 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | 394 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) |
394 | rt_mutex_init(&mutexes[i]); | 395 | rt_mutex_init(&mutexes[i]); |
395 | 396 | ||
396 | ret = sysdev_class_register(&rttest_sysclass); | 397 | ret = subsys_system_register(&rttest_subsys, NULL); |
397 | if (ret) | 398 | if (ret) |
398 | return ret; | 399 | return ret; |
399 | 400 | ||
@@ -401,10 +402,10 @@ static int init_rttest(void) | |||
401 | ret = init_test_thread(i); | 402 | ret = init_test_thread(i); |
402 | if (ret) | 403 | if (ret) |
403 | break; | 404 | break; |
404 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | 405 | ret = device_create_file(&thread_data[i].dev, &dev_attr_status); |
405 | if (ret) | 406 | if (ret) |
406 | break; | 407 | break; |
407 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | 408 | ret = device_create_file(&thread_data[i].dev, &dev_attr_command); |
408 | if (ret) | 409 | if (ret) |
409 | break; | 410 | break; |
410 | } | 411 | } |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index f9d8482dd487..a242e691c993 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
579 | struct rt_mutex_waiter *waiter) | 579 | struct rt_mutex_waiter *waiter) |
580 | { | 580 | { |
581 | int ret = 0; | 581 | int ret = 0; |
582 | int was_disabled; | ||
583 | 582 | ||
584 | for (;;) { | 583 | for (;;) { |
585 | /* Try to acquire the lock: */ | 584 | /* Try to acquire the lock: */ |
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
602 | 601 | ||
603 | raw_spin_unlock(&lock->wait_lock); | 602 | raw_spin_unlock(&lock->wait_lock); |
604 | 603 | ||
605 | was_disabled = irqs_disabled(); | ||
606 | if (was_disabled) | ||
607 | local_irq_enable(); | ||
608 | |||
609 | debug_rt_mutex_print_deadlock(waiter); | 604 | debug_rt_mutex_print_deadlock(waiter); |
610 | 605 | ||
611 | schedule_rt_mutex(lock); | 606 | schedule_rt_mutex(lock); |
612 | 607 | ||
613 | if (was_disabled) | ||
614 | local_irq_disable(); | ||
615 | |||
616 | raw_spin_lock(&lock->wait_lock); | 608 | raw_spin_lock(&lock->wait_lock); |
617 | set_current_state(state); | 609 | set_current_state(state); |
618 | } | 610 | } |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 000000000000..9a7dd35102a3 --- /dev/null +++ b/kernel/sched/Makefile | |||
@@ -0,0 +1,20 @@ | |||
1 | ifdef CONFIG_FUNCTION_TRACER | ||
2 | CFLAGS_REMOVE_clock.o = -pg | ||
3 | endif | ||
4 | |||
5 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
6 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
7 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
8 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
9 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
10 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | ||
12 | endif | ||
13 | |||
14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | ||
15 | obj-$(CONFIG_SMP) += cpupri.o | ||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | ||
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | ||
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | ||
19 | |||
20 | |||
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c index 429242f3c484..e8a1f83ee0e7 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched/auto_group.c | |||
@@ -1,15 +1,19 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | #include "sched.h" | ||
4 | |||
3 | #include <linux/proc_fs.h> | 5 | #include <linux/proc_fs.h> |
4 | #include <linux/seq_file.h> | 6 | #include <linux/seq_file.h> |
5 | #include <linux/kallsyms.h> | 7 | #include <linux/kallsyms.h> |
6 | #include <linux/utsname.h> | 8 | #include <linux/utsname.h> |
9 | #include <linux/security.h> | ||
10 | #include <linux/export.h> | ||
7 | 11 | ||
8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 12 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
9 | static struct autogroup autogroup_default; | 13 | static struct autogroup autogroup_default; |
10 | static atomic_t autogroup_seq_nr; | 14 | static atomic_t autogroup_seq_nr; |
11 | 15 | ||
12 | static void __init autogroup_init(struct task_struct *init_task) | 16 | void __init autogroup_init(struct task_struct *init_task) |
13 | { | 17 | { |
14 | autogroup_default.tg = &root_task_group; | 18 | autogroup_default.tg = &root_task_group; |
15 | kref_init(&autogroup_default.kref); | 19 | kref_init(&autogroup_default.kref); |
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task) | |||
17 | init_task->signal->autogroup = &autogroup_default; | 21 | init_task->signal->autogroup = &autogroup_default; |
18 | } | 22 | } |
19 | 23 | ||
20 | static inline void autogroup_free(struct task_group *tg) | 24 | void autogroup_free(struct task_group *tg) |
21 | { | 25 | { |
22 | kfree(tg->autogroup); | 26 | kfree(tg->autogroup); |
23 | } | 27 | } |
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) | |||
59 | return ag; | 63 | return ag; |
60 | } | 64 | } |
61 | 65 | ||
62 | #ifdef CONFIG_RT_GROUP_SCHED | ||
63 | static void free_rt_sched_group(struct task_group *tg); | ||
64 | #endif | ||
65 | |||
66 | static inline struct autogroup *autogroup_create(void) | 66 | static inline struct autogroup *autogroup_create(void) |
67 | { | 67 | { |
68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | 68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); |
@@ -108,8 +108,7 @@ out_fail: | |||
108 | return autogroup_kref_get(&autogroup_default); | 108 | return autogroup_kref_get(&autogroup_default); |
109 | } | 109 | } |
110 | 110 | ||
111 | static inline bool | 111 | bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) |
112 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
113 | { | 112 | { |
114 | if (tg != &root_task_group) | 113 | if (tg != &root_task_group) |
115 | return false; | 114 | return false; |
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
127 | return true; | 126 | return true; |
128 | } | 127 | } |
129 | 128 | ||
130 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
131 | { | ||
132 | return !!tg->autogroup; | ||
133 | } | ||
134 | |||
135 | static inline struct task_group * | ||
136 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
137 | { | ||
138 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
139 | |||
140 | if (enabled && task_wants_autogroup(p, tg)) | ||
141 | return p->signal->autogroup->tg; | ||
142 | |||
143 | return tg; | ||
144 | } | ||
145 | |||
146 | static void | 129 | static void |
147 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | 130 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) |
148 | { | 131 | { |
@@ -263,7 +246,7 @@ out: | |||
263 | #endif /* CONFIG_PROC_FS */ | 246 | #endif /* CONFIG_PROC_FS */ |
264 | 247 | ||
265 | #ifdef CONFIG_SCHED_DEBUG | 248 | #ifdef CONFIG_SCHED_DEBUG |
266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 249 | int autogroup_path(struct task_group *tg, char *buf, int buflen) |
267 | { | 250 | { |
268 | if (!task_group_is_autogroup(tg)) | 251 | if (!task_group_is_autogroup(tg)) |
269 | return 0; | 252 | return 0; |
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h index c2f0e7248dca..8bd047142816 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched/auto_group.h | |||
@@ -1,5 +1,8 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | #include <linux/kref.h> | ||
4 | #include <linux/rwsem.h> | ||
5 | |||
3 | struct autogroup { | 6 | struct autogroup { |
4 | /* | 7 | /* |
5 | * reference doesn't mean how many thread attach to this | 8 | * reference doesn't mean how many thread attach to this |
@@ -13,9 +16,28 @@ struct autogroup { | |||
13 | int nice; | 16 | int nice; |
14 | }; | 17 | }; |
15 | 18 | ||
16 | static inline bool task_group_is_autogroup(struct task_group *tg); | 19 | extern void autogroup_init(struct task_struct *init_task); |
20 | extern void autogroup_free(struct task_group *tg); | ||
21 | |||
22 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
23 | { | ||
24 | return !!tg->autogroup; | ||
25 | } | ||
26 | |||
27 | extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); | ||
28 | |||
17 | static inline struct task_group * | 29 | static inline struct task_group * |
18 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | 30 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
31 | { | ||
32 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
33 | |||
34 | if (enabled && task_wants_autogroup(p, tg)) | ||
35 | return p->signal->autogroup->tg; | ||
36 | |||
37 | return tg; | ||
38 | } | ||
39 | |||
40 | extern int autogroup_path(struct task_group *tg, char *buf, int buflen); | ||
19 | 41 | ||
20 | #else /* !CONFIG_SCHED_AUTOGROUP */ | 42 | #else /* !CONFIG_SCHED_AUTOGROUP */ |
21 | 43 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c index c685e31492df..c685e31492df 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched/clock.c | |||
diff --git a/kernel/sched.c b/kernel/sched/core.c index 0e9344a71be3..33a0676ea744 100644 --- a/kernel/sched.c +++ b/kernel/sched/core.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched/core.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
@@ -56,7 +56,6 @@ | |||
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/proc_fs.h> | 57 | #include <linux/proc_fs.h> |
58 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
59 | #include <linux/stop_machine.h> | ||
60 | #include <linux/sysctl.h> | 59 | #include <linux/sysctl.h> |
61 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
62 | #include <linux/times.h> | 61 | #include <linux/times.h> |
@@ -71,6 +70,7 @@ | |||
71 | #include <linux/ctype.h> | 70 | #include <linux/ctype.h> |
72 | #include <linux/ftrace.h> | 71 | #include <linux/ftrace.h> |
73 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
73 | #include <linux/init_task.h> | ||
74 | 74 | ||
75 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
@@ -79,124 +79,13 @@ | |||
79 | #include <asm/paravirt.h> | 79 | #include <asm/paravirt.h> |
80 | #endif | 80 | #endif |
81 | 81 | ||
82 | #include "sched_cpupri.h" | 82 | #include "sched.h" |
83 | #include "workqueue_sched.h" | 83 | #include "../workqueue_sched.h" |
84 | #include "sched_autogroup.h" | ||
85 | 84 | ||
86 | #define CREATE_TRACE_POINTS | 85 | #define CREATE_TRACE_POINTS |
87 | #include <trace/events/sched.h> | 86 | #include <trace/events/sched.h> |
88 | 87 | ||
89 | /* | 88 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
90 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
91 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
92 | * and back. | ||
93 | */ | ||
94 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
95 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
96 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
97 | |||
98 | /* | ||
99 | * 'User priority' is the nice value converted to something we | ||
100 | * can work with better when scaling various scheduler parameters, | ||
101 | * it's a [ 0 ... 39 ] range. | ||
102 | */ | ||
103 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
104 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
105 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
106 | |||
107 | /* | ||
108 | * Helpers for converting nanosecond timing to jiffy resolution | ||
109 | */ | ||
110 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
111 | |||
112 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
113 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
114 | |||
115 | /* | ||
116 | * These are the 'tuning knobs' of the scheduler: | ||
117 | * | ||
118 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
119 | * Timeslices get refilled after they expire. | ||
120 | */ | ||
121 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
122 | |||
123 | /* | ||
124 | * single value that denotes runtime == period, ie unlimited time. | ||
125 | */ | ||
126 | #define RUNTIME_INF ((u64)~0ULL) | ||
127 | |||
128 | static inline int rt_policy(int policy) | ||
129 | { | ||
130 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
131 | return 1; | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | static inline int task_has_rt_policy(struct task_struct *p) | ||
136 | { | ||
137 | return rt_policy(p->policy); | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * This is the priority-queue data structure of the RT scheduling class: | ||
142 | */ | ||
143 | struct rt_prio_array { | ||
144 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
145 | struct list_head queue[MAX_RT_PRIO]; | ||
146 | }; | ||
147 | |||
148 | struct rt_bandwidth { | ||
149 | /* nests inside the rq lock: */ | ||
150 | raw_spinlock_t rt_runtime_lock; | ||
151 | ktime_t rt_period; | ||
152 | u64 rt_runtime; | ||
153 | struct hrtimer rt_period_timer; | ||
154 | }; | ||
155 | |||
156 | static struct rt_bandwidth def_rt_bandwidth; | ||
157 | |||
158 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
159 | |||
160 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
161 | { | ||
162 | struct rt_bandwidth *rt_b = | ||
163 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
164 | ktime_t now; | ||
165 | int overrun; | ||
166 | int idle = 0; | ||
167 | |||
168 | for (;;) { | ||
169 | now = hrtimer_cb_get_time(timer); | ||
170 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
171 | |||
172 | if (!overrun) | ||
173 | break; | ||
174 | |||
175 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
176 | } | ||
177 | |||
178 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
179 | } | ||
180 | |||
181 | static | ||
182 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
183 | { | ||
184 | rt_b->rt_period = ns_to_ktime(period); | ||
185 | rt_b->rt_runtime = runtime; | ||
186 | |||
187 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
188 | |||
189 | hrtimer_init(&rt_b->rt_period_timer, | ||
190 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
191 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
192 | } | ||
193 | |||
194 | static inline int rt_bandwidth_enabled(void) | ||
195 | { | ||
196 | return sysctl_sched_rt_runtime >= 0; | ||
197 | } | ||
198 | |||
199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | ||
200 | { | 89 | { |
201 | unsigned long delta; | 90 | unsigned long delta; |
202 | ktime_t soft, hard, now; | 91 | ktime_t soft, hard, now; |
@@ -216,580 +105,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | |||
216 | } | 105 | } |
217 | } | 106 | } |
218 | 107 | ||
219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 108 | DEFINE_MUTEX(sched_domains_mutex); |
220 | { | 109 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
222 | return; | ||
223 | |||
224 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
225 | return; | ||
226 | |||
227 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
229 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
230 | } | ||
231 | |||
232 | #ifdef CONFIG_RT_GROUP_SCHED | ||
233 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
234 | { | ||
235 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
236 | } | ||
237 | #endif | ||
238 | |||
239 | /* | ||
240 | * sched_domains_mutex serializes calls to init_sched_domains, | ||
241 | * detach_destroy_domains and partition_sched_domains. | ||
242 | */ | ||
243 | static DEFINE_MUTEX(sched_domains_mutex); | ||
244 | |||
245 | #ifdef CONFIG_CGROUP_SCHED | ||
246 | |||
247 | #include <linux/cgroup.h> | ||
248 | |||
249 | struct cfs_rq; | ||
250 | |||
251 | static LIST_HEAD(task_groups); | ||
252 | |||
253 | struct cfs_bandwidth { | ||
254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
255 | raw_spinlock_t lock; | ||
256 | ktime_t period; | ||
257 | u64 quota, runtime; | ||
258 | s64 hierarchal_quota; | ||
259 | u64 runtime_expires; | ||
260 | |||
261 | int idle, timer_active; | ||
262 | struct hrtimer period_timer, slack_timer; | ||
263 | struct list_head throttled_cfs_rq; | ||
264 | |||
265 | /* statistics */ | ||
266 | int nr_periods, nr_throttled; | ||
267 | u64 throttled_time; | ||
268 | #endif | ||
269 | }; | ||
270 | |||
271 | /* task group related information */ | ||
272 | struct task_group { | ||
273 | struct cgroup_subsys_state css; | ||
274 | |||
275 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
276 | /* schedulable entities of this group on each cpu */ | ||
277 | struct sched_entity **se; | ||
278 | /* runqueue "owned" by this group on each cpu */ | ||
279 | struct cfs_rq **cfs_rq; | ||
280 | unsigned long shares; | ||
281 | |||
282 | atomic_t load_weight; | ||
283 | #endif | ||
284 | |||
285 | #ifdef CONFIG_RT_GROUP_SCHED | ||
286 | struct sched_rt_entity **rt_se; | ||
287 | struct rt_rq **rt_rq; | ||
288 | |||
289 | struct rt_bandwidth rt_bandwidth; | ||
290 | #endif | ||
291 | |||
292 | struct rcu_head rcu; | ||
293 | struct list_head list; | ||
294 | |||
295 | struct task_group *parent; | ||
296 | struct list_head siblings; | ||
297 | struct list_head children; | ||
298 | |||
299 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
300 | struct autogroup *autogroup; | ||
301 | #endif | ||
302 | |||
303 | struct cfs_bandwidth cfs_bandwidth; | ||
304 | }; | ||
305 | |||
306 | /* task_group_lock serializes the addition/removal of task groups */ | ||
307 | static DEFINE_SPINLOCK(task_group_lock); | ||
308 | |||
309 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
310 | |||
311 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
312 | |||
313 | /* | ||
314 | * A weight of 0 or 1 can cause arithmetics problems. | ||
315 | * A weight of a cfs_rq is the sum of weights of which entities | ||
316 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
317 | * too large, so as the shares value of a task group. | ||
318 | * (The default weight is 1024 - so there's no practical | ||
319 | * limitation from this.) | ||
320 | */ | ||
321 | #define MIN_SHARES (1UL << 1) | ||
322 | #define MAX_SHARES (1UL << 18) | ||
323 | |||
324 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | ||
325 | #endif | ||
326 | |||
327 | /* Default task group. | ||
328 | * Every task in system belong to this group at bootup. | ||
329 | */ | ||
330 | struct task_group root_task_group; | ||
331 | |||
332 | #endif /* CONFIG_CGROUP_SCHED */ | ||
333 | |||
334 | /* CFS-related fields in a runqueue */ | ||
335 | struct cfs_rq { | ||
336 | struct load_weight load; | ||
337 | unsigned long nr_running, h_nr_running; | ||
338 | |||
339 | u64 exec_clock; | ||
340 | u64 min_vruntime; | ||
341 | #ifndef CONFIG_64BIT | ||
342 | u64 min_vruntime_copy; | ||
343 | #endif | ||
344 | |||
345 | struct rb_root tasks_timeline; | ||
346 | struct rb_node *rb_leftmost; | ||
347 | |||
348 | struct list_head tasks; | ||
349 | struct list_head *balance_iterator; | ||
350 | |||
351 | /* | ||
352 | * 'curr' points to currently running entity on this cfs_rq. | ||
353 | * It is set to NULL otherwise (i.e when none are currently running). | ||
354 | */ | ||
355 | struct sched_entity *curr, *next, *last, *skip; | ||
356 | |||
357 | #ifdef CONFIG_SCHED_DEBUG | ||
358 | unsigned int nr_spread_over; | ||
359 | #endif | ||
360 | |||
361 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
362 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
363 | |||
364 | /* | ||
365 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
366 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
367 | * (like users, containers etc.) | ||
368 | * | ||
369 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
370 | * list is used during load balance. | ||
371 | */ | ||
372 | int on_list; | ||
373 | struct list_head leaf_cfs_rq_list; | ||
374 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
375 | |||
376 | #ifdef CONFIG_SMP | ||
377 | /* | ||
378 | * the part of load.weight contributed by tasks | ||
379 | */ | ||
380 | unsigned long task_weight; | ||
381 | |||
382 | /* | ||
383 | * h_load = weight * f(tg) | ||
384 | * | ||
385 | * Where f(tg) is the recursive weight fraction assigned to | ||
386 | * this group. | ||
387 | */ | ||
388 | unsigned long h_load; | ||
389 | |||
390 | /* | ||
391 | * Maintaining per-cpu shares distribution for group scheduling | ||
392 | * | ||
393 | * load_stamp is the last time we updated the load average | ||
394 | * load_last is the last time we updated the load average and saw load | ||
395 | * load_unacc_exec_time is currently unaccounted execution time | ||
396 | */ | ||
397 | u64 load_avg; | ||
398 | u64 load_period; | ||
399 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
400 | |||
401 | unsigned long load_contribution; | ||
402 | #endif | ||
403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
404 | int runtime_enabled; | ||
405 | u64 runtime_expires; | ||
406 | s64 runtime_remaining; | ||
407 | |||
408 | u64 throttled_timestamp; | ||
409 | int throttled, throttle_count; | ||
410 | struct list_head throttled_list; | ||
411 | #endif | ||
412 | #endif | ||
413 | }; | ||
414 | |||
415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
418 | { | ||
419 | return &tg->cfs_bandwidth; | ||
420 | } | ||
421 | |||
422 | static inline u64 default_cfs_period(void); | ||
423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
425 | |||
426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
427 | { | ||
428 | struct cfs_bandwidth *cfs_b = | ||
429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
430 | do_sched_cfs_slack_timer(cfs_b); | ||
431 | |||
432 | return HRTIMER_NORESTART; | ||
433 | } | ||
434 | |||
435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
436 | { | ||
437 | struct cfs_bandwidth *cfs_b = | ||
438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
439 | ktime_t now; | ||
440 | int overrun; | ||
441 | int idle = 0; | ||
442 | |||
443 | for (;;) { | ||
444 | now = hrtimer_cb_get_time(timer); | ||
445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
446 | |||
447 | if (!overrun) | ||
448 | break; | ||
449 | |||
450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
451 | } | ||
452 | |||
453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
454 | } | ||
455 | |||
456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
457 | { | ||
458 | raw_spin_lock_init(&cfs_b->lock); | ||
459 | cfs_b->runtime = 0; | ||
460 | cfs_b->quota = RUNTIME_INF; | ||
461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
462 | |||
463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
468 | } | ||
469 | |||
470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
471 | { | ||
472 | cfs_rq->runtime_enabled = 0; | ||
473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
474 | } | ||
475 | |||
476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
478 | { | ||
479 | /* | ||
480 | * The timer may be active because we're trying to set a new bandwidth | ||
481 | * period or because we're racing with the tear-down path | ||
482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
483 | * terminates). In either case we ensure that it's re-programmed | ||
484 | */ | ||
485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
486 | raw_spin_unlock(&cfs_b->lock); | ||
487 | /* ensure cfs_b->lock is available while we wait */ | ||
488 | hrtimer_cancel(&cfs_b->period_timer); | ||
489 | |||
490 | raw_spin_lock(&cfs_b->lock); | ||
491 | /* if someone else restarted the timer then we're done */ | ||
492 | if (cfs_b->timer_active) | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | cfs_b->timer_active = 1; | ||
497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
498 | } | ||
499 | |||
500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
501 | { | ||
502 | hrtimer_cancel(&cfs_b->period_timer); | ||
503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
504 | } | ||
505 | #else | ||
506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | |||
510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
511 | { | ||
512 | return NULL; | ||
513 | } | ||
514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
516 | |||
517 | /* Real-Time classes' related field in a runqueue: */ | ||
518 | struct rt_rq { | ||
519 | struct rt_prio_array active; | ||
520 | unsigned long rt_nr_running; | ||
521 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
522 | struct { | ||
523 | int curr; /* highest queued rt task prio */ | ||
524 | #ifdef CONFIG_SMP | ||
525 | int next; /* next highest */ | ||
526 | #endif | ||
527 | } highest_prio; | ||
528 | #endif | ||
529 | #ifdef CONFIG_SMP | ||
530 | unsigned long rt_nr_migratory; | ||
531 | unsigned long rt_nr_total; | ||
532 | int overloaded; | ||
533 | struct plist_head pushable_tasks; | ||
534 | #endif | ||
535 | int rt_throttled; | ||
536 | u64 rt_time; | ||
537 | u64 rt_runtime; | ||
538 | /* Nests inside the rq lock: */ | ||
539 | raw_spinlock_t rt_runtime_lock; | ||
540 | |||
541 | #ifdef CONFIG_RT_GROUP_SCHED | ||
542 | unsigned long rt_nr_boosted; | ||
543 | |||
544 | struct rq *rq; | ||
545 | struct list_head leaf_rt_rq_list; | ||
546 | struct task_group *tg; | ||
547 | #endif | ||
548 | }; | ||
549 | |||
550 | #ifdef CONFIG_SMP | ||
551 | |||
552 | /* | ||
553 | * We add the notion of a root-domain which will be used to define per-domain | ||
554 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
555 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
556 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
557 | * object. | ||
558 | * | ||
559 | */ | ||
560 | struct root_domain { | ||
561 | atomic_t refcount; | ||
562 | atomic_t rto_count; | ||
563 | struct rcu_head rcu; | ||
564 | cpumask_var_t span; | ||
565 | cpumask_var_t online; | ||
566 | |||
567 | /* | ||
568 | * The "RT overload" flag: it gets set if a CPU has more than | ||
569 | * one runnable RT task. | ||
570 | */ | ||
571 | cpumask_var_t rto_mask; | ||
572 | struct cpupri cpupri; | ||
573 | }; | ||
574 | |||
575 | /* | ||
576 | * By default the system creates a single root-domain with all cpus as | ||
577 | * members (mimicking the global state we have today). | ||
578 | */ | ||
579 | static struct root_domain def_root_domain; | ||
580 | |||
581 | #endif /* CONFIG_SMP */ | ||
582 | |||
583 | /* | ||
584 | * This is the main, per-CPU runqueue data structure. | ||
585 | * | ||
586 | * Locking rule: those places that want to lock multiple runqueues | ||
587 | * (such as the load balancing or the thread migration code), lock | ||
588 | * acquire operations must be ordered by ascending &runqueue. | ||
589 | */ | ||
590 | struct rq { | ||
591 | /* runqueue lock: */ | ||
592 | raw_spinlock_t lock; | ||
593 | |||
594 | /* | ||
595 | * nr_running and cpu_load should be in the same cacheline because | ||
596 | * remote CPUs use both these fields when doing load calculation. | ||
597 | */ | ||
598 | unsigned long nr_running; | ||
599 | #define CPU_LOAD_IDX_MAX 5 | ||
600 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
601 | unsigned long last_load_update_tick; | ||
602 | #ifdef CONFIG_NO_HZ | ||
603 | u64 nohz_stamp; | ||
604 | unsigned char nohz_balance_kick; | ||
605 | #endif | ||
606 | int skip_clock_update; | ||
607 | |||
608 | /* capture load from *all* tasks on this cpu: */ | ||
609 | struct load_weight load; | ||
610 | unsigned long nr_load_updates; | ||
611 | u64 nr_switches; | ||
612 | |||
613 | struct cfs_rq cfs; | ||
614 | struct rt_rq rt; | ||
615 | |||
616 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
617 | /* list of leaf cfs_rq on this cpu: */ | ||
618 | struct list_head leaf_cfs_rq_list; | ||
619 | #endif | ||
620 | #ifdef CONFIG_RT_GROUP_SCHED | ||
621 | struct list_head leaf_rt_rq_list; | ||
622 | #endif | ||
623 | |||
624 | /* | ||
625 | * This is part of a global counter where only the total sum | ||
626 | * over all CPUs matters. A task can increase this counter on | ||
627 | * one CPU and if it got migrated afterwards it may decrease | ||
628 | * it on another CPU. Always updated under the runqueue lock: | ||
629 | */ | ||
630 | unsigned long nr_uninterruptible; | ||
631 | |||
632 | struct task_struct *curr, *idle, *stop; | ||
633 | unsigned long next_balance; | ||
634 | struct mm_struct *prev_mm; | ||
635 | |||
636 | u64 clock; | ||
637 | u64 clock_task; | ||
638 | |||
639 | atomic_t nr_iowait; | ||
640 | |||
641 | #ifdef CONFIG_SMP | ||
642 | struct root_domain *rd; | ||
643 | struct sched_domain *sd; | ||
644 | |||
645 | unsigned long cpu_power; | ||
646 | |||
647 | unsigned char idle_balance; | ||
648 | /* For active balancing */ | ||
649 | int post_schedule; | ||
650 | int active_balance; | ||
651 | int push_cpu; | ||
652 | struct cpu_stop_work active_balance_work; | ||
653 | /* cpu of this runqueue: */ | ||
654 | int cpu; | ||
655 | int online; | ||
656 | |||
657 | u64 rt_avg; | ||
658 | u64 age_stamp; | ||
659 | u64 idle_stamp; | ||
660 | u64 avg_idle; | ||
661 | #endif | ||
662 | |||
663 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
664 | u64 prev_irq_time; | ||
665 | #endif | ||
666 | #ifdef CONFIG_PARAVIRT | ||
667 | u64 prev_steal_time; | ||
668 | #endif | ||
669 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
670 | u64 prev_steal_time_rq; | ||
671 | #endif | ||
672 | |||
673 | /* calc_load related fields */ | ||
674 | unsigned long calc_load_update; | ||
675 | long calc_load_active; | ||
676 | |||
677 | #ifdef CONFIG_SCHED_HRTICK | ||
678 | #ifdef CONFIG_SMP | ||
679 | int hrtick_csd_pending; | ||
680 | struct call_single_data hrtick_csd; | ||
681 | #endif | ||
682 | struct hrtimer hrtick_timer; | ||
683 | #endif | ||
684 | |||
685 | #ifdef CONFIG_SCHEDSTATS | ||
686 | /* latency stats */ | ||
687 | struct sched_info rq_sched_info; | ||
688 | unsigned long long rq_cpu_time; | ||
689 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
690 | |||
691 | /* sys_sched_yield() stats */ | ||
692 | unsigned int yld_count; | ||
693 | |||
694 | /* schedule() stats */ | ||
695 | unsigned int sched_switch; | ||
696 | unsigned int sched_count; | ||
697 | unsigned int sched_goidle; | ||
698 | |||
699 | /* try_to_wake_up() stats */ | ||
700 | unsigned int ttwu_count; | ||
701 | unsigned int ttwu_local; | ||
702 | #endif | ||
703 | |||
704 | #ifdef CONFIG_SMP | ||
705 | struct llist_head wake_list; | ||
706 | #endif | ||
707 | }; | ||
708 | |||
709 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | ||
710 | |||
711 | |||
712 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
713 | |||
714 | static inline int cpu_of(struct rq *rq) | ||
715 | { | ||
716 | #ifdef CONFIG_SMP | ||
717 | return rq->cpu; | ||
718 | #else | ||
719 | return 0; | ||
720 | #endif | ||
721 | } | ||
722 | |||
723 | #define rcu_dereference_check_sched_domain(p) \ | ||
724 | rcu_dereference_check((p), \ | ||
725 | lockdep_is_held(&sched_domains_mutex)) | ||
726 | |||
727 | /* | ||
728 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
729 | * See detach_destroy_domains: synchronize_sched for details. | ||
730 | * | ||
731 | * The domain tree of any CPU may only be accessed from within | ||
732 | * preempt-disabled sections. | ||
733 | */ | ||
734 | #define for_each_domain(cpu, __sd) \ | ||
735 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | ||
736 | |||
737 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
738 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
739 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
740 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
741 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
742 | |||
743 | #ifdef CONFIG_CGROUP_SCHED | ||
744 | |||
745 | /* | ||
746 | * Return the group to which this tasks belongs. | ||
747 | * | ||
748 | * We use task_subsys_state_check() and extend the RCU verification with | ||
749 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
750 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
751 | * we pin the task to the current cgroup. | ||
752 | */ | ||
753 | static inline struct task_group *task_group(struct task_struct *p) | ||
754 | { | ||
755 | struct task_group *tg; | ||
756 | struct cgroup_subsys_state *css; | ||
757 | |||
758 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
759 | lockdep_is_held(&p->pi_lock) || | ||
760 | lockdep_is_held(&task_rq(p)->lock)); | ||
761 | tg = container_of(css, struct task_group, css); | ||
762 | |||
763 | return autogroup_task_group(p, tg); | ||
764 | } | ||
765 | |||
766 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
767 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
768 | { | ||
769 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
770 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | ||
771 | p->se.parent = task_group(p)->se[cpu]; | ||
772 | #endif | ||
773 | |||
774 | #ifdef CONFIG_RT_GROUP_SCHED | ||
775 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
776 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
777 | #endif | ||
778 | } | ||
779 | |||
780 | #else /* CONFIG_CGROUP_SCHED */ | ||
781 | |||
782 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
783 | static inline struct task_group *task_group(struct task_struct *p) | ||
784 | { | ||
785 | return NULL; | ||
786 | } | ||
787 | |||
788 | #endif /* CONFIG_CGROUP_SCHED */ | ||
789 | 110 | ||
790 | static void update_rq_clock_task(struct rq *rq, s64 delta); | 111 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
791 | 112 | ||
792 | static void update_rq_clock(struct rq *rq) | 113 | void update_rq_clock(struct rq *rq) |
793 | { | 114 | { |
794 | s64 delta; | 115 | s64 delta; |
795 | 116 | ||
@@ -802,44 +123,14 @@ static void update_rq_clock(struct rq *rq) | |||
802 | } | 123 | } |
803 | 124 | ||
804 | /* | 125 | /* |
805 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
806 | */ | ||
807 | #ifdef CONFIG_SCHED_DEBUG | ||
808 | # define const_debug __read_mostly | ||
809 | #else | ||
810 | # define const_debug static const | ||
811 | #endif | ||
812 | |||
813 | /** | ||
814 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked | ||
815 | * @cpu: the processor in question. | ||
816 | * | ||
817 | * This interface allows printk to be called with the runqueue lock | ||
818 | * held and know whether or not it is OK to wake up the klogd. | ||
819 | */ | ||
820 | int runqueue_is_locked(int cpu) | ||
821 | { | ||
822 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); | ||
823 | } | ||
824 | |||
825 | /* | ||
826 | * Debugging: various feature bits | 126 | * Debugging: various feature bits |
827 | */ | 127 | */ |
828 | 128 | ||
829 | #define SCHED_FEAT(name, enabled) \ | 129 | #define SCHED_FEAT(name, enabled) \ |
830 | __SCHED_FEAT_##name , | ||
831 | |||
832 | enum { | ||
833 | #include "sched_features.h" | ||
834 | }; | ||
835 | |||
836 | #undef SCHED_FEAT | ||
837 | |||
838 | #define SCHED_FEAT(name, enabled) \ | ||
839 | (1UL << __SCHED_FEAT_##name) * enabled | | 130 | (1UL << __SCHED_FEAT_##name) * enabled | |
840 | 131 | ||
841 | const_debug unsigned int sysctl_sched_features = | 132 | const_debug unsigned int sysctl_sched_features = |
842 | #include "sched_features.h" | 133 | #include "features.h" |
843 | 0; | 134 | 0; |
844 | 135 | ||
845 | #undef SCHED_FEAT | 136 | #undef SCHED_FEAT |
@@ -849,7 +140,7 @@ const_debug unsigned int sysctl_sched_features = | |||
849 | #name , | 140 | #name , |
850 | 141 | ||
851 | static __read_mostly char *sched_feat_names[] = { | 142 | static __read_mostly char *sched_feat_names[] = { |
852 | #include "sched_features.h" | 143 | #include "features.h" |
853 | NULL | 144 | NULL |
854 | }; | 145 | }; |
855 | 146 | ||
@@ -859,7 +150,7 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
859 | { | 150 | { |
860 | int i; | 151 | int i; |
861 | 152 | ||
862 | for (i = 0; sched_feat_names[i]; i++) { | 153 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
863 | if (!(sysctl_sched_features & (1UL << i))) | 154 | if (!(sysctl_sched_features & (1UL << i))) |
864 | seq_puts(m, "NO_"); | 155 | seq_puts(m, "NO_"); |
865 | seq_printf(m, "%s ", sched_feat_names[i]); | 156 | seq_printf(m, "%s ", sched_feat_names[i]); |
@@ -869,6 +160,36 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
869 | return 0; | 160 | return 0; |
870 | } | 161 | } |
871 | 162 | ||
163 | #ifdef HAVE_JUMP_LABEL | ||
164 | |||
165 | #define jump_label_key__true jump_label_key_enabled | ||
166 | #define jump_label_key__false jump_label_key_disabled | ||
167 | |||
168 | #define SCHED_FEAT(name, enabled) \ | ||
169 | jump_label_key__##enabled , | ||
170 | |||
171 | struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
172 | #include "features.h" | ||
173 | }; | ||
174 | |||
175 | #undef SCHED_FEAT | ||
176 | |||
177 | static void sched_feat_disable(int i) | ||
178 | { | ||
179 | if (jump_label_enabled(&sched_feat_keys[i])) | ||
180 | jump_label_dec(&sched_feat_keys[i]); | ||
181 | } | ||
182 | |||
183 | static void sched_feat_enable(int i) | ||
184 | { | ||
185 | if (!jump_label_enabled(&sched_feat_keys[i])) | ||
186 | jump_label_inc(&sched_feat_keys[i]); | ||
187 | } | ||
188 | #else | ||
189 | static void sched_feat_disable(int i) { }; | ||
190 | static void sched_feat_enable(int i) { }; | ||
191 | #endif /* HAVE_JUMP_LABEL */ | ||
192 | |||
872 | static ssize_t | 193 | static ssize_t |
873 | sched_feat_write(struct file *filp, const char __user *ubuf, | 194 | sched_feat_write(struct file *filp, const char __user *ubuf, |
874 | size_t cnt, loff_t *ppos) | 195 | size_t cnt, loff_t *ppos) |
@@ -892,17 +213,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
892 | cmp += 3; | 213 | cmp += 3; |
893 | } | 214 | } |
894 | 215 | ||
895 | for (i = 0; sched_feat_names[i]; i++) { | 216 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
896 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | 217 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
897 | if (neg) | 218 | if (neg) { |
898 | sysctl_sched_features &= ~(1UL << i); | 219 | sysctl_sched_features &= ~(1UL << i); |
899 | else | 220 | sched_feat_disable(i); |
221 | } else { | ||
900 | sysctl_sched_features |= (1UL << i); | 222 | sysctl_sched_features |= (1UL << i); |
223 | sched_feat_enable(i); | ||
224 | } | ||
901 | break; | 225 | break; |
902 | } | 226 | } |
903 | } | 227 | } |
904 | 228 | ||
905 | if (!sched_feat_names[i]) | 229 | if (i == __SCHED_FEAT_NR) |
906 | return -EINVAL; | 230 | return -EINVAL; |
907 | 231 | ||
908 | *ppos += cnt; | 232 | *ppos += cnt; |
@@ -931,10 +255,7 @@ static __init int sched_init_debug(void) | |||
931 | return 0; | 255 | return 0; |
932 | } | 256 | } |
933 | late_initcall(sched_init_debug); | 257 | late_initcall(sched_init_debug); |
934 | 258 | #endif /* CONFIG_SCHED_DEBUG */ | |
935 | #endif | ||
936 | |||
937 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
938 | 259 | ||
939 | /* | 260 | /* |
940 | * Number of tasks to iterate in a single balance run. | 261 | * Number of tasks to iterate in a single balance run. |
@@ -956,7 +277,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | |||
956 | */ | 277 | */ |
957 | unsigned int sysctl_sched_rt_period = 1000000; | 278 | unsigned int sysctl_sched_rt_period = 1000000; |
958 | 279 | ||
959 | static __read_mostly int scheduler_running; | 280 | __read_mostly int scheduler_running; |
960 | 281 | ||
961 | /* | 282 | /* |
962 | * part of the period that we allow rt tasks to run in us. | 283 | * part of the period that we allow rt tasks to run in us. |
@@ -964,112 +285,7 @@ static __read_mostly int scheduler_running; | |||
964 | */ | 285 | */ |
965 | int sysctl_sched_rt_runtime = 950000; | 286 | int sysctl_sched_rt_runtime = 950000; |
966 | 287 | ||
967 | static inline u64 global_rt_period(void) | ||
968 | { | ||
969 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
970 | } | ||
971 | |||
972 | static inline u64 global_rt_runtime(void) | ||
973 | { | ||
974 | if (sysctl_sched_rt_runtime < 0) | ||
975 | return RUNTIME_INF; | ||
976 | |||
977 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
978 | } | ||
979 | |||
980 | #ifndef prepare_arch_switch | ||
981 | # define prepare_arch_switch(next) do { } while (0) | ||
982 | #endif | ||
983 | #ifndef finish_arch_switch | ||
984 | # define finish_arch_switch(prev) do { } while (0) | ||
985 | #endif | ||
986 | |||
987 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
988 | { | ||
989 | return rq->curr == p; | ||
990 | } | ||
991 | |||
992 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
993 | { | ||
994 | #ifdef CONFIG_SMP | ||
995 | return p->on_cpu; | ||
996 | #else | ||
997 | return task_current(rq, p); | ||
998 | #endif | ||
999 | } | ||
1000 | |||
1001 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
1002 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
1003 | { | ||
1004 | #ifdef CONFIG_SMP | ||
1005 | /* | ||
1006 | * We can optimise this out completely for !SMP, because the | ||
1007 | * SMP rebalancing from interrupt is the only thing that cares | ||
1008 | * here. | ||
1009 | */ | ||
1010 | next->on_cpu = 1; | ||
1011 | #endif | ||
1012 | } | ||
1013 | 288 | ||
1014 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
1015 | { | ||
1016 | #ifdef CONFIG_SMP | ||
1017 | /* | ||
1018 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
1019 | * We must ensure this doesn't happen until the switch is completely | ||
1020 | * finished. | ||
1021 | */ | ||
1022 | smp_wmb(); | ||
1023 | prev->on_cpu = 0; | ||
1024 | #endif | ||
1025 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
1026 | /* this is a valid case when another task releases the spinlock */ | ||
1027 | rq->lock.owner = current; | ||
1028 | #endif | ||
1029 | /* | ||
1030 | * If we are tracking spinlock dependencies then we have to | ||
1031 | * fix up the runqueue lock - which gets 'carried over' from | ||
1032 | * prev into current: | ||
1033 | */ | ||
1034 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
1035 | |||
1036 | raw_spin_unlock_irq(&rq->lock); | ||
1037 | } | ||
1038 | |||
1039 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
1040 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
1041 | { | ||
1042 | #ifdef CONFIG_SMP | ||
1043 | /* | ||
1044 | * We can optimise this out completely for !SMP, because the | ||
1045 | * SMP rebalancing from interrupt is the only thing that cares | ||
1046 | * here. | ||
1047 | */ | ||
1048 | next->on_cpu = 1; | ||
1049 | #endif | ||
1050 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1051 | raw_spin_unlock_irq(&rq->lock); | ||
1052 | #else | ||
1053 | raw_spin_unlock(&rq->lock); | ||
1054 | #endif | ||
1055 | } | ||
1056 | |||
1057 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
1058 | { | ||
1059 | #ifdef CONFIG_SMP | ||
1060 | /* | ||
1061 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
1062 | * We must ensure this doesn't happen until the switch is completely | ||
1063 | * finished. | ||
1064 | */ | ||
1065 | smp_wmb(); | ||
1066 | prev->on_cpu = 0; | ||
1067 | #endif | ||
1068 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1069 | local_irq_enable(); | ||
1070 | #endif | ||
1071 | } | ||
1072 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
1073 | 289 | ||
1074 | /* | 290 | /* |
1075 | * __task_rq_lock - lock the rq @p resides on. | 291 | * __task_rq_lock - lock the rq @p resides on. |
@@ -1152,20 +368,6 @@ static struct rq *this_rq_lock(void) | |||
1152 | * rq->lock. | 368 | * rq->lock. |
1153 | */ | 369 | */ |
1154 | 370 | ||
1155 | /* | ||
1156 | * Use hrtick when: | ||
1157 | * - enabled by features | ||
1158 | * - hrtimer is actually high res | ||
1159 | */ | ||
1160 | static inline int hrtick_enabled(struct rq *rq) | ||
1161 | { | ||
1162 | if (!sched_feat(HRTICK)) | ||
1163 | return 0; | ||
1164 | if (!cpu_active(cpu_of(rq))) | ||
1165 | return 0; | ||
1166 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
1167 | } | ||
1168 | |||
1169 | static void hrtick_clear(struct rq *rq) | 371 | static void hrtick_clear(struct rq *rq) |
1170 | { | 372 | { |
1171 | if (hrtimer_active(&rq->hrtick_timer)) | 373 | if (hrtimer_active(&rq->hrtick_timer)) |
@@ -1209,7 +411,7 @@ static void __hrtick_start(void *arg) | |||
1209 | * | 411 | * |
1210 | * called with rq->lock held and irqs disabled | 412 | * called with rq->lock held and irqs disabled |
1211 | */ | 413 | */ |
1212 | static void hrtick_start(struct rq *rq, u64 delay) | 414 | void hrtick_start(struct rq *rq, u64 delay) |
1213 | { | 415 | { |
1214 | struct hrtimer *timer = &rq->hrtick_timer; | 416 | struct hrtimer *timer = &rq->hrtick_timer; |
1215 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 417 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
@@ -1253,7 +455,7 @@ static __init void init_hrtick(void) | |||
1253 | * | 455 | * |
1254 | * called with rq->lock held and irqs disabled | 456 | * called with rq->lock held and irqs disabled |
1255 | */ | 457 | */ |
1256 | static void hrtick_start(struct rq *rq, u64 delay) | 458 | void hrtick_start(struct rq *rq, u64 delay) |
1257 | { | 459 | { |
1258 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 460 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
1259 | HRTIMER_MODE_REL_PINNED, 0); | 461 | HRTIMER_MODE_REL_PINNED, 0); |
@@ -1304,7 +506,7 @@ static inline void init_hrtick(void) | |||
1304 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 506 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1305 | #endif | 507 | #endif |
1306 | 508 | ||
1307 | static void resched_task(struct task_struct *p) | 509 | void resched_task(struct task_struct *p) |
1308 | { | 510 | { |
1309 | int cpu; | 511 | int cpu; |
1310 | 512 | ||
@@ -1325,7 +527,7 @@ static void resched_task(struct task_struct *p) | |||
1325 | smp_send_reschedule(cpu); | 527 | smp_send_reschedule(cpu); |
1326 | } | 528 | } |
1327 | 529 | ||
1328 | static void resched_cpu(int cpu) | 530 | void resched_cpu(int cpu) |
1329 | { | 531 | { |
1330 | struct rq *rq = cpu_rq(cpu); | 532 | struct rq *rq = cpu_rq(cpu); |
1331 | unsigned long flags; | 533 | unsigned long flags; |
@@ -1406,7 +608,8 @@ void wake_up_idle_cpu(int cpu) | |||
1406 | 608 | ||
1407 | static inline bool got_nohz_idle_kick(void) | 609 | static inline bool got_nohz_idle_kick(void) |
1408 | { | 610 | { |
1409 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | 611 | int cpu = smp_processor_id(); |
612 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | ||
1410 | } | 613 | } |
1411 | 614 | ||
1412 | #else /* CONFIG_NO_HZ */ | 615 | #else /* CONFIG_NO_HZ */ |
@@ -1418,12 +621,7 @@ static inline bool got_nohz_idle_kick(void) | |||
1418 | 621 | ||
1419 | #endif /* CONFIG_NO_HZ */ | 622 | #endif /* CONFIG_NO_HZ */ |
1420 | 623 | ||
1421 | static u64 sched_avg_period(void) | 624 | void sched_avg_update(struct rq *rq) |
1422 | { | ||
1423 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1424 | } | ||
1425 | |||
1426 | static void sched_avg_update(struct rq *rq) | ||
1427 | { | 625 | { |
1428 | s64 period = sched_avg_period(); | 626 | s64 period = sched_avg_period(); |
1429 | 627 | ||
@@ -1439,193 +637,23 @@ static void sched_avg_update(struct rq *rq) | |||
1439 | } | 637 | } |
1440 | } | 638 | } |
1441 | 639 | ||
1442 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1443 | { | ||
1444 | rq->rt_avg += rt_delta; | ||
1445 | sched_avg_update(rq); | ||
1446 | } | ||
1447 | |||
1448 | #else /* !CONFIG_SMP */ | 640 | #else /* !CONFIG_SMP */ |
1449 | static void resched_task(struct task_struct *p) | 641 | void resched_task(struct task_struct *p) |
1450 | { | 642 | { |
1451 | assert_raw_spin_locked(&task_rq(p)->lock); | 643 | assert_raw_spin_locked(&task_rq(p)->lock); |
1452 | set_tsk_need_resched(p); | 644 | set_tsk_need_resched(p); |
1453 | } | 645 | } |
1454 | |||
1455 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1456 | { | ||
1457 | } | ||
1458 | |||
1459 | static void sched_avg_update(struct rq *rq) | ||
1460 | { | ||
1461 | } | ||
1462 | #endif /* CONFIG_SMP */ | 646 | #endif /* CONFIG_SMP */ |
1463 | 647 | ||
1464 | #if BITS_PER_LONG == 32 | ||
1465 | # define WMULT_CONST (~0UL) | ||
1466 | #else | ||
1467 | # define WMULT_CONST (1UL << 32) | ||
1468 | #endif | ||
1469 | |||
1470 | #define WMULT_SHIFT 32 | ||
1471 | |||
1472 | /* | ||
1473 | * Shift right and round: | ||
1474 | */ | ||
1475 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
1476 | |||
1477 | /* | ||
1478 | * delta *= weight / lw | ||
1479 | */ | ||
1480 | static unsigned long | ||
1481 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
1482 | struct load_weight *lw) | ||
1483 | { | ||
1484 | u64 tmp; | ||
1485 | |||
1486 | /* | ||
1487 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1488 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1489 | * 2^SCHED_LOAD_RESOLUTION. | ||
1490 | */ | ||
1491 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1492 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1493 | else | ||
1494 | tmp = (u64)delta_exec; | ||
1495 | |||
1496 | if (!lw->inv_weight) { | ||
1497 | unsigned long w = scale_load_down(lw->weight); | ||
1498 | |||
1499 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1500 | lw->inv_weight = 1; | ||
1501 | else if (unlikely(!w)) | ||
1502 | lw->inv_weight = WMULT_CONST; | ||
1503 | else | ||
1504 | lw->inv_weight = WMULT_CONST / w; | ||
1505 | } | ||
1506 | |||
1507 | /* | ||
1508 | * Check whether we'd overflow the 64-bit multiplication: | ||
1509 | */ | ||
1510 | if (unlikely(tmp > WMULT_CONST)) | ||
1511 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
1512 | WMULT_SHIFT/2); | ||
1513 | else | ||
1514 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
1515 | |||
1516 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
1517 | } | ||
1518 | |||
1519 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
1520 | { | ||
1521 | lw->weight += inc; | ||
1522 | lw->inv_weight = 0; | ||
1523 | } | ||
1524 | |||
1525 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
1526 | { | ||
1527 | lw->weight -= dec; | ||
1528 | lw->inv_weight = 0; | ||
1529 | } | ||
1530 | |||
1531 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1532 | { | ||
1533 | lw->weight = w; | ||
1534 | lw->inv_weight = 0; | ||
1535 | } | ||
1536 | |||
1537 | /* | ||
1538 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
1539 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
1540 | * each task makes to its run queue's load is weighted according to its | ||
1541 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
1542 | * scaled version of the new time slice allocation that they receive on time | ||
1543 | * slice expiry etc. | ||
1544 | */ | ||
1545 | |||
1546 | #define WEIGHT_IDLEPRIO 3 | ||
1547 | #define WMULT_IDLEPRIO 1431655765 | ||
1548 | |||
1549 | /* | ||
1550 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
1551 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
1552 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
1553 | * that remained on nice 0. | ||
1554 | * | ||
1555 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
1556 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
1557 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
1558 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
1559 | * the relative distance between them is ~25%.) | ||
1560 | */ | ||
1561 | static const int prio_to_weight[40] = { | ||
1562 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
1563 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
1564 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
1565 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
1566 | /* 0 */ 1024, 820, 655, 526, 423, | ||
1567 | /* 5 */ 335, 272, 215, 172, 137, | ||
1568 | /* 10 */ 110, 87, 70, 56, 45, | ||
1569 | /* 15 */ 36, 29, 23, 18, 15, | ||
1570 | }; | ||
1571 | |||
1572 | /* | ||
1573 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
1574 | * | ||
1575 | * In cases where the weight does not change often, we can use the | ||
1576 | * precalculated inverse to speed up arithmetics by turning divisions | ||
1577 | * into multiplications: | ||
1578 | */ | ||
1579 | static const u32 prio_to_wmult[40] = { | ||
1580 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
1581 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
1582 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
1583 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
1584 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
1585 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
1586 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
1587 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
1588 | }; | ||
1589 | |||
1590 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
1591 | enum cpuacct_stat_index { | ||
1592 | CPUACCT_STAT_USER, /* ... user mode */ | ||
1593 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
1594 | |||
1595 | CPUACCT_STAT_NSTATS, | ||
1596 | }; | ||
1597 | |||
1598 | #ifdef CONFIG_CGROUP_CPUACCT | ||
1599 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
1600 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
1601 | enum cpuacct_stat_index idx, cputime_t val); | ||
1602 | #else | ||
1603 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
1604 | static inline void cpuacct_update_stats(struct task_struct *tsk, | ||
1605 | enum cpuacct_stat_index idx, cputime_t val) {} | ||
1606 | #endif | ||
1607 | |||
1608 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1609 | { | ||
1610 | update_load_add(&rq->load, load); | ||
1611 | } | ||
1612 | |||
1613 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1614 | { | ||
1615 | update_load_sub(&rq->load, load); | ||
1616 | } | ||
1617 | |||
1618 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 648 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1619 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | 649 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) |
1620 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
1621 | |||
1622 | /* | 650 | /* |
1623 | * Iterate task_group tree rooted at *from, calling @down when first entering a | 651 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1624 | * node and @up when leaving it for the final time. | 652 | * node and @up when leaving it for the final time. |
1625 | * | 653 | * |
1626 | * Caller must hold rcu_lock or sufficient equivalent. | 654 | * Caller must hold rcu_lock or sufficient equivalent. |
1627 | */ | 655 | */ |
1628 | static int walk_tg_tree_from(struct task_group *from, | 656 | int walk_tg_tree_from(struct task_group *from, |
1629 | tg_visitor down, tg_visitor up, void *data) | 657 | tg_visitor down, tg_visitor up, void *data) |
1630 | { | 658 | { |
1631 | struct task_group *parent, *child; | 659 | struct task_group *parent, *child; |
@@ -1656,270 +684,13 @@ out: | |||
1656 | return ret; | 684 | return ret; |
1657 | } | 685 | } |
1658 | 686 | ||
1659 | /* | 687 | int tg_nop(struct task_group *tg, void *data) |
1660 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1661 | * leaving it for the final time. | ||
1662 | * | ||
1663 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1664 | */ | ||
1665 | |||
1666 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1667 | { | ||
1668 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1669 | } | ||
1670 | |||
1671 | static int tg_nop(struct task_group *tg, void *data) | ||
1672 | { | 688 | { |
1673 | return 0; | 689 | return 0; |
1674 | } | 690 | } |
1675 | #endif | 691 | #endif |
1676 | 692 | ||
1677 | #ifdef CONFIG_SMP | 693 | void update_cpu_load(struct rq *this_rq); |
1678 | /* Used instead of source_load when we know the type == 0 */ | ||
1679 | static unsigned long weighted_cpuload(const int cpu) | ||
1680 | { | ||
1681 | return cpu_rq(cpu)->load.weight; | ||
1682 | } | ||
1683 | |||
1684 | /* | ||
1685 | * Return a low guess at the load of a migration-source cpu weighted | ||
1686 | * according to the scheduling class and "nice" value. | ||
1687 | * | ||
1688 | * We want to under-estimate the load of migration sources, to | ||
1689 | * balance conservatively. | ||
1690 | */ | ||
1691 | static unsigned long source_load(int cpu, int type) | ||
1692 | { | ||
1693 | struct rq *rq = cpu_rq(cpu); | ||
1694 | unsigned long total = weighted_cpuload(cpu); | ||
1695 | |||
1696 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1697 | return total; | ||
1698 | |||
1699 | return min(rq->cpu_load[type-1], total); | ||
1700 | } | ||
1701 | |||
1702 | /* | ||
1703 | * Return a high guess at the load of a migration-target cpu weighted | ||
1704 | * according to the scheduling class and "nice" value. | ||
1705 | */ | ||
1706 | static unsigned long target_load(int cpu, int type) | ||
1707 | { | ||
1708 | struct rq *rq = cpu_rq(cpu); | ||
1709 | unsigned long total = weighted_cpuload(cpu); | ||
1710 | |||
1711 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1712 | return total; | ||
1713 | |||
1714 | return max(rq->cpu_load[type-1], total); | ||
1715 | } | ||
1716 | |||
1717 | static unsigned long power_of(int cpu) | ||
1718 | { | ||
1719 | return cpu_rq(cpu)->cpu_power; | ||
1720 | } | ||
1721 | |||
1722 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1723 | |||
1724 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1725 | { | ||
1726 | struct rq *rq = cpu_rq(cpu); | ||
1727 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
1728 | |||
1729 | if (nr_running) | ||
1730 | return rq->load.weight / nr_running; | ||
1731 | |||
1732 | return 0; | ||
1733 | } | ||
1734 | |||
1735 | #ifdef CONFIG_PREEMPT | ||
1736 | |||
1737 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
1738 | |||
1739 | /* | ||
1740 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
1741 | * way at the expense of forcing extra atomic operations in all | ||
1742 | * invocations. This assures that the double_lock is acquired using the | ||
1743 | * same underlying policy as the spinlock_t on this architecture, which | ||
1744 | * reduces latency compared to the unfair variant below. However, it | ||
1745 | * also adds more overhead and therefore may reduce throughput. | ||
1746 | */ | ||
1747 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1748 | __releases(this_rq->lock) | ||
1749 | __acquires(busiest->lock) | ||
1750 | __acquires(this_rq->lock) | ||
1751 | { | ||
1752 | raw_spin_unlock(&this_rq->lock); | ||
1753 | double_rq_lock(this_rq, busiest); | ||
1754 | |||
1755 | return 1; | ||
1756 | } | ||
1757 | |||
1758 | #else | ||
1759 | /* | ||
1760 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1761 | * latency by eliminating extra atomic operations when the locks are | ||
1762 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1763 | * grant the double lock to lower cpus over higher ids under contention, | ||
1764 | * regardless of entry order into the function. | ||
1765 | */ | ||
1766 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1767 | __releases(this_rq->lock) | ||
1768 | __acquires(busiest->lock) | ||
1769 | __acquires(this_rq->lock) | ||
1770 | { | ||
1771 | int ret = 0; | ||
1772 | |||
1773 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
1774 | if (busiest < this_rq) { | ||
1775 | raw_spin_unlock(&this_rq->lock); | ||
1776 | raw_spin_lock(&busiest->lock); | ||
1777 | raw_spin_lock_nested(&this_rq->lock, | ||
1778 | SINGLE_DEPTH_NESTING); | ||
1779 | ret = 1; | ||
1780 | } else | ||
1781 | raw_spin_lock_nested(&busiest->lock, | ||
1782 | SINGLE_DEPTH_NESTING); | ||
1783 | } | ||
1784 | return ret; | ||
1785 | } | ||
1786 | |||
1787 | #endif /* CONFIG_PREEMPT */ | ||
1788 | |||
1789 | /* | ||
1790 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1791 | */ | ||
1792 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1793 | { | ||
1794 | if (unlikely(!irqs_disabled())) { | ||
1795 | /* printk() doesn't work good under rq->lock */ | ||
1796 | raw_spin_unlock(&this_rq->lock); | ||
1797 | BUG_ON(1); | ||
1798 | } | ||
1799 | |||
1800 | return _double_lock_balance(this_rq, busiest); | ||
1801 | } | ||
1802 | |||
1803 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1804 | __releases(busiest->lock) | ||
1805 | { | ||
1806 | raw_spin_unlock(&busiest->lock); | ||
1807 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1808 | } | ||
1809 | |||
1810 | /* | ||
1811 | * double_rq_lock - safely lock two runqueues | ||
1812 | * | ||
1813 | * Note this does not disable interrupts like task_rq_lock, | ||
1814 | * you need to do so manually before calling. | ||
1815 | */ | ||
1816 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1817 | __acquires(rq1->lock) | ||
1818 | __acquires(rq2->lock) | ||
1819 | { | ||
1820 | BUG_ON(!irqs_disabled()); | ||
1821 | if (rq1 == rq2) { | ||
1822 | raw_spin_lock(&rq1->lock); | ||
1823 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1824 | } else { | ||
1825 | if (rq1 < rq2) { | ||
1826 | raw_spin_lock(&rq1->lock); | ||
1827 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1828 | } else { | ||
1829 | raw_spin_lock(&rq2->lock); | ||
1830 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1831 | } | ||
1832 | } | ||
1833 | } | ||
1834 | |||
1835 | /* | ||
1836 | * double_rq_unlock - safely unlock two runqueues | ||
1837 | * | ||
1838 | * Note this does not restore interrupts like task_rq_unlock, | ||
1839 | * you need to do so manually after calling. | ||
1840 | */ | ||
1841 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1842 | __releases(rq1->lock) | ||
1843 | __releases(rq2->lock) | ||
1844 | { | ||
1845 | raw_spin_unlock(&rq1->lock); | ||
1846 | if (rq1 != rq2) | ||
1847 | raw_spin_unlock(&rq2->lock); | ||
1848 | else | ||
1849 | __release(rq2->lock); | ||
1850 | } | ||
1851 | |||
1852 | #else /* CONFIG_SMP */ | ||
1853 | |||
1854 | /* | ||
1855 | * double_rq_lock - safely lock two runqueues | ||
1856 | * | ||
1857 | * Note this does not disable interrupts like task_rq_lock, | ||
1858 | * you need to do so manually before calling. | ||
1859 | */ | ||
1860 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1861 | __acquires(rq1->lock) | ||
1862 | __acquires(rq2->lock) | ||
1863 | { | ||
1864 | BUG_ON(!irqs_disabled()); | ||
1865 | BUG_ON(rq1 != rq2); | ||
1866 | raw_spin_lock(&rq1->lock); | ||
1867 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1868 | } | ||
1869 | |||
1870 | /* | ||
1871 | * double_rq_unlock - safely unlock two runqueues | ||
1872 | * | ||
1873 | * Note this does not restore interrupts like task_rq_unlock, | ||
1874 | * you need to do so manually after calling. | ||
1875 | */ | ||
1876 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1877 | __releases(rq1->lock) | ||
1878 | __releases(rq2->lock) | ||
1879 | { | ||
1880 | BUG_ON(rq1 != rq2); | ||
1881 | raw_spin_unlock(&rq1->lock); | ||
1882 | __release(rq2->lock); | ||
1883 | } | ||
1884 | |||
1885 | #endif | ||
1886 | |||
1887 | static void calc_load_account_idle(struct rq *this_rq); | ||
1888 | static void update_sysctl(void); | ||
1889 | static int get_update_sysctl_factor(void); | ||
1890 | static void update_cpu_load(struct rq *this_rq); | ||
1891 | |||
1892 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
1893 | { | ||
1894 | set_task_rq(p, cpu); | ||
1895 | #ifdef CONFIG_SMP | ||
1896 | /* | ||
1897 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
1898 | * successfully executed on another CPU. We must ensure that updates of | ||
1899 | * per-task data have been completed by this moment. | ||
1900 | */ | ||
1901 | smp_wmb(); | ||
1902 | task_thread_info(p)->cpu = cpu; | ||
1903 | #endif | ||
1904 | } | ||
1905 | |||
1906 | static const struct sched_class rt_sched_class; | ||
1907 | |||
1908 | #define sched_class_highest (&stop_sched_class) | ||
1909 | #define for_each_class(class) \ | ||
1910 | for (class = sched_class_highest; class; class = class->next) | ||
1911 | |||
1912 | #include "sched_stats.h" | ||
1913 | |||
1914 | static void inc_nr_running(struct rq *rq) | ||
1915 | { | ||
1916 | rq->nr_running++; | ||
1917 | } | ||
1918 | |||
1919 | static void dec_nr_running(struct rq *rq) | ||
1920 | { | ||
1921 | rq->nr_running--; | ||
1922 | } | ||
1923 | 694 | ||
1924 | static void set_load_weight(struct task_struct *p) | 695 | static void set_load_weight(struct task_struct *p) |
1925 | { | 696 | { |
@@ -1953,10 +724,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1953 | p->sched_class->dequeue_task(rq, p, flags); | 724 | p->sched_class->dequeue_task(rq, p, flags); |
1954 | } | 725 | } |
1955 | 726 | ||
1956 | /* | 727 | void activate_task(struct rq *rq, struct task_struct *p, int flags) |
1957 | * activate_task - move a task to the runqueue. | ||
1958 | */ | ||
1959 | static void activate_task(struct rq *rq, struct task_struct *p, int flags) | ||
1960 | { | 728 | { |
1961 | if (task_contributes_to_load(p)) | 729 | if (task_contributes_to_load(p)) |
1962 | rq->nr_uninterruptible--; | 730 | rq->nr_uninterruptible--; |
@@ -1964,10 +732,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1964 | enqueue_task(rq, p, flags); | 732 | enqueue_task(rq, p, flags); |
1965 | } | 733 | } |
1966 | 734 | ||
1967 | /* | 735 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
1968 | * deactivate_task - remove a task from the runqueue. | ||
1969 | */ | ||
1970 | static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | ||
1971 | { | 736 | { |
1972 | if (task_contributes_to_load(p)) | 737 | if (task_contributes_to_load(p)) |
1973 | rq->nr_uninterruptible++; | 738 | rq->nr_uninterruptible++; |
@@ -2158,14 +923,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
2158 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 923 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
2159 | static int irqtime_account_hi_update(void) | 924 | static int irqtime_account_hi_update(void) |
2160 | { | 925 | { |
2161 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 926 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
2162 | unsigned long flags; | 927 | unsigned long flags; |
2163 | u64 latest_ns; | 928 | u64 latest_ns; |
2164 | int ret = 0; | 929 | int ret = 0; |
2165 | 930 | ||
2166 | local_irq_save(flags); | 931 | local_irq_save(flags); |
2167 | latest_ns = this_cpu_read(cpu_hardirq_time); | 932 | latest_ns = this_cpu_read(cpu_hardirq_time); |
2168 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | 933 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) |
2169 | ret = 1; | 934 | ret = 1; |
2170 | local_irq_restore(flags); | 935 | local_irq_restore(flags); |
2171 | return ret; | 936 | return ret; |
@@ -2173,14 +938,14 @@ static int irqtime_account_hi_update(void) | |||
2173 | 938 | ||
2174 | static int irqtime_account_si_update(void) | 939 | static int irqtime_account_si_update(void) |
2175 | { | 940 | { |
2176 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 941 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
2177 | unsigned long flags; | 942 | unsigned long flags; |
2178 | u64 latest_ns; | 943 | u64 latest_ns; |
2179 | int ret = 0; | 944 | int ret = 0; |
2180 | 945 | ||
2181 | local_irq_save(flags); | 946 | local_irq_save(flags); |
2182 | latest_ns = this_cpu_read(cpu_softirq_time); | 947 | latest_ns = this_cpu_read(cpu_softirq_time); |
2183 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | 948 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) |
2184 | ret = 1; | 949 | ret = 1; |
2185 | local_irq_restore(flags); | 950 | local_irq_restore(flags); |
2186 | return ret; | 951 | return ret; |
@@ -2192,15 +957,6 @@ static int irqtime_account_si_update(void) | |||
2192 | 957 | ||
2193 | #endif | 958 | #endif |
2194 | 959 | ||
2195 | #include "sched_idletask.c" | ||
2196 | #include "sched_fair.c" | ||
2197 | #include "sched_rt.c" | ||
2198 | #include "sched_autogroup.c" | ||
2199 | #include "sched_stoptask.c" | ||
2200 | #ifdef CONFIG_SCHED_DEBUG | ||
2201 | # include "sched_debug.c" | ||
2202 | #endif | ||
2203 | |||
2204 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 960 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
2205 | { | 961 | { |
2206 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 962 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -2298,7 +1054,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
2298 | p->sched_class->prio_changed(rq, p, oldprio); | 1054 | p->sched_class->prio_changed(rq, p, oldprio); |
2299 | } | 1055 | } |
2300 | 1056 | ||
2301 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 1057 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
2302 | { | 1058 | { |
2303 | const struct sched_class *class; | 1059 | const struct sched_class *class; |
2304 | 1060 | ||
@@ -2324,38 +1080,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2324 | } | 1080 | } |
2325 | 1081 | ||
2326 | #ifdef CONFIG_SMP | 1082 | #ifdef CONFIG_SMP |
2327 | /* | ||
2328 | * Is this task likely cache-hot: | ||
2329 | */ | ||
2330 | static int | ||
2331 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
2332 | { | ||
2333 | s64 delta; | ||
2334 | |||
2335 | if (p->sched_class != &fair_sched_class) | ||
2336 | return 0; | ||
2337 | |||
2338 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2339 | return 0; | ||
2340 | |||
2341 | /* | ||
2342 | * Buddy candidates are cache hot: | ||
2343 | */ | ||
2344 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
2345 | (&p->se == cfs_rq_of(&p->se)->next || | ||
2346 | &p->se == cfs_rq_of(&p->se)->last)) | ||
2347 | return 1; | ||
2348 | |||
2349 | if (sysctl_sched_migration_cost == -1) | ||
2350 | return 1; | ||
2351 | if (sysctl_sched_migration_cost == 0) | ||
2352 | return 0; | ||
2353 | |||
2354 | delta = now - p->se.exec_start; | ||
2355 | |||
2356 | return delta < (s64)sysctl_sched_migration_cost; | ||
2357 | } | ||
2358 | |||
2359 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1083 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
2360 | { | 1084 | { |
2361 | #ifdef CONFIG_SCHED_DEBUG | 1085 | #ifdef CONFIG_SCHED_DEBUG |
@@ -2782,6 +1506,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | |||
2782 | 1506 | ||
2783 | } | 1507 | } |
2784 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1508 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
1509 | |||
1510 | static inline int ttwu_share_cache(int this_cpu, int that_cpu) | ||
1511 | { | ||
1512 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | ||
1513 | } | ||
2785 | #endif /* CONFIG_SMP */ | 1514 | #endif /* CONFIG_SMP */ |
2786 | 1515 | ||
2787 | static void ttwu_queue(struct task_struct *p, int cpu) | 1516 | static void ttwu_queue(struct task_struct *p, int cpu) |
@@ -2789,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
2789 | struct rq *rq = cpu_rq(cpu); | 1518 | struct rq *rq = cpu_rq(cpu); |
2790 | 1519 | ||
2791 | #if defined(CONFIG_SMP) | 1520 | #if defined(CONFIG_SMP) |
2792 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | 1521 | if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { |
2793 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1522 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ |
2794 | ttwu_queue_remote(p, cpu); | 1523 | ttwu_queue_remote(p, cpu); |
2795 | return; | 1524 | return; |
@@ -3438,7 +2167,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
3438 | */ | 2167 | */ |
3439 | static atomic_long_t calc_load_tasks_idle; | 2168 | static atomic_long_t calc_load_tasks_idle; |
3440 | 2169 | ||
3441 | static void calc_load_account_idle(struct rq *this_rq) | 2170 | void calc_load_account_idle(struct rq *this_rq) |
3442 | { | 2171 | { |
3443 | long delta; | 2172 | long delta; |
3444 | 2173 | ||
@@ -3582,7 +2311,7 @@ static void calc_global_nohz(unsigned long ticks) | |||
3582 | */ | 2311 | */ |
3583 | } | 2312 | } |
3584 | #else | 2313 | #else |
3585 | static void calc_load_account_idle(struct rq *this_rq) | 2314 | void calc_load_account_idle(struct rq *this_rq) |
3586 | { | 2315 | { |
3587 | } | 2316 | } |
3588 | 2317 | ||
@@ -3725,7 +2454,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
3725 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2454 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
3726 | * every tick. We fix it up based on jiffies. | 2455 | * every tick. We fix it up based on jiffies. |
3727 | */ | 2456 | */ |
3728 | static void update_cpu_load(struct rq *this_rq) | 2457 | void update_cpu_load(struct rq *this_rq) |
3729 | { | 2458 | { |
3730 | unsigned long this_load = this_rq->load.weight; | 2459 | unsigned long this_load = this_rq->load.weight; |
3731 | unsigned long curr_jiffies = jiffies; | 2460 | unsigned long curr_jiffies = jiffies; |
@@ -3803,8 +2532,10 @@ unlock: | |||
3803 | #endif | 2532 | #endif |
3804 | 2533 | ||
3805 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 2534 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
2535 | DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); | ||
3806 | 2536 | ||
3807 | EXPORT_PER_CPU_SYMBOL(kstat); | 2537 | EXPORT_PER_CPU_SYMBOL(kstat); |
2538 | EXPORT_PER_CPU_SYMBOL(kernel_cpustat); | ||
3808 | 2539 | ||
3809 | /* | 2540 | /* |
3810 | * Return any ns on the sched_clock that have not yet been accounted in | 2541 | * Return any ns on the sched_clock that have not yet been accounted in |
@@ -3857,6 +2588,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3857 | return ns; | 2588 | return ns; |
3858 | } | 2589 | } |
3859 | 2590 | ||
2591 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2592 | struct cgroup_subsys cpuacct_subsys; | ||
2593 | struct cpuacct root_cpuacct; | ||
2594 | #endif | ||
2595 | |||
2596 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
2597 | u64 tmp) | ||
2598 | { | ||
2599 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2600 | struct kernel_cpustat *kcpustat; | ||
2601 | struct cpuacct *ca; | ||
2602 | #endif | ||
2603 | /* | ||
2604 | * Since all updates are sure to touch the root cgroup, we | ||
2605 | * get ourselves ahead and touch it first. If the root cgroup | ||
2606 | * is the only cgroup, then nothing else should be necessary. | ||
2607 | * | ||
2608 | */ | ||
2609 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
2610 | |||
2611 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2612 | if (unlikely(!cpuacct_subsys.active)) | ||
2613 | return; | ||
2614 | |||
2615 | rcu_read_lock(); | ||
2616 | ca = task_ca(p); | ||
2617 | while (ca && (ca != &root_cpuacct)) { | ||
2618 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
2619 | kcpustat->cpustat[index] += tmp; | ||
2620 | ca = parent_ca(ca); | ||
2621 | } | ||
2622 | rcu_read_unlock(); | ||
2623 | #endif | ||
2624 | } | ||
2625 | |||
2626 | |||
3860 | /* | 2627 | /* |
3861 | * Account user cpu time to a process. | 2628 | * Account user cpu time to a process. |
3862 | * @p: the process that the cpu time gets accounted to | 2629 | * @p: the process that the cpu time gets accounted to |
@@ -3866,22 +2633,18 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3866 | void account_user_time(struct task_struct *p, cputime_t cputime, | 2633 | void account_user_time(struct task_struct *p, cputime_t cputime, |
3867 | cputime_t cputime_scaled) | 2634 | cputime_t cputime_scaled) |
3868 | { | 2635 | { |
3869 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2636 | int index; |
3870 | cputime64_t tmp; | ||
3871 | 2637 | ||
3872 | /* Add user time to process. */ | 2638 | /* Add user time to process. */ |
3873 | p->utime = cputime_add(p->utime, cputime); | 2639 | p->utime += cputime; |
3874 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 2640 | p->utimescaled += cputime_scaled; |
3875 | account_group_user_time(p, cputime); | 2641 | account_group_user_time(p, cputime); |
3876 | 2642 | ||
2643 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
2644 | |||
3877 | /* Add user time to cpustat. */ | 2645 | /* Add user time to cpustat. */ |
3878 | tmp = cputime_to_cputime64(cputime); | 2646 | task_group_account_field(p, index, (__force u64) cputime); |
3879 | if (TASK_NICE(p) > 0) | ||
3880 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | ||
3881 | else | ||
3882 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
3883 | 2647 | ||
3884 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); | ||
3885 | /* Account for user time used */ | 2648 | /* Account for user time used */ |
3886 | acct_update_integrals(p); | 2649 | acct_update_integrals(p); |
3887 | } | 2650 | } |
@@ -3895,24 +2658,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
3895 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | 2658 | static void account_guest_time(struct task_struct *p, cputime_t cputime, |
3896 | cputime_t cputime_scaled) | 2659 | cputime_t cputime_scaled) |
3897 | { | 2660 | { |
3898 | cputime64_t tmp; | 2661 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3899 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3900 | |||
3901 | tmp = cputime_to_cputime64(cputime); | ||
3902 | 2662 | ||
3903 | /* Add guest time to process. */ | 2663 | /* Add guest time to process. */ |
3904 | p->utime = cputime_add(p->utime, cputime); | 2664 | p->utime += cputime; |
3905 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 2665 | p->utimescaled += cputime_scaled; |
3906 | account_group_user_time(p, cputime); | 2666 | account_group_user_time(p, cputime); |
3907 | p->gtime = cputime_add(p->gtime, cputime); | 2667 | p->gtime += cputime; |
3908 | 2668 | ||
3909 | /* Add guest time to cpustat. */ | 2669 | /* Add guest time to cpustat. */ |
3910 | if (TASK_NICE(p) > 0) { | 2670 | if (TASK_NICE(p) > 0) { |
3911 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 2671 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
3912 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | 2672 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; |
3913 | } else { | 2673 | } else { |
3914 | cpustat->user = cputime64_add(cpustat->user, tmp); | 2674 | cpustat[CPUTIME_USER] += (__force u64) cputime; |
3915 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 2675 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; |
3916 | } | 2676 | } |
3917 | } | 2677 | } |
3918 | 2678 | ||
@@ -3925,18 +2685,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3925 | */ | 2685 | */ |
3926 | static inline | 2686 | static inline |
3927 | void __account_system_time(struct task_struct *p, cputime_t cputime, | 2687 | void __account_system_time(struct task_struct *p, cputime_t cputime, |
3928 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | 2688 | cputime_t cputime_scaled, int index) |
3929 | { | 2689 | { |
3930 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3931 | |||
3932 | /* Add system time to process. */ | 2690 | /* Add system time to process. */ |
3933 | p->stime = cputime_add(p->stime, cputime); | 2691 | p->stime += cputime; |
3934 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | 2692 | p->stimescaled += cputime_scaled; |
3935 | account_group_system_time(p, cputime); | 2693 | account_group_system_time(p, cputime); |
3936 | 2694 | ||
3937 | /* Add system time to cpustat. */ | 2695 | /* Add system time to cpustat. */ |
3938 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | 2696 | task_group_account_field(p, index, (__force u64) cputime); |
3939 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3940 | 2697 | ||
3941 | /* Account for system time used */ | 2698 | /* Account for system time used */ |
3942 | acct_update_integrals(p); | 2699 | acct_update_integrals(p); |
@@ -3952,8 +2709,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
3952 | void account_system_time(struct task_struct *p, int hardirq_offset, | 2709 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3953 | cputime_t cputime, cputime_t cputime_scaled) | 2710 | cputime_t cputime, cputime_t cputime_scaled) |
3954 | { | 2711 | { |
3955 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2712 | int index; |
3956 | cputime64_t *target_cputime64; | ||
3957 | 2713 | ||
3958 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 2714 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3959 | account_guest_time(p, cputime, cputime_scaled); | 2715 | account_guest_time(p, cputime, cputime_scaled); |
@@ -3961,13 +2717,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3961 | } | 2717 | } |
3962 | 2718 | ||
3963 | if (hardirq_count() - hardirq_offset) | 2719 | if (hardirq_count() - hardirq_offset) |
3964 | target_cputime64 = &cpustat->irq; | 2720 | index = CPUTIME_IRQ; |
3965 | else if (in_serving_softirq()) | 2721 | else if (in_serving_softirq()) |
3966 | target_cputime64 = &cpustat->softirq; | 2722 | index = CPUTIME_SOFTIRQ; |
3967 | else | 2723 | else |
3968 | target_cputime64 = &cpustat->system; | 2724 | index = CPUTIME_SYSTEM; |
3969 | 2725 | ||
3970 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); | 2726 | __account_system_time(p, cputime, cputime_scaled, index); |
3971 | } | 2727 | } |
3972 | 2728 | ||
3973 | /* | 2729 | /* |
@@ -3976,10 +2732,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3976 | */ | 2732 | */ |
3977 | void account_steal_time(cputime_t cputime) | 2733 | void account_steal_time(cputime_t cputime) |
3978 | { | 2734 | { |
3979 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2735 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3980 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | ||
3981 | 2736 | ||
3982 | cpustat->steal = cputime64_add(cpustat->steal, cputime64); | 2737 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; |
3983 | } | 2738 | } |
3984 | 2739 | ||
3985 | /* | 2740 | /* |
@@ -3988,14 +2743,13 @@ void account_steal_time(cputime_t cputime) | |||
3988 | */ | 2743 | */ |
3989 | void account_idle_time(cputime_t cputime) | 2744 | void account_idle_time(cputime_t cputime) |
3990 | { | 2745 | { |
3991 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2746 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3992 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | ||
3993 | struct rq *rq = this_rq(); | 2747 | struct rq *rq = this_rq(); |
3994 | 2748 | ||
3995 | if (atomic_read(&rq->nr_iowait) > 0) | 2749 | if (atomic_read(&rq->nr_iowait) > 0) |
3996 | cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); | 2750 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; |
3997 | else | 2751 | else |
3998 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 2752 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
3999 | } | 2753 | } |
4000 | 2754 | ||
4001 | static __always_inline bool steal_account_process_tick(void) | 2755 | static __always_inline bool steal_account_process_tick(void) |
@@ -4045,16 +2799,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
4045 | struct rq *rq) | 2799 | struct rq *rq) |
4046 | { | 2800 | { |
4047 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 2801 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
4048 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 2802 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
4049 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
4050 | 2803 | ||
4051 | if (steal_account_process_tick()) | 2804 | if (steal_account_process_tick()) |
4052 | return; | 2805 | return; |
4053 | 2806 | ||
4054 | if (irqtime_account_hi_update()) { | 2807 | if (irqtime_account_hi_update()) { |
4055 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 2808 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; |
4056 | } else if (irqtime_account_si_update()) { | 2809 | } else if (irqtime_account_si_update()) { |
4057 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 2810 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; |
4058 | } else if (this_cpu_ksoftirqd() == p) { | 2811 | } else if (this_cpu_ksoftirqd() == p) { |
4059 | /* | 2812 | /* |
4060 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 2813 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
@@ -4062,7 +2815,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
4062 | * Also, p->stime needs to be updated for ksoftirqd. | 2815 | * Also, p->stime needs to be updated for ksoftirqd. |
4063 | */ | 2816 | */ |
4064 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2817 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
4065 | &cpustat->softirq); | 2818 | CPUTIME_SOFTIRQ); |
4066 | } else if (user_tick) { | 2819 | } else if (user_tick) { |
4067 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2820 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
4068 | } else if (p == rq->idle) { | 2821 | } else if (p == rq->idle) { |
@@ -4071,7 +2824,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
4071 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2824 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); |
4072 | } else { | 2825 | } else { |
4073 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2826 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
4074 | &cpustat->system); | 2827 | CPUTIME_SYSTEM); |
4075 | } | 2828 | } |
4076 | } | 2829 | } |
4077 | 2830 | ||
@@ -4170,7 +2923,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4170 | 2923 | ||
4171 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 2924 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
4172 | { | 2925 | { |
4173 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); | 2926 | cputime_t rtime, utime = p->utime, total = utime + p->stime; |
4174 | 2927 | ||
4175 | /* | 2928 | /* |
4176 | * Use CFS's precise accounting: | 2929 | * Use CFS's precise accounting: |
@@ -4178,11 +2931,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4178 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 2931 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
4179 | 2932 | ||
4180 | if (total) { | 2933 | if (total) { |
4181 | u64 temp = rtime; | 2934 | u64 temp = (__force u64) rtime; |
4182 | 2935 | ||
4183 | temp *= utime; | 2936 | temp *= (__force u64) utime; |
4184 | do_div(temp, total); | 2937 | do_div(temp, (__force u32) total); |
4185 | utime = (cputime_t)temp; | 2938 | utime = (__force cputime_t) temp; |
4186 | } else | 2939 | } else |
4187 | utime = rtime; | 2940 | utime = rtime; |
4188 | 2941 | ||
@@ -4190,7 +2943,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4190 | * Compare with previous values, to keep monotonicity: | 2943 | * Compare with previous values, to keep monotonicity: |
4191 | */ | 2944 | */ |
4192 | p->prev_utime = max(p->prev_utime, utime); | 2945 | p->prev_utime = max(p->prev_utime, utime); |
4193 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | 2946 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); |
4194 | 2947 | ||
4195 | *ut = p->prev_utime; | 2948 | *ut = p->prev_utime; |
4196 | *st = p->prev_stime; | 2949 | *st = p->prev_stime; |
@@ -4207,21 +2960,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4207 | 2960 | ||
4208 | thread_group_cputime(p, &cputime); | 2961 | thread_group_cputime(p, &cputime); |
4209 | 2962 | ||
4210 | total = cputime_add(cputime.utime, cputime.stime); | 2963 | total = cputime.utime + cputime.stime; |
4211 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 2964 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
4212 | 2965 | ||
4213 | if (total) { | 2966 | if (total) { |
4214 | u64 temp = rtime; | 2967 | u64 temp = (__force u64) rtime; |
4215 | 2968 | ||
4216 | temp *= cputime.utime; | 2969 | temp *= (__force u64) cputime.utime; |
4217 | do_div(temp, total); | 2970 | do_div(temp, (__force u32) total); |
4218 | utime = (cputime_t)temp; | 2971 | utime = (__force cputime_t) temp; |
4219 | } else | 2972 | } else |
4220 | utime = rtime; | 2973 | utime = rtime; |
4221 | 2974 | ||
4222 | sig->prev_utime = max(sig->prev_utime, utime); | 2975 | sig->prev_utime = max(sig->prev_utime, utime); |
4223 | sig->prev_stime = max(sig->prev_stime, | 2976 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); |
4224 | cputime_sub(rtime, sig->prev_utime)); | ||
4225 | 2977 | ||
4226 | *ut = sig->prev_utime; | 2978 | *ut = sig->prev_utime; |
4227 | *st = sig->prev_stime; | 2979 | *st = sig->prev_stime; |
@@ -4320,6 +3072,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4320 | { | 3072 | { |
4321 | struct pt_regs *regs = get_irq_regs(); | 3073 | struct pt_regs *regs = get_irq_regs(); |
4322 | 3074 | ||
3075 | if (oops_in_progress) | ||
3076 | return; | ||
3077 | |||
4323 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", | 3078 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", |
4324 | prev->comm, prev->pid, preempt_count()); | 3079 | prev->comm, prev->pid, preempt_count()); |
4325 | 3080 | ||
@@ -4810,6 +3565,9 @@ EXPORT_SYMBOL(wait_for_completion); | |||
4810 | * This waits for either a completion of a specific task to be signaled or for a | 3565 | * This waits for either a completion of a specific task to be signaled or for a |
4811 | * specified timeout to expire. The timeout is in jiffies. It is not | 3566 | * specified timeout to expire. The timeout is in jiffies. It is not |
4812 | * interruptible. | 3567 | * interruptible. |
3568 | * | ||
3569 | * The return value is 0 if timed out, and positive (at least 1, or number of | ||
3570 | * jiffies left till timeout) if completed. | ||
4813 | */ | 3571 | */ |
4814 | unsigned long __sched | 3572 | unsigned long __sched |
4815 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 3573 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
@@ -4824,6 +3582,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); | |||
4824 | * | 3582 | * |
4825 | * This waits for completion of a specific task to be signaled. It is | 3583 | * This waits for completion of a specific task to be signaled. It is |
4826 | * interruptible. | 3584 | * interruptible. |
3585 | * | ||
3586 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
4827 | */ | 3587 | */ |
4828 | int __sched wait_for_completion_interruptible(struct completion *x) | 3588 | int __sched wait_for_completion_interruptible(struct completion *x) |
4829 | { | 3589 | { |
@@ -4841,6 +3601,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4841 | * | 3601 | * |
4842 | * This waits for either a completion of a specific task to be signaled or for a | 3602 | * This waits for either a completion of a specific task to be signaled or for a |
4843 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 3603 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
3604 | * | ||
3605 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
3606 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
4844 | */ | 3607 | */ |
4845 | long __sched | 3608 | long __sched |
4846 | wait_for_completion_interruptible_timeout(struct completion *x, | 3609 | wait_for_completion_interruptible_timeout(struct completion *x, |
@@ -4856,6 +3619,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | |||
4856 | * | 3619 | * |
4857 | * This waits to be signaled for completion of a specific task. It can be | 3620 | * This waits to be signaled for completion of a specific task. It can be |
4858 | * interrupted by a kill signal. | 3621 | * interrupted by a kill signal. |
3622 | * | ||
3623 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
4859 | */ | 3624 | */ |
4860 | int __sched wait_for_completion_killable(struct completion *x) | 3625 | int __sched wait_for_completion_killable(struct completion *x) |
4861 | { | 3626 | { |
@@ -4874,6 +3639,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4874 | * This waits for either a completion of a specific task to be | 3639 | * This waits for either a completion of a specific task to be |
4875 | * signaled or for a specified timeout to expire. It can be | 3640 | * signaled or for a specified timeout to expire. It can be |
4876 | * interrupted by a kill signal. The timeout is in jiffies. | 3641 | * interrupted by a kill signal. The timeout is in jiffies. |
3642 | * | ||
3643 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
3644 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
4877 | */ | 3645 | */ |
4878 | long __sched | 3646 | long __sched |
4879 | wait_for_completion_killable_timeout(struct completion *x, | 3647 | wait_for_completion_killable_timeout(struct completion *x, |
@@ -5360,7 +4128,7 @@ recheck: | |||
5360 | on_rq = p->on_rq; | 4128 | on_rq = p->on_rq; |
5361 | running = task_current(rq, p); | 4129 | running = task_current(rq, p); |
5362 | if (on_rq) | 4130 | if (on_rq) |
5363 | deactivate_task(rq, p, 0); | 4131 | dequeue_task(rq, p, 0); |
5364 | if (running) | 4132 | if (running) |
5365 | p->sched_class->put_prev_task(rq, p); | 4133 | p->sched_class->put_prev_task(rq, p); |
5366 | 4134 | ||
@@ -5373,7 +4141,7 @@ recheck: | |||
5373 | if (running) | 4141 | if (running) |
5374 | p->sched_class->set_curr_task(rq); | 4142 | p->sched_class->set_curr_task(rq); |
5375 | if (on_rq) | 4143 | if (on_rq) |
5376 | activate_task(rq, p, 0); | 4144 | enqueue_task(rq, p, 0); |
5377 | 4145 | ||
5378 | check_class_changed(rq, p, prev_class, oldprio); | 4146 | check_class_changed(rq, p, prev_class, oldprio); |
5379 | task_rq_unlock(rq, p, &flags); | 4147 | task_rq_unlock(rq, p, &flags); |
@@ -5556,7 +4324,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
5556 | goto out_free_cpus_allowed; | 4324 | goto out_free_cpus_allowed; |
5557 | } | 4325 | } |
5558 | retval = -EPERM; | 4326 | retval = -EPERM; |
5559 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) | 4327 | if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) |
5560 | goto out_unlock; | 4328 | goto out_unlock; |
5561 | 4329 | ||
5562 | retval = security_task_setscheduler(p); | 4330 | retval = security_task_setscheduler(p); |
@@ -5838,6 +4606,13 @@ again: | |||
5838 | */ | 4606 | */ |
5839 | if (preempt && rq != p_rq) | 4607 | if (preempt && rq != p_rq) |
5840 | resched_task(p_rq->curr); | 4608 | resched_task(p_rq->curr); |
4609 | } else { | ||
4610 | /* | ||
4611 | * We might have set it in task_yield_fair(), but are | ||
4612 | * not going to schedule(), so don't want to skip | ||
4613 | * the next update. | ||
4614 | */ | ||
4615 | rq->skip_clock_update = 0; | ||
5841 | } | 4616 | } |
5842 | 4617 | ||
5843 | out: | 4618 | out: |
@@ -6005,7 +4780,7 @@ void sched_show_task(struct task_struct *p) | |||
6005 | free = stack_not_used(p); | 4780 | free = stack_not_used(p); |
6006 | #endif | 4781 | #endif |
6007 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4782 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
6008 | task_pid_nr(p), task_pid_nr(p->real_parent), | 4783 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), |
6009 | (unsigned long)task_thread_info(p)->flags); | 4784 | (unsigned long)task_thread_info(p)->flags); |
6010 | 4785 | ||
6011 | show_stack(p, NULL); | 4786 | show_stack(p, NULL); |
@@ -6099,53 +4874,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
6099 | */ | 4874 | */ |
6100 | idle->sched_class = &idle_sched_class; | 4875 | idle->sched_class = &idle_sched_class; |
6101 | ftrace_graph_init_idle_task(idle, cpu); | 4876 | ftrace_graph_init_idle_task(idle, cpu); |
6102 | } | 4877 | #if defined(CONFIG_SMP) |
6103 | 4878 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); | |
6104 | /* | 4879 | #endif |
6105 | * Increase the granularity value when there are more CPUs, | ||
6106 | * because with more CPUs the 'effective latency' as visible | ||
6107 | * to users decreases. But the relationship is not linear, | ||
6108 | * so pick a second-best guess by going with the log2 of the | ||
6109 | * number of CPUs. | ||
6110 | * | ||
6111 | * This idea comes from the SD scheduler of Con Kolivas: | ||
6112 | */ | ||
6113 | static int get_update_sysctl_factor(void) | ||
6114 | { | ||
6115 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
6116 | unsigned int factor; | ||
6117 | |||
6118 | switch (sysctl_sched_tunable_scaling) { | ||
6119 | case SCHED_TUNABLESCALING_NONE: | ||
6120 | factor = 1; | ||
6121 | break; | ||
6122 | case SCHED_TUNABLESCALING_LINEAR: | ||
6123 | factor = cpus; | ||
6124 | break; | ||
6125 | case SCHED_TUNABLESCALING_LOG: | ||
6126 | default: | ||
6127 | factor = 1 + ilog2(cpus); | ||
6128 | break; | ||
6129 | } | ||
6130 | |||
6131 | return factor; | ||
6132 | } | ||
6133 | |||
6134 | static void update_sysctl(void) | ||
6135 | { | ||
6136 | unsigned int factor = get_update_sysctl_factor(); | ||
6137 | |||
6138 | #define SET_SYSCTL(name) \ | ||
6139 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
6140 | SET_SYSCTL(sched_min_granularity); | ||
6141 | SET_SYSCTL(sched_latency); | ||
6142 | SET_SYSCTL(sched_wakeup_granularity); | ||
6143 | #undef SET_SYSCTL | ||
6144 | } | ||
6145 | |||
6146 | static inline void sched_init_granularity(void) | ||
6147 | { | ||
6148 | update_sysctl(); | ||
6149 | } | 4880 | } |
6150 | 4881 | ||
6151 | #ifdef CONFIG_SMP | 4882 | #ifdef CONFIG_SMP |
@@ -6261,9 +4992,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6261 | * placed properly. | 4992 | * placed properly. |
6262 | */ | 4993 | */ |
6263 | if (p->on_rq) { | 4994 | if (p->on_rq) { |
6264 | deactivate_task(rq_src, p, 0); | 4995 | dequeue_task(rq_src, p, 0); |
6265 | set_task_cpu(p, dest_cpu); | 4996 | set_task_cpu(p, dest_cpu); |
6266 | activate_task(rq_dest, p, 0); | 4997 | enqueue_task(rq_dest, p, 0); |
6267 | check_preempt_curr(rq_dest, p, 0); | 4998 | check_preempt_curr(rq_dest, p, 0); |
6268 | } | 4999 | } |
6269 | done: | 5000 | done: |
@@ -6334,30 +5065,6 @@ static void calc_global_load_remove(struct rq *rq) | |||
6334 | rq->calc_load_active = 0; | 5065 | rq->calc_load_active = 0; |
6335 | } | 5066 | } |
6336 | 5067 | ||
6337 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6338 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6339 | { | ||
6340 | struct cfs_rq *cfs_rq; | ||
6341 | |||
6342 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6343 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6344 | |||
6345 | if (!cfs_rq->runtime_enabled) | ||
6346 | continue; | ||
6347 | |||
6348 | /* | ||
6349 | * clock_task is not advancing so we just need to make sure | ||
6350 | * there's some valid quota amount | ||
6351 | */ | ||
6352 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6353 | if (cfs_rq_throttled(cfs_rq)) | ||
6354 | unthrottle_cfs_rq(cfs_rq); | ||
6355 | } | ||
6356 | } | ||
6357 | #else | ||
6358 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6359 | #endif | ||
6360 | |||
6361 | /* | 5068 | /* |
6362 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 5069 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6363 | * try_to_wake_up()->select_task_rq(). | 5070 | * try_to_wake_up()->select_task_rq(). |
@@ -6463,7 +5170,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
6463 | static void | 5170 | static void |
6464 | set_table_entry(struct ctl_table *entry, | 5171 | set_table_entry(struct ctl_table *entry, |
6465 | const char *procname, void *data, int maxlen, | 5172 | const char *procname, void *data, int maxlen, |
6466 | mode_t mode, proc_handler *proc_handler) | 5173 | umode_t mode, proc_handler *proc_handler) |
6467 | { | 5174 | { |
6468 | entry->procname = procname; | 5175 | entry->procname = procname; |
6469 | entry->data = data; | 5176 | entry->data = data; |
@@ -6963,6 +5670,12 @@ out: | |||
6963 | return -ENOMEM; | 5670 | return -ENOMEM; |
6964 | } | 5671 | } |
6965 | 5672 | ||
5673 | /* | ||
5674 | * By default the system creates a single root-domain with all cpus as | ||
5675 | * members (mimicking the global state we have today). | ||
5676 | */ | ||
5677 | struct root_domain def_root_domain; | ||
5678 | |||
6966 | static void init_defrootdomain(void) | 5679 | static void init_defrootdomain(void) |
6967 | { | 5680 | { |
6968 | init_rootdomain(&def_root_domain); | 5681 | init_rootdomain(&def_root_domain); |
@@ -7034,6 +5747,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
7034 | } | 5747 | } |
7035 | 5748 | ||
7036 | /* | 5749 | /* |
5750 | * Keep a special pointer to the highest sched_domain that has | ||
5751 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
5752 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
5753 | * | ||
5754 | * Also keep a unique ID per domain (we use the first cpu number in | ||
5755 | * the cpumask of the domain), this allows us to quickly tell if | ||
5756 | * two cpus are in the same cache domain, see ttwu_share_cache(). | ||
5757 | */ | ||
5758 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
5759 | DEFINE_PER_CPU(int, sd_llc_id); | ||
5760 | |||
5761 | static void update_top_cache_domain(int cpu) | ||
5762 | { | ||
5763 | struct sched_domain *sd; | ||
5764 | int id = cpu; | ||
5765 | |||
5766 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
5767 | if (sd) | ||
5768 | id = cpumask_first(sched_domain_span(sd)); | ||
5769 | |||
5770 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
5771 | per_cpu(sd_llc_id, cpu) = id; | ||
5772 | } | ||
5773 | |||
5774 | /* | ||
7037 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 5775 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
7038 | * hold the hotplug lock. | 5776 | * hold the hotplug lock. |
7039 | */ | 5777 | */ |
@@ -7072,6 +5810,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
7072 | tmp = rq->sd; | 5810 | tmp = rq->sd; |
7073 | rcu_assign_pointer(rq->sd, sd); | 5811 | rcu_assign_pointer(rq->sd, sd); |
7074 | destroy_sched_domains(tmp, cpu); | 5812 | destroy_sched_domains(tmp, cpu); |
5813 | |||
5814 | update_top_cache_domain(cpu); | ||
7075 | } | 5815 | } |
7076 | 5816 | ||
7077 | /* cpus with isolated domains */ | 5817 | /* cpus with isolated domains */ |
@@ -7231,7 +5971,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
7231 | continue; | 5971 | continue; |
7232 | 5972 | ||
7233 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5973 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
7234 | GFP_KERNEL, cpu_to_node(i)); | 5974 | GFP_KERNEL, cpu_to_node(cpu)); |
7235 | 5975 | ||
7236 | if (!sg) | 5976 | if (!sg) |
7237 | goto fail; | 5977 | goto fail; |
@@ -7369,6 +6109,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7369 | return; | 6109 | return; |
7370 | 6110 | ||
7371 | update_group_power(sd, cpu); | 6111 | update_group_power(sd, cpu); |
6112 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | ||
6113 | } | ||
6114 | |||
6115 | int __weak arch_sd_sibling_asym_packing(void) | ||
6116 | { | ||
6117 | return 0*SD_ASYM_PACKING; | ||
7372 | } | 6118 | } |
7373 | 6119 | ||
7374 | /* | 6120 | /* |
@@ -7923,54 +6669,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7923 | } | 6669 | } |
7924 | 6670 | ||
7925 | #ifdef CONFIG_SCHED_MC | 6671 | #ifdef CONFIG_SCHED_MC |
7926 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, | 6672 | static ssize_t sched_mc_power_savings_show(struct device *dev, |
7927 | struct sysdev_class_attribute *attr, | 6673 | struct device_attribute *attr, |
7928 | char *page) | 6674 | char *buf) |
7929 | { | 6675 | { |
7930 | return sprintf(page, "%u\n", sched_mc_power_savings); | 6676 | return sprintf(buf, "%u\n", sched_mc_power_savings); |
7931 | } | 6677 | } |
7932 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, | 6678 | static ssize_t sched_mc_power_savings_store(struct device *dev, |
7933 | struct sysdev_class_attribute *attr, | 6679 | struct device_attribute *attr, |
7934 | const char *buf, size_t count) | 6680 | const char *buf, size_t count) |
7935 | { | 6681 | { |
7936 | return sched_power_savings_store(buf, count, 0); | 6682 | return sched_power_savings_store(buf, count, 0); |
7937 | } | 6683 | } |
7938 | static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, | 6684 | static DEVICE_ATTR(sched_mc_power_savings, 0644, |
7939 | sched_mc_power_savings_show, | 6685 | sched_mc_power_savings_show, |
7940 | sched_mc_power_savings_store); | 6686 | sched_mc_power_savings_store); |
7941 | #endif | 6687 | #endif |
7942 | 6688 | ||
7943 | #ifdef CONFIG_SCHED_SMT | 6689 | #ifdef CONFIG_SCHED_SMT |
7944 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, | 6690 | static ssize_t sched_smt_power_savings_show(struct device *dev, |
7945 | struct sysdev_class_attribute *attr, | 6691 | struct device_attribute *attr, |
7946 | char *page) | 6692 | char *buf) |
7947 | { | 6693 | { |
7948 | return sprintf(page, "%u\n", sched_smt_power_savings); | 6694 | return sprintf(buf, "%u\n", sched_smt_power_savings); |
7949 | } | 6695 | } |
7950 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, | 6696 | static ssize_t sched_smt_power_savings_store(struct device *dev, |
7951 | struct sysdev_class_attribute *attr, | 6697 | struct device_attribute *attr, |
7952 | const char *buf, size_t count) | 6698 | const char *buf, size_t count) |
7953 | { | 6699 | { |
7954 | return sched_power_savings_store(buf, count, 1); | 6700 | return sched_power_savings_store(buf, count, 1); |
7955 | } | 6701 | } |
7956 | static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, | 6702 | static DEVICE_ATTR(sched_smt_power_savings, 0644, |
7957 | sched_smt_power_savings_show, | 6703 | sched_smt_power_savings_show, |
7958 | sched_smt_power_savings_store); | 6704 | sched_smt_power_savings_store); |
7959 | #endif | 6705 | #endif |
7960 | 6706 | ||
7961 | int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 6707 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) |
7962 | { | 6708 | { |
7963 | int err = 0; | 6709 | int err = 0; |
7964 | 6710 | ||
7965 | #ifdef CONFIG_SCHED_SMT | 6711 | #ifdef CONFIG_SCHED_SMT |
7966 | if (smt_capable()) | 6712 | if (smt_capable()) |
7967 | err = sysfs_create_file(&cls->kset.kobj, | 6713 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); |
7968 | &attr_sched_smt_power_savings.attr); | ||
7969 | #endif | 6714 | #endif |
7970 | #ifdef CONFIG_SCHED_MC | 6715 | #ifdef CONFIG_SCHED_MC |
7971 | if (!err && mc_capable()) | 6716 | if (!err && mc_capable()) |
7972 | err = sysfs_create_file(&cls->kset.kobj, | 6717 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); |
7973 | &attr_sched_mc_power_savings.attr); | ||
7974 | #endif | 6718 | #endif |
7975 | return err; | 6719 | return err; |
7976 | } | 6720 | } |
@@ -7984,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7984 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | 6728 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
7985 | void *hcpu) | 6729 | void *hcpu) |
7986 | { | 6730 | { |
7987 | switch (action & ~CPU_TASKS_FROZEN) { | 6731 | switch (action) { |
7988 | case CPU_ONLINE: | 6732 | case CPU_ONLINE: |
7989 | case CPU_DOWN_FAILED: | 6733 | case CPU_DOWN_FAILED: |
7990 | cpuset_update_active_cpus(); | 6734 | cpuset_update_active_cpus(); |
@@ -7997,33 +6741,10 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
7997 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 6741 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
7998 | void *hcpu) | 6742 | void *hcpu) |
7999 | { | 6743 | { |
8000 | switch (action & ~CPU_TASKS_FROZEN) { | ||
8001 | case CPU_DOWN_PREPARE: | ||
8002 | cpuset_update_active_cpus(); | ||
8003 | return NOTIFY_OK; | ||
8004 | default: | ||
8005 | return NOTIFY_DONE; | ||
8006 | } | ||
8007 | } | ||
8008 | |||
8009 | static int update_runtime(struct notifier_block *nfb, | ||
8010 | unsigned long action, void *hcpu) | ||
8011 | { | ||
8012 | int cpu = (int)(long)hcpu; | ||
8013 | |||
8014 | switch (action) { | 6744 | switch (action) { |
8015 | case CPU_DOWN_PREPARE: | 6745 | case CPU_DOWN_PREPARE: |
8016 | case CPU_DOWN_PREPARE_FROZEN: | 6746 | cpuset_update_active_cpus(); |
8017 | disable_runtime(cpu_rq(cpu)); | ||
8018 | return NOTIFY_OK; | ||
8019 | |||
8020 | case CPU_DOWN_FAILED: | ||
8021 | case CPU_DOWN_FAILED_FROZEN: | ||
8022 | case CPU_ONLINE: | ||
8023 | case CPU_ONLINE_FROZEN: | ||
8024 | enable_runtime(cpu_rq(cpu)); | ||
8025 | return NOTIFY_OK; | 6747 | return NOTIFY_OK; |
8026 | |||
8027 | default: | 6748 | default: |
8028 | return NOTIFY_DONE; | 6749 | return NOTIFY_DONE; |
8029 | } | 6750 | } |
@@ -8077,104 +6798,11 @@ int in_sched_functions(unsigned long addr) | |||
8077 | && addr < (unsigned long)__sched_text_end); | 6798 | && addr < (unsigned long)__sched_text_end); |
8078 | } | 6799 | } |
8079 | 6800 | ||
8080 | static void init_cfs_rq(struct cfs_rq *cfs_rq) | 6801 | #ifdef CONFIG_CGROUP_SCHED |
8081 | { | 6802 | struct task_group root_task_group; |
8082 | cfs_rq->tasks_timeline = RB_ROOT; | ||
8083 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
8084 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
8085 | #ifndef CONFIG_64BIT | ||
8086 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
8087 | #endif | ||
8088 | } | ||
8089 | |||
8090 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
8091 | { | ||
8092 | struct rt_prio_array *array; | ||
8093 | int i; | ||
8094 | |||
8095 | array = &rt_rq->active; | ||
8096 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
8097 | INIT_LIST_HEAD(array->queue + i); | ||
8098 | __clear_bit(i, array->bitmap); | ||
8099 | } | ||
8100 | /* delimiter for bitsearch: */ | ||
8101 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
8102 | |||
8103 | #if defined CONFIG_SMP | ||
8104 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
8105 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
8106 | rt_rq->rt_nr_migratory = 0; | ||
8107 | rt_rq->overloaded = 0; | ||
8108 | plist_head_init(&rt_rq->pushable_tasks); | ||
8109 | #endif | ||
8110 | |||
8111 | rt_rq->rt_time = 0; | ||
8112 | rt_rq->rt_throttled = 0; | ||
8113 | rt_rq->rt_runtime = 0; | ||
8114 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
8115 | } | ||
8116 | |||
8117 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8118 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
8119 | struct sched_entity *se, int cpu, | ||
8120 | struct sched_entity *parent) | ||
8121 | { | ||
8122 | struct rq *rq = cpu_rq(cpu); | ||
8123 | |||
8124 | cfs_rq->tg = tg; | ||
8125 | cfs_rq->rq = rq; | ||
8126 | #ifdef CONFIG_SMP | ||
8127 | /* allow initial update_cfs_load() to truncate */ | ||
8128 | cfs_rq->load_stamp = 1; | ||
8129 | #endif | ||
8130 | init_cfs_rq_runtime(cfs_rq); | ||
8131 | |||
8132 | tg->cfs_rq[cpu] = cfs_rq; | ||
8133 | tg->se[cpu] = se; | ||
8134 | |||
8135 | /* se could be NULL for root_task_group */ | ||
8136 | if (!se) | ||
8137 | return; | ||
8138 | |||
8139 | if (!parent) | ||
8140 | se->cfs_rq = &rq->cfs; | ||
8141 | else | ||
8142 | se->cfs_rq = parent->my_q; | ||
8143 | |||
8144 | se->my_q = cfs_rq; | ||
8145 | update_load_set(&se->load, 0); | ||
8146 | se->parent = parent; | ||
8147 | } | ||
8148 | #endif | 6803 | #endif |
8149 | 6804 | ||
8150 | #ifdef CONFIG_RT_GROUP_SCHED | 6805 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
8151 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
8152 | struct sched_rt_entity *rt_se, int cpu, | ||
8153 | struct sched_rt_entity *parent) | ||
8154 | { | ||
8155 | struct rq *rq = cpu_rq(cpu); | ||
8156 | |||
8157 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
8158 | rt_rq->rt_nr_boosted = 0; | ||
8159 | rt_rq->rq = rq; | ||
8160 | rt_rq->tg = tg; | ||
8161 | |||
8162 | tg->rt_rq[cpu] = rt_rq; | ||
8163 | tg->rt_se[cpu] = rt_se; | ||
8164 | |||
8165 | if (!rt_se) | ||
8166 | return; | ||
8167 | |||
8168 | if (!parent) | ||
8169 | rt_se->rt_rq = &rq->rt; | ||
8170 | else | ||
8171 | rt_se->rt_rq = parent->my_q; | ||
8172 | |||
8173 | rt_se->my_q = rt_rq; | ||
8174 | rt_se->parent = parent; | ||
8175 | INIT_LIST_HEAD(&rt_se->run_list); | ||
8176 | } | ||
8177 | #endif | ||
8178 | 6806 | ||
8179 | void __init sched_init(void) | 6807 | void __init sched_init(void) |
8180 | { | 6808 | { |
@@ -8232,9 +6860,17 @@ void __init sched_init(void) | |||
8232 | #ifdef CONFIG_CGROUP_SCHED | 6860 | #ifdef CONFIG_CGROUP_SCHED |
8233 | list_add(&root_task_group.list, &task_groups); | 6861 | list_add(&root_task_group.list, &task_groups); |
8234 | INIT_LIST_HEAD(&root_task_group.children); | 6862 | INIT_LIST_HEAD(&root_task_group.children); |
6863 | INIT_LIST_HEAD(&root_task_group.siblings); | ||
8235 | autogroup_init(&init_task); | 6864 | autogroup_init(&init_task); |
6865 | |||
8236 | #endif /* CONFIG_CGROUP_SCHED */ | 6866 | #endif /* CONFIG_CGROUP_SCHED */ |
8237 | 6867 | ||
6868 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6869 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6870 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6871 | /* Too early, not expected to fail */ | ||
6872 | BUG_ON(!root_cpuacct.cpuusage); | ||
6873 | #endif | ||
8238 | for_each_possible_cpu(i) { | 6874 | for_each_possible_cpu(i) { |
8239 | struct rq *rq; | 6875 | struct rq *rq; |
8240 | 6876 | ||
@@ -8246,7 +6882,7 @@ void __init sched_init(void) | |||
8246 | init_cfs_rq(&rq->cfs); | 6882 | init_cfs_rq(&rq->cfs); |
8247 | init_rt_rq(&rq->rt, rq); | 6883 | init_rt_rq(&rq->rt, rq); |
8248 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6884 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8249 | root_task_group.shares = root_task_group_load; | 6885 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
8250 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6886 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
8251 | /* | 6887 | /* |
8252 | * How much cpu bandwidth does root_task_group get? | 6888 | * How much cpu bandwidth does root_task_group get? |
@@ -8296,7 +6932,7 @@ void __init sched_init(void) | |||
8296 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6932 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
8297 | rq_attach_root(rq, &def_root_domain); | 6933 | rq_attach_root(rq, &def_root_domain); |
8298 | #ifdef CONFIG_NO_HZ | 6934 | #ifdef CONFIG_NO_HZ |
8299 | rq->nohz_balance_kick = 0; | 6935 | rq->nohz_flags = 0; |
8300 | #endif | 6936 | #endif |
8301 | #endif | 6937 | #endif |
8302 | init_rq_hrtick(rq); | 6938 | init_rq_hrtick(rq); |
@@ -8309,10 +6945,6 @@ void __init sched_init(void) | |||
8309 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6945 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
8310 | #endif | 6946 | #endif |
8311 | 6947 | ||
8312 | #ifdef CONFIG_SMP | ||
8313 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
8314 | #endif | ||
8315 | |||
8316 | #ifdef CONFIG_RT_MUTEXES | 6948 | #ifdef CONFIG_RT_MUTEXES |
8317 | plist_head_init(&init_task.pi_waiters); | 6949 | plist_head_init(&init_task.pi_waiters); |
8318 | #endif | 6950 | #endif |
@@ -8340,17 +6972,11 @@ void __init sched_init(void) | |||
8340 | 6972 | ||
8341 | #ifdef CONFIG_SMP | 6973 | #ifdef CONFIG_SMP |
8342 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 6974 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
8343 | #ifdef CONFIG_NO_HZ | ||
8344 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
8345 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | ||
8346 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
8347 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
8348 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
8349 | #endif | ||
8350 | /* May be allocated at isolcpus cmdline parse time */ | 6975 | /* May be allocated at isolcpus cmdline parse time */ |
8351 | if (cpu_isolated_map == NULL) | 6976 | if (cpu_isolated_map == NULL) |
8352 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 6977 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8353 | #endif /* SMP */ | 6978 | #endif |
6979 | init_sched_fair_class(); | ||
8354 | 6980 | ||
8355 | scheduler_running = 1; | 6981 | scheduler_running = 1; |
8356 | } | 6982 | } |
@@ -8400,10 +7026,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8400 | 7026 | ||
8401 | on_rq = p->on_rq; | 7027 | on_rq = p->on_rq; |
8402 | if (on_rq) | 7028 | if (on_rq) |
8403 | deactivate_task(rq, p, 0); | 7029 | dequeue_task(rq, p, 0); |
8404 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 7030 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
8405 | if (on_rq) { | 7031 | if (on_rq) { |
8406 | activate_task(rq, p, 0); | 7032 | enqueue_task(rq, p, 0); |
8407 | resched_task(rq->curr); | 7033 | resched_task(rq->curr); |
8408 | } | 7034 | } |
8409 | 7035 | ||
@@ -8502,169 +7128,10 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
8502 | 7128 | ||
8503 | #endif | 7129 | #endif |
8504 | 7130 | ||
8505 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8506 | static void free_fair_sched_group(struct task_group *tg) | ||
8507 | { | ||
8508 | int i; | ||
8509 | |||
8510 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8511 | |||
8512 | for_each_possible_cpu(i) { | ||
8513 | if (tg->cfs_rq) | ||
8514 | kfree(tg->cfs_rq[i]); | ||
8515 | if (tg->se) | ||
8516 | kfree(tg->se[i]); | ||
8517 | } | ||
8518 | |||
8519 | kfree(tg->cfs_rq); | ||
8520 | kfree(tg->se); | ||
8521 | } | ||
8522 | |||
8523 | static | ||
8524 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
8525 | { | ||
8526 | struct cfs_rq *cfs_rq; | ||
8527 | struct sched_entity *se; | ||
8528 | int i; | ||
8529 | |||
8530 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
8531 | if (!tg->cfs_rq) | ||
8532 | goto err; | ||
8533 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
8534 | if (!tg->se) | ||
8535 | goto err; | ||
8536 | |||
8537 | tg->shares = NICE_0_LOAD; | ||
8538 | |||
8539 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8540 | |||
8541 | for_each_possible_cpu(i) { | ||
8542 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
8543 | GFP_KERNEL, cpu_to_node(i)); | ||
8544 | if (!cfs_rq) | ||
8545 | goto err; | ||
8546 | |||
8547 | se = kzalloc_node(sizeof(struct sched_entity), | ||
8548 | GFP_KERNEL, cpu_to_node(i)); | ||
8549 | if (!se) | ||
8550 | goto err_free_rq; | ||
8551 | |||
8552 | init_cfs_rq(cfs_rq); | ||
8553 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
8554 | } | ||
8555 | |||
8556 | return 1; | ||
8557 | |||
8558 | err_free_rq: | ||
8559 | kfree(cfs_rq); | ||
8560 | err: | ||
8561 | return 0; | ||
8562 | } | ||
8563 | |||
8564 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
8565 | { | ||
8566 | struct rq *rq = cpu_rq(cpu); | ||
8567 | unsigned long flags; | ||
8568 | |||
8569 | /* | ||
8570 | * Only empty task groups can be destroyed; so we can speculatively | ||
8571 | * check on_list without danger of it being re-added. | ||
8572 | */ | ||
8573 | if (!tg->cfs_rq[cpu]->on_list) | ||
8574 | return; | ||
8575 | |||
8576 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8577 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8578 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8579 | } | ||
8580 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | ||
8581 | static inline void free_fair_sched_group(struct task_group *tg) | ||
8582 | { | ||
8583 | } | ||
8584 | |||
8585 | static inline | ||
8586 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
8587 | { | ||
8588 | return 1; | ||
8589 | } | ||
8590 | |||
8591 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
8592 | { | ||
8593 | } | ||
8594 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
8595 | |||
8596 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8597 | static void free_rt_sched_group(struct task_group *tg) | ||
8598 | { | ||
8599 | int i; | ||
8600 | |||
8601 | if (tg->rt_se) | ||
8602 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
8603 | |||
8604 | for_each_possible_cpu(i) { | ||
8605 | if (tg->rt_rq) | ||
8606 | kfree(tg->rt_rq[i]); | ||
8607 | if (tg->rt_se) | ||
8608 | kfree(tg->rt_se[i]); | ||
8609 | } | ||
8610 | |||
8611 | kfree(tg->rt_rq); | ||
8612 | kfree(tg->rt_se); | ||
8613 | } | ||
8614 | |||
8615 | static | ||
8616 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
8617 | { | ||
8618 | struct rt_rq *rt_rq; | ||
8619 | struct sched_rt_entity *rt_se; | ||
8620 | int i; | ||
8621 | |||
8622 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
8623 | if (!tg->rt_rq) | ||
8624 | goto err; | ||
8625 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
8626 | if (!tg->rt_se) | ||
8627 | goto err; | ||
8628 | |||
8629 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
8630 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
8631 | |||
8632 | for_each_possible_cpu(i) { | ||
8633 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
8634 | GFP_KERNEL, cpu_to_node(i)); | ||
8635 | if (!rt_rq) | ||
8636 | goto err; | ||
8637 | |||
8638 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
8639 | GFP_KERNEL, cpu_to_node(i)); | ||
8640 | if (!rt_se) | ||
8641 | goto err_free_rq; | ||
8642 | |||
8643 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
8644 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8645 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
8646 | } | ||
8647 | |||
8648 | return 1; | ||
8649 | |||
8650 | err_free_rq: | ||
8651 | kfree(rt_rq); | ||
8652 | err: | ||
8653 | return 0; | ||
8654 | } | ||
8655 | #else /* !CONFIG_RT_GROUP_SCHED */ | ||
8656 | static inline void free_rt_sched_group(struct task_group *tg) | ||
8657 | { | ||
8658 | } | ||
8659 | |||
8660 | static inline | ||
8661 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
8662 | { | ||
8663 | return 1; | ||
8664 | } | ||
8665 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
8666 | |||
8667 | #ifdef CONFIG_CGROUP_SCHED | 7131 | #ifdef CONFIG_CGROUP_SCHED |
7132 | /* task_group_lock serializes the addition/removal of task groups */ | ||
7133 | static DEFINE_SPINLOCK(task_group_lock); | ||
7134 | |||
8668 | static void free_sched_group(struct task_group *tg) | 7135 | static void free_sched_group(struct task_group *tg) |
8669 | { | 7136 | { |
8670 | free_fair_sched_group(tg); | 7137 | free_fair_sched_group(tg); |
@@ -8769,50 +7236,6 @@ void sched_move_task(struct task_struct *tsk) | |||
8769 | } | 7236 | } |
8770 | #endif /* CONFIG_CGROUP_SCHED */ | 7237 | #endif /* CONFIG_CGROUP_SCHED */ |
8771 | 7238 | ||
8772 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8773 | static DEFINE_MUTEX(shares_mutex); | ||
8774 | |||
8775 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
8776 | { | ||
8777 | int i; | ||
8778 | unsigned long flags; | ||
8779 | |||
8780 | /* | ||
8781 | * We can't change the weight of the root cgroup. | ||
8782 | */ | ||
8783 | if (!tg->se[0]) | ||
8784 | return -EINVAL; | ||
8785 | |||
8786 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
8787 | |||
8788 | mutex_lock(&shares_mutex); | ||
8789 | if (tg->shares == shares) | ||
8790 | goto done; | ||
8791 | |||
8792 | tg->shares = shares; | ||
8793 | for_each_possible_cpu(i) { | ||
8794 | struct rq *rq = cpu_rq(i); | ||
8795 | struct sched_entity *se; | ||
8796 | |||
8797 | se = tg->se[i]; | ||
8798 | /* Propagate contribution to hierarchy */ | ||
8799 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8800 | for_each_sched_entity(se) | ||
8801 | update_cfs_shares(group_cfs_rq(se)); | ||
8802 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8803 | } | ||
8804 | |||
8805 | done: | ||
8806 | mutex_unlock(&shares_mutex); | ||
8807 | return 0; | ||
8808 | } | ||
8809 | |||
8810 | unsigned long sched_group_shares(struct task_group *tg) | ||
8811 | { | ||
8812 | return tg->shares; | ||
8813 | } | ||
8814 | #endif | ||
8815 | |||
8816 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | 7239 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8817 | static unsigned long to_ratio(u64 period, u64 runtime) | 7240 | static unsigned long to_ratio(u64 period, u64 runtime) |
8818 | { | 7241 | { |
@@ -8835,7 +7258,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
8835 | struct task_struct *g, *p; | 7258 | struct task_struct *g, *p; |
8836 | 7259 | ||
8837 | do_each_thread(g, p) { | 7260 | do_each_thread(g, p) { |
8838 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 7261 | if (rt_task(p) && task_rq(p)->rt.tg == tg) |
8839 | return 1; | 7262 | return 1; |
8840 | } while_each_thread(g, p); | 7263 | } while_each_thread(g, p); |
8841 | 7264 | ||
@@ -9127,24 +7550,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9127 | sched_destroy_group(tg); | 7550 | sched_destroy_group(tg); |
9128 | } | 7551 | } |
9129 | 7552 | ||
9130 | static int | 7553 | static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
9131 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 7554 | struct cgroup_taskset *tset) |
9132 | { | 7555 | { |
7556 | struct task_struct *task; | ||
7557 | |||
7558 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
9133 | #ifdef CONFIG_RT_GROUP_SCHED | 7559 | #ifdef CONFIG_RT_GROUP_SCHED |
9134 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) | 7560 | if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) |
9135 | return -EINVAL; | 7561 | return -EINVAL; |
9136 | #else | 7562 | #else |
9137 | /* We don't support RT-tasks being in separate groups */ | 7563 | /* We don't support RT-tasks being in separate groups */ |
9138 | if (tsk->sched_class != &fair_sched_class) | 7564 | if (task->sched_class != &fair_sched_class) |
9139 | return -EINVAL; | 7565 | return -EINVAL; |
9140 | #endif | 7566 | #endif |
7567 | } | ||
9141 | return 0; | 7568 | return 0; |
9142 | } | 7569 | } |
9143 | 7570 | ||
9144 | static void | 7571 | static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
9145 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 7572 | struct cgroup_taskset *tset) |
9146 | { | 7573 | { |
9147 | sched_move_task(tsk); | 7574 | struct task_struct *task; |
7575 | |||
7576 | cgroup_taskset_for_each(task, cgrp, tset) | ||
7577 | sched_move_task(task); | ||
9148 | } | 7578 | } |
9149 | 7579 | ||
9150 | static void | 7580 | static void |
@@ -9186,8 +7616,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | |||
9186 | 7616 | ||
9187 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | 7617 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) |
9188 | { | 7618 | { |
9189 | int i, ret = 0, runtime_enabled; | 7619 | int i, ret = 0, runtime_enabled, runtime_was_enabled; |
9190 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7620 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
9191 | 7621 | ||
9192 | if (tg == &root_task_group) | 7622 | if (tg == &root_task_group) |
9193 | return -EINVAL; | 7623 | return -EINVAL; |
@@ -9214,6 +7644,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
9214 | goto out_unlock; | 7644 | goto out_unlock; |
9215 | 7645 | ||
9216 | runtime_enabled = quota != RUNTIME_INF; | 7646 | runtime_enabled = quota != RUNTIME_INF; |
7647 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | ||
7648 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | ||
9217 | raw_spin_lock_irq(&cfs_b->lock); | 7649 | raw_spin_lock_irq(&cfs_b->lock); |
9218 | cfs_b->period = ns_to_ktime(period); | 7650 | cfs_b->period = ns_to_ktime(period); |
9219 | cfs_b->quota = quota; | 7651 | cfs_b->quota = quota; |
@@ -9229,13 +7661,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
9229 | 7661 | ||
9230 | for_each_possible_cpu(i) { | 7662 | for_each_possible_cpu(i) { |
9231 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | 7663 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; |
9232 | struct rq *rq = rq_of(cfs_rq); | 7664 | struct rq *rq = cfs_rq->rq; |
9233 | 7665 | ||
9234 | raw_spin_lock_irq(&rq->lock); | 7666 | raw_spin_lock_irq(&rq->lock); |
9235 | cfs_rq->runtime_enabled = runtime_enabled; | 7667 | cfs_rq->runtime_enabled = runtime_enabled; |
9236 | cfs_rq->runtime_remaining = 0; | 7668 | cfs_rq->runtime_remaining = 0; |
9237 | 7669 | ||
9238 | if (cfs_rq_throttled(cfs_rq)) | 7670 | if (cfs_rq->throttled) |
9239 | unthrottle_cfs_rq(cfs_rq); | 7671 | unthrottle_cfs_rq(cfs_rq); |
9240 | raw_spin_unlock_irq(&rq->lock); | 7672 | raw_spin_unlock_irq(&rq->lock); |
9241 | } | 7673 | } |
@@ -9249,7 +7681,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | |||
9249 | { | 7681 | { |
9250 | u64 quota, period; | 7682 | u64 quota, period; |
9251 | 7683 | ||
9252 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | 7684 | period = ktime_to_ns(tg->cfs_bandwidth.period); |
9253 | if (cfs_quota_us < 0) | 7685 | if (cfs_quota_us < 0) |
9254 | quota = RUNTIME_INF; | 7686 | quota = RUNTIME_INF; |
9255 | else | 7687 | else |
@@ -9262,10 +7694,10 @@ long tg_get_cfs_quota(struct task_group *tg) | |||
9262 | { | 7694 | { |
9263 | u64 quota_us; | 7695 | u64 quota_us; |
9264 | 7696 | ||
9265 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | 7697 | if (tg->cfs_bandwidth.quota == RUNTIME_INF) |
9266 | return -1; | 7698 | return -1; |
9267 | 7699 | ||
9268 | quota_us = tg_cfs_bandwidth(tg)->quota; | 7700 | quota_us = tg->cfs_bandwidth.quota; |
9269 | do_div(quota_us, NSEC_PER_USEC); | 7701 | do_div(quota_us, NSEC_PER_USEC); |
9270 | 7702 | ||
9271 | return quota_us; | 7703 | return quota_us; |
@@ -9276,10 +7708,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | |||
9276 | u64 quota, period; | 7708 | u64 quota, period; |
9277 | 7709 | ||
9278 | period = (u64)cfs_period_us * NSEC_PER_USEC; | 7710 | period = (u64)cfs_period_us * NSEC_PER_USEC; |
9279 | quota = tg_cfs_bandwidth(tg)->quota; | 7711 | quota = tg->cfs_bandwidth.quota; |
9280 | |||
9281 | if (period <= 0) | ||
9282 | return -EINVAL; | ||
9283 | 7712 | ||
9284 | return tg_set_cfs_bandwidth(tg, period, quota); | 7713 | return tg_set_cfs_bandwidth(tg, period, quota); |
9285 | } | 7714 | } |
@@ -9288,7 +7717,7 @@ long tg_get_cfs_period(struct task_group *tg) | |||
9288 | { | 7717 | { |
9289 | u64 cfs_period_us; | 7718 | u64 cfs_period_us; |
9290 | 7719 | ||
9291 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | 7720 | cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); |
9292 | do_div(cfs_period_us, NSEC_PER_USEC); | 7721 | do_div(cfs_period_us, NSEC_PER_USEC); |
9293 | 7722 | ||
9294 | return cfs_period_us; | 7723 | return cfs_period_us; |
@@ -9348,13 +7777,13 @@ static u64 normalize_cfs_quota(struct task_group *tg, | |||
9348 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | 7777 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) |
9349 | { | 7778 | { |
9350 | struct cfs_schedulable_data *d = data; | 7779 | struct cfs_schedulable_data *d = data; |
9351 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7780 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
9352 | s64 quota = 0, parent_quota = -1; | 7781 | s64 quota = 0, parent_quota = -1; |
9353 | 7782 | ||
9354 | if (!tg->parent) { | 7783 | if (!tg->parent) { |
9355 | quota = RUNTIME_INF; | 7784 | quota = RUNTIME_INF; |
9356 | } else { | 7785 | } else { |
9357 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | 7786 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
9358 | 7787 | ||
9359 | quota = normalize_cfs_quota(tg, d); | 7788 | quota = normalize_cfs_quota(tg, d); |
9360 | parent_quota = parent_b->hierarchal_quota; | 7789 | parent_quota = parent_b->hierarchal_quota; |
@@ -9398,7 +7827,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
9398 | struct cgroup_map_cb *cb) | 7827 | struct cgroup_map_cb *cb) |
9399 | { | 7828 | { |
9400 | struct task_group *tg = cgroup_tg(cgrp); | 7829 | struct task_group *tg = cgroup_tg(cgrp); |
9401 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7830 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
9402 | 7831 | ||
9403 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7832 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
9404 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | 7833 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); |
@@ -9480,8 +7909,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9480 | .name = "cpu", | 7909 | .name = "cpu", |
9481 | .create = cpu_cgroup_create, | 7910 | .create = cpu_cgroup_create, |
9482 | .destroy = cpu_cgroup_destroy, | 7911 | .destroy = cpu_cgroup_destroy, |
9483 | .can_attach_task = cpu_cgroup_can_attach_task, | 7912 | .can_attach = cpu_cgroup_can_attach, |
9484 | .attach_task = cpu_cgroup_attach_task, | 7913 | .attach = cpu_cgroup_attach, |
9485 | .exit = cpu_cgroup_exit, | 7914 | .exit = cpu_cgroup_exit, |
9486 | .populate = cpu_cgroup_populate, | 7915 | .populate = cpu_cgroup_populate, |
9487 | .subsys_id = cpu_cgroup_subsys_id, | 7916 | .subsys_id = cpu_cgroup_subsys_id, |
@@ -9499,38 +7928,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9499 | * (balbir@in.ibm.com). | 7928 | * (balbir@in.ibm.com). |
9500 | */ | 7929 | */ |
9501 | 7930 | ||
9502 | /* track cpu usage of a group of tasks and its child groups */ | ||
9503 | struct cpuacct { | ||
9504 | struct cgroup_subsys_state css; | ||
9505 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
9506 | u64 __percpu *cpuusage; | ||
9507 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | ||
9508 | struct cpuacct *parent; | ||
9509 | }; | ||
9510 | |||
9511 | struct cgroup_subsys cpuacct_subsys; | ||
9512 | |||
9513 | /* return cpu accounting group corresponding to this container */ | ||
9514 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
9515 | { | ||
9516 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
9517 | struct cpuacct, css); | ||
9518 | } | ||
9519 | |||
9520 | /* return cpu accounting group to which this task belongs */ | ||
9521 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
9522 | { | ||
9523 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
9524 | struct cpuacct, css); | ||
9525 | } | ||
9526 | |||
9527 | /* create a new cpu accounting group */ | 7931 | /* create a new cpu accounting group */ |
9528 | static struct cgroup_subsys_state *cpuacct_create( | 7932 | static struct cgroup_subsys_state *cpuacct_create( |
9529 | struct cgroup_subsys *ss, struct cgroup *cgrp) | 7933 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
9530 | { | 7934 | { |
9531 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 7935 | struct cpuacct *ca; |
9532 | int i; | 7936 | |
7937 | if (!cgrp->parent) | ||
7938 | return &root_cpuacct.css; | ||
9533 | 7939 | ||
7940 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
9534 | if (!ca) | 7941 | if (!ca) |
9535 | goto out; | 7942 | goto out; |
9536 | 7943 | ||
@@ -9538,18 +7945,13 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
9538 | if (!ca->cpuusage) | 7945 | if (!ca->cpuusage) |
9539 | goto out_free_ca; | 7946 | goto out_free_ca; |
9540 | 7947 | ||
9541 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7948 | ca->cpustat = alloc_percpu(struct kernel_cpustat); |
9542 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 7949 | if (!ca->cpustat) |
9543 | goto out_free_counters; | 7950 | goto out_free_cpuusage; |
9544 | |||
9545 | if (cgrp->parent) | ||
9546 | ca->parent = cgroup_ca(cgrp->parent); | ||
9547 | 7951 | ||
9548 | return &ca->css; | 7952 | return &ca->css; |
9549 | 7953 | ||
9550 | out_free_counters: | 7954 | out_free_cpuusage: |
9551 | while (--i >= 0) | ||
9552 | percpu_counter_destroy(&ca->cpustat[i]); | ||
9553 | free_percpu(ca->cpuusage); | 7955 | free_percpu(ca->cpuusage); |
9554 | out_free_ca: | 7956 | out_free_ca: |
9555 | kfree(ca); | 7957 | kfree(ca); |
@@ -9562,10 +7964,8 @@ static void | |||
9562 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7964 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
9563 | { | 7965 | { |
9564 | struct cpuacct *ca = cgroup_ca(cgrp); | 7966 | struct cpuacct *ca = cgroup_ca(cgrp); |
9565 | int i; | ||
9566 | 7967 | ||
9567 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7968 | free_percpu(ca->cpustat); |
9568 | percpu_counter_destroy(&ca->cpustat[i]); | ||
9569 | free_percpu(ca->cpuusage); | 7969 | free_percpu(ca->cpuusage); |
9570 | kfree(ca); | 7970 | kfree(ca); |
9571 | } | 7971 | } |
@@ -9658,16 +8058,31 @@ static const char *cpuacct_stat_desc[] = { | |||
9658 | }; | 8058 | }; |
9659 | 8059 | ||
9660 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 8060 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, |
9661 | struct cgroup_map_cb *cb) | 8061 | struct cgroup_map_cb *cb) |
9662 | { | 8062 | { |
9663 | struct cpuacct *ca = cgroup_ca(cgrp); | 8063 | struct cpuacct *ca = cgroup_ca(cgrp); |
9664 | int i; | 8064 | int cpu; |
8065 | s64 val = 0; | ||
9665 | 8066 | ||
9666 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { | 8067 | for_each_online_cpu(cpu) { |
9667 | s64 val = percpu_counter_read(&ca->cpustat[i]); | 8068 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); |
9668 | val = cputime64_to_clock_t(val); | 8069 | val += kcpustat->cpustat[CPUTIME_USER]; |
9669 | cb->fill(cb, cpuacct_stat_desc[i], val); | 8070 | val += kcpustat->cpustat[CPUTIME_NICE]; |
9670 | } | 8071 | } |
8072 | val = cputime64_to_clock_t(val); | ||
8073 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
8074 | |||
8075 | val = 0; | ||
8076 | for_each_online_cpu(cpu) { | ||
8077 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8078 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
8079 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8080 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
8081 | } | ||
8082 | |||
8083 | val = cputime64_to_clock_t(val); | ||
8084 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8085 | |||
9671 | return 0; | 8086 | return 0; |
9672 | } | 8087 | } |
9673 | 8088 | ||
@@ -9697,7 +8112,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9697 | * | 8112 | * |
9698 | * called with rq->lock held. | 8113 | * called with rq->lock held. |
9699 | */ | 8114 | */ |
9700 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 8115 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
9701 | { | 8116 | { |
9702 | struct cpuacct *ca; | 8117 | struct cpuacct *ca; |
9703 | int cpu; | 8118 | int cpu; |
@@ -9711,7 +8126,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9711 | 8126 | ||
9712 | ca = task_ca(tsk); | 8127 | ca = task_ca(tsk); |
9713 | 8128 | ||
9714 | for (; ca; ca = ca->parent) { | 8129 | for (; ca; ca = parent_ca(ca)) { |
9715 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 8130 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
9716 | *cpuusage += cputime; | 8131 | *cpuusage += cputime; |
9717 | } | 8132 | } |
@@ -9719,45 +8134,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9719 | rcu_read_unlock(); | 8134 | rcu_read_unlock(); |
9720 | } | 8135 | } |
9721 | 8136 | ||
9722 | /* | ||
9723 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
9724 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
9725 | * percpu_counter_add with values large enough to always overflow the | ||
9726 | * per cpu batch limit causing bad SMP scalability. | ||
9727 | * | ||
9728 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
9729 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
9730 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
9731 | */ | ||
9732 | #ifdef CONFIG_SMP | ||
9733 | #define CPUACCT_BATCH \ | ||
9734 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
9735 | #else | ||
9736 | #define CPUACCT_BATCH 0 | ||
9737 | #endif | ||
9738 | |||
9739 | /* | ||
9740 | * Charge the system/user time to the task's accounting group. | ||
9741 | */ | ||
9742 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
9743 | enum cpuacct_stat_index idx, cputime_t val) | ||
9744 | { | ||
9745 | struct cpuacct *ca; | ||
9746 | int batch = CPUACCT_BATCH; | ||
9747 | |||
9748 | if (unlikely(!cpuacct_subsys.active)) | ||
9749 | return; | ||
9750 | |||
9751 | rcu_read_lock(); | ||
9752 | ca = task_ca(tsk); | ||
9753 | |||
9754 | do { | ||
9755 | __percpu_counter_add(&ca->cpustat[idx], val, batch); | ||
9756 | ca = ca->parent; | ||
9757 | } while (ca); | ||
9758 | rcu_read_unlock(); | ||
9759 | } | ||
9760 | |||
9761 | struct cgroup_subsys cpuacct_subsys = { | 8137 | struct cgroup_subsys cpuacct_subsys = { |
9762 | .name = "cpuacct", | 8138 | .name = "cpuacct", |
9763 | .create = cpuacct_create, | 8139 | .create = cpuacct_create, |
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c index a86cf9d9eb11..d72586fdf660 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched_cpupri.c | 2 | * kernel/sched/cpupri.c |
3 | * | 3 | * |
4 | * CPU priority management | 4 | * CPU priority management |
5 | * | 5 | * |
@@ -28,7 +28,7 @@ | |||
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
31 | #include "sched_cpupri.h" | 31 | #include "cpupri.h" |
32 | 32 | ||
33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
34 | static int convert_prio(int prio) | 34 | static int convert_prio(int prio) |
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
129 | * cpupri_set - update the cpu priority setting | 129 | * cpupri_set - update the cpu priority setting |
130 | * @cp: The cpupri context | 130 | * @cp: The cpupri context |
131 | * @cpu: The target cpu | 131 | * @cpu: The target cpu |
132 | * @pri: The priority (INVALID-RT99) to assign to this CPU | 132 | * @newpri: The priority (INVALID-RT99) to assign to this CPU |
133 | * | 133 | * |
134 | * Note: Assumes cpu_rq(cpu)->lock is locked | 134 | * Note: Assumes cpu_rq(cpu)->lock is locked |
135 | * | 135 | * |
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
200 | /** | 200 | /** |
201 | * cpupri_init - initialize the cpupri structure | 201 | * cpupri_init - initialize the cpupri structure |
202 | * @cp: The cpupri context | 202 | * @cp: The cpupri context |
203 | * @bootmem: true if allocations need to use bootmem | ||
204 | * | 203 | * |
205 | * Returns: -ENOMEM if memory fails. | 204 | * Returns: -ENOMEM if memory fails. |
206 | */ | 205 | */ |
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h index f6d756173491..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched/cpupri.h | |||
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c index a6710a112b4f..2a075e10004b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched/debug.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/time/sched_debug.c | 2 | * kernel/sched/debug.c |
3 | * | 3 | * |
4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree |
5 | * | 5 | * |
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | 18 | ||
19 | #include "sched.h" | ||
20 | |||
19 | static DEFINE_SPINLOCK(sched_debug_lock); | 21 | static DEFINE_SPINLOCK(sched_debug_lock); |
20 | 22 | ||
21 | /* | 23 | /* |
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
373 | return 0; | 375 | return 0; |
374 | } | 376 | } |
375 | 377 | ||
376 | static void sysrq_sched_debug_show(void) | 378 | void sysrq_sched_debug_show(void) |
377 | { | 379 | { |
378 | sched_debug_show(NULL, NULL); | 380 | sched_debug_show(NULL, NULL); |
379 | } | 381 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c index 5c9e67923b7c..aca16b843b7e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched/fair.c | |||
@@ -23,6 +23,13 @@ | |||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
26 | #include <linux/slab.h> | ||
27 | #include <linux/profile.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | |||
30 | #include <trace/events/sched.h> | ||
31 | |||
32 | #include "sched.h" | ||
26 | 33 | ||
27 | /* | 34 | /* |
28 | * Targeted preemption latency for CPU-bound tasks: | 35 | * Targeted preemption latency for CPU-bound tasks: |
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 110 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
104 | #endif | 111 | #endif |
105 | 112 | ||
106 | static const struct sched_class fair_sched_class; | 113 | /* |
114 | * Increase the granularity value when there are more CPUs, | ||
115 | * because with more CPUs the 'effective latency' as visible | ||
116 | * to users decreases. But the relationship is not linear, | ||
117 | * so pick a second-best guess by going with the log2 of the | ||
118 | * number of CPUs. | ||
119 | * | ||
120 | * This idea comes from the SD scheduler of Con Kolivas: | ||
121 | */ | ||
122 | static int get_update_sysctl_factor(void) | ||
123 | { | ||
124 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
125 | unsigned int factor; | ||
126 | |||
127 | switch (sysctl_sched_tunable_scaling) { | ||
128 | case SCHED_TUNABLESCALING_NONE: | ||
129 | factor = 1; | ||
130 | break; | ||
131 | case SCHED_TUNABLESCALING_LINEAR: | ||
132 | factor = cpus; | ||
133 | break; | ||
134 | case SCHED_TUNABLESCALING_LOG: | ||
135 | default: | ||
136 | factor = 1 + ilog2(cpus); | ||
137 | break; | ||
138 | } | ||
139 | |||
140 | return factor; | ||
141 | } | ||
142 | |||
143 | static void update_sysctl(void) | ||
144 | { | ||
145 | unsigned int factor = get_update_sysctl_factor(); | ||
146 | |||
147 | #define SET_SYSCTL(name) \ | ||
148 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
149 | SET_SYSCTL(sched_min_granularity); | ||
150 | SET_SYSCTL(sched_latency); | ||
151 | SET_SYSCTL(sched_wakeup_granularity); | ||
152 | #undef SET_SYSCTL | ||
153 | } | ||
154 | |||
155 | void sched_init_granularity(void) | ||
156 | { | ||
157 | update_sysctl(); | ||
158 | } | ||
159 | |||
160 | #if BITS_PER_LONG == 32 | ||
161 | # define WMULT_CONST (~0UL) | ||
162 | #else | ||
163 | # define WMULT_CONST (1UL << 32) | ||
164 | #endif | ||
165 | |||
166 | #define WMULT_SHIFT 32 | ||
167 | |||
168 | /* | ||
169 | * Shift right and round: | ||
170 | */ | ||
171 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
172 | |||
173 | /* | ||
174 | * delta *= weight / lw | ||
175 | */ | ||
176 | static unsigned long | ||
177 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
178 | struct load_weight *lw) | ||
179 | { | ||
180 | u64 tmp; | ||
181 | |||
182 | /* | ||
183 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
184 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
185 | * 2^SCHED_LOAD_RESOLUTION. | ||
186 | */ | ||
187 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
188 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
189 | else | ||
190 | tmp = (u64)delta_exec; | ||
191 | |||
192 | if (!lw->inv_weight) { | ||
193 | unsigned long w = scale_load_down(lw->weight); | ||
194 | |||
195 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
196 | lw->inv_weight = 1; | ||
197 | else if (unlikely(!w)) | ||
198 | lw->inv_weight = WMULT_CONST; | ||
199 | else | ||
200 | lw->inv_weight = WMULT_CONST / w; | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Check whether we'd overflow the 64-bit multiplication: | ||
205 | */ | ||
206 | if (unlikely(tmp > WMULT_CONST)) | ||
207 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
208 | WMULT_SHIFT/2); | ||
209 | else | ||
210 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
211 | |||
212 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
213 | } | ||
214 | |||
215 | |||
216 | const struct sched_class fair_sched_class; | ||
107 | 217 | ||
108 | /************************************************************** | 218 | /************************************************************** |
109 | * CFS operations on generic schedulable entities: | 219 | * CFS operations on generic schedulable entities: |
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
413 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 523 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
414 | } | 524 | } |
415 | 525 | ||
416 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) | 526 | struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
417 | { | 527 | { |
418 | struct rb_node *left = cfs_rq->rb_leftmost; | 528 | struct rb_node *left = cfs_rq->rb_leftmost; |
419 | 529 | ||
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) | |||
434 | } | 544 | } |
435 | 545 | ||
436 | #ifdef CONFIG_SCHED_DEBUG | 546 | #ifdef CONFIG_SCHED_DEBUG |
437 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 547 | struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
438 | { | 548 | { |
439 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 549 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
440 | 550 | ||
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
684 | { | 794 | { |
685 | update_load_add(&cfs_rq->load, se->load.weight); | 795 | update_load_add(&cfs_rq->load, se->load.weight); |
686 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
687 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | 797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
688 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) { |
689 | add_cfs_task_weight(cfs_rq, se->load.weight); | 799 | add_cfs_task_weight(cfs_rq, se->load.weight); |
690 | list_add(&se->group_node, &cfs_rq->tasks); | 800 | list_add(&se->group_node, &cfs_rq->tasks); |
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
697 | { | 807 | { |
698 | update_load_sub(&cfs_rq->load, se->load.weight); | 808 | update_load_sub(&cfs_rq->load, se->load.weight); |
699 | if (!parent_entity(se)) | 809 | if (!parent_entity(se)) |
700 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | 810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
701 | if (entity_is_task(se)) { | 811 | if (entity_is_task(se)) { |
702 | add_cfs_task_weight(cfs_rq, -se->load.weight); | 812 | add_cfs_task_weight(cfs_rq, -se->load.weight); |
703 | list_del_init(&se->group_node); | 813 | list_del_init(&se->group_node); |
@@ -772,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
772 | list_del_leaf_cfs_rq(cfs_rq); | 882 | list_del_leaf_cfs_rq(cfs_rq); |
773 | } | 883 | } |
774 | 884 | ||
885 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | ||
886 | { | ||
887 | long tg_weight; | ||
888 | |||
889 | /* | ||
890 | * Use this CPU's actual weight instead of the last load_contribution | ||
891 | * to gain a more accurate current total weight. See | ||
892 | * update_cfs_rq_load_contribution(). | ||
893 | */ | ||
894 | tg_weight = atomic_read(&tg->load_weight); | ||
895 | tg_weight -= cfs_rq->load_contribution; | ||
896 | tg_weight += cfs_rq->load.weight; | ||
897 | |||
898 | return tg_weight; | ||
899 | } | ||
900 | |||
775 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 901 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
776 | { | 902 | { |
777 | long load_weight, load, shares; | 903 | long tg_weight, load, shares; |
778 | 904 | ||
905 | tg_weight = calc_tg_weight(tg, cfs_rq); | ||
779 | load = cfs_rq->load.weight; | 906 | load = cfs_rq->load.weight; |
780 | 907 | ||
781 | load_weight = atomic_read(&tg->load_weight); | ||
782 | load_weight += load; | ||
783 | load_weight -= cfs_rq->load_contribution; | ||
784 | |||
785 | shares = (tg->shares * load); | 908 | shares = (tg->shares * load); |
786 | if (load_weight) | 909 | if (tg_weight) |
787 | shares /= load_weight; | 910 | shares /= tg_weight; |
788 | 911 | ||
789 | if (shares < MIN_SHARES) | 912 | if (shares < MIN_SHARES) |
790 | shares = MIN_SHARES; | 913 | shares = MIN_SHARES; |
@@ -907,6 +1030,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
907 | trace_sched_stat_iowait(tsk, delta); | 1030 | trace_sched_stat_iowait(tsk, delta); |
908 | } | 1031 | } |
909 | 1032 | ||
1033 | trace_sched_stat_blocked(tsk, delta); | ||
1034 | |||
910 | /* | 1035 | /* |
911 | * Blocking time is in units of nanosecs, so shift by | 1036 | * Blocking time is in units of nanosecs, so shift by |
912 | * 20 to get a milliseconds-range estimation of the | 1037 | * 20 to get a milliseconds-range estimation of the |
@@ -1274,6 +1399,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1274 | */ | 1399 | */ |
1275 | 1400 | ||
1276 | #ifdef CONFIG_CFS_BANDWIDTH | 1401 | #ifdef CONFIG_CFS_BANDWIDTH |
1402 | |||
1403 | #ifdef HAVE_JUMP_LABEL | ||
1404 | static struct jump_label_key __cfs_bandwidth_used; | ||
1405 | |||
1406 | static inline bool cfs_bandwidth_used(void) | ||
1407 | { | ||
1408 | return static_branch(&__cfs_bandwidth_used); | ||
1409 | } | ||
1410 | |||
1411 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | ||
1412 | { | ||
1413 | /* only need to count groups transitioning between enabled/!enabled */ | ||
1414 | if (enabled && !was_enabled) | ||
1415 | jump_label_inc(&__cfs_bandwidth_used); | ||
1416 | else if (!enabled && was_enabled) | ||
1417 | jump_label_dec(&__cfs_bandwidth_used); | ||
1418 | } | ||
1419 | #else /* HAVE_JUMP_LABEL */ | ||
1420 | static bool cfs_bandwidth_used(void) | ||
1421 | { | ||
1422 | return true; | ||
1423 | } | ||
1424 | |||
1425 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | ||
1426 | #endif /* HAVE_JUMP_LABEL */ | ||
1427 | |||
1277 | /* | 1428 | /* |
1278 | * default period for cfs group bandwidth. | 1429 | * default period for cfs group bandwidth. |
1279 | * default: 0.1s, units: nanoseconds | 1430 | * default: 0.1s, units: nanoseconds |
@@ -1295,7 +1446,7 @@ static inline u64 sched_cfs_bandwidth_slice(void) | |||
1295 | * | 1446 | * |
1296 | * requires cfs_b->lock | 1447 | * requires cfs_b->lock |
1297 | */ | 1448 | */ |
1298 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | 1449 | void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) |
1299 | { | 1450 | { |
1300 | u64 now; | 1451 | u64 now; |
1301 | 1452 | ||
@@ -1307,6 +1458,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | |||
1307 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | 1458 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); |
1308 | } | 1459 | } |
1309 | 1460 | ||
1461 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
1462 | { | ||
1463 | return &tg->cfs_bandwidth; | ||
1464 | } | ||
1465 | |||
1310 | /* returns 0 on failure to allocate runtime */ | 1466 | /* returns 0 on failure to allocate runtime */ |
1311 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1467 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1312 | { | 1468 | { |
@@ -1408,7 +1564,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
1408 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 1564 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
1409 | unsigned long delta_exec) | 1565 | unsigned long delta_exec) |
1410 | { | 1566 | { |
1411 | if (!cfs_rq->runtime_enabled) | 1567 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
1412 | return; | 1568 | return; |
1413 | 1569 | ||
1414 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | 1570 | __account_cfs_rq_runtime(cfs_rq, delta_exec); |
@@ -1416,13 +1572,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
1416 | 1572 | ||
1417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 1573 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
1418 | { | 1574 | { |
1419 | return cfs_rq->throttled; | 1575 | return cfs_bandwidth_used() && cfs_rq->throttled; |
1420 | } | 1576 | } |
1421 | 1577 | ||
1422 | /* check whether cfs_rq, or any parent, is throttled */ | 1578 | /* check whether cfs_rq, or any parent, is throttled */ |
1423 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | 1579 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) |
1424 | { | 1580 | { |
1425 | return cfs_rq->throttle_count; | 1581 | return cfs_bandwidth_used() && cfs_rq->throttle_count; |
1426 | } | 1582 | } |
1427 | 1583 | ||
1428 | /* | 1584 | /* |
@@ -1517,7 +1673,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
1517 | raw_spin_unlock(&cfs_b->lock); | 1673 | raw_spin_unlock(&cfs_b->lock); |
1518 | } | 1674 | } |
1519 | 1675 | ||
1520 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | 1676 | void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) |
1521 | { | 1677 | { |
1522 | struct rq *rq = rq_of(cfs_rq); | 1678 | struct rq *rq = rq_of(cfs_rq); |
1523 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | 1679 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); |
@@ -1743,7 +1899,10 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
1743 | 1899 | ||
1744 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1900 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1745 | { | 1901 | { |
1746 | if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) | 1902 | if (!cfs_bandwidth_used()) |
1903 | return; | ||
1904 | |||
1905 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) | ||
1747 | return; | 1906 | return; |
1748 | 1907 | ||
1749 | __return_cfs_rq_runtime(cfs_rq); | 1908 | __return_cfs_rq_runtime(cfs_rq); |
@@ -1788,6 +1947,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
1788 | */ | 1947 | */ |
1789 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | 1948 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) |
1790 | { | 1949 | { |
1950 | if (!cfs_bandwidth_used()) | ||
1951 | return; | ||
1952 | |||
1791 | /* an active group must be handled by the update_curr()->put() path */ | 1953 | /* an active group must be handled by the update_curr()->put() path */ |
1792 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | 1954 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) |
1793 | return; | 1955 | return; |
@@ -1805,6 +1967,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
1805 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 1967 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
1806 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1968 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
1807 | { | 1969 | { |
1970 | if (!cfs_bandwidth_used()) | ||
1971 | return; | ||
1972 | |||
1808 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | 1973 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) |
1809 | return; | 1974 | return; |
1810 | 1975 | ||
@@ -1817,7 +1982,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
1817 | 1982 | ||
1818 | throttle_cfs_rq(cfs_rq); | 1983 | throttle_cfs_rq(cfs_rq); |
1819 | } | 1984 | } |
1820 | #else | 1985 | |
1986 | static inline u64 default_cfs_period(void); | ||
1987 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
1988 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
1989 | |||
1990 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
1991 | { | ||
1992 | struct cfs_bandwidth *cfs_b = | ||
1993 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
1994 | do_sched_cfs_slack_timer(cfs_b); | ||
1995 | |||
1996 | return HRTIMER_NORESTART; | ||
1997 | } | ||
1998 | |||
1999 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
2000 | { | ||
2001 | struct cfs_bandwidth *cfs_b = | ||
2002 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
2003 | ktime_t now; | ||
2004 | int overrun; | ||
2005 | int idle = 0; | ||
2006 | |||
2007 | for (;;) { | ||
2008 | now = hrtimer_cb_get_time(timer); | ||
2009 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
2010 | |||
2011 | if (!overrun) | ||
2012 | break; | ||
2013 | |||
2014 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
2015 | } | ||
2016 | |||
2017 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
2018 | } | ||
2019 | |||
2020 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2021 | { | ||
2022 | raw_spin_lock_init(&cfs_b->lock); | ||
2023 | cfs_b->runtime = 0; | ||
2024 | cfs_b->quota = RUNTIME_INF; | ||
2025 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
2026 | |||
2027 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
2028 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2029 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
2030 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2031 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
2032 | } | ||
2033 | |||
2034 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
2035 | { | ||
2036 | cfs_rq->runtime_enabled = 0; | ||
2037 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
2038 | } | ||
2039 | |||
2040 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
2041 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2042 | { | ||
2043 | /* | ||
2044 | * The timer may be active because we're trying to set a new bandwidth | ||
2045 | * period or because we're racing with the tear-down path | ||
2046 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
2047 | * terminates). In either case we ensure that it's re-programmed | ||
2048 | */ | ||
2049 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
2050 | raw_spin_unlock(&cfs_b->lock); | ||
2051 | /* ensure cfs_b->lock is available while we wait */ | ||
2052 | hrtimer_cancel(&cfs_b->period_timer); | ||
2053 | |||
2054 | raw_spin_lock(&cfs_b->lock); | ||
2055 | /* if someone else restarted the timer then we're done */ | ||
2056 | if (cfs_b->timer_active) | ||
2057 | return; | ||
2058 | } | ||
2059 | |||
2060 | cfs_b->timer_active = 1; | ||
2061 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
2062 | } | ||
2063 | |||
2064 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2065 | { | ||
2066 | hrtimer_cancel(&cfs_b->period_timer); | ||
2067 | hrtimer_cancel(&cfs_b->slack_timer); | ||
2068 | } | ||
2069 | |||
2070 | void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
2071 | { | ||
2072 | struct cfs_rq *cfs_rq; | ||
2073 | |||
2074 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
2075 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
2076 | |||
2077 | if (!cfs_rq->runtime_enabled) | ||
2078 | continue; | ||
2079 | |||
2080 | /* | ||
2081 | * clock_task is not advancing so we just need to make sure | ||
2082 | * there's some valid quota amount | ||
2083 | */ | ||
2084 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
2085 | if (cfs_rq_throttled(cfs_rq)) | ||
2086 | unthrottle_cfs_rq(cfs_rq); | ||
2087 | } | ||
2088 | } | ||
2089 | |||
2090 | #else /* CONFIG_CFS_BANDWIDTH */ | ||
1821 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2091 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
1822 | unsigned long delta_exec) {} | 2092 | unsigned long delta_exec) {} |
1823 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2093 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
@@ -1839,8 +2109,22 @@ static inline int throttled_lb_pair(struct task_group *tg, | |||
1839 | { | 2109 | { |
1840 | return 0; | 2110 | return 0; |
1841 | } | 2111 | } |
2112 | |||
2113 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
2114 | |||
2115 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
2116 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1842 | #endif | 2117 | #endif |
1843 | 2118 | ||
2119 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
2120 | { | ||
2121 | return NULL; | ||
2122 | } | ||
2123 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
2124 | void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
2125 | |||
2126 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
2127 | |||
1844 | /************************************************** | 2128 | /************************************************** |
1845 | * CFS operations on tasks: | 2129 | * CFS operations on tasks: |
1846 | */ | 2130 | */ |
@@ -1853,7 +2137,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
1853 | 2137 | ||
1854 | WARN_ON(task_rq(p) != rq); | 2138 | WARN_ON(task_rq(p) != rq); |
1855 | 2139 | ||
1856 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | 2140 | if (cfs_rq->nr_running > 1) { |
1857 | u64 slice = sched_slice(cfs_rq, se); | 2141 | u64 slice = sched_slice(cfs_rq, se); |
1858 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 2142 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
1859 | s64 delta = slice - ran; | 2143 | s64 delta = slice - ran; |
@@ -1884,7 +2168,7 @@ static void hrtick_update(struct rq *rq) | |||
1884 | { | 2168 | { |
1885 | struct task_struct *curr = rq->curr; | 2169 | struct task_struct *curr = rq->curr; |
1886 | 2170 | ||
1887 | if (curr->sched_class != &fair_sched_class) | 2171 | if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) |
1888 | return; | 2172 | return; |
1889 | 2173 | ||
1890 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) | 2174 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) |
@@ -2007,6 +2291,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2007 | } | 2291 | } |
2008 | 2292 | ||
2009 | #ifdef CONFIG_SMP | 2293 | #ifdef CONFIG_SMP |
2294 | /* Used instead of source_load when we know the type == 0 */ | ||
2295 | static unsigned long weighted_cpuload(const int cpu) | ||
2296 | { | ||
2297 | return cpu_rq(cpu)->load.weight; | ||
2298 | } | ||
2299 | |||
2300 | /* | ||
2301 | * Return a low guess at the load of a migration-source cpu weighted | ||
2302 | * according to the scheduling class and "nice" value. | ||
2303 | * | ||
2304 | * We want to under-estimate the load of migration sources, to | ||
2305 | * balance conservatively. | ||
2306 | */ | ||
2307 | static unsigned long source_load(int cpu, int type) | ||
2308 | { | ||
2309 | struct rq *rq = cpu_rq(cpu); | ||
2310 | unsigned long total = weighted_cpuload(cpu); | ||
2311 | |||
2312 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2313 | return total; | ||
2314 | |||
2315 | return min(rq->cpu_load[type-1], total); | ||
2316 | } | ||
2317 | |||
2318 | /* | ||
2319 | * Return a high guess at the load of a migration-target cpu weighted | ||
2320 | * according to the scheduling class and "nice" value. | ||
2321 | */ | ||
2322 | static unsigned long target_load(int cpu, int type) | ||
2323 | { | ||
2324 | struct rq *rq = cpu_rq(cpu); | ||
2325 | unsigned long total = weighted_cpuload(cpu); | ||
2326 | |||
2327 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2328 | return total; | ||
2329 | |||
2330 | return max(rq->cpu_load[type-1], total); | ||
2331 | } | ||
2332 | |||
2333 | static unsigned long power_of(int cpu) | ||
2334 | { | ||
2335 | return cpu_rq(cpu)->cpu_power; | ||
2336 | } | ||
2337 | |||
2338 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
2339 | { | ||
2340 | struct rq *rq = cpu_rq(cpu); | ||
2341 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
2342 | |||
2343 | if (nr_running) | ||
2344 | return rq->load.weight / nr_running; | ||
2345 | |||
2346 | return 0; | ||
2347 | } | ||
2348 | |||
2010 | 2349 | ||
2011 | static void task_waking_fair(struct task_struct *p) | 2350 | static void task_waking_fair(struct task_struct *p) |
2012 | { | 2351 | { |
@@ -2036,36 +2375,100 @@ static void task_waking_fair(struct task_struct *p) | |||
2036 | * Adding load to a group doesn't make a group heavier, but can cause movement | 2375 | * Adding load to a group doesn't make a group heavier, but can cause movement |
2037 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 2376 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
2038 | * can calculate the shift in shares. | 2377 | * can calculate the shift in shares. |
2378 | * | ||
2379 | * Calculate the effective load difference if @wl is added (subtracted) to @tg | ||
2380 | * on this @cpu and results in a total addition (subtraction) of @wg to the | ||
2381 | * total group weight. | ||
2382 | * | ||
2383 | * Given a runqueue weight distribution (rw_i) we can compute a shares | ||
2384 | * distribution (s_i) using: | ||
2385 | * | ||
2386 | * s_i = rw_i / \Sum rw_j (1) | ||
2387 | * | ||
2388 | * Suppose we have 4 CPUs and our @tg is a direct child of the root group and | ||
2389 | * has 7 equal weight tasks, distributed as below (rw_i), with the resulting | ||
2390 | * shares distribution (s_i): | ||
2391 | * | ||
2392 | * rw_i = { 2, 4, 1, 0 } | ||
2393 | * s_i = { 2/7, 4/7, 1/7, 0 } | ||
2394 | * | ||
2395 | * As per wake_affine() we're interested in the load of two CPUs (the CPU the | ||
2396 | * task used to run on and the CPU the waker is running on), we need to | ||
2397 | * compute the effect of waking a task on either CPU and, in case of a sync | ||
2398 | * wakeup, compute the effect of the current task going to sleep. | ||
2399 | * | ||
2400 | * So for a change of @wl to the local @cpu with an overall group weight change | ||
2401 | * of @wl we can compute the new shares distribution (s'_i) using: | ||
2402 | * | ||
2403 | * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) | ||
2404 | * | ||
2405 | * Suppose we're interested in CPUs 0 and 1, and want to compute the load | ||
2406 | * differences in waking a task to CPU 0. The additional task changes the | ||
2407 | * weight and shares distributions like: | ||
2408 | * | ||
2409 | * rw'_i = { 3, 4, 1, 0 } | ||
2410 | * s'_i = { 3/8, 4/8, 1/8, 0 } | ||
2411 | * | ||
2412 | * We can then compute the difference in effective weight by using: | ||
2413 | * | ||
2414 | * dw_i = S * (s'_i - s_i) (3) | ||
2415 | * | ||
2416 | * Where 'S' is the group weight as seen by its parent. | ||
2417 | * | ||
2418 | * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) | ||
2419 | * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - | ||
2420 | * 4/7) times the weight of the group. | ||
2039 | */ | 2421 | */ |
2040 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | 2422 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
2041 | { | 2423 | { |
2042 | struct sched_entity *se = tg->se[cpu]; | 2424 | struct sched_entity *se = tg->se[cpu]; |
2043 | 2425 | ||
2044 | if (!tg->parent) | 2426 | if (!tg->parent) /* the trivial, non-cgroup case */ |
2045 | return wl; | 2427 | return wl; |
2046 | 2428 | ||
2047 | for_each_sched_entity(se) { | 2429 | for_each_sched_entity(se) { |
2048 | long lw, w; | 2430 | long w, W; |
2049 | 2431 | ||
2050 | tg = se->my_q->tg; | 2432 | tg = se->my_q->tg; |
2051 | w = se->my_q->load.weight; | ||
2052 | 2433 | ||
2053 | /* use this cpu's instantaneous contribution */ | 2434 | /* |
2054 | lw = atomic_read(&tg->load_weight); | 2435 | * W = @wg + \Sum rw_j |
2055 | lw -= se->my_q->load_contribution; | 2436 | */ |
2056 | lw += w + wg; | 2437 | W = wg + calc_tg_weight(tg, se->my_q); |
2057 | 2438 | ||
2058 | wl += w; | 2439 | /* |
2440 | * w = rw_i + @wl | ||
2441 | */ | ||
2442 | w = se->my_q->load.weight + wl; | ||
2059 | 2443 | ||
2060 | if (lw > 0 && wl < lw) | 2444 | /* |
2061 | wl = (wl * tg->shares) / lw; | 2445 | * wl = S * s'_i; see (2) |
2446 | */ | ||
2447 | if (W > 0 && w < W) | ||
2448 | wl = (w * tg->shares) / W; | ||
2062 | else | 2449 | else |
2063 | wl = tg->shares; | 2450 | wl = tg->shares; |
2064 | 2451 | ||
2065 | /* zero point is MIN_SHARES */ | 2452 | /* |
2453 | * Per the above, wl is the new se->load.weight value; since | ||
2454 | * those are clipped to [MIN_SHARES, ...) do so now. See | ||
2455 | * calc_cfs_shares(). | ||
2456 | */ | ||
2066 | if (wl < MIN_SHARES) | 2457 | if (wl < MIN_SHARES) |
2067 | wl = MIN_SHARES; | 2458 | wl = MIN_SHARES; |
2459 | |||
2460 | /* | ||
2461 | * wl = dw_i = S * (s'_i - s_i); see (3) | ||
2462 | */ | ||
2068 | wl -= se->load.weight; | 2463 | wl -= se->load.weight; |
2464 | |||
2465 | /* | ||
2466 | * Recursively apply this logic to all parent groups to compute | ||
2467 | * the final effective load change on the root group. Since | ||
2468 | * only the @tg group gets extra weight, all parent groups can | ||
2469 | * only redistribute existing shares. @wl is the shift in shares | ||
2470 | * resulting from this level per the above. | ||
2471 | */ | ||
2069 | wg = 0; | 2472 | wg = 0; |
2070 | } | 2473 | } |
2071 | 2474 | ||
@@ -2249,6 +2652,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2249 | int cpu = smp_processor_id(); | 2652 | int cpu = smp_processor_id(); |
2250 | int prev_cpu = task_cpu(p); | 2653 | int prev_cpu = task_cpu(p); |
2251 | struct sched_domain *sd; | 2654 | struct sched_domain *sd; |
2655 | struct sched_group *sg; | ||
2252 | int i; | 2656 | int i; |
2253 | 2657 | ||
2254 | /* | 2658 | /* |
@@ -2269,25 +2673,28 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2269 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2673 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2270 | */ | 2674 | */ |
2271 | rcu_read_lock(); | 2675 | rcu_read_lock(); |
2272 | for_each_domain(target, sd) { | ||
2273 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | ||
2274 | break; | ||
2275 | 2676 | ||
2276 | for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { | 2677 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2277 | if (idle_cpu(i)) { | 2678 | for_each_lower_domain(sd) { |
2278 | target = i; | 2679 | sg = sd->groups; |
2279 | break; | 2680 | do { |
2681 | if (!cpumask_intersects(sched_group_cpus(sg), | ||
2682 | tsk_cpus_allowed(p))) | ||
2683 | goto next; | ||
2684 | |||
2685 | for_each_cpu(i, sched_group_cpus(sg)) { | ||
2686 | if (!idle_cpu(i)) | ||
2687 | goto next; | ||
2280 | } | 2688 | } |
2281 | } | ||
2282 | 2689 | ||
2283 | /* | 2690 | target = cpumask_first_and(sched_group_cpus(sg), |
2284 | * Lets stop looking for an idle sibling when we reached | 2691 | tsk_cpus_allowed(p)); |
2285 | * the domain that spans the current cpu and prev_cpu. | 2692 | goto done; |
2286 | */ | 2693 | next: |
2287 | if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && | 2694 | sg = sg->next; |
2288 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 2695 | } while (sg != sd->groups); |
2289 | break; | ||
2290 | } | 2696 | } |
2697 | done: | ||
2291 | rcu_read_unlock(); | 2698 | rcu_read_unlock(); |
2292 | 2699 | ||
2293 | return target; | 2700 | return target; |
@@ -2315,6 +2722,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2315 | int want_sd = 1; | 2722 | int want_sd = 1; |
2316 | int sync = wake_flags & WF_SYNC; | 2723 | int sync = wake_flags & WF_SYNC; |
2317 | 2724 | ||
2725 | if (p->rt.nr_cpus_allowed == 1) | ||
2726 | return prev_cpu; | ||
2727 | |||
2318 | if (sd_flag & SD_BALANCE_WAKE) { | 2728 | if (sd_flag & SD_BALANCE_WAKE) { |
2319 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | 2729 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
2320 | want_affine = 1; | 2730 | want_affine = 1; |
@@ -2599,7 +3009,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
2599 | } while (cfs_rq); | 3009 | } while (cfs_rq); |
2600 | 3010 | ||
2601 | p = task_of(se); | 3011 | p = task_of(se); |
2602 | hrtick_start_fair(rq, p); | 3012 | if (hrtick_enabled(rq)) |
3013 | hrtick_start_fair(rq, p); | ||
2603 | 3014 | ||
2604 | return p; | 3015 | return p; |
2605 | } | 3016 | } |
@@ -2643,6 +3054,12 @@ static void yield_task_fair(struct rq *rq) | |||
2643 | * Update run-time statistics of the 'current'. | 3054 | * Update run-time statistics of the 'current'. |
2644 | */ | 3055 | */ |
2645 | update_curr(cfs_rq); | 3056 | update_curr(cfs_rq); |
3057 | /* | ||
3058 | * Tell update_rq_clock() that we've just updated, | ||
3059 | * so we don't do microscopic update in schedule() | ||
3060 | * and double the fastpath cost. | ||
3061 | */ | ||
3062 | rq->skip_clock_update = 1; | ||
2646 | } | 3063 | } |
2647 | 3064 | ||
2648 | set_skip_buddy(se); | 3065 | set_skip_buddy(se); |
@@ -2683,12 +3100,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2683 | } | 3100 | } |
2684 | 3101 | ||
2685 | /* | 3102 | /* |
3103 | * Is this task likely cache-hot: | ||
3104 | */ | ||
3105 | static int | ||
3106 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
3107 | { | ||
3108 | s64 delta; | ||
3109 | |||
3110 | if (p->sched_class != &fair_sched_class) | ||
3111 | return 0; | ||
3112 | |||
3113 | if (unlikely(p->policy == SCHED_IDLE)) | ||
3114 | return 0; | ||
3115 | |||
3116 | /* | ||
3117 | * Buddy candidates are cache hot: | ||
3118 | */ | ||
3119 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
3120 | (&p->se == cfs_rq_of(&p->se)->next || | ||
3121 | &p->se == cfs_rq_of(&p->se)->last)) | ||
3122 | return 1; | ||
3123 | |||
3124 | if (sysctl_sched_migration_cost == -1) | ||
3125 | return 1; | ||
3126 | if (sysctl_sched_migration_cost == 0) | ||
3127 | return 0; | ||
3128 | |||
3129 | delta = now - p->se.exec_start; | ||
3130 | |||
3131 | return delta < (s64)sysctl_sched_migration_cost; | ||
3132 | } | ||
3133 | |||
3134 | #define LBF_ALL_PINNED 0x01 | ||
3135 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | ||
3136 | #define LBF_HAD_BREAK 0x04 | ||
3137 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3138 | #define LBF_ABORT 0x10 | ||
3139 | |||
3140 | /* | ||
2686 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3141 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
2687 | */ | 3142 | */ |
2688 | static | 3143 | static |
2689 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3144 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2690 | struct sched_domain *sd, enum cpu_idle_type idle, | 3145 | struct sched_domain *sd, enum cpu_idle_type idle, |
2691 | int *all_pinned) | 3146 | int *lb_flags) |
2692 | { | 3147 | { |
2693 | int tsk_cache_hot = 0; | 3148 | int tsk_cache_hot = 0; |
2694 | /* | 3149 | /* |
@@ -2701,7 +3156,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2701 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3156 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
2702 | return 0; | 3157 | return 0; |
2703 | } | 3158 | } |
2704 | *all_pinned = 0; | 3159 | *lb_flags &= ~LBF_ALL_PINNED; |
2705 | 3160 | ||
2706 | if (task_running(rq, p)) { | 3161 | if (task_running(rq, p)) { |
2707 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3162 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
@@ -2775,7 +3230,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2775 | static unsigned long | 3230 | static unsigned long |
2776 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3231 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2777 | unsigned long max_load_move, struct sched_domain *sd, | 3232 | unsigned long max_load_move, struct sched_domain *sd, |
2778 | enum cpu_idle_type idle, int *all_pinned, | 3233 | enum cpu_idle_type idle, int *lb_flags, |
2779 | struct cfs_rq *busiest_cfs_rq) | 3234 | struct cfs_rq *busiest_cfs_rq) |
2780 | { | 3235 | { |
2781 | int loops = 0, pulled = 0; | 3236 | int loops = 0, pulled = 0; |
@@ -2786,12 +3241,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2786 | goto out; | 3241 | goto out; |
2787 | 3242 | ||
2788 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3243 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
2789 | if (loops++ > sysctl_sched_nr_migrate) | 3244 | if (loops++ > sysctl_sched_nr_migrate) { |
3245 | *lb_flags |= LBF_NEED_BREAK; | ||
2790 | break; | 3246 | break; |
3247 | } | ||
2791 | 3248 | ||
2792 | if ((p->se.load.weight >> 1) > rem_load_move || | 3249 | if ((p->se.load.weight >> 1) > rem_load_move || |
2793 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3250 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2794 | all_pinned)) | 3251 | lb_flags)) |
2795 | continue; | 3252 | continue; |
2796 | 3253 | ||
2797 | pull_task(busiest, p, this_rq, this_cpu); | 3254 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -2804,8 +3261,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2804 | * kernels will stop after the first task is pulled to minimize | 3261 | * kernels will stop after the first task is pulled to minimize |
2805 | * the critical section. | 3262 | * the critical section. |
2806 | */ | 3263 | */ |
2807 | if (idle == CPU_NEWLY_IDLE) | 3264 | if (idle == CPU_NEWLY_IDLE) { |
3265 | *lb_flags |= LBF_ABORT; | ||
2808 | break; | 3266 | break; |
3267 | } | ||
2809 | #endif | 3268 | #endif |
2810 | 3269 | ||
2811 | /* | 3270 | /* |
@@ -2910,7 +3369,7 @@ static unsigned long | |||
2910 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3369 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2911 | unsigned long max_load_move, | 3370 | unsigned long max_load_move, |
2912 | struct sched_domain *sd, enum cpu_idle_type idle, | 3371 | struct sched_domain *sd, enum cpu_idle_type idle, |
2913 | int *all_pinned) | 3372 | int *lb_flags) |
2914 | { | 3373 | { |
2915 | long rem_load_move = max_load_move; | 3374 | long rem_load_move = max_load_move; |
2916 | struct cfs_rq *busiest_cfs_rq; | 3375 | struct cfs_rq *busiest_cfs_rq; |
@@ -2923,6 +3382,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2923 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 3382 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
2924 | u64 rem_load, moved_load; | 3383 | u64 rem_load, moved_load; |
2925 | 3384 | ||
3385 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3386 | break; | ||
3387 | |||
2926 | /* | 3388 | /* |
2927 | * empty group or part of a throttled hierarchy | 3389 | * empty group or part of a throttled hierarchy |
2928 | */ | 3390 | */ |
@@ -2934,7 +3396,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2934 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 3396 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
2935 | 3397 | ||
2936 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 3398 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
2937 | rem_load, sd, idle, all_pinned, | 3399 | rem_load, sd, idle, lb_flags, |
2938 | busiest_cfs_rq); | 3400 | busiest_cfs_rq); |
2939 | 3401 | ||
2940 | if (!moved_load) | 3402 | if (!moved_load) |
@@ -2960,10 +3422,10 @@ static unsigned long | |||
2960 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3422 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2961 | unsigned long max_load_move, | 3423 | unsigned long max_load_move, |
2962 | struct sched_domain *sd, enum cpu_idle_type idle, | 3424 | struct sched_domain *sd, enum cpu_idle_type idle, |
2963 | int *all_pinned) | 3425 | int *lb_flags) |
2964 | { | 3426 | { |
2965 | return balance_tasks(this_rq, this_cpu, busiest, | 3427 | return balance_tasks(this_rq, this_cpu, busiest, |
2966 | max_load_move, sd, idle, all_pinned, | 3428 | max_load_move, sd, idle, lb_flags, |
2967 | &busiest->cfs); | 3429 | &busiest->cfs); |
2968 | } | 3430 | } |
2969 | #endif | 3431 | #endif |
@@ -2978,29 +3440,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2978 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3440 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2979 | unsigned long max_load_move, | 3441 | unsigned long max_load_move, |
2980 | struct sched_domain *sd, enum cpu_idle_type idle, | 3442 | struct sched_domain *sd, enum cpu_idle_type idle, |
2981 | int *all_pinned) | 3443 | int *lb_flags) |
2982 | { | 3444 | { |
2983 | unsigned long total_load_moved = 0, load_moved; | 3445 | unsigned long total_load_moved = 0, load_moved; |
2984 | 3446 | ||
2985 | do { | 3447 | do { |
2986 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 3448 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2987 | max_load_move - total_load_moved, | 3449 | max_load_move - total_load_moved, |
2988 | sd, idle, all_pinned); | 3450 | sd, idle, lb_flags); |
2989 | 3451 | ||
2990 | total_load_moved += load_moved; | 3452 | total_load_moved += load_moved; |
2991 | 3453 | ||
3454 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3455 | break; | ||
3456 | |||
2992 | #ifdef CONFIG_PREEMPT | 3457 | #ifdef CONFIG_PREEMPT |
2993 | /* | 3458 | /* |
2994 | * NEWIDLE balancing is a source of latency, so preemptible | 3459 | * NEWIDLE balancing is a source of latency, so preemptible |
2995 | * kernels will stop after the first task is pulled to minimize | 3460 | * kernels will stop after the first task is pulled to minimize |
2996 | * the critical section. | 3461 | * the critical section. |
2997 | */ | 3462 | */ |
2998 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | 3463 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { |
2999 | break; | 3464 | *lb_flags |= LBF_ABORT; |
3000 | |||
3001 | if (raw_spin_is_contended(&this_rq->lock) || | ||
3002 | raw_spin_is_contended(&busiest->lock)) | ||
3003 | break; | 3465 | break; |
3466 | } | ||
3004 | #endif | 3467 | #endif |
3005 | } while (load_moved && max_load_move > total_load_moved); | 3468 | } while (load_moved && max_load_move > total_load_moved); |
3006 | 3469 | ||
@@ -3062,15 +3525,6 @@ struct sg_lb_stats { | |||
3062 | }; | 3525 | }; |
3063 | 3526 | ||
3064 | /** | 3527 | /** |
3065 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
3066 | * @group: The group whose first cpu is to be returned. | ||
3067 | */ | ||
3068 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
3069 | { | ||
3070 | return cpumask_first(sched_group_cpus(group)); | ||
3071 | } | ||
3072 | |||
3073 | /** | ||
3074 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 3528 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
3075 | * @sd: The sched_domain whose load_idx is to be obtained. | 3529 | * @sd: The sched_domain whose load_idx is to be obtained. |
3076 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 3530 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. |
@@ -3319,7 +3773,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
3319 | sdg->sgp->power = power; | 3773 | sdg->sgp->power = power; |
3320 | } | 3774 | } |
3321 | 3775 | ||
3322 | static void update_group_power(struct sched_domain *sd, int cpu) | 3776 | void update_group_power(struct sched_domain *sd, int cpu) |
3323 | { | 3777 | { |
3324 | struct sched_domain *child = sd->child; | 3778 | struct sched_domain *child = sd->child; |
3325 | struct sched_group *group, *sdg = sd->groups; | 3779 | struct sched_group *group, *sdg = sd->groups; |
@@ -3511,7 +3965,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3511 | } | 3965 | } |
3512 | 3966 | ||
3513 | /** | 3967 | /** |
3514 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | 3968 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3515 | * @sd: sched_domain whose statistics are to be updated. | 3969 | * @sd: sched_domain whose statistics are to be updated. |
3516 | * @this_cpu: Cpu for which load balance is currently performed. | 3970 | * @this_cpu: Cpu for which load balance is currently performed. |
3517 | * @idle: Idle status of this_cpu | 3971 | * @idle: Idle status of this_cpu |
@@ -3585,11 +4039,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3585 | } while (sg != sd->groups); | 4039 | } while (sg != sd->groups); |
3586 | } | 4040 | } |
3587 | 4041 | ||
3588 | int __weak arch_sd_sibling_asym_packing(void) | ||
3589 | { | ||
3590 | return 0*SD_ASYM_PACKING; | ||
3591 | } | ||
3592 | |||
3593 | /** | 4042 | /** |
3594 | * check_asym_packing - Check to see if the group is packed into the | 4043 | * check_asym_packing - Check to see if the group is packed into the |
3595 | * sched doman. | 4044 | * sched doman. |
@@ -3953,7 +4402,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3953 | #define MAX_PINNED_INTERVAL 512 | 4402 | #define MAX_PINNED_INTERVAL 512 |
3954 | 4403 | ||
3955 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4404 | /* Working cpumask for load_balance and load_balance_newidle. */ |
3956 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4405 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
3957 | 4406 | ||
3958 | static int need_active_balance(struct sched_domain *sd, int idle, | 4407 | static int need_active_balance(struct sched_domain *sd, int idle, |
3959 | int busiest_cpu, int this_cpu) | 4408 | int busiest_cpu, int this_cpu) |
@@ -4004,7 +4453,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4004 | struct sched_domain *sd, enum cpu_idle_type idle, | 4453 | struct sched_domain *sd, enum cpu_idle_type idle, |
4005 | int *balance) | 4454 | int *balance) |
4006 | { | 4455 | { |
4007 | int ld_moved, all_pinned = 0, active_balance = 0; | 4456 | int ld_moved, lb_flags = 0, active_balance = 0; |
4008 | struct sched_group *group; | 4457 | struct sched_group *group; |
4009 | unsigned long imbalance; | 4458 | unsigned long imbalance; |
4010 | struct rq *busiest; | 4459 | struct rq *busiest; |
@@ -4045,11 +4494,11 @@ redo: | |||
4045 | * still unbalanced. ld_moved simply stays zero, so it is | 4494 | * still unbalanced. ld_moved simply stays zero, so it is |
4046 | * correctly treated as an imbalance. | 4495 | * correctly treated as an imbalance. |
4047 | */ | 4496 | */ |
4048 | all_pinned = 1; | 4497 | lb_flags |= LBF_ALL_PINNED; |
4049 | local_irq_save(flags); | 4498 | local_irq_save(flags); |
4050 | double_rq_lock(this_rq, busiest); | 4499 | double_rq_lock(this_rq, busiest); |
4051 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4500 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
4052 | imbalance, sd, idle, &all_pinned); | 4501 | imbalance, sd, idle, &lb_flags); |
4053 | double_rq_unlock(this_rq, busiest); | 4502 | double_rq_unlock(this_rq, busiest); |
4054 | local_irq_restore(flags); | 4503 | local_irq_restore(flags); |
4055 | 4504 | ||
@@ -4059,8 +4508,18 @@ redo: | |||
4059 | if (ld_moved && this_cpu != smp_processor_id()) | 4508 | if (ld_moved && this_cpu != smp_processor_id()) |
4060 | resched_cpu(this_cpu); | 4509 | resched_cpu(this_cpu); |
4061 | 4510 | ||
4511 | if (lb_flags & LBF_ABORT) | ||
4512 | goto out_balanced; | ||
4513 | |||
4514 | if (lb_flags & LBF_NEED_BREAK) { | ||
4515 | lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | ||
4516 | if (lb_flags & LBF_ABORT) | ||
4517 | goto out_balanced; | ||
4518 | goto redo; | ||
4519 | } | ||
4520 | |||
4062 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4521 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4063 | if (unlikely(all_pinned)) { | 4522 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { |
4064 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4523 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4065 | if (!cpumask_empty(cpus)) | 4524 | if (!cpumask_empty(cpus)) |
4066 | goto redo; | 4525 | goto redo; |
@@ -4090,7 +4549,7 @@ redo: | |||
4090 | tsk_cpus_allowed(busiest->curr))) { | 4549 | tsk_cpus_allowed(busiest->curr))) { |
4091 | raw_spin_unlock_irqrestore(&busiest->lock, | 4550 | raw_spin_unlock_irqrestore(&busiest->lock, |
4092 | flags); | 4551 | flags); |
4093 | all_pinned = 1; | 4552 | lb_flags |= LBF_ALL_PINNED; |
4094 | goto out_one_pinned; | 4553 | goto out_one_pinned; |
4095 | } | 4554 | } |
4096 | 4555 | ||
@@ -4143,7 +4602,8 @@ out_balanced: | |||
4143 | 4602 | ||
4144 | out_one_pinned: | 4603 | out_one_pinned: |
4145 | /* tune up the balancing interval */ | 4604 | /* tune up the balancing interval */ |
4146 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 4605 | if (((lb_flags & LBF_ALL_PINNED) && |
4606 | sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
4147 | (sd->balance_interval < sd->max_interval)) | 4607 | (sd->balance_interval < sd->max_interval)) |
4148 | sd->balance_interval *= 2; | 4608 | sd->balance_interval *= 2; |
4149 | 4609 | ||
@@ -4156,7 +4616,7 @@ out: | |||
4156 | * idle_balance is called by schedule() if this_cpu is about to become | 4616 | * idle_balance is called by schedule() if this_cpu is about to become |
4157 | * idle. Attempts to pull tasks from other CPUs. | 4617 | * idle. Attempts to pull tasks from other CPUs. |
4158 | */ | 4618 | */ |
4159 | static void idle_balance(int this_cpu, struct rq *this_rq) | 4619 | void idle_balance(int this_cpu, struct rq *this_rq) |
4160 | { | 4620 | { |
4161 | struct sched_domain *sd; | 4621 | struct sched_domain *sd; |
4162 | int pulled_task = 0; | 4622 | int pulled_task = 0; |
@@ -4271,28 +4731,16 @@ out_unlock: | |||
4271 | #ifdef CONFIG_NO_HZ | 4731 | #ifdef CONFIG_NO_HZ |
4272 | /* | 4732 | /* |
4273 | * idle load balancing details | 4733 | * idle load balancing details |
4274 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
4275 | * entering idle. | ||
4276 | * - This idle load balancer CPU will also go into tickless mode when | ||
4277 | * it is idle, just like all other idle CPUs | ||
4278 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 4734 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
4279 | * needed, they will kick the idle load balancer, which then does idle | 4735 | * needed, they will kick the idle load balancer, which then does idle |
4280 | * load balancing for all the idle CPUs. | 4736 | * load balancing for all the idle CPUs. |
4281 | */ | 4737 | */ |
4282 | static struct { | 4738 | static struct { |
4283 | atomic_t load_balancer; | ||
4284 | atomic_t first_pick_cpu; | ||
4285 | atomic_t second_pick_cpu; | ||
4286 | cpumask_var_t idle_cpus_mask; | 4739 | cpumask_var_t idle_cpus_mask; |
4287 | cpumask_var_t grp_idle_mask; | 4740 | atomic_t nr_cpus; |
4288 | unsigned long next_balance; /* in jiffy units */ | 4741 | unsigned long next_balance; /* in jiffy units */ |
4289 | } nohz ____cacheline_aligned; | 4742 | } nohz ____cacheline_aligned; |
4290 | 4743 | ||
4291 | int get_nohz_load_balancer(void) | ||
4292 | { | ||
4293 | return atomic_read(&nohz.load_balancer); | ||
4294 | } | ||
4295 | |||
4296 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4744 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
4297 | /** | 4745 | /** |
4298 | * lowest_flag_domain - Return lowest sched_domain containing flag. | 4746 | * lowest_flag_domain - Return lowest sched_domain containing flag. |
@@ -4329,33 +4777,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
4329 | (sd && (sd->flags & flag)); sd = sd->parent) | 4777 | (sd && (sd->flags & flag)); sd = sd->parent) |
4330 | 4778 | ||
4331 | /** | 4779 | /** |
4332 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4333 | * @ilb_group: group to be checked for semi-idleness | ||
4334 | * | ||
4335 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4336 | * | ||
4337 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4338 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4339 | * sched_group is semi-idle or not. | ||
4340 | */ | ||
4341 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4342 | { | ||
4343 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, | ||
4344 | sched_group_cpus(ilb_group)); | ||
4345 | |||
4346 | /* | ||
4347 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4348 | * and atleast one idle cpu. | ||
4349 | */ | ||
4350 | if (cpumask_empty(nohz.grp_idle_mask)) | ||
4351 | return 0; | ||
4352 | |||
4353 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) | ||
4354 | return 0; | ||
4355 | |||
4356 | return 1; | ||
4357 | } | ||
4358 | /** | ||
4359 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | 4780 | * find_new_ilb - Finds the optimum idle load balancer for nomination. |
4360 | * @cpu: The cpu which is nominating a new idle_load_balancer. | 4781 | * @cpu: The cpu which is nominating a new idle_load_balancer. |
4361 | * | 4782 | * |
@@ -4369,9 +4790,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) | |||
4369 | */ | 4790 | */ |
4370 | static int find_new_ilb(int cpu) | 4791 | static int find_new_ilb(int cpu) |
4371 | { | 4792 | { |
4793 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
4794 | struct sched_group *ilbg; | ||
4372 | struct sched_domain *sd; | 4795 | struct sched_domain *sd; |
4373 | struct sched_group *ilb_group; | ||
4374 | int ilb = nr_cpu_ids; | ||
4375 | 4796 | ||
4376 | /* | 4797 | /* |
4377 | * Have idle load balancer selection from semi-idle packages only | 4798 | * Have idle load balancer selection from semi-idle packages only |
@@ -4389,23 +4810,28 @@ static int find_new_ilb(int cpu) | |||
4389 | 4810 | ||
4390 | rcu_read_lock(); | 4811 | rcu_read_lock(); |
4391 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 4812 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
4392 | ilb_group = sd->groups; | 4813 | ilbg = sd->groups; |
4393 | 4814 | ||
4394 | do { | 4815 | do { |
4395 | if (is_semi_idle_group(ilb_group)) { | 4816 | if (ilbg->group_weight != |
4396 | ilb = cpumask_first(nohz.grp_idle_mask); | 4817 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { |
4818 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4819 | sched_group_cpus(ilbg)); | ||
4397 | goto unlock; | 4820 | goto unlock; |
4398 | } | 4821 | } |
4399 | 4822 | ||
4400 | ilb_group = ilb_group->next; | 4823 | ilbg = ilbg->next; |
4401 | 4824 | ||
4402 | } while (ilb_group != sd->groups); | 4825 | } while (ilbg != sd->groups); |
4403 | } | 4826 | } |
4404 | unlock: | 4827 | unlock: |
4405 | rcu_read_unlock(); | 4828 | rcu_read_unlock(); |
4406 | 4829 | ||
4407 | out_done: | 4830 | out_done: |
4408 | return ilb; | 4831 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4832 | return ilb; | ||
4833 | |||
4834 | return nr_cpu_ids; | ||
4409 | } | 4835 | } |
4410 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 4836 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
4411 | static inline int find_new_ilb(int call_cpu) | 4837 | static inline int find_new_ilb(int call_cpu) |
@@ -4425,102 +4851,98 @@ static void nohz_balancer_kick(int cpu) | |||
4425 | 4851 | ||
4426 | nohz.next_balance++; | 4852 | nohz.next_balance++; |
4427 | 4853 | ||
4428 | ilb_cpu = get_nohz_load_balancer(); | 4854 | ilb_cpu = find_new_ilb(cpu); |
4429 | 4855 | ||
4430 | if (ilb_cpu >= nr_cpu_ids) { | 4856 | if (ilb_cpu >= nr_cpu_ids) |
4431 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | 4857 | return; |
4432 | if (ilb_cpu >= nr_cpu_ids) | ||
4433 | return; | ||
4434 | } | ||
4435 | 4858 | ||
4436 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4859 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) |
4437 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4860 | return; |
4861 | /* | ||
4862 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
4863 | * This way we generate a sched IPI on the target cpu which | ||
4864 | * is idle. And the softirq performing nohz idle load balance | ||
4865 | * will be run before returning from the IPI. | ||
4866 | */ | ||
4867 | smp_send_reschedule(ilb_cpu); | ||
4868 | return; | ||
4869 | } | ||
4438 | 4870 | ||
4439 | smp_mb(); | 4871 | static inline void clear_nohz_tick_stopped(int cpu) |
4440 | /* | 4872 | { |
4441 | * Use smp_send_reschedule() instead of resched_cpu(). | 4873 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
4442 | * This way we generate a sched IPI on the target cpu which | 4874 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
4443 | * is idle. And the softirq performing nohz idle load balance | 4875 | atomic_dec(&nohz.nr_cpus); |
4444 | * will be run before returning from the IPI. | 4876 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
4445 | */ | ||
4446 | smp_send_reschedule(ilb_cpu); | ||
4447 | } | 4877 | } |
4448 | return; | ||
4449 | } | 4878 | } |
4450 | 4879 | ||
4451 | /* | 4880 | static inline void set_cpu_sd_state_busy(void) |
4452 | * This routine will try to nominate the ilb (idle load balancing) | ||
4453 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
4454 | * load balancing on behalf of all those cpus. | ||
4455 | * | ||
4456 | * When the ilb owner becomes busy, we will not have new ilb owner until some | ||
4457 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick | ||
4458 | * idle load balancing by kicking one of the idle CPUs. | ||
4459 | * | ||
4460 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this | ||
4461 | * ilb owner CPU in future (when there is a need for idle load balancing on | ||
4462 | * behalf of all idle CPUs). | ||
4463 | */ | ||
4464 | void select_nohz_load_balancer(int stop_tick) | ||
4465 | { | 4881 | { |
4882 | struct sched_domain *sd; | ||
4466 | int cpu = smp_processor_id(); | 4883 | int cpu = smp_processor_id(); |
4467 | 4884 | ||
4468 | if (stop_tick) { | 4885 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
4469 | if (!cpu_active(cpu)) { | 4886 | return; |
4470 | if (atomic_read(&nohz.load_balancer) != cpu) | 4887 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); |
4471 | return; | ||
4472 | 4888 | ||
4473 | /* | 4889 | rcu_read_lock(); |
4474 | * If we are going offline and still the leader, | 4890 | for_each_domain(cpu, sd) |
4475 | * give up! | 4891 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
4476 | */ | 4892 | rcu_read_unlock(); |
4477 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | 4893 | } |
4478 | nr_cpu_ids) != cpu) | ||
4479 | BUG(); | ||
4480 | 4894 | ||
4481 | return; | 4895 | void set_cpu_sd_state_idle(void) |
4482 | } | 4896 | { |
4897 | struct sched_domain *sd; | ||
4898 | int cpu = smp_processor_id(); | ||
4483 | 4899 | ||
4484 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4900 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
4901 | return; | ||
4902 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
4485 | 4903 | ||
4486 | if (atomic_read(&nohz.first_pick_cpu) == cpu) | 4904 | rcu_read_lock(); |
4487 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); | 4905 | for_each_domain(cpu, sd) |
4488 | if (atomic_read(&nohz.second_pick_cpu) == cpu) | 4906 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
4489 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | 4907 | rcu_read_unlock(); |
4908 | } | ||
4490 | 4909 | ||
4491 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { | 4910 | /* |
4492 | int new_ilb; | 4911 | * This routine will record that this cpu is going idle with tick stopped. |
4912 | * This info will be used in performing idle load balancing in the future. | ||
4913 | */ | ||
4914 | void select_nohz_load_balancer(int stop_tick) | ||
4915 | { | ||
4916 | int cpu = smp_processor_id(); | ||
4493 | 4917 | ||
4494 | /* make me the ilb owner */ | 4918 | /* |
4495 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, | 4919 | * If this cpu is going down, then nothing needs to be done. |
4496 | cpu) != nr_cpu_ids) | 4920 | */ |
4497 | return; | 4921 | if (!cpu_active(cpu)) |
4922 | return; | ||
4498 | 4923 | ||
4499 | /* | 4924 | if (stop_tick) { |
4500 | * Check to see if there is a more power-efficient | 4925 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
4501 | * ilb. | ||
4502 | */ | ||
4503 | new_ilb = find_new_ilb(cpu); | ||
4504 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4505 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
4506 | resched_cpu(new_ilb); | ||
4507 | return; | ||
4508 | } | ||
4509 | return; | ||
4510 | } | ||
4511 | } else { | ||
4512 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
4513 | return; | 4926 | return; |
4514 | 4927 | ||
4515 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4928 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
4516 | 4929 | atomic_inc(&nohz.nr_cpus); | |
4517 | if (atomic_read(&nohz.load_balancer) == cpu) | 4930 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
4518 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
4519 | nr_cpu_ids) != cpu) | ||
4520 | BUG(); | ||
4521 | } | 4931 | } |
4522 | return; | 4932 | return; |
4523 | } | 4933 | } |
4934 | |||
4935 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | ||
4936 | unsigned long action, void *hcpu) | ||
4937 | { | ||
4938 | switch (action & ~CPU_TASKS_FROZEN) { | ||
4939 | case CPU_DYING: | ||
4940 | clear_nohz_tick_stopped(smp_processor_id()); | ||
4941 | return NOTIFY_OK; | ||
4942 | default: | ||
4943 | return NOTIFY_DONE; | ||
4944 | } | ||
4945 | } | ||
4524 | #endif | 4946 | #endif |
4525 | 4947 | ||
4526 | static DEFINE_SPINLOCK(balancing); | 4948 | static DEFINE_SPINLOCK(balancing); |
@@ -4531,7 +4953,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
4531 | * Scale the max load_balance interval with the number of CPUs in the system. | 4953 | * Scale the max load_balance interval with the number of CPUs in the system. |
4532 | * This trades load-balance latency on larger machines for less cross talk. | 4954 | * This trades load-balance latency on larger machines for less cross talk. |
4533 | */ | 4955 | */ |
4534 | static void update_max_interval(void) | 4956 | void update_max_interval(void) |
4535 | { | 4957 | { |
4536 | max_load_balance_interval = HZ*num_online_cpus()/10; | 4958 | max_load_balance_interval = HZ*num_online_cpus()/10; |
4537 | } | 4959 | } |
@@ -4623,11 +5045,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4623 | struct rq *rq; | 5045 | struct rq *rq; |
4624 | int balance_cpu; | 5046 | int balance_cpu; |
4625 | 5047 | ||
4626 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | 5048 | if (idle != CPU_IDLE || |
4627 | return; | 5049 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
5050 | goto end; | ||
4628 | 5051 | ||
4629 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 5052 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { |
4630 | if (balance_cpu == this_cpu) | 5053 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) |
4631 | continue; | 5054 | continue; |
4632 | 5055 | ||
4633 | /* | 5056 | /* |
@@ -4635,10 +5058,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4635 | * work being done for other cpus. Next load | 5058 | * work being done for other cpus. Next load |
4636 | * balancing owner will pick it up. | 5059 | * balancing owner will pick it up. |
4637 | */ | 5060 | */ |
4638 | if (need_resched()) { | 5061 | if (need_resched()) |
4639 | this_rq->nohz_balance_kick = 0; | ||
4640 | break; | 5062 | break; |
4641 | } | ||
4642 | 5063 | ||
4643 | raw_spin_lock_irq(&this_rq->lock); | 5064 | raw_spin_lock_irq(&this_rq->lock); |
4644 | update_rq_clock(this_rq); | 5065 | update_rq_clock(this_rq); |
@@ -4652,53 +5073,71 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4652 | this_rq->next_balance = rq->next_balance; | 5073 | this_rq->next_balance = rq->next_balance; |
4653 | } | 5074 | } |
4654 | nohz.next_balance = this_rq->next_balance; | 5075 | nohz.next_balance = this_rq->next_balance; |
4655 | this_rq->nohz_balance_kick = 0; | 5076 | end: |
5077 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
4656 | } | 5078 | } |
4657 | 5079 | ||
4658 | /* | 5080 | /* |
4659 | * Current heuristic for kicking the idle load balancer | 5081 | * Current heuristic for kicking the idle load balancer in the presence |
4660 | * - first_pick_cpu is the one of the busy CPUs. It will kick | 5082 | * of an idle cpu is the system. |
4661 | * idle load balancer when it has more than one process active. This | 5083 | * - This rq has more than one task. |
4662 | * eliminates the need for idle load balancing altogether when we have | 5084 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
4663 | * only one running process in the system (common case). | 5085 | * busy cpu's exceeding the group's power. |
4664 | * - If there are more than one busy CPU, idle load balancer may have | 5086 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
4665 | * to run for active_load_balance to happen (i.e., two busy CPUs are | 5087 | * domain span are idle. |
4666 | * SMT or core siblings and can run better if they move to different | ||
4667 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
4668 | * which will kick idle load balancer as soon as it has any load. | ||
4669 | */ | 5088 | */ |
4670 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 5089 | static inline int nohz_kick_needed(struct rq *rq, int cpu) |
4671 | { | 5090 | { |
4672 | unsigned long now = jiffies; | 5091 | unsigned long now = jiffies; |
4673 | int ret; | 5092 | struct sched_domain *sd; |
4674 | int first_pick_cpu, second_pick_cpu; | ||
4675 | 5093 | ||
4676 | if (time_before(now, nohz.next_balance)) | 5094 | if (unlikely(idle_cpu(cpu))) |
4677 | return 0; | 5095 | return 0; |
4678 | 5096 | ||
4679 | if (idle_cpu(cpu)) | 5097 | /* |
4680 | return 0; | 5098 | * We may be recently in ticked or tickless idle mode. At the first |
5099 | * busy tick after returning from idle, we will update the busy stats. | ||
5100 | */ | ||
5101 | set_cpu_sd_state_busy(); | ||
5102 | clear_nohz_tick_stopped(cpu); | ||
4681 | 5103 | ||
4682 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 5104 | /* |
4683 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | 5105 | * None are in tickless mode and hence no need for NOHZ idle load |
5106 | * balancing. | ||
5107 | */ | ||
5108 | if (likely(!atomic_read(&nohz.nr_cpus))) | ||
5109 | return 0; | ||
4684 | 5110 | ||
4685 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | 5111 | if (time_before(now, nohz.next_balance)) |
4686 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
4687 | return 0; | 5112 | return 0; |
4688 | 5113 | ||
4689 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | 5114 | if (rq->nr_running >= 2) |
4690 | if (ret == nr_cpu_ids || ret == cpu) { | 5115 | goto need_kick; |
4691 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | 5116 | |
4692 | if (rq->nr_running > 1) | 5117 | rcu_read_lock(); |
4693 | return 1; | 5118 | for_each_domain(cpu, sd) { |
4694 | } else { | 5119 | struct sched_group *sg = sd->groups; |
4695 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | 5120 | struct sched_group_power *sgp = sg->sgp; |
4696 | if (ret == nr_cpu_ids || ret == cpu) { | 5121 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); |
4697 | if (rq->nr_running) | 5122 | |
4698 | return 1; | 5123 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) |
4699 | } | 5124 | goto need_kick_unlock; |
5125 | |||
5126 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | ||
5127 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
5128 | sched_domain_span(sd)) < cpu)) | ||
5129 | goto need_kick_unlock; | ||
5130 | |||
5131 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
5132 | break; | ||
4700 | } | 5133 | } |
5134 | rcu_read_unlock(); | ||
4701 | return 0; | 5135 | return 0; |
5136 | |||
5137 | need_kick_unlock: | ||
5138 | rcu_read_unlock(); | ||
5139 | need_kick: | ||
5140 | return 1; | ||
4702 | } | 5141 | } |
4703 | #else | 5142 | #else |
4704 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 5143 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } |
@@ -4733,14 +5172,14 @@ static inline int on_null_domain(int cpu) | |||
4733 | /* | 5172 | /* |
4734 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 5173 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
4735 | */ | 5174 | */ |
4736 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 5175 | void trigger_load_balance(struct rq *rq, int cpu) |
4737 | { | 5176 | { |
4738 | /* Don't need to rebalance while attached to NULL domain */ | 5177 | /* Don't need to rebalance while attached to NULL domain */ |
4739 | if (time_after_eq(jiffies, rq->next_balance) && | 5178 | if (time_after_eq(jiffies, rq->next_balance) && |
4740 | likely(!on_null_domain(cpu))) | 5179 | likely(!on_null_domain(cpu))) |
4741 | raise_softirq(SCHED_SOFTIRQ); | 5180 | raise_softirq(SCHED_SOFTIRQ); |
4742 | #ifdef CONFIG_NO_HZ | 5181 | #ifdef CONFIG_NO_HZ |
4743 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 5182 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) |
4744 | nohz_balancer_kick(cpu); | 5183 | nohz_balancer_kick(cpu); |
4745 | #endif | 5184 | #endif |
4746 | } | 5185 | } |
@@ -4755,15 +5194,6 @@ static void rq_offline_fair(struct rq *rq) | |||
4755 | update_sysctl(); | 5194 | update_sysctl(); |
4756 | } | 5195 | } |
4757 | 5196 | ||
4758 | #else /* CONFIG_SMP */ | ||
4759 | |||
4760 | /* | ||
4761 | * on UP we do not need to balance between CPUs: | ||
4762 | */ | ||
4763 | static inline void idle_balance(int cpu, struct rq *rq) | ||
4764 | { | ||
4765 | } | ||
4766 | |||
4767 | #endif /* CONFIG_SMP */ | 5197 | #endif /* CONFIG_SMP */ |
4768 | 5198 | ||
4769 | /* | 5199 | /* |
@@ -4787,8 +5217,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4787 | */ | 5217 | */ |
4788 | static void task_fork_fair(struct task_struct *p) | 5218 | static void task_fork_fair(struct task_struct *p) |
4789 | { | 5219 | { |
4790 | struct cfs_rq *cfs_rq = task_cfs_rq(current); | 5220 | struct cfs_rq *cfs_rq; |
4791 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; | 5221 | struct sched_entity *se = &p->se, *curr; |
4792 | int this_cpu = smp_processor_id(); | 5222 | int this_cpu = smp_processor_id(); |
4793 | struct rq *rq = this_rq(); | 5223 | struct rq *rq = this_rq(); |
4794 | unsigned long flags; | 5224 | unsigned long flags; |
@@ -4797,6 +5227,9 @@ static void task_fork_fair(struct task_struct *p) | |||
4797 | 5227 | ||
4798 | update_rq_clock(rq); | 5228 | update_rq_clock(rq); |
4799 | 5229 | ||
5230 | cfs_rq = task_cfs_rq(current); | ||
5231 | curr = cfs_rq->curr; | ||
5232 | |||
4800 | if (unlikely(task_cpu(p) != this_cpu)) { | 5233 | if (unlikely(task_cpu(p) != this_cpu)) { |
4801 | rcu_read_lock(); | 5234 | rcu_read_lock(); |
4802 | __set_task_cpu(p, this_cpu); | 5235 | __set_task_cpu(p, this_cpu); |
@@ -4906,6 +5339,16 @@ static void set_curr_task_fair(struct rq *rq) | |||
4906 | } | 5339 | } |
4907 | } | 5340 | } |
4908 | 5341 | ||
5342 | void init_cfs_rq(struct cfs_rq *cfs_rq) | ||
5343 | { | ||
5344 | cfs_rq->tasks_timeline = RB_ROOT; | ||
5345 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5346 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
5347 | #ifndef CONFIG_64BIT | ||
5348 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
5349 | #endif | ||
5350 | } | ||
5351 | |||
4909 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5352 | #ifdef CONFIG_FAIR_GROUP_SCHED |
4910 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 5353 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
4911 | { | 5354 | { |
@@ -4922,13 +5365,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
4922 | * to another cgroup's rq. This does somewhat interfere with the | 5365 | * to another cgroup's rq. This does somewhat interfere with the |
4923 | * fair sleeper stuff for the first placement, but who cares. | 5366 | * fair sleeper stuff for the first placement, but who cares. |
4924 | */ | 5367 | */ |
5368 | /* | ||
5369 | * When !on_rq, vruntime of the task has usually NOT been normalized. | ||
5370 | * But there are some cases where it has already been normalized: | ||
5371 | * | ||
5372 | * - Moving a forked child which is waiting for being woken up by | ||
5373 | * wake_up_new_task(). | ||
5374 | * - Moving a task which has been woken up by try_to_wake_up() and | ||
5375 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
5376 | * | ||
5377 | * To prevent boost or penalty in the new cfs_rq caused by delta | ||
5378 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | ||
5379 | */ | ||
5380 | if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) | ||
5381 | on_rq = 1; | ||
5382 | |||
4925 | if (!on_rq) | 5383 | if (!on_rq) |
4926 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 5384 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; |
4927 | set_task_rq(p, task_cpu(p)); | 5385 | set_task_rq(p, task_cpu(p)); |
4928 | if (!on_rq) | 5386 | if (!on_rq) |
4929 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5387 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; |
4930 | } | 5388 | } |
5389 | |||
5390 | void free_fair_sched_group(struct task_group *tg) | ||
5391 | { | ||
5392 | int i; | ||
5393 | |||
5394 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
5395 | |||
5396 | for_each_possible_cpu(i) { | ||
5397 | if (tg->cfs_rq) | ||
5398 | kfree(tg->cfs_rq[i]); | ||
5399 | if (tg->se) | ||
5400 | kfree(tg->se[i]); | ||
5401 | } | ||
5402 | |||
5403 | kfree(tg->cfs_rq); | ||
5404 | kfree(tg->se); | ||
5405 | } | ||
5406 | |||
5407 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
5408 | { | ||
5409 | struct cfs_rq *cfs_rq; | ||
5410 | struct sched_entity *se; | ||
5411 | int i; | ||
5412 | |||
5413 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
5414 | if (!tg->cfs_rq) | ||
5415 | goto err; | ||
5416 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
5417 | if (!tg->se) | ||
5418 | goto err; | ||
5419 | |||
5420 | tg->shares = NICE_0_LOAD; | ||
5421 | |||
5422 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
5423 | |||
5424 | for_each_possible_cpu(i) { | ||
5425 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
5426 | GFP_KERNEL, cpu_to_node(i)); | ||
5427 | if (!cfs_rq) | ||
5428 | goto err; | ||
5429 | |||
5430 | se = kzalloc_node(sizeof(struct sched_entity), | ||
5431 | GFP_KERNEL, cpu_to_node(i)); | ||
5432 | if (!se) | ||
5433 | goto err_free_rq; | ||
5434 | |||
5435 | init_cfs_rq(cfs_rq); | ||
5436 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
5437 | } | ||
5438 | |||
5439 | return 1; | ||
5440 | |||
5441 | err_free_rq: | ||
5442 | kfree(cfs_rq); | ||
5443 | err: | ||
5444 | return 0; | ||
5445 | } | ||
5446 | |||
5447 | void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
5448 | { | ||
5449 | struct rq *rq = cpu_rq(cpu); | ||
5450 | unsigned long flags; | ||
5451 | |||
5452 | /* | ||
5453 | * Only empty task groups can be destroyed; so we can speculatively | ||
5454 | * check on_list without danger of it being re-added. | ||
5455 | */ | ||
5456 | if (!tg->cfs_rq[cpu]->on_list) | ||
5457 | return; | ||
5458 | |||
5459 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5460 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
5461 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5462 | } | ||
5463 | |||
5464 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
5465 | struct sched_entity *se, int cpu, | ||
5466 | struct sched_entity *parent) | ||
5467 | { | ||
5468 | struct rq *rq = cpu_rq(cpu); | ||
5469 | |||
5470 | cfs_rq->tg = tg; | ||
5471 | cfs_rq->rq = rq; | ||
5472 | #ifdef CONFIG_SMP | ||
5473 | /* allow initial update_cfs_load() to truncate */ | ||
5474 | cfs_rq->load_stamp = 1; | ||
4931 | #endif | 5475 | #endif |
5476 | init_cfs_rq_runtime(cfs_rq); | ||
5477 | |||
5478 | tg->cfs_rq[cpu] = cfs_rq; | ||
5479 | tg->se[cpu] = se; | ||
5480 | |||
5481 | /* se could be NULL for root_task_group */ | ||
5482 | if (!se) | ||
5483 | return; | ||
5484 | |||
5485 | if (!parent) | ||
5486 | se->cfs_rq = &rq->cfs; | ||
5487 | else | ||
5488 | se->cfs_rq = parent->my_q; | ||
5489 | |||
5490 | se->my_q = cfs_rq; | ||
5491 | update_load_set(&se->load, 0); | ||
5492 | se->parent = parent; | ||
5493 | } | ||
5494 | |||
5495 | static DEFINE_MUTEX(shares_mutex); | ||
5496 | |||
5497 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
5498 | { | ||
5499 | int i; | ||
5500 | unsigned long flags; | ||
5501 | |||
5502 | /* | ||
5503 | * We can't change the weight of the root cgroup. | ||
5504 | */ | ||
5505 | if (!tg->se[0]) | ||
5506 | return -EINVAL; | ||
5507 | |||
5508 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
5509 | |||
5510 | mutex_lock(&shares_mutex); | ||
5511 | if (tg->shares == shares) | ||
5512 | goto done; | ||
5513 | |||
5514 | tg->shares = shares; | ||
5515 | for_each_possible_cpu(i) { | ||
5516 | struct rq *rq = cpu_rq(i); | ||
5517 | struct sched_entity *se; | ||
5518 | |||
5519 | se = tg->se[i]; | ||
5520 | /* Propagate contribution to hierarchy */ | ||
5521 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5522 | for_each_sched_entity(se) | ||
5523 | update_cfs_shares(group_cfs_rq(se)); | ||
5524 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5525 | } | ||
5526 | |||
5527 | done: | ||
5528 | mutex_unlock(&shares_mutex); | ||
5529 | return 0; | ||
5530 | } | ||
5531 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
5532 | |||
5533 | void free_fair_sched_group(struct task_group *tg) { } | ||
5534 | |||
5535 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
5536 | { | ||
5537 | return 1; | ||
5538 | } | ||
5539 | |||
5540 | void unregister_fair_sched_group(struct task_group *tg, int cpu) { } | ||
5541 | |||
5542 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
5543 | |||
4932 | 5544 | ||
4933 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 5545 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
4934 | { | 5546 | { |
@@ -4948,7 +5560,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | |||
4948 | /* | 5560 | /* |
4949 | * All the scheduling class methods: | 5561 | * All the scheduling class methods: |
4950 | */ | 5562 | */ |
4951 | static const struct sched_class fair_sched_class = { | 5563 | const struct sched_class fair_sched_class = { |
4952 | .next = &idle_sched_class, | 5564 | .next = &idle_sched_class, |
4953 | .enqueue_task = enqueue_task_fair, | 5565 | .enqueue_task = enqueue_task_fair, |
4954 | .dequeue_task = dequeue_task_fair, | 5566 | .dequeue_task = dequeue_task_fair, |
@@ -4985,7 +5597,7 @@ static const struct sched_class fair_sched_class = { | |||
4985 | }; | 5597 | }; |
4986 | 5598 | ||
4987 | #ifdef CONFIG_SCHED_DEBUG | 5599 | #ifdef CONFIG_SCHED_DEBUG |
4988 | static void print_cfs_stats(struct seq_file *m, int cpu) | 5600 | void print_cfs_stats(struct seq_file *m, int cpu) |
4989 | { | 5601 | { |
4990 | struct cfs_rq *cfs_rq; | 5602 | struct cfs_rq *cfs_rq; |
4991 | 5603 | ||
@@ -4995,3 +5607,16 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
4995 | rcu_read_unlock(); | 5607 | rcu_read_unlock(); |
4996 | } | 5608 | } |
4997 | #endif | 5609 | #endif |
5610 | |||
5611 | __init void init_sched_fair_class(void) | ||
5612 | { | ||
5613 | #ifdef CONFIG_SMP | ||
5614 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
5615 | |||
5616 | #ifdef CONFIG_NO_HZ | ||
5617 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
5618 | cpu_notifier(sched_ilb_notifier, 0); | ||
5619 | #endif | ||
5620 | #endif /* SMP */ | ||
5621 | |||
5622 | } | ||
diff --git a/kernel/sched_features.h b/kernel/sched/features.h index efa0a7b75dde..e61fd73913d0 100644 --- a/kernel/sched_features.h +++ b/kernel/sched/features.h | |||
@@ -3,13 +3,13 @@ | |||
3 | * them to run sooner, but does not allow tons of sleepers to | 3 | * them to run sooner, but does not allow tons of sleepers to |
4 | * rip the spread apart. | 4 | * rip the spread apart. |
5 | */ | 5 | */ |
6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | 6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) |
7 | 7 | ||
8 | /* | 8 | /* |
9 | * Place new tasks ahead so that they do not starve already running | 9 | * Place new tasks ahead so that they do not starve already running |
10 | * tasks | 10 | * tasks |
11 | */ | 11 | */ |
12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, true) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
@@ -17,53 +17,54 @@ SCHED_FEAT(START_DEBIT, 1) | |||
17 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. |
19 | */ | 19 | */ |
20 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | 20 | SCHED_FEAT(AFFINE_WAKEUPS, true) |
21 | 21 | ||
22 | /* | 22 | /* |
23 | * Prefer to schedule the task we woke last (assuming it failed | 23 | * Prefer to schedule the task we woke last (assuming it failed |
24 | * wakeup-preemption), since its likely going to consume data we | 24 | * wakeup-preemption), since its likely going to consume data we |
25 | * touched, increases cache locality. | 25 | * touched, increases cache locality. |
26 | */ | 26 | */ |
27 | SCHED_FEAT(NEXT_BUDDY, 0) | 27 | SCHED_FEAT(NEXT_BUDDY, false) |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * Prefer to schedule the task that ran last (when we did | 30 | * Prefer to schedule the task that ran last (when we did |
31 | * wake-preempt) as that likely will touch the same data, increases | 31 | * wake-preempt) as that likely will touch the same data, increases |
32 | * cache locality. | 32 | * cache locality. |
33 | */ | 33 | */ |
34 | SCHED_FEAT(LAST_BUDDY, 1) | 34 | SCHED_FEAT(LAST_BUDDY, true) |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * Consider buddies to be cache hot, decreases the likelyness of a | 37 | * Consider buddies to be cache hot, decreases the likelyness of a |
38 | * cache buddy being migrated away, increases cache locality. | 38 | * cache buddy being migrated away, increases cache locality. |
39 | */ | 39 | */ |
40 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | 40 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
41 | 41 | ||
42 | /* | 42 | /* |
43 | * Use arch dependent cpu power functions | 43 | * Use arch dependent cpu power functions |
44 | */ | 44 | */ |
45 | SCHED_FEAT(ARCH_POWER, 0) | 45 | SCHED_FEAT(ARCH_POWER, false) |
46 | 46 | ||
47 | SCHED_FEAT(HRTICK, 0) | 47 | SCHED_FEAT(HRTICK, false) |
48 | SCHED_FEAT(DOUBLE_TICK, 0) | 48 | SCHED_FEAT(DOUBLE_TICK, false) |
49 | SCHED_FEAT(LB_BIAS, 1) | 49 | SCHED_FEAT(LB_BIAS, true) |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Spin-wait on mutex acquisition when the mutex owner is running on | 52 | * Spin-wait on mutex acquisition when the mutex owner is running on |
53 | * another cpu -- assumes that when the owner is running, it will soon | 53 | * another cpu -- assumes that when the owner is running, it will soon |
54 | * release the lock. Decreases scheduling overhead. | 54 | * release the lock. Decreases scheduling overhead. |
55 | */ | 55 | */ |
56 | SCHED_FEAT(OWNER_SPIN, 1) | 56 | SCHED_FEAT(OWNER_SPIN, true) |
57 | 57 | ||
58 | /* | 58 | /* |
59 | * Decrement CPU power based on time not spent running tasks | 59 | * Decrement CPU power based on time not spent running tasks |
60 | */ | 60 | */ |
61 | SCHED_FEAT(NONTASK_POWER, 1) | 61 | SCHED_FEAT(NONTASK_POWER, true) |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Queue remote wakeups on the target CPU and process them | 64 | * Queue remote wakeups on the target CPU and process them |
65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | 65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. |
66 | */ | 66 | */ |
67 | SCHED_FEAT(TTWU_QUEUE, 1) | 67 | SCHED_FEAT(TTWU_QUEUE, true) |
68 | 68 | ||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c index 0a51882534ea..91b4c957f289 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched/idle_task.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #include "sched.h" | ||
2 | |||
1 | /* | 3 | /* |
2 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
3 | * | 5 | * |
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task | |||
71 | /* | 73 | /* |
72 | * Simple, special scheduling class for the per-CPU idle tasks: | 74 | * Simple, special scheduling class for the per-CPU idle tasks: |
73 | */ | 75 | */ |
74 | static const struct sched_class idle_sched_class = { | 76 | const struct sched_class idle_sched_class = { |
75 | /* .next is NULL */ | 77 | /* .next is NULL */ |
76 | /* no enqueue/yield_task for idle tasks */ | 78 | /* no enqueue/yield_task for idle tasks */ |
77 | 79 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c index 056cbd2e2a27..f42ae7fb5ec5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched/rt.c | |||
@@ -3,7 +3,92 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include "sched.h" | ||
7 | |||
8 | #include <linux/slab.h> | ||
9 | |||
10 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
11 | |||
12 | struct rt_bandwidth def_rt_bandwidth; | ||
13 | |||
14 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
15 | { | ||
16 | struct rt_bandwidth *rt_b = | ||
17 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
18 | ktime_t now; | ||
19 | int overrun; | ||
20 | int idle = 0; | ||
21 | |||
22 | for (;;) { | ||
23 | now = hrtimer_cb_get_time(timer); | ||
24 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
25 | |||
26 | if (!overrun) | ||
27 | break; | ||
28 | |||
29 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
30 | } | ||
31 | |||
32 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
33 | } | ||
34 | |||
35 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
36 | { | ||
37 | rt_b->rt_period = ns_to_ktime(period); | ||
38 | rt_b->rt_runtime = runtime; | ||
39 | |||
40 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
41 | |||
42 | hrtimer_init(&rt_b->rt_period_timer, | ||
43 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
44 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
45 | } | ||
46 | |||
47 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
48 | { | ||
49 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
50 | return; | ||
51 | |||
52 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
53 | return; | ||
54 | |||
55 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
56 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
57 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
58 | } | ||
59 | |||
60 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
61 | { | ||
62 | struct rt_prio_array *array; | ||
63 | int i; | ||
64 | |||
65 | array = &rt_rq->active; | ||
66 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
67 | INIT_LIST_HEAD(array->queue + i); | ||
68 | __clear_bit(i, array->bitmap); | ||
69 | } | ||
70 | /* delimiter for bitsearch: */ | ||
71 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
72 | |||
73 | #if defined CONFIG_SMP | ||
74 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
75 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
76 | rt_rq->rt_nr_migratory = 0; | ||
77 | rt_rq->overloaded = 0; | ||
78 | plist_head_init(&rt_rq->pushable_tasks); | ||
79 | #endif | ||
80 | |||
81 | rt_rq->rt_time = 0; | ||
82 | rt_rq->rt_throttled = 0; | ||
83 | rt_rq->rt_runtime = 0; | ||
84 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
85 | } | ||
86 | |||
6 | #ifdef CONFIG_RT_GROUP_SCHED | 87 | #ifdef CONFIG_RT_GROUP_SCHED |
88 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
89 | { | ||
90 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
91 | } | ||
7 | 92 | ||
8 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | 93 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) |
9 | 94 | ||
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
25 | return rt_se->rt_rq; | 110 | return rt_se->rt_rq; |
26 | } | 111 | } |
27 | 112 | ||
113 | void free_rt_sched_group(struct task_group *tg) | ||
114 | { | ||
115 | int i; | ||
116 | |||
117 | if (tg->rt_se) | ||
118 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
119 | |||
120 | for_each_possible_cpu(i) { | ||
121 | if (tg->rt_rq) | ||
122 | kfree(tg->rt_rq[i]); | ||
123 | if (tg->rt_se) | ||
124 | kfree(tg->rt_se[i]); | ||
125 | } | ||
126 | |||
127 | kfree(tg->rt_rq); | ||
128 | kfree(tg->rt_se); | ||
129 | } | ||
130 | |||
131 | void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
132 | struct sched_rt_entity *rt_se, int cpu, | ||
133 | struct sched_rt_entity *parent) | ||
134 | { | ||
135 | struct rq *rq = cpu_rq(cpu); | ||
136 | |||
137 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
138 | rt_rq->rt_nr_boosted = 0; | ||
139 | rt_rq->rq = rq; | ||
140 | rt_rq->tg = tg; | ||
141 | |||
142 | tg->rt_rq[cpu] = rt_rq; | ||
143 | tg->rt_se[cpu] = rt_se; | ||
144 | |||
145 | if (!rt_se) | ||
146 | return; | ||
147 | |||
148 | if (!parent) | ||
149 | rt_se->rt_rq = &rq->rt; | ||
150 | else | ||
151 | rt_se->rt_rq = parent->my_q; | ||
152 | |||
153 | rt_se->my_q = rt_rq; | ||
154 | rt_se->parent = parent; | ||
155 | INIT_LIST_HEAD(&rt_se->run_list); | ||
156 | } | ||
157 | |||
158 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
159 | { | ||
160 | struct rt_rq *rt_rq; | ||
161 | struct sched_rt_entity *rt_se; | ||
162 | int i; | ||
163 | |||
164 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
165 | if (!tg->rt_rq) | ||
166 | goto err; | ||
167 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
168 | if (!tg->rt_se) | ||
169 | goto err; | ||
170 | |||
171 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
172 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
173 | |||
174 | for_each_possible_cpu(i) { | ||
175 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
176 | GFP_KERNEL, cpu_to_node(i)); | ||
177 | if (!rt_rq) | ||
178 | goto err; | ||
179 | |||
180 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
181 | GFP_KERNEL, cpu_to_node(i)); | ||
182 | if (!rt_se) | ||
183 | goto err_free_rq; | ||
184 | |||
185 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
186 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
187 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
188 | } | ||
189 | |||
190 | return 1; | ||
191 | |||
192 | err_free_rq: | ||
193 | kfree(rt_rq); | ||
194 | err: | ||
195 | return 0; | ||
196 | } | ||
197 | |||
28 | #else /* CONFIG_RT_GROUP_SCHED */ | 198 | #else /* CONFIG_RT_GROUP_SCHED */ |
29 | 199 | ||
30 | #define rt_entity_is_task(rt_se) (1) | 200 | #define rt_entity_is_task(rt_se) (1) |
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
47 | return &rq->rt; | 217 | return &rq->rt; |
48 | } | 218 | } |
49 | 219 | ||
220 | void free_rt_sched_group(struct task_group *tg) { } | ||
221 | |||
222 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
223 | { | ||
224 | return 1; | ||
225 | } | ||
50 | #endif /* CONFIG_RT_GROUP_SCHED */ | 226 | #endif /* CONFIG_RT_GROUP_SCHED */ |
51 | 227 | ||
52 | #ifdef CONFIG_SMP | 228 | #ifdef CONFIG_SMP |
@@ -556,10 +732,35 @@ static void enable_runtime(struct rq *rq) | |||
556 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 732 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
557 | } | 733 | } |
558 | 734 | ||
735 | int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
736 | { | ||
737 | int cpu = (int)(long)hcpu; | ||
738 | |||
739 | switch (action) { | ||
740 | case CPU_DOWN_PREPARE: | ||
741 | case CPU_DOWN_PREPARE_FROZEN: | ||
742 | disable_runtime(cpu_rq(cpu)); | ||
743 | return NOTIFY_OK; | ||
744 | |||
745 | case CPU_DOWN_FAILED: | ||
746 | case CPU_DOWN_FAILED_FROZEN: | ||
747 | case CPU_ONLINE: | ||
748 | case CPU_ONLINE_FROZEN: | ||
749 | enable_runtime(cpu_rq(cpu)); | ||
750 | return NOTIFY_OK; | ||
751 | |||
752 | default: | ||
753 | return NOTIFY_DONE; | ||
754 | } | ||
755 | } | ||
756 | |||
559 | static int balance_runtime(struct rt_rq *rt_rq) | 757 | static int balance_runtime(struct rt_rq *rt_rq) |
560 | { | 758 | { |
561 | int more = 0; | 759 | int more = 0; |
562 | 760 | ||
761 | if (!sched_feat(RT_RUNTIME_SHARE)) | ||
762 | return more; | ||
763 | |||
563 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | 764 | if (rt_rq->rt_time > rt_rq->rt_runtime) { |
564 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 765 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
565 | more = do_balance_runtime(rt_rq); | 766 | more = do_balance_runtime(rt_rq); |
@@ -645,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
645 | if (rt_rq->rt_throttled) | 846 | if (rt_rq->rt_throttled) |
646 | return rt_rq_throttled(rt_rq); | 847 | return rt_rq_throttled(rt_rq); |
647 | 848 | ||
648 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | 849 | if (runtime >= sched_rt_period(rt_rq)) |
649 | return 0; | 850 | return 0; |
650 | 851 | ||
651 | balance_runtime(rt_rq); | 852 | balance_runtime(rt_rq); |
@@ -954,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
954 | } | 1155 | } |
955 | 1156 | ||
956 | /* | 1157 | /* |
957 | * Put task to the end of the run list without the overhead of dequeue | 1158 | * Put task to the head or the end of the run list without the overhead of |
958 | * followed by enqueue. | 1159 | * dequeue followed by enqueue. |
959 | */ | 1160 | */ |
960 | static void | 1161 | static void |
961 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) | 1162 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) |
@@ -999,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
999 | 1200 | ||
1000 | cpu = task_cpu(p); | 1201 | cpu = task_cpu(p); |
1001 | 1202 | ||
1203 | if (p->rt.nr_cpus_allowed == 1) | ||
1204 | goto out; | ||
1205 | |||
1002 | /* For anything but wake ups, just return the task_cpu */ | 1206 | /* For anything but wake ups, just return the task_cpu */ |
1003 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 1207 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) |
1004 | goto out; | 1208 | goto out; |
@@ -1175,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1175 | /* Only try algorithms three times */ | 1379 | /* Only try algorithms three times */ |
1176 | #define RT_MAX_TRIES 3 | 1380 | #define RT_MAX_TRIES 3 |
1177 | 1381 | ||
1178 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | ||
1179 | |||
1180 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1382 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
1181 | { | 1383 | { |
1182 | if (!task_running(rq, p) && | 1384 | if (!task_running(rq, p) && |
@@ -1385,6 +1587,11 @@ static int push_rt_task(struct rq *rq) | |||
1385 | if (!next_task) | 1587 | if (!next_task) |
1386 | return 0; | 1588 | return 0; |
1387 | 1589 | ||
1590 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1591 | if (unlikely(task_running(rq, next_task))) | ||
1592 | return 0; | ||
1593 | #endif | ||
1594 | |||
1388 | retry: | 1595 | retry: |
1389 | if (unlikely(next_task == rq->curr)) { | 1596 | if (unlikely(next_task == rq->curr)) { |
1390 | WARN_ON(1); | 1597 | WARN_ON(1); |
@@ -1650,13 +1857,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1650 | pull_rt_task(rq); | 1857 | pull_rt_task(rq); |
1651 | } | 1858 | } |
1652 | 1859 | ||
1653 | static inline void init_sched_rt_class(void) | 1860 | void init_sched_rt_class(void) |
1654 | { | 1861 | { |
1655 | unsigned int i; | 1862 | unsigned int i; |
1656 | 1863 | ||
1657 | for_each_possible_cpu(i) | 1864 | for_each_possible_cpu(i) { |
1658 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), | 1865 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), |
1659 | GFP_KERNEL, cpu_to_node(i)); | 1866 | GFP_KERNEL, cpu_to_node(i)); |
1867 | } | ||
1660 | } | 1868 | } |
1661 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1662 | 1870 | ||
@@ -1797,7 +2005,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
1797 | return 0; | 2005 | return 0; |
1798 | } | 2006 | } |
1799 | 2007 | ||
1800 | static const struct sched_class rt_sched_class = { | 2008 | const struct sched_class rt_sched_class = { |
1801 | .next = &fair_sched_class, | 2009 | .next = &fair_sched_class, |
1802 | .enqueue_task = enqueue_task_rt, | 2010 | .enqueue_task = enqueue_task_rt, |
1803 | .dequeue_task = dequeue_task_rt, | 2011 | .dequeue_task = dequeue_task_rt, |
@@ -1832,7 +2040,7 @@ static const struct sched_class rt_sched_class = { | |||
1832 | #ifdef CONFIG_SCHED_DEBUG | 2040 | #ifdef CONFIG_SCHED_DEBUG |
1833 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | 2041 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); |
1834 | 2042 | ||
1835 | static void print_rt_stats(struct seq_file *m, int cpu) | 2043 | void print_rt_stats(struct seq_file *m, int cpu) |
1836 | { | 2044 | { |
1837 | rt_rq_iter_t iter; | 2045 | rt_rq_iter_t iter; |
1838 | struct rt_rq *rt_rq; | 2046 | struct rt_rq *rt_rq; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 000000000000..98c0c2623db8 --- /dev/null +++ b/kernel/sched/sched.h | |||
@@ -0,0 +1,1166 @@ | |||
1 | |||
2 | #include <linux/sched.h> | ||
3 | #include <linux/mutex.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/stop_machine.h> | ||
6 | |||
7 | #include "cpupri.h" | ||
8 | |||
9 | extern __read_mostly int scheduler_running; | ||
10 | |||
11 | /* | ||
12 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
13 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
14 | * and back. | ||
15 | */ | ||
16 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
17 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
18 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
19 | |||
20 | /* | ||
21 | * 'User priority' is the nice value converted to something we | ||
22 | * can work with better when scaling various scheduler parameters, | ||
23 | * it's a [ 0 ... 39 ] range. | ||
24 | */ | ||
25 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
26 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
27 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
28 | |||
29 | /* | ||
30 | * Helpers for converting nanosecond timing to jiffy resolution | ||
31 | */ | ||
32 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
33 | |||
34 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
35 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
36 | |||
37 | /* | ||
38 | * These are the 'tuning knobs' of the scheduler: | ||
39 | * | ||
40 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
41 | * Timeslices get refilled after they expire. | ||
42 | */ | ||
43 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
44 | |||
45 | /* | ||
46 | * single value that denotes runtime == period, ie unlimited time. | ||
47 | */ | ||
48 | #define RUNTIME_INF ((u64)~0ULL) | ||
49 | |||
50 | static inline int rt_policy(int policy) | ||
51 | { | ||
52 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
53 | return 1; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static inline int task_has_rt_policy(struct task_struct *p) | ||
58 | { | ||
59 | return rt_policy(p->policy); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is the priority-queue data structure of the RT scheduling class: | ||
64 | */ | ||
65 | struct rt_prio_array { | ||
66 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
67 | struct list_head queue[MAX_RT_PRIO]; | ||
68 | }; | ||
69 | |||
70 | struct rt_bandwidth { | ||
71 | /* nests inside the rq lock: */ | ||
72 | raw_spinlock_t rt_runtime_lock; | ||
73 | ktime_t rt_period; | ||
74 | u64 rt_runtime; | ||
75 | struct hrtimer rt_period_timer; | ||
76 | }; | ||
77 | |||
78 | extern struct mutex sched_domains_mutex; | ||
79 | |||
80 | #ifdef CONFIG_CGROUP_SCHED | ||
81 | |||
82 | #include <linux/cgroup.h> | ||
83 | |||
84 | struct cfs_rq; | ||
85 | struct rt_rq; | ||
86 | |||
87 | static LIST_HEAD(task_groups); | ||
88 | |||
89 | struct cfs_bandwidth { | ||
90 | #ifdef CONFIG_CFS_BANDWIDTH | ||
91 | raw_spinlock_t lock; | ||
92 | ktime_t period; | ||
93 | u64 quota, runtime; | ||
94 | s64 hierarchal_quota; | ||
95 | u64 runtime_expires; | ||
96 | |||
97 | int idle, timer_active; | ||
98 | struct hrtimer period_timer, slack_timer; | ||
99 | struct list_head throttled_cfs_rq; | ||
100 | |||
101 | /* statistics */ | ||
102 | int nr_periods, nr_throttled; | ||
103 | u64 throttled_time; | ||
104 | #endif | ||
105 | }; | ||
106 | |||
107 | /* task group related information */ | ||
108 | struct task_group { | ||
109 | struct cgroup_subsys_state css; | ||
110 | |||
111 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
112 | /* schedulable entities of this group on each cpu */ | ||
113 | struct sched_entity **se; | ||
114 | /* runqueue "owned" by this group on each cpu */ | ||
115 | struct cfs_rq **cfs_rq; | ||
116 | unsigned long shares; | ||
117 | |||
118 | atomic_t load_weight; | ||
119 | #endif | ||
120 | |||
121 | #ifdef CONFIG_RT_GROUP_SCHED | ||
122 | struct sched_rt_entity **rt_se; | ||
123 | struct rt_rq **rt_rq; | ||
124 | |||
125 | struct rt_bandwidth rt_bandwidth; | ||
126 | #endif | ||
127 | |||
128 | struct rcu_head rcu; | ||
129 | struct list_head list; | ||
130 | |||
131 | struct task_group *parent; | ||
132 | struct list_head siblings; | ||
133 | struct list_head children; | ||
134 | |||
135 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
136 | struct autogroup *autogroup; | ||
137 | #endif | ||
138 | |||
139 | struct cfs_bandwidth cfs_bandwidth; | ||
140 | }; | ||
141 | |||
142 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
143 | #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
144 | |||
145 | /* | ||
146 | * A weight of 0 or 1 can cause arithmetics problems. | ||
147 | * A weight of a cfs_rq is the sum of weights of which entities | ||
148 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
149 | * too large, so as the shares value of a task group. | ||
150 | * (The default weight is 1024 - so there's no practical | ||
151 | * limitation from this.) | ||
152 | */ | ||
153 | #define MIN_SHARES (1UL << 1) | ||
154 | #define MAX_SHARES (1UL << 18) | ||
155 | #endif | ||
156 | |||
157 | /* Default task group. | ||
158 | * Every task in system belong to this group at bootup. | ||
159 | */ | ||
160 | extern struct task_group root_task_group; | ||
161 | |||
162 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
163 | |||
164 | extern int walk_tg_tree_from(struct task_group *from, | ||
165 | tg_visitor down, tg_visitor up, void *data); | ||
166 | |||
167 | /* | ||
168 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
169 | * leaving it for the final time. | ||
170 | * | ||
171 | * Caller must hold rcu_lock or sufficient equivalent. | ||
172 | */ | ||
173 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
174 | { | ||
175 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
176 | } | ||
177 | |||
178 | extern int tg_nop(struct task_group *tg, void *data); | ||
179 | |||
180 | extern void free_fair_sched_group(struct task_group *tg); | ||
181 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | ||
182 | extern void unregister_fair_sched_group(struct task_group *tg, int cpu); | ||
183 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
184 | struct sched_entity *se, int cpu, | ||
185 | struct sched_entity *parent); | ||
186 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
187 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
188 | |||
189 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | ||
190 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
191 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); | ||
192 | |||
193 | extern void free_rt_sched_group(struct task_group *tg); | ||
194 | extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); | ||
195 | extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
196 | struct sched_rt_entity *rt_se, int cpu, | ||
197 | struct sched_rt_entity *parent); | ||
198 | |||
199 | #else /* CONFIG_CGROUP_SCHED */ | ||
200 | |||
201 | struct cfs_bandwidth { }; | ||
202 | |||
203 | #endif /* CONFIG_CGROUP_SCHED */ | ||
204 | |||
205 | /* CFS-related fields in a runqueue */ | ||
206 | struct cfs_rq { | ||
207 | struct load_weight load; | ||
208 | unsigned long nr_running, h_nr_running; | ||
209 | |||
210 | u64 exec_clock; | ||
211 | u64 min_vruntime; | ||
212 | #ifndef CONFIG_64BIT | ||
213 | u64 min_vruntime_copy; | ||
214 | #endif | ||
215 | |||
216 | struct rb_root tasks_timeline; | ||
217 | struct rb_node *rb_leftmost; | ||
218 | |||
219 | struct list_head tasks; | ||
220 | struct list_head *balance_iterator; | ||
221 | |||
222 | /* | ||
223 | * 'curr' points to currently running entity on this cfs_rq. | ||
224 | * It is set to NULL otherwise (i.e when none are currently running). | ||
225 | */ | ||
226 | struct sched_entity *curr, *next, *last, *skip; | ||
227 | |||
228 | #ifdef CONFIG_SCHED_DEBUG | ||
229 | unsigned int nr_spread_over; | ||
230 | #endif | ||
231 | |||
232 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
233 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
234 | |||
235 | /* | ||
236 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
237 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
238 | * (like users, containers etc.) | ||
239 | * | ||
240 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
241 | * list is used during load balance. | ||
242 | */ | ||
243 | int on_list; | ||
244 | struct list_head leaf_cfs_rq_list; | ||
245 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
246 | |||
247 | #ifdef CONFIG_SMP | ||
248 | /* | ||
249 | * the part of load.weight contributed by tasks | ||
250 | */ | ||
251 | unsigned long task_weight; | ||
252 | |||
253 | /* | ||
254 | * h_load = weight * f(tg) | ||
255 | * | ||
256 | * Where f(tg) is the recursive weight fraction assigned to | ||
257 | * this group. | ||
258 | */ | ||
259 | unsigned long h_load; | ||
260 | |||
261 | /* | ||
262 | * Maintaining per-cpu shares distribution for group scheduling | ||
263 | * | ||
264 | * load_stamp is the last time we updated the load average | ||
265 | * load_last is the last time we updated the load average and saw load | ||
266 | * load_unacc_exec_time is currently unaccounted execution time | ||
267 | */ | ||
268 | u64 load_avg; | ||
269 | u64 load_period; | ||
270 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
271 | |||
272 | unsigned long load_contribution; | ||
273 | #endif /* CONFIG_SMP */ | ||
274 | #ifdef CONFIG_CFS_BANDWIDTH | ||
275 | int runtime_enabled; | ||
276 | u64 runtime_expires; | ||
277 | s64 runtime_remaining; | ||
278 | |||
279 | u64 throttled_timestamp; | ||
280 | int throttled, throttle_count; | ||
281 | struct list_head throttled_list; | ||
282 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
283 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
284 | }; | ||
285 | |||
286 | static inline int rt_bandwidth_enabled(void) | ||
287 | { | ||
288 | return sysctl_sched_rt_runtime >= 0; | ||
289 | } | ||
290 | |||
291 | /* Real-Time classes' related field in a runqueue: */ | ||
292 | struct rt_rq { | ||
293 | struct rt_prio_array active; | ||
294 | unsigned long rt_nr_running; | ||
295 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
296 | struct { | ||
297 | int curr; /* highest queued rt task prio */ | ||
298 | #ifdef CONFIG_SMP | ||
299 | int next; /* next highest */ | ||
300 | #endif | ||
301 | } highest_prio; | ||
302 | #endif | ||
303 | #ifdef CONFIG_SMP | ||
304 | unsigned long rt_nr_migratory; | ||
305 | unsigned long rt_nr_total; | ||
306 | int overloaded; | ||
307 | struct plist_head pushable_tasks; | ||
308 | #endif | ||
309 | int rt_throttled; | ||
310 | u64 rt_time; | ||
311 | u64 rt_runtime; | ||
312 | /* Nests inside the rq lock: */ | ||
313 | raw_spinlock_t rt_runtime_lock; | ||
314 | |||
315 | #ifdef CONFIG_RT_GROUP_SCHED | ||
316 | unsigned long rt_nr_boosted; | ||
317 | |||
318 | struct rq *rq; | ||
319 | struct list_head leaf_rt_rq_list; | ||
320 | struct task_group *tg; | ||
321 | #endif | ||
322 | }; | ||
323 | |||
324 | #ifdef CONFIG_SMP | ||
325 | |||
326 | /* | ||
327 | * We add the notion of a root-domain which will be used to define per-domain | ||
328 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
329 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
330 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
331 | * object. | ||
332 | * | ||
333 | */ | ||
334 | struct root_domain { | ||
335 | atomic_t refcount; | ||
336 | atomic_t rto_count; | ||
337 | struct rcu_head rcu; | ||
338 | cpumask_var_t span; | ||
339 | cpumask_var_t online; | ||
340 | |||
341 | /* | ||
342 | * The "RT overload" flag: it gets set if a CPU has more than | ||
343 | * one runnable RT task. | ||
344 | */ | ||
345 | cpumask_var_t rto_mask; | ||
346 | struct cpupri cpupri; | ||
347 | }; | ||
348 | |||
349 | extern struct root_domain def_root_domain; | ||
350 | |||
351 | #endif /* CONFIG_SMP */ | ||
352 | |||
353 | /* | ||
354 | * This is the main, per-CPU runqueue data structure. | ||
355 | * | ||
356 | * Locking rule: those places that want to lock multiple runqueues | ||
357 | * (such as the load balancing or the thread migration code), lock | ||
358 | * acquire operations must be ordered by ascending &runqueue. | ||
359 | */ | ||
360 | struct rq { | ||
361 | /* runqueue lock: */ | ||
362 | raw_spinlock_t lock; | ||
363 | |||
364 | /* | ||
365 | * nr_running and cpu_load should be in the same cacheline because | ||
366 | * remote CPUs use both these fields when doing load calculation. | ||
367 | */ | ||
368 | unsigned long nr_running; | ||
369 | #define CPU_LOAD_IDX_MAX 5 | ||
370 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
371 | unsigned long last_load_update_tick; | ||
372 | #ifdef CONFIG_NO_HZ | ||
373 | u64 nohz_stamp; | ||
374 | unsigned long nohz_flags; | ||
375 | #endif | ||
376 | int skip_clock_update; | ||
377 | |||
378 | /* capture load from *all* tasks on this cpu: */ | ||
379 | struct load_weight load; | ||
380 | unsigned long nr_load_updates; | ||
381 | u64 nr_switches; | ||
382 | |||
383 | struct cfs_rq cfs; | ||
384 | struct rt_rq rt; | ||
385 | |||
386 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
387 | /* list of leaf cfs_rq on this cpu: */ | ||
388 | struct list_head leaf_cfs_rq_list; | ||
389 | #endif | ||
390 | #ifdef CONFIG_RT_GROUP_SCHED | ||
391 | struct list_head leaf_rt_rq_list; | ||
392 | #endif | ||
393 | |||
394 | /* | ||
395 | * This is part of a global counter where only the total sum | ||
396 | * over all CPUs matters. A task can increase this counter on | ||
397 | * one CPU and if it got migrated afterwards it may decrease | ||
398 | * it on another CPU. Always updated under the runqueue lock: | ||
399 | */ | ||
400 | unsigned long nr_uninterruptible; | ||
401 | |||
402 | struct task_struct *curr, *idle, *stop; | ||
403 | unsigned long next_balance; | ||
404 | struct mm_struct *prev_mm; | ||
405 | |||
406 | u64 clock; | ||
407 | u64 clock_task; | ||
408 | |||
409 | atomic_t nr_iowait; | ||
410 | |||
411 | #ifdef CONFIG_SMP | ||
412 | struct root_domain *rd; | ||
413 | struct sched_domain *sd; | ||
414 | |||
415 | unsigned long cpu_power; | ||
416 | |||
417 | unsigned char idle_balance; | ||
418 | /* For active balancing */ | ||
419 | int post_schedule; | ||
420 | int active_balance; | ||
421 | int push_cpu; | ||
422 | struct cpu_stop_work active_balance_work; | ||
423 | /* cpu of this runqueue: */ | ||
424 | int cpu; | ||
425 | int online; | ||
426 | |||
427 | u64 rt_avg; | ||
428 | u64 age_stamp; | ||
429 | u64 idle_stamp; | ||
430 | u64 avg_idle; | ||
431 | #endif | ||
432 | |||
433 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
434 | u64 prev_irq_time; | ||
435 | #endif | ||
436 | #ifdef CONFIG_PARAVIRT | ||
437 | u64 prev_steal_time; | ||
438 | #endif | ||
439 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
440 | u64 prev_steal_time_rq; | ||
441 | #endif | ||
442 | |||
443 | /* calc_load related fields */ | ||
444 | unsigned long calc_load_update; | ||
445 | long calc_load_active; | ||
446 | |||
447 | #ifdef CONFIG_SCHED_HRTICK | ||
448 | #ifdef CONFIG_SMP | ||
449 | int hrtick_csd_pending; | ||
450 | struct call_single_data hrtick_csd; | ||
451 | #endif | ||
452 | struct hrtimer hrtick_timer; | ||
453 | #endif | ||
454 | |||
455 | #ifdef CONFIG_SCHEDSTATS | ||
456 | /* latency stats */ | ||
457 | struct sched_info rq_sched_info; | ||
458 | unsigned long long rq_cpu_time; | ||
459 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
460 | |||
461 | /* sys_sched_yield() stats */ | ||
462 | unsigned int yld_count; | ||
463 | |||
464 | /* schedule() stats */ | ||
465 | unsigned int sched_switch; | ||
466 | unsigned int sched_count; | ||
467 | unsigned int sched_goidle; | ||
468 | |||
469 | /* try_to_wake_up() stats */ | ||
470 | unsigned int ttwu_count; | ||
471 | unsigned int ttwu_local; | ||
472 | #endif | ||
473 | |||
474 | #ifdef CONFIG_SMP | ||
475 | struct llist_head wake_list; | ||
476 | #endif | ||
477 | }; | ||
478 | |||
479 | static inline int cpu_of(struct rq *rq) | ||
480 | { | ||
481 | #ifdef CONFIG_SMP | ||
482 | return rq->cpu; | ||
483 | #else | ||
484 | return 0; | ||
485 | #endif | ||
486 | } | ||
487 | |||
488 | DECLARE_PER_CPU(struct rq, runqueues); | ||
489 | |||
490 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
491 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
492 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
493 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
494 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
495 | |||
496 | #ifdef CONFIG_SMP | ||
497 | |||
498 | #define rcu_dereference_check_sched_domain(p) \ | ||
499 | rcu_dereference_check((p), \ | ||
500 | lockdep_is_held(&sched_domains_mutex)) | ||
501 | |||
502 | /* | ||
503 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
504 | * See detach_destroy_domains: synchronize_sched for details. | ||
505 | * | ||
506 | * The domain tree of any CPU may only be accessed from within | ||
507 | * preempt-disabled sections. | ||
508 | */ | ||
509 | #define for_each_domain(cpu, __sd) \ | ||
510 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ | ||
511 | __sd; __sd = __sd->parent) | ||
512 | |||
513 | #define for_each_lower_domain(sd) for (; sd; sd = sd->child) | ||
514 | |||
515 | /** | ||
516 | * highest_flag_domain - Return highest sched_domain containing flag. | ||
517 | * @cpu: The cpu whose highest level of sched domain is to | ||
518 | * be returned. | ||
519 | * @flag: The flag to check for the highest sched_domain | ||
520 | * for the given cpu. | ||
521 | * | ||
522 | * Returns the highest sched_domain of a cpu which contains the given flag. | ||
523 | */ | ||
524 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | ||
525 | { | ||
526 | struct sched_domain *sd, *hsd = NULL; | ||
527 | |||
528 | for_each_domain(cpu, sd) { | ||
529 | if (!(sd->flags & flag)) | ||
530 | break; | ||
531 | hsd = sd; | ||
532 | } | ||
533 | |||
534 | return hsd; | ||
535 | } | ||
536 | |||
537 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | ||
538 | DECLARE_PER_CPU(int, sd_llc_id); | ||
539 | |||
540 | #endif /* CONFIG_SMP */ | ||
541 | |||
542 | #include "stats.h" | ||
543 | #include "auto_group.h" | ||
544 | |||
545 | #ifdef CONFIG_CGROUP_SCHED | ||
546 | |||
547 | /* | ||
548 | * Return the group to which this tasks belongs. | ||
549 | * | ||
550 | * We use task_subsys_state_check() and extend the RCU verification with | ||
551 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
552 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
553 | * we pin the task to the current cgroup. | ||
554 | */ | ||
555 | static inline struct task_group *task_group(struct task_struct *p) | ||
556 | { | ||
557 | struct task_group *tg; | ||
558 | struct cgroup_subsys_state *css; | ||
559 | |||
560 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
561 | lockdep_is_held(&p->pi_lock) || | ||
562 | lockdep_is_held(&task_rq(p)->lock)); | ||
563 | tg = container_of(css, struct task_group, css); | ||
564 | |||
565 | return autogroup_task_group(p, tg); | ||
566 | } | ||
567 | |||
568 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
569 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
570 | { | ||
571 | #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) | ||
572 | struct task_group *tg = task_group(p); | ||
573 | #endif | ||
574 | |||
575 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
576 | p->se.cfs_rq = tg->cfs_rq[cpu]; | ||
577 | p->se.parent = tg->se[cpu]; | ||
578 | #endif | ||
579 | |||
580 | #ifdef CONFIG_RT_GROUP_SCHED | ||
581 | p->rt.rt_rq = tg->rt_rq[cpu]; | ||
582 | p->rt.parent = tg->rt_se[cpu]; | ||
583 | #endif | ||
584 | } | ||
585 | |||
586 | #else /* CONFIG_CGROUP_SCHED */ | ||
587 | |||
588 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
589 | static inline struct task_group *task_group(struct task_struct *p) | ||
590 | { | ||
591 | return NULL; | ||
592 | } | ||
593 | |||
594 | #endif /* CONFIG_CGROUP_SCHED */ | ||
595 | |||
596 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
597 | { | ||
598 | set_task_rq(p, cpu); | ||
599 | #ifdef CONFIG_SMP | ||
600 | /* | ||
601 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
602 | * successfuly executed on another CPU. We must ensure that updates of | ||
603 | * per-task data have been completed by this moment. | ||
604 | */ | ||
605 | smp_wmb(); | ||
606 | task_thread_info(p)->cpu = cpu; | ||
607 | #endif | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
612 | */ | ||
613 | #ifdef CONFIG_SCHED_DEBUG | ||
614 | # include <linux/jump_label.h> | ||
615 | # define const_debug __read_mostly | ||
616 | #else | ||
617 | # define const_debug const | ||
618 | #endif | ||
619 | |||
620 | extern const_debug unsigned int sysctl_sched_features; | ||
621 | |||
622 | #define SCHED_FEAT(name, enabled) \ | ||
623 | __SCHED_FEAT_##name , | ||
624 | |||
625 | enum { | ||
626 | #include "features.h" | ||
627 | __SCHED_FEAT_NR, | ||
628 | }; | ||
629 | |||
630 | #undef SCHED_FEAT | ||
631 | |||
632 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | ||
633 | static __always_inline bool static_branch__true(struct jump_label_key *key) | ||
634 | { | ||
635 | return likely(static_branch(key)); /* Not out of line branch. */ | ||
636 | } | ||
637 | |||
638 | static __always_inline bool static_branch__false(struct jump_label_key *key) | ||
639 | { | ||
640 | return unlikely(static_branch(key)); /* Out of line branch. */ | ||
641 | } | ||
642 | |||
643 | #define SCHED_FEAT(name, enabled) \ | ||
644 | static __always_inline bool static_branch_##name(struct jump_label_key *key) \ | ||
645 | { \ | ||
646 | return static_branch__##enabled(key); \ | ||
647 | } | ||
648 | |||
649 | #include "features.h" | ||
650 | |||
651 | #undef SCHED_FEAT | ||
652 | |||
653 | extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; | ||
654 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) | ||
655 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ | ||
656 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
657 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | ||
658 | |||
659 | static inline u64 global_rt_period(void) | ||
660 | { | ||
661 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
662 | } | ||
663 | |||
664 | static inline u64 global_rt_runtime(void) | ||
665 | { | ||
666 | if (sysctl_sched_rt_runtime < 0) | ||
667 | return RUNTIME_INF; | ||
668 | |||
669 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
670 | } | ||
671 | |||
672 | |||
673 | |||
674 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
675 | { | ||
676 | return rq->curr == p; | ||
677 | } | ||
678 | |||
679 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
680 | { | ||
681 | #ifdef CONFIG_SMP | ||
682 | return p->on_cpu; | ||
683 | #else | ||
684 | return task_current(rq, p); | ||
685 | #endif | ||
686 | } | ||
687 | |||
688 | |||
689 | #ifndef prepare_arch_switch | ||
690 | # define prepare_arch_switch(next) do { } while (0) | ||
691 | #endif | ||
692 | #ifndef finish_arch_switch | ||
693 | # define finish_arch_switch(prev) do { } while (0) | ||
694 | #endif | ||
695 | |||
696 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
697 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
698 | { | ||
699 | #ifdef CONFIG_SMP | ||
700 | /* | ||
701 | * We can optimise this out completely for !SMP, because the | ||
702 | * SMP rebalancing from interrupt is the only thing that cares | ||
703 | * here. | ||
704 | */ | ||
705 | next->on_cpu = 1; | ||
706 | #endif | ||
707 | } | ||
708 | |||
709 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
710 | { | ||
711 | #ifdef CONFIG_SMP | ||
712 | /* | ||
713 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
714 | * We must ensure this doesn't happen until the switch is completely | ||
715 | * finished. | ||
716 | */ | ||
717 | smp_wmb(); | ||
718 | prev->on_cpu = 0; | ||
719 | #endif | ||
720 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
721 | /* this is a valid case when another task releases the spinlock */ | ||
722 | rq->lock.owner = current; | ||
723 | #endif | ||
724 | /* | ||
725 | * If we are tracking spinlock dependencies then we have to | ||
726 | * fix up the runqueue lock - which gets 'carried over' from | ||
727 | * prev into current: | ||
728 | */ | ||
729 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
730 | |||
731 | raw_spin_unlock_irq(&rq->lock); | ||
732 | } | ||
733 | |||
734 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
735 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
736 | { | ||
737 | #ifdef CONFIG_SMP | ||
738 | /* | ||
739 | * We can optimise this out completely for !SMP, because the | ||
740 | * SMP rebalancing from interrupt is the only thing that cares | ||
741 | * here. | ||
742 | */ | ||
743 | next->on_cpu = 1; | ||
744 | #endif | ||
745 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
746 | raw_spin_unlock_irq(&rq->lock); | ||
747 | #else | ||
748 | raw_spin_unlock(&rq->lock); | ||
749 | #endif | ||
750 | } | ||
751 | |||
752 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
753 | { | ||
754 | #ifdef CONFIG_SMP | ||
755 | /* | ||
756 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
757 | * We must ensure this doesn't happen until the switch is completely | ||
758 | * finished. | ||
759 | */ | ||
760 | smp_wmb(); | ||
761 | prev->on_cpu = 0; | ||
762 | #endif | ||
763 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
764 | local_irq_enable(); | ||
765 | #endif | ||
766 | } | ||
767 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
768 | |||
769 | |||
770 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
771 | { | ||
772 | lw->weight += inc; | ||
773 | lw->inv_weight = 0; | ||
774 | } | ||
775 | |||
776 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
777 | { | ||
778 | lw->weight -= dec; | ||
779 | lw->inv_weight = 0; | ||
780 | } | ||
781 | |||
782 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
783 | { | ||
784 | lw->weight = w; | ||
785 | lw->inv_weight = 0; | ||
786 | } | ||
787 | |||
788 | /* | ||
789 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
790 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
791 | * each task makes to its run queue's load is weighted according to its | ||
792 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
793 | * scaled version of the new time slice allocation that they receive on time | ||
794 | * slice expiry etc. | ||
795 | */ | ||
796 | |||
797 | #define WEIGHT_IDLEPRIO 3 | ||
798 | #define WMULT_IDLEPRIO 1431655765 | ||
799 | |||
800 | /* | ||
801 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
802 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
803 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
804 | * that remained on nice 0. | ||
805 | * | ||
806 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
807 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
808 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
809 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
810 | * the relative distance between them is ~25%.) | ||
811 | */ | ||
812 | static const int prio_to_weight[40] = { | ||
813 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
814 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
815 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
816 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
817 | /* 0 */ 1024, 820, 655, 526, 423, | ||
818 | /* 5 */ 335, 272, 215, 172, 137, | ||
819 | /* 10 */ 110, 87, 70, 56, 45, | ||
820 | /* 15 */ 36, 29, 23, 18, 15, | ||
821 | }; | ||
822 | |||
823 | /* | ||
824 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
825 | * | ||
826 | * In cases where the weight does not change often, we can use the | ||
827 | * precalculated inverse to speed up arithmetics by turning divisions | ||
828 | * into multiplications: | ||
829 | */ | ||
830 | static const u32 prio_to_wmult[40] = { | ||
831 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
832 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
833 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
834 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
835 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
836 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
837 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
838 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
839 | }; | ||
840 | |||
841 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
842 | enum cpuacct_stat_index { | ||
843 | CPUACCT_STAT_USER, /* ... user mode */ | ||
844 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
845 | |||
846 | CPUACCT_STAT_NSTATS, | ||
847 | }; | ||
848 | |||
849 | |||
850 | #define sched_class_highest (&stop_sched_class) | ||
851 | #define for_each_class(class) \ | ||
852 | for (class = sched_class_highest; class; class = class->next) | ||
853 | |||
854 | extern const struct sched_class stop_sched_class; | ||
855 | extern const struct sched_class rt_sched_class; | ||
856 | extern const struct sched_class fair_sched_class; | ||
857 | extern const struct sched_class idle_sched_class; | ||
858 | |||
859 | |||
860 | #ifdef CONFIG_SMP | ||
861 | |||
862 | extern void trigger_load_balance(struct rq *rq, int cpu); | ||
863 | extern void idle_balance(int this_cpu, struct rq *this_rq); | ||
864 | |||
865 | #else /* CONFIG_SMP */ | ||
866 | |||
867 | static inline void idle_balance(int cpu, struct rq *rq) | ||
868 | { | ||
869 | } | ||
870 | |||
871 | #endif | ||
872 | |||
873 | extern void sysrq_sched_debug_show(void); | ||
874 | extern void sched_init_granularity(void); | ||
875 | extern void update_max_interval(void); | ||
876 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
877 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | ||
878 | extern void init_sched_rt_class(void); | ||
879 | extern void init_sched_fair_class(void); | ||
880 | |||
881 | extern void resched_task(struct task_struct *p); | ||
882 | extern void resched_cpu(int cpu); | ||
883 | |||
884 | extern struct rt_bandwidth def_rt_bandwidth; | ||
885 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | ||
886 | |||
887 | extern void update_cpu_load(struct rq *this_rq); | ||
888 | |||
889 | #ifdef CONFIG_CGROUP_CPUACCT | ||
890 | #include <linux/cgroup.h> | ||
891 | /* track cpu usage of a group of tasks and its child groups */ | ||
892 | struct cpuacct { | ||
893 | struct cgroup_subsys_state css; | ||
894 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
895 | u64 __percpu *cpuusage; | ||
896 | struct kernel_cpustat __percpu *cpustat; | ||
897 | }; | ||
898 | |||
899 | /* return cpu accounting group corresponding to this container */ | ||
900 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
901 | { | ||
902 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
903 | struct cpuacct, css); | ||
904 | } | ||
905 | |||
906 | /* return cpu accounting group to which this task belongs */ | ||
907 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
908 | { | ||
909 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
910 | struct cpuacct, css); | ||
911 | } | ||
912 | |||
913 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
914 | { | ||
915 | if (!ca || !ca->css.cgroup->parent) | ||
916 | return NULL; | ||
917 | return cgroup_ca(ca->css.cgroup->parent); | ||
918 | } | ||
919 | |||
920 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
921 | #else | ||
922 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
923 | #endif | ||
924 | |||
925 | static inline void inc_nr_running(struct rq *rq) | ||
926 | { | ||
927 | rq->nr_running++; | ||
928 | } | ||
929 | |||
930 | static inline void dec_nr_running(struct rq *rq) | ||
931 | { | ||
932 | rq->nr_running--; | ||
933 | } | ||
934 | |||
935 | extern void update_rq_clock(struct rq *rq); | ||
936 | |||
937 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | ||
938 | extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); | ||
939 | |||
940 | extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
941 | |||
942 | extern const_debug unsigned int sysctl_sched_time_avg; | ||
943 | extern const_debug unsigned int sysctl_sched_nr_migrate; | ||
944 | extern const_debug unsigned int sysctl_sched_migration_cost; | ||
945 | |||
946 | static inline u64 sched_avg_period(void) | ||
947 | { | ||
948 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
949 | } | ||
950 | |||
951 | void calc_load_account_idle(struct rq *this_rq); | ||
952 | |||
953 | #ifdef CONFIG_SCHED_HRTICK | ||
954 | |||
955 | /* | ||
956 | * Use hrtick when: | ||
957 | * - enabled by features | ||
958 | * - hrtimer is actually high res | ||
959 | */ | ||
960 | static inline int hrtick_enabled(struct rq *rq) | ||
961 | { | ||
962 | if (!sched_feat(HRTICK)) | ||
963 | return 0; | ||
964 | if (!cpu_active(cpu_of(rq))) | ||
965 | return 0; | ||
966 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
967 | } | ||
968 | |||
969 | void hrtick_start(struct rq *rq, u64 delay); | ||
970 | |||
971 | #else | ||
972 | |||
973 | static inline int hrtick_enabled(struct rq *rq) | ||
974 | { | ||
975 | return 0; | ||
976 | } | ||
977 | |||
978 | #endif /* CONFIG_SCHED_HRTICK */ | ||
979 | |||
980 | #ifdef CONFIG_SMP | ||
981 | extern void sched_avg_update(struct rq *rq); | ||
982 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
983 | { | ||
984 | rq->rt_avg += rt_delta; | ||
985 | sched_avg_update(rq); | ||
986 | } | ||
987 | #else | ||
988 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | ||
989 | static inline void sched_avg_update(struct rq *rq) { } | ||
990 | #endif | ||
991 | |||
992 | extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); | ||
993 | |||
994 | #ifdef CONFIG_SMP | ||
995 | #ifdef CONFIG_PREEMPT | ||
996 | |||
997 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
998 | |||
999 | /* | ||
1000 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
1001 | * way at the expense of forcing extra atomic operations in all | ||
1002 | * invocations. This assures that the double_lock is acquired using the | ||
1003 | * same underlying policy as the spinlock_t on this architecture, which | ||
1004 | * reduces latency compared to the unfair variant below. However, it | ||
1005 | * also adds more overhead and therefore may reduce throughput. | ||
1006 | */ | ||
1007 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1008 | __releases(this_rq->lock) | ||
1009 | __acquires(busiest->lock) | ||
1010 | __acquires(this_rq->lock) | ||
1011 | { | ||
1012 | raw_spin_unlock(&this_rq->lock); | ||
1013 | double_rq_lock(this_rq, busiest); | ||
1014 | |||
1015 | return 1; | ||
1016 | } | ||
1017 | |||
1018 | #else | ||
1019 | /* | ||
1020 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1021 | * latency by eliminating extra atomic operations when the locks are | ||
1022 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1023 | * grant the double lock to lower cpus over higher ids under contention, | ||
1024 | * regardless of entry order into the function. | ||
1025 | */ | ||
1026 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1027 | __releases(this_rq->lock) | ||
1028 | __acquires(busiest->lock) | ||
1029 | __acquires(this_rq->lock) | ||
1030 | { | ||
1031 | int ret = 0; | ||
1032 | |||
1033 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
1034 | if (busiest < this_rq) { | ||
1035 | raw_spin_unlock(&this_rq->lock); | ||
1036 | raw_spin_lock(&busiest->lock); | ||
1037 | raw_spin_lock_nested(&this_rq->lock, | ||
1038 | SINGLE_DEPTH_NESTING); | ||
1039 | ret = 1; | ||
1040 | } else | ||
1041 | raw_spin_lock_nested(&busiest->lock, | ||
1042 | SINGLE_DEPTH_NESTING); | ||
1043 | } | ||
1044 | return ret; | ||
1045 | } | ||
1046 | |||
1047 | #endif /* CONFIG_PREEMPT */ | ||
1048 | |||
1049 | /* | ||
1050 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1051 | */ | ||
1052 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1053 | { | ||
1054 | if (unlikely(!irqs_disabled())) { | ||
1055 | /* printk() doesn't work good under rq->lock */ | ||
1056 | raw_spin_unlock(&this_rq->lock); | ||
1057 | BUG_ON(1); | ||
1058 | } | ||
1059 | |||
1060 | return _double_lock_balance(this_rq, busiest); | ||
1061 | } | ||
1062 | |||
1063 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1064 | __releases(busiest->lock) | ||
1065 | { | ||
1066 | raw_spin_unlock(&busiest->lock); | ||
1067 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1068 | } | ||
1069 | |||
1070 | /* | ||
1071 | * double_rq_lock - safely lock two runqueues | ||
1072 | * | ||
1073 | * Note this does not disable interrupts like task_rq_lock, | ||
1074 | * you need to do so manually before calling. | ||
1075 | */ | ||
1076 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1077 | __acquires(rq1->lock) | ||
1078 | __acquires(rq2->lock) | ||
1079 | { | ||
1080 | BUG_ON(!irqs_disabled()); | ||
1081 | if (rq1 == rq2) { | ||
1082 | raw_spin_lock(&rq1->lock); | ||
1083 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1084 | } else { | ||
1085 | if (rq1 < rq2) { | ||
1086 | raw_spin_lock(&rq1->lock); | ||
1087 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1088 | } else { | ||
1089 | raw_spin_lock(&rq2->lock); | ||
1090 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1091 | } | ||
1092 | } | ||
1093 | } | ||
1094 | |||
1095 | /* | ||
1096 | * double_rq_unlock - safely unlock two runqueues | ||
1097 | * | ||
1098 | * Note this does not restore interrupts like task_rq_unlock, | ||
1099 | * you need to do so manually after calling. | ||
1100 | */ | ||
1101 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1102 | __releases(rq1->lock) | ||
1103 | __releases(rq2->lock) | ||
1104 | { | ||
1105 | raw_spin_unlock(&rq1->lock); | ||
1106 | if (rq1 != rq2) | ||
1107 | raw_spin_unlock(&rq2->lock); | ||
1108 | else | ||
1109 | __release(rq2->lock); | ||
1110 | } | ||
1111 | |||
1112 | #else /* CONFIG_SMP */ | ||
1113 | |||
1114 | /* | ||
1115 | * double_rq_lock - safely lock two runqueues | ||
1116 | * | ||
1117 | * Note this does not disable interrupts like task_rq_lock, | ||
1118 | * you need to do so manually before calling. | ||
1119 | */ | ||
1120 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1121 | __acquires(rq1->lock) | ||
1122 | __acquires(rq2->lock) | ||
1123 | { | ||
1124 | BUG_ON(!irqs_disabled()); | ||
1125 | BUG_ON(rq1 != rq2); | ||
1126 | raw_spin_lock(&rq1->lock); | ||
1127 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * double_rq_unlock - safely unlock two runqueues | ||
1132 | * | ||
1133 | * Note this does not restore interrupts like task_rq_unlock, | ||
1134 | * you need to do so manually after calling. | ||
1135 | */ | ||
1136 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1137 | __releases(rq1->lock) | ||
1138 | __releases(rq2->lock) | ||
1139 | { | ||
1140 | BUG_ON(rq1 != rq2); | ||
1141 | raw_spin_unlock(&rq1->lock); | ||
1142 | __release(rq2->lock); | ||
1143 | } | ||
1144 | |||
1145 | #endif | ||
1146 | |||
1147 | extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | ||
1148 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | ||
1149 | extern void print_cfs_stats(struct seq_file *m, int cpu); | ||
1150 | extern void print_rt_stats(struct seq_file *m, int cpu); | ||
1151 | |||
1152 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | ||
1153 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | ||
1154 | extern void unthrottle_offline_cfs_rqs(struct rq *rq); | ||
1155 | |||
1156 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | ||
1157 | |||
1158 | #ifdef CONFIG_NO_HZ | ||
1159 | enum rq_nohz_flag_bits { | ||
1160 | NOHZ_TICK_STOPPED, | ||
1161 | NOHZ_BALANCE_KICK, | ||
1162 | NOHZ_IDLE, | ||
1163 | }; | ||
1164 | |||
1165 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | ||
1166 | #endif | ||
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 000000000000..2a581ba8e190 --- /dev/null +++ b/kernel/sched/stats.c | |||
@@ -0,0 +1,111 @@ | |||
1 | |||
2 | #include <linux/slab.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/seq_file.h> | ||
5 | #include <linux/proc_fs.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | /* | ||
10 | * bump this up when changing the output format or the meaning of an existing | ||
11 | * format, so that tools can adapt (or abort) | ||
12 | */ | ||
13 | #define SCHEDSTAT_VERSION 15 | ||
14 | |||
15 | static int show_schedstat(struct seq_file *seq, void *v) | ||
16 | { | ||
17 | int cpu; | ||
18 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
19 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
20 | |||
21 | if (mask_str == NULL) | ||
22 | return -ENOMEM; | ||
23 | |||
24 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
25 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
26 | for_each_online_cpu(cpu) { | ||
27 | struct rq *rq = cpu_rq(cpu); | ||
28 | #ifdef CONFIG_SMP | ||
29 | struct sched_domain *sd; | ||
30 | int dcount = 0; | ||
31 | #endif | ||
32 | |||
33 | /* runqueue-specific stats */ | ||
34 | seq_printf(seq, | ||
35 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
36 | cpu, rq->yld_count, | ||
37 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
38 | rq->ttwu_count, rq->ttwu_local, | ||
39 | rq->rq_cpu_time, | ||
40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
41 | |||
42 | seq_printf(seq, "\n"); | ||
43 | |||
44 | #ifdef CONFIG_SMP | ||
45 | /* domain-specific stats */ | ||
46 | rcu_read_lock(); | ||
47 | for_each_domain(cpu, sd) { | ||
48 | enum cpu_idle_type itype; | ||
49 | |||
50 | cpumask_scnprintf(mask_str, mask_len, | ||
51 | sched_domain_span(sd)); | ||
52 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
53 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
54 | itype++) { | ||
55 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
56 | sd->lb_count[itype], | ||
57 | sd->lb_balanced[itype], | ||
58 | sd->lb_failed[itype], | ||
59 | sd->lb_imbalance[itype], | ||
60 | sd->lb_gained[itype], | ||
61 | sd->lb_hot_gained[itype], | ||
62 | sd->lb_nobusyq[itype], | ||
63 | sd->lb_nobusyg[itype]); | ||
64 | } | ||
65 | seq_printf(seq, | ||
66 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
67 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
68 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
69 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
70 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
71 | sd->ttwu_move_balance); | ||
72 | } | ||
73 | rcu_read_unlock(); | ||
74 | #endif | ||
75 | } | ||
76 | kfree(mask_str); | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static int schedstat_open(struct inode *inode, struct file *file) | ||
81 | { | ||
82 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
83 | char *buf = kmalloc(size, GFP_KERNEL); | ||
84 | struct seq_file *m; | ||
85 | int res; | ||
86 | |||
87 | if (!buf) | ||
88 | return -ENOMEM; | ||
89 | res = single_open(file, show_schedstat, NULL); | ||
90 | if (!res) { | ||
91 | m = file->private_data; | ||
92 | m->buf = buf; | ||
93 | m->size = size; | ||
94 | } else | ||
95 | kfree(buf); | ||
96 | return res; | ||
97 | } | ||
98 | |||
99 | static const struct file_operations proc_schedstat_operations = { | ||
100 | .open = schedstat_open, | ||
101 | .read = seq_read, | ||
102 | .llseek = seq_lseek, | ||
103 | .release = single_release, | ||
104 | }; | ||
105 | |||
106 | static int __init proc_schedstat_init(void) | ||
107 | { | ||
108 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
109 | return 0; | ||
110 | } | ||
111 | module_init(proc_schedstat_init); | ||
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h index 87f9e36ea56e..2ef90a51ec5e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched/stats.h | |||
@@ -1,108 +1,5 @@ | |||
1 | 1 | ||
2 | #ifdef CONFIG_SCHEDSTATS | 2 | #ifdef CONFIG_SCHEDSTATS |
3 | /* | ||
4 | * bump this up when changing the output format or the meaning of an existing | ||
5 | * format, so that tools can adapt (or abort) | ||
6 | */ | ||
7 | #define SCHEDSTAT_VERSION 15 | ||
8 | |||
9 | static int show_schedstat(struct seq_file *seq, void *v) | ||
10 | { | ||
11 | int cpu; | ||
12 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
14 | |||
15 | if (mask_str == NULL) | ||
16 | return -ENOMEM; | ||
17 | |||
18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
19 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
20 | for_each_online_cpu(cpu) { | ||
21 | struct rq *rq = cpu_rq(cpu); | ||
22 | #ifdef CONFIG_SMP | ||
23 | struct sched_domain *sd; | ||
24 | int dcount = 0; | ||
25 | #endif | ||
26 | |||
27 | /* runqueue-specific stats */ | ||
28 | seq_printf(seq, | ||
29 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
30 | cpu, rq->yld_count, | ||
31 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
32 | rq->ttwu_count, rq->ttwu_local, | ||
33 | rq->rq_cpu_time, | ||
34 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
35 | |||
36 | seq_printf(seq, "\n"); | ||
37 | |||
38 | #ifdef CONFIG_SMP | ||
39 | /* domain-specific stats */ | ||
40 | rcu_read_lock(); | ||
41 | for_each_domain(cpu, sd) { | ||
42 | enum cpu_idle_type itype; | ||
43 | |||
44 | cpumask_scnprintf(mask_str, mask_len, | ||
45 | sched_domain_span(sd)); | ||
46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
48 | itype++) { | ||
49 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
50 | sd->lb_count[itype], | ||
51 | sd->lb_balanced[itype], | ||
52 | sd->lb_failed[itype], | ||
53 | sd->lb_imbalance[itype], | ||
54 | sd->lb_gained[itype], | ||
55 | sd->lb_hot_gained[itype], | ||
56 | sd->lb_nobusyq[itype], | ||
57 | sd->lb_nobusyg[itype]); | ||
58 | } | ||
59 | seq_printf(seq, | ||
60 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
61 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
62 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
63 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
65 | sd->ttwu_move_balance); | ||
66 | } | ||
67 | rcu_read_unlock(); | ||
68 | #endif | ||
69 | } | ||
70 | kfree(mask_str); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static int schedstat_open(struct inode *inode, struct file *file) | ||
75 | { | ||
76 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
77 | char *buf = kmalloc(size, GFP_KERNEL); | ||
78 | struct seq_file *m; | ||
79 | int res; | ||
80 | |||
81 | if (!buf) | ||
82 | return -ENOMEM; | ||
83 | res = single_open(file, show_schedstat, NULL); | ||
84 | if (!res) { | ||
85 | m = file->private_data; | ||
86 | m->buf = buf; | ||
87 | m->size = size; | ||
88 | } else | ||
89 | kfree(buf); | ||
90 | return res; | ||
91 | } | ||
92 | |||
93 | static const struct file_operations proc_schedstat_operations = { | ||
94 | .open = schedstat_open, | ||
95 | .read = seq_read, | ||
96 | .llseek = seq_lseek, | ||
97 | .release = single_release, | ||
98 | }; | ||
99 | |||
100 | static int __init proc_schedstat_init(void) | ||
101 | { | ||
102 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
103 | return 0; | ||
104 | } | ||
105 | module_init(proc_schedstat_init); | ||
106 | 3 | ||
107 | /* | 4 | /* |
108 | * Expects runqueue lock to be held for atomicity of update | 5 | * Expects runqueue lock to be held for atomicity of update |
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
283 | return; | 180 | return; |
284 | 181 | ||
285 | raw_spin_lock(&cputimer->lock); | 182 | raw_spin_lock(&cputimer->lock); |
286 | cputimer->cputime.utime = | 183 | cputimer->cputime.utime += cputime; |
287 | cputime_add(cputimer->cputime.utime, cputime); | ||
288 | raw_spin_unlock(&cputimer->lock); | 184 | raw_spin_unlock(&cputimer->lock); |
289 | } | 185 | } |
290 | 186 | ||
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
307 | return; | 203 | return; |
308 | 204 | ||
309 | raw_spin_lock(&cputimer->lock); | 205 | raw_spin_lock(&cputimer->lock); |
310 | cputimer->cputime.stime = | 206 | cputimer->cputime.stime += cputime; |
311 | cputime_add(cputimer->cputime.stime, cputime); | ||
312 | raw_spin_unlock(&cputimer->lock); | 207 | raw_spin_unlock(&cputimer->lock); |
313 | } | 208 | } |
314 | 209 | ||
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c index 8b44e7fa7fb3..7b386e86fd23 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched/stop_task.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #include "sched.h" | ||
2 | |||
1 | /* | 3 | /* |
2 | * stop-task scheduling class. | 4 | * stop-task scheduling class. |
3 | * | 5 | * |
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
80 | /* | 82 | /* |
81 | * Simple, special scheduling class for the per-CPU stop tasks: | 83 | * Simple, special scheduling class for the per-CPU stop tasks: |
82 | */ | 84 | */ |
83 | static const struct sched_class stop_sched_class = { | 85 | const struct sched_class stop_sched_class = { |
84 | .next = &rt_sched_class, | 86 | .next = &rt_sched_class, |
85 | 87 | ||
86 | .enqueue_task = enqueue_task_stop, | 88 | .enqueue_task = enqueue_task_stop, |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631d..e8d76c5895ea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * This defines a simple but solid secure-computing mode. | 6 | * This defines a simple but solid secure-computing mode. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/audit.h> | ||
9 | #include <linux/seccomp.h> | 10 | #include <linux/seccomp.h> |
10 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
11 | #include <linux/compat.h> | 12 | #include <linux/compat.h> |
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) | |||
54 | #ifdef SECCOMP_DEBUG | 55 | #ifdef SECCOMP_DEBUG |
55 | dump_stack(); | 56 | dump_stack(); |
56 | #endif | 57 | #endif |
58 | audit_seccomp(this_syscall); | ||
57 | do_exit(SIGKILL); | 59 | do_exit(SIGKILL); |
58 | } | 60 | } |
59 | 61 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index b3f78d09a105..c73c4284160e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/freezer.h> | 28 | #include <linux/freezer.h> |
29 | #include <linux/pid_namespace.h> | 29 | #include <linux/pid_namespace.h> |
30 | #include <linux/nsproxy.h> | 30 | #include <linux/nsproxy.h> |
31 | #include <linux/user_namespace.h> | ||
31 | #define CREATE_TRACE_POINTS | 32 | #define CREATE_TRACE_POINTS |
32 | #include <trace/events/signal.h> | 33 | #include <trace/events/signal.h> |
33 | 34 | ||
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) | |||
1019 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | 1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); |
1020 | } | 1021 | } |
1021 | 1022 | ||
1023 | /* | ||
1024 | * map the uid in struct cred into user namespace *ns | ||
1025 | */ | ||
1026 | static inline uid_t map_cred_ns(const struct cred *cred, | ||
1027 | struct user_namespace *ns) | ||
1028 | { | ||
1029 | return user_ns_map_uid(ns, cred, cred->uid); | ||
1030 | } | ||
1031 | |||
1032 | #ifdef CONFIG_USER_NS | ||
1033 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | ||
1034 | { | ||
1035 | if (current_user_ns() == task_cred_xxx(t, user_ns)) | ||
1036 | return; | ||
1037 | |||
1038 | if (SI_FROMKERNEL(info)) | ||
1039 | return; | ||
1040 | |||
1041 | info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), | ||
1042 | current_cred(), info->si_uid); | ||
1043 | } | ||
1044 | #else | ||
1045 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | ||
1046 | { | ||
1047 | return; | ||
1048 | } | ||
1049 | #endif | ||
1050 | |||
1022 | static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | 1051 | static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, |
1023 | int group, int from_ancestor_ns) | 1052 | int group, int from_ancestor_ns) |
1024 | { | 1053 | { |
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1088 | q->info.si_pid = 0; | 1117 | q->info.si_pid = 0; |
1089 | break; | 1118 | break; |
1090 | } | 1119 | } |
1120 | |||
1121 | userns_fixup_signal_uid(&q->info, t); | ||
1122 | |||
1091 | } else if (!is_si_special(info)) { | 1123 | } else if (!is_si_special(info)) { |
1092 | if (sig >= SIGRTMIN && info->si_code != SI_USER) { | 1124 | if (sig >= SIGRTMIN && info->si_code != SI_USER) { |
1093 | /* | 1125 | /* |
@@ -1626,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1626 | */ | 1658 | */ |
1627 | rcu_read_lock(); | 1659 | rcu_read_lock(); |
1628 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); | 1660 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); |
1629 | info.si_uid = __task_cred(tsk)->uid; | 1661 | info.si_uid = map_cred_ns(__task_cred(tsk), |
1662 | task_cred_xxx(tsk->parent, user_ns)); | ||
1630 | rcu_read_unlock(); | 1663 | rcu_read_unlock(); |
1631 | 1664 | ||
1632 | info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, | 1665 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); |
1633 | tsk->signal->utime)); | 1666 | info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); |
1634 | info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, | ||
1635 | tsk->signal->stime)); | ||
1636 | 1667 | ||
1637 | info.si_status = tsk->exit_code & 0x7f; | 1668 | info.si_status = tsk->exit_code & 0x7f; |
1638 | if (tsk->exit_code & 0x80) | 1669 | if (tsk->exit_code & 0x80) |
@@ -1711,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1711 | */ | 1742 | */ |
1712 | rcu_read_lock(); | 1743 | rcu_read_lock(); |
1713 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1744 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
1714 | info.si_uid = __task_cred(tsk)->uid; | 1745 | info.si_uid = map_cred_ns(__task_cred(tsk), |
1746 | task_cred_xxx(parent, user_ns)); | ||
1715 | rcu_read_unlock(); | 1747 | rcu_read_unlock(); |
1716 | 1748 | ||
1717 | info.si_utime = cputime_to_clock_t(tsk->utime); | 1749 | info.si_utime = cputime_to_clock_t(tsk->utime); |
@@ -1994,8 +2026,6 @@ static bool do_signal_stop(int signr) | |||
1994 | */ | 2026 | */ |
1995 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) | 2027 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) |
1996 | sig->group_exit_code = signr; | 2028 | sig->group_exit_code = signr; |
1997 | else | ||
1998 | WARN_ON_ONCE(!current->ptrace); | ||
1999 | 2029 | ||
2000 | sig->group_stop_count = 0; | 2030 | sig->group_stop_count = 0; |
2001 | 2031 | ||
@@ -2129,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
2129 | info->si_signo = signr; | 2159 | info->si_signo = signr; |
2130 | info->si_errno = 0; | 2160 | info->si_errno = 0; |
2131 | info->si_code = SI_USER; | 2161 | info->si_code = SI_USER; |
2162 | rcu_read_lock(); | ||
2132 | info->si_pid = task_pid_vnr(current->parent); | 2163 | info->si_pid = task_pid_vnr(current->parent); |
2133 | info->si_uid = task_uid(current->parent); | 2164 | info->si_uid = map_cred_ns(__task_cred(current->parent), |
2165 | current_user_ns()); | ||
2166 | rcu_read_unlock(); | ||
2134 | } | 2167 | } |
2135 | 2168 | ||
2136 | /* If the (new) signal is now blocked, requeue it. */ | 2169 | /* If the (new) signal is now blocked, requeue it. */ |
@@ -2322,6 +2355,27 @@ relock: | |||
2322 | return signr; | 2355 | return signr; |
2323 | } | 2356 | } |
2324 | 2357 | ||
2358 | /** | ||
2359 | * block_sigmask - add @ka's signal mask to current->blocked | ||
2360 | * @ka: action for @signr | ||
2361 | * @signr: signal that has been successfully delivered | ||
2362 | * | ||
2363 | * This function should be called when a signal has succesfully been | ||
2364 | * delivered. It adds the mask of signals for @ka to current->blocked | ||
2365 | * so that they are blocked during the execution of the signal | ||
2366 | * handler. In addition, @signr will be blocked unless %SA_NODEFER is | ||
2367 | * set in @ka->sa.sa_flags. | ||
2368 | */ | ||
2369 | void block_sigmask(struct k_sigaction *ka, int signr) | ||
2370 | { | ||
2371 | sigset_t blocked; | ||
2372 | |||
2373 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | ||
2374 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
2375 | sigaddset(&blocked, signr); | ||
2376 | set_current_blocked(&blocked); | ||
2377 | } | ||
2378 | |||
2325 | /* | 2379 | /* |
2326 | * It could be that complete_signal() picked us to notify about the | 2380 | * It could be that complete_signal() picked us to notify about the |
2327 | * group-wide signal. Other threads should be notified now to take | 2381 | * group-wide signal. Other threads should be notified now to take |
@@ -2359,8 +2413,15 @@ void exit_signals(struct task_struct *tsk) | |||
2359 | int group_stop = 0; | 2413 | int group_stop = 0; |
2360 | sigset_t unblocked; | 2414 | sigset_t unblocked; |
2361 | 2415 | ||
2416 | /* | ||
2417 | * @tsk is about to have PF_EXITING set - lock out users which | ||
2418 | * expect stable threadgroup. | ||
2419 | */ | ||
2420 | threadgroup_change_begin(tsk); | ||
2421 | |||
2362 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { | 2422 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { |
2363 | tsk->flags |= PF_EXITING; | 2423 | tsk->flags |= PF_EXITING; |
2424 | threadgroup_change_end(tsk); | ||
2364 | return; | 2425 | return; |
2365 | } | 2426 | } |
2366 | 2427 | ||
@@ -2370,6 +2431,9 @@ void exit_signals(struct task_struct *tsk) | |||
2370 | * see wants_signal(), do_signal_stop(). | 2431 | * see wants_signal(), do_signal_stop(). |
2371 | */ | 2432 | */ |
2372 | tsk->flags |= PF_EXITING; | 2433 | tsk->flags |= PF_EXITING; |
2434 | |||
2435 | threadgroup_change_end(tsk); | ||
2436 | |||
2373 | if (!signal_pending(tsk)) | 2437 | if (!signal_pending(tsk)) |
2374 | goto out; | 2438 | goto out; |
2375 | 2439 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c71d91efff0..4eb3a0fa351e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -347,12 +347,12 @@ void irq_exit(void) | |||
347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
348 | invoke_softirq(); | 348 | invoke_softirq(); |
349 | 349 | ||
350 | rcu_irq_exit(); | ||
351 | #ifdef CONFIG_NO_HZ | 350 | #ifdef CONFIG_NO_HZ |
352 | /* Make sure that timer wheel updates are propagated */ | 351 | /* Make sure that timer wheel updates are propagated */ |
353 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) | 352 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) |
354 | tick_nohz_stop_sched_tick(0); | 353 | tick_nohz_irq_exit(); |
355 | #endif | 354 | #endif |
355 | rcu_irq_exit(); | ||
356 | preempt_enable_no_resched(); | 356 | preempt_enable_no_resched(); |
357 | } | 357 | } |
358 | 358 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 481611fbd079..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1605 | unsigned long maxrss = 0; | 1605 | unsigned long maxrss = 0; |
1606 | 1606 | ||
1607 | memset((char *) r, 0, sizeof *r); | 1607 | memset((char *) r, 0, sizeof *r); |
1608 | utime = stime = cputime_zero; | 1608 | utime = stime = 0; |
1609 | 1609 | ||
1610 | if (who == RUSAGE_THREAD) { | 1610 | if (who == RUSAGE_THREAD) { |
1611 | task_times(current, &utime, &stime); | 1611 | task_times(current, &utime, &stime); |
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1635 | 1635 | ||
1636 | case RUSAGE_SELF: | 1636 | case RUSAGE_SELF: |
1637 | thread_group_times(p, &tgutime, &tgstime); | 1637 | thread_group_times(p, &tgutime, &tgstime); |
1638 | utime = cputime_add(utime, tgutime); | 1638 | utime += tgutime; |
1639 | stime = cputime_add(stime, tgstime); | 1639 | stime += tgstime; |
1640 | r->ru_nvcsw += p->signal->nvcsw; | 1640 | r->ru_nvcsw += p->signal->nvcsw; |
1641 | r->ru_nivcsw += p->signal->nivcsw; | 1641 | r->ru_nivcsw += p->signal->nivcsw; |
1642 | r->ru_minflt += p->signal->min_flt; | 1642 | r->ru_minflt += p->signal->min_flt; |
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1692 | return mask; | 1692 | return mask; |
1693 | } | 1693 | } |
1694 | 1694 | ||
1695 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1696 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1697 | unsigned long arg4, unsigned long arg5) | ||
1698 | { | ||
1699 | unsigned long rlim = rlimit(RLIMIT_DATA); | ||
1700 | unsigned long vm_req_flags; | ||
1701 | unsigned long vm_bad_flags; | ||
1702 | struct vm_area_struct *vma; | ||
1703 | int error = 0; | ||
1704 | struct mm_struct *mm = current->mm; | ||
1705 | |||
1706 | if (arg4 | arg5) | ||
1707 | return -EINVAL; | ||
1708 | |||
1709 | if (!capable(CAP_SYS_ADMIN)) | ||
1710 | return -EPERM; | ||
1711 | |||
1712 | if (addr >= TASK_SIZE) | ||
1713 | return -EINVAL; | ||
1714 | |||
1715 | down_read(&mm->mmap_sem); | ||
1716 | vma = find_vma(mm, addr); | ||
1717 | |||
1718 | if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { | ||
1719 | /* It must be existing VMA */ | ||
1720 | if (!vma || vma->vm_start > addr) | ||
1721 | goto out; | ||
1722 | } | ||
1723 | |||
1724 | error = -EINVAL; | ||
1725 | switch (opt) { | ||
1726 | case PR_SET_MM_START_CODE: | ||
1727 | case PR_SET_MM_END_CODE: | ||
1728 | vm_req_flags = VM_READ | VM_EXEC; | ||
1729 | vm_bad_flags = VM_WRITE | VM_MAYSHARE; | ||
1730 | |||
1731 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1732 | (vma->vm_flags & vm_bad_flags)) | ||
1733 | goto out; | ||
1734 | |||
1735 | if (opt == PR_SET_MM_START_CODE) | ||
1736 | mm->start_code = addr; | ||
1737 | else | ||
1738 | mm->end_code = addr; | ||
1739 | break; | ||
1740 | |||
1741 | case PR_SET_MM_START_DATA: | ||
1742 | case PR_SET_MM_END_DATA: | ||
1743 | vm_req_flags = VM_READ | VM_WRITE; | ||
1744 | vm_bad_flags = VM_EXEC | VM_MAYSHARE; | ||
1745 | |||
1746 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1747 | (vma->vm_flags & vm_bad_flags)) | ||
1748 | goto out; | ||
1749 | |||
1750 | if (opt == PR_SET_MM_START_DATA) | ||
1751 | mm->start_data = addr; | ||
1752 | else | ||
1753 | mm->end_data = addr; | ||
1754 | break; | ||
1755 | |||
1756 | case PR_SET_MM_START_STACK: | ||
1757 | |||
1758 | #ifdef CONFIG_STACK_GROWSUP | ||
1759 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; | ||
1760 | #else | ||
1761 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; | ||
1762 | #endif | ||
1763 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags) | ||
1764 | goto out; | ||
1765 | |||
1766 | mm->start_stack = addr; | ||
1767 | break; | ||
1768 | |||
1769 | case PR_SET_MM_START_BRK: | ||
1770 | if (addr <= mm->end_data) | ||
1771 | goto out; | ||
1772 | |||
1773 | if (rlim < RLIM_INFINITY && | ||
1774 | (mm->brk - addr) + | ||
1775 | (mm->end_data - mm->start_data) > rlim) | ||
1776 | goto out; | ||
1777 | |||
1778 | mm->start_brk = addr; | ||
1779 | break; | ||
1780 | |||
1781 | case PR_SET_MM_BRK: | ||
1782 | if (addr <= mm->end_data) | ||
1783 | goto out; | ||
1784 | |||
1785 | if (rlim < RLIM_INFINITY && | ||
1786 | (addr - mm->start_brk) + | ||
1787 | (mm->end_data - mm->start_data) > rlim) | ||
1788 | goto out; | ||
1789 | |||
1790 | mm->brk = addr; | ||
1791 | break; | ||
1792 | |||
1793 | default: | ||
1794 | error = -EINVAL; | ||
1795 | goto out; | ||
1796 | } | ||
1797 | |||
1798 | error = 0; | ||
1799 | |||
1800 | out: | ||
1801 | up_read(&mm->mmap_sem); | ||
1802 | |||
1803 | return error; | ||
1804 | } | ||
1805 | #else /* CONFIG_CHECKPOINT_RESTORE */ | ||
1806 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1807 | unsigned long arg4, unsigned long arg5) | ||
1808 | { | ||
1809 | return -EINVAL; | ||
1810 | } | ||
1811 | #endif | ||
1812 | |||
1695 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | 1813 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, |
1696 | unsigned long, arg4, unsigned long, arg5) | 1814 | unsigned long, arg4, unsigned long, arg5) |
1697 | { | 1815 | { |
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1841 | else | 1959 | else |
1842 | error = PR_MCE_KILL_DEFAULT; | 1960 | error = PR_MCE_KILL_DEFAULT; |
1843 | break; | 1961 | break; |
1962 | case PR_SET_MM: | ||
1963 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | ||
1964 | break; | ||
1844 | default: | 1965 | default: |
1845 | error = -EINVAL; | 1966 | error = -EINVAL; |
1846 | break; | 1967 | break; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae2719643854..f487f257e05e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = { | |||
803 | .mode = 0644, | 803 | .mode = 0644, |
804 | .proc_handler = proc_dointvec, | 804 | .proc_handler = proc_dointvec, |
805 | }, | 805 | }, |
806 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
807 | { | ||
808 | .procname = "panic_on_stackoverflow", | ||
809 | .data = &sysctl_panic_on_stackoverflow, | ||
810 | .maxlen = sizeof(int), | ||
811 | .mode = 0644, | ||
812 | .proc_handler = proc_dointvec, | ||
813 | }, | ||
814 | #endif | ||
806 | { | 815 | { |
807 | .procname = "bootloader_type", | 816 | .procname = "bootloader_type", |
808 | .data = &bootloader_type, | 817 | .data = &bootloader_type, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6318b511afa1..a650694883a1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1354 | 1354 | ||
1355 | fput(file); | 1355 | fput(file); |
1356 | out_putname: | 1356 | out_putname: |
1357 | putname(pathname); | 1357 | __putname(pathname); |
1358 | out: | 1358 | out: |
1359 | return result; | 1359 | return result; |
1360 | } | 1360 | } |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b26c2228fe92..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS | |||
25 | config GENERIC_CLOCKEVENTS_BUILD | 25 | config GENERIC_CLOCKEVENTS_BUILD |
26 | bool | 26 | bool |
27 | default y | 27 | default y |
28 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | 28 | depends on GENERIC_CLOCKEVENTS |
29 | 29 | ||
30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | 30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST |
31 | bool | 31 | bool |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c436e790b21b..8a46f5d64504 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
195 | struct alarm *alarm; | 195 | struct alarm *alarm; |
196 | ktime_t expired = next->expires; | 196 | ktime_t expired = next->expires; |
197 | 197 | ||
198 | if (expired.tv64 >= now.tv64) | 198 | if (expired.tv64 > now.tv64) |
199 | break; | 199 | break; |
200 | 200 | ||
201 | alarm = container_of(next, struct alarm, node); | 201 | alarm = container_of(next, struct alarm, node); |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6c..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/sysdev.h> | ||
21 | 20 | ||
22 | #include "tick-internal.h" | 21 | #include "tick-internal.h" |
23 | 22 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e096..a45ca167ab24 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -23,8 +23,8 @@ | |||
23 | * o Allow clocksource drivers to be unregistered | 23 | * o Allow clocksource drivers to be unregistered |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/device.h> | ||
26 | #include <linux/clocksource.h> | 27 | #include <linux/clocksource.h> |
27 | #include <linux/sysdev.h> | ||
28 | #include <linux/init.h> | 28 | #include <linux/init.h> |
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ | 30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ |
@@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void) | |||
492 | } | 492 | } |
493 | 493 | ||
494 | /** | 494 | /** |
495 | * clocksource_max_adjustment- Returns max adjustment amount | ||
496 | * @cs: Pointer to clocksource | ||
497 | * | ||
498 | */ | ||
499 | static u32 clocksource_max_adjustment(struct clocksource *cs) | ||
500 | { | ||
501 | u64 ret; | ||
502 | /* | ||
503 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | ||
504 | */ | ||
505 | ret = (u64)cs->mult * 11; | ||
506 | do_div(ret,100); | ||
507 | return (u32)ret; | ||
508 | } | ||
509 | |||
510 | /** | ||
495 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 511 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
496 | * @cs: Pointer to clocksource | 512 | * @cs: Pointer to clocksource |
497 | * | 513 | * |
@@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
503 | /* | 519 | /* |
504 | * Calculate the maximum number of cycles that we can pass to the | 520 | * Calculate the maximum number of cycles that we can pass to the |
505 | * cyc2ns function without overflowing a 64-bit signed result. The | 521 | * cyc2ns function without overflowing a 64-bit signed result. The |
506 | * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | 522 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) |
507 | * is equivalent to the below. | 523 | * which is equivalent to the below. |
508 | * max_cycles < (2^63)/cs->mult | 524 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) |
509 | * max_cycles < 2^(log2((2^63)/cs->mult)) | 525 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) |
510 | * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | 526 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) |
511 | * max_cycles < 2^(63 - log2(cs->mult)) | 527 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) |
512 | * max_cycles < 1 << (63 - log2(cs->mult)) | 528 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) |
513 | * Please note that we add 1 to the result of the log2 to account for | 529 | * Please note that we add 1 to the result of the log2 to account for |
514 | * any rounding errors, ensure the above inequality is satisfied and | 530 | * any rounding errors, ensure the above inequality is satisfied and |
515 | * no overflow will occur. | 531 | * no overflow will occur. |
516 | */ | 532 | */ |
517 | max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | 533 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); |
518 | 534 | ||
519 | /* | 535 | /* |
520 | * The actual maximum number of cycles we can defer the clocksource is | 536 | * The actual maximum number of cycles we can defer the clocksource is |
521 | * determined by the minimum of max_cycles and cs->mask. | 537 | * determined by the minimum of max_cycles and cs->mask. |
538 | * Note: Here we subtract the maxadj to make sure we don't sleep for | ||
539 | * too long if there's a large negative adjustment. | ||
522 | */ | 540 | */ |
523 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 541 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
524 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | 542 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, |
543 | cs->shift); | ||
525 | 544 | ||
526 | /* | 545 | /* |
527 | * To ensure that the clocksource does not wrap whilst we are idle, | 546 | * To ensure that the clocksource does not wrap whilst we are idle, |
@@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
529 | * note a margin of 12.5% is used because this can be computed with | 548 | * note a margin of 12.5% is used because this can be computed with |
530 | * a shift, versus say 10% which would require division. | 549 | * a shift, versus say 10% which would require division. |
531 | */ | 550 | */ |
532 | return max_nsecs - (max_nsecs >> 5); | 551 | return max_nsecs - (max_nsecs >> 3); |
533 | } | 552 | } |
534 | 553 | ||
535 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 554 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
@@ -628,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
628 | 647 | ||
629 | /** | 648 | /** |
630 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
631 | * @t: clocksource to be registered | 650 | * @cs: clocksource to be registered |
632 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz |
633 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale |
634 | * | 653 | * |
@@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
640 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 659 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
641 | { | 660 | { |
642 | u64 sec; | 661 | u64 sec; |
643 | |||
644 | /* | 662 | /* |
645 | * Calc the maximum number of seconds which we can run before | 663 | * Calc the maximum number of seconds which we can run before |
646 | * wrapping around. For clocksources which have a mask > 32bit | 664 | * wrapping around. For clocksources which have a mask > 32bit |
@@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
651 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | 669 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% |
652 | * margin as we do in clocksource_max_deferment() | 670 | * margin as we do in clocksource_max_deferment() |
653 | */ | 671 | */ |
654 | sec = (cs->mask - (cs->mask >> 5)); | 672 | sec = (cs->mask - (cs->mask >> 3)); |
655 | do_div(sec, freq); | 673 | do_div(sec, freq); |
656 | do_div(sec, scale); | 674 | do_div(sec, scale); |
657 | if (!sec) | 675 | if (!sec) |
@@ -661,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
661 | 679 | ||
662 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 680 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
663 | NSEC_PER_SEC / scale, sec * scale); | 681 | NSEC_PER_SEC / scale, sec * scale); |
682 | |||
683 | /* | ||
684 | * for clocksources that have large mults, to avoid overflow. | ||
685 | * Since mult may be adjusted by ntp, add an safety extra margin | ||
686 | * | ||
687 | */ | ||
688 | cs->maxadj = clocksource_max_adjustment(cs); | ||
689 | while ((cs->mult + cs->maxadj < cs->mult) | ||
690 | || (cs->mult - cs->maxadj > cs->mult)) { | ||
691 | cs->mult >>= 1; | ||
692 | cs->shift--; | ||
693 | cs->maxadj = clocksource_max_adjustment(cs); | ||
694 | } | ||
695 | |||
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 696 | cs->max_idle_ns = clocksource_max_deferment(cs); |
665 | } | 697 | } |
666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 698 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
667 | 699 | ||
668 | /** | 700 | /** |
669 | * __clocksource_register_scale - Used to install new clocksources | 701 | * __clocksource_register_scale - Used to install new clocksources |
670 | * @t: clocksource to be registered | 702 | * @cs: clocksource to be registered |
671 | * @scale: Scale factor multiplied against freq to get clocksource hz | 703 | * @scale: Scale factor multiplied against freq to get clocksource hz |
672 | * @freq: clocksource frequency (cycles per second) divided by scale | 704 | * @freq: clocksource frequency (cycles per second) divided by scale |
673 | * | 705 | * |
@@ -695,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); | |||
695 | 727 | ||
696 | /** | 728 | /** |
697 | * clocksource_register - Used to install new clocksources | 729 | * clocksource_register - Used to install new clocksources |
698 | * @t: clocksource to be registered | 730 | * @cs: clocksource to be registered |
699 | * | 731 | * |
700 | * Returns -EBUSY if registration fails, zero otherwise. | 732 | * Returns -EBUSY if registration fails, zero otherwise. |
701 | */ | 733 | */ |
702 | int clocksource_register(struct clocksource *cs) | 734 | int clocksource_register(struct clocksource *cs) |
703 | { | 735 | { |
736 | /* calculate max adjustment for given mult/shift */ | ||
737 | cs->maxadj = clocksource_max_adjustment(cs); | ||
738 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
739 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
740 | cs->name); | ||
741 | |||
704 | /* calculate max idle time permitted for this clocksource */ | 742 | /* calculate max idle time permitted for this clocksource */ |
705 | cs->max_idle_ns = clocksource_max_deferment(cs); | 743 | cs->max_idle_ns = clocksource_max_deferment(cs); |
706 | 744 | ||
@@ -723,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) | |||
723 | 761 | ||
724 | /** | 762 | /** |
725 | * clocksource_change_rating - Change the rating of a registered clocksource | 763 | * clocksource_change_rating - Change the rating of a registered clocksource |
764 | * @cs: clocksource to be changed | ||
765 | * @rating: new rating | ||
726 | */ | 766 | */ |
727 | void clocksource_change_rating(struct clocksource *cs, int rating) | 767 | void clocksource_change_rating(struct clocksource *cs, int rating) |
728 | { | 768 | { |
@@ -734,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); | |||
734 | 774 | ||
735 | /** | 775 | /** |
736 | * clocksource_unregister - remove a registered clocksource | 776 | * clocksource_unregister - remove a registered clocksource |
777 | * @cs: clocksource to be unregistered | ||
737 | */ | 778 | */ |
738 | void clocksource_unregister(struct clocksource *cs) | 779 | void clocksource_unregister(struct clocksource *cs) |
739 | { | 780 | { |
@@ -749,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister); | |||
749 | /** | 790 | /** |
750 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 791 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
751 | * @dev: unused | 792 | * @dev: unused |
793 | * @attr: unused | ||
752 | * @buf: char buffer to be filled with clocksource list | 794 | * @buf: char buffer to be filled with clocksource list |
753 | * | 795 | * |
754 | * Provides sysfs interface for listing current clocksource. | 796 | * Provides sysfs interface for listing current clocksource. |
755 | */ | 797 | */ |
756 | static ssize_t | 798 | static ssize_t |
757 | sysfs_show_current_clocksources(struct sys_device *dev, | 799 | sysfs_show_current_clocksources(struct device *dev, |
758 | struct sysdev_attribute *attr, char *buf) | 800 | struct device_attribute *attr, char *buf) |
759 | { | 801 | { |
760 | ssize_t count = 0; | 802 | ssize_t count = 0; |
761 | 803 | ||
@@ -769,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev, | |||
769 | /** | 811 | /** |
770 | * sysfs_override_clocksource - interface for manually overriding clocksource | 812 | * sysfs_override_clocksource - interface for manually overriding clocksource |
771 | * @dev: unused | 813 | * @dev: unused |
814 | * @attr: unused | ||
772 | * @buf: name of override clocksource | 815 | * @buf: name of override clocksource |
773 | * @count: length of buffer | 816 | * @count: length of buffer |
774 | * | 817 | * |
775 | * Takes input from sysfs interface for manually overriding the default | 818 | * Takes input from sysfs interface for manually overriding the default |
776 | * clocksource selection. | 819 | * clocksource selection. |
777 | */ | 820 | */ |
778 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | 821 | static ssize_t sysfs_override_clocksource(struct device *dev, |
779 | struct sysdev_attribute *attr, | 822 | struct device_attribute *attr, |
780 | const char *buf, size_t count) | 823 | const char *buf, size_t count) |
781 | { | 824 | { |
782 | size_t ret = count; | 825 | size_t ret = count; |
@@ -804,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, | |||
804 | /** | 847 | /** |
805 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | 848 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource |
806 | * @dev: unused | 849 | * @dev: unused |
850 | * @attr: unused | ||
807 | * @buf: char buffer to be filled with clocksource list | 851 | * @buf: char buffer to be filled with clocksource list |
808 | * | 852 | * |
809 | * Provides sysfs interface for listing registered clocksources | 853 | * Provides sysfs interface for listing registered clocksources |
810 | */ | 854 | */ |
811 | static ssize_t | 855 | static ssize_t |
812 | sysfs_show_available_clocksources(struct sys_device *dev, | 856 | sysfs_show_available_clocksources(struct device *dev, |
813 | struct sysdev_attribute *attr, | 857 | struct device_attribute *attr, |
814 | char *buf) | 858 | char *buf) |
815 | { | 859 | { |
816 | struct clocksource *src; | 860 | struct clocksource *src; |
@@ -839,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, | |||
839 | /* | 883 | /* |
840 | * Sysfs setup bits: | 884 | * Sysfs setup bits: |
841 | */ | 885 | */ |
842 | static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, | 886 | static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, |
843 | sysfs_override_clocksource); | 887 | sysfs_override_clocksource); |
844 | 888 | ||
845 | static SYSDEV_ATTR(available_clocksource, 0444, | 889 | static DEVICE_ATTR(available_clocksource, 0444, |
846 | sysfs_show_available_clocksources, NULL); | 890 | sysfs_show_available_clocksources, NULL); |
847 | 891 | ||
848 | static struct sysdev_class clocksource_sysclass = { | 892 | static struct bus_type clocksource_subsys = { |
849 | .name = "clocksource", | 893 | .name = "clocksource", |
894 | .dev_name = "clocksource", | ||
850 | }; | 895 | }; |
851 | 896 | ||
852 | static struct sys_device device_clocksource = { | 897 | static struct device device_clocksource = { |
853 | .id = 0, | 898 | .id = 0, |
854 | .cls = &clocksource_sysclass, | 899 | .bus = &clocksource_subsys, |
855 | }; | 900 | }; |
856 | 901 | ||
857 | static int __init init_clocksource_sysfs(void) | 902 | static int __init init_clocksource_sysfs(void) |
858 | { | 903 | { |
859 | int error = sysdev_class_register(&clocksource_sysclass); | 904 | int error = subsys_system_register(&clocksource_subsys, NULL); |
860 | 905 | ||
861 | if (!error) | 906 | if (!error) |
862 | error = sysdev_register(&device_clocksource); | 907 | error = device_register(&device_clocksource); |
863 | if (!error) | 908 | if (!error) |
864 | error = sysdev_create_file( | 909 | error = device_create_file( |
865 | &device_clocksource, | 910 | &device_clocksource, |
866 | &attr_current_clocksource); | 911 | &dev_attr_current_clocksource); |
867 | if (!error) | 912 | if (!error) |
868 | error = sysdev_create_file( | 913 | error = device_create_file( |
869 | &device_clocksource, | 914 | &device_clocksource, |
870 | &attr_available_clocksource); | 915 | &dev_attr_available_clocksource); |
871 | return error; | 916 | return error; |
872 | } | 917 | } |
873 | 918 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f954282d9a82..fd4a7b1625a2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) | |||
71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) |
72 | return 0; | 72 | return 0; |
73 | 73 | ||
74 | clockevents_exchange_device(NULL, dev); | 74 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); |
75 | tick_broadcast_device.evtdev = dev; | 75 | tick_broadcast_device.evtdev = dev; |
76 | if (!cpumask_empty(tick_get_broadcast_mask())) | 76 | if (!cpumask_empty(tick_get_broadcast_mask())) |
77 | tick_broadcast_start_periodic(dev); | 77 | tick_broadcast_start_periodic(dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0ba..7656642e4b8e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
275 | } | 275 | } |
276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
277 | 277 | ||
278 | /** | 278 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
279 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
280 | * | ||
281 | * When the next event is more than a tick into the future, stop the idle tick | ||
282 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
283 | * just interrupted by an interrupt which did not cause a reschedule. | ||
284 | */ | ||
285 | void tick_nohz_stop_sched_tick(int inidle) | ||
286 | { | 279 | { |
287 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 280 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
288 | struct tick_sched *ts; | ||
289 | ktime_t last_update, expires, now; | 281 | ktime_t last_update, expires, now; |
290 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 282 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
291 | u64 time_delta; | 283 | u64 time_delta; |
292 | int cpu; | 284 | int cpu; |
293 | 285 | ||
294 | local_irq_save(flags); | ||
295 | |||
296 | cpu = smp_processor_id(); | 286 | cpu = smp_processor_id(); |
297 | ts = &per_cpu(tick_cpu_sched, cpu); | 287 | ts = &per_cpu(tick_cpu_sched, cpu); |
298 | 288 | ||
299 | /* | ||
300 | * Call to tick_nohz_start_idle stops the last_update_time from being | ||
301 | * updated. Thus, it must not be called in the event we are called from | ||
302 | * irq_exit() with the prior state different than idle. | ||
303 | */ | ||
304 | if (!inidle && !ts->inidle) | ||
305 | goto end; | ||
306 | |||
307 | /* | ||
308 | * Set ts->inidle unconditionally. Even if the system did not | ||
309 | * switch to NOHZ mode the cpu frequency governers rely on the | ||
310 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
311 | */ | ||
312 | ts->inidle = 1; | ||
313 | |||
314 | now = tick_nohz_start_idle(cpu, ts); | 289 | now = tick_nohz_start_idle(cpu, ts); |
315 | 290 | ||
316 | /* | 291 | /* |
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
326 | } | 301 | } |
327 | 302 | ||
328 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 303 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
329 | goto end; | 304 | return; |
330 | 305 | ||
331 | if (need_resched()) | 306 | if (need_resched()) |
332 | goto end; | 307 | return; |
333 | 308 | ||
334 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 309 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
335 | static int ratelimit; | 310 | static int ratelimit; |
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
339 | (unsigned int) local_softirq_pending()); | 314 | (unsigned int) local_softirq_pending()); |
340 | ratelimit++; | 315 | ratelimit++; |
341 | } | 316 | } |
342 | goto end; | 317 | return; |
343 | } | 318 | } |
344 | 319 | ||
345 | ts->idle_calls++; | 320 | ts->idle_calls++; |
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
434 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 409 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
435 | ts->tick_stopped = 1; | 410 | ts->tick_stopped = 1; |
436 | ts->idle_jiffies = last_jiffies; | 411 | ts->idle_jiffies = last_jiffies; |
437 | rcu_enter_nohz(); | ||
438 | } | 412 | } |
439 | 413 | ||
440 | ts->idle_sleeps++; | 414 | ts->idle_sleeps++; |
@@ -472,8 +446,64 @@ out: | |||
472 | ts->next_jiffies = next_jiffies; | 446 | ts->next_jiffies = next_jiffies; |
473 | ts->last_jiffies = last_jiffies; | 447 | ts->last_jiffies = last_jiffies; |
474 | ts->sleep_length = ktime_sub(dev->next_event, now); | 448 | ts->sleep_length = ktime_sub(dev->next_event, now); |
475 | end: | 449 | } |
476 | local_irq_restore(flags); | 450 | |
451 | /** | ||
452 | * tick_nohz_idle_enter - stop the idle tick from the idle task | ||
453 | * | ||
454 | * When the next event is more than a tick into the future, stop the idle tick | ||
455 | * Called when we start the idle loop. | ||
456 | * | ||
457 | * The arch is responsible of calling: | ||
458 | * | ||
459 | * - rcu_idle_enter() after its last use of RCU before the CPU is put | ||
460 | * to sleep. | ||
461 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. | ||
462 | */ | ||
463 | void tick_nohz_idle_enter(void) | ||
464 | { | ||
465 | struct tick_sched *ts; | ||
466 | |||
467 | WARN_ON_ONCE(irqs_disabled()); | ||
468 | |||
469 | /* | ||
470 | * Update the idle state in the scheduler domain hierarchy | ||
471 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
472 | * State will be updated to busy during the first busy tick after | ||
473 | * exiting idle. | ||
474 | */ | ||
475 | set_cpu_sd_state_idle(); | ||
476 | |||
477 | local_irq_disable(); | ||
478 | |||
479 | ts = &__get_cpu_var(tick_cpu_sched); | ||
480 | /* | ||
481 | * set ts->inidle unconditionally. even if the system did not | ||
482 | * switch to nohz mode the cpu frequency governers rely on the | ||
483 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
484 | */ | ||
485 | ts->inidle = 1; | ||
486 | tick_nohz_stop_sched_tick(ts); | ||
487 | |||
488 | local_irq_enable(); | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * tick_nohz_irq_exit - update next tick event from interrupt exit | ||
493 | * | ||
494 | * When an interrupt fires while we are idle and it doesn't cause | ||
495 | * a reschedule, it may still add, modify or delete a timer, enqueue | ||
496 | * an RCU callback, etc... | ||
497 | * So we need to re-calculate and reprogram the next tick event. | ||
498 | */ | ||
499 | void tick_nohz_irq_exit(void) | ||
500 | { | ||
501 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
502 | |||
503 | if (!ts->inidle) | ||
504 | return; | ||
505 | |||
506 | tick_nohz_stop_sched_tick(ts); | ||
477 | } | 507 | } |
478 | 508 | ||
479 | /** | 509 | /** |
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
515 | } | 545 | } |
516 | 546 | ||
517 | /** | 547 | /** |
518 | * tick_nohz_restart_sched_tick - restart the idle tick from the idle task | 548 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
519 | * | 549 | * |
520 | * Restart the idle tick when the CPU is woken up from idle | 550 | * Restart the idle tick when the CPU is woken up from idle |
551 | * This also exit the RCU extended quiescent state. The CPU | ||
552 | * can use RCU again after this function is called. | ||
521 | */ | 553 | */ |
522 | void tick_nohz_restart_sched_tick(void) | 554 | void tick_nohz_idle_exit(void) |
523 | { | 555 | { |
524 | int cpu = smp_processor_id(); | 556 | int cpu = smp_processor_id(); |
525 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 557 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void) | |||
529 | ktime_t now; | 561 | ktime_t now; |
530 | 562 | ||
531 | local_irq_disable(); | 563 | local_irq_disable(); |
564 | |||
532 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) | 565 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) |
533 | now = ktime_get(); | 566 | now = ktime_get(); |
534 | 567 | ||
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void) | |||
543 | 576 | ||
544 | ts->inidle = 0; | 577 | ts->inidle = 0; |
545 | 578 | ||
546 | rcu_exit_nohz(); | ||
547 | |||
548 | /* Update jiffies first */ | 579 | /* Update jiffies first */ |
549 | select_nohz_load_balancer(0); | 580 | select_nohz_load_balancer(0); |
550 | tick_do_update_jiffies64(now); | 581 | tick_do_update_jiffies64(now); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 169479994755..e6a5a6bc2769 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
131 | /* calculate the delta since the last update_wall_time: */ | 131 | /* calculate the delta since the last update_wall_time: */ |
132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
133 | 133 | ||
134 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 134 | /* return delta convert to nanoseconds. */ |
135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
136 | } | 136 | } |
137 | 137 | ||
@@ -249,6 +249,8 @@ ktime_t ktime_get(void) | |||
249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
251 | nsecs += timekeeping_get_ns(); | 251 | nsecs += timekeeping_get_ns(); |
252 | /* If arch requires, add in gettimeoffset() */ | ||
253 | nsecs += arch_gettimeoffset(); | ||
252 | 254 | ||
253 | } while (read_seqretry(&xtime_lock, seq)); | 255 | } while (read_seqretry(&xtime_lock, seq)); |
254 | /* | 256 | /* |
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) | |||
280 | *ts = xtime; | 282 | *ts = xtime; |
281 | tomono = wall_to_monotonic; | 283 | tomono = wall_to_monotonic; |
282 | nsecs = timekeeping_get_ns(); | 284 | nsecs = timekeeping_get_ns(); |
285 | /* If arch requires, add in gettimeoffset() */ | ||
286 | nsecs += arch_gettimeoffset(); | ||
283 | 287 | ||
284 | } while (read_seqretry(&xtime_lock, seq)); | 288 | } while (read_seqretry(&xtime_lock, seq)); |
285 | 289 | ||
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) | |||
802 | s64 error, interval = timekeeper.cycle_interval; | 806 | s64 error, interval = timekeeper.cycle_interval; |
803 | int adj; | 807 | int adj; |
804 | 808 | ||
809 | /* | ||
810 | * The point of this is to check if the error is greater then half | ||
811 | * an interval. | ||
812 | * | ||
813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | ||
814 | * | ||
815 | * Note we subtract one in the shift, so that error is really error*2. | ||
816 | * This "saves" dividing(shifting) interval twice, but keeps the | ||
817 | * (error > interval) comparison as still measuring if error is | ||
818 | * larger then half an interval. | ||
819 | * | ||
820 | * Note: It does not "save" on aggravation when reading the code. | ||
821 | */ | ||
805 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
806 | if (error > interval) { | 823 | if (error > interval) { |
824 | /* | ||
825 | * We now divide error by 4(via shift), which checks if | ||
826 | * the error is greater then twice the interval. | ||
827 | * If it is greater, we need a bigadjust, if its smaller, | ||
828 | * we can adjust by 1. | ||
829 | */ | ||
807 | error >>= 2; | 830 | error >>= 2; |
831 | /* | ||
832 | * XXX - In update_wall_time, we round up to the next | ||
833 | * nanosecond, and store the amount rounded up into | ||
834 | * the error. This causes the likely below to be unlikely. | ||
835 | * | ||
836 | * The proper fix is to avoid rounding up by using | ||
837 | * the high precision timekeeper.xtime_nsec instead of | ||
838 | * xtime.tv_nsec everywhere. Fixing this will take some | ||
839 | * time. | ||
840 | */ | ||
808 | if (likely(error <= interval)) | 841 | if (likely(error <= interval)) |
809 | adj = 1; | 842 | adj = 1; |
810 | else | 843 | else |
811 | adj = timekeeping_bigadjust(error, &interval, &offset); | 844 | adj = timekeeping_bigadjust(error, &interval, &offset); |
812 | } else if (error < -interval) { | 845 | } else if (error < -interval) { |
846 | /* See comment above, this is just switched for the negative */ | ||
813 | error >>= 2; | 847 | error >>= 2; |
814 | if (likely(error >= -interval)) { | 848 | if (likely(error >= -interval)) { |
815 | adj = -1; | 849 | adj = -1; |
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) | |||
817 | offset = -offset; | 851 | offset = -offset; |
818 | } else | 852 | } else |
819 | adj = timekeeping_bigadjust(error, &interval, &offset); | 853 | adj = timekeeping_bigadjust(error, &interval, &offset); |
820 | } else | 854 | } else /* No adjustment needed */ |
821 | return; | 855 | return; |
822 | 856 | ||
857 | WARN_ONCE(timekeeper.clock->maxadj && | ||
858 | (timekeeper.mult + adj > timekeeper.clock->mult + | ||
859 | timekeeper.clock->maxadj), | ||
860 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | ||
861 | timekeeper.clock->name, (long)timekeeper.mult + adj, | ||
862 | (long)timekeeper.clock->mult + | ||
863 | timekeeper.clock->maxadj); | ||
864 | /* | ||
865 | * So the following can be confusing. | ||
866 | * | ||
867 | * To keep things simple, lets assume adj == 1 for now. | ||
868 | * | ||
869 | * When adj != 1, remember that the interval and offset values | ||
870 | * have been appropriately scaled so the math is the same. | ||
871 | * | ||
872 | * The basic idea here is that we're increasing the multiplier | ||
873 | * by one, this causes the xtime_interval to be incremented by | ||
874 | * one cycle_interval. This is because: | ||
875 | * xtime_interval = cycle_interval * mult | ||
876 | * So if mult is being incremented by one: | ||
877 | * xtime_interval = cycle_interval * (mult + 1) | ||
878 | * Its the same as: | ||
879 | * xtime_interval = (cycle_interval * mult) + cycle_interval | ||
880 | * Which can be shortened to: | ||
881 | * xtime_interval += cycle_interval | ||
882 | * | ||
883 | * So offset stores the non-accumulated cycles. Thus the current | ||
884 | * time (in shifted nanoseconds) is: | ||
885 | * now = (offset * adj) + xtime_nsec | ||
886 | * Now, even though we're adjusting the clock frequency, we have | ||
887 | * to keep time consistent. In other words, we can't jump back | ||
888 | * in time, and we also want to avoid jumping forward in time. | ||
889 | * | ||
890 | * So given the same offset value, we need the time to be the same | ||
891 | * both before and after the freq adjustment. | ||
892 | * now = (offset * adj_1) + xtime_nsec_1 | ||
893 | * now = (offset * adj_2) + xtime_nsec_2 | ||
894 | * So: | ||
895 | * (offset * adj_1) + xtime_nsec_1 = | ||
896 | * (offset * adj_2) + xtime_nsec_2 | ||
897 | * And we know: | ||
898 | * adj_2 = adj_1 + 1 | ||
899 | * So: | ||
900 | * (offset * adj_1) + xtime_nsec_1 = | ||
901 | * (offset * (adj_1+1)) + xtime_nsec_2 | ||
902 | * (offset * adj_1) + xtime_nsec_1 = | ||
903 | * (offset * adj_1) + offset + xtime_nsec_2 | ||
904 | * Canceling the sides: | ||
905 | * xtime_nsec_1 = offset + xtime_nsec_2 | ||
906 | * Which gives us: | ||
907 | * xtime_nsec_2 = xtime_nsec_1 - offset | ||
908 | * Which simplfies to: | ||
909 | * xtime_nsec -= offset | ||
910 | * | ||
911 | * XXX - TODO: Doc ntp_error calculation. | ||
912 | */ | ||
823 | timekeeper.mult += adj; | 913 | timekeeper.mult += adj; |
824 | timekeeper.xtime_interval += interval; | 914 | timekeeper.xtime_interval += interval; |
825 | timekeeper.xtime_nsec -= offset; | 915 | timekeeper.xtime_nsec -= offset; |
diff --git a/kernel/timer.c b/kernel/timer.c index dbaa62422b13..a297ffcf888e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) | |||
427 | } | 427 | } |
428 | } | 428 | } |
429 | 429 | ||
430 | /* Stub timer callback for improperly used timers. */ | ||
431 | static void stub_timer(unsigned long data) | ||
432 | { | ||
433 | WARN_ON(1); | ||
434 | } | ||
435 | |||
430 | /* | 436 | /* |
431 | * fixup_activate is called when: | 437 | * fixup_activate is called when: |
432 | * - an active object is activated | 438 | * - an active object is activated |
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) | |||
450 | debug_object_activate(timer, &timer_debug_descr); | 456 | debug_object_activate(timer, &timer_debug_descr); |
451 | return 0; | 457 | return 0; |
452 | } else { | 458 | } else { |
453 | WARN_ON_ONCE(1); | 459 | setup_timer(timer, stub_timer, 0); |
460 | return 1; | ||
454 | } | 461 | } |
455 | return 0; | 462 | return 0; |
456 | 463 | ||
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) | |||
480 | } | 487 | } |
481 | } | 488 | } |
482 | 489 | ||
490 | /* | ||
491 | * fixup_assert_init is called when: | ||
492 | * - an untracked/uninit-ed object is found | ||
493 | */ | ||
494 | static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) | ||
495 | { | ||
496 | struct timer_list *timer = addr; | ||
497 | |||
498 | switch (state) { | ||
499 | case ODEBUG_STATE_NOTAVAILABLE: | ||
500 | if (timer->entry.prev == TIMER_ENTRY_STATIC) { | ||
501 | /* | ||
502 | * This is not really a fixup. The timer was | ||
503 | * statically initialized. We just make sure that it | ||
504 | * is tracked in the object tracker. | ||
505 | */ | ||
506 | debug_object_init(timer, &timer_debug_descr); | ||
507 | return 0; | ||
508 | } else { | ||
509 | setup_timer(timer, stub_timer, 0); | ||
510 | return 1; | ||
511 | } | ||
512 | default: | ||
513 | return 0; | ||
514 | } | ||
515 | } | ||
516 | |||
483 | static struct debug_obj_descr timer_debug_descr = { | 517 | static struct debug_obj_descr timer_debug_descr = { |
484 | .name = "timer_list", | 518 | .name = "timer_list", |
485 | .debug_hint = timer_debug_hint, | 519 | .debug_hint = timer_debug_hint, |
486 | .fixup_init = timer_fixup_init, | 520 | .fixup_init = timer_fixup_init, |
487 | .fixup_activate = timer_fixup_activate, | 521 | .fixup_activate = timer_fixup_activate, |
488 | .fixup_free = timer_fixup_free, | 522 | .fixup_free = timer_fixup_free, |
523 | .fixup_assert_init = timer_fixup_assert_init, | ||
489 | }; | 524 | }; |
490 | 525 | ||
491 | static inline void debug_timer_init(struct timer_list *timer) | 526 | static inline void debug_timer_init(struct timer_list *timer) |
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer) | |||
508 | debug_object_free(timer, &timer_debug_descr); | 543 | debug_object_free(timer, &timer_debug_descr); |
509 | } | 544 | } |
510 | 545 | ||
546 | static inline void debug_timer_assert_init(struct timer_list *timer) | ||
547 | { | ||
548 | debug_object_assert_init(timer, &timer_debug_descr); | ||
549 | } | ||
550 | |||
511 | static void __init_timer(struct timer_list *timer, | 551 | static void __init_timer(struct timer_list *timer, |
512 | const char *name, | 552 | const char *name, |
513 | struct lock_class_key *key); | 553 | struct lock_class_key *key); |
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack); | |||
531 | static inline void debug_timer_init(struct timer_list *timer) { } | 571 | static inline void debug_timer_init(struct timer_list *timer) { } |
532 | static inline void debug_timer_activate(struct timer_list *timer) { } | 572 | static inline void debug_timer_activate(struct timer_list *timer) { } |
533 | static inline void debug_timer_deactivate(struct timer_list *timer) { } | 573 | static inline void debug_timer_deactivate(struct timer_list *timer) { } |
574 | static inline void debug_timer_assert_init(struct timer_list *timer) { } | ||
534 | #endif | 575 | #endif |
535 | 576 | ||
536 | static inline void debug_init(struct timer_list *timer) | 577 | static inline void debug_init(struct timer_list *timer) |
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer) | |||
552 | trace_timer_cancel(timer); | 593 | trace_timer_cancel(timer); |
553 | } | 594 | } |
554 | 595 | ||
596 | static inline void debug_assert_init(struct timer_list *timer) | ||
597 | { | ||
598 | debug_timer_assert_init(timer); | ||
599 | } | ||
600 | |||
555 | static void __init_timer(struct timer_list *timer, | 601 | static void __init_timer(struct timer_list *timer, |
556 | const char *name, | 602 | const char *name, |
557 | struct lock_class_key *key) | 603 | struct lock_class_key *key) |
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer) | |||
902 | unsigned long flags; | 948 | unsigned long flags; |
903 | int ret = 0; | 949 | int ret = 0; |
904 | 950 | ||
951 | debug_assert_init(timer); | ||
952 | |||
905 | timer_stats_timer_clear_start_info(timer); | 953 | timer_stats_timer_clear_start_info(timer); |
906 | if (timer_pending(timer)) { | 954 | if (timer_pending(timer)) { |
907 | base = lock_timer_base(timer, &flags); | 955 | base = lock_timer_base(timer, &flags); |
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
932 | unsigned long flags; | 980 | unsigned long flags; |
933 | int ret = -1; | 981 | int ret = -1; |
934 | 982 | ||
983 | debug_assert_init(timer); | ||
984 | |||
935 | base = lock_timer_base(timer, &flags); | 985 | base = lock_timer_base(timer, &flags); |
936 | 986 | ||
937 | if (base->running_timer == timer) | 987 | if (base->running_timer == timer) |
@@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid) | |||
1368 | int pid; | 1418 | int pid; |
1369 | 1419 | ||
1370 | rcu_read_lock(); | 1420 | rcu_read_lock(); |
1371 | pid = task_tgid_vnr(current->real_parent); | 1421 | pid = task_tgid_vnr(rcu_dereference(current->real_parent)); |
1372 | rcu_read_unlock(); | 1422 | rcu_read_unlock(); |
1373 | 1423 | ||
1374 | return pid; | 1424 | return pid; |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 16fc34a0806f..cdea7b56b0c9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry) | |||
402 | 402 | ||
403 | static struct dentry *blk_create_buf_file_callback(const char *filename, | 403 | static struct dentry *blk_create_buf_file_callback(const char *filename, |
404 | struct dentry *parent, | 404 | struct dentry *parent, |
405 | int mode, | 405 | umode_t mode, |
406 | struct rchan_buf *buf, | 406 | struct rchan_buf *buf, |
407 | int *is_global) | 407 | int *is_global) |
408 | { | 408 | { |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 900b409543db..683d559a0eef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -22,11 +22,13 @@ | |||
22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | #include <linux/bsearch.h> | ||
25 | #include <linux/module.h> | 26 | #include <linux/module.h> |
26 | #include <linux/ftrace.h> | 27 | #include <linux/ftrace.h> |
27 | #include <linux/sysctl.h> | 28 | #include <linux/sysctl.h> |
28 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
29 | #include <linux/ctype.h> | 30 | #include <linux/ctype.h> |
31 | #include <linux/sort.h> | ||
30 | #include <linux/list.h> | 32 | #include <linux/list.h> |
31 | #include <linux/hash.h> | 33 | #include <linux/hash.h> |
32 | #include <linux/rcupdate.h> | 34 | #include <linux/rcupdate.h> |
@@ -152,7 +154,6 @@ void clear_ftrace_function(void) | |||
152 | ftrace_pid_function = ftrace_stub; | 154 | ftrace_pid_function = ftrace_stub; |
153 | } | 155 | } |
154 | 156 | ||
155 | #undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
156 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 157 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
157 | /* | 158 | /* |
158 | * For those archs that do not test ftrace_trace_stop in their | 159 | * For those archs that do not test ftrace_trace_stop in their |
@@ -948,13 +949,6 @@ struct ftrace_func_probe { | |||
948 | struct rcu_head rcu; | 949 | struct rcu_head rcu; |
949 | }; | 950 | }; |
950 | 951 | ||
951 | enum { | ||
952 | FTRACE_ENABLE_CALLS = (1 << 0), | ||
953 | FTRACE_DISABLE_CALLS = (1 << 1), | ||
954 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), | ||
955 | FTRACE_START_FUNC_RET = (1 << 3), | ||
956 | FTRACE_STOP_FUNC_RET = (1 << 4), | ||
957 | }; | ||
958 | struct ftrace_func_entry { | 952 | struct ftrace_func_entry { |
959 | struct hlist_node hlist; | 953 | struct hlist_node hlist; |
960 | unsigned long ip; | 954 | unsigned long ip; |
@@ -985,18 +979,19 @@ static struct ftrace_ops global_ops = { | |||
985 | .filter_hash = EMPTY_HASH, | 979 | .filter_hash = EMPTY_HASH, |
986 | }; | 980 | }; |
987 | 981 | ||
988 | static struct dyn_ftrace *ftrace_new_addrs; | ||
989 | |||
990 | static DEFINE_MUTEX(ftrace_regex_lock); | 982 | static DEFINE_MUTEX(ftrace_regex_lock); |
991 | 983 | ||
992 | struct ftrace_page { | 984 | struct ftrace_page { |
993 | struct ftrace_page *next; | 985 | struct ftrace_page *next; |
986 | struct dyn_ftrace *records; | ||
994 | int index; | 987 | int index; |
995 | struct dyn_ftrace records[]; | 988 | int size; |
996 | }; | 989 | }; |
997 | 990 | ||
998 | #define ENTRIES_PER_PAGE \ | 991 | static struct ftrace_page *ftrace_new_pgs; |
999 | ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) | 992 | |
993 | #define ENTRY_SIZE sizeof(struct dyn_ftrace) | ||
994 | #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) | ||
1000 | 995 | ||
1001 | /* estimate from running different kernels */ | 996 | /* estimate from running different kernels */ |
1002 | #define NR_TO_INIT 10000 | 997 | #define NR_TO_INIT 10000 |
@@ -1004,7 +999,10 @@ struct ftrace_page { | |||
1004 | static struct ftrace_page *ftrace_pages_start; | 999 | static struct ftrace_page *ftrace_pages_start; |
1005 | static struct ftrace_page *ftrace_pages; | 1000 | static struct ftrace_page *ftrace_pages; |
1006 | 1001 | ||
1007 | static struct dyn_ftrace *ftrace_free_records; | 1002 | static bool ftrace_hash_empty(struct ftrace_hash *hash) |
1003 | { | ||
1004 | return !hash || !hash->count; | ||
1005 | } | ||
1008 | 1006 | ||
1009 | static struct ftrace_func_entry * | 1007 | static struct ftrace_func_entry * |
1010 | ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | 1008 | ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) |
@@ -1014,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | |||
1014 | struct hlist_head *hhd; | 1012 | struct hlist_head *hhd; |
1015 | struct hlist_node *n; | 1013 | struct hlist_node *n; |
1016 | 1014 | ||
1017 | if (!hash->count) | 1015 | if (ftrace_hash_empty(hash)) |
1018 | return NULL; | 1016 | return NULL; |
1019 | 1017 | ||
1020 | if (hash->size_bits > 0) | 1018 | if (hash->size_bits > 0) |
@@ -1158,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
1158 | return NULL; | 1156 | return NULL; |
1159 | 1157 | ||
1160 | /* Empty hash? */ | 1158 | /* Empty hash? */ |
1161 | if (!hash || !hash->count) | 1159 | if (ftrace_hash_empty(hash)) |
1162 | return new_hash; | 1160 | return new_hash; |
1163 | 1161 | ||
1164 | size = 1 << hash->size_bits; | 1162 | size = 1 << hash->size_bits; |
@@ -1212,7 +1210,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1212 | if (!src->count) { | 1210 | if (!src->count) { |
1213 | free_ftrace_hash_rcu(*dst); | 1211 | free_ftrace_hash_rcu(*dst); |
1214 | rcu_assign_pointer(*dst, EMPTY_HASH); | 1212 | rcu_assign_pointer(*dst, EMPTY_HASH); |
1215 | return 0; | 1213 | /* still need to update the function records */ |
1214 | ret = 0; | ||
1215 | goto out; | ||
1216 | } | 1216 | } |
1217 | 1217 | ||
1218 | /* | 1218 | /* |
@@ -1281,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1281 | filter_hash = rcu_dereference_raw(ops->filter_hash); | 1281 | filter_hash = rcu_dereference_raw(ops->filter_hash); |
1282 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | 1282 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); |
1283 | 1283 | ||
1284 | if ((!filter_hash || !filter_hash->count || | 1284 | if ((ftrace_hash_empty(filter_hash) || |
1285 | ftrace_lookup_ip(filter_hash, ip)) && | 1285 | ftrace_lookup_ip(filter_hash, ip)) && |
1286 | (!notrace_hash || !notrace_hash->count || | 1286 | (ftrace_hash_empty(notrace_hash) || |
1287 | !ftrace_lookup_ip(notrace_hash, ip))) | 1287 | !ftrace_lookup_ip(notrace_hash, ip))) |
1288 | ret = 1; | 1288 | ret = 1; |
1289 | else | 1289 | else |
@@ -1306,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1306 | } \ | 1306 | } \ |
1307 | } | 1307 | } |
1308 | 1308 | ||
1309 | |||
1310 | static int ftrace_cmp_recs(const void *a, const void *b) | ||
1311 | { | ||
1312 | const struct dyn_ftrace *reca = a; | ||
1313 | const struct dyn_ftrace *recb = b; | ||
1314 | |||
1315 | if (reca->ip > recb->ip) | ||
1316 | return 1; | ||
1317 | if (reca->ip < recb->ip) | ||
1318 | return -1; | ||
1319 | return 0; | ||
1320 | } | ||
1321 | |||
1322 | /** | ||
1323 | * ftrace_location - return true if the ip giving is a traced location | ||
1324 | * @ip: the instruction pointer to check | ||
1325 | * | ||
1326 | * Returns 1 if @ip given is a pointer to a ftrace location. | ||
1327 | * That is, the instruction that is either a NOP or call to | ||
1328 | * the function tracer. It checks the ftrace internal tables to | ||
1329 | * determine if the address belongs or not. | ||
1330 | */ | ||
1331 | int ftrace_location(unsigned long ip) | ||
1332 | { | ||
1333 | struct ftrace_page *pg; | ||
1334 | struct dyn_ftrace *rec; | ||
1335 | struct dyn_ftrace key; | ||
1336 | |||
1337 | key.ip = ip; | ||
1338 | |||
1339 | for (pg = ftrace_pages_start; pg; pg = pg->next) { | ||
1340 | rec = bsearch(&key, pg->records, pg->index, | ||
1341 | sizeof(struct dyn_ftrace), | ||
1342 | ftrace_cmp_recs); | ||
1343 | if (rec) | ||
1344 | return 1; | ||
1345 | } | ||
1346 | |||
1347 | return 0; | ||
1348 | } | ||
1349 | |||
1309 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1350 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1310 | int filter_hash, | 1351 | int filter_hash, |
1311 | bool inc) | 1352 | bool inc) |
@@ -1335,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1335 | if (filter_hash) { | 1376 | if (filter_hash) { |
1336 | hash = ops->filter_hash; | 1377 | hash = ops->filter_hash; |
1337 | other_hash = ops->notrace_hash; | 1378 | other_hash = ops->notrace_hash; |
1338 | if (!hash || !hash->count) | 1379 | if (ftrace_hash_empty(hash)) |
1339 | all = 1; | 1380 | all = 1; |
1340 | } else { | 1381 | } else { |
1341 | inc = !inc; | 1382 | inc = !inc; |
@@ -1345,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1345 | * If the notrace hash has no items, | 1386 | * If the notrace hash has no items, |
1346 | * then there's nothing to do. | 1387 | * then there's nothing to do. |
1347 | */ | 1388 | */ |
1348 | if (hash && !hash->count) | 1389 | if (ftrace_hash_empty(hash)) |
1349 | return; | 1390 | return; |
1350 | } | 1391 | } |
1351 | 1392 | ||
@@ -1362,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1362 | if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) | 1403 | if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) |
1363 | match = 1; | 1404 | match = 1; |
1364 | } else { | 1405 | } else { |
1365 | in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); | 1406 | in_hash = !!ftrace_lookup_ip(hash, rec->ip); |
1366 | in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); | 1407 | in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); |
1367 | 1408 | ||
1368 | /* | 1409 | /* |
1369 | * | 1410 | * |
@@ -1371,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1371 | if (filter_hash && in_hash && !in_other_hash) | 1412 | if (filter_hash && in_hash && !in_other_hash) |
1372 | match = 1; | 1413 | match = 1; |
1373 | else if (!filter_hash && in_hash && | 1414 | else if (!filter_hash && in_hash && |
1374 | (in_other_hash || !other_hash->count)) | 1415 | (in_other_hash || ftrace_hash_empty(other_hash))) |
1375 | match = 1; | 1416 | match = 1; |
1376 | } | 1417 | } |
1377 | if (!match) | 1418 | if (!match) |
@@ -1405,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
1405 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1446 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
1406 | } | 1447 | } |
1407 | 1448 | ||
1408 | static void ftrace_free_rec(struct dyn_ftrace *rec) | ||
1409 | { | ||
1410 | rec->freelist = ftrace_free_records; | ||
1411 | ftrace_free_records = rec; | ||
1412 | rec->flags |= FTRACE_FL_FREE; | ||
1413 | } | ||
1414 | |||
1415 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) | 1449 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) |
1416 | { | 1450 | { |
1417 | struct dyn_ftrace *rec; | 1451 | if (ftrace_pages->index == ftrace_pages->size) { |
1418 | 1452 | /* We should have allocated enough */ | |
1419 | /* First check for freed records */ | 1453 | if (WARN_ON(!ftrace_pages->next)) |
1420 | if (ftrace_free_records) { | ||
1421 | rec = ftrace_free_records; | ||
1422 | |||
1423 | if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { | ||
1424 | FTRACE_WARN_ON_ONCE(1); | ||
1425 | ftrace_free_records = NULL; | ||
1426 | return NULL; | 1454 | return NULL; |
1427 | } | ||
1428 | |||
1429 | ftrace_free_records = rec->freelist; | ||
1430 | memset(rec, 0, sizeof(*rec)); | ||
1431 | return rec; | ||
1432 | } | ||
1433 | |||
1434 | if (ftrace_pages->index == ENTRIES_PER_PAGE) { | ||
1435 | if (!ftrace_pages->next) { | ||
1436 | /* allocate another page */ | ||
1437 | ftrace_pages->next = | ||
1438 | (void *)get_zeroed_page(GFP_KERNEL); | ||
1439 | if (!ftrace_pages->next) | ||
1440 | return NULL; | ||
1441 | } | ||
1442 | ftrace_pages = ftrace_pages->next; | 1455 | ftrace_pages = ftrace_pages->next; |
1443 | } | 1456 | } |
1444 | 1457 | ||
@@ -1458,8 +1471,6 @@ ftrace_record_ip(unsigned long ip) | |||
1458 | return NULL; | 1471 | return NULL; |
1459 | 1472 | ||
1460 | rec->ip = ip; | 1473 | rec->ip = ip; |
1461 | rec->newlist = ftrace_new_addrs; | ||
1462 | ftrace_new_addrs = rec; | ||
1463 | 1474 | ||
1464 | return rec; | 1475 | return rec; |
1465 | } | 1476 | } |
@@ -1474,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p) | |||
1474 | printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); | 1485 | printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); |
1475 | } | 1486 | } |
1476 | 1487 | ||
1477 | static void ftrace_bug(int failed, unsigned long ip) | 1488 | /** |
1489 | * ftrace_bug - report and shutdown function tracer | ||
1490 | * @failed: The failed type (EFAULT, EINVAL, EPERM) | ||
1491 | * @ip: The address that failed | ||
1492 | * | ||
1493 | * The arch code that enables or disables the function tracing | ||
1494 | * can call ftrace_bug() when it has detected a problem in | ||
1495 | * modifying the code. @failed should be one of either: | ||
1496 | * EFAULT - if the problem happens on reading the @ip address | ||
1497 | * EINVAL - if what is read at @ip is not what was expected | ||
1498 | * EPERM - if the problem happens on writting to the @ip address | ||
1499 | */ | ||
1500 | void ftrace_bug(int failed, unsigned long ip) | ||
1478 | { | 1501 | { |
1479 | switch (failed) { | 1502 | switch (failed) { |
1480 | case -EFAULT: | 1503 | case -EFAULT: |
@@ -1516,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end) | |||
1516 | return 0; | 1539 | return 0; |
1517 | } | 1540 | } |
1518 | 1541 | ||
1519 | 1542 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |
1520 | static int | ||
1521 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | ||
1522 | { | 1543 | { |
1523 | unsigned long ftrace_addr; | ||
1524 | unsigned long flag = 0UL; | 1544 | unsigned long flag = 0UL; |
1525 | 1545 | ||
1526 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
1527 | |||
1528 | /* | 1546 | /* |
1529 | * If we are enabling tracing: | 1547 | * If we are updating calls: |
1530 | * | 1548 | * |
1531 | * If the record has a ref count, then we need to enable it | 1549 | * If the record has a ref count, then we need to enable it |
1532 | * because someone is using it. | 1550 | * because someone is using it. |
1533 | * | 1551 | * |
1534 | * Otherwise we make sure its disabled. | 1552 | * Otherwise we make sure its disabled. |
1535 | * | 1553 | * |
1536 | * If we are disabling tracing, then disable all records that | 1554 | * If we are disabling calls, then disable all records that |
1537 | * are enabled. | 1555 | * are enabled. |
1538 | */ | 1556 | */ |
1539 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) | 1557 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
@@ -1541,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1541 | 1559 | ||
1542 | /* If the state of this record hasn't changed, then do nothing */ | 1560 | /* If the state of this record hasn't changed, then do nothing */ |
1543 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1561 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
1544 | return 0; | 1562 | return FTRACE_UPDATE_IGNORE; |
1545 | 1563 | ||
1546 | if (flag) { | 1564 | if (flag) { |
1547 | rec->flags |= FTRACE_FL_ENABLED; | 1565 | if (update) |
1566 | rec->flags |= FTRACE_FL_ENABLED; | ||
1567 | return FTRACE_UPDATE_MAKE_CALL; | ||
1568 | } | ||
1569 | |||
1570 | if (update) | ||
1571 | rec->flags &= ~FTRACE_FL_ENABLED; | ||
1572 | |||
1573 | return FTRACE_UPDATE_MAKE_NOP; | ||
1574 | } | ||
1575 | |||
1576 | /** | ||
1577 | * ftrace_update_record, set a record that now is tracing or not | ||
1578 | * @rec: the record to update | ||
1579 | * @enable: set to 1 if the record is tracing, zero to force disable | ||
1580 | * | ||
1581 | * The records that represent all functions that can be traced need | ||
1582 | * to be updated when tracing has been enabled. | ||
1583 | */ | ||
1584 | int ftrace_update_record(struct dyn_ftrace *rec, int enable) | ||
1585 | { | ||
1586 | return ftrace_check_record(rec, enable, 1); | ||
1587 | } | ||
1588 | |||
1589 | /** | ||
1590 | * ftrace_test_record, check if the record has been enabled or not | ||
1591 | * @rec: the record to test | ||
1592 | * @enable: set to 1 to check if enabled, 0 if it is disabled | ||
1593 | * | ||
1594 | * The arch code may need to test if a record is already set to | ||
1595 | * tracing to determine how to modify the function code that it | ||
1596 | * represents. | ||
1597 | */ | ||
1598 | int ftrace_test_record(struct dyn_ftrace *rec, int enable) | ||
1599 | { | ||
1600 | return ftrace_check_record(rec, enable, 0); | ||
1601 | } | ||
1602 | |||
1603 | static int | ||
1604 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | ||
1605 | { | ||
1606 | unsigned long ftrace_addr; | ||
1607 | int ret; | ||
1608 | |||
1609 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
1610 | |||
1611 | ret = ftrace_update_record(rec, enable); | ||
1612 | |||
1613 | switch (ret) { | ||
1614 | case FTRACE_UPDATE_IGNORE: | ||
1615 | return 0; | ||
1616 | |||
1617 | case FTRACE_UPDATE_MAKE_CALL: | ||
1548 | return ftrace_make_call(rec, ftrace_addr); | 1618 | return ftrace_make_call(rec, ftrace_addr); |
1619 | |||
1620 | case FTRACE_UPDATE_MAKE_NOP: | ||
1621 | return ftrace_make_nop(NULL, rec, ftrace_addr); | ||
1549 | } | 1622 | } |
1550 | 1623 | ||
1551 | rec->flags &= ~FTRACE_FL_ENABLED; | 1624 | return -1; /* unknow ftrace bug */ |
1552 | return ftrace_make_nop(NULL, rec, ftrace_addr); | ||
1553 | } | 1625 | } |
1554 | 1626 | ||
1555 | static void ftrace_replace_code(int enable) | 1627 | static void ftrace_replace_code(int update) |
1556 | { | 1628 | { |
1557 | struct dyn_ftrace *rec; | 1629 | struct dyn_ftrace *rec; |
1558 | struct ftrace_page *pg; | 1630 | struct ftrace_page *pg; |
@@ -1562,11 +1634,7 @@ static void ftrace_replace_code(int enable) | |||
1562 | return; | 1634 | return; |
1563 | 1635 | ||
1564 | do_for_each_ftrace_rec(pg, rec) { | 1636 | do_for_each_ftrace_rec(pg, rec) { |
1565 | /* Skip over free records */ | 1637 | failed = __ftrace_replace_code(rec, update); |
1566 | if (rec->flags & FTRACE_FL_FREE) | ||
1567 | continue; | ||
1568 | |||
1569 | failed = __ftrace_replace_code(rec, enable); | ||
1570 | if (failed) { | 1638 | if (failed) { |
1571 | ftrace_bug(failed, rec->ip); | 1639 | ftrace_bug(failed, rec->ip); |
1572 | /* Stop processing */ | 1640 | /* Stop processing */ |
@@ -1575,6 +1643,78 @@ static void ftrace_replace_code(int enable) | |||
1575 | } while_for_each_ftrace_rec(); | 1643 | } while_for_each_ftrace_rec(); |
1576 | } | 1644 | } |
1577 | 1645 | ||
1646 | struct ftrace_rec_iter { | ||
1647 | struct ftrace_page *pg; | ||
1648 | int index; | ||
1649 | }; | ||
1650 | |||
1651 | /** | ||
1652 | * ftrace_rec_iter_start, start up iterating over traced functions | ||
1653 | * | ||
1654 | * Returns an iterator handle that is used to iterate over all | ||
1655 | * the records that represent address locations where functions | ||
1656 | * are traced. | ||
1657 | * | ||
1658 | * May return NULL if no records are available. | ||
1659 | */ | ||
1660 | struct ftrace_rec_iter *ftrace_rec_iter_start(void) | ||
1661 | { | ||
1662 | /* | ||
1663 | * We only use a single iterator. | ||
1664 | * Protected by the ftrace_lock mutex. | ||
1665 | */ | ||
1666 | static struct ftrace_rec_iter ftrace_rec_iter; | ||
1667 | struct ftrace_rec_iter *iter = &ftrace_rec_iter; | ||
1668 | |||
1669 | iter->pg = ftrace_pages_start; | ||
1670 | iter->index = 0; | ||
1671 | |||
1672 | /* Could have empty pages */ | ||
1673 | while (iter->pg && !iter->pg->index) | ||
1674 | iter->pg = iter->pg->next; | ||
1675 | |||
1676 | if (!iter->pg) | ||
1677 | return NULL; | ||
1678 | |||
1679 | return iter; | ||
1680 | } | ||
1681 | |||
1682 | /** | ||
1683 | * ftrace_rec_iter_next, get the next record to process. | ||
1684 | * @iter: The handle to the iterator. | ||
1685 | * | ||
1686 | * Returns the next iterator after the given iterator @iter. | ||
1687 | */ | ||
1688 | struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) | ||
1689 | { | ||
1690 | iter->index++; | ||
1691 | |||
1692 | if (iter->index >= iter->pg->index) { | ||
1693 | iter->pg = iter->pg->next; | ||
1694 | iter->index = 0; | ||
1695 | |||
1696 | /* Could have empty pages */ | ||
1697 | while (iter->pg && !iter->pg->index) | ||
1698 | iter->pg = iter->pg->next; | ||
1699 | } | ||
1700 | |||
1701 | if (!iter->pg) | ||
1702 | return NULL; | ||
1703 | |||
1704 | return iter; | ||
1705 | } | ||
1706 | |||
1707 | /** | ||
1708 | * ftrace_rec_iter_record, get the record at the iterator location | ||
1709 | * @iter: The current iterator location | ||
1710 | * | ||
1711 | * Returns the record that the current @iter is at. | ||
1712 | */ | ||
1713 | struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) | ||
1714 | { | ||
1715 | return &iter->pg->records[iter->index]; | ||
1716 | } | ||
1717 | |||
1578 | static int | 1718 | static int |
1579 | ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | 1719 | ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) |
1580 | { | 1720 | { |
@@ -1616,13 +1756,7 @@ static int __ftrace_modify_code(void *data) | |||
1616 | { | 1756 | { |
1617 | int *command = data; | 1757 | int *command = data; |
1618 | 1758 | ||
1619 | /* | 1759 | if (*command & FTRACE_UPDATE_CALLS) |
1620 | * Do not call function tracer while we update the code. | ||
1621 | * We are in stop machine, no worrying about races. | ||
1622 | */ | ||
1623 | function_trace_stop++; | ||
1624 | |||
1625 | if (*command & FTRACE_ENABLE_CALLS) | ||
1626 | ftrace_replace_code(1); | 1760 | ftrace_replace_code(1); |
1627 | else if (*command & FTRACE_DISABLE_CALLS) | 1761 | else if (*command & FTRACE_DISABLE_CALLS) |
1628 | ftrace_replace_code(0); | 1762 | ftrace_replace_code(0); |
@@ -1635,21 +1769,33 @@ static int __ftrace_modify_code(void *data) | |||
1635 | else if (*command & FTRACE_STOP_FUNC_RET) | 1769 | else if (*command & FTRACE_STOP_FUNC_RET) |
1636 | ftrace_disable_ftrace_graph_caller(); | 1770 | ftrace_disable_ftrace_graph_caller(); |
1637 | 1771 | ||
1638 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
1639 | /* | ||
1640 | * For archs that call ftrace_test_stop_func(), we must | ||
1641 | * wait till after we update all the function callers | ||
1642 | * before we update the callback. This keeps different | ||
1643 | * ops that record different functions from corrupting | ||
1644 | * each other. | ||
1645 | */ | ||
1646 | __ftrace_trace_function = __ftrace_trace_function_delay; | ||
1647 | #endif | ||
1648 | function_trace_stop--; | ||
1649 | |||
1650 | return 0; | 1772 | return 0; |
1651 | } | 1773 | } |
1652 | 1774 | ||
1775 | /** | ||
1776 | * ftrace_run_stop_machine, go back to the stop machine method | ||
1777 | * @command: The command to tell ftrace what to do | ||
1778 | * | ||
1779 | * If an arch needs to fall back to the stop machine method, the | ||
1780 | * it can call this function. | ||
1781 | */ | ||
1782 | void ftrace_run_stop_machine(int command) | ||
1783 | { | ||
1784 | stop_machine(__ftrace_modify_code, &command, NULL); | ||
1785 | } | ||
1786 | |||
1787 | /** | ||
1788 | * arch_ftrace_update_code, modify the code to trace or not trace | ||
1789 | * @command: The command that needs to be done | ||
1790 | * | ||
1791 | * Archs can override this function if it does not need to | ||
1792 | * run stop_machine() to modify code. | ||
1793 | */ | ||
1794 | void __weak arch_ftrace_update_code(int command) | ||
1795 | { | ||
1796 | ftrace_run_stop_machine(command); | ||
1797 | } | ||
1798 | |||
1653 | static void ftrace_run_update_code(int command) | 1799 | static void ftrace_run_update_code(int command) |
1654 | { | 1800 | { |
1655 | int ret; | 1801 | int ret; |
@@ -1658,8 +1804,31 @@ static void ftrace_run_update_code(int command) | |||
1658 | FTRACE_WARN_ON(ret); | 1804 | FTRACE_WARN_ON(ret); |
1659 | if (ret) | 1805 | if (ret) |
1660 | return; | 1806 | return; |
1807 | /* | ||
1808 | * Do not call function tracer while we update the code. | ||
1809 | * We are in stop machine. | ||
1810 | */ | ||
1811 | function_trace_stop++; | ||
1661 | 1812 | ||
1662 | stop_machine(__ftrace_modify_code, &command, NULL); | 1813 | /* |
1814 | * By default we use stop_machine() to modify the code. | ||
1815 | * But archs can do what ever they want as long as it | ||
1816 | * is safe. The stop_machine() is the safest, but also | ||
1817 | * produces the most overhead. | ||
1818 | */ | ||
1819 | arch_ftrace_update_code(command); | ||
1820 | |||
1821 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
1822 | /* | ||
1823 | * For archs that call ftrace_test_stop_func(), we must | ||
1824 | * wait till after we update all the function callers | ||
1825 | * before we update the callback. This keeps different | ||
1826 | * ops that record different functions from corrupting | ||
1827 | * each other. | ||
1828 | */ | ||
1829 | __ftrace_trace_function = __ftrace_trace_function_delay; | ||
1830 | #endif | ||
1831 | function_trace_stop--; | ||
1663 | 1832 | ||
1664 | ret = ftrace_arch_code_modify_post_process(); | 1833 | ret = ftrace_arch_code_modify_post_process(); |
1665 | FTRACE_WARN_ON(ret); | 1834 | FTRACE_WARN_ON(ret); |
@@ -1690,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
1690 | return -ENODEV; | 1859 | return -ENODEV; |
1691 | 1860 | ||
1692 | ftrace_start_up++; | 1861 | ftrace_start_up++; |
1693 | command |= FTRACE_ENABLE_CALLS; | 1862 | command |= FTRACE_UPDATE_CALLS; |
1694 | 1863 | ||
1695 | /* ops marked global share the filter hashes */ | 1864 | /* ops marked global share the filter hashes */ |
1696 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 1865 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
@@ -1742,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
1742 | if (ops != &global_ops || !global_start_up) | 1911 | if (ops != &global_ops || !global_start_up) |
1743 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 1912 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
1744 | 1913 | ||
1745 | if (!ftrace_start_up) | 1914 | command |= FTRACE_UPDATE_CALLS; |
1746 | command |= FTRACE_DISABLE_CALLS; | ||
1747 | 1915 | ||
1748 | if (saved_ftrace_func != ftrace_trace_function) { | 1916 | if (saved_ftrace_func != ftrace_trace_function) { |
1749 | saved_ftrace_func = ftrace_trace_function; | 1917 | saved_ftrace_func = ftrace_trace_function; |
@@ -1765,7 +1933,7 @@ static void ftrace_startup_sysctl(void) | |||
1765 | saved_ftrace_func = NULL; | 1933 | saved_ftrace_func = NULL; |
1766 | /* ftrace_start_up is true if we want ftrace running */ | 1934 | /* ftrace_start_up is true if we want ftrace running */ |
1767 | if (ftrace_start_up) | 1935 | if (ftrace_start_up) |
1768 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | 1936 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); |
1769 | } | 1937 | } |
1770 | 1938 | ||
1771 | static void ftrace_shutdown_sysctl(void) | 1939 | static void ftrace_shutdown_sysctl(void) |
@@ -1787,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops) | |||
1787 | struct ftrace_hash *hash; | 1955 | struct ftrace_hash *hash; |
1788 | 1956 | ||
1789 | hash = ops->filter_hash; | 1957 | hash = ops->filter_hash; |
1790 | return !!(!hash || !hash->count); | 1958 | return ftrace_hash_empty(hash); |
1791 | } | 1959 | } |
1792 | 1960 | ||
1793 | static int ftrace_update_code(struct module *mod) | 1961 | static int ftrace_update_code(struct module *mod) |
1794 | { | 1962 | { |
1963 | struct ftrace_page *pg; | ||
1795 | struct dyn_ftrace *p; | 1964 | struct dyn_ftrace *p; |
1796 | cycle_t start, stop; | 1965 | cycle_t start, stop; |
1797 | unsigned long ref = 0; | 1966 | unsigned long ref = 0; |
1967 | int i; | ||
1798 | 1968 | ||
1799 | /* | 1969 | /* |
1800 | * When adding a module, we need to check if tracers are | 1970 | * When adding a module, we need to check if tracers are |
@@ -1816,46 +1986,44 @@ static int ftrace_update_code(struct module *mod) | |||
1816 | start = ftrace_now(raw_smp_processor_id()); | 1986 | start = ftrace_now(raw_smp_processor_id()); |
1817 | ftrace_update_cnt = 0; | 1987 | ftrace_update_cnt = 0; |
1818 | 1988 | ||
1819 | while (ftrace_new_addrs) { | 1989 | for (pg = ftrace_new_pgs; pg; pg = pg->next) { |
1820 | 1990 | ||
1821 | /* If something went wrong, bail without enabling anything */ | 1991 | for (i = 0; i < pg->index; i++) { |
1822 | if (unlikely(ftrace_disabled)) | 1992 | /* If something went wrong, bail without enabling anything */ |
1823 | return -1; | 1993 | if (unlikely(ftrace_disabled)) |
1994 | return -1; | ||
1824 | 1995 | ||
1825 | p = ftrace_new_addrs; | 1996 | p = &pg->records[i]; |
1826 | ftrace_new_addrs = p->newlist; | 1997 | p->flags = ref; |
1827 | p->flags = ref; | ||
1828 | 1998 | ||
1829 | /* | 1999 | /* |
1830 | * Do the initial record conversion from mcount jump | 2000 | * Do the initial record conversion from mcount jump |
1831 | * to the NOP instructions. | 2001 | * to the NOP instructions. |
1832 | */ | 2002 | */ |
1833 | if (!ftrace_code_disable(mod, p)) { | 2003 | if (!ftrace_code_disable(mod, p)) |
1834 | ftrace_free_rec(p); | 2004 | break; |
1835 | /* Game over */ | ||
1836 | break; | ||
1837 | } | ||
1838 | 2005 | ||
1839 | ftrace_update_cnt++; | 2006 | ftrace_update_cnt++; |
1840 | 2007 | ||
1841 | /* | 2008 | /* |
1842 | * If the tracing is enabled, go ahead and enable the record. | 2009 | * If the tracing is enabled, go ahead and enable the record. |
1843 | * | 2010 | * |
1844 | * The reason not to enable the record immediatelly is the | 2011 | * The reason not to enable the record immediatelly is the |
1845 | * inherent check of ftrace_make_nop/ftrace_make_call for | 2012 | * inherent check of ftrace_make_nop/ftrace_make_call for |
1846 | * correct previous instructions. Making first the NOP | 2013 | * correct previous instructions. Making first the NOP |
1847 | * conversion puts the module to the correct state, thus | 2014 | * conversion puts the module to the correct state, thus |
1848 | * passing the ftrace_make_call check. | 2015 | * passing the ftrace_make_call check. |
1849 | */ | 2016 | */ |
1850 | if (ftrace_start_up && ref) { | 2017 | if (ftrace_start_up && ref) { |
1851 | int failed = __ftrace_replace_code(p, 1); | 2018 | int failed = __ftrace_replace_code(p, 1); |
1852 | if (failed) { | 2019 | if (failed) |
1853 | ftrace_bug(failed, p->ip); | 2020 | ftrace_bug(failed, p->ip); |
1854 | ftrace_free_rec(p); | ||
1855 | } | 2021 | } |
1856 | } | 2022 | } |
1857 | } | 2023 | } |
1858 | 2024 | ||
2025 | ftrace_new_pgs = NULL; | ||
2026 | |||
1859 | stop = ftrace_now(raw_smp_processor_id()); | 2027 | stop = ftrace_now(raw_smp_processor_id()); |
1860 | ftrace_update_time = stop - start; | 2028 | ftrace_update_time = stop - start; |
1861 | ftrace_update_tot_cnt += ftrace_update_cnt; | 2029 | ftrace_update_tot_cnt += ftrace_update_cnt; |
@@ -1863,57 +2031,108 @@ static int ftrace_update_code(struct module *mod) | |||
1863 | return 0; | 2031 | return 0; |
1864 | } | 2032 | } |
1865 | 2033 | ||
1866 | static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) | 2034 | static int ftrace_allocate_records(struct ftrace_page *pg, int count) |
1867 | { | 2035 | { |
1868 | struct ftrace_page *pg; | 2036 | int order; |
1869 | int cnt; | 2037 | int cnt; |
1870 | int i; | ||
1871 | 2038 | ||
1872 | /* allocate a few pages */ | 2039 | if (WARN_ON(!count)) |
1873 | ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); | 2040 | return -EINVAL; |
1874 | if (!ftrace_pages_start) | 2041 | |
1875 | return -1; | 2042 | order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); |
1876 | 2043 | ||
1877 | /* | 2044 | /* |
1878 | * Allocate a few more pages. | 2045 | * We want to fill as much as possible. No more than a page |
1879 | * | 2046 | * may be empty. |
1880 | * TODO: have some parser search vmlinux before | ||
1881 | * final linking to find all calls to ftrace. | ||
1882 | * Then we can: | ||
1883 | * a) know how many pages to allocate. | ||
1884 | * and/or | ||
1885 | * b) set up the table then. | ||
1886 | * | ||
1887 | * The dynamic code is still necessary for | ||
1888 | * modules. | ||
1889 | */ | 2047 | */ |
2048 | while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) | ||
2049 | order--; | ||
1890 | 2050 | ||
1891 | pg = ftrace_pages = ftrace_pages_start; | 2051 | again: |
2052 | pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); | ||
1892 | 2053 | ||
1893 | cnt = num_to_init / ENTRIES_PER_PAGE; | 2054 | if (!pg->records) { |
1894 | pr_info("ftrace: allocating %ld entries in %d pages\n", | 2055 | /* if we can't allocate this size, try something smaller */ |
1895 | num_to_init, cnt + 1); | 2056 | if (!order) |
2057 | return -ENOMEM; | ||
2058 | order >>= 1; | ||
2059 | goto again; | ||
2060 | } | ||
1896 | 2061 | ||
1897 | for (i = 0; i < cnt; i++) { | 2062 | cnt = (PAGE_SIZE << order) / ENTRY_SIZE; |
1898 | pg->next = (void *)get_zeroed_page(GFP_KERNEL); | 2063 | pg->size = cnt; |
1899 | 2064 | ||
1900 | /* If we fail, we'll try later anyway */ | 2065 | if (cnt > count) |
1901 | if (!pg->next) | 2066 | cnt = count; |
2067 | |||
2068 | return cnt; | ||
2069 | } | ||
2070 | |||
2071 | static struct ftrace_page * | ||
2072 | ftrace_allocate_pages(unsigned long num_to_init) | ||
2073 | { | ||
2074 | struct ftrace_page *start_pg; | ||
2075 | struct ftrace_page *pg; | ||
2076 | int order; | ||
2077 | int cnt; | ||
2078 | |||
2079 | if (!num_to_init) | ||
2080 | return 0; | ||
2081 | |||
2082 | start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); | ||
2083 | if (!pg) | ||
2084 | return NULL; | ||
2085 | |||
2086 | /* | ||
2087 | * Try to allocate as much as possible in one continues | ||
2088 | * location that fills in all of the space. We want to | ||
2089 | * waste as little space as possible. | ||
2090 | */ | ||
2091 | for (;;) { | ||
2092 | cnt = ftrace_allocate_records(pg, num_to_init); | ||
2093 | if (cnt < 0) | ||
2094 | goto free_pages; | ||
2095 | |||
2096 | num_to_init -= cnt; | ||
2097 | if (!num_to_init) | ||
1902 | break; | 2098 | break; |
1903 | 2099 | ||
2100 | pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); | ||
2101 | if (!pg->next) | ||
2102 | goto free_pages; | ||
2103 | |||
1904 | pg = pg->next; | 2104 | pg = pg->next; |
1905 | } | 2105 | } |
1906 | 2106 | ||
1907 | return 0; | 2107 | return start_pg; |
2108 | |||
2109 | free_pages: | ||
2110 | while (start_pg) { | ||
2111 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | ||
2112 | free_pages((unsigned long)pg->records, order); | ||
2113 | start_pg = pg->next; | ||
2114 | kfree(pg); | ||
2115 | pg = start_pg; | ||
2116 | } | ||
2117 | pr_info("ftrace: FAILED to allocate memory for functions\n"); | ||
2118 | return NULL; | ||
1908 | } | 2119 | } |
1909 | 2120 | ||
1910 | enum { | 2121 | static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) |
1911 | FTRACE_ITER_FILTER = (1 << 0), | 2122 | { |
1912 | FTRACE_ITER_NOTRACE = (1 << 1), | 2123 | int cnt; |
1913 | FTRACE_ITER_PRINTALL = (1 << 2), | 2124 | |
1914 | FTRACE_ITER_HASH = (1 << 3), | 2125 | if (!num_to_init) { |
1915 | FTRACE_ITER_ENABLED = (1 << 4), | 2126 | pr_info("ftrace: No functions to be traced?\n"); |
1916 | }; | 2127 | return -1; |
2128 | } | ||
2129 | |||
2130 | cnt = num_to_init / ENTRIES_PER_PAGE; | ||
2131 | pr_info("ftrace: allocating %ld entries in %d pages\n", | ||
2132 | num_to_init, cnt + 1); | ||
2133 | |||
2134 | return 0; | ||
2135 | } | ||
1917 | 2136 | ||
1918 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 2137 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
1919 | 2138 | ||
@@ -1979,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) | |||
1979 | void *p = NULL; | 2198 | void *p = NULL; |
1980 | loff_t l; | 2199 | loff_t l; |
1981 | 2200 | ||
2201 | if (!(iter->flags & FTRACE_ITER_DO_HASH)) | ||
2202 | return NULL; | ||
2203 | |||
1982 | if (iter->func_pos > *pos) | 2204 | if (iter->func_pos > *pos) |
1983 | return NULL; | 2205 | return NULL; |
1984 | 2206 | ||
@@ -2022,7 +2244,7 @@ static void * | |||
2022 | t_next(struct seq_file *m, void *v, loff_t *pos) | 2244 | t_next(struct seq_file *m, void *v, loff_t *pos) |
2023 | { | 2245 | { |
2024 | struct ftrace_iterator *iter = m->private; | 2246 | struct ftrace_iterator *iter = m->private; |
2025 | struct ftrace_ops *ops = &global_ops; | 2247 | struct ftrace_ops *ops = iter->ops; |
2026 | struct dyn_ftrace *rec = NULL; | 2248 | struct dyn_ftrace *rec = NULL; |
2027 | 2249 | ||
2028 | if (unlikely(ftrace_disabled)) | 2250 | if (unlikely(ftrace_disabled)) |
@@ -2046,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
2046 | } | 2268 | } |
2047 | } else { | 2269 | } else { |
2048 | rec = &iter->pg->records[iter->idx++]; | 2270 | rec = &iter->pg->records[iter->idx++]; |
2049 | if ((rec->flags & FTRACE_FL_FREE) || | 2271 | if (((iter->flags & FTRACE_ITER_FILTER) && |
2050 | |||
2051 | ((iter->flags & FTRACE_ITER_FILTER) && | ||
2052 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || | 2272 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || |
2053 | 2273 | ||
2054 | ((iter->flags & FTRACE_ITER_NOTRACE) && | 2274 | ((iter->flags & FTRACE_ITER_NOTRACE) && |
@@ -2080,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
2080 | static void *t_start(struct seq_file *m, loff_t *pos) | 2300 | static void *t_start(struct seq_file *m, loff_t *pos) |
2081 | { | 2301 | { |
2082 | struct ftrace_iterator *iter = m->private; | 2302 | struct ftrace_iterator *iter = m->private; |
2083 | struct ftrace_ops *ops = &global_ops; | 2303 | struct ftrace_ops *ops = iter->ops; |
2084 | void *p = NULL; | 2304 | void *p = NULL; |
2085 | loff_t l; | 2305 | loff_t l; |
2086 | 2306 | ||
@@ -2100,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
2100 | * off, we can short cut and just print out that all | 2320 | * off, we can short cut and just print out that all |
2101 | * functions are enabled. | 2321 | * functions are enabled. |
2102 | */ | 2322 | */ |
2103 | if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { | 2323 | if (iter->flags & FTRACE_ITER_FILTER && |
2324 | ftrace_hash_empty(ops->filter_hash)) { | ||
2104 | if (*pos > 0) | 2325 | if (*pos > 0) |
2105 | return t_hash_start(m, pos); | 2326 | return t_hash_start(m, pos); |
2106 | iter->flags |= FTRACE_ITER_PRINTALL; | 2327 | iter->flags |= FTRACE_ITER_PRINTALL; |
@@ -2125,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
2125 | break; | 2346 | break; |
2126 | } | 2347 | } |
2127 | 2348 | ||
2128 | if (!p) { | 2349 | if (!p) |
2129 | if (iter->flags & FTRACE_ITER_FILTER) | 2350 | return t_hash_start(m, pos); |
2130 | return t_hash_start(m, pos); | ||
2131 | |||
2132 | return NULL; | ||
2133 | } | ||
2134 | 2351 | ||
2135 | return iter; | 2352 | return iter; |
2136 | } | 2353 | } |
@@ -2188,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) | |||
2188 | return -ENOMEM; | 2405 | return -ENOMEM; |
2189 | 2406 | ||
2190 | iter->pg = ftrace_pages_start; | 2407 | iter->pg = ftrace_pages_start; |
2408 | iter->ops = &global_ops; | ||
2191 | 2409 | ||
2192 | ret = seq_open(file, &show_ftrace_seq_ops); | 2410 | ret = seq_open(file, &show_ftrace_seq_ops); |
2193 | if (!ret) { | 2411 | if (!ret) { |
@@ -2216,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file) | |||
2216 | 2434 | ||
2217 | iter->pg = ftrace_pages_start; | 2435 | iter->pg = ftrace_pages_start; |
2218 | iter->flags = FTRACE_ITER_ENABLED; | 2436 | iter->flags = FTRACE_ITER_ENABLED; |
2437 | iter->ops = &global_ops; | ||
2219 | 2438 | ||
2220 | ret = seq_open(file, &show_ftrace_seq_ops); | 2439 | ret = seq_open(file, &show_ftrace_seq_ops); |
2221 | if (!ret) { | 2440 | if (!ret) { |
@@ -2236,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) | |||
2236 | mutex_unlock(&ftrace_lock); | 2455 | mutex_unlock(&ftrace_lock); |
2237 | } | 2456 | } |
2238 | 2457 | ||
2239 | static int | 2458 | /** |
2459 | * ftrace_regex_open - initialize function tracer filter files | ||
2460 | * @ops: The ftrace_ops that hold the hash filters | ||
2461 | * @flag: The type of filter to process | ||
2462 | * @inode: The inode, usually passed in to your open routine | ||
2463 | * @file: The file, usually passed in to your open routine | ||
2464 | * | ||
2465 | * ftrace_regex_open() initializes the filter files for the | ||
2466 | * @ops. Depending on @flag it may process the filter hash or | ||
2467 | * the notrace hash of @ops. With this called from the open | ||
2468 | * routine, you can use ftrace_filter_write() for the write | ||
2469 | * routine if @flag has FTRACE_ITER_FILTER set, or | ||
2470 | * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. | ||
2471 | * ftrace_regex_lseek() should be used as the lseek routine, and | ||
2472 | * release must call ftrace_regex_release(). | ||
2473 | */ | ||
2474 | int | ||
2240 | ftrace_regex_open(struct ftrace_ops *ops, int flag, | 2475 | ftrace_regex_open(struct ftrace_ops *ops, int flag, |
2241 | struct inode *inode, struct file *file) | 2476 | struct inode *inode, struct file *file) |
2242 | { | 2477 | { |
@@ -2305,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
2305 | static int | 2540 | static int |
2306 | ftrace_filter_open(struct inode *inode, struct file *file) | 2541 | ftrace_filter_open(struct inode *inode, struct file *file) |
2307 | { | 2542 | { |
2308 | return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, | 2543 | return ftrace_regex_open(&global_ops, |
2309 | inode, file); | 2544 | FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, |
2545 | inode, file); | ||
2310 | } | 2546 | } |
2311 | 2547 | ||
2312 | static int | 2548 | static int |
@@ -2316,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) | |||
2316 | inode, file); | 2552 | inode, file); |
2317 | } | 2553 | } |
2318 | 2554 | ||
2319 | static loff_t | 2555 | loff_t |
2320 | ftrace_regex_lseek(struct file *file, loff_t offset, int origin) | 2556 | ftrace_regex_lseek(struct file *file, loff_t offset, int origin) |
2321 | { | 2557 | { |
2322 | loff_t ret; | 2558 | loff_t ret; |
@@ -2425,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff, | |||
2425 | goto out_unlock; | 2661 | goto out_unlock; |
2426 | 2662 | ||
2427 | do_for_each_ftrace_rec(pg, rec) { | 2663 | do_for_each_ftrace_rec(pg, rec) { |
2428 | |||
2429 | if (ftrace_match_record(rec, mod, search, search_len, type)) { | 2664 | if (ftrace_match_record(rec, mod, search, search_len, type)) { |
2430 | ret = enter_record(hash, rec, not); | 2665 | ret = enter_record(hash, rec, not); |
2431 | if (ret < 0) { | 2666 | if (ret < 0) { |
@@ -2870,14 +3105,14 @@ out_unlock: | |||
2870 | return ret; | 3105 | return ret; |
2871 | } | 3106 | } |
2872 | 3107 | ||
2873 | static ssize_t | 3108 | ssize_t |
2874 | ftrace_filter_write(struct file *file, const char __user *ubuf, | 3109 | ftrace_filter_write(struct file *file, const char __user *ubuf, |
2875 | size_t cnt, loff_t *ppos) | 3110 | size_t cnt, loff_t *ppos) |
2876 | { | 3111 | { |
2877 | return ftrace_regex_write(file, ubuf, cnt, ppos, 1); | 3112 | return ftrace_regex_write(file, ubuf, cnt, ppos, 1); |
2878 | } | 3113 | } |
2879 | 3114 | ||
2880 | static ssize_t | 3115 | ssize_t |
2881 | ftrace_notrace_write(struct file *file, const char __user *ubuf, | 3116 | ftrace_notrace_write(struct file *file, const char __user *ubuf, |
2882 | size_t cnt, loff_t *ppos) | 3117 | size_t cnt, loff_t *ppos) |
2883 | { | 3118 | { |
@@ -2918,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
2918 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 3153 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
2919 | if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED | 3154 | if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED |
2920 | && ftrace_enabled) | 3155 | && ftrace_enabled) |
2921 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | 3156 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); |
2922 | 3157 | ||
2923 | mutex_unlock(&ftrace_lock); | 3158 | mutex_unlock(&ftrace_lock); |
2924 | 3159 | ||
@@ -3044,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf) | |||
3044 | } | 3279 | } |
3045 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 3280 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
3046 | 3281 | ||
3047 | static void __init | 3282 | void __init |
3048 | set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) | 3283 | ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) |
3049 | { | 3284 | { |
3050 | char *func; | 3285 | char *func; |
3051 | 3286 | ||
@@ -3058,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) | |||
3058 | static void __init set_ftrace_early_filters(void) | 3293 | static void __init set_ftrace_early_filters(void) |
3059 | { | 3294 | { |
3060 | if (ftrace_filter_buf[0]) | 3295 | if (ftrace_filter_buf[0]) |
3061 | set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); | 3296 | ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); |
3062 | if (ftrace_notrace_buf[0]) | 3297 | if (ftrace_notrace_buf[0]) |
3063 | set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); | 3298 | ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); |
3064 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3299 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
3065 | if (ftrace_graph_buf[0]) | 3300 | if (ftrace_graph_buf[0]) |
3066 | set_ftrace_early_graph(ftrace_graph_buf); | 3301 | set_ftrace_early_graph(ftrace_graph_buf); |
3067 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 3302 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
3068 | } | 3303 | } |
3069 | 3304 | ||
3070 | static int | 3305 | int ftrace_regex_release(struct inode *inode, struct file *file) |
3071 | ftrace_regex_release(struct inode *inode, struct file *file) | ||
3072 | { | 3306 | { |
3073 | struct seq_file *m = (struct seq_file *)file->private_data; | 3307 | struct seq_file *m = (struct seq_file *)file->private_data; |
3074 | struct ftrace_iterator *iter; | 3308 | struct ftrace_iterator *iter; |
@@ -3106,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) | |||
3106 | orig_hash, iter->hash); | 3340 | orig_hash, iter->hash); |
3107 | if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) | 3341 | if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) |
3108 | && ftrace_enabled) | 3342 | && ftrace_enabled) |
3109 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | 3343 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); |
3110 | 3344 | ||
3111 | mutex_unlock(&ftrace_lock); | 3345 | mutex_unlock(&ftrace_lock); |
3112 | } | 3346 | } |
@@ -3269,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
3269 | 3503 | ||
3270 | do_for_each_ftrace_rec(pg, rec) { | 3504 | do_for_each_ftrace_rec(pg, rec) { |
3271 | 3505 | ||
3272 | if (rec->flags & FTRACE_FL_FREE) | ||
3273 | continue; | ||
3274 | |||
3275 | if (ftrace_match_record(rec, NULL, search, search_len, type)) { | 3506 | if (ftrace_match_record(rec, NULL, search, search_len, type)) { |
3276 | /* if it is in the array */ | 3507 | /* if it is in the array */ |
3277 | exists = false; | 3508 | exists = false; |
@@ -3380,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3380 | return 0; | 3611 | return 0; |
3381 | } | 3612 | } |
3382 | 3613 | ||
3614 | static void ftrace_swap_recs(void *a, void *b, int size) | ||
3615 | { | ||
3616 | struct dyn_ftrace *reca = a; | ||
3617 | struct dyn_ftrace *recb = b; | ||
3618 | struct dyn_ftrace t; | ||
3619 | |||
3620 | t = *reca; | ||
3621 | *reca = *recb; | ||
3622 | *recb = t; | ||
3623 | } | ||
3624 | |||
3383 | static int ftrace_process_locs(struct module *mod, | 3625 | static int ftrace_process_locs(struct module *mod, |
3384 | unsigned long *start, | 3626 | unsigned long *start, |
3385 | unsigned long *end) | 3627 | unsigned long *end) |
3386 | { | 3628 | { |
3629 | struct ftrace_page *pg; | ||
3630 | unsigned long count; | ||
3387 | unsigned long *p; | 3631 | unsigned long *p; |
3388 | unsigned long addr; | 3632 | unsigned long addr; |
3389 | unsigned long flags = 0; /* Shut up gcc */ | 3633 | unsigned long flags = 0; /* Shut up gcc */ |
3634 | int ret = -ENOMEM; | ||
3635 | |||
3636 | count = end - start; | ||
3637 | |||
3638 | if (!count) | ||
3639 | return 0; | ||
3640 | |||
3641 | pg = ftrace_allocate_pages(count); | ||
3642 | if (!pg) | ||
3643 | return -ENOMEM; | ||
3390 | 3644 | ||
3391 | mutex_lock(&ftrace_lock); | 3645 | mutex_lock(&ftrace_lock); |
3646 | |||
3647 | /* | ||
3648 | * Core and each module needs their own pages, as | ||
3649 | * modules will free them when they are removed. | ||
3650 | * Force a new page to be allocated for modules. | ||
3651 | */ | ||
3652 | if (!mod) { | ||
3653 | WARN_ON(ftrace_pages || ftrace_pages_start); | ||
3654 | /* First initialization */ | ||
3655 | ftrace_pages = ftrace_pages_start = pg; | ||
3656 | } else { | ||
3657 | if (!ftrace_pages) | ||
3658 | goto out; | ||
3659 | |||
3660 | if (WARN_ON(ftrace_pages->next)) { | ||
3661 | /* Hmm, we have free pages? */ | ||
3662 | while (ftrace_pages->next) | ||
3663 | ftrace_pages = ftrace_pages->next; | ||
3664 | } | ||
3665 | |||
3666 | ftrace_pages->next = pg; | ||
3667 | ftrace_pages = pg; | ||
3668 | } | ||
3669 | |||
3392 | p = start; | 3670 | p = start; |
3393 | while (p < end) { | 3671 | while (p < end) { |
3394 | addr = ftrace_call_adjust(*p++); | 3672 | addr = ftrace_call_adjust(*p++); |
@@ -3400,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod, | |||
3400 | */ | 3678 | */ |
3401 | if (!addr) | 3679 | if (!addr) |
3402 | continue; | 3680 | continue; |
3403 | ftrace_record_ip(addr); | 3681 | if (!ftrace_record_ip(addr)) |
3682 | break; | ||
3404 | } | 3683 | } |
3405 | 3684 | ||
3685 | /* These new locations need to be initialized */ | ||
3686 | ftrace_new_pgs = pg; | ||
3687 | |||
3688 | /* Make each individual set of pages sorted by ips */ | ||
3689 | for (; pg; pg = pg->next) | ||
3690 | sort(pg->records, pg->index, sizeof(struct dyn_ftrace), | ||
3691 | ftrace_cmp_recs, ftrace_swap_recs); | ||
3692 | |||
3406 | /* | 3693 | /* |
3407 | * We only need to disable interrupts on start up | 3694 | * We only need to disable interrupts on start up |
3408 | * because we are modifying code that an interrupt | 3695 | * because we are modifying code that an interrupt |
@@ -3416,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod, | |||
3416 | ftrace_update_code(mod); | 3703 | ftrace_update_code(mod); |
3417 | if (!mod) | 3704 | if (!mod) |
3418 | local_irq_restore(flags); | 3705 | local_irq_restore(flags); |
3706 | ret = 0; | ||
3707 | out: | ||
3419 | mutex_unlock(&ftrace_lock); | 3708 | mutex_unlock(&ftrace_lock); |
3420 | 3709 | ||
3421 | return 0; | 3710 | return ret; |
3422 | } | 3711 | } |
3423 | 3712 | ||
3424 | #ifdef CONFIG_MODULES | 3713 | #ifdef CONFIG_MODULES |
3714 | |||
3715 | #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) | ||
3716 | |||
3425 | void ftrace_release_mod(struct module *mod) | 3717 | void ftrace_release_mod(struct module *mod) |
3426 | { | 3718 | { |
3427 | struct dyn_ftrace *rec; | 3719 | struct dyn_ftrace *rec; |
3720 | struct ftrace_page **last_pg; | ||
3428 | struct ftrace_page *pg; | 3721 | struct ftrace_page *pg; |
3722 | int order; | ||
3429 | 3723 | ||
3430 | mutex_lock(&ftrace_lock); | 3724 | mutex_lock(&ftrace_lock); |
3431 | 3725 | ||
3432 | if (ftrace_disabled) | 3726 | if (ftrace_disabled) |
3433 | goto out_unlock; | 3727 | goto out_unlock; |
3434 | 3728 | ||
3435 | do_for_each_ftrace_rec(pg, rec) { | 3729 | /* |
3730 | * Each module has its own ftrace_pages, remove | ||
3731 | * them from the list. | ||
3732 | */ | ||
3733 | last_pg = &ftrace_pages_start; | ||
3734 | for (pg = ftrace_pages_start; pg; pg = *last_pg) { | ||
3735 | rec = &pg->records[0]; | ||
3436 | if (within_module_core(rec->ip, mod)) { | 3736 | if (within_module_core(rec->ip, mod)) { |
3437 | /* | 3737 | /* |
3438 | * rec->ip is changed in ftrace_free_rec() | 3738 | * As core pages are first, the first |
3439 | * It should not between s and e if record was freed. | 3739 | * page should never be a module page. |
3440 | */ | 3740 | */ |
3441 | FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); | 3741 | if (WARN_ON(pg == ftrace_pages_start)) |
3442 | ftrace_free_rec(rec); | 3742 | goto out_unlock; |
3443 | } | 3743 | |
3444 | } while_for_each_ftrace_rec(); | 3744 | /* Check if we are deleting the last page */ |
3745 | if (pg == ftrace_pages) | ||
3746 | ftrace_pages = next_to_ftrace_page(last_pg); | ||
3747 | |||
3748 | *last_pg = pg->next; | ||
3749 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | ||
3750 | free_pages((unsigned long)pg->records, order); | ||
3751 | kfree(pg); | ||
3752 | } else | ||
3753 | last_pg = &pg->next; | ||
3754 | } | ||
3445 | out_unlock: | 3755 | out_unlock: |
3446 | mutex_unlock(&ftrace_lock); | 3756 | mutex_unlock(&ftrace_lock); |
3447 | } | 3757 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f2bd275bb60f..a3f1bc5d2a00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
338 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
342 | TRACE_ITER_IRQ_INFO; | ||
342 | 343 | ||
343 | static int trace_stop_count; | 344 | static int trace_stop_count; |
344 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | 345 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
@@ -426,6 +427,7 @@ static const char *trace_options[] = { | |||
426 | "record-cmd", | 427 | "record-cmd", |
427 | "overwrite", | 428 | "overwrite", |
428 | "disable_on_free", | 429 | "disable_on_free", |
430 | "irq-info", | ||
429 | NULL | 431 | NULL |
430 | }; | 432 | }; |
431 | 433 | ||
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) | |||
1843 | trace_event_read_unlock(); | 1845 | trace_event_read_unlock(); |
1844 | } | 1846 | } |
1845 | 1847 | ||
1848 | static void | ||
1849 | get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) | ||
1850 | { | ||
1851 | unsigned long count; | ||
1852 | int cpu; | ||
1853 | |||
1854 | *total = 0; | ||
1855 | *entries = 0; | ||
1856 | |||
1857 | for_each_tracing_cpu(cpu) { | ||
1858 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | ||
1859 | /* | ||
1860 | * If this buffer has skipped entries, then we hold all | ||
1861 | * entries for the trace and we need to ignore the | ||
1862 | * ones before the time stamp. | ||
1863 | */ | ||
1864 | if (tr->data[cpu]->skipped_entries) { | ||
1865 | count -= tr->data[cpu]->skipped_entries; | ||
1866 | /* total is the same as the entries */ | ||
1867 | *total += count; | ||
1868 | } else | ||
1869 | *total += count + | ||
1870 | ring_buffer_overrun_cpu(tr->buffer, cpu); | ||
1871 | *entries += count; | ||
1872 | } | ||
1873 | } | ||
1874 | |||
1846 | static void print_lat_help_header(struct seq_file *m) | 1875 | static void print_lat_help_header(struct seq_file *m) |
1847 | { | 1876 | { |
1848 | seq_puts(m, "# _------=> CPU# \n"); | 1877 | seq_puts(m, "# _------=> CPU# \n"); |
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m) | |||
1855 | seq_puts(m, "# \\ / ||||| \\ | / \n"); | 1884 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
1856 | } | 1885 | } |
1857 | 1886 | ||
1858 | static void print_func_help_header(struct seq_file *m) | 1887 | static void print_event_info(struct trace_array *tr, struct seq_file *m) |
1888 | { | ||
1889 | unsigned long total; | ||
1890 | unsigned long entries; | ||
1891 | |||
1892 | get_total_entries(tr, &total, &entries); | ||
1893 | seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", | ||
1894 | entries, total, num_online_cpus()); | ||
1895 | seq_puts(m, "#\n"); | ||
1896 | } | ||
1897 | |||
1898 | static void print_func_help_header(struct trace_array *tr, struct seq_file *m) | ||
1859 | { | 1899 | { |
1860 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | 1900 | print_event_info(tr, m); |
1901 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | ||
1861 | seq_puts(m, "# | | | | |\n"); | 1902 | seq_puts(m, "# | | | | |\n"); |
1862 | } | 1903 | } |
1863 | 1904 | ||
1905 | static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) | ||
1906 | { | ||
1907 | print_event_info(tr, m); | ||
1908 | seq_puts(m, "# _-----=> irqs-off\n"); | ||
1909 | seq_puts(m, "# / _----=> need-resched\n"); | ||
1910 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); | ||
1911 | seq_puts(m, "# || / _--=> preempt-depth\n"); | ||
1912 | seq_puts(m, "# ||| / delay\n"); | ||
1913 | seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); | ||
1914 | seq_puts(m, "# | | | |||| | |\n"); | ||
1915 | } | ||
1864 | 1916 | ||
1865 | void | 1917 | void |
1866 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) | 1918 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) |
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
1869 | struct trace_array *tr = iter->tr; | 1921 | struct trace_array *tr = iter->tr; |
1870 | struct trace_array_cpu *data = tr->data[tr->cpu]; | 1922 | struct trace_array_cpu *data = tr->data[tr->cpu]; |
1871 | struct tracer *type = current_trace; | 1923 | struct tracer *type = current_trace; |
1872 | unsigned long entries = 0; | 1924 | unsigned long entries; |
1873 | unsigned long total = 0; | 1925 | unsigned long total; |
1874 | unsigned long count; | ||
1875 | const char *name = "preemption"; | 1926 | const char *name = "preemption"; |
1876 | int cpu; | ||
1877 | 1927 | ||
1878 | if (type) | 1928 | if (type) |
1879 | name = type->name; | 1929 | name = type->name; |
1880 | 1930 | ||
1881 | 1931 | get_total_entries(tr, &total, &entries); | |
1882 | for_each_tracing_cpu(cpu) { | ||
1883 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | ||
1884 | /* | ||
1885 | * If this buffer has skipped entries, then we hold all | ||
1886 | * entries for the trace and we need to ignore the | ||
1887 | * ones before the time stamp. | ||
1888 | */ | ||
1889 | if (tr->data[cpu]->skipped_entries) { | ||
1890 | count -= tr->data[cpu]->skipped_entries; | ||
1891 | /* total is the same as the entries */ | ||
1892 | total += count; | ||
1893 | } else | ||
1894 | total += count + | ||
1895 | ring_buffer_overrun_cpu(tr->buffer, cpu); | ||
1896 | entries += count; | ||
1897 | } | ||
1898 | 1932 | ||
1899 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", | 1933 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", |
1900 | name, UTS_RELEASE); | 1934 | name, UTS_RELEASE); |
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
2140 | return print_trace_fmt(iter); | 2174 | return print_trace_fmt(iter); |
2141 | } | 2175 | } |
2142 | 2176 | ||
2177 | void trace_latency_header(struct seq_file *m) | ||
2178 | { | ||
2179 | struct trace_iterator *iter = m->private; | ||
2180 | |||
2181 | /* print nothing if the buffers are empty */ | ||
2182 | if (trace_empty(iter)) | ||
2183 | return; | ||
2184 | |||
2185 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) | ||
2186 | print_trace_header(m, iter); | ||
2187 | |||
2188 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | ||
2189 | print_lat_help_header(m); | ||
2190 | } | ||
2191 | |||
2143 | void trace_default_header(struct seq_file *m) | 2192 | void trace_default_header(struct seq_file *m) |
2144 | { | 2193 | { |
2145 | struct trace_iterator *iter = m->private; | 2194 | struct trace_iterator *iter = m->private; |
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m) | |||
2155 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | 2204 | if (!(trace_flags & TRACE_ITER_VERBOSE)) |
2156 | print_lat_help_header(m); | 2205 | print_lat_help_header(m); |
2157 | } else { | 2206 | } else { |
2158 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | 2207 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { |
2159 | print_func_help_header(m); | 2208 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
2209 | print_func_help_header_irq(iter->tr, m); | ||
2210 | else | ||
2211 | print_func_help_header(iter->tr, m); | ||
2212 | } | ||
2160 | } | 2213 | } |
2161 | } | 2214 | } |
2162 | 2215 | ||
@@ -4385,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = { | |||
4385 | }; | 4438 | }; |
4386 | 4439 | ||
4387 | struct dentry *trace_create_file(const char *name, | 4440 | struct dentry *trace_create_file(const char *name, |
4388 | mode_t mode, | 4441 | umode_t mode, |
4389 | struct dentry *parent, | 4442 | struct dentry *parent, |
4390 | void *data, | 4443 | void *data, |
4391 | const struct file_operations *fops) | 4444 | const struct file_operations *fops) |
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
4775 | { | 4828 | { |
4776 | __ftrace_dump(true, oops_dump_mode); | 4829 | __ftrace_dump(true, oops_dump_mode); |
4777 | } | 4830 | } |
4831 | EXPORT_SYMBOL_GPL(ftrace_dump); | ||
4778 | 4832 | ||
4779 | __init static int tracer_alloc_buffers(void) | 4833 | __init static int tracer_alloc_buffers(void) |
4780 | { | 4834 | { |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18dc..b93ecbadad6d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu); | |||
312 | void tracing_reset_current_online_cpus(void); | 312 | void tracing_reset_current_online_cpus(void); |
313 | int tracing_open_generic(struct inode *inode, struct file *filp); | 313 | int tracing_open_generic(struct inode *inode, struct file *filp); |
314 | struct dentry *trace_create_file(const char *name, | 314 | struct dentry *trace_create_file(const char *name, |
315 | mode_t mode, | 315 | umode_t mode, |
316 | struct dentry *parent, | 316 | struct dentry *parent, |
317 | void *data, | 317 | void *data, |
318 | const struct file_operations *fops); | 318 | const struct file_operations *fops); |
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, | |||
370 | unsigned long ip, | 370 | unsigned long ip, |
371 | unsigned long parent_ip, | 371 | unsigned long parent_ip, |
372 | unsigned long flags, int pc); | 372 | unsigned long flags, int pc); |
373 | void trace_latency_header(struct seq_file *m); | ||
373 | void trace_default_header(struct seq_file *m); | 374 | void trace_default_header(struct seq_file *m); |
374 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 375 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
375 | int trace_empty(struct trace_iterator *iter); | 376 | int trace_empty(struct trace_iterator *iter); |
@@ -654,6 +655,7 @@ enum trace_iterator_flags { | |||
654 | TRACE_ITER_RECORD_CMD = 0x100000, | 655 | TRACE_ITER_RECORD_CMD = 0x100000, |
655 | TRACE_ITER_OVERWRITE = 0x200000, | 656 | TRACE_ITER_OVERWRITE = 0x200000, |
656 | TRACE_ITER_STOP_ON_FREE = 0x400000, | 657 | TRACE_ITER_STOP_ON_FREE = 0x400000, |
658 | TRACE_ITER_IRQ_INFO = 0x800000, | ||
657 | }; | 659 | }; |
658 | 660 | ||
659 | /* | 661 | /* |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
1078 | /* First see if we did not already create this dir */ | 1078 | /* First see if we did not already create this dir */ |
1079 | list_for_each_entry(system, &event_subsystems, list) { | 1079 | list_for_each_entry(system, &event_subsystems, list) { |
1080 | if (strcmp(system->name, name) == 0) { | 1080 | if (strcmp(system->name, name) == 0) { |
1081 | __get_system(system); | ||
1082 | system->nr_events++; | 1081 | system->nr_events++; |
1083 | return system->entry; | 1082 | return system->entry; |
1084 | } | 1083 | } |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 816d3d074979..24aee7127451 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -27,6 +27,12 @@ | |||
27 | #include "trace.h" | 27 | #include "trace.h" |
28 | #include "trace_output.h" | 28 | #include "trace_output.h" |
29 | 29 | ||
30 | #define DEFAULT_SYS_FILTER_MESSAGE \ | ||
31 | "### global filter ###\n" \ | ||
32 | "# Use this to set filters for multiple events.\n" \ | ||
33 | "# Only events with the given fields will be affected.\n" \ | ||
34 | "# If no events are modified, an error message will be displayed here" | ||
35 | |||
30 | enum filter_op_ids | 36 | enum filter_op_ids |
31 | { | 37 | { |
32 | OP_OR, | 38 | OP_OR, |
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
646 | if (filter && filter->filter_string) | 652 | if (filter && filter->filter_string) |
647 | trace_seq_printf(s, "%s\n", filter->filter_string); | 653 | trace_seq_printf(s, "%s\n", filter->filter_string); |
648 | else | 654 | else |
649 | trace_seq_printf(s, "none\n"); | 655 | trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); |
650 | mutex_unlock(&event_mutex); | 656 | mutex_unlock(&event_mutex); |
651 | } | 657 | } |
652 | 658 | ||
@@ -1649,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1649 | */ | 1655 | */ |
1650 | err = replace_preds(call, NULL, ps, filter_string, true); | 1656 | err = replace_preds(call, NULL, ps, filter_string, true); |
1651 | if (err) | 1657 | if (err) |
1652 | goto fail; | 1658 | call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; |
1659 | else | ||
1660 | call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; | ||
1653 | } | 1661 | } |
1654 | 1662 | ||
1655 | list_for_each_entry(call, &ftrace_events, list) { | 1663 | list_for_each_entry(call, &ftrace_events, list) { |
@@ -1658,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1658 | if (strcmp(call->class->system, system->name) != 0) | 1666 | if (strcmp(call->class->system, system->name) != 0) |
1659 | continue; | 1667 | continue; |
1660 | 1668 | ||
1669 | if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) | ||
1670 | continue; | ||
1671 | |||
1661 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); | 1672 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
1662 | if (!filter_item) | 1673 | if (!filter_item) |
1663 | goto fail_mem; | 1674 | goto fail_mem; |
@@ -1686,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1686 | * replace the filter for the call. | 1697 | * replace the filter for the call. |
1687 | */ | 1698 | */ |
1688 | filter = call->filter; | 1699 | filter = call->filter; |
1689 | call->filter = filter_item->filter; | 1700 | rcu_assign_pointer(call->filter, filter_item->filter); |
1690 | filter_item->filter = filter; | 1701 | filter_item->filter = filter; |
1691 | 1702 | ||
1692 | fail = false; | 1703 | fail = false; |
@@ -1727,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1727 | return -ENOMEM; | 1738 | return -ENOMEM; |
1728 | } | 1739 | } |
1729 | 1740 | ||
1741 | static int create_filter_start(char *filter_str, bool set_str, | ||
1742 | struct filter_parse_state **psp, | ||
1743 | struct event_filter **filterp) | ||
1744 | { | ||
1745 | struct event_filter *filter; | ||
1746 | struct filter_parse_state *ps = NULL; | ||
1747 | int err = 0; | ||
1748 | |||
1749 | WARN_ON_ONCE(*psp || *filterp); | ||
1750 | |||
1751 | /* allocate everything, and if any fails, free all and fail */ | ||
1752 | filter = __alloc_filter(); | ||
1753 | if (filter && set_str) | ||
1754 | err = replace_filter_string(filter, filter_str); | ||
1755 | |||
1756 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | ||
1757 | |||
1758 | if (!filter || !ps || err) { | ||
1759 | kfree(ps); | ||
1760 | __free_filter(filter); | ||
1761 | return -ENOMEM; | ||
1762 | } | ||
1763 | |||
1764 | /* we're committed to creating a new filter */ | ||
1765 | *filterp = filter; | ||
1766 | *psp = ps; | ||
1767 | |||
1768 | parse_init(ps, filter_ops, filter_str); | ||
1769 | err = filter_parse(ps); | ||
1770 | if (err && set_str) | ||
1771 | append_filter_err(ps, filter); | ||
1772 | return err; | ||
1773 | } | ||
1774 | |||
1775 | static void create_filter_finish(struct filter_parse_state *ps) | ||
1776 | { | ||
1777 | if (ps) { | ||
1778 | filter_opstack_clear(ps); | ||
1779 | postfix_clear(ps); | ||
1780 | kfree(ps); | ||
1781 | } | ||
1782 | } | ||
1783 | |||
1784 | /** | ||
1785 | * create_filter - create a filter for a ftrace_event_call | ||
1786 | * @call: ftrace_event_call to create a filter for | ||
1787 | * @filter_str: filter string | ||
1788 | * @set_str: remember @filter_str and enable detailed error in filter | ||
1789 | * @filterp: out param for created filter (always updated on return) | ||
1790 | * | ||
1791 | * Creates a filter for @call with @filter_str. If @set_str is %true, | ||
1792 | * @filter_str is copied and recorded in the new filter. | ||
1793 | * | ||
1794 | * On success, returns 0 and *@filterp points to the new filter. On | ||
1795 | * failure, returns -errno and *@filterp may point to %NULL or to a new | ||
1796 | * filter. In the latter case, the returned filter contains error | ||
1797 | * information if @set_str is %true and the caller is responsible for | ||
1798 | * freeing it. | ||
1799 | */ | ||
1800 | static int create_filter(struct ftrace_event_call *call, | ||
1801 | char *filter_str, bool set_str, | ||
1802 | struct event_filter **filterp) | ||
1803 | { | ||
1804 | struct event_filter *filter = NULL; | ||
1805 | struct filter_parse_state *ps = NULL; | ||
1806 | int err; | ||
1807 | |||
1808 | err = create_filter_start(filter_str, set_str, &ps, &filter); | ||
1809 | if (!err) { | ||
1810 | err = replace_preds(call, filter, ps, filter_str, false); | ||
1811 | if (err && set_str) | ||
1812 | append_filter_err(ps, filter); | ||
1813 | } | ||
1814 | create_filter_finish(ps); | ||
1815 | |||
1816 | *filterp = filter; | ||
1817 | return err; | ||
1818 | } | ||
1819 | |||
1820 | /** | ||
1821 | * create_system_filter - create a filter for an event_subsystem | ||
1822 | * @system: event_subsystem to create a filter for | ||
1823 | * @filter_str: filter string | ||
1824 | * @filterp: out param for created filter (always updated on return) | ||
1825 | * | ||
1826 | * Identical to create_filter() except that it creates a subsystem filter | ||
1827 | * and always remembers @filter_str. | ||
1828 | */ | ||
1829 | static int create_system_filter(struct event_subsystem *system, | ||
1830 | char *filter_str, struct event_filter **filterp) | ||
1831 | { | ||
1832 | struct event_filter *filter = NULL; | ||
1833 | struct filter_parse_state *ps = NULL; | ||
1834 | int err; | ||
1835 | |||
1836 | err = create_filter_start(filter_str, true, &ps, &filter); | ||
1837 | if (!err) { | ||
1838 | err = replace_system_preds(system, ps, filter_str); | ||
1839 | if (!err) { | ||
1840 | /* System filters just show a default message */ | ||
1841 | kfree(filter->filter_string); | ||
1842 | filter->filter_string = NULL; | ||
1843 | } else { | ||
1844 | append_filter_err(ps, filter); | ||
1845 | } | ||
1846 | } | ||
1847 | create_filter_finish(ps); | ||
1848 | |||
1849 | *filterp = filter; | ||
1850 | return err; | ||
1851 | } | ||
1852 | |||
1730 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1853 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) |
1731 | { | 1854 | { |
1732 | struct filter_parse_state *ps; | ||
1733 | struct event_filter *filter; | 1855 | struct event_filter *filter; |
1734 | struct event_filter *tmp; | ||
1735 | int err = 0; | 1856 | int err = 0; |
1736 | 1857 | ||
1737 | mutex_lock(&event_mutex); | 1858 | mutex_lock(&event_mutex); |
@@ -1741,56 +1862,37 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1741 | filter = call->filter; | 1862 | filter = call->filter; |
1742 | if (!filter) | 1863 | if (!filter) |
1743 | goto out_unlock; | 1864 | goto out_unlock; |
1744 | call->filter = NULL; | 1865 | RCU_INIT_POINTER(call->filter, NULL); |
1745 | /* Make sure the filter is not being used */ | 1866 | /* Make sure the filter is not being used */ |
1746 | synchronize_sched(); | 1867 | synchronize_sched(); |
1747 | __free_filter(filter); | 1868 | __free_filter(filter); |
1748 | goto out_unlock; | 1869 | goto out_unlock; |
1749 | } | 1870 | } |
1750 | 1871 | ||
1751 | err = -ENOMEM; | 1872 | err = create_filter(call, filter_string, true, &filter); |
1752 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | ||
1753 | if (!ps) | ||
1754 | goto out_unlock; | ||
1755 | |||
1756 | filter = __alloc_filter(); | ||
1757 | if (!filter) { | ||
1758 | kfree(ps); | ||
1759 | goto out_unlock; | ||
1760 | } | ||
1761 | 1873 | ||
1762 | replace_filter_string(filter, filter_string); | ||
1763 | |||
1764 | parse_init(ps, filter_ops, filter_string); | ||
1765 | err = filter_parse(ps); | ||
1766 | if (err) { | ||
1767 | append_filter_err(ps, filter); | ||
1768 | goto out; | ||
1769 | } | ||
1770 | |||
1771 | err = replace_preds(call, filter, ps, filter_string, false); | ||
1772 | if (err) { | ||
1773 | filter_disable(call); | ||
1774 | append_filter_err(ps, filter); | ||
1775 | } else | ||
1776 | call->flags |= TRACE_EVENT_FL_FILTERED; | ||
1777 | out: | ||
1778 | /* | 1874 | /* |
1779 | * Always swap the call filter with the new filter | 1875 | * Always swap the call filter with the new filter |
1780 | * even if there was an error. If there was an error | 1876 | * even if there was an error. If there was an error |
1781 | * in the filter, we disable the filter and show the error | 1877 | * in the filter, we disable the filter and show the error |
1782 | * string | 1878 | * string |
1783 | */ | 1879 | */ |
1784 | tmp = call->filter; | 1880 | if (filter) { |
1785 | call->filter = filter; | 1881 | struct event_filter *tmp = call->filter; |
1786 | if (tmp) { | 1882 | |
1787 | /* Make sure the call is done with the filter */ | 1883 | if (!err) |
1788 | synchronize_sched(); | 1884 | call->flags |= TRACE_EVENT_FL_FILTERED; |
1789 | __free_filter(tmp); | 1885 | else |
1886 | filter_disable(call); | ||
1887 | |||
1888 | rcu_assign_pointer(call->filter, filter); | ||
1889 | |||
1890 | if (tmp) { | ||
1891 | /* Make sure the call is done with the filter */ | ||
1892 | synchronize_sched(); | ||
1893 | __free_filter(tmp); | ||
1894 | } | ||
1790 | } | 1895 | } |
1791 | filter_opstack_clear(ps); | ||
1792 | postfix_clear(ps); | ||
1793 | kfree(ps); | ||
1794 | out_unlock: | 1896 | out_unlock: |
1795 | mutex_unlock(&event_mutex); | 1897 | mutex_unlock(&event_mutex); |
1796 | 1898 | ||
@@ -1800,7 +1902,6 @@ out_unlock: | |||
1800 | int apply_subsystem_event_filter(struct event_subsystem *system, | 1902 | int apply_subsystem_event_filter(struct event_subsystem *system, |
1801 | char *filter_string) | 1903 | char *filter_string) |
1802 | { | 1904 | { |
1803 | struct filter_parse_state *ps; | ||
1804 | struct event_filter *filter; | 1905 | struct event_filter *filter; |
1805 | int err = 0; | 1906 | int err = 0; |
1806 | 1907 | ||
@@ -1824,38 +1925,15 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1824 | goto out_unlock; | 1925 | goto out_unlock; |
1825 | } | 1926 | } |
1826 | 1927 | ||
1827 | err = -ENOMEM; | 1928 | err = create_system_filter(system, filter_string, &filter); |
1828 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | 1929 | if (filter) { |
1829 | if (!ps) | 1930 | /* |
1830 | goto out_unlock; | 1931 | * No event actually uses the system filter |
1831 | 1932 | * we can free it without synchronize_sched(). | |
1832 | filter = __alloc_filter(); | 1933 | */ |
1833 | if (!filter) | 1934 | __free_filter(system->filter); |
1834 | goto out; | 1935 | system->filter = filter; |
1835 | |||
1836 | replace_filter_string(filter, filter_string); | ||
1837 | /* | ||
1838 | * No event actually uses the system filter | ||
1839 | * we can free it without synchronize_sched(). | ||
1840 | */ | ||
1841 | __free_filter(system->filter); | ||
1842 | system->filter = filter; | ||
1843 | |||
1844 | parse_init(ps, filter_ops, filter_string); | ||
1845 | err = filter_parse(ps); | ||
1846 | if (err) { | ||
1847 | append_filter_err(ps, system->filter); | ||
1848 | goto out; | ||
1849 | } | 1936 | } |
1850 | |||
1851 | err = replace_system_preds(system, ps, filter_string); | ||
1852 | if (err) | ||
1853 | append_filter_err(ps, system->filter); | ||
1854 | |||
1855 | out: | ||
1856 | filter_opstack_clear(ps); | ||
1857 | postfix_clear(ps); | ||
1858 | kfree(ps); | ||
1859 | out_unlock: | 1937 | out_unlock: |
1860 | mutex_unlock(&event_mutex); | 1938 | mutex_unlock(&event_mutex); |
1861 | 1939 | ||
@@ -1877,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1877 | { | 1955 | { |
1878 | int err; | 1956 | int err; |
1879 | struct event_filter *filter; | 1957 | struct event_filter *filter; |
1880 | struct filter_parse_state *ps; | ||
1881 | struct ftrace_event_call *call; | 1958 | struct ftrace_event_call *call; |
1882 | 1959 | ||
1883 | mutex_lock(&event_mutex); | 1960 | mutex_lock(&event_mutex); |
@@ -1892,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1892 | if (event->filter) | 1969 | if (event->filter) |
1893 | goto out_unlock; | 1970 | goto out_unlock; |
1894 | 1971 | ||
1895 | filter = __alloc_filter(); | 1972 | err = create_filter(call, filter_str, false, &filter); |
1896 | if (!filter) { | ||
1897 | err = PTR_ERR(filter); | ||
1898 | goto out_unlock; | ||
1899 | } | ||
1900 | |||
1901 | err = -ENOMEM; | ||
1902 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | ||
1903 | if (!ps) | ||
1904 | goto free_filter; | ||
1905 | |||
1906 | parse_init(ps, filter_ops, filter_str); | ||
1907 | err = filter_parse(ps); | ||
1908 | if (err) | ||
1909 | goto free_ps; | ||
1910 | |||
1911 | err = replace_preds(call, filter, ps, filter_str, false); | ||
1912 | if (!err) | 1973 | if (!err) |
1913 | event->filter = filter; | 1974 | event->filter = filter; |
1914 | 1975 | else | |
1915 | free_ps: | ||
1916 | filter_opstack_clear(ps); | ||
1917 | postfix_clear(ps); | ||
1918 | kfree(ps); | ||
1919 | |||
1920 | free_filter: | ||
1921 | if (err) | ||
1922 | __free_filter(filter); | 1976 | __free_filter(filter); |
1923 | 1977 | ||
1924 | out_unlock: | 1978 | out_unlock: |
@@ -1937,43 +1991,6 @@ out_unlock: | |||
1937 | #define CREATE_TRACE_POINTS | 1991 | #define CREATE_TRACE_POINTS |
1938 | #include "trace_events_filter_test.h" | 1992 | #include "trace_events_filter_test.h" |
1939 | 1993 | ||
1940 | static int test_get_filter(char *filter_str, struct ftrace_event_call *call, | ||
1941 | struct event_filter **pfilter) | ||
1942 | { | ||
1943 | struct event_filter *filter; | ||
1944 | struct filter_parse_state *ps; | ||
1945 | int err = -ENOMEM; | ||
1946 | |||
1947 | filter = __alloc_filter(); | ||
1948 | if (!filter) | ||
1949 | goto out; | ||
1950 | |||
1951 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | ||
1952 | if (!ps) | ||
1953 | goto free_filter; | ||
1954 | |||
1955 | parse_init(ps, filter_ops, filter_str); | ||
1956 | err = filter_parse(ps); | ||
1957 | if (err) | ||
1958 | goto free_ps; | ||
1959 | |||
1960 | err = replace_preds(call, filter, ps, filter_str, false); | ||
1961 | if (!err) | ||
1962 | *pfilter = filter; | ||
1963 | |||
1964 | free_ps: | ||
1965 | filter_opstack_clear(ps); | ||
1966 | postfix_clear(ps); | ||
1967 | kfree(ps); | ||
1968 | |||
1969 | free_filter: | ||
1970 | if (err) | ||
1971 | __free_filter(filter); | ||
1972 | |||
1973 | out: | ||
1974 | return err; | ||
1975 | } | ||
1976 | |||
1977 | #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ | 1994 | #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ |
1978 | { \ | 1995 | { \ |
1979 | .filter = FILTER, \ | 1996 | .filter = FILTER, \ |
@@ -2092,12 +2109,13 @@ static __init int ftrace_test_event_filter(void) | |||
2092 | struct test_filter_data_t *d = &test_filter_data[i]; | 2109 | struct test_filter_data_t *d = &test_filter_data[i]; |
2093 | int err; | 2110 | int err; |
2094 | 2111 | ||
2095 | err = test_get_filter(d->filter, &event_ftrace_test_filter, | 2112 | err = create_filter(&event_ftrace_test_filter, d->filter, |
2096 | &filter); | 2113 | false, &filter); |
2097 | if (err) { | 2114 | if (err) { |
2098 | printk(KERN_INFO | 2115 | printk(KERN_INFO |
2099 | "Failed to get filter for '%s', err %d\n", | 2116 | "Failed to get filter for '%s', err %d\n", |
2100 | d->filter, err); | 2117 | d->filter, err); |
2118 | __free_filter(filter); | ||
2101 | break; | 2119 | break; |
2102 | } | 2120 | } |
2103 | 2121 | ||
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 20dad0d7a163..99d20e920368 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | |||
280 | } | 280 | } |
281 | 281 | ||
282 | static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } | 282 | static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } |
283 | static void irqsoff_print_header(struct seq_file *s) { } | ||
284 | static void irqsoff_trace_open(struct trace_iterator *iter) { } | 283 | static void irqsoff_trace_open(struct trace_iterator *iter) { } |
285 | static void irqsoff_trace_close(struct trace_iterator *iter) { } | 284 | static void irqsoff_trace_close(struct trace_iterator *iter) { } |
285 | |||
286 | #ifdef CONFIG_FUNCTION_TRACER | ||
287 | static void irqsoff_print_header(struct seq_file *s) | ||
288 | { | ||
289 | trace_default_header(s); | ||
290 | } | ||
291 | #else | ||
292 | static void irqsoff_print_header(struct seq_file *s) | ||
293 | { | ||
294 | trace_latency_header(s); | ||
295 | } | ||
296 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
286 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 297 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
287 | 298 | ||
288 | /* | 299 | /* |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6cf..0d6ff3555942 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) | |||
627 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 627 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
628 | unsigned long secs = (unsigned long)t; | 628 | unsigned long secs = (unsigned long)t; |
629 | char comm[TASK_COMM_LEN]; | 629 | char comm[TASK_COMM_LEN]; |
630 | int ret; | ||
630 | 631 | ||
631 | trace_find_cmdline(entry->pid, comm); | 632 | trace_find_cmdline(entry->pid, comm); |
632 | 633 | ||
633 | return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", | 634 | ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", |
634 | comm, entry->pid, iter->cpu, secs, usec_rem); | 635 | comm, entry->pid, iter->cpu); |
636 | if (!ret) | ||
637 | return 0; | ||
638 | |||
639 | if (trace_flags & TRACE_ITER_IRQ_INFO) { | ||
640 | ret = trace_print_lat_fmt(s, entry); | ||
641 | if (!ret) | ||
642 | return 0; | ||
643 | } | ||
644 | |||
645 | return trace_seq_printf(s, " %5lu.%06lu: ", | ||
646 | secs, usec_rem); | ||
635 | } | 647 | } |
636 | 648 | ||
637 | int trace_print_lat_context(struct trace_iterator *iter) | 649 | int trace_print_lat_context(struct trace_iterator *iter) |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | |||
280 | } | 280 | } |
281 | 281 | ||
282 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | 282 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } |
283 | static void wakeup_print_header(struct seq_file *s) { } | ||
284 | static void wakeup_trace_open(struct trace_iterator *iter) { } | 283 | static void wakeup_trace_open(struct trace_iterator *iter) { } |
285 | static void wakeup_trace_close(struct trace_iterator *iter) { } | 284 | static void wakeup_trace_close(struct trace_iterator *iter) { } |
285 | |||
286 | #ifdef CONFIG_FUNCTION_TRACER | ||
287 | static void wakeup_print_header(struct seq_file *s) | ||
288 | { | ||
289 | trace_default_header(s); | ||
290 | } | ||
291 | #else | ||
292 | static void wakeup_print_header(struct seq_file *s) | ||
293 | { | ||
294 | trace_latency_header(s); | ||
295 | } | ||
296 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
286 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 297 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
287 | 298 | ||
288 | /* | 299 | /* |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 77575b386d97..d4545f49242e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -13,6 +13,9 @@ | |||
13 | #include <linux/sysctl.h> | 13 | #include <linux/sysctl.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | |||
17 | #include <asm/setup.h> | ||
18 | |||
16 | #include "trace.h" | 19 | #include "trace.h" |
17 | 20 | ||
18 | #define STACK_TRACE_ENTRIES 500 | 21 | #define STACK_TRACE_ENTRIES 500 |
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
133 | static struct ftrace_ops trace_ops __read_mostly = | 136 | static struct ftrace_ops trace_ops __read_mostly = |
134 | { | 137 | { |
135 | .func = stack_trace_call, | 138 | .func = stack_trace_call, |
136 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
137 | }; | 139 | }; |
138 | 140 | ||
139 | static ssize_t | 141 | static ssize_t |
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = { | |||
311 | .release = seq_release, | 313 | .release = seq_release, |
312 | }; | 314 | }; |
313 | 315 | ||
316 | static int | ||
317 | stack_trace_filter_open(struct inode *inode, struct file *file) | ||
318 | { | ||
319 | return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, | ||
320 | inode, file); | ||
321 | } | ||
322 | |||
323 | static const struct file_operations stack_trace_filter_fops = { | ||
324 | .open = stack_trace_filter_open, | ||
325 | .read = seq_read, | ||
326 | .write = ftrace_filter_write, | ||
327 | .llseek = ftrace_regex_lseek, | ||
328 | .release = ftrace_regex_release, | ||
329 | }; | ||
330 | |||
314 | int | 331 | int |
315 | stack_trace_sysctl(struct ctl_table *table, int write, | 332 | stack_trace_sysctl(struct ctl_table *table, int write, |
316 | void __user *buffer, size_t *lenp, | 333 | void __user *buffer, size_t *lenp, |
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, | |||
338 | return ret; | 355 | return ret; |
339 | } | 356 | } |
340 | 357 | ||
358 | static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; | ||
359 | |||
341 | static __init int enable_stacktrace(char *str) | 360 | static __init int enable_stacktrace(char *str) |
342 | { | 361 | { |
362 | if (strncmp(str, "_filter=", 8) == 0) | ||
363 | strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); | ||
364 | |||
343 | stack_tracer_enabled = 1; | 365 | stack_tracer_enabled = 1; |
344 | last_stack_tracer_enabled = 1; | 366 | last_stack_tracer_enabled = 1; |
345 | return 1; | 367 | return 1; |
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void) | |||
358 | trace_create_file("stack_trace", 0444, d_tracer, | 380 | trace_create_file("stack_trace", 0444, d_tracer, |
359 | NULL, &stack_trace_fops); | 381 | NULL, &stack_trace_fops); |
360 | 382 | ||
383 | trace_create_file("stack_trace_filter", 0444, d_tracer, | ||
384 | NULL, &stack_trace_filter_fops); | ||
385 | |||
386 | if (stack_trace_filter_buf[0]) | ||
387 | ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); | ||
388 | |||
361 | if (stack_tracer_enabled) | 389 | if (stack_tracer_enabled) |
362 | register_ftrace_function(&trace_ops); | 390 | register_ftrace_function(&trace_ops); |
363 | 391 | ||
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index db110b8ae030..f1539decd99d 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod) | |||
634 | int ret = 0; | 634 | int ret = 0; |
635 | 635 | ||
636 | /* | 636 | /* |
637 | * We skip modules that tain the kernel, especially those with different | 637 | * We skip modules that taint the kernel, especially those with different |
638 | * module header (for forced load), to make sure we don't cause a crash. | 638 | * module headers (for forced load), to make sure we don't cause a crash. |
639 | * Staging and out-of-tree GPL modules are fine. | ||
639 | */ | 640 | */ |
640 | if (mod->taints) | 641 | if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) |
641 | return 0; | 642 | return 0; |
642 | mutex_lock(&tracepoints_mutex); | 643 | mutex_lock(&tracepoints_mutex); |
643 | tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); | 644 | tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 5bbfac85866e..23b4d784ebdd 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
127 | 127 | ||
128 | local_irq_save(flags); | 128 | local_irq_save(flags); |
129 | time = tsk->stime + tsk->utime; | 129 | time = tsk->stime + tsk->utime; |
130 | dtime = cputime_sub(time, tsk->acct_timexpd); | 130 | dtime = time - tsk->acct_timexpd; |
131 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); | 131 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); |
132 | delta = value.tv_sec; | 132 | delta = value.tv_sec; |
133 | delta = delta * USEC_PER_SEC + value.tv_usec; | 133 | delta = delta * USEC_PER_SEC + value.tv_usec; |
diff --git a/kernel/wait.c b/kernel/wait.c index 26fa7797f90f..7fdd9eaca2c3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -10,10 +10,10 @@ | |||
10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
11 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
12 | 12 | ||
13 | void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) | 13 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) |
14 | { | 14 | { |
15 | spin_lock_init(&q->lock); | 15 | spin_lock_init(&q->lock); |
16 | lockdep_set_class(&q->lock, key); | 16 | lockdep_set_class_and_name(&q->lock, key, name); |
17 | INIT_LIST_HEAD(&q->task_list); | 17 | INIT_LIST_HEAD(&q->task_list); |
18 | } | 18 | } |
19 | 19 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1d7bca7f4f52..d117262deba3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
296 | if (__this_cpu_read(soft_watchdog_warn) == true) | 296 | if (__this_cpu_read(soft_watchdog_warn) == true) |
297 | return HRTIMER_RESTART; | 297 | return HRTIMER_RESTART; |
298 | 298 | ||
299 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 299 | printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
300 | smp_processor_id(), duration, | 300 | smp_processor_id(), duration, |
301 | current->comm, task_pid_nr(current)); | 301 | current->comm, task_pid_nr(current)); |
302 | print_modules(); | 302 | print_modules(); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad0a810..bec7b5b53e03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -242,10 +242,10 @@ struct workqueue_struct { | |||
242 | 242 | ||
243 | int nr_drainers; /* W: drain in progress */ | 243 | int nr_drainers; /* W: drain in progress */ |
244 | int saved_max_active; /* W: saved cwq max_active */ | 244 | int saved_max_active; /* W: saved cwq max_active */ |
245 | const char *name; /* I: workqueue name */ | ||
246 | #ifdef CONFIG_LOCKDEP | 245 | #ifdef CONFIG_LOCKDEP |
247 | struct lockdep_map lockdep_map; | 246 | struct lockdep_map lockdep_map; |
248 | #endif | 247 | #endif |
248 | char name[]; /* I: workqueue name */ | ||
249 | }; | 249 | }; |
250 | 250 | ||
251 | struct workqueue_struct *system_wq __read_mostly; | 251 | struct workqueue_struct *system_wq __read_mostly; |
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, | |||
2954 | return clamp_val(max_active, 1, lim); | 2954 | return clamp_val(max_active, 1, lim); |
2955 | } | 2955 | } |
2956 | 2956 | ||
2957 | struct workqueue_struct *__alloc_workqueue_key(const char *name, | 2957 | struct workqueue_struct *__alloc_workqueue_key(const char *fmt, |
2958 | unsigned int flags, | 2958 | unsigned int flags, |
2959 | int max_active, | 2959 | int max_active, |
2960 | struct lock_class_key *key, | 2960 | struct lock_class_key *key, |
2961 | const char *lock_name) | 2961 | const char *lock_name, ...) |
2962 | { | 2962 | { |
2963 | va_list args, args1; | ||
2963 | struct workqueue_struct *wq; | 2964 | struct workqueue_struct *wq; |
2964 | unsigned int cpu; | 2965 | unsigned int cpu; |
2966 | size_t namelen; | ||
2967 | |||
2968 | /* determine namelen, allocate wq and format name */ | ||
2969 | va_start(args, lock_name); | ||
2970 | va_copy(args1, args); | ||
2971 | namelen = vsnprintf(NULL, 0, fmt, args) + 1; | ||
2972 | |||
2973 | wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); | ||
2974 | if (!wq) | ||
2975 | goto err; | ||
2976 | |||
2977 | vsnprintf(wq->name, namelen, fmt, args1); | ||
2978 | va_end(args); | ||
2979 | va_end(args1); | ||
2965 | 2980 | ||
2966 | /* | 2981 | /* |
2967 | * Workqueues which may be used during memory reclaim should | 2982 | * Workqueues which may be used during memory reclaim should |
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2978 | flags |= WQ_HIGHPRI; | 2993 | flags |= WQ_HIGHPRI; |
2979 | 2994 | ||
2980 | max_active = max_active ?: WQ_DFL_ACTIVE; | 2995 | max_active = max_active ?: WQ_DFL_ACTIVE; |
2981 | max_active = wq_clamp_max_active(max_active, flags, name); | 2996 | max_active = wq_clamp_max_active(max_active, flags, wq->name); |
2982 | |||
2983 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); | ||
2984 | if (!wq) | ||
2985 | goto err; | ||
2986 | 2997 | ||
2998 | /* init wq */ | ||
2987 | wq->flags = flags; | 2999 | wq->flags = flags; |
2988 | wq->saved_max_active = max_active; | 3000 | wq->saved_max_active = max_active; |
2989 | mutex_init(&wq->flush_mutex); | 3001 | mutex_init(&wq->flush_mutex); |
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2991 | INIT_LIST_HEAD(&wq->flusher_queue); | 3003 | INIT_LIST_HEAD(&wq->flusher_queue); |
2992 | INIT_LIST_HEAD(&wq->flusher_overflow); | 3004 | INIT_LIST_HEAD(&wq->flusher_overflow); |
2993 | 3005 | ||
2994 | wq->name = name; | ||
2995 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 3006 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
2996 | INIT_LIST_HEAD(&wq->list); | 3007 | INIT_LIST_HEAD(&wq->list); |
2997 | 3008 | ||
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
3020 | if (!rescuer) | 3031 | if (!rescuer) |
3021 | goto err; | 3032 | goto err; |
3022 | 3033 | ||
3023 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); | 3034 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", |
3035 | wq->name); | ||
3024 | if (IS_ERR(rescuer->task)) | 3036 | if (IS_ERR(rescuer->task)) |
3025 | goto err; | 3037 | goto err; |
3026 | 3038 | ||