aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c222
-rw-r--r--kernel/cgroup.c309
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c357
-rw-r--r--kernel/delayacct.c16
-rw-r--r--kernel/exit.c61
-rw-r--r--kernel/fork.c35
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/marker.c25
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/nsproxy.c8
-rw-r--r--kernel/panic.c22
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/pid_namespace.c10
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/printk.c17
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/res_counter.c48
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/signal.c80
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/tsacct.c25
-rw-r--r--kernel/workqueue.c112
32 files changed, 871 insertions, 692 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 15ab63ffe64d..54f69837d35a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg 24CFLAGS_REMOVE_sched.o = -mno-spe -pg
25endif 25endif
26 26
27obj-$(CONFIG_PROFILING) += profile.o
27obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 28obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
28obj-$(CONFIG_STACKTRACE) += stacktrace.o 29obj-$(CONFIG_STACKTRACE) += stacktrace.o
29obj-y += time/ 30obj-y += time/
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..dd68b9059418 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
75/* 75/*
76 * External references and all of the globals. 76 * External references and all of the globals.
77 */ 77 */
78static void do_acct_process(struct pid_namespace *ns, struct file *); 78static void do_acct_process(struct bsd_acct_struct *acct,
79 struct pid_namespace *ns, struct file *);
79 80
80/* 81/*
81 * This structure is used so that all the data protected by lock 82 * This structure is used so that all the data protected by lock
82 * can be placed in the same cache line as the lock. This primes 83 * can be placed in the same cache line as the lock. This primes
83 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
84 */ 85 */
85struct acct_glbs { 86struct bsd_acct_struct {
86 spinlock_t lock;
87 volatile int active; 87 volatile int active;
88 volatile int needcheck; 88 volatile int needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer; 91 struct timer_list timer;
92 struct list_head list;
92}; 93};
93 94
94static struct acct_glbs acct_globals __cacheline_aligned = 95static DEFINE_SPINLOCK(acct_lock);
95 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; 96static LIST_HEAD(acct_list);
96 97
97/* 98/*
98 * Called whenever the timer says to check the free space. 99 * Called whenever the timer says to check the free space.
99 */ 100 */
100static void acct_timeout(unsigned long unused) 101static void acct_timeout(unsigned long x)
101{ 102{
102 acct_globals.needcheck = 1; 103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
103} 105}
104 106
105/* 107/*
106 * Check the amount of free space and suspend/resume accordingly. 108 * Check the amount of free space and suspend/resume accordingly.
107 */ 109 */
108static int check_free_space(struct file *file) 110static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
109{ 111{
110 struct kstatfs sbuf; 112 struct kstatfs sbuf;
111 int res; 113 int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
113 sector_t resume; 115 sector_t resume;
114 sector_t suspend; 116 sector_t suspend;
115 117
116 spin_lock(&acct_globals.lock); 118 spin_lock(&acct_lock);
117 res = acct_globals.active; 119 res = acct->active;
118 if (!file || !acct_globals.needcheck) 120 if (!file || !acct->needcheck)
119 goto out; 121 goto out;
120 spin_unlock(&acct_globals.lock); 122 spin_unlock(&acct_lock);
121 123
122 /* May block */ 124 /* May block */
123 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
136 act = 0; 138 act = 0;
137 139
138 /* 140 /*
139 * If some joker switched acct_globals.file under us we'ld better be 141 * If some joker switched acct->file under us we'ld better be
140 * silent and _not_ touch anything. 142 * silent and _not_ touch anything.
141 */ 143 */
142 spin_lock(&acct_globals.lock); 144 spin_lock(&acct_lock);
143 if (file != acct_globals.file) { 145 if (file != acct->file) {
144 if (act) 146 if (act)
145 res = act>0; 147 res = act>0;
146 goto out; 148 goto out;
147 } 149 }
148 150
149 if (acct_globals.active) { 151 if (acct->active) {
150 if (act < 0) { 152 if (act < 0) {
151 acct_globals.active = 0; 153 acct->active = 0;
152 printk(KERN_INFO "Process accounting paused\n"); 154 printk(KERN_INFO "Process accounting paused\n");
153 } 155 }
154 } else { 156 } else {
155 if (act > 0) { 157 if (act > 0) {
156 acct_globals.active = 1; 158 acct->active = 1;
157 printk(KERN_INFO "Process accounting resumed\n"); 159 printk(KERN_INFO "Process accounting resumed\n");
158 } 160 }
159 } 161 }
160 162
161 del_timer(&acct_globals.timer); 163 del_timer(&acct->timer);
162 acct_globals.needcheck = 0; 164 acct->needcheck = 0;
163 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
164 add_timer(&acct_globals.timer); 166 add_timer(&acct->timer);
165 res = acct_globals.active; 167 res = acct->active;
166out: 168out:
167 spin_unlock(&acct_globals.lock); 169 spin_unlock(&acct_lock);
168 return res; 170 return res;
169} 171}
170 172
@@ -172,39 +174,41 @@ out:
172 * Close the old accounting file (if currently open) and then replace 174 * Close the old accounting file (if currently open) and then replace
173 * it with file (if non-NULL). 175 * it with file (if non-NULL).
174 * 176 *
175 * NOTE: acct_globals.lock MUST be held on entry and exit. 177 * NOTE: acct_lock MUST be held on entry and exit.
176 */ 178 */
177static void acct_file_reopen(struct file *file) 179static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
180 struct pid_namespace *ns)
178{ 181{
179 struct file *old_acct = NULL; 182 struct file *old_acct = NULL;
180 struct pid_namespace *old_ns = NULL; 183 struct pid_namespace *old_ns = NULL;
181 184
182 if (acct_globals.file) { 185 if (acct->file) {
183 old_acct = acct_globals.file; 186 old_acct = acct->file;
184 old_ns = acct_globals.ns; 187 old_ns = acct->ns;
185 del_timer(&acct_globals.timer); 188 del_timer(&acct->timer);
186 acct_globals.active = 0; 189 acct->active = 0;
187 acct_globals.needcheck = 0; 190 acct->needcheck = 0;
188 acct_globals.file = NULL; 191 acct->file = NULL;
192 acct->ns = NULL;
193 list_del(&acct->list);
189 } 194 }
190 if (file) { 195 if (file) {
191 acct_globals.file = file; 196 acct->file = file;
192 acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); 197 acct->ns = ns;
193 acct_globals.needcheck = 0; 198 acct->needcheck = 0;
194 acct_globals.active = 1; 199 acct->active = 1;
200 list_add(&acct->list, &acct_list);
195 /* It's been deleted if it was used before so this is safe */ 201 /* It's been deleted if it was used before so this is safe */
196 init_timer(&acct_globals.timer); 202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
197 acct_globals.timer.function = acct_timeout; 203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
198 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 204 add_timer(&acct->timer);
199 add_timer(&acct_globals.timer);
200 } 205 }
201 if (old_acct) { 206 if (old_acct) {
202 mnt_unpin(old_acct->f_path.mnt); 207 mnt_unpin(old_acct->f_path.mnt);
203 spin_unlock(&acct_globals.lock); 208 spin_unlock(&acct_lock);
204 do_acct_process(old_ns, old_acct); 209 do_acct_process(acct, old_ns, old_acct);
205 filp_close(old_acct, NULL); 210 filp_close(old_acct, NULL);
206 put_pid_ns(old_ns); 211 spin_lock(&acct_lock);
207 spin_lock(&acct_globals.lock);
208 } 212 }
209} 213}
210 214
@@ -212,6 +216,8 @@ static int acct_on(char *name)
212{ 216{
213 struct file *file; 217 struct file *file;
214 int error; 218 int error;
219 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL;
215 221
216 /* Difference from BSD - they don't do O_APPEND */ 222 /* Difference from BSD - they don't do O_APPEND */
217 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 223 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
228 return -EIO; 234 return -EIO;
229 } 235 }
230 236
237 ns = task_active_pid_ns(current);
238 if (ns->bacct == NULL) {
239 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
240 if (acct == NULL) {
241 filp_close(file, NULL);
242 return -ENOMEM;
243 }
244 }
245
231 error = security_acct(file); 246 error = security_acct(file);
232 if (error) { 247 if (error) {
248 kfree(acct);
233 filp_close(file, NULL); 249 filp_close(file, NULL);
234 return error; 250 return error;
235 } 251 }
236 252
237 spin_lock(&acct_globals.lock); 253 spin_lock(&acct_lock);
254 if (ns->bacct == NULL) {
255 ns->bacct = acct;
256 acct = NULL;
257 }
258
238 mnt_pin(file->f_path.mnt); 259 mnt_pin(file->f_path.mnt);
239 acct_file_reopen(file); 260 acct_file_reopen(ns->bacct, file, ns);
240 spin_unlock(&acct_globals.lock); 261 spin_unlock(&acct_lock);
241 262
242 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
264 kfree(acct);
243 265
244 return 0; 266 return 0;
245} 267}
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
269 error = acct_on(tmp); 291 error = acct_on(tmp);
270 putname(tmp); 292 putname(tmp);
271 } else { 293 } else {
294 struct bsd_acct_struct *acct;
295
296 acct = task_active_pid_ns(current)->bacct;
297 if (acct == NULL)
298 return 0;
299
272 error = security_acct(NULL); 300 error = security_acct(NULL);
273 if (!error) { 301 if (!error) {
274 spin_lock(&acct_globals.lock); 302 spin_lock(&acct_lock);
275 acct_file_reopen(NULL); 303 acct_file_reopen(acct, NULL, NULL);
276 spin_unlock(&acct_globals.lock); 304 spin_unlock(&acct_lock);
277 } 305 }
278 } 306 }
279 return error; 307 return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
288 */ 316 */
289void acct_auto_close_mnt(struct vfsmount *m) 317void acct_auto_close_mnt(struct vfsmount *m)
290{ 318{
291 spin_lock(&acct_globals.lock); 319 struct bsd_acct_struct *acct;
292 if (acct_globals.file && acct_globals.file->f_path.mnt == m) 320
293 acct_file_reopen(NULL); 321 spin_lock(&acct_lock);
294 spin_unlock(&acct_globals.lock); 322restart:
323 list_for_each_entry(acct, &acct_list, list)
324 if (acct->file && acct->file->f_path.mnt == m) {
325 acct_file_reopen(acct, NULL, NULL);
326 goto restart;
327 }
328 spin_unlock(&acct_lock);
295} 329}
296 330
297/** 331/**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
303 */ 337 */
304void acct_auto_close(struct super_block *sb) 338void acct_auto_close(struct super_block *sb)
305{ 339{
306 spin_lock(&acct_globals.lock); 340 struct bsd_acct_struct *acct;
307 if (acct_globals.file && 341
308 acct_globals.file->f_path.mnt->mnt_sb == sb) { 342 spin_lock(&acct_lock);
309 acct_file_reopen(NULL); 343restart:
344 list_for_each_entry(acct, &acct_list, list)
345 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
346 acct_file_reopen(acct, NULL, NULL);
347 goto restart;
348 }
349 spin_unlock(&acct_lock);
350}
351
352void acct_exit_ns(struct pid_namespace *ns)
353{
354 struct bsd_acct_struct *acct;
355
356 spin_lock(&acct_lock);
357 acct = ns->bacct;
358 if (acct != NULL) {
359 if (acct->file != NULL)
360 acct_file_reopen(acct, NULL, NULL);
361
362 kfree(acct);
310 } 363 }
311 spin_unlock(&acct_globals.lock); 364 spin_unlock(&acct_lock);
312} 365}
313 366
314/* 367/*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
425/* 478/*
426 * do_acct_process does all actual work. Caller holds the reference to file. 479 * do_acct_process does all actual work. Caller holds the reference to file.
427 */ 480 */
428static void do_acct_process(struct pid_namespace *ns, struct file *file) 481static void do_acct_process(struct bsd_acct_struct *acct,
482 struct pid_namespace *ns, struct file *file)
429{ 483{
430 struct pacct_struct *pacct = &current->signal->pacct; 484 struct pacct_struct *pacct = &current->signal->pacct;
431 acct_t ac; 485 acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
440 * First check to see if there is enough free_space to continue 494 * First check to see if there is enough free_space to continue
441 * the process accounting system. 495 * the process accounting system.
442 */ 496 */
443 if (!check_free_space(file)) 497 if (!check_free_space(acct, file))
444 return; 498 return;
445 499
446 /* 500 /*
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
577 spin_unlock_irq(&current->sighand->siglock); 631 spin_unlock_irq(&current->sighand->siglock);
578} 632}
579 633
580/** 634static void acct_process_in_ns(struct pid_namespace *ns)
581 * acct_process - now just a wrapper around do_acct_process
582 * @exitcode: task exit code
583 *
584 * handles process accounting for an exiting task
585 */
586void acct_process(void)
587{ 635{
588 struct file *file = NULL; 636 struct file *file = NULL;
589 struct pid_namespace *ns; 637 struct bsd_acct_struct *acct;
590 638
639 acct = ns->bacct;
591 /* 640 /*
592 * accelerate the common fastpath: 641 * accelerate the common fastpath:
593 */ 642 */
594 if (!acct_globals.file) 643 if (!acct || !acct->file)
595 return; 644 return;
596 645
597 spin_lock(&acct_globals.lock); 646 spin_lock(&acct_lock);
598 file = acct_globals.file; 647 file = acct->file;
599 if (unlikely(!file)) { 648 if (unlikely(!file)) {
600 spin_unlock(&acct_globals.lock); 649 spin_unlock(&acct_lock);
601 return; 650 return;
602 } 651 }
603 get_file(file); 652 get_file(file);
604 ns = get_pid_ns(acct_globals.ns); 653 spin_unlock(&acct_lock);
605 spin_unlock(&acct_globals.lock);
606 654
607 do_acct_process(ns, file); 655 do_acct_process(acct, ns, file);
608 fput(file); 656 fput(file);
609 put_pid_ns(ns); 657}
658
659/**
660 * acct_process - now just a wrapper around acct_process_in_ns,
661 * which in turn is a wrapper around do_acct_process.
662 *
663 * handles process accounting for an exiting task
664 */
665void acct_process(void)
666{
667 struct pid_namespace *ns;
668
669 /*
670 * This loop is safe lockless, since current is still
671 * alive and holds its namespace, which in turn holds
672 * its parent.
673 */
674 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
675 acct_process_in_ns(ns);
610} 676}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..66ec9fd21e0c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -89,11 +89,7 @@ struct cgroupfs_root {
89 /* Hierarchy-specific flags */ 89 /* Hierarchy-specific flags */
90 unsigned long flags; 90 unsigned long flags;
91 91
92 /* The path to use for release notifications. No locking 92 /* The path to use for release notifications. */
93 * between setting and use - so if userspace updates this
94 * while child cgroups exist, you could miss a
95 * notification. We ensure that it's always a valid
96 * NUL-terminated string */
97 char release_agent_path[PATH_MAX]; 93 char release_agent_path[PATH_MAX];
98}; 94};
99 95
@@ -118,7 +114,7 @@ static int root_count;
118 * extra work in the fork/exit path if none of the subsystems need to 114 * extra work in the fork/exit path if none of the subsystems need to
119 * be called. 115 * be called.
120 */ 116 */
121static int need_forkexit_callback; 117static int need_forkexit_callback __read_mostly;
122static int need_mm_owner_callback __read_mostly; 118static int need_mm_owner_callback __read_mostly;
123 119
124/* convenient tests for these bits */ 120/* convenient tests for these bits */
@@ -220,7 +216,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
220 * task until after the first call to cgroup_iter_start(). This 216 * task until after the first call to cgroup_iter_start(). This
221 * reduces the fork()/exit() overhead for people who have cgroups 217 * reduces the fork()/exit() overhead for people who have cgroups
222 * compiled into their kernel but not actually in use */ 218 * compiled into their kernel but not actually in use */
223static int use_task_css_set_links; 219static int use_task_css_set_links __read_mostly;
224 220
225/* When we create or destroy a css_set, the operation simply 221/* When we create or destroy a css_set, the operation simply
226 * takes/releases a reference count on all the cgroups referenced 222 * takes/releases a reference count on all the cgroups referenced
@@ -241,17 +237,20 @@ static int use_task_css_set_links;
241 */ 237 */
242static void unlink_css_set(struct css_set *cg) 238static void unlink_css_set(struct css_set *cg)
243{ 239{
240 struct cg_cgroup_link *link;
241 struct cg_cgroup_link *saved_link;
242
244 write_lock(&css_set_lock); 243 write_lock(&css_set_lock);
245 hlist_del(&cg->hlist); 244 hlist_del(&cg->hlist);
246 css_set_count--; 245 css_set_count--;
247 while (!list_empty(&cg->cg_links)) { 246
248 struct cg_cgroup_link *link; 247 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
249 link = list_entry(cg->cg_links.next, 248 cg_link_list) {
250 struct cg_cgroup_link, cg_link_list);
251 list_del(&link->cg_link_list); 249 list_del(&link->cg_link_list);
252 list_del(&link->cgrp_link_list); 250 list_del(&link->cgrp_link_list);
253 kfree(link); 251 kfree(link);
254 } 252 }
253
255 write_unlock(&css_set_lock); 254 write_unlock(&css_set_lock);
256} 255}
257 256
@@ -363,15 +362,14 @@ static struct css_set *find_existing_css_set(
363static int allocate_cg_links(int count, struct list_head *tmp) 362static int allocate_cg_links(int count, struct list_head *tmp)
364{ 363{
365 struct cg_cgroup_link *link; 364 struct cg_cgroup_link *link;
365 struct cg_cgroup_link *saved_link;
366 int i; 366 int i;
367 INIT_LIST_HEAD(tmp); 367 INIT_LIST_HEAD(tmp);
368 for (i = 0; i < count; i++) { 368 for (i = 0; i < count; i++) {
369 link = kmalloc(sizeof(*link), GFP_KERNEL); 369 link = kmalloc(sizeof(*link), GFP_KERNEL);
370 if (!link) { 370 if (!link) {
371 while (!list_empty(tmp)) { 371 list_for_each_entry_safe(link, saved_link, tmp,
372 link = list_entry(tmp->next, 372 cgrp_link_list) {
373 struct cg_cgroup_link,
374 cgrp_link_list);
375 list_del(&link->cgrp_link_list); 373 list_del(&link->cgrp_link_list);
376 kfree(link); 374 kfree(link);
377 } 375 }
@@ -384,11 +382,10 @@ static int allocate_cg_links(int count, struct list_head *tmp)
384 382
385static void free_cg_links(struct list_head *tmp) 383static void free_cg_links(struct list_head *tmp)
386{ 384{
387 while (!list_empty(tmp)) { 385 struct cg_cgroup_link *link;
388 struct cg_cgroup_link *link; 386 struct cg_cgroup_link *saved_link;
389 link = list_entry(tmp->next, 387
390 struct cg_cgroup_link, 388 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
391 cgrp_link_list);
392 list_del(&link->cgrp_link_list); 389 list_del(&link->cgrp_link_list);
393 kfree(link); 390 kfree(link);
394 } 391 }
@@ -415,11 +412,11 @@ static struct css_set *find_css_set(
415 412
416 /* First see if we already have a cgroup group that matches 413 /* First see if we already have a cgroup group that matches
417 * the desired set */ 414 * the desired set */
418 write_lock(&css_set_lock); 415 read_lock(&css_set_lock);
419 res = find_existing_css_set(oldcg, cgrp, template); 416 res = find_existing_css_set(oldcg, cgrp, template);
420 if (res) 417 if (res)
421 get_css_set(res); 418 get_css_set(res);
422 write_unlock(&css_set_lock); 419 read_unlock(&css_set_lock);
423 420
424 if (res) 421 if (res)
425 return res; 422 return res;
@@ -507,10 +504,6 @@ static struct css_set *find_css_set(
507 * knows that the cgroup won't be removed, as cgroup_rmdir() 504 * knows that the cgroup won't be removed, as cgroup_rmdir()
508 * needs that mutex. 505 * needs that mutex.
509 * 506 *
510 * The cgroup_common_file_write handler for operations that modify
511 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
512 * single threading all such cgroup modifications across the system.
513 *
514 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 507 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
515 * (usually) take cgroup_mutex. These are the two most performance 508 * (usually) take cgroup_mutex. These are the two most performance
516 * critical pieces of code here. The exception occurs on cgroup_exit(), 509 * critical pieces of code here. The exception occurs on cgroup_exit(),
@@ -1093,6 +1086,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1093 struct cgroupfs_root *root = sb->s_fs_info; 1086 struct cgroupfs_root *root = sb->s_fs_info;
1094 struct cgroup *cgrp = &root->top_cgroup; 1087 struct cgroup *cgrp = &root->top_cgroup;
1095 int ret; 1088 int ret;
1089 struct cg_cgroup_link *link;
1090 struct cg_cgroup_link *saved_link;
1096 1091
1097 BUG_ON(!root); 1092 BUG_ON(!root);
1098 1093
@@ -1112,10 +1107,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1112 * root cgroup 1107 * root cgroup
1113 */ 1108 */
1114 write_lock(&css_set_lock); 1109 write_lock(&css_set_lock);
1115 while (!list_empty(&cgrp->css_sets)) { 1110
1116 struct cg_cgroup_link *link; 1111 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1117 link = list_entry(cgrp->css_sets.next, 1112 cgrp_link_list) {
1118 struct cg_cgroup_link, cgrp_link_list);
1119 list_del(&link->cg_link_list); 1113 list_del(&link->cg_link_list);
1120 list_del(&link->cgrp_link_list); 1114 list_del(&link->cgrp_link_list);
1121 kfree(link); 1115 kfree(link);
@@ -1281,18 +1275,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1281} 1275}
1282 1276
1283/* 1277/*
1284 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with 1278 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1285 * cgroup_mutex, may take task_lock of task 1279 * held. May take task_lock of task
1286 */ 1280 */
1287static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) 1281static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1288{ 1282{
1289 pid_t pid;
1290 struct task_struct *tsk; 1283 struct task_struct *tsk;
1291 int ret; 1284 int ret;
1292 1285
1293 if (sscanf(pidbuf, "%d", &pid) != 1)
1294 return -EIO;
1295
1296 if (pid) { 1286 if (pid) {
1297 rcu_read_lock(); 1287 rcu_read_lock();
1298 tsk = find_task_by_vpid(pid); 1288 tsk = find_task_by_vpid(pid);
@@ -1318,6 +1308,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1318 return ret; 1308 return ret;
1319} 1309}
1320 1310
1311static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1312{
1313 int ret;
1314 if (!cgroup_lock_live_group(cgrp))
1315 return -ENODEV;
1316 ret = attach_task_by_pid(cgrp, pid);
1317 cgroup_unlock();
1318 return ret;
1319}
1320
1321/* The various types of files and directories in a cgroup file system */ 1321/* The various types of files and directories in a cgroup file system */
1322enum cgroup_filetype { 1322enum cgroup_filetype {
1323 FILE_ROOT, 1323 FILE_ROOT,
@@ -1327,12 +1327,54 @@ enum cgroup_filetype {
1327 FILE_RELEASE_AGENT, 1327 FILE_RELEASE_AGENT,
1328}; 1328};
1329 1329
1330/**
1331 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1332 * @cgrp: the cgroup to be checked for liveness
1333 *
1334 * On success, returns true; the lock should be later released with
1335 * cgroup_unlock(). On failure returns false with no lock held.
1336 */
1337bool cgroup_lock_live_group(struct cgroup *cgrp)
1338{
1339 mutex_lock(&cgroup_mutex);
1340 if (cgroup_is_removed(cgrp)) {
1341 mutex_unlock(&cgroup_mutex);
1342 return false;
1343 }
1344 return true;
1345}
1346
1347static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1348 const char *buffer)
1349{
1350 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1351 if (!cgroup_lock_live_group(cgrp))
1352 return -ENODEV;
1353 strcpy(cgrp->root->release_agent_path, buffer);
1354 cgroup_unlock();
1355 return 0;
1356}
1357
1358static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
1359 struct seq_file *seq)
1360{
1361 if (!cgroup_lock_live_group(cgrp))
1362 return -ENODEV;
1363 seq_puts(seq, cgrp->root->release_agent_path);
1364 seq_putc(seq, '\n');
1365 cgroup_unlock();
1366 return 0;
1367}
1368
1369/* A buffer size big enough for numbers or short strings */
1370#define CGROUP_LOCAL_BUFFER_SIZE 64
1371
1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 1372static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1331 struct file *file, 1373 struct file *file,
1332 const char __user *userbuf, 1374 const char __user *userbuf,
1333 size_t nbytes, loff_t *unused_ppos) 1375 size_t nbytes, loff_t *unused_ppos)
1334{ 1376{
1335 char buffer[64]; 1377 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
1336 int retval = 0; 1378 int retval = 0;
1337 char *end; 1379 char *end;
1338 1380
@@ -1361,68 +1403,36 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1361 return retval; 1403 return retval;
1362} 1404}
1363 1405
1364static ssize_t cgroup_common_file_write(struct cgroup *cgrp, 1406static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1365 struct cftype *cft, 1407 struct file *file,
1366 struct file *file, 1408 const char __user *userbuf,
1367 const char __user *userbuf, 1409 size_t nbytes, loff_t *unused_ppos)
1368 size_t nbytes, loff_t *unused_ppos)
1369{ 1410{
1370 enum cgroup_filetype type = cft->private; 1411 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
1371 char *buffer;
1372 int retval = 0; 1412 int retval = 0;
1413 size_t max_bytes = cft->max_write_len;
1414 char *buffer = local_buffer;
1373 1415
1374 if (nbytes >= PATH_MAX) 1416 if (!max_bytes)
1417 max_bytes = sizeof(local_buffer) - 1;
1418 if (nbytes >= max_bytes)
1375 return -E2BIG; 1419 return -E2BIG;
1376 1420 /* Allocate a dynamic buffer if we need one */
1377 /* +1 for nul-terminator */ 1421 if (nbytes >= sizeof(local_buffer)) {
1378 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 1422 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1379 if (buffer == NULL) 1423 if (buffer == NULL)
1380 return -ENOMEM; 1424 return -ENOMEM;
1381
1382 if (copy_from_user(buffer, userbuf, nbytes)) {
1383 retval = -EFAULT;
1384 goto out1;
1385 } 1425 }
1386 buffer[nbytes] = 0; /* nul-terminate */ 1426 if (nbytes && copy_from_user(buffer, userbuf, nbytes))
1387 strstrip(buffer); /* strip -just- trailing whitespace */ 1427 return -EFAULT;
1388
1389 mutex_lock(&cgroup_mutex);
1390 1428
1391 /* 1429 buffer[nbytes] = 0; /* nul-terminate */
1392 * This was already checked for in cgroup_file_write(), but 1430 strstrip(buffer);
1393 * check again now we're holding cgroup_mutex. 1431 retval = cft->write_string(cgrp, cft, buffer);
1394 */ 1432 if (!retval)
1395 if (cgroup_is_removed(cgrp)) {
1396 retval = -ENODEV;
1397 goto out2;
1398 }
1399
1400 switch (type) {
1401 case FILE_TASKLIST:
1402 retval = attach_task_by_pid(cgrp, buffer);
1403 break;
1404 case FILE_NOTIFY_ON_RELEASE:
1405 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1406 if (simple_strtoul(buffer, NULL, 10) != 0)
1407 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1408 else
1409 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1410 break;
1411 case FILE_RELEASE_AGENT:
1412 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1413 strcpy(cgrp->root->release_agent_path, buffer);
1414 break;
1415 default:
1416 retval = -EINVAL;
1417 goto out2;
1418 }
1419
1420 if (retval == 0)
1421 retval = nbytes; 1433 retval = nbytes;
1422out2: 1434 if (buffer != local_buffer)
1423 mutex_unlock(&cgroup_mutex); 1435 kfree(buffer);
1424out1:
1425 kfree(buffer);
1426 return retval; 1436 return retval;
1427} 1437}
1428 1438
@@ -1438,6 +1448,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1448 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1439 if (cft->write_u64 || cft->write_s64) 1449 if (cft->write_u64 || cft->write_s64)
1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 1450 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1451 if (cft->write_string)
1452 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) { 1453 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 1454 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes; 1455 return ret ? ret : nbytes;
@@ -1450,7 +1462,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1450 char __user *buf, size_t nbytes, 1462 char __user *buf, size_t nbytes,
1451 loff_t *ppos) 1463 loff_t *ppos)
1452{ 1464{
1453 char tmp[64]; 1465 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1454 u64 val = cft->read_u64(cgrp, cft); 1466 u64 val = cft->read_u64(cgrp, cft);
1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1467 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1456 1468
@@ -1462,56 +1474,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1462 char __user *buf, size_t nbytes, 1474 char __user *buf, size_t nbytes,
1463 loff_t *ppos) 1475 loff_t *ppos)
1464{ 1476{
1465 char tmp[64]; 1477 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1466 s64 val = cft->read_s64(cgrp, cft); 1478 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val); 1479 int len = sprintf(tmp, "%lld\n", (long long) val);
1468 1480
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1481 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470} 1482}
1471 1483
1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1473 struct cftype *cft,
1474 struct file *file,
1475 char __user *buf,
1476 size_t nbytes, loff_t *ppos)
1477{
1478 enum cgroup_filetype type = cft->private;
1479 char *page;
1480 ssize_t retval = 0;
1481 char *s;
1482
1483 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1484 return -ENOMEM;
1485
1486 s = page;
1487
1488 switch (type) {
1489 case FILE_RELEASE_AGENT:
1490 {
1491 struct cgroupfs_root *root;
1492 size_t n;
1493 mutex_lock(&cgroup_mutex);
1494 root = cgrp->root;
1495 n = strnlen(root->release_agent_path,
1496 sizeof(root->release_agent_path));
1497 n = min(n, (size_t) PAGE_SIZE);
1498 strncpy(s, root->release_agent_path, n);
1499 mutex_unlock(&cgroup_mutex);
1500 s += n;
1501 break;
1502 }
1503 default:
1504 retval = -EINVAL;
1505 goto out;
1506 }
1507 *s++ = '\n';
1508
1509 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1510out:
1511 free_page((unsigned long)page);
1512 return retval;
1513}
1514
1515static ssize_t cgroup_file_read(struct file *file, char __user *buf, 1484static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1516 size_t nbytes, loff_t *ppos) 1485 size_t nbytes, loff_t *ppos)
1517{ 1486{
@@ -1569,6 +1538,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
1569 1538
1570static struct file_operations cgroup_seqfile_operations = { 1539static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read, 1540 .read = seq_read,
1541 .write = cgroup_file_write,
1572 .llseek = seq_lseek, 1542 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release, 1543 .release = cgroup_seqfile_release,
1574}; 1544};
@@ -1756,15 +1726,11 @@ int cgroup_add_files(struct cgroup *cgrp,
1756int cgroup_task_count(const struct cgroup *cgrp) 1726int cgroup_task_count(const struct cgroup *cgrp)
1757{ 1727{
1758 int count = 0; 1728 int count = 0;
1759 struct list_head *l; 1729 struct cg_cgroup_link *link;
1760 1730
1761 read_lock(&css_set_lock); 1731 read_lock(&css_set_lock);
1762 l = cgrp->css_sets.next; 1732 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1763 while (l != &cgrp->css_sets) {
1764 struct cg_cgroup_link *link =
1765 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1766 count += atomic_read(&link->cg->ref.refcount); 1733 count += atomic_read(&link->cg->ref.refcount);
1767 l = l->next;
1768 } 1734 }
1769 read_unlock(&css_set_lock); 1735 read_unlock(&css_set_lock);
1770 return count; 1736 return count;
@@ -2227,6 +2193,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2227 return notify_on_release(cgrp); 2193 return notify_on_release(cgrp);
2228} 2194}
2229 2195
2196static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2197 struct cftype *cft,
2198 u64 val)
2199{
2200 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
2201 if (val)
2202 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2203 else
2204 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2205 return 0;
2206}
2207
2230/* 2208/*
2231 * for the common functions, 'private' gives the type of file 2209 * for the common functions, 'private' gives the type of file
2232 */ 2210 */
@@ -2235,7 +2213,7 @@ static struct cftype files[] = {
2235 .name = "tasks", 2213 .name = "tasks",
2236 .open = cgroup_tasks_open, 2214 .open = cgroup_tasks_open,
2237 .read = cgroup_tasks_read, 2215 .read = cgroup_tasks_read,
2238 .write = cgroup_common_file_write, 2216 .write_u64 = cgroup_tasks_write,
2239 .release = cgroup_tasks_release, 2217 .release = cgroup_tasks_release,
2240 .private = FILE_TASKLIST, 2218 .private = FILE_TASKLIST,
2241 }, 2219 },
@@ -2243,15 +2221,16 @@ static struct cftype files[] = {
2243 { 2221 {
2244 .name = "notify_on_release", 2222 .name = "notify_on_release",
2245 .read_u64 = cgroup_read_notify_on_release, 2223 .read_u64 = cgroup_read_notify_on_release,
2246 .write = cgroup_common_file_write, 2224 .write_u64 = cgroup_write_notify_on_release,
2247 .private = FILE_NOTIFY_ON_RELEASE, 2225 .private = FILE_NOTIFY_ON_RELEASE,
2248 }, 2226 },
2249}; 2227};
2250 2228
2251static struct cftype cft_release_agent = { 2229static struct cftype cft_release_agent = {
2252 .name = "release_agent", 2230 .name = "release_agent",
2253 .read = cgroup_common_file_read, 2231 .read_seq_string = cgroup_release_agent_show,
2254 .write = cgroup_common_file_write, 2232 .write_string = cgroup_release_agent_write,
2233 .max_write_len = PATH_MAX,
2255 .private = FILE_RELEASE_AGENT, 2234 .private = FILE_RELEASE_AGENT,
2256}; 2235};
2257 2236
@@ -2869,16 +2848,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2869 * cgroup_clone - clone the cgroup the given subsystem is attached to 2848 * cgroup_clone - clone the cgroup the given subsystem is attached to
2870 * @tsk: the task to be moved 2849 * @tsk: the task to be moved
2871 * @subsys: the given subsystem 2850 * @subsys: the given subsystem
2851 * @nodename: the name for the new cgroup
2872 * 2852 *
2873 * Duplicate the current cgroup in the hierarchy that the given 2853 * Duplicate the current cgroup in the hierarchy that the given
2874 * subsystem is attached to, and move this task into the new 2854 * subsystem is attached to, and move this task into the new
2875 * child. 2855 * child.
2876 */ 2856 */
2877int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) 2857int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2858 char *nodename)
2878{ 2859{
2879 struct dentry *dentry; 2860 struct dentry *dentry;
2880 int ret = 0; 2861 int ret = 0;
2881 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2882 struct cgroup *parent, *child; 2862 struct cgroup *parent, *child;
2883 struct inode *inode; 2863 struct inode *inode;
2884 struct css_set *cg; 2864 struct css_set *cg;
@@ -2903,8 +2883,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2903 cg = tsk->cgroups; 2883 cg = tsk->cgroups;
2904 parent = task_cgroup(tsk, subsys->subsys_id); 2884 parent = task_cgroup(tsk, subsys->subsys_id);
2905 2885
2906 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
2907
2908 /* Pin the hierarchy */ 2886 /* Pin the hierarchy */
2909 atomic_inc(&parent->root->sb->s_active); 2887 atomic_inc(&parent->root->sb->s_active);
2910 2888
@@ -3078,27 +3056,24 @@ static void cgroup_release_agent(struct work_struct *work)
3078 while (!list_empty(&release_list)) { 3056 while (!list_empty(&release_list)) {
3079 char *argv[3], *envp[3]; 3057 char *argv[3], *envp[3];
3080 int i; 3058 int i;
3081 char *pathbuf; 3059 char *pathbuf = NULL, *agentbuf = NULL;
3082 struct cgroup *cgrp = list_entry(release_list.next, 3060 struct cgroup *cgrp = list_entry(release_list.next,
3083 struct cgroup, 3061 struct cgroup,
3084 release_list); 3062 release_list);
3085 list_del_init(&cgrp->release_list); 3063 list_del_init(&cgrp->release_list);
3086 spin_unlock(&release_list_lock); 3064 spin_unlock(&release_list_lock);
3087 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 3065 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3088 if (!pathbuf) { 3066 if (!pathbuf)
3089 spin_lock(&release_list_lock); 3067 goto continue_free;
3090 continue; 3068 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
3091 } 3069 goto continue_free;
3092 3070 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
3093 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { 3071 if (!agentbuf)
3094 kfree(pathbuf); 3072 goto continue_free;
3095 spin_lock(&release_list_lock);
3096 continue;
3097 }
3098 3073
3099 i = 0; 3074 i = 0;
3100 argv[i++] = cgrp->root->release_agent_path; 3075 argv[i++] = agentbuf;
3101 argv[i++] = (char *)pathbuf; 3076 argv[i++] = pathbuf;
3102 argv[i] = NULL; 3077 argv[i] = NULL;
3103 3078
3104 i = 0; 3079 i = 0;
@@ -3112,8 +3087,10 @@ static void cgroup_release_agent(struct work_struct *work)
3112 * be a slow process */ 3087 * be a slow process */
3113 mutex_unlock(&cgroup_mutex); 3088 mutex_unlock(&cgroup_mutex);
3114 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 3089 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
3115 kfree(pathbuf);
3116 mutex_lock(&cgroup_mutex); 3090 mutex_lock(&cgroup_mutex);
3091 continue_free:
3092 kfree(pathbuf);
3093 kfree(agentbuf);
3117 spin_lock(&release_list_lock); 3094 spin_lock(&release_list_lock);
3118 } 3095 }
3119 spin_unlock(&release_list_lock); 3096 spin_unlock(&release_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2cc409ce0a8f..10ba5f1004a5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -285,6 +285,11 @@ out_allowed:
285 set_cpus_allowed_ptr(current, &old_allowed); 285 set_cpus_allowed_ptr(current, &old_allowed);
286out_release: 286out_release:
287 cpu_hotplug_done(); 287 cpu_hotplug_done();
288 if (!err) {
289 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
290 hcpu) == NOTIFY_BAD)
291 BUG();
292 }
288 return err; 293 return err;
289} 294}
290 295
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5738910c34c..91cf85b36dd5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -227,10 +227,6 @@ static struct cpuset top_cpuset = {
227 * The task_struct fields mems_allowed and mems_generation may only 227 * The task_struct fields mems_allowed and mems_generation may only
228 * be accessed in the context of that task, so require no locks. 228 * be accessed in the context of that task, so require no locks.
229 * 229 *
230 * The cpuset_common_file_write handler for operations that modify
231 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
232 * single threading all such cpuset modifications across the system.
233 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 230 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 231 * small pieces of code, such as when reading out possibly multi-word
236 * cpumasks and nodemasks. 232 * cpumasks and nodemasks.
@@ -369,7 +365,7 @@ void cpuset_update_task_memory_state(void)
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 365 my_cpusets_mem_gen = top_cpuset.mems_generation;
370 } else { 366 } else {
371 rcu_read_lock(); 367 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(current)->mems_generation; 368 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock(); 369 rcu_read_unlock();
374 } 370 }
375 371
@@ -500,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
500/* 496/*
501 * rebuild_sched_domains() 497 * rebuild_sched_domains()
502 * 498 *
503 * If the flag 'sched_load_balance' of any cpuset with non-empty 499 * This routine will be called to rebuild the scheduler's dynamic
504 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 500 * sched domains:
505 * which has that flag enabled, or if any cpuset with a non-empty 501 * - if the flag 'sched_load_balance' of any cpuset with non-empty
506 * 'cpus' is removed, then call this routine to rebuild the 502 * 'cpus' changes,
507 * scheduler's dynamic sched domains. 503 * - or if the 'cpus' allowed changes in any cpuset which has that
504 * flag enabled,
505 * - or if the 'sched_relax_domain_level' of any cpuset which has
506 * that flag enabled and with non-empty 'cpus' changes,
507 * - or if any cpuset with non-empty 'cpus' is removed,
508 * - or if a cpu gets offlined.
508 * 509 *
509 * This routine builds a partial partition of the systems CPUs 510 * This routine builds a partial partition of the systems CPUs
510 * (the set of non-overlappping cpumask_t's in the array 'part' 511 * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -609,8 +610,13 @@ void rebuild_sched_domains(void)
609 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { 610 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
610 struct cgroup *cont; 611 struct cgroup *cont;
611 struct cpuset *child; /* scans child cpusets of cp */ 612 struct cpuset *child; /* scans child cpusets of cp */
613
614 if (cpus_empty(cp->cpus_allowed))
615 continue;
616
612 if (is_sched_load_balance(cp)) 617 if (is_sched_load_balance(cp))
613 csa[csn++] = cp; 618 csa[csn++] = cp;
619
614 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 620 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
615 child = cgroup_cs(cont); 621 child = cgroup_cs(cont);
616 __kfifo_put(q, (void *)&child, sizeof(cp)); 622 __kfifo_put(q, (void *)&child, sizeof(cp));
@@ -703,36 +709,6 @@ done:
703 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 709 /* Don't kfree(dattr) -- partition_sched_domains() does that. */
704} 710}
705 711
706static inline int started_after_time(struct task_struct *t1,
707 struct timespec *time,
708 struct task_struct *t2)
709{
710 int start_diff = timespec_compare(&t1->start_time, time);
711 if (start_diff > 0) {
712 return 1;
713 } else if (start_diff < 0) {
714 return 0;
715 } else {
716 /*
717 * Arbitrarily, if two processes started at the same
718 * time, we'll say that the lower pointer value
719 * started first. Note that t2 may have exited by now
720 * so this may not be a valid pointer any longer, but
721 * that's fine - it still serves to distinguish
722 * between two tasks started (effectively)
723 * simultaneously.
724 */
725 return t1 > t2;
726 }
727}
728
729static inline int started_after(void *p1, void *p2)
730{
731 struct task_struct *t1 = p1;
732 struct task_struct *t2 = p2;
733 return started_after_time(t1, &t2->start_time, t2);
734}
735
736/** 712/**
737 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 713 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
738 * @tsk: task to test 714 * @tsk: task to test
@@ -768,15 +744,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
768} 744}
769 745
770/** 746/**
747 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
748 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
749 *
750 * Called with cgroup_mutex held
751 *
752 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
753 * calling callback functions for each.
754 *
755 * Return 0 if successful, -errno if not.
756 */
757static int update_tasks_cpumask(struct cpuset *cs)
758{
759 struct cgroup_scanner scan;
760 struct ptr_heap heap;
761 int retval;
762
763 /*
764 * cgroup_scan_tasks() will initialize heap->gt for us.
765 * heap_init() is still needed here for we should not change
766 * cs->cpus_allowed when heap_init() fails.
767 */
768 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
769 if (retval)
770 return retval;
771
772 scan.cg = cs->css.cgroup;
773 scan.test_task = cpuset_test_cpumask;
774 scan.process_task = cpuset_change_cpumask;
775 scan.heap = &heap;
776 retval = cgroup_scan_tasks(&scan);
777
778 heap_free(&heap);
779 return retval;
780}
781
782/**
771 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 783 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
772 * @cs: the cpuset to consider 784 * @cs: the cpuset to consider
773 * @buf: buffer of cpu numbers written to this cpuset 785 * @buf: buffer of cpu numbers written to this cpuset
774 */ 786 */
775static int update_cpumask(struct cpuset *cs, char *buf) 787static int update_cpumask(struct cpuset *cs, const char *buf)
776{ 788{
777 struct cpuset trialcs; 789 struct cpuset trialcs;
778 struct cgroup_scanner scan;
779 struct ptr_heap heap;
780 int retval; 790 int retval;
781 int is_load_balanced; 791 int is_load_balanced;
782 792
@@ -792,7 +802,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
792 * that parsing. The validate_change() call ensures that cpusets 802 * that parsing. The validate_change() call ensures that cpusets
793 * with tasks have cpus. 803 * with tasks have cpus.
794 */ 804 */
795 buf = strstrip(buf);
796 if (!*buf) { 805 if (!*buf) {
797 cpus_clear(trialcs.cpus_allowed); 806 cpus_clear(trialcs.cpus_allowed);
798 } else { 807 } else {
@@ -811,10 +820,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
811 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 820 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
812 return 0; 821 return 0;
813 822
814 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
815 if (retval)
816 return retval;
817
818 is_load_balanced = is_sched_load_balance(&trialcs); 823 is_load_balanced = is_sched_load_balance(&trialcs);
819 824
820 mutex_lock(&callback_mutex); 825 mutex_lock(&callback_mutex);
@@ -825,12 +830,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
825 * Scan tasks in the cpuset, and update the cpumasks of any 830 * Scan tasks in the cpuset, and update the cpumasks of any
826 * that need an update. 831 * that need an update.
827 */ 832 */
828 scan.cg = cs->css.cgroup; 833 retval = update_tasks_cpumask(cs);
829 scan.test_task = cpuset_test_cpumask; 834 if (retval < 0)
830 scan.process_task = cpuset_change_cpumask; 835 return retval;
831 scan.heap = &heap;
832 cgroup_scan_tasks(&scan);
833 heap_free(&heap);
834 836
835 if (is_load_balanced) 837 if (is_load_balanced)
836 rebuild_sched_domains(); 838 rebuild_sched_domains();
@@ -886,74 +888,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
886 mutex_unlock(&callback_mutex); 888 mutex_unlock(&callback_mutex);
887} 889}
888 890
889/*
890 * Handle user request to change the 'mems' memory placement
891 * of a cpuset. Needs to validate the request, update the
892 * cpusets mems_allowed and mems_generation, and for each
893 * task in the cpuset, rebind any vma mempolicies and if
894 * the cpuset is marked 'memory_migrate', migrate the tasks
895 * pages to the new memory.
896 *
897 * Call with cgroup_mutex held. May take callback_mutex during call.
898 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
899 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
900 * their mempolicies to the cpusets new mems_allowed.
901 */
902
903static void *cpuset_being_rebound; 891static void *cpuset_being_rebound;
904 892
905static int update_nodemask(struct cpuset *cs, char *buf) 893/**
894 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
895 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
896 * @oldmem: old mems_allowed of cpuset cs
897 *
898 * Called with cgroup_mutex held
899 * Return 0 if successful, -errno if not.
900 */
901static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
906{ 902{
907 struct cpuset trialcs;
908 nodemask_t oldmem;
909 struct task_struct *p; 903 struct task_struct *p;
910 struct mm_struct **mmarray; 904 struct mm_struct **mmarray;
911 int i, n, ntasks; 905 int i, n, ntasks;
912 int migrate; 906 int migrate;
913 int fudge; 907 int fudge;
914 int retval;
915 struct cgroup_iter it; 908 struct cgroup_iter it;
916 909 int retval;
917 /*
918 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
919 * it's read-only
920 */
921 if (cs == &top_cpuset)
922 return -EACCES;
923
924 trialcs = *cs;
925
926 /*
927 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
928 * Since nodelist_parse() fails on an empty mask, we special case
929 * that parsing. The validate_change() call ensures that cpusets
930 * with tasks have memory.
931 */
932 buf = strstrip(buf);
933 if (!*buf) {
934 nodes_clear(trialcs.mems_allowed);
935 } else {
936 retval = nodelist_parse(buf, trialcs.mems_allowed);
937 if (retval < 0)
938 goto done;
939
940 if (!nodes_subset(trialcs.mems_allowed,
941 node_states[N_HIGH_MEMORY]))
942 return -EINVAL;
943 }
944 oldmem = cs->mems_allowed;
945 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
946 retval = 0; /* Too easy - nothing to do */
947 goto done;
948 }
949 retval = validate_change(cs, &trialcs);
950 if (retval < 0)
951 goto done;
952
953 mutex_lock(&callback_mutex);
954 cs->mems_allowed = trialcs.mems_allowed;
955 cs->mems_generation = cpuset_mems_generation++;
956 mutex_unlock(&callback_mutex);
957 910
958 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 911 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
959 912
@@ -1020,7 +973,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1020 973
1021 mpol_rebind_mm(mm, &cs->mems_allowed); 974 mpol_rebind_mm(mm, &cs->mems_allowed);
1022 if (migrate) 975 if (migrate)
1023 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); 976 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1024 mmput(mm); 977 mmput(mm);
1025 } 978 }
1026 979
@@ -1032,6 +985,70 @@ done:
1032 return retval; 985 return retval;
1033} 986}
1034 987
988/*
989 * Handle user request to change the 'mems' memory placement
990 * of a cpuset. Needs to validate the request, update the
991 * cpusets mems_allowed and mems_generation, and for each
992 * task in the cpuset, rebind any vma mempolicies and if
993 * the cpuset is marked 'memory_migrate', migrate the tasks
994 * pages to the new memory.
995 *
996 * Call with cgroup_mutex held. May take callback_mutex during call.
997 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
998 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
999 * their mempolicies to the cpusets new mems_allowed.
1000 */
1001static int update_nodemask(struct cpuset *cs, const char *buf)
1002{
1003 struct cpuset trialcs;
1004 nodemask_t oldmem;
1005 int retval;
1006
1007 /*
1008 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1009 * it's read-only
1010 */
1011 if (cs == &top_cpuset)
1012 return -EACCES;
1013
1014 trialcs = *cs;
1015
1016 /*
1017 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1018 * Since nodelist_parse() fails on an empty mask, we special case
1019 * that parsing. The validate_change() call ensures that cpusets
1020 * with tasks have memory.
1021 */
1022 if (!*buf) {
1023 nodes_clear(trialcs.mems_allowed);
1024 } else {
1025 retval = nodelist_parse(buf, trialcs.mems_allowed);
1026 if (retval < 0)
1027 goto done;
1028
1029 if (!nodes_subset(trialcs.mems_allowed,
1030 node_states[N_HIGH_MEMORY]))
1031 return -EINVAL;
1032 }
1033 oldmem = cs->mems_allowed;
1034 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
1035 retval = 0; /* Too easy - nothing to do */
1036 goto done;
1037 }
1038 retval = validate_change(cs, &trialcs);
1039 if (retval < 0)
1040 goto done;
1041
1042 mutex_lock(&callback_mutex);
1043 cs->mems_allowed = trialcs.mems_allowed;
1044 cs->mems_generation = cpuset_mems_generation++;
1045 mutex_unlock(&callback_mutex);
1046
1047 retval = update_tasks_nodemask(cs, &oldmem);
1048done:
1049 return retval;
1050}
1051
1035int current_cpuset_is_being_rebound(void) 1052int current_cpuset_is_being_rebound(void)
1036{ 1053{
1037 return task_cs(current) == cpuset_being_rebound; 1054 return task_cs(current) == cpuset_being_rebound;
@@ -1044,7 +1061,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1044 1061
1045 if (val != cs->relax_domain_level) { 1062 if (val != cs->relax_domain_level) {
1046 cs->relax_domain_level = val; 1063 cs->relax_domain_level = val;
1047 rebuild_sched_domains(); 1064 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1065 rebuild_sched_domains();
1048 } 1066 }
1049 1067
1050 return 0; 1068 return 0;
@@ -1256,72 +1274,14 @@ typedef enum {
1256 FILE_SPREAD_SLAB, 1274 FILE_SPREAD_SLAB,
1257} cpuset_filetype_t; 1275} cpuset_filetype_t;
1258 1276
1259static ssize_t cpuset_common_file_write(struct cgroup *cont,
1260 struct cftype *cft,
1261 struct file *file,
1262 const char __user *userbuf,
1263 size_t nbytes, loff_t *unused_ppos)
1264{
1265 struct cpuset *cs = cgroup_cs(cont);
1266 cpuset_filetype_t type = cft->private;
1267 char *buffer;
1268 int retval = 0;
1269
1270 /* Crude upper limit on largest legitimate cpulist user might write. */
1271 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1272 return -E2BIG;
1273
1274 /* +1 for nul-terminator */
1275 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1276 if (!buffer)
1277 return -ENOMEM;
1278
1279 if (copy_from_user(buffer, userbuf, nbytes)) {
1280 retval = -EFAULT;
1281 goto out1;
1282 }
1283 buffer[nbytes] = 0; /* nul-terminate */
1284
1285 cgroup_lock();
1286
1287 if (cgroup_is_removed(cont)) {
1288 retval = -ENODEV;
1289 goto out2;
1290 }
1291
1292 switch (type) {
1293 case FILE_CPULIST:
1294 retval = update_cpumask(cs, buffer);
1295 break;
1296 case FILE_MEMLIST:
1297 retval = update_nodemask(cs, buffer);
1298 break;
1299 default:
1300 retval = -EINVAL;
1301 goto out2;
1302 }
1303
1304 if (retval == 0)
1305 retval = nbytes;
1306out2:
1307 cgroup_unlock();
1308out1:
1309 kfree(buffer);
1310 return retval;
1311}
1312
1313static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1277static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1314{ 1278{
1315 int retval = 0; 1279 int retval = 0;
1316 struct cpuset *cs = cgroup_cs(cgrp); 1280 struct cpuset *cs = cgroup_cs(cgrp);
1317 cpuset_filetype_t type = cft->private; 1281 cpuset_filetype_t type = cft->private;
1318 1282
1319 cgroup_lock(); 1283 if (!cgroup_lock_live_group(cgrp))
1320
1321 if (cgroup_is_removed(cgrp)) {
1322 cgroup_unlock();
1323 return -ENODEV; 1284 return -ENODEV;
1324 }
1325 1285
1326 switch (type) { 1286 switch (type) {
1327 case FILE_CPU_EXCLUSIVE: 1287 case FILE_CPU_EXCLUSIVE:
@@ -1367,12 +1327,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1367 struct cpuset *cs = cgroup_cs(cgrp); 1327 struct cpuset *cs = cgroup_cs(cgrp);
1368 cpuset_filetype_t type = cft->private; 1328 cpuset_filetype_t type = cft->private;
1369 1329
1370 cgroup_lock(); 1330 if (!cgroup_lock_live_group(cgrp))
1371
1372 if (cgroup_is_removed(cgrp)) {
1373 cgroup_unlock();
1374 return -ENODEV; 1331 return -ENODEV;
1375 } 1332
1376 switch (type) { 1333 switch (type) {
1377 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1334 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1378 retval = update_relax_domain_level(cs, val); 1335 retval = update_relax_domain_level(cs, val);
@@ -1386,6 +1343,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1386} 1343}
1387 1344
1388/* 1345/*
1346 * Common handling for a write to a "cpus" or "mems" file.
1347 */
1348static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1349 const char *buf)
1350{
1351 int retval = 0;
1352
1353 if (!cgroup_lock_live_group(cgrp))
1354 return -ENODEV;
1355
1356 switch (cft->private) {
1357 case FILE_CPULIST:
1358 retval = update_cpumask(cgroup_cs(cgrp), buf);
1359 break;
1360 case FILE_MEMLIST:
1361 retval = update_nodemask(cgroup_cs(cgrp), buf);
1362 break;
1363 default:
1364 retval = -EINVAL;
1365 break;
1366 }
1367 cgroup_unlock();
1368 return retval;
1369}
1370
1371/*
1389 * These ascii lists should be read in a single call, by using a user 1372 * These ascii lists should be read in a single call, by using a user
1390 * buffer large enough to hold the entire map. If read in smaller 1373 * buffer large enough to hold the entire map. If read in smaller
1391 * chunks, there is no guarantee of atomicity. Since the display format 1374 * chunks, there is no guarantee of atomicity. Since the display format
@@ -1504,14 +1487,16 @@ static struct cftype files[] = {
1504 { 1487 {
1505 .name = "cpus", 1488 .name = "cpus",
1506 .read = cpuset_common_file_read, 1489 .read = cpuset_common_file_read,
1507 .write = cpuset_common_file_write, 1490 .write_string = cpuset_write_resmask,
1491 .max_write_len = (100U + 6 * NR_CPUS),
1508 .private = FILE_CPULIST, 1492 .private = FILE_CPULIST,
1509 }, 1493 },
1510 1494
1511 { 1495 {
1512 .name = "mems", 1496 .name = "mems",
1513 .read = cpuset_common_file_read, 1497 .read = cpuset_common_file_read,
1514 .write = cpuset_common_file_write, 1498 .write_string = cpuset_write_resmask,
1499 .max_write_len = (100U + 6 * MAX_NUMNODES),
1515 .private = FILE_MEMLIST, 1500 .private = FILE_MEMLIST,
1516 }, 1501 },
1517 1502
@@ -1792,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1792 scan.scan.heap = NULL; 1777 scan.scan.heap = NULL;
1793 scan.to = to->css.cgroup; 1778 scan.to = to->css.cgroup;
1794 1779
1795 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) 1780 if (cgroup_scan_tasks(&scan.scan))
1796 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1781 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1797 "cgroup_scan_tasks failed\n"); 1782 "cgroup_scan_tasks failed\n");
1798} 1783}
@@ -1852,6 +1837,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1852 struct cpuset *child; /* scans child cpusets of cp */ 1837 struct cpuset *child; /* scans child cpusets of cp */
1853 struct list_head queue; 1838 struct list_head queue;
1854 struct cgroup *cont; 1839 struct cgroup *cont;
1840 nodemask_t oldmems;
1855 1841
1856 INIT_LIST_HEAD(&queue); 1842 INIT_LIST_HEAD(&queue);
1857 1843
@@ -1871,6 +1857,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1871 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 1857 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1872 continue; 1858 continue;
1873 1859
1860 oldmems = cp->mems_allowed;
1861
1874 /* Remove offline cpus and mems from this cpuset. */ 1862 /* Remove offline cpus and mems from this cpuset. */
1875 mutex_lock(&callback_mutex); 1863 mutex_lock(&callback_mutex);
1876 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 1864 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1882,6 +1870,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1882 if (cpus_empty(cp->cpus_allowed) || 1870 if (cpus_empty(cp->cpus_allowed) ||
1883 nodes_empty(cp->mems_allowed)) 1871 nodes_empty(cp->mems_allowed))
1884 remove_tasks_in_empty_cpuset(cp); 1872 remove_tasks_in_empty_cpuset(cp);
1873 else {
1874 update_tasks_cpumask(cp);
1875 update_tasks_nodemask(cp, &oldmems);
1876 }
1885 } 1877 }
1886} 1878}
1887 1879
@@ -1974,7 +1966,6 @@ void __init cpuset_init_smp(void)
1974} 1966}
1975 1967
1976/** 1968/**
1977
1978 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1969 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1979 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1970 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1980 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 1971 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
145 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; 145 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
146 tmp = d->swapin_delay_total + tsk->delays->swapin_delay; 146 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
147 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; 147 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
148 tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
149 d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
148 d->blkio_count += tsk->delays->blkio_count; 150 d->blkio_count += tsk->delays->blkio_count;
149 d->swapin_count += tsk->delays->swapin_count; 151 d->swapin_count += tsk->delays->swapin_count;
152 d->freepages_count += tsk->delays->freepages_count;
150 spin_unlock_irqrestore(&tsk->delays->lock, flags); 153 spin_unlock_irqrestore(&tsk->delays->lock, flags);
151 154
152done: 155done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
165 return ret; 168 return ret;
166} 169}
167 170
171void __delayacct_freepages_start(void)
172{
173 delayacct_start(&current->delays->freepages_start);
174}
175
176void __delayacct_freepages_end(void)
177{
178 delayacct_end(&current->delays->freepages_start,
179 &current->delays->freepages_end,
180 &current->delays->freepages_delay,
181 &current->delays->freepages_count);
182}
183
diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711b9381..ad933bb29ec7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 rcu_read_lock();
89 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference(tsk->sighand);
90 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
91 90
@@ -121,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
121 sig->nivcsw += tsk->nivcsw; 120 sig->nivcsw += tsk->nivcsw;
122 sig->inblock += task_io_get_inblock(tsk); 121 sig->inblock += task_io_get_inblock(tsk);
123 sig->oublock += task_io_get_oublock(tsk); 122 sig->oublock += task_io_get_oublock(tsk);
123#ifdef CONFIG_TASK_XACCT
124 sig->rchar += tsk->rchar;
125 sig->wchar += tsk->wchar;
126 sig->syscr += tsk->syscr;
127 sig->syscw += tsk->syscw;
128#endif /* CONFIG_TASK_XACCT */
129#ifdef CONFIG_TASK_IO_ACCOUNTING
130 sig->ioac.read_bytes += tsk->ioac.read_bytes;
131 sig->ioac.write_bytes += tsk->ioac.write_bytes;
132 sig->ioac.cancelled_write_bytes +=
133 tsk->ioac.cancelled_write_bytes;
134#endif /* CONFIG_TASK_IO_ACCOUNTING */
124 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 135 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
125 sig = NULL; /* Marker for below. */ 136 sig = NULL; /* Marker for below. */
126 } 137 }
@@ -136,7 +147,6 @@ static void __exit_signal(struct task_struct *tsk)
136 tsk->signal = NULL; 147 tsk->signal = NULL;
137 tsk->sighand = NULL; 148 tsk->sighand = NULL;
138 spin_unlock(&sighand->siglock); 149 spin_unlock(&sighand->siglock);
139 rcu_read_unlock();
140 150
141 __cleanup_sighand(sighand); 151 __cleanup_sighand(sighand);
142 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 152 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -432,7 +442,7 @@ void daemonize(const char *name, ...)
432 * We don't want to have TIF_FREEZE set if the system-wide hibernation 442 * We don't want to have TIF_FREEZE set if the system-wide hibernation
433 * or suspend transition begins right now. 443 * or suspend transition begins right now.
434 */ 444 */
435 current->flags |= PF_NOFREEZE; 445 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
436 446
437 if (current->nsproxy != &init_nsproxy) { 447 if (current->nsproxy != &init_nsproxy) {
438 get_nsproxy(&init_nsproxy); 448 get_nsproxy(&init_nsproxy);
@@ -666,26 +676,40 @@ assign_new_owner:
666static void exit_mm(struct task_struct * tsk) 676static void exit_mm(struct task_struct * tsk)
667{ 677{
668 struct mm_struct *mm = tsk->mm; 678 struct mm_struct *mm = tsk->mm;
679 struct core_state *core_state;
669 680
670 mm_release(tsk, mm); 681 mm_release(tsk, mm);
671 if (!mm) 682 if (!mm)
672 return; 683 return;
673 /* 684 /*
674 * Serialize with any possible pending coredump. 685 * Serialize with any possible pending coredump.
675 * We must hold mmap_sem around checking core_waiters 686 * We must hold mmap_sem around checking core_state
676 * and clearing tsk->mm. The core-inducing thread 687 * and clearing tsk->mm. The core-inducing thread
677 * will increment core_waiters for each thread in the 688 * will increment ->nr_threads for each thread in the
678 * group with ->mm != NULL. 689 * group with ->mm != NULL.
679 */ 690 */
680 down_read(&mm->mmap_sem); 691 down_read(&mm->mmap_sem);
681 if (mm->core_waiters) { 692 core_state = mm->core_state;
693 if (core_state) {
694 struct core_thread self;
682 up_read(&mm->mmap_sem); 695 up_read(&mm->mmap_sem);
683 down_write(&mm->mmap_sem);
684 if (!--mm->core_waiters)
685 complete(mm->core_startup_done);
686 up_write(&mm->mmap_sem);
687 696
688 wait_for_completion(&mm->core_done); 697 self.task = tsk;
698 self.next = xchg(&core_state->dumper.next, &self);
699 /*
700 * Implies mb(), the result of xchg() must be visible
701 * to core_state->dumper.
702 */
703 if (atomic_dec_and_test(&core_state->nr_threads))
704 complete(&core_state->startup);
705
706 for (;;) {
707 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
708 if (!self.task) /* see coredump_finish() */
709 break;
710 schedule();
711 }
712 __set_task_state(tsk, TASK_RUNNING);
689 down_read(&mm->mmap_sem); 713 down_read(&mm->mmap_sem);
690 } 714 }
691 atomic_inc(&mm->mm_count); 715 atomic_inc(&mm->mm_count);
@@ -1354,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
1354 psig->coublock += 1378 psig->coublock +=
1355 task_io_get_oublock(p) + 1379 task_io_get_oublock(p) +
1356 sig->oublock + sig->coublock; 1380 sig->oublock + sig->coublock;
1381#ifdef CONFIG_TASK_XACCT
1382 psig->rchar += p->rchar + sig->rchar;
1383 psig->wchar += p->wchar + sig->wchar;
1384 psig->syscr += p->syscr + sig->syscr;
1385 psig->syscw += p->syscw + sig->syscw;
1386#endif /* CONFIG_TASK_XACCT */
1387#ifdef CONFIG_TASK_IO_ACCOUNTING
1388 psig->ioac.read_bytes +=
1389 p->ioac.read_bytes + sig->ioac.read_bytes;
1390 psig->ioac.write_bytes +=
1391 p->ioac.write_bytes + sig->ioac.write_bytes;
1392 psig->ioac.cancelled_write_bytes +=
1393 p->ioac.cancelled_write_bytes +
1394 sig->ioac.cancelled_write_bytes;
1395#endif /* CONFIG_TASK_IO_ACCOUNTING */
1357 spin_unlock_irq(&p->parent->sighand->siglock); 1396 spin_unlock_irq(&p->parent->sighand->siglock);
1358 } 1397 }
1359 1398
diff --git a/kernel/fork.c b/kernel/fork.c
index 552c8d8e77ad..b99d73e971a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,6 +93,23 @@ int nr_processes(void)
93static struct kmem_cache *task_struct_cachep; 93static struct kmem_cache *task_struct_cachep;
94#endif 94#endif
95 95
96#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
97static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
98{
99#ifdef CONFIG_DEBUG_STACK_USAGE
100 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
101#else
102 gfp_t mask = GFP_KERNEL;
103#endif
104 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
105}
106
107static inline void free_thread_info(struct thread_info *ti)
108{
109 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
110}
111#endif
112
96/* SLAB cache for signal_struct structures (tsk->signal) */ 113/* SLAB cache for signal_struct structures (tsk->signal) */
97static struct kmem_cache *signal_cachep; 114static struct kmem_cache *signal_cachep;
98 115
@@ -383,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
383 INIT_LIST_HEAD(&mm->mmlist); 400 INIT_LIST_HEAD(&mm->mmlist);
384 mm->flags = (current->mm) ? current->mm->flags 401 mm->flags = (current->mm) ? current->mm->flags
385 : MMF_DUMP_FILTER_DEFAULT; 402 : MMF_DUMP_FILTER_DEFAULT;
386 mm->core_waiters = 0; 403 mm->core_state = NULL;
387 mm->nr_ptes = 0; 404 mm->nr_ptes = 0;
388 set_mm_counter(mm, file_rss, 0); 405 set_mm_counter(mm, file_rss, 0);
389 set_mm_counter(mm, anon_rss, 0); 406 set_mm_counter(mm, anon_rss, 0);
@@ -457,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput);
457/** 474/**
458 * get_task_mm - acquire a reference to the task's mm 475 * get_task_mm - acquire a reference to the task's mm
459 * 476 *
460 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning 477 * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
461 * this kernel workthread has transiently adopted a user mm with use_mm, 478 * this kernel workthread has transiently adopted a user mm with use_mm,
462 * to do its AIO) is not set and if so returns a reference to it, after 479 * to do its AIO) is not set and if so returns a reference to it, after
463 * bumping up the use count. User must release the mm via mmput() 480 * bumping up the use count. User must release the mm via mmput()
@@ -470,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
470 task_lock(task); 487 task_lock(task);
471 mm = task->mm; 488 mm = task->mm;
472 if (mm) { 489 if (mm) {
473 if (task->flags & PF_BORROWED_MM) 490 if (task->flags & PF_KTHREAD)
474 mm = NULL; 491 mm = NULL;
475 else 492 else
476 atomic_inc(&mm->mm_users); 493 atomic_inc(&mm->mm_users);
@@ -795,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
795 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 812 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
796 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 813 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
797 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 814 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
815#ifdef CONFIG_TASK_XACCT
816 sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
817#endif
818#ifdef CONFIG_TASK_IO_ACCOUNTING
819 memset(&sig->ioac, 0, sizeof(sig->ioac));
820#endif
798 sig->sum_sched_runtime = 0; 821 sig->sum_sched_runtime = 0;
799 INIT_LIST_HEAD(&sig->cpu_timers[0]); 822 INIT_LIST_HEAD(&sig->cpu_timers[0]);
800 INIT_LIST_HEAD(&sig->cpu_timers[1]); 823 INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -1090,6 +1113,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1090 if (clone_flags & CLONE_THREAD) 1113 if (clone_flags & CLONE_THREAD)
1091 p->tgid = current->tgid; 1114 p->tgid = current->tgid;
1092 1115
1116 if (current->nsproxy != p->nsproxy) {
1117 retval = ns_cgroup_clone(p, pid);
1118 if (retval)
1119 goto bad_fork_free_pid;
1120 }
1121
1093 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1122 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1094 /* 1123 /*
1095 * Clear TID on mm_release()? 1124 * Clear TID on mm_release()?
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 63b93a935565..7d73e008fc3b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -267,9 +267,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
267 } 267 }
268 } else { 268 } else {
269 if (desc->wake_depth == 0) { 269 if (desc->wake_depth == 0) {
270 printk(KERN_WARNING "Unbalanced IRQ %d " 270 WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
271 "wake disable\n", irq);
272 WARN_ON(1);
273 } else if (--desc->wake_depth == 0) { 271 } else if (--desc->wake_depth == 0) {
274 ret = set_irq_wake_real(irq, on); 272 ret = set_irq_wake_real(irq, on);
275 if (ret) 273 if (ret)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..38fc10ac7541 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
176 high = kallsyms_num_syms; 176 high = kallsyms_num_syms;
177 177
178 while (high - low > 1) { 178 while (high - low > 1) {
179 mid = (low + high) / 2; 179 mid = low + (high - low) / 2;
180 if (kallsyms_addresses[mid] <= addr) 180 if (kallsyms_addresses[mid] <= addr)
181 low = mid; 181 low = mid;
182 else 182 else
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2989f67c4446..2456d1a0befb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
352 * @path: path to usermode executable 352 * @path: path to usermode executable
353 * @argv: arg vector for process 353 * @argv: arg vector for process
354 * @envp: environment for process 354 * @envp: environment for process
355 * @gfp_mask: gfp mask for memory allocation
355 * 356 *
356 * Returns either %NULL on allocation failure, or a subprocess_info 357 * Returns either %NULL on allocation failure, or a subprocess_info
357 * structure. This should be passed to call_usermodehelper_exec to 358 * structure. This should be passed to call_usermodehelper_exec to
358 * exec the process and free the structure. 359 * exec the process and free the structure.
359 */ 360 */
360struct subprocess_info *call_usermodehelper_setup(char *path, 361struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
361 char **argv, char **envp) 362 char **envp, gfp_t gfp_mask)
362{ 363{
363 struct subprocess_info *sub_info; 364 struct subprocess_info *sub_info;
364 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 365 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
365 if (!sub_info) 366 if (!sub_info)
366 goto out; 367 goto out;
367 368
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
494 struct subprocess_info *sub_info; 495 struct subprocess_info *sub_info;
495 int ret; 496 int ret;
496 497
497 sub_info = call_usermodehelper_setup(path, argv, envp); 498 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
498 if (sub_info == NULL) 499 if (sub_info == NULL)
499 return -ENOMEM; 500 return -ENOMEM;
500 501
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e00..75bc2cd9ebc6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
62 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) 62 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
63#endif 63#endif
64 64
65static int kprobes_initialized;
65static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 66static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
66static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 67static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
67 68
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69static bool kprobe_enabled; 70static bool kprobe_enabled;
70 71
71DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct {
75 spinlock_t lock ____cacheline_aligned;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
79{
80 return &(kretprobe_table_locks[hash].lock);
81}
74 82
75/* 83/*
76 * Normally, functions that we'd want to prohibit kprobes in, are marked 84 * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
368 return; 376 return;
369} 377}
370 378
371/* Called with kretprobe_lock held */
372void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, 379void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
373 struct hlist_head *head) 380 struct hlist_head *head)
374{ 381{
382 struct kretprobe *rp = ri->rp;
383
375 /* remove rp inst off the rprobe_inst_table */ 384 /* remove rp inst off the rprobe_inst_table */
376 hlist_del(&ri->hlist); 385 hlist_del(&ri->hlist);
377 if (ri->rp) { 386 INIT_HLIST_NODE(&ri->hlist);
378 /* remove rp inst off the used list */ 387 if (likely(rp)) {
379 hlist_del(&ri->uflist); 388 spin_lock(&rp->lock);
380 /* put rp inst back onto the free list */ 389 hlist_add_head(&ri->hlist, &rp->free_instances);
381 INIT_HLIST_NODE(&ri->uflist); 390 spin_unlock(&rp->lock);
382 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
383 } else 391 } else
384 /* Unregistering */ 392 /* Unregistering */
385 hlist_add_head(&ri->hlist, head); 393 hlist_add_head(&ri->hlist, head);
386} 394}
387 395
388struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) 396void kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags)
389{ 398{
390 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
400 spinlock_t *hlist_lock;
401
402 *head = &kretprobe_inst_table[hash];
403 hlist_lock = kretprobe_table_lock_ptr(hash);
404 spin_lock_irqsave(hlist_lock, *flags);
405}
406
407void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags);
411}
412
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
414{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock;
417
418 hlist_lock = kretprobe_table_lock_ptr(hash);
419 spin_unlock_irqrestore(hlist_lock, *flags);
420}
421
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags);
391} 426}
392 427
393/* 428/*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
401 struct kretprobe_instance *ri; 436 struct kretprobe_instance *ri;
402 struct hlist_head *head, empty_rp; 437 struct hlist_head *head, empty_rp;
403 struct hlist_node *node, *tmp; 438 struct hlist_node *node, *tmp;
404 unsigned long flags = 0; 439 unsigned long hash, flags = 0;
405 440
406 INIT_HLIST_HEAD(&empty_rp); 441 if (unlikely(!kprobes_initialized))
407 spin_lock_irqsave(&kretprobe_lock, flags); 442 /* Early boot. kretprobe_table_locks not yet initialized. */
408 head = kretprobe_inst_table_head(tk); 443 return;
444
445 hash = hash_ptr(tk, KPROBE_HASH_BITS);
446 head = &kretprobe_inst_table[hash];
447 kretprobe_table_lock(hash, &flags);
409 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 448 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
410 if (ri->task == tk) 449 if (ri->task == tk)
411 recycle_rp_inst(ri, &empty_rp); 450 recycle_rp_inst(ri, &empty_rp);
412 } 451 }
413 spin_unlock_irqrestore(&kretprobe_lock, flags); 452 kretprobe_table_unlock(hash, &flags);
414 453 INIT_HLIST_HEAD(&empty_rp);
415 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 454 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
416 hlist_del(&ri->hlist); 455 hlist_del(&ri->hlist);
417 kfree(ri); 456 kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
423 struct kretprobe_instance *ri; 462 struct kretprobe_instance *ri;
424 struct hlist_node *pos, *next; 463 struct hlist_node *pos, *next;
425 464
426 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { 465 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
427 hlist_del(&ri->uflist); 466 hlist_del(&ri->hlist);
428 kfree(ri); 467 kfree(ri);
429 } 468 }
430} 469}
431 470
432static void __kprobes cleanup_rp_inst(struct kretprobe *rp) 471static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
433{ 472{
434 unsigned long flags; 473 unsigned long flags, hash;
435 struct kretprobe_instance *ri; 474 struct kretprobe_instance *ri;
436 struct hlist_node *pos, *next; 475 struct hlist_node *pos, *next;
476 struct hlist_head *head;
477
437 /* No race here */ 478 /* No race here */
438 spin_lock_irqsave(&kretprobe_lock, flags); 479 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
439 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { 480 kretprobe_table_lock(hash, &flags);
440 ri->rp = NULL; 481 head = &kretprobe_inst_table[hash];
441 hlist_del(&ri->uflist); 482 hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
483 if (ri->rp == rp)
484 ri->rp = NULL;
485 }
486 kretprobe_table_unlock(hash, &flags);
442 } 487 }
443 spin_unlock_irqrestore(&kretprobe_lock, flags);
444 free_rp_inst(rp); 488 free_rp_inst(rp);
445} 489}
446 490
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
831 struct pt_regs *regs) 875 struct pt_regs *regs)
832{ 876{
833 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 877 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
834 unsigned long flags = 0; 878 unsigned long hash, flags = 0;
879 struct kretprobe_instance *ri;
835 880
836 /*TODO: consider to only swap the RA after the last pre_handler fired */ 881 /*TODO: consider to only swap the RA after the last pre_handler fired */
837 spin_lock_irqsave(&kretprobe_lock, flags); 882 hash = hash_ptr(current, KPROBE_HASH_BITS);
883 spin_lock_irqsave(&rp->lock, flags);
838 if (!hlist_empty(&rp->free_instances)) { 884 if (!hlist_empty(&rp->free_instances)) {
839 struct kretprobe_instance *ri;
840
841 ri = hlist_entry(rp->free_instances.first, 885 ri = hlist_entry(rp->free_instances.first,
842 struct kretprobe_instance, uflist); 886 struct kretprobe_instance, hlist);
887 hlist_del(&ri->hlist);
888 spin_unlock_irqrestore(&rp->lock, flags);
889
843 ri->rp = rp; 890 ri->rp = rp;
844 ri->task = current; 891 ri->task = current;
845 892
846 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 893 if (rp->entry_handler && rp->entry_handler(ri, regs)) {
847 spin_unlock_irqrestore(&kretprobe_lock, flags); 894 spin_unlock_irqrestore(&rp->lock, flags);
848 return 0; 895 return 0;
849 } 896 }
850 897
851 arch_prepare_kretprobe(ri, regs); 898 arch_prepare_kretprobe(ri, regs);
852 899
853 /* XXX(hch): why is there no hlist_move_head? */ 900 /* XXX(hch): why is there no hlist_move_head? */
854 hlist_del(&ri->uflist); 901 INIT_HLIST_NODE(&ri->hlist);
855 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 902 kretprobe_table_lock(hash, &flags);
856 hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); 903 hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
857 } else 904 kretprobe_table_unlock(hash, &flags);
905 } else {
858 rp->nmissed++; 906 rp->nmissed++;
859 spin_unlock_irqrestore(&kretprobe_lock, flags); 907 spin_unlock_irqrestore(&rp->lock, flags);
908 }
860 return 0; 909 return 0;
861} 910}
862 911
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
892 rp->maxactive = NR_CPUS; 941 rp->maxactive = NR_CPUS;
893#endif 942#endif
894 } 943 }
895 INIT_HLIST_HEAD(&rp->used_instances); 944 spin_lock_init(&rp->lock);
896 INIT_HLIST_HEAD(&rp->free_instances); 945 INIT_HLIST_HEAD(&rp->free_instances);
897 for (i = 0; i < rp->maxactive; i++) { 946 for (i = 0; i < rp->maxactive; i++) {
898 inst = kmalloc(sizeof(struct kretprobe_instance) + 947 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
901 free_rp_inst(rp); 950 free_rp_inst(rp);
902 return -ENOMEM; 951 return -ENOMEM;
903 } 952 }
904 INIT_HLIST_NODE(&inst->uflist); 953 INIT_HLIST_NODE(&inst->hlist);
905 hlist_add_head(&inst->uflist, &rp->free_instances); 954 hlist_add_head(&inst->hlist, &rp->free_instances);
906 } 955 }
907 956
908 rp->nmissed = 0; 957 rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
1009 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1058 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1010 INIT_HLIST_HEAD(&kprobe_table[i]); 1059 INIT_HLIST_HEAD(&kprobe_table[i]);
1011 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1060 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
1061 spin_lock_init(&(kretprobe_table_locks[i].lock));
1012 } 1062 }
1013 1063
1014 /* 1064 /*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
1050 err = arch_init_kprobes(); 1100 err = arch_init_kprobes();
1051 if (!err) 1101 if (!err)
1052 err = register_die_notifier(&kprobe_exceptions_nb); 1102 err = register_die_notifier(&kprobe_exceptions_nb);
1103 kprobes_initialized = (err == 0);
1053 1104
1054 if (!err) 1105 if (!err)
1055 init_test_probes(); 1106 init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
1286EXPORT_SYMBOL_GPL(unregister_jprobe); 1337EXPORT_SYMBOL_GPL(unregister_jprobe);
1287EXPORT_SYMBOL_GPL(register_jprobes); 1338EXPORT_SYMBOL_GPL(register_jprobes);
1288EXPORT_SYMBOL_GPL(unregister_jprobes); 1339EXPORT_SYMBOL_GPL(unregister_jprobes);
1289#ifdef CONFIG_KPROBES
1290EXPORT_SYMBOL_GPL(jprobe_return); 1340EXPORT_SYMBOL_GPL(jprobe_return);
1291#endif
1292
1293#ifdef CONFIG_KPROBES
1294EXPORT_SYMBOL_GPL(register_kretprobe); 1341EXPORT_SYMBOL_GPL(register_kretprobe);
1295EXPORT_SYMBOL_GPL(unregister_kretprobe); 1342EXPORT_SYMBOL_GPL(unregister_kretprobe);
1296EXPORT_SYMBOL_GPL(register_kretprobes); 1343EXPORT_SYMBOL_GPL(register_kretprobes);
1297EXPORT_SYMBOL_GPL(unregister_kretprobes); 1344EXPORT_SYMBOL_GPL(unregister_kretprobes);
1298#endif
diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb923b761..971da5317903 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -441,7 +441,7 @@ static int remove_marker(const char *name)
441 hlist_del(&e->hlist); 441 hlist_del(&e->hlist);
442 /* Make sure the call_rcu has been executed */ 442 /* Make sure the call_rcu has been executed */
443 if (e->rcu_pending) 443 if (e->rcu_pending)
444 rcu_barrier(); 444 rcu_barrier_sched();
445 kfree(e); 445 kfree(e);
446 return 0; 446 return 0;
447} 447}
@@ -476,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
476 hlist_del(&(*entry)->hlist); 476 hlist_del(&(*entry)->hlist);
477 /* Make sure the call_rcu has been executed */ 477 /* Make sure the call_rcu has been executed */
478 if ((*entry)->rcu_pending) 478 if ((*entry)->rcu_pending)
479 rcu_barrier(); 479 rcu_barrier_sched();
480 kfree(*entry); 480 kfree(*entry);
481 *entry = e; 481 *entry = e;
482 trace_mark(core_marker_format, "name %s format %s", 482 trace_mark(core_marker_format, "name %s format %s",
@@ -655,7 +655,7 @@ int marker_probe_register(const char *name, const char *format,
655 * make sure it's executed now. 655 * make sure it's executed now.
656 */ 656 */
657 if (entry->rcu_pending) 657 if (entry->rcu_pending)
658 rcu_barrier(); 658 rcu_barrier_sched();
659 old = marker_entry_add_probe(entry, probe, probe_private); 659 old = marker_entry_add_probe(entry, probe, probe_private);
660 if (IS_ERR(old)) { 660 if (IS_ERR(old)) {
661 ret = PTR_ERR(old); 661 ret = PTR_ERR(old);
@@ -670,10 +670,7 @@ int marker_probe_register(const char *name, const char *format,
670 entry->rcu_pending = 1; 670 entry->rcu_pending = 1;
671 /* write rcu_pending before calling the RCU callback */ 671 /* write rcu_pending before calling the RCU callback */
672 smp_wmb(); 672 smp_wmb();
673#ifdef CONFIG_PREEMPT_RCU 673 call_rcu_sched(&entry->rcu, free_old_closure);
674 synchronize_sched(); /* Until we have the call_rcu_sched() */
675#endif
676 call_rcu(&entry->rcu, free_old_closure);
677end: 674end:
678 mutex_unlock(&markers_mutex); 675 mutex_unlock(&markers_mutex);
679 return ret; 676 return ret;
@@ -704,7 +701,7 @@ int marker_probe_unregister(const char *name,
704 if (!entry) 701 if (!entry)
705 goto end; 702 goto end;
706 if (entry->rcu_pending) 703 if (entry->rcu_pending)
707 rcu_barrier(); 704 rcu_barrier_sched();
708 old = marker_entry_remove_probe(entry, probe, probe_private); 705 old = marker_entry_remove_probe(entry, probe, probe_private);
709 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
710 marker_update_probes(); /* may update entry */ 707 marker_update_probes(); /* may update entry */
@@ -716,10 +713,7 @@ int marker_probe_unregister(const char *name,
716 entry->rcu_pending = 1; 713 entry->rcu_pending = 1;
717 /* write rcu_pending before calling the RCU callback */ 714 /* write rcu_pending before calling the RCU callback */
718 smp_wmb(); 715 smp_wmb();
719#ifdef CONFIG_PREEMPT_RCU 716 call_rcu_sched(&entry->rcu, free_old_closure);
720 synchronize_sched(); /* Until we have the call_rcu_sched() */
721#endif
722 call_rcu(&entry->rcu, free_old_closure);
723 remove_marker(name); /* Ignore busy error message */ 717 remove_marker(name); /* Ignore busy error message */
724 ret = 0; 718 ret = 0;
725end: 719end:
@@ -786,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
786 goto end; 780 goto end;
787 } 781 }
788 if (entry->rcu_pending) 782 if (entry->rcu_pending)
789 rcu_barrier(); 783 rcu_barrier_sched();
790 old = marker_entry_remove_probe(entry, NULL, probe_private); 784 old = marker_entry_remove_probe(entry, NULL, probe_private);
791 mutex_unlock(&markers_mutex); 785 mutex_unlock(&markers_mutex);
792 marker_update_probes(); /* may update entry */ 786 marker_update_probes(); /* may update entry */
@@ -797,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
797 entry->rcu_pending = 1; 791 entry->rcu_pending = 1;
798 /* write rcu_pending before calling the RCU callback */ 792 /* write rcu_pending before calling the RCU callback */
799 smp_wmb(); 793 smp_wmb();
800#ifdef CONFIG_PREEMPT_RCU 794 call_rcu_sched(&entry->rcu, free_old_closure);
801 synchronize_sched(); /* Until we have the call_rcu_sched() */
802#endif
803 call_rcu(&entry->rcu, free_old_closure);
804 remove_marker(entry->name); /* Ignore busy error message */ 795 remove_marker(entry->name); /* Ignore busy error message */
805end: 796end:
806 mutex_unlock(&markers_mutex); 797 mutex_unlock(&markers_mutex);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/proc_fs.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/nsproxy.h> 12#include <linux/nsproxy.h>
12 13
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
24 struct ns_cgroup, css); 25 struct ns_cgroup, css);
25} 26}
26 27
27int ns_cgroup_clone(struct task_struct *task) 28int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{ 29{
29 return cgroup_clone(task, &ns_subsys); 30 char name[PROC_NUMBUF];
31
32 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
33 return cgroup_clone(task, &ns_subsys, name);
30} 34}
31 35
32/* 36/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..21575fc46d05 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
157 goto out; 157 goto out;
158 } 158 }
159 159
160 err = ns_cgroup_clone(tsk);
161 if (err) {
162 put_nsproxy(new_ns);
163 goto out;
164 }
165
166 tsk->nsproxy = new_ns; 160 tsk->nsproxy = new_ns;
167 161
168out: 162out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
209 goto out; 203 goto out;
210 } 204 }
211 205
212 err = ns_cgroup_clone(current); 206 err = ns_cgroup_clone(current, task_pid(current));
213 if (err) 207 if (err)
214 put_nsproxy(*new_nsp); 208 put_nsproxy(*new_nsp);
215 209
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..12c5a0a6c89b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
318 add_taint(TAINT_WARN); 318 add_taint(TAINT_WARN);
319} 319}
320EXPORT_SYMBOL(warn_on_slowpath); 320EXPORT_SYMBOL(warn_on_slowpath);
321
322
323void warn_slowpath(const char *file, int line, const char *fmt, ...)
324{
325 va_list args;
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long)__builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 va_start(args, fmt);
334 vprintk(fmt, args);
335 va_end(args);
336
337 print_modules();
338 dump_stack();
339 print_oops_end_marker();
340 add_taint(TAINT_WARN);
341}
342EXPORT_SYMBOL(warn_slowpath);
321#endif 343#endif
322 344
323#ifdef CONFIG_CC_STACKPROTECTOR 345#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4b2ac7..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
309} 309}
310EXPORT_SYMBOL_GPL(find_vpid); 310EXPORT_SYMBOL_GPL(find_vpid);
311 311
312struct pid *find_pid(int nr)
313{
314 return find_pid_ns(nr, &init_pid_ns);
315}
316EXPORT_SYMBOL_GPL(find_pid);
317
318/* 312/*
319 * attach_pid() must be called with the tasklist_lock write-held. 313 * attach_pid() must be called with the tasklist_lock write-held.
320 */ 314 */
@@ -435,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
435 429
436 return pid; 430 return pid;
437} 431}
432EXPORT_SYMBOL_GPL(find_get_pid);
438 433
439pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) 434pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
440{ 435{
@@ -482,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
482/* 477/*
483 * Used by proc to find the first pid that is greater then or equal to nr. 478 * Used by proc to find the first pid that is greater then or equal to nr.
484 * 479 *
485 * If there is a pid at nr this function is exactly the same as find_pid. 480 * If there is a pid at nr this function is exactly the same as find_pid_ns.
486 */ 481 */
487struct pid *find_ge_pid(int nr, struct pid_namespace *ns) 482struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
488{ 483{
@@ -497,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
497 492
498 return pid; 493 return pid;
499} 494}
500EXPORT_SYMBOL_GPL(find_get_pid);
501 495
502/* 496/*
503 * The pid hash table is scaled according to the amount of memory in the 497 * The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..ea567b78d1aa 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h>
15 16
16#define BITS_PER_PAGE (PAGE_SIZE*8) 17#define BITS_PER_PAGE (PAGE_SIZE*8)
17 18
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
71 struct pid_namespace *ns; 72 struct pid_namespace *ns;
72 int i; 73 int i;
73 74
74 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); 75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
75 if (ns == NULL) 76 if (ns == NULL)
76 goto out; 77 goto out;
77 78
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
84 goto out_free_map; 85 goto out_free_map;
85 86
86 kref_init(&ns->kref); 87 kref_init(&ns->kref);
87 ns->last_pid = 0;
88 ns->child_reaper = NULL;
89 ns->level = level; 88 ns->level = level;
90 89
91 set_bit(0, ns->pidmap[0].page); 90 set_bit(0, ns->pidmap[0].page);
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 92
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 93 for (i = 1; i < PIDMAP_ENTRIES; i++)
95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 94 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 }
98 95
99 return ns; 96 return ns;
100 97
@@ -185,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
185 182
186 /* Child reaper for the pid namespace is going away */ 183 /* Child reaper for the pid namespace is going away */
187 pid_ns->child_reaper = NULL; 184 pid_ns->child_reaper = NULL;
185 acct_exit_ns(pid_ns);
188 return; 186 return;
189} 187}
190 188
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..9a21681aa80f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
449 spin_unlock_irqrestore(&idr_lock, flags); 449 spin_unlock_irqrestore(&idr_lock, flags);
450 } 450 }
451 sigqueue_free(tmr->sigq); 451 sigqueue_free(tmr->sigq);
452 if (unlikely(tmr->it_process) &&
453 tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
454 put_task_struct(tmr->it_process);
455 kmem_cache_free(posix_timers_cache, tmr); 452 kmem_cache_free(posix_timers_cache, tmr);
456} 453}
457 454
@@ -856,11 +853,10 @@ retry_delete:
856 * This keeps any tasks waiting on the spin lock from thinking 853 * This keeps any tasks waiting on the spin lock from thinking
857 * they got something (see the lock code above). 854 * they got something (see the lock code above).
858 */ 855 */
859 if (timer->it_process) { 856 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
860 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 857 put_task_struct(timer->it_process);
861 put_task_struct(timer->it_process); 858 timer->it_process = NULL;
862 timer->it_process = NULL; 859
863 }
864 unlock_timer(timer, flags); 860 unlock_timer(timer, flags);
865 release_posix_timer(timer, IT_ID_SET); 861 release_posix_timer(timer, IT_ID_SET);
866 return 0; 862 return 0;
@@ -885,11 +881,10 @@ retry_delete:
885 * This keeps any tasks waiting on the spin lock from thinking 881 * This keeps any tasks waiting on the spin lock from thinking
886 * they got something (see the lock code above). 882 * they got something (see the lock code above).
887 */ 883 */
888 if (timer->it_process) { 884 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
889 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 885 put_task_struct(timer->it_process);
890 put_task_struct(timer->it_process); 886 timer->it_process = NULL;
891 timer->it_process = NULL; 887
892 }
893 unlock_timer(timer, flags); 888 unlock_timer(timer, flags);
894 release_posix_timer(timer, IT_ID_SET); 889 release_posix_timer(timer, IT_ID_SET);
895} 890}
diff --git a/kernel/printk.c b/kernel/printk.c
index 3f7a2a94583b..a7f7559c5f6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1308,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1308} 1308}
1309 1309
1310#if defined CONFIG_PRINTK 1310#if defined CONFIG_PRINTK
1311
1312DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1311/* 1313/*
1312 * printk rate limiting, lifted from the networking subsystem. 1314 * printk rate limiting, lifted from the networking subsystem.
1313 * 1315 *
@@ -1315,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg)
1315 * every printk_ratelimit_jiffies to make a denial-of-service 1317 * every printk_ratelimit_jiffies to make a denial-of-service
1316 * attack impossible. 1318 * attack impossible.
1317 */ 1319 */
1318int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1319{
1320 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1321}
1322EXPORT_SYMBOL(__printk_ratelimit);
1323
1324/* minimum time in jiffies between messages */
1325int printk_ratelimit_jiffies = 5 * HZ;
1326
1327/* number of messages we send before ratelimiting */
1328int printk_ratelimit_burst = 10;
1329
1330int printk_ratelimit(void) 1320int printk_ratelimit(void)
1331{ 1321{
1332 return __printk_ratelimit(printk_ratelimit_jiffies, 1322 return __ratelimit(&printk_ratelimit_state);
1333 printk_ratelimit_burst);
1334} 1323}
1335EXPORT_SYMBOL(printk_ratelimit); 1324EXPORT_SYMBOL(printk_ratelimit);
1336 1325
diff --git a/kernel/profile.c b/kernel/profile.c
index 58926411eb2a..cd26bed4cc26 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
112 112
113/* Profile event notifications */ 113/* Profile event notifications */
114 114
115#ifdef CONFIG_PROFILING
116
117static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 115static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
118static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 116static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
119static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 117static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
203} 201}
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 202EXPORT_SYMBOL_GPL(unregister_timer_hook);
205 203
206#endif /* CONFIG_PROFILING */
207
208 204
209#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
210/* 206/*
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/res_counter.h> 14#include <linux/res_counter.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h>
16 17
17void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter)
18{ 19{
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
102 return *res_counter_member(counter, member); 103 return *res_counter_member(counter, member);
103} 104}
104 105
105ssize_t res_counter_write(struct res_counter *counter, int member, 106int res_counter_memparse_write_strategy(const char *buf,
106 const char __user *userbuf, size_t nbytes, loff_t *pos, 107 unsigned long long *res)
107 int (*write_strategy)(char *st_buf, unsigned long long *val))
108{ 108{
109 int ret; 109 char *end;
110 char *buf, *end; 110 /* FIXME - make memparse() take const char* args */
111 unsigned long flags; 111 *res = memparse((char *)buf, &end);
112 unsigned long long tmp, *val; 112 if (*end != '\0')
113 113 return -EINVAL;
114 buf = kmalloc(nbytes + 1, GFP_KERNEL);
115 ret = -ENOMEM;
116 if (buf == NULL)
117 goto out;
118 114
119 buf[nbytes] = '\0'; 115 *res = PAGE_ALIGN(*res);
120 ret = -EFAULT; 116 return 0;
121 if (copy_from_user(buf, userbuf, nbytes)) 117}
122 goto out_free;
123 118
124 ret = -EINVAL; 119int res_counter_write(struct res_counter *counter, int member,
120 const char *buf, write_strategy_fn write_strategy)
121{
122 char *end;
123 unsigned long flags;
124 unsigned long long tmp, *val;
125 125
126 strstrip(buf);
127 if (write_strategy) { 126 if (write_strategy) {
128 if (write_strategy(buf, &tmp)) { 127 if (write_strategy(buf, &tmp))
129 goto out_free; 128 return -EINVAL;
130 }
131 } else { 129 } else {
132 tmp = simple_strtoull(buf, &end, 10); 130 tmp = simple_strtoull(buf, &end, 10);
133 if (*end != '\0') 131 if (*end != '\0')
134 goto out_free; 132 return -EINVAL;
135 } 133 }
136 spin_lock_irqsave(&counter->lock, flags); 134 spin_lock_irqsave(&counter->lock, flags);
137 val = res_counter_member(counter, member); 135 val = res_counter_member(counter, member);
138 *val = tmp; 136 *val = tmp;
139 spin_unlock_irqrestore(&counter->lock, flags); 137 spin_unlock_irqrestore(&counter->lock, flags);
140 ret = nbytes; 138 return 0;
141out_free:
142 kfree(buf);
143out:
144 return ret;
145} 139}
diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d3336..0047bd9b96aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4046 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4046 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4047 else 4047 else
4048 cpustat->user = cputime64_add(cpustat->user, tmp); 4048 cpustat->user = cputime64_add(cpustat->user, tmp);
4049 /* Account for user time used */
4050 acct_update_integrals(p);
4049} 4051}
4050 4052
4051/* 4053/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..82c3545596c5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -338,13 +338,9 @@ unblock_all_signals(void)
338 spin_unlock_irqrestore(&current->sighand->siglock, flags); 338 spin_unlock_irqrestore(&current->sighand->siglock, flags);
339} 339}
340 340
341static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) 341static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
342{ 342{
343 struct sigqueue *q, *first = NULL; 343 struct sigqueue *q, *first = NULL;
344 int still_pending = 0;
345
346 if (unlikely(!sigismember(&list->signal, sig)))
347 return 0;
348 344
349 /* 345 /*
350 * Collect the siginfo appropriate to this signal. Check if 346 * Collect the siginfo appropriate to this signal. Check if
@@ -352,33 +348,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
352 */ 348 */
353 list_for_each_entry(q, &list->list, list) { 349 list_for_each_entry(q, &list->list, list) {
354 if (q->info.si_signo == sig) { 350 if (q->info.si_signo == sig) {
355 if (first) { 351 if (first)
356 still_pending = 1; 352 goto still_pending;
357 break;
358 }
359 first = q; 353 first = q;
360 } 354 }
361 } 355 }
356
357 sigdelset(&list->signal, sig);
358
362 if (first) { 359 if (first) {
360still_pending:
363 list_del_init(&first->list); 361 list_del_init(&first->list);
364 copy_siginfo(info, &first->info); 362 copy_siginfo(info, &first->info);
365 __sigqueue_free(first); 363 __sigqueue_free(first);
366 if (!still_pending)
367 sigdelset(&list->signal, sig);
368 } else { 364 } else {
369
370 /* Ok, it wasn't in the queue. This must be 365 /* Ok, it wasn't in the queue. This must be
371 a fast-pathed signal or we must have been 366 a fast-pathed signal or we must have been
372 out of queue space. So zero out the info. 367 out of queue space. So zero out the info.
373 */ 368 */
374 sigdelset(&list->signal, sig);
375 info->si_signo = sig; 369 info->si_signo = sig;
376 info->si_errno = 0; 370 info->si_errno = 0;
377 info->si_code = 0; 371 info->si_code = 0;
378 info->si_pid = 0; 372 info->si_pid = 0;
379 info->si_uid = 0; 373 info->si_uid = 0;
380 } 374 }
381 return 1;
382} 375}
383 376
384static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, 377static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +389,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
396 } 389 }
397 } 390 }
398 391
399 if (!collect_signal(sig, pending, info)) 392 collect_signal(sig, pending, info);
400 sig = 0;
401 } 393 }
402 394
403 return sig; 395 return sig;
@@ -462,8 +454,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
462 * is to alert stop-signal processing code when another 454 * is to alert stop-signal processing code when another
463 * processor has come along and cleared the flag. 455 * processor has come along and cleared the flag.
464 */ 456 */
465 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 457 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
466 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
467 } 458 }
468 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 459 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
469 /* 460 /*
@@ -1125,7 +1116,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1125 * is probably wrong. Should make it like BSD or SYSV. 1116 * is probably wrong. Should make it like BSD or SYSV.
1126 */ 1117 */
1127 1118
1128static int kill_something_info(int sig, struct siginfo *info, int pid) 1119static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1129{ 1120{
1130 int ret; 1121 int ret;
1131 1122
@@ -1237,17 +1228,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
1237} 1228}
1238EXPORT_SYMBOL(kill_pid); 1229EXPORT_SYMBOL(kill_pid);
1239 1230
1240int
1241kill_proc(pid_t pid, int sig, int priv)
1242{
1243 int ret;
1244
1245 rcu_read_lock();
1246 ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
1247 rcu_read_unlock();
1248 return ret;
1249}
1250
1251/* 1231/*
1252 * These functions support sending signals using preallocated sigqueue 1232 * These functions support sending signals using preallocated sigqueue
1253 * structures. This is needed "because realtime applications cannot 1233 * structures. This is needed "because realtime applications cannot
@@ -1379,10 +1359,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1379 1359
1380 info.si_uid = tsk->uid; 1360 info.si_uid = tsk->uid;
1381 1361
1382 /* FIXME: find out whether or not this is supposed to be c*time. */ 1362 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
1383 info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
1384 tsk->signal->utime)); 1363 tsk->signal->utime));
1385 info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime, 1364 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1386 tsk->signal->stime)); 1365 tsk->signal->stime));
1387 1366
1388 info.si_status = tsk->exit_code & 0x7f; 1367 info.si_status = tsk->exit_code & 0x7f;
@@ -1450,9 +1429,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1450 1429
1451 info.si_uid = tsk->uid; 1430 info.si_uid = tsk->uid;
1452 1431
1453 /* FIXME: find out whether or not this is supposed to be c*time. */ 1432 info.si_utime = cputime_to_clock_t(tsk->utime);
1454 info.si_utime = cputime_to_jiffies(tsk->utime); 1433 info.si_stime = cputime_to_clock_t(tsk->stime);
1455 info.si_stime = cputime_to_jiffies(tsk->stime);
1456 1434
1457 info.si_code = why; 1435 info.si_code = why;
1458 switch (why) { 1436 switch (why) {
@@ -1491,10 +1469,10 @@ static inline int may_ptrace_stop(void)
1491 * is a deadlock situation, and pointless because our tracer 1469 * is a deadlock situation, and pointless because our tracer
1492 * is dead so don't allow us to stop. 1470 * is dead so don't allow us to stop.
1493 * If SIGKILL was already sent before the caller unlocked 1471 * If SIGKILL was already sent before the caller unlocked
1494 * ->siglock we must see ->core_waiters != 0. Otherwise it 1472 * ->siglock we must see ->core_state != NULL. Otherwise it
1495 * is safe to enter schedule(). 1473 * is safe to enter schedule().
1496 */ 1474 */
1497 if (unlikely(current->mm->core_waiters) && 1475 if (unlikely(current->mm->core_state) &&
1498 unlikely(current->mm == current->parent->mm)) 1476 unlikely(current->mm == current->parent->mm))
1499 return 0; 1477 return 0;
1500 1478
@@ -1507,9 +1485,8 @@ static inline int may_ptrace_stop(void)
1507 */ 1485 */
1508static int sigkill_pending(struct task_struct *tsk) 1486static int sigkill_pending(struct task_struct *tsk)
1509{ 1487{
1510 return ((sigismember(&tsk->pending.signal, SIGKILL) || 1488 return sigismember(&tsk->pending.signal, SIGKILL) ||
1511 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) && 1489 sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
1512 !unlikely(sigismember(&tsk->blocked, SIGKILL)));
1513} 1490}
1514 1491
1515/* 1492/*
@@ -1525,8 +1502,6 @@ static int sigkill_pending(struct task_struct *tsk)
1525 */ 1502 */
1526static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1503static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1527{ 1504{
1528 int killed = 0;
1529
1530 if (arch_ptrace_stop_needed(exit_code, info)) { 1505 if (arch_ptrace_stop_needed(exit_code, info)) {
1531 /* 1506 /*
1532 * The arch code has something special to do before a 1507 * The arch code has something special to do before a
@@ -1542,7 +1517,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1542 spin_unlock_irq(&current->sighand->siglock); 1517 spin_unlock_irq(&current->sighand->siglock);
1543 arch_ptrace_stop(exit_code, info); 1518 arch_ptrace_stop(exit_code, info);
1544 spin_lock_irq(&current->sighand->siglock); 1519 spin_lock_irq(&current->sighand->siglock);
1545 killed = sigkill_pending(current); 1520 if (sigkill_pending(current))
1521 return;
1546 } 1522 }
1547 1523
1548 /* 1524 /*
@@ -1559,7 +1535,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1559 __set_current_state(TASK_TRACED); 1535 __set_current_state(TASK_TRACED);
1560 spin_unlock_irq(&current->sighand->siglock); 1536 spin_unlock_irq(&current->sighand->siglock);
1561 read_lock(&tasklist_lock); 1537 read_lock(&tasklist_lock);
1562 if (!unlikely(killed) && may_ptrace_stop()) { 1538 if (may_ptrace_stop()) {
1563 do_notify_parent_cldstop(current, CLD_TRAPPED); 1539 do_notify_parent_cldstop(current, CLD_TRAPPED);
1564 read_unlock(&tasklist_lock); 1540 read_unlock(&tasklist_lock);
1565 schedule(); 1541 schedule();
@@ -1658,8 +1634,7 @@ static int do_signal_stop(int signr)
1658 } else { 1634 } else {
1659 struct task_struct *t; 1635 struct task_struct *t;
1660 1636
1661 if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE)) 1637 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
1662 != SIGNAL_STOP_DEQUEUED) ||
1663 unlikely(signal_group_exit(sig))) 1638 unlikely(signal_group_exit(sig)))
1664 return 0; 1639 return 0;
1665 /* 1640 /*
@@ -1920,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
1920EXPORT_SYMBOL_GPL(dequeue_signal); 1895EXPORT_SYMBOL_GPL(dequeue_signal);
1921EXPORT_SYMBOL(flush_signals); 1896EXPORT_SYMBOL(flush_signals);
1922EXPORT_SYMBOL(force_sig); 1897EXPORT_SYMBOL(force_sig);
1923EXPORT_SYMBOL(kill_proc);
1924EXPORT_SYMBOL(ptrace_notify); 1898EXPORT_SYMBOL(ptrace_notify);
1925EXPORT_SYMBOL(send_sig); 1899EXPORT_SYMBOL(send_sig);
1926EXPORT_SYMBOL(send_sig_info); 1900EXPORT_SYMBOL(send_sig_info);
@@ -2196,7 +2170,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2196} 2170}
2197 2171
2198asmlinkage long 2172asmlinkage long
2199sys_kill(int pid, int sig) 2173sys_kill(pid_t pid, int sig)
2200{ 2174{
2201 struct siginfo info; 2175 struct siginfo info;
2202 2176
@@ -2209,7 +2183,7 @@ sys_kill(int pid, int sig)
2209 return kill_something_info(sig, &info, pid); 2183 return kill_something_info(sig, &info, pid);
2210} 2184}
2211 2185
2212static int do_tkill(int tgid, int pid, int sig) 2186static int do_tkill(pid_t tgid, pid_t pid, int sig)
2213{ 2187{
2214 int error; 2188 int error;
2215 struct siginfo info; 2189 struct siginfo info;
@@ -2255,7 +2229,7 @@ static int do_tkill(int tgid, int pid, int sig)
2255 * exists but it's not belonging to the target process anymore. This 2229 * exists but it's not belonging to the target process anymore. This
2256 * method solves the problem of threads exiting and PIDs getting reused. 2230 * method solves the problem of threads exiting and PIDs getting reused.
2257 */ 2231 */
2258asmlinkage long sys_tgkill(int tgid, int pid, int sig) 2232asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
2259{ 2233{
2260 /* This is only valid for single tasks */ 2234 /* This is only valid for single tasks */
2261 if (pid <= 0 || tgid <= 0) 2235 if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2242,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2268 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2242 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2269 */ 2243 */
2270asmlinkage long 2244asmlinkage long
2271sys_tkill(int pid, int sig) 2245sys_tkill(pid_t pid, int sig)
2272{ 2246{
2273 /* This is only valid for single tasks */ 2247 /* This is only valid for single tasks */
2274 if (pid <= 0) 2248 if (pid <= 0)
@@ -2278,7 +2252,7 @@ sys_tkill(int pid, int sig)
2278} 2252}
2279 2253
2280asmlinkage long 2254asmlinkage long
2281sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) 2255sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
2282{ 2256{
2283 siginfo_t info; 2257 siginfo_t info;
2284 2258
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..0c9d3fa1f5ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1343,8 +1343,6 @@ EXPORT_SYMBOL(in_egroup_p);
1343 1343
1344DECLARE_RWSEM(uts_sem); 1344DECLARE_RWSEM(uts_sem);
1345 1345
1346EXPORT_SYMBOL(uts_sem);
1347
1348asmlinkage long sys_newuname(struct new_utsname __user * name) 1346asmlinkage long sys_newuname(struct new_utsname __user * name)
1349{ 1347{
1350 int errno = 0; 1348 int errno = 0;
@@ -1795,7 +1793,7 @@ int orderly_poweroff(bool force)
1795 goto out; 1793 goto out;
1796 } 1794 }
1797 1795
1798 info = call_usermodehelper_setup(argv[0], argv, envp); 1796 info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
1799 if (info == NULL) { 1797 if (info == NULL) {
1800 argv_free(argv); 1798 argv_free(argv);
1801 goto out; 1799 goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bd66ac5406f3..08d6e1bb99ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -57,6 +57,7 @@ cond_syscall(compat_sys_set_robust_list);
57cond_syscall(sys_get_robust_list); 57cond_syscall(sys_get_robust_list);
58cond_syscall(compat_sys_get_robust_list); 58cond_syscall(compat_sys_get_robust_list);
59cond_syscall(sys_epoll_create); 59cond_syscall(sys_epoll_create);
60cond_syscall(sys_epoll_create1);
60cond_syscall(sys_epoll_ctl); 61cond_syscall(sys_epoll_ctl);
61cond_syscall(sys_epoll_wait); 62cond_syscall(sys_epoll_wait);
62cond_syscall(sys_epoll_pwait); 63cond_syscall(sys_epoll_pwait);
@@ -159,6 +160,7 @@ cond_syscall(sys_ioprio_get);
159cond_syscall(sys_signalfd); 160cond_syscall(sys_signalfd);
160cond_syscall(sys_signalfd4); 161cond_syscall(sys_signalfd4);
161cond_syscall(compat_sys_signalfd); 162cond_syscall(compat_sys_signalfd);
163cond_syscall(compat_sys_signalfd4);
162cond_syscall(sys_timerfd_create); 164cond_syscall(sys_timerfd_create);
163cond_syscall(sys_timerfd_settime); 165cond_syscall(sys_timerfd_settime);
164cond_syscall(sys_timerfd_gettime); 166cond_syscall(sys_timerfd_gettime);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a8299d1fe59..35a50db9b6ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -624,7 +624,7 @@ static struct ctl_table kern_table[] = {
624 { 624 {
625 .ctl_name = KERN_PRINTK_RATELIMIT, 625 .ctl_name = KERN_PRINTK_RATELIMIT,
626 .procname = "printk_ratelimit", 626 .procname = "printk_ratelimit",
627 .data = &printk_ratelimit_jiffies, 627 .data = &printk_ratelimit_state.interval,
628 .maxlen = sizeof(int), 628 .maxlen = sizeof(int),
629 .mode = 0644, 629 .mode = 0644,
630 .proc_handler = &proc_dointvec_jiffies, 630 .proc_handler = &proc_dointvec_jiffies,
@@ -633,7 +633,7 @@ static struct ctl_table kern_table[] = {
633 { 633 {
634 .ctl_name = KERN_PRINTK_RATELIMIT_BURST, 634 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
635 .procname = "printk_ratelimit_burst", 635 .procname = "printk_ratelimit_burst",
636 .data = &printk_ratelimit_burst, 636 .data = &printk_ratelimit_state.burst,
637 .maxlen = sizeof(int), 637 .maxlen = sizeof(int),
638 .mode = 0644, 638 .mode = 0644,
639 .proc_handler = &proc_dointvec, 639 .proc_handler = &proc_dointvec,
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1532 sysctl_check_leaf(namespaces, table, &fail); 1532 sysctl_check_leaf(namespaces, table, &fail);
1533 } 1533 }
1534 sysctl_check_bin_path(table, &fail); 1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode");
1535 if (fail) { 1537 if (fail) {
1536 set_fail(&fail, table, NULL); 1538 set_fail(&fail, table, NULL);
1537 error = -EINVAL; 1539 error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 06b17547f4e7..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
35 */ 35 */
36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
37 37
38static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 38static DEFINE_PER_CPU(__u32, taskstats_seqnum);
39static int family_registered; 39static int family_registered;
40struct kmem_cache *taskstats_cache; 40struct kmem_cache *taskstats_cache;
41 41
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 63528086337c..ce2d723c10e1 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
161 __trace_special(tr, data, 2, regs->ip, 0); 161 __trace_special(tr, data, 2, regs->ip, 0);
162 162
163 while (i < sample_max_depth) { 163 while (i < sample_max_depth) {
164 frame.next_fp = 0; 164 frame.next_fp = NULL;
165 frame.return_address = 0; 165 frame.return_address = 0;
166 if (!copy_stack_frame(fp, &frame)) 166 if (!copy_stack_frame(fp, &frame))
167 break; 167 break;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..3da47ccdc5e5 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 struct timespec uptime, ts; 30 struct timespec uptime, ts;
31 s64 ac_etime; 31 u64 ac_etime;
32 32
33 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 33 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
34 34
35 /* calculate task elapsed time in timespec */ 35 /* calculate task elapsed time in timespec */
36 do_posix_clock_monotonic_gettime(&uptime); 36 do_posix_clock_monotonic_gettime(&uptime);
37 ts = timespec_sub(uptime, tsk->start_time); 37 ts = timespec_sub(uptime, tsk->start_time);
38 /* rebase elapsed time to usec */ 38 /* rebase elapsed time to usec (should never be negative) */
39 ac_etime = timespec_to_ns(&ts); 39 ac_etime = timespec_to_ns(&ts);
40 do_div(ac_etime, NSEC_PER_USEC); 40 do_div(ac_etime, NSEC_PER_USEC);
41 stats->ac_etime = ac_etime; 41 stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
84{ 84{
85 struct mm_struct *mm; 85 struct mm_struct *mm;
86 86
87 /* convert pages-jiffies to Mbyte-usec */ 87 /* convert pages-usec to Mbyte-usec */
88 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; 88 stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
89 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; 89 stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
90 mm = get_task_mm(p); 90 mm = get_task_mm(p);
91 if (mm) { 91 if (mm) {
92 /* adjust to KB unit */ 92 /* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
118void acct_update_integrals(struct task_struct *tsk) 118void acct_update_integrals(struct task_struct *tsk)
119{ 119{
120 if (likely(tsk->mm)) { 120 if (likely(tsk->mm)) {
121 long delta = cputime_to_jiffies( 121 cputime_t time, dtime;
122 cputime_sub(tsk->stime, tsk->acct_stimexpd)); 122 struct timeval value;
123 u64 delta;
124
125 time = tsk->stime + tsk->utime;
126 dtime = cputime_sub(time, tsk->acct_timexpd);
127 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
128 delta = value.tv_sec;
129 delta = delta * USEC_PER_SEC + value.tv_usec;
123 130
124 if (delta == 0) 131 if (delta == 0)
125 return; 132 return;
126 tsk->acct_stimexpd = tsk->stime; 133 tsk->acct_timexpd = time;
127 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 134 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
128 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 135 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
129 } 136 }
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
135 */ 142 */
136void acct_clear_integrals(struct task_struct *tsk) 143void acct_clear_integrals(struct task_struct *tsk)
137{ 144{
138 tsk->acct_stimexpd = 0; 145 tsk->acct_timexpd = 0;
139 tsk->acct_rss_mem1 = 0; 146 tsk->acct_rss_mem1 = 0;
140 tsk->acct_vm_mem1 = 0; 147 tsk->acct_vm_mem1 = 0;
141} 148}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6fd158b21026..ec7e4f62aaff 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
125} 125}
126 126
127static void insert_work(struct cpu_workqueue_struct *cwq, 127static void insert_work(struct cpu_workqueue_struct *cwq,
128 struct work_struct *work, int tail) 128 struct work_struct *work, struct list_head *head)
129{ 129{
130 set_wq_data(work, cwq); 130 set_wq_data(work, cwq);
131 /* 131 /*
@@ -133,10 +133,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
133 * result of list_add() below, see try_to_grab_pending(). 133 * result of list_add() below, see try_to_grab_pending().
134 */ 134 */
135 smp_wmb(); 135 smp_wmb();
136 if (tail) 136 list_add_tail(&work->entry, head);
137 list_add_tail(&work->entry, &cwq->worklist);
138 else
139 list_add(&work->entry, &cwq->worklist);
140 wake_up(&cwq->more_work); 137 wake_up(&cwq->more_work);
141} 138}
142 139
@@ -146,7 +143,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
146 unsigned long flags; 143 unsigned long flags;
147 144
148 spin_lock_irqsave(&cwq->lock, flags); 145 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, 1); 146 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 147 spin_unlock_irqrestore(&cwq->lock, flags);
151} 148}
152 149
@@ -162,14 +159,11 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
162 */ 159 */
163int queue_work(struct workqueue_struct *wq, struct work_struct *work) 160int queue_work(struct workqueue_struct *wq, struct work_struct *work)
164{ 161{
165 int ret = 0; 162 int ret;
163
164 ret = queue_work_on(get_cpu(), wq, work);
165 put_cpu();
166 166
167 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
168 BUG_ON(!list_empty(&work->entry));
169 __queue_work(wq_per_cpu(wq, get_cpu()), work);
170 put_cpu();
171 ret = 1;
172 }
173 return ret; 167 return ret;
174} 168}
175EXPORT_SYMBOL_GPL(queue_work); 169EXPORT_SYMBOL_GPL(queue_work);
@@ -361,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work)
361} 355}
362 356
363static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 357static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
364 struct wq_barrier *barr, int tail) 358 struct wq_barrier *barr, struct list_head *head)
365{ 359{
366 INIT_WORK(&barr->work, wq_barrier_func); 360 INIT_WORK(&barr->work, wq_barrier_func);
367 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 361 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
368 362
369 init_completion(&barr->done); 363 init_completion(&barr->done);
370 364
371 insert_work(cwq, &barr->work, tail); 365 insert_work(cwq, &barr->work, head);
372} 366}
373 367
374static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 368static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -388,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
388 active = 0; 382 active = 0;
389 spin_lock_irq(&cwq->lock); 383 spin_lock_irq(&cwq->lock);
390 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 384 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
391 insert_wq_barrier(cwq, &barr, 1); 385 insert_wq_barrier(cwq, &barr, &cwq->worklist);
392 active = 1; 386 active = 1;
393 } 387 }
394 spin_unlock_irq(&cwq->lock); 388 spin_unlock_irq(&cwq->lock);
@@ -426,6 +420,57 @@ void flush_workqueue(struct workqueue_struct *wq)
426} 420}
427EXPORT_SYMBOL_GPL(flush_workqueue); 421EXPORT_SYMBOL_GPL(flush_workqueue);
428 422
423/**
424 * flush_work - block until a work_struct's callback has terminated
425 * @work: the work which is to be flushed
426 *
427 * Returns false if @work has already terminated.
428 *
429 * It is expected that, prior to calling flush_work(), the caller has
430 * arranged for the work to not be requeued, otherwise it doesn't make
431 * sense to use this function.
432 */
433int flush_work(struct work_struct *work)
434{
435 struct cpu_workqueue_struct *cwq;
436 struct list_head *prev;
437 struct wq_barrier barr;
438
439 might_sleep();
440 cwq = get_wq_data(work);
441 if (!cwq)
442 return 0;
443
444 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
445 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
446
447 prev = NULL;
448 spin_lock_irq(&cwq->lock);
449 if (!list_empty(&work->entry)) {
450 /*
451 * See the comment near try_to_grab_pending()->smp_rmb().
452 * If it was re-queued under us we are not going to wait.
453 */
454 smp_rmb();
455 if (unlikely(cwq != get_wq_data(work)))
456 goto out;
457 prev = &work->entry;
458 } else {
459 if (cwq->current_work != work)
460 goto out;
461 prev = &cwq->worklist;
462 }
463 insert_wq_barrier(cwq, &barr, prev->next);
464out:
465 spin_unlock_irq(&cwq->lock);
466 if (!prev)
467 return 0;
468
469 wait_for_completion(&barr.done);
470 return 1;
471}
472EXPORT_SYMBOL_GPL(flush_work);
473
429/* 474/*
430 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 475 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
431 * so this work can't be re-armed in any way. 476 * so this work can't be re-armed in any way.
@@ -473,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
473 518
474 spin_lock_irq(&cwq->lock); 519 spin_lock_irq(&cwq->lock);
475 if (unlikely(cwq->current_work == work)) { 520 if (unlikely(cwq->current_work == work)) {
476 insert_wq_barrier(cwq, &barr, 0); 521 insert_wq_barrier(cwq, &barr, cwq->worklist.next);
477 running = 1; 522 running = 1;
478 } 523 }
479 spin_unlock_irq(&cwq->lock); 524 spin_unlock_irq(&cwq->lock);
@@ -644,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func)
644 struct work_struct *work = per_cpu_ptr(works, cpu); 689 struct work_struct *work = per_cpu_ptr(works, cpu);
645 690
646 INIT_WORK(work, func); 691 INIT_WORK(work, func);
647 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 692 schedule_work_on(cpu, work);
648 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
649 } 693 }
650 flush_workqueue(keventd_wq); 694 for_each_online_cpu(cpu)
695 flush_work(per_cpu_ptr(works, cpu));
651 put_online_cpus(); 696 put_online_cpus();
652 free_percpu(works); 697 free_percpu(works);
653 return 0; 698 return 0;
@@ -784,7 +829,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
784 err = create_workqueue_thread(cwq, singlethread_cpu); 829 err = create_workqueue_thread(cwq, singlethread_cpu);
785 start_workqueue_thread(cwq, -1); 830 start_workqueue_thread(cwq, -1);
786 } else { 831 } else {
787 get_online_cpus(); 832 cpu_maps_update_begin();
788 spin_lock(&workqueue_lock); 833 spin_lock(&workqueue_lock);
789 list_add(&wq->list, &workqueues); 834 list_add(&wq->list, &workqueues);
790 spin_unlock(&workqueue_lock); 835 spin_unlock(&workqueue_lock);
@@ -796,7 +841,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
796 err = create_workqueue_thread(cwq, cpu); 841 err = create_workqueue_thread(cwq, cpu);
797 start_workqueue_thread(cwq, cpu); 842 start_workqueue_thread(cwq, cpu);
798 } 843 }
799 put_online_cpus(); 844 cpu_maps_update_done();
800 } 845 }
801 846
802 if (err) { 847 if (err) {
@@ -810,8 +855,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
810static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 855static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
811{ 856{
812 /* 857 /*
813 * Our caller is either destroy_workqueue() or CPU_DEAD, 858 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
814 * get_online_cpus() protects cwq->thread. 859 * cpu_add_remove_lock protects cwq->thread.
815 */ 860 */
816 if (cwq->thread == NULL) 861 if (cwq->thread == NULL)
817 return; 862 return;
@@ -821,7 +866,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
821 866
822 flush_cpu_workqueue(cwq); 867 flush_cpu_workqueue(cwq);
823 /* 868 /*
824 * If the caller is CPU_DEAD and cwq->worklist was not empty, 869 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
825 * a concurrent flush_workqueue() can insert a barrier after us. 870 * a concurrent flush_workqueue() can insert a barrier after us.
826 * However, in that case run_workqueue() won't return and check 871 * However, in that case run_workqueue() won't return and check
827 * kthread_should_stop() until it flushes all work_struct's. 872 * kthread_should_stop() until it flushes all work_struct's.
@@ -845,14 +890,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
845 const cpumask_t *cpu_map = wq_cpu_map(wq); 890 const cpumask_t *cpu_map = wq_cpu_map(wq);
846 int cpu; 891 int cpu;
847 892
848 get_online_cpus(); 893 cpu_maps_update_begin();
849 spin_lock(&workqueue_lock); 894 spin_lock(&workqueue_lock);
850 list_del(&wq->list); 895 list_del(&wq->list);
851 spin_unlock(&workqueue_lock); 896 spin_unlock(&workqueue_lock);
852 897
853 for_each_cpu_mask_nr(cpu, *cpu_map) 898 for_each_cpu_mask_nr(cpu, *cpu_map)
854 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 899 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
855 put_online_cpus(); 900 cpu_maps_update_done();
856 901
857 free_percpu(wq->cpu_wq); 902 free_percpu(wq->cpu_wq);
858 kfree(wq); 903 kfree(wq);
@@ -866,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
866 unsigned int cpu = (unsigned long)hcpu; 911 unsigned int cpu = (unsigned long)hcpu;
867 struct cpu_workqueue_struct *cwq; 912 struct cpu_workqueue_struct *cwq;
868 struct workqueue_struct *wq; 913 struct workqueue_struct *wq;
914 int ret = NOTIFY_OK;
869 915
870 action &= ~CPU_TASKS_FROZEN; 916 action &= ~CPU_TASKS_FROZEN;
871 917
@@ -873,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
873 case CPU_UP_PREPARE: 919 case CPU_UP_PREPARE:
874 cpu_set(cpu, cpu_populated_map); 920 cpu_set(cpu, cpu_populated_map);
875 } 921 }
876 922undo:
877 list_for_each_entry(wq, &workqueues, list) { 923 list_for_each_entry(wq, &workqueues, list) {
878 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 924 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
879 925
@@ -883,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
883 break; 929 break;
884 printk(KERN_ERR "workqueue [%s] for %i failed\n", 930 printk(KERN_ERR "workqueue [%s] for %i failed\n",
885 wq->name, cpu); 931 wq->name, cpu);
886 return NOTIFY_BAD; 932 action = CPU_UP_CANCELED;
933 ret = NOTIFY_BAD;
934 goto undo;
887 935
888 case CPU_ONLINE: 936 case CPU_ONLINE:
889 start_workqueue_thread(cwq, cpu); 937 start_workqueue_thread(cwq, cpu);
@@ -891,7 +939,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
891 939
892 case CPU_UP_CANCELED: 940 case CPU_UP_CANCELED:
893 start_workqueue_thread(cwq, -1); 941 start_workqueue_thread(cwq, -1);
894 case CPU_DEAD: 942 case CPU_POST_DEAD:
895 cleanup_workqueue_thread(cwq); 943 cleanup_workqueue_thread(cwq);
896 break; 944 break;
897 } 945 }
@@ -899,11 +947,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
899 947
900 switch (action) { 948 switch (action) {
901 case CPU_UP_CANCELED: 949 case CPU_UP_CANCELED:
902 case CPU_DEAD: 950 case CPU_POST_DEAD:
903 cpu_clear(cpu, cpu_populated_map); 951 cpu_clear(cpu, cpu_populated_map);
904 } 952 }
905 953
906 return NOTIFY_OK; 954 return ret;
907} 955}
908 956
909void __init init_workqueues(void) 957void __init init_workqueues(void)