aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-10 21:41:39 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-10 21:41:39 -0500
commitabede81c4fb2e3b85d8760f25e3da39d2c69a134 (patch)
tree26c893ec108d837eb9171d678c55a1cea7b22af4 /kernel
parentc9d557c19f94df42db78d4a5de4d25feee694bad (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into core/urgent
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c335
-rw-r--r--kernel/cgroup.c276
-rw-r--r--kernel/cpuset.c251
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/irq/autoprobe.c5
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/disk.c6
-rw-r--r--kernel/power/snapshot.c370
-rw-r--r--kernel/power/swsusp.c122
-rw-r--r--kernel/res_counter.c44
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched.c5
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/trace/ring_buffer.c8
19 files changed, 1129 insertions, 396 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e1c5bf3365c0..2921d90ce32f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o
13 14
14ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000000..f286e9f2b736
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,335 @@
1/*
2 * async.c: Asynchronous function calls for boot performance
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13
14/*
15
16Goals and Theory of Operation
17
18The primary goal of this feature is to reduce the kernel boot time,
19by doing various independent hardware delays and discovery operations
20decoupled and not strictly serialized.
21
22More specifically, the asynchronous function call concept allows
23certain operations (primarily during system boot) to happen
24asynchronously, out of order, while these operations still
25have their externally visible parts happen sequentially and in-order.
26(not unlike how out-of-order CPUs retire their instructions in order)
27
28Key to the asynchronous function call implementation is the concept of
29a "sequence cookie" (which, although it has an abstracted type, can be
30thought of as a monotonically incrementing number).
31
32The async core will assign each scheduled event such a sequence cookie and
33pass this to the called functions.
34
35The asynchronously called function should before doing a globally visible
36operation, such as registering device numbers, call the
37async_synchronize_cookie() function and pass in its own cookie. The
38async_synchronize_cookie() function will make sure that all asynchronous
39operations that were scheduled prior to the operation corresponding with the
40cookie have completed.
41
42Subsystem/driver initialization code that scheduled asynchronous probe
43functions, but which shares global resources with other drivers/subsystems
44that do not use the asynchronous call feature, need to do a full
45synchronization with the async_synchronize_full() function, before returning
46from their init function. This is to maintain strict ordering between the
47asynchronous and synchronous parts of the kernel.
48
49*/
50
51#include <linux/async.h>
52#include <linux/module.h>
53#include <linux/wait.h>
54#include <linux/sched.h>
55#include <linux/init.h>
56#include <linux/kthread.h>
57#include <asm/atomic.h>
58
59static async_cookie_t next_cookie = 1;
60
61#define MAX_THREADS 256
62#define MAX_WORK 32768
63
64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running);
66static DEFINE_SPINLOCK(async_lock);
67
68static int async_enabled = 0;
69
70struct async_entry {
71 struct list_head list;
72 async_cookie_t cookie;
73 async_func_ptr *func;
74 void *data;
75 struct list_head *running;
76};
77
78static DECLARE_WAIT_QUEUE_HEAD(async_done);
79static DECLARE_WAIT_QUEUE_HEAD(async_new);
80
81static atomic_t entry_count;
82static atomic_t thread_count;
83
84extern int initcall_debug;
85
86
87/*
88 * MUST be called with the lock held!
89 */
90static async_cookie_t __lowest_in_progress(struct list_head *running)
91{
92 struct async_entry *entry;
93 if (!list_empty(&async_pending)) {
94 entry = list_first_entry(&async_pending,
95 struct async_entry, list);
96 return entry->cookie;
97 } else if (!list_empty(running)) {
98 entry = list_first_entry(running,
99 struct async_entry, list);
100 return entry->cookie;
101 } else {
102 /* nothing in progress... next_cookie is "infinity" */
103 return next_cookie;
104 }
105
106}
107/*
108 * pick the first pending entry and run it
109 */
110static void run_one_entry(void)
111{
112 unsigned long flags;
113 struct async_entry *entry;
114 ktime_t calltime, delta, rettime;
115
116 /* 1) pick one task from the pending queue */
117
118 spin_lock_irqsave(&async_lock, flags);
119 if (list_empty(&async_pending))
120 goto out;
121 entry = list_first_entry(&async_pending, struct async_entry, list);
122
123 /* 2) move it to the running queue */
124 list_del(&entry->list);
125 list_add_tail(&entry->list, &async_running);
126 spin_unlock_irqrestore(&async_lock, flags);
127
128 /* 3) run it (and print duration)*/
129 if (initcall_debug && system_state == SYSTEM_BOOTING) {
130 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
131 calltime = ktime_get();
132 }
133 entry->func(entry->data, entry->cookie);
134 if (initcall_debug && system_state == SYSTEM_BOOTING) {
135 rettime = ktime_get();
136 delta = ktime_sub(rettime, calltime);
137 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
138 entry->func, ktime_to_ns(delta) >> 10);
139 }
140
141 /* 4) remove it from the running queue */
142 spin_lock_irqsave(&async_lock, flags);
143 list_del(&entry->list);
144
145 /* 5) free the entry */
146 kfree(entry);
147 atomic_dec(&entry_count);
148
149 spin_unlock_irqrestore(&async_lock, flags);
150
151 /* 6) wake up any waiters. */
152 wake_up(&async_done);
153 return;
154
155out:
156 spin_unlock_irqrestore(&async_lock, flags);
157}
158
159
160static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
161{
162 struct async_entry *entry;
163 unsigned long flags;
164 async_cookie_t newcookie;
165
166
167 /* allow irq-off callers */
168 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
169
170 /*
171 * If we're out of memory or if there's too much work
172 * pending already, we execute synchronously.
173 */
174 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
175 kfree(entry);
176 spin_lock_irqsave(&async_lock, flags);
177 newcookie = next_cookie++;
178 spin_unlock_irqrestore(&async_lock, flags);
179
180 /* low on memory.. run synchronously */
181 ptr(data, newcookie);
182 return newcookie;
183 }
184 entry->func = ptr;
185 entry->data = data;
186 entry->running = running;
187
188 spin_lock_irqsave(&async_lock, flags);
189 newcookie = entry->cookie = next_cookie++;
190 list_add_tail(&entry->list, &async_pending);
191 atomic_inc(&entry_count);
192 spin_unlock_irqrestore(&async_lock, flags);
193 wake_up(&async_new);
194 return newcookie;
195}
196
197async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
198{
199 return __async_schedule(ptr, data, &async_pending);
200}
201EXPORT_SYMBOL_GPL(async_schedule);
202
203async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
204{
205 return __async_schedule(ptr, data, running);
206}
207EXPORT_SYMBOL_GPL(async_schedule_special);
208
209void async_synchronize_full(void)
210{
211 do {
212 async_synchronize_cookie(next_cookie);
213 } while (!list_empty(&async_running) || !list_empty(&async_pending));
214}
215EXPORT_SYMBOL_GPL(async_synchronize_full);
216
217void async_synchronize_full_special(struct list_head *list)
218{
219 async_synchronize_cookie_special(next_cookie, list);
220}
221EXPORT_SYMBOL_GPL(async_synchronize_full_special);
222
223void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
224{
225 ktime_t starttime, delta, endtime;
226
227 if (initcall_debug && system_state == SYSTEM_BOOTING) {
228 printk("async_waiting @ %i\n", task_pid_nr(current));
229 starttime = ktime_get();
230 }
231
232 wait_event(async_done, __lowest_in_progress(running) >= cookie);
233
234 if (initcall_debug && system_state == SYSTEM_BOOTING) {
235 endtime = ktime_get();
236 delta = ktime_sub(endtime, starttime);
237
238 printk("async_continuing @ %i after %lli usec\n",
239 task_pid_nr(current), ktime_to_ns(delta) >> 10);
240 }
241}
242EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
243
244void async_synchronize_cookie(async_cookie_t cookie)
245{
246 async_synchronize_cookie_special(cookie, &async_running);
247}
248EXPORT_SYMBOL_GPL(async_synchronize_cookie);
249
250
251static int async_thread(void *unused)
252{
253 DECLARE_WAITQUEUE(wq, current);
254 add_wait_queue(&async_new, &wq);
255
256 while (!kthread_should_stop()) {
257 int ret = HZ;
258 set_current_state(TASK_INTERRUPTIBLE);
259 /*
260 * check the list head without lock.. false positives
261 * are dealt with inside run_one_entry() while holding
262 * the lock.
263 */
264 rmb();
265 if (!list_empty(&async_pending))
266 run_one_entry();
267 else
268 ret = schedule_timeout(HZ);
269
270 if (ret == 0) {
271 /*
272 * we timed out, this means we as thread are redundant.
273 * we sign off and die, but we to avoid any races there
274 * is a last-straw check to see if work snuck in.
275 */
276 atomic_dec(&thread_count);
277 wmb(); /* manager must see our departure first */
278 if (list_empty(&async_pending))
279 break;
280 /*
281 * woops work came in between us timing out and us
282 * signing off; we need to stay alive and keep working.
283 */
284 atomic_inc(&thread_count);
285 }
286 }
287 remove_wait_queue(&async_new, &wq);
288
289 return 0;
290}
291
292static int async_manager_thread(void *unused)
293{
294 DECLARE_WAITQUEUE(wq, current);
295 add_wait_queue(&async_new, &wq);
296
297 while (!kthread_should_stop()) {
298 int tc, ec;
299
300 set_current_state(TASK_INTERRUPTIBLE);
301
302 tc = atomic_read(&thread_count);
303 rmb();
304 ec = atomic_read(&entry_count);
305
306 while (tc < ec && tc < MAX_THREADS) {
307 kthread_run(async_thread, NULL, "async/%i", tc);
308 atomic_inc(&thread_count);
309 tc++;
310 }
311
312 schedule();
313 }
314 remove_wait_queue(&async_new, &wq);
315
316 return 0;
317}
318
319static int __init async_init(void)
320{
321 if (async_enabled)
322 kthread_run(async_manager_thread, NULL, "async/mgr");
323 return 0;
324}
325
326static int __init setup_async(char *str)
327{
328 async_enabled = 1;
329 return 1;
330}
331
332__setup("fastboot", setup_async);
333
334
335core_initcall(async_init);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f221446aa02d..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
148#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
149list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
150 150
151/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
152#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
153list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
154 154
155/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
271 271
272 rcu_read_lock(); 272 rcu_read_lock();
273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
274 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
275 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
276 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
277 if (taskexit) 277 if (taskexit)
@@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
384 return 0; 384 return 0;
385} 385}
386 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
387/* 406/*
388 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
389 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -399,7 +418,6 @@ static struct css_set *find_css_set(
399 int i; 418 int i;
400 419
401 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
402 struct cg_cgroup_link *link;
403 421
404 struct hlist_head *hhead; 422 struct hlist_head *hhead;
405 423
@@ -444,26 +462,11 @@ static struct css_set *find_css_set(
444 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
445 * hierarchy 463 * hierarchy
446 */ 464 */
447 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
448 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
449 link = list_entry(tmp_cg_links.next,
450 struct cg_cgroup_link,
451 cgrp_link_list);
452 list_del(&link->cgrp_link_list);
453 list_add(&link->cgrp_link_list, &cgrp->css_sets);
454 link->cg = res;
455 list_add(&link->cg_link_list, &res->cg_links);
456 }
457 }
458 if (list_empty(&rootnode.subsys_list)) {
459 link = list_entry(tmp_cg_links.next,
460 struct cg_cgroup_link,
461 cgrp_link_list);
462 list_del(&link->cgrp_link_list);
463 list_add(&link->cgrp_link_list, &dummytop->css_sets);
464 link->cg = res;
465 list_add(&link->cg_link_list, &res->cg_links);
466 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
467 470
468 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
469 472
@@ -586,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
586{ 589{
587 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
588 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
589 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
590 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
591 return; 594 return;
592} 595}
593 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
594static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
595{ 605{
596 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -610,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
610 /* 620 /*
611 * Release the subsystem state objects. 621 * Release the subsystem state objects.
612 */ 622 */
613 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
614 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
615 ss->destroy(ss, cgrp);
616 }
617 625
618 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
619 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
620 628
621 /* Drop the active superblock reference that we took when we 629 /*
622 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
623 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
624 634
625 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
626 } 636 }
627 iput(inode); 637 iput(inode);
628} 638}
@@ -712,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
712 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
713 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
714 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
715 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
716 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
717 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
718 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
719 if (ss->bind) 730 if (ss->bind)
720 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
721 732 mutex_unlock(&ss->hierarchy_mutex);
722 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
723 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
724 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
725 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
726 if (ss->bind) 738 if (ss->bind)
727 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
728 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
729 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
730 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
731 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
732 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
733 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
734 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -990,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
990 root = NULL; 1003 root = NULL;
991 } else { 1004 } else {
992 /* New superblock */ 1005 /* New superblock */
993 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
994 struct inode *inode; 1007 struct inode *inode;
995 int i; 1008 int i;
996 1009
@@ -1031,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1031 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1032 root_count++; 1045 root_count++;
1033 1046
1034 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1035 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1036 1049
1037 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1042,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1042 struct hlist_node *node; 1055 struct hlist_node *node;
1043 struct css_set *cg; 1056 struct css_set *cg;
1044 1057
1045 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1046 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1047
1048 BUG_ON(list_empty(&tmp_cg_links));
1049 link = list_entry(tmp_cg_links.next,
1050 struct cg_cgroup_link,
1051 cgrp_link_list);
1052 list_del(&link->cgrp_link_list);
1053 link->cg = cg;
1054 list_add(&link->cgrp_link_list,
1055 &root->top_cgroup.css_sets);
1056 list_add(&link->cg_link_list, &cg->cg_links);
1057 }
1058 } 1060 }
1059 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1060 1062
1061 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1062 1064
1063 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1064 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1065 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1066 1068
1067 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1068 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1069 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1070 } 1072 }
@@ -1113,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1113 } 1115 }
1114 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1115 1117
1116 if (!list_empty(&root->root_list)) { 1118 list_del(&root->root_list);
1117 list_del(&root->root_list); 1119 root_count--;
1118 root_count--; 1120
1119 }
1120 mutex_unlock(&cgroup_mutex); 1121 mutex_unlock(&cgroup_mutex);
1121 1122
1122 kfree(root); 1123 kfree(root);
@@ -1145,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1145 * @buf: the buffer to write the path into 1146 * @buf: the buffer to write the path into
1146 * @buflen: the length of the buffer 1147 * @buflen: the length of the buffer
1147 * 1148 *
1148 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1149 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1149 * Returns 0 on success, -errno on error. 1150 * reference. Writes path of cgroup into buf. Returns 0 on success,
1151 * -errno on error.
1150 */ 1152 */
1151int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1152{ 1154{
1153 char *start; 1155 char *start;
1156 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1154 1157
1155 if (cgrp == dummytop) { 1158 if (!dentry || cgrp == dummytop) {
1156 /* 1159 /*
1157 * Inactive subsystems have no dentry for their root 1160 * Inactive subsystems have no dentry for their root
1158 * cgroup 1161 * cgroup
@@ -1165,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1165 1168
1166 *--start = '\0'; 1169 *--start = '\0';
1167 for (;;) { 1170 for (;;) {
1168 int len = cgrp->dentry->d_name.len; 1171 int len = dentry->d_name.len;
1169 if ((start -= len) < buf) 1172 if ((start -= len) < buf)
1170 return -ENAMETOOLONG; 1173 return -ENAMETOOLONG;
1171 memcpy(start, cgrp->dentry->d_name.name, len); 1174 memcpy(start, cgrp->dentry->d_name.name, len);
1172 cgrp = cgrp->parent; 1175 cgrp = cgrp->parent;
1173 if (!cgrp) 1176 if (!cgrp)
1174 break; 1177 break;
1178 dentry = rcu_dereference(cgrp->dentry);
1175 if (!cgrp->parent) 1179 if (!cgrp->parent)
1176 continue; 1180 continue;
1177 if (--start < buf) 1181 if (--start < buf)
@@ -1216,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1216 int retval = 0; 1220 int retval = 0;
1217 struct cgroup_subsys *ss; 1221 struct cgroup_subsys *ss;
1218 struct cgroup *oldcgrp; 1222 struct cgroup *oldcgrp;
1219 struct css_set *cg = tsk->cgroups; 1223 struct css_set *cg;
1220 struct css_set *newcg; 1224 struct css_set *newcg;
1221 struct cgroupfs_root *root = cgrp->root; 1225 struct cgroupfs_root *root = cgrp->root;
1222 int subsys_id; 1226 int subsys_id;
@@ -1236,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1236 } 1240 }
1237 } 1241 }
1238 1242
1243 task_lock(tsk);
1244 cg = tsk->cgroups;
1245 get_css_set(cg);
1246 task_unlock(tsk);
1239 /* 1247 /*
1240 * Locate or allocate a new css_set for this task, 1248 * Locate or allocate a new css_set for this task,
1241 * based on its final set of cgroups 1249 * based on its final set of cgroups
1242 */ 1250 */
1243 newcg = find_css_set(cg, cgrp); 1251 newcg = find_css_set(cg, cgrp);
1252 put_css_set(cg);
1244 if (!newcg) 1253 if (!newcg)
1245 return -ENOMEM; 1254 return -ENOMEM;
1246 1255
@@ -1445,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1445 struct cftype *cft = __d_cft(file->f_dentry); 1454 struct cftype *cft = __d_cft(file->f_dentry);
1446 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1455 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1447 1456
1448 if (!cft || cgroup_is_removed(cgrp)) 1457 if (cgroup_is_removed(cgrp))
1449 return -ENODEV; 1458 return -ENODEV;
1450 if (cft->write) 1459 if (cft->write)
1451 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1460 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1490,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1490 struct cftype *cft = __d_cft(file->f_dentry); 1499 struct cftype *cft = __d_cft(file->f_dentry);
1491 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1500 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1492 1501
1493 if (!cft || cgroup_is_removed(cgrp)) 1502 if (cgroup_is_removed(cgrp))
1494 return -ENODEV; 1503 return -ENODEV;
1495 1504
1496 if (cft->read) 1505 if (cft->read)
@@ -1554,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1554 err = generic_file_open(inode, file); 1563 err = generic_file_open(inode, file);
1555 if (err) 1564 if (err)
1556 return err; 1565 return err;
1557
1558 cft = __d_cft(file->f_dentry); 1566 cft = __d_cft(file->f_dentry);
1559 if (!cft) 1567
1560 return -ENODEV;
1561 if (cft->read_map || cft->read_seq_string) { 1568 if (cft->read_map || cft->read_seq_string) {
1562 struct cgroup_seqfile_state *state = 1569 struct cgroup_seqfile_state *state =
1563 kzalloc(sizeof(*state), GFP_USER); 1570 kzalloc(sizeof(*state), GFP_USER);
@@ -1671,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1671 if (!error) { 1678 if (!error) {
1672 dentry->d_fsdata = cgrp; 1679 dentry->d_fsdata = cgrp;
1673 inc_nlink(parent->d_inode); 1680 inc_nlink(parent->d_inode);
1674 cgrp->dentry = dentry; 1681 rcu_assign_pointer(cgrp->dentry, dentry);
1675 dget(dentry); 1682 dget(dentry);
1676 } 1683 }
1677 dput(dentry); 1684 dput(dentry);
@@ -1812,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1812{ 1819{
1813 struct task_struct *res; 1820 struct task_struct *res;
1814 struct list_head *l = it->task; 1821 struct list_head *l = it->task;
1822 struct cg_cgroup_link *link;
1815 1823
1816 /* If the iterator cg is NULL, we have no tasks */ 1824 /* If the iterator cg is NULL, we have no tasks */
1817 if (!it->cg_link) 1825 if (!it->cg_link)
@@ -1819,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1819 res = list_entry(l, struct task_struct, cg_list); 1827 res = list_entry(l, struct task_struct, cg_list);
1820 /* Advance iterator to find next entry */ 1828 /* Advance iterator to find next entry */
1821 l = l->next; 1829 l = l->next;
1822 if (l == &res->cgroups->tasks) { 1830 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1831 if (l == &link->cg->tasks) {
1823 /* We reached the end of this task list - move on to 1832 /* We reached the end of this task list - move on to
1824 * the next cg_cgroup_link */ 1833 * the next cg_cgroup_link */
1825 cgroup_advance_iter(cgrp, it); 1834 cgroup_advance_iter(cgrp, it);
@@ -2013,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2013 */ 2022 */
2014static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2023static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2015{ 2024{
2016 int n = 0; 2025 int n = 0, pid;
2017 struct cgroup_iter it; 2026 struct cgroup_iter it;
2018 struct task_struct *tsk; 2027 struct task_struct *tsk;
2019 cgroup_iter_start(cgrp, &it); 2028 cgroup_iter_start(cgrp, &it);
2020 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2029 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2021 if (unlikely(n == npids)) 2030 if (unlikely(n == npids))
2022 break; 2031 break;
2023 pidarray[n++] = task_pid_vnr(tsk); 2032 pid = task_pid_vnr(tsk);
2033 if (pid > 0)
2034 pidarray[n++] = pid;
2024 } 2035 }
2025 cgroup_iter_end(cgrp, &it); 2036 cgroup_iter_end(cgrp, &it);
2026 return n; 2037 return n;
@@ -2052,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2052 2063
2053 ret = 0; 2064 ret = 0;
2054 cgrp = dentry->d_fsdata; 2065 cgrp = dentry->d_fsdata;
2055 rcu_read_lock();
2056 2066
2057 cgroup_iter_start(cgrp, &it); 2067 cgroup_iter_start(cgrp, &it);
2058 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2068 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2077,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2077 } 2087 }
2078 cgroup_iter_end(cgrp, &it); 2088 cgroup_iter_end(cgrp, &it);
2079 2089
2080 rcu_read_unlock();
2081err: 2090err:
2082 return ret; 2091 return ret;
2083} 2092}
@@ -2324,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2324 struct cgroup *cgrp) 2333 struct cgroup *cgrp)
2325{ 2334{
2326 css->cgroup = cgrp; 2335 css->cgroup = cgrp;
2327 atomic_set(&css->refcnt, 0); 2336 atomic_set(&css->refcnt, 1);
2328 css->flags = 0; 2337 css->flags = 0;
2329 if (cgrp == dummytop) 2338 if (cgrp == dummytop)
2330 set_bit(CSS_ROOT, &css->flags); 2339 set_bit(CSS_ROOT, &css->flags);
@@ -2332,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2332 cgrp->subsys[ss->subsys_id] = css; 2341 cgrp->subsys[ss->subsys_id] = css;
2333} 2342}
2334 2343
2344static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2345{
2346 /* We need to take each hierarchy_mutex in a consistent order */
2347 int i;
2348
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i);
2353 }
2354}
2355
2356static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2357{
2358 int i;
2359
2360 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2361 struct cgroup_subsys *ss = subsys[i];
2362 if (ss->root == root)
2363 mutex_unlock(&ss->hierarchy_mutex);
2364 }
2365}
2366
2335/* 2367/*
2336 * cgroup_create - create a cgroup 2368 * cgroup_create - create a cgroup
2337 * @parent: cgroup that will be parent of the new cgroup 2369 * @parent: cgroup that will be parent of the new cgroup
@@ -2380,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2380 init_cgroup_css(css, ss, cgrp); 2412 init_cgroup_css(css, ss, cgrp);
2381 } 2413 }
2382 2414
2415 cgroup_lock_hierarchy(root);
2383 list_add(&cgrp->sibling, &cgrp->parent->children); 2416 list_add(&cgrp->sibling, &cgrp->parent->children);
2417 cgroup_unlock_hierarchy(root);
2384 root->number_of_cgroups++; 2418 root->number_of_cgroups++;
2385 2419
2386 err = cgroup_create_dir(cgrp, dentry, mode); 2420 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2431,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2431{ 2465{
2432 /* Check the reference count on each subsystem. Since we 2466 /* Check the reference count on each subsystem. Since we
2433 * already established that there are no tasks in the 2467 * already established that there are no tasks in the
2434 * cgroup, if the css refcount is also 0, then there should 2468 * cgroup, if the css refcount is also 1, then there should
2435 * be no outstanding references, so the subsystem is safe to 2469 * be no outstanding references, so the subsystem is safe to
2436 * destroy. We scan across all subsystems rather than using 2470 * destroy. We scan across all subsystems rather than using
2437 * the per-hierarchy linked list of mounted subsystems since 2471 * the per-hierarchy linked list of mounted subsystems since
@@ -2452,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2452 * matter, since it can only happen if the cgroup 2486 * matter, since it can only happen if the cgroup
2453 * has been deleted and hence no longer needs the 2487 * has been deleted and hence no longer needs the
2454 * release agent to be called anyway. */ 2488 * release agent to be called anyway. */
2455 if (css && atomic_read(&css->refcnt)) 2489 if (css && (atomic_read(&css->refcnt) > 1))
2456 return 1; 2490 return 1;
2457 } 2491 }
2458 return 0; 2492 return 0;
2459} 2493}
2460 2494
2495/*
2496 * Atomically mark all (or else none) of the cgroup's CSS objects as
2497 * CSS_REMOVED. Return true on success, or false if the cgroup has
2498 * busy subsystems. Call with cgroup_mutex held
2499 */
2500
2501static int cgroup_clear_css_refs(struct cgroup *cgrp)
2502{
2503 struct cgroup_subsys *ss;
2504 unsigned long flags;
2505 bool failed = false;
2506 local_irq_save(flags);
2507 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt;
2510 do {
2511 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) {
2514 failed = true;
2515 goto done;
2516 }
2517 BUG_ON(!refcnt);
2518 /*
2519 * Drop the refcnt to 0 while we check other
2520 * subsystems. This will cause any racing
2521 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort
2523 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
2525 }
2526 done:
2527 for_each_subsys(cgrp->root, ss) {
2528 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2529 if (failed) {
2530 /*
2531 * Restore old refcnt if we previously managed
2532 * to clear it from 1 to 0
2533 */
2534 if (!atomic_read(&css->refcnt))
2535 atomic_set(&css->refcnt, 1);
2536 } else {
2537 /* Commit the fact that the CSS is removed */
2538 set_bit(CSS_REMOVED, &css->flags);
2539 }
2540 }
2541 local_irq_restore(flags);
2542 return !failed;
2543}
2544
2461static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2545static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2462{ 2546{
2463 struct cgroup *cgrp = dentry->d_fsdata; 2547 struct cgroup *cgrp = dentry->d_fsdata;
2464 struct dentry *d; 2548 struct dentry *d;
2465 struct cgroup *parent; 2549 struct cgroup *parent;
2466 struct super_block *sb;
2467 struct cgroupfs_root *root;
2468 2550
2469 /* the vfs holds both inode->i_mutex already */ 2551 /* the vfs holds both inode->i_mutex already */
2470 2552
@@ -2487,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2487 2569
2488 mutex_lock(&cgroup_mutex); 2570 mutex_lock(&cgroup_mutex);
2489 parent = cgrp->parent; 2571 parent = cgrp->parent;
2490 root = cgrp->root;
2491 sb = root->sb;
2492 2572
2493 if (atomic_read(&cgrp->count) 2573 if (atomic_read(&cgrp->count)
2494 || !list_empty(&cgrp->children) 2574 || !list_empty(&cgrp->children)
2495 || cgroup_has_css_refs(cgrp)) { 2575 || !cgroup_clear_css_refs(cgrp)) {
2496 mutex_unlock(&cgroup_mutex); 2576 mutex_unlock(&cgroup_mutex);
2497 return -EBUSY; 2577 return -EBUSY;
2498 } 2578 }
@@ -2502,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2502 if (!list_empty(&cgrp->release_list)) 2582 if (!list_empty(&cgrp->release_list))
2503 list_del(&cgrp->release_list); 2583 list_del(&cgrp->release_list);
2504 spin_unlock(&release_list_lock); 2584 spin_unlock(&release_list_lock);
2505 /* delete my sibling from parent->children */ 2585
2586 cgroup_lock_hierarchy(cgrp->root);
2587 /* delete this cgroup from parent->children */
2506 list_del(&cgrp->sibling); 2588 list_del(&cgrp->sibling);
2589 cgroup_unlock_hierarchy(cgrp->root);
2590
2507 spin_lock(&cgrp->dentry->d_lock); 2591 spin_lock(&cgrp->dentry->d_lock);
2508 d = dget(cgrp->dentry); 2592 d = dget(cgrp->dentry);
2509 spin_unlock(&d->d_lock); 2593 spin_unlock(&d->d_lock);
@@ -2525,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2525 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2609 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2526 2610
2527 /* Create the top cgroup state for this subsystem */ 2611 /* Create the top cgroup state for this subsystem */
2612 list_add(&ss->sibling, &rootnode.subsys_list);
2528 ss->root = &rootnode; 2613 ss->root = &rootnode;
2529 css = ss->create(ss, dummytop); 2614 css = ss->create(ss, dummytop);
2530 /* We don't handle early failures gracefully */ 2615 /* We don't handle early failures gracefully */
@@ -2544,6 +2629,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2544 * need to invoke fork callbacks here. */ 2629 * need to invoke fork callbacks here. */
2545 BUG_ON(!list_empty(&init_task.tasks)); 2630 BUG_ON(!list_empty(&init_task.tasks));
2546 2631
2632 mutex_init(&ss->hierarchy_mutex);
2547 ss->active = 1; 2633 ss->active = 1;
2548} 2634}
2549 2635
@@ -2562,7 +2648,6 @@ int __init cgroup_init_early(void)
2562 INIT_HLIST_NODE(&init_css_set.hlist); 2648 INIT_HLIST_NODE(&init_css_set.hlist);
2563 css_set_count = 1; 2649 css_set_count = 1;
2564 init_cgroup_root(&rootnode); 2650 init_cgroup_root(&rootnode);
2565 list_add(&rootnode.root_list, &roots);
2566 root_count = 1; 2651 root_count = 1;
2567 init_task.cgroups = &init_css_set; 2652 init_task.cgroups = &init_css_set;
2568 2653
@@ -2669,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2669 2754
2670 mutex_lock(&cgroup_mutex); 2755 mutex_lock(&cgroup_mutex);
2671 2756
2672 for_each_root(root) { 2757 for_each_active_root(root) {
2673 struct cgroup_subsys *ss; 2758 struct cgroup_subsys *ss;
2674 struct cgroup *cgrp; 2759 struct cgroup *cgrp;
2675 int subsys_id; 2760 int subsys_id;
2676 int count = 0; 2761 int count = 0;
2677 2762
2678 /* Skip this hierarchy if it has no active subsystems */
2679 if (!root->actual_subsys_bits)
2680 continue;
2681 seq_printf(m, "%lu:", root->subsys_bits); 2763 seq_printf(m, "%lu:", root->subsys_bits);
2682 for_each_subsys(root, ss) 2764 for_each_subsys(root, ss)
2683 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2765 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2800,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
2800{ 2882{
2801 if (use_task_css_set_links) { 2883 if (use_task_css_set_links) {
2802 write_lock(&css_set_lock); 2884 write_lock(&css_set_lock);
2885 task_lock(child);
2803 if (list_empty(&child->cg_list)) 2886 if (list_empty(&child->cg_list))
2804 list_add(&child->cg_list, &child->cgroups->tasks); 2887 list_add(&child->cg_list, &child->cgroups->tasks);
2888 task_unlock(child);
2805 write_unlock(&css_set_lock); 2889 write_unlock(&css_set_lock);
2806 } 2890 }
2807} 2891}
@@ -2907,6 +2991,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2907 mutex_unlock(&cgroup_mutex); 2991 mutex_unlock(&cgroup_mutex);
2908 return 0; 2992 return 0;
2909 } 2993 }
2994 task_lock(tsk);
2910 cg = tsk->cgroups; 2995 cg = tsk->cgroups;
2911 parent = task_cgroup(tsk, subsys->subsys_id); 2996 parent = task_cgroup(tsk, subsys->subsys_id);
2912 2997
@@ -2919,6 +3004,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2919 3004
2920 /* Keep the cgroup alive */ 3005 /* Keep the cgroup alive */
2921 get_css_set(cg); 3006 get_css_set(cg);
3007 task_unlock(tsk);
2922 mutex_unlock(&cgroup_mutex); 3008 mutex_unlock(&cgroup_mutex);
2923 3009
2924 /* Now do the VFS work to create a cgroup */ 3010 /* Now do the VFS work to create a cgroup */
@@ -2937,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2937 } 3023 }
2938 3024
2939 /* Create the cgroup directory, which also creates the cgroup */ 3025 /* Create the cgroup directory, which also creates the cgroup */
2940 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3026 ret = vfs_mkdir(inode, dentry, 0755);
2941 child = __d_cgrp(dentry); 3027 child = __d_cgrp(dentry);
2942 dput(dentry); 3028 dput(dentry);
2943 if (ret) { 3029 if (ret) {
@@ -2947,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2947 goto out_release; 3033 goto out_release;
2948 } 3034 }
2949 3035
2950 if (!child) {
2951 printk(KERN_INFO
2952 "Couldn't find new cgroup %s\n", nodename);
2953 ret = -ENOMEM;
2954 goto out_release;
2955 }
2956
2957 /* The cgroup now exists. Retake cgroup_mutex and check 3036 /* The cgroup now exists. Retake cgroup_mutex and check
2958 * that we're still in the same state that we thought we 3037 * that we're still in the same state that we thought we
2959 * were. */ 3038 * were. */
@@ -3049,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
3049{ 3128{
3050 struct cgroup *cgrp = css->cgroup; 3129 struct cgroup *cgrp = css->cgroup;
3051 rcu_read_lock(); 3130 rcu_read_lock();
3052 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3131 if ((atomic_dec_return(&css->refcnt) == 1) &&
3132 notify_on_release(cgrp)) {
3053 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3133 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3054 check_for_release(cgrp); 3134 check_for_release(cgrp);
3055 } 3135 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 345ace5117de..647c77a88fcb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -84,7 +84,7 @@ struct cpuset {
84 struct cgroup_subsys_state css; 84 struct cgroup_subsys_state css;
85 85
86 unsigned long flags; /* "unsigned long" so bitops work */ 86 unsigned long flags; /* "unsigned long" so bitops work */
87 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 89
90 struct cpuset *parent; /* my parent */ 90 struct cpuset *parent; /* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
195 195
196static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
198 .cpus_allowed = CPU_MASK_ALL,
199 .mems_allowed = NODE_MASK_ALL,
200}; 198};
201 199
202/* 200/*
@@ -278,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
278}; 276};
279 277
280/* 278/*
281 * Return in *pmask the portion of a cpusets's cpus_allowed that 279 * Return in pmask the portion of a cpusets's cpus_allowed that
282 * are online. If none are online, walk up the cpuset hierarchy 280 * are online. If none are online, walk up the cpuset hierarchy
283 * until we find one that does have some online cpus. If we get 281 * until we find one that does have some online cpus. If we get
284 * all the way to the top and still haven't found any online cpus, 282 * all the way to the top and still haven't found any online cpus,
@@ -291,15 +289,16 @@ static struct file_system_type cpuset_fs_type = {
291 * Call with callback_mutex held. 289 * Call with callback_mutex held.
292 */ 290 */
293 291
294static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 292static void guarantee_online_cpus(const struct cpuset *cs,
293 struct cpumask *pmask)
295{ 294{
296 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) 295 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
297 cs = cs->parent; 296 cs = cs->parent;
298 if (cs) 297 if (cs)
299 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); 298 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
300 else 299 else
301 *pmask = cpu_online_map; 300 cpumask_copy(pmask, cpu_online_mask);
302 BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); 301 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
303} 302}
304 303
305/* 304/*
@@ -375,14 +374,9 @@ void cpuset_update_task_memory_state(void)
375 struct task_struct *tsk = current; 374 struct task_struct *tsk = current;
376 struct cpuset *cs; 375 struct cpuset *cs;
377 376
378 if (task_cs(tsk) == &top_cpuset) { 377 rcu_read_lock();
379 /* Don't need rcu for top_cpuset. It's never freed. */ 378 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
380 my_cpusets_mem_gen = top_cpuset.mems_generation; 379 rcu_read_unlock();
381 } else {
382 rcu_read_lock();
383 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
384 rcu_read_unlock();
385 }
386 380
387 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 381 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
388 mutex_lock(&callback_mutex); 382 mutex_lock(&callback_mutex);
@@ -414,12 +408,43 @@ void cpuset_update_task_memory_state(void)
414 408
415static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 409static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
416{ 410{
417 return cpus_subset(p->cpus_allowed, q->cpus_allowed) && 411 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
418 nodes_subset(p->mems_allowed, q->mems_allowed) && 412 nodes_subset(p->mems_allowed, q->mems_allowed) &&
419 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 413 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
420 is_mem_exclusive(p) <= is_mem_exclusive(q); 414 is_mem_exclusive(p) <= is_mem_exclusive(q);
421} 415}
422 416
417/**
418 * alloc_trial_cpuset - allocate a trial cpuset
419 * @cs: the cpuset that the trial cpuset duplicates
420 */
421static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
422{
423 struct cpuset *trial;
424
425 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
426 if (!trial)
427 return NULL;
428
429 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
430 kfree(trial);
431 return NULL;
432 }
433 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
434
435 return trial;
436}
437
438/**
439 * free_trial_cpuset - free the trial cpuset
440 * @trial: the trial cpuset to be freed
441 */
442static void free_trial_cpuset(struct cpuset *trial)
443{
444 free_cpumask_var(trial->cpus_allowed);
445 kfree(trial);
446}
447
423/* 448/*
424 * validate_change() - Used to validate that any proposed cpuset change 449 * validate_change() - Used to validate that any proposed cpuset change
425 * follows the structural rules for cpusets. 450 * follows the structural rules for cpusets.
@@ -469,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
469 c = cgroup_cs(cont); 494 c = cgroup_cs(cont);
470 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 495 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
471 c != cur && 496 c != cur &&
472 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 497 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
473 return -EINVAL; 498 return -EINVAL;
474 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 499 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
475 c != cur && 500 c != cur &&
@@ -479,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
479 504
480 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 505 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
481 if (cgroup_task_count(cur->css.cgroup)) { 506 if (cgroup_task_count(cur->css.cgroup)) {
482 if (cpus_empty(trial->cpus_allowed) || 507 if (cpumask_empty(trial->cpus_allowed) ||
483 nodes_empty(trial->mems_allowed)) { 508 nodes_empty(trial->mems_allowed)) {
484 return -ENOSPC; 509 return -ENOSPC;
485 } 510 }
@@ -494,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
494 */ 519 */
495static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 520static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
496{ 521{
497 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 522 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
498} 523}
499 524
500static void 525static void
@@ -519,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
519 cp = list_first_entry(&q, struct cpuset, stack_list); 544 cp = list_first_entry(&q, struct cpuset, stack_list);
520 list_del(q.next); 545 list_del(q.next);
521 546
522 if (cpus_empty(cp->cpus_allowed)) 547 if (cpumask_empty(cp->cpus_allowed))
523 continue; 548 continue;
524 549
525 if (is_sched_load_balance(cp)) 550 if (is_sched_load_balance(cp))
@@ -586,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
586 * element of the partition (one sched domain) to be passed to 611 * element of the partition (one sched domain) to be passed to
587 * partition_sched_domains(). 612 * partition_sched_domains().
588 */ 613 */
589static int generate_sched_domains(cpumask_t **domains, 614/* FIXME: see the FIXME in partition_sched_domains() */
615static int generate_sched_domains(struct cpumask **domains,
590 struct sched_domain_attr **attributes) 616 struct sched_domain_attr **attributes)
591{ 617{
592 LIST_HEAD(q); /* queue of cpusets to be scanned */ 618 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -594,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
594 struct cpuset **csa; /* array of all cpuset ptrs */ 620 struct cpuset **csa; /* array of all cpuset ptrs */
595 int csn; /* how many cpuset ptrs in csa so far */ 621 int csn; /* how many cpuset ptrs in csa so far */
596 int i, j, k; /* indices for partition finding loops */ 622 int i, j, k; /* indices for partition finding loops */
597 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 623 struct cpumask *doms; /* resulting partition; i.e. sched domains */
598 struct sched_domain_attr *dattr; /* attributes for custom domains */ 624 struct sched_domain_attr *dattr; /* attributes for custom domains */
599 int ndoms = 0; /* number of sched domains in result */ 625 int ndoms = 0; /* number of sched domains in result */
600 int nslot; /* next empty doms[] cpumask_t slot */ 626 int nslot; /* next empty doms[] struct cpumask slot */
601 627
602 doms = NULL; 628 doms = NULL;
603 dattr = NULL; 629 dattr = NULL;
@@ -605,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
605 631
606 /* Special case for the 99% of systems with one, full, sched domain */ 632 /* Special case for the 99% of systems with one, full, sched domain */
607 if (is_sched_load_balance(&top_cpuset)) { 633 if (is_sched_load_balance(&top_cpuset)) {
608 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 634 doms = kmalloc(cpumask_size(), GFP_KERNEL);
609 if (!doms) 635 if (!doms)
610 goto done; 636 goto done;
611 637
@@ -614,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains,
614 *dattr = SD_ATTR_INIT; 640 *dattr = SD_ATTR_INIT;
615 update_domain_attr_tree(dattr, &top_cpuset); 641 update_domain_attr_tree(dattr, &top_cpuset);
616 } 642 }
617 *doms = top_cpuset.cpus_allowed; 643 cpumask_copy(doms, top_cpuset.cpus_allowed);
618 644
619 ndoms = 1; 645 ndoms = 1;
620 goto done; 646 goto done;
@@ -633,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains,
633 cp = list_first_entry(&q, struct cpuset, stack_list); 659 cp = list_first_entry(&q, struct cpuset, stack_list);
634 list_del(q.next); 660 list_del(q.next);
635 661
636 if (cpus_empty(cp->cpus_allowed)) 662 if (cpumask_empty(cp->cpus_allowed))
637 continue; 663 continue;
638 664
639 /* 665 /*
@@ -684,7 +710,7 @@ restart:
684 * Now we know how many domains to create. 710 * Now we know how many domains to create.
685 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 711 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
686 */ 712 */
687 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 713 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
688 if (!doms) 714 if (!doms)
689 goto done; 715 goto done;
690 716
@@ -696,7 +722,7 @@ restart:
696 722
697 for (nslot = 0, i = 0; i < csn; i++) { 723 for (nslot = 0, i = 0; i < csn; i++) {
698 struct cpuset *a = csa[i]; 724 struct cpuset *a = csa[i];
699 cpumask_t *dp; 725 struct cpumask *dp;
700 int apn = a->pn; 726 int apn = a->pn;
701 727
702 if (apn < 0) { 728 if (apn < 0) {
@@ -719,14 +745,14 @@ restart:
719 continue; 745 continue;
720 } 746 }
721 747
722 cpus_clear(*dp); 748 cpumask_clear(dp);
723 if (dattr) 749 if (dattr)
724 *(dattr + nslot) = SD_ATTR_INIT; 750 *(dattr + nslot) = SD_ATTR_INIT;
725 for (j = i; j < csn; j++) { 751 for (j = i; j < csn; j++) {
726 struct cpuset *b = csa[j]; 752 struct cpuset *b = csa[j];
727 753
728 if (apn == b->pn) { 754 if (apn == b->pn) {
729 cpus_or(*dp, *dp, b->cpus_allowed); 755 cpumask_or(dp, dp, b->cpus_allowed);
730 if (dattr) 756 if (dattr)
731 update_domain_attr_tree(dattr + nslot, b); 757 update_domain_attr_tree(dattr + nslot, b);
732 758
@@ -766,7 +792,7 @@ done:
766static void do_rebuild_sched_domains(struct work_struct *unused) 792static void do_rebuild_sched_domains(struct work_struct *unused)
767{ 793{
768 struct sched_domain_attr *attr; 794 struct sched_domain_attr *attr;
769 cpumask_t *doms; 795 struct cpumask *doms;
770 int ndoms; 796 int ndoms;
771 797
772 get_online_cpus(); 798 get_online_cpus();
@@ -835,7 +861,7 @@ void rebuild_sched_domains(void)
835static int cpuset_test_cpumask(struct task_struct *tsk, 861static int cpuset_test_cpumask(struct task_struct *tsk,
836 struct cgroup_scanner *scan) 862 struct cgroup_scanner *scan)
837{ 863{
838 return !cpus_equal(tsk->cpus_allowed, 864 return !cpumask_equal(&tsk->cpus_allowed,
839 (cgroup_cs(scan->cg))->cpus_allowed); 865 (cgroup_cs(scan->cg))->cpus_allowed);
840} 866}
841 867
@@ -853,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
853static void cpuset_change_cpumask(struct task_struct *tsk, 879static void cpuset_change_cpumask(struct task_struct *tsk,
854 struct cgroup_scanner *scan) 880 struct cgroup_scanner *scan)
855{ 881{
856 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 882 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
857} 883}
858 884
859/** 885/**
@@ -885,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
885 * @cs: the cpuset to consider 911 * @cs: the cpuset to consider
886 * @buf: buffer of cpu numbers written to this cpuset 912 * @buf: buffer of cpu numbers written to this cpuset
887 */ 913 */
888static int update_cpumask(struct cpuset *cs, const char *buf) 914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf)
889{ 916{
890 struct ptr_heap heap; 917 struct ptr_heap heap;
891 struct cpuset trialcs;
892 int retval; 918 int retval;
893 int is_load_balanced; 919 int is_load_balanced;
894 920
@@ -896,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
896 if (cs == &top_cpuset) 922 if (cs == &top_cpuset)
897 return -EACCES; 923 return -EACCES;
898 924
899 trialcs = *cs;
900
901 /* 925 /*
902 * An empty cpus_allowed is ok only if the cpuset has no tasks. 926 * An empty cpus_allowed is ok only if the cpuset has no tasks.
903 * Since cpulist_parse() fails on an empty mask, we special case 927 * Since cpulist_parse() fails on an empty mask, we special case
@@ -905,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
905 * with tasks have cpus. 929 * with tasks have cpus.
906 */ 930 */
907 if (!*buf) { 931 if (!*buf) {
908 cpus_clear(trialcs.cpus_allowed); 932 cpumask_clear(trialcs->cpus_allowed);
909 } else { 933 } else {
910 retval = cpulist_parse(buf, &trialcs.cpus_allowed); 934 retval = cpulist_parse(buf, trialcs->cpus_allowed);
911 if (retval < 0) 935 if (retval < 0)
912 return retval; 936 return retval;
913 937
914 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) 938 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
915 return -EINVAL; 939 return -EINVAL;
916 } 940 }
917 retval = validate_change(cs, &trialcs); 941 retval = validate_change(cs, trialcs);
918 if (retval < 0) 942 if (retval < 0)
919 return retval; 943 return retval;
920 944
921 /* Nothing to do if the cpus didn't change */ 945 /* Nothing to do if the cpus didn't change */
922 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 946 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
923 return 0; 947 return 0;
924 948
925 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 949 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
926 if (retval) 950 if (retval)
927 return retval; 951 return retval;
928 952
929 is_load_balanced = is_sched_load_balance(&trialcs); 953 is_load_balanced = is_sched_load_balance(trialcs);
930 954
931 mutex_lock(&callback_mutex); 955 mutex_lock(&callback_mutex);
932 cs->cpus_allowed = trialcs.cpus_allowed; 956 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
933 mutex_unlock(&callback_mutex); 957 mutex_unlock(&callback_mutex);
934 958
935 /* 959 /*
@@ -1017,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
1017 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1041 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1018 1042
1019 fudge = 10; /* spare mmarray[] slots */ 1043 fudge = 10; /* spare mmarray[] slots */
1020 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 1044 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
1021 retval = -ENOMEM; 1045 retval = -ENOMEM;
1022 1046
1023 /* 1047 /*
@@ -1104,9 +1128,9 @@ done:
1104 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1128 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1105 * their mempolicies to the cpusets new mems_allowed. 1129 * their mempolicies to the cpusets new mems_allowed.
1106 */ 1130 */
1107static int update_nodemask(struct cpuset *cs, const char *buf) 1131static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1132 const char *buf)
1108{ 1133{
1109 struct cpuset trialcs;
1110 nodemask_t oldmem; 1134 nodemask_t oldmem;
1111 int retval; 1135 int retval;
1112 1136
@@ -1117,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1117 if (cs == &top_cpuset) 1141 if (cs == &top_cpuset)
1118 return -EACCES; 1142 return -EACCES;
1119 1143
1120 trialcs = *cs;
1121
1122 /* 1144 /*
1123 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1145 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1124 * Since nodelist_parse() fails on an empty mask, we special case 1146 * Since nodelist_parse() fails on an empty mask, we special case
@@ -1126,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1126 * with tasks have memory. 1148 * with tasks have memory.
1127 */ 1149 */
1128 if (!*buf) { 1150 if (!*buf) {
1129 nodes_clear(trialcs.mems_allowed); 1151 nodes_clear(trialcs->mems_allowed);
1130 } else { 1152 } else {
1131 retval = nodelist_parse(buf, trialcs.mems_allowed); 1153 retval = nodelist_parse(buf, trialcs->mems_allowed);
1132 if (retval < 0) 1154 if (retval < 0)
1133 goto done; 1155 goto done;
1134 1156
1135 if (!nodes_subset(trialcs.mems_allowed, 1157 if (!nodes_subset(trialcs->mems_allowed,
1136 node_states[N_HIGH_MEMORY])) 1158 node_states[N_HIGH_MEMORY]))
1137 return -EINVAL; 1159 return -EINVAL;
1138 } 1160 }
1139 oldmem = cs->mems_allowed; 1161 oldmem = cs->mems_allowed;
1140 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 1162 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1141 retval = 0; /* Too easy - nothing to do */ 1163 retval = 0; /* Too easy - nothing to do */
1142 goto done; 1164 goto done;
1143 } 1165 }
1144 retval = validate_change(cs, &trialcs); 1166 retval = validate_change(cs, trialcs);
1145 if (retval < 0) 1167 if (retval < 0)
1146 goto done; 1168 goto done;
1147 1169
1148 mutex_lock(&callback_mutex); 1170 mutex_lock(&callback_mutex);
1149 cs->mems_allowed = trialcs.mems_allowed; 1171 cs->mems_allowed = trialcs->mems_allowed;
1150 cs->mems_generation = cpuset_mems_generation++; 1172 cs->mems_generation = cpuset_mems_generation++;
1151 mutex_unlock(&callback_mutex); 1173 mutex_unlock(&callback_mutex);
1152 1174
@@ -1167,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1167 1189
1168 if (val != cs->relax_domain_level) { 1190 if (val != cs->relax_domain_level) {
1169 cs->relax_domain_level = val; 1191 cs->relax_domain_level = val;
1170 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1192 if (!cpumask_empty(cs->cpus_allowed) &&
1193 is_sched_load_balance(cs))
1171 async_rebuild_sched_domains(); 1194 async_rebuild_sched_domains();
1172 } 1195 }
1173 1196
@@ -1186,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1186static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1209static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1187 int turning_on) 1210 int turning_on)
1188{ 1211{
1189 struct cpuset trialcs; 1212 struct cpuset *trialcs;
1190 int err; 1213 int err;
1191 int balance_flag_changed; 1214 int balance_flag_changed;
1192 1215
1193 trialcs = *cs; 1216 trialcs = alloc_trial_cpuset(cs);
1217 if (!trialcs)
1218 return -ENOMEM;
1219
1194 if (turning_on) 1220 if (turning_on)
1195 set_bit(bit, &trialcs.flags); 1221 set_bit(bit, &trialcs->flags);
1196 else 1222 else
1197 clear_bit(bit, &trialcs.flags); 1223 clear_bit(bit, &trialcs->flags);
1198 1224
1199 err = validate_change(cs, &trialcs); 1225 err = validate_change(cs, trialcs);
1200 if (err < 0) 1226 if (err < 0)
1201 return err; 1227 goto out;
1202 1228
1203 balance_flag_changed = (is_sched_load_balance(cs) != 1229 balance_flag_changed = (is_sched_load_balance(cs) !=
1204 is_sched_load_balance(&trialcs)); 1230 is_sched_load_balance(trialcs));
1205 1231
1206 mutex_lock(&callback_mutex); 1232 mutex_lock(&callback_mutex);
1207 cs->flags = trialcs.flags; 1233 cs->flags = trialcs->flags;
1208 mutex_unlock(&callback_mutex); 1234 mutex_unlock(&callback_mutex);
1209 1235
1210 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) 1236 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1211 async_rebuild_sched_domains(); 1237 async_rebuild_sched_domains();
1212 1238
1213 return 0; 1239out:
1240 free_trial_cpuset(trialcs);
1241 return err;
1214} 1242}
1215 1243
1216/* 1244/*
@@ -1311,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp)
1311 return val; 1339 return val;
1312} 1340}
1313 1341
1342/* Protected by cgroup_lock */
1343static cpumask_var_t cpus_attach;
1344
1314/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1345/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1315static int cpuset_can_attach(struct cgroup_subsys *ss, 1346static int cpuset_can_attach(struct cgroup_subsys *ss,
1316 struct cgroup *cont, struct task_struct *tsk) 1347 struct cgroup *cont, struct task_struct *tsk)
1317{ 1348{
1318 struct cpuset *cs = cgroup_cs(cont); 1349 struct cpuset *cs = cgroup_cs(cont);
1350 int ret = 0;
1319 1351
1320 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1352 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1321 return -ENOSPC; 1353 return -ENOSPC;
1322 if (tsk->flags & PF_THREAD_BOUND) {
1323 cpumask_t mask;
1324 1354
1355 if (tsk->flags & PF_THREAD_BOUND) {
1325 mutex_lock(&callback_mutex); 1356 mutex_lock(&callback_mutex);
1326 mask = cs->cpus_allowed; 1357 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
1358 ret = -EINVAL;
1327 mutex_unlock(&callback_mutex); 1359 mutex_unlock(&callback_mutex);
1328 if (!cpus_equal(tsk->cpus_allowed, mask))
1329 return -EINVAL;
1330 } 1360 }
1331 1361
1332 return security_task_setscheduler(tsk, 0, NULL); 1362 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
1333} 1363}
1334 1364
1335static void cpuset_attach(struct cgroup_subsys *ss, 1365static void cpuset_attach(struct cgroup_subsys *ss,
1336 struct cgroup *cont, struct cgroup *oldcont, 1366 struct cgroup *cont, struct cgroup *oldcont,
1337 struct task_struct *tsk) 1367 struct task_struct *tsk)
1338{ 1368{
1339 cpumask_t cpus;
1340 nodemask_t from, to; 1369 nodemask_t from, to;
1341 struct mm_struct *mm; 1370 struct mm_struct *mm;
1342 struct cpuset *cs = cgroup_cs(cont); 1371 struct cpuset *cs = cgroup_cs(cont);
1343 struct cpuset *oldcs = cgroup_cs(oldcont); 1372 struct cpuset *oldcs = cgroup_cs(oldcont);
1344 int err; 1373 int err;
1345 1374
1346 mutex_lock(&callback_mutex); 1375 if (cs == &top_cpuset) {
1347 guarantee_online_cpus(cs, &cpus); 1376 cpumask_copy(cpus_attach, cpu_possible_mask);
1348 err = set_cpus_allowed_ptr(tsk, &cpus); 1377 } else {
1349 mutex_unlock(&callback_mutex); 1378 mutex_lock(&callback_mutex);
1379 guarantee_online_cpus(cs, cpus_attach);
1380 mutex_unlock(&callback_mutex);
1381 }
1382 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1350 if (err) 1383 if (err)
1351 return; 1384 return;
1352 1385
@@ -1359,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1359 cpuset_migrate_mm(mm, &from, &to); 1392 cpuset_migrate_mm(mm, &from, &to);
1360 mmput(mm); 1393 mmput(mm);
1361 } 1394 }
1362
1363} 1395}
1364 1396
1365/* The various types of files and directories in a cpuset file system */ 1397/* The various types of files and directories in a cpuset file system */
@@ -1454,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1454 const char *buf) 1486 const char *buf)
1455{ 1487{
1456 int retval = 0; 1488 int retval = 0;
1489 struct cpuset *cs = cgroup_cs(cgrp);
1490 struct cpuset *trialcs;
1457 1491
1458 if (!cgroup_lock_live_group(cgrp)) 1492 if (!cgroup_lock_live_group(cgrp))
1459 return -ENODEV; 1493 return -ENODEV;
1460 1494
1495 trialcs = alloc_trial_cpuset(cs);
1496 if (!trialcs)
1497 return -ENOMEM;
1498
1461 switch (cft->private) { 1499 switch (cft->private) {
1462 case FILE_CPULIST: 1500 case FILE_CPULIST:
1463 retval = update_cpumask(cgroup_cs(cgrp), buf); 1501 retval = update_cpumask(cs, trialcs, buf);
1464 break; 1502 break;
1465 case FILE_MEMLIST: 1503 case FILE_MEMLIST:
1466 retval = update_nodemask(cgroup_cs(cgrp), buf); 1504 retval = update_nodemask(cs, trialcs, buf);
1467 break; 1505 break;
1468 default: 1506 default:
1469 retval = -EINVAL; 1507 retval = -EINVAL;
1470 break; 1508 break;
1471 } 1509 }
1510
1511 free_trial_cpuset(trialcs);
1472 cgroup_unlock(); 1512 cgroup_unlock();
1473 return retval; 1513 return retval;
1474} 1514}
@@ -1487,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1487 1527
1488static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1528static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1489{ 1529{
1490 cpumask_t mask; 1530 int ret;
1491 1531
1492 mutex_lock(&callback_mutex); 1532 mutex_lock(&callback_mutex);
1493 mask = cs->cpus_allowed; 1533 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1494 mutex_unlock(&callback_mutex); 1534 mutex_unlock(&callback_mutex);
1495 1535
1496 return cpulist_scnprintf(page, PAGE_SIZE, &mask); 1536 return ret;
1497} 1537}
1498 1538
1499static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1539static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1729,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1729 parent_cs = cgroup_cs(parent); 1769 parent_cs = cgroup_cs(parent);
1730 1770
1731 cs->mems_allowed = parent_cs->mems_allowed; 1771 cs->mems_allowed = parent_cs->mems_allowed;
1732 cs->cpus_allowed = parent_cs->cpus_allowed; 1772 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1733 return; 1773 return;
1734} 1774}
1735 1775
@@ -1755,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create(
1755 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1795 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1756 if (!cs) 1796 if (!cs)
1757 return ERR_PTR(-ENOMEM); 1797 return ERR_PTR(-ENOMEM);
1798 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1799 kfree(cs);
1800 return ERR_PTR(-ENOMEM);
1801 }
1758 1802
1759 cpuset_update_task_memory_state(); 1803 cpuset_update_task_memory_state();
1760 cs->flags = 0; 1804 cs->flags = 0;
@@ -1763,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create(
1763 if (is_spread_slab(parent)) 1807 if (is_spread_slab(parent))
1764 set_bit(CS_SPREAD_SLAB, &cs->flags); 1808 set_bit(CS_SPREAD_SLAB, &cs->flags);
1765 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1809 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1766 cpus_clear(cs->cpus_allowed); 1810 cpumask_clear(cs->cpus_allowed);
1767 nodes_clear(cs->mems_allowed); 1811 nodes_clear(cs->mems_allowed);
1768 cs->mems_generation = cpuset_mems_generation++; 1812 cs->mems_generation = cpuset_mems_generation++;
1769 fmeter_init(&cs->fmeter); 1813 fmeter_init(&cs->fmeter);
@@ -1790,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1790 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1834 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1791 1835
1792 number_of_cpusets--; 1836 number_of_cpusets--;
1837 free_cpumask_var(cs->cpus_allowed);
1793 kfree(cs); 1838 kfree(cs);
1794} 1839}
1795 1840
@@ -1813,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = {
1813 1858
1814int __init cpuset_init_early(void) 1859int __init cpuset_init_early(void)
1815{ 1860{
1861 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1862
1816 top_cpuset.mems_generation = cpuset_mems_generation++; 1863 top_cpuset.mems_generation = cpuset_mems_generation++;
1817 return 0; 1864 return 0;
1818} 1865}
@@ -1828,7 +1875,7 @@ int __init cpuset_init(void)
1828{ 1875{
1829 int err = 0; 1876 int err = 0;
1830 1877
1831 cpus_setall(top_cpuset.cpus_allowed); 1878 cpumask_setall(top_cpuset.cpus_allowed);
1832 nodes_setall(top_cpuset.mems_allowed); 1879 nodes_setall(top_cpuset.mems_allowed);
1833 1880
1834 fmeter_init(&top_cpuset.fmeter); 1881 fmeter_init(&top_cpuset.fmeter);
@@ -1840,6 +1887,9 @@ int __init cpuset_init(void)
1840 if (err < 0) 1887 if (err < 0)
1841 return err; 1888 return err;
1842 1889
1890 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1891 BUG();
1892
1843 number_of_cpusets = 1; 1893 number_of_cpusets = 1;
1844 return 0; 1894 return 0;
1845} 1895}
@@ -1914,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1914 * has online cpus, so can't be empty). 1964 * has online cpus, so can't be empty).
1915 */ 1965 */
1916 parent = cs->parent; 1966 parent = cs->parent;
1917 while (cpus_empty(parent->cpus_allowed) || 1967 while (cpumask_empty(parent->cpus_allowed) ||
1918 nodes_empty(parent->mems_allowed)) 1968 nodes_empty(parent->mems_allowed))
1919 parent = parent->parent; 1969 parent = parent->parent;
1920 1970
@@ -1955,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1955 } 2005 }
1956 2006
1957 /* Continue past cpusets with all cpus, mems online */ 2007 /* Continue past cpusets with all cpus, mems online */
1958 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 2008 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
1959 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2009 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1960 continue; 2010 continue;
1961 2011
@@ -1963,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1963 2013
1964 /* Remove offline cpus and mems from this cpuset. */ 2014 /* Remove offline cpus and mems from this cpuset. */
1965 mutex_lock(&callback_mutex); 2015 mutex_lock(&callback_mutex);
1966 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 2016 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2017 cpu_online_mask);
1967 nodes_and(cp->mems_allowed, cp->mems_allowed, 2018 nodes_and(cp->mems_allowed, cp->mems_allowed,
1968 node_states[N_HIGH_MEMORY]); 2019 node_states[N_HIGH_MEMORY]);
1969 mutex_unlock(&callback_mutex); 2020 mutex_unlock(&callback_mutex);
1970 2021
1971 /* Move tasks from the empty cpuset to a parent */ 2022 /* Move tasks from the empty cpuset to a parent */
1972 if (cpus_empty(cp->cpus_allowed) || 2023 if (cpumask_empty(cp->cpus_allowed) ||
1973 nodes_empty(cp->mems_allowed)) 2024 nodes_empty(cp->mems_allowed))
1974 remove_tasks_in_empty_cpuset(cp); 2025 remove_tasks_in_empty_cpuset(cp);
1975 else { 2026 else {
@@ -1995,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1995 unsigned long phase, void *unused_cpu) 2046 unsigned long phase, void *unused_cpu)
1996{ 2047{
1997 struct sched_domain_attr *attr; 2048 struct sched_domain_attr *attr;
1998 cpumask_t *doms; 2049 struct cpumask *doms;
1999 int ndoms; 2050 int ndoms;
2000 2051
2001 switch (phase) { 2052 switch (phase) {
@@ -2010,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2010 } 2061 }
2011 2062
2012 cgroup_lock(); 2063 cgroup_lock();
2013 top_cpuset.cpus_allowed = cpu_online_map; 2064 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2014 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2015 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2016 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2055,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2055 2106
2056void __init cpuset_init_smp(void) 2107void __init cpuset_init_smp(void)
2057{ 2108{
2058 top_cpuset.cpus_allowed = cpu_online_map; 2109 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2059 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2110 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2060 2111
2061 hotcpu_notifier(cpuset_track_online_cpus, 0); 2112 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2065,15 +2116,15 @@ void __init cpuset_init_smp(void)
2065/** 2116/**
2066 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2117 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2067 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2118 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2068 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2119 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
2069 * 2120 *
2070 * Description: Returns the cpumask_t cpus_allowed of the cpuset 2121 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2071 * attached to the specified @tsk. Guaranteed to return some non-empty 2122 * attached to the specified @tsk. Guaranteed to return some non-empty
2072 * subset of cpu_online_map, even if this means going outside the 2123 * subset of cpu_online_map, even if this means going outside the
2073 * tasks cpuset. 2124 * tasks cpuset.
2074 **/ 2125 **/
2075 2126
2076void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) 2127void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2077{ 2128{
2078 mutex_lock(&callback_mutex); 2129 mutex_lock(&callback_mutex);
2079 cpuset_cpus_allowed_locked(tsk, pmask); 2130 cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2084,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
2084 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 2135 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2085 * Must be called with callback_mutex held. 2136 * Must be called with callback_mutex held.
2086 **/ 2137 **/
2087void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) 2138void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2088{ 2139{
2089 task_lock(tsk); 2140 task_lock(tsk);
2090 guarantee_online_cpus(task_cs(tsk), pmask); 2141 guarantee_online_cpus(task_cs(tsk), pmask);
diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991c..3a039189d707 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,7 +372,8 @@ int commit_creds(struct cred *new)
372 old->fsuid != new->fsuid || 372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid || 373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable); 375 if (task->mm)
376 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0; 377 task->pdeath_signal = 0;
377 smp_wmb(); 378 smp_wmb();
378 } 379 }
@@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
506 else 507 else
507 old = get_cred(&init_cred); 508 old = get_cred(&init_cred);
508 509
510 *new = *old;
509 get_uid(new->user); 511 get_uid(new->user);
510 get_group_info(new->group_info); 512 get_group_info(new->group_info);
511 513
@@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
529 531
530error: 532error:
531 put_cred(new); 533 put_cred(new);
534 put_cred(old);
532 return NULL; 535 return NULL;
533} 536}
534EXPORT_SYMBOL(prepare_kernel_cred); 537EXPORT_SYMBOL(prepare_kernel_cred);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b8f2a78be3d..1d68f1255dd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1126 1126
1127 if (pid != &init_struct_pid) { 1127 if (pid != &init_struct_pid) {
1128 retval = -ENOMEM; 1128 retval = -ENOMEM;
1129 pid = alloc_pid(task_active_pid_ns(p)); 1129 pid = alloc_pid(p->nsproxy->pid_ns);
1130 if (!pid) 1130 if (!pid)
1131 goto bad_fork_cleanup_io; 1131 goto bad_fork_cleanup_io;
1132 1132
1133 if (clone_flags & CLONE_NEWPID) { 1133 if (clone_flags & CLONE_NEWPID) {
1134 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1134 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1135 if (retval < 0) 1135 if (retval < 0)
1136 goto bad_fork_free_pid; 1136 goto bad_fork_free_pid;
1137 } 1137 }
@@ -1481,12 +1481,10 @@ void __init proc_caches_init(void)
1481 fs_cachep = kmem_cache_create("fs_cache", 1481 fs_cachep = kmem_cache_create("fs_cache",
1482 sizeof(struct fs_struct), 0, 1482 sizeof(struct fs_struct), 0,
1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1484 vm_area_cachep = kmem_cache_create("vm_area_struct",
1485 sizeof(struct vm_area_struct), 0,
1486 SLAB_PANIC, NULL);
1487 mm_cachep = kmem_cache_create("mm_struct", 1484 mm_cachep = kmem_cache_create("mm_struct",
1488 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1489 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1487 mmap_init();
1490} 1488}
1491 1489
1492/* 1490/*
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..1de9700f416e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/async.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
34 unsigned int status; 35 unsigned int status;
35 int i; 36 int i;
36 37
38 /*
39 * quiesce the kernel, or at least the asynchronous portion
40 */
41 async_synchronize_full();
37 mutex_lock(&probing_active); 42 mutex_lock(&probing_active);
38 /* 43 /*
39 * something may have generated an irq long ago and we want to 44 * something may have generated an irq long ago and we want to
diff --git a/kernel/module.c b/kernel/module.c
index 496dcb57b608..c9332c90d5a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -50,6 +50,7 @@
50#include <asm/sections.h> 50#include <asm/sections.h>
51#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
52#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h>
53 54
54#if 0 55#if 0
55#define DEBUGP printk 56#define DEBUGP printk
@@ -816,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
816 mod->exit(); 817 mod->exit();
817 blocking_notifier_call_chain(&module_notify_list, 818 blocking_notifier_call_chain(&module_notify_list,
818 MODULE_STATE_GOING, mod); 819 MODULE_STATE_GOING, mod);
820 async_synchronize_full();
819 mutex_lock(&module_mutex); 821 mutex_lock(&module_mutex);
820 /* Store the name of the last unloaded module for diagnostic purposes */ 822 /* Store the name of the last unloaded module for diagnostic purposes */
821 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54d..78bc3fdac0d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
13 13
14struct ns_cgroup { 14struct ns_cgroup {
15 struct cgroup_subsys_state css; 15 struct cgroup_subsys_state css;
16 spinlock_t lock;
17}; 16};
18 17
19struct cgroup_subsys ns_subsys; 18struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
84 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
85 if (!ns_cgroup) 84 if (!ns_cgroup)
86 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
87 spin_lock_init(&ns_cgroup->lock);
88 return &ns_cgroup->css; 86 return &ns_cgroup->css;
89} 87}
90 88
diff --git a/kernel/pid.c b/kernel/pid.c
index af9224cdd6c0..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
474} 474}
475EXPORT_SYMBOL(task_session_nr_ns); 475EXPORT_SYMBOL(task_session_nr_ns);
476 476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{
479 return ns_of_pid(task_pid(tsk));
480}
481EXPORT_SYMBOL_GPL(task_active_pid_ns);
482
477/* 483/*
478 * Used by proc to find the first pid that is greater than or equal to nr. 484 * Used by proc to find the first pid that is greater than or equal to nr.
479 * 485 *
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f77d3819ef57..45e8541ab7e3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode)
258{ 258{
259 int error; 259 int error;
260 260
261 /* Free memory before shutting down devices. */ 261 error = platform_begin(platform_mode);
262 error = swsusp_shrink_memory();
263 if (error) 262 if (error)
264 return error; 263 return error;
265 264
266 error = platform_begin(platform_mode); 265 /* Free memory before shutting down devices. */
266 error = swsusp_shrink_memory();
267 if (error) 267 if (error)
268 goto Close; 268 goto Close;
269 269
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5d2ab836e998..f5fc2d7680f2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -25,6 +25,7 @@
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/mmu_context.h> 31#include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
192 return ret; 193 return ret;
193} 194}
194 195
195static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
196{
197 free_list_of_pages(ca->chain, clear_page_nosave);
198 memset(ca, 0, sizeof(struct chain_allocator));
199}
200
201/** 196/**
202 * Data types related to memory bitmaps. 197 * Data types related to memory bitmaps.
203 * 198 *
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
233#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 228#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
234 229
235struct bm_block { 230struct bm_block {
236 struct bm_block *next; /* next element of the list */ 231 struct list_head hook; /* hook into a list of bitmap blocks */
237 unsigned long start_pfn; /* pfn represented by the first bit */ 232 unsigned long start_pfn; /* pfn represented by the first bit */
238 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 233 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
239 unsigned long *data; /* bitmap representing pages */ 234 unsigned long *data; /* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
244 return bb->end_pfn - bb->start_pfn; 239 return bb->end_pfn - bb->start_pfn;
245} 240}
246 241
247struct zone_bitmap {
248 struct zone_bitmap *next; /* next element of the list */
249 unsigned long start_pfn; /* minimal pfn in this zone */
250 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
251 struct bm_block *bm_blocks; /* list of bitmap blocks */
252 struct bm_block *cur_block; /* recently used bitmap block */
253};
254
255/* strcut bm_position is used for browsing memory bitmaps */ 242/* strcut bm_position is used for browsing memory bitmaps */
256 243
257struct bm_position { 244struct bm_position {
258 struct zone_bitmap *zone_bm;
259 struct bm_block *block; 245 struct bm_block *block;
260 int bit; 246 int bit;
261}; 247};
262 248
263struct memory_bitmap { 249struct memory_bitmap {
264 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ 250 struct list_head blocks; /* list of bitmap blocks */
265 struct linked_page *p_list; /* list of pages used to store zone 251 struct linked_page *p_list; /* list of pages used to store zone
266 * bitmap objects and bitmap block 252 * bitmap objects and bitmap block
267 * objects 253 * objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
273 259
274static void memory_bm_position_reset(struct memory_bitmap *bm) 260static void memory_bm_position_reset(struct memory_bitmap *bm)
275{ 261{
276 struct zone_bitmap *zone_bm; 262 bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
277
278 zone_bm = bm->zone_bm_list;
279 bm->cur.zone_bm = zone_bm;
280 bm->cur.block = zone_bm->bm_blocks;
281 bm->cur.bit = 0; 263 bm->cur.bit = 0;
282} 264}
283 265
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
285 267
286/** 268/**
287 * create_bm_block_list - create a list of block bitmap objects 269 * create_bm_block_list - create a list of block bitmap objects
270 * @nr_blocks - number of blocks to allocate
271 * @list - list to put the allocated blocks into
272 * @ca - chain allocator to be used for allocating memory
288 */ 273 */
289 274static int create_bm_block_list(unsigned long pages,
290static inline struct bm_block * 275 struct list_head *list,
291create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) 276 struct chain_allocator *ca)
292{ 277{
293 struct bm_block *bblist = NULL; 278 unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
294 279
295 while (nr_blocks-- > 0) { 280 while (nr_blocks-- > 0) {
296 struct bm_block *bb; 281 struct bm_block *bb;
297 282
298 bb = chain_alloc(ca, sizeof(struct bm_block)); 283 bb = chain_alloc(ca, sizeof(struct bm_block));
299 if (!bb) 284 if (!bb)
300 return NULL; 285 return -ENOMEM;
301 286 list_add(&bb->hook, list);
302 bb->next = bblist;
303 bblist = bb;
304 } 287 }
305 return bblist; 288
289 return 0;
306} 290}
307 291
292struct mem_extent {
293 struct list_head hook;
294 unsigned long start;
295 unsigned long end;
296};
297
308/** 298/**
309 * create_zone_bm_list - create a list of zone bitmap objects 299 * free_mem_extents - free a list of memory extents
300 * @list - list of extents to empty
310 */ 301 */
302static void free_mem_extents(struct list_head *list)
303{
304 struct mem_extent *ext, *aux;
311 305
312static inline struct zone_bitmap * 306 list_for_each_entry_safe(ext, aux, list, hook) {
313create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) 307 list_del(&ext->hook);
308 kfree(ext);
309 }
310}
311
312/**
313 * create_mem_extents - create a list of memory extents representing
314 * contiguous ranges of PFNs
315 * @list - list to put the extents into
316 * @gfp_mask - mask to use for memory allocations
317 */
318static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
314{ 319{
315 struct zone_bitmap *zbmlist = NULL; 320 struct zone *zone;
316 321
317 while (nr_zones-- > 0) { 322 INIT_LIST_HEAD(list);
318 struct zone_bitmap *zbm;
319 323
320 zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); 324 for_each_zone(zone) {
321 if (!zbm) 325 unsigned long zone_start, zone_end;
322 return NULL; 326 struct mem_extent *ext, *cur, *aux;
327
328 if (!populated_zone(zone))
329 continue;
323 330
324 zbm->next = zbmlist; 331 zone_start = zone->zone_start_pfn;
325 zbmlist = zbm; 332 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333
334 list_for_each_entry(ext, list, hook)
335 if (zone_start <= ext->end)
336 break;
337
338 if (&ext->hook == list || zone_end < ext->start) {
339 /* New extent is necessary */
340 struct mem_extent *new_ext;
341
342 new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
343 if (!new_ext) {
344 free_mem_extents(list);
345 return -ENOMEM;
346 }
347 new_ext->start = zone_start;
348 new_ext->end = zone_end;
349 list_add_tail(&new_ext->hook, &ext->hook);
350 continue;
351 }
352
353 /* Merge this zone's range of PFNs with the existing one */
354 if (zone_start < ext->start)
355 ext->start = zone_start;
356 if (zone_end > ext->end)
357 ext->end = zone_end;
358
359 /* More merging may be possible */
360 cur = ext;
361 list_for_each_entry_safe_continue(cur, aux, list, hook) {
362 if (zone_end < cur->start)
363 break;
364 if (zone_end < cur->end)
365 ext->end = cur->end;
366 list_del(&cur->hook);
367 kfree(cur);
368 }
326 } 369 }
327 return zbmlist; 370
371 return 0;
328} 372}
329 373
330/** 374/**
331 * memory_bm_create - allocate memory for a memory bitmap 375 * memory_bm_create - allocate memory for a memory bitmap
332 */ 376 */
333
334static int 377static int
335memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) 378memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
336{ 379{
337 struct chain_allocator ca; 380 struct chain_allocator ca;
338 struct zone *zone; 381 struct list_head mem_extents;
339 struct zone_bitmap *zone_bm; 382 struct mem_extent *ext;
340 struct bm_block *bb; 383 int error;
341 unsigned int nr;
342 384
343 chain_init(&ca, gfp_mask, safe_needed); 385 chain_init(&ca, gfp_mask, safe_needed);
386 INIT_LIST_HEAD(&bm->blocks);
344 387
345 /* Compute the number of zones */ 388 error = create_mem_extents(&mem_extents, gfp_mask);
346 nr = 0; 389 if (error)
347 for_each_zone(zone) 390 return error;
348 if (populated_zone(zone))
349 nr++;
350
351 /* Allocate the list of zones bitmap objects */
352 zone_bm = create_zone_bm_list(nr, &ca);
353 bm->zone_bm_list = zone_bm;
354 if (!zone_bm) {
355 chain_free(&ca, PG_UNSAFE_CLEAR);
356 return -ENOMEM;
357 }
358
359 /* Initialize the zone bitmap objects */
360 for_each_zone(zone) {
361 unsigned long pfn;
362 391
363 if (!populated_zone(zone)) 392 list_for_each_entry(ext, &mem_extents, hook) {
364 continue; 393 struct bm_block *bb;
394 unsigned long pfn = ext->start;
395 unsigned long pages = ext->end - ext->start;
365 396
366 zone_bm->start_pfn = zone->zone_start_pfn; 397 bb = list_entry(bm->blocks.prev, struct bm_block, hook);
367 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
368 /* Allocate the list of bitmap block objects */
369 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
370 bb = create_bm_block_list(nr, &ca);
371 zone_bm->bm_blocks = bb;
372 zone_bm->cur_block = bb;
373 if (!bb)
374 goto Free;
375 398
376 nr = zone->spanned_pages; 399 error = create_bm_block_list(pages, bm->blocks.prev, &ca);
377 pfn = zone->zone_start_pfn; 400 if (error)
378 /* Initialize the bitmap block objects */ 401 goto Error;
379 while (bb) {
380 unsigned long *ptr;
381 402
382 ptr = get_image_page(gfp_mask, safe_needed); 403 list_for_each_entry_continue(bb, &bm->blocks, hook) {
383 bb->data = ptr; 404 bb->data = get_image_page(gfp_mask, safe_needed);
384 if (!ptr) 405 if (!bb->data) {
385 goto Free; 406 error = -ENOMEM;
407 goto Error;
408 }
386 409
387 bb->start_pfn = pfn; 410 bb->start_pfn = pfn;
388 if (nr >= BM_BITS_PER_BLOCK) { 411 if (pages >= BM_BITS_PER_BLOCK) {
389 pfn += BM_BITS_PER_BLOCK; 412 pfn += BM_BITS_PER_BLOCK;
390 nr -= BM_BITS_PER_BLOCK; 413 pages -= BM_BITS_PER_BLOCK;
391 } else { 414 } else {
392 /* This is executed only once in the loop */ 415 /* This is executed only once in the loop */
393 pfn += nr; 416 pfn += pages;
394 } 417 }
395 bb->end_pfn = pfn; 418 bb->end_pfn = pfn;
396 bb = bb->next;
397 } 419 }
398 zone_bm = zone_bm->next;
399 } 420 }
421
400 bm->p_list = ca.chain; 422 bm->p_list = ca.chain;
401 memory_bm_position_reset(bm); 423 memory_bm_position_reset(bm);
402 return 0; 424 Exit:
425 free_mem_extents(&mem_extents);
426 return error;
403 427
404 Free: 428 Error:
405 bm->p_list = ca.chain; 429 bm->p_list = ca.chain;
406 memory_bm_free(bm, PG_UNSAFE_CLEAR); 430 memory_bm_free(bm, PG_UNSAFE_CLEAR);
407 return -ENOMEM; 431 goto Exit;
408} 432}
409 433
410/** 434/**
411 * memory_bm_free - free memory occupied by the memory bitmap @bm 435 * memory_bm_free - free memory occupied by the memory bitmap @bm
412 */ 436 */
413
414static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 437static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
415{ 438{
416 struct zone_bitmap *zone_bm; 439 struct bm_block *bb;
417 440
418 /* Free the list of bit blocks for each zone_bitmap object */ 441 list_for_each_entry(bb, &bm->blocks, hook)
419 zone_bm = bm->zone_bm_list; 442 if (bb->data)
420 while (zone_bm) { 443 free_image_page(bb->data, clear_nosave_free);
421 struct bm_block *bb;
422 444
423 bb = zone_bm->bm_blocks;
424 while (bb) {
425 if (bb->data)
426 free_image_page(bb->data, clear_nosave_free);
427 bb = bb->next;
428 }
429 zone_bm = zone_bm->next;
430 }
431 free_list_of_pages(bm->p_list, clear_nosave_free); 445 free_list_of_pages(bm->p_list, clear_nosave_free);
432 bm->zone_bm_list = NULL; 446
447 INIT_LIST_HEAD(&bm->blocks);
433} 448}
434 449
435/** 450/**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
437 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 452 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
438 * of @bm->cur_zone_bm are updated. 453 * of @bm->cur_zone_bm are updated.
439 */ 454 */
440
441static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 455static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
442 void **addr, unsigned int *bit_nr) 456 void **addr, unsigned int *bit_nr)
443{ 457{
444 struct zone_bitmap *zone_bm;
445 struct bm_block *bb; 458 struct bm_block *bb;
446 459
447 /* Check if the pfn is from the current zone */ 460 /*
448 zone_bm = bm->cur.zone_bm; 461 * Check if the pfn corresponds to the current bitmap block and find
449 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 462 * the block where it fits if this is not the case.
450 zone_bm = bm->zone_bm_list; 463 */
451 /* We don't assume that the zones are sorted by pfns */ 464 bb = bm->cur.block;
452 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
453 zone_bm = zone_bm->next;
454
455 if (!zone_bm)
456 return -EFAULT;
457 }
458 bm->cur.zone_bm = zone_bm;
459 }
460 /* Check if the pfn corresponds to the current bitmap block */
461 bb = zone_bm->cur_block;
462 if (pfn < bb->start_pfn) 465 if (pfn < bb->start_pfn)
463 bb = zone_bm->bm_blocks; 466 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
467 if (pfn >= bb->start_pfn)
468 break;
464 469
465 while (pfn >= bb->end_pfn) { 470 if (pfn >= bb->end_pfn)
466 bb = bb->next; 471 list_for_each_entry_continue(bb, &bm->blocks, hook)
472 if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
473 break;
467 474
468 BUG_ON(!bb); 475 if (&bb->hook == &bm->blocks)
469 } 476 return -EFAULT;
470 zone_bm->cur_block = bb; 477
478 /* The block has been found */
479 bm->cur.block = bb;
471 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
481 bm->cur.bit = pfn + 1;
472 *bit_nr = pfn; 482 *bit_nr = pfn;
473 *addr = bb->data; 483 *addr = bb->data;
474 return 0; 484 return 0;
@@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
519 return test_bit(bit, addr); 529 return test_bit(bit, addr);
520} 530}
521 531
532static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
533{
534 void *addr;
535 unsigned int bit;
536
537 return !memory_bm_find_bit(bm, pfn, &addr, &bit);
538}
539
522/** 540/**
523 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 541 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
524 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 542 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
530 548
531static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 549static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
532{ 550{
533 struct zone_bitmap *zone_bm;
534 struct bm_block *bb; 551 struct bm_block *bb;
535 int bit; 552 int bit;
536 553
554 bb = bm->cur.block;
537 do { 555 do {
538 bb = bm->cur.block; 556 bit = bm->cur.bit;
539 do { 557 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
540 bit = bm->cur.bit; 558 if (bit < bm_block_bits(bb))
541 bit = find_next_bit(bb->data, bm_block_bits(bb), bit); 559 goto Return_pfn;
542 if (bit < bm_block_bits(bb)) 560
543 goto Return_pfn; 561 bb = list_entry(bb->hook.next, struct bm_block, hook);
544 562 bm->cur.block = bb;
545 bb = bb->next; 563 bm->cur.bit = 0;
546 bm->cur.block = bb; 564 } while (&bb->hook != &bm->blocks);
547 bm->cur.bit = 0; 565
548 } while (bb);
549 zone_bm = bm->cur.zone_bm->next;
550 if (zone_bm) {
551 bm->cur.zone_bm = zone_bm;
552 bm->cur.block = zone_bm->bm_blocks;
553 bm->cur.bit = 0;
554 }
555 } while (zone_bm);
556 memory_bm_position_reset(bm); 566 memory_bm_position_reset(bm);
557 return BM_END_OF_MAP; 567 return BM_END_OF_MAP;
558 568
@@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
808 * We should save the page if it isn't Nosave or NosaveFree, or Reserved, 818 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
809 * and it isn't a part of a free chunk of pages. 819 * and it isn't a part of a free chunk of pages.
810 */ 820 */
811 821static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
812static struct page *saveable_highmem_page(unsigned long pfn)
813{ 822{
814 struct page *page; 823 struct page *page;
815 824
@@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
817 return NULL; 826 return NULL;
818 827
819 page = pfn_to_page(pfn); 828 page = pfn_to_page(pfn);
829 if (page_zone(page) != zone)
830 return NULL;
820 831
821 BUG_ON(!PageHighMem(page)); 832 BUG_ON(!PageHighMem(page));
822 833
@@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void)
846 mark_free_pages(zone); 857 mark_free_pages(zone);
847 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 858 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
848 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 859 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
849 if (saveable_highmem_page(pfn)) 860 if (saveable_highmem_page(zone, pfn))
850 n++; 861 n++;
851 } 862 }
852 return n; 863 return n;
853} 864}
854#else 865#else
855static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } 866static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
867{
868 return NULL;
869}
856#endif /* CONFIG_HIGHMEM */ 870#endif /* CONFIG_HIGHMEM */
857 871
858/** 872/**
@@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
863 * of pages statically defined as 'unsaveable', and it isn't a part of 877 * of pages statically defined as 'unsaveable', and it isn't a part of
864 * a free chunk of pages. 878 * a free chunk of pages.
865 */ 879 */
866 880static struct page *saveable_page(struct zone *zone, unsigned long pfn)
867static struct page *saveable_page(unsigned long pfn)
868{ 881{
869 struct page *page; 882 struct page *page;
870 883
@@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
872 return NULL; 885 return NULL;
873 886
874 page = pfn_to_page(pfn); 887 page = pfn_to_page(pfn);
888 if (page_zone(page) != zone)
889 return NULL;
875 890
876 BUG_ON(PageHighMem(page)); 891 BUG_ON(PageHighMem(page));
877 892
@@ -903,7 +918,7 @@ unsigned int count_data_pages(void)
903 mark_free_pages(zone); 918 mark_free_pages(zone);
904 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 919 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
905 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 920 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
906 if(saveable_page(pfn)) 921 if (saveable_page(zone, pfn))
907 n++; 922 n++;
908 } 923 }
909 return n; 924 return n;
@@ -944,7 +959,7 @@ static inline struct page *
944page_is_saveable(struct zone *zone, unsigned long pfn) 959page_is_saveable(struct zone *zone, unsigned long pfn)
945{ 960{
946 return is_highmem(zone) ? 961 return is_highmem(zone) ?
947 saveable_highmem_page(pfn) : saveable_page(pfn); 962 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
948} 963}
949 964
950static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 965static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
966 * data modified by kmap_atomic() 981 * data modified by kmap_atomic()
967 */ 982 */
968 safe_copy_page(buffer, s_page); 983 safe_copy_page(buffer, s_page);
969 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); 984 dst = kmap_atomic(d_page, KM_USER0);
970 memcpy(dst, buffer, PAGE_SIZE); 985 memcpy(dst, buffer, PAGE_SIZE);
971 kunmap_atomic(dst, KM_USER0); 986 kunmap_atomic(dst, KM_USER0);
972 } else { 987 } else {
@@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
975 } 990 }
976} 991}
977#else 992#else
978#define page_is_saveable(zone, pfn) saveable_page(pfn) 993#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
979 994
980static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 995static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
981{ 996{
@@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info)
1459 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set 1474 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
1460 * the corresponding bit in the memory bitmap @bm 1475 * the corresponding bit in the memory bitmap @bm
1461 */ 1476 */
1462 1477static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1463static inline void
1464unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1465{ 1478{
1466 int j; 1479 int j;
1467 1480
@@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1469 if (unlikely(buf[j] == BM_END_OF_MAP)) 1482 if (unlikely(buf[j] == BM_END_OF_MAP))
1470 break; 1483 break;
1471 1484
1472 memory_bm_set_bit(bm, buf[j]); 1485 if (memory_bm_pfn_present(bm, buf[j]))
1486 memory_bm_set_bit(bm, buf[j]);
1487 else
1488 return -EFAULT;
1473 } 1489 }
1490
1491 return 0;
1474} 1492}
1475 1493
1476/* List of "safe" pages that may be used to store data loaded from the suspend 1494/* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1608 pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); 1626 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1609 if (!pbe) { 1627 if (!pbe) {
1610 swsusp_free(); 1628 swsusp_free();
1611 return NULL; 1629 return ERR_PTR(-ENOMEM);
1612 } 1630 }
1613 pbe->orig_page = page; 1631 pbe->orig_page = page;
1614 if (safe_highmem_pages > 0) { 1632 if (safe_highmem_pages > 0) {
@@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1677static inline void * 1695static inline void *
1678get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) 1696get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1679{ 1697{
1680 return NULL; 1698 return ERR_PTR(-EINVAL);
1681} 1699}
1682 1700
1683static inline void copy_last_highmem_page(void) {} 1701static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1788static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) 1806static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1789{ 1807{
1790 struct pbe *pbe; 1808 struct pbe *pbe;
1791 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1809 struct page *page;
1810 unsigned long pfn = memory_bm_next_pfn(bm);
1792 1811
1812 if (pfn == BM_END_OF_MAP)
1813 return ERR_PTR(-EFAULT);
1814
1815 page = pfn_to_page(pfn);
1793 if (PageHighMem(page)) 1816 if (PageHighMem(page))
1794 return get_highmem_page_buffer(page, ca); 1817 return get_highmem_page_buffer(page, ca);
1795 1818
@@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1805 pbe = chain_alloc(ca, sizeof(struct pbe)); 1828 pbe = chain_alloc(ca, sizeof(struct pbe));
1806 if (!pbe) { 1829 if (!pbe) {
1807 swsusp_free(); 1830 swsusp_free();
1808 return NULL; 1831 return ERR_PTR(-ENOMEM);
1809 } 1832 }
1810 pbe->orig_address = page_address(page); 1833 pbe->orig_address = page_address(page);
1811 pbe->address = safe_pages_list; 1834 pbe->address = safe_pages_list;
@@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1868 return error; 1891 return error;
1869 1892
1870 } else if (handle->prev <= nr_meta_pages) { 1893 } else if (handle->prev <= nr_meta_pages) {
1871 unpack_orig_pfns(buffer, &copy_bm); 1894 error = unpack_orig_pfns(buffer, &copy_bm);
1895 if (error)
1896 return error;
1897
1872 if (handle->prev == nr_meta_pages) { 1898 if (handle->prev == nr_meta_pages) {
1873 error = prepare_image(&orig_bm, &copy_bm); 1899 error = prepare_image(&orig_bm, &copy_bm);
1874 if (error) 1900 if (error)
@@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1879 restore_pblist = NULL; 1905 restore_pblist = NULL;
1880 handle->buffer = get_buffer(&orig_bm, &ca); 1906 handle->buffer = get_buffer(&orig_bm, &ca);
1881 handle->sync_read = 0; 1907 handle->sync_read = 0;
1882 if (!handle->buffer) 1908 if (IS_ERR(handle->buffer))
1883 return -ENOMEM; 1909 return PTR_ERR(handle->buffer);
1884 } 1910 }
1885 } else { 1911 } else {
1886 copy_last_highmem_page(); 1912 copy_last_highmem_page();
1887 handle->buffer = get_buffer(&orig_bm, &ca); 1913 handle->buffer = get_buffer(&orig_bm, &ca);
1914 if (IS_ERR(handle->buffer))
1915 return PTR_ERR(handle->buffer);
1888 if (handle->buffer != buffer) 1916 if (handle->buffer != buffer)
1889 handle->sync_read = 0; 1917 handle->sync_read = 0;
1890 } 1918 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d89..a92c91451559 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
262 262
263 return 0; 263 return 0;
264} 264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17 17
18void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = (unsigned long long)LLONG_MAX;
22 counter->parent = parent;
22} 23}
23 24
24int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
34 return 0; 35 return 0;
35} 36}
36 37
37int res_counter_charge(struct res_counter *counter, unsigned long val) 38int res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at)
38{ 40{
39 int ret; 41 int ret;
40 unsigned long flags; 42 unsigned long flags;
41 43 struct res_counter *c, *u;
42 spin_lock_irqsave(&counter->lock, flags); 44
43 ret = res_counter_charge_locked(counter, val); 45 *limit_fail_at = NULL;
44 spin_unlock_irqrestore(&counter->lock, flags); 46 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val);
50 spin_unlock(&c->lock);
51 if (ret < 0) {
52 *limit_fail_at = c;
53 goto undo;
54 }
55 }
56 ret = 0;
57 goto done;
58undo:
59 for (u = counter; u != c; u = u->parent) {
60 spin_lock(&u->lock);
61 res_counter_uncharge_locked(u, val);
62 spin_unlock(&u->lock);
63 }
64done:
65 local_irq_restore(flags);
45 return ret; 66 return ret;
46} 67}
47 68
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
56void res_counter_uncharge(struct res_counter *counter, unsigned long val) 77void res_counter_uncharge(struct res_counter *counter, unsigned long val)
57{ 78{
58 unsigned long flags; 79 unsigned long flags;
80 struct res_counter *c;
59 81
60 spin_lock_irqsave(&counter->lock, flags); 82 local_irq_save(flags);
61 res_counter_uncharge_locked(counter, val); 83 for (c = counter; c != NULL; c = c->parent) {
62 spin_unlock_irqrestore(&counter->lock, flags); 84 spin_lock(&c->lock);
85 res_counter_uncharge_locked(c, val);
86 spin_unlock(&c->lock);
87 }
88 local_irq_restore(flags);
63} 89}
64 90
65 91
diff --git a/kernel/resource.c b/kernel/resource.c
index e633106b12f6..ca6a1536b205 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
623 */ 623 */
624struct resource * __request_region(struct resource *parent, 624struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 625 resource_size_t start, resource_size_t n,
626 const char *name) 626 const char *name, int flags)
627{ 627{
628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
629 629
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
634 res->start = start; 634 res->start = start;
635 res->end = start + n - 1; 635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY; 636 res->flags = IORESOURCE_BUSY;
637 res->flags |= flags;
637 638
638 write_lock(&resource_lock); 639 write_lock(&resource_lock);
639 640
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
679{ 680{
680 struct resource * res; 681 struct resource * res;
681 682
682 res = __request_region(parent, start, n, "check-region"); 683 res = __request_region(parent, start, n, "check-region", 0);
683 if (!res) 684 if (!res)
684 return -EBUSY; 685 return -EBUSY;
685 686
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
776 dr->start = start; 777 dr->start = start;
777 dr->n = n; 778 dr->n = n;
778 779
779 res = __request_region(parent, start, n, name); 780 res = __request_region(parent, start, n, name, 0);
780 if (res) 781 if (res)
781 devres_add(dev, dr); 782 devres_add(dev, dr);
782 else 783 else
@@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
876 877
877 return err; 878 return err;
878} 879}
880
881#ifdef CONFIG_STRICT_DEVMEM
882static int strict_iomem_checks = 1;
883#else
884static int strict_iomem_checks;
885#endif
886
887/*
888 * check if an address is reserved in the iomem resource tree
889 * returns 1 if reserved, 0 if not reserved.
890 */
891int iomem_is_exclusive(u64 addr)
892{
893 struct resource *p = &iomem_resource;
894 int err = 0;
895 loff_t l;
896 int size = PAGE_SIZE;
897
898 if (!strict_iomem_checks)
899 return 0;
900
901 addr = addr & PAGE_MASK;
902
903 read_lock(&resource_lock);
904 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
905 /*
906 * We can probably skip the resources without
907 * IORESOURCE_IO attribute?
908 */
909 if (p->start >= addr + size)
910 break;
911 if (p->end < addr)
912 continue;
913 if (p->flags & IORESOURCE_BUSY &&
914 p->flags & IORESOURCE_EXCLUSIVE) {
915 err = 1;
916 break;
917 }
918 }
919 read_unlock(&resource_lock);
920
921 return err;
922}
923
924static int __init strict_iomem(char *str)
925{
926 if (strstr(str, "relaxed"))
927 strict_iomem_checks = 0;
928 if (strstr(str, "strict"))
929 strict_iomem_checks = 1;
930 return 1;
931}
932
933__setup("iomem=", strict_iomem);
diff --git a/kernel/sched.c b/kernel/sched.c
index 2e3545f57e77..deb5ac8c12f3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3728,8 +3728,13 @@ redo:
3728 } 3728 }
3729 3729
3730 double_unlock_balance(this_rq, busiest); 3730 double_unlock_balance(this_rq, busiest);
3731 /*
3732 * Should not call ttwu while holding a rq->lock
3733 */
3734 spin_unlock(&this_rq->lock);
3731 if (active_balance) 3735 if (active_balance)
3732 wake_up_process(busiest->migration_thread); 3736 wake_up_process(busiest->migration_thread);
3737 spin_lock(&this_rq->lock);
3733 3738
3734 } else 3739 } else
3735 sd->nr_balance_failed = 0; 3740 sd->nr_balance_failed = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e0c0b4bc3f08..8e1352c75557 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1617,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1617 } 1617 }
1618} 1618}
1619 1619
1620#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1621
1622/* 1620/*
1623 * Share the fairness runtime between parent and child, thus the 1621 * Share the fairness runtime between parent and child, thus the
1624 * total amount of pressure for CPU stays equal - new tasks 1622 * total amount of pressure for CPU stays equal - new tasks
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 92f6e5bc3c24..89d74436318c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int latencytop_enabled; 83extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
85#ifndef CONFIG_MMU
86extern int sysctl_nr_trim_pages;
87#endif
85#ifdef CONFIG_RCU_TORTURE_TEST 88#ifdef CONFIG_RCU_TORTURE_TEST
86extern int rcutorture_runnable; 89extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 90#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = {
1102 .mode = 0644, 1105 .mode = 0644,
1103 .proc_handler = &proc_dointvec 1106 .proc_handler = &proc_dointvec
1104 }, 1107 },
1108#else
1109 {
1110 .ctl_name = CTL_UNNUMBERED,
1111 .procname = "nr_trim_pages",
1112 .data = &sysctl_nr_trim_pages,
1113 .maxlen = sizeof(sysctl_nr_trim_pages),
1114 .mode = 0644,
1115 .proc_handler = &proc_dointvec_minmax,
1116 .strategy = &sysctl_intvec,
1117 .extra1 = &zero,
1118 },
1105#endif 1119#endif
1106 { 1120 {
1107 .ctl_name = VM_LAPTOP_MODE, 1121 .ctl_name = VM_LAPTOP_MODE,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a9d9760dc7b6..8b0daf0662ef 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event)
168 */ 168 */
169unsigned ring_buffer_event_length(struct ring_buffer_event *event) 169unsigned ring_buffer_event_length(struct ring_buffer_event *event)
170{ 170{
171 return rb_event_length(event); 171 unsigned length = rb_event_length(event);
172 if (event->type != RINGBUF_TYPE_DATA)
173 return length;
174 length -= RB_EVNT_HDR_SIZE;
175 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
176 length -= sizeof(event->array[0]);
177 return length;
172} 178}
173EXPORT_SYMBOL_GPL(ring_buffer_event_length); 179EXPORT_SYMBOL_GPL(ring_buffer_event_length);
174 180