aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-10 21:43:52 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-10 21:43:52 -0500
commit99cd7074891f87c49660e3b2880564324a4733ac (patch)
tree903d2665bcb445f1f265d1adf7a99f265bcefc15 /kernel
parente8a9cbf6ae620d9e5ba9cb42001c033287a284a3 (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into tracing/urgent
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c335
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c309
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c285
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/dma-coherent.c42
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex.c72
-rw-r--r--kernel/hrtimer.c142
-rw-r--r--kernel/irq/autoprobe.c5
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c281
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/module.c35
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/power/disk.c6
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/snapshot.c370
-rw-r--r--kernel/power/swsusp.c122
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/rcupdate.c11
-rw-r--r--kernel/rcupreempt.c11
-rw-r--r--kernel/rcutorture.c18
-rw-r--r--kernel/rcutree.c13
-rw-r--r--kernel/res_counter.c44
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched.c23
-rw-r--r--kernel/sched_clock.c5
-rw-r--r--kernel/sched_cpupri.c2
-rw-r--r--kernel/sched_fair.c32
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c41
-rw-r--r--kernel/test_kprobes.c210
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/trace/ring_buffer.c8
-rw-r--r--kernel/tsacct.c4
46 files changed, 1821 insertions, 782 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e1c5bf3365c0..2921d90ce32f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o
13 14
14ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000000..f286e9f2b736
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,335 @@
1/*
2 * async.c: Asynchronous function calls for boot performance
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13
14/*
15
16Goals and Theory of Operation
17
18The primary goal of this feature is to reduce the kernel boot time,
19by doing various independent hardware delays and discovery operations
20decoupled and not strictly serialized.
21
22More specifically, the asynchronous function call concept allows
23certain operations (primarily during system boot) to happen
24asynchronously, out of order, while these operations still
25have their externally visible parts happen sequentially and in-order.
26(not unlike how out-of-order CPUs retire their instructions in order)
27
28Key to the asynchronous function call implementation is the concept of
29a "sequence cookie" (which, although it has an abstracted type, can be
30thought of as a monotonically incrementing number).
31
32The async core will assign each scheduled event such a sequence cookie and
33pass this to the called functions.
34
35The asynchronously called function should before doing a globally visible
36operation, such as registering device numbers, call the
37async_synchronize_cookie() function and pass in its own cookie. The
38async_synchronize_cookie() function will make sure that all asynchronous
39operations that were scheduled prior to the operation corresponding with the
40cookie have completed.
41
42Subsystem/driver initialization code that scheduled asynchronous probe
43functions, but which shares global resources with other drivers/subsystems
44that do not use the asynchronous call feature, need to do a full
45synchronization with the async_synchronize_full() function, before returning
46from their init function. This is to maintain strict ordering between the
47asynchronous and synchronous parts of the kernel.
48
49*/
50
51#include <linux/async.h>
52#include <linux/module.h>
53#include <linux/wait.h>
54#include <linux/sched.h>
55#include <linux/init.h>
56#include <linux/kthread.h>
57#include <asm/atomic.h>
58
59static async_cookie_t next_cookie = 1;
60
61#define MAX_THREADS 256
62#define MAX_WORK 32768
63
64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running);
66static DEFINE_SPINLOCK(async_lock);
67
68static int async_enabled = 0;
69
70struct async_entry {
71 struct list_head list;
72 async_cookie_t cookie;
73 async_func_ptr *func;
74 void *data;
75 struct list_head *running;
76};
77
78static DECLARE_WAIT_QUEUE_HEAD(async_done);
79static DECLARE_WAIT_QUEUE_HEAD(async_new);
80
81static atomic_t entry_count;
82static atomic_t thread_count;
83
84extern int initcall_debug;
85
86
87/*
88 * MUST be called with the lock held!
89 */
90static async_cookie_t __lowest_in_progress(struct list_head *running)
91{
92 struct async_entry *entry;
93 if (!list_empty(&async_pending)) {
94 entry = list_first_entry(&async_pending,
95 struct async_entry, list);
96 return entry->cookie;
97 } else if (!list_empty(running)) {
98 entry = list_first_entry(running,
99 struct async_entry, list);
100 return entry->cookie;
101 } else {
102 /* nothing in progress... next_cookie is "infinity" */
103 return next_cookie;
104 }
105
106}
107/*
108 * pick the first pending entry and run it
109 */
110static void run_one_entry(void)
111{
112 unsigned long flags;
113 struct async_entry *entry;
114 ktime_t calltime, delta, rettime;
115
116 /* 1) pick one task from the pending queue */
117
118 spin_lock_irqsave(&async_lock, flags);
119 if (list_empty(&async_pending))
120 goto out;
121 entry = list_first_entry(&async_pending, struct async_entry, list);
122
123 /* 2) move it to the running queue */
124 list_del(&entry->list);
125 list_add_tail(&entry->list, &async_running);
126 spin_unlock_irqrestore(&async_lock, flags);
127
128 /* 3) run it (and print duration)*/
129 if (initcall_debug && system_state == SYSTEM_BOOTING) {
130 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
131 calltime = ktime_get();
132 }
133 entry->func(entry->data, entry->cookie);
134 if (initcall_debug && system_state == SYSTEM_BOOTING) {
135 rettime = ktime_get();
136 delta = ktime_sub(rettime, calltime);
137 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
138 entry->func, ktime_to_ns(delta) >> 10);
139 }
140
141 /* 4) remove it from the running queue */
142 spin_lock_irqsave(&async_lock, flags);
143 list_del(&entry->list);
144
145 /* 5) free the entry */
146 kfree(entry);
147 atomic_dec(&entry_count);
148
149 spin_unlock_irqrestore(&async_lock, flags);
150
151 /* 6) wake up any waiters. */
152 wake_up(&async_done);
153 return;
154
155out:
156 spin_unlock_irqrestore(&async_lock, flags);
157}
158
159
160static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
161{
162 struct async_entry *entry;
163 unsigned long flags;
164 async_cookie_t newcookie;
165
166
167 /* allow irq-off callers */
168 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
169
170 /*
171 * If we're out of memory or if there's too much work
172 * pending already, we execute synchronously.
173 */
174 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
175 kfree(entry);
176 spin_lock_irqsave(&async_lock, flags);
177 newcookie = next_cookie++;
178 spin_unlock_irqrestore(&async_lock, flags);
179
180 /* low on memory.. run synchronously */
181 ptr(data, newcookie);
182 return newcookie;
183 }
184 entry->func = ptr;
185 entry->data = data;
186 entry->running = running;
187
188 spin_lock_irqsave(&async_lock, flags);
189 newcookie = entry->cookie = next_cookie++;
190 list_add_tail(&entry->list, &async_pending);
191 atomic_inc(&entry_count);
192 spin_unlock_irqrestore(&async_lock, flags);
193 wake_up(&async_new);
194 return newcookie;
195}
196
197async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
198{
199 return __async_schedule(ptr, data, &async_pending);
200}
201EXPORT_SYMBOL_GPL(async_schedule);
202
203async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
204{
205 return __async_schedule(ptr, data, running);
206}
207EXPORT_SYMBOL_GPL(async_schedule_special);
208
209void async_synchronize_full(void)
210{
211 do {
212 async_synchronize_cookie(next_cookie);
213 } while (!list_empty(&async_running) || !list_empty(&async_pending));
214}
215EXPORT_SYMBOL_GPL(async_synchronize_full);
216
217void async_synchronize_full_special(struct list_head *list)
218{
219 async_synchronize_cookie_special(next_cookie, list);
220}
221EXPORT_SYMBOL_GPL(async_synchronize_full_special);
222
223void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
224{
225 ktime_t starttime, delta, endtime;
226
227 if (initcall_debug && system_state == SYSTEM_BOOTING) {
228 printk("async_waiting @ %i\n", task_pid_nr(current));
229 starttime = ktime_get();
230 }
231
232 wait_event(async_done, __lowest_in_progress(running) >= cookie);
233
234 if (initcall_debug && system_state == SYSTEM_BOOTING) {
235 endtime = ktime_get();
236 delta = ktime_sub(endtime, starttime);
237
238 printk("async_continuing @ %i after %lli usec\n",
239 task_pid_nr(current), ktime_to_ns(delta) >> 10);
240 }
241}
242EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
243
244void async_synchronize_cookie(async_cookie_t cookie)
245{
246 async_synchronize_cookie_special(cookie, &async_running);
247}
248EXPORT_SYMBOL_GPL(async_synchronize_cookie);
249
250
251static int async_thread(void *unused)
252{
253 DECLARE_WAITQUEUE(wq, current);
254 add_wait_queue(&async_new, &wq);
255
256 while (!kthread_should_stop()) {
257 int ret = HZ;
258 set_current_state(TASK_INTERRUPTIBLE);
259 /*
260 * check the list head without lock.. false positives
261 * are dealt with inside run_one_entry() while holding
262 * the lock.
263 */
264 rmb();
265 if (!list_empty(&async_pending))
266 run_one_entry();
267 else
268 ret = schedule_timeout(HZ);
269
270 if (ret == 0) {
271 /*
272 * we timed out, this means we as thread are redundant.
273 * we sign off and die, but we to avoid any races there
274 * is a last-straw check to see if work snuck in.
275 */
276 atomic_dec(&thread_count);
277 wmb(); /* manager must see our departure first */
278 if (list_empty(&async_pending))
279 break;
280 /*
281 * woops work came in between us timing out and us
282 * signing off; we need to stay alive and keep working.
283 */
284 atomic_inc(&thread_count);
285 }
286 }
287 remove_wait_queue(&async_new, &wq);
288
289 return 0;
290}
291
292static int async_manager_thread(void *unused)
293{
294 DECLARE_WAITQUEUE(wq, current);
295 add_wait_queue(&async_new, &wq);
296
297 while (!kthread_should_stop()) {
298 int tc, ec;
299
300 set_current_state(TASK_INTERRUPTIBLE);
301
302 tc = atomic_read(&thread_count);
303 rmb();
304 ec = atomic_read(&entry_count);
305
306 while (tc < ec && tc < MAX_THREADS) {
307 kthread_run(async_thread, NULL, "async/%i", tc);
308 atomic_inc(&thread_count);
309 tc++;
310 }
311
312 schedule();
313 }
314 remove_wait_queue(&async_new, &wq);
315
316 return 0;
317}
318
319static int __init async_init(void)
320{
321 if (async_enabled)
322 kthread_run(async_manager_thread, NULL, "async/mgr");
323 return 0;
324}
325
326static int __init setup_async(char *str)
327{
328 async_enabled = 1;
329 return 1;
330}
331
332__setup("fastboot", setup_async);
333
334
335core_initcall(async_init);
diff --git a/kernel/capability.c b/kernel/capability.c
index c598d9d5be4f..688926e496be 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
306 BUG(); 306 BUG();
307 } 307 }
308 308
309 if (has_capability(current, cap)) { 309 if (security_capable(cap) == 0) {
310 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
311 return 1; 311 return 1;
312 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 87bb0258fd27..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
116 * be called. 116 * be called.
117 */ 117 */
118static int need_forkexit_callback __read_mostly; 118static int need_forkexit_callback __read_mostly;
119static int need_mm_owner_callback __read_mostly;
120 119
121/* convenient tests for these bits */ 120/* convenient tests for these bits */
122inline int cgroup_is_removed(const struct cgroup *cgrp) 121inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
149#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
150list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
151 150
152/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
153#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
154list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
155 154
156/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
272 271
273 rcu_read_lock(); 272 rcu_read_lock();
274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
275 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
276 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
277 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
278 if (taskexit) 277 if (taskexit)
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
385 return 0; 384 return 0;
386} 385}
387 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
388/* 406/*
389 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
390 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -400,7 +418,6 @@ static struct css_set *find_css_set(
400 int i; 418 int i;
401 419
402 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
403 struct cg_cgroup_link *link;
404 421
405 struct hlist_head *hhead; 422 struct hlist_head *hhead;
406 423
@@ -445,26 +462,11 @@ static struct css_set *find_css_set(
445 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
446 * hierarchy 463 * hierarchy
447 */ 464 */
448 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
449 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
450 link = list_entry(tmp_cg_links.next,
451 struct cg_cgroup_link,
452 cgrp_link_list);
453 list_del(&link->cgrp_link_list);
454 list_add(&link->cgrp_link_list, &cgrp->css_sets);
455 link->cg = res;
456 list_add(&link->cg_link_list, &res->cg_links);
457 }
458 }
459 if (list_empty(&rootnode.subsys_list)) {
460 link = list_entry(tmp_cg_links.next,
461 struct cg_cgroup_link,
462 cgrp_link_list);
463 list_del(&link->cgrp_link_list);
464 list_add(&link->cgrp_link_list, &dummytop->css_sets);
465 link->cg = res;
466 list_add(&link->cg_link_list, &res->cg_links);
467 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
468 470
469 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
470 472
@@ -587,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
587{ 589{
588 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
589 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
590 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
591 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
592 return; 594 return;
593} 595}
594 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
595static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
596{ 605{
597 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -611,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
611 /* 620 /*
612 * Release the subsystem state objects. 621 * Release the subsystem state objects.
613 */ 622 */
614 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
615 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
616 ss->destroy(ss, cgrp);
617 }
618 625
619 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
620 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
621 628
622 /* Drop the active superblock reference that we took when we 629 /*
623 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
624 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
625 634
626 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
627 } 636 }
628 iput(inode); 637 iput(inode);
629} 638}
@@ -713,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
713 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
714 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
715 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
716 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
717 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
718 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
719 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
720 if (ss->bind) 730 if (ss->bind)
721 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
722 732 mutex_unlock(&ss->hierarchy_mutex);
723 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
724 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
725 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
726 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
727 if (ss->bind) 738 if (ss->bind)
728 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
729 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
730 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
731 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
732 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
733 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
734 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
735 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -991,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
991 root = NULL; 1003 root = NULL;
992 } else { 1004 } else {
993 /* New superblock */ 1005 /* New superblock */
994 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
995 struct inode *inode; 1007 struct inode *inode;
996 int i; 1008 int i;
997 1009
@@ -1032,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1032 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1033 root_count++; 1045 root_count++;
1034 1046
1035 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1036 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1037 1049
1038 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1043,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1043 struct hlist_node *node; 1055 struct hlist_node *node;
1044 struct css_set *cg; 1056 struct css_set *cg;
1045 1057
1046 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1047 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1048
1049 BUG_ON(list_empty(&tmp_cg_links));
1050 link = list_entry(tmp_cg_links.next,
1051 struct cg_cgroup_link,
1052 cgrp_link_list);
1053 list_del(&link->cgrp_link_list);
1054 link->cg = cg;
1055 list_add(&link->cgrp_link_list,
1056 &root->top_cgroup.css_sets);
1057 list_add(&link->cg_link_list, &cg->cg_links);
1058 }
1059 } 1060 }
1060 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1061 1062
1062 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1063 1064
1064 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1065 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1066 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1067 1068
1068 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1069 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1070 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1071 } 1072 }
@@ -1114,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1114 } 1115 }
1115 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1116 1117
1117 if (!list_empty(&root->root_list)) { 1118 list_del(&root->root_list);
1118 list_del(&root->root_list); 1119 root_count--;
1119 root_count--; 1120
1120 }
1121 mutex_unlock(&cgroup_mutex); 1121 mutex_unlock(&cgroup_mutex);
1122 1122
1123 kfree(root); 1123 kfree(root);
@@ -1146,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1146 * @buf: the buffer to write the path into 1146 * @buf: the buffer to write the path into
1147 * @buflen: the length of the buffer 1147 * @buflen: the length of the buffer
1148 * 1148 *
1149 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1149 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1150 * Returns 0 on success, -errno on error. 1150 * reference. Writes path of cgroup into buf. Returns 0 on success,
1151 * -errno on error.
1151 */ 1152 */
1152int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1153{ 1154{
1154 char *start; 1155 char *start;
1156 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1155 1157
1156 if (cgrp == dummytop) { 1158 if (!dentry || cgrp == dummytop) {
1157 /* 1159 /*
1158 * Inactive subsystems have no dentry for their root 1160 * Inactive subsystems have no dentry for their root
1159 * cgroup 1161 * cgroup
@@ -1166,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1166 1168
1167 *--start = '\0'; 1169 *--start = '\0';
1168 for (;;) { 1170 for (;;) {
1169 int len = cgrp->dentry->d_name.len; 1171 int len = dentry->d_name.len;
1170 if ((start -= len) < buf) 1172 if ((start -= len) < buf)
1171 return -ENAMETOOLONG; 1173 return -ENAMETOOLONG;
1172 memcpy(start, cgrp->dentry->d_name.name, len); 1174 memcpy(start, cgrp->dentry->d_name.name, len);
1173 cgrp = cgrp->parent; 1175 cgrp = cgrp->parent;
1174 if (!cgrp) 1176 if (!cgrp)
1175 break; 1177 break;
1178 dentry = rcu_dereference(cgrp->dentry);
1176 if (!cgrp->parent) 1179 if (!cgrp->parent)
1177 continue; 1180 continue;
1178 if (--start < buf) 1181 if (--start < buf)
@@ -1217,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1217 int retval = 0; 1220 int retval = 0;
1218 struct cgroup_subsys *ss; 1221 struct cgroup_subsys *ss;
1219 struct cgroup *oldcgrp; 1222 struct cgroup *oldcgrp;
1220 struct css_set *cg = tsk->cgroups; 1223 struct css_set *cg;
1221 struct css_set *newcg; 1224 struct css_set *newcg;
1222 struct cgroupfs_root *root = cgrp->root; 1225 struct cgroupfs_root *root = cgrp->root;
1223 int subsys_id; 1226 int subsys_id;
@@ -1237,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1237 } 1240 }
1238 } 1241 }
1239 1242
1243 task_lock(tsk);
1244 cg = tsk->cgroups;
1245 get_css_set(cg);
1246 task_unlock(tsk);
1240 /* 1247 /*
1241 * Locate or allocate a new css_set for this task, 1248 * Locate or allocate a new css_set for this task,
1242 * based on its final set of cgroups 1249 * based on its final set of cgroups
1243 */ 1250 */
1244 newcg = find_css_set(cg, cgrp); 1251 newcg = find_css_set(cg, cgrp);
1252 put_css_set(cg);
1245 if (!newcg) 1253 if (!newcg)
1246 return -ENOMEM; 1254 return -ENOMEM;
1247 1255
@@ -1446,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1446 struct cftype *cft = __d_cft(file->f_dentry); 1454 struct cftype *cft = __d_cft(file->f_dentry);
1447 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1455 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1448 1456
1449 if (!cft || cgroup_is_removed(cgrp)) 1457 if (cgroup_is_removed(cgrp))
1450 return -ENODEV; 1458 return -ENODEV;
1451 if (cft->write) 1459 if (cft->write)
1452 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1460 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1491,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1491 struct cftype *cft = __d_cft(file->f_dentry); 1499 struct cftype *cft = __d_cft(file->f_dentry);
1492 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1500 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1493 1501
1494 if (!cft || cgroup_is_removed(cgrp)) 1502 if (cgroup_is_removed(cgrp))
1495 return -ENODEV; 1503 return -ENODEV;
1496 1504
1497 if (cft->read) 1505 if (cft->read)
@@ -1555,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1555 err = generic_file_open(inode, file); 1563 err = generic_file_open(inode, file);
1556 if (err) 1564 if (err)
1557 return err; 1565 return err;
1558
1559 cft = __d_cft(file->f_dentry); 1566 cft = __d_cft(file->f_dentry);
1560 if (!cft) 1567
1561 return -ENODEV;
1562 if (cft->read_map || cft->read_seq_string) { 1568 if (cft->read_map || cft->read_seq_string) {
1563 struct cgroup_seqfile_state *state = 1569 struct cgroup_seqfile_state *state =
1564 kzalloc(sizeof(*state), GFP_USER); 1570 kzalloc(sizeof(*state), GFP_USER);
@@ -1672,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1672 if (!error) { 1678 if (!error) {
1673 dentry->d_fsdata = cgrp; 1679 dentry->d_fsdata = cgrp;
1674 inc_nlink(parent->d_inode); 1680 inc_nlink(parent->d_inode);
1675 cgrp->dentry = dentry; 1681 rcu_assign_pointer(cgrp->dentry, dentry);
1676 dget(dentry); 1682 dget(dentry);
1677 } 1683 }
1678 dput(dentry); 1684 dput(dentry);
@@ -1813,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1813{ 1819{
1814 struct task_struct *res; 1820 struct task_struct *res;
1815 struct list_head *l = it->task; 1821 struct list_head *l = it->task;
1822 struct cg_cgroup_link *link;
1816 1823
1817 /* If the iterator cg is NULL, we have no tasks */ 1824 /* If the iterator cg is NULL, we have no tasks */
1818 if (!it->cg_link) 1825 if (!it->cg_link)
@@ -1820,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1820 res = list_entry(l, struct task_struct, cg_list); 1827 res = list_entry(l, struct task_struct, cg_list);
1821 /* Advance iterator to find next entry */ 1828 /* Advance iterator to find next entry */
1822 l = l->next; 1829 l = l->next;
1823 if (l == &res->cgroups->tasks) { 1830 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1831 if (l == &link->cg->tasks) {
1824 /* We reached the end of this task list - move on to 1832 /* We reached the end of this task list - move on to
1825 * the next cg_cgroup_link */ 1833 * the next cg_cgroup_link */
1826 cgroup_advance_iter(cgrp, it); 1834 cgroup_advance_iter(cgrp, it);
@@ -2014,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2014 */ 2022 */
2015static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2023static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2016{ 2024{
2017 int n = 0; 2025 int n = 0, pid;
2018 struct cgroup_iter it; 2026 struct cgroup_iter it;
2019 struct task_struct *tsk; 2027 struct task_struct *tsk;
2020 cgroup_iter_start(cgrp, &it); 2028 cgroup_iter_start(cgrp, &it);
2021 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2029 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2022 if (unlikely(n == npids)) 2030 if (unlikely(n == npids))
2023 break; 2031 break;
2024 pidarray[n++] = task_pid_vnr(tsk); 2032 pid = task_pid_vnr(tsk);
2033 if (pid > 0)
2034 pidarray[n++] = pid;
2025 } 2035 }
2026 cgroup_iter_end(cgrp, &it); 2036 cgroup_iter_end(cgrp, &it);
2027 return n; 2037 return n;
@@ -2053,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2053 2063
2054 ret = 0; 2064 ret = 0;
2055 cgrp = dentry->d_fsdata; 2065 cgrp = dentry->d_fsdata;
2056 rcu_read_lock();
2057 2066
2058 cgroup_iter_start(cgrp, &it); 2067 cgroup_iter_start(cgrp, &it);
2059 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2068 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2078,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2078 } 2087 }
2079 cgroup_iter_end(cgrp, &it); 2088 cgroup_iter_end(cgrp, &it);
2080 2089
2081 rcu_read_unlock();
2082err: 2090err:
2083 return ret; 2091 return ret;
2084} 2092}
@@ -2325,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2325 struct cgroup *cgrp) 2333 struct cgroup *cgrp)
2326{ 2334{
2327 css->cgroup = cgrp; 2335 css->cgroup = cgrp;
2328 atomic_set(&css->refcnt, 0); 2336 atomic_set(&css->refcnt, 1);
2329 css->flags = 0; 2337 css->flags = 0;
2330 if (cgrp == dummytop) 2338 if (cgrp == dummytop)
2331 set_bit(CSS_ROOT, &css->flags); 2339 set_bit(CSS_ROOT, &css->flags);
@@ -2333,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2333 cgrp->subsys[ss->subsys_id] = css; 2341 cgrp->subsys[ss->subsys_id] = css;
2334} 2342}
2335 2343
2344static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2345{
2346 /* We need to take each hierarchy_mutex in a consistent order */
2347 int i;
2348
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i);
2353 }
2354}
2355
2356static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2357{
2358 int i;
2359
2360 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2361 struct cgroup_subsys *ss = subsys[i];
2362 if (ss->root == root)
2363 mutex_unlock(&ss->hierarchy_mutex);
2364 }
2365}
2366
2336/* 2367/*
2337 * cgroup_create - create a cgroup 2368 * cgroup_create - create a cgroup
2338 * @parent: cgroup that will be parent of the new cgroup 2369 * @parent: cgroup that will be parent of the new cgroup
@@ -2381,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2381 init_cgroup_css(css, ss, cgrp); 2412 init_cgroup_css(css, ss, cgrp);
2382 } 2413 }
2383 2414
2415 cgroup_lock_hierarchy(root);
2384 list_add(&cgrp->sibling, &cgrp->parent->children); 2416 list_add(&cgrp->sibling, &cgrp->parent->children);
2417 cgroup_unlock_hierarchy(root);
2385 root->number_of_cgroups++; 2418 root->number_of_cgroups++;
2386 2419
2387 err = cgroup_create_dir(cgrp, dentry, mode); 2420 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2432,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2432{ 2465{
2433 /* Check the reference count on each subsystem. Since we 2466 /* Check the reference count on each subsystem. Since we
2434 * already established that there are no tasks in the 2467 * already established that there are no tasks in the
2435 * cgroup, if the css refcount is also 0, then there should 2468 * cgroup, if the css refcount is also 1, then there should
2436 * be no outstanding references, so the subsystem is safe to 2469 * be no outstanding references, so the subsystem is safe to
2437 * destroy. We scan across all subsystems rather than using 2470 * destroy. We scan across all subsystems rather than using
2438 * the per-hierarchy linked list of mounted subsystems since 2471 * the per-hierarchy linked list of mounted subsystems since
@@ -2453,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2453 * matter, since it can only happen if the cgroup 2486 * matter, since it can only happen if the cgroup
2454 * has been deleted and hence no longer needs the 2487 * has been deleted and hence no longer needs the
2455 * release agent to be called anyway. */ 2488 * release agent to be called anyway. */
2456 if (css && atomic_read(&css->refcnt)) 2489 if (css && (atomic_read(&css->refcnt) > 1))
2457 return 1; 2490 return 1;
2458 } 2491 }
2459 return 0; 2492 return 0;
2460} 2493}
2461 2494
2495/*
2496 * Atomically mark all (or else none) of the cgroup's CSS objects as
2497 * CSS_REMOVED. Return true on success, or false if the cgroup has
2498 * busy subsystems. Call with cgroup_mutex held
2499 */
2500
2501static int cgroup_clear_css_refs(struct cgroup *cgrp)
2502{
2503 struct cgroup_subsys *ss;
2504 unsigned long flags;
2505 bool failed = false;
2506 local_irq_save(flags);
2507 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt;
2510 do {
2511 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) {
2514 failed = true;
2515 goto done;
2516 }
2517 BUG_ON(!refcnt);
2518 /*
2519 * Drop the refcnt to 0 while we check other
2520 * subsystems. This will cause any racing
2521 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort
2523 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
2525 }
2526 done:
2527 for_each_subsys(cgrp->root, ss) {
2528 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2529 if (failed) {
2530 /*
2531 * Restore old refcnt if we previously managed
2532 * to clear it from 1 to 0
2533 */
2534 if (!atomic_read(&css->refcnt))
2535 atomic_set(&css->refcnt, 1);
2536 } else {
2537 /* Commit the fact that the CSS is removed */
2538 set_bit(CSS_REMOVED, &css->flags);
2539 }
2540 }
2541 local_irq_restore(flags);
2542 return !failed;
2543}
2544
2462static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2545static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2463{ 2546{
2464 struct cgroup *cgrp = dentry->d_fsdata; 2547 struct cgroup *cgrp = dentry->d_fsdata;
2465 struct dentry *d; 2548 struct dentry *d;
2466 struct cgroup *parent; 2549 struct cgroup *parent;
2467 struct super_block *sb;
2468 struct cgroupfs_root *root;
2469 2550
2470 /* the vfs holds both inode->i_mutex already */ 2551 /* the vfs holds both inode->i_mutex already */
2471 2552
@@ -2488,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2488 2569
2489 mutex_lock(&cgroup_mutex); 2570 mutex_lock(&cgroup_mutex);
2490 parent = cgrp->parent; 2571 parent = cgrp->parent;
2491 root = cgrp->root;
2492 sb = root->sb;
2493 2572
2494 if (atomic_read(&cgrp->count) 2573 if (atomic_read(&cgrp->count)
2495 || !list_empty(&cgrp->children) 2574 || !list_empty(&cgrp->children)
2496 || cgroup_has_css_refs(cgrp)) { 2575 || !cgroup_clear_css_refs(cgrp)) {
2497 mutex_unlock(&cgroup_mutex); 2576 mutex_unlock(&cgroup_mutex);
2498 return -EBUSY; 2577 return -EBUSY;
2499 } 2578 }
@@ -2503,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2503 if (!list_empty(&cgrp->release_list)) 2582 if (!list_empty(&cgrp->release_list))
2504 list_del(&cgrp->release_list); 2583 list_del(&cgrp->release_list);
2505 spin_unlock(&release_list_lock); 2584 spin_unlock(&release_list_lock);
2506 /* delete my sibling from parent->children */ 2585
2586 cgroup_lock_hierarchy(cgrp->root);
2587 /* delete this cgroup from parent->children */
2507 list_del(&cgrp->sibling); 2588 list_del(&cgrp->sibling);
2589 cgroup_unlock_hierarchy(cgrp->root);
2590
2508 spin_lock(&cgrp->dentry->d_lock); 2591 spin_lock(&cgrp->dentry->d_lock);
2509 d = dget(cgrp->dentry); 2592 d = dget(cgrp->dentry);
2510 spin_unlock(&d->d_lock); 2593 spin_unlock(&d->d_lock);
@@ -2526,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2526 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2609 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2527 2610
2528 /* Create the top cgroup state for this subsystem */ 2611 /* Create the top cgroup state for this subsystem */
2612 list_add(&ss->sibling, &rootnode.subsys_list);
2529 ss->root = &rootnode; 2613 ss->root = &rootnode;
2530 css = ss->create(ss, dummytop); 2614 css = ss->create(ss, dummytop);
2531 /* We don't handle early failures gracefully */ 2615 /* We don't handle early failures gracefully */
@@ -2539,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2539 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2623 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2540 2624
2541 need_forkexit_callback |= ss->fork || ss->exit; 2625 need_forkexit_callback |= ss->fork || ss->exit;
2542 need_mm_owner_callback |= !!ss->mm_owner_changed;
2543 2626
2544 /* At system boot, before all subsystems have been 2627 /* At system boot, before all subsystems have been
2545 * registered, no tasks have been forked, so we don't 2628 * registered, no tasks have been forked, so we don't
2546 * need to invoke fork callbacks here. */ 2629 * need to invoke fork callbacks here. */
2547 BUG_ON(!list_empty(&init_task.tasks)); 2630 BUG_ON(!list_empty(&init_task.tasks));
2548 2631
2632 mutex_init(&ss->hierarchy_mutex);
2549 ss->active = 1; 2633 ss->active = 1;
2550} 2634}
2551 2635
@@ -2564,7 +2648,6 @@ int __init cgroup_init_early(void)
2564 INIT_HLIST_NODE(&init_css_set.hlist); 2648 INIT_HLIST_NODE(&init_css_set.hlist);
2565 css_set_count = 1; 2649 css_set_count = 1;
2566 init_cgroup_root(&rootnode); 2650 init_cgroup_root(&rootnode);
2567 list_add(&rootnode.root_list, &roots);
2568 root_count = 1; 2651 root_count = 1;
2569 init_task.cgroups = &init_css_set; 2652 init_task.cgroups = &init_css_set;
2570 2653
@@ -2671,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2671 2754
2672 mutex_lock(&cgroup_mutex); 2755 mutex_lock(&cgroup_mutex);
2673 2756
2674 for_each_root(root) { 2757 for_each_active_root(root) {
2675 struct cgroup_subsys *ss; 2758 struct cgroup_subsys *ss;
2676 struct cgroup *cgrp; 2759 struct cgroup *cgrp;
2677 int subsys_id; 2760 int subsys_id;
2678 int count = 0; 2761 int count = 0;
2679 2762
2680 /* Skip this hierarchy if it has no active subsystems */
2681 if (!root->actual_subsys_bits)
2682 continue;
2683 seq_printf(m, "%lu:", root->subsys_bits); 2763 seq_printf(m, "%lu:", root->subsys_bits);
2684 for_each_subsys(root, ss) 2764 for_each_subsys(root, ss)
2685 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2765 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2789,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
2789 } 2869 }
2790} 2870}
2791 2871
2792#ifdef CONFIG_MM_OWNER
2793/**
2794 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2795 * @p: the new owner
2796 *
2797 * Called on every change to mm->owner. mm_init_owner() does not
2798 * invoke this routine, since it assigns the mm->owner the first time
2799 * and does not change it.
2800 *
2801 * The callbacks are invoked with mmap_sem held in read mode.
2802 */
2803void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2804{
2805 struct cgroup *oldcgrp, *newcgrp = NULL;
2806
2807 if (need_mm_owner_callback) {
2808 int i;
2809 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2810 struct cgroup_subsys *ss = subsys[i];
2811 oldcgrp = task_cgroup(old, ss->subsys_id);
2812 if (new)
2813 newcgrp = task_cgroup(new, ss->subsys_id);
2814 if (oldcgrp == newcgrp)
2815 continue;
2816 if (ss->mm_owner_changed)
2817 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2818 }
2819 }
2820}
2821#endif /* CONFIG_MM_OWNER */
2822
2823/** 2872/**
2824 * cgroup_post_fork - called on a new task after adding it to the task list 2873 * cgroup_post_fork - called on a new task after adding it to the task list
2825 * @child: the task in question 2874 * @child: the task in question
@@ -2833,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
2833{ 2882{
2834 if (use_task_css_set_links) { 2883 if (use_task_css_set_links) {
2835 write_lock(&css_set_lock); 2884 write_lock(&css_set_lock);
2885 task_lock(child);
2836 if (list_empty(&child->cg_list)) 2886 if (list_empty(&child->cg_list))
2837 list_add(&child->cg_list, &child->cgroups->tasks); 2887 list_add(&child->cg_list, &child->cgroups->tasks);
2888 task_unlock(child);
2838 write_unlock(&css_set_lock); 2889 write_unlock(&css_set_lock);
2839 } 2890 }
2840} 2891}
@@ -2940,6 +2991,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2940 mutex_unlock(&cgroup_mutex); 2991 mutex_unlock(&cgroup_mutex);
2941 return 0; 2992 return 0;
2942 } 2993 }
2994 task_lock(tsk);
2943 cg = tsk->cgroups; 2995 cg = tsk->cgroups;
2944 parent = task_cgroup(tsk, subsys->subsys_id); 2996 parent = task_cgroup(tsk, subsys->subsys_id);
2945 2997
@@ -2952,6 +3004,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2952 3004
2953 /* Keep the cgroup alive */ 3005 /* Keep the cgroup alive */
2954 get_css_set(cg); 3006 get_css_set(cg);
3007 task_unlock(tsk);
2955 mutex_unlock(&cgroup_mutex); 3008 mutex_unlock(&cgroup_mutex);
2956 3009
2957 /* Now do the VFS work to create a cgroup */ 3010 /* Now do the VFS work to create a cgroup */
@@ -2970,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2970 } 3023 }
2971 3024
2972 /* Create the cgroup directory, which also creates the cgroup */ 3025 /* Create the cgroup directory, which also creates the cgroup */
2973 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3026 ret = vfs_mkdir(inode, dentry, 0755);
2974 child = __d_cgrp(dentry); 3027 child = __d_cgrp(dentry);
2975 dput(dentry); 3028 dput(dentry);
2976 if (ret) { 3029 if (ret) {
@@ -2980,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2980 goto out_release; 3033 goto out_release;
2981 } 3034 }
2982 3035
2983 if (!child) {
2984 printk(KERN_INFO
2985 "Couldn't find new cgroup %s\n", nodename);
2986 ret = -ENOMEM;
2987 goto out_release;
2988 }
2989
2990 /* The cgroup now exists. Retake cgroup_mutex and check 3036 /* The cgroup now exists. Retake cgroup_mutex and check
2991 * that we're still in the same state that we thought we 3037 * that we're still in the same state that we thought we
2992 * were. */ 3038 * were. */
@@ -3082,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
3082{ 3128{
3083 struct cgroup *cgrp = css->cgroup; 3129 struct cgroup *cgrp = css->cgroup;
3084 rcu_read_lock(); 3130 rcu_read_lock();
3085 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3131 if ((atomic_dec_return(&css->refcnt) == 1) &&
3132 notify_on_release(cgrp)) {
3086 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3133 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3087 check_for_release(cgrp); 3134 check_for_release(cgrp);
3088 } 3135 }
diff --git a/kernel/compat.c b/kernel/compat.c
index d52e2ec1deb5..42d56544460f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -24,6 +24,7 @@
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
229 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 230 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
230 return -EFAULT; 231 return -EFAULT;
231 } 232 }
233 force_successful_syscall_return();
232 return compat_jiffies_to_clock_t(jiffies); 234 return compat_jiffies_to_clock_t(jiffies);
233} 235}
234 236
@@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
894 896
895 if (tloc) { 897 if (tloc) {
896 if (put_user(i,tloc)) 898 if (put_user(i,tloc))
897 i = -EFAULT; 899 return -EFAULT;
898 } 900 }
901 force_successful_syscall_return();
899 return i; 902 return i;
900} 903}
901 904
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 30e74dd6d01b..79e40f00dcb8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -379,8 +379,11 @@ static cpumask_var_t frozen_cpus;
379 379
380int disable_nonboot_cpus(void) 380int disable_nonboot_cpus(void)
381{ 381{
382 int cpu, first_cpu, error = 0; 382 int cpu, first_cpu, error;
383 383
384 error = stop_machine_create();
385 if (error)
386 return error;
384 cpu_maps_update_begin(); 387 cpu_maps_update_begin();
385 first_cpu = cpumask_first(cpu_online_mask); 388 first_cpu = cpumask_first(cpu_online_mask);
386 /* We take down all of the non-boot CPUs in one shot to avoid races 389 /* We take down all of the non-boot CPUs in one shot to avoid races
@@ -409,6 +412,7 @@ int disable_nonboot_cpus(void)
409 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 412 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
410 } 413 }
411 cpu_maps_update_done(); 414 cpu_maps_update_done();
415 stop_machine_destroy();
412 return error; 416 return error;
413} 417}
414 418
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 39c1a4c1c5a9..647c77a88fcb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -84,7 +84,7 @@ struct cpuset {
84 struct cgroup_subsys_state css; 84 struct cgroup_subsys_state css;
85 85
86 unsigned long flags; /* "unsigned long" so bitops work */ 86 unsigned long flags; /* "unsigned long" so bitops work */
87 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 89
90 struct cpuset *parent; /* my parent */ 90 struct cpuset *parent; /* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
195 195
196static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
198 .cpus_allowed = CPU_MASK_ALL,
199 .mems_allowed = NODE_MASK_ALL,
200}; 198};
201 199
202/* 200/*
@@ -240,6 +238,17 @@ static struct cpuset top_cpuset = {
240static DEFINE_MUTEX(callback_mutex); 238static DEFINE_MUTEX(callback_mutex);
241 239
242/* 240/*
241 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
242 * buffers. They are statically allocated to prevent using excess stack
243 * when calling cpuset_print_task_mems_allowed().
244 */
245#define CPUSET_NAME_LEN (128)
246#define CPUSET_NODELIST_LEN (256)
247static char cpuset_name[CPUSET_NAME_LEN];
248static char cpuset_nodelist[CPUSET_NODELIST_LEN];
249static DEFINE_SPINLOCK(cpuset_buffer_lock);
250
251/*
243 * This is ugly, but preserves the userspace API for existing cpuset 252 * This is ugly, but preserves the userspace API for existing cpuset
244 * users. If someone tries to mount the "cpuset" filesystem, we 253 * users. If someone tries to mount the "cpuset" filesystem, we
245 * silently switch it to mount "cgroup" instead 254 * silently switch it to mount "cgroup" instead
@@ -267,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
267}; 276};
268 277
269/* 278/*
270 * Return in *pmask the portion of a cpusets's cpus_allowed that 279 * Return in pmask the portion of a cpusets's cpus_allowed that
271 * are online. If none are online, walk up the cpuset hierarchy 280 * are online. If none are online, walk up the cpuset hierarchy
272 * until we find one that does have some online cpus. If we get 281 * until we find one that does have some online cpus. If we get
273 * all the way to the top and still haven't found any online cpus, 282 * all the way to the top and still haven't found any online cpus,
@@ -280,15 +289,16 @@ static struct file_system_type cpuset_fs_type = {
280 * Call with callback_mutex held. 289 * Call with callback_mutex held.
281 */ 290 */
282 291
283static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 292static void guarantee_online_cpus(const struct cpuset *cs,
293 struct cpumask *pmask)
284{ 294{
285 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) 295 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
286 cs = cs->parent; 296 cs = cs->parent;
287 if (cs) 297 if (cs)
288 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); 298 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
289 else 299 else
290 *pmask = cpu_online_map; 300 cpumask_copy(pmask, cpu_online_mask);
291 BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); 301 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
292} 302}
293 303
294/* 304/*
@@ -364,14 +374,9 @@ void cpuset_update_task_memory_state(void)
364 struct task_struct *tsk = current; 374 struct task_struct *tsk = current;
365 struct cpuset *cs; 375 struct cpuset *cs;
366 376
367 if (task_cs(tsk) == &top_cpuset) { 377 rcu_read_lock();
368 /* Don't need rcu for top_cpuset. It's never freed. */ 378 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 379 rcu_read_unlock();
370 } else {
371 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock();
374 }
375 380
376 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 381 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
377 mutex_lock(&callback_mutex); 382 mutex_lock(&callback_mutex);
@@ -403,12 +408,43 @@ void cpuset_update_task_memory_state(void)
403 408
404static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 409static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
405{ 410{
406 return cpus_subset(p->cpus_allowed, q->cpus_allowed) && 411 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
407 nodes_subset(p->mems_allowed, q->mems_allowed) && 412 nodes_subset(p->mems_allowed, q->mems_allowed) &&
408 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 413 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
409 is_mem_exclusive(p) <= is_mem_exclusive(q); 414 is_mem_exclusive(p) <= is_mem_exclusive(q);
410} 415}
411 416
417/**
418 * alloc_trial_cpuset - allocate a trial cpuset
419 * @cs: the cpuset that the trial cpuset duplicates
420 */
421static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
422{
423 struct cpuset *trial;
424
425 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
426 if (!trial)
427 return NULL;
428
429 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
430 kfree(trial);
431 return NULL;
432 }
433 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
434
435 return trial;
436}
437
438/**
439 * free_trial_cpuset - free the trial cpuset
440 * @trial: the trial cpuset to be freed
441 */
442static void free_trial_cpuset(struct cpuset *trial)
443{
444 free_cpumask_var(trial->cpus_allowed);
445 kfree(trial);
446}
447
412/* 448/*
413 * validate_change() - Used to validate that any proposed cpuset change 449 * validate_change() - Used to validate that any proposed cpuset change
414 * follows the structural rules for cpusets. 450 * follows the structural rules for cpusets.
@@ -458,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
458 c = cgroup_cs(cont); 494 c = cgroup_cs(cont);
459 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 495 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
460 c != cur && 496 c != cur &&
461 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 497 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
462 return -EINVAL; 498 return -EINVAL;
463 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 499 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
464 c != cur && 500 c != cur &&
@@ -468,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
468 504
469 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 505 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
470 if (cgroup_task_count(cur->css.cgroup)) { 506 if (cgroup_task_count(cur->css.cgroup)) {
471 if (cpus_empty(trial->cpus_allowed) || 507 if (cpumask_empty(trial->cpus_allowed) ||
472 nodes_empty(trial->mems_allowed)) { 508 nodes_empty(trial->mems_allowed)) {
473 return -ENOSPC; 509 return -ENOSPC;
474 } 510 }
@@ -483,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
483 */ 519 */
484static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 520static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
485{ 521{
486 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 522 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
487} 523}
488 524
489static void 525static void
@@ -508,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
508 cp = list_first_entry(&q, struct cpuset, stack_list); 544 cp = list_first_entry(&q, struct cpuset, stack_list);
509 list_del(q.next); 545 list_del(q.next);
510 546
511 if (cpus_empty(cp->cpus_allowed)) 547 if (cpumask_empty(cp->cpus_allowed))
512 continue; 548 continue;
513 549
514 if (is_sched_load_balance(cp)) 550 if (is_sched_load_balance(cp))
@@ -575,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
575 * element of the partition (one sched domain) to be passed to 611 * element of the partition (one sched domain) to be passed to
576 * partition_sched_domains(). 612 * partition_sched_domains().
577 */ 613 */
578static int generate_sched_domains(cpumask_t **domains, 614/* FIXME: see the FIXME in partition_sched_domains() */
615static int generate_sched_domains(struct cpumask **domains,
579 struct sched_domain_attr **attributes) 616 struct sched_domain_attr **attributes)
580{ 617{
581 LIST_HEAD(q); /* queue of cpusets to be scanned */ 618 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -583,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
583 struct cpuset **csa; /* array of all cpuset ptrs */ 620 struct cpuset **csa; /* array of all cpuset ptrs */
584 int csn; /* how many cpuset ptrs in csa so far */ 621 int csn; /* how many cpuset ptrs in csa so far */
585 int i, j, k; /* indices for partition finding loops */ 622 int i, j, k; /* indices for partition finding loops */
586 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 623 struct cpumask *doms; /* resulting partition; i.e. sched domains */
587 struct sched_domain_attr *dattr; /* attributes for custom domains */ 624 struct sched_domain_attr *dattr; /* attributes for custom domains */
588 int ndoms = 0; /* number of sched domains in result */ 625 int ndoms = 0; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */ 626 int nslot; /* next empty doms[] struct cpumask slot */
590 627
591 doms = NULL; 628 doms = NULL;
592 dattr = NULL; 629 dattr = NULL;
@@ -594,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
594 631
595 /* Special case for the 99% of systems with one, full, sched domain */ 632 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) { 633 if (is_sched_load_balance(&top_cpuset)) {
597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 634 doms = kmalloc(cpumask_size(), GFP_KERNEL);
598 if (!doms) 635 if (!doms)
599 goto done; 636 goto done;
600 637
@@ -603,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains,
603 *dattr = SD_ATTR_INIT; 640 *dattr = SD_ATTR_INIT;
604 update_domain_attr_tree(dattr, &top_cpuset); 641 update_domain_attr_tree(dattr, &top_cpuset);
605 } 642 }
606 *doms = top_cpuset.cpus_allowed; 643 cpumask_copy(doms, top_cpuset.cpus_allowed);
607 644
608 ndoms = 1; 645 ndoms = 1;
609 goto done; 646 goto done;
@@ -622,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains,
622 cp = list_first_entry(&q, struct cpuset, stack_list); 659 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next); 660 list_del(q.next);
624 661
625 if (cpus_empty(cp->cpus_allowed)) 662 if (cpumask_empty(cp->cpus_allowed))
626 continue; 663 continue;
627 664
628 /* 665 /*
@@ -673,7 +710,7 @@ restart:
673 * Now we know how many domains to create. 710 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 711 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 712 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 713 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
677 if (!doms) 714 if (!doms)
678 goto done; 715 goto done;
679 716
@@ -685,7 +722,7 @@ restart:
685 722
686 for (nslot = 0, i = 0; i < csn; i++) { 723 for (nslot = 0, i = 0; i < csn; i++) {
687 struct cpuset *a = csa[i]; 724 struct cpuset *a = csa[i];
688 cpumask_t *dp; 725 struct cpumask *dp;
689 int apn = a->pn; 726 int apn = a->pn;
690 727
691 if (apn < 0) { 728 if (apn < 0) {
@@ -708,14 +745,14 @@ restart:
708 continue; 745 continue;
709 } 746 }
710 747
711 cpus_clear(*dp); 748 cpumask_clear(dp);
712 if (dattr) 749 if (dattr)
713 *(dattr + nslot) = SD_ATTR_INIT; 750 *(dattr + nslot) = SD_ATTR_INIT;
714 for (j = i; j < csn; j++) { 751 for (j = i; j < csn; j++) {
715 struct cpuset *b = csa[j]; 752 struct cpuset *b = csa[j];
716 753
717 if (apn == b->pn) { 754 if (apn == b->pn) {
718 cpus_or(*dp, *dp, b->cpus_allowed); 755 cpumask_or(dp, dp, b->cpus_allowed);
719 if (dattr) 756 if (dattr)
720 update_domain_attr_tree(dattr + nslot, b); 757 update_domain_attr_tree(dattr + nslot, b);
721 758
@@ -755,7 +792,7 @@ done:
755static void do_rebuild_sched_domains(struct work_struct *unused) 792static void do_rebuild_sched_domains(struct work_struct *unused)
756{ 793{
757 struct sched_domain_attr *attr; 794 struct sched_domain_attr *attr;
758 cpumask_t *doms; 795 struct cpumask *doms;
759 int ndoms; 796 int ndoms;
760 797
761 get_online_cpus(); 798 get_online_cpus();
@@ -824,7 +861,7 @@ void rebuild_sched_domains(void)
824static int cpuset_test_cpumask(struct task_struct *tsk, 861static int cpuset_test_cpumask(struct task_struct *tsk,
825 struct cgroup_scanner *scan) 862 struct cgroup_scanner *scan)
826{ 863{
827 return !cpus_equal(tsk->cpus_allowed, 864 return !cpumask_equal(&tsk->cpus_allowed,
828 (cgroup_cs(scan->cg))->cpus_allowed); 865 (cgroup_cs(scan->cg))->cpus_allowed);
829} 866}
830 867
@@ -842,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
842static void cpuset_change_cpumask(struct task_struct *tsk, 879static void cpuset_change_cpumask(struct task_struct *tsk,
843 struct cgroup_scanner *scan) 880 struct cgroup_scanner *scan)
844{ 881{
845 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 882 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
846} 883}
847 884
848/** 885/**
@@ -874,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
874 * @cs: the cpuset to consider 911 * @cs: the cpuset to consider
875 * @buf: buffer of cpu numbers written to this cpuset 912 * @buf: buffer of cpu numbers written to this cpuset
876 */ 913 */
877static int update_cpumask(struct cpuset *cs, const char *buf) 914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf)
878{ 916{
879 struct ptr_heap heap; 917 struct ptr_heap heap;
880 struct cpuset trialcs;
881 int retval; 918 int retval;
882 int is_load_balanced; 919 int is_load_balanced;
883 920
@@ -885,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
885 if (cs == &top_cpuset) 922 if (cs == &top_cpuset)
886 return -EACCES; 923 return -EACCES;
887 924
888 trialcs = *cs;
889
890 /* 925 /*
891 * An empty cpus_allowed is ok only if the cpuset has no tasks. 926 * An empty cpus_allowed is ok only if the cpuset has no tasks.
892 * Since cpulist_parse() fails on an empty mask, we special case 927 * Since cpulist_parse() fails on an empty mask, we special case
@@ -894,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
894 * with tasks have cpus. 929 * with tasks have cpus.
895 */ 930 */
896 if (!*buf) { 931 if (!*buf) {
897 cpus_clear(trialcs.cpus_allowed); 932 cpumask_clear(trialcs->cpus_allowed);
898 } else { 933 } else {
899 retval = cpulist_parse(buf, &trialcs.cpus_allowed); 934 retval = cpulist_parse(buf, trialcs->cpus_allowed);
900 if (retval < 0) 935 if (retval < 0)
901 return retval; 936 return retval;
902 937
903 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) 938 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
904 return -EINVAL; 939 return -EINVAL;
905 } 940 }
906 retval = validate_change(cs, &trialcs); 941 retval = validate_change(cs, trialcs);
907 if (retval < 0) 942 if (retval < 0)
908 return retval; 943 return retval;
909 944
910 /* Nothing to do if the cpus didn't change */ 945 /* Nothing to do if the cpus didn't change */
911 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 946 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
912 return 0; 947 return 0;
913 948
914 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 949 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
915 if (retval) 950 if (retval)
916 return retval; 951 return retval;
917 952
918 is_load_balanced = is_sched_load_balance(&trialcs); 953 is_load_balanced = is_sched_load_balance(trialcs);
919 954
920 mutex_lock(&callback_mutex); 955 mutex_lock(&callback_mutex);
921 cs->cpus_allowed = trialcs.cpus_allowed; 956 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
922 mutex_unlock(&callback_mutex); 957 mutex_unlock(&callback_mutex);
923 958
924 /* 959 /*
@@ -1006,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
1006 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1041 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1007 1042
1008 fudge = 10; /* spare mmarray[] slots */ 1043 fudge = 10; /* spare mmarray[] slots */
1009 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 1044 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
1010 retval = -ENOMEM; 1045 retval = -ENOMEM;
1011 1046
1012 /* 1047 /*
@@ -1093,9 +1128,9 @@ done:
1093 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1128 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1094 * their mempolicies to the cpusets new mems_allowed. 1129 * their mempolicies to the cpusets new mems_allowed.
1095 */ 1130 */
1096static int update_nodemask(struct cpuset *cs, const char *buf) 1131static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1132 const char *buf)
1097{ 1133{
1098 struct cpuset trialcs;
1099 nodemask_t oldmem; 1134 nodemask_t oldmem;
1100 int retval; 1135 int retval;
1101 1136
@@ -1106,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1106 if (cs == &top_cpuset) 1141 if (cs == &top_cpuset)
1107 return -EACCES; 1142 return -EACCES;
1108 1143
1109 trialcs = *cs;
1110
1111 /* 1144 /*
1112 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1145 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1113 * Since nodelist_parse() fails on an empty mask, we special case 1146 * Since nodelist_parse() fails on an empty mask, we special case
@@ -1115,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1115 * with tasks have memory. 1148 * with tasks have memory.
1116 */ 1149 */
1117 if (!*buf) { 1150 if (!*buf) {
1118 nodes_clear(trialcs.mems_allowed); 1151 nodes_clear(trialcs->mems_allowed);
1119 } else { 1152 } else {
1120 retval = nodelist_parse(buf, trialcs.mems_allowed); 1153 retval = nodelist_parse(buf, trialcs->mems_allowed);
1121 if (retval < 0) 1154 if (retval < 0)
1122 goto done; 1155 goto done;
1123 1156
1124 if (!nodes_subset(trialcs.mems_allowed, 1157 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) 1158 node_states[N_HIGH_MEMORY]))
1126 return -EINVAL; 1159 return -EINVAL;
1127 } 1160 }
1128 oldmem = cs->mems_allowed; 1161 oldmem = cs->mems_allowed;
1129 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 1162 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1130 retval = 0; /* Too easy - nothing to do */ 1163 retval = 0; /* Too easy - nothing to do */
1131 goto done; 1164 goto done;
1132 } 1165 }
1133 retval = validate_change(cs, &trialcs); 1166 retval = validate_change(cs, trialcs);
1134 if (retval < 0) 1167 if (retval < 0)
1135 goto done; 1168 goto done;
1136 1169
1137 mutex_lock(&callback_mutex); 1170 mutex_lock(&callback_mutex);
1138 cs->mems_allowed = trialcs.mems_allowed; 1171 cs->mems_allowed = trialcs->mems_allowed;
1139 cs->mems_generation = cpuset_mems_generation++; 1172 cs->mems_generation = cpuset_mems_generation++;
1140 mutex_unlock(&callback_mutex); 1173 mutex_unlock(&callback_mutex);
1141 1174
@@ -1156,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1156 1189
1157 if (val != cs->relax_domain_level) { 1190 if (val != cs->relax_domain_level) {
1158 cs->relax_domain_level = val; 1191 cs->relax_domain_level = val;
1159 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1192 if (!cpumask_empty(cs->cpus_allowed) &&
1193 is_sched_load_balance(cs))
1160 async_rebuild_sched_domains(); 1194 async_rebuild_sched_domains();
1161 } 1195 }
1162 1196
@@ -1175,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1175static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1209static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1176 int turning_on) 1210 int turning_on)
1177{ 1211{
1178 struct cpuset trialcs; 1212 struct cpuset *trialcs;
1179 int err; 1213 int err;
1180 int balance_flag_changed; 1214 int balance_flag_changed;
1181 1215
1182 trialcs = *cs; 1216 trialcs = alloc_trial_cpuset(cs);
1217 if (!trialcs)
1218 return -ENOMEM;
1219
1183 if (turning_on) 1220 if (turning_on)
1184 set_bit(bit, &trialcs.flags); 1221 set_bit(bit, &trialcs->flags);
1185 else 1222 else
1186 clear_bit(bit, &trialcs.flags); 1223 clear_bit(bit, &trialcs->flags);
1187 1224
1188 err = validate_change(cs, &trialcs); 1225 err = validate_change(cs, trialcs);
1189 if (err < 0) 1226 if (err < 0)
1190 return err; 1227 goto out;
1191 1228
1192 balance_flag_changed = (is_sched_load_balance(cs) != 1229 balance_flag_changed = (is_sched_load_balance(cs) !=
1193 is_sched_load_balance(&trialcs)); 1230 is_sched_load_balance(trialcs));
1194 1231
1195 mutex_lock(&callback_mutex); 1232 mutex_lock(&callback_mutex);
1196 cs->flags = trialcs.flags; 1233 cs->flags = trialcs->flags;
1197 mutex_unlock(&callback_mutex); 1234 mutex_unlock(&callback_mutex);
1198 1235
1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) 1236 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1200 async_rebuild_sched_domains(); 1237 async_rebuild_sched_domains();
1201 1238
1202 return 0; 1239out:
1240 free_trial_cpuset(trialcs);
1241 return err;
1203} 1242}
1204 1243
1205/* 1244/*
@@ -1300,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp)
1300 return val; 1339 return val;
1301} 1340}
1302 1341
1342/* Protected by cgroup_lock */
1343static cpumask_var_t cpus_attach;
1344
1303/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1345/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1304static int cpuset_can_attach(struct cgroup_subsys *ss, 1346static int cpuset_can_attach(struct cgroup_subsys *ss,
1305 struct cgroup *cont, struct task_struct *tsk) 1347 struct cgroup *cont, struct task_struct *tsk)
1306{ 1348{
1307 struct cpuset *cs = cgroup_cs(cont); 1349 struct cpuset *cs = cgroup_cs(cont);
1350 int ret = 0;
1308 1351
1309 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1352 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1310 return -ENOSPC; 1353 return -ENOSPC;
1311 if (tsk->flags & PF_THREAD_BOUND) {
1312 cpumask_t mask;
1313 1354
1355 if (tsk->flags & PF_THREAD_BOUND) {
1314 mutex_lock(&callback_mutex); 1356 mutex_lock(&callback_mutex);
1315 mask = cs->cpus_allowed; 1357 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
1358 ret = -EINVAL;
1316 mutex_unlock(&callback_mutex); 1359 mutex_unlock(&callback_mutex);
1317 if (!cpus_equal(tsk->cpus_allowed, mask))
1318 return -EINVAL;
1319 } 1360 }
1320 1361
1321 return security_task_setscheduler(tsk, 0, NULL); 1362 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
1322} 1363}
1323 1364
1324static void cpuset_attach(struct cgroup_subsys *ss, 1365static void cpuset_attach(struct cgroup_subsys *ss,
1325 struct cgroup *cont, struct cgroup *oldcont, 1366 struct cgroup *cont, struct cgroup *oldcont,
1326 struct task_struct *tsk) 1367 struct task_struct *tsk)
1327{ 1368{
1328 cpumask_t cpus;
1329 nodemask_t from, to; 1369 nodemask_t from, to;
1330 struct mm_struct *mm; 1370 struct mm_struct *mm;
1331 struct cpuset *cs = cgroup_cs(cont); 1371 struct cpuset *cs = cgroup_cs(cont);
1332 struct cpuset *oldcs = cgroup_cs(oldcont); 1372 struct cpuset *oldcs = cgroup_cs(oldcont);
1333 int err; 1373 int err;
1334 1374
1335 mutex_lock(&callback_mutex); 1375 if (cs == &top_cpuset) {
1336 guarantee_online_cpus(cs, &cpus); 1376 cpumask_copy(cpus_attach, cpu_possible_mask);
1337 err = set_cpus_allowed_ptr(tsk, &cpus); 1377 } else {
1338 mutex_unlock(&callback_mutex); 1378 mutex_lock(&callback_mutex);
1379 guarantee_online_cpus(cs, cpus_attach);
1380 mutex_unlock(&callback_mutex);
1381 }
1382 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1339 if (err) 1383 if (err)
1340 return; 1384 return;
1341 1385
@@ -1348,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1348 cpuset_migrate_mm(mm, &from, &to); 1392 cpuset_migrate_mm(mm, &from, &to);
1349 mmput(mm); 1393 mmput(mm);
1350 } 1394 }
1351
1352} 1395}
1353 1396
1354/* The various types of files and directories in a cpuset file system */ 1397/* The various types of files and directories in a cpuset file system */
@@ -1443,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1443 const char *buf) 1486 const char *buf)
1444{ 1487{
1445 int retval = 0; 1488 int retval = 0;
1489 struct cpuset *cs = cgroup_cs(cgrp);
1490 struct cpuset *trialcs;
1446 1491
1447 if (!cgroup_lock_live_group(cgrp)) 1492 if (!cgroup_lock_live_group(cgrp))
1448 return -ENODEV; 1493 return -ENODEV;
1449 1494
1495 trialcs = alloc_trial_cpuset(cs);
1496 if (!trialcs)
1497 return -ENOMEM;
1498
1450 switch (cft->private) { 1499 switch (cft->private) {
1451 case FILE_CPULIST: 1500 case FILE_CPULIST:
1452 retval = update_cpumask(cgroup_cs(cgrp), buf); 1501 retval = update_cpumask(cs, trialcs, buf);
1453 break; 1502 break;
1454 case FILE_MEMLIST: 1503 case FILE_MEMLIST:
1455 retval = update_nodemask(cgroup_cs(cgrp), buf); 1504 retval = update_nodemask(cs, trialcs, buf);
1456 break; 1505 break;
1457 default: 1506 default:
1458 retval = -EINVAL; 1507 retval = -EINVAL;
1459 break; 1508 break;
1460 } 1509 }
1510
1511 free_trial_cpuset(trialcs);
1461 cgroup_unlock(); 1512 cgroup_unlock();
1462 return retval; 1513 return retval;
1463} 1514}
@@ -1476,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1476 1527
1477static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1528static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1478{ 1529{
1479 cpumask_t mask; 1530 int ret;
1480 1531
1481 mutex_lock(&callback_mutex); 1532 mutex_lock(&callback_mutex);
1482 mask = cs->cpus_allowed; 1533 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1483 mutex_unlock(&callback_mutex); 1534 mutex_unlock(&callback_mutex);
1484 1535
1485 return cpulist_scnprintf(page, PAGE_SIZE, &mask); 1536 return ret;
1486} 1537}
1487 1538
1488static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1539static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1718,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1718 parent_cs = cgroup_cs(parent); 1769 parent_cs = cgroup_cs(parent);
1719 1770
1720 cs->mems_allowed = parent_cs->mems_allowed; 1771 cs->mems_allowed = parent_cs->mems_allowed;
1721 cs->cpus_allowed = parent_cs->cpus_allowed; 1772 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1722 return; 1773 return;
1723} 1774}
1724 1775
@@ -1744,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create(
1744 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1795 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1745 if (!cs) 1796 if (!cs)
1746 return ERR_PTR(-ENOMEM); 1797 return ERR_PTR(-ENOMEM);
1798 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1799 kfree(cs);
1800 return ERR_PTR(-ENOMEM);
1801 }
1747 1802
1748 cpuset_update_task_memory_state(); 1803 cpuset_update_task_memory_state();
1749 cs->flags = 0; 1804 cs->flags = 0;
@@ -1752,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create(
1752 if (is_spread_slab(parent)) 1807 if (is_spread_slab(parent))
1753 set_bit(CS_SPREAD_SLAB, &cs->flags); 1808 set_bit(CS_SPREAD_SLAB, &cs->flags);
1754 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1809 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1755 cpus_clear(cs->cpus_allowed); 1810 cpumask_clear(cs->cpus_allowed);
1756 nodes_clear(cs->mems_allowed); 1811 nodes_clear(cs->mems_allowed);
1757 cs->mems_generation = cpuset_mems_generation++; 1812 cs->mems_generation = cpuset_mems_generation++;
1758 fmeter_init(&cs->fmeter); 1813 fmeter_init(&cs->fmeter);
@@ -1779,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1779 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1834 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1780 1835
1781 number_of_cpusets--; 1836 number_of_cpusets--;
1837 free_cpumask_var(cs->cpus_allowed);
1782 kfree(cs); 1838 kfree(cs);
1783} 1839}
1784 1840
@@ -1802,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = {
1802 1858
1803int __init cpuset_init_early(void) 1859int __init cpuset_init_early(void)
1804{ 1860{
1861 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1862
1805 top_cpuset.mems_generation = cpuset_mems_generation++; 1863 top_cpuset.mems_generation = cpuset_mems_generation++;
1806 return 0; 1864 return 0;
1807} 1865}
@@ -1817,7 +1875,7 @@ int __init cpuset_init(void)
1817{ 1875{
1818 int err = 0; 1876 int err = 0;
1819 1877
1820 cpus_setall(top_cpuset.cpus_allowed); 1878 cpumask_setall(top_cpuset.cpus_allowed);
1821 nodes_setall(top_cpuset.mems_allowed); 1879 nodes_setall(top_cpuset.mems_allowed);
1822 1880
1823 fmeter_init(&top_cpuset.fmeter); 1881 fmeter_init(&top_cpuset.fmeter);
@@ -1829,6 +1887,9 @@ int __init cpuset_init(void)
1829 if (err < 0) 1887 if (err < 0)
1830 return err; 1888 return err;
1831 1889
1890 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1891 BUG();
1892
1832 number_of_cpusets = 1; 1893 number_of_cpusets = 1;
1833 return 0; 1894 return 0;
1834} 1895}
@@ -1903,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1903 * has online cpus, so can't be empty). 1964 * has online cpus, so can't be empty).
1904 */ 1965 */
1905 parent = cs->parent; 1966 parent = cs->parent;
1906 while (cpus_empty(parent->cpus_allowed) || 1967 while (cpumask_empty(parent->cpus_allowed) ||
1907 nodes_empty(parent->mems_allowed)) 1968 nodes_empty(parent->mems_allowed))
1908 parent = parent->parent; 1969 parent = parent->parent;
1909 1970
@@ -1944,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1944 } 2005 }
1945 2006
1946 /* Continue past cpusets with all cpus, mems online */ 2007 /* Continue past cpusets with all cpus, mems online */
1947 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 2008 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
1948 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2009 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1949 continue; 2010 continue;
1950 2011
@@ -1952,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1952 2013
1953 /* Remove offline cpus and mems from this cpuset. */ 2014 /* Remove offline cpus and mems from this cpuset. */
1954 mutex_lock(&callback_mutex); 2015 mutex_lock(&callback_mutex);
1955 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 2016 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2017 cpu_online_mask);
1956 nodes_and(cp->mems_allowed, cp->mems_allowed, 2018 nodes_and(cp->mems_allowed, cp->mems_allowed,
1957 node_states[N_HIGH_MEMORY]); 2019 node_states[N_HIGH_MEMORY]);
1958 mutex_unlock(&callback_mutex); 2020 mutex_unlock(&callback_mutex);
1959 2021
1960 /* Move tasks from the empty cpuset to a parent */ 2022 /* Move tasks from the empty cpuset to a parent */
1961 if (cpus_empty(cp->cpus_allowed) || 2023 if (cpumask_empty(cp->cpus_allowed) ||
1962 nodes_empty(cp->mems_allowed)) 2024 nodes_empty(cp->mems_allowed))
1963 remove_tasks_in_empty_cpuset(cp); 2025 remove_tasks_in_empty_cpuset(cp);
1964 else { 2026 else {
@@ -1984,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1984 unsigned long phase, void *unused_cpu) 2046 unsigned long phase, void *unused_cpu)
1985{ 2047{
1986 struct sched_domain_attr *attr; 2048 struct sched_domain_attr *attr;
1987 cpumask_t *doms; 2049 struct cpumask *doms;
1988 int ndoms; 2050 int ndoms;
1989 2051
1990 switch (phase) { 2052 switch (phase) {
@@ -1999,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1999 } 2061 }
2000 2062
2001 cgroup_lock(); 2063 cgroup_lock();
2002 top_cpuset.cpus_allowed = cpu_online_map; 2064 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2003 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2004 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2005 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2044,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2044 2106
2045void __init cpuset_init_smp(void) 2107void __init cpuset_init_smp(void)
2046{ 2108{
2047 top_cpuset.cpus_allowed = cpu_online_map; 2109 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2110 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2049 2111
2050 hotcpu_notifier(cpuset_track_online_cpus, 0); 2112 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2054,15 +2116,15 @@ void __init cpuset_init_smp(void)
2054/** 2116/**
2055 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2117 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2056 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2118 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2057 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2119 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
2058 * 2120 *
2059 * Description: Returns the cpumask_t cpus_allowed of the cpuset 2121 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2060 * attached to the specified @tsk. Guaranteed to return some non-empty 2122 * attached to the specified @tsk. Guaranteed to return some non-empty
2061 * subset of cpu_online_map, even if this means going outside the 2123 * subset of cpu_online_map, even if this means going outside the
2062 * tasks cpuset. 2124 * tasks cpuset.
2063 **/ 2125 **/
2064 2126
2065void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) 2127void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2066{ 2128{
2067 mutex_lock(&callback_mutex); 2129 mutex_lock(&callback_mutex);
2068 cpuset_cpus_allowed_locked(tsk, pmask); 2130 cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2073,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
2073 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 2135 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2074 * Must be called with callback_mutex held. 2136 * Must be called with callback_mutex held.
2075 **/ 2137 **/
2076void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) 2138void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2077{ 2139{
2078 task_lock(tsk); 2140 task_lock(tsk);
2079 guarantee_online_cpus(task_cs(tsk), pmask); 2141 guarantee_online_cpus(task_cs(tsk), pmask);
@@ -2356,6 +2418,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2356 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2418 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2357} 2419}
2358 2420
2421/**
2422 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2423 * @task: pointer to task_struct of some task.
2424 *
2425 * Description: Prints @task's name, cpuset name, and cached copy of its
2426 * mems_allowed to the kernel log. Must hold task_lock(task) to allow
2427 * dereferencing task_cs(task).
2428 */
2429void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2430{
2431 struct dentry *dentry;
2432
2433 dentry = task_cs(tsk)->css.cgroup->dentry;
2434 spin_lock(&cpuset_buffer_lock);
2435 snprintf(cpuset_name, CPUSET_NAME_LEN,
2436 dentry ? (const char *)dentry->d_name.name : "/");
2437 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2438 tsk->mems_allowed);
2439 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2440 tsk->comm, cpuset_name, cpuset_nodelist);
2441 spin_unlock(&cpuset_buffer_lock);
2442}
2443
2359/* 2444/*
2360 * Collection of memory_pressure is suppressed unless 2445 * Collection of memory_pressure is suppressed unless
2361 * this flag is enabled by writing "1" to the special 2446 * this flag is enabled by writing "1" to the special
diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991c..3a039189d707 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,7 +372,8 @@ int commit_creds(struct cred *new)
372 old->fsuid != new->fsuid || 372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid || 373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable); 375 if (task->mm)
376 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0; 377 task->pdeath_signal = 0;
377 smp_wmb(); 378 smp_wmb();
378 } 379 }
@@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
506 else 507 else
507 old = get_cred(&init_cred); 508 old = get_cred(&init_cred);
508 509
510 *new = *old;
509 get_uid(new->user); 511 get_uid(new->user);
510 get_group_info(new->group_info); 512 get_group_info(new->group_info);
511 513
@@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
529 531
530error: 532error:
531 put_cred(new); 533 put_cred(new);
534 put_cred(old);
532 return NULL; 535 return NULL;
533} 536}
534EXPORT_SYMBOL(prepare_kernel_cred); 537EXPORT_SYMBOL(prepare_kernel_cred);
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index f013a0c2e111..038707404b76 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
109int dma_alloc_from_coherent(struct device *dev, ssize_t size, 109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret) 110 dma_addr_t *dma_handle, void **ret)
111{ 111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; 112 struct dma_coherent_mem *mem;
113 int order = get_order(size); 113 int order = get_order(size);
114 int pageno;
114 115
115 if (mem) { 116 if (!dev)
116 int page = bitmap_find_free_region(mem->bitmap, mem->size, 117 return 0;
117 order); 118 mem = dev->dma_mem;
118 if (page >= 0) { 119 if (!mem)
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT); 120 return 0;
120 *ret = mem->virt_base + (page << PAGE_SHIFT); 121 if (unlikely(size > mem->size))
121 memset(*ret, 0, size); 122 return 0;
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) 123
123 *ret = NULL; 124 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
125 if (pageno >= 0) {
126 /*
127 * Memory was found in the per-device arena.
128 */
129 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
130 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
131 memset(*ret, 0, size);
132 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
133 /*
134 * The per-device arena is exhausted and we are not
135 * permitted to fall back to generic memory.
136 */
137 *ret = NULL;
138 } else {
139 /*
140 * The per-device arena is exhausted and we are
141 * permitted to fall back to generic memory.
142 */
143 return 0;
124 } 144 }
125 return (mem != NULL); 145 return 1;
126} 146}
127EXPORT_SYMBOL(dma_alloc_from_coherent); 147EXPORT_SYMBOL(dma_alloc_from_coherent);
128 148
diff --git a/kernel/exit.c b/kernel/exit.c
index c9e5a1c14e08..c7740fa3252c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -642,35 +642,31 @@ retry:
642 /* 642 /*
643 * We found no owner yet mm_users > 1: this implies that we are 643 * We found no owner yet mm_users > 1: this implies that we are
644 * most likely racing with swapoff (try_to_unuse()) or /proc or 644 * most likely racing with swapoff (try_to_unuse()) or /proc or
645 * ptrace or page migration (get_task_mm()). Mark owner as NULL, 645 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
646 * so that subsystems can understand the callback and take action.
647 */ 646 */
648 down_write(&mm->mmap_sem);
649 cgroup_mm_owner_callbacks(mm->owner, NULL);
650 mm->owner = NULL; 647 mm->owner = NULL;
651 up_write(&mm->mmap_sem);
652 return; 648 return;
653 649
654assign_new_owner: 650assign_new_owner:
655 BUG_ON(c == p); 651 BUG_ON(c == p);
656 get_task_struct(c); 652 get_task_struct(c);
657 read_unlock(&tasklist_lock);
658 down_write(&mm->mmap_sem);
659 /* 653 /*
660 * The task_lock protects c->mm from changing. 654 * The task_lock protects c->mm from changing.
661 * We always want mm->owner->mm == mm 655 * We always want mm->owner->mm == mm
662 */ 656 */
663 task_lock(c); 657 task_lock(c);
658 /*
659 * Delay read_unlock() till we have the task_lock()
660 * to ensure that c does not slip away underneath us
661 */
662 read_unlock(&tasklist_lock);
664 if (c->mm != mm) { 663 if (c->mm != mm) {
665 task_unlock(c); 664 task_unlock(c);
666 up_write(&mm->mmap_sem);
667 put_task_struct(c); 665 put_task_struct(c);
668 goto retry; 666 goto retry;
669 } 667 }
670 cgroup_mm_owner_callbacks(mm->owner, c);
671 mm->owner = c; 668 mm->owner = c;
672 task_unlock(c); 669 task_unlock(c);
673 up_write(&mm->mmap_sem);
674 put_task_struct(c); 670 put_task_struct(c);
675} 671}
676#endif /* CONFIG_MM_OWNER */ 672#endif /* CONFIG_MM_OWNER */
@@ -1055,10 +1051,7 @@ NORET_TYPE void do_exit(long code)
1055 preempt_count()); 1051 preempt_count());
1056 1052
1057 acct_update_integrals(tsk); 1053 acct_update_integrals(tsk);
1058 if (tsk->mm) { 1054
1059 update_hiwater_rss(tsk->mm);
1060 update_hiwater_vm(tsk->mm);
1061 }
1062 group_dead = atomic_dec_and_test(&tsk->signal->live); 1055 group_dead = atomic_dec_and_test(&tsk->signal->live);
1063 if (group_dead) { 1056 if (group_dead) {
1064 hrtimer_cancel(&tsk->signal->real_timer); 1057 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 43cbf30669e6..1d68f1255dd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
402 402
403static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
404
405static int __init coredump_filter_setup(char *s)
406{
407 default_dump_filter =
408 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
409 MMF_DUMP_FILTER_MASK;
410 return 1;
411}
412
413__setup("coredump_filter=", coredump_filter_setup);
414
403#include <linux/init_task.h> 415#include <linux/init_task.h>
404 416
405static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 417static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
408 atomic_set(&mm->mm_count, 1); 420 atomic_set(&mm->mm_count, 1);
409 init_rwsem(&mm->mmap_sem); 421 init_rwsem(&mm->mmap_sem);
410 INIT_LIST_HEAD(&mm->mmlist); 422 INIT_LIST_HEAD(&mm->mmlist);
411 mm->flags = (current->mm) ? current->mm->flags 423 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
412 : MMF_DUMP_FILTER_DEFAULT;
413 mm->core_state = NULL; 424 mm->core_state = NULL;
414 mm->nr_ptes = 0; 425 mm->nr_ptes = 0;
415 set_mm_counter(mm, file_rss, 0); 426 set_mm_counter(mm, file_rss, 0);
@@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
758{ 769{
759 struct sighand_struct *sig; 770 struct sighand_struct *sig;
760 771
761 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { 772 if (clone_flags & CLONE_SIGHAND) {
762 atomic_inc(&current->sighand->count); 773 atomic_inc(&current->sighand->count);
763 return 0; 774 return 0;
764 } 775 }
@@ -1115,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1115 1126
1116 if (pid != &init_struct_pid) { 1127 if (pid != &init_struct_pid) {
1117 retval = -ENOMEM; 1128 retval = -ENOMEM;
1118 pid = alloc_pid(task_active_pid_ns(p)); 1129 pid = alloc_pid(p->nsproxy->pid_ns);
1119 if (!pid) 1130 if (!pid)
1120 goto bad_fork_cleanup_io; 1131 goto bad_fork_cleanup_io;
1121 1132
1122 if (clone_flags & CLONE_NEWPID) { 1133 if (clone_flags & CLONE_NEWPID) {
1123 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1134 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1124 if (retval < 0) 1135 if (retval < 0)
1125 goto bad_fork_free_pid; 1136 goto bad_fork_free_pid;
1126 } 1137 }
@@ -1470,12 +1481,10 @@ void __init proc_caches_init(void)
1470 fs_cachep = kmem_cache_create("fs_cache", 1481 fs_cachep = kmem_cache_create("fs_cache",
1471 sizeof(struct fs_struct), 0, 1482 sizeof(struct fs_struct), 0,
1472 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1483 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1473 vm_area_cachep = kmem_cache_create("vm_area_struct",
1474 sizeof(struct vm_area_struct), 0,
1475 SLAB_PANIC, NULL);
1476 mm_cachep = kmem_cache_create("mm_struct", 1484 mm_cachep = kmem_cache_create("mm_struct",
1477 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1478 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1487 mmap_init();
1479} 1488}
1480 1489
1481/* 1490/*
diff --git a/kernel/futex.c b/kernel/futex.c
index 7c6cbabe52b3..002aa189eb09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key)
170 */ 170 */
171static void drop_futex_key_refs(union futex_key *key) 171static void drop_futex_key_refs(union futex_key *key)
172{ 172{
173 if (!key->both.ptr) 173 if (!key->both.ptr) {
174 /* If we're here then we tried to put a key we failed to get */
175 WARN_ON_ONCE(1);
174 return; 176 return;
177 }
175 178
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 179 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE: 180 case FUT_OFF_INODE:
@@ -730,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
730 } 733 }
731 734
732 spin_unlock(&hb->lock); 735 spin_unlock(&hb->lock);
733out:
734 put_futex_key(fshared, &key); 736 put_futex_key(fshared, &key);
737out:
735 return ret; 738 return ret;
736} 739}
737 740
@@ -755,7 +758,7 @@ retryfull:
755 goto out; 758 goto out;
756 ret = get_futex_key(uaddr2, fshared, &key2); 759 ret = get_futex_key(uaddr2, fshared, &key2);
757 if (unlikely(ret != 0)) 760 if (unlikely(ret != 0))
758 goto out; 761 goto out_put_key1;
759 762
760 hb1 = hash_futex(&key1); 763 hb1 = hash_futex(&key1);
761 hb2 = hash_futex(&key2); 764 hb2 = hash_futex(&key2);
@@ -777,12 +780,12 @@ retry:
777 * but we might get them from range checking 780 * but we might get them from range checking
778 */ 781 */
779 ret = op_ret; 782 ret = op_ret;
780 goto out; 783 goto out_put_keys;
781#endif 784#endif
782 785
783 if (unlikely(op_ret != -EFAULT)) { 786 if (unlikely(op_ret != -EFAULT)) {
784 ret = op_ret; 787 ret = op_ret;
785 goto out; 788 goto out_put_keys;
786 } 789 }
787 790
788 /* 791 /*
@@ -796,7 +799,7 @@ retry:
796 ret = futex_handle_fault((unsigned long)uaddr2, 799 ret = futex_handle_fault((unsigned long)uaddr2,
797 attempt); 800 attempt);
798 if (ret) 801 if (ret)
799 goto out; 802 goto out_put_keys;
800 goto retry; 803 goto retry;
801 } 804 }
802 805
@@ -834,10 +837,11 @@ retry:
834 spin_unlock(&hb1->lock); 837 spin_unlock(&hb1->lock);
835 if (hb1 != hb2) 838 if (hb1 != hb2)
836 spin_unlock(&hb2->lock); 839 spin_unlock(&hb2->lock);
837out: 840out_put_keys:
838 put_futex_key(fshared, &key2); 841 put_futex_key(fshared, &key2);
842out_put_key1:
839 put_futex_key(fshared, &key1); 843 put_futex_key(fshared, &key1);
840 844out:
841 return ret; 845 return ret;
842} 846}
843 847
@@ -854,13 +858,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
854 struct futex_q *this, *next; 858 struct futex_q *this, *next;
855 int ret, drop_count = 0; 859 int ret, drop_count = 0;
856 860
857 retry: 861retry:
858 ret = get_futex_key(uaddr1, fshared, &key1); 862 ret = get_futex_key(uaddr1, fshared, &key1);
859 if (unlikely(ret != 0)) 863 if (unlikely(ret != 0))
860 goto out; 864 goto out;
861 ret = get_futex_key(uaddr2, fshared, &key2); 865 ret = get_futex_key(uaddr2, fshared, &key2);
862 if (unlikely(ret != 0)) 866 if (unlikely(ret != 0))
863 goto out; 867 goto out_put_key1;
864 868
865 hb1 = hash_futex(&key1); 869 hb1 = hash_futex(&key1);
866 hb2 = hash_futex(&key2); 870 hb2 = hash_futex(&key2);
@@ -882,7 +886,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
882 if (!ret) 886 if (!ret)
883 goto retry; 887 goto retry;
884 888
885 return ret; 889 goto out_put_keys;
886 } 890 }
887 if (curval != *cmpval) { 891 if (curval != *cmpval) {
888 ret = -EAGAIN; 892 ret = -EAGAIN;
@@ -927,9 +931,11 @@ out_unlock:
927 while (--drop_count >= 0) 931 while (--drop_count >= 0)
928 drop_futex_key_refs(&key1); 932 drop_futex_key_refs(&key1);
929 933
930out: 934out_put_keys:
931 put_futex_key(fshared, &key2); 935 put_futex_key(fshared, &key2);
936out_put_key1:
932 put_futex_key(fshared, &key1); 937 put_futex_key(fshared, &key1);
938out:
933 return ret; 939 return ret;
934} 940}
935 941
@@ -990,7 +996,7 @@ static int unqueue_me(struct futex_q *q)
990 int ret = 0; 996 int ret = 0;
991 997
992 /* In the common case we don't take the spinlock, which is nice. */ 998 /* In the common case we don't take the spinlock, which is nice. */
993 retry: 999retry:
994 lock_ptr = q->lock_ptr; 1000 lock_ptr = q->lock_ptr;
995 barrier(); 1001 barrier();
996 if (lock_ptr != NULL) { 1002 if (lock_ptr != NULL) {
@@ -1172,11 +1178,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1172 1178
1173 q.pi_state = NULL; 1179 q.pi_state = NULL;
1174 q.bitset = bitset; 1180 q.bitset = bitset;
1175 retry: 1181retry:
1176 q.key = FUTEX_KEY_INIT; 1182 q.key = FUTEX_KEY_INIT;
1177 ret = get_futex_key(uaddr, fshared, &q.key); 1183 ret = get_futex_key(uaddr, fshared, &q.key);
1178 if (unlikely(ret != 0)) 1184 if (unlikely(ret != 0))
1179 goto out_release_sem; 1185 goto out;
1180 1186
1181 hb = queue_lock(&q); 1187 hb = queue_lock(&q);
1182 1188
@@ -1204,6 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1204 1210
1205 if (unlikely(ret)) { 1211 if (unlikely(ret)) {
1206 queue_unlock(&q, hb); 1212 queue_unlock(&q, hb);
1213 put_futex_key(fshared, &q.key);
1207 1214
1208 ret = get_user(uval, uaddr); 1215 ret = get_user(uval, uaddr);
1209 1216
@@ -1213,7 +1220,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1213 } 1220 }
1214 ret = -EWOULDBLOCK; 1221 ret = -EWOULDBLOCK;
1215 if (uval != val) 1222 if (uval != val)
1216 goto out_unlock_release_sem; 1223 goto out_unlock_put_key;
1217 1224
1218 /* Only actually queue if *uaddr contained val. */ 1225 /* Only actually queue if *uaddr contained val. */
1219 queue_me(&q, hb); 1226 queue_me(&q, hb);
@@ -1305,11 +1312,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1305 return -ERESTART_RESTARTBLOCK; 1312 return -ERESTART_RESTARTBLOCK;
1306 } 1313 }
1307 1314
1308 out_unlock_release_sem: 1315out_unlock_put_key:
1309 queue_unlock(&q, hb); 1316 queue_unlock(&q, hb);
1310
1311 out_release_sem:
1312 put_futex_key(fshared, &q.key); 1317 put_futex_key(fshared, &q.key);
1318
1319out:
1313 return ret; 1320 return ret;
1314} 1321}
1315 1322
@@ -1358,16 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1358 } 1365 }
1359 1366
1360 q.pi_state = NULL; 1367 q.pi_state = NULL;
1361 retry: 1368retry:
1362 q.key = FUTEX_KEY_INIT; 1369 q.key = FUTEX_KEY_INIT;
1363 ret = get_futex_key(uaddr, fshared, &q.key); 1370 ret = get_futex_key(uaddr, fshared, &q.key);
1364 if (unlikely(ret != 0)) 1371 if (unlikely(ret != 0))
1365 goto out_release_sem; 1372 goto out;
1366 1373
1367 retry_unlocked: 1374retry_unlocked:
1368 hb = queue_lock(&q); 1375 hb = queue_lock(&q);
1369 1376
1370 retry_locked: 1377retry_locked:
1371 ret = lock_taken = 0; 1378 ret = lock_taken = 0;
1372 1379
1373 /* 1380 /*
@@ -1388,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1388 */ 1395 */
1389 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { 1396 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1390 ret = -EDEADLK; 1397 ret = -EDEADLK;
1391 goto out_unlock_release_sem; 1398 goto out_unlock_put_key;
1392 } 1399 }
1393 1400
1394 /* 1401 /*
1395 * Surprise - we got the lock. Just return to userspace: 1402 * Surprise - we got the lock. Just return to userspace:
1396 */ 1403 */
1397 if (unlikely(!curval)) 1404 if (unlikely(!curval))
1398 goto out_unlock_release_sem; 1405 goto out_unlock_put_key;
1399 1406
1400 uval = curval; 1407 uval = curval;
1401 1408
@@ -1431,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1431 * We took the lock due to owner died take over. 1438 * We took the lock due to owner died take over.
1432 */ 1439 */
1433 if (unlikely(lock_taken)) 1440 if (unlikely(lock_taken))
1434 goto out_unlock_release_sem; 1441 goto out_unlock_put_key;
1435 1442
1436 /* 1443 /*
1437 * We dont have the lock. Look up the PI state (or create it if 1444 * We dont have the lock. Look up the PI state (or create it if
@@ -1470,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1470 goto retry_locked; 1477 goto retry_locked;
1471 } 1478 }
1472 default: 1479 default:
1473 goto out_unlock_release_sem; 1480 goto out_unlock_put_key;
1474 } 1481 }
1475 } 1482 }
1476 1483
@@ -1561,16 +1568,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1561 destroy_hrtimer_on_stack(&to->timer); 1568 destroy_hrtimer_on_stack(&to->timer);
1562 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1569 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1563 1570
1564 out_unlock_release_sem: 1571out_unlock_put_key:
1565 queue_unlock(&q, hb); 1572 queue_unlock(&q, hb);
1566 1573
1567 out_release_sem: 1574out_put_key:
1568 put_futex_key(fshared, &q.key); 1575 put_futex_key(fshared, &q.key);
1576out:
1569 if (to) 1577 if (to)
1570 destroy_hrtimer_on_stack(&to->timer); 1578 destroy_hrtimer_on_stack(&to->timer);
1571 return ret; 1579 return ret;
1572 1580
1573 uaddr_faulted: 1581uaddr_faulted:
1574 /* 1582 /*
1575 * We have to r/w *(int __user *)uaddr, and we have to modify it 1583 * We have to r/w *(int __user *)uaddr, and we have to modify it
1576 * atomically. Therefore, if we continue to fault after get_user() 1584 * atomically. Therefore, if we continue to fault after get_user()
@@ -1583,7 +1591,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1583 if (attempt++) { 1591 if (attempt++) {
1584 ret = futex_handle_fault((unsigned long)uaddr, attempt); 1592 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1585 if (ret) 1593 if (ret)
1586 goto out_release_sem; 1594 goto out_put_key;
1587 goto retry_unlocked; 1595 goto retry_unlocked;
1588 } 1596 }
1589 1597
@@ -1675,9 +1683,9 @@ retry_unlocked:
1675 1683
1676out_unlock: 1684out_unlock:
1677 spin_unlock(&hb->lock); 1685 spin_unlock(&hb->lock);
1678out:
1679 put_futex_key(fshared, &key); 1686 put_futex_key(fshared, &key);
1680 1687
1688out:
1681 return ret; 1689 return ret;
1682 1690
1683pi_faulted: 1691pi_faulted:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eb2bfefa6dcc..1455b7651b6b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
634{ 634{
635} 635}
636 636
637static void __run_hrtimer(struct hrtimer *timer);
638 637
639/* 638/*
640 * When High resolution timers are active, try to reprogram. Note, that in case 639 * When High resolution timers are active, try to reprogram. Note, that in case
@@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
646 struct hrtimer_clock_base *base) 645 struct hrtimer_clock_base *base)
647{ 646{
648 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
649 /* 648 spin_unlock(&base->cpu_base->lock);
650 * XXX: recursion check? 649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
651 * hrtimer_forward() should round up with timer granularity 650 spin_lock(&base->cpu_base->lock);
652 * so that we never get into inf recursion here,
653 * it doesn't do that though
654 */
655 __run_hrtimer(timer);
656 return 1; 651 return 1;
657 } 652 }
658 return 0; 653 return 0;
@@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
705} 700}
706static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 701static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
707static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 702static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
708static inline int hrtimer_reprogram(struct hrtimer *timer,
709 struct hrtimer_clock_base *base)
710{
711 return 0;
712}
713 703
714#endif /* CONFIG_HIGH_RES_TIMERS */ 704#endif /* CONFIG_HIGH_RES_TIMERS */
715 705
@@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
780 * 770 *
781 * The timer is inserted in expiry order. Insertion into the 771 * The timer is inserted in expiry order. Insertion into the
782 * red black tree is O(log(n)). Must hold the base lock. 772 * red black tree is O(log(n)). Must hold the base lock.
773 *
774 * Returns 1 when the new timer is the leftmost timer in the tree.
783 */ 775 */
784static void enqueue_hrtimer(struct hrtimer *timer, 776static int enqueue_hrtimer(struct hrtimer *timer,
785 struct hrtimer_clock_base *base, int reprogram) 777 struct hrtimer_clock_base *base)
786{ 778{
787 struct rb_node **link = &base->active.rb_node; 779 struct rb_node **link = &base->active.rb_node;
788 struct rb_node *parent = NULL; 780 struct rb_node *parent = NULL;
@@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
814 * Insert the timer to the rbtree and check whether it 806 * Insert the timer to the rbtree and check whether it
815 * replaces the first pending timer 807 * replaces the first pending timer
816 */ 808 */
817 if (leftmost) { 809 if (leftmost)
818 /*
819 * Reprogram the clock event device. When the timer is already
820 * expired hrtimer_enqueue_reprogram has either called the
821 * callback or added it to the pending list and raised the
822 * softirq.
823 *
824 * This is a NOP for !HIGHRES
825 */
826 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
827 return;
828
829 base->first = &timer->node; 810 base->first = &timer->node;
830 }
831 811
832 rb_link_node(&timer->node, parent, link); 812 rb_link_node(&timer->node, parent, link);
833 rb_insert_color(&timer->node, &base->active); 813 rb_insert_color(&timer->node, &base->active);
@@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
836 * state of a possibly running callback. 816 * state of a possibly running callback.
837 */ 817 */
838 timer->state |= HRTIMER_STATE_ENQUEUED; 818 timer->state |= HRTIMER_STATE_ENQUEUED;
819
820 return leftmost;
839} 821}
840 822
841/* 823/*
@@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
912{ 894{
913 struct hrtimer_clock_base *base, *new_base; 895 struct hrtimer_clock_base *base, *new_base;
914 unsigned long flags; 896 unsigned long flags;
915 int ret; 897 int ret, leftmost;
916 898
917 base = lock_hrtimer_base(timer, &flags); 899 base = lock_hrtimer_base(timer, &flags);
918 900
@@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
940 922
941 timer_stats_hrtimer_set_start_info(timer); 923 timer_stats_hrtimer_set_start_info(timer);
942 924
925 leftmost = enqueue_hrtimer(timer, new_base);
926
943 /* 927 /*
944 * Only allow reprogramming if the new base is on this CPU. 928 * Only allow reprogramming if the new base is on this CPU.
945 * (it might still be on another CPU if the timer was pending) 929 * (it might still be on another CPU if the timer was pending)
930 *
931 * XXX send_remote_softirq() ?
946 */ 932 */
947 enqueue_hrtimer(timer, new_base, 933 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
948 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 934 hrtimer_enqueue_reprogram(timer, new_base);
949 935
950 unlock_hrtimer_base(timer, &flags); 936 unlock_hrtimer_base(timer, &flags);
951 937
@@ -1157,13 +1143,13 @@ static void __run_hrtimer(struct hrtimer *timer)
1157 spin_lock(&cpu_base->lock); 1143 spin_lock(&cpu_base->lock);
1158 1144
1159 /* 1145 /*
1160 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1146 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1161 * reprogramming of the event hardware. This happens at the end of this 1147 * we do not reprogramm the event hardware. Happens either in
1162 * function anyway. 1148 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1163 */ 1149 */
1164 if (restart != HRTIMER_NORESTART) { 1150 if (restart != HRTIMER_NORESTART) {
1165 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1151 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1166 enqueue_hrtimer(timer, base, 0); 1152 enqueue_hrtimer(timer, base);
1167 } 1153 }
1168 timer->state &= ~HRTIMER_STATE_CALLBACK; 1154 timer->state &= ~HRTIMER_STATE_CALLBACK;
1169} 1155}
@@ -1243,6 +1229,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1243 } 1229 }
1244} 1230}
1245 1231
1232/*
1233 * local version of hrtimer_peek_ahead_timers() called with interrupts
1234 * disabled.
1235 */
1236static void __hrtimer_peek_ahead_timers(void)
1237{
1238 struct tick_device *td;
1239
1240 if (!hrtimer_hres_active())
1241 return;
1242
1243 td = &__get_cpu_var(tick_cpu_device);
1244 if (td && td->evtdev)
1245 hrtimer_interrupt(td->evtdev);
1246}
1247
1246/** 1248/**
1247 * hrtimer_peek_ahead_timers -- run soft-expired timers now 1249 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1248 * 1250 *
@@ -1254,20 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1254 */ 1256 */
1255void hrtimer_peek_ahead_timers(void) 1257void hrtimer_peek_ahead_timers(void)
1256{ 1258{
1257 struct tick_device *td;
1258 unsigned long flags; 1259 unsigned long flags;
1259 1260
1260 if (!hrtimer_hres_active())
1261 return;
1262
1263 local_irq_save(flags); 1261 local_irq_save(flags);
1264 td = &__get_cpu_var(tick_cpu_device); 1262 __hrtimer_peek_ahead_timers();
1265 if (td && td->evtdev)
1266 hrtimer_interrupt(td->evtdev);
1267 local_irq_restore(flags); 1263 local_irq_restore(flags);
1268} 1264}
1269 1265
1270#endif /* CONFIG_HIGH_RES_TIMERS */ 1266static void run_hrtimer_softirq(struct softirq_action *h)
1267{
1268 hrtimer_peek_ahead_timers();
1269}
1270
1271#else /* CONFIG_HIGH_RES_TIMERS */
1272
1273static inline void __hrtimer_peek_ahead_timers(void) { }
1274
1275#endif /* !CONFIG_HIGH_RES_TIMERS */
1271 1276
1272/* 1277/*
1273 * Called from timer softirq every jiffy, expire hrtimers: 1278 * Called from timer softirq every jiffy, expire hrtimers:
@@ -1513,39 +1518,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1513 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1518 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1514 timer->base = new_base; 1519 timer->base = new_base;
1515 /* 1520 /*
1516 * Enqueue the timers on the new cpu, but do not reprogram 1521 * Enqueue the timers on the new cpu. This does not
1517 * the timer as that would enable a deadlock between 1522 * reprogram the event device in case the timer
1518 * hrtimer_enqueue_reprogramm() running the timer and us still 1523 * expires before the earliest on this CPU, but we run
1519 * holding a nested base lock. 1524 * hrtimer_interrupt after we migrated everything to
1520 * 1525 * sort out already expired timers and reprogram the
1521 * Instead we tickle the hrtimer interrupt after the migration 1526 * event device.
1522 * is done, which will run all expired timers and re-programm
1523 * the timer device.
1524 */ 1527 */
1525 enqueue_hrtimer(timer, new_base, 0); 1528 enqueue_hrtimer(timer, new_base);
1526 1529
1527 /* Clear the migration state bit */ 1530 /* Clear the migration state bit */
1528 timer->state &= ~HRTIMER_STATE_MIGRATE; 1531 timer->state &= ~HRTIMER_STATE_MIGRATE;
1529 } 1532 }
1530} 1533}
1531 1534
1532static int migrate_hrtimers(int scpu) 1535static void migrate_hrtimers(int scpu)
1533{ 1536{
1534 struct hrtimer_cpu_base *old_base, *new_base; 1537 struct hrtimer_cpu_base *old_base, *new_base;
1535 int dcpu, i; 1538 int i;
1536 1539
1537 BUG_ON(cpu_online(scpu)); 1540 BUG_ON(cpu_online(scpu));
1538 old_base = &per_cpu(hrtimer_bases, scpu);
1539 new_base = &get_cpu_var(hrtimer_bases);
1540
1541 dcpu = smp_processor_id();
1542
1543 tick_cancel_sched_timer(scpu); 1541 tick_cancel_sched_timer(scpu);
1542
1543 local_irq_disable();
1544 old_base = &per_cpu(hrtimer_bases, scpu);
1545 new_base = &__get_cpu_var(hrtimer_bases);
1544 /* 1546 /*
1545 * The caller is globally serialized and nobody else 1547 * The caller is globally serialized and nobody else
1546 * takes two locks at once, deadlock is not possible. 1548 * takes two locks at once, deadlock is not possible.
1547 */ 1549 */
1548 spin_lock_irq(&new_base->lock); 1550 spin_lock(&new_base->lock);
1549 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1551 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1550 1552
1551 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1553 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1554,15 +1556,11 @@ static int migrate_hrtimers(int scpu)
1554 } 1556 }
1555 1557
1556 spin_unlock(&old_base->lock); 1558 spin_unlock(&old_base->lock);
1557 spin_unlock_irq(&new_base->lock); 1559 spin_unlock(&new_base->lock);
1558 put_cpu_var(hrtimer_bases);
1559 1560
1560 return dcpu; 1561 /* Check, if we got expired work to do */
1561} 1562 __hrtimer_peek_ahead_timers();
1562 1563 local_irq_enable();
1563static void tickle_timers(void *arg)
1564{
1565 hrtimer_peek_ahead_timers();
1566} 1564}
1567 1565
1568#endif /* CONFIG_HOTPLUG_CPU */ 1566#endif /* CONFIG_HOTPLUG_CPU */
@@ -1583,11 +1581,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1583 case CPU_DEAD: 1581 case CPU_DEAD:
1584 case CPU_DEAD_FROZEN: 1582 case CPU_DEAD_FROZEN:
1585 { 1583 {
1586 int dcpu;
1587
1588 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); 1584 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1589 dcpu = migrate_hrtimers(scpu); 1585 migrate_hrtimers(scpu);
1590 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1591 break; 1586 break;
1592 } 1587 }
1593#endif 1588#endif
@@ -1608,6 +1603,9 @@ void __init hrtimers_init(void)
1608 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1603 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1609 (void *)(long)smp_processor_id()); 1604 (void *)(long)smp_processor_id());
1610 register_cpu_notifier(&hrtimers_nb); 1605 register_cpu_notifier(&hrtimers_nb);
1606#ifdef CONFIG_HIGH_RES_TIMERS
1607 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1608#endif
1611} 1609}
1612 1610
1613/** 1611/**
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..1de9700f416e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/async.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
34 unsigned int status; 35 unsigned int status;
35 int i; 36 int i;
36 37
38 /*
39 * quiesce the kernel, or at least the asynchronous portion
40 */
41 async_synchronize_full();
37 mutex_lock(&probing_active); 42 mutex_lock(&probing_active);
38 /* 43 /*
39 * something may have generated an irq long ago and we want to 44 * something may have generated an irq long ago and we want to
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b46dbb908669..a27a5f64443d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * request_module - try to load a kernel module
54 * @fmt: printf style format string for the name of the module 54 * @fmt: printf style format string for the name of the module
55 * @varargs: arguements as specified in the format string 55 * @...: arguments as specified in the format string
56 * 56 *
57 * Load a module using the user mode module loader. The function returns 57 * Load a module using the user mode module loader. The function returns
58 * zero on success or a negative errno code on failure. Note that a 58 * zero on success or a negative errno code on failure. Note that a
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f8a3f25259a..1b9cbdc0127a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69/* NOTE: change this value only with kprobe_mutex held */ 69/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 70static bool kprobe_enabled;
71 71
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned_in_smp; 75 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -115,6 +115,7 @@ enum kprobe_slot_state {
115 SLOT_USED = 2, 115 SLOT_USED = 2,
116}; 116};
117 117
118static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
118static struct hlist_head kprobe_insn_pages; 119static struct hlist_head kprobe_insn_pages;
119static int kprobe_garbage_slots; 120static int kprobe_garbage_slots;
120static int collect_garbage_slots(void); 121static int collect_garbage_slots(void);
@@ -144,10 +145,10 @@ loop_end:
144} 145}
145 146
146/** 147/**
147 * get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
148 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
149 */ 150 */
150kprobe_opcode_t __kprobes *get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(void)
151{ 152{
152 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
153 struct hlist_node *pos; 154 struct hlist_node *pos;
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
196 return kip->insns; 197 return kip->insns;
197} 198}
198 199
200kprobe_opcode_t __kprobes *get_insn_slot(void)
201{
202 kprobe_opcode_t *ret;
203 mutex_lock(&kprobe_insn_mutex);
204 ret = __get_insn_slot();
205 mutex_unlock(&kprobe_insn_mutex);
206 return ret;
207}
208
199/* Return 1 if all garbages are collected, otherwise 0. */ 209/* Return 1 if all garbages are collected, otherwise 0. */
200static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 210static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
201{ 211{
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void)
226{ 236{
227 struct kprobe_insn_page *kip; 237 struct kprobe_insn_page *kip;
228 struct hlist_node *pos, *next; 238 struct hlist_node *pos, *next;
239 int safety;
229 240
230 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
231 if (check_safety() != 0) 242 mutex_unlock(&kprobe_insn_mutex);
243 safety = check_safety();
244 mutex_lock(&kprobe_insn_mutex);
245 if (safety != 0)
232 return -EAGAIN; 246 return -EAGAIN;
233 247
234 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 248 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
251 struct kprobe_insn_page *kip; 265 struct kprobe_insn_page *kip;
252 struct hlist_node *pos; 266 struct hlist_node *pos;
253 267
268 mutex_lock(&kprobe_insn_mutex);
254 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 269 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
255 if (kip->insns <= slot && 270 if (kip->insns <= slot &&
256 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 271 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
267 282
268 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 283 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
269 collect_garbage_slots(); 284 collect_garbage_slots();
285
286 mutex_unlock(&kprobe_insn_mutex);
270} 287}
271#endif 288#endif
272 289
@@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
310 struct kprobe *kp; 327 struct kprobe *kp;
311 328
312 list_for_each_entry_rcu(kp, &p->list, list) { 329 list_for_each_entry_rcu(kp, &p->list, list) {
313 if (kp->pre_handler) { 330 if (kp->pre_handler && !kprobe_gone(kp)) {
314 set_kprobe_instance(kp); 331 set_kprobe_instance(kp);
315 if (kp->pre_handler(kp, regs)) 332 if (kp->pre_handler(kp, regs))
316 return 1; 333 return 1;
@@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
326 struct kprobe *kp; 343 struct kprobe *kp;
327 344
328 list_for_each_entry_rcu(kp, &p->list, list) { 345 list_for_each_entry_rcu(kp, &p->list, list) {
329 if (kp->post_handler) { 346 if (kp->post_handler && !kprobe_gone(kp)) {
330 set_kprobe_instance(kp); 347 set_kprobe_instance(kp);
331 kp->post_handler(kp, regs, flags); 348 kp->post_handler(kp, regs, flags);
332 reset_kprobe_instance(); 349 reset_kprobe_instance();
@@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
393 hlist_add_head(&ri->hlist, head); 410 hlist_add_head(&ri->hlist, head);
394} 411}
395 412
396void kretprobe_hash_lock(struct task_struct *tsk, 413void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags) 414 struct hlist_head **head, unsigned long *flags)
398{ 415{
399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 416 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 421 spin_lock_irqsave(hlist_lock, *flags);
405} 422}
406 423
407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 424static void __kprobes kretprobe_table_lock(unsigned long hash,
425 unsigned long *flags)
408{ 426{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 427 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 428 spin_lock_irqsave(hlist_lock, *flags);
411} 429}
412 430
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) 431void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
432 unsigned long *flags)
414{ 433{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 434 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock; 435 spinlock_t *hlist_lock;
@@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
419 spin_unlock_irqrestore(hlist_lock, *flags); 438 spin_unlock_irqrestore(hlist_lock, *flags);
420} 439}
421 440
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 441void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{ 442{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 443 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags); 444 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
526 ap->addr = p->addr; 545 ap->addr = p->addr;
527 ap->pre_handler = aggr_pre_handler; 546 ap->pre_handler = aggr_pre_handler;
528 ap->fault_handler = aggr_fault_handler; 547 ap->fault_handler = aggr_fault_handler;
529 if (p->post_handler) 548 /* We don't care the kprobe which has gone. */
549 if (p->post_handler && !kprobe_gone(p))
530 ap->post_handler = aggr_post_handler; 550 ap->post_handler = aggr_post_handler;
531 if (p->break_handler) 551 if (p->break_handler && !kprobe_gone(p))
532 ap->break_handler = aggr_break_handler; 552 ap->break_handler = aggr_break_handler;
533 553
534 INIT_LIST_HEAD(&ap->list); 554 INIT_LIST_HEAD(&ap->list);
@@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
547 int ret = 0; 567 int ret = 0;
548 struct kprobe *ap; 568 struct kprobe *ap;
549 569
570 if (kprobe_gone(old_p)) {
571 /*
572 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe.
576 */
577 ret = arch_prepare_kprobe(old_p);
578 if (ret)
579 return ret;
580 }
550 if (old_p->pre_handler == aggr_pre_handler) { 581 if (old_p->pre_handler == aggr_pre_handler) {
551 copy_kprobe(old_p, p); 582 copy_kprobe(old_p, p);
552 ret = add_new_kprobe(old_p, p); 583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
553 } else { 585 } else {
554 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
555 if (!ap) 587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
556 return -ENOMEM; 590 return -ENOMEM;
591 }
557 add_aggr_kprobe(ap, old_p); 592 add_aggr_kprobe(ap, old_p);
558 copy_kprobe(ap, p); 593 copy_kprobe(ap, p);
559 ret = add_new_kprobe(ap, p); 594 ret = add_new_kprobe(ap, p);
560 } 595 }
596 if (kprobe_gone(old_p)) {
597 /*
598 * If the old_p has gone, its breakpoint has been disarmed.
599 * We have to arm it again after preparing real kprobes.
600 */
601 ap->flags &= ~KPROBE_FLAG_GONE;
602 if (kprobe_enabled)
603 arch_arm_kprobe(ap);
604 }
561 return ret; 605 return ret;
562} 606}
563 607
@@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
600 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 644 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
601} 645}
602 646
603static int __kprobes __register_kprobe(struct kprobe *p, 647int __kprobes register_kprobe(struct kprobe *p)
604 unsigned long called_from)
605{ 648{
606 int ret = 0; 649 int ret = 0;
607 struct kprobe *old_p; 650 struct kprobe *old_p;
@@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p,
620 return -EINVAL; 663 return -EINVAL;
621 } 664 }
622 665
623 p->mod_refcounted = 0; 666 p->flags = 0;
624
625 /* 667 /*
626 * Check if are we probing a module. 668 * Check if are we probing a module.
627 */ 669 */
628 probed_mod = __module_text_address((unsigned long) p->addr); 670 probed_mod = __module_text_address((unsigned long) p->addr);
629 if (probed_mod) { 671 if (probed_mod) {
630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
632 /* 672 /*
633 * We must allow modules to probe themself and in this case 673 * We must hold a refcount of the probed module while updating
634 * avoid incrementing the module refcount, so as to allow 674 * its code to prohibit unexpected unloading.
635 * unloading of self probing modules.
636 */ 675 */
637 if (calling_mod && calling_mod != probed_mod) { 676 if (unlikely(!try_module_get(probed_mod))) {
638 if (unlikely(!try_module_get(probed_mod))) { 677 preempt_enable();
639 preempt_enable(); 678 return -EINVAL;
640 return -EINVAL; 679 }
641 } 680 /*
642 p->mod_refcounted = 1; 681 * If the module freed .init.text, we couldn't insert
643 } else 682 * kprobes in there.
644 probed_mod = NULL; 683 */
684 if (within_module_init((unsigned long)p->addr, probed_mod) &&
685 probed_mod->state != MODULE_STATE_COMING) {
686 module_put(probed_mod);
687 preempt_enable();
688 return -EINVAL;
689 }
645 } 690 }
646 preempt_enable(); 691 preempt_enable();
647 692
@@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
668out: 713out:
669 mutex_unlock(&kprobe_mutex); 714 mutex_unlock(&kprobe_mutex);
670 715
671 if (ret && probed_mod) 716 if (probed_mod)
672 module_put(probed_mod); 717 module_put(probed_mod);
718
673 return ret; 719 return ret;
674} 720}
675 721
@@ -697,16 +743,16 @@ valid_p:
697 list_is_singular(&old_p->list))) { 743 list_is_singular(&old_p->list))) {
698 /* 744 /*
699 * Only probe on the hash list. Disarm only if kprobes are 745 * Only probe on the hash list. Disarm only if kprobes are
700 * enabled - otherwise, the breakpoint would already have 746 * enabled and not gone - otherwise, the breakpoint would
701 * been removed. We save on flushing icache. 747 * already have been removed. We save on flushing icache.
702 */ 748 */
703 if (kprobe_enabled) 749 if (kprobe_enabled && !kprobe_gone(old_p))
704 arch_disarm_kprobe(p); 750 arch_disarm_kprobe(p);
705 hlist_del_rcu(&old_p->hlist); 751 hlist_del_rcu(&old_p->hlist);
706 } else { 752 } else {
707 if (p->break_handler) 753 if (p->break_handler && !kprobe_gone(p))
708 old_p->break_handler = NULL; 754 old_p->break_handler = NULL;
709 if (p->post_handler) { 755 if (p->post_handler && !kprobe_gone(p)) {
710 list_for_each_entry_rcu(list_p, &old_p->list, list) { 756 list_for_each_entry_rcu(list_p, &old_p->list, list) {
711 if ((list_p != p) && (list_p->post_handler)) 757 if ((list_p != p) && (list_p->post_handler))
712 goto noclean; 758 goto noclean;
@@ -721,39 +767,27 @@ noclean:
721 767
722static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 768static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
723{ 769{
724 struct module *mod;
725 struct kprobe *old_p; 770 struct kprobe *old_p;
726 771
727 if (p->mod_refcounted) { 772 if (list_empty(&p->list))
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
732 mod = module_text_address((unsigned long)p->addr);
733 if (mod)
734 module_put(mod);
735 }
736
737 if (list_empty(&p->list) || list_is_singular(&p->list)) {
738 if (!list_empty(&p->list)) {
739 /* "p" is the last child of an aggr_kprobe */
740 old_p = list_entry(p->list.next, struct kprobe, list);
741 list_del(&p->list);
742 kfree(old_p);
743 }
744 arch_remove_kprobe(p); 773 arch_remove_kprobe(p);
774 else if (list_is_singular(&p->list)) {
775 /* "p" is the last child of an aggr_kprobe */
776 old_p = list_entry(p->list.next, struct kprobe, list);
777 list_del(&p->list);
778 arch_remove_kprobe(old_p);
779 kfree(old_p);
745 } 780 }
746} 781}
747 782
748static int __register_kprobes(struct kprobe **kps, int num, 783int __kprobes register_kprobes(struct kprobe **kps, int num)
749 unsigned long called_from)
750{ 784{
751 int i, ret = 0; 785 int i, ret = 0;
752 786
753 if (num <= 0) 787 if (num <= 0)
754 return -EINVAL; 788 return -EINVAL;
755 for (i = 0; i < num; i++) { 789 for (i = 0; i < num; i++) {
756 ret = __register_kprobe(kps[i], called_from); 790 ret = register_kprobe(kps[i]);
757 if (ret < 0) { 791 if (ret < 0) {
758 if (i > 0) 792 if (i > 0)
759 unregister_kprobes(kps, i); 793 unregister_kprobes(kps, i);
@@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num,
763 return ret; 797 return ret;
764} 798}
765 799
766/*
767 * Registration and unregistration functions for kprobe.
768 */
769int __kprobes register_kprobe(struct kprobe *p)
770{
771 return __register_kprobes(&p, 1,
772 (unsigned long)__builtin_return_address(0));
773}
774
775void __kprobes unregister_kprobe(struct kprobe *p) 800void __kprobes unregister_kprobe(struct kprobe *p)
776{ 801{
777 unregister_kprobes(&p, 1); 802 unregister_kprobes(&p, 1);
778} 803}
779 804
780int __kprobes register_kprobes(struct kprobe **kps, int num)
781{
782 return __register_kprobes(kps, num,
783 (unsigned long)__builtin_return_address(0));
784}
785
786void __kprobes unregister_kprobes(struct kprobe **kps, int num) 805void __kprobes unregister_kprobes(struct kprobe **kps, int num)
787{ 806{
788 int i; 807 int i;
@@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
811 return (unsigned long)entry; 830 return (unsigned long)entry;
812} 831}
813 832
814static int __register_jprobes(struct jprobe **jps, int num, 833int __kprobes register_jprobes(struct jprobe **jps, int num)
815 unsigned long called_from)
816{ 834{
817 struct jprobe *jp; 835 struct jprobe *jp;
818 int ret = 0, i; 836 int ret = 0, i;
@@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
830 /* Todo: Verify probepoint is a function entry point */ 848 /* Todo: Verify probepoint is a function entry point */
831 jp->kp.pre_handler = setjmp_pre_handler; 849 jp->kp.pre_handler = setjmp_pre_handler;
832 jp->kp.break_handler = longjmp_break_handler; 850 jp->kp.break_handler = longjmp_break_handler;
833 ret = __register_kprobe(&jp->kp, called_from); 851 ret = register_kprobe(&jp->kp);
834 } 852 }
835 if (ret < 0) { 853 if (ret < 0) {
836 if (i > 0) 854 if (i > 0)
@@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
843 861
844int __kprobes register_jprobe(struct jprobe *jp) 862int __kprobes register_jprobe(struct jprobe *jp)
845{ 863{
846 return __register_jprobes(&jp, 1, 864 return register_jprobes(&jp, 1);
847 (unsigned long)__builtin_return_address(0));
848} 865}
849 866
850void __kprobes unregister_jprobe(struct jprobe *jp) 867void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
852 unregister_jprobes(&jp, 1); 869 unregister_jprobes(&jp, 1);
853} 870}
854 871
855int __kprobes register_jprobes(struct jprobe **jps, int num)
856{
857 return __register_jprobes(jps, num,
858 (unsigned long)__builtin_return_address(0));
859}
860
861void __kprobes unregister_jprobes(struct jprobe **jps, int num) 872void __kprobes unregister_jprobes(struct jprobe **jps, int num)
862{ 873{
863 int i; 874 int i;
@@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
920 return 0; 931 return 0;
921} 932}
922 933
923static int __kprobes __register_kretprobe(struct kretprobe *rp, 934int __kprobes register_kretprobe(struct kretprobe *rp)
924 unsigned long called_from)
925{ 935{
926 int ret = 0; 936 int ret = 0;
927 struct kretprobe_instance *inst; 937 struct kretprobe_instance *inst;
@@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
967 977
968 rp->nmissed = 0; 978 rp->nmissed = 0;
969 /* Establish function entry probe point */ 979 /* Establish function entry probe point */
970 ret = __register_kprobe(&rp->kp, called_from); 980 ret = register_kprobe(&rp->kp);
971 if (ret != 0) 981 if (ret != 0)
972 free_rp_inst(rp); 982 free_rp_inst(rp);
973 return ret; 983 return ret;
974} 984}
975 985
976static int __register_kretprobes(struct kretprobe **rps, int num, 986int __kprobes register_kretprobes(struct kretprobe **rps, int num)
977 unsigned long called_from)
978{ 987{
979 int ret = 0, i; 988 int ret = 0, i;
980 989
981 if (num <= 0) 990 if (num <= 0)
982 return -EINVAL; 991 return -EINVAL;
983 for (i = 0; i < num; i++) { 992 for (i = 0; i < num; i++) {
984 ret = __register_kretprobe(rps[i], called_from); 993 ret = register_kretprobe(rps[i]);
985 if (ret < 0) { 994 if (ret < 0) {
986 if (i > 0) 995 if (i > 0)
987 unregister_kretprobes(rps, i); 996 unregister_kretprobes(rps, i);
@@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
991 return ret; 1000 return ret;
992} 1001}
993 1002
994int __kprobes register_kretprobe(struct kretprobe *rp)
995{
996 return __register_kretprobes(&rp, 1,
997 (unsigned long)__builtin_return_address(0));
998}
999
1000void __kprobes unregister_kretprobe(struct kretprobe *rp) 1003void __kprobes unregister_kretprobe(struct kretprobe *rp)
1001{ 1004{
1002 unregister_kretprobes(&rp, 1); 1005 unregister_kretprobes(&rp, 1);
1003} 1006}
1004 1007
1005int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1006{
1007 return __register_kretprobes(rps, num,
1008 (unsigned long)__builtin_return_address(0));
1009}
1010
1011void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1012{ 1009{
1013 int i; 1010 int i;
@@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1055 1052
1056#endif /* CONFIG_KRETPROBES */ 1053#endif /* CONFIG_KRETPROBES */
1057 1054
1055/* Set the kprobe gone and remove its instruction buffer. */
1056static void __kprobes kill_kprobe(struct kprobe *p)
1057{
1058 struct kprobe *kp;
1059 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) {
1061 /*
1062 * If this is an aggr_kprobe, we have to list all the
1063 * chained probes and mark them GONE.
1064 */
1065 list_for_each_entry_rcu(kp, &p->list, list)
1066 kp->flags |= KPROBE_FLAG_GONE;
1067 p->post_handler = NULL;
1068 p->break_handler = NULL;
1069 }
1070 /*
1071 * Here, we can remove insn_slot safely, because no thread calls
1072 * the original probed function (which will be freed soon) any more.
1073 */
1074 arch_remove_kprobe(p);
1075}
1076
1077/* Module notifier call back, checking kprobes on the module */
1078static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1079 unsigned long val, void *data)
1080{
1081 struct module *mod = data;
1082 struct hlist_head *head;
1083 struct hlist_node *node;
1084 struct kprobe *p;
1085 unsigned int i;
1086 int checkcore = (val == MODULE_STATE_GOING);
1087
1088 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
1089 return NOTIFY_DONE;
1090
1091 /*
1092 * When MODULE_STATE_GOING was notified, both of module .text and
1093 * .init.text sections would be freed. When MODULE_STATE_LIVE was
1094 * notified, only .init.text section would be freed. We need to
1095 * disable kprobes which have been inserted in the sections.
1096 */
1097 mutex_lock(&kprobe_mutex);
1098 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1099 head = &kprobe_table[i];
1100 hlist_for_each_entry_rcu(p, node, head, hlist)
1101 if (within_module_init((unsigned long)p->addr, mod) ||
1102 (checkcore &&
1103 within_module_core((unsigned long)p->addr, mod))) {
1104 /*
1105 * The vaddr this probe is installed will soon
1106 * be vfreed buy not synced to disk. Hence,
1107 * disarming the breakpoint isn't needed.
1108 */
1109 kill_kprobe(p);
1110 }
1111 }
1112 mutex_unlock(&kprobe_mutex);
1113 return NOTIFY_DONE;
1114}
1115
1116static struct notifier_block kprobe_module_nb = {
1117 .notifier_call = kprobes_module_callback,
1118 .priority = 0
1119};
1120
1058static int __init init_kprobes(void) 1121static int __init init_kprobes(void)
1059{ 1122{
1060 int i, err = 0; 1123 int i, err = 0;
@@ -1111,6 +1174,9 @@ static int __init init_kprobes(void)
1111 err = arch_init_kprobes(); 1174 err = arch_init_kprobes();
1112 if (!err) 1175 if (!err)
1113 err = register_die_notifier(&kprobe_exceptions_nb); 1176 err = register_die_notifier(&kprobe_exceptions_nb);
1177 if (!err)
1178 err = register_module_notifier(&kprobe_module_nb);
1179
1114 kprobes_initialized = (err == 0); 1180 kprobes_initialized = (err == 0);
1115 1181
1116 if (!err) 1182 if (!err)
@@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1131 else 1197 else
1132 kprobe_type = "k"; 1198 kprobe_type = "k";
1133 if (sym) 1199 if (sym)
1134 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, 1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
1135 sym, offset, (modname ? modname : " ")); 1201 sym, offset, (modname ? modname : " "),
1202 (kprobe_gone(p) ? "[GONE]" : ""));
1136 else 1203 else
1137 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); 1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
1205 (kprobe_gone(p) ? "[GONE]" : ""));
1138} 1206}
1139 1207
1140static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void)
1215 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1216 head = &kprobe_table[i]; 1284 head = &kprobe_table[i];
1217 hlist_for_each_entry_rcu(p, node, head, hlist) 1285 hlist_for_each_entry_rcu(p, node, head, hlist)
1218 arch_arm_kprobe(p); 1286 if (!kprobe_gone(p))
1287 arch_arm_kprobe(p);
1219 } 1288 }
1220 1289
1221 kprobe_enabled = true; 1290 kprobe_enabled = true;
@@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void)
1244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1245 head = &kprobe_table[i]; 1314 head = &kprobe_table[i];
1246 hlist_for_each_entry_rcu(p, node, head, hlist) { 1315 hlist_for_each_entry_rcu(p, node, head, hlist) {
1247 if (!arch_trampoline_kprobe(p)) 1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
1248 arch_disarm_kprobe(p); 1317 arch_disarm_kprobe(p);
1249 } 1318 }
1250 } 1319 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 08dd8ed86c77..528dd78e7e7e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24static struct kobj_attribute _name##_attr = \ 24static struct kobj_attribute _name##_attr = \
25 __ATTR(_name, 0644, _name##_show, _name##_store) 25 __ATTR(_name, 0644, _name##_show, _name##_store)
26 26
27#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 27#if defined(CONFIG_HOTPLUG)
28/* current uevent sequence number */ 28/* current uevent sequence number */
29static ssize_t uevent_seqnum_show(struct kobject *kobj, 29static ssize_t uevent_seqnum_show(struct kobject *kobj,
30 struct kobj_attribute *attr, char *buf) 30 struct kobj_attribute *attr, char *buf)
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj;
137EXPORT_SYMBOL_GPL(kernel_kobj); 137EXPORT_SYMBOL_GPL(kernel_kobj);
138 138
139static struct attribute * kernel_attrs[] = { 139static struct attribute * kernel_attrs[] = {
140#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 140#if defined(CONFIG_HOTPLUG)
141 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
142 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
143#endif 143#endif
diff --git a/kernel/module.c b/kernel/module.c
index f47cce910f25..c9332c90d5a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/device.h> 43#include <linux/device.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/unwind.h>
47#include <linux/rculist.h> 46#include <linux/rculist.h>
48#include <asm/uaccess.h> 47#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -51,6 +50,7 @@
51#include <asm/sections.h> 50#include <asm/sections.h>
52#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
53#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h>
54 54
55#if 0 55#if 0
56#define DEBUGP printk 56#define DEBUGP printk
@@ -817,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
817 mod->exit(); 817 mod->exit();
818 blocking_notifier_call_chain(&module_notify_list, 818 blocking_notifier_call_chain(&module_notify_list,
819 MODULE_STATE_GOING, mod); 819 MODULE_STATE_GOING, mod);
820 async_synchronize_full();
820 mutex_lock(&module_mutex); 821 mutex_lock(&module_mutex);
821 /* Store the name of the last unloaded module for diagnostic purposes */ 822 /* Store the name of the last unloaded module for diagnostic purposes */
822 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
@@ -1449,8 +1450,6 @@ static void free_module(struct module *mod)
1449 remove_sect_attrs(mod); 1450 remove_sect_attrs(mod);
1450 mod_kobject_remove(mod); 1451 mod_kobject_remove(mod);
1451 1452
1452 unwind_remove_table(mod->unwind_info, 0);
1453
1454 /* Arch-specific cleanup. */ 1453 /* Arch-specific cleanup. */
1455 module_arch_cleanup(mod); 1454 module_arch_cleanup(mod);
1456 1455
@@ -1867,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod,
1867 unsigned int symindex = 0; 1866 unsigned int symindex = 0;
1868 unsigned int strindex = 0; 1867 unsigned int strindex = 0;
1869 unsigned int modindex, versindex, infoindex, pcpuindex; 1868 unsigned int modindex, versindex, infoindex, pcpuindex;
1870 unsigned int unwindex = 0;
1871 unsigned int num_kp, num_mcount; 1869 unsigned int num_kp, num_mcount;
1872 struct kernel_param *kp; 1870 struct kernel_param *kp;
1873 struct module *mod; 1871 struct module *mod;
@@ -1957,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod,
1957 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1955 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1958 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1956 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1959 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1957 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1960#ifdef ARCH_UNWIND_SECTION_NAME
1961 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1962#endif
1963 1958
1964 /* Don't keep modinfo and version sections. */ 1959 /* Don't keep modinfo and version sections. */
1965 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1960 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1969,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod,
1969 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1964 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1970 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1965 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1971#endif 1966#endif
1972 if (unwindex)
1973 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1974 1967
1975 /* Check module struct version now, before we try to use module. */ 1968 /* Check module struct version now, before we try to use module. */
1976 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1969 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2267,11 +2260,6 @@ static noinline struct module *load_module(void __user *umod,
2267 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2260 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2268 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2261 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2269 2262
2270 /* Size of section 0 is 0, so this works well if no unwind info. */
2271 mod->unwind_info = unwind_add_table(mod,
2272 (void *)sechdrs[unwindex].sh_addr,
2273 sechdrs[unwindex].sh_size);
2274
2275 /* Get rid of temporary copy */ 2263 /* Get rid of temporary copy */
2276 vfree(hdr); 2264 vfree(hdr);
2277 2265
@@ -2366,11 +2354,12 @@ sys_init_module(void __user *umod,
2366 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 2354 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2367 mod->state = MODULE_STATE_LIVE; 2355 mod->state = MODULE_STATE_LIVE;
2368 wake_up(&module_wq); 2356 wake_up(&module_wq);
2357 blocking_notifier_call_chain(&module_notify_list,
2358 MODULE_STATE_LIVE, mod);
2369 2359
2370 mutex_lock(&module_mutex); 2360 mutex_lock(&module_mutex);
2371 /* Drop initial reference. */ 2361 /* Drop initial reference. */
2372 module_put(mod); 2362 module_put(mod);
2373 unwind_remove_table(mod->unwind_info, 1);
2374 module_free(mod, mod->module_init); 2363 module_free(mod, mod->module_init);
2375 mod->module_init = NULL; 2364 mod->module_init = NULL;
2376 mod->init_size = 0; 2365 mod->init_size = 0;
@@ -2405,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod,
2405 unsigned long nextval; 2394 unsigned long nextval;
2406 2395
2407 /* At worse, next value is at end of module */ 2396 /* At worse, next value is at end of module */
2408 if (within(addr, mod->module_init, mod->init_size)) 2397 if (within_module_init(addr, mod))
2409 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2398 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2410 else 2399 else
2411 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2400 nextval = (unsigned long)mod->module_core+mod->core_text_size;
@@ -2453,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr,
2453 2442
2454 preempt_disable(); 2443 preempt_disable();
2455 list_for_each_entry_rcu(mod, &modules, list) { 2444 list_for_each_entry_rcu(mod, &modules, list) {
2456 if (within(addr, mod->module_init, mod->init_size) 2445 if (within_module_init(addr, mod) ||
2457 || within(addr, mod->module_core, mod->core_size)) { 2446 within_module_core(addr, mod)) {
2458 if (modname) 2447 if (modname)
2459 *modname = mod->name; 2448 *modname = mod->name;
2460 ret = get_ksymbol(mod, addr, size, offset); 2449 ret = get_ksymbol(mod, addr, size, offset);
@@ -2476,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2476 2465
2477 preempt_disable(); 2466 preempt_disable();
2478 list_for_each_entry_rcu(mod, &modules, list) { 2467 list_for_each_entry_rcu(mod, &modules, list) {
2479 if (within(addr, mod->module_init, mod->init_size) || 2468 if (within_module_init(addr, mod) ||
2480 within(addr, mod->module_core, mod->core_size)) { 2469 within_module_core(addr, mod)) {
2481 const char *sym; 2470 const char *sym;
2482 2471
2483 sym = get_ksymbol(mod, addr, NULL, NULL); 2472 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -2500,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2500 2489
2501 preempt_disable(); 2490 preempt_disable();
2502 list_for_each_entry_rcu(mod, &modules, list) { 2491 list_for_each_entry_rcu(mod, &modules, list) {
2503 if (within(addr, mod->module_init, mod->init_size) || 2492 if (within_module_init(addr, mod) ||
2504 within(addr, mod->module_core, mod->core_size)) { 2493 within_module_core(addr, mod)) {
2505 const char *sym; 2494 const char *sym;
2506 2495
2507 sym = get_ksymbol(mod, addr, size, offset); 2496 sym = get_ksymbol(mod, addr, size, offset);
@@ -2720,7 +2709,7 @@ int is_module_address(unsigned long addr)
2720 preempt_disable(); 2709 preempt_disable();
2721 2710
2722 list_for_each_entry_rcu(mod, &modules, list) { 2711 list_for_each_entry_rcu(mod, &modules, list) {
2723 if (within(addr, mod->module_core, mod->core_size)) { 2712 if (within_module_core(addr, mod)) {
2724 preempt_enable(); 2713 preempt_enable();
2725 return 1; 2714 return 1;
2726 } 2715 }
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54d..78bc3fdac0d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
13 13
14struct ns_cgroup { 14struct ns_cgroup {
15 struct cgroup_subsys_state css; 15 struct cgroup_subsys_state css;
16 spinlock_t lock;
17}; 16};
18 17
19struct cgroup_subsys ns_subsys; 18struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
84 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
85 if (!ns_cgroup) 84 if (!ns_cgroup)
86 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
87 spin_lock_init(&ns_cgroup->lock);
88 return &ns_cgroup->css; 86 return &ns_cgroup->css;
89} 87}
90 88
diff --git a/kernel/panic.c b/kernel/panic.c
index 13f06349a786..2a2ff36ff44d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -299,6 +299,8 @@ static int init_oops_id(void)
299{ 299{
300 if (!oops_id) 300 if (!oops_id)
301 get_random_bytes(&oops_id, sizeof(oops_id)); 301 get_random_bytes(&oops_id, sizeof(oops_id));
302 else
303 oops_id++;
302 304
303 return 0; 305 return 0;
304} 306}
diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76afa507..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
474} 474}
475EXPORT_SYMBOL(task_session_nr_ns); 475EXPORT_SYMBOL(task_session_nr_ns);
476 476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{
479 return ns_of_pid(task_pid(tsk));
480}
481EXPORT_SYMBOL_GPL(task_active_pid_ns);
482
477/* 483/*
478 * Used by proc to find the first pid that is greater then or equal to nr. 484 * Used by proc to find the first pid that is greater than or equal to nr.
479 * 485 *
480 * If there is a pid at nr this function is exactly the same as find_pid_ns. 486 * If there is a pid at nr this function is exactly the same as find_pid_ns.
481 */ 487 */
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f77d3819ef57..45e8541ab7e3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode)
258{ 258{
259 int error; 259 int error;
260 260
261 /* Free memory before shutting down devices. */ 261 error = platform_begin(platform_mode);
262 error = swsusp_shrink_memory();
263 if (error) 262 if (error)
264 return error; 263 return error;
265 264
266 error = platform_begin(platform_mode); 265 /* Free memory before shutting down devices. */
266 error = swsusp_shrink_memory();
267 if (error) 267 if (error)
268 goto Close; 268 goto Close;
269 269
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 613f16941b85..239988873971 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
615 /* this may fail if the RTC hasn't been initialized */ 615 /* this may fail if the RTC hasn't been initialized */
616 status = rtc_read_time(rtc, &alm.time); 616 status = rtc_read_time(rtc, &alm.time);
617 if (status < 0) { 617 if (status < 0) {
618 printk(err_readtime, rtc->dev.bus_id, status); 618 printk(err_readtime, dev_name(&rtc->dev), status);
619 return; 619 return;
620 } 620 }
621 rtc_tm_to_time(&alm.time, &now); 621 rtc_tm_to_time(&alm.time, &now);
@@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
626 626
627 status = rtc_set_alarm(rtc, &alm); 627 status = rtc_set_alarm(rtc, &alm);
628 if (status < 0) { 628 if (status < 0) {
629 printk(err_wakealarm, rtc->dev.bus_id, status); 629 printk(err_wakealarm, dev_name(&rtc->dev), status);
630 return; 630 return;
631 } 631 }
632 632
@@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
660 if (!device_may_wakeup(candidate->dev.parent)) 660 if (!device_may_wakeup(candidate->dev.parent))
661 return 0; 661 return 0;
662 662
663 *(char **)name_ptr = dev->bus_id; 663 *(const char **)name_ptr = dev_name(dev);
664 return 1; 664 return 1;
665} 665}
666 666
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5d2ab836e998..f5fc2d7680f2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -25,6 +25,7 @@
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/mmu_context.h> 31#include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
192 return ret; 193 return ret;
193} 194}
194 195
195static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
196{
197 free_list_of_pages(ca->chain, clear_page_nosave);
198 memset(ca, 0, sizeof(struct chain_allocator));
199}
200
201/** 196/**
202 * Data types related to memory bitmaps. 197 * Data types related to memory bitmaps.
203 * 198 *
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
233#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 228#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
234 229
235struct bm_block { 230struct bm_block {
236 struct bm_block *next; /* next element of the list */ 231 struct list_head hook; /* hook into a list of bitmap blocks */
237 unsigned long start_pfn; /* pfn represented by the first bit */ 232 unsigned long start_pfn; /* pfn represented by the first bit */
238 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 233 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
239 unsigned long *data; /* bitmap representing pages */ 234 unsigned long *data; /* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
244 return bb->end_pfn - bb->start_pfn; 239 return bb->end_pfn - bb->start_pfn;
245} 240}
246 241
247struct zone_bitmap {
248 struct zone_bitmap *next; /* next element of the list */
249 unsigned long start_pfn; /* minimal pfn in this zone */
250 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
251 struct bm_block *bm_blocks; /* list of bitmap blocks */
252 struct bm_block *cur_block; /* recently used bitmap block */
253};
254
255/* strcut bm_position is used for browsing memory bitmaps */ 242/* strcut bm_position is used for browsing memory bitmaps */
256 243
257struct bm_position { 244struct bm_position {
258 struct zone_bitmap *zone_bm;
259 struct bm_block *block; 245 struct bm_block *block;
260 int bit; 246 int bit;
261}; 247};
262 248
263struct memory_bitmap { 249struct memory_bitmap {
264 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ 250 struct list_head blocks; /* list of bitmap blocks */
265 struct linked_page *p_list; /* list of pages used to store zone 251 struct linked_page *p_list; /* list of pages used to store zone
266 * bitmap objects and bitmap block 252 * bitmap objects and bitmap block
267 * objects 253 * objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
273 259
274static void memory_bm_position_reset(struct memory_bitmap *bm) 260static void memory_bm_position_reset(struct memory_bitmap *bm)
275{ 261{
276 struct zone_bitmap *zone_bm; 262 bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
277
278 zone_bm = bm->zone_bm_list;
279 bm->cur.zone_bm = zone_bm;
280 bm->cur.block = zone_bm->bm_blocks;
281 bm->cur.bit = 0; 263 bm->cur.bit = 0;
282} 264}
283 265
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
285 267
286/** 268/**
287 * create_bm_block_list - create a list of block bitmap objects 269 * create_bm_block_list - create a list of block bitmap objects
270 * @nr_blocks - number of blocks to allocate
271 * @list - list to put the allocated blocks into
272 * @ca - chain allocator to be used for allocating memory
288 */ 273 */
289 274static int create_bm_block_list(unsigned long pages,
290static inline struct bm_block * 275 struct list_head *list,
291create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) 276 struct chain_allocator *ca)
292{ 277{
293 struct bm_block *bblist = NULL; 278 unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
294 279
295 while (nr_blocks-- > 0) { 280 while (nr_blocks-- > 0) {
296 struct bm_block *bb; 281 struct bm_block *bb;
297 282
298 bb = chain_alloc(ca, sizeof(struct bm_block)); 283 bb = chain_alloc(ca, sizeof(struct bm_block));
299 if (!bb) 284 if (!bb)
300 return NULL; 285 return -ENOMEM;
301 286 list_add(&bb->hook, list);
302 bb->next = bblist;
303 bblist = bb;
304 } 287 }
305 return bblist; 288
289 return 0;
306} 290}
307 291
292struct mem_extent {
293 struct list_head hook;
294 unsigned long start;
295 unsigned long end;
296};
297
308/** 298/**
309 * create_zone_bm_list - create a list of zone bitmap objects 299 * free_mem_extents - free a list of memory extents
300 * @list - list of extents to empty
310 */ 301 */
302static void free_mem_extents(struct list_head *list)
303{
304 struct mem_extent *ext, *aux;
311 305
312static inline struct zone_bitmap * 306 list_for_each_entry_safe(ext, aux, list, hook) {
313create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) 307 list_del(&ext->hook);
308 kfree(ext);
309 }
310}
311
312/**
313 * create_mem_extents - create a list of memory extents representing
314 * contiguous ranges of PFNs
315 * @list - list to put the extents into
316 * @gfp_mask - mask to use for memory allocations
317 */
318static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
314{ 319{
315 struct zone_bitmap *zbmlist = NULL; 320 struct zone *zone;
316 321
317 while (nr_zones-- > 0) { 322 INIT_LIST_HEAD(list);
318 struct zone_bitmap *zbm;
319 323
320 zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); 324 for_each_zone(zone) {
321 if (!zbm) 325 unsigned long zone_start, zone_end;
322 return NULL; 326 struct mem_extent *ext, *cur, *aux;
327
328 if (!populated_zone(zone))
329 continue;
323 330
324 zbm->next = zbmlist; 331 zone_start = zone->zone_start_pfn;
325 zbmlist = zbm; 332 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333
334 list_for_each_entry(ext, list, hook)
335 if (zone_start <= ext->end)
336 break;
337
338 if (&ext->hook == list || zone_end < ext->start) {
339 /* New extent is necessary */
340 struct mem_extent *new_ext;
341
342 new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
343 if (!new_ext) {
344 free_mem_extents(list);
345 return -ENOMEM;
346 }
347 new_ext->start = zone_start;
348 new_ext->end = zone_end;
349 list_add_tail(&new_ext->hook, &ext->hook);
350 continue;
351 }
352
353 /* Merge this zone's range of PFNs with the existing one */
354 if (zone_start < ext->start)
355 ext->start = zone_start;
356 if (zone_end > ext->end)
357 ext->end = zone_end;
358
359 /* More merging may be possible */
360 cur = ext;
361 list_for_each_entry_safe_continue(cur, aux, list, hook) {
362 if (zone_end < cur->start)
363 break;
364 if (zone_end < cur->end)
365 ext->end = cur->end;
366 list_del(&cur->hook);
367 kfree(cur);
368 }
326 } 369 }
327 return zbmlist; 370
371 return 0;
328} 372}
329 373
330/** 374/**
331 * memory_bm_create - allocate memory for a memory bitmap 375 * memory_bm_create - allocate memory for a memory bitmap
332 */ 376 */
333
334static int 377static int
335memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) 378memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
336{ 379{
337 struct chain_allocator ca; 380 struct chain_allocator ca;
338 struct zone *zone; 381 struct list_head mem_extents;
339 struct zone_bitmap *zone_bm; 382 struct mem_extent *ext;
340 struct bm_block *bb; 383 int error;
341 unsigned int nr;
342 384
343 chain_init(&ca, gfp_mask, safe_needed); 385 chain_init(&ca, gfp_mask, safe_needed);
386 INIT_LIST_HEAD(&bm->blocks);
344 387
345 /* Compute the number of zones */ 388 error = create_mem_extents(&mem_extents, gfp_mask);
346 nr = 0; 389 if (error)
347 for_each_zone(zone) 390 return error;
348 if (populated_zone(zone))
349 nr++;
350
351 /* Allocate the list of zones bitmap objects */
352 zone_bm = create_zone_bm_list(nr, &ca);
353 bm->zone_bm_list = zone_bm;
354 if (!zone_bm) {
355 chain_free(&ca, PG_UNSAFE_CLEAR);
356 return -ENOMEM;
357 }
358
359 /* Initialize the zone bitmap objects */
360 for_each_zone(zone) {
361 unsigned long pfn;
362 391
363 if (!populated_zone(zone)) 392 list_for_each_entry(ext, &mem_extents, hook) {
364 continue; 393 struct bm_block *bb;
394 unsigned long pfn = ext->start;
395 unsigned long pages = ext->end - ext->start;
365 396
366 zone_bm->start_pfn = zone->zone_start_pfn; 397 bb = list_entry(bm->blocks.prev, struct bm_block, hook);
367 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
368 /* Allocate the list of bitmap block objects */
369 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
370 bb = create_bm_block_list(nr, &ca);
371 zone_bm->bm_blocks = bb;
372 zone_bm->cur_block = bb;
373 if (!bb)
374 goto Free;
375 398
376 nr = zone->spanned_pages; 399 error = create_bm_block_list(pages, bm->blocks.prev, &ca);
377 pfn = zone->zone_start_pfn; 400 if (error)
378 /* Initialize the bitmap block objects */ 401 goto Error;
379 while (bb) {
380 unsigned long *ptr;
381 402
382 ptr = get_image_page(gfp_mask, safe_needed); 403 list_for_each_entry_continue(bb, &bm->blocks, hook) {
383 bb->data = ptr; 404 bb->data = get_image_page(gfp_mask, safe_needed);
384 if (!ptr) 405 if (!bb->data) {
385 goto Free; 406 error = -ENOMEM;
407 goto Error;
408 }
386 409
387 bb->start_pfn = pfn; 410 bb->start_pfn = pfn;
388 if (nr >= BM_BITS_PER_BLOCK) { 411 if (pages >= BM_BITS_PER_BLOCK) {
389 pfn += BM_BITS_PER_BLOCK; 412 pfn += BM_BITS_PER_BLOCK;
390 nr -= BM_BITS_PER_BLOCK; 413 pages -= BM_BITS_PER_BLOCK;
391 } else { 414 } else {
392 /* This is executed only once in the loop */ 415 /* This is executed only once in the loop */
393 pfn += nr; 416 pfn += pages;
394 } 417 }
395 bb->end_pfn = pfn; 418 bb->end_pfn = pfn;
396 bb = bb->next;
397 } 419 }
398 zone_bm = zone_bm->next;
399 } 420 }
421
400 bm->p_list = ca.chain; 422 bm->p_list = ca.chain;
401 memory_bm_position_reset(bm); 423 memory_bm_position_reset(bm);
402 return 0; 424 Exit:
425 free_mem_extents(&mem_extents);
426 return error;
403 427
404 Free: 428 Error:
405 bm->p_list = ca.chain; 429 bm->p_list = ca.chain;
406 memory_bm_free(bm, PG_UNSAFE_CLEAR); 430 memory_bm_free(bm, PG_UNSAFE_CLEAR);
407 return -ENOMEM; 431 goto Exit;
408} 432}
409 433
410/** 434/**
411 * memory_bm_free - free memory occupied by the memory bitmap @bm 435 * memory_bm_free - free memory occupied by the memory bitmap @bm
412 */ 436 */
413
414static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 437static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
415{ 438{
416 struct zone_bitmap *zone_bm; 439 struct bm_block *bb;
417 440
418 /* Free the list of bit blocks for each zone_bitmap object */ 441 list_for_each_entry(bb, &bm->blocks, hook)
419 zone_bm = bm->zone_bm_list; 442 if (bb->data)
420 while (zone_bm) { 443 free_image_page(bb->data, clear_nosave_free);
421 struct bm_block *bb;
422 444
423 bb = zone_bm->bm_blocks;
424 while (bb) {
425 if (bb->data)
426 free_image_page(bb->data, clear_nosave_free);
427 bb = bb->next;
428 }
429 zone_bm = zone_bm->next;
430 }
431 free_list_of_pages(bm->p_list, clear_nosave_free); 445 free_list_of_pages(bm->p_list, clear_nosave_free);
432 bm->zone_bm_list = NULL; 446
447 INIT_LIST_HEAD(&bm->blocks);
433} 448}
434 449
435/** 450/**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
437 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 452 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
438 * of @bm->cur_zone_bm are updated. 453 * of @bm->cur_zone_bm are updated.
439 */ 454 */
440
441static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 455static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
442 void **addr, unsigned int *bit_nr) 456 void **addr, unsigned int *bit_nr)
443{ 457{
444 struct zone_bitmap *zone_bm;
445 struct bm_block *bb; 458 struct bm_block *bb;
446 459
447 /* Check if the pfn is from the current zone */ 460 /*
448 zone_bm = bm->cur.zone_bm; 461 * Check if the pfn corresponds to the current bitmap block and find
449 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 462 * the block where it fits if this is not the case.
450 zone_bm = bm->zone_bm_list; 463 */
451 /* We don't assume that the zones are sorted by pfns */ 464 bb = bm->cur.block;
452 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
453 zone_bm = zone_bm->next;
454
455 if (!zone_bm)
456 return -EFAULT;
457 }
458 bm->cur.zone_bm = zone_bm;
459 }
460 /* Check if the pfn corresponds to the current bitmap block */
461 bb = zone_bm->cur_block;
462 if (pfn < bb->start_pfn) 465 if (pfn < bb->start_pfn)
463 bb = zone_bm->bm_blocks; 466 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
467 if (pfn >= bb->start_pfn)
468 break;
464 469
465 while (pfn >= bb->end_pfn) { 470 if (pfn >= bb->end_pfn)
466 bb = bb->next; 471 list_for_each_entry_continue(bb, &bm->blocks, hook)
472 if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
473 break;
467 474
468 BUG_ON(!bb); 475 if (&bb->hook == &bm->blocks)
469 } 476 return -EFAULT;
470 zone_bm->cur_block = bb; 477
478 /* The block has been found */
479 bm->cur.block = bb;
471 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
481 bm->cur.bit = pfn + 1;
472 *bit_nr = pfn; 482 *bit_nr = pfn;
473 *addr = bb->data; 483 *addr = bb->data;
474 return 0; 484 return 0;
@@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
519 return test_bit(bit, addr); 529 return test_bit(bit, addr);
520} 530}
521 531
532static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
533{
534 void *addr;
535 unsigned int bit;
536
537 return !memory_bm_find_bit(bm, pfn, &addr, &bit);
538}
539
522/** 540/**
523 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 541 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
524 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 542 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
530 548
531static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 549static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
532{ 550{
533 struct zone_bitmap *zone_bm;
534 struct bm_block *bb; 551 struct bm_block *bb;
535 int bit; 552 int bit;
536 553
554 bb = bm->cur.block;
537 do { 555 do {
538 bb = bm->cur.block; 556 bit = bm->cur.bit;
539 do { 557 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
540 bit = bm->cur.bit; 558 if (bit < bm_block_bits(bb))
541 bit = find_next_bit(bb->data, bm_block_bits(bb), bit); 559 goto Return_pfn;
542 if (bit < bm_block_bits(bb)) 560
543 goto Return_pfn; 561 bb = list_entry(bb->hook.next, struct bm_block, hook);
544 562 bm->cur.block = bb;
545 bb = bb->next; 563 bm->cur.bit = 0;
546 bm->cur.block = bb; 564 } while (&bb->hook != &bm->blocks);
547 bm->cur.bit = 0; 565
548 } while (bb);
549 zone_bm = bm->cur.zone_bm->next;
550 if (zone_bm) {
551 bm->cur.zone_bm = zone_bm;
552 bm->cur.block = zone_bm->bm_blocks;
553 bm->cur.bit = 0;
554 }
555 } while (zone_bm);
556 memory_bm_position_reset(bm); 566 memory_bm_position_reset(bm);
557 return BM_END_OF_MAP; 567 return BM_END_OF_MAP;
558 568
@@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
808 * We should save the page if it isn't Nosave or NosaveFree, or Reserved, 818 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
809 * and it isn't a part of a free chunk of pages. 819 * and it isn't a part of a free chunk of pages.
810 */ 820 */
811 821static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
812static struct page *saveable_highmem_page(unsigned long pfn)
813{ 822{
814 struct page *page; 823 struct page *page;
815 824
@@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
817 return NULL; 826 return NULL;
818 827
819 page = pfn_to_page(pfn); 828 page = pfn_to_page(pfn);
829 if (page_zone(page) != zone)
830 return NULL;
820 831
821 BUG_ON(!PageHighMem(page)); 832 BUG_ON(!PageHighMem(page));
822 833
@@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void)
846 mark_free_pages(zone); 857 mark_free_pages(zone);
847 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 858 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
848 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 859 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
849 if (saveable_highmem_page(pfn)) 860 if (saveable_highmem_page(zone, pfn))
850 n++; 861 n++;
851 } 862 }
852 return n; 863 return n;
853} 864}
854#else 865#else
855static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } 866static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
867{
868 return NULL;
869}
856#endif /* CONFIG_HIGHMEM */ 870#endif /* CONFIG_HIGHMEM */
857 871
858/** 872/**
@@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
863 * of pages statically defined as 'unsaveable', and it isn't a part of 877 * of pages statically defined as 'unsaveable', and it isn't a part of
864 * a free chunk of pages. 878 * a free chunk of pages.
865 */ 879 */
866 880static struct page *saveable_page(struct zone *zone, unsigned long pfn)
867static struct page *saveable_page(unsigned long pfn)
868{ 881{
869 struct page *page; 882 struct page *page;
870 883
@@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
872 return NULL; 885 return NULL;
873 886
874 page = pfn_to_page(pfn); 887 page = pfn_to_page(pfn);
888 if (page_zone(page) != zone)
889 return NULL;
875 890
876 BUG_ON(PageHighMem(page)); 891 BUG_ON(PageHighMem(page));
877 892
@@ -903,7 +918,7 @@ unsigned int count_data_pages(void)
903 mark_free_pages(zone); 918 mark_free_pages(zone);
904 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 919 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
905 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 920 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
906 if(saveable_page(pfn)) 921 if (saveable_page(zone, pfn))
907 n++; 922 n++;
908 } 923 }
909 return n; 924 return n;
@@ -944,7 +959,7 @@ static inline struct page *
944page_is_saveable(struct zone *zone, unsigned long pfn) 959page_is_saveable(struct zone *zone, unsigned long pfn)
945{ 960{
946 return is_highmem(zone) ? 961 return is_highmem(zone) ?
947 saveable_highmem_page(pfn) : saveable_page(pfn); 962 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
948} 963}
949 964
950static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 965static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
966 * data modified by kmap_atomic() 981 * data modified by kmap_atomic()
967 */ 982 */
968 safe_copy_page(buffer, s_page); 983 safe_copy_page(buffer, s_page);
969 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); 984 dst = kmap_atomic(d_page, KM_USER0);
970 memcpy(dst, buffer, PAGE_SIZE); 985 memcpy(dst, buffer, PAGE_SIZE);
971 kunmap_atomic(dst, KM_USER0); 986 kunmap_atomic(dst, KM_USER0);
972 } else { 987 } else {
@@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
975 } 990 }
976} 991}
977#else 992#else
978#define page_is_saveable(zone, pfn) saveable_page(pfn) 993#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
979 994
980static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 995static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
981{ 996{
@@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info)
1459 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set 1474 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
1460 * the corresponding bit in the memory bitmap @bm 1475 * the corresponding bit in the memory bitmap @bm
1461 */ 1476 */
1462 1477static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1463static inline void
1464unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1465{ 1478{
1466 int j; 1479 int j;
1467 1480
@@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1469 if (unlikely(buf[j] == BM_END_OF_MAP)) 1482 if (unlikely(buf[j] == BM_END_OF_MAP))
1470 break; 1483 break;
1471 1484
1472 memory_bm_set_bit(bm, buf[j]); 1485 if (memory_bm_pfn_present(bm, buf[j]))
1486 memory_bm_set_bit(bm, buf[j]);
1487 else
1488 return -EFAULT;
1473 } 1489 }
1490
1491 return 0;
1474} 1492}
1475 1493
1476/* List of "safe" pages that may be used to store data loaded from the suspend 1494/* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1608 pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); 1626 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1609 if (!pbe) { 1627 if (!pbe) {
1610 swsusp_free(); 1628 swsusp_free();
1611 return NULL; 1629 return ERR_PTR(-ENOMEM);
1612 } 1630 }
1613 pbe->orig_page = page; 1631 pbe->orig_page = page;
1614 if (safe_highmem_pages > 0) { 1632 if (safe_highmem_pages > 0) {
@@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1677static inline void * 1695static inline void *
1678get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) 1696get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1679{ 1697{
1680 return NULL; 1698 return ERR_PTR(-EINVAL);
1681} 1699}
1682 1700
1683static inline void copy_last_highmem_page(void) {} 1701static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1788static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) 1806static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1789{ 1807{
1790 struct pbe *pbe; 1808 struct pbe *pbe;
1791 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1809 struct page *page;
1810 unsigned long pfn = memory_bm_next_pfn(bm);
1792 1811
1812 if (pfn == BM_END_OF_MAP)
1813 return ERR_PTR(-EFAULT);
1814
1815 page = pfn_to_page(pfn);
1793 if (PageHighMem(page)) 1816 if (PageHighMem(page))
1794 return get_highmem_page_buffer(page, ca); 1817 return get_highmem_page_buffer(page, ca);
1795 1818
@@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1805 pbe = chain_alloc(ca, sizeof(struct pbe)); 1828 pbe = chain_alloc(ca, sizeof(struct pbe));
1806 if (!pbe) { 1829 if (!pbe) {
1807 swsusp_free(); 1830 swsusp_free();
1808 return NULL; 1831 return ERR_PTR(-ENOMEM);
1809 } 1832 }
1810 pbe->orig_address = page_address(page); 1833 pbe->orig_address = page_address(page);
1811 pbe->address = safe_pages_list; 1834 pbe->address = safe_pages_list;
@@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1868 return error; 1891 return error;
1869 1892
1870 } else if (handle->prev <= nr_meta_pages) { 1893 } else if (handle->prev <= nr_meta_pages) {
1871 unpack_orig_pfns(buffer, &copy_bm); 1894 error = unpack_orig_pfns(buffer, &copy_bm);
1895 if (error)
1896 return error;
1897
1872 if (handle->prev == nr_meta_pages) { 1898 if (handle->prev == nr_meta_pages) {
1873 error = prepare_image(&orig_bm, &copy_bm); 1899 error = prepare_image(&orig_bm, &copy_bm);
1874 if (error) 1900 if (error)
@@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1879 restore_pblist = NULL; 1905 restore_pblist = NULL;
1880 handle->buffer = get_buffer(&orig_bm, &ca); 1906 handle->buffer = get_buffer(&orig_bm, &ca);
1881 handle->sync_read = 0; 1907 handle->sync_read = 0;
1882 if (!handle->buffer) 1908 if (IS_ERR(handle->buffer))
1883 return -ENOMEM; 1909 return PTR_ERR(handle->buffer);
1884 } 1910 }
1885 } else { 1911 } else {
1886 copy_last_highmem_page(); 1912 copy_last_highmem_page();
1887 handle->buffer = get_buffer(&orig_bm, &ca); 1913 handle->buffer = get_buffer(&orig_bm, &ca);
1914 if (IS_ERR(handle->buffer))
1915 return PTR_ERR(handle->buffer);
1888 if (handle->buffer != buffer) 1916 if (handle->buffer != buffer)
1889 handle->sync_read = 0; 1917 handle->sync_read = 0;
1890 } 1918 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d89..a92c91451559 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
262 262
263 return 0; 263 return 0;
264} 264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/printk.c b/kernel/printk.c
index e651ab05655f..7015733793e8 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
619static const char recursion_bug_msg [] = 619static const char recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 620 KERN_CRIT "BUG: recent printk recursion!\n";
621static int recursion_bug; 621static int recursion_bug;
622 static int new_text_line = 1; 622static int new_text_line = 1;
623static char printk_buf[1024]; 623static char printk_buf[1024];
624 624
625asmlinkage int vprintk(const char *fmt, va_list args) 625asmlinkage int vprintk(const char *fmt, va_list args)
diff --git a/kernel/profile.c b/kernel/profile.c
index d18e2d2654f2..784933acf5b8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -445,7 +445,6 @@ void profile_tick(int type)
445#ifdef CONFIG_PROC_FS 445#ifdef CONFIG_PROC_FS
446#include <linux/proc_fs.h> 446#include <linux/proc_fs.h>
447#include <asm/uaccess.h> 447#include <asm/uaccess.h>
448#include <asm/ptrace.h>
449 448
450static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 449static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
451 int count, int *eof, void *data) 450 int count, int *eof, void *data)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ad63af8b2521..d92a76a881aa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */ 80void synchronize_rcu(void)
81synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81{
82 struct rcu_synchronize rcu;
83 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu);
86 /* Wait for it. */
87 wait_for_completion(&rcu.completion);
88}
82EXPORT_SYMBOL_GPL(synchronize_rcu); 89EXPORT_SYMBOL_GPL(synchronize_rcu);
83 90
84static void rcu_barrier_callback(struct rcu_head *notused) 91static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index f9dc8f3720f6..33cfc50781f9 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1177,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
1177 * in -rt this does -not- necessarily result in all currently executing 1177 * in -rt this does -not- necessarily result in all currently executing
1178 * interrupt -handlers- having completed. 1178 * interrupt -handlers- having completed.
1179 */ 1179 */
1180synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) 1180void __synchronize_sched(void)
1181{
1182 struct rcu_synchronize rcu;
1183
1184 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1187 /* Wait for it. */
1188 wait_for_completion(&rcu.completion);
1189}
1181EXPORT_SYMBOL_GPL(__synchronize_sched); 1190EXPORT_SYMBOL_GPL(__synchronize_sched);
1182 1191
1183/* 1192/*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 3245b40952c6..1cff28db56b6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -136,7 +136,7 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ 139#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ 140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */ 141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ 142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
@@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
151{ 151{
152 if (fullstop) 152 if (fullstop)
153 return NOTIFY_DONE; 153 return NOTIFY_DONE;
154 if (signal_pending(current)) { 154 mutex_lock(&fullstop_mutex);
155 mutex_lock(&fullstop_mutex); 155 if (!fullstop)
156 if (!ACCESS_ONCE(fullstop)) 156 fullstop = FULLSTOP_SHUTDOWN;
157 fullstop = FULLSTOP_SIGNALED; 157 mutex_unlock(&fullstop_mutex);
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE; 158 return NOTIFY_DONE;
161} 159}
162 160
@@ -624,7 +622,7 @@ rcu_torture_writer(void *arg)
624 rcu_stutter_wait(); 622 rcu_stutter_wait();
625 } while (!kthread_should_stop() && !fullstop); 623 } while (!kthread_should_stop() && !fullstop);
626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 624 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 625 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
628 schedule_timeout_uninterruptible(1); 626 schedule_timeout_uninterruptible(1);
629 return 0; 627 return 0;
630} 628}
@@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg)
649 } while (!kthread_should_stop() && !fullstop); 647 } while (!kthread_should_stop() && !fullstop);
650 648
651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 649 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 650 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
653 schedule_timeout_uninterruptible(1); 651 schedule_timeout_uninterruptible(1);
654 return 0; 652 return 0;
655} 653}
@@ -759,7 +757,7 @@ rcu_torture_reader(void *arg)
759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 757 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
760 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irqcapable)
761 del_timer_sync(&t); 759 del_timer_sync(&t);
762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 760 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
763 schedule_timeout_uninterruptible(1); 761 schedule_timeout_uninterruptible(1);
764 return 0; 762 return 0;
765} 763}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a342b032112c..f2d8638e6c60 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81#ifdef CONFIG_NO_HZ 81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); 82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1,
84 .dynticks = 1,
85};
83#endif /* #ifdef CONFIG_NO_HZ */ 86#endif /* #ifdef CONFIG_NO_HZ */
84 87
85static int blimit = 10; /* Maximum callbacks per softirq. */ 88static int blimit = 10; /* Maximum callbacks per softirq. */
@@ -572,6 +575,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
572 /* Special-case the common single-level case. */ 575 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) { 576 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit; 577 rnp->qsmask = rnp->qsmaskinit;
578 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
575 spin_unlock_irqrestore(&rnp->lock, flags); 579 spin_unlock_irqrestore(&rnp->lock, flags);
576 return; 580 return;
577 } 581 }
@@ -1379,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1379 1383
1380static void __cpuinit rcu_online_cpu(int cpu) 1384static void __cpuinit rcu_online_cpu(int cpu)
1381{ 1385{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state); 1386 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state); 1387 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1388 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17 17
18void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = (unsigned long long)LLONG_MAX;
22 counter->parent = parent;
22} 23}
23 24
24int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
34 return 0; 35 return 0;
35} 36}
36 37
37int res_counter_charge(struct res_counter *counter, unsigned long val) 38int res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at)
38{ 40{
39 int ret; 41 int ret;
40 unsigned long flags; 42 unsigned long flags;
41 43 struct res_counter *c, *u;
42 spin_lock_irqsave(&counter->lock, flags); 44
43 ret = res_counter_charge_locked(counter, val); 45 *limit_fail_at = NULL;
44 spin_unlock_irqrestore(&counter->lock, flags); 46 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val);
50 spin_unlock(&c->lock);
51 if (ret < 0) {
52 *limit_fail_at = c;
53 goto undo;
54 }
55 }
56 ret = 0;
57 goto done;
58undo:
59 for (u = counter; u != c; u = u->parent) {
60 spin_lock(&u->lock);
61 res_counter_uncharge_locked(u, val);
62 spin_unlock(&u->lock);
63 }
64done:
65 local_irq_restore(flags);
45 return ret; 66 return ret;
46} 67}
47 68
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
56void res_counter_uncharge(struct res_counter *counter, unsigned long val) 77void res_counter_uncharge(struct res_counter *counter, unsigned long val)
57{ 78{
58 unsigned long flags; 79 unsigned long flags;
80 struct res_counter *c;
59 81
60 spin_lock_irqsave(&counter->lock, flags); 82 local_irq_save(flags);
61 res_counter_uncharge_locked(counter, val); 83 for (c = counter; c != NULL; c = c->parent) {
62 spin_unlock_irqrestore(&counter->lock, flags); 84 spin_lock(&c->lock);
85 res_counter_uncharge_locked(c, val);
86 spin_unlock(&c->lock);
87 }
88 local_irq_restore(flags);
63} 89}
64 90
65 91
diff --git a/kernel/resource.c b/kernel/resource.c
index e633106b12f6..ca6a1536b205 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
623 */ 623 */
624struct resource * __request_region(struct resource *parent, 624struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 625 resource_size_t start, resource_size_t n,
626 const char *name) 626 const char *name, int flags)
627{ 627{
628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
629 629
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
634 res->start = start; 634 res->start = start;
635 res->end = start + n - 1; 635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY; 636 res->flags = IORESOURCE_BUSY;
637 res->flags |= flags;
637 638
638 write_lock(&resource_lock); 639 write_lock(&resource_lock);
639 640
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
679{ 680{
680 struct resource * res; 681 struct resource * res;
681 682
682 res = __request_region(parent, start, n, "check-region"); 683 res = __request_region(parent, start, n, "check-region", 0);
683 if (!res) 684 if (!res)
684 return -EBUSY; 685 return -EBUSY;
685 686
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
776 dr->start = start; 777 dr->start = start;
777 dr->n = n; 778 dr->n = n;
778 779
779 res = __request_region(parent, start, n, name); 780 res = __request_region(parent, start, n, name, 0);
780 if (res) 781 if (res)
781 devres_add(dev, dr); 782 devres_add(dev, dr);
782 else 783 else
@@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
876 877
877 return err; 878 return err;
878} 879}
880
881#ifdef CONFIG_STRICT_DEVMEM
882static int strict_iomem_checks = 1;
883#else
884static int strict_iomem_checks;
885#endif
886
887/*
888 * check if an address is reserved in the iomem resource tree
889 * returns 1 if reserved, 0 if not reserved.
890 */
891int iomem_is_exclusive(u64 addr)
892{
893 struct resource *p = &iomem_resource;
894 int err = 0;
895 loff_t l;
896 int size = PAGE_SIZE;
897
898 if (!strict_iomem_checks)
899 return 0;
900
901 addr = addr & PAGE_MASK;
902
903 read_lock(&resource_lock);
904 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
905 /*
906 * We can probably skip the resources without
907 * IORESOURCE_IO attribute?
908 */
909 if (p->start >= addr + size)
910 break;
911 if (p->end < addr)
912 continue;
913 if (p->flags & IORESOURCE_BUSY &&
914 p->flags & IORESOURCE_EXCLUSIVE) {
915 err = 1;
916 break;
917 }
918 }
919 read_unlock(&resource_lock);
920
921 return err;
922}
923
924static int __init strict_iomem(char *str)
925{
926 if (strstr(str, "relaxed"))
927 strict_iomem_checks = 0;
928 if (strstr(str, "strict"))
929 strict_iomem_checks = 1;
930 return 1;
931}
932
933__setup("iomem=", strict_iomem);
diff --git a/kernel/sched.c b/kernel/sched.c
index 545c6fccd1dc..deb5ac8c12f3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3728,8 +3728,13 @@ redo:
3728 } 3728 }
3729 3729
3730 double_unlock_balance(this_rq, busiest); 3730 double_unlock_balance(this_rq, busiest);
3731 /*
3732 * Should not call ttwu while holding a rq->lock
3733 */
3734 spin_unlock(&this_rq->lock);
3731 if (active_balance) 3735 if (active_balance)
3732 wake_up_process(busiest->migration_thread); 3736 wake_up_process(busiest->migration_thread);
3737 spin_lock(&this_rq->lock);
3733 3738
3734 } else 3739 } else
3735 sd->nr_balance_failed = 0; 3740 sd->nr_balance_failed = 0;
@@ -6957,7 +6962,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6957 spin_unlock_irqrestore(&rq->lock, flags); 6962 spin_unlock_irqrestore(&rq->lock, flags);
6958} 6963}
6959 6964
6960static int init_rootdomain(struct root_domain *rd, bool bootmem) 6965static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
6961{ 6966{
6962 memset(rd, 0, sizeof(*rd)); 6967 memset(rd, 0, sizeof(*rd));
6963 6968
@@ -6970,7 +6975,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
6970 } 6975 }
6971 6976
6972 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 6977 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6973 goto free_rd; 6978 goto out;
6974 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 6979 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6975 goto free_span; 6980 goto free_span;
6976 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 6981 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
@@ -6986,8 +6991,7 @@ free_online:
6986 free_cpumask_var(rd->online); 6991 free_cpumask_var(rd->online);
6987free_span: 6992free_span:
6988 free_cpumask_var(rd->span); 6993 free_cpumask_var(rd->span);
6989free_rd: 6994out:
6990 kfree(rd);
6991 return -ENOMEM; 6995 return -ENOMEM;
6992} 6996}
6993 6997
@@ -7987,7 +7991,7 @@ match2:
7987} 7991}
7988 7992
7989#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7993#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7990int arch_reinit_sched_domains(void) 7994static void arch_reinit_sched_domains(void)
7991{ 7995{
7992 get_online_cpus(); 7996 get_online_cpus();
7993 7997
@@ -7996,13 +8000,10 @@ int arch_reinit_sched_domains(void)
7996 8000
7997 rebuild_sched_domains(); 8001 rebuild_sched_domains();
7998 put_online_cpus(); 8002 put_online_cpus();
7999
8000 return 0;
8001} 8003}
8002 8004
8003static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 8005static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
8004{ 8006{
8005 int ret;
8006 unsigned int level = 0; 8007 unsigned int level = 0;
8007 8008
8008 if (sscanf(buf, "%u", &level) != 1) 8009 if (sscanf(buf, "%u", &level) != 1)
@@ -8023,9 +8024,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
8023 else 8024 else
8024 sched_mc_power_savings = level; 8025 sched_mc_power_savings = level;
8025 8026
8026 ret = arch_reinit_sched_domains(); 8027 arch_reinit_sched_domains();
8027 8028
8028 return ret ? ret : count; 8029 return count;
8029} 8030}
8030 8031
8031#ifdef CONFIG_SCHED_MC 8032#ifdef CONFIG_SCHED_MC
@@ -8060,7 +8061,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
8060 sched_smt_power_savings_store); 8061 sched_smt_power_savings_store);
8061#endif 8062#endif
8062 8063
8063int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 8064int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
8064{ 8065{
8065 int err = 0; 8066 int err = 0;
8066 8067
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..a0b0852414cc 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
227 */ 227 */
228void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
229{ 229{
230 if (timekeeping_suspended)
231 return;
232
230 sched_clock_tick(); 233 sched_clock_tick();
231 touch_softlockup_watchdog(); 234 touch_softlockup_watchdog();
232} 235}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 018b7be1db2e..1e00bfacf9b8 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -151,7 +151,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
151 * 151 *
152 * Returns: -ENOMEM if memory fails. 152 * Returns: -ENOMEM if memory fails.
153 */ 153 */
154int cpupri_init(struct cpupri *cp, bool bootmem) 154int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
155{ 155{
156 int i; 156 int i;
157 157
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 56c0efe902a7..8e1352c75557 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
386#endif 386#endif
387 387
388/* 388/*
389 * delta *= P[w / rw]
390 */
391static inline unsigned long
392calc_delta_weight(unsigned long delta, struct sched_entity *se)
393{
394 for_each_sched_entity(se) {
395 delta = calc_delta_mine(delta,
396 se->load.weight, &cfs_rq_of(se)->load);
397 }
398
399 return delta;
400}
401
402/*
403 * delta /= w 389 * delta /= w
404 */ 390 */
405static inline unsigned long 391static inline unsigned long
@@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running)
440 */ 426 */
441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 427static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
442{ 428{
443 unsigned long nr_running = cfs_rq->nr_running; 429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
430
431 for_each_sched_entity(se) {
432 struct load_weight *load = &cfs_rq->load;
444 433
445 if (unlikely(!se->on_rq)) 434 if (unlikely(!se->on_rq)) {
446 nr_running++; 435 struct load_weight lw = cfs_rq->load;
447 436
448 return calc_delta_weight(__sched_period(nr_running), se); 437 update_load_add(&lw, se->load.weight);
438 load = &lw;
439 }
440 slice = calc_delta_mine(slice, se->load.weight, load);
441 }
442 return slice;
449} 443}
450 444
451/* 445/*
@@ -1623,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1623 } 1617 }
1624} 1618}
1625 1619
1626#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1627
1628/* 1620/*
1629 * Share the fairness runtime between parent and child, thus the 1621 * Share the fairness runtime between parent and child, thus the
1630 * total amount of pressure for CPU stays equal - new tasks 1622 * total amount of pressure for CPU stays equal - new tasks
diff --git a/kernel/signal.c b/kernel/signal.c
index 8e95855ff3cf..3152ac3b62e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
858 q->info.si_signo = sig; 858 q->info.si_signo = sig;
859 q->info.si_errno = 0; 859 q->info.si_errno = 0;
860 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
861 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_tgid_nr_ns(current,
862 task_active_pid_ns(t));
862 q->info.si_uid = current_uid(); 863 q->info.si_uid = current_uid();
863 break; 864 break;
864 case (unsigned long) SEND_SIG_PRIV: 865 case (unsigned long) SEND_SIG_PRIV:
diff --git a/kernel/sys.c b/kernel/sys.c
index d356d79e84ac..763c3c17ded3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h>
36 37
37#include <linux/compat.h> 38#include <linux/compat.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
@@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
927 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 928 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
928 return -EFAULT; 929 return -EFAULT;
929 } 930 }
931 force_successful_syscall_return();
930 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 932 return (long) jiffies_64_to_clock_t(get_jiffies_64());
931} 933}
932 934
@@ -1627,6 +1629,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1627 utime = stime = cputime_zero; 1629 utime = stime = cputime_zero;
1628 1630
1629 if (who == RUSAGE_THREAD) { 1631 if (who == RUSAGE_THREAD) {
1632 utime = task_utime(current);
1633 stime = task_stime(current);
1630 accumulate_thread_rusage(p, r); 1634 accumulate_thread_rusage(p, r);
1631 goto out; 1635 goto out;
1632 } 1636 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff6d45c7626f..89d74436318c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,15 +82,14 @@ extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int latencytop_enabled; 83extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
85#ifndef CONFIG_MMU
86extern int sysctl_nr_trim_pages;
87#endif
85#ifdef CONFIG_RCU_TORTURE_TEST 88#ifdef CONFIG_RCU_TORTURE_TEST
86extern int rcutorture_runnable; 89extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 90#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
88 91
89/* Constants used for minimum and maximum */ 92/* Constants used for minimum and maximum */
90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
91static int one = 1;
92#endif
93
94#ifdef CONFIG_DETECT_SOFTLOCKUP 93#ifdef CONFIG_DETECT_SOFTLOCKUP
95static int sixty = 60; 94static int sixty = 60;
96static int neg_one = -1; 95static int neg_one = -1;
@@ -101,6 +100,7 @@ static int two = 2;
101#endif 100#endif
102 101
103static int zero; 102static int zero;
103static int one = 1;
104static int one_hundred = 100; 104static int one_hundred = 100;
105 105
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -952,12 +952,22 @@ static struct ctl_table vm_table[] = {
952 .data = &dirty_background_ratio, 952 .data = &dirty_background_ratio,
953 .maxlen = sizeof(dirty_background_ratio), 953 .maxlen = sizeof(dirty_background_ratio),
954 .mode = 0644, 954 .mode = 0644,
955 .proc_handler = &proc_dointvec_minmax, 955 .proc_handler = &dirty_background_ratio_handler,
956 .strategy = &sysctl_intvec, 956 .strategy = &sysctl_intvec,
957 .extra1 = &zero, 957 .extra1 = &zero,
958 .extra2 = &one_hundred, 958 .extra2 = &one_hundred,
959 }, 959 },
960 { 960 {
961 .ctl_name = CTL_UNNUMBERED,
962 .procname = "dirty_background_bytes",
963 .data = &dirty_background_bytes,
964 .maxlen = sizeof(dirty_background_bytes),
965 .mode = 0644,
966 .proc_handler = &dirty_background_bytes_handler,
967 .strategy = &sysctl_intvec,
968 .extra1 = &one,
969 },
970 {
961 .ctl_name = VM_DIRTY_RATIO, 971 .ctl_name = VM_DIRTY_RATIO,
962 .procname = "dirty_ratio", 972 .procname = "dirty_ratio",
963 .data = &vm_dirty_ratio, 973 .data = &vm_dirty_ratio,
@@ -969,6 +979,16 @@ static struct ctl_table vm_table[] = {
969 .extra2 = &one_hundred, 979 .extra2 = &one_hundred,
970 }, 980 },
971 { 981 {
982 .ctl_name = CTL_UNNUMBERED,
983 .procname = "dirty_bytes",
984 .data = &vm_dirty_bytes,
985 .maxlen = sizeof(vm_dirty_bytes),
986 .mode = 0644,
987 .proc_handler = &dirty_bytes_handler,
988 .strategy = &sysctl_intvec,
989 .extra1 = &one,
990 },
991 {
972 .procname = "dirty_writeback_centisecs", 992 .procname = "dirty_writeback_centisecs",
973 .data = &dirty_writeback_interval, 993 .data = &dirty_writeback_interval,
974 .maxlen = sizeof(dirty_writeback_interval), 994 .maxlen = sizeof(dirty_writeback_interval),
@@ -1085,6 +1105,17 @@ static struct ctl_table vm_table[] = {
1085 .mode = 0644, 1105 .mode = 0644,
1086 .proc_handler = &proc_dointvec 1106 .proc_handler = &proc_dointvec
1087 }, 1107 },
1108#else
1109 {
1110 .ctl_name = CTL_UNNUMBERED,
1111 .procname = "nr_trim_pages",
1112 .data = &sysctl_nr_trim_pages,
1113 .maxlen = sizeof(sysctl_nr_trim_pages),
1114 .mode = 0644,
1115 .proc_handler = &proc_dointvec_minmax,
1116 .strategy = &sysctl_intvec,
1117 .extra1 = &zero,
1118 },
1088#endif 1119#endif
1089 { 1120 {
1090 .ctl_name = VM_LAPTOP_MODE, 1121 .ctl_name = VM_LAPTOP_MODE,
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 06b6395b45b2..4f104515a19b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,21 +22,11 @@
22 22
23static u32 rand1, preh_val, posth_val, jph_val; 23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests; 24static int errors, handler_errors, num_tests;
25static u32 (*target)(u32 value);
26static u32 (*target2)(u32 value);
25 27
26static noinline u32 kprobe_target(u32 value) 28static noinline u32 kprobe_target(u32 value)
27{ 29{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor); 30 return (value / div_factor);
41} 31}
42 32
@@ -74,7 +64,7 @@ static int test_kprobe(void)
74 return ret; 64 return ret;
75 } 65 }
76 66
77 ret = kprobe_target(rand1); 67 ret = target(rand1);
78 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
79 69
80 if (preh_val == 0) { 70 if (preh_val == 0) {
@@ -92,6 +82,84 @@ static int test_kprobe(void)
92 return 0; 82 return 0;
93} 83}
94 84
85static noinline u32 kprobe_target2(u32 value)
86{
87 return (value / div_factor) + 1;
88}
89
90static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
91{
92 preh_val = (rand1 / div_factor) + 1;
93 return 0;
94}
95
96static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
97 unsigned long flags)
98{
99 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: "
102 "incorrect value in post_handler2\n");
103 }
104 posth_val = preh_val + div_factor;
105}
106
107static struct kprobe kp2 = {
108 .symbol_name = "kprobe_target2",
109 .pre_handler = kp_pre_handler2,
110 .post_handler = kp_post_handler2
111};
112
113static int test_kprobes(void)
114{
115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2};
117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */
119 ret = register_kprobes(kps, 2);
120 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: "
122 "register_kprobes returned %d\n", ret);
123 return ret;
124 }
125
126 preh_val = 0;
127 posth_val = 0;
128 ret = target(rand1);
129
130 if (preh_val == 0) {
131 printk(KERN_ERR "Kprobe smoke test failed: "
132 "kprobe pre_handler not called\n");
133 handler_errors++;
134 }
135
136 if (posth_val == 0) {
137 printk(KERN_ERR "Kprobe smoke test failed: "
138 "kprobe post_handler not called\n");
139 handler_errors++;
140 }
141
142 preh_val = 0;
143 posth_val = 0;
144 ret = target2(rand1);
145
146 if (preh_val == 0) {
147 printk(KERN_ERR "Kprobe smoke test failed: "
148 "kprobe pre_handler2 not called\n");
149 handler_errors++;
150 }
151
152 if (posth_val == 0) {
153 printk(KERN_ERR "Kprobe smoke test failed: "
154 "kprobe post_handler2 not called\n");
155 handler_errors++;
156 }
157
158 unregister_kprobes(kps, 2);
159 return 0;
160
161}
162
95static u32 j_kprobe_target(u32 value) 163static u32 j_kprobe_target(u32 value)
96{ 164{
97 if (value != rand1) { 165 if (value != rand1) {
@@ -121,7 +189,7 @@ static int test_jprobe(void)
121 return ret; 189 return ret;
122 } 190 }
123 191
124 ret = kprobe_target(rand1); 192 ret = target(rand1);
125 unregister_jprobe(&jp); 193 unregister_jprobe(&jp);
126 if (jph_val == 0) { 194 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: " 195 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -132,6 +200,43 @@ static int test_jprobe(void)
132 return 0; 200 return 0;
133} 201}
134 202
203static struct jprobe jp2 = {
204 .entry = j_kprobe_target,
205 .kp.symbol_name = "kprobe_target2"
206};
207
208static int test_jprobes(void)
209{
210 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2};
212
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
214 ret = register_jprobes(jps, 2);
215 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: "
217 "register_jprobes returned %d\n", ret);
218 return ret;
219 }
220
221 jph_val = 0;
222 ret = target(rand1);
223 if (jph_val == 0) {
224 printk(KERN_ERR "Kprobe smoke test failed: "
225 "jprobe handler not called\n");
226 handler_errors++;
227 }
228
229 jph_val = 0;
230 ret = target2(rand1);
231 if (jph_val == 0) {
232 printk(KERN_ERR "Kprobe smoke test failed: "
233 "jprobe handler2 not called\n");
234 handler_errors++;
235 }
236 unregister_jprobes(jps, 2);
237
238 return 0;
239}
135#ifdef CONFIG_KRETPROBES 240#ifdef CONFIG_KRETPROBES
136static u32 krph_val; 241static u32 krph_val;
137 242
@@ -177,7 +282,7 @@ static int test_kretprobe(void)
177 return ret; 282 return ret;
178 } 283 }
179 284
180 ret = kprobe_target(rand1); 285 ret = target(rand1);
181 unregister_kretprobe(&rp); 286 unregister_kretprobe(&rp);
182 if (krph_val != rand1) { 287 if (krph_val != rand1) {
183 printk(KERN_ERR "Kprobe smoke test failed: " 288 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -187,12 +292,72 @@ static int test_kretprobe(void)
187 292
188 return 0; 293 return 0;
189} 294}
295
296static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
297{
298 unsigned long ret = regs_return_value(regs);
299
300 if (ret != (rand1 / div_factor) + 1) {
301 handler_errors++;
302 printk(KERN_ERR "Kprobe smoke test failed: "
303 "incorrect value in kretprobe handler2\n");
304 }
305 if (krph_val == 0) {
306 handler_errors++;
307 printk(KERN_ERR "Kprobe smoke test failed: "
308 "call to kretprobe entry handler failed\n");
309 }
310
311 krph_val = rand1;
312 return 0;
313}
314
315static struct kretprobe rp2 = {
316 .handler = return_handler2,
317 .entry_handler = entry_handler,
318 .kp.symbol_name = "kprobe_target2"
319};
320
321static int test_kretprobes(void)
322{
323 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2};
325
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
327 ret = register_kretprobes(rps, 2);
328 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: "
330 "register_kretprobe returned %d\n", ret);
331 return ret;
332 }
333
334 krph_val = 0;
335 ret = target(rand1);
336 if (krph_val != rand1) {
337 printk(KERN_ERR "Kprobe smoke test failed: "
338 "kretprobe handler not called\n");
339 handler_errors++;
340 }
341
342 krph_val = 0;
343 ret = target2(rand1);
344 if (krph_val != rand1) {
345 printk(KERN_ERR "Kprobe smoke test failed: "
346 "kretprobe handler2 not called\n");
347 handler_errors++;
348 }
349 unregister_kretprobes(rps, 2);
350 return 0;
351}
190#endif /* CONFIG_KRETPROBES */ 352#endif /* CONFIG_KRETPROBES */
191 353
192int init_test_probes(void) 354int init_test_probes(void)
193{ 355{
194 int ret; 356 int ret;
195 357
358 target = kprobe_target;
359 target2 = kprobe_target2;
360
196 do { 361 do {
197 rand1 = random32(); 362 rand1 = random32();
198 } while (rand1 <= div_factor); 363 } while (rand1 <= div_factor);
@@ -204,15 +369,30 @@ int init_test_probes(void)
204 errors++; 369 errors++;
205 370
206 num_tests++; 371 num_tests++;
372 ret = test_kprobes();
373 if (ret < 0)
374 errors++;
375
376 num_tests++;
207 ret = test_jprobe(); 377 ret = test_jprobe();
208 if (ret < 0) 378 if (ret < 0)
209 errors++; 379 errors++;
210 380
381 num_tests++;
382 ret = test_jprobes();
383 if (ret < 0)
384 errors++;
385
211#ifdef CONFIG_KRETPROBES 386#ifdef CONFIG_KRETPROBES
212 num_tests++; 387 num_tests++;
213 ret = test_kretprobe(); 388 ret = test_kretprobe();
214 if (ret < 0) 389 if (ret < 0)
215 errors++; 390 errors++;
391
392 num_tests++;
393 ret = test_kretprobes();
394 if (ret < 0)
395 errors++;
216#endif /* CONFIG_KRETPROBES */ 396#endif /* CONFIG_KRETPROBES */
217 397
218 if (errors) 398 if (errors)
diff --git a/kernel/time.c b/kernel/time.c
index d63a4336fad6..4886e3ce83a4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -37,6 +37,7 @@
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/math64.h> 39#include <linux/math64.h>
40#include <linux/ptrace.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc)
65 66
66 if (tloc) { 67 if (tloc) {
67 if (put_user(i,tloc)) 68 if (put_user(i,tloc))
68 i = -EFAULT; 69 return -EFAULT;
69 } 70 }
71 force_successful_syscall_return();
70 return i; 72 return i;
71} 73}
72 74
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1ca99557e929..06f197560f3b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -45,7 +45,7 @@
45 * 45 *
46 * The value 8 is somewhat carefully chosen, as anything 46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as 47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when 48 * HZ shrinks, so values greater than 8 overflow 32bits when
49 * HZ=100. 49 * HZ=100.
50 */ 50 */
51#define JIFFIES_SHIFT 8 51#define JIFFIES_SHIFT 8
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88aa76f..900f1b6598d1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 46struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
48 48
49/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended;
51
49static struct timespec xtime_cache __attribute__ ((aligned (16))); 52static struct timespec xtime_cache __attribute__ ((aligned (16)));
50void update_xtime_cache(u64 nsec) 53void update_xtime_cache(u64 nsec)
51{ 54{
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts)
92 unsigned long seq; 95 unsigned long seq;
93 s64 nsecs; 96 s64 nsecs;
94 97
98 WARN_ON(timekeeping_suspended);
99
95 do { 100 do {
96 seq = read_seqbegin(&xtime_lock); 101 seq = read_seqbegin(&xtime_lock);
97 102
@@ -299,8 +304,6 @@ void __init timekeeping_init(void)
299 write_sequnlock_irqrestore(&xtime_lock, flags); 304 write_sequnlock_irqrestore(&xtime_lock, flags);
300} 305}
301 306
302/* flag for if timekeeping is suspended */
303static int timekeeping_suspended;
304/* time in seconds when suspend began */ 307/* time in seconds when suspend began */
305static unsigned long timekeeping_suspend_time; 308static unsigned long timekeeping_suspend_time;
306 309
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a9d9760dc7b6..8b0daf0662ef 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event)
168 */ 168 */
169unsigned ring_buffer_event_length(struct ring_buffer_event *event) 169unsigned ring_buffer_event_length(struct ring_buffer_event *event)
170{ 170{
171 return rb_event_length(event); 171 unsigned length = rb_event_length(event);
172 if (event->type != RINGBUF_TYPE_DATA)
173 return length;
174 length -= RB_EVNT_HDR_SIZE;
175 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
176 length -= sizeof(event->array[0]);
177 return length;
172} 178}
173EXPORT_SYMBOL_GPL(ring_buffer_event_length); 179EXPORT_SYMBOL_GPL(ring_buffer_event_length);
174 180
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 2dc06ab35716..43f891b05a4b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
92 mm = get_task_mm(p); 92 mm = get_task_mm(p);
93 if (mm) { 93 if (mm) {
94 /* adjust to KB unit */ 94 /* adjust to KB unit */
95 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; 95 stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
96 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
97 mmput(mm); 97 mmput(mm);
98 } 98 }
99 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar;