diff options
Diffstat (limited to 'kernel')
42 files changed, 1553 insertions, 630 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e1c5bf3365c0..2921d90ce32f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o | ||
13 | 14 | ||
14 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
15 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
diff --git a/kernel/async.c b/kernel/async.c new file mode 100644 index 000000000000..64cc916299a5 --- /dev/null +++ b/kernel/async.c | |||
@@ -0,0 +1,323 @@ | |||
1 | /* | ||
2 | * async.c: Asynchronous function calls for boot performance | ||
3 | * | ||
4 | * (C) Copyright 2009 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | |||
13 | |||
14 | /* | ||
15 | |||
16 | Goals and Theory of Operation | ||
17 | |||
18 | The primary goal of this feature is to reduce the kernel boot time, | ||
19 | by doing various independent hardware delays and discovery operations | ||
20 | decoupled and not strictly serialized. | ||
21 | |||
22 | More specifically, the asynchronous function call concept allows | ||
23 | certain operations (primarily during system boot) to happen | ||
24 | asynchronously, out of order, while these operations still | ||
25 | have their externally visible parts happen sequentially and in-order. | ||
26 | (not unlike how out-of-order CPUs retire their instructions in order) | ||
27 | |||
28 | Key to the asynchronous function call implementation is the concept of | ||
29 | a "sequence cookie" (which, although it has an abstracted type, can be | ||
30 | thought of as a monotonically incrementing number). | ||
31 | |||
32 | The async core will assign each scheduled event such a sequence cookie and | ||
33 | pass this to the called functions. | ||
34 | |||
35 | The asynchronously called function should before doing a globally visible | ||
36 | operation, such as registering device numbers, call the | ||
37 | async_synchronize_cookie() function and pass in its own cookie. The | ||
38 | async_synchronize_cookie() function will make sure that all asynchronous | ||
39 | operations that were scheduled prior to the operation corresponding with the | ||
40 | cookie have completed. | ||
41 | |||
42 | Subsystem/driver initialization code that scheduled asynchronous probe | ||
43 | functions, but which shares global resources with other drivers/subsystems | ||
44 | that do not use the asynchronous call feature, need to do a full | ||
45 | synchronization with the async_synchronize_full() function, before returning | ||
46 | from their init function. This is to maintain strict ordering between the | ||
47 | asynchronous and synchronous parts of the kernel. | ||
48 | |||
49 | */ | ||
50 | |||
51 | #include <linux/async.h> | ||
52 | #include <linux/module.h> | ||
53 | #include <linux/wait.h> | ||
54 | #include <linux/sched.h> | ||
55 | #include <linux/init.h> | ||
56 | #include <linux/kthread.h> | ||
57 | #include <asm/atomic.h> | ||
58 | |||
59 | static async_cookie_t next_cookie = 1; | ||
60 | |||
61 | #define MAX_THREADS 256 | ||
62 | #define MAX_WORK 32768 | ||
63 | |||
64 | static LIST_HEAD(async_pending); | ||
65 | static LIST_HEAD(async_running); | ||
66 | static DEFINE_SPINLOCK(async_lock); | ||
67 | |||
68 | struct async_entry { | ||
69 | struct list_head list; | ||
70 | async_cookie_t cookie; | ||
71 | async_func_ptr *func; | ||
72 | void *data; | ||
73 | struct list_head *running; | ||
74 | }; | ||
75 | |||
76 | static DECLARE_WAIT_QUEUE_HEAD(async_done); | ||
77 | static DECLARE_WAIT_QUEUE_HEAD(async_new); | ||
78 | |||
79 | static atomic_t entry_count; | ||
80 | static atomic_t thread_count; | ||
81 | |||
82 | extern int initcall_debug; | ||
83 | |||
84 | |||
85 | /* | ||
86 | * MUST be called with the lock held! | ||
87 | */ | ||
88 | static async_cookie_t __lowest_in_progress(struct list_head *running) | ||
89 | { | ||
90 | struct async_entry *entry; | ||
91 | if (!list_empty(&async_pending)) { | ||
92 | entry = list_first_entry(&async_pending, | ||
93 | struct async_entry, list); | ||
94 | return entry->cookie; | ||
95 | } else if (!list_empty(running)) { | ||
96 | entry = list_first_entry(running, | ||
97 | struct async_entry, list); | ||
98 | return entry->cookie; | ||
99 | } else { | ||
100 | /* nothing in progress... next_cookie is "infinity" */ | ||
101 | return next_cookie; | ||
102 | } | ||
103 | |||
104 | } | ||
105 | /* | ||
106 | * pick the first pending entry and run it | ||
107 | */ | ||
108 | static void run_one_entry(void) | ||
109 | { | ||
110 | unsigned long flags; | ||
111 | struct async_entry *entry; | ||
112 | ktime_t calltime, delta, rettime; | ||
113 | |||
114 | /* 1) pick one task from the pending queue */ | ||
115 | |||
116 | spin_lock_irqsave(&async_lock, flags); | ||
117 | if (list_empty(&async_pending)) | ||
118 | goto out; | ||
119 | entry = list_first_entry(&async_pending, struct async_entry, list); | ||
120 | |||
121 | /* 2) move it to the running queue */ | ||
122 | list_del(&entry->list); | ||
123 | list_add_tail(&entry->list, &async_running); | ||
124 | spin_unlock_irqrestore(&async_lock, flags); | ||
125 | |||
126 | /* 3) run it (and print duration)*/ | ||
127 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | ||
128 | printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); | ||
129 | calltime = ktime_get(); | ||
130 | } | ||
131 | entry->func(entry->data, entry->cookie); | ||
132 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | ||
133 | rettime = ktime_get(); | ||
134 | delta = ktime_sub(rettime, calltime); | ||
135 | printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, | ||
136 | entry->func, ktime_to_ns(delta) >> 10); | ||
137 | } | ||
138 | |||
139 | /* 4) remove it from the running queue */ | ||
140 | spin_lock_irqsave(&async_lock, flags); | ||
141 | list_del(&entry->list); | ||
142 | |||
143 | /* 5) free the entry */ | ||
144 | kfree(entry); | ||
145 | atomic_dec(&entry_count); | ||
146 | |||
147 | spin_unlock_irqrestore(&async_lock, flags); | ||
148 | |||
149 | /* 6) wake up any waiters. */ | ||
150 | wake_up(&async_done); | ||
151 | return; | ||
152 | |||
153 | out: | ||
154 | spin_unlock_irqrestore(&async_lock, flags); | ||
155 | } | ||
156 | |||
157 | |||
158 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) | ||
159 | { | ||
160 | struct async_entry *entry; | ||
161 | unsigned long flags; | ||
162 | async_cookie_t newcookie; | ||
163 | |||
164 | |||
165 | /* allow irq-off callers */ | ||
166 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); | ||
167 | |||
168 | /* | ||
169 | * If we're out of memory or if there's too much work | ||
170 | * pending already, we execute synchronously. | ||
171 | */ | ||
172 | if (!entry || atomic_read(&entry_count) > MAX_WORK) { | ||
173 | kfree(entry); | ||
174 | spin_lock_irqsave(&async_lock, flags); | ||
175 | newcookie = next_cookie++; | ||
176 | spin_unlock_irqrestore(&async_lock, flags); | ||
177 | |||
178 | /* low on memory.. run synchronously */ | ||
179 | ptr(data, newcookie); | ||
180 | return newcookie; | ||
181 | } | ||
182 | entry->func = ptr; | ||
183 | entry->data = data; | ||
184 | entry->running = running; | ||
185 | |||
186 | spin_lock_irqsave(&async_lock, flags); | ||
187 | newcookie = entry->cookie = next_cookie++; | ||
188 | list_add_tail(&entry->list, &async_pending); | ||
189 | atomic_inc(&entry_count); | ||
190 | spin_unlock_irqrestore(&async_lock, flags); | ||
191 | wake_up(&async_new); | ||
192 | return newcookie; | ||
193 | } | ||
194 | |||
195 | async_cookie_t async_schedule(async_func_ptr *ptr, void *data) | ||
196 | { | ||
197 | return __async_schedule(ptr, data, &async_pending); | ||
198 | } | ||
199 | EXPORT_SYMBOL_GPL(async_schedule); | ||
200 | |||
201 | async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) | ||
202 | { | ||
203 | return __async_schedule(ptr, data, running); | ||
204 | } | ||
205 | EXPORT_SYMBOL_GPL(async_schedule_special); | ||
206 | |||
207 | void async_synchronize_full(void) | ||
208 | { | ||
209 | do { | ||
210 | async_synchronize_cookie(next_cookie); | ||
211 | } while (!list_empty(&async_running) || !list_empty(&async_pending)); | ||
212 | } | ||
213 | EXPORT_SYMBOL_GPL(async_synchronize_full); | ||
214 | |||
215 | void async_synchronize_full_special(struct list_head *list) | ||
216 | { | ||
217 | async_synchronize_cookie_special(next_cookie, list); | ||
218 | } | ||
219 | EXPORT_SYMBOL_GPL(async_synchronize_full_special); | ||
220 | |||
221 | void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) | ||
222 | { | ||
223 | ktime_t starttime, delta, endtime; | ||
224 | |||
225 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | ||
226 | printk("async_waiting @ %i\n", task_pid_nr(current)); | ||
227 | starttime = ktime_get(); | ||
228 | } | ||
229 | |||
230 | wait_event(async_done, __lowest_in_progress(running) >= cookie); | ||
231 | |||
232 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | ||
233 | endtime = ktime_get(); | ||
234 | delta = ktime_sub(endtime, starttime); | ||
235 | |||
236 | printk("async_continuing @ %i after %lli usec\n", | ||
237 | task_pid_nr(current), ktime_to_ns(delta) >> 10); | ||
238 | } | ||
239 | } | ||
240 | EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); | ||
241 | |||
242 | void async_synchronize_cookie(async_cookie_t cookie) | ||
243 | { | ||
244 | async_synchronize_cookie_special(cookie, &async_running); | ||
245 | } | ||
246 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); | ||
247 | |||
248 | |||
249 | static int async_thread(void *unused) | ||
250 | { | ||
251 | DECLARE_WAITQUEUE(wq, current); | ||
252 | add_wait_queue(&async_new, &wq); | ||
253 | |||
254 | while (!kthread_should_stop()) { | ||
255 | int ret = HZ; | ||
256 | set_current_state(TASK_INTERRUPTIBLE); | ||
257 | /* | ||
258 | * check the list head without lock.. false positives | ||
259 | * are dealt with inside run_one_entry() while holding | ||
260 | * the lock. | ||
261 | */ | ||
262 | rmb(); | ||
263 | if (!list_empty(&async_pending)) | ||
264 | run_one_entry(); | ||
265 | else | ||
266 | ret = schedule_timeout(HZ); | ||
267 | |||
268 | if (ret == 0) { | ||
269 | /* | ||
270 | * we timed out, this means we as thread are redundant. | ||
271 | * we sign off and die, but we to avoid any races there | ||
272 | * is a last-straw check to see if work snuck in. | ||
273 | */ | ||
274 | atomic_dec(&thread_count); | ||
275 | wmb(); /* manager must see our departure first */ | ||
276 | if (list_empty(&async_pending)) | ||
277 | break; | ||
278 | /* | ||
279 | * woops work came in between us timing out and us | ||
280 | * signing off; we need to stay alive and keep working. | ||
281 | */ | ||
282 | atomic_inc(&thread_count); | ||
283 | } | ||
284 | } | ||
285 | remove_wait_queue(&async_new, &wq); | ||
286 | |||
287 | return 0; | ||
288 | } | ||
289 | |||
290 | static int async_manager_thread(void *unused) | ||
291 | { | ||
292 | DECLARE_WAITQUEUE(wq, current); | ||
293 | add_wait_queue(&async_new, &wq); | ||
294 | |||
295 | while (!kthread_should_stop()) { | ||
296 | int tc, ec; | ||
297 | |||
298 | set_current_state(TASK_INTERRUPTIBLE); | ||
299 | |||
300 | tc = atomic_read(&thread_count); | ||
301 | rmb(); | ||
302 | ec = atomic_read(&entry_count); | ||
303 | |||
304 | while (tc < ec && tc < MAX_THREADS) { | ||
305 | kthread_run(async_thread, NULL, "async/%i", tc); | ||
306 | atomic_inc(&thread_count); | ||
307 | tc++; | ||
308 | } | ||
309 | |||
310 | schedule(); | ||
311 | } | ||
312 | remove_wait_queue(&async_new, &wq); | ||
313 | |||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | static int __init async_init(void) | ||
318 | { | ||
319 | kthread_run(async_manager_thread, NULL, "async/mgr"); | ||
320 | return 0; | ||
321 | } | ||
322 | |||
323 | core_initcall(async_init); | ||
diff --git a/kernel/capability.c b/kernel/capability.c index c598d9d5be4f..688926e496be 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -306,7 +306,7 @@ int capable(int cap) | |||
306 | BUG(); | 306 | BUG(); |
307 | } | 307 | } |
308 | 308 | ||
309 | if (has_capability(current, cap)) { | 309 | if (security_capable(cap) == 0) { |
310 | current->flags |= PF_SUPERPRIV; | 310 | current->flags |= PF_SUPERPRIV; |
311 | return 1; | 311 | return 1; |
312 | } | 312 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 891a84eb9d30..c29831076e7a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -84,7 +84,7 @@ struct cgroupfs_root { | |||
84 | /* Tracks how many cgroups are currently defined in hierarchy.*/ | 84 | /* Tracks how many cgroups are currently defined in hierarchy.*/ |
85 | int number_of_cgroups; | 85 | int number_of_cgroups; |
86 | 86 | ||
87 | /* A list running through the mounted hierarchies */ | 87 | /* A list running through the active hierarchies */ |
88 | struct list_head root_list; | 88 | struct list_head root_list; |
89 | 89 | ||
90 | /* Hierarchy-specific flags */ | 90 | /* Hierarchy-specific flags */ |
@@ -116,7 +116,6 @@ static int root_count; | |||
116 | * be called. | 116 | * be called. |
117 | */ | 117 | */ |
118 | static int need_forkexit_callback __read_mostly; | 118 | static int need_forkexit_callback __read_mostly; |
119 | static int need_mm_owner_callback __read_mostly; | ||
120 | 119 | ||
121 | /* convenient tests for these bits */ | 120 | /* convenient tests for these bits */ |
122 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 121 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
149 | #define for_each_subsys(_root, _ss) \ | 148 | #define for_each_subsys(_root, _ss) \ |
150 | list_for_each_entry(_ss, &_root->subsys_list, sibling) | 149 | list_for_each_entry(_ss, &_root->subsys_list, sibling) |
151 | 150 | ||
152 | /* for_each_root() allows you to iterate across the active hierarchies */ | 151 | /* for_each_active_root() allows you to iterate across the active hierarchies */ |
153 | #define for_each_root(_root) \ | 152 | #define for_each_active_root(_root) \ |
154 | list_for_each_entry(_root, &roots, root_list) | 153 | list_for_each_entry(_root, &roots, root_list) |
155 | 154 | ||
156 | /* the list of cgroups eligible for automatic release. Protected by | 155 | /* the list of cgroups eligible for automatic release. Protected by |
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
272 | 271 | ||
273 | rcu_read_lock(); | 272 | rcu_read_lock(); |
274 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 273 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
275 | struct cgroup *cgrp = cg->subsys[i]->cgroup; | 274 | struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); |
276 | if (atomic_dec_and_test(&cgrp->count) && | 275 | if (atomic_dec_and_test(&cgrp->count) && |
277 | notify_on_release(cgrp)) { | 276 | notify_on_release(cgrp)) { |
278 | if (taskexit) | 277 | if (taskexit) |
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp) | |||
385 | return 0; | 384 | return 0; |
386 | } | 385 | } |
387 | 386 | ||
387 | /** | ||
388 | * link_css_set - a helper function to link a css_set to a cgroup | ||
389 | * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() | ||
390 | * @cg: the css_set to be linked | ||
391 | * @cgrp: the destination cgroup | ||
392 | */ | ||
393 | static void link_css_set(struct list_head *tmp_cg_links, | ||
394 | struct css_set *cg, struct cgroup *cgrp) | ||
395 | { | ||
396 | struct cg_cgroup_link *link; | ||
397 | |||
398 | BUG_ON(list_empty(tmp_cg_links)); | ||
399 | link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, | ||
400 | cgrp_link_list); | ||
401 | link->cg = cg; | ||
402 | list_move(&link->cgrp_link_list, &cgrp->css_sets); | ||
403 | list_add(&link->cg_link_list, &cg->cg_links); | ||
404 | } | ||
405 | |||
388 | /* | 406 | /* |
389 | * find_css_set() takes an existing cgroup group and a | 407 | * find_css_set() takes an existing cgroup group and a |
390 | * cgroup object, and returns a css_set object that's | 408 | * cgroup object, and returns a css_set object that's |
@@ -400,7 +418,6 @@ static struct css_set *find_css_set( | |||
400 | int i; | 418 | int i; |
401 | 419 | ||
402 | struct list_head tmp_cg_links; | 420 | struct list_head tmp_cg_links; |
403 | struct cg_cgroup_link *link; | ||
404 | 421 | ||
405 | struct hlist_head *hhead; | 422 | struct hlist_head *hhead; |
406 | 423 | ||
@@ -445,26 +462,11 @@ static struct css_set *find_css_set( | |||
445 | * only do it for the first subsystem in each | 462 | * only do it for the first subsystem in each |
446 | * hierarchy | 463 | * hierarchy |
447 | */ | 464 | */ |
448 | if (ss->root->subsys_list.next == &ss->sibling) { | 465 | if (ss->root->subsys_list.next == &ss->sibling) |
449 | BUG_ON(list_empty(&tmp_cg_links)); | 466 | link_css_set(&tmp_cg_links, res, cgrp); |
450 | link = list_entry(tmp_cg_links.next, | ||
451 | struct cg_cgroup_link, | ||
452 | cgrp_link_list); | ||
453 | list_del(&link->cgrp_link_list); | ||
454 | list_add(&link->cgrp_link_list, &cgrp->css_sets); | ||
455 | link->cg = res; | ||
456 | list_add(&link->cg_link_list, &res->cg_links); | ||
457 | } | ||
458 | } | ||
459 | if (list_empty(&rootnode.subsys_list)) { | ||
460 | link = list_entry(tmp_cg_links.next, | ||
461 | struct cg_cgroup_link, | ||
462 | cgrp_link_list); | ||
463 | list_del(&link->cgrp_link_list); | ||
464 | list_add(&link->cgrp_link_list, &dummytop->css_sets); | ||
465 | link->cg = res; | ||
466 | list_add(&link->cg_link_list, &res->cg_links); | ||
467 | } | 467 | } |
468 | if (list_empty(&rootnode.subsys_list)) | ||
469 | link_css_set(&tmp_cg_links, res, dummytop); | ||
468 | 470 | ||
469 | BUG_ON(!list_empty(&tmp_cg_links)); | 471 | BUG_ON(!list_empty(&tmp_cg_links)); |
470 | 472 | ||
@@ -573,7 +575,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
573 | inode->i_mode = mode; | 575 | inode->i_mode = mode; |
574 | inode->i_uid = current_fsuid(); | 576 | inode->i_uid = current_fsuid(); |
575 | inode->i_gid = current_fsgid(); | 577 | inode->i_gid = current_fsgid(); |
576 | inode->i_blocks = 0; | ||
577 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 578 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
578 | inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; | 579 | inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; |
579 | } | 580 | } |
@@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
588 | { | 589 | { |
589 | struct cgroup_subsys *ss; | 590 | struct cgroup_subsys *ss; |
590 | for_each_subsys(cgrp->root, ss) | 591 | for_each_subsys(cgrp->root, ss) |
591 | if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) | 592 | if (ss->pre_destroy) |
592 | ss->pre_destroy(ss, cgrp); | 593 | ss->pre_destroy(ss, cgrp); |
593 | return; | 594 | return; |
594 | } | 595 | } |
595 | 596 | ||
597 | static void free_cgroup_rcu(struct rcu_head *obj) | ||
598 | { | ||
599 | struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); | ||
600 | |||
601 | kfree(cgrp); | ||
602 | } | ||
603 | |||
596 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 604 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
597 | { | 605 | { |
598 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 606 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
612 | /* | 620 | /* |
613 | * Release the subsystem state objects. | 621 | * Release the subsystem state objects. |
614 | */ | 622 | */ |
615 | for_each_subsys(cgrp->root, ss) { | 623 | for_each_subsys(cgrp->root, ss) |
616 | if (cgrp->subsys[ss->subsys_id]) | 624 | ss->destroy(ss, cgrp); |
617 | ss->destroy(ss, cgrp); | ||
618 | } | ||
619 | 625 | ||
620 | cgrp->root->number_of_cgroups--; | 626 | cgrp->root->number_of_cgroups--; |
621 | mutex_unlock(&cgroup_mutex); | 627 | mutex_unlock(&cgroup_mutex); |
622 | 628 | ||
623 | /* Drop the active superblock reference that we took when we | 629 | /* |
624 | * created the cgroup */ | 630 | * Drop the active superblock reference that we took when we |
631 | * created the cgroup | ||
632 | */ | ||
625 | deactivate_super(cgrp->root->sb); | 633 | deactivate_super(cgrp->root->sb); |
626 | 634 | ||
627 | kfree(cgrp); | 635 | call_rcu(&cgrp->rcu_head, free_cgroup_rcu); |
628 | } | 636 | } |
629 | iput(inode); | 637 | iput(inode); |
630 | } | 638 | } |
@@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
714 | BUG_ON(cgrp->subsys[i]); | 722 | BUG_ON(cgrp->subsys[i]); |
715 | BUG_ON(!dummytop->subsys[i]); | 723 | BUG_ON(!dummytop->subsys[i]); |
716 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 724 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); |
725 | mutex_lock(&ss->hierarchy_mutex); | ||
717 | cgrp->subsys[i] = dummytop->subsys[i]; | 726 | cgrp->subsys[i] = dummytop->subsys[i]; |
718 | cgrp->subsys[i]->cgroup = cgrp; | 727 | cgrp->subsys[i]->cgroup = cgrp; |
719 | list_add(&ss->sibling, &root->subsys_list); | 728 | list_move(&ss->sibling, &root->subsys_list); |
720 | rcu_assign_pointer(ss->root, root); | 729 | ss->root = root; |
721 | if (ss->bind) | 730 | if (ss->bind) |
722 | ss->bind(ss, cgrp); | 731 | ss->bind(ss, cgrp); |
723 | 732 | mutex_unlock(&ss->hierarchy_mutex); | |
724 | } else if (bit & removed_bits) { | 733 | } else if (bit & removed_bits) { |
725 | /* We're removing this subsystem */ | 734 | /* We're removing this subsystem */ |
726 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 735 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
727 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 736 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
737 | mutex_lock(&ss->hierarchy_mutex); | ||
728 | if (ss->bind) | 738 | if (ss->bind) |
729 | ss->bind(ss, dummytop); | 739 | ss->bind(ss, dummytop); |
730 | dummytop->subsys[i]->cgroup = dummytop; | 740 | dummytop->subsys[i]->cgroup = dummytop; |
731 | cgrp->subsys[i] = NULL; | 741 | cgrp->subsys[i] = NULL; |
732 | rcu_assign_pointer(subsys[i]->root, &rootnode); | 742 | subsys[i]->root = &rootnode; |
733 | list_del(&ss->sibling); | 743 | list_move(&ss->sibling, &rootnode.subsys_list); |
744 | mutex_unlock(&ss->hierarchy_mutex); | ||
734 | } else if (bit & final_bits) { | 745 | } else if (bit & final_bits) { |
735 | /* Subsystem state should already exist */ | 746 | /* Subsystem state should already exist */ |
736 | BUG_ON(!cgrp->subsys[i]); | 747 | BUG_ON(!cgrp->subsys[i]); |
@@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
992 | root = NULL; | 1003 | root = NULL; |
993 | } else { | 1004 | } else { |
994 | /* New superblock */ | 1005 | /* New superblock */ |
995 | struct cgroup *cgrp = &root->top_cgroup; | 1006 | struct cgroup *root_cgrp = &root->top_cgroup; |
996 | struct inode *inode; | 1007 | struct inode *inode; |
997 | int i; | 1008 | int i; |
998 | 1009 | ||
@@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1033 | list_add(&root->root_list, &roots); | 1044 | list_add(&root->root_list, &roots); |
1034 | root_count++; | 1045 | root_count++; |
1035 | 1046 | ||
1036 | sb->s_root->d_fsdata = &root->top_cgroup; | 1047 | sb->s_root->d_fsdata = root_cgrp; |
1037 | root->top_cgroup.dentry = sb->s_root; | 1048 | root->top_cgroup.dentry = sb->s_root; |
1038 | 1049 | ||
1039 | /* Link the top cgroup in this hierarchy into all | 1050 | /* Link the top cgroup in this hierarchy into all |
@@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, | |||
1044 | struct hlist_node *node; | 1055 | struct hlist_node *node; |
1045 | struct css_set *cg; | 1056 | struct css_set *cg; |
1046 | 1057 | ||
1047 | hlist_for_each_entry(cg, node, hhead, hlist) { | 1058 | hlist_for_each_entry(cg, node, hhead, hlist) |
1048 | struct cg_cgroup_link *link; | 1059 | link_css_set(&tmp_cg_links, cg, root_cgrp); |
1049 | |||
1050 | BUG_ON(list_empty(&tmp_cg_links)); | ||
1051 | link = list_entry(tmp_cg_links.next, | ||
1052 | struct cg_cgroup_link, | ||
1053 | cgrp_link_list); | ||
1054 | list_del(&link->cgrp_link_list); | ||
1055 | link->cg = cg; | ||
1056 | list_add(&link->cgrp_link_list, | ||
1057 | &root->top_cgroup.css_sets); | ||
1058 | list_add(&link->cg_link_list, &cg->cg_links); | ||
1059 | } | ||
1060 | } | 1060 | } |
1061 | write_unlock(&css_set_lock); | 1061 | write_unlock(&css_set_lock); |
1062 | 1062 | ||
1063 | free_cg_links(&tmp_cg_links); | 1063 | free_cg_links(&tmp_cg_links); |
1064 | 1064 | ||
1065 | BUG_ON(!list_empty(&cgrp->sibling)); | 1065 | BUG_ON(!list_empty(&root_cgrp->sibling)); |
1066 | BUG_ON(!list_empty(&cgrp->children)); | 1066 | BUG_ON(!list_empty(&root_cgrp->children)); |
1067 | BUG_ON(root->number_of_cgroups != 1); | 1067 | BUG_ON(root->number_of_cgroups != 1); |
1068 | 1068 | ||
1069 | cgroup_populate_dir(cgrp); | 1069 | cgroup_populate_dir(root_cgrp); |
1070 | mutex_unlock(&inode->i_mutex); | 1070 | mutex_unlock(&inode->i_mutex); |
1071 | mutex_unlock(&cgroup_mutex); | 1071 | mutex_unlock(&cgroup_mutex); |
1072 | } | 1072 | } |
@@ -1115,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1115 | } | 1115 | } |
1116 | write_unlock(&css_set_lock); | 1116 | write_unlock(&css_set_lock); |
1117 | 1117 | ||
1118 | if (!list_empty(&root->root_list)) { | 1118 | list_del(&root->root_list); |
1119 | list_del(&root->root_list); | 1119 | root_count--; |
1120 | root_count--; | 1120 | |
1121 | } | ||
1122 | mutex_unlock(&cgroup_mutex); | 1121 | mutex_unlock(&cgroup_mutex); |
1123 | 1122 | ||
1124 | kfree(root); | 1123 | kfree(root); |
@@ -1147,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
1147 | * @buf: the buffer to write the path into | 1146 | * @buf: the buffer to write the path into |
1148 | * @buflen: the length of the buffer | 1147 | * @buflen: the length of the buffer |
1149 | * | 1148 | * |
1150 | * Called with cgroup_mutex held. Writes path of cgroup into buf. | 1149 | * Called with cgroup_mutex held or else with an RCU-protected cgroup |
1151 | * Returns 0 on success, -errno on error. | 1150 | * reference. Writes path of cgroup into buf. Returns 0 on success, |
1151 | * -errno on error. | ||
1152 | */ | 1152 | */ |
1153 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1153 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1154 | { | 1154 | { |
1155 | char *start; | 1155 | char *start; |
1156 | struct dentry *dentry = rcu_dereference(cgrp->dentry); | ||
1156 | 1157 | ||
1157 | if (cgrp == dummytop) { | 1158 | if (!dentry || cgrp == dummytop) { |
1158 | /* | 1159 | /* |
1159 | * Inactive subsystems have no dentry for their root | 1160 | * Inactive subsystems have no dentry for their root |
1160 | * cgroup | 1161 | * cgroup |
@@ -1167,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1167 | 1168 | ||
1168 | *--start = '\0'; | 1169 | *--start = '\0'; |
1169 | for (;;) { | 1170 | for (;;) { |
1170 | int len = cgrp->dentry->d_name.len; | 1171 | int len = dentry->d_name.len; |
1171 | if ((start -= len) < buf) | 1172 | if ((start -= len) < buf) |
1172 | return -ENAMETOOLONG; | 1173 | return -ENAMETOOLONG; |
1173 | memcpy(start, cgrp->dentry->d_name.name, len); | 1174 | memcpy(start, cgrp->dentry->d_name.name, len); |
1174 | cgrp = cgrp->parent; | 1175 | cgrp = cgrp->parent; |
1175 | if (!cgrp) | 1176 | if (!cgrp) |
1176 | break; | 1177 | break; |
1178 | dentry = rcu_dereference(cgrp->dentry); | ||
1177 | if (!cgrp->parent) | 1179 | if (!cgrp->parent) |
1178 | continue; | 1180 | continue; |
1179 | if (--start < buf) | 1181 | if (--start < buf) |
@@ -1218,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1218 | int retval = 0; | 1220 | int retval = 0; |
1219 | struct cgroup_subsys *ss; | 1221 | struct cgroup_subsys *ss; |
1220 | struct cgroup *oldcgrp; | 1222 | struct cgroup *oldcgrp; |
1221 | struct css_set *cg = tsk->cgroups; | 1223 | struct css_set *cg; |
1222 | struct css_set *newcg; | 1224 | struct css_set *newcg; |
1223 | struct cgroupfs_root *root = cgrp->root; | 1225 | struct cgroupfs_root *root = cgrp->root; |
1224 | int subsys_id; | 1226 | int subsys_id; |
@@ -1238,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1238 | } | 1240 | } |
1239 | } | 1241 | } |
1240 | 1242 | ||
1243 | task_lock(tsk); | ||
1244 | cg = tsk->cgroups; | ||
1245 | get_css_set(cg); | ||
1246 | task_unlock(tsk); | ||
1241 | /* | 1247 | /* |
1242 | * Locate or allocate a new css_set for this task, | 1248 | * Locate or allocate a new css_set for this task, |
1243 | * based on its final set of cgroups | 1249 | * based on its final set of cgroups |
1244 | */ | 1250 | */ |
1245 | newcg = find_css_set(cg, cgrp); | 1251 | newcg = find_css_set(cg, cgrp); |
1252 | put_css_set(cg); | ||
1246 | if (!newcg) | 1253 | if (!newcg) |
1247 | return -ENOMEM; | 1254 | return -ENOMEM; |
1248 | 1255 | ||
@@ -1447,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | |||
1447 | struct cftype *cft = __d_cft(file->f_dentry); | 1454 | struct cftype *cft = __d_cft(file->f_dentry); |
1448 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 1455 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
1449 | 1456 | ||
1450 | if (!cft || cgroup_is_removed(cgrp)) | 1457 | if (cgroup_is_removed(cgrp)) |
1451 | return -ENODEV; | 1458 | return -ENODEV; |
1452 | if (cft->write) | 1459 | if (cft->write) |
1453 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 1460 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
@@ -1492,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
1492 | struct cftype *cft = __d_cft(file->f_dentry); | 1499 | struct cftype *cft = __d_cft(file->f_dentry); |
1493 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 1500 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
1494 | 1501 | ||
1495 | if (!cft || cgroup_is_removed(cgrp)) | 1502 | if (cgroup_is_removed(cgrp)) |
1496 | return -ENODEV; | 1503 | return -ENODEV; |
1497 | 1504 | ||
1498 | if (cft->read) | 1505 | if (cft->read) |
@@ -1556,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
1556 | err = generic_file_open(inode, file); | 1563 | err = generic_file_open(inode, file); |
1557 | if (err) | 1564 | if (err) |
1558 | return err; | 1565 | return err; |
1559 | |||
1560 | cft = __d_cft(file->f_dentry); | 1566 | cft = __d_cft(file->f_dentry); |
1561 | if (!cft) | 1567 | |
1562 | return -ENODEV; | ||
1563 | if (cft->read_map || cft->read_seq_string) { | 1568 | if (cft->read_map || cft->read_seq_string) { |
1564 | struct cgroup_seqfile_state *state = | 1569 | struct cgroup_seqfile_state *state = |
1565 | kzalloc(sizeof(*state), GFP_USER); | 1570 | kzalloc(sizeof(*state), GFP_USER); |
@@ -1673,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | |||
1673 | if (!error) { | 1678 | if (!error) { |
1674 | dentry->d_fsdata = cgrp; | 1679 | dentry->d_fsdata = cgrp; |
1675 | inc_nlink(parent->d_inode); | 1680 | inc_nlink(parent->d_inode); |
1676 | cgrp->dentry = dentry; | 1681 | rcu_assign_pointer(cgrp->dentry, dentry); |
1677 | dget(dentry); | 1682 | dget(dentry); |
1678 | } | 1683 | } |
1679 | dput(dentry); | 1684 | dput(dentry); |
@@ -1814,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
1814 | { | 1819 | { |
1815 | struct task_struct *res; | 1820 | struct task_struct *res; |
1816 | struct list_head *l = it->task; | 1821 | struct list_head *l = it->task; |
1822 | struct cg_cgroup_link *link; | ||
1817 | 1823 | ||
1818 | /* If the iterator cg is NULL, we have no tasks */ | 1824 | /* If the iterator cg is NULL, we have no tasks */ |
1819 | if (!it->cg_link) | 1825 | if (!it->cg_link) |
@@ -1821,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
1821 | res = list_entry(l, struct task_struct, cg_list); | 1827 | res = list_entry(l, struct task_struct, cg_list); |
1822 | /* Advance iterator to find next entry */ | 1828 | /* Advance iterator to find next entry */ |
1823 | l = l->next; | 1829 | l = l->next; |
1824 | if (l == &res->cgroups->tasks) { | 1830 | link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); |
1831 | if (l == &link->cg->tasks) { | ||
1825 | /* We reached the end of this task list - move on to | 1832 | /* We reached the end of this task list - move on to |
1826 | * the next cg_cgroup_link */ | 1833 | * the next cg_cgroup_link */ |
1827 | cgroup_advance_iter(cgrp, it); | 1834 | cgroup_advance_iter(cgrp, it); |
@@ -2015,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
2015 | */ | 2022 | */ |
2016 | static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) | 2023 | static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) |
2017 | { | 2024 | { |
2018 | int n = 0; | 2025 | int n = 0, pid; |
2019 | struct cgroup_iter it; | 2026 | struct cgroup_iter it; |
2020 | struct task_struct *tsk; | 2027 | struct task_struct *tsk; |
2021 | cgroup_iter_start(cgrp, &it); | 2028 | cgroup_iter_start(cgrp, &it); |
2022 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 2029 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
2023 | if (unlikely(n == npids)) | 2030 | if (unlikely(n == npids)) |
2024 | break; | 2031 | break; |
2025 | pidarray[n++] = task_pid_vnr(tsk); | 2032 | pid = task_pid_vnr(tsk); |
2033 | if (pid > 0) | ||
2034 | pidarray[n++] = pid; | ||
2026 | } | 2035 | } |
2027 | cgroup_iter_end(cgrp, &it); | 2036 | cgroup_iter_end(cgrp, &it); |
2028 | return n; | 2037 | return n; |
@@ -2054,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
2054 | 2063 | ||
2055 | ret = 0; | 2064 | ret = 0; |
2056 | cgrp = dentry->d_fsdata; | 2065 | cgrp = dentry->d_fsdata; |
2057 | rcu_read_lock(); | ||
2058 | 2066 | ||
2059 | cgroup_iter_start(cgrp, &it); | 2067 | cgroup_iter_start(cgrp, &it); |
2060 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 2068 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
@@ -2079,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
2079 | } | 2087 | } |
2080 | cgroup_iter_end(cgrp, &it); | 2088 | cgroup_iter_end(cgrp, &it); |
2081 | 2089 | ||
2082 | rcu_read_unlock(); | ||
2083 | err: | 2090 | err: |
2084 | return ret; | 2091 | return ret; |
2085 | } | 2092 | } |
@@ -2326,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
2326 | struct cgroup *cgrp) | 2333 | struct cgroup *cgrp) |
2327 | { | 2334 | { |
2328 | css->cgroup = cgrp; | 2335 | css->cgroup = cgrp; |
2329 | atomic_set(&css->refcnt, 0); | 2336 | atomic_set(&css->refcnt, 1); |
2330 | css->flags = 0; | 2337 | css->flags = 0; |
2331 | if (cgrp == dummytop) | 2338 | if (cgrp == dummytop) |
2332 | set_bit(CSS_ROOT, &css->flags); | 2339 | set_bit(CSS_ROOT, &css->flags); |
@@ -2334,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
2334 | cgrp->subsys[ss->subsys_id] = css; | 2341 | cgrp->subsys[ss->subsys_id] = css; |
2335 | } | 2342 | } |
2336 | 2343 | ||
2344 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | ||
2345 | { | ||
2346 | /* We need to take each hierarchy_mutex in a consistent order */ | ||
2347 | int i; | ||
2348 | |||
2349 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
2350 | struct cgroup_subsys *ss = subsys[i]; | ||
2351 | if (ss->root == root) | ||
2352 | mutex_lock_nested(&ss->hierarchy_mutex, i); | ||
2353 | } | ||
2354 | } | ||
2355 | |||
2356 | static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | ||
2357 | { | ||
2358 | int i; | ||
2359 | |||
2360 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
2361 | struct cgroup_subsys *ss = subsys[i]; | ||
2362 | if (ss->root == root) | ||
2363 | mutex_unlock(&ss->hierarchy_mutex); | ||
2364 | } | ||
2365 | } | ||
2366 | |||
2337 | /* | 2367 | /* |
2338 | * cgroup_create - create a cgroup | 2368 | * cgroup_create - create a cgroup |
2339 | * @parent: cgroup that will be parent of the new cgroup | 2369 | * @parent: cgroup that will be parent of the new cgroup |
@@ -2382,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
2382 | init_cgroup_css(css, ss, cgrp); | 2412 | init_cgroup_css(css, ss, cgrp); |
2383 | } | 2413 | } |
2384 | 2414 | ||
2415 | cgroup_lock_hierarchy(root); | ||
2385 | list_add(&cgrp->sibling, &cgrp->parent->children); | 2416 | list_add(&cgrp->sibling, &cgrp->parent->children); |
2417 | cgroup_unlock_hierarchy(root); | ||
2386 | root->number_of_cgroups++; | 2418 | root->number_of_cgroups++; |
2387 | 2419 | ||
2388 | err = cgroup_create_dir(cgrp, dentry, mode); | 2420 | err = cgroup_create_dir(cgrp, dentry, mode); |
@@ -2433,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
2433 | { | 2465 | { |
2434 | /* Check the reference count on each subsystem. Since we | 2466 | /* Check the reference count on each subsystem. Since we |
2435 | * already established that there are no tasks in the | 2467 | * already established that there are no tasks in the |
2436 | * cgroup, if the css refcount is also 0, then there should | 2468 | * cgroup, if the css refcount is also 1, then there should |
2437 | * be no outstanding references, so the subsystem is safe to | 2469 | * be no outstanding references, so the subsystem is safe to |
2438 | * destroy. We scan across all subsystems rather than using | 2470 | * destroy. We scan across all subsystems rather than using |
2439 | * the per-hierarchy linked list of mounted subsystems since | 2471 | * the per-hierarchy linked list of mounted subsystems since |
@@ -2454,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
2454 | * matter, since it can only happen if the cgroup | 2486 | * matter, since it can only happen if the cgroup |
2455 | * has been deleted and hence no longer needs the | 2487 | * has been deleted and hence no longer needs the |
2456 | * release agent to be called anyway. */ | 2488 | * release agent to be called anyway. */ |
2457 | if (css && atomic_read(&css->refcnt)) | 2489 | if (css && (atomic_read(&css->refcnt) > 1)) |
2458 | return 1; | 2490 | return 1; |
2459 | } | 2491 | } |
2460 | return 0; | 2492 | return 0; |
2461 | } | 2493 | } |
2462 | 2494 | ||
2495 | /* | ||
2496 | * Atomically mark all (or else none) of the cgroup's CSS objects as | ||
2497 | * CSS_REMOVED. Return true on success, or false if the cgroup has | ||
2498 | * busy subsystems. Call with cgroup_mutex held | ||
2499 | */ | ||
2500 | |||
2501 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | ||
2502 | { | ||
2503 | struct cgroup_subsys *ss; | ||
2504 | unsigned long flags; | ||
2505 | bool failed = false; | ||
2506 | local_irq_save(flags); | ||
2507 | for_each_subsys(cgrp->root, ss) { | ||
2508 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
2509 | int refcnt; | ||
2510 | do { | ||
2511 | /* We can only remove a CSS with a refcnt==1 */ | ||
2512 | refcnt = atomic_read(&css->refcnt); | ||
2513 | if (refcnt > 1) { | ||
2514 | failed = true; | ||
2515 | goto done; | ||
2516 | } | ||
2517 | BUG_ON(!refcnt); | ||
2518 | /* | ||
2519 | * Drop the refcnt to 0 while we check other | ||
2520 | * subsystems. This will cause any racing | ||
2521 | * css_tryget() to spin until we set the | ||
2522 | * CSS_REMOVED bits or abort | ||
2523 | */ | ||
2524 | } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); | ||
2525 | } | ||
2526 | done: | ||
2527 | for_each_subsys(cgrp->root, ss) { | ||
2528 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
2529 | if (failed) { | ||
2530 | /* | ||
2531 | * Restore old refcnt if we previously managed | ||
2532 | * to clear it from 1 to 0 | ||
2533 | */ | ||
2534 | if (!atomic_read(&css->refcnt)) | ||
2535 | atomic_set(&css->refcnt, 1); | ||
2536 | } else { | ||
2537 | /* Commit the fact that the CSS is removed */ | ||
2538 | set_bit(CSS_REMOVED, &css->flags); | ||
2539 | } | ||
2540 | } | ||
2541 | local_irq_restore(flags); | ||
2542 | return !failed; | ||
2543 | } | ||
2544 | |||
2463 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 2545 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
2464 | { | 2546 | { |
2465 | struct cgroup *cgrp = dentry->d_fsdata; | 2547 | struct cgroup *cgrp = dentry->d_fsdata; |
2466 | struct dentry *d; | 2548 | struct dentry *d; |
2467 | struct cgroup *parent; | 2549 | struct cgroup *parent; |
2468 | struct super_block *sb; | ||
2469 | struct cgroupfs_root *root; | ||
2470 | 2550 | ||
2471 | /* the vfs holds both inode->i_mutex already */ | 2551 | /* the vfs holds both inode->i_mutex already */ |
2472 | 2552 | ||
@@ -2489,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
2489 | 2569 | ||
2490 | mutex_lock(&cgroup_mutex); | 2570 | mutex_lock(&cgroup_mutex); |
2491 | parent = cgrp->parent; | 2571 | parent = cgrp->parent; |
2492 | root = cgrp->root; | ||
2493 | sb = root->sb; | ||
2494 | 2572 | ||
2495 | if (atomic_read(&cgrp->count) | 2573 | if (atomic_read(&cgrp->count) |
2496 | || !list_empty(&cgrp->children) | 2574 | || !list_empty(&cgrp->children) |
2497 | || cgroup_has_css_refs(cgrp)) { | 2575 | || !cgroup_clear_css_refs(cgrp)) { |
2498 | mutex_unlock(&cgroup_mutex); | 2576 | mutex_unlock(&cgroup_mutex); |
2499 | return -EBUSY; | 2577 | return -EBUSY; |
2500 | } | 2578 | } |
@@ -2504,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
2504 | if (!list_empty(&cgrp->release_list)) | 2582 | if (!list_empty(&cgrp->release_list)) |
2505 | list_del(&cgrp->release_list); | 2583 | list_del(&cgrp->release_list); |
2506 | spin_unlock(&release_list_lock); | 2584 | spin_unlock(&release_list_lock); |
2507 | /* delete my sibling from parent->children */ | 2585 | |
2586 | cgroup_lock_hierarchy(cgrp->root); | ||
2587 | /* delete this cgroup from parent->children */ | ||
2508 | list_del(&cgrp->sibling); | 2588 | list_del(&cgrp->sibling); |
2589 | cgroup_unlock_hierarchy(cgrp->root); | ||
2590 | |||
2509 | spin_lock(&cgrp->dentry->d_lock); | 2591 | spin_lock(&cgrp->dentry->d_lock); |
2510 | d = dget(cgrp->dentry); | 2592 | d = dget(cgrp->dentry); |
2511 | spin_unlock(&d->d_lock); | 2593 | spin_unlock(&d->d_lock); |
@@ -2527,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
2527 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 2609 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
2528 | 2610 | ||
2529 | /* Create the top cgroup state for this subsystem */ | 2611 | /* Create the top cgroup state for this subsystem */ |
2612 | list_add(&ss->sibling, &rootnode.subsys_list); | ||
2530 | ss->root = &rootnode; | 2613 | ss->root = &rootnode; |
2531 | css = ss->create(ss, dummytop); | 2614 | css = ss->create(ss, dummytop); |
2532 | /* We don't handle early failures gracefully */ | 2615 | /* We don't handle early failures gracefully */ |
@@ -2540,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
2540 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; | 2623 | init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; |
2541 | 2624 | ||
2542 | need_forkexit_callback |= ss->fork || ss->exit; | 2625 | need_forkexit_callback |= ss->fork || ss->exit; |
2543 | need_mm_owner_callback |= !!ss->mm_owner_changed; | ||
2544 | 2626 | ||
2545 | /* At system boot, before all subsystems have been | 2627 | /* At system boot, before all subsystems have been |
2546 | * registered, no tasks have been forked, so we don't | 2628 | * registered, no tasks have been forked, so we don't |
2547 | * need to invoke fork callbacks here. */ | 2629 | * need to invoke fork callbacks here. */ |
2548 | BUG_ON(!list_empty(&init_task.tasks)); | 2630 | BUG_ON(!list_empty(&init_task.tasks)); |
2549 | 2631 | ||
2632 | mutex_init(&ss->hierarchy_mutex); | ||
2550 | ss->active = 1; | 2633 | ss->active = 1; |
2551 | } | 2634 | } |
2552 | 2635 | ||
@@ -2565,7 +2648,6 @@ int __init cgroup_init_early(void) | |||
2565 | INIT_HLIST_NODE(&init_css_set.hlist); | 2648 | INIT_HLIST_NODE(&init_css_set.hlist); |
2566 | css_set_count = 1; | 2649 | css_set_count = 1; |
2567 | init_cgroup_root(&rootnode); | 2650 | init_cgroup_root(&rootnode); |
2568 | list_add(&rootnode.root_list, &roots); | ||
2569 | root_count = 1; | 2651 | root_count = 1; |
2570 | init_task.cgroups = &init_css_set; | 2652 | init_task.cgroups = &init_css_set; |
2571 | 2653 | ||
@@ -2672,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v) | |||
2672 | 2754 | ||
2673 | mutex_lock(&cgroup_mutex); | 2755 | mutex_lock(&cgroup_mutex); |
2674 | 2756 | ||
2675 | for_each_root(root) { | 2757 | for_each_active_root(root) { |
2676 | struct cgroup_subsys *ss; | 2758 | struct cgroup_subsys *ss; |
2677 | struct cgroup *cgrp; | 2759 | struct cgroup *cgrp; |
2678 | int subsys_id; | 2760 | int subsys_id; |
2679 | int count = 0; | 2761 | int count = 0; |
2680 | 2762 | ||
2681 | /* Skip this hierarchy if it has no active subsystems */ | ||
2682 | if (!root->actual_subsys_bits) | ||
2683 | continue; | ||
2684 | seq_printf(m, "%lu:", root->subsys_bits); | 2763 | seq_printf(m, "%lu:", root->subsys_bits); |
2685 | for_each_subsys(root, ss) | 2764 | for_each_subsys(root, ss) |
2686 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 2765 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
@@ -2790,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
2790 | } | 2869 | } |
2791 | } | 2870 | } |
2792 | 2871 | ||
2793 | #ifdef CONFIG_MM_OWNER | ||
2794 | /** | ||
2795 | * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes | ||
2796 | * @p: the new owner | ||
2797 | * | ||
2798 | * Called on every change to mm->owner. mm_init_owner() does not | ||
2799 | * invoke this routine, since it assigns the mm->owner the first time | ||
2800 | * and does not change it. | ||
2801 | * | ||
2802 | * The callbacks are invoked with mmap_sem held in read mode. | ||
2803 | */ | ||
2804 | void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) | ||
2805 | { | ||
2806 | struct cgroup *oldcgrp, *newcgrp = NULL; | ||
2807 | |||
2808 | if (need_mm_owner_callback) { | ||
2809 | int i; | ||
2810 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
2811 | struct cgroup_subsys *ss = subsys[i]; | ||
2812 | oldcgrp = task_cgroup(old, ss->subsys_id); | ||
2813 | if (new) | ||
2814 | newcgrp = task_cgroup(new, ss->subsys_id); | ||
2815 | if (oldcgrp == newcgrp) | ||
2816 | continue; | ||
2817 | if (ss->mm_owner_changed) | ||
2818 | ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); | ||
2819 | } | ||
2820 | } | ||
2821 | } | ||
2822 | #endif /* CONFIG_MM_OWNER */ | ||
2823 | |||
2824 | /** | 2872 | /** |
2825 | * cgroup_post_fork - called on a new task after adding it to the task list | 2873 | * cgroup_post_fork - called on a new task after adding it to the task list |
2826 | * @child: the task in question | 2874 | * @child: the task in question |
@@ -2834,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child) | |||
2834 | { | 2882 | { |
2835 | if (use_task_css_set_links) { | 2883 | if (use_task_css_set_links) { |
2836 | write_lock(&css_set_lock); | 2884 | write_lock(&css_set_lock); |
2885 | task_lock(child); | ||
2837 | if (list_empty(&child->cg_list)) | 2886 | if (list_empty(&child->cg_list)) |
2838 | list_add(&child->cg_list, &child->cgroups->tasks); | 2887 | list_add(&child->cg_list, &child->cgroups->tasks); |
2888 | task_unlock(child); | ||
2839 | write_unlock(&css_set_lock); | 2889 | write_unlock(&css_set_lock); |
2840 | } | 2890 | } |
2841 | } | 2891 | } |
@@ -2941,6 +2991,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | |||
2941 | mutex_unlock(&cgroup_mutex); | 2991 | mutex_unlock(&cgroup_mutex); |
2942 | return 0; | 2992 | return 0; |
2943 | } | 2993 | } |
2994 | task_lock(tsk); | ||
2944 | cg = tsk->cgroups; | 2995 | cg = tsk->cgroups; |
2945 | parent = task_cgroup(tsk, subsys->subsys_id); | 2996 | parent = task_cgroup(tsk, subsys->subsys_id); |
2946 | 2997 | ||
@@ -2953,6 +3004,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | |||
2953 | 3004 | ||
2954 | /* Keep the cgroup alive */ | 3005 | /* Keep the cgroup alive */ |
2955 | get_css_set(cg); | 3006 | get_css_set(cg); |
3007 | task_unlock(tsk); | ||
2956 | mutex_unlock(&cgroup_mutex); | 3008 | mutex_unlock(&cgroup_mutex); |
2957 | 3009 | ||
2958 | /* Now do the VFS work to create a cgroup */ | 3010 | /* Now do the VFS work to create a cgroup */ |
@@ -2971,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | |||
2971 | } | 3023 | } |
2972 | 3024 | ||
2973 | /* Create the cgroup directory, which also creates the cgroup */ | 3025 | /* Create the cgroup directory, which also creates the cgroup */ |
2974 | ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); | 3026 | ret = vfs_mkdir(inode, dentry, 0755); |
2975 | child = __d_cgrp(dentry); | 3027 | child = __d_cgrp(dentry); |
2976 | dput(dentry); | 3028 | dput(dentry); |
2977 | if (ret) { | 3029 | if (ret) { |
@@ -2981,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | |||
2981 | goto out_release; | 3033 | goto out_release; |
2982 | } | 3034 | } |
2983 | 3035 | ||
2984 | if (!child) { | ||
2985 | printk(KERN_INFO | ||
2986 | "Couldn't find new cgroup %s\n", nodename); | ||
2987 | ret = -ENOMEM; | ||
2988 | goto out_release; | ||
2989 | } | ||
2990 | |||
2991 | /* The cgroup now exists. Retake cgroup_mutex and check | 3036 | /* The cgroup now exists. Retake cgroup_mutex and check |
2992 | * that we're still in the same state that we thought we | 3037 | * that we're still in the same state that we thought we |
2993 | * were. */ | 3038 | * were. */ |
@@ -3083,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css) | |||
3083 | { | 3128 | { |
3084 | struct cgroup *cgrp = css->cgroup; | 3129 | struct cgroup *cgrp = css->cgroup; |
3085 | rcu_read_lock(); | 3130 | rcu_read_lock(); |
3086 | if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { | 3131 | if ((atomic_dec_return(&css->refcnt) == 1) && |
3132 | notify_on_release(cgrp)) { | ||
3087 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 3133 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
3088 | check_for_release(cgrp); | 3134 | check_for_release(cgrp); |
3089 | } | 3135 | } |
diff --git a/kernel/compat.c b/kernel/compat.c index d52e2ec1deb5..42d56544460f 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
25 | #include <linux/posix-timers.h> | 25 | #include <linux/posix-timers.h> |
26 | #include <linux/times.h> | 26 | #include <linux/times.h> |
27 | #include <linux/ptrace.h> | ||
27 | 28 | ||
28 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
29 | 30 | ||
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) | |||
229 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) | 230 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) |
230 | return -EFAULT; | 231 | return -EFAULT; |
231 | } | 232 | } |
233 | force_successful_syscall_return(); | ||
232 | return compat_jiffies_to_clock_t(jiffies); | 234 | return compat_jiffies_to_clock_t(jiffies); |
233 | } | 235 | } |
234 | 236 | ||
@@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) | |||
894 | 896 | ||
895 | if (tloc) { | 897 | if (tloc) { |
896 | if (put_user(i,tloc)) | 898 | if (put_user(i,tloc)) |
897 | i = -EFAULT; | 899 | return -EFAULT; |
898 | } | 900 | } |
901 | force_successful_syscall_return(); | ||
899 | return i; | 902 | return i; |
900 | } | 903 | } |
901 | 904 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 47fff3b63cbf..79e40f00dcb8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -269,8 +269,11 @@ out_release: | |||
269 | 269 | ||
270 | int __ref cpu_down(unsigned int cpu) | 270 | int __ref cpu_down(unsigned int cpu) |
271 | { | 271 | { |
272 | int err = 0; | 272 | int err; |
273 | 273 | ||
274 | err = stop_machine_create(); | ||
275 | if (err) | ||
276 | return err; | ||
274 | cpu_maps_update_begin(); | 277 | cpu_maps_update_begin(); |
275 | 278 | ||
276 | if (cpu_hotplug_disabled) { | 279 | if (cpu_hotplug_disabled) { |
@@ -297,6 +300,7 @@ int __ref cpu_down(unsigned int cpu) | |||
297 | 300 | ||
298 | out: | 301 | out: |
299 | cpu_maps_update_done(); | 302 | cpu_maps_update_done(); |
303 | stop_machine_destroy(); | ||
300 | return err; | 304 | return err; |
301 | } | 305 | } |
302 | EXPORT_SYMBOL(cpu_down); | 306 | EXPORT_SYMBOL(cpu_down); |
@@ -375,8 +379,11 @@ static cpumask_var_t frozen_cpus; | |||
375 | 379 | ||
376 | int disable_nonboot_cpus(void) | 380 | int disable_nonboot_cpus(void) |
377 | { | 381 | { |
378 | int cpu, first_cpu, error = 0; | 382 | int cpu, first_cpu, error; |
379 | 383 | ||
384 | error = stop_machine_create(); | ||
385 | if (error) | ||
386 | return error; | ||
380 | cpu_maps_update_begin(); | 387 | cpu_maps_update_begin(); |
381 | first_cpu = cpumask_first(cpu_online_mask); | 388 | first_cpu = cpumask_first(cpu_online_mask); |
382 | /* We take down all of the non-boot CPUs in one shot to avoid races | 389 | /* We take down all of the non-boot CPUs in one shot to avoid races |
@@ -405,6 +412,7 @@ int disable_nonboot_cpus(void) | |||
405 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 412 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
406 | } | 413 | } |
407 | cpu_maps_update_done(); | 414 | cpu_maps_update_done(); |
415 | stop_machine_destroy(); | ||
408 | return error; | 416 | return error; |
409 | } | 417 | } |
410 | 418 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39c1a4c1c5a9..647c77a88fcb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -84,7 +84,7 @@ struct cpuset { | |||
84 | struct cgroup_subsys_state css; | 84 | struct cgroup_subsys_state css; |
85 | 85 | ||
86 | unsigned long flags; /* "unsigned long" so bitops work */ | 86 | unsigned long flags; /* "unsigned long" so bitops work */ |
87 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 87 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
88 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 88 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
89 | 89 | ||
90 | struct cpuset *parent; /* my parent */ | 90 | struct cpuset *parent; /* my parent */ |
@@ -195,8 +195,6 @@ static int cpuset_mems_generation; | |||
195 | 195 | ||
196 | static struct cpuset top_cpuset = { | 196 | static struct cpuset top_cpuset = { |
197 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 197 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), |
198 | .cpus_allowed = CPU_MASK_ALL, | ||
199 | .mems_allowed = NODE_MASK_ALL, | ||
200 | }; | 198 | }; |
201 | 199 | ||
202 | /* | 200 | /* |
@@ -240,6 +238,17 @@ static struct cpuset top_cpuset = { | |||
240 | static DEFINE_MUTEX(callback_mutex); | 238 | static DEFINE_MUTEX(callback_mutex); |
241 | 239 | ||
242 | /* | 240 | /* |
241 | * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist | ||
242 | * buffers. They are statically allocated to prevent using excess stack | ||
243 | * when calling cpuset_print_task_mems_allowed(). | ||
244 | */ | ||
245 | #define CPUSET_NAME_LEN (128) | ||
246 | #define CPUSET_NODELIST_LEN (256) | ||
247 | static char cpuset_name[CPUSET_NAME_LEN]; | ||
248 | static char cpuset_nodelist[CPUSET_NODELIST_LEN]; | ||
249 | static DEFINE_SPINLOCK(cpuset_buffer_lock); | ||
250 | |||
251 | /* | ||
243 | * This is ugly, but preserves the userspace API for existing cpuset | 252 | * This is ugly, but preserves the userspace API for existing cpuset |
244 | * users. If someone tries to mount the "cpuset" filesystem, we | 253 | * users. If someone tries to mount the "cpuset" filesystem, we |
245 | * silently switch it to mount "cgroup" instead | 254 | * silently switch it to mount "cgroup" instead |
@@ -267,7 +276,7 @@ static struct file_system_type cpuset_fs_type = { | |||
267 | }; | 276 | }; |
268 | 277 | ||
269 | /* | 278 | /* |
270 | * Return in *pmask the portion of a cpusets's cpus_allowed that | 279 | * Return in pmask the portion of a cpusets's cpus_allowed that |
271 | * are online. If none are online, walk up the cpuset hierarchy | 280 | * are online. If none are online, walk up the cpuset hierarchy |
272 | * until we find one that does have some online cpus. If we get | 281 | * until we find one that does have some online cpus. If we get |
273 | * all the way to the top and still haven't found any online cpus, | 282 | * all the way to the top and still haven't found any online cpus, |
@@ -280,15 +289,16 @@ static struct file_system_type cpuset_fs_type = { | |||
280 | * Call with callback_mutex held. | 289 | * Call with callback_mutex held. |
281 | */ | 290 | */ |
282 | 291 | ||
283 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 292 | static void guarantee_online_cpus(const struct cpuset *cs, |
293 | struct cpumask *pmask) | ||
284 | { | 294 | { |
285 | while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) | 295 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
286 | cs = cs->parent; | 296 | cs = cs->parent; |
287 | if (cs) | 297 | if (cs) |
288 | cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); | 298 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); |
289 | else | 299 | else |
290 | *pmask = cpu_online_map; | 300 | cpumask_copy(pmask, cpu_online_mask); |
291 | BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); | 301 | BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); |
292 | } | 302 | } |
293 | 303 | ||
294 | /* | 304 | /* |
@@ -364,14 +374,9 @@ void cpuset_update_task_memory_state(void) | |||
364 | struct task_struct *tsk = current; | 374 | struct task_struct *tsk = current; |
365 | struct cpuset *cs; | 375 | struct cpuset *cs; |
366 | 376 | ||
367 | if (task_cs(tsk) == &top_cpuset) { | 377 | rcu_read_lock(); |
368 | /* Don't need rcu for top_cpuset. It's never freed. */ | 378 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; |
369 | my_cpusets_mem_gen = top_cpuset.mems_generation; | 379 | rcu_read_unlock(); |
370 | } else { | ||
371 | rcu_read_lock(); | ||
372 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; | ||
373 | rcu_read_unlock(); | ||
374 | } | ||
375 | 380 | ||
376 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | 381 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { |
377 | mutex_lock(&callback_mutex); | 382 | mutex_lock(&callback_mutex); |
@@ -403,12 +408,43 @@ void cpuset_update_task_memory_state(void) | |||
403 | 408 | ||
404 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 409 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
405 | { | 410 | { |
406 | return cpus_subset(p->cpus_allowed, q->cpus_allowed) && | 411 | return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && |
407 | nodes_subset(p->mems_allowed, q->mems_allowed) && | 412 | nodes_subset(p->mems_allowed, q->mems_allowed) && |
408 | is_cpu_exclusive(p) <= is_cpu_exclusive(q) && | 413 | is_cpu_exclusive(p) <= is_cpu_exclusive(q) && |
409 | is_mem_exclusive(p) <= is_mem_exclusive(q); | 414 | is_mem_exclusive(p) <= is_mem_exclusive(q); |
410 | } | 415 | } |
411 | 416 | ||
417 | /** | ||
418 | * alloc_trial_cpuset - allocate a trial cpuset | ||
419 | * @cs: the cpuset that the trial cpuset duplicates | ||
420 | */ | ||
421 | static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) | ||
422 | { | ||
423 | struct cpuset *trial; | ||
424 | |||
425 | trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); | ||
426 | if (!trial) | ||
427 | return NULL; | ||
428 | |||
429 | if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { | ||
430 | kfree(trial); | ||
431 | return NULL; | ||
432 | } | ||
433 | cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); | ||
434 | |||
435 | return trial; | ||
436 | } | ||
437 | |||
438 | /** | ||
439 | * free_trial_cpuset - free the trial cpuset | ||
440 | * @trial: the trial cpuset to be freed | ||
441 | */ | ||
442 | static void free_trial_cpuset(struct cpuset *trial) | ||
443 | { | ||
444 | free_cpumask_var(trial->cpus_allowed); | ||
445 | kfree(trial); | ||
446 | } | ||
447 | |||
412 | /* | 448 | /* |
413 | * validate_change() - Used to validate that any proposed cpuset change | 449 | * validate_change() - Used to validate that any proposed cpuset change |
414 | * follows the structural rules for cpusets. | 450 | * follows the structural rules for cpusets. |
@@ -458,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
458 | c = cgroup_cs(cont); | 494 | c = cgroup_cs(cont); |
459 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 495 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
460 | c != cur && | 496 | c != cur && |
461 | cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) | 497 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
462 | return -EINVAL; | 498 | return -EINVAL; |
463 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && | 499 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && |
464 | c != cur && | 500 | c != cur && |
@@ -468,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
468 | 504 | ||
469 | /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ | 505 | /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ |
470 | if (cgroup_task_count(cur->css.cgroup)) { | 506 | if (cgroup_task_count(cur->css.cgroup)) { |
471 | if (cpus_empty(trial->cpus_allowed) || | 507 | if (cpumask_empty(trial->cpus_allowed) || |
472 | nodes_empty(trial->mems_allowed)) { | 508 | nodes_empty(trial->mems_allowed)) { |
473 | return -ENOSPC; | 509 | return -ENOSPC; |
474 | } | 510 | } |
@@ -483,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
483 | */ | 519 | */ |
484 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 520 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
485 | { | 521 | { |
486 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 522 | return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); |
487 | } | 523 | } |
488 | 524 | ||
489 | static void | 525 | static void |
@@ -508,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
508 | cp = list_first_entry(&q, struct cpuset, stack_list); | 544 | cp = list_first_entry(&q, struct cpuset, stack_list); |
509 | list_del(q.next); | 545 | list_del(q.next); |
510 | 546 | ||
511 | if (cpus_empty(cp->cpus_allowed)) | 547 | if (cpumask_empty(cp->cpus_allowed)) |
512 | continue; | 548 | continue; |
513 | 549 | ||
514 | if (is_sched_load_balance(cp)) | 550 | if (is_sched_load_balance(cp)) |
@@ -575,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
575 | * element of the partition (one sched domain) to be passed to | 611 | * element of the partition (one sched domain) to be passed to |
576 | * partition_sched_domains(). | 612 | * partition_sched_domains(). |
577 | */ | 613 | */ |
578 | static int generate_sched_domains(cpumask_t **domains, | 614 | /* FIXME: see the FIXME in partition_sched_domains() */ |
615 | static int generate_sched_domains(struct cpumask **domains, | ||
579 | struct sched_domain_attr **attributes) | 616 | struct sched_domain_attr **attributes) |
580 | { | 617 | { |
581 | LIST_HEAD(q); /* queue of cpusets to be scanned */ | 618 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
@@ -583,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains, | |||
583 | struct cpuset **csa; /* array of all cpuset ptrs */ | 620 | struct cpuset **csa; /* array of all cpuset ptrs */ |
584 | int csn; /* how many cpuset ptrs in csa so far */ | 621 | int csn; /* how many cpuset ptrs in csa so far */ |
585 | int i, j, k; /* indices for partition finding loops */ | 622 | int i, j, k; /* indices for partition finding loops */ |
586 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ | 623 | struct cpumask *doms; /* resulting partition; i.e. sched domains */ |
587 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 624 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
588 | int ndoms = 0; /* number of sched domains in result */ | 625 | int ndoms = 0; /* number of sched domains in result */ |
589 | int nslot; /* next empty doms[] cpumask_t slot */ | 626 | int nslot; /* next empty doms[] struct cpumask slot */ |
590 | 627 | ||
591 | doms = NULL; | 628 | doms = NULL; |
592 | dattr = NULL; | 629 | dattr = NULL; |
@@ -594,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains, | |||
594 | 631 | ||
595 | /* Special case for the 99% of systems with one, full, sched domain */ | 632 | /* Special case for the 99% of systems with one, full, sched domain */ |
596 | if (is_sched_load_balance(&top_cpuset)) { | 633 | if (is_sched_load_balance(&top_cpuset)) { |
597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 634 | doms = kmalloc(cpumask_size(), GFP_KERNEL); |
598 | if (!doms) | 635 | if (!doms) |
599 | goto done; | 636 | goto done; |
600 | 637 | ||
@@ -603,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains, | |||
603 | *dattr = SD_ATTR_INIT; | 640 | *dattr = SD_ATTR_INIT; |
604 | update_domain_attr_tree(dattr, &top_cpuset); | 641 | update_domain_attr_tree(dattr, &top_cpuset); |
605 | } | 642 | } |
606 | *doms = top_cpuset.cpus_allowed; | 643 | cpumask_copy(doms, top_cpuset.cpus_allowed); |
607 | 644 | ||
608 | ndoms = 1; | 645 | ndoms = 1; |
609 | goto done; | 646 | goto done; |
@@ -622,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains, | |||
622 | cp = list_first_entry(&q, struct cpuset, stack_list); | 659 | cp = list_first_entry(&q, struct cpuset, stack_list); |
623 | list_del(q.next); | 660 | list_del(q.next); |
624 | 661 | ||
625 | if (cpus_empty(cp->cpus_allowed)) | 662 | if (cpumask_empty(cp->cpus_allowed)) |
626 | continue; | 663 | continue; |
627 | 664 | ||
628 | /* | 665 | /* |
@@ -673,7 +710,7 @@ restart: | |||
673 | * Now we know how many domains to create. | 710 | * Now we know how many domains to create. |
674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | 711 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. |
675 | */ | 712 | */ |
676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 713 | doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); |
677 | if (!doms) | 714 | if (!doms) |
678 | goto done; | 715 | goto done; |
679 | 716 | ||
@@ -685,7 +722,7 @@ restart: | |||
685 | 722 | ||
686 | for (nslot = 0, i = 0; i < csn; i++) { | 723 | for (nslot = 0, i = 0; i < csn; i++) { |
687 | struct cpuset *a = csa[i]; | 724 | struct cpuset *a = csa[i]; |
688 | cpumask_t *dp; | 725 | struct cpumask *dp; |
689 | int apn = a->pn; | 726 | int apn = a->pn; |
690 | 727 | ||
691 | if (apn < 0) { | 728 | if (apn < 0) { |
@@ -708,14 +745,14 @@ restart: | |||
708 | continue; | 745 | continue; |
709 | } | 746 | } |
710 | 747 | ||
711 | cpus_clear(*dp); | 748 | cpumask_clear(dp); |
712 | if (dattr) | 749 | if (dattr) |
713 | *(dattr + nslot) = SD_ATTR_INIT; | 750 | *(dattr + nslot) = SD_ATTR_INIT; |
714 | for (j = i; j < csn; j++) { | 751 | for (j = i; j < csn; j++) { |
715 | struct cpuset *b = csa[j]; | 752 | struct cpuset *b = csa[j]; |
716 | 753 | ||
717 | if (apn == b->pn) { | 754 | if (apn == b->pn) { |
718 | cpus_or(*dp, *dp, b->cpus_allowed); | 755 | cpumask_or(dp, dp, b->cpus_allowed); |
719 | if (dattr) | 756 | if (dattr) |
720 | update_domain_attr_tree(dattr + nslot, b); | 757 | update_domain_attr_tree(dattr + nslot, b); |
721 | 758 | ||
@@ -755,7 +792,7 @@ done: | |||
755 | static void do_rebuild_sched_domains(struct work_struct *unused) | 792 | static void do_rebuild_sched_domains(struct work_struct *unused) |
756 | { | 793 | { |
757 | struct sched_domain_attr *attr; | 794 | struct sched_domain_attr *attr; |
758 | cpumask_t *doms; | 795 | struct cpumask *doms; |
759 | int ndoms; | 796 | int ndoms; |
760 | 797 | ||
761 | get_online_cpus(); | 798 | get_online_cpus(); |
@@ -824,7 +861,7 @@ void rebuild_sched_domains(void) | |||
824 | static int cpuset_test_cpumask(struct task_struct *tsk, | 861 | static int cpuset_test_cpumask(struct task_struct *tsk, |
825 | struct cgroup_scanner *scan) | 862 | struct cgroup_scanner *scan) |
826 | { | 863 | { |
827 | return !cpus_equal(tsk->cpus_allowed, | 864 | return !cpumask_equal(&tsk->cpus_allowed, |
828 | (cgroup_cs(scan->cg))->cpus_allowed); | 865 | (cgroup_cs(scan->cg))->cpus_allowed); |
829 | } | 866 | } |
830 | 867 | ||
@@ -842,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, | |||
842 | static void cpuset_change_cpumask(struct task_struct *tsk, | 879 | static void cpuset_change_cpumask(struct task_struct *tsk, |
843 | struct cgroup_scanner *scan) | 880 | struct cgroup_scanner *scan) |
844 | { | 881 | { |
845 | set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); | 882 | set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); |
846 | } | 883 | } |
847 | 884 | ||
848 | /** | 885 | /** |
@@ -874,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | |||
874 | * @cs: the cpuset to consider | 911 | * @cs: the cpuset to consider |
875 | * @buf: buffer of cpu numbers written to this cpuset | 912 | * @buf: buffer of cpu numbers written to this cpuset |
876 | */ | 913 | */ |
877 | static int update_cpumask(struct cpuset *cs, const char *buf) | 914 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
915 | const char *buf) | ||
878 | { | 916 | { |
879 | struct ptr_heap heap; | 917 | struct ptr_heap heap; |
880 | struct cpuset trialcs; | ||
881 | int retval; | 918 | int retval; |
882 | int is_load_balanced; | 919 | int is_load_balanced; |
883 | 920 | ||
@@ -885,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
885 | if (cs == &top_cpuset) | 922 | if (cs == &top_cpuset) |
886 | return -EACCES; | 923 | return -EACCES; |
887 | 924 | ||
888 | trialcs = *cs; | ||
889 | |||
890 | /* | 925 | /* |
891 | * An empty cpus_allowed is ok only if the cpuset has no tasks. | 926 | * An empty cpus_allowed is ok only if the cpuset has no tasks. |
892 | * Since cpulist_parse() fails on an empty mask, we special case | 927 | * Since cpulist_parse() fails on an empty mask, we special case |
@@ -894,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
894 | * with tasks have cpus. | 929 | * with tasks have cpus. |
895 | */ | 930 | */ |
896 | if (!*buf) { | 931 | if (!*buf) { |
897 | cpus_clear(trialcs.cpus_allowed); | 932 | cpumask_clear(trialcs->cpus_allowed); |
898 | } else { | 933 | } else { |
899 | retval = cpulist_parse(buf, &trialcs.cpus_allowed); | 934 | retval = cpulist_parse(buf, trialcs->cpus_allowed); |
900 | if (retval < 0) | 935 | if (retval < 0) |
901 | return retval; | 936 | return retval; |
902 | 937 | ||
903 | if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) | 938 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) |
904 | return -EINVAL; | 939 | return -EINVAL; |
905 | } | 940 | } |
906 | retval = validate_change(cs, &trialcs); | 941 | retval = validate_change(cs, trialcs); |
907 | if (retval < 0) | 942 | if (retval < 0) |
908 | return retval; | 943 | return retval; |
909 | 944 | ||
910 | /* Nothing to do if the cpus didn't change */ | 945 | /* Nothing to do if the cpus didn't change */ |
911 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 946 | if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) |
912 | return 0; | 947 | return 0; |
913 | 948 | ||
914 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | 949 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); |
915 | if (retval) | 950 | if (retval) |
916 | return retval; | 951 | return retval; |
917 | 952 | ||
918 | is_load_balanced = is_sched_load_balance(&trialcs); | 953 | is_load_balanced = is_sched_load_balance(trialcs); |
919 | 954 | ||
920 | mutex_lock(&callback_mutex); | 955 | mutex_lock(&callback_mutex); |
921 | cs->cpus_allowed = trialcs.cpus_allowed; | 956 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
922 | mutex_unlock(&callback_mutex); | 957 | mutex_unlock(&callback_mutex); |
923 | 958 | ||
924 | /* | 959 | /* |
@@ -1006,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | |||
1006 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1041 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1007 | 1042 | ||
1008 | fudge = 10; /* spare mmarray[] slots */ | 1043 | fudge = 10; /* spare mmarray[] slots */ |
1009 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | 1044 | fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ |
1010 | retval = -ENOMEM; | 1045 | retval = -ENOMEM; |
1011 | 1046 | ||
1012 | /* | 1047 | /* |
@@ -1093,9 +1128,9 @@ done: | |||
1093 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1128 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
1094 | * their mempolicies to the cpusets new mems_allowed. | 1129 | * their mempolicies to the cpusets new mems_allowed. |
1095 | */ | 1130 | */ |
1096 | static int update_nodemask(struct cpuset *cs, const char *buf) | 1131 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
1132 | const char *buf) | ||
1097 | { | 1133 | { |
1098 | struct cpuset trialcs; | ||
1099 | nodemask_t oldmem; | 1134 | nodemask_t oldmem; |
1100 | int retval; | 1135 | int retval; |
1101 | 1136 | ||
@@ -1106,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf) | |||
1106 | if (cs == &top_cpuset) | 1141 | if (cs == &top_cpuset) |
1107 | return -EACCES; | 1142 | return -EACCES; |
1108 | 1143 | ||
1109 | trialcs = *cs; | ||
1110 | |||
1111 | /* | 1144 | /* |
1112 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | 1145 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. |
1113 | * Since nodelist_parse() fails on an empty mask, we special case | 1146 | * Since nodelist_parse() fails on an empty mask, we special case |
@@ -1115,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf) | |||
1115 | * with tasks have memory. | 1148 | * with tasks have memory. |
1116 | */ | 1149 | */ |
1117 | if (!*buf) { | 1150 | if (!*buf) { |
1118 | nodes_clear(trialcs.mems_allowed); | 1151 | nodes_clear(trialcs->mems_allowed); |
1119 | } else { | 1152 | } else { |
1120 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 1153 | retval = nodelist_parse(buf, trialcs->mems_allowed); |
1121 | if (retval < 0) | 1154 | if (retval < 0) |
1122 | goto done; | 1155 | goto done; |
1123 | 1156 | ||
1124 | if (!nodes_subset(trialcs.mems_allowed, | 1157 | if (!nodes_subset(trialcs->mems_allowed, |
1125 | node_states[N_HIGH_MEMORY])) | 1158 | node_states[N_HIGH_MEMORY])) |
1126 | return -EINVAL; | 1159 | return -EINVAL; |
1127 | } | 1160 | } |
1128 | oldmem = cs->mems_allowed; | 1161 | oldmem = cs->mems_allowed; |
1129 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | 1162 | if (nodes_equal(oldmem, trialcs->mems_allowed)) { |
1130 | retval = 0; /* Too easy - nothing to do */ | 1163 | retval = 0; /* Too easy - nothing to do */ |
1131 | goto done; | 1164 | goto done; |
1132 | } | 1165 | } |
1133 | retval = validate_change(cs, &trialcs); | 1166 | retval = validate_change(cs, trialcs); |
1134 | if (retval < 0) | 1167 | if (retval < 0) |
1135 | goto done; | 1168 | goto done; |
1136 | 1169 | ||
1137 | mutex_lock(&callback_mutex); | 1170 | mutex_lock(&callback_mutex); |
1138 | cs->mems_allowed = trialcs.mems_allowed; | 1171 | cs->mems_allowed = trialcs->mems_allowed; |
1139 | cs->mems_generation = cpuset_mems_generation++; | 1172 | cs->mems_generation = cpuset_mems_generation++; |
1140 | mutex_unlock(&callback_mutex); | 1173 | mutex_unlock(&callback_mutex); |
1141 | 1174 | ||
@@ -1156,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1156 | 1189 | ||
1157 | if (val != cs->relax_domain_level) { | 1190 | if (val != cs->relax_domain_level) { |
1158 | cs->relax_domain_level = val; | 1191 | cs->relax_domain_level = val; |
1159 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) | 1192 | if (!cpumask_empty(cs->cpus_allowed) && |
1193 | is_sched_load_balance(cs)) | ||
1160 | async_rebuild_sched_domains(); | 1194 | async_rebuild_sched_domains(); |
1161 | } | 1195 | } |
1162 | 1196 | ||
@@ -1175,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1175 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | 1209 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
1176 | int turning_on) | 1210 | int turning_on) |
1177 | { | 1211 | { |
1178 | struct cpuset trialcs; | 1212 | struct cpuset *trialcs; |
1179 | int err; | 1213 | int err; |
1180 | int balance_flag_changed; | 1214 | int balance_flag_changed; |
1181 | 1215 | ||
1182 | trialcs = *cs; | 1216 | trialcs = alloc_trial_cpuset(cs); |
1217 | if (!trialcs) | ||
1218 | return -ENOMEM; | ||
1219 | |||
1183 | if (turning_on) | 1220 | if (turning_on) |
1184 | set_bit(bit, &trialcs.flags); | 1221 | set_bit(bit, &trialcs->flags); |
1185 | else | 1222 | else |
1186 | clear_bit(bit, &trialcs.flags); | 1223 | clear_bit(bit, &trialcs->flags); |
1187 | 1224 | ||
1188 | err = validate_change(cs, &trialcs); | 1225 | err = validate_change(cs, trialcs); |
1189 | if (err < 0) | 1226 | if (err < 0) |
1190 | return err; | 1227 | goto out; |
1191 | 1228 | ||
1192 | balance_flag_changed = (is_sched_load_balance(cs) != | 1229 | balance_flag_changed = (is_sched_load_balance(cs) != |
1193 | is_sched_load_balance(&trialcs)); | 1230 | is_sched_load_balance(trialcs)); |
1194 | 1231 | ||
1195 | mutex_lock(&callback_mutex); | 1232 | mutex_lock(&callback_mutex); |
1196 | cs->flags = trialcs.flags; | 1233 | cs->flags = trialcs->flags; |
1197 | mutex_unlock(&callback_mutex); | 1234 | mutex_unlock(&callback_mutex); |
1198 | 1235 | ||
1199 | if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) | 1236 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
1200 | async_rebuild_sched_domains(); | 1237 | async_rebuild_sched_domains(); |
1201 | 1238 | ||
1202 | return 0; | 1239 | out: |
1240 | free_trial_cpuset(trialcs); | ||
1241 | return err; | ||
1203 | } | 1242 | } |
1204 | 1243 | ||
1205 | /* | 1244 | /* |
@@ -1300,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1300 | return val; | 1339 | return val; |
1301 | } | 1340 | } |
1302 | 1341 | ||
1342 | /* Protected by cgroup_lock */ | ||
1343 | static cpumask_var_t cpus_attach; | ||
1344 | |||
1303 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1345 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1304 | static int cpuset_can_attach(struct cgroup_subsys *ss, | 1346 | static int cpuset_can_attach(struct cgroup_subsys *ss, |
1305 | struct cgroup *cont, struct task_struct *tsk) | 1347 | struct cgroup *cont, struct task_struct *tsk) |
1306 | { | 1348 | { |
1307 | struct cpuset *cs = cgroup_cs(cont); | 1349 | struct cpuset *cs = cgroup_cs(cont); |
1350 | int ret = 0; | ||
1308 | 1351 | ||
1309 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1352 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1310 | return -ENOSPC; | 1353 | return -ENOSPC; |
1311 | if (tsk->flags & PF_THREAD_BOUND) { | ||
1312 | cpumask_t mask; | ||
1313 | 1354 | ||
1355 | if (tsk->flags & PF_THREAD_BOUND) { | ||
1314 | mutex_lock(&callback_mutex); | 1356 | mutex_lock(&callback_mutex); |
1315 | mask = cs->cpus_allowed; | 1357 | if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) |
1358 | ret = -EINVAL; | ||
1316 | mutex_unlock(&callback_mutex); | 1359 | mutex_unlock(&callback_mutex); |
1317 | if (!cpus_equal(tsk->cpus_allowed, mask)) | ||
1318 | return -EINVAL; | ||
1319 | } | 1360 | } |
1320 | 1361 | ||
1321 | return security_task_setscheduler(tsk, 0, NULL); | 1362 | return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); |
1322 | } | 1363 | } |
1323 | 1364 | ||
1324 | static void cpuset_attach(struct cgroup_subsys *ss, | 1365 | static void cpuset_attach(struct cgroup_subsys *ss, |
1325 | struct cgroup *cont, struct cgroup *oldcont, | 1366 | struct cgroup *cont, struct cgroup *oldcont, |
1326 | struct task_struct *tsk) | 1367 | struct task_struct *tsk) |
1327 | { | 1368 | { |
1328 | cpumask_t cpus; | ||
1329 | nodemask_t from, to; | 1369 | nodemask_t from, to; |
1330 | struct mm_struct *mm; | 1370 | struct mm_struct *mm; |
1331 | struct cpuset *cs = cgroup_cs(cont); | 1371 | struct cpuset *cs = cgroup_cs(cont); |
1332 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1372 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1333 | int err; | 1373 | int err; |
1334 | 1374 | ||
1335 | mutex_lock(&callback_mutex); | 1375 | if (cs == &top_cpuset) { |
1336 | guarantee_online_cpus(cs, &cpus); | 1376 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1337 | err = set_cpus_allowed_ptr(tsk, &cpus); | 1377 | } else { |
1338 | mutex_unlock(&callback_mutex); | 1378 | mutex_lock(&callback_mutex); |
1379 | guarantee_online_cpus(cs, cpus_attach); | ||
1380 | mutex_unlock(&callback_mutex); | ||
1381 | } | ||
1382 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | ||
1339 | if (err) | 1383 | if (err) |
1340 | return; | 1384 | return; |
1341 | 1385 | ||
@@ -1348,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1348 | cpuset_migrate_mm(mm, &from, &to); | 1392 | cpuset_migrate_mm(mm, &from, &to); |
1349 | mmput(mm); | 1393 | mmput(mm); |
1350 | } | 1394 | } |
1351 | |||
1352 | } | 1395 | } |
1353 | 1396 | ||
1354 | /* The various types of files and directories in a cpuset file system */ | 1397 | /* The various types of files and directories in a cpuset file system */ |
@@ -1443,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1443 | const char *buf) | 1486 | const char *buf) |
1444 | { | 1487 | { |
1445 | int retval = 0; | 1488 | int retval = 0; |
1489 | struct cpuset *cs = cgroup_cs(cgrp); | ||
1490 | struct cpuset *trialcs; | ||
1446 | 1491 | ||
1447 | if (!cgroup_lock_live_group(cgrp)) | 1492 | if (!cgroup_lock_live_group(cgrp)) |
1448 | return -ENODEV; | 1493 | return -ENODEV; |
1449 | 1494 | ||
1495 | trialcs = alloc_trial_cpuset(cs); | ||
1496 | if (!trialcs) | ||
1497 | return -ENOMEM; | ||
1498 | |||
1450 | switch (cft->private) { | 1499 | switch (cft->private) { |
1451 | case FILE_CPULIST: | 1500 | case FILE_CPULIST: |
1452 | retval = update_cpumask(cgroup_cs(cgrp), buf); | 1501 | retval = update_cpumask(cs, trialcs, buf); |
1453 | break; | 1502 | break; |
1454 | case FILE_MEMLIST: | 1503 | case FILE_MEMLIST: |
1455 | retval = update_nodemask(cgroup_cs(cgrp), buf); | 1504 | retval = update_nodemask(cs, trialcs, buf); |
1456 | break; | 1505 | break; |
1457 | default: | 1506 | default: |
1458 | retval = -EINVAL; | 1507 | retval = -EINVAL; |
1459 | break; | 1508 | break; |
1460 | } | 1509 | } |
1510 | |||
1511 | free_trial_cpuset(trialcs); | ||
1461 | cgroup_unlock(); | 1512 | cgroup_unlock(); |
1462 | return retval; | 1513 | return retval; |
1463 | } | 1514 | } |
@@ -1476,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1476 | 1527 | ||
1477 | static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | 1528 | static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) |
1478 | { | 1529 | { |
1479 | cpumask_t mask; | 1530 | int ret; |
1480 | 1531 | ||
1481 | mutex_lock(&callback_mutex); | 1532 | mutex_lock(&callback_mutex); |
1482 | mask = cs->cpus_allowed; | 1533 | ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); |
1483 | mutex_unlock(&callback_mutex); | 1534 | mutex_unlock(&callback_mutex); |
1484 | 1535 | ||
1485 | return cpulist_scnprintf(page, PAGE_SIZE, &mask); | 1536 | return ret; |
1486 | } | 1537 | } |
1487 | 1538 | ||
1488 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | 1539 | static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) |
@@ -1718,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, | |||
1718 | parent_cs = cgroup_cs(parent); | 1769 | parent_cs = cgroup_cs(parent); |
1719 | 1770 | ||
1720 | cs->mems_allowed = parent_cs->mems_allowed; | 1771 | cs->mems_allowed = parent_cs->mems_allowed; |
1721 | cs->cpus_allowed = parent_cs->cpus_allowed; | 1772 | cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); |
1722 | return; | 1773 | return; |
1723 | } | 1774 | } |
1724 | 1775 | ||
@@ -1744,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1744 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1795 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); |
1745 | if (!cs) | 1796 | if (!cs) |
1746 | return ERR_PTR(-ENOMEM); | 1797 | return ERR_PTR(-ENOMEM); |
1798 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { | ||
1799 | kfree(cs); | ||
1800 | return ERR_PTR(-ENOMEM); | ||
1801 | } | ||
1747 | 1802 | ||
1748 | cpuset_update_task_memory_state(); | 1803 | cpuset_update_task_memory_state(); |
1749 | cs->flags = 0; | 1804 | cs->flags = 0; |
@@ -1752,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1752 | if (is_spread_slab(parent)) | 1807 | if (is_spread_slab(parent)) |
1753 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1808 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1754 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1809 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1755 | cpus_clear(cs->cpus_allowed); | 1810 | cpumask_clear(cs->cpus_allowed); |
1756 | nodes_clear(cs->mems_allowed); | 1811 | nodes_clear(cs->mems_allowed); |
1757 | cs->mems_generation = cpuset_mems_generation++; | 1812 | cs->mems_generation = cpuset_mems_generation++; |
1758 | fmeter_init(&cs->fmeter); | 1813 | fmeter_init(&cs->fmeter); |
@@ -1779,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1779 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1834 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
1780 | 1835 | ||
1781 | number_of_cpusets--; | 1836 | number_of_cpusets--; |
1837 | free_cpumask_var(cs->cpus_allowed); | ||
1782 | kfree(cs); | 1838 | kfree(cs); |
1783 | } | 1839 | } |
1784 | 1840 | ||
@@ -1802,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = { | |||
1802 | 1858 | ||
1803 | int __init cpuset_init_early(void) | 1859 | int __init cpuset_init_early(void) |
1804 | { | 1860 | { |
1861 | alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); | ||
1862 | |||
1805 | top_cpuset.mems_generation = cpuset_mems_generation++; | 1863 | top_cpuset.mems_generation = cpuset_mems_generation++; |
1806 | return 0; | 1864 | return 0; |
1807 | } | 1865 | } |
@@ -1817,7 +1875,7 @@ int __init cpuset_init(void) | |||
1817 | { | 1875 | { |
1818 | int err = 0; | 1876 | int err = 0; |
1819 | 1877 | ||
1820 | cpus_setall(top_cpuset.cpus_allowed); | 1878 | cpumask_setall(top_cpuset.cpus_allowed); |
1821 | nodes_setall(top_cpuset.mems_allowed); | 1879 | nodes_setall(top_cpuset.mems_allowed); |
1822 | 1880 | ||
1823 | fmeter_init(&top_cpuset.fmeter); | 1881 | fmeter_init(&top_cpuset.fmeter); |
@@ -1829,6 +1887,9 @@ int __init cpuset_init(void) | |||
1829 | if (err < 0) | 1887 | if (err < 0) |
1830 | return err; | 1888 | return err; |
1831 | 1889 | ||
1890 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) | ||
1891 | BUG(); | ||
1892 | |||
1832 | number_of_cpusets = 1; | 1893 | number_of_cpusets = 1; |
1833 | return 0; | 1894 | return 0; |
1834 | } | 1895 | } |
@@ -1903,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1903 | * has online cpus, so can't be empty). | 1964 | * has online cpus, so can't be empty). |
1904 | */ | 1965 | */ |
1905 | parent = cs->parent; | 1966 | parent = cs->parent; |
1906 | while (cpus_empty(parent->cpus_allowed) || | 1967 | while (cpumask_empty(parent->cpus_allowed) || |
1907 | nodes_empty(parent->mems_allowed)) | 1968 | nodes_empty(parent->mems_allowed)) |
1908 | parent = parent->parent; | 1969 | parent = parent->parent; |
1909 | 1970 | ||
@@ -1944,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
1944 | } | 2005 | } |
1945 | 2006 | ||
1946 | /* Continue past cpusets with all cpus, mems online */ | 2007 | /* Continue past cpusets with all cpus, mems online */ |
1947 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && | 2008 | if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && |
1948 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 2009 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
1949 | continue; | 2010 | continue; |
1950 | 2011 | ||
@@ -1952,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
1952 | 2013 | ||
1953 | /* Remove offline cpus and mems from this cpuset. */ | 2014 | /* Remove offline cpus and mems from this cpuset. */ |
1954 | mutex_lock(&callback_mutex); | 2015 | mutex_lock(&callback_mutex); |
1955 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | 2016 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, |
2017 | cpu_online_mask); | ||
1956 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2018 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
1957 | node_states[N_HIGH_MEMORY]); | 2019 | node_states[N_HIGH_MEMORY]); |
1958 | mutex_unlock(&callback_mutex); | 2020 | mutex_unlock(&callback_mutex); |
1959 | 2021 | ||
1960 | /* Move tasks from the empty cpuset to a parent */ | 2022 | /* Move tasks from the empty cpuset to a parent */ |
1961 | if (cpus_empty(cp->cpus_allowed) || | 2023 | if (cpumask_empty(cp->cpus_allowed) || |
1962 | nodes_empty(cp->mems_allowed)) | 2024 | nodes_empty(cp->mems_allowed)) |
1963 | remove_tasks_in_empty_cpuset(cp); | 2025 | remove_tasks_in_empty_cpuset(cp); |
1964 | else { | 2026 | else { |
@@ -1984,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
1984 | unsigned long phase, void *unused_cpu) | 2046 | unsigned long phase, void *unused_cpu) |
1985 | { | 2047 | { |
1986 | struct sched_domain_attr *attr; | 2048 | struct sched_domain_attr *attr; |
1987 | cpumask_t *doms; | 2049 | struct cpumask *doms; |
1988 | int ndoms; | 2050 | int ndoms; |
1989 | 2051 | ||
1990 | switch (phase) { | 2052 | switch (phase) { |
@@ -1999,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
1999 | } | 2061 | } |
2000 | 2062 | ||
2001 | cgroup_lock(); | 2063 | cgroup_lock(); |
2002 | top_cpuset.cpus_allowed = cpu_online_map; | 2064 | cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); |
2003 | scan_for_empty_cpusets(&top_cpuset); | 2065 | scan_for_empty_cpusets(&top_cpuset); |
2004 | ndoms = generate_sched_domains(&doms, &attr); | 2066 | ndoms = generate_sched_domains(&doms, &attr); |
2005 | cgroup_unlock(); | 2067 | cgroup_unlock(); |
@@ -2044,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2044 | 2106 | ||
2045 | void __init cpuset_init_smp(void) | 2107 | void __init cpuset_init_smp(void) |
2046 | { | 2108 | { |
2047 | top_cpuset.cpus_allowed = cpu_online_map; | 2109 | cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); |
2048 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2110 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2049 | 2111 | ||
2050 | hotcpu_notifier(cpuset_track_online_cpus, 0); | 2112 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
@@ -2054,15 +2116,15 @@ void __init cpuset_init_smp(void) | |||
2054 | /** | 2116 | /** |
2055 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 2117 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
2056 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 2118 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
2057 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | 2119 | * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. |
2058 | * | 2120 | * |
2059 | * Description: Returns the cpumask_t cpus_allowed of the cpuset | 2121 | * Description: Returns the cpumask_var_t cpus_allowed of the cpuset |
2060 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2122 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2061 | * subset of cpu_online_map, even if this means going outside the | 2123 | * subset of cpu_online_map, even if this means going outside the |
2062 | * tasks cpuset. | 2124 | * tasks cpuset. |
2063 | **/ | 2125 | **/ |
2064 | 2126 | ||
2065 | void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) | 2127 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
2066 | { | 2128 | { |
2067 | mutex_lock(&callback_mutex); | 2129 | mutex_lock(&callback_mutex); |
2068 | cpuset_cpus_allowed_locked(tsk, pmask); | 2130 | cpuset_cpus_allowed_locked(tsk, pmask); |
@@ -2073,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) | |||
2073 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 2135 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
2074 | * Must be called with callback_mutex held. | 2136 | * Must be called with callback_mutex held. |
2075 | **/ | 2137 | **/ |
2076 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) | 2138 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) |
2077 | { | 2139 | { |
2078 | task_lock(tsk); | 2140 | task_lock(tsk); |
2079 | guarantee_online_cpus(task_cs(tsk), pmask); | 2141 | guarantee_online_cpus(task_cs(tsk), pmask); |
@@ -2356,6 +2418,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
2356 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); | 2418 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); |
2357 | } | 2419 | } |
2358 | 2420 | ||
2421 | /** | ||
2422 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed | ||
2423 | * @task: pointer to task_struct of some task. | ||
2424 | * | ||
2425 | * Description: Prints @task's name, cpuset name, and cached copy of its | ||
2426 | * mems_allowed to the kernel log. Must hold task_lock(task) to allow | ||
2427 | * dereferencing task_cs(task). | ||
2428 | */ | ||
2429 | void cpuset_print_task_mems_allowed(struct task_struct *tsk) | ||
2430 | { | ||
2431 | struct dentry *dentry; | ||
2432 | |||
2433 | dentry = task_cs(tsk)->css.cgroup->dentry; | ||
2434 | spin_lock(&cpuset_buffer_lock); | ||
2435 | snprintf(cpuset_name, CPUSET_NAME_LEN, | ||
2436 | dentry ? (const char *)dentry->d_name.name : "/"); | ||
2437 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, | ||
2438 | tsk->mems_allowed); | ||
2439 | printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", | ||
2440 | tsk->comm, cpuset_name, cpuset_nodelist); | ||
2441 | spin_unlock(&cpuset_buffer_lock); | ||
2442 | } | ||
2443 | |||
2359 | /* | 2444 | /* |
2360 | * Collection of memory_pressure is suppressed unless | 2445 | * Collection of memory_pressure is suppressed unless |
2361 | * this flag is enabled by writing "1" to the special | 2446 | * this flag is enabled by writing "1" to the special |
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index f013a0c2e111..038707404b76 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c | |||
@@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); | |||
109 | int dma_alloc_from_coherent(struct device *dev, ssize_t size, | 109 | int dma_alloc_from_coherent(struct device *dev, ssize_t size, |
110 | dma_addr_t *dma_handle, void **ret) | 110 | dma_addr_t *dma_handle, void **ret) |
111 | { | 111 | { |
112 | struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; | 112 | struct dma_coherent_mem *mem; |
113 | int order = get_order(size); | 113 | int order = get_order(size); |
114 | int pageno; | ||
114 | 115 | ||
115 | if (mem) { | 116 | if (!dev) |
116 | int page = bitmap_find_free_region(mem->bitmap, mem->size, | 117 | return 0; |
117 | order); | 118 | mem = dev->dma_mem; |
118 | if (page >= 0) { | 119 | if (!mem) |
119 | *dma_handle = mem->device_base + (page << PAGE_SHIFT); | 120 | return 0; |
120 | *ret = mem->virt_base + (page << PAGE_SHIFT); | 121 | if (unlikely(size > mem->size)) |
121 | memset(*ret, 0, size); | 122 | return 0; |
122 | } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) | 123 | |
123 | *ret = NULL; | 124 | pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); |
125 | if (pageno >= 0) { | ||
126 | /* | ||
127 | * Memory was found in the per-device arena. | ||
128 | */ | ||
129 | *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); | ||
130 | *ret = mem->virt_base + (pageno << PAGE_SHIFT); | ||
131 | memset(*ret, 0, size); | ||
132 | } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { | ||
133 | /* | ||
134 | * The per-device arena is exhausted and we are not | ||
135 | * permitted to fall back to generic memory. | ||
136 | */ | ||
137 | *ret = NULL; | ||
138 | } else { | ||
139 | /* | ||
140 | * The per-device arena is exhausted and we are | ||
141 | * permitted to fall back to generic memory. | ||
142 | */ | ||
143 | return 0; | ||
124 | } | 144 | } |
125 | return (mem != NULL); | 145 | return 1; |
126 | } | 146 | } |
127 | EXPORT_SYMBOL(dma_alloc_from_coherent); | 147 | EXPORT_SYMBOL(dma_alloc_from_coherent); |
128 | 148 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index c9e5a1c14e08..c7740fa3252c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -642,35 +642,31 @@ retry: | |||
642 | /* | 642 | /* |
643 | * We found no owner yet mm_users > 1: this implies that we are | 643 | * We found no owner yet mm_users > 1: this implies that we are |
644 | * most likely racing with swapoff (try_to_unuse()) or /proc or | 644 | * most likely racing with swapoff (try_to_unuse()) or /proc or |
645 | * ptrace or page migration (get_task_mm()). Mark owner as NULL, | 645 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. |
646 | * so that subsystems can understand the callback and take action. | ||
647 | */ | 646 | */ |
648 | down_write(&mm->mmap_sem); | ||
649 | cgroup_mm_owner_callbacks(mm->owner, NULL); | ||
650 | mm->owner = NULL; | 647 | mm->owner = NULL; |
651 | up_write(&mm->mmap_sem); | ||
652 | return; | 648 | return; |
653 | 649 | ||
654 | assign_new_owner: | 650 | assign_new_owner: |
655 | BUG_ON(c == p); | 651 | BUG_ON(c == p); |
656 | get_task_struct(c); | 652 | get_task_struct(c); |
657 | read_unlock(&tasklist_lock); | ||
658 | down_write(&mm->mmap_sem); | ||
659 | /* | 653 | /* |
660 | * The task_lock protects c->mm from changing. | 654 | * The task_lock protects c->mm from changing. |
661 | * We always want mm->owner->mm == mm | 655 | * We always want mm->owner->mm == mm |
662 | */ | 656 | */ |
663 | task_lock(c); | 657 | task_lock(c); |
658 | /* | ||
659 | * Delay read_unlock() till we have the task_lock() | ||
660 | * to ensure that c does not slip away underneath us | ||
661 | */ | ||
662 | read_unlock(&tasklist_lock); | ||
664 | if (c->mm != mm) { | 663 | if (c->mm != mm) { |
665 | task_unlock(c); | 664 | task_unlock(c); |
666 | up_write(&mm->mmap_sem); | ||
667 | put_task_struct(c); | 665 | put_task_struct(c); |
668 | goto retry; | 666 | goto retry; |
669 | } | 667 | } |
670 | cgroup_mm_owner_callbacks(mm->owner, c); | ||
671 | mm->owner = c; | 668 | mm->owner = c; |
672 | task_unlock(c); | 669 | task_unlock(c); |
673 | up_write(&mm->mmap_sem); | ||
674 | put_task_struct(c); | 670 | put_task_struct(c); |
675 | } | 671 | } |
676 | #endif /* CONFIG_MM_OWNER */ | 672 | #endif /* CONFIG_MM_OWNER */ |
@@ -1055,10 +1051,7 @@ NORET_TYPE void do_exit(long code) | |||
1055 | preempt_count()); | 1051 | preempt_count()); |
1056 | 1052 | ||
1057 | acct_update_integrals(tsk); | 1053 | acct_update_integrals(tsk); |
1058 | if (tsk->mm) { | 1054 | |
1059 | update_hiwater_rss(tsk->mm); | ||
1060 | update_hiwater_vm(tsk->mm); | ||
1061 | } | ||
1062 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 1055 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
1063 | if (group_dead) { | 1056 | if (group_dead) { |
1064 | hrtimer_cancel(&tsk->signal->real_timer); | 1057 | hrtimer_cancel(&tsk->signal->real_timer); |
diff --git a/kernel/fork.c b/kernel/fork.c index 43cbf30669e6..4018308048cf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | |||
400 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) | 400 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) |
401 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | 401 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) |
402 | 402 | ||
403 | static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; | ||
404 | |||
405 | static int __init coredump_filter_setup(char *s) | ||
406 | { | ||
407 | default_dump_filter = | ||
408 | (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & | ||
409 | MMF_DUMP_FILTER_MASK; | ||
410 | return 1; | ||
411 | } | ||
412 | |||
413 | __setup("coredump_filter=", coredump_filter_setup); | ||
414 | |||
403 | #include <linux/init_task.h> | 415 | #include <linux/init_task.h> |
404 | 416 | ||
405 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | 417 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) |
@@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
408 | atomic_set(&mm->mm_count, 1); | 420 | atomic_set(&mm->mm_count, 1); |
409 | init_rwsem(&mm->mmap_sem); | 421 | init_rwsem(&mm->mmap_sem); |
410 | INIT_LIST_HEAD(&mm->mmlist); | 422 | INIT_LIST_HEAD(&mm->mmlist); |
411 | mm->flags = (current->mm) ? current->mm->flags | 423 | mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; |
412 | : MMF_DUMP_FILTER_DEFAULT; | ||
413 | mm->core_state = NULL; | 424 | mm->core_state = NULL; |
414 | mm->nr_ptes = 0; | 425 | mm->nr_ptes = 0; |
415 | set_mm_counter(mm, file_rss, 0); | 426 | set_mm_counter(mm, file_rss, 0); |
@@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
758 | { | 769 | { |
759 | struct sighand_struct *sig; | 770 | struct sighand_struct *sig; |
760 | 771 | ||
761 | if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { | 772 | if (clone_flags & CLONE_SIGHAND) { |
762 | atomic_inc(¤t->sighand->count); | 773 | atomic_inc(¤t->sighand->count); |
763 | return 0; | 774 | return 0; |
764 | } | 775 | } |
@@ -1115,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1115 | 1126 | ||
1116 | if (pid != &init_struct_pid) { | 1127 | if (pid != &init_struct_pid) { |
1117 | retval = -ENOMEM; | 1128 | retval = -ENOMEM; |
1118 | pid = alloc_pid(task_active_pid_ns(p)); | 1129 | pid = alloc_pid(p->nsproxy->pid_ns); |
1119 | if (!pid) | 1130 | if (!pid) |
1120 | goto bad_fork_cleanup_io; | 1131 | goto bad_fork_cleanup_io; |
1121 | 1132 | ||
1122 | if (clone_flags & CLONE_NEWPID) { | 1133 | if (clone_flags & CLONE_NEWPID) { |
1123 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); | 1134 | retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); |
1124 | if (retval < 0) | 1135 | if (retval < 0) |
1125 | goto bad_fork_free_pid; | 1136 | goto bad_fork_free_pid; |
1126 | } | 1137 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 7c6cbabe52b3..002aa189eb09 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key) | |||
170 | */ | 170 | */ |
171 | static void drop_futex_key_refs(union futex_key *key) | 171 | static void drop_futex_key_refs(union futex_key *key) |
172 | { | 172 | { |
173 | if (!key->both.ptr) | 173 | if (!key->both.ptr) { |
174 | /* If we're here then we tried to put a key we failed to get */ | ||
175 | WARN_ON_ONCE(1); | ||
174 | return; | 176 | return; |
177 | } | ||
175 | 178 | ||
176 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 179 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
177 | case FUT_OFF_INODE: | 180 | case FUT_OFF_INODE: |
@@ -730,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) | |||
730 | } | 733 | } |
731 | 734 | ||
732 | spin_unlock(&hb->lock); | 735 | spin_unlock(&hb->lock); |
733 | out: | ||
734 | put_futex_key(fshared, &key); | 736 | put_futex_key(fshared, &key); |
737 | out: | ||
735 | return ret; | 738 | return ret; |
736 | } | 739 | } |
737 | 740 | ||
@@ -755,7 +758,7 @@ retryfull: | |||
755 | goto out; | 758 | goto out; |
756 | ret = get_futex_key(uaddr2, fshared, &key2); | 759 | ret = get_futex_key(uaddr2, fshared, &key2); |
757 | if (unlikely(ret != 0)) | 760 | if (unlikely(ret != 0)) |
758 | goto out; | 761 | goto out_put_key1; |
759 | 762 | ||
760 | hb1 = hash_futex(&key1); | 763 | hb1 = hash_futex(&key1); |
761 | hb2 = hash_futex(&key2); | 764 | hb2 = hash_futex(&key2); |
@@ -777,12 +780,12 @@ retry: | |||
777 | * but we might get them from range checking | 780 | * but we might get them from range checking |
778 | */ | 781 | */ |
779 | ret = op_ret; | 782 | ret = op_ret; |
780 | goto out; | 783 | goto out_put_keys; |
781 | #endif | 784 | #endif |
782 | 785 | ||
783 | if (unlikely(op_ret != -EFAULT)) { | 786 | if (unlikely(op_ret != -EFAULT)) { |
784 | ret = op_ret; | 787 | ret = op_ret; |
785 | goto out; | 788 | goto out_put_keys; |
786 | } | 789 | } |
787 | 790 | ||
788 | /* | 791 | /* |
@@ -796,7 +799,7 @@ retry: | |||
796 | ret = futex_handle_fault((unsigned long)uaddr2, | 799 | ret = futex_handle_fault((unsigned long)uaddr2, |
797 | attempt); | 800 | attempt); |
798 | if (ret) | 801 | if (ret) |
799 | goto out; | 802 | goto out_put_keys; |
800 | goto retry; | 803 | goto retry; |
801 | } | 804 | } |
802 | 805 | ||
@@ -834,10 +837,11 @@ retry: | |||
834 | spin_unlock(&hb1->lock); | 837 | spin_unlock(&hb1->lock); |
835 | if (hb1 != hb2) | 838 | if (hb1 != hb2) |
836 | spin_unlock(&hb2->lock); | 839 | spin_unlock(&hb2->lock); |
837 | out: | 840 | out_put_keys: |
838 | put_futex_key(fshared, &key2); | 841 | put_futex_key(fshared, &key2); |
842 | out_put_key1: | ||
839 | put_futex_key(fshared, &key1); | 843 | put_futex_key(fshared, &key1); |
840 | 844 | out: | |
841 | return ret; | 845 | return ret; |
842 | } | 846 | } |
843 | 847 | ||
@@ -854,13 +858,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | |||
854 | struct futex_q *this, *next; | 858 | struct futex_q *this, *next; |
855 | int ret, drop_count = 0; | 859 | int ret, drop_count = 0; |
856 | 860 | ||
857 | retry: | 861 | retry: |
858 | ret = get_futex_key(uaddr1, fshared, &key1); | 862 | ret = get_futex_key(uaddr1, fshared, &key1); |
859 | if (unlikely(ret != 0)) | 863 | if (unlikely(ret != 0)) |
860 | goto out; | 864 | goto out; |
861 | ret = get_futex_key(uaddr2, fshared, &key2); | 865 | ret = get_futex_key(uaddr2, fshared, &key2); |
862 | if (unlikely(ret != 0)) | 866 | if (unlikely(ret != 0)) |
863 | goto out; | 867 | goto out_put_key1; |
864 | 868 | ||
865 | hb1 = hash_futex(&key1); | 869 | hb1 = hash_futex(&key1); |
866 | hb2 = hash_futex(&key2); | 870 | hb2 = hash_futex(&key2); |
@@ -882,7 +886,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, | |||
882 | if (!ret) | 886 | if (!ret) |
883 | goto retry; | 887 | goto retry; |
884 | 888 | ||
885 | return ret; | 889 | goto out_put_keys; |
886 | } | 890 | } |
887 | if (curval != *cmpval) { | 891 | if (curval != *cmpval) { |
888 | ret = -EAGAIN; | 892 | ret = -EAGAIN; |
@@ -927,9 +931,11 @@ out_unlock: | |||
927 | while (--drop_count >= 0) | 931 | while (--drop_count >= 0) |
928 | drop_futex_key_refs(&key1); | 932 | drop_futex_key_refs(&key1); |
929 | 933 | ||
930 | out: | 934 | out_put_keys: |
931 | put_futex_key(fshared, &key2); | 935 | put_futex_key(fshared, &key2); |
936 | out_put_key1: | ||
932 | put_futex_key(fshared, &key1); | 937 | put_futex_key(fshared, &key1); |
938 | out: | ||
933 | return ret; | 939 | return ret; |
934 | } | 940 | } |
935 | 941 | ||
@@ -990,7 +996,7 @@ static int unqueue_me(struct futex_q *q) | |||
990 | int ret = 0; | 996 | int ret = 0; |
991 | 997 | ||
992 | /* In the common case we don't take the spinlock, which is nice. */ | 998 | /* In the common case we don't take the spinlock, which is nice. */ |
993 | retry: | 999 | retry: |
994 | lock_ptr = q->lock_ptr; | 1000 | lock_ptr = q->lock_ptr; |
995 | barrier(); | 1001 | barrier(); |
996 | if (lock_ptr != NULL) { | 1002 | if (lock_ptr != NULL) { |
@@ -1172,11 +1178,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1172 | 1178 | ||
1173 | q.pi_state = NULL; | 1179 | q.pi_state = NULL; |
1174 | q.bitset = bitset; | 1180 | q.bitset = bitset; |
1175 | retry: | 1181 | retry: |
1176 | q.key = FUTEX_KEY_INIT; | 1182 | q.key = FUTEX_KEY_INIT; |
1177 | ret = get_futex_key(uaddr, fshared, &q.key); | 1183 | ret = get_futex_key(uaddr, fshared, &q.key); |
1178 | if (unlikely(ret != 0)) | 1184 | if (unlikely(ret != 0)) |
1179 | goto out_release_sem; | 1185 | goto out; |
1180 | 1186 | ||
1181 | hb = queue_lock(&q); | 1187 | hb = queue_lock(&q); |
1182 | 1188 | ||
@@ -1204,6 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1204 | 1210 | ||
1205 | if (unlikely(ret)) { | 1211 | if (unlikely(ret)) { |
1206 | queue_unlock(&q, hb); | 1212 | queue_unlock(&q, hb); |
1213 | put_futex_key(fshared, &q.key); | ||
1207 | 1214 | ||
1208 | ret = get_user(uval, uaddr); | 1215 | ret = get_user(uval, uaddr); |
1209 | 1216 | ||
@@ -1213,7 +1220,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1213 | } | 1220 | } |
1214 | ret = -EWOULDBLOCK; | 1221 | ret = -EWOULDBLOCK; |
1215 | if (uval != val) | 1222 | if (uval != val) |
1216 | goto out_unlock_release_sem; | 1223 | goto out_unlock_put_key; |
1217 | 1224 | ||
1218 | /* Only actually queue if *uaddr contained val. */ | 1225 | /* Only actually queue if *uaddr contained val. */ |
1219 | queue_me(&q, hb); | 1226 | queue_me(&q, hb); |
@@ -1305,11 +1312,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1305 | return -ERESTART_RESTARTBLOCK; | 1312 | return -ERESTART_RESTARTBLOCK; |
1306 | } | 1313 | } |
1307 | 1314 | ||
1308 | out_unlock_release_sem: | 1315 | out_unlock_put_key: |
1309 | queue_unlock(&q, hb); | 1316 | queue_unlock(&q, hb); |
1310 | |||
1311 | out_release_sem: | ||
1312 | put_futex_key(fshared, &q.key); | 1317 | put_futex_key(fshared, &q.key); |
1318 | |||
1319 | out: | ||
1313 | return ret; | 1320 | return ret; |
1314 | } | 1321 | } |
1315 | 1322 | ||
@@ -1358,16 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1358 | } | 1365 | } |
1359 | 1366 | ||
1360 | q.pi_state = NULL; | 1367 | q.pi_state = NULL; |
1361 | retry: | 1368 | retry: |
1362 | q.key = FUTEX_KEY_INIT; | 1369 | q.key = FUTEX_KEY_INIT; |
1363 | ret = get_futex_key(uaddr, fshared, &q.key); | 1370 | ret = get_futex_key(uaddr, fshared, &q.key); |
1364 | if (unlikely(ret != 0)) | 1371 | if (unlikely(ret != 0)) |
1365 | goto out_release_sem; | 1372 | goto out; |
1366 | 1373 | ||
1367 | retry_unlocked: | 1374 | retry_unlocked: |
1368 | hb = queue_lock(&q); | 1375 | hb = queue_lock(&q); |
1369 | 1376 | ||
1370 | retry_locked: | 1377 | retry_locked: |
1371 | ret = lock_taken = 0; | 1378 | ret = lock_taken = 0; |
1372 | 1379 | ||
1373 | /* | 1380 | /* |
@@ -1388,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1388 | */ | 1395 | */ |
1389 | if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { | 1396 | if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { |
1390 | ret = -EDEADLK; | 1397 | ret = -EDEADLK; |
1391 | goto out_unlock_release_sem; | 1398 | goto out_unlock_put_key; |
1392 | } | 1399 | } |
1393 | 1400 | ||
1394 | /* | 1401 | /* |
1395 | * Surprise - we got the lock. Just return to userspace: | 1402 | * Surprise - we got the lock. Just return to userspace: |
1396 | */ | 1403 | */ |
1397 | if (unlikely(!curval)) | 1404 | if (unlikely(!curval)) |
1398 | goto out_unlock_release_sem; | 1405 | goto out_unlock_put_key; |
1399 | 1406 | ||
1400 | uval = curval; | 1407 | uval = curval; |
1401 | 1408 | ||
@@ -1431,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1431 | * We took the lock due to owner died take over. | 1438 | * We took the lock due to owner died take over. |
1432 | */ | 1439 | */ |
1433 | if (unlikely(lock_taken)) | 1440 | if (unlikely(lock_taken)) |
1434 | goto out_unlock_release_sem; | 1441 | goto out_unlock_put_key; |
1435 | 1442 | ||
1436 | /* | 1443 | /* |
1437 | * We dont have the lock. Look up the PI state (or create it if | 1444 | * We dont have the lock. Look up the PI state (or create it if |
@@ -1470,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1470 | goto retry_locked; | 1477 | goto retry_locked; |
1471 | } | 1478 | } |
1472 | default: | 1479 | default: |
1473 | goto out_unlock_release_sem; | 1480 | goto out_unlock_put_key; |
1474 | } | 1481 | } |
1475 | } | 1482 | } |
1476 | 1483 | ||
@@ -1561,16 +1568,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1561 | destroy_hrtimer_on_stack(&to->timer); | 1568 | destroy_hrtimer_on_stack(&to->timer); |
1562 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 1569 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
1563 | 1570 | ||
1564 | out_unlock_release_sem: | 1571 | out_unlock_put_key: |
1565 | queue_unlock(&q, hb); | 1572 | queue_unlock(&q, hb); |
1566 | 1573 | ||
1567 | out_release_sem: | 1574 | out_put_key: |
1568 | put_futex_key(fshared, &q.key); | 1575 | put_futex_key(fshared, &q.key); |
1576 | out: | ||
1569 | if (to) | 1577 | if (to) |
1570 | destroy_hrtimer_on_stack(&to->timer); | 1578 | destroy_hrtimer_on_stack(&to->timer); |
1571 | return ret; | 1579 | return ret; |
1572 | 1580 | ||
1573 | uaddr_faulted: | 1581 | uaddr_faulted: |
1574 | /* | 1582 | /* |
1575 | * We have to r/w *(int __user *)uaddr, and we have to modify it | 1583 | * We have to r/w *(int __user *)uaddr, and we have to modify it |
1576 | * atomically. Therefore, if we continue to fault after get_user() | 1584 | * atomically. Therefore, if we continue to fault after get_user() |
@@ -1583,7 +1591,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, | |||
1583 | if (attempt++) { | 1591 | if (attempt++) { |
1584 | ret = futex_handle_fault((unsigned long)uaddr, attempt); | 1592 | ret = futex_handle_fault((unsigned long)uaddr, attempt); |
1585 | if (ret) | 1593 | if (ret) |
1586 | goto out_release_sem; | 1594 | goto out_put_key; |
1587 | goto retry_unlocked; | 1595 | goto retry_unlocked; |
1588 | } | 1596 | } |
1589 | 1597 | ||
@@ -1675,9 +1683,9 @@ retry_unlocked: | |||
1675 | 1683 | ||
1676 | out_unlock: | 1684 | out_unlock: |
1677 | spin_unlock(&hb->lock); | 1685 | spin_unlock(&hb->lock); |
1678 | out: | ||
1679 | put_futex_key(fshared, &key); | 1686 | put_futex_key(fshared, &key); |
1680 | 1687 | ||
1688 | out: | ||
1681 | return ret; | 1689 | return ret; |
1682 | 1690 | ||
1683 | pi_faulted: | 1691 | pi_faulted: |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index eb2bfefa6dcc..1455b7651b6b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | |||
634 | { | 634 | { |
635 | } | 635 | } |
636 | 636 | ||
637 | static void __run_hrtimer(struct hrtimer *timer); | ||
638 | 637 | ||
639 | /* | 638 | /* |
640 | * When High resolution timers are active, try to reprogram. Note, that in case | 639 | * When High resolution timers are active, try to reprogram. Note, that in case |
@@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
646 | struct hrtimer_clock_base *base) | 645 | struct hrtimer_clock_base *base) |
647 | { | 646 | { |
648 | if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { | 647 | if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { |
649 | /* | 648 | spin_unlock(&base->cpu_base->lock); |
650 | * XXX: recursion check? | 649 | raise_softirq_irqoff(HRTIMER_SOFTIRQ); |
651 | * hrtimer_forward() should round up with timer granularity | 650 | spin_lock(&base->cpu_base->lock); |
652 | * so that we never get into inf recursion here, | ||
653 | * it doesn't do that though | ||
654 | */ | ||
655 | __run_hrtimer(timer); | ||
656 | return 1; | 651 | return 1; |
657 | } | 652 | } |
658 | return 0; | 653 | return 0; |
@@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
705 | } | 700 | } |
706 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 701 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
707 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 702 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } |
708 | static inline int hrtimer_reprogram(struct hrtimer *timer, | ||
709 | struct hrtimer_clock_base *base) | ||
710 | { | ||
711 | return 0; | ||
712 | } | ||
713 | 703 | ||
714 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 704 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
715 | 705 | ||
@@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); | |||
780 | * | 770 | * |
781 | * The timer is inserted in expiry order. Insertion into the | 771 | * The timer is inserted in expiry order. Insertion into the |
782 | * red black tree is O(log(n)). Must hold the base lock. | 772 | * red black tree is O(log(n)). Must hold the base lock. |
773 | * | ||
774 | * Returns 1 when the new timer is the leftmost timer in the tree. | ||
783 | */ | 775 | */ |
784 | static void enqueue_hrtimer(struct hrtimer *timer, | 776 | static int enqueue_hrtimer(struct hrtimer *timer, |
785 | struct hrtimer_clock_base *base, int reprogram) | 777 | struct hrtimer_clock_base *base) |
786 | { | 778 | { |
787 | struct rb_node **link = &base->active.rb_node; | 779 | struct rb_node **link = &base->active.rb_node; |
788 | struct rb_node *parent = NULL; | 780 | struct rb_node *parent = NULL; |
@@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
814 | * Insert the timer to the rbtree and check whether it | 806 | * Insert the timer to the rbtree and check whether it |
815 | * replaces the first pending timer | 807 | * replaces the first pending timer |
816 | */ | 808 | */ |
817 | if (leftmost) { | 809 | if (leftmost) |
818 | /* | ||
819 | * Reprogram the clock event device. When the timer is already | ||
820 | * expired hrtimer_enqueue_reprogram has either called the | ||
821 | * callback or added it to the pending list and raised the | ||
822 | * softirq. | ||
823 | * | ||
824 | * This is a NOP for !HIGHRES | ||
825 | */ | ||
826 | if (reprogram && hrtimer_enqueue_reprogram(timer, base)) | ||
827 | return; | ||
828 | |||
829 | base->first = &timer->node; | 810 | base->first = &timer->node; |
830 | } | ||
831 | 811 | ||
832 | rb_link_node(&timer->node, parent, link); | 812 | rb_link_node(&timer->node, parent, link); |
833 | rb_insert_color(&timer->node, &base->active); | 813 | rb_insert_color(&timer->node, &base->active); |
@@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, | |||
836 | * state of a possibly running callback. | 816 | * state of a possibly running callback. |
837 | */ | 817 | */ |
838 | timer->state |= HRTIMER_STATE_ENQUEUED; | 818 | timer->state |= HRTIMER_STATE_ENQUEUED; |
819 | |||
820 | return leftmost; | ||
839 | } | 821 | } |
840 | 822 | ||
841 | /* | 823 | /* |
@@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n | |||
912 | { | 894 | { |
913 | struct hrtimer_clock_base *base, *new_base; | 895 | struct hrtimer_clock_base *base, *new_base; |
914 | unsigned long flags; | 896 | unsigned long flags; |
915 | int ret; | 897 | int ret, leftmost; |
916 | 898 | ||
917 | base = lock_hrtimer_base(timer, &flags); | 899 | base = lock_hrtimer_base(timer, &flags); |
918 | 900 | ||
@@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n | |||
940 | 922 | ||
941 | timer_stats_hrtimer_set_start_info(timer); | 923 | timer_stats_hrtimer_set_start_info(timer); |
942 | 924 | ||
925 | leftmost = enqueue_hrtimer(timer, new_base); | ||
926 | |||
943 | /* | 927 | /* |
944 | * Only allow reprogramming if the new base is on this CPU. | 928 | * Only allow reprogramming if the new base is on this CPU. |
945 | * (it might still be on another CPU if the timer was pending) | 929 | * (it might still be on another CPU if the timer was pending) |
930 | * | ||
931 | * XXX send_remote_softirq() ? | ||
946 | */ | 932 | */ |
947 | enqueue_hrtimer(timer, new_base, | 933 | if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) |
948 | new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); | 934 | hrtimer_enqueue_reprogram(timer, new_base); |
949 | 935 | ||
950 | unlock_hrtimer_base(timer, &flags); | 936 | unlock_hrtimer_base(timer, &flags); |
951 | 937 | ||
@@ -1157,13 +1143,13 @@ static void __run_hrtimer(struct hrtimer *timer) | |||
1157 | spin_lock(&cpu_base->lock); | 1143 | spin_lock(&cpu_base->lock); |
1158 | 1144 | ||
1159 | /* | 1145 | /* |
1160 | * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid | 1146 | * Note: We clear the CALLBACK bit after enqueue_hrtimer and |
1161 | * reprogramming of the event hardware. This happens at the end of this | 1147 | * we do not reprogramm the event hardware. Happens either in |
1162 | * function anyway. | 1148 | * hrtimer_start_range_ns() or in hrtimer_interrupt() |
1163 | */ | 1149 | */ |
1164 | if (restart != HRTIMER_NORESTART) { | 1150 | if (restart != HRTIMER_NORESTART) { |
1165 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | 1151 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); |
1166 | enqueue_hrtimer(timer, base, 0); | 1152 | enqueue_hrtimer(timer, base); |
1167 | } | 1153 | } |
1168 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1154 | timer->state &= ~HRTIMER_STATE_CALLBACK; |
1169 | } | 1155 | } |
@@ -1243,6 +1229,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1243 | } | 1229 | } |
1244 | } | 1230 | } |
1245 | 1231 | ||
1232 | /* | ||
1233 | * local version of hrtimer_peek_ahead_timers() called with interrupts | ||
1234 | * disabled. | ||
1235 | */ | ||
1236 | static void __hrtimer_peek_ahead_timers(void) | ||
1237 | { | ||
1238 | struct tick_device *td; | ||
1239 | |||
1240 | if (!hrtimer_hres_active()) | ||
1241 | return; | ||
1242 | |||
1243 | td = &__get_cpu_var(tick_cpu_device); | ||
1244 | if (td && td->evtdev) | ||
1245 | hrtimer_interrupt(td->evtdev); | ||
1246 | } | ||
1247 | |||
1246 | /** | 1248 | /** |
1247 | * hrtimer_peek_ahead_timers -- run soft-expired timers now | 1249 | * hrtimer_peek_ahead_timers -- run soft-expired timers now |
1248 | * | 1250 | * |
@@ -1254,20 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1254 | */ | 1256 | */ |
1255 | void hrtimer_peek_ahead_timers(void) | 1257 | void hrtimer_peek_ahead_timers(void) |
1256 | { | 1258 | { |
1257 | struct tick_device *td; | ||
1258 | unsigned long flags; | 1259 | unsigned long flags; |
1259 | 1260 | ||
1260 | if (!hrtimer_hres_active()) | ||
1261 | return; | ||
1262 | |||
1263 | local_irq_save(flags); | 1261 | local_irq_save(flags); |
1264 | td = &__get_cpu_var(tick_cpu_device); | 1262 | __hrtimer_peek_ahead_timers(); |
1265 | if (td && td->evtdev) | ||
1266 | hrtimer_interrupt(td->evtdev); | ||
1267 | local_irq_restore(flags); | 1263 | local_irq_restore(flags); |
1268 | } | 1264 | } |
1269 | 1265 | ||
1270 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 1266 | static void run_hrtimer_softirq(struct softirq_action *h) |
1267 | { | ||
1268 | hrtimer_peek_ahead_timers(); | ||
1269 | } | ||
1270 | |||
1271 | #else /* CONFIG_HIGH_RES_TIMERS */ | ||
1272 | |||
1273 | static inline void __hrtimer_peek_ahead_timers(void) { } | ||
1274 | |||
1275 | #endif /* !CONFIG_HIGH_RES_TIMERS */ | ||
1271 | 1276 | ||
1272 | /* | 1277 | /* |
1273 | * Called from timer softirq every jiffy, expire hrtimers: | 1278 | * Called from timer softirq every jiffy, expire hrtimers: |
@@ -1513,39 +1518,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, | |||
1513 | __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); | 1518 | __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); |
1514 | timer->base = new_base; | 1519 | timer->base = new_base; |
1515 | /* | 1520 | /* |
1516 | * Enqueue the timers on the new cpu, but do not reprogram | 1521 | * Enqueue the timers on the new cpu. This does not |
1517 | * the timer as that would enable a deadlock between | 1522 | * reprogram the event device in case the timer |
1518 | * hrtimer_enqueue_reprogramm() running the timer and us still | 1523 | * expires before the earliest on this CPU, but we run |
1519 | * holding a nested base lock. | 1524 | * hrtimer_interrupt after we migrated everything to |
1520 | * | 1525 | * sort out already expired timers and reprogram the |
1521 | * Instead we tickle the hrtimer interrupt after the migration | 1526 | * event device. |
1522 | * is done, which will run all expired timers and re-programm | ||
1523 | * the timer device. | ||
1524 | */ | 1527 | */ |
1525 | enqueue_hrtimer(timer, new_base, 0); | 1528 | enqueue_hrtimer(timer, new_base); |
1526 | 1529 | ||
1527 | /* Clear the migration state bit */ | 1530 | /* Clear the migration state bit */ |
1528 | timer->state &= ~HRTIMER_STATE_MIGRATE; | 1531 | timer->state &= ~HRTIMER_STATE_MIGRATE; |
1529 | } | 1532 | } |
1530 | } | 1533 | } |
1531 | 1534 | ||
1532 | static int migrate_hrtimers(int scpu) | 1535 | static void migrate_hrtimers(int scpu) |
1533 | { | 1536 | { |
1534 | struct hrtimer_cpu_base *old_base, *new_base; | 1537 | struct hrtimer_cpu_base *old_base, *new_base; |
1535 | int dcpu, i; | 1538 | int i; |
1536 | 1539 | ||
1537 | BUG_ON(cpu_online(scpu)); | 1540 | BUG_ON(cpu_online(scpu)); |
1538 | old_base = &per_cpu(hrtimer_bases, scpu); | ||
1539 | new_base = &get_cpu_var(hrtimer_bases); | ||
1540 | |||
1541 | dcpu = smp_processor_id(); | ||
1542 | |||
1543 | tick_cancel_sched_timer(scpu); | 1541 | tick_cancel_sched_timer(scpu); |
1542 | |||
1543 | local_irq_disable(); | ||
1544 | old_base = &per_cpu(hrtimer_bases, scpu); | ||
1545 | new_base = &__get_cpu_var(hrtimer_bases); | ||
1544 | /* | 1546 | /* |
1545 | * The caller is globally serialized and nobody else | 1547 | * The caller is globally serialized and nobody else |
1546 | * takes two locks at once, deadlock is not possible. | 1548 | * takes two locks at once, deadlock is not possible. |
1547 | */ | 1549 | */ |
1548 | spin_lock_irq(&new_base->lock); | 1550 | spin_lock(&new_base->lock); |
1549 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); | 1551 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
1550 | 1552 | ||
1551 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1553 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
@@ -1554,15 +1556,11 @@ static int migrate_hrtimers(int scpu) | |||
1554 | } | 1556 | } |
1555 | 1557 | ||
1556 | spin_unlock(&old_base->lock); | 1558 | spin_unlock(&old_base->lock); |
1557 | spin_unlock_irq(&new_base->lock); | 1559 | spin_unlock(&new_base->lock); |
1558 | put_cpu_var(hrtimer_bases); | ||
1559 | 1560 | ||
1560 | return dcpu; | 1561 | /* Check, if we got expired work to do */ |
1561 | } | 1562 | __hrtimer_peek_ahead_timers(); |
1562 | 1563 | local_irq_enable(); | |
1563 | static void tickle_timers(void *arg) | ||
1564 | { | ||
1565 | hrtimer_peek_ahead_timers(); | ||
1566 | } | 1564 | } |
1567 | 1565 | ||
1568 | #endif /* CONFIG_HOTPLUG_CPU */ | 1566 | #endif /* CONFIG_HOTPLUG_CPU */ |
@@ -1583,11 +1581,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, | |||
1583 | case CPU_DEAD: | 1581 | case CPU_DEAD: |
1584 | case CPU_DEAD_FROZEN: | 1582 | case CPU_DEAD_FROZEN: |
1585 | { | 1583 | { |
1586 | int dcpu; | ||
1587 | |||
1588 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); | 1584 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); |
1589 | dcpu = migrate_hrtimers(scpu); | 1585 | migrate_hrtimers(scpu); |
1590 | smp_call_function_single(dcpu, tickle_timers, NULL, 0); | ||
1591 | break; | 1586 | break; |
1592 | } | 1587 | } |
1593 | #endif | 1588 | #endif |
@@ -1608,6 +1603,9 @@ void __init hrtimers_init(void) | |||
1608 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1603 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, |
1609 | (void *)(long)smp_processor_id()); | 1604 | (void *)(long)smp_processor_id()); |
1610 | register_cpu_notifier(&hrtimers_nb); | 1605 | register_cpu_notifier(&hrtimers_nb); |
1606 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1607 | open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); | ||
1608 | #endif | ||
1611 | } | 1609 | } |
1612 | 1610 | ||
1613 | /** | 1611 | /** |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f7321b8ce..1de9700f416e 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | #include <linux/async.h> | ||
13 | 14 | ||
14 | #include "internals.h" | 15 | #include "internals.h" |
15 | 16 | ||
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void) | |||
34 | unsigned int status; | 35 | unsigned int status; |
35 | int i; | 36 | int i; |
36 | 37 | ||
38 | /* | ||
39 | * quiesce the kernel, or at least the asynchronous portion | ||
40 | */ | ||
41 | async_synchronize_full(); | ||
37 | mutex_lock(&probing_active); | 42 | mutex_lock(&probing_active); |
38 | /* | 43 | /* |
39 | * something may have generated an irq long ago and we want to | 44 | * something may have generated an irq long ago and we want to |
diff --git a/kernel/kmod.c b/kernel/kmod.c index b46dbb908669..a27a5f64443d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; | |||
51 | 51 | ||
52 | /** | 52 | /** |
53 | * request_module - try to load a kernel module | 53 | * request_module - try to load a kernel module |
54 | * @fmt: printf style format string for the name of the module | 54 | * @fmt: printf style format string for the name of the module |
55 | * @varargs: arguements as specified in the format string | 55 | * @...: arguments as specified in the format string |
56 | * | 56 | * |
57 | * Load a module using the user mode module loader. The function returns | 57 | * Load a module using the user mode module loader. The function returns |
58 | * zero on success or a negative errno code on failure. Note that a | 58 | * zero on success or a negative errno code on failure. Note that a |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f8a3f25259a..1b9cbdc0127a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | |||
69 | /* NOTE: change this value only with kprobe_mutex held */ | 69 | /* NOTE: change this value only with kprobe_mutex held */ |
70 | static bool kprobe_enabled; | 70 | static bool kprobe_enabled; |
71 | 71 | ||
72 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 72 | static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
73 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 73 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
74 | static struct { | 74 | static struct { |
75 | spinlock_t lock ____cacheline_aligned_in_smp; | 75 | spinlock_t lock ____cacheline_aligned_in_smp; |
@@ -115,6 +115,7 @@ enum kprobe_slot_state { | |||
115 | SLOT_USED = 2, | 115 | SLOT_USED = 2, |
116 | }; | 116 | }; |
117 | 117 | ||
118 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ | ||
118 | static struct hlist_head kprobe_insn_pages; | 119 | static struct hlist_head kprobe_insn_pages; |
119 | static int kprobe_garbage_slots; | 120 | static int kprobe_garbage_slots; |
120 | static int collect_garbage_slots(void); | 121 | static int collect_garbage_slots(void); |
@@ -144,10 +145,10 @@ loop_end: | |||
144 | } | 145 | } |
145 | 146 | ||
146 | /** | 147 | /** |
147 | * get_insn_slot() - Find a slot on an executable page for an instruction. | 148 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
148 | * We allocate an executable page if there's no room on existing ones. | 149 | * We allocate an executable page if there's no room on existing ones. |
149 | */ | 150 | */ |
150 | kprobe_opcode_t __kprobes *get_insn_slot(void) | 151 | static kprobe_opcode_t __kprobes *__get_insn_slot(void) |
151 | { | 152 | { |
152 | struct kprobe_insn_page *kip; | 153 | struct kprobe_insn_page *kip; |
153 | struct hlist_node *pos; | 154 | struct hlist_node *pos; |
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
196 | return kip->insns; | 197 | return kip->insns; |
197 | } | 198 | } |
198 | 199 | ||
200 | kprobe_opcode_t __kprobes *get_insn_slot(void) | ||
201 | { | ||
202 | kprobe_opcode_t *ret; | ||
203 | mutex_lock(&kprobe_insn_mutex); | ||
204 | ret = __get_insn_slot(); | ||
205 | mutex_unlock(&kprobe_insn_mutex); | ||
206 | return ret; | ||
207 | } | ||
208 | |||
199 | /* Return 1 if all garbages are collected, otherwise 0. */ | 209 | /* Return 1 if all garbages are collected, otherwise 0. */ |
200 | static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | 210 | static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) |
201 | { | 211 | { |
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void) | |||
226 | { | 236 | { |
227 | struct kprobe_insn_page *kip; | 237 | struct kprobe_insn_page *kip; |
228 | struct hlist_node *pos, *next; | 238 | struct hlist_node *pos, *next; |
239 | int safety; | ||
229 | 240 | ||
230 | /* Ensure no-one is preepmted on the garbages */ | 241 | /* Ensure no-one is preepmted on the garbages */ |
231 | if (check_safety() != 0) | 242 | mutex_unlock(&kprobe_insn_mutex); |
243 | safety = check_safety(); | ||
244 | mutex_lock(&kprobe_insn_mutex); | ||
245 | if (safety != 0) | ||
232 | return -EAGAIN; | 246 | return -EAGAIN; |
233 | 247 | ||
234 | hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { | 248 | hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { |
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | |||
251 | struct kprobe_insn_page *kip; | 265 | struct kprobe_insn_page *kip; |
252 | struct hlist_node *pos; | 266 | struct hlist_node *pos; |
253 | 267 | ||
268 | mutex_lock(&kprobe_insn_mutex); | ||
254 | hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { | 269 | hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { |
255 | if (kip->insns <= slot && | 270 | if (kip->insns <= slot && |
256 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 271 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { |
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | |||
267 | 282 | ||
268 | if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) | 283 | if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) |
269 | collect_garbage_slots(); | 284 | collect_garbage_slots(); |
285 | |||
286 | mutex_unlock(&kprobe_insn_mutex); | ||
270 | } | 287 | } |
271 | #endif | 288 | #endif |
272 | 289 | ||
@@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
310 | struct kprobe *kp; | 327 | struct kprobe *kp; |
311 | 328 | ||
312 | list_for_each_entry_rcu(kp, &p->list, list) { | 329 | list_for_each_entry_rcu(kp, &p->list, list) { |
313 | if (kp->pre_handler) { | 330 | if (kp->pre_handler && !kprobe_gone(kp)) { |
314 | set_kprobe_instance(kp); | 331 | set_kprobe_instance(kp); |
315 | if (kp->pre_handler(kp, regs)) | 332 | if (kp->pre_handler(kp, regs)) |
316 | return 1; | 333 | return 1; |
@@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
326 | struct kprobe *kp; | 343 | struct kprobe *kp; |
327 | 344 | ||
328 | list_for_each_entry_rcu(kp, &p->list, list) { | 345 | list_for_each_entry_rcu(kp, &p->list, list) { |
329 | if (kp->post_handler) { | 346 | if (kp->post_handler && !kprobe_gone(kp)) { |
330 | set_kprobe_instance(kp); | 347 | set_kprobe_instance(kp); |
331 | kp->post_handler(kp, regs, flags); | 348 | kp->post_handler(kp, regs, flags); |
332 | reset_kprobe_instance(); | 349 | reset_kprobe_instance(); |
@@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
393 | hlist_add_head(&ri->hlist, head); | 410 | hlist_add_head(&ri->hlist, head); |
394 | } | 411 | } |
395 | 412 | ||
396 | void kretprobe_hash_lock(struct task_struct *tsk, | 413 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, |
397 | struct hlist_head **head, unsigned long *flags) | 414 | struct hlist_head **head, unsigned long *flags) |
398 | { | 415 | { |
399 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 416 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
@@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk, | |||
404 | spin_lock_irqsave(hlist_lock, *flags); | 421 | spin_lock_irqsave(hlist_lock, *flags); |
405 | } | 422 | } |
406 | 423 | ||
407 | static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) | 424 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
425 | unsigned long *flags) | ||
408 | { | 426 | { |
409 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 427 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
410 | spin_lock_irqsave(hlist_lock, *flags); | 428 | spin_lock_irqsave(hlist_lock, *flags); |
411 | } | 429 | } |
412 | 430 | ||
413 | void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) | 431 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
432 | unsigned long *flags) | ||
414 | { | 433 | { |
415 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 434 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
416 | spinlock_t *hlist_lock; | 435 | spinlock_t *hlist_lock; |
@@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) | |||
419 | spin_unlock_irqrestore(hlist_lock, *flags); | 438 | spin_unlock_irqrestore(hlist_lock, *flags); |
420 | } | 439 | } |
421 | 440 | ||
422 | void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) | 441 | void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) |
423 | { | 442 | { |
424 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 443 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
425 | spin_unlock_irqrestore(hlist_lock, *flags); | 444 | spin_unlock_irqrestore(hlist_lock, *flags); |
@@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
526 | ap->addr = p->addr; | 545 | ap->addr = p->addr; |
527 | ap->pre_handler = aggr_pre_handler; | 546 | ap->pre_handler = aggr_pre_handler; |
528 | ap->fault_handler = aggr_fault_handler; | 547 | ap->fault_handler = aggr_fault_handler; |
529 | if (p->post_handler) | 548 | /* We don't care the kprobe which has gone. */ |
549 | if (p->post_handler && !kprobe_gone(p)) | ||
530 | ap->post_handler = aggr_post_handler; | 550 | ap->post_handler = aggr_post_handler; |
531 | if (p->break_handler) | 551 | if (p->break_handler && !kprobe_gone(p)) |
532 | ap->break_handler = aggr_break_handler; | 552 | ap->break_handler = aggr_break_handler; |
533 | 553 | ||
534 | INIT_LIST_HEAD(&ap->list); | 554 | INIT_LIST_HEAD(&ap->list); |
@@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
547 | int ret = 0; | 567 | int ret = 0; |
548 | struct kprobe *ap; | 568 | struct kprobe *ap; |
549 | 569 | ||
570 | if (kprobe_gone(old_p)) { | ||
571 | /* | ||
572 | * Attempting to insert new probe at the same location that | ||
573 | * had a probe in the module vaddr area which already | ||
574 | * freed. So, the instruction slot has already been | ||
575 | * released. We need a new slot for the new probe. | ||
576 | */ | ||
577 | ret = arch_prepare_kprobe(old_p); | ||
578 | if (ret) | ||
579 | return ret; | ||
580 | } | ||
550 | if (old_p->pre_handler == aggr_pre_handler) { | 581 | if (old_p->pre_handler == aggr_pre_handler) { |
551 | copy_kprobe(old_p, p); | 582 | copy_kprobe(old_p, p); |
552 | ret = add_new_kprobe(old_p, p); | 583 | ret = add_new_kprobe(old_p, p); |
584 | ap = old_p; | ||
553 | } else { | 585 | } else { |
554 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); | 586 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); |
555 | if (!ap) | 587 | if (!ap) { |
588 | if (kprobe_gone(old_p)) | ||
589 | arch_remove_kprobe(old_p); | ||
556 | return -ENOMEM; | 590 | return -ENOMEM; |
591 | } | ||
557 | add_aggr_kprobe(ap, old_p); | 592 | add_aggr_kprobe(ap, old_p); |
558 | copy_kprobe(ap, p); | 593 | copy_kprobe(ap, p); |
559 | ret = add_new_kprobe(ap, p); | 594 | ret = add_new_kprobe(ap, p); |
560 | } | 595 | } |
596 | if (kprobe_gone(old_p)) { | ||
597 | /* | ||
598 | * If the old_p has gone, its breakpoint has been disarmed. | ||
599 | * We have to arm it again after preparing real kprobes. | ||
600 | */ | ||
601 | ap->flags &= ~KPROBE_FLAG_GONE; | ||
602 | if (kprobe_enabled) | ||
603 | arch_arm_kprobe(ap); | ||
604 | } | ||
561 | return ret; | 605 | return ret; |
562 | } | 606 | } |
563 | 607 | ||
@@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
600 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); | 644 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); |
601 | } | 645 | } |
602 | 646 | ||
603 | static int __kprobes __register_kprobe(struct kprobe *p, | 647 | int __kprobes register_kprobe(struct kprobe *p) |
604 | unsigned long called_from) | ||
605 | { | 648 | { |
606 | int ret = 0; | 649 | int ret = 0; |
607 | struct kprobe *old_p; | 650 | struct kprobe *old_p; |
@@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
620 | return -EINVAL; | 663 | return -EINVAL; |
621 | } | 664 | } |
622 | 665 | ||
623 | p->mod_refcounted = 0; | 666 | p->flags = 0; |
624 | |||
625 | /* | 667 | /* |
626 | * Check if are we probing a module. | 668 | * Check if are we probing a module. |
627 | */ | 669 | */ |
628 | probed_mod = __module_text_address((unsigned long) p->addr); | 670 | probed_mod = __module_text_address((unsigned long) p->addr); |
629 | if (probed_mod) { | 671 | if (probed_mod) { |
630 | struct module *calling_mod; | ||
631 | calling_mod = __module_text_address(called_from); | ||
632 | /* | 672 | /* |
633 | * We must allow modules to probe themself and in this case | 673 | * We must hold a refcount of the probed module while updating |
634 | * avoid incrementing the module refcount, so as to allow | 674 | * its code to prohibit unexpected unloading. |
635 | * unloading of self probing modules. | ||
636 | */ | 675 | */ |
637 | if (calling_mod && calling_mod != probed_mod) { | 676 | if (unlikely(!try_module_get(probed_mod))) { |
638 | if (unlikely(!try_module_get(probed_mod))) { | 677 | preempt_enable(); |
639 | preempt_enable(); | 678 | return -EINVAL; |
640 | return -EINVAL; | 679 | } |
641 | } | 680 | /* |
642 | p->mod_refcounted = 1; | 681 | * If the module freed .init.text, we couldn't insert |
643 | } else | 682 | * kprobes in there. |
644 | probed_mod = NULL; | 683 | */ |
684 | if (within_module_init((unsigned long)p->addr, probed_mod) && | ||
685 | probed_mod->state != MODULE_STATE_COMING) { | ||
686 | module_put(probed_mod); | ||
687 | preempt_enable(); | ||
688 | return -EINVAL; | ||
689 | } | ||
645 | } | 690 | } |
646 | preempt_enable(); | 691 | preempt_enable(); |
647 | 692 | ||
@@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
668 | out: | 713 | out: |
669 | mutex_unlock(&kprobe_mutex); | 714 | mutex_unlock(&kprobe_mutex); |
670 | 715 | ||
671 | if (ret && probed_mod) | 716 | if (probed_mod) |
672 | module_put(probed_mod); | 717 | module_put(probed_mod); |
718 | |||
673 | return ret; | 719 | return ret; |
674 | } | 720 | } |
675 | 721 | ||
@@ -697,16 +743,16 @@ valid_p: | |||
697 | list_is_singular(&old_p->list))) { | 743 | list_is_singular(&old_p->list))) { |
698 | /* | 744 | /* |
699 | * Only probe on the hash list. Disarm only if kprobes are | 745 | * Only probe on the hash list. Disarm only if kprobes are |
700 | * enabled - otherwise, the breakpoint would already have | 746 | * enabled and not gone - otherwise, the breakpoint would |
701 | * been removed. We save on flushing icache. | 747 | * already have been removed. We save on flushing icache. |
702 | */ | 748 | */ |
703 | if (kprobe_enabled) | 749 | if (kprobe_enabled && !kprobe_gone(old_p)) |
704 | arch_disarm_kprobe(p); | 750 | arch_disarm_kprobe(p); |
705 | hlist_del_rcu(&old_p->hlist); | 751 | hlist_del_rcu(&old_p->hlist); |
706 | } else { | 752 | } else { |
707 | if (p->break_handler) | 753 | if (p->break_handler && !kprobe_gone(p)) |
708 | old_p->break_handler = NULL; | 754 | old_p->break_handler = NULL; |
709 | if (p->post_handler) { | 755 | if (p->post_handler && !kprobe_gone(p)) { |
710 | list_for_each_entry_rcu(list_p, &old_p->list, list) { | 756 | list_for_each_entry_rcu(list_p, &old_p->list, list) { |
711 | if ((list_p != p) && (list_p->post_handler)) | 757 | if ((list_p != p) && (list_p->post_handler)) |
712 | goto noclean; | 758 | goto noclean; |
@@ -721,39 +767,27 @@ noclean: | |||
721 | 767 | ||
722 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 768 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) |
723 | { | 769 | { |
724 | struct module *mod; | ||
725 | struct kprobe *old_p; | 770 | struct kprobe *old_p; |
726 | 771 | ||
727 | if (p->mod_refcounted) { | 772 | if (list_empty(&p->list)) |
728 | /* | ||
729 | * Since we've already incremented refcount, | ||
730 | * we don't need to disable preemption. | ||
731 | */ | ||
732 | mod = module_text_address((unsigned long)p->addr); | ||
733 | if (mod) | ||
734 | module_put(mod); | ||
735 | } | ||
736 | |||
737 | if (list_empty(&p->list) || list_is_singular(&p->list)) { | ||
738 | if (!list_empty(&p->list)) { | ||
739 | /* "p" is the last child of an aggr_kprobe */ | ||
740 | old_p = list_entry(p->list.next, struct kprobe, list); | ||
741 | list_del(&p->list); | ||
742 | kfree(old_p); | ||
743 | } | ||
744 | arch_remove_kprobe(p); | 773 | arch_remove_kprobe(p); |
774 | else if (list_is_singular(&p->list)) { | ||
775 | /* "p" is the last child of an aggr_kprobe */ | ||
776 | old_p = list_entry(p->list.next, struct kprobe, list); | ||
777 | list_del(&p->list); | ||
778 | arch_remove_kprobe(old_p); | ||
779 | kfree(old_p); | ||
745 | } | 780 | } |
746 | } | 781 | } |
747 | 782 | ||
748 | static int __register_kprobes(struct kprobe **kps, int num, | 783 | int __kprobes register_kprobes(struct kprobe **kps, int num) |
749 | unsigned long called_from) | ||
750 | { | 784 | { |
751 | int i, ret = 0; | 785 | int i, ret = 0; |
752 | 786 | ||
753 | if (num <= 0) | 787 | if (num <= 0) |
754 | return -EINVAL; | 788 | return -EINVAL; |
755 | for (i = 0; i < num; i++) { | 789 | for (i = 0; i < num; i++) { |
756 | ret = __register_kprobe(kps[i], called_from); | 790 | ret = register_kprobe(kps[i]); |
757 | if (ret < 0) { | 791 | if (ret < 0) { |
758 | if (i > 0) | 792 | if (i > 0) |
759 | unregister_kprobes(kps, i); | 793 | unregister_kprobes(kps, i); |
@@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num, | |||
763 | return ret; | 797 | return ret; |
764 | } | 798 | } |
765 | 799 | ||
766 | /* | ||
767 | * Registration and unregistration functions for kprobe. | ||
768 | */ | ||
769 | int __kprobes register_kprobe(struct kprobe *p) | ||
770 | { | ||
771 | return __register_kprobes(&p, 1, | ||
772 | (unsigned long)__builtin_return_address(0)); | ||
773 | } | ||
774 | |||
775 | void __kprobes unregister_kprobe(struct kprobe *p) | 800 | void __kprobes unregister_kprobe(struct kprobe *p) |
776 | { | 801 | { |
777 | unregister_kprobes(&p, 1); | 802 | unregister_kprobes(&p, 1); |
778 | } | 803 | } |
779 | 804 | ||
780 | int __kprobes register_kprobes(struct kprobe **kps, int num) | ||
781 | { | ||
782 | return __register_kprobes(kps, num, | ||
783 | (unsigned long)__builtin_return_address(0)); | ||
784 | } | ||
785 | |||
786 | void __kprobes unregister_kprobes(struct kprobe **kps, int num) | 805 | void __kprobes unregister_kprobes(struct kprobe **kps, int num) |
787 | { | 806 | { |
788 | int i; | 807 | int i; |
@@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) | |||
811 | return (unsigned long)entry; | 830 | return (unsigned long)entry; |
812 | } | 831 | } |
813 | 832 | ||
814 | static int __register_jprobes(struct jprobe **jps, int num, | 833 | int __kprobes register_jprobes(struct jprobe **jps, int num) |
815 | unsigned long called_from) | ||
816 | { | 834 | { |
817 | struct jprobe *jp; | 835 | struct jprobe *jp; |
818 | int ret = 0, i; | 836 | int ret = 0, i; |
@@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num, | |||
830 | /* Todo: Verify probepoint is a function entry point */ | 848 | /* Todo: Verify probepoint is a function entry point */ |
831 | jp->kp.pre_handler = setjmp_pre_handler; | 849 | jp->kp.pre_handler = setjmp_pre_handler; |
832 | jp->kp.break_handler = longjmp_break_handler; | 850 | jp->kp.break_handler = longjmp_break_handler; |
833 | ret = __register_kprobe(&jp->kp, called_from); | 851 | ret = register_kprobe(&jp->kp); |
834 | } | 852 | } |
835 | if (ret < 0) { | 853 | if (ret < 0) { |
836 | if (i > 0) | 854 | if (i > 0) |
@@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num, | |||
843 | 861 | ||
844 | int __kprobes register_jprobe(struct jprobe *jp) | 862 | int __kprobes register_jprobe(struct jprobe *jp) |
845 | { | 863 | { |
846 | return __register_jprobes(&jp, 1, | 864 | return register_jprobes(&jp, 1); |
847 | (unsigned long)__builtin_return_address(0)); | ||
848 | } | 865 | } |
849 | 866 | ||
850 | void __kprobes unregister_jprobe(struct jprobe *jp) | 867 | void __kprobes unregister_jprobe(struct jprobe *jp) |
@@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp) | |||
852 | unregister_jprobes(&jp, 1); | 869 | unregister_jprobes(&jp, 1); |
853 | } | 870 | } |
854 | 871 | ||
855 | int __kprobes register_jprobes(struct jprobe **jps, int num) | ||
856 | { | ||
857 | return __register_jprobes(jps, num, | ||
858 | (unsigned long)__builtin_return_address(0)); | ||
859 | } | ||
860 | |||
861 | void __kprobes unregister_jprobes(struct jprobe **jps, int num) | 872 | void __kprobes unregister_jprobes(struct jprobe **jps, int num) |
862 | { | 873 | { |
863 | int i; | 874 | int i; |
@@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
920 | return 0; | 931 | return 0; |
921 | } | 932 | } |
922 | 933 | ||
923 | static int __kprobes __register_kretprobe(struct kretprobe *rp, | 934 | int __kprobes register_kretprobe(struct kretprobe *rp) |
924 | unsigned long called_from) | ||
925 | { | 935 | { |
926 | int ret = 0; | 936 | int ret = 0; |
927 | struct kretprobe_instance *inst; | 937 | struct kretprobe_instance *inst; |
@@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, | |||
967 | 977 | ||
968 | rp->nmissed = 0; | 978 | rp->nmissed = 0; |
969 | /* Establish function entry probe point */ | 979 | /* Establish function entry probe point */ |
970 | ret = __register_kprobe(&rp->kp, called_from); | 980 | ret = register_kprobe(&rp->kp); |
971 | if (ret != 0) | 981 | if (ret != 0) |
972 | free_rp_inst(rp); | 982 | free_rp_inst(rp); |
973 | return ret; | 983 | return ret; |
974 | } | 984 | } |
975 | 985 | ||
976 | static int __register_kretprobes(struct kretprobe **rps, int num, | 986 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) |
977 | unsigned long called_from) | ||
978 | { | 987 | { |
979 | int ret = 0, i; | 988 | int ret = 0, i; |
980 | 989 | ||
981 | if (num <= 0) | 990 | if (num <= 0) |
982 | return -EINVAL; | 991 | return -EINVAL; |
983 | for (i = 0; i < num; i++) { | 992 | for (i = 0; i < num; i++) { |
984 | ret = __register_kretprobe(rps[i], called_from); | 993 | ret = register_kretprobe(rps[i]); |
985 | if (ret < 0) { | 994 | if (ret < 0) { |
986 | if (i > 0) | 995 | if (i > 0) |
987 | unregister_kretprobes(rps, i); | 996 | unregister_kretprobes(rps, i); |
@@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num, | |||
991 | return ret; | 1000 | return ret; |
992 | } | 1001 | } |
993 | 1002 | ||
994 | int __kprobes register_kretprobe(struct kretprobe *rp) | ||
995 | { | ||
996 | return __register_kretprobes(&rp, 1, | ||
997 | (unsigned long)__builtin_return_address(0)); | ||
998 | } | ||
999 | |||
1000 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 1003 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
1001 | { | 1004 | { |
1002 | unregister_kretprobes(&rp, 1); | 1005 | unregister_kretprobes(&rp, 1); |
1003 | } | 1006 | } |
1004 | 1007 | ||
1005 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) | ||
1006 | { | ||
1007 | return __register_kretprobes(rps, num, | ||
1008 | (unsigned long)__builtin_return_address(0)); | ||
1009 | } | ||
1010 | |||
1011 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) | 1008 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) |
1012 | { | 1009 | { |
1013 | int i; | 1010 | int i; |
@@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1055 | 1052 | ||
1056 | #endif /* CONFIG_KRETPROBES */ | 1053 | #endif /* CONFIG_KRETPROBES */ |
1057 | 1054 | ||
1055 | /* Set the kprobe gone and remove its instruction buffer. */ | ||
1056 | static void __kprobes kill_kprobe(struct kprobe *p) | ||
1057 | { | ||
1058 | struct kprobe *kp; | ||
1059 | p->flags |= KPROBE_FLAG_GONE; | ||
1060 | if (p->pre_handler == aggr_pre_handler) { | ||
1061 | /* | ||
1062 | * If this is an aggr_kprobe, we have to list all the | ||
1063 | * chained probes and mark them GONE. | ||
1064 | */ | ||
1065 | list_for_each_entry_rcu(kp, &p->list, list) | ||
1066 | kp->flags |= KPROBE_FLAG_GONE; | ||
1067 | p->post_handler = NULL; | ||
1068 | p->break_handler = NULL; | ||
1069 | } | ||
1070 | /* | ||
1071 | * Here, we can remove insn_slot safely, because no thread calls | ||
1072 | * the original probed function (which will be freed soon) any more. | ||
1073 | */ | ||
1074 | arch_remove_kprobe(p); | ||
1075 | } | ||
1076 | |||
1077 | /* Module notifier call back, checking kprobes on the module */ | ||
1078 | static int __kprobes kprobes_module_callback(struct notifier_block *nb, | ||
1079 | unsigned long val, void *data) | ||
1080 | { | ||
1081 | struct module *mod = data; | ||
1082 | struct hlist_head *head; | ||
1083 | struct hlist_node *node; | ||
1084 | struct kprobe *p; | ||
1085 | unsigned int i; | ||
1086 | int checkcore = (val == MODULE_STATE_GOING); | ||
1087 | |||
1088 | if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) | ||
1089 | return NOTIFY_DONE; | ||
1090 | |||
1091 | /* | ||
1092 | * When MODULE_STATE_GOING was notified, both of module .text and | ||
1093 | * .init.text sections would be freed. When MODULE_STATE_LIVE was | ||
1094 | * notified, only .init.text section would be freed. We need to | ||
1095 | * disable kprobes which have been inserted in the sections. | ||
1096 | */ | ||
1097 | mutex_lock(&kprobe_mutex); | ||
1098 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
1099 | head = &kprobe_table[i]; | ||
1100 | hlist_for_each_entry_rcu(p, node, head, hlist) | ||
1101 | if (within_module_init((unsigned long)p->addr, mod) || | ||
1102 | (checkcore && | ||
1103 | within_module_core((unsigned long)p->addr, mod))) { | ||
1104 | /* | ||
1105 | * The vaddr this probe is installed will soon | ||
1106 | * be vfreed buy not synced to disk. Hence, | ||
1107 | * disarming the breakpoint isn't needed. | ||
1108 | */ | ||
1109 | kill_kprobe(p); | ||
1110 | } | ||
1111 | } | ||
1112 | mutex_unlock(&kprobe_mutex); | ||
1113 | return NOTIFY_DONE; | ||
1114 | } | ||
1115 | |||
1116 | static struct notifier_block kprobe_module_nb = { | ||
1117 | .notifier_call = kprobes_module_callback, | ||
1118 | .priority = 0 | ||
1119 | }; | ||
1120 | |||
1058 | static int __init init_kprobes(void) | 1121 | static int __init init_kprobes(void) |
1059 | { | 1122 | { |
1060 | int i, err = 0; | 1123 | int i, err = 0; |
@@ -1111,6 +1174,9 @@ static int __init init_kprobes(void) | |||
1111 | err = arch_init_kprobes(); | 1174 | err = arch_init_kprobes(); |
1112 | if (!err) | 1175 | if (!err) |
1113 | err = register_die_notifier(&kprobe_exceptions_nb); | 1176 | err = register_die_notifier(&kprobe_exceptions_nb); |
1177 | if (!err) | ||
1178 | err = register_module_notifier(&kprobe_module_nb); | ||
1179 | |||
1114 | kprobes_initialized = (err == 0); | 1180 | kprobes_initialized = (err == 0); |
1115 | 1181 | ||
1116 | if (!err) | 1182 | if (!err) |
@@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
1131 | else | 1197 | else |
1132 | kprobe_type = "k"; | 1198 | kprobe_type = "k"; |
1133 | if (sym) | 1199 | if (sym) |
1134 | seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, | 1200 | seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, |
1135 | sym, offset, (modname ? modname : " ")); | 1201 | sym, offset, (modname ? modname : " "), |
1202 | (kprobe_gone(p) ? "[GONE]" : "")); | ||
1136 | else | 1203 | else |
1137 | seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); | 1204 | seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, |
1205 | (kprobe_gone(p) ? "[GONE]" : "")); | ||
1138 | } | 1206 | } |
1139 | 1207 | ||
1140 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 1208 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
@@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void) | |||
1215 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1283 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1216 | head = &kprobe_table[i]; | 1284 | head = &kprobe_table[i]; |
1217 | hlist_for_each_entry_rcu(p, node, head, hlist) | 1285 | hlist_for_each_entry_rcu(p, node, head, hlist) |
1218 | arch_arm_kprobe(p); | 1286 | if (!kprobe_gone(p)) |
1287 | arch_arm_kprobe(p); | ||
1219 | } | 1288 | } |
1220 | 1289 | ||
1221 | kprobe_enabled = true; | 1290 | kprobe_enabled = true; |
@@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void) | |||
1244 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1313 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1245 | head = &kprobe_table[i]; | 1314 | head = &kprobe_table[i]; |
1246 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 1315 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
1247 | if (!arch_trampoline_kprobe(p)) | 1316 | if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) |
1248 | arch_disarm_kprobe(p); | 1317 | arch_disarm_kprobe(p); |
1249 | } | 1318 | } |
1250 | } | 1319 | } |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 08dd8ed86c77..528dd78e7e7e 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | |||
24 | static struct kobj_attribute _name##_attr = \ | 24 | static struct kobj_attribute _name##_attr = \ |
25 | __ATTR(_name, 0644, _name##_show, _name##_store) | 25 | __ATTR(_name, 0644, _name##_show, _name##_store) |
26 | 26 | ||
27 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 27 | #if defined(CONFIG_HOTPLUG) |
28 | /* current uevent sequence number */ | 28 | /* current uevent sequence number */ |
29 | static ssize_t uevent_seqnum_show(struct kobject *kobj, | 29 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
30 | struct kobj_attribute *attr, char *buf) | 30 | struct kobj_attribute *attr, char *buf) |
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj; | |||
137 | EXPORT_SYMBOL_GPL(kernel_kobj); | 137 | EXPORT_SYMBOL_GPL(kernel_kobj); |
138 | 138 | ||
139 | static struct attribute * kernel_attrs[] = { | 139 | static struct attribute * kernel_attrs[] = { |
140 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 140 | #if defined(CONFIG_HOTPLUG) |
141 | &uevent_seqnum_attr.attr, | 141 | &uevent_seqnum_attr.attr, |
142 | &uevent_helper_attr.attr, | 142 | &uevent_helper_attr.attr, |
143 | #endif | 143 | #endif |
diff --git a/kernel/module.c b/kernel/module.c index dd2a54155b54..c9332c90d5a0 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -43,7 +43,6 @@ | |||
43 | #include <linux/device.h> | 43 | #include <linux/device.h> |
44 | #include <linux/string.h> | 44 | #include <linux/string.h> |
45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
46 | #include <linux/unwind.h> | ||
47 | #include <linux/rculist.h> | 46 | #include <linux/rculist.h> |
48 | #include <asm/uaccess.h> | 47 | #include <asm/uaccess.h> |
49 | #include <asm/cacheflush.h> | 48 | #include <asm/cacheflush.h> |
@@ -51,6 +50,7 @@ | |||
51 | #include <asm/sections.h> | 50 | #include <asm/sections.h> |
52 | #include <linux/tracepoint.h> | 51 | #include <linux/tracepoint.h> |
53 | #include <linux/ftrace.h> | 52 | #include <linux/ftrace.h> |
53 | #include <linux/async.h> | ||
54 | 54 | ||
55 | #if 0 | 55 | #if 0 |
56 | #define DEBUGP printk | 56 | #define DEBUGP printk |
@@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
757 | return -EFAULT; | 757 | return -EFAULT; |
758 | name[MODULE_NAME_LEN-1] = '\0'; | 758 | name[MODULE_NAME_LEN-1] = '\0'; |
759 | 759 | ||
760 | if (mutex_lock_interruptible(&module_mutex) != 0) | 760 | /* Create stop_machine threads since free_module relies on |
761 | return -EINTR; | 761 | * a non-failing stop_machine call. */ |
762 | ret = stop_machine_create(); | ||
763 | if (ret) | ||
764 | return ret; | ||
765 | |||
766 | if (mutex_lock_interruptible(&module_mutex) != 0) { | ||
767 | ret = -EINTR; | ||
768 | goto out_stop; | ||
769 | } | ||
762 | 770 | ||
763 | mod = find_module(name); | 771 | mod = find_module(name); |
764 | if (!mod) { | 772 | if (!mod) { |
@@ -809,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
809 | mod->exit(); | 817 | mod->exit(); |
810 | blocking_notifier_call_chain(&module_notify_list, | 818 | blocking_notifier_call_chain(&module_notify_list, |
811 | MODULE_STATE_GOING, mod); | 819 | MODULE_STATE_GOING, mod); |
820 | async_synchronize_full(); | ||
812 | mutex_lock(&module_mutex); | 821 | mutex_lock(&module_mutex); |
813 | /* Store the name of the last unloaded module for diagnostic purposes */ | 822 | /* Store the name of the last unloaded module for diagnostic purposes */ |
814 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); | 823 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); |
@@ -817,10 +826,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
817 | 826 | ||
818 | out: | 827 | out: |
819 | mutex_unlock(&module_mutex); | 828 | mutex_unlock(&module_mutex); |
829 | out_stop: | ||
830 | stop_machine_destroy(); | ||
820 | return ret; | 831 | return ret; |
821 | } | 832 | } |
822 | 833 | ||
823 | static void print_unload_info(struct seq_file *m, struct module *mod) | 834 | static inline void print_unload_info(struct seq_file *m, struct module *mod) |
824 | { | 835 | { |
825 | struct module_use *use; | 836 | struct module_use *use; |
826 | int printed_something = 0; | 837 | int printed_something = 0; |
@@ -893,7 +904,7 @@ void module_put(struct module *module) | |||
893 | EXPORT_SYMBOL(module_put); | 904 | EXPORT_SYMBOL(module_put); |
894 | 905 | ||
895 | #else /* !CONFIG_MODULE_UNLOAD */ | 906 | #else /* !CONFIG_MODULE_UNLOAD */ |
896 | static void print_unload_info(struct seq_file *m, struct module *mod) | 907 | static inline void print_unload_info(struct seq_file *m, struct module *mod) |
897 | { | 908 | { |
898 | /* We don't know the usage count, or what modules are using. */ | 909 | /* We don't know the usage count, or what modules are using. */ |
899 | seq_printf(m, " - -"); | 910 | seq_printf(m, " - -"); |
@@ -1439,8 +1450,6 @@ static void free_module(struct module *mod) | |||
1439 | remove_sect_attrs(mod); | 1450 | remove_sect_attrs(mod); |
1440 | mod_kobject_remove(mod); | 1451 | mod_kobject_remove(mod); |
1441 | 1452 | ||
1442 | unwind_remove_table(mod->unwind_info, 0); | ||
1443 | |||
1444 | /* Arch-specific cleanup. */ | 1453 | /* Arch-specific cleanup. */ |
1445 | module_arch_cleanup(mod); | 1454 | module_arch_cleanup(mod); |
1446 | 1455 | ||
@@ -1578,11 +1587,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1578 | return ret; | 1587 | return ret; |
1579 | } | 1588 | } |
1580 | 1589 | ||
1590 | /* Additional bytes needed by arch in front of individual sections */ | ||
1591 | unsigned int __weak arch_mod_section_prepend(struct module *mod, | ||
1592 | unsigned int section) | ||
1593 | { | ||
1594 | /* default implementation just returns zero */ | ||
1595 | return 0; | ||
1596 | } | ||
1597 | |||
1581 | /* Update size with this section: return offset. */ | 1598 | /* Update size with this section: return offset. */ |
1582 | static long get_offset(unsigned int *size, Elf_Shdr *sechdr) | 1599 | static long get_offset(struct module *mod, unsigned int *size, |
1600 | Elf_Shdr *sechdr, unsigned int section) | ||
1583 | { | 1601 | { |
1584 | long ret; | 1602 | long ret; |
1585 | 1603 | ||
1604 | *size += arch_mod_section_prepend(mod, section); | ||
1586 | ret = ALIGN(*size, sechdr->sh_addralign ?: 1); | 1605 | ret = ALIGN(*size, sechdr->sh_addralign ?: 1); |
1587 | *size = ret + sechdr->sh_size; | 1606 | *size = ret + sechdr->sh_size; |
1588 | return ret; | 1607 | return ret; |
@@ -1622,7 +1641,7 @@ static void layout_sections(struct module *mod, | |||
1622 | || strncmp(secstrings + s->sh_name, | 1641 | || strncmp(secstrings + s->sh_name, |
1623 | ".init", 5) == 0) | 1642 | ".init", 5) == 0) |
1624 | continue; | 1643 | continue; |
1625 | s->sh_entsize = get_offset(&mod->core_size, s); | 1644 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1626 | DEBUGP("\t%s\n", secstrings + s->sh_name); | 1645 | DEBUGP("\t%s\n", secstrings + s->sh_name); |
1627 | } | 1646 | } |
1628 | if (m == 0) | 1647 | if (m == 0) |
@@ -1640,7 +1659,7 @@ static void layout_sections(struct module *mod, | |||
1640 | || strncmp(secstrings + s->sh_name, | 1659 | || strncmp(secstrings + s->sh_name, |
1641 | ".init", 5) != 0) | 1660 | ".init", 5) != 0) |
1642 | continue; | 1661 | continue; |
1643 | s->sh_entsize = (get_offset(&mod->init_size, s) | 1662 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) |
1644 | | INIT_OFFSET_MASK); | 1663 | | INIT_OFFSET_MASK); |
1645 | DEBUGP("\t%s\n", secstrings + s->sh_name); | 1664 | DEBUGP("\t%s\n", secstrings + s->sh_name); |
1646 | } | 1665 | } |
@@ -1725,15 +1744,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
1725 | return NULL; | 1744 | return NULL; |
1726 | } | 1745 | } |
1727 | 1746 | ||
1728 | static int is_exported(const char *name, const struct module *mod) | 1747 | static int is_exported(const char *name, unsigned long value, |
1748 | const struct module *mod) | ||
1729 | { | 1749 | { |
1730 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) | 1750 | const struct kernel_symbol *ks; |
1731 | return 1; | 1751 | if (!mod) |
1752 | ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); | ||
1732 | else | 1753 | else |
1733 | if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) | 1754 | ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); |
1734 | return 1; | 1755 | return ks != NULL && ks->value == value; |
1735 | else | ||
1736 | return 0; | ||
1737 | } | 1756 | } |
1738 | 1757 | ||
1739 | /* As per nm */ | 1758 | /* As per nm */ |
@@ -1847,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod, | |||
1847 | unsigned int symindex = 0; | 1866 | unsigned int symindex = 0; |
1848 | unsigned int strindex = 0; | 1867 | unsigned int strindex = 0; |
1849 | unsigned int modindex, versindex, infoindex, pcpuindex; | 1868 | unsigned int modindex, versindex, infoindex, pcpuindex; |
1850 | unsigned int unwindex = 0; | ||
1851 | unsigned int num_kp, num_mcount; | 1869 | unsigned int num_kp, num_mcount; |
1852 | struct kernel_param *kp; | 1870 | struct kernel_param *kp; |
1853 | struct module *mod; | 1871 | struct module *mod; |
@@ -1865,6 +1883,13 @@ static noinline struct module *load_module(void __user *umod, | |||
1865 | /* vmalloc barfs on "unusual" numbers. Check here */ | 1883 | /* vmalloc barfs on "unusual" numbers. Check here */ |
1866 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) | 1884 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) |
1867 | return ERR_PTR(-ENOMEM); | 1885 | return ERR_PTR(-ENOMEM); |
1886 | |||
1887 | /* Create stop_machine threads since the error path relies on | ||
1888 | * a non-failing stop_machine call. */ | ||
1889 | err = stop_machine_create(); | ||
1890 | if (err) | ||
1891 | goto free_hdr; | ||
1892 | |||
1868 | if (copy_from_user(hdr, umod, len) != 0) { | 1893 | if (copy_from_user(hdr, umod, len) != 0) { |
1869 | err = -EFAULT; | 1894 | err = -EFAULT; |
1870 | goto free_hdr; | 1895 | goto free_hdr; |
@@ -1930,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod, | |||
1930 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 1955 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); |
1931 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | 1956 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); |
1932 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | 1957 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); |
1933 | #ifdef ARCH_UNWIND_SECTION_NAME | ||
1934 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); | ||
1935 | #endif | ||
1936 | 1958 | ||
1937 | /* Don't keep modinfo and version sections. */ | 1959 | /* Don't keep modinfo and version sections. */ |
1938 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 1960 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
@@ -1942,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod, | |||
1942 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | 1964 | sechdrs[symindex].sh_flags |= SHF_ALLOC; |
1943 | sechdrs[strindex].sh_flags |= SHF_ALLOC; | 1965 | sechdrs[strindex].sh_flags |= SHF_ALLOC; |
1944 | #endif | 1966 | #endif |
1945 | if (unwindex) | ||
1946 | sechdrs[unwindex].sh_flags |= SHF_ALLOC; | ||
1947 | 1967 | ||
1948 | /* Check module struct version now, before we try to use module. */ | 1968 | /* Check module struct version now, before we try to use module. */ |
1949 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 1969 | if (!check_modstruct_version(sechdrs, versindex, mod)) { |
@@ -2240,14 +2260,10 @@ static noinline struct module *load_module(void __user *umod, | |||
2240 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2260 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2241 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2261 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2242 | 2262 | ||
2243 | /* Size of section 0 is 0, so this works well if no unwind info. */ | ||
2244 | mod->unwind_info = unwind_add_table(mod, | ||
2245 | (void *)sechdrs[unwindex].sh_addr, | ||
2246 | sechdrs[unwindex].sh_size); | ||
2247 | |||
2248 | /* Get rid of temporary copy */ | 2263 | /* Get rid of temporary copy */ |
2249 | vfree(hdr); | 2264 | vfree(hdr); |
2250 | 2265 | ||
2266 | stop_machine_destroy(); | ||
2251 | /* Done! */ | 2267 | /* Done! */ |
2252 | return mod; | 2268 | return mod; |
2253 | 2269 | ||
@@ -2270,6 +2286,7 @@ static noinline struct module *load_module(void __user *umod, | |||
2270 | kfree(args); | 2286 | kfree(args); |
2271 | free_hdr: | 2287 | free_hdr: |
2272 | vfree(hdr); | 2288 | vfree(hdr); |
2289 | stop_machine_destroy(); | ||
2273 | return ERR_PTR(err); | 2290 | return ERR_PTR(err); |
2274 | 2291 | ||
2275 | truncated: | 2292 | truncated: |
@@ -2337,11 +2354,12 @@ sys_init_module(void __user *umod, | |||
2337 | /* Now it's a first class citizen! Wake up anyone waiting for it. */ | 2354 | /* Now it's a first class citizen! Wake up anyone waiting for it. */ |
2338 | mod->state = MODULE_STATE_LIVE; | 2355 | mod->state = MODULE_STATE_LIVE; |
2339 | wake_up(&module_wq); | 2356 | wake_up(&module_wq); |
2357 | blocking_notifier_call_chain(&module_notify_list, | ||
2358 | MODULE_STATE_LIVE, mod); | ||
2340 | 2359 | ||
2341 | mutex_lock(&module_mutex); | 2360 | mutex_lock(&module_mutex); |
2342 | /* Drop initial reference. */ | 2361 | /* Drop initial reference. */ |
2343 | module_put(mod); | 2362 | module_put(mod); |
2344 | unwind_remove_table(mod->unwind_info, 1); | ||
2345 | module_free(mod, mod->module_init); | 2363 | module_free(mod, mod->module_init); |
2346 | mod->module_init = NULL; | 2364 | mod->module_init = NULL; |
2347 | mod->init_size = 0; | 2365 | mod->init_size = 0; |
@@ -2376,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod, | |||
2376 | unsigned long nextval; | 2394 | unsigned long nextval; |
2377 | 2395 | ||
2378 | /* At worse, next value is at end of module */ | 2396 | /* At worse, next value is at end of module */ |
2379 | if (within(addr, mod->module_init, mod->init_size)) | 2397 | if (within_module_init(addr, mod)) |
2380 | nextval = (unsigned long)mod->module_init+mod->init_text_size; | 2398 | nextval = (unsigned long)mod->module_init+mod->init_text_size; |
2381 | else | 2399 | else |
2382 | nextval = (unsigned long)mod->module_core+mod->core_text_size; | 2400 | nextval = (unsigned long)mod->module_core+mod->core_text_size; |
@@ -2424,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr, | |||
2424 | 2442 | ||
2425 | preempt_disable(); | 2443 | preempt_disable(); |
2426 | list_for_each_entry_rcu(mod, &modules, list) { | 2444 | list_for_each_entry_rcu(mod, &modules, list) { |
2427 | if (within(addr, mod->module_init, mod->init_size) | 2445 | if (within_module_init(addr, mod) || |
2428 | || within(addr, mod->module_core, mod->core_size)) { | 2446 | within_module_core(addr, mod)) { |
2429 | if (modname) | 2447 | if (modname) |
2430 | *modname = mod->name; | 2448 | *modname = mod->name; |
2431 | ret = get_ksymbol(mod, addr, size, offset); | 2449 | ret = get_ksymbol(mod, addr, size, offset); |
@@ -2447,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
2447 | 2465 | ||
2448 | preempt_disable(); | 2466 | preempt_disable(); |
2449 | list_for_each_entry_rcu(mod, &modules, list) { | 2467 | list_for_each_entry_rcu(mod, &modules, list) { |
2450 | if (within(addr, mod->module_init, mod->init_size) || | 2468 | if (within_module_init(addr, mod) || |
2451 | within(addr, mod->module_core, mod->core_size)) { | 2469 | within_module_core(addr, mod)) { |
2452 | const char *sym; | 2470 | const char *sym; |
2453 | 2471 | ||
2454 | sym = get_ksymbol(mod, addr, NULL, NULL); | 2472 | sym = get_ksymbol(mod, addr, NULL, NULL); |
@@ -2471,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
2471 | 2489 | ||
2472 | preempt_disable(); | 2490 | preempt_disable(); |
2473 | list_for_each_entry_rcu(mod, &modules, list) { | 2491 | list_for_each_entry_rcu(mod, &modules, list) { |
2474 | if (within(addr, mod->module_init, mod->init_size) || | 2492 | if (within_module_init(addr, mod) || |
2475 | within(addr, mod->module_core, mod->core_size)) { | 2493 | within_module_core(addr, mod)) { |
2476 | const char *sym; | 2494 | const char *sym; |
2477 | 2495 | ||
2478 | sym = get_ksymbol(mod, addr, size, offset); | 2496 | sym = get_ksymbol(mod, addr, size, offset); |
@@ -2504,7 +2522,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
2504 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, | 2522 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, |
2505 | KSYM_NAME_LEN); | 2523 | KSYM_NAME_LEN); |
2506 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); | 2524 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); |
2507 | *exported = is_exported(name, mod); | 2525 | *exported = is_exported(name, *value, mod); |
2508 | preempt_enable(); | 2526 | preempt_enable(); |
2509 | return 0; | 2527 | return 0; |
2510 | } | 2528 | } |
@@ -2691,7 +2709,7 @@ int is_module_address(unsigned long addr) | |||
2691 | preempt_disable(); | 2709 | preempt_disable(); |
2692 | 2710 | ||
2693 | list_for_each_entry_rcu(mod, &modules, list) { | 2711 | list_for_each_entry_rcu(mod, &modules, list) { |
2694 | if (within(addr, mod->module_core, mod->core_size)) { | 2712 | if (within_module_core(addr, mod)) { |
2695 | preempt_enable(); | 2713 | preempt_enable(); |
2696 | return 1; | 2714 | return 1; |
2697 | } | 2715 | } |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 43c2111cd54d..78bc3fdac0d2 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c | |||
@@ -13,7 +13,6 @@ | |||
13 | 13 | ||
14 | struct ns_cgroup { | 14 | struct ns_cgroup { |
15 | struct cgroup_subsys_state css; | 15 | struct cgroup_subsys_state css; |
16 | spinlock_t lock; | ||
17 | }; | 16 | }; |
18 | 17 | ||
19 | struct cgroup_subsys ns_subsys; | 18 | struct cgroup_subsys ns_subsys; |
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | |||
84 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | 83 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); |
85 | if (!ns_cgroup) | 84 | if (!ns_cgroup) |
86 | return ERR_PTR(-ENOMEM); | 85 | return ERR_PTR(-ENOMEM); |
87 | spin_lock_init(&ns_cgroup->lock); | ||
88 | return &ns_cgroup->css; | 86 | return &ns_cgroup->css; |
89 | } | 87 | } |
90 | 88 | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 13f06349a786..2a2ff36ff44d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -299,6 +299,8 @@ static int init_oops_id(void) | |||
299 | { | 299 | { |
300 | if (!oops_id) | 300 | if (!oops_id) |
301 | get_random_bytes(&oops_id, sizeof(oops_id)); | 301 | get_random_bytes(&oops_id, sizeof(oops_id)); |
302 | else | ||
303 | oops_id++; | ||
302 | 304 | ||
303 | return 0; | 305 | return 0; |
304 | } | 306 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index 064e76afa507..1b3586fe753a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) | |||
474 | } | 474 | } |
475 | EXPORT_SYMBOL(task_session_nr_ns); | 475 | EXPORT_SYMBOL(task_session_nr_ns); |
476 | 476 | ||
477 | struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) | ||
478 | { | ||
479 | return ns_of_pid(task_pid(tsk)); | ||
480 | } | ||
481 | EXPORT_SYMBOL_GPL(task_active_pid_ns); | ||
482 | |||
477 | /* | 483 | /* |
478 | * Used by proc to find the first pid that is greater then or equal to nr. | 484 | * Used by proc to find the first pid that is greater than or equal to nr. |
479 | * | 485 | * |
480 | * If there is a pid at nr this function is exactly the same as find_pid_ns. | 486 | * If there is a pid at nr this function is exactly the same as find_pid_ns. |
481 | */ | 487 | */ |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 613f16941b85..239988873971 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
615 | /* this may fail if the RTC hasn't been initialized */ | 615 | /* this may fail if the RTC hasn't been initialized */ |
616 | status = rtc_read_time(rtc, &alm.time); | 616 | status = rtc_read_time(rtc, &alm.time); |
617 | if (status < 0) { | 617 | if (status < 0) { |
618 | printk(err_readtime, rtc->dev.bus_id, status); | 618 | printk(err_readtime, dev_name(&rtc->dev), status); |
619 | return; | 619 | return; |
620 | } | 620 | } |
621 | rtc_tm_to_time(&alm.time, &now); | 621 | rtc_tm_to_time(&alm.time, &now); |
@@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
626 | 626 | ||
627 | status = rtc_set_alarm(rtc, &alm); | 627 | status = rtc_set_alarm(rtc, &alm); |
628 | if (status < 0) { | 628 | if (status < 0) { |
629 | printk(err_wakealarm, rtc->dev.bus_id, status); | 629 | printk(err_wakealarm, dev_name(&rtc->dev), status); |
630 | return; | 630 | return; |
631 | } | 631 | } |
632 | 632 | ||
@@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) | |||
660 | if (!device_may_wakeup(candidate->dev.parent)) | 660 | if (!device_may_wakeup(candidate->dev.parent)) |
661 | return 0; | 661 | return 0; |
662 | 662 | ||
663 | *(char **)name_ptr = dev->bus_id; | 663 | *(const char **)name_ptr = dev_name(dev); |
664 | return 1; | 664 | return 1; |
665 | } | 665 | } |
666 | 666 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index e651ab05655f..7015733793e8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) | |||
619 | static const char recursion_bug_msg [] = | 619 | static const char recursion_bug_msg [] = |
620 | KERN_CRIT "BUG: recent printk recursion!\n"; | 620 | KERN_CRIT "BUG: recent printk recursion!\n"; |
621 | static int recursion_bug; | 621 | static int recursion_bug; |
622 | static int new_text_line = 1; | 622 | static int new_text_line = 1; |
623 | static char printk_buf[1024]; | 623 | static char printk_buf[1024]; |
624 | 624 | ||
625 | asmlinkage int vprintk(const char *fmt, va_list args) | 625 | asmlinkage int vprintk(const char *fmt, va_list args) |
diff --git a/kernel/profile.c b/kernel/profile.c index d18e2d2654f2..784933acf5b8 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -445,7 +445,6 @@ void profile_tick(int type) | |||
445 | #ifdef CONFIG_PROC_FS | 445 | #ifdef CONFIG_PROC_FS |
446 | #include <linux/proc_fs.h> | 446 | #include <linux/proc_fs.h> |
447 | #include <asm/uaccess.h> | 447 | #include <asm/uaccess.h> |
448 | #include <asm/ptrace.h> | ||
449 | 448 | ||
450 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, | 449 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, |
451 | int count, int *eof, void *data) | 450 | int count, int *eof, void *data) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ad63af8b2521..d92a76a881aa 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head) | |||
77 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 77 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), |
78 | * and may be nested. | 78 | * and may be nested. |
79 | */ | 79 | */ |
80 | void synchronize_rcu(void); /* Makes kernel-doc tools happy */ | 80 | void synchronize_rcu(void) |
81 | synchronize_rcu_xxx(synchronize_rcu, call_rcu) | 81 | { |
82 | struct rcu_synchronize rcu; | ||
83 | init_completion(&rcu.completion); | ||
84 | /* Will wake me after RCU finished. */ | ||
85 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
86 | /* Wait for it. */ | ||
87 | wait_for_completion(&rcu.completion); | ||
88 | } | ||
82 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 89 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
83 | 90 | ||
84 | static void rcu_barrier_callback(struct rcu_head *notused) | 91 | static void rcu_barrier_callback(struct rcu_head *notused) |
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index f9dc8f3720f6..33cfc50781f9 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
@@ -1177,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
1177 | * in -rt this does -not- necessarily result in all currently executing | 1177 | * in -rt this does -not- necessarily result in all currently executing |
1178 | * interrupt -handlers- having completed. | 1178 | * interrupt -handlers- having completed. |
1179 | */ | 1179 | */ |
1180 | synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) | 1180 | void __synchronize_sched(void) |
1181 | { | ||
1182 | struct rcu_synchronize rcu; | ||
1183 | |||
1184 | init_completion(&rcu.completion); | ||
1185 | /* Will wake me after RCU finished. */ | ||
1186 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | ||
1187 | /* Wait for it. */ | ||
1188 | wait_for_completion(&rcu.completion); | ||
1189 | } | ||
1181 | EXPORT_SYMBOL_GPL(__synchronize_sched); | 1190 | EXPORT_SYMBOL_GPL(__synchronize_sched); |
1182 | 1191 | ||
1183 | /* | 1192 | /* |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 3245b40952c6..1cff28db56b6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -136,7 +136,7 @@ static int stutter_pause_test = 0; | |||
136 | #endif | 136 | #endif |
137 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 137 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
138 | 138 | ||
139 | #define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ | 139 | #define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */ |
140 | #define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ | 140 | #define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ |
141 | static int fullstop; /* stop generating callbacks at test end. */ | 141 | static int fullstop; /* stop generating callbacks at test end. */ |
142 | DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ | 142 | DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ |
@@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
151 | { | 151 | { |
152 | if (fullstop) | 152 | if (fullstop) |
153 | return NOTIFY_DONE; | 153 | return NOTIFY_DONE; |
154 | if (signal_pending(current)) { | 154 | mutex_lock(&fullstop_mutex); |
155 | mutex_lock(&fullstop_mutex); | 155 | if (!fullstop) |
156 | if (!ACCESS_ONCE(fullstop)) | 156 | fullstop = FULLSTOP_SHUTDOWN; |
157 | fullstop = FULLSTOP_SIGNALED; | 157 | mutex_unlock(&fullstop_mutex); |
158 | mutex_unlock(&fullstop_mutex); | ||
159 | } | ||
160 | return NOTIFY_DONE; | 158 | return NOTIFY_DONE; |
161 | } | 159 | } |
162 | 160 | ||
@@ -624,7 +622,7 @@ rcu_torture_writer(void *arg) | |||
624 | rcu_stutter_wait(); | 622 | rcu_stutter_wait(); |
625 | } while (!kthread_should_stop() && !fullstop); | 623 | } while (!kthread_should_stop() && !fullstop); |
626 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 624 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
627 | while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) | 625 | while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) |
628 | schedule_timeout_uninterruptible(1); | 626 | schedule_timeout_uninterruptible(1); |
629 | return 0; | 627 | return 0; |
630 | } | 628 | } |
@@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg) | |||
649 | } while (!kthread_should_stop() && !fullstop); | 647 | } while (!kthread_should_stop() && !fullstop); |
650 | 648 | ||
651 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); | 649 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); |
652 | while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) | 650 | while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) |
653 | schedule_timeout_uninterruptible(1); | 651 | schedule_timeout_uninterruptible(1); |
654 | return 0; | 652 | return 0; |
655 | } | 653 | } |
@@ -759,7 +757,7 @@ rcu_torture_reader(void *arg) | |||
759 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 757 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
760 | if (irqreader && cur_ops->irqcapable) | 758 | if (irqreader && cur_ops->irqcapable) |
761 | del_timer_sync(&t); | 759 | del_timer_sync(&t); |
762 | while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) | 760 | while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) |
763 | schedule_timeout_uninterruptible(1); | 761 | schedule_timeout_uninterruptible(1); |
764 | return 0; | 762 | return 0; |
765 | } | 763 | } |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a342b032112c..f2d8638e6c60 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | |||
79 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 79 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
80 | 80 | ||
81 | #ifdef CONFIG_NO_HZ | 81 | #ifdef CONFIG_NO_HZ |
82 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); | 82 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
83 | .dynticks_nesting = 1, | ||
84 | .dynticks = 1, | ||
85 | }; | ||
83 | #endif /* #ifdef CONFIG_NO_HZ */ | 86 | #endif /* #ifdef CONFIG_NO_HZ */ |
84 | 87 | ||
85 | static int blimit = 10; /* Maximum callbacks per softirq. */ | 88 | static int blimit = 10; /* Maximum callbacks per softirq. */ |
@@ -572,6 +575,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
572 | /* Special-case the common single-level case. */ | 575 | /* Special-case the common single-level case. */ |
573 | if (NUM_RCU_NODES == 1) { | 576 | if (NUM_RCU_NODES == 1) { |
574 | rnp->qsmask = rnp->qsmaskinit; | 577 | rnp->qsmask = rnp->qsmaskinit; |
578 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | ||
575 | spin_unlock_irqrestore(&rnp->lock, flags); | 579 | spin_unlock_irqrestore(&rnp->lock, flags); |
576 | return; | 580 | return; |
577 | } | 581 | } |
@@ -1379,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1379 | 1383 | ||
1380 | static void __cpuinit rcu_online_cpu(int cpu) | 1384 | static void __cpuinit rcu_online_cpu(int cpu) |
1381 | { | 1385 | { |
1382 | #ifdef CONFIG_NO_HZ | ||
1383 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1384 | |||
1385 | rdtp->dynticks_nesting = 1; | ||
1386 | rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */ | ||
1387 | rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1; | ||
1388 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
1389 | rcu_init_percpu_data(cpu, &rcu_state); | 1386 | rcu_init_percpu_data(cpu, &rcu_state); |
1390 | rcu_init_percpu_data(cpu, &rcu_bh_state); | 1387 | rcu_init_percpu_data(cpu, &rcu_bh_state); |
1391 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 1388 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index f275c8eca772..bf8e7534c803 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -15,10 +15,11 @@ | |||
15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | 17 | ||
18 | void res_counter_init(struct res_counter *counter) | 18 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) |
19 | { | 19 | { |
20 | spin_lock_init(&counter->lock); | 20 | spin_lock_init(&counter->lock); |
21 | counter->limit = (unsigned long long)LLONG_MAX; | 21 | counter->limit = (unsigned long long)LLONG_MAX; |
22 | counter->parent = parent; | ||
22 | } | 23 | } |
23 | 24 | ||
24 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) |
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | |||
34 | return 0; | 35 | return 0; |
35 | } | 36 | } |
36 | 37 | ||
37 | int res_counter_charge(struct res_counter *counter, unsigned long val) | 38 | int res_counter_charge(struct res_counter *counter, unsigned long val, |
39 | struct res_counter **limit_fail_at) | ||
38 | { | 40 | { |
39 | int ret; | 41 | int ret; |
40 | unsigned long flags; | 42 | unsigned long flags; |
41 | 43 | struct res_counter *c, *u; | |
42 | spin_lock_irqsave(&counter->lock, flags); | 44 | |
43 | ret = res_counter_charge_locked(counter, val); | 45 | *limit_fail_at = NULL; |
44 | spin_unlock_irqrestore(&counter->lock, flags); | 46 | local_irq_save(flags); |
47 | for (c = counter; c != NULL; c = c->parent) { | ||
48 | spin_lock(&c->lock); | ||
49 | ret = res_counter_charge_locked(c, val); | ||
50 | spin_unlock(&c->lock); | ||
51 | if (ret < 0) { | ||
52 | *limit_fail_at = c; | ||
53 | goto undo; | ||
54 | } | ||
55 | } | ||
56 | ret = 0; | ||
57 | goto done; | ||
58 | undo: | ||
59 | for (u = counter; u != c; u = u->parent) { | ||
60 | spin_lock(&u->lock); | ||
61 | res_counter_uncharge_locked(u, val); | ||
62 | spin_unlock(&u->lock); | ||
63 | } | ||
64 | done: | ||
65 | local_irq_restore(flags); | ||
45 | return ret; | 66 | return ret; |
46 | } | 67 | } |
47 | 68 | ||
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | |||
56 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 77 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) |
57 | { | 78 | { |
58 | unsigned long flags; | 79 | unsigned long flags; |
80 | struct res_counter *c; | ||
59 | 81 | ||
60 | spin_lock_irqsave(&counter->lock, flags); | 82 | local_irq_save(flags); |
61 | res_counter_uncharge_locked(counter, val); | 83 | for (c = counter; c != NULL; c = c->parent) { |
62 | spin_unlock_irqrestore(&counter->lock, flags); | 84 | spin_lock(&c->lock); |
85 | res_counter_uncharge_locked(c, val); | ||
86 | spin_unlock(&c->lock); | ||
87 | } | ||
88 | local_irq_restore(flags); | ||
63 | } | 89 | } |
64 | 90 | ||
65 | 91 | ||
diff --git a/kernel/resource.c b/kernel/resource.c index e633106b12f6..ca6a1536b205 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res) | |||
623 | */ | 623 | */ |
624 | struct resource * __request_region(struct resource *parent, | 624 | struct resource * __request_region(struct resource *parent, |
625 | resource_size_t start, resource_size_t n, | 625 | resource_size_t start, resource_size_t n, |
626 | const char *name) | 626 | const char *name, int flags) |
627 | { | 627 | { |
628 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); | 628 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); |
629 | 629 | ||
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent, | |||
634 | res->start = start; | 634 | res->start = start; |
635 | res->end = start + n - 1; | 635 | res->end = start + n - 1; |
636 | res->flags = IORESOURCE_BUSY; | 636 | res->flags = IORESOURCE_BUSY; |
637 | res->flags |= flags; | ||
637 | 638 | ||
638 | write_lock(&resource_lock); | 639 | write_lock(&resource_lock); |
639 | 640 | ||
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start, | |||
679 | { | 680 | { |
680 | struct resource * res; | 681 | struct resource * res; |
681 | 682 | ||
682 | res = __request_region(parent, start, n, "check-region"); | 683 | res = __request_region(parent, start, n, "check-region", 0); |
683 | if (!res) | 684 | if (!res) |
684 | return -EBUSY; | 685 | return -EBUSY; |
685 | 686 | ||
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev, | |||
776 | dr->start = start; | 777 | dr->start = start; |
777 | dr->n = n; | 778 | dr->n = n; |
778 | 779 | ||
779 | res = __request_region(parent, start, n, name); | 780 | res = __request_region(parent, start, n, name, 0); |
780 | if (res) | 781 | if (res) |
781 | devres_add(dev, dr); | 782 | devres_add(dev, dr); |
782 | else | 783 | else |
@@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) | |||
876 | 877 | ||
877 | return err; | 878 | return err; |
878 | } | 879 | } |
880 | |||
881 | #ifdef CONFIG_STRICT_DEVMEM | ||
882 | static int strict_iomem_checks = 1; | ||
883 | #else | ||
884 | static int strict_iomem_checks; | ||
885 | #endif | ||
886 | |||
887 | /* | ||
888 | * check if an address is reserved in the iomem resource tree | ||
889 | * returns 1 if reserved, 0 if not reserved. | ||
890 | */ | ||
891 | int iomem_is_exclusive(u64 addr) | ||
892 | { | ||
893 | struct resource *p = &iomem_resource; | ||
894 | int err = 0; | ||
895 | loff_t l; | ||
896 | int size = PAGE_SIZE; | ||
897 | |||
898 | if (!strict_iomem_checks) | ||
899 | return 0; | ||
900 | |||
901 | addr = addr & PAGE_MASK; | ||
902 | |||
903 | read_lock(&resource_lock); | ||
904 | for (p = p->child; p ; p = r_next(NULL, p, &l)) { | ||
905 | /* | ||
906 | * We can probably skip the resources without | ||
907 | * IORESOURCE_IO attribute? | ||
908 | */ | ||
909 | if (p->start >= addr + size) | ||
910 | break; | ||
911 | if (p->end < addr) | ||
912 | continue; | ||
913 | if (p->flags & IORESOURCE_BUSY && | ||
914 | p->flags & IORESOURCE_EXCLUSIVE) { | ||
915 | err = 1; | ||
916 | break; | ||
917 | } | ||
918 | } | ||
919 | read_unlock(&resource_lock); | ||
920 | |||
921 | return err; | ||
922 | } | ||
923 | |||
924 | static int __init strict_iomem(char *str) | ||
925 | { | ||
926 | if (strstr(str, "relaxed")) | ||
927 | strict_iomem_checks = 0; | ||
928 | if (strstr(str, "strict")) | ||
929 | strict_iomem_checks = 1; | ||
930 | return 1; | ||
931 | } | ||
932 | |||
933 | __setup("iomem=", strict_iomem); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 545c6fccd1dc..deb5ac8c12f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -3728,8 +3728,13 @@ redo: | |||
3728 | } | 3728 | } |
3729 | 3729 | ||
3730 | double_unlock_balance(this_rq, busiest); | 3730 | double_unlock_balance(this_rq, busiest); |
3731 | /* | ||
3732 | * Should not call ttwu while holding a rq->lock | ||
3733 | */ | ||
3734 | spin_unlock(&this_rq->lock); | ||
3731 | if (active_balance) | 3735 | if (active_balance) |
3732 | wake_up_process(busiest->migration_thread); | 3736 | wake_up_process(busiest->migration_thread); |
3737 | spin_lock(&this_rq->lock); | ||
3733 | 3738 | ||
3734 | } else | 3739 | } else |
3735 | sd->nr_balance_failed = 0; | 3740 | sd->nr_balance_failed = 0; |
@@ -6957,7 +6962,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6957 | spin_unlock_irqrestore(&rq->lock, flags); | 6962 | spin_unlock_irqrestore(&rq->lock, flags); |
6958 | } | 6963 | } |
6959 | 6964 | ||
6960 | static int init_rootdomain(struct root_domain *rd, bool bootmem) | 6965 | static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) |
6961 | { | 6966 | { |
6962 | memset(rd, 0, sizeof(*rd)); | 6967 | memset(rd, 0, sizeof(*rd)); |
6963 | 6968 | ||
@@ -6970,7 +6975,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) | |||
6970 | } | 6975 | } |
6971 | 6976 | ||
6972 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) | 6977 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
6973 | goto free_rd; | 6978 | goto out; |
6974 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) | 6979 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
6975 | goto free_span; | 6980 | goto free_span; |
6976 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 6981 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
@@ -6986,8 +6991,7 @@ free_online: | |||
6986 | free_cpumask_var(rd->online); | 6991 | free_cpumask_var(rd->online); |
6987 | free_span: | 6992 | free_span: |
6988 | free_cpumask_var(rd->span); | 6993 | free_cpumask_var(rd->span); |
6989 | free_rd: | 6994 | out: |
6990 | kfree(rd); | ||
6991 | return -ENOMEM; | 6995 | return -ENOMEM; |
6992 | } | 6996 | } |
6993 | 6997 | ||
@@ -7987,7 +7991,7 @@ match2: | |||
7987 | } | 7991 | } |
7988 | 7992 | ||
7989 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7993 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7990 | int arch_reinit_sched_domains(void) | 7994 | static void arch_reinit_sched_domains(void) |
7991 | { | 7995 | { |
7992 | get_online_cpus(); | 7996 | get_online_cpus(); |
7993 | 7997 | ||
@@ -7996,13 +8000,10 @@ int arch_reinit_sched_domains(void) | |||
7996 | 8000 | ||
7997 | rebuild_sched_domains(); | 8001 | rebuild_sched_domains(); |
7998 | put_online_cpus(); | 8002 | put_online_cpus(); |
7999 | |||
8000 | return 0; | ||
8001 | } | 8003 | } |
8002 | 8004 | ||
8003 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | 8005 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) |
8004 | { | 8006 | { |
8005 | int ret; | ||
8006 | unsigned int level = 0; | 8007 | unsigned int level = 0; |
8007 | 8008 | ||
8008 | if (sscanf(buf, "%u", &level) != 1) | 8009 | if (sscanf(buf, "%u", &level) != 1) |
@@ -8023,9 +8024,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
8023 | else | 8024 | else |
8024 | sched_mc_power_savings = level; | 8025 | sched_mc_power_savings = level; |
8025 | 8026 | ||
8026 | ret = arch_reinit_sched_domains(); | 8027 | arch_reinit_sched_domains(); |
8027 | 8028 | ||
8028 | return ret ? ret : count; | 8029 | return count; |
8029 | } | 8030 | } |
8030 | 8031 | ||
8031 | #ifdef CONFIG_SCHED_MC | 8032 | #ifdef CONFIG_SCHED_MC |
@@ -8060,7 +8061,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, | |||
8060 | sched_smt_power_savings_store); | 8061 | sched_smt_power_savings_store); |
8061 | #endif | 8062 | #endif |
8062 | 8063 | ||
8063 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 8064 | int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) |
8064 | { | 8065 | { |
8065 | int err = 0; | 8066 | int err = 0; |
8066 | 8067 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe3..a0b0852414cc 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) | |||
124 | 124 | ||
125 | clock = scd->tick_gtod + delta; | 125 | clock = scd->tick_gtod + delta; |
126 | min_clock = wrap_max(scd->tick_gtod, scd->clock); | 126 | min_clock = wrap_max(scd->tick_gtod, scd->clock); |
127 | max_clock = scd->tick_gtod + TICK_NSEC; | 127 | max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); |
128 | 128 | ||
129 | clock = wrap_max(clock, min_clock); | 129 | clock = wrap_max(clock, min_clock); |
130 | clock = wrap_min(clock, max_clock); | 130 | clock = wrap_min(clock, max_clock); |
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); | |||
227 | */ | 227 | */ |
228 | void sched_clock_idle_wakeup_event(u64 delta_ns) | 228 | void sched_clock_idle_wakeup_event(u64 delta_ns) |
229 | { | 229 | { |
230 | if (timekeeping_suspended) | ||
231 | return; | ||
232 | |||
230 | sched_clock_tick(); | 233 | sched_clock_tick(); |
231 | touch_softlockup_watchdog(); | 234 | touch_softlockup_watchdog(); |
232 | } | 235 | } |
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 018b7be1db2e..1e00bfacf9b8 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -151,7 +151,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
151 | * | 151 | * |
152 | * Returns: -ENOMEM if memory fails. | 152 | * Returns: -ENOMEM if memory fails. |
153 | */ | 153 | */ |
154 | int cpupri_init(struct cpupri *cp, bool bootmem) | 154 | int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) |
155 | { | 155 | { |
156 | int i; | 156 | int i; |
157 | 157 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 56c0efe902a7..8e1352c75557 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, | |||
386 | #endif | 386 | #endif |
387 | 387 | ||
388 | /* | 388 | /* |
389 | * delta *= P[w / rw] | ||
390 | */ | ||
391 | static inline unsigned long | ||
392 | calc_delta_weight(unsigned long delta, struct sched_entity *se) | ||
393 | { | ||
394 | for_each_sched_entity(se) { | ||
395 | delta = calc_delta_mine(delta, | ||
396 | se->load.weight, &cfs_rq_of(se)->load); | ||
397 | } | ||
398 | |||
399 | return delta; | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * delta /= w | 389 | * delta /= w |
404 | */ | 390 | */ |
405 | static inline unsigned long | 391 | static inline unsigned long |
@@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running) | |||
440 | */ | 426 | */ |
441 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | 427 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
442 | { | 428 | { |
443 | unsigned long nr_running = cfs_rq->nr_running; | 429 | u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); |
430 | |||
431 | for_each_sched_entity(se) { | ||
432 | struct load_weight *load = &cfs_rq->load; | ||
444 | 433 | ||
445 | if (unlikely(!se->on_rq)) | 434 | if (unlikely(!se->on_rq)) { |
446 | nr_running++; | 435 | struct load_weight lw = cfs_rq->load; |
447 | 436 | ||
448 | return calc_delta_weight(__sched_period(nr_running), se); | 437 | update_load_add(&lw, se->load.weight); |
438 | load = &lw; | ||
439 | } | ||
440 | slice = calc_delta_mine(slice, se->load.weight, load); | ||
441 | } | ||
442 | return slice; | ||
449 | } | 443 | } |
450 | 444 | ||
451 | /* | 445 | /* |
@@ -1623,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
1623 | } | 1617 | } |
1624 | } | 1618 | } |
1625 | 1619 | ||
1626 | #define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) | ||
1627 | |||
1628 | /* | 1620 | /* |
1629 | * Share the fairness runtime between parent and child, thus the | 1621 | * Share the fairness runtime between parent and child, thus the |
1630 | * total amount of pressure for CPU stays equal - new tasks | 1622 | * total amount of pressure for CPU stays equal - new tasks |
diff --git a/kernel/signal.c b/kernel/signal.c index 8e95855ff3cf..3152ac3b62e2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
858 | q->info.si_signo = sig; | 858 | q->info.si_signo = sig; |
859 | q->info.si_errno = 0; | 859 | q->info.si_errno = 0; |
860 | q->info.si_code = SI_USER; | 860 | q->info.si_code = SI_USER; |
861 | q->info.si_pid = task_pid_vnr(current); | 861 | q->info.si_pid = task_tgid_nr_ns(current, |
862 | task_active_pid_ns(t)); | ||
862 | q->info.si_uid = current_uid(); | 863 | q->info.si_uid = current_uid(); |
863 | break; | 864 | break; |
864 | case (unsigned long) SEND_SIG_PRIV: | 865 | case (unsigned long) SEND_SIG_PRIV: |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 286c41722e8c..0cd415ee62a2 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -38,7 +38,10 @@ struct stop_machine_data { | |||
38 | static unsigned int num_threads; | 38 | static unsigned int num_threads; |
39 | static atomic_t thread_ack; | 39 | static atomic_t thread_ack; |
40 | static DEFINE_MUTEX(lock); | 40 | static DEFINE_MUTEX(lock); |
41 | 41 | /* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ | |
42 | static DEFINE_MUTEX(setup_lock); | ||
43 | /* Users of stop_machine. */ | ||
44 | static int refcount; | ||
42 | static struct workqueue_struct *stop_machine_wq; | 45 | static struct workqueue_struct *stop_machine_wq; |
43 | static struct stop_machine_data active, idle; | 46 | static struct stop_machine_data active, idle; |
44 | static const cpumask_t *active_cpus; | 47 | static const cpumask_t *active_cpus; |
@@ -109,6 +112,43 @@ static int chill(void *unused) | |||
109 | return 0; | 112 | return 0; |
110 | } | 113 | } |
111 | 114 | ||
115 | int stop_machine_create(void) | ||
116 | { | ||
117 | mutex_lock(&setup_lock); | ||
118 | if (refcount) | ||
119 | goto done; | ||
120 | stop_machine_wq = create_rt_workqueue("kstop"); | ||
121 | if (!stop_machine_wq) | ||
122 | goto err_out; | ||
123 | stop_machine_work = alloc_percpu(struct work_struct); | ||
124 | if (!stop_machine_work) | ||
125 | goto err_out; | ||
126 | done: | ||
127 | refcount++; | ||
128 | mutex_unlock(&setup_lock); | ||
129 | return 0; | ||
130 | |||
131 | err_out: | ||
132 | if (stop_machine_wq) | ||
133 | destroy_workqueue(stop_machine_wq); | ||
134 | mutex_unlock(&setup_lock); | ||
135 | return -ENOMEM; | ||
136 | } | ||
137 | EXPORT_SYMBOL_GPL(stop_machine_create); | ||
138 | |||
139 | void stop_machine_destroy(void) | ||
140 | { | ||
141 | mutex_lock(&setup_lock); | ||
142 | refcount--; | ||
143 | if (refcount) | ||
144 | goto done; | ||
145 | destroy_workqueue(stop_machine_wq); | ||
146 | free_percpu(stop_machine_work); | ||
147 | done: | ||
148 | mutex_unlock(&setup_lock); | ||
149 | } | ||
150 | EXPORT_SYMBOL_GPL(stop_machine_destroy); | ||
151 | |||
112 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 152 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
113 | { | 153 | { |
114 | struct work_struct *sm_work; | 154 | struct work_struct *sm_work; |
@@ -146,19 +186,14 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
146 | { | 186 | { |
147 | int ret; | 187 | int ret; |
148 | 188 | ||
189 | ret = stop_machine_create(); | ||
190 | if (ret) | ||
191 | return ret; | ||
149 | /* No CPUs can come up or down during this. */ | 192 | /* No CPUs can come up or down during this. */ |
150 | get_online_cpus(); | 193 | get_online_cpus(); |
151 | ret = __stop_machine(fn, data, cpus); | 194 | ret = __stop_machine(fn, data, cpus); |
152 | put_online_cpus(); | 195 | put_online_cpus(); |
153 | 196 | stop_machine_destroy(); | |
154 | return ret; | 197 | return ret; |
155 | } | 198 | } |
156 | EXPORT_SYMBOL_GPL(stop_machine); | 199 | EXPORT_SYMBOL_GPL(stop_machine); |
157 | |||
158 | static int __init stop_machine_init(void) | ||
159 | { | ||
160 | stop_machine_wq = create_rt_workqueue("kstop"); | ||
161 | stop_machine_work = alloc_percpu(struct work_struct); | ||
162 | return 0; | ||
163 | } | ||
164 | core_initcall(stop_machine_init); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index d356d79e84ac..763c3c17ded3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/task_io_accounting_ops.h> | 33 | #include <linux/task_io_accounting_ops.h> |
34 | #include <linux/seccomp.h> | 34 | #include <linux/seccomp.h> |
35 | #include <linux/cpu.h> | 35 | #include <linux/cpu.h> |
36 | #include <linux/ptrace.h> | ||
36 | 37 | ||
37 | #include <linux/compat.h> | 38 | #include <linux/compat.h> |
38 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
@@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
927 | if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) | 928 | if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) |
928 | return -EFAULT; | 929 | return -EFAULT; |
929 | } | 930 | } |
931 | force_successful_syscall_return(); | ||
930 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); | 932 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); |
931 | } | 933 | } |
932 | 934 | ||
@@ -1627,6 +1629,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1627 | utime = stime = cputime_zero; | 1629 | utime = stime = cputime_zero; |
1628 | 1630 | ||
1629 | if (who == RUSAGE_THREAD) { | 1631 | if (who == RUSAGE_THREAD) { |
1632 | utime = task_utime(current); | ||
1633 | stime = task_stime(current); | ||
1630 | accumulate_thread_rusage(p, r); | 1634 | accumulate_thread_rusage(p, r); |
1631 | goto out; | 1635 | goto out; |
1632 | } | 1636 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c7626f..92f6e5bc3c24 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -87,10 +87,6 @@ extern int rcutorture_runnable; | |||
87 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ | 87 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ |
88 | 88 | ||
89 | /* Constants used for minimum and maximum */ | 89 | /* Constants used for minimum and maximum */ |
90 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) | ||
91 | static int one = 1; | ||
92 | #endif | ||
93 | |||
94 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 90 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
95 | static int sixty = 60; | 91 | static int sixty = 60; |
96 | static int neg_one = -1; | 92 | static int neg_one = -1; |
@@ -101,6 +97,7 @@ static int two = 2; | |||
101 | #endif | 97 | #endif |
102 | 98 | ||
103 | static int zero; | 99 | static int zero; |
100 | static int one = 1; | ||
104 | static int one_hundred = 100; | 101 | static int one_hundred = 100; |
105 | 102 | ||
106 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 103 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
@@ -952,12 +949,22 @@ static struct ctl_table vm_table[] = { | |||
952 | .data = &dirty_background_ratio, | 949 | .data = &dirty_background_ratio, |
953 | .maxlen = sizeof(dirty_background_ratio), | 950 | .maxlen = sizeof(dirty_background_ratio), |
954 | .mode = 0644, | 951 | .mode = 0644, |
955 | .proc_handler = &proc_dointvec_minmax, | 952 | .proc_handler = &dirty_background_ratio_handler, |
956 | .strategy = &sysctl_intvec, | 953 | .strategy = &sysctl_intvec, |
957 | .extra1 = &zero, | 954 | .extra1 = &zero, |
958 | .extra2 = &one_hundred, | 955 | .extra2 = &one_hundred, |
959 | }, | 956 | }, |
960 | { | 957 | { |
958 | .ctl_name = CTL_UNNUMBERED, | ||
959 | .procname = "dirty_background_bytes", | ||
960 | .data = &dirty_background_bytes, | ||
961 | .maxlen = sizeof(dirty_background_bytes), | ||
962 | .mode = 0644, | ||
963 | .proc_handler = &dirty_background_bytes_handler, | ||
964 | .strategy = &sysctl_intvec, | ||
965 | .extra1 = &one, | ||
966 | }, | ||
967 | { | ||
961 | .ctl_name = VM_DIRTY_RATIO, | 968 | .ctl_name = VM_DIRTY_RATIO, |
962 | .procname = "dirty_ratio", | 969 | .procname = "dirty_ratio", |
963 | .data = &vm_dirty_ratio, | 970 | .data = &vm_dirty_ratio, |
@@ -969,6 +976,16 @@ static struct ctl_table vm_table[] = { | |||
969 | .extra2 = &one_hundred, | 976 | .extra2 = &one_hundred, |
970 | }, | 977 | }, |
971 | { | 978 | { |
979 | .ctl_name = CTL_UNNUMBERED, | ||
980 | .procname = "dirty_bytes", | ||
981 | .data = &vm_dirty_bytes, | ||
982 | .maxlen = sizeof(vm_dirty_bytes), | ||
983 | .mode = 0644, | ||
984 | .proc_handler = &dirty_bytes_handler, | ||
985 | .strategy = &sysctl_intvec, | ||
986 | .extra1 = &one, | ||
987 | }, | ||
988 | { | ||
972 | .procname = "dirty_writeback_centisecs", | 989 | .procname = "dirty_writeback_centisecs", |
973 | .data = &dirty_writeback_interval, | 990 | .data = &dirty_writeback_interval, |
974 | .maxlen = sizeof(dirty_writeback_interval), | 991 | .maxlen = sizeof(dirty_writeback_interval), |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 06b6395b45b2..4f104515a19b 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
@@ -22,21 +22,11 @@ | |||
22 | 22 | ||
23 | static u32 rand1, preh_val, posth_val, jph_val; | 23 | static u32 rand1, preh_val, posth_val, jph_val; |
24 | static int errors, handler_errors, num_tests; | 24 | static int errors, handler_errors, num_tests; |
25 | static u32 (*target)(u32 value); | ||
26 | static u32 (*target2)(u32 value); | ||
25 | 27 | ||
26 | static noinline u32 kprobe_target(u32 value) | 28 | static noinline u32 kprobe_target(u32 value) |
27 | { | 29 | { |
28 | /* | ||
29 | * gcc ignores noinline on some architectures unless we stuff | ||
30 | * sufficient lard into the function. The get_kprobe() here is | ||
31 | * just for that. | ||
32 | * | ||
33 | * NOTE: We aren't concerned about the correctness of get_kprobe() | ||
34 | * here; hence, this call is neither under !preempt nor with the | ||
35 | * kprobe_mutex held. This is fine(tm) | ||
36 | */ | ||
37 | if (get_kprobe((void *)0xdeadbeef)) | ||
38 | printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); | ||
39 | |||
40 | return (value / div_factor); | 30 | return (value / div_factor); |
41 | } | 31 | } |
42 | 32 | ||
@@ -74,7 +64,7 @@ static int test_kprobe(void) | |||
74 | return ret; | 64 | return ret; |
75 | } | 65 | } |
76 | 66 | ||
77 | ret = kprobe_target(rand1); | 67 | ret = target(rand1); |
78 | unregister_kprobe(&kp); | 68 | unregister_kprobe(&kp); |
79 | 69 | ||
80 | if (preh_val == 0) { | 70 | if (preh_val == 0) { |
@@ -92,6 +82,84 @@ static int test_kprobe(void) | |||
92 | return 0; | 82 | return 0; |
93 | } | 83 | } |
94 | 84 | ||
85 | static noinline u32 kprobe_target2(u32 value) | ||
86 | { | ||
87 | return (value / div_factor) + 1; | ||
88 | } | ||
89 | |||
90 | static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) | ||
91 | { | ||
92 | preh_val = (rand1 / div_factor) + 1; | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, | ||
97 | unsigned long flags) | ||
98 | { | ||
99 | if (preh_val != (rand1 / div_factor) + 1) { | ||
100 | handler_errors++; | ||
101 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
102 | "incorrect value in post_handler2\n"); | ||
103 | } | ||
104 | posth_val = preh_val + div_factor; | ||
105 | } | ||
106 | |||
107 | static struct kprobe kp2 = { | ||
108 | .symbol_name = "kprobe_target2", | ||
109 | .pre_handler = kp_pre_handler2, | ||
110 | .post_handler = kp_post_handler2 | ||
111 | }; | ||
112 | |||
113 | static int test_kprobes(void) | ||
114 | { | ||
115 | int ret; | ||
116 | struct kprobe *kps[2] = {&kp, &kp2}; | ||
117 | |||
118 | kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | ||
119 | ret = register_kprobes(kps, 2); | ||
120 | if (ret < 0) { | ||
121 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
122 | "register_kprobes returned %d\n", ret); | ||
123 | return ret; | ||
124 | } | ||
125 | |||
126 | preh_val = 0; | ||
127 | posth_val = 0; | ||
128 | ret = target(rand1); | ||
129 | |||
130 | if (preh_val == 0) { | ||
131 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
132 | "kprobe pre_handler not called\n"); | ||
133 | handler_errors++; | ||
134 | } | ||
135 | |||
136 | if (posth_val == 0) { | ||
137 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
138 | "kprobe post_handler not called\n"); | ||
139 | handler_errors++; | ||
140 | } | ||
141 | |||
142 | preh_val = 0; | ||
143 | posth_val = 0; | ||
144 | ret = target2(rand1); | ||
145 | |||
146 | if (preh_val == 0) { | ||
147 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
148 | "kprobe pre_handler2 not called\n"); | ||
149 | handler_errors++; | ||
150 | } | ||
151 | |||
152 | if (posth_val == 0) { | ||
153 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
154 | "kprobe post_handler2 not called\n"); | ||
155 | handler_errors++; | ||
156 | } | ||
157 | |||
158 | unregister_kprobes(kps, 2); | ||
159 | return 0; | ||
160 | |||
161 | } | ||
162 | |||
95 | static u32 j_kprobe_target(u32 value) | 163 | static u32 j_kprobe_target(u32 value) |
96 | { | 164 | { |
97 | if (value != rand1) { | 165 | if (value != rand1) { |
@@ -121,7 +189,7 @@ static int test_jprobe(void) | |||
121 | return ret; | 189 | return ret; |
122 | } | 190 | } |
123 | 191 | ||
124 | ret = kprobe_target(rand1); | 192 | ret = target(rand1); |
125 | unregister_jprobe(&jp); | 193 | unregister_jprobe(&jp); |
126 | if (jph_val == 0) { | 194 | if (jph_val == 0) { |
127 | printk(KERN_ERR "Kprobe smoke test failed: " | 195 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -132,6 +200,43 @@ static int test_jprobe(void) | |||
132 | return 0; | 200 | return 0; |
133 | } | 201 | } |
134 | 202 | ||
203 | static struct jprobe jp2 = { | ||
204 | .entry = j_kprobe_target, | ||
205 | .kp.symbol_name = "kprobe_target2" | ||
206 | }; | ||
207 | |||
208 | static int test_jprobes(void) | ||
209 | { | ||
210 | int ret; | ||
211 | struct jprobe *jps[2] = {&jp, &jp2}; | ||
212 | |||
213 | jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | ||
214 | ret = register_jprobes(jps, 2); | ||
215 | if (ret < 0) { | ||
216 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
217 | "register_jprobes returned %d\n", ret); | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | jph_val = 0; | ||
222 | ret = target(rand1); | ||
223 | if (jph_val == 0) { | ||
224 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
225 | "jprobe handler not called\n"); | ||
226 | handler_errors++; | ||
227 | } | ||
228 | |||
229 | jph_val = 0; | ||
230 | ret = target2(rand1); | ||
231 | if (jph_val == 0) { | ||
232 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
233 | "jprobe handler2 not called\n"); | ||
234 | handler_errors++; | ||
235 | } | ||
236 | unregister_jprobes(jps, 2); | ||
237 | |||
238 | return 0; | ||
239 | } | ||
135 | #ifdef CONFIG_KRETPROBES | 240 | #ifdef CONFIG_KRETPROBES |
136 | static u32 krph_val; | 241 | static u32 krph_val; |
137 | 242 | ||
@@ -177,7 +282,7 @@ static int test_kretprobe(void) | |||
177 | return ret; | 282 | return ret; |
178 | } | 283 | } |
179 | 284 | ||
180 | ret = kprobe_target(rand1); | 285 | ret = target(rand1); |
181 | unregister_kretprobe(&rp); | 286 | unregister_kretprobe(&rp); |
182 | if (krph_val != rand1) { | 287 | if (krph_val != rand1) { |
183 | printk(KERN_ERR "Kprobe smoke test failed: " | 288 | printk(KERN_ERR "Kprobe smoke test failed: " |
@@ -187,12 +292,72 @@ static int test_kretprobe(void) | |||
187 | 292 | ||
188 | return 0; | 293 | return 0; |
189 | } | 294 | } |
295 | |||
296 | static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
297 | { | ||
298 | unsigned long ret = regs_return_value(regs); | ||
299 | |||
300 | if (ret != (rand1 / div_factor) + 1) { | ||
301 | handler_errors++; | ||
302 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
303 | "incorrect value in kretprobe handler2\n"); | ||
304 | } | ||
305 | if (krph_val == 0) { | ||
306 | handler_errors++; | ||
307 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
308 | "call to kretprobe entry handler failed\n"); | ||
309 | } | ||
310 | |||
311 | krph_val = rand1; | ||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | static struct kretprobe rp2 = { | ||
316 | .handler = return_handler2, | ||
317 | .entry_handler = entry_handler, | ||
318 | .kp.symbol_name = "kprobe_target2" | ||
319 | }; | ||
320 | |||
321 | static int test_kretprobes(void) | ||
322 | { | ||
323 | int ret; | ||
324 | struct kretprobe *rps[2] = {&rp, &rp2}; | ||
325 | |||
326 | rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | ||
327 | ret = register_kretprobes(rps, 2); | ||
328 | if (ret < 0) { | ||
329 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
330 | "register_kretprobe returned %d\n", ret); | ||
331 | return ret; | ||
332 | } | ||
333 | |||
334 | krph_val = 0; | ||
335 | ret = target(rand1); | ||
336 | if (krph_val != rand1) { | ||
337 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
338 | "kretprobe handler not called\n"); | ||
339 | handler_errors++; | ||
340 | } | ||
341 | |||
342 | krph_val = 0; | ||
343 | ret = target2(rand1); | ||
344 | if (krph_val != rand1) { | ||
345 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
346 | "kretprobe handler2 not called\n"); | ||
347 | handler_errors++; | ||
348 | } | ||
349 | unregister_kretprobes(rps, 2); | ||
350 | return 0; | ||
351 | } | ||
190 | #endif /* CONFIG_KRETPROBES */ | 352 | #endif /* CONFIG_KRETPROBES */ |
191 | 353 | ||
192 | int init_test_probes(void) | 354 | int init_test_probes(void) |
193 | { | 355 | { |
194 | int ret; | 356 | int ret; |
195 | 357 | ||
358 | target = kprobe_target; | ||
359 | target2 = kprobe_target2; | ||
360 | |||
196 | do { | 361 | do { |
197 | rand1 = random32(); | 362 | rand1 = random32(); |
198 | } while (rand1 <= div_factor); | 363 | } while (rand1 <= div_factor); |
@@ -204,15 +369,30 @@ int init_test_probes(void) | |||
204 | errors++; | 369 | errors++; |
205 | 370 | ||
206 | num_tests++; | 371 | num_tests++; |
372 | ret = test_kprobes(); | ||
373 | if (ret < 0) | ||
374 | errors++; | ||
375 | |||
376 | num_tests++; | ||
207 | ret = test_jprobe(); | 377 | ret = test_jprobe(); |
208 | if (ret < 0) | 378 | if (ret < 0) |
209 | errors++; | 379 | errors++; |
210 | 380 | ||
381 | num_tests++; | ||
382 | ret = test_jprobes(); | ||
383 | if (ret < 0) | ||
384 | errors++; | ||
385 | |||
211 | #ifdef CONFIG_KRETPROBES | 386 | #ifdef CONFIG_KRETPROBES |
212 | num_tests++; | 387 | num_tests++; |
213 | ret = test_kretprobe(); | 388 | ret = test_kretprobe(); |
214 | if (ret < 0) | 389 | if (ret < 0) |
215 | errors++; | 390 | errors++; |
391 | |||
392 | num_tests++; | ||
393 | ret = test_kretprobes(); | ||
394 | if (ret < 0) | ||
395 | errors++; | ||
216 | #endif /* CONFIG_KRETPROBES */ | 396 | #endif /* CONFIG_KRETPROBES */ |
217 | 397 | ||
218 | if (errors) | 398 | if (errors) |
diff --git a/kernel/time.c b/kernel/time.c index d63a4336fad6..4886e3ce83a4 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/fs.h> | 37 | #include <linux/fs.h> |
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/math64.h> | 39 | #include <linux/math64.h> |
40 | #include <linux/ptrace.h> | ||
40 | 41 | ||
41 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
42 | #include <asm/unistd.h> | 43 | #include <asm/unistd.h> |
@@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc) | |||
65 | 66 | ||
66 | if (tloc) { | 67 | if (tloc) { |
67 | if (put_user(i,tloc)) | 68 | if (put_user(i,tloc)) |
68 | i = -EFAULT; | 69 | return -EFAULT; |
69 | } | 70 | } |
71 | force_successful_syscall_return(); | ||
70 | return i; | 72 | return i; |
71 | } | 73 | } |
72 | 74 | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1ca99557e929..06f197560f3b 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -45,7 +45,7 @@ | |||
45 | * | 45 | * |
46 | * The value 8 is somewhat carefully chosen, as anything | 46 | * The value 8 is somewhat carefully chosen, as anything |
47 | * larger can result in overflows. NSEC_PER_JIFFY grows as | 47 | * larger can result in overflows. NSEC_PER_JIFFY grows as |
48 | * HZ shrinks, so values greater then 8 overflow 32bits when | 48 | * HZ shrinks, so values greater than 8 overflow 32bits when |
49 | * HZ=100. | 49 | * HZ=100. |
50 | */ | 50 | */ |
51 | #define JIFFIES_SHIFT 8 | 51 | #define JIFFIES_SHIFT 8 |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fa05e88aa76f..900f1b6598d1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16))); | |||
46 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 46 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
47 | static unsigned long total_sleep_time; /* seconds */ | 47 | static unsigned long total_sleep_time; /* seconds */ |
48 | 48 | ||
49 | /* flag for if timekeeping is suspended */ | ||
50 | int __read_mostly timekeeping_suspended; | ||
51 | |||
49 | static struct timespec xtime_cache __attribute__ ((aligned (16))); | 52 | static struct timespec xtime_cache __attribute__ ((aligned (16))); |
50 | void update_xtime_cache(u64 nsec) | 53 | void update_xtime_cache(u64 nsec) |
51 | { | 54 | { |
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts) | |||
92 | unsigned long seq; | 95 | unsigned long seq; |
93 | s64 nsecs; | 96 | s64 nsecs; |
94 | 97 | ||
98 | WARN_ON(timekeeping_suspended); | ||
99 | |||
95 | do { | 100 | do { |
96 | seq = read_seqbegin(&xtime_lock); | 101 | seq = read_seqbegin(&xtime_lock); |
97 | 102 | ||
@@ -299,8 +304,6 @@ void __init timekeeping_init(void) | |||
299 | write_sequnlock_irqrestore(&xtime_lock, flags); | 304 | write_sequnlock_irqrestore(&xtime_lock, flags); |
300 | } | 305 | } |
301 | 306 | ||
302 | /* flag for if timekeeping is suspended */ | ||
303 | static int timekeeping_suspended; | ||
304 | /* time in seconds when suspend began */ | 307 | /* time in seconds when suspend began */ |
305 | static unsigned long timekeeping_suspend_time; | 308 | static unsigned long timekeeping_suspend_time; |
306 | 309 | ||
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab35716..43f891b05a4b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
92 | mm = get_task_mm(p); | 92 | mm = get_task_mm(p); |
93 | if (mm) { | 93 | if (mm) { |
94 | /* adjust to KB unit */ | 94 | /* adjust to KB unit */ |
95 | stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; | 95 | stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; |
96 | stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; | 96 | stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; |
97 | mmput(mm); | 97 | mmput(mm); |
98 | } | 98 | } |
99 | stats->read_char = p->ioac.rchar; | 99 | stats->read_char = p->ioac.rchar; |