aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pretzel.yyz.us>2005-06-26 23:42:30 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-06-26 23:42:30 -0400
commitf45727d52d1581e9ff4df9d1a12a60789ad2d1eb (patch)
tree773ae25f98542e6d382c688f7e85e8137d065614 /kernel
parent4c925f452cfd16c690209e96821ee094e09a2404 (diff)
parent5696c1944a33b4434a9a1ebb6383b906afd43a10 (diff)
Merge /spare/repo/netdev-2.6/ branch 'ieee80211'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz46
-rw-r--r--kernel/Kconfig.preempt65
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c587
-rw-r--r--kernel/auditsc.c259
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpuset.c97
-rw-r--r--kernel/crash_dump.c52
-rw-r--r--kernel/exit.c20
-rw-r--r--kernel/fork.c28
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kexec.c1063
-rw-r--r--kernel/kmod.c17
-rw-r--r--kernel/kprobes.c288
-rw-r--r--kernel/ksysfs.c13
-rw-r--r--kernel/module.c105
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/disk.c35
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/smp.c89
-rw-r--r--kernel/power/swsusp.c95
-rw-r--r--kernel/printk.c15
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c1063
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c133
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/timer.c353
37 files changed, 3338 insertions, 1263 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000000..248e1c396f8b
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
1#
2# Timer Interrupt Frequency Configuration
3#
4
5choice
6 prompt "Timer frequency"
7 default HZ_250
8 help
9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts.
14 Note that the timer interrupt occurs on each processor in an SMP
15 environment leading to NR_CPUS * HZ number of timer interrupts
16 per second.
17
18
19 config HZ_100
20 bool "100 HZ"
21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring.
25
26 config HZ_250
27 bool "250 HZ"
28 help
29 250 HZ is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems.
32
33 config HZ_1000
34 bool "1000 HZ"
35 help
36 1000 HZ is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events.
38
39endchoice
40
41config HZ
42 int
43 default 100 if HZ_100
44 default 250 if HZ_250
45 default 1000 if HZ_1000
46
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..0b46a5dff4c0
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
1
2choice
3 prompt "Preemption Model"
4 default PREEMPT_NONE
5
6config PREEMPT_NONE
7 bool "No Forced Preemption (Server)"
8 help
9 This is the traditional Linux preemption model, geared towards
10 throughput. It will still provide good latencies most of the
11 time, but there are no guarantees and occasional longer delays
12 are possible.
13
14 Select this option if you are building a kernel for a server or
15 scientific/computation system, or if you want to maximize the
16 raw processing power of the kernel, irrespective of scheduling
17 latencies.
18
19config PREEMPT_VOLUNTARY
20 bool "Voluntary Kernel Preemption (Desktop)"
21 help
22 This option reduces the latency of the kernel by adding more
23 "explicit preemption points" to the kernel code. These new
24 preemption points have been selected to reduce the maximum
25 latency of rescheduling, providing faster application reactions,
26 at the cost of slighly lower throughput.
27
28 This allows reaction to interactive events by allowing a
29 low priority process to voluntarily preempt itself even if it
30 is in kernel mode executing a system call. This allows
31 applications to run more 'smoothly' even when the system is
32 under load.
33
34 Select this if you are building a kernel for a desktop system.
35
36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 help
39 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section)
41 preemptible. This allows reaction to interactive events by
42 permitting a low priority process to be preempted involuntarily
43 even if it is in kernel mode executing a system call and would
44 otherwise not be about to reach a natural preemption point.
45 This allows applications to run more 'smoothly' even when the
46 system is under load, at the cost of slighly lower throughput
47 and a slight runtime overhead to kernel code.
48
49 Select this if you are building a kernel for a desktop or
50 embedded system with latency requirements in the milliseconds
51 range.
52
53endchoice
54
55config PREEMPT_BKL
56 bool "Preempt The Big Kernel Lock"
57 depends on SMP || PREEMPT
58 default y
59 help
60 This option reduces the latency of the kernel by making the
61 big kernel lock preemptible.
62
63 Say Y here if you are building a kernel for a desktop system.
64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 17obj-$(CONFIG_KALLSYMS) += kallsyms.o
18obj-$(CONFIG_PM) += power/ 18obj-$(CONFIG_PM) += power/
19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
20obj-$(CONFIG_KEXEC) += kexec.o
20obj-$(CONFIG_COMPAT) += compat.o 21obj-$(CONFIG_COMPAT) += compat.o
21obj-$(CONFIG_CPUSETS) += cpuset.o 22obj-$(CONFIG_CPUSETS) += cpuset.o
22obj-$(CONFIG_IKCONFIG) += configs.o 23obj-$(CONFIG_IKCONFIG) += configs.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
27obj-$(CONFIG_KPROBES) += kprobes.o 28obj-$(CONFIG_KPROBES) += kprobes.o
28obj-$(CONFIG_SYSFS) += ksysfs.o 29obj-$(CONFIG_SYSFS) += ksysfs.o
29obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
30obj-$(CONFIG_SECCOMP) += seccomp.o 32obj-$(CONFIG_SECCOMP) += seccomp.o
31 33
32ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 34ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/audit.c b/kernel/audit.c
index 9c4f1af0c794..ef35166fdc29 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,8 @@
46#include <asm/types.h> 46#include <asm/types.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/err.h>
50#include <linux/kthread.h>
49 51
50#include <linux/audit.h> 52#include <linux/audit.h>
51 53
@@ -68,7 +70,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
68 70
69/* If audit records are to be written to the netlink socket, audit_pid 71/* If audit records are to be written to the netlink socket, audit_pid
70 * contains the (non-zero) pid. */ 72 * contains the (non-zero) pid. */
71static int audit_pid; 73int audit_pid;
72 74
73/* If audit_limit is non-zero, limit the rate of sending audit records 75/* If audit_limit is non-zero, limit the rate of sending audit records
74 * to that number per second. This prevents DoS attacks, but results in 76 * to that number per second. This prevents DoS attacks, but results in
@@ -77,7 +79,10 @@ static int audit_rate_limit;
77 79
78/* Number of outstanding audit_buffers allowed. */ 80/* Number of outstanding audit_buffers allowed. */
79static int audit_backlog_limit = 64; 81static int audit_backlog_limit = 64;
80static atomic_t audit_backlog = ATOMIC_INIT(0); 82
83/* The identity of the user shutting down the audit system. */
84uid_t audit_sig_uid = -1;
85pid_t audit_sig_pid = -1;
81 86
82/* Records can be lost in several ways: 87/* Records can be lost in several ways:
83 0) [suppressed in audit_alloc] 88 0) [suppressed in audit_alloc]
@@ -91,19 +96,17 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
91/* The netlink socket. */ 96/* The netlink socket. */
92static struct sock *audit_sock; 97static struct sock *audit_sock;
93 98
94/* There are two lists of audit buffers. The txlist contains audit 99/* The audit_freelist is a list of pre-allocated audit buffers (if more
95 * buffers that cannot be sent immediately to the netlink device because
96 * we are in an irq context (these are sent later in a tasklet).
97 *
98 * The second list is a list of pre-allocated audit buffers (if more
99 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 100 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
100 * being placed on the freelist). */ 101 * being placed on the freelist). */
101static DEFINE_SPINLOCK(audit_txlist_lock);
102static DEFINE_SPINLOCK(audit_freelist_lock); 102static DEFINE_SPINLOCK(audit_freelist_lock);
103static int audit_freelist_count = 0; 103static int audit_freelist_count = 0;
104static LIST_HEAD(audit_txlist);
105static LIST_HEAD(audit_freelist); 104static LIST_HEAD(audit_freelist);
106 105
106static struct sk_buff_head audit_skb_queue;
107static struct task_struct *kauditd_task;
108static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
109
107/* There are three lists of rules -- one to search at task creation 110/* There are three lists of rules -- one to search at task creation
108 * time, one to search at syscall entry time, and another to search at 111 * time, one to search at syscall entry time, and another to search at
109 * syscall exit time. */ 112 * syscall exit time. */
@@ -112,7 +115,7 @@ static LIST_HEAD(audit_entlist);
112static LIST_HEAD(audit_extlist); 115static LIST_HEAD(audit_extlist);
113 116
114/* The netlink socket is only to be read by 1 CPU, which lets us assume 117/* The netlink socket is only to be read by 1 CPU, which lets us assume
115 * that list additions and deletions never happen simultaneiously in 118 * that list additions and deletions never happen simultaneously in
116 * auditsc.c */ 119 * auditsc.c */
117static DECLARE_MUTEX(audit_netlink_sem); 120static DECLARE_MUTEX(audit_netlink_sem);
118 121
@@ -132,21 +135,14 @@ static DECLARE_MUTEX(audit_netlink_sem);
132 * use simultaneously. */ 135 * use simultaneously. */
133struct audit_buffer { 136struct audit_buffer {
134 struct list_head list; 137 struct list_head list;
135 struct sk_buff_head sklist; /* formatted skbs ready to send */ 138 struct sk_buff *skb; /* formatted skb ready to send */
136 struct audit_context *ctx; /* NULL or associated context */ 139 struct audit_context *ctx; /* NULL or associated context */
137 int len; /* used area of tmp */
138 char tmp[AUDIT_BUFSIZ];
139
140 /* Pointer to header and contents */
141 struct nlmsghdr *nlh;
142 int total;
143 int type;
144 int pid;
145}; 140};
146 141
147void audit_set_type(struct audit_buffer *ab, int type) 142static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
148{ 143{
149 ab->type = type; 144 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
145 nlh->nlmsg_pid = pid;
150} 146}
151 147
152struct audit_entry { 148struct audit_entry {
@@ -154,9 +150,6 @@ struct audit_entry {
154 struct audit_rule rule; 150 struct audit_rule rule;
155}; 151};
156 152
157static void audit_log_end_irq(struct audit_buffer *ab);
158static void audit_log_end_fast(struct audit_buffer *ab);
159
160static void audit_panic(const char *message) 153static void audit_panic(const char *message)
161{ 154{
162 switch (audit_failure) 155 switch (audit_failure)
@@ -227,10 +220,8 @@ void audit_log_lost(const char *message)
227 220
228 if (print) { 221 if (print) {
229 printk(KERN_WARNING 222 printk(KERN_WARNING
230 "audit: audit_lost=%d audit_backlog=%d" 223 "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
231 " audit_rate_limit=%d audit_backlog_limit=%d\n",
232 atomic_read(&audit_lost), 224 atomic_read(&audit_lost),
233 atomic_read(&audit_backlog),
234 audit_rate_limit, 225 audit_rate_limit,
235 audit_backlog_limit); 226 audit_backlog_limit);
236 audit_panic(message); 227 audit_panic(message);
@@ -242,7 +233,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
242{ 233{
243 int old = audit_rate_limit; 234 int old = audit_rate_limit;
244 audit_rate_limit = limit; 235 audit_rate_limit = limit;
245 audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u", 236 audit_log(NULL, AUDIT_CONFIG_CHANGE,
237 "audit_rate_limit=%d old=%d by auid=%u",
246 audit_rate_limit, old, loginuid); 238 audit_rate_limit, old, loginuid);
247 return old; 239 return old;
248} 240}
@@ -251,7 +243,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
251{ 243{
252 int old = audit_backlog_limit; 244 int old = audit_backlog_limit;
253 audit_backlog_limit = limit; 245 audit_backlog_limit = limit;
254 audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u", 246 audit_log(NULL, AUDIT_CONFIG_CHANGE,
247 "audit_backlog_limit=%d old=%d by auid=%u",
255 audit_backlog_limit, old, loginuid); 248 audit_backlog_limit, old, loginuid);
256 return old; 249 return old;
257} 250}
@@ -262,8 +255,9 @@ static int audit_set_enabled(int state, uid_t loginuid)
262 if (state != 0 && state != 1) 255 if (state != 0 && state != 1)
263 return -EINVAL; 256 return -EINVAL;
264 audit_enabled = state; 257 audit_enabled = state;
265 audit_log(NULL, "audit_enabled=%d old=%d by auid %u", 258 audit_log(NULL, AUDIT_CONFIG_CHANGE,
266 audit_enabled, old, loginuid); 259 "audit_enabled=%d old=%d by auid=%u",
260 audit_enabled, old, loginuid);
267 return old; 261 return old;
268} 262}
269 263
@@ -275,12 +269,44 @@ static int audit_set_failure(int state, uid_t loginuid)
275 && state != AUDIT_FAIL_PANIC) 269 && state != AUDIT_FAIL_PANIC)
276 return -EINVAL; 270 return -EINVAL;
277 audit_failure = state; 271 audit_failure = state;
278 audit_log(NULL, "audit_failure=%d old=%d by auid %u", 272 audit_log(NULL, AUDIT_CONFIG_CHANGE,
279 audit_failure, old, loginuid); 273 "audit_failure=%d old=%d by auid=%u",
274 audit_failure, old, loginuid);
280 return old; 275 return old;
281} 276}
282 277
283#ifdef CONFIG_NET 278int kauditd_thread(void *dummy)
279{
280 struct sk_buff *skb;
281
282 while (1) {
283 skb = skb_dequeue(&audit_skb_queue);
284 if (skb) {
285 if (audit_pid) {
286 int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
287 if (err < 0) {
288 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
289 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
290 audit_pid = 0;
291 }
292 } else {
293 printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
294 kfree_skb(skb);
295 }
296 } else {
297 DECLARE_WAITQUEUE(wait, current);
298 set_current_state(TASK_INTERRUPTIBLE);
299 add_wait_queue(&kauditd_wait, &wait);
300
301 if (!skb_queue_len(&audit_skb_queue))
302 schedule();
303
304 __set_current_state(TASK_RUNNING);
305 remove_wait_queue(&kauditd_wait, &wait);
306 }
307 }
308}
309
284void audit_send_reply(int pid, int seq, int type, int done, int multi, 310void audit_send_reply(int pid, int seq, int type, int done, int multi,
285 void *payload, int size) 311 void *payload, int size)
286{ 312{
@@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
293 319
294 skb = alloc_skb(len, GFP_KERNEL); 320 skb = alloc_skb(len, GFP_KERNEL);
295 if (!skb) 321 if (!skb)
296 goto nlmsg_failure; 322 return;
297 323
298 nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); 324 nlh = NLMSG_PUT(skb, pid, seq, t, size);
299 nlh->nlmsg_flags = flags; 325 nlh->nlmsg_flags = flags;
300 data = NLMSG_DATA(nlh); 326 data = NLMSG_DATA(nlh);
301 memcpy(data, payload, size); 327 memcpy(data, payload, size);
302 netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); 328
329 /* Ignore failure. It'll only happen if the sender goes away,
330 because our timeout is set to infinite. */
331 netlink_unicast(audit_sock, skb, pid, 0);
303 return; 332 return;
304 333
305nlmsg_failure: /* Used by NLMSG_PUT */ 334nlmsg_failure: /* Used by NLMSG_PUT */
@@ -321,10 +350,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
321 case AUDIT_SET: 350 case AUDIT_SET:
322 case AUDIT_ADD: 351 case AUDIT_ADD:
323 case AUDIT_DEL: 352 case AUDIT_DEL:
353 case AUDIT_SIGNAL_INFO:
324 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 354 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
325 err = -EPERM; 355 err = -EPERM;
326 break; 356 break;
327 case AUDIT_USER: 357 case AUDIT_USER:
358 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
328 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 359 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
329 err = -EPERM; 360 err = -EPERM;
330 break; 361 break;
@@ -344,11 +375,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
344 struct audit_buffer *ab; 375 struct audit_buffer *ab;
345 u16 msg_type = nlh->nlmsg_type; 376 u16 msg_type = nlh->nlmsg_type;
346 uid_t loginuid; /* loginuid of sender */ 377 uid_t loginuid; /* loginuid of sender */
378 struct audit_sig_info sig_data;
347 379
348 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 380 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
349 if (err) 381 if (err)
350 return err; 382 return err;
351 383
384 /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
385 if (!kauditd_task)
386 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
387 if (IS_ERR(kauditd_task)) {
388 err = PTR_ERR(kauditd_task);
389 kauditd_task = NULL;
390 return err;
391 }
392
352 pid = NETLINK_CREDS(skb)->pid; 393 pid = NETLINK_CREDS(skb)->pid;
353 uid = NETLINK_CREDS(skb)->uid; 394 uid = NETLINK_CREDS(skb)->uid;
354 loginuid = NETLINK_CB(skb).loginuid; 395 loginuid = NETLINK_CB(skb).loginuid;
@@ -363,7 +404,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
363 status_set.rate_limit = audit_rate_limit; 404 status_set.rate_limit = audit_rate_limit;
364 status_set.backlog_limit = audit_backlog_limit; 405 status_set.backlog_limit = audit_backlog_limit;
365 status_set.lost = atomic_read(&audit_lost); 406 status_set.lost = atomic_read(&audit_lost);
366 status_set.backlog = atomic_read(&audit_backlog); 407 status_set.backlog = skb_queue_len(&audit_skb_queue);
367 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, 408 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
368 &status_set, sizeof(status_set)); 409 &status_set, sizeof(status_set));
369 break; 410 break;
@@ -382,7 +423,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
382 if (status_get->mask & AUDIT_STATUS_PID) { 423 if (status_get->mask & AUDIT_STATUS_PID) {
383 int old = audit_pid; 424 int old = audit_pid;
384 audit_pid = status_get->pid; 425 audit_pid = status_get->pid;
385 audit_log(NULL, "audit_pid=%d old=%d by auid %u", 426 audit_log(NULL, AUDIT_CONFIG_CHANGE,
427 "audit_pid=%d old=%d by auid=%u",
386 audit_pid, old, loginuid); 428 audit_pid, old, loginuid);
387 } 429 }
388 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 430 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
@@ -392,18 +434,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
392 loginuid); 434 loginuid);
393 break; 435 break;
394 case AUDIT_USER: 436 case AUDIT_USER:
395 ab = audit_log_start(NULL); 437 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
438 ab = audit_log_start(NULL, msg_type);
396 if (!ab) 439 if (!ab)
397 break; /* audit_panic has been called */ 440 break; /* audit_panic has been called */
398 audit_log_format(ab, 441 audit_log_format(ab,
399 "user pid=%d uid=%d length=%d loginuid=%u" 442 "user pid=%d uid=%u auid=%u"
400 " msg='%.1024s'", 443 " msg='%.1024s'",
401 pid, uid, 444 pid, uid, loginuid, (char *)data);
402 (int)(nlh->nlmsg_len 445 audit_set_pid(ab, pid);
403 - ((char *)data - (char *)nlh)),
404 loginuid, (char *)data);
405 ab->type = AUDIT_USER;
406 ab->pid = pid;
407 audit_log_end(ab); 446 audit_log_end(ab);
408 break; 447 break;
409 case AUDIT_ADD: 448 case AUDIT_ADD:
@@ -412,12 +451,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
412 return -EINVAL; 451 return -EINVAL;
413 /* fallthrough */ 452 /* fallthrough */
414 case AUDIT_LIST: 453 case AUDIT_LIST:
415#ifdef CONFIG_AUDITSYSCALL
416 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 454 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
417 uid, seq, data, loginuid); 455 uid, seq, data, loginuid);
418#else 456 break;
419 err = -EOPNOTSUPP; 457 case AUDIT_SIGNAL_INFO:
420#endif 458 sig_data.uid = audit_sig_uid;
459 sig_data.pid = audit_sig_pid;
460 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
461 0, 0, &sig_data, sizeof(sig_data));
421 break; 462 break;
422 default: 463 default:
423 err = -EINVAL; 464 err = -EINVAL;
@@ -467,87 +508,6 @@ static void audit_receive(struct sock *sk, int length)
467 up(&audit_netlink_sem); 508 up(&audit_netlink_sem);
468} 509}
469 510
470/* Move data from tmp buffer into an skb. This is an extra copy, and
471 * that is unfortunate. However, the copy will only occur when a record
472 * is being written to user space, which is already a high-overhead
473 * operation. (Elimination of the copy is possible, for example, by
474 * writing directly into a pre-allocated skb, at the cost of wasting
475 * memory. */
476static void audit_log_move(struct audit_buffer *ab)
477{
478 struct sk_buff *skb;
479 char *start;
480 int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
481
482 /* possible resubmission */
483 if (ab->len == 0)
484 return;
485
486 skb = skb_peek_tail(&ab->sklist);
487 if (!skb || skb_tailroom(skb) <= ab->len + extra) {
488 skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
489 if (!skb) {
490 ab->len = 0; /* Lose information in ab->tmp */
491 audit_log_lost("out of memory in audit_log_move");
492 return;
493 }
494 __skb_queue_tail(&ab->sklist, skb);
495 if (!ab->nlh)
496 ab->nlh = (struct nlmsghdr *)skb_put(skb,
497 NLMSG_SPACE(0));
498 }
499 start = skb_put(skb, ab->len);
500 memcpy(start, ab->tmp, ab->len);
501 ab->len = 0;
502}
503
504/* Iterate over the skbuff in the audit_buffer, sending their contents
505 * to user space. */
506static inline int audit_log_drain(struct audit_buffer *ab)
507{
508 struct sk_buff *skb;
509
510 while ((skb = skb_dequeue(&ab->sklist))) {
511 int retval = 0;
512
513 if (audit_pid) {
514 if (ab->nlh) {
515 ab->nlh->nlmsg_len = ab->total;
516 ab->nlh->nlmsg_type = ab->type;
517 ab->nlh->nlmsg_flags = 0;
518 ab->nlh->nlmsg_seq = 0;
519 ab->nlh->nlmsg_pid = ab->pid;
520 }
521 skb_get(skb); /* because netlink_* frees */
522 retval = netlink_unicast(audit_sock, skb, audit_pid,
523 MSG_DONTWAIT);
524 }
525 if (retval == -EAGAIN &&
526 (atomic_read(&audit_backlog)) < audit_backlog_limit) {
527 skb_queue_head(&ab->sklist, skb);
528 audit_log_end_irq(ab);
529 return 1;
530 }
531 if (retval < 0) {
532 if (retval == -ECONNREFUSED) {
533 printk(KERN_ERR
534 "audit: *NO* daemon at audit_pid=%d\n",
535 audit_pid);
536 audit_pid = 0;
537 } else
538 audit_log_lost("netlink socket too busy");
539 }
540 if (!audit_pid) { /* No daemon */
541 int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
542 int len = skb->len - offset;
543 skb->data[offset + len] = '\0';
544 printk(KERN_ERR "%s\n", skb->data + offset);
545 }
546 kfree_skb(skb);
547 ab->nlh = NULL;
548 }
549 return 0;
550}
551 511
552/* Initialize audit support at boot time. */ 512/* Initialize audit support at boot time. */
553static int __init audit_init(void) 513static int __init audit_init(void)
@@ -558,40 +518,13 @@ static int __init audit_init(void)
558 if (!audit_sock) 518 if (!audit_sock)
559 audit_panic("cannot initialize netlink socket"); 519 audit_panic("cannot initialize netlink socket");
560 520
521 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
522 skb_queue_head_init(&audit_skb_queue);
561 audit_initialized = 1; 523 audit_initialized = 1;
562 audit_enabled = audit_default; 524 audit_enabled = audit_default;
563 audit_log(NULL, "initialized"); 525 audit_log(NULL, AUDIT_KERNEL, "initialized");
564 return 0;
565}
566
567#else
568/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
569 * in the buffer. */
570static void audit_log_move(struct audit_buffer *ab)
571{
572 printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
573 ab->len = 0;
574}
575
576static inline int audit_log_drain(struct audit_buffer *ab)
577{
578 return 0;
579}
580
581/* Initialize audit support at boot time. */
582int __init audit_init(void)
583{
584 printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
585 audit_sock = NULL;
586 audit_pid = 0;
587
588 audit_initialized = 1;
589 audit_enabled = audit_default;
590 audit_log(NULL, "initialized");
591 return 0; 526 return 0;
592} 527}
593#endif
594
595__initcall(audit_init); 528__initcall(audit_init);
596 529
597/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ 530/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
@@ -608,6 +541,102 @@ static int __init audit_enable(char *str)
608 541
609__setup("audit=", audit_enable); 542__setup("audit=", audit_enable);
610 543
544static void audit_buffer_free(struct audit_buffer *ab)
545{
546 unsigned long flags;
547
548 if (!ab)
549 return;
550
551 if (ab->skb)
552 kfree_skb(ab->skb);
553
554 spin_lock_irqsave(&audit_freelist_lock, flags);
555 if (++audit_freelist_count > AUDIT_MAXFREE)
556 kfree(ab);
557 else
558 list_add(&ab->list, &audit_freelist);
559 spin_unlock_irqrestore(&audit_freelist_lock, flags);
560}
561
562static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
563 int gfp_mask, int type)
564{
565 unsigned long flags;
566 struct audit_buffer *ab = NULL;
567 struct nlmsghdr *nlh;
568
569 spin_lock_irqsave(&audit_freelist_lock, flags);
570 if (!list_empty(&audit_freelist)) {
571 ab = list_entry(audit_freelist.next,
572 struct audit_buffer, list);
573 list_del(&ab->list);
574 --audit_freelist_count;
575 }
576 spin_unlock_irqrestore(&audit_freelist_lock, flags);
577
578 if (!ab) {
579 ab = kmalloc(sizeof(*ab), gfp_mask);
580 if (!ab)
581 goto err;
582 }
583
584 ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
585 if (!ab->skb)
586 goto err;
587
588 ab->ctx = ctx;
589 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
590 nlh->nlmsg_type = type;
591 nlh->nlmsg_flags = 0;
592 nlh->nlmsg_pid = 0;
593 nlh->nlmsg_seq = 0;
594 return ab;
595err:
596 audit_buffer_free(ab);
597 return NULL;
598}
599
600/* Compute a serial number for the audit record. Audit records are
601 * written to user-space as soon as they are generated, so a complete
602 * audit record may be written in several pieces. The timestamp of the
603 * record and this serial number are used by the user-space tools to
604 * determine which pieces belong to the same audit record. The
605 * (timestamp,serial) tuple is unique for each syscall and is live from
606 * syscall entry to syscall exit.
607 *
608 * Atomic values are only guaranteed to be 24-bit, so we count down.
609 *
610 * NOTE: Another possibility is to store the formatted records off the
611 * audit context (for those records that have a context), and emit them
612 * all at syscall exit. However, this could delay the reporting of
613 * significant errors until syscall exit (or never, if the system
614 * halts). */
615unsigned int audit_serial(void)
616{
617 static atomic_t serial = ATOMIC_INIT(0xffffff);
618 unsigned int a, b;
619
620 do {
621 a = atomic_read(&serial);
622 if (atomic_dec_and_test(&serial))
623 atomic_set(&serial, 0xffffff);
624 b = atomic_read(&serial);
625 } while (b != a - 1);
626
627 return 0xffffff - b;
628}
629
630static inline void audit_get_stamp(struct audit_context *ctx,
631 struct timespec *t, unsigned int *serial)
632{
633 if (ctx)
634 auditsc_get_stamp(ctx, t, serial);
635 else {
636 *t = CURRENT_TIME;
637 *serial = audit_serial();
638 }
639}
611 640
612/* Obtain an audit buffer. This routine does locking to obtain the 641/* Obtain an audit buffer. This routine does locking to obtain the
613 * audit buffer, but then no locking is required for calls to 642 * audit buffer, but then no locking is required for calls to
@@ -615,10 +644,9 @@ __setup("audit=", audit_enable);
615 * syscall, then the syscall is marked as auditable and an audit record 644 * syscall, then the syscall is marked as auditable and an audit record
616 * will be written at syscall exit. If there is no associated task, tsk 645 * will be written at syscall exit. If there is no associated task, tsk
617 * should be NULL. */ 646 * should be NULL. */
618struct audit_buffer *audit_log_start(struct audit_context *ctx) 647struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
619{ 648{
620 struct audit_buffer *ab = NULL; 649 struct audit_buffer *ab = NULL;
621 unsigned long flags;
622 struct timespec t; 650 struct timespec t;
623 unsigned int serial; 651 unsigned int serial;
624 652
@@ -626,57 +654,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
626 return NULL; 654 return NULL;
627 655
628 if (audit_backlog_limit 656 if (audit_backlog_limit
629 && atomic_read(&audit_backlog) > audit_backlog_limit) { 657 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
630 if (audit_rate_check()) 658 if (audit_rate_check())
631 printk(KERN_WARNING 659 printk(KERN_WARNING
632 "audit: audit_backlog=%d > " 660 "audit: audit_backlog=%d > "
633 "audit_backlog_limit=%d\n", 661 "audit_backlog_limit=%d\n",
634 atomic_read(&audit_backlog), 662 skb_queue_len(&audit_skb_queue),
635 audit_backlog_limit); 663 audit_backlog_limit);
636 audit_log_lost("backlog limit exceeded"); 664 audit_log_lost("backlog limit exceeded");
637 return NULL; 665 return NULL;
638 } 666 }
639 667
640 spin_lock_irqsave(&audit_freelist_lock, flags); 668 ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
641 if (!list_empty(&audit_freelist)) {
642 ab = list_entry(audit_freelist.next,
643 struct audit_buffer, list);
644 list_del(&ab->list);
645 --audit_freelist_count;
646 }
647 spin_unlock_irqrestore(&audit_freelist_lock, flags);
648
649 if (!ab)
650 ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
651 if (!ab) { 669 if (!ab) {
652 audit_log_lost("out of memory in audit_log_start"); 670 audit_log_lost("out of memory in audit_log_start");
653 return NULL; 671 return NULL;
654 } 672 }
655 673
656 atomic_inc(&audit_backlog); 674 audit_get_stamp(ab->ctx, &t, &serial);
657 skb_queue_head_init(&ab->sklist);
658
659 ab->ctx = ctx;
660 ab->len = 0;
661 ab->nlh = NULL;
662 ab->total = 0;
663 ab->type = AUDIT_KERNEL;
664 ab->pid = 0;
665 675
666#ifdef CONFIG_AUDITSYSCALL
667 if (ab->ctx)
668 audit_get_stamp(ab->ctx, &t, &serial);
669 else
670#endif
671 {
672 t = CURRENT_TIME;
673 serial = 0;
674 }
675 audit_log_format(ab, "audit(%lu.%03lu:%u): ", 676 audit_log_format(ab, "audit(%lu.%03lu:%u): ",
676 t.tv_sec, t.tv_nsec/1000000, serial); 677 t.tv_sec, t.tv_nsec/1000000, serial);
677 return ab; 678 return ab;
678} 679}
679 680
681/**
682 * audit_expand - expand skb in the audit buffer
683 * @ab: audit_buffer
684 *
685 * Returns 0 (no space) on failed expansion, or available space if
686 * successful.
687 */
688static inline int audit_expand(struct audit_buffer *ab, int extra)
689{
690 struct sk_buff *skb = ab->skb;
691 int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
692 GFP_ATOMIC);
693 if (ret < 0) {
694 audit_log_lost("out of memory in audit_expand");
695 return 0;
696 }
697 return skb_tailroom(skb);
698}
680 699
681/* Format an audit message into the audit buffer. If there isn't enough 700/* Format an audit message into the audit buffer. If there isn't enough
682 * room in the audit buffer, more room will be allocated and vsnprint 701 * room in the audit buffer, more room will be allocated and vsnprint
@@ -686,26 +705,35 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
686 va_list args) 705 va_list args)
687{ 706{
688 int len, avail; 707 int len, avail;
708 struct sk_buff *skb;
709 va_list args2;
689 710
690 if (!ab) 711 if (!ab)
691 return; 712 return;
692 713
693 avail = sizeof(ab->tmp) - ab->len; 714 BUG_ON(!ab->skb);
694 if (avail <= 0) { 715 skb = ab->skb;
695 audit_log_move(ab); 716 avail = skb_tailroom(skb);
696 avail = sizeof(ab->tmp) - ab->len; 717 if (avail == 0) {
718 avail = audit_expand(ab, AUDIT_BUFSIZ);
719 if (!avail)
720 goto out;
697 } 721 }
698 len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); 722 va_copy(args2, args);
723 len = vsnprintf(skb->tail, avail, fmt, args);
699 if (len >= avail) { 724 if (len >= avail) {
700 /* The printk buffer is 1024 bytes long, so if we get 725 /* The printk buffer is 1024 bytes long, so if we get
701 * here and AUDIT_BUFSIZ is at least 1024, then we can 726 * here and AUDIT_BUFSIZ is at least 1024, then we can
702 * log everything that printk could have logged. */ 727 * log everything that printk could have logged. */
703 audit_log_move(ab); 728 avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
704 avail = sizeof(ab->tmp) - ab->len; 729 if (!avail)
705 len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); 730 goto out;
731 len = vsnprintf(skb->tail, avail, fmt, args2);
706 } 732 }
707 ab->len += (len < avail) ? len : avail; 733 if (len > 0)
708 ab->total += (len < avail) ? len : avail; 734 skb_put(skb, len);
735out:
736 return;
709} 737}
710 738
711/* Format a message into the audit buffer. All the work is done in 739/* Format a message into the audit buffer. All the work is done in
@@ -721,20 +749,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
721 va_end(args); 749 va_end(args);
722} 750}
723 751
724void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) 752/* This function will take the passed buf and convert it into a string of
753 * ascii hex digits. The new string is placed onto the skb. */
754void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
755 size_t len)
725{ 756{
726 int i; 757 int i, avail, new_len;
758 unsigned char *ptr;
759 struct sk_buff *skb;
760 static const unsigned char *hex = "0123456789ABCDEF";
761
762 BUG_ON(!ab->skb);
763 skb = ab->skb;
764 avail = skb_tailroom(skb);
765 new_len = len<<1;
766 if (new_len >= avail) {
767 /* Round the buffer request up to the next multiple */
768 new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
769 avail = audit_expand(ab, new_len);
770 if (!avail)
771 return;
772 }
727 773
728 for (i=0; i<len; i++) 774 ptr = skb->tail;
729 audit_log_format(ab, "%02x", buf[i]); 775 for (i=0; i<len; i++) {
776 *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
777 *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
778 }
779 *ptr = 0;
780 skb_put(skb, len << 1); /* new string is twice the old string */
730} 781}
731 782
783/* This code will escape a string that is passed to it if the string
784 * contains a control character, unprintable character, double quote mark,
785 * or a space. Unescaped strings will start and end with a double quote mark.
786 * Strings that are escaped are printed in hex (2 digits per char). */
732void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 787void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
733{ 788{
734 const unsigned char *p = string; 789 const unsigned char *p = string;
735 790
736 while (*p) { 791 while (*p) {
737 if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) { 792 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
738 audit_log_hex(ab, string, strlen(string)); 793 audit_log_hex(ab, string, strlen(string));
739 return; 794 return;
740 } 795 }
@@ -743,117 +798,63 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
743 audit_log_format(ab, "\"%s\"", string); 798 audit_log_format(ab, "\"%s\"", string);
744} 799}
745 800
746 801/* This is a helper-function to print the escaped d_path */
747/* This is a helper-function to print the d_path without using a static
748 * buffer or allocating another buffer in addition to the one in
749 * audit_buffer. */
750void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 802void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
751 struct dentry *dentry, struct vfsmount *vfsmnt) 803 struct dentry *dentry, struct vfsmount *vfsmnt)
752{ 804{
753 char *p; 805 char *p, *path;
754 int len, avail;
755 806
756 if (prefix) audit_log_format(ab, " %s", prefix); 807 if (prefix)
757 808 audit_log_format(ab, " %s", prefix);
758 if (ab->len > 128)
759 audit_log_move(ab);
760 avail = sizeof(ab->tmp) - ab->len;
761 p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
762 if (IS_ERR(p)) {
763 /* FIXME: can we save some information here? */
764 audit_log_format(ab, "<toolong>");
765 } else {
766 /* path isn't at start of buffer */
767 len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
768 memmove(ab->tmp + ab->len, p, len);
769 ab->len += len;
770 ab->total += len;
771 }
772}
773
774/* Remove queued messages from the audit_txlist and send them to userspace. */
775static void audit_tasklet_handler(unsigned long arg)
776{
777 LIST_HEAD(list);
778 struct audit_buffer *ab;
779 unsigned long flags;
780 809
781 spin_lock_irqsave(&audit_txlist_lock, flags); 810 /* We will allow 11 spaces for ' (deleted)' to be appended */
782 list_splice_init(&audit_txlist, &list); 811 path = kmalloc(PATH_MAX+11, GFP_KERNEL);
783 spin_unlock_irqrestore(&audit_txlist_lock, flags); 812 if (!path) {
784 813 audit_log_format(ab, "<no memory>");
785 while (!list_empty(&list)) { 814 return;
786 ab = list_entry(list.next, struct audit_buffer, list);
787 list_del(&ab->list);
788 audit_log_end_fast(ab);
789 } 815 }
816 p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
817 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
818 /* FIXME: can we save some information here? */
819 audit_log_format(ab, "<too long>");
820 } else
821 audit_log_untrustedstring(ab, p);
822 kfree(path);
790} 823}
791 824
792static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
793
794/* The netlink_* functions cannot be called inside an irq context, so 825/* The netlink_* functions cannot be called inside an irq context, so
795 * the audit buffer is places on a queue and a tasklet is scheduled to 826 * the audit buffer is places on a queue and a tasklet is scheduled to
796 * remove them from the queue outside the irq context. May be called in 827 * remove them from the queue outside the irq context. May be called in
797 * any context. */ 828 * any context. */
798static void audit_log_end_irq(struct audit_buffer *ab) 829void audit_log_end(struct audit_buffer *ab)
799{
800 unsigned long flags;
801
802 if (!ab)
803 return;
804 spin_lock_irqsave(&audit_txlist_lock, flags);
805 list_add_tail(&ab->list, &audit_txlist);
806 spin_unlock_irqrestore(&audit_txlist_lock, flags);
807
808 tasklet_schedule(&audit_tasklet);
809}
810
811/* Send the message in the audit buffer directly to user space. May not
812 * be called in an irq context. */
813static void audit_log_end_fast(struct audit_buffer *ab)
814{ 830{
815 unsigned long flags;
816
817 BUG_ON(in_irq());
818 if (!ab) 831 if (!ab)
819 return; 832 return;
820 if (!audit_rate_check()) { 833 if (!audit_rate_check()) {
821 audit_log_lost("rate limit exceeded"); 834 audit_log_lost("rate limit exceeded");
822 } else { 835 } else {
823 audit_log_move(ab); 836 if (audit_pid) {
824 if (audit_log_drain(ab)) 837 struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
825 return; 838 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
839 skb_queue_tail(&audit_skb_queue, ab->skb);
840 ab->skb = NULL;
841 wake_up_interruptible(&kauditd_wait);
842 } else {
843 printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
844 }
826 } 845 }
827 846 audit_buffer_free(ab);
828 atomic_dec(&audit_backlog);
829 spin_lock_irqsave(&audit_freelist_lock, flags);
830 if (++audit_freelist_count > AUDIT_MAXFREE)
831 kfree(ab);
832 else
833 list_add(&ab->list, &audit_freelist);
834 spin_unlock_irqrestore(&audit_freelist_lock, flags);
835}
836
837/* Send or queue the message in the audit buffer, depending on the
838 * current context. (A convenience function that may be called in any
839 * context.) */
840void audit_log_end(struct audit_buffer *ab)
841{
842 if (in_irq())
843 audit_log_end_irq(ab);
844 else
845 audit_log_end_fast(ab);
846} 847}
847 848
848/* Log an audit record. This is a convenience function that calls 849/* Log an audit record. This is a convenience function that calls
849 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 850 * audit_log_start, audit_log_vformat, and audit_log_end. It may be
850 * called in any context. */ 851 * called in any context. */
851void audit_log(struct audit_context *ctx, const char *fmt, ...) 852void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
852{ 853{
853 struct audit_buffer *ab; 854 struct audit_buffer *ab;
854 va_list args; 855 va_list args;
855 856
856 ab = audit_log_start(ctx); 857 ab = audit_log_start(ctx, type);
857 if (ab) { 858 if (ab) {
858 va_start(args, fmt); 859 va_start(args, fmt);
859 audit_log_vformat(ab, fmt, args); 860 audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37b3ac94bc47..e75f84e1a1a0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,7 +34,8 @@
34#include <asm/types.h> 34#include <asm/types.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/module.h> 36#include <linux/module.h>
37 37#include <linux/mount.h>
38#include <linux/socket.h>
38#include <linux/audit.h> 39#include <linux/audit.h>
39#include <linux/personality.h> 40#include <linux/personality.h>
40#include <linux/time.h> 41#include <linux/time.h>
@@ -112,6 +113,23 @@ struct audit_aux_data_ipcctl {
112 mode_t mode; 113 mode_t mode;
113}; 114};
114 115
116struct audit_aux_data_socketcall {
117 struct audit_aux_data d;
118 int nargs;
119 unsigned long args[0];
120};
121
122struct audit_aux_data_sockaddr {
123 struct audit_aux_data d;
124 int len;
125 char a[0];
126};
127
128struct audit_aux_data_path {
129 struct audit_aux_data d;
130 struct dentry *dentry;
131 struct vfsmount *mnt;
132};
115 133
116/* The per-task audit context. */ 134/* The per-task audit context. */
117struct audit_context { 135struct audit_context {
@@ -127,6 +145,8 @@ struct audit_context {
127 int auditable; /* 1 if record should be written */ 145 int auditable; /* 1 if record should be written */
128 int name_count; 146 int name_count;
129 struct audit_names names[AUDIT_NAMES]; 147 struct audit_names names[AUDIT_NAMES];
148 struct dentry * pwd;
149 struct vfsmount * pwdmnt;
130 struct audit_context *previous; /* For nested syscalls */ 150 struct audit_context *previous; /* For nested syscalls */
131 struct audit_aux_data *aux; 151 struct audit_aux_data *aux;
132 152
@@ -157,6 +177,8 @@ struct audit_entry {
157 struct audit_rule rule; 177 struct audit_rule rule;
158}; 178};
159 179
180extern int audit_pid;
181
160/* Check to see if two rules are identical. It is called from 182/* Check to see if two rules are identical. It is called from
161 * audit_del_rule during AUDIT_DEL. */ 183 * audit_del_rule during AUDIT_DEL. */
162static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) 184static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
@@ -226,7 +248,6 @@ static inline int audit_del_rule(struct audit_rule *rule,
226 return -EFAULT; /* No matching rule */ 248 return -EFAULT; /* No matching rule */
227} 249}
228 250
229#ifdef CONFIG_NET
230/* Copy rule from user-space to kernel-space. Called during 251/* Copy rule from user-space to kernel-space. Called during
231 * AUDIT_ADD. */ 252 * AUDIT_ADD. */
232static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) 253static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
@@ -287,7 +308,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
287 err = audit_add_rule(entry, &audit_entlist); 308 err = audit_add_rule(entry, &audit_entlist);
288 if (!err && (flags & AUDIT_AT_EXIT)) 309 if (!err && (flags & AUDIT_AT_EXIT))
289 err = audit_add_rule(entry, &audit_extlist); 310 err = audit_add_rule(entry, &audit_extlist);
290 audit_log(NULL, "auid %u added an audit rule\n", loginuid); 311 audit_log(NULL, AUDIT_CONFIG_CHANGE,
312 "auid=%u added an audit rule\n", loginuid);
291 break; 313 break;
292 case AUDIT_DEL: 314 case AUDIT_DEL:
293 flags =((struct audit_rule *)data)->flags; 315 flags =((struct audit_rule *)data)->flags;
@@ -297,7 +319,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
297 err = audit_del_rule(data, &audit_entlist); 319 err = audit_del_rule(data, &audit_entlist);
298 if (!err && (flags & AUDIT_AT_EXIT)) 320 if (!err && (flags & AUDIT_AT_EXIT))
299 err = audit_del_rule(data, &audit_extlist); 321 err = audit_del_rule(data, &audit_extlist);
300 audit_log(NULL, "auid %u removed an audit rule\n", loginuid); 322 audit_log(NULL, AUDIT_CONFIG_CHANGE,
323 "auid=%u removed an audit rule\n", loginuid);
301 break; 324 break;
302 default: 325 default:
303 return -EINVAL; 326 return -EINVAL;
@@ -305,7 +328,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
305 328
306 return err; 329 return err;
307} 330}
308#endif
309 331
310/* Compare a task_struct with an audit_rule. Return 1 on match, 0 332/* Compare a task_struct with an audit_rule. Return 1 on match, 0
311 * otherwise. */ 333 * otherwise. */
@@ -444,7 +466,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
444 466
445/* At syscall entry and exit time, this filter is called if the 467/* At syscall entry and exit time, this filter is called if the
446 * audit_state is not low enough that auditing cannot take place, but is 468 * audit_state is not low enough that auditing cannot take place, but is
447 * also not high enough that we already know we have to write and audit 469 * also not high enough that we already know we have to write an audit
448 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). 470 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
449 */ 471 */
450static enum audit_state audit_filter_syscall(struct task_struct *tsk, 472static enum audit_state audit_filter_syscall(struct task_struct *tsk,
@@ -532,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context)
532 if (context->names[i].name) 554 if (context->names[i].name)
533 __putname(context->names[i].name); 555 __putname(context->names[i].name);
534 context->name_count = 0; 556 context->name_count = 0;
557 if (context->pwd)
558 dput(context->pwd);
559 if (context->pwdmnt)
560 mntput(context->pwdmnt);
561 context->pwd = NULL;
562 context->pwdmnt = NULL;
535} 563}
536 564
537static inline void audit_free_aux(struct audit_context *context) 565static inline void audit_free_aux(struct audit_context *context)
@@ -539,6 +567,11 @@ static inline void audit_free_aux(struct audit_context *context)
539 struct audit_aux_data *aux; 567 struct audit_aux_data *aux;
540 568
541 while ((aux = context->aux)) { 569 while ((aux = context->aux)) {
570 if (aux->type == AUDIT_AVC_PATH) {
571 struct audit_aux_data_path *axi = (void *)aux;
572 dput(axi->dentry);
573 mntput(axi->mnt);
574 }
542 context->aux = aux->next; 575 context->aux = aux->next;
543 kfree(aux); 576 kfree(aux);
544 } 577 }
@@ -625,7 +658,8 @@ static void audit_log_task_info(struct audit_buffer *ab)
625 struct vm_area_struct *vma; 658 struct vm_area_struct *vma;
626 659
627 get_task_comm(name, current); 660 get_task_comm(name, current);
628 audit_log_format(ab, " comm=%s", name); 661 audit_log_format(ab, " comm=");
662 audit_log_untrustedstring(ab, name);
629 663
630 if (!mm) 664 if (!mm)
631 return; 665 return;
@@ -649,23 +683,24 @@ static void audit_log_exit(struct audit_context *context)
649{ 683{
650 int i; 684 int i;
651 struct audit_buffer *ab; 685 struct audit_buffer *ab;
686 struct audit_aux_data *aux;
652 687
653 ab = audit_log_start(context); 688 ab = audit_log_start(context, AUDIT_SYSCALL);
654 if (!ab) 689 if (!ab)
655 return; /* audit_panic has been called */ 690 return; /* audit_panic has been called */
656 audit_log_format(ab, "syscall=%d", context->major); 691 audit_log_format(ab, "arch=%x syscall=%d",
692 context->arch, context->major);
657 if (context->personality != PER_LINUX) 693 if (context->personality != PER_LINUX)
658 audit_log_format(ab, " per=%lx", context->personality); 694 audit_log_format(ab, " per=%lx", context->personality);
659 audit_log_format(ab, " arch=%x", context->arch);
660 if (context->return_valid) 695 if (context->return_valid)
661 audit_log_format(ab, " success=%s exit=%ld", 696 audit_log_format(ab, " success=%s exit=%ld",
662 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 697 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
663 context->return_code); 698 context->return_code);
664 audit_log_format(ab, 699 audit_log_format(ab,
665 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 700 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
666 " pid=%d loginuid=%d uid=%d gid=%d" 701 " pid=%d auid=%u uid=%u gid=%u"
667 " euid=%d suid=%d fsuid=%d" 702 " euid=%u suid=%u fsuid=%u"
668 " egid=%d sgid=%d fsgid=%d", 703 " egid=%u sgid=%u fsgid=%u",
669 context->argv[0], 704 context->argv[0],
670 context->argv[1], 705 context->argv[1],
671 context->argv[2], 706 context->argv[2],
@@ -679,33 +714,57 @@ static void audit_log_exit(struct audit_context *context)
679 context->egid, context->sgid, context->fsgid); 714 context->egid, context->sgid, context->fsgid);
680 audit_log_task_info(ab); 715 audit_log_task_info(ab);
681 audit_log_end(ab); 716 audit_log_end(ab);
682 while (context->aux) {
683 struct audit_aux_data *aux;
684 717
685 ab = audit_log_start(context); 718 for (aux = context->aux; aux; aux = aux->next) {
719
720 ab = audit_log_start(context, aux->type);
686 if (!ab) 721 if (!ab)
687 continue; /* audit_panic has been called */ 722 continue; /* audit_panic has been called */
688 723
689 aux = context->aux;
690 context->aux = aux->next;
691
692 audit_log_format(ab, "auxitem=%d", aux->type);
693 switch (aux->type) { 724 switch (aux->type) {
694 case AUDIT_AUX_IPCPERM: { 725 case AUDIT_IPC: {
695 struct audit_aux_data_ipcctl *axi = (void *)aux; 726 struct audit_aux_data_ipcctl *axi = (void *)aux;
696 audit_log_format(ab, 727 audit_log_format(ab,
697 " qbytes=%lx uid=%d gid=%d mode=%x", 728 " qbytes=%lx iuid=%u igid=%u mode=%x",
698 axi->qbytes, axi->uid, axi->gid, axi->mode); 729 axi->qbytes, axi->uid, axi->gid, axi->mode);
699 } 730 break; }
731
732 case AUDIT_SOCKETCALL: {
733 int i;
734 struct audit_aux_data_socketcall *axs = (void *)aux;
735 audit_log_format(ab, "nargs=%d", axs->nargs);
736 for (i=0; i<axs->nargs; i++)
737 audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
738 break; }
739
740 case AUDIT_SOCKADDR: {
741 struct audit_aux_data_sockaddr *axs = (void *)aux;
742
743 audit_log_format(ab, "saddr=");
744 audit_log_hex(ab, axs->a, axs->len);
745 break; }
746
747 case AUDIT_AVC_PATH: {
748 struct audit_aux_data_path *axi = (void *)aux;
749 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
750 break; }
751
700 } 752 }
701 audit_log_end(ab); 753 audit_log_end(ab);
702 kfree(aux);
703 } 754 }
704 755
756 if (context->pwd && context->pwdmnt) {
757 ab = audit_log_start(context, AUDIT_CWD);
758 if (ab) {
759 audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
760 audit_log_end(ab);
761 }
762 }
705 for (i = 0; i < context->name_count; i++) { 763 for (i = 0; i < context->name_count; i++) {
706 ab = audit_log_start(context); 764 ab = audit_log_start(context, AUDIT_PATH);
707 if (!ab) 765 if (!ab)
708 continue; /* audit_panic has been called */ 766 continue; /* audit_panic has been called */
767
709 audit_log_format(ab, "item=%d", i); 768 audit_log_format(ab, "item=%d", i);
710 if (context->names[i].name) { 769 if (context->names[i].name) {
711 audit_log_format(ab, " name="); 770 audit_log_format(ab, " name=");
@@ -713,7 +772,7 @@ static void audit_log_exit(struct audit_context *context)
713 } 772 }
714 if (context->names[i].ino != (unsigned long)-1) 773 if (context->names[i].ino != (unsigned long)-1)
715 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" 774 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
716 " uid=%d gid=%d rdev=%02x:%02x", 775 " ouid=%u ogid=%u rdev=%02x:%02x",
717 context->names[i].ino, 776 context->names[i].ino,
718 MAJOR(context->names[i].dev), 777 MAJOR(context->names[i].dev),
719 MINOR(context->names[i].dev), 778 MINOR(context->names[i].dev),
@@ -741,42 +800,12 @@ void audit_free(struct task_struct *tsk)
741 800
742 /* Check for system calls that do not go through the exit 801 /* Check for system calls that do not go through the exit
743 * function (e.g., exit_group), then free context block. */ 802 * function (e.g., exit_group), then free context block. */
744 if (context->in_syscall && context->auditable) 803 if (context->in_syscall && context->auditable && context->pid != audit_pid)
745 audit_log_exit(context); 804 audit_log_exit(context);
746 805
747 audit_free_context(context); 806 audit_free_context(context);
748} 807}
749 808
750/* Compute a serial number for the audit record. Audit records are
751 * written to user-space as soon as they are generated, so a complete
752 * audit record may be written in several pieces. The timestamp of the
753 * record and this serial number are used by the user-space daemon to
754 * determine which pieces belong to the same audit record. The
755 * (timestamp,serial) tuple is unique for each syscall and is live from
756 * syscall entry to syscall exit.
757 *
758 * Atomic values are only guaranteed to be 24-bit, so we count down.
759 *
760 * NOTE: Another possibility is to store the formatted records off the
761 * audit context (for those records that have a context), and emit them
762 * all at syscall exit. However, this could delay the reporting of
763 * significant errors until syscall exit (or never, if the system
764 * halts). */
765static inline unsigned int audit_serial(void)
766{
767 static atomic_t serial = ATOMIC_INIT(0xffffff);
768 unsigned int a, b;
769
770 do {
771 a = atomic_read(&serial);
772 if (atomic_dec_and_test(&serial))
773 atomic_set(&serial, 0xffffff);
774 b = atomic_read(&serial);
775 } while (b != a - 1);
776
777 return 0xffffff - b;
778}
779
780/* Fill in audit context at syscall entry. This only happens if the 809/* Fill in audit context at syscall entry. This only happens if the
781 * audit context was created when the task was created and the state or 810 * audit context was created when the task was created and the state or
782 * filters demand the audit context be built. If the state from the 811 * filters demand the audit context be built. If the state from the
@@ -876,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
876 if (likely(!context)) 905 if (likely(!context))
877 return; 906 return;
878 907
879 if (context->in_syscall && context->auditable) 908 if (context->in_syscall && context->auditable && context->pid != audit_pid)
880 audit_log_exit(context); 909 audit_log_exit(context);
881 910
882 context->in_syscall = 0; 911 context->in_syscall = 0;
@@ -916,6 +945,13 @@ void audit_getname(const char *name)
916 context->names[context->name_count].name = name; 945 context->names[context->name_count].name = name;
917 context->names[context->name_count].ino = (unsigned long)-1; 946 context->names[context->name_count].ino = (unsigned long)-1;
918 ++context->name_count; 947 ++context->name_count;
948 if (!context->pwd) {
949 read_lock(&current->fs->lock);
950 context->pwd = dget(current->fs->pwd);
951 context->pwdmnt = mntget(current->fs->pwdmnt);
952 read_unlock(&current->fs->lock);
953 }
954
919} 955}
920 956
921/* Intercept a putname request. Called from 957/* Intercept a putname request. Called from
@@ -994,34 +1030,26 @@ void audit_inode(const char *name, const struct inode *inode)
994 context->names[idx].rdev = inode->i_rdev; 1030 context->names[idx].rdev = inode->i_rdev;
995} 1031}
996 1032
997void audit_get_stamp(struct audit_context *ctx, 1033void auditsc_get_stamp(struct audit_context *ctx,
998 struct timespec *t, unsigned int *serial) 1034 struct timespec *t, unsigned int *serial)
999{ 1035{
1000 if (ctx) { 1036 t->tv_sec = ctx->ctime.tv_sec;
1001 t->tv_sec = ctx->ctime.tv_sec; 1037 t->tv_nsec = ctx->ctime.tv_nsec;
1002 t->tv_nsec = ctx->ctime.tv_nsec; 1038 *serial = ctx->serial;
1003 *serial = ctx->serial; 1039 ctx->auditable = 1;
1004 ctx->auditable = 1;
1005 } else {
1006 *t = CURRENT_TIME;
1007 *serial = 0;
1008 }
1009} 1040}
1010 1041
1011extern int audit_set_type(struct audit_buffer *ab, int type);
1012
1013int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1042int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1014{ 1043{
1015 if (task->audit_context) { 1044 if (task->audit_context) {
1016 struct audit_buffer *ab; 1045 struct audit_buffer *ab;
1017 1046
1018 ab = audit_log_start(NULL); 1047 ab = audit_log_start(NULL, AUDIT_LOGIN);
1019 if (ab) { 1048 if (ab) {
1020 audit_log_format(ab, "login pid=%d uid=%u " 1049 audit_log_format(ab, "login pid=%d uid=%u "
1021 "old loginuid=%u new loginuid=%u", 1050 "old auid=%u new auid=%u",
1022 task->pid, task->uid, 1051 task->pid, task->uid,
1023 task->audit_context->loginuid, loginuid); 1052 task->audit_context->loginuid, loginuid);
1024 audit_set_type(ab, AUDIT_LOGIN);
1025 audit_log_end(ab); 1053 audit_log_end(ab);
1026 } 1054 }
1027 task->audit_context->loginuid = loginuid; 1055 task->audit_context->loginuid = loginuid;
@@ -1051,8 +1079,89 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1051 ax->gid = gid; 1079 ax->gid = gid;
1052 ax->mode = mode; 1080 ax->mode = mode;
1053 1081
1054 ax->d.type = AUDIT_AUX_IPCPERM; 1082 ax->d.type = AUDIT_IPC;
1083 ax->d.next = context->aux;
1084 context->aux = (void *)ax;
1085 return 0;
1086}
1087
1088int audit_socketcall(int nargs, unsigned long *args)
1089{
1090 struct audit_aux_data_socketcall *ax;
1091 struct audit_context *context = current->audit_context;
1092
1093 if (likely(!context))
1094 return 0;
1095
1096 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
1097 if (!ax)
1098 return -ENOMEM;
1099
1100 ax->nargs = nargs;
1101 memcpy(ax->args, args, nargs * sizeof(unsigned long));
1102
1103 ax->d.type = AUDIT_SOCKETCALL;
1104 ax->d.next = context->aux;
1105 context->aux = (void *)ax;
1106 return 0;
1107}
1108
1109int audit_sockaddr(int len, void *a)
1110{
1111 struct audit_aux_data_sockaddr *ax;
1112 struct audit_context *context = current->audit_context;
1113
1114 if (likely(!context))
1115 return 0;
1116
1117 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
1118 if (!ax)
1119 return -ENOMEM;
1120
1121 ax->len = len;
1122 memcpy(ax->a, a, len);
1123
1124 ax->d.type = AUDIT_SOCKADDR;
1055 ax->d.next = context->aux; 1125 ax->d.next = context->aux;
1056 context->aux = (void *)ax; 1126 context->aux = (void *)ax;
1057 return 0; 1127 return 0;
1058} 1128}
1129
1130int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1131{
1132 struct audit_aux_data_path *ax;
1133 struct audit_context *context = current->audit_context;
1134
1135 if (likely(!context))
1136 return 0;
1137
1138 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1139 if (!ax)
1140 return -ENOMEM;
1141
1142 ax->dentry = dget(dentry);
1143 ax->mnt = mntget(mnt);
1144
1145 ax->d.type = AUDIT_AVC_PATH;
1146 ax->d.next = context->aux;
1147 context->aux = (void *)ax;
1148 return 0;
1149}
1150
1151void audit_signal_info(int sig, struct task_struct *t)
1152{
1153 extern pid_t audit_sig_pid;
1154 extern uid_t audit_sig_uid;
1155
1156 if (unlikely(audit_pid && t->pid == audit_pid)) {
1157 if (sig == SIGTERM || sig == SIGHUP) {
1158 struct audit_context *ctx = current->audit_context;
1159 audit_sig_pid = current->pid;
1160 if (ctx)
1161 audit_sig_uid = ctx->loginuid;
1162 else
1163 audit_sig_uid = current->uid;
1164 }
1165 }
1166}
1167
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
63{ 63{
64 int err; 64 int err;
65 65
66 /* Take offline: makes arch_cpu_down somewhat easier. */
67 cpu_clear(smp_processor_id(), cpu_online_map);
68
69 /* Ensure this CPU doesn't handle any more interrupts. */ 66 /* Ensure this CPU doesn't handle any more interrupts. */
70 err = __cpu_disable(); 67 err = __cpu_disable();
71 if (err < 0) 68 if (err < 0)
72 cpu_set(smp_processor_id(), cpu_online_map); 69 return err;
73 else
74 /* Force idle task to run as soon as we yield: it should
75 immediately notice cpu is offline and die quickly. */
76 sched_idle_next();
77 70
78 return err; 71 /* Force idle task to run as soon as we yield: it should
72 immediately notice cpu is offline and die quickly. */
73 sched_idle_next();
74 return 0;
79} 75}
80 76
81int cpu_down(unsigned int cpu) 77int cpu_down(unsigned int cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00e8f2575512..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
228 228
229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) 229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
230{ 230{
231 struct qstr qstr; 231 struct dentry *d = lookup_one_len(name, parent, strlen(name));
232 struct dentry *d;
233
234 qstr.name = name;
235 qstr.len = strlen(name);
236 qstr.hash = full_name_hash(name, qstr.len);
237 d = lookup_hash(&qstr, parent);
238 if (!IS_ERR(d)) 232 if (!IS_ERR(d))
239 d->d_op = &cpuset_dops; 233 d->d_op = &cpuset_dops;
240 return d; 234 return d;
@@ -601,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
601 return 0; 595 return 0;
602} 596}
603 597
598/*
599 * For a given cpuset cur, partition the system as follows
600 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
601 * exclusive child cpusets
602 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
603 * exclusive child cpusets
604 * Build these two partitions by calling partition_sched_domains
605 *
606 * Call with cpuset_sem held. May nest a call to the
607 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
608 */
609static void update_cpu_domains(struct cpuset *cur)
610{
611 struct cpuset *c, *par = cur->parent;
612 cpumask_t pspan, cspan;
613
614 if (par == NULL || cpus_empty(cur->cpus_allowed))
615 return;
616
617 /*
618 * Get all cpus from parent's cpus_allowed not part of exclusive
619 * children
620 */
621 pspan = par->cpus_allowed;
622 list_for_each_entry(c, &par->children, sibling) {
623 if (is_cpu_exclusive(c))
624 cpus_andnot(pspan, pspan, c->cpus_allowed);
625 }
626 if (is_removed(cur) || !is_cpu_exclusive(cur)) {
627 cpus_or(pspan, pspan, cur->cpus_allowed);
628 if (cpus_equal(pspan, cur->cpus_allowed))
629 return;
630 cspan = CPU_MASK_NONE;
631 } else {
632 if (cpus_empty(pspan))
633 return;
634 cspan = cur->cpus_allowed;
635 /*
636 * Get all cpus from current cpuset's cpus_allowed not part
637 * of exclusive children
638 */
639 list_for_each_entry(c, &cur->children, sibling) {
640 if (is_cpu_exclusive(c))
641 cpus_andnot(cspan, cspan, c->cpus_allowed);
642 }
643 }
644
645 lock_cpu_hotplug();
646 partition_sched_domains(&pspan, &cspan);
647 unlock_cpu_hotplug();
648}
649
604static int update_cpumask(struct cpuset *cs, char *buf) 650static int update_cpumask(struct cpuset *cs, char *buf)
605{ 651{
606 struct cpuset trialcs; 652 struct cpuset trialcs;
607 int retval; 653 int retval, cpus_unchanged;
608 654
609 trialcs = *cs; 655 trialcs = *cs;
610 retval = cpulist_parse(buf, trialcs.cpus_allowed); 656 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -614,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
614 if (cpus_empty(trialcs.cpus_allowed)) 660 if (cpus_empty(trialcs.cpus_allowed))
615 return -ENOSPC; 661 return -ENOSPC;
616 retval = validate_change(cs, &trialcs); 662 retval = validate_change(cs, &trialcs);
617 if (retval == 0) 663 if (retval < 0)
618 cs->cpus_allowed = trialcs.cpus_allowed; 664 return retval;
619 return retval; 665 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
666 cs->cpus_allowed = trialcs.cpus_allowed;
667 if (is_cpu_exclusive(cs) && !cpus_unchanged)
668 update_cpu_domains(cs);
669 return 0;
620} 670}
621 671
622static int update_nodemask(struct cpuset *cs, char *buf) 672static int update_nodemask(struct cpuset *cs, char *buf)
@@ -652,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
652{ 702{
653 int turning_on; 703 int turning_on;
654 struct cpuset trialcs; 704 struct cpuset trialcs;
655 int err; 705 int err, cpu_exclusive_changed;
656 706
657 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 707 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
658 708
@@ -663,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
663 clear_bit(bit, &trialcs.flags); 713 clear_bit(bit, &trialcs.flags);
664 714
665 err = validate_change(cs, &trialcs); 715 err = validate_change(cs, &trialcs);
666 if (err == 0) { 716 if (err < 0)
667 if (turning_on) 717 return err;
668 set_bit(bit, &cs->flags); 718 cpu_exclusive_changed =
669 else 719 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
670 clear_bit(bit, &cs->flags); 720 if (turning_on)
671 } 721 set_bit(bit, &cs->flags);
672 return err; 722 else
723 clear_bit(bit, &cs->flags);
724
725 if (cpu_exclusive_changed)
726 update_cpu_domains(cs);
727 return 0;
673} 728}
674 729
675static int attach_task(struct cpuset *cs, char *buf) 730static int attach_task(struct cpuset *cs, char *buf)
@@ -1315,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1315 up(&cpuset_sem); 1370 up(&cpuset_sem);
1316 return -EBUSY; 1371 return -EBUSY;
1317 } 1372 }
1318 spin_lock(&cs->dentry->d_lock);
1319 parent = cs->parent; 1373 parent = cs->parent;
1320 set_bit(CS_REMOVED, &cs->flags); 1374 set_bit(CS_REMOVED, &cs->flags);
1375 if (is_cpu_exclusive(cs))
1376 update_cpu_domains(cs);
1321 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1377 list_del(&cs->sibling); /* delete my sibling from parent->children */
1322 if (list_empty(&parent->children)) 1378 if (list_empty(&parent->children))
1323 check_for_release(parent); 1379 check_for_release(parent);
1380 spin_lock(&cs->dentry->d_lock);
1324 d = dget(cs->dentry); 1381 d = dget(cs->dentry);
1325 cs->dentry = NULL; 1382 cs->dentry = NULL;
1326 spin_unlock(&d->d_lock); 1383 spin_unlock(&d->d_lock);
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..459ba49e376a
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,52 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/smp_lock.h>
9#include <linux/errno.h>
10#include <linux/proc_fs.h>
11#include <linux/bootmem.h>
12#include <linux/highmem.h>
13#include <linux/crash_dump.h>
14
15#include <asm/io.h>
16#include <asm/uaccess.h>
17
18/* Stores the physical address of elf header of crash image. */
19unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
20
21/*
22 * Copy a page from "oldmem". For this page, there is no pte mapped
23 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
24 */
25ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
26 size_t csize, unsigned long offset, int userbuf)
27{
28 void *page, *vaddr;
29
30 if (!csize)
31 return 0;
32
33 page = kmalloc(PAGE_SIZE, GFP_KERNEL);
34 if (!page)
35 return -ENOMEM;
36
37 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
38 copy_page(page, vaddr);
39 kunmap_atomic(vaddr, KM_PTE0);
40
41 if (userbuf) {
42 if (copy_to_user(buf, (page + offset), csize)) {
43 kfree(page);
44 return -EFAULT;
45 }
46 } else {
47 memcpy(buf, (page + offset), csize);
48 }
49
50 kfree(page);
51 return csize;
52}
diff --git a/kernel/exit.c b/kernel/exit.c
index edaa50b5bbfa..3ebcd60a19c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,11 @@ repeat:
72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
73 __exit_signal(p); 73 __exit_signal(p);
74 __exit_sighand(p); 74 __exit_sighand(p);
75 /*
76 * Note that the fastpath in sys_times depends on __exit_signal having
77 * updated the counters before a task is removed from the tasklist of
78 * the process by __unhash_process.
79 */
75 __unhash_process(p); 80 __unhash_process(p);
76 81
77 /* 82 /*
@@ -793,6 +798,17 @@ fastcall NORET_TYPE void do_exit(long code)
793 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); 798 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
794 } 799 }
795 800
801 /*
802 * We're taking recursive faults here in do_exit. Safest is to just
803 * leave this task alone and wait for reboot.
804 */
805 if (unlikely(tsk->flags & PF_EXITING)) {
806 printk(KERN_ALERT
807 "Fixing recursive fault but reboot is needed!\n");
808 set_current_state(TASK_UNINTERRUPTIBLE);
809 schedule();
810 }
811
796 tsk->flags |= PF_EXITING; 812 tsk->flags |= PF_EXITING;
797 813
798 /* 814 /*
@@ -811,10 +827,8 @@ fastcall NORET_TYPE void do_exit(long code)
811 acct_update_integrals(tsk); 827 acct_update_integrals(tsk);
812 update_mem_hiwater(tsk); 828 update_mem_hiwater(tsk);
813 group_dead = atomic_dec_and_test(&tsk->signal->live); 829 group_dead = atomic_dec_and_test(&tsk->signal->live);
814 if (group_dead) { 830 if (group_dead)
815 del_timer_sync(&tsk->signal->real_timer);
816 acct_process(code); 831 acct_process(code);
817 }
818 exit_mm(tsk); 832 exit_mm(tsk);
819 833
820 exit_sem(tsk); 834 exit_sem(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f88699..2c7806873bfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
194 mm->mmap = NULL; 194 mm->mmap = NULL;
195 mm->mmap_cache = NULL; 195 mm->mmap_cache = NULL;
196 mm->free_area_cache = oldmm->mmap_base; 196 mm->free_area_cache = oldmm->mmap_base;
197 mm->cached_hole_size = ~0UL;
197 mm->map_count = 0; 198 mm->map_count = 0;
198 set_mm_counter(mm, rss, 0); 199 set_mm_counter(mm, rss, 0);
199 set_mm_counter(mm, anon_rss, 0); 200 set_mm_counter(mm, anon_rss, 0);
@@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
249 250
250 /* 251 /*
251 * Link in the new vma and copy the page table entries: 252 * Link in the new vma and copy the page table entries:
252 * link in first so that swapoff can see swap entries, 253 * link in first so that swapoff can see swap entries.
253 * and try_to_unmap_one's find_vma find the new vma. 254 * Note that, exceptionally, here the vma is inserted
255 * without holding mm->mmap_sem.
254 */ 256 */
255 spin_lock(&mm->page_table_lock); 257 spin_lock(&mm->page_table_lock);
256 *pprev = tmp; 258 *pprev = tmp;
@@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
322 mm->ioctx_list = NULL; 324 mm->ioctx_list = NULL;
323 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); 325 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
324 mm->free_area_cache = TASK_UNMAPPED_BASE; 326 mm->free_area_cache = TASK_UNMAPPED_BASE;
327 mm->cached_hole_size = ~0UL;
325 328
326 if (likely(!mm_alloc_pgd(mm))) { 329 if (likely(!mm_alloc_pgd(mm))) {
327 mm->def_flags = 0; 330 mm->def_flags = 0;
@@ -1000,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
1000 p->pdeath_signal = 0; 1003 p->pdeath_signal = 0;
1001 p->exit_state = 0; 1004 p->exit_state = 0;
1002 1005
1003 /* Perform scheduler related setup */
1004 sched_fork(p);
1005
1006 /* 1006 /*
1007 * Ok, make it visible to the rest of the system. 1007 * Ok, make it visible to the rest of the system.
1008 * We dont wake it up yet. 1008 * We dont wake it up yet.
@@ -1011,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
1011 INIT_LIST_HEAD(&p->ptrace_children); 1011 INIT_LIST_HEAD(&p->ptrace_children);
1012 INIT_LIST_HEAD(&p->ptrace_list); 1012 INIT_LIST_HEAD(&p->ptrace_list);
1013 1013
1014 /* Perform scheduler related setup. Assign this task to a CPU. */
1015 sched_fork(p, clone_flags);
1016
1014 /* Need tasklist lock for parent etc handling! */ 1017 /* Need tasklist lock for parent etc handling! */
1015 write_lock_irq(&tasklist_lock); 1018 write_lock_irq(&tasklist_lock);
1016 1019
1017 /* 1020 /*
1018 * The task hasn't been attached yet, so cpus_allowed mask cannot 1021 * The task hasn't been attached yet, so its cpus_allowed mask will
1019 * have changed. The cpus_allowed mask of the parent may have 1022 * not be changed, nor will its assigned CPU.
1020 * changed after it was copied first time, and it may then move to 1023 *
1021 * another CPU - so we re-copy it here and set the child's CPU to 1024 * The cpus_allowed mask of the parent may have changed after it was
1022 * the parent's CPU. This avoids alot of nasty races. 1025 * copied first time - so re-copy it here, then check the child's CPU
1026 * to ensure it is on a valid CPU (and if not, just force it back to
1027 * parent's CPU). This avoids alot of nasty races.
1023 */ 1028 */
1024 p->cpus_allowed = current->cpus_allowed; 1029 p->cpus_allowed = current->cpus_allowed;
1025 set_task_cpu(p, smp_processor_id()); 1030 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
1031 set_task_cpu(p, smp_processor_id());
1026 1032
1027 /* 1033 /*
1028 * Check for pending SIGKILL! The new thread should not be allowed 1034 * Check for pending SIGKILL! The new thread should not be allowed
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 06b5a6323998..436c7d93c00a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -119,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
119 */ 119 */
120 desc->handler->ack(irq); 120 desc->handler->ack(irq);
121 action_ret = handle_IRQ_event(irq, regs, desc->action); 121 action_ret = handle_IRQ_event(irq, regs, desc->action);
122 if (!noirqdebug)
123 note_interrupt(irq, desc, action_ret);
124 desc->handler->end(irq); 122 desc->handler->end(irq);
125 return 1; 123 return 1;
126 } 124 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5202e4c4a5b6..ac6700985705 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -6,6 +6,7 @@
6 * This file contains driver APIs to the irq subsystem. 6 * This file contains driver APIs to the irq subsystem.
7 */ 7 */
8 8
9#include <linux/config.h>
9#include <linux/irq.h> 10#include <linux/irq.h>
10#include <linux/module.h> 11#include <linux/module.h>
11#include <linux/random.h> 12#include <linux/random.h>
@@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id)
255 256
256 /* Found it - now remove it from the list of entries */ 257 /* Found it - now remove it from the list of entries */
257 *pp = action->next; 258 *pp = action->next;
259
260 /* Currently used only by UML, might disappear one day.*/
261#ifdef CONFIG_IRQ_RELEASE_METHOD
262 if (desc->handler->release)
263 desc->handler->release(irq, dev_id);
264#endif
265
258 if (!desc->action) { 266 if (!desc->action) {
259 desc->status |= IRQ_DISABLED; 267 desc->status |= IRQ_DISABLED;
260 if (desc->handler->shutdown) 268 if (desc->handler->shutdown)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..ba039e827d58 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
45 } 45 }
46} 46}
47 47
48void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 48static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
49{ 49{
50 static int count = 100; 50 static int count = 100;
51 51
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..7843548cf2d9
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1063 @@
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/file.h>
11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/kexec.h>
14#include <linux/spinlock.h>
15#include <linux/list.h>
16#include <linux/highmem.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h>
19#include <linux/syscalls.h>
20#include <linux/ioport.h>
21#include <linux/hardirq.h>
22
23#include <asm/page.h>
24#include <asm/uaccess.h>
25#include <asm/io.h>
26#include <asm/system.h>
27#include <asm/semaphore.h>
28
29/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = {
31 .name = "Crash kernel",
32 .start = 0,
33 .end = 0,
34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
35};
36
37int kexec_should_crash(struct task_struct *p)
38{
39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 return 1;
41 return 0;
42}
43
44/*
45 * When kexec transitions to the new kernel there is a one-to-one
46 * mapping between physical and virtual addresses. On processors
47 * where you can disable the MMU this is trivial, and easy. For
48 * others it is still a simple predictable page table to setup.
49 *
50 * In that environment kexec copies the new kernel to its final
51 * resting place. This means I can only support memory whose
52 * physical address can fit in an unsigned long. In particular
53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54 * If the assembly stub has more restrictive requirements
55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56 * defined more restrictively in <asm/kexec.h>.
57 *
58 * The code for the transition from the current kernel to the
59 * the new kernel is placed in the control_code_buffer, whose size
60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
61 * page of memory is necessary, but some architectures require more.
62 * Because this memory must be identity mapped in the transition from
63 * virtual to physical addresses it must live in the range
64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65 * modifiable.
66 *
67 * The assembly stub in the control code buffer is passed a linked list
68 * of descriptor pages detailing the source pages of the new kernel,
69 * and the destination addresses of those source pages. As this data
70 * structure is not used in the context of the current OS, it must
71 * be self-contained.
72 *
73 * The code has been made to work with highmem pages and will use a
74 * destination page in its final resting place (if it happens
75 * to allocate it). The end product of this is that most of the
76 * physical address space, and most of RAM can be used.
77 *
78 * Future directions include:
79 * - allocating a page table with the control code buffer identity
80 * mapped, to simplify machine_kexec and make kexec_on_panic more
81 * reliable.
82 */
83
84/*
85 * KIMAGE_NO_DEST is an impossible destination address..., for
86 * allocating pages whose destination address we do not care about.
87 */
88#define KIMAGE_NO_DEST (-1UL)
89
90static int kimage_is_destination_range(struct kimage *image,
91 unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image,
93 unsigned int gfp_mask,
94 unsigned long dest);
95
96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
97 unsigned long nr_segments,
98 struct kexec_segment __user *segments)
99{
100 size_t segment_bytes;
101 struct kimage *image;
102 unsigned long i;
103 int result;
104
105 /* Allocate a controlling structure */
106 result = -ENOMEM;
107 image = kmalloc(sizeof(*image), GFP_KERNEL);
108 if (!image)
109 goto out;
110
111 memset(image, 0, sizeof(*image));
112 image->head = 0;
113 image->entry = &image->head;
114 image->last_entry = &image->head;
115 image->control_page = ~0; /* By default this does not apply */
116 image->start = entry;
117 image->type = KEXEC_TYPE_DEFAULT;
118
119 /* Initialize the list of control pages */
120 INIT_LIST_HEAD(&image->control_pages);
121
122 /* Initialize the list of destination pages */
123 INIT_LIST_HEAD(&image->dest_pages);
124
125 /* Initialize the list of unuseable pages */
126 INIT_LIST_HEAD(&image->unuseable_pages);
127
128 /* Read in the segments */
129 image->nr_segments = nr_segments;
130 segment_bytes = nr_segments * sizeof(*segments);
131 result = copy_from_user(image->segment, segments, segment_bytes);
132 if (result)
133 goto out;
134
135 /*
136 * Verify we have good destination addresses. The caller is
137 * responsible for making certain we don't attempt to load
138 * the new image into invalid or reserved areas of RAM. This
139 * just verifies it is an address we can use.
140 *
141 * Since the kernel does everything in page size chunks ensure
142 * the destination addreses are page aligned. Too many
143 * special cases crop of when we don't do this. The most
144 * insidious is getting overlapping destination addresses
145 * simply because addresses are changed to page size
146 * granularity.
147 */
148 result = -EADDRNOTAVAIL;
149 for (i = 0; i < nr_segments; i++) {
150 unsigned long mstart, mend;
151
152 mstart = image->segment[i].mem;
153 mend = mstart + image->segment[i].memsz;
154 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
155 goto out;
156 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
157 goto out;
158 }
159
160 /* Verify our destination addresses do not overlap.
161 * If we alloed overlapping destination addresses
162 * through very weird things can happen with no
163 * easy explanation as one segment stops on another.
164 */
165 result = -EINVAL;
166 for (i = 0; i < nr_segments; i++) {
167 unsigned long mstart, mend;
168 unsigned long j;
169
170 mstart = image->segment[i].mem;
171 mend = mstart + image->segment[i].memsz;
172 for (j = 0; j < i; j++) {
173 unsigned long pstart, pend;
174 pstart = image->segment[j].mem;
175 pend = pstart + image->segment[j].memsz;
176 /* Do the segments overlap ? */
177 if ((mend > pstart) && (mstart < pend))
178 goto out;
179 }
180 }
181
182 /* Ensure our buffer sizes are strictly less than
183 * our memory sizes. This should always be the case,
184 * and it is easier to check up front than to be surprised
185 * later on.
186 */
187 result = -EINVAL;
188 for (i = 0; i < nr_segments; i++) {
189 if (image->segment[i].bufsz > image->segment[i].memsz)
190 goto out;
191 }
192
193 result = 0;
194out:
195 if (result == 0)
196 *rimage = image;
197 else
198 kfree(image);
199
200 return result;
201
202}
203
204static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
205 unsigned long nr_segments,
206 struct kexec_segment __user *segments)
207{
208 int result;
209 struct kimage *image;
210
211 /* Allocate and initialize a controlling structure */
212 image = NULL;
213 result = do_kimage_alloc(&image, entry, nr_segments, segments);
214 if (result)
215 goto out;
216
217 *rimage = image;
218
219 /*
220 * Find a location for the control code buffer, and add it
221 * the vector of segments so that it's pages will also be
222 * counted as destination pages.
223 */
224 result = -ENOMEM;
225 image->control_code_page = kimage_alloc_control_pages(image,
226 get_order(KEXEC_CONTROL_CODE_SIZE));
227 if (!image->control_code_page) {
228 printk(KERN_ERR "Could not allocate control_code_buffer\n");
229 goto out;
230 }
231
232 result = 0;
233 out:
234 if (result == 0)
235 *rimage = image;
236 else
237 kfree(image);
238
239 return result;
240}
241
242static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
243 unsigned long nr_segments,
244 struct kexec_segment *segments)
245{
246 int result;
247 struct kimage *image;
248 unsigned long i;
249
250 image = NULL;
251 /* Verify we have a valid entry point */
252 if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
253 result = -EADDRNOTAVAIL;
254 goto out;
255 }
256
257 /* Allocate and initialize a controlling structure */
258 result = do_kimage_alloc(&image, entry, nr_segments, segments);
259 if (result)
260 goto out;
261
262 /* Enable the special crash kernel control page
263 * allocation policy.
264 */
265 image->control_page = crashk_res.start;
266 image->type = KEXEC_TYPE_CRASH;
267
268 /*
269 * Verify we have good destination addresses. Normally
270 * the caller is responsible for making certain we don't
271 * attempt to load the new image into invalid or reserved
272 * areas of RAM. But crash kernels are preloaded into a
273 * reserved area of ram. We must ensure the addresses
274 * are in the reserved area otherwise preloading the
275 * kernel could corrupt things.
276 */
277 result = -EADDRNOTAVAIL;
278 for (i = 0; i < nr_segments; i++) {
279 unsigned long mstart, mend;
280
281 mstart = image->segment[i].mem;
282 mend = mstart + image->segment[i].memsz - 1;
283 /* Ensure we are within the crash kernel limits */
284 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
285 goto out;
286 }
287
288 /*
289 * Find a location for the control code buffer, and add
290 * the vector of segments so that it's pages will also be
291 * counted as destination pages.
292 */
293 result = -ENOMEM;
294 image->control_code_page = kimage_alloc_control_pages(image,
295 get_order(KEXEC_CONTROL_CODE_SIZE));
296 if (!image->control_code_page) {
297 printk(KERN_ERR "Could not allocate control_code_buffer\n");
298 goto out;
299 }
300
301 result = 0;
302out:
303 if (result == 0)
304 *rimage = image;
305 else
306 kfree(image);
307
308 return result;
309}
310
311static int kimage_is_destination_range(struct kimage *image,
312 unsigned long start,
313 unsigned long end)
314{
315 unsigned long i;
316
317 for (i = 0; i < image->nr_segments; i++) {
318 unsigned long mstart, mend;
319
320 mstart = image->segment[i].mem;
321 mend = mstart + image->segment[i].memsz;
322 if ((end > mstart) && (start < mend))
323 return 1;
324 }
325
326 return 0;
327}
328
329static struct page *kimage_alloc_pages(unsigned int gfp_mask,
330 unsigned int order)
331{
332 struct page *pages;
333
334 pages = alloc_pages(gfp_mask, order);
335 if (pages) {
336 unsigned int count, i;
337 pages->mapping = NULL;
338 pages->private = order;
339 count = 1 << order;
340 for (i = 0; i < count; i++)
341 SetPageReserved(pages + i);
342 }
343
344 return pages;
345}
346
347static void kimage_free_pages(struct page *page)
348{
349 unsigned int order, count, i;
350
351 order = page->private;
352 count = 1 << order;
353 for (i = 0; i < count; i++)
354 ClearPageReserved(page + i);
355 __free_pages(page, order);
356}
357
358static void kimage_free_page_list(struct list_head *list)
359{
360 struct list_head *pos, *next;
361
362 list_for_each_safe(pos, next, list) {
363 struct page *page;
364
365 page = list_entry(pos, struct page, lru);
366 list_del(&page->lru);
367 kimage_free_pages(page);
368 }
369}
370
371static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
372 unsigned int order)
373{
374 /* Control pages are special, they are the intermediaries
375 * that are needed while we copy the rest of the pages
376 * to their final resting place. As such they must
377 * not conflict with either the destination addresses
378 * or memory the kernel is already using.
379 *
380 * The only case where we really need more than one of
381 * these are for architectures where we cannot disable
382 * the MMU and must instead generate an identity mapped
383 * page table for all of the memory.
384 *
385 * At worst this runs in O(N) of the image size.
386 */
387 struct list_head extra_pages;
388 struct page *pages;
389 unsigned int count;
390
391 count = 1 << order;
392 INIT_LIST_HEAD(&extra_pages);
393
394 /* Loop while I can allocate a page and the page allocated
395 * is a destination page.
396 */
397 do {
398 unsigned long pfn, epfn, addr, eaddr;
399
400 pages = kimage_alloc_pages(GFP_KERNEL, order);
401 if (!pages)
402 break;
403 pfn = page_to_pfn(pages);
404 epfn = pfn + count;
405 addr = pfn << PAGE_SHIFT;
406 eaddr = epfn << PAGE_SHIFT;
407 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
408 kimage_is_destination_range(image, addr, eaddr)) {
409 list_add(&pages->lru, &extra_pages);
410 pages = NULL;
411 }
412 } while (!pages);
413
414 if (pages) {
415 /* Remember the allocated page... */
416 list_add(&pages->lru, &image->control_pages);
417
418 /* Because the page is already in it's destination
419 * location we will never allocate another page at
420 * that address. Therefore kimage_alloc_pages
421 * will not return it (again) and we don't need
422 * to give it an entry in image->segment[].
423 */
424 }
425 /* Deal with the destination pages I have inadvertently allocated.
426 *
427 * Ideally I would convert multi-page allocations into single
428 * page allocations, and add everyting to image->dest_pages.
429 *
430 * For now it is simpler to just free the pages.
431 */
432 kimage_free_page_list(&extra_pages);
433
434 return pages;
435}
436
437static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
438 unsigned int order)
439{
440 /* Control pages are special, they are the intermediaries
441 * that are needed while we copy the rest of the pages
442 * to their final resting place. As such they must
443 * not conflict with either the destination addresses
444 * or memory the kernel is already using.
445 *
446 * Control pages are also the only pags we must allocate
447 * when loading a crash kernel. All of the other pages
448 * are specified by the segments and we just memcpy
449 * into them directly.
450 *
451 * The only case where we really need more than one of
452 * these are for architectures where we cannot disable
453 * the MMU and must instead generate an identity mapped
454 * page table for all of the memory.
455 *
456 * Given the low demand this implements a very simple
457 * allocator that finds the first hole of the appropriate
458 * size in the reserved memory region, and allocates all
459 * of the memory up to and including the hole.
460 */
461 unsigned long hole_start, hole_end, size;
462 struct page *pages;
463
464 pages = NULL;
465 size = (1 << order) << PAGE_SHIFT;
466 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
467 hole_end = hole_start + size - 1;
468 while (hole_end <= crashk_res.end) {
469 unsigned long i;
470
471 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
472 break;
473 if (hole_end > crashk_res.end)
474 break;
475 /* See if I overlap any of the segments */
476 for (i = 0; i < image->nr_segments; i++) {
477 unsigned long mstart, mend;
478
479 mstart = image->segment[i].mem;
480 mend = mstart + image->segment[i].memsz - 1;
481 if ((hole_end >= mstart) && (hole_start <= mend)) {
482 /* Advance the hole to the end of the segment */
483 hole_start = (mend + (size - 1)) & ~(size - 1);
484 hole_end = hole_start + size - 1;
485 break;
486 }
487 }
488 /* If I don't overlap any segments I have found my hole! */
489 if (i == image->nr_segments) {
490 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
491 break;
492 }
493 }
494 if (pages)
495 image->control_page = hole_end;
496
497 return pages;
498}
499
500
501struct page *kimage_alloc_control_pages(struct kimage *image,
502 unsigned int order)
503{
504 struct page *pages = NULL;
505
506 switch (image->type) {
507 case KEXEC_TYPE_DEFAULT:
508 pages = kimage_alloc_normal_control_pages(image, order);
509 break;
510 case KEXEC_TYPE_CRASH:
511 pages = kimage_alloc_crash_control_pages(image, order);
512 break;
513 }
514
515 return pages;
516}
517
518static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
519{
520 if (*image->entry != 0)
521 image->entry++;
522
523 if (image->entry == image->last_entry) {
524 kimage_entry_t *ind_page;
525 struct page *page;
526
527 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
528 if (!page)
529 return -ENOMEM;
530
531 ind_page = page_address(page);
532 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
533 image->entry = ind_page;
534 image->last_entry = ind_page +
535 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
536 }
537 *image->entry = entry;
538 image->entry++;
539 *image->entry = 0;
540
541 return 0;
542}
543
544static int kimage_set_destination(struct kimage *image,
545 unsigned long destination)
546{
547 int result;
548
549 destination &= PAGE_MASK;
550 result = kimage_add_entry(image, destination | IND_DESTINATION);
551 if (result == 0)
552 image->destination = destination;
553
554 return result;
555}
556
557
558static int kimage_add_page(struct kimage *image, unsigned long page)
559{
560 int result;
561
562 page &= PAGE_MASK;
563 result = kimage_add_entry(image, page | IND_SOURCE);
564 if (result == 0)
565 image->destination += PAGE_SIZE;
566
567 return result;
568}
569
570
571static void kimage_free_extra_pages(struct kimage *image)
572{
573 /* Walk through and free any extra destination pages I may have */
574 kimage_free_page_list(&image->dest_pages);
575
576 /* Walk through and free any unuseable pages I have cached */
577 kimage_free_page_list(&image->unuseable_pages);
578
579}
580static int kimage_terminate(struct kimage *image)
581{
582 if (*image->entry != 0)
583 image->entry++;
584
585 *image->entry = IND_DONE;
586
587 return 0;
588}
589
590#define for_each_kimage_entry(image, ptr, entry) \
591 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
592 ptr = (entry & IND_INDIRECTION)? \
593 phys_to_virt((entry & PAGE_MASK)): ptr +1)
594
595static void kimage_free_entry(kimage_entry_t entry)
596{
597 struct page *page;
598
599 page = pfn_to_page(entry >> PAGE_SHIFT);
600 kimage_free_pages(page);
601}
602
603static void kimage_free(struct kimage *image)
604{
605 kimage_entry_t *ptr, entry;
606 kimage_entry_t ind = 0;
607
608 if (!image)
609 return;
610
611 kimage_free_extra_pages(image);
612 for_each_kimage_entry(image, ptr, entry) {
613 if (entry & IND_INDIRECTION) {
614 /* Free the previous indirection page */
615 if (ind & IND_INDIRECTION)
616 kimage_free_entry(ind);
617 /* Save this indirection page until we are
618 * done with it.
619 */
620 ind = entry;
621 }
622 else if (entry & IND_SOURCE)
623 kimage_free_entry(entry);
624 }
625 /* Free the final indirection page */
626 if (ind & IND_INDIRECTION)
627 kimage_free_entry(ind);
628
629 /* Handle any machine specific cleanup */
630 machine_kexec_cleanup(image);
631
632 /* Free the kexec control pages... */
633 kimage_free_page_list(&image->control_pages);
634 kfree(image);
635}
636
637static kimage_entry_t *kimage_dst_used(struct kimage *image,
638 unsigned long page)
639{
640 kimage_entry_t *ptr, entry;
641 unsigned long destination = 0;
642
643 for_each_kimage_entry(image, ptr, entry) {
644 if (entry & IND_DESTINATION)
645 destination = entry & PAGE_MASK;
646 else if (entry & IND_SOURCE) {
647 if (page == destination)
648 return ptr;
649 destination += PAGE_SIZE;
650 }
651 }
652
653 return 0;
654}
655
656static struct page *kimage_alloc_page(struct kimage *image,
657 unsigned int gfp_mask,
658 unsigned long destination)
659{
660 /*
661 * Here we implement safeguards to ensure that a source page
662 * is not copied to its destination page before the data on
663 * the destination page is no longer useful.
664 *
665 * To do this we maintain the invariant that a source page is
666 * either its own destination page, or it is not a
667 * destination page at all.
668 *
669 * That is slightly stronger than required, but the proof
670 * that no problems will not occur is trivial, and the
671 * implementation is simply to verify.
672 *
673 * When allocating all pages normally this algorithm will run
674 * in O(N) time, but in the worst case it will run in O(N^2)
675 * time. If the runtime is a problem the data structures can
676 * be fixed.
677 */
678 struct page *page;
679 unsigned long addr;
680
681 /*
682 * Walk through the list of destination pages, and see if I
683 * have a match.
684 */
685 list_for_each_entry(page, &image->dest_pages, lru) {
686 addr = page_to_pfn(page) << PAGE_SHIFT;
687 if (addr == destination) {
688 list_del(&page->lru);
689 return page;
690 }
691 }
692 page = NULL;
693 while (1) {
694 kimage_entry_t *old;
695
696 /* Allocate a page, if we run out of memory give up */
697 page = kimage_alloc_pages(gfp_mask, 0);
698 if (!page)
699 return 0;
700 /* If the page cannot be used file it away */
701 if (page_to_pfn(page) >
702 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
703 list_add(&page->lru, &image->unuseable_pages);
704 continue;
705 }
706 addr = page_to_pfn(page) << PAGE_SHIFT;
707
708 /* If it is the destination page we want use it */
709 if (addr == destination)
710 break;
711
712 /* If the page is not a destination page use it */
713 if (!kimage_is_destination_range(image, addr,
714 addr + PAGE_SIZE))
715 break;
716
717 /*
718 * I know that the page is someones destination page.
719 * See if there is already a source page for this
720 * destination page. And if so swap the source pages.
721 */
722 old = kimage_dst_used(image, addr);
723 if (old) {
724 /* If so move it */
725 unsigned long old_addr;
726 struct page *old_page;
727
728 old_addr = *old & PAGE_MASK;
729 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
730 copy_highpage(page, old_page);
731 *old = addr | (*old & ~PAGE_MASK);
732
733 /* The old page I have found cannot be a
734 * destination page, so return it.
735 */
736 addr = old_addr;
737 page = old_page;
738 break;
739 }
740 else {
741 /* Place the page on the destination list I
742 * will use it later.
743 */
744 list_add(&page->lru, &image->dest_pages);
745 }
746 }
747
748 return page;
749}
750
751static int kimage_load_normal_segment(struct kimage *image,
752 struct kexec_segment *segment)
753{
754 unsigned long maddr;
755 unsigned long ubytes, mbytes;
756 int result;
757 unsigned char *buf;
758
759 result = 0;
760 buf = segment->buf;
761 ubytes = segment->bufsz;
762 mbytes = segment->memsz;
763 maddr = segment->mem;
764
765 result = kimage_set_destination(image, maddr);
766 if (result < 0)
767 goto out;
768
769 while (mbytes) {
770 struct page *page;
771 char *ptr;
772 size_t uchunk, mchunk;
773
774 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
775 if (page == 0) {
776 result = -ENOMEM;
777 goto out;
778 }
779 result = kimage_add_page(image, page_to_pfn(page)
780 << PAGE_SHIFT);
781 if (result < 0)
782 goto out;
783
784 ptr = kmap(page);
785 /* Start with a clear page */
786 memset(ptr, 0, PAGE_SIZE);
787 ptr += maddr & ~PAGE_MASK;
788 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
789 if (mchunk > mbytes)
790 mchunk = mbytes;
791
792 uchunk = mchunk;
793 if (uchunk > ubytes)
794 uchunk = ubytes;
795
796 result = copy_from_user(ptr, buf, uchunk);
797 kunmap(page);
798 if (result) {
799 result = (result < 0) ? result : -EIO;
800 goto out;
801 }
802 ubytes -= uchunk;
803 maddr += mchunk;
804 buf += mchunk;
805 mbytes -= mchunk;
806 }
807out:
808 return result;
809}
810
811static int kimage_load_crash_segment(struct kimage *image,
812 struct kexec_segment *segment)
813{
814 /* For crash dumps kernels we simply copy the data from
815 * user space to it's destination.
816 * We do things a page at a time for the sake of kmap.
817 */
818 unsigned long maddr;
819 unsigned long ubytes, mbytes;
820 int result;
821 unsigned char *buf;
822
823 result = 0;
824 buf = segment->buf;
825 ubytes = segment->bufsz;
826 mbytes = segment->memsz;
827 maddr = segment->mem;
828 while (mbytes) {
829 struct page *page;
830 char *ptr;
831 size_t uchunk, mchunk;
832
833 page = pfn_to_page(maddr >> PAGE_SHIFT);
834 if (page == 0) {
835 result = -ENOMEM;
836 goto out;
837 }
838 ptr = kmap(page);
839 ptr += maddr & ~PAGE_MASK;
840 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
841 if (mchunk > mbytes)
842 mchunk = mbytes;
843
844 uchunk = mchunk;
845 if (uchunk > ubytes) {
846 uchunk = ubytes;
847 /* Zero the trailing part of the page */
848 memset(ptr + uchunk, 0, mchunk - uchunk);
849 }
850 result = copy_from_user(ptr, buf, uchunk);
851 kunmap(page);
852 if (result) {
853 result = (result < 0) ? result : -EIO;
854 goto out;
855 }
856 ubytes -= uchunk;
857 maddr += mchunk;
858 buf += mchunk;
859 mbytes -= mchunk;
860 }
861out:
862 return result;
863}
864
865static int kimage_load_segment(struct kimage *image,
866 struct kexec_segment *segment)
867{
868 int result = -ENOMEM;
869
870 switch (image->type) {
871 case KEXEC_TYPE_DEFAULT:
872 result = kimage_load_normal_segment(image, segment);
873 break;
874 case KEXEC_TYPE_CRASH:
875 result = kimage_load_crash_segment(image, segment);
876 break;
877 }
878
879 return result;
880}
881
882/*
883 * Exec Kernel system call: for obvious reasons only root may call it.
884 *
885 * This call breaks up into three pieces.
886 * - A generic part which loads the new kernel from the current
887 * address space, and very carefully places the data in the
888 * allocated pages.
889 *
890 * - A generic part that interacts with the kernel and tells all of
891 * the devices to shut down. Preventing on-going dmas, and placing
892 * the devices in a consistent state so a later kernel can
893 * reinitialize them.
894 *
895 * - A machine specific part that includes the syscall number
896 * and the copies the image to it's final destination. And
897 * jumps into the image at entry.
898 *
899 * kexec does not sync, or unmount filesystems so if you need
900 * that to happen you need to do that yourself.
901 */
902struct kimage *kexec_image = NULL;
903static struct kimage *kexec_crash_image = NULL;
904/*
905 * A home grown binary mutex.
906 * Nothing can wait so this mutex is safe to use
907 * in interrupt context :)
908 */
909static int kexec_lock = 0;
910
911asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
912 struct kexec_segment __user *segments,
913 unsigned long flags)
914{
915 struct kimage **dest_image, *image;
916 int locked;
917 int result;
918
919 /* We only trust the superuser with rebooting the system. */
920 if (!capable(CAP_SYS_BOOT))
921 return -EPERM;
922
923 /*
924 * Verify we have a legal set of flags
925 * This leaves us room for future extensions.
926 */
927 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
928 return -EINVAL;
929
930 /* Verify we are on the appropriate architecture */
931 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
932 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
933 return -EINVAL;
934
935 /* Put an artificial cap on the number
936 * of segments passed to kexec_load.
937 */
938 if (nr_segments > KEXEC_SEGMENT_MAX)
939 return -EINVAL;
940
941 image = NULL;
942 result = 0;
943
944 /* Because we write directly to the reserved memory
945 * region when loading crash kernels we need a mutex here to
946 * prevent multiple crash kernels from attempting to load
947 * simultaneously, and to prevent a crash kernel from loading
948 * over the top of a in use crash kernel.
949 *
950 * KISS: always take the mutex.
951 */
952 locked = xchg(&kexec_lock, 1);
953 if (locked)
954 return -EBUSY;
955
956 dest_image = &kexec_image;
957 if (flags & KEXEC_ON_CRASH)
958 dest_image = &kexec_crash_image;
959 if (nr_segments > 0) {
960 unsigned long i;
961
962 /* Loading another kernel to reboot into */
963 if ((flags & KEXEC_ON_CRASH) == 0)
964 result = kimage_normal_alloc(&image, entry,
965 nr_segments, segments);
966 /* Loading another kernel to switch to if this one crashes */
967 else if (flags & KEXEC_ON_CRASH) {
968 /* Free any current crash dump kernel before
969 * we corrupt it.
970 */
971 kimage_free(xchg(&kexec_crash_image, NULL));
972 result = kimage_crash_alloc(&image, entry,
973 nr_segments, segments);
974 }
975 if (result)
976 goto out;
977
978 result = machine_kexec_prepare(image);
979 if (result)
980 goto out;
981
982 for (i = 0; i < nr_segments; i++) {
983 result = kimage_load_segment(image, &image->segment[i]);
984 if (result)
985 goto out;
986 }
987 result = kimage_terminate(image);
988 if (result)
989 goto out;
990 }
991 /* Install the new kernel, and Uninstall the old */
992 image = xchg(dest_image, image);
993
994out:
995 xchg(&kexec_lock, 0); /* Release the mutex */
996 kimage_free(image);
997
998 return result;
999}
1000
1001#ifdef CONFIG_COMPAT
1002asmlinkage long compat_sys_kexec_load(unsigned long entry,
1003 unsigned long nr_segments,
1004 struct compat_kexec_segment __user *segments,
1005 unsigned long flags)
1006{
1007 struct compat_kexec_segment in;
1008 struct kexec_segment out, __user *ksegments;
1009 unsigned long i, result;
1010
1011 /* Don't allow clients that don't understand the native
1012 * architecture to do anything.
1013 */
1014 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1015 return -EINVAL;
1016
1017 if (nr_segments > KEXEC_SEGMENT_MAX)
1018 return -EINVAL;
1019
1020 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1021 for (i=0; i < nr_segments; i++) {
1022 result = copy_from_user(&in, &segments[i], sizeof(in));
1023 if (result)
1024 return -EFAULT;
1025
1026 out.buf = compat_ptr(in.buf);
1027 out.bufsz = in.bufsz;
1028 out.mem = in.mem;
1029 out.memsz = in.memsz;
1030
1031 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1032 if (result)
1033 return -EFAULT;
1034 }
1035
1036 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1037}
1038#endif
1039
1040void crash_kexec(struct pt_regs *regs)
1041{
1042 struct kimage *image;
1043 int locked;
1044
1045
1046 /* Take the kexec_lock here to prevent sys_kexec_load
1047 * running on one cpu from replacing the crash kernel
1048 * we are using after a panic on a different cpu.
1049 *
1050 * If the crash kernel was not located in a fixed area
1051 * of memory the xchg(&kexec_crash_image) would be
1052 * sufficient. But since I reuse the memory...
1053 */
1054 locked = xchg(&kexec_lock, 1);
1055 if (!locked) {
1056 image = xchg(&kexec_crash_image, NULL);
1057 if (image) {
1058 machine_crash_shutdown(regs);
1059 machine_kexec(image);
1060 }
1061 xchg(&kexec_lock, 0);
1062 }
1063}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
120 char *path; 120 char *path;
121 char **argv; 121 char **argv;
122 char **envp; 122 char **envp;
123 struct key *ring;
123 int wait; 124 int wait;
124 int retval; 125 int retval;
125}; 126};
@@ -130,16 +131,21 @@ struct subprocess_info {
130static int ____call_usermodehelper(void *data) 131static int ____call_usermodehelper(void *data)
131{ 132{
132 struct subprocess_info *sub_info = data; 133 struct subprocess_info *sub_info = data;
134 struct key *old_session;
133 int retval; 135 int retval;
134 136
135 /* Unblock all signals. */ 137 /* Unblock all signals and set the session keyring. */
138 key_get(sub_info->ring);
136 flush_signals(current); 139 flush_signals(current);
137 spin_lock_irq(&current->sighand->siglock); 140 spin_lock_irq(&current->sighand->siglock);
141 old_session = __install_session_keyring(current, sub_info->ring);
138 flush_signal_handlers(current, 1); 142 flush_signal_handlers(current, 1);
139 sigemptyset(&current->blocked); 143 sigemptyset(&current->blocked);
140 recalc_sigpending(); 144 recalc_sigpending();
141 spin_unlock_irq(&current->sighand->siglock); 145 spin_unlock_irq(&current->sighand->siglock);
142 146
147 key_put(old_session);
148
143 /* We can run anywhere, unlike our parent keventd(). */ 149 /* We can run anywhere, unlike our parent keventd(). */
144 set_cpus_allowed(current, CPU_MASK_ALL); 150 set_cpus_allowed(current, CPU_MASK_ALL);
145 151
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
211} 217}
212 218
213/** 219/**
214 * call_usermodehelper - start a usermode application 220 * call_usermodehelper_keys - start a usermode application
215 * @path: pathname for the application 221 * @path: pathname for the application
216 * @argv: null-terminated argument list 222 * @argv: null-terminated argument list
217 * @envp: null-terminated environment list 223 * @envp: null-terminated environment list
224 * @session_keyring: session keyring for process (NULL for an empty keyring)
218 * @wait: wait for the application to finish and return status. 225 * @wait: wait for the application to finish and return status.
219 * 226 *
220 * Runs a user-space application. The application is started 227 * Runs a user-space application. The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
224 * Must be called from process context. Returns a negative error code 231 * Must be called from process context. Returns a negative error code
225 * if program was not execed successfully, or 0. 232 * if program was not execed successfully, or 0.
226 */ 233 */
227int call_usermodehelper(char *path, char **argv, char **envp, int wait) 234int call_usermodehelper_keys(char *path, char **argv, char **envp,
235 struct key *session_keyring, int wait)
228{ 236{
229 DECLARE_COMPLETION(done); 237 DECLARE_COMPLETION(done);
230 struct subprocess_info sub_info = { 238 struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
232 .path = path, 240 .path = path,
233 .argv = argv, 241 .argv = argv,
234 .envp = envp, 242 .envp = envp,
243 .ring = session_keyring,
235 .wait = wait, 244 .wait = wait,
236 .retval = 0, 245 .retval = 0,
237 }; 246 };
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
247 wait_for_completion(&done); 256 wait_for_completion(&done);
248 return sub_info.retval; 257 return sub_info.retval;
249} 258}
250EXPORT_SYMBOL(call_usermodehelper); 259EXPORT_SYMBOL(call_usermodehelper_keys);
251 260
252void __init usermodehelper_init(void) 261void __init usermodehelper_init(void)
253{ 262{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..334f37472c56 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,6 +27,9 @@
27 * interface to access function arguments. 27 * interface to access function arguments.
28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes 28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
29 * exceptions notifier to be first on the priority list. 29 * exceptions notifier to be first on the priority list.
30 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
31 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
32 * <prasanna@in.ibm.com> added function-return probes.
30 */ 33 */
31#include <linux/kprobes.h> 34#include <linux/kprobes.h>
32#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -41,6 +44,7 @@
41#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 44#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
42 45
43static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 46static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
47static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
44 48
45unsigned int kprobe_cpu = NR_CPUS; 49unsigned int kprobe_cpu = NR_CPUS;
46static DEFINE_SPINLOCK(kprobe_lock); 50static DEFINE_SPINLOCK(kprobe_lock);
@@ -78,22 +82,23 @@ struct kprobe *get_kprobe(void *addr)
78 * Aggregate handlers for multiple kprobes support - these handlers 82 * Aggregate handlers for multiple kprobes support - these handlers
79 * take care of invoking the individual kprobe handlers on p->list 83 * take care of invoking the individual kprobe handlers on p->list
80 */ 84 */
81int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 85static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
82{ 86{
83 struct kprobe *kp; 87 struct kprobe *kp;
84 88
85 list_for_each_entry(kp, &p->list, list) { 89 list_for_each_entry(kp, &p->list, list) {
86 if (kp->pre_handler) { 90 if (kp->pre_handler) {
87 curr_kprobe = kp; 91 curr_kprobe = kp;
88 kp->pre_handler(kp, regs); 92 if (kp->pre_handler(kp, regs))
89 curr_kprobe = NULL; 93 return 1;
90 } 94 }
95 curr_kprobe = NULL;
91 } 96 }
92 return 0; 97 return 0;
93} 98}
94 99
95void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 100static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
96 unsigned long flags) 101 unsigned long flags)
97{ 102{
98 struct kprobe *kp; 103 struct kprobe *kp;
99 104
@@ -107,7 +112,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
107 return; 112 return;
108} 113}
109 114
110int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) 115static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
116 int trapnr)
111{ 117{
112 /* 118 /*
113 * if we faulted "during" the execution of a user specified 119 * if we faulted "during" the execution of a user specified
@@ -120,19 +126,191 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
120 return 0; 126 return 0;
121} 127}
122 128
129static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
130{
131 struct kprobe *kp = curr_kprobe;
132 if (curr_kprobe && kp->break_handler) {
133 if (kp->break_handler(kp, regs)) {
134 curr_kprobe = NULL;
135 return 1;
136 }
137 }
138 curr_kprobe = NULL;
139 return 0;
140}
141
142struct kprobe trampoline_p = {
143 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
144 .pre_handler = trampoline_probe_handler,
145 .post_handler = trampoline_post_handler
146};
147
148struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
149{
150 struct hlist_node *node;
151 struct kretprobe_instance *ri;
152 hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
153 return ri;
154 return NULL;
155}
156
157static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
158{
159 struct hlist_node *node;
160 struct kretprobe_instance *ri;
161 hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
162 return ri;
163 return NULL;
164}
165
166struct kretprobe_instance *get_rp_inst(void *sara)
167{
168 struct hlist_head *head;
169 struct hlist_node *node;
170 struct task_struct *tsk;
171 struct kretprobe_instance *ri;
172
173 tsk = arch_get_kprobe_task(sara);
174 head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
175 hlist_for_each_entry(ri, node, head, hlist) {
176 if (ri->stack_addr == sara)
177 return ri;
178 }
179 return NULL;
180}
181
182void add_rp_inst(struct kretprobe_instance *ri)
183{
184 struct task_struct *tsk;
185 /*
186 * Remove rp inst off the free list -
187 * Add it back when probed function returns
188 */
189 hlist_del(&ri->uflist);
190 tsk = arch_get_kprobe_task(ri->stack_addr);
191 /* Add rp inst onto table */
192 INIT_HLIST_NODE(&ri->hlist);
193 hlist_add_head(&ri->hlist,
194 &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]);
195
196 /* Also add this rp inst to the used list. */
197 INIT_HLIST_NODE(&ri->uflist);
198 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
199}
200
201void recycle_rp_inst(struct kretprobe_instance *ri)
202{
203 /* remove rp inst off the rprobe_inst_table */
204 hlist_del(&ri->hlist);
205 if (ri->rp) {
206 /* remove rp inst off the used list */
207 hlist_del(&ri->uflist);
208 /* put rp inst back onto the free list */
209 INIT_HLIST_NODE(&ri->uflist);
210 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
211 } else
212 /* Unregistering */
213 kfree(ri);
214}
215
216struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
217{
218 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
219}
220
221struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
222{
223 struct task_struct *tsk;
224 struct hlist_head *head;
225 struct hlist_node *node;
226 struct kretprobe_instance *ri;
227
228 head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)];
229
230 hlist_for_each_entry(ri, node, head, hlist) {
231 tsk = arch_get_kprobe_task(ri->stack_addr);
232 if (tsk == tk)
233 return ri;
234 }
235 return NULL;
236}
237
238/*
239 * This function is called from do_exit or do_execv when task tk's stack is
240 * about to be recycled. Recycle any function-return probe instances
241 * associated with this task. These represent probed functions that have
242 * been called but may never return.
243 */
244void kprobe_flush_task(struct task_struct *tk)
245{
246 unsigned long flags = 0;
247 spin_lock_irqsave(&kprobe_lock, flags);
248 arch_kprobe_flush_task(tk);
249 spin_unlock_irqrestore(&kprobe_lock, flags);
250}
251
252/*
253 * This kprobe pre_handler is registered with every kretprobe. When probe
254 * hits it will set up the return probe.
255 */
256static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
257{
258 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
259
260 /*TODO: consider to only swap the RA after the last pre_handler fired */
261 arch_prepare_kretprobe(rp, regs);
262 return 0;
263}
264
265static inline void free_rp_inst(struct kretprobe *rp)
266{
267 struct kretprobe_instance *ri;
268 while ((ri = get_free_rp_inst(rp)) != NULL) {
269 hlist_del(&ri->uflist);
270 kfree(ri);
271 }
272}
273
274/*
275 * Keep all fields in the kprobe consistent
276 */
277static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
278{
279 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
280 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
281}
282
283/*
284* Add the new probe to old_p->list. Fail if this is the
285* second jprobe at the address - two jprobes can't coexist
286*/
287static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
288{
289 struct kprobe *kp;
290
291 if (p->break_handler) {
292 list_for_each_entry(kp, &old_p->list, list) {
293 if (kp->break_handler)
294 return -EEXIST;
295 }
296 list_add_tail(&p->list, &old_p->list);
297 } else
298 list_add(&p->list, &old_p->list);
299 return 0;
300}
301
123/* 302/*
124 * Fill in the required fields of the "manager kprobe". Replace the 303 * Fill in the required fields of the "manager kprobe". Replace the
125 * earlier kprobe in the hlist with the manager kprobe 304 * earlier kprobe in the hlist with the manager kprobe
126 */ 305 */
127static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 306static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
128{ 307{
308 copy_kprobe(p, ap);
129 ap->addr = p->addr; 309 ap->addr = p->addr;
130 ap->opcode = p->opcode;
131 memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
132
133 ap->pre_handler = aggr_pre_handler; 310 ap->pre_handler = aggr_pre_handler;
134 ap->post_handler = aggr_post_handler; 311 ap->post_handler = aggr_post_handler;
135 ap->fault_handler = aggr_fault_handler; 312 ap->fault_handler = aggr_fault_handler;
313 ap->break_handler = aggr_break_handler;
136 314
137 INIT_LIST_HEAD(&ap->list); 315 INIT_LIST_HEAD(&ap->list);
138 list_add(&p->list, &ap->list); 316 list_add(&p->list, &ap->list);
@@ -153,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
153 int ret = 0; 331 int ret = 0;
154 struct kprobe *ap; 332 struct kprobe *ap;
155 333
156 if (old_p->break_handler || p->break_handler) { 334 if (old_p->pre_handler == aggr_pre_handler) {
157 ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ 335 copy_kprobe(old_p, p);
158 } else if (old_p->pre_handler == aggr_pre_handler) { 336 ret = add_new_kprobe(old_p, p);
159 list_add(&p->list, &old_p->list);
160 } else { 337 } else {
161 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); 338 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
162 if (!ap) 339 if (!ap)
163 return -ENOMEM; 340 return -ENOMEM;
164 add_aggr_kprobe(ap, old_p); 341 add_aggr_kprobe(ap, old_p);
165 list_add(&p->list, &ap->list); 342 copy_kprobe(ap, p);
343 ret = add_new_kprobe(ap, p);
166 } 344 }
167 return ret; 345 return ret;
168} 346}
@@ -170,10 +348,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
170/* kprobe removal house-keeping routines */ 348/* kprobe removal house-keeping routines */
171static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) 349static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
172{ 350{
173 *p->addr = p->opcode; 351 arch_disarm_kprobe(p);
174 hlist_del(&p->hlist); 352 hlist_del(&p->hlist);
175 flush_icache_range((unsigned long) p->addr,
176 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
177 spin_unlock_irqrestore(&kprobe_lock, flags); 353 spin_unlock_irqrestore(&kprobe_lock, flags);
178 arch_remove_kprobe(p); 354 arch_remove_kprobe(p);
179} 355}
@@ -200,6 +376,7 @@ int register_kprobe(struct kprobe *p)
200 } 376 }
201 spin_lock_irqsave(&kprobe_lock, flags); 377 spin_lock_irqsave(&kprobe_lock, flags);
202 old_p = get_kprobe(p->addr); 378 old_p = get_kprobe(p->addr);
379 p->nmissed = 0;
203 if (old_p) { 380 if (old_p) {
204 ret = register_aggr_kprobe(old_p, p); 381 ret = register_aggr_kprobe(old_p, p);
205 goto out; 382 goto out;
@@ -210,10 +387,8 @@ int register_kprobe(struct kprobe *p)
210 hlist_add_head(&p->hlist, 387 hlist_add_head(&p->hlist,
211 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 388 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
212 389
213 p->opcode = *p->addr; 390 arch_arm_kprobe(p);
214 *p->addr = BREAKPOINT_INSTRUCTION; 391
215 flush_icache_range((unsigned long) p->addr,
216 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
217out: 392out:
218 spin_unlock_irqrestore(&kprobe_lock, flags); 393 spin_unlock_irqrestore(&kprobe_lock, flags);
219rm_kprobe: 394rm_kprobe:
@@ -257,16 +432,82 @@ void unregister_jprobe(struct jprobe *jp)
257 unregister_kprobe(&jp->kp); 432 unregister_kprobe(&jp->kp);
258} 433}
259 434
435#ifdef ARCH_SUPPORTS_KRETPROBES
436
437int register_kretprobe(struct kretprobe *rp)
438{
439 int ret = 0;
440 struct kretprobe_instance *inst;
441 int i;
442
443 rp->kp.pre_handler = pre_handler_kretprobe;
444
445 /* Pre-allocate memory for max kretprobe instances */
446 if (rp->maxactive <= 0) {
447#ifdef CONFIG_PREEMPT
448 rp->maxactive = max(10, 2 * NR_CPUS);
449#else
450 rp->maxactive = NR_CPUS;
451#endif
452 }
453 INIT_HLIST_HEAD(&rp->used_instances);
454 INIT_HLIST_HEAD(&rp->free_instances);
455 for (i = 0; i < rp->maxactive; i++) {
456 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
457 if (inst == NULL) {
458 free_rp_inst(rp);
459 return -ENOMEM;
460 }
461 INIT_HLIST_NODE(&inst->uflist);
462 hlist_add_head(&inst->uflist, &rp->free_instances);
463 }
464
465 rp->nmissed = 0;
466 /* Establish function entry probe point */
467 if ((ret = register_kprobe(&rp->kp)) != 0)
468 free_rp_inst(rp);
469 return ret;
470}
471
472#else /* ARCH_SUPPORTS_KRETPROBES */
473
474int register_kretprobe(struct kretprobe *rp)
475{
476 return -ENOSYS;
477}
478
479#endif /* ARCH_SUPPORTS_KRETPROBES */
480
481void unregister_kretprobe(struct kretprobe *rp)
482{
483 unsigned long flags;
484 struct kretprobe_instance *ri;
485
486 unregister_kprobe(&rp->kp);
487 /* No race here */
488 spin_lock_irqsave(&kprobe_lock, flags);
489 free_rp_inst(rp);
490 while ((ri = get_used_rp_inst(rp)) != NULL) {
491 ri->rp = NULL;
492 hlist_del(&ri->uflist);
493 }
494 spin_unlock_irqrestore(&kprobe_lock, flags);
495}
496
260static int __init init_kprobes(void) 497static int __init init_kprobes(void)
261{ 498{
262 int i, err = 0; 499 int i, err = 0;
263 500
264 /* FIXME allocate the probe table, currently defined statically */ 501 /* FIXME allocate the probe table, currently defined statically */
265 /* initialize all list heads */ 502 /* initialize all list heads */
266 for (i = 0; i < KPROBE_TABLE_SIZE; i++) 503 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
267 INIT_HLIST_HEAD(&kprobe_table[i]); 504 INIT_HLIST_HEAD(&kprobe_table[i]);
505 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
506 }
268 507
269 err = register_die_notifier(&kprobe_exceptions_nb); 508 err = register_die_notifier(&kprobe_exceptions_nb);
509 /* Register the trampoline probe for return probe */
510 register_kprobe(&trampoline_p);
270 return err; 511 return err;
271} 512}
272 513
@@ -277,3 +518,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
277EXPORT_SYMBOL_GPL(register_jprobe); 518EXPORT_SYMBOL_GPL(register_jprobe);
278EXPORT_SYMBOL_GPL(unregister_jprobe); 519EXPORT_SYMBOL_GPL(unregister_jprobe);
279EXPORT_SYMBOL_GPL(jprobe_return); 520EXPORT_SYMBOL_GPL(jprobe_return);
521EXPORT_SYMBOL_GPL(register_kretprobe);
522EXPORT_SYMBOL_GPL(unregister_kretprobe);
523
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
30KERNEL_ATTR_RO(hotplug_seqnum); 30KERNEL_ATTR_RO(hotplug_seqnum);
31#endif 31#endif
32 32
33#ifdef CONFIG_KEXEC
34#include <asm/kexec.h>
35
36static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
37{
38 return sprintf(page, "%p\n", (void *)crash_notes);
39}
40KERNEL_ATTR_RO(crash_notes);
41#endif
42
33decl_subsys(kernel, NULL, NULL); 43decl_subsys(kernel, NULL, NULL);
34EXPORT_SYMBOL_GPL(kernel_subsys); 44EXPORT_SYMBOL_GPL(kernel_subsys);
35 45
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = {
37#ifdef CONFIG_HOTPLUG 47#ifdef CONFIG_HOTPLUG
38 &hotplug_seqnum_attr.attr, 48 &hotplug_seqnum_attr.attr,
39#endif 49#endif
50#ifdef CONFIG_KEXEC
51 &crash_notes_attr.attr,
52#endif
40 NULL 53 NULL
41}; 54};
42 55
diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3f9..068e271ab3a5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/stop_machine.h> 36#include <linux/stop_machine.h>
37#include <linux/device.h> 37#include <linux/device.h>
38#include <linux/string.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/semaphore.h> 40#include <asm/semaphore.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
@@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
370#endif /* CONFIG_SMP */ 371#endif /* CONFIG_SMP */
371 372
372#ifdef CONFIG_MODULE_UNLOAD 373#ifdef CONFIG_MODULE_UNLOAD
374#define MODINFO_ATTR(field) \
375static void setup_modinfo_##field(struct module *mod, const char *s) \
376{ \
377 mod->field = kstrdup(s, GFP_KERNEL); \
378} \
379static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
380 struct module *mod, char *buffer) \
381{ \
382 return sprintf(buffer, "%s\n", mod->field); \
383} \
384static int modinfo_##field##_exists(struct module *mod) \
385{ \
386 return mod->field != NULL; \
387} \
388static void free_modinfo_##field(struct module *mod) \
389{ \
390 kfree(mod->field); \
391 mod->field = NULL; \
392} \
393static struct module_attribute modinfo_##field = { \
394 .attr = { .name = __stringify(field), .mode = 0444, \
395 .owner = THIS_MODULE }, \
396 .show = show_modinfo_##field, \
397 .setup = setup_modinfo_##field, \
398 .test = modinfo_##field##_exists, \
399 .free = free_modinfo_##field, \
400};
401
402MODINFO_ATTR(version);
403MODINFO_ATTR(srcversion);
404
405static struct module_attribute *modinfo_attrs[] = {
406 &modinfo_version,
407 &modinfo_srcversion,
408 NULL,
409};
410
373/* Init the unload section of the module. */ 411/* Init the unload section of the module. */
374static void module_unload_init(struct module *mod) 412static void module_unload_init(struct module *mod)
375{ 413{
@@ -379,7 +417,7 @@ static void module_unload_init(struct module *mod)
379 for (i = 0; i < NR_CPUS; i++) 417 for (i = 0; i < NR_CPUS; i++)
380 local_set(&mod->ref[i].count, 0); 418 local_set(&mod->ref[i].count, 0);
381 /* Hold reference count during initialization. */ 419 /* Hold reference count during initialization. */
382 local_set(&mod->ref[_smp_processor_id()].count, 1); 420 local_set(&mod->ref[raw_smp_processor_id()].count, 1);
383 /* Backwards compatibility macros put refcount during init. */ 421 /* Backwards compatibility macros put refcount during init. */
384 mod->waiter = current; 422 mod->waiter = current;
385} 423}
@@ -692,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
692 return 0; 730 return 0;
693} 731}
694 732
695int set_obsolete(const char *val, struct kernel_param *kp) 733static int set_obsolete(const char *val, struct kernel_param *kp)
696{ 734{
697 unsigned int min, max; 735 unsigned int min, max;
698 unsigned int size, maxsize; 736 unsigned int size, maxsize;
@@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod)
1031} 1069}
1032#endif 1070#endif
1033 1071
1072#ifdef CONFIG_MODULE_UNLOAD
1073static int module_add_modinfo_attrs(struct module *mod)
1074{
1075 struct module_attribute *attr;
1076 int error = 0;
1077 int i;
1078
1079 for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
1080 if (!attr->test ||
1081 (attr->test && attr->test(mod)))
1082 error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
1083 }
1084 return error;
1085}
1086
1087static void module_remove_modinfo_attrs(struct module *mod)
1088{
1089 struct module_attribute *attr;
1090 int i;
1091
1092 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1093 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
1094 attr->free(mod);
1095 }
1096}
1097#endif
1034 1098
1035static int mod_sysfs_setup(struct module *mod, 1099static int mod_sysfs_setup(struct module *mod,
1036 struct kernel_param *kparam, 1100 struct kernel_param *kparam,
@@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod,
1056 if (err) 1120 if (err)
1057 goto out_unreg; 1121 goto out_unreg;
1058 1122
1123#ifdef CONFIG_MODULE_UNLOAD
1124 err = module_add_modinfo_attrs(mod);
1125 if (err)
1126 goto out_unreg;
1127#endif
1128
1059 return 0; 1129 return 0;
1060 1130
1061out_unreg: 1131out_unreg:
@@ -1066,6 +1136,9 @@ out:
1066 1136
1067static void mod_kobject_remove(struct module *mod) 1137static void mod_kobject_remove(struct module *mod)
1068{ 1138{
1139#ifdef CONFIG_MODULE_UNLOAD
1140 module_remove_modinfo_attrs(mod);
1141#endif
1069 module_remove_refcnt_attr(mod); 1142 module_remove_refcnt_attr(mod);
1070 module_param_sysfs_remove(mod); 1143 module_param_sysfs_remove(mod);
1071 1144
@@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
1311 return NULL; 1384 return NULL;
1312} 1385}
1313 1386
1387#ifdef CONFIG_MODULE_UNLOAD
1388static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1389 unsigned int infoindex)
1390{
1391 struct module_attribute *attr;
1392 int i;
1393
1394 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1395 if (attr->setup)
1396 attr->setup(mod,
1397 get_modinfo(sechdrs,
1398 infoindex,
1399 attr->attr.name));
1400 }
1401}
1402#endif
1403
1314#ifdef CONFIG_KALLSYMS 1404#ifdef CONFIG_KALLSYMS
1315int is_exported(const char *name, const struct module *mod) 1405int is_exported(const char *name, const struct module *mod)
1316{ 1406{
@@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod,
1615 /* Set up license info based on the info section */ 1705 /* Set up license info based on the info section */
1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1706 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617 1707
1708#ifdef CONFIG_MODULE_UNLOAD
1709 /* Set up MODINFO_ATTR fields */
1710 setup_modinfo(mod, sechdrs, infoindex);
1711#endif
1712
1618 /* Fix up syms, so that st_value is a pointer to location. */ 1713 /* Fix up syms, so that st_value is a pointer to location. */
1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, 1714 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620 mod); 1715 mod);
@@ -1758,6 +1853,7 @@ sys_init_module(void __user *umod,
1758 const char __user *uargs) 1853 const char __user *uargs)
1759{ 1854{
1760 struct module *mod; 1855 struct module *mod;
1856 mm_segment_t old_fs = get_fs();
1761 int ret = 0; 1857 int ret = 0;
1762 1858
1763 /* Must have permission */ 1859 /* Must have permission */
@@ -1775,6 +1871,9 @@ sys_init_module(void __user *umod,
1775 return PTR_ERR(mod); 1871 return PTR_ERR(mod);
1776 } 1872 }
1777 1873
1874 /* flush the icache in correct context */
1875 set_fs(KERNEL_DS);
1876
1778 /* Flush the instruction cache, since we've played with text */ 1877 /* Flush the instruction cache, since we've played with text */
1779 if (mod->module_init) 1878 if (mod->module_init)
1780 flush_icache_range((unsigned long)mod->module_init, 1879 flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1882,8 @@ sys_init_module(void __user *umod,
1783 flush_icache_range((unsigned long)mod->module_core, 1882 flush_icache_range((unsigned long)mod->module_core,
1784 (unsigned long)mod->module_core + mod->core_size); 1883 (unsigned long)mod->module_core + mod->core_size);
1785 1884
1885 set_fs(old_fs);
1886
1786 /* Now sew it into the lists. They won't access us, since 1887 /* Now sew it into the lists. They won't access us, since
1787 strong_try_module_get() will fail. */ 1888 strong_try_module_get() will fail. */
1788 stop_machine_run(__link_module, mod, NR_CPUS); 1889 stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
18#include <linux/sysrq.h> 18#include <linux/sysrq.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h>
21 22
22int panic_timeout; 23int panic_timeout;
23int panic_on_oops; 24int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
63 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
64#endif 65#endif
65 66
67 /*
68 * It's possible to come here directly from a panic-assertion and not
69 * have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though...
71 */
72 preempt_disable();
73
66 bust_spinlocks(1); 74 bust_spinlocks(1);
67 va_start(args, fmt); 75 va_start(args, fmt);
68 vsnprintf(buf, sizeof(buf), fmt, args); 76 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
70 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 78 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
71 bust_spinlocks(0); 79 bust_spinlocks(0);
72 80
81 /*
82 * If we have crashed and we have a crash kernel loaded let it handle
83 * everything else.
84 * Do we want to call this before we try to display a message?
85 */
86 crash_kexec(NULL);
87
73#ifdef CONFIG_SMP 88#ifdef CONFIG_SMP
89 /*
90 * Note smp_send_stop is the usual smp shutdown function, which
91 * unfortunately means it may not be hardened to work in a panic
92 * situation.
93 */
74 smp_send_stop(); 94 smp_send_stop();
75#endif 95#endif
76 96
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
79 if (!panic_blink) 99 if (!panic_blink)
80 panic_blink = no_blink; 100 panic_blink = no_blink;
81 101
82 if (panic_timeout > 0) 102 if (panic_timeout > 0) {
83 {
84 /* 103 /*
85 * Delay timeout seconds before rebooting the machine. 104 * Delay timeout seconds before rebooting the machine.
86 * We can't use the "normal" timers since we just panicked.. 105 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/params.c b/kernel/params.c
index 5513844bec13..d586c35ef8fc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
629 mk = to_module_kobject(kobj); 629 mk = to_module_kobject(kobj);
630 630
631 if (!attribute->show) 631 if (!attribute->show)
632 return -EPERM; 632 return -EIO;
633 633
634 if (!try_module_get(mk->mod)) 634 if (!try_module_get(mk->mod))
635 return -ENODEV; 635 return -ENODEV;
@@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
653 mk = to_module_kobject(kobj); 653 mk = to_module_kobject(kobj);
654 654
655 if (!attribute->store) 655 if (!attribute->store)
656 return -EPERM; 656 return -EIO;
657 657
658 if (!try_module_get(mk->mod)) 658 if (!try_module_get(mk->mod))
659 return -ENODEV; 659 return -ENODEV;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fd316c272260..5b7b4736d82b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -89,23 +89,6 @@ static struct idr posix_timers_id;
89static DEFINE_SPINLOCK(idr_lock); 89static DEFINE_SPINLOCK(idr_lock);
90 90
91/* 91/*
92 * Just because the timer is not in the timer list does NOT mean it is
93 * inactive. It could be in the "fire" routine getting a new expire time.
94 */
95#define TIMER_INACTIVE 1
96
97#ifdef CONFIG_SMP
98# define timer_active(tmr) \
99 ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
100# define set_timer_inactive(tmr) \
101 do { \
102 (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
103 } while (0)
104#else
105# define timer_active(tmr) BARFY // error to use outside of SMP
106# define set_timer_inactive(tmr) do { } while (0)
107#endif
108/*
109 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 92 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
110 * SIGEV values. Here we put out an error if this assumption fails. 93 * SIGEV values. Here we put out an error if this assumption fails.
111 */ 94 */
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer)
226 init_timer(&new_timer->it.real.timer); 209 init_timer(&new_timer->it.real.timer);
227 new_timer->it.real.timer.data = (unsigned long) new_timer; 210 new_timer->it.real.timer.data = (unsigned long) new_timer;
228 new_timer->it.real.timer.function = posix_timer_fn; 211 new_timer->it.real.timer.function = posix_timer_fn;
229 set_timer_inactive(new_timer);
230 return 0; 212 return 0;
231} 213}
232 214
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data)
480 int do_notify = 1; 462 int do_notify = 1;
481 463
482 spin_lock_irqsave(&timr->it_lock, flags); 464 spin_lock_irqsave(&timr->it_lock, flags);
483 set_timer_inactive(timr);
484 if (!list_empty(&timr->it.real.abs_timer_entry)) { 465 if (!list_empty(&timr->it.real.abs_timer_entry)) {
485 spin_lock(&abs_list.lock); 466 spin_lock(&abs_list.lock);
486 do { 467 do {
@@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags,
983 * careful here. If smp we could be in the "fire" routine which will 964 * careful here. If smp we could be in the "fire" routine which will
984 * be spinning as we hold the lock. But this is ONLY an SMP issue. 965 * be spinning as we hold the lock. But this is ONLY an SMP issue.
985 */ 966 */
967 if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
986#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
987 if (timer_active(timr) && !del_timer(&timr->it.real.timer))
988 /* 969 /*
989 * It can only be active if on an other cpu. Since 970 * It can only be active if on an other cpu. Since
990 * we have cleared the interval stuff above, it should 971 * we have cleared the interval stuff above, it should
@@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags,
994 * a "retry" exit status. 975 * a "retry" exit status.
995 */ 976 */
996 return TIMER_RETRY; 977 return TIMER_RETRY;
997
998 set_timer_inactive(timr);
999#else
1000 del_timer(&timr->it.real.timer);
1001#endif 978#endif
979 }
980
1002 remove_from_abslist(timr); 981 remove_from_abslist(timr);
1003 982
1004 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 983 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
@@ -1083,8 +1062,9 @@ retry:
1083static inline int common_timer_del(struct k_itimer *timer) 1062static inline int common_timer_del(struct k_itimer *timer)
1084{ 1063{
1085 timer->it.real.incr = 0; 1064 timer->it.real.incr = 0;
1065
1066 if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
1086#ifdef CONFIG_SMP 1067#ifdef CONFIG_SMP
1087 if (timer_active(timer) && !del_timer(&timer->it.real.timer))
1088 /* 1068 /*
1089 * It can only be active if on an other cpu. Since 1069 * It can only be active if on an other cpu. Since
1090 * we have cleared the interval stuff above, it should 1070 * we have cleared the interval stuff above, it should
@@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer)
1094 * a "retry" exit status. 1074 * a "retry" exit status.
1095 */ 1075 */
1096 return TIMER_RETRY; 1076 return TIMER_RETRY;
1097#else
1098 del_timer(&timer->it.real.timer);
1099#endif 1077#endif
1078 }
1079
1100 remove_from_abslist(timer); 1080 remove_from_abslist(timer);
1101 1081
1102 return 0; 1082 return 0;
@@ -1197,6 +1177,7 @@ void exit_itimers(struct signal_struct *sig)
1197 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); 1177 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
1198 itimer_delete(tmr); 1178 itimer_delete(tmr);
1199 } 1179 }
1180 del_timer_sync(&sig->real_timer);
1200} 1181}
1201 1182
1202/* 1183/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
27 like suspend support. 27 like suspend support.
28 28
29config SOFTWARE_SUSPEND 29config SOFTWARE_SUSPEND
30 bool "Software Suspend (EXPERIMENTAL)" 30 bool "Software Suspend"
31 depends on EXPERIMENTAL && PM && SWAP 31 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
32 ---help--- 32 ---help---
33 Enable the possibility of suspending the machine. 33 Enable the possibility of suspending the machine.
34 It doesn't need APM. 34 It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
72 suspended image to. It will simply pick the first available swap 72 suspended image to. It will simply pick the first available swap
73 device. 73 device.
74 74
75config SUSPEND_SMP
76 bool
77 depends on HOTPLUG_CPU && X86 && PM
78 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6swsusp-smp-$(CONFIG_SMP) += smp.o
7
8obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o pm.o
9obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o 7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o
8
9obj-$(CONFIG_SUSPEND_SMP) += smp.o
10 10
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
117{ 117{
118 device_resume(); 118 device_resume();
119 platform_finish(); 119 platform_finish();
120 enable_nonboot_cpus();
121 thaw_processes(); 120 thaw_processes();
121 enable_nonboot_cpus();
122 pm_restore_console(); 122 pm_restore_console();
123} 123}
124 124
@@ -131,28 +131,35 @@ static int prepare_processes(void)
131 131
132 sys_sync(); 132 sys_sync();
133 133
134 disable_nonboot_cpus();
135
134 if (freeze_processes()) { 136 if (freeze_processes()) {
135 error = -EBUSY; 137 error = -EBUSY;
136 return error; 138 goto thaw;
137 } 139 }
138 140
139 if (pm_disk_mode == PM_DISK_PLATFORM) { 141 if (pm_disk_mode == PM_DISK_PLATFORM) {
140 if (pm_ops && pm_ops->prepare) { 142 if (pm_ops && pm_ops->prepare) {
141 if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) 143 if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
142 return error; 144 goto thaw;
143 } 145 }
144 } 146 }
145 147
146 /* Free memory before shutting down devices. */ 148 /* Free memory before shutting down devices. */
147 free_some_memory(); 149 free_some_memory();
148
149 return 0; 150 return 0;
151thaw:
152 thaw_processes();
153 enable_nonboot_cpus();
154 pm_restore_console();
155 return error;
150} 156}
151 157
152static void unprepare_processes(void) 158static void unprepare_processes(void)
153{ 159{
154 enable_nonboot_cpus(); 160 platform_finish();
155 thaw_processes(); 161 thaw_processes();
162 enable_nonboot_cpus();
156 pm_restore_console(); 163 pm_restore_console();
157} 164}
158 165
@@ -160,15 +167,9 @@ static int prepare_devices(void)
160{ 167{
161 int error; 168 int error;
162 169
163 disable_nonboot_cpus(); 170 if ((error = device_suspend(PMSG_FREEZE)))
164 if ((error = device_suspend(PMSG_FREEZE))) {
165 printk("Some devices failed to suspend\n"); 171 printk("Some devices failed to suspend\n");
166 platform_finish(); 172 return error;
167 enable_nonboot_cpus();
168 return error;
169 }
170
171 return 0;
172} 173}
173 174
174/** 175/**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
185 int error; 186 int error;
186 187
187 error = prepare_processes(); 188 error = prepare_processes();
188 if (!error) { 189 if (error)
189 error = prepare_devices(); 190 return error;
190 } 191 error = prepare_devices();
191 192
192 if (error) { 193 if (error) {
193 unprepare_processes(); 194 unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
250 251
251 if ((error = prepare_processes())) { 252 if ((error = prepare_processes())) {
252 swsusp_close(); 253 swsusp_close();
253 goto Cleanup; 254 goto Done;
254 } 255 }
255 256
256 pr_debug("PM: Reading swsusp image.\n"); 257 pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
55 55
56 pm_prepare_console(); 56 pm_prepare_console();
57 57
58 disable_nonboot_cpus();
59
60 if (num_online_cpus() != 1) {
61 error = -EPERM;
62 goto Enable_cpu;
63 }
64
58 if (freeze_processes()) { 65 if (freeze_processes()) {
59 error = -EAGAIN; 66 error = -EAGAIN;
60 goto Thaw; 67 goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
75 pm_ops->finish(state); 82 pm_ops->finish(state);
76 Thaw: 83 Thaw:
77 thaw_processes(); 84 thaw_processes();
85 Enable_cpu:
86 enable_nonboot_cpus();
78 pm_restore_console(); 87 pm_restore_console();
79 return error; 88 return error;
80} 89}
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
113 if (pm_ops && pm_ops->finish) 122 if (pm_ops && pm_ops->finish)
114 pm_ops->finish(state); 123 pm_ops->finish(state);
115 thaw_processes(); 124 thaw_processes();
125 enable_nonboot_cpus();
116 pm_restore_console(); 126 pm_restore_console();
117} 127}
118 128
@@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state)
150 goto Unlock; 160 goto Unlock;
151 } 161 }
152 162
153 /* Suspend is hard to get right on SMP. */
154 if (num_online_cpus() != 1) {
155 error = -EPERM;
156 goto Unlock;
157 }
158
159 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 163 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
160 if ((error = suspend_prepare(state))) 164 if ((error = suspend_prepare(state)))
161 goto Unlock; 165 goto Unlock;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
32} 32}
33 33
34/* Refrigerator is place where frozen processes are stored :-). */ 34/* Refrigerator is place where frozen processes are stored :-). */
35void refrigerator(unsigned long flag) 35void refrigerator(void)
36{ 36{
37 /* Hmm, should we be allowed to suspend when there are realtime 37 /* Hmm, should we be allowed to suspend when there are realtime
38 processes around? */ 38 processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
41 current->state = TASK_UNINTERRUPTIBLE; 41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm); 42 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("="); 43 printk("=");
44 current->flags &= ~PF_FREEZE;
45 44
45 frozen_process(current);
46 spin_lock_irq(&current->sighand->siglock); 46 spin_lock_irq(&current->sighand->siglock);
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 47 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 48 spin_unlock_irq(&current->sighand->siglock);
49 49
50 current->flags |= PF_FROZEN; 50 while (frozen(current))
51 while (current->flags & PF_FROZEN)
52 schedule(); 51 schedule();
53 pr_debug("%s left refrigerator\n", current->comm); 52 pr_debug("%s left refrigerator\n", current->comm);
54 current->state = save; 53 current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
57/* 0 = success, else # of processes that we failed to stop */ 56/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 57int freeze_processes(void)
59{ 58{
60 int todo; 59 int todo;
61 unsigned long start_time; 60 unsigned long start_time;
62 struct task_struct *g, *p; 61 struct task_struct *g, *p;
63 62
64 printk( "Stopping tasks: " ); 63 printk( "Stopping tasks: " );
65 start_time = jiffies; 64 start_time = jiffies;
66 do { 65 do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
70 unsigned long flags; 69 unsigned long flags;
71 if (!freezeable(p)) 70 if (!freezeable(p))
72 continue; 71 continue;
73 if ((p->flags & PF_FROZEN) || 72 if ((frozen(p)) ||
74 (p->state == TASK_TRACED) || 73 (p->state == TASK_TRACED) ||
75 (p->state == TASK_STOPPED)) 74 (p->state == TASK_STOPPED))
76 continue; 75 continue;
77 76
78 /* FIXME: smp problem here: we may not access other process' flags 77 freeze(p);
79 without locking */
80 p->flags |= PF_FREEZE;
81 spin_lock_irqsave(&p->sighand->siglock, flags); 78 spin_lock_irqsave(&p->sighand->siglock, flags);
82 signal_wake_up(p, 0); 79 signal_wake_up(p, 0);
83 spin_unlock_irqrestore(&p->sighand->siglock, flags); 80 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
91 return todo; 88 return todo;
92 } 89 }
93 } while(todo); 90 } while(todo);
94 91
95 printk( "|\n" ); 92 printk( "|\n" );
96 BUG_ON(in_atomic()); 93 BUG_ON(in_atomic());
97 return 0; 94 return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
106 do_each_thread(g, p) { 103 do_each_thread(g, p) {
107 if (!freezeable(p)) 104 if (!freezeable(p))
108 continue; 105 continue;
109 if (p->flags & PF_FROZEN) { 106 if (!thaw_process(p))
110 p->flags &= ~PF_FROZEN;
111 wake_up_process(p);
112 } else
113 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 107 printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
114 } while_each_thread(g, p); 108 } while_each_thread(g, p);
115 109
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b80fe..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/suspend.h> 14#include <linux/suspend.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/cpu.h>
16#include <asm/atomic.h> 17#include <asm/atomic.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19static atomic_t cpu_counter, freeze; 20/* This is protected by pm_sem semaphore */
20 21static cpumask_t frozen_cpus;
21
22static void smp_pause(void * data)
23{
24 struct saved_context ctxt;
25 __save_processor_state(&ctxt);
26 printk("Sleeping in:\n");
27 dump_stack();
28 atomic_inc(&cpu_counter);
29 while (atomic_read(&freeze)) {
30 /* FIXME: restore takes place at random piece inside this.
31 This should probably be written in assembly, and
32 preserve general-purpose registers, too
33
34 What about stack? We may need to move to new stack here.
35
36 This should better be ran with interrupts disabled.
37 */
38 cpu_relax();
39 barrier();
40 }
41 atomic_dec(&cpu_counter);
42 __restore_processor_state(&ctxt);
43}
44
45static cpumask_t oldmask;
46 22
47void disable_nonboot_cpus(void) 23void disable_nonboot_cpus(void)
48{ 24{
49 oldmask = current->cpus_allowed; 25 int cpu, error;
50 set_cpus_allowed(current, cpumask_of_cpu(0));
51 printk("Freezing CPUs (at %d)", _smp_processor_id());
52 current->state = TASK_INTERRUPTIBLE;
53 schedule_timeout(HZ);
54 printk("...");
55 BUG_ON(_smp_processor_id() != 0);
56
57 /* FIXME: for this to work, all the CPUs must be running
58 * "idle" thread (or we deadlock). Is that guaranteed? */
59 26
60 atomic_set(&cpu_counter, 0); 27 error = 0;
61 atomic_set(&freeze, 1); 28 cpus_clear(frozen_cpus);
62 smp_call_function(smp_pause, NULL, 0, 0); 29 printk("Freezing cpus ...\n");
63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { 30 for_each_online_cpu(cpu) {
64 cpu_relax(); 31 if (cpu == 0)
65 barrier(); 32 continue;
33 error = cpu_down(cpu);
34 if (!error) {
35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
66 } 40 }
67 printk("ok\n"); 41 BUG_ON(smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
68} 44}
69 45
70void enable_nonboot_cpus(void) 46void enable_nonboot_cpus(void)
71{ 47{
72 printk("Restarting CPUs"); 48 int cpu, error;
73 atomic_set(&freeze, 0);
74 while (atomic_read(&cpu_counter)) {
75 cpu_relax();
76 barrier();
77 }
78 printk("...");
79 set_cpus_allowed(current, oldmask);
80 schedule();
81 printk("ok\n");
82 49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = smp_prepare_cpu(cpu);
53 if (!error)
54 error = cpu_up(cpu);
55 if (!error) {
56 printk("CPU%d is up\n", cpu);
57 continue;
58 }
59 printk("Error taking cpu %d up: %d\n", cpu, error);
60 panic("Not enough cpus");
61 }
62 cpus_clear(frozen_cpus);
83} 63}
84 64
85
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..c285fc5a2320 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
11 * 11 *
12 * I'd like to thank the following people for their work: 12 * I'd like to thank the following people for their work:
13 * 13 *
14 * Pavel Machek <pavel@ucw.cz>: 14 * Pavel Machek <pavel@ucw.cz>:
15 * Modifications, defectiveness pointing, being with me at the very beginning, 15 * Modifications, defectiveness pointing, being with me at the very beginning,
16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. 16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17 * 17 *
18 * Steve Doddi <dirk@loth.demon.co.uk>: 18 * Steve Doddi <dirk@loth.demon.co.uk>:
19 * Support the possibility of hardware state restoring. 19 * Support the possibility of hardware state restoring.
20 * 20 *
21 * Raph <grey.havens@earthling.net>: 21 * Raph <grey.havens@earthling.net>:
@@ -81,14 +81,14 @@ static int nr_copy_pages_check;
81extern char resume_file[]; 81extern char resume_file[];
82 82
83/* Local variables that should not be affected by save */ 83/* Local variables that should not be affected by save */
84unsigned int nr_copy_pages __nosavedata = 0; 84static unsigned int nr_copy_pages __nosavedata = 0;
85 85
86/* Suspend pagedir is allocated before final copy, therefore it 86/* Suspend pagedir is allocated before final copy, therefore it
87 must be freed after resume 87 must be freed after resume
88 88
89 Warning: this is evil. There are actually two pagedirs at time of 89 Warning: this is evil. There are actually two pagedirs at time of
90 resume. One is "pagedir_save", which is empty frame allocated at 90 resume. One is "pagedir_save", which is empty frame allocated at
91 time of suspend, that must be freed. Second is "pagedir_nosave", 91 time of suspend, that must be freed. Second is "pagedir_nosave",
92 allocated at time of resume, that travels through memory not to 92 allocated at time of resume, that travels through memory not to
93 collide with anything. 93 collide with anything.
94 94
@@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev)
132{ 132{
133 int error; 133 int error;
134 134
135 rw_swap_page_sync(READ, 135 rw_swap_page_sync(READ,
136 swp_entry(root_swap, 0), 136 swp_entry(root_swap, 0),
137 virt_to_page((unsigned long)&swsusp_header)); 137 virt_to_page((unsigned long)&swsusp_header));
138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
142 swsusp_header.swsusp_info = prev; 142 swsusp_header.swsusp_info = prev;
143 error = rw_swap_page_sync(WRITE, 143 error = rw_swap_page_sync(WRITE,
144 swp_entry(root_swap, 0), 144 swp_entry(root_swap, 0),
145 virt_to_page((unsigned long) 145 virt_to_page((unsigned long)
146 &swsusp_header)); 146 &swsusp_header));
@@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
174static int swsusp_swap_check(void) /* This is called before saving image */ 174static int swsusp_swap_check(void) /* This is called before saving image */
175{ 175{
176 int i, len; 176 int i, len;
177 177
178 len=strlen(resume_file); 178 len=strlen(resume_file);
179 root_swap = 0xFFFF; 179 root_swap = 0xFFFF;
180 180
181 swap_list_lock(); 181 swap_list_lock();
182 for(i=0; i<MAX_SWAPFILES; i++) { 182 for (i=0; i<MAX_SWAPFILES; i++) {
183 if (swap_info[i].flags == 0) { 183 if (swap_info[i].flags == 0) {
184 swapfile_used[i]=SWAPFILE_UNUSED; 184 swapfile_used[i]=SWAPFILE_UNUSED;
185 } else { 185 } else {
186 if(!len) { 186 if (!len) {
187 printk(KERN_WARNING "resume= option should be used to set suspend device" ); 187 printk(KERN_WARNING "resume= option should be used to set suspend device" );
188 if(root_swap == 0xFFFF) { 188 if (root_swap == 0xFFFF) {
189 swapfile_used[i] = SWAPFILE_SUSPEND; 189 swapfile_used[i] = SWAPFILE_SUSPEND;
190 root_swap = i; 190 root_swap = i;
191 } else 191 } else
192 swapfile_used[i] = SWAPFILE_IGNORED; 192 swapfile_used[i] = SWAPFILE_IGNORED;
193 } else { 193 } else {
194 /* we ignore all swap devices that are not the resume_file */ 194 /* we ignore all swap devices that are not the resume_file */
195 if (is_resume_device(&swap_info[i])) { 195 if (is_resume_device(&swap_info[i])) {
@@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
209 * This is called after saving image so modification 209 * This is called after saving image so modification
210 * will be lost after resume... and that's what we want. 210 * will be lost after resume... and that's what we want.
211 * we make the device unusable. A new call to 211 * we make the device unusable. A new call to
212 * lock_swapdevices can unlock the devices. 212 * lock_swapdevices can unlock the devices.
213 */ 213 */
214static void lock_swapdevices(void) 214static void lock_swapdevices(void)
215{ 215{
216 int i; 216 int i;
217 217
218 swap_list_lock(); 218 swap_list_lock();
219 for(i = 0; i< MAX_SWAPFILES; i++) 219 for (i = 0; i< MAX_SWAPFILES; i++)
220 if(swapfile_used[i] == SWAPFILE_IGNORED) { 220 if (swapfile_used[i] == SWAPFILE_IGNORED) {
221 swap_info[i].flags ^= 0xFF; 221 swap_info[i].flags ^= 0xFF;
222 } 222 }
223 swap_list_unlock(); 223 swap_list_unlock();
@@ -229,7 +229,7 @@ static void lock_swapdevices(void)
229 * @loc: Place to store the entry we used. 229 * @loc: Place to store the entry we used.
230 * 230 *
231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO 231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
232 * errors. That is an artifact left over from swsusp. It did not 232 * errors. That is an artifact left over from swsusp. It did not
233 * check the return of rw_swap_page_sync() at all, since most pages 233 * check the return of rw_swap_page_sync() at all, since most pages
234 * written back to swap would return -EIO. 234 * written back to swap would return -EIO.
235 * This is a partial improvement, since we will at least return other 235 * This is a partial improvement, since we will at least return other
@@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
241 int error = 0; 241 int error = 0;
242 242
243 entry = get_swap_page(); 243 entry = get_swap_page();
244 if (swp_offset(entry) && 244 if (swp_offset(entry) &&
245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
246 error = rw_swap_page_sync(WRITE, entry, 246 error = rw_swap_page_sync(WRITE, entry,
247 virt_to_page(addr)); 247 virt_to_page(addr));
@@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
257/** 257/**
258 * data_free - Free the swap entries used by the saved image. 258 * data_free - Free the swap entries used by the saved image.
259 * 259 *
260 * Walk the list of used swap entries and free each one. 260 * Walk the list of used swap entries and free each one.
261 * This is only used for cleanup when suspend fails. 261 * This is only used for cleanup when suspend fails.
262 */ 262 */
263static void data_free(void) 263static void data_free(void)
@@ -290,7 +290,7 @@ static int data_write(void)
290 mod = 1; 290 mod = 1;
291 291
292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
293 for_each_pbe(p, pagedir_nosave) { 293 for_each_pbe (p, pagedir_nosave) {
294 if (!(i%mod)) 294 if (!(i%mod))
295 printk( "\b\b\b\b%3d%%", i / mod ); 295 printk( "\b\b\b\b%3d%%", i / mod );
296 if ((error = write_page(p->address, &(p->swap_address)))) 296 if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +335,7 @@ static int close_swap(void)
335 335
336 dump_info(); 336 dump_info();
337 error = write_page((unsigned long)&swsusp_info, &entry); 337 error = write_page((unsigned long)&swsusp_info, &entry);
338 if (!error) { 338 if (!error) {
339 printk( "S" ); 339 printk( "S" );
340 error = mark_swapfiles(entry); 340 error = mark_swapfiles(entry);
341 printk( "|\n" ); 341 printk( "|\n" );
@@ -370,7 +370,7 @@ static int write_pagedir(void)
370 struct pbe * pbe; 370 struct pbe * pbe;
371 371
372 printk( "Writing pagedir..."); 372 printk( "Writing pagedir...");
373 for_each_pb_page(pbe, pagedir_nosave) { 373 for_each_pb_page (pbe, pagedir_nosave) {
374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
375 return error; 375 return error;
376 } 376 }
@@ -472,7 +472,7 @@ static int save_highmem(void)
472 int res = 0; 472 int res = 0;
473 473
474 pr_debug("swsusp: Saving Highmem\n"); 474 pr_debug("swsusp: Saving Highmem\n");
475 for_each_zone(zone) { 475 for_each_zone (zone) {
476 if (is_highmem(zone)) 476 if (is_highmem(zone))
477 res = save_highmem_zone(zone); 477 res = save_highmem_zone(zone);
478 if (res) 478 if (res)
@@ -547,7 +547,7 @@ static void count_data_pages(void)
547 547
548 nr_copy_pages = 0; 548 nr_copy_pages = 0;
549 549
550 for_each_zone(zone) { 550 for_each_zone (zone) {
551 if (is_highmem(zone)) 551 if (is_highmem(zone))
552 continue; 552 continue;
553 mark_free_pages(zone); 553 mark_free_pages(zone);
@@ -562,9 +562,9 @@ static void copy_data_pages(void)
562 struct zone *zone; 562 struct zone *zone;
563 unsigned long zone_pfn; 563 unsigned long zone_pfn;
564 struct pbe * pbe = pagedir_nosave; 564 struct pbe * pbe = pagedir_nosave;
565 565
566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); 566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
567 for_each_zone(zone) { 567 for_each_zone (zone) {
568 if (is_highmem(zone)) 568 if (is_highmem(zone))
569 continue; 569 continue;
570 mark_free_pages(zone); 570 mark_free_pages(zone);
@@ -702,7 +702,7 @@ static void free_image_pages(void)
702{ 702{
703 struct pbe * p; 703 struct pbe * p;
704 704
705 for_each_pbe(p, pagedir_save) { 705 for_each_pbe (p, pagedir_save) {
706 if (p->address) { 706 if (p->address) {
707 ClearPageNosave(virt_to_page(p->address)); 707 ClearPageNosave(virt_to_page(p->address));
708 free_page(p->address); 708 free_page(p->address);
@@ -719,7 +719,7 @@ static int alloc_image_pages(void)
719{ 719{
720 struct pbe * p; 720 struct pbe * p;
721 721
722 for_each_pbe(p, pagedir_save) { 722 for_each_pbe (p, pagedir_save) {
723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); 723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
724 if (!p->address) 724 if (!p->address)
725 return -ENOMEM; 725 return -ENOMEM;
@@ -740,7 +740,7 @@ void swsusp_free(void)
740/** 740/**
741 * enough_free_mem - Make sure we enough free memory to snapshot. 741 * enough_free_mem - Make sure we enough free memory to snapshot.
742 * 742 *
743 * Returns TRUE or FALSE after checking the number of available 743 * Returns TRUE or FALSE after checking the number of available
744 * free pages. 744 * free pages.
745 */ 745 */
746 746
@@ -758,11 +758,11 @@ static int enough_free_mem(void)
758/** 758/**
759 * enough_swap - Make sure we have enough swap to save the image. 759 * enough_swap - Make sure we have enough swap to save the image.
760 * 760 *
761 * Returns TRUE or FALSE after checking the total amount of swap 761 * Returns TRUE or FALSE after checking the total amount of swap
762 * space avaiable. 762 * space avaiable.
763 * 763 *
764 * FIXME: si_swapinfo(&i) returns all swap devices information. 764 * FIXME: si_swapinfo(&i) returns all swap devices information.
765 * We should only consider resume_device. 765 * We should only consider resume_device.
766 */ 766 */
767 767
768static int enough_swap(void) 768static int enough_swap(void)
@@ -781,18 +781,18 @@ static int swsusp_alloc(void)
781{ 781{
782 int error; 782 int error;
783 783
784 pagedir_nosave = NULL;
785 nr_copy_pages = calc_nr(nr_copy_pages);
786
784 pr_debug("suspend: (pages needed: %d + %d free: %d)\n", 787 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
785 nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); 788 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
786 789
787 pagedir_nosave = NULL;
788 if (!enough_free_mem()) 790 if (!enough_free_mem())
789 return -ENOMEM; 791 return -ENOMEM;
790 792
791 if (!enough_swap()) 793 if (!enough_swap())
792 return -ENOSPC; 794 return -ENOSPC;
793 795
794 nr_copy_pages = calc_nr(nr_copy_pages);
795
796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { 796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
798 return -ENOMEM; 798 return -ENOMEM;
@@ -827,8 +827,8 @@ static int suspend_prepare_image(void)
827 error = swsusp_alloc(); 827 error = swsusp_alloc();
828 if (error) 828 if (error)
829 return error; 829 return error;
830 830
831 /* During allocating of suspend pagedir, new cold pages may appear. 831 /* During allocating of suspend pagedir, new cold pages may appear.
832 * Kill them. 832 * Kill them.
833 */ 833 */
834 drain_local_pages(); 834 drain_local_pages();
@@ -929,21 +929,6 @@ int swsusp_resume(void)
929 return error; 929 return error;
930} 930}
931 931
932/* More restore stuff */
933
934/*
935 * Returns true if given address/order collides with any orig_address
936 */
937static int does_collide_order(unsigned long addr, int order)
938{
939 int i;
940
941 for (i=0; i < (1<<order); i++)
942 if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
943 return 1;
944 return 0;
945}
946
947/** 932/**
948 * On resume, for storing the PBE list and the image, 933 * On resume, for storing the PBE list and the image,
949 * we can only use memory pages that do not conflict with the pages 934 * we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
973 unsigned long m; 958 unsigned long m;
974 959
975 m = get_zeroed_page(gfp_mask); 960 m = get_zeroed_page(gfp_mask);
976 while (does_collide_order(m, 0)) { 961 while (!PageNosaveFree(virt_to_page(m))) {
977 eat_page((void *)m); 962 eat_page((void *)m);
978 m = get_zeroed_page(gfp_mask); 963 m = get_zeroed_page(gfp_mask);
979 if (!m) 964 if (!m)
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1045 1030
1046 /* Set page flags */ 1031 /* Set page flags */
1047 1032
1048 for_each_zone(zone) { 1033 for_each_zone (zone) {
1049 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1034 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1050 SetPageNosaveFree(pfn_to_page(zone_pfn + 1035 SetPageNosaveFree(pfn_to_page(zone_pfn +
1051 zone->zone_start_pfn)); 1036 zone->zone_start_pfn));
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1061 /* Relocate colliding pages */ 1046 /* Relocate colliding pages */
1062 1047
1063 for_each_pb_page (pbpage, pblist) { 1048 for_each_pb_page (pbpage, pblist) {
1064 if (does_collide_order((unsigned long)pbpage, 0)) { 1049 if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
1065 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); 1050 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
1066 if (!m) { 1051 if (!m) {
1067 error = -ENOMEM; 1052 error = -ENOMEM;
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void)
1193 return "version"; 1178 return "version";
1194 if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) 1179 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1195 return "machine"; 1180 return "machine";
1181#if 0
1196 if(swsusp_info.cpus != num_online_cpus()) 1182 if(swsusp_info.cpus != num_online_cpus())
1197 return "number of cpus"; 1183 return "number of cpus";
1184#endif
1198 return NULL; 1185 return NULL;
1199} 1186}
1200 1187
diff --git a/kernel/printk.c b/kernel/printk.c
index 01b58d7d17ff..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
588 log_level_unknown = 1; 588 log_level_unknown = 1;
589 } 589 }
590 590
591 if (!cpu_online(smp_processor_id()) && 591 if (!cpu_online(smp_processor_id())) {
592 system_state != SYSTEM_RUNNING) {
593 /* 592 /*
594 * Some console drivers may assume that per-cpu resources have 593 * Some console drivers may assume that per-cpu resources have
595 * been allocated. So don't allow them to be called by this 594 * been allocated. So don't allow them to be called by this
@@ -876,8 +875,10 @@ void register_console(struct console * console)
876 break; 875 break;
877 console->flags |= CON_ENABLED; 876 console->flags |= CON_ENABLED;
878 console->index = console_cmdline[i].index; 877 console->index = console_cmdline[i].index;
879 if (i == preferred_console) 878 if (i == selected_console) {
880 console->flags |= CON_CONSDEV; 879 console->flags |= CON_CONSDEV;
880 preferred_console = selected_console;
881 }
881 break; 882 break;
882 } 883 }
883 884
@@ -897,6 +898,8 @@ void register_console(struct console * console)
897 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 898 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
898 console->next = console_drivers; 899 console->next = console_drivers;
899 console_drivers = console; 900 console_drivers = console;
901 if (console->next)
902 console->next->flags &= ~CON_CONSDEV;
900 } else { 903 } else {
901 console->next = console_drivers->next; 904 console->next = console_drivers->next;
902 console_drivers->next = console; 905 console_drivers->next = console;
@@ -937,10 +940,14 @@ int unregister_console(struct console * console)
937 /* If last console is removed, we re-enable picking the first 940 /* If last console is removed, we re-enable picking the first
938 * one that gets registered. Without that, pmac early boot console 941 * one that gets registered. Without that, pmac early boot console
939 * would prevent fbcon from taking over. 942 * would prevent fbcon from taking over.
943 *
944 * If this isn't the last console and it has CON_CONSDEV set, we
945 * need to set it on the next preferred console.
940 */ 946 */
941 if (console_drivers == NULL) 947 if (console_drivers == NULL)
942 preferred_console = selected_console; 948 preferred_console = selected_console;
943 949 else if (console->flags & CON_CONSDEV)
950 console_drivers->flags |= CON_CONSDEV;
944 951
945 release_console_sem(); 952 release_console_sem();
946 return res; 953 return res;
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
263 new->start = min; 263 new->start = min;
264 if (new->end > max) 264 if (new->end > max)
265 new->end = max; 265 new->end = max;
266 new->start = (new->start + align - 1) & ~(align - 1); 266 new->start = ALIGN(new->start, align);
267 if (alignf) 267 if (alignf)
268 alignf(alignf_data, new, size, align); 268 alignf(alignf_data, new, size, align);
269 if (new->start < new->end && new->end - new->start >= size - 1) { 269 if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 66b2ed784822..a07cff90d849 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
166#define SCALE_PRIO(x, prio) \ 166#define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168 168
169static inline unsigned int task_timeslice(task_t *p) 169static unsigned int task_timeslice(task_t *p)
170{ 170{
171 if (p->static_prio < NICE_TO_PRIO(0)) 171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load[3];
210#endif 210#endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
@@ -260,23 +260,87 @@ struct runqueue {
260 260
261static DEFINE_PER_CPU(struct runqueue, runqueues); 261static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263/*
264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
265 * See detach_destroy_domains: synchronize_sched for details.
266 *
267 * The domain tree of any CPU may only be accessed from within
268 * preempt-disabled sections.
269 */
263#define for_each_domain(cpu, domain) \ 270#define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 271for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
265 272
266#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 273#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267#define this_rq() (&__get_cpu_var(runqueues)) 274#define this_rq() (&__get_cpu_var(runqueues))
268#define task_rq(p) cpu_rq(task_cpu(p)) 275#define task_rq(p) cpu_rq(task_cpu(p))
269#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 276#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270 277
271/*
272 * Default context-switch locking:
273 */
274#ifndef prepare_arch_switch 278#ifndef prepare_arch_switch
275# define prepare_arch_switch(rq, next) do { } while (0) 279# define prepare_arch_switch(next) do { } while (0)
276# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 280#endif
277# define task_running(rq, p) ((rq)->curr == (p)) 281#ifndef finish_arch_switch
282# define finish_arch_switch(prev) do { } while (0)
278#endif 283#endif
279 284
285#ifndef __ARCH_WANT_UNLOCKED_CTXSW
286static inline int task_running(runqueue_t *rq, task_t *p)
287{
288 return rq->curr == p;
289}
290
291static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
292{
293}
294
295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
296{
297 spin_unlock_irq(&rq->lock);
298}
299
300#else /* __ARCH_WANT_UNLOCKED_CTXSW */
301static inline int task_running(runqueue_t *rq, task_t *p)
302{
303#ifdef CONFIG_SMP
304 return p->oncpu;
305#else
306 return rq->curr == p;
307#endif
308}
309
310static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
311{
312#ifdef CONFIG_SMP
313 /*
314 * We can optimise this out completely for !SMP, because the
315 * SMP rebalancing from interrupt is the only thing that cares
316 * here.
317 */
318 next->oncpu = 1;
319#endif
320#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
321 spin_unlock_irq(&rq->lock);
322#else
323 spin_unlock(&rq->lock);
324#endif
325}
326
327static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
328{
329#ifdef CONFIG_SMP
330 /*
331 * After ->oncpu is cleared, the task can be moved to a different CPU.
332 * We must ensure this doesn't happen until the switch is completely
333 * finished.
334 */
335 smp_wmb();
336 prev->oncpu = 0;
337#endif
338#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
339 local_irq_enable();
340#endif
341}
342#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
343
280/* 344/*
281 * task_rq_lock - lock the runqueue a given task resides on and disable 345 * task_rq_lock - lock the runqueue a given task resides on and disable
282 * interrupts. Note the ordering: we can safely lookup the task_rq without 346 * interrupts. Note the ordering: we can safely lookup the task_rq without
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
309 * bump this up when changing the output format or the meaning of an existing 373 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort) 374 * format, so that tools can adapt (or abort)
311 */ 375 */
312#define SCHEDSTAT_VERSION 11 376#define SCHEDSTAT_VERSION 12
313 377
314static int show_schedstat(struct seq_file *seq, void *v) 378static int show_schedstat(struct seq_file *seq, void *v)
315{ 379{
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
338 402
339#ifdef CONFIG_SMP 403#ifdef CONFIG_SMP
340 /* domain-specific stats */ 404 /* domain-specific stats */
405 preempt_disable();
341 for_each_domain(cpu, sd) { 406 for_each_domain(cpu, sd) {
342 enum idle_type itype; 407 enum idle_type itype;
343 char mask_str[NR_CPUS]; 408 char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
356 sd->lb_nobusyq[itype], 421 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]); 422 sd->lb_nobusyg[itype]);
358 } 423 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", 424 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 425 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts, 426 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
427 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 428 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 } 429 }
430 preempt_enable();
364#endif 431#endif
365 } 432 }
366 return 0; 433 return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
414 return rq; 481 return rq;
415} 482}
416 483
417#ifdef CONFIG_SCHED_SMT
418static int cpu_and_siblings_are_idle(int cpu)
419{
420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib))
423 continue;
424 return 0;
425 }
426
427 return 1;
428}
429#else
430#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431#endif
432
433#ifdef CONFIG_SCHEDSTATS 484#ifdef CONFIG_SCHEDSTATS
434/* 485/*
435 * Called when a process is dequeued from the active array and given 486 * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
622 rq->nr_running++; 673 rq->nr_running++;
623} 674}
624 675
625static void recalc_task_prio(task_t *p, unsigned long long now) 676static int recalc_task_prio(task_t *p, unsigned long long now)
626{ 677{
627 /* Caller must always ensure 'now >= p->timestamp' */ 678 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp; 679 unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
681 } 732 }
682 } 733 }
683 734
684 p->prio = effective_prio(p); 735 return effective_prio(p);
685} 736}
686 737
687/* 738/*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
704 } 755 }
705#endif 756#endif
706 757
707 recalc_task_prio(p, now); 758 p->prio = recalc_task_prio(p, now);
708 759
709 /* 760 /*
710 * This checks to make sure it's not an uninterruptible task 761 * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
782} 833}
783 834
784#ifdef CONFIG_SMP 835#ifdef CONFIG_SMP
785enum request_type {
786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN,
788};
789
790typedef struct { 836typedef struct {
791 struct list_head list; 837 struct list_head list;
792 enum request_type type;
793 838
794 /* For REQ_MOVE_TASK */
795 task_t *task; 839 task_t *task;
796 int dest_cpu; 840 int dest_cpu;
797 841
798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd;
800
801 struct completion done; 842 struct completion done;
802} migration_req_t; 843} migration_req_t;
803 844
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
819 } 860 }
820 861
821 init_completion(&req->done); 862 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK;
823 req->task = p; 863 req->task = p;
824 req->dest_cpu = dest_cpu; 864 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue); 865 list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
886 * We want to under-estimate the load of migration sources, to 926 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 927 * balance conservatively.
888 */ 928 */
889static inline unsigned long source_load(int cpu) 929static inline unsigned long source_load(int cpu, int type)
890{ 930{
891 runqueue_t *rq = cpu_rq(cpu); 931 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 932 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
933 if (type == 0)
934 return load_now;
893 935
894 return min(rq->cpu_load, load_now); 936 return min(rq->cpu_load[type-1], load_now);
895} 937}
896 938
897/* 939/*
898 * Return a high guess at the load of a migration-target cpu 940 * Return a high guess at the load of a migration-target cpu
899 */ 941 */
900static inline unsigned long target_load(int cpu) 942static inline unsigned long target_load(int cpu, int type)
901{ 943{
902 runqueue_t *rq = cpu_rq(cpu); 944 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 945 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
946 if (type == 0)
947 return load_now;
904 948
905 return max(rq->cpu_load, load_now); 949 return max(rq->cpu_load[type-1], load_now);
906} 950}
907 951
908#endif 952/*
953 * find_idlest_group finds and returns the least busy CPU group within the
954 * domain.
955 */
956static struct sched_group *
957find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
958{
959 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
960 unsigned long min_load = ULONG_MAX, this_load = 0;
961 int load_idx = sd->forkexec_idx;
962 int imbalance = 100 + (sd->imbalance_pct-100)/2;
963
964 do {
965 unsigned long load, avg_load;
966 int local_group;
967 int i;
968
969 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971
972 /* Tally up the load of all CPUs in the group */
973 avg_load = 0;
974
975 for_each_cpu_mask(i, group->cpumask) {
976 /* Bias balancing toward cpus of our domain */
977 if (local_group)
978 load = source_load(i, load_idx);
979 else
980 load = target_load(i, load_idx);
981
982 avg_load += load;
983 }
984
985 /* Adjust by relative CPU power of the group */
986 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
987
988 if (local_group) {
989 this_load = avg_load;
990 this = group;
991 } else if (avg_load < min_load) {
992 min_load = avg_load;
993 idlest = group;
994 }
995 group = group->next;
996 } while (group != sd->groups);
997
998 if (!idlest || 100*this_load < imbalance*min_load)
999 return NULL;
1000 return idlest;
1001}
1002
1003/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu)
1007{
1008 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1;
1010 int i;
1011
1012 for_each_cpu_mask(i, group->cpumask) {
1013 load = source_load(i, 0);
1014
1015 if (load < min_load || (load == min_load && i == this_cpu)) {
1016 min_load = load;
1017 idlest = i;
1018 }
1019 }
1020
1021 return idlest;
1022}
1023
1024/*
1025 * sched_balance_self: balance the current task (running on cpu) in domains
1026 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1027 * SD_BALANCE_EXEC.
1028 *
1029 * Balance, ie. select the least loaded group.
1030 *
1031 * Returns the target CPU number, or the same CPU if no balancing is needed.
1032 *
1033 * preempt must be disabled.
1034 */
1035static int sched_balance_self(int cpu, int flag)
1036{
1037 struct task_struct *t = current;
1038 struct sched_domain *tmp, *sd = NULL;
1039
1040 for_each_domain(cpu, tmp)
1041 if (tmp->flags & flag)
1042 sd = tmp;
1043
1044 while (sd) {
1045 cpumask_t span;
1046 struct sched_group *group;
1047 int new_cpu;
1048 int weight;
1049
1050 span = sd->span;
1051 group = find_idlest_group(sd, t, cpu);
1052 if (!group)
1053 goto nextlevel;
1054
1055 new_cpu = find_idlest_cpu(group, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel;
1058
1059 /* Now try balancing at a lower domain level */
1060 cpu = new_cpu;
1061nextlevel:
1062 sd = NULL;
1063 weight = cpus_weight(span);
1064 for_each_domain(cpu, tmp) {
1065 if (weight <= cpus_weight(tmp->span))
1066 break;
1067 if (tmp->flags & flag)
1068 sd = tmp;
1069 }
1070 /* while loop will break here if sd == NULL */
1071 }
1072
1073 return cpu;
1074}
1075
1076#endif /* CONFIG_SMP */
909 1077
910/* 1078/*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is 1079 * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
927 1095
928 for_each_domain(cpu, sd) { 1096 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) { 1097 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map); 1098 cpus_and(tmp, sd->span, p->cpus_allowed);
931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) { 1099 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i)) 1100 if (idle_cpu(i))
934 return i; 1101 return i;
935 } 1102 }
936 } 1103 }
937 else break; 1104 else
1105 break;
938 } 1106 }
939 return cpu; 1107 return cpu;
940} 1108}
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
967 runqueue_t *rq; 1135 runqueue_t *rq;
968#ifdef CONFIG_SMP 1136#ifdef CONFIG_SMP
969 unsigned long load, this_load; 1137 unsigned long load, this_load;
970 struct sched_domain *sd; 1138 struct sched_domain *sd, *this_sd = NULL;
971 int new_cpu; 1139 int new_cpu;
972#endif 1140#endif
973 1141
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986 if (unlikely(task_running(rq, p))) 1154 if (unlikely(task_running(rq, p)))
987 goto out_activate; 1155 goto out_activate;
988 1156
989#ifdef CONFIG_SCHEDSTATS 1157 new_cpu = cpu;
1158
990 schedstat_inc(rq, ttwu_cnt); 1159 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 1160 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 1161 schedstat_inc(rq, ttwu_local);
993 } else { 1162 goto out_set_cpu;
994 for_each_domain(this_cpu, sd) { 1163 }
995 if (cpu_isset(cpu, sd->span)) { 1164
996 schedstat_inc(sd, ttwu_wake_remote); 1165 for_each_domain(this_cpu, sd) {
997 break; 1166 if (cpu_isset(cpu, sd->span)) {
998 } 1167 schedstat_inc(sd, ttwu_wake_remote);
1168 this_sd = sd;
1169 break;
999 } 1170 }
1000 } 1171 }
1001#endif
1002 1172
1003 new_cpu = cpu; 1173 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1174 goto out_set_cpu;
1006 1175
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /* 1176 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1177 * Check for affine wakeup and passive balancing possibilities.
1012 * the currently running task from the load of the current CPU:
1013 */ 1178 */
1014 if (sync) 1179 if (this_sd) {
1015 this_load -= SCHED_LOAD_SCALE; 1180 int idx = this_sd->wake_idx;
1181 unsigned int imbalance;
1016 1182
1017 /* Don't pull the task off an idle CPU to a busy one */ 1183 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020 1184
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1185 load = source_load(cpu, idx);
1186 this_load = target_load(this_cpu, idx);
1022 1187
1023 /* 1188 new_cpu = this_cpu; /* Wake to this CPU if we can */
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /*
1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached.
1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034 1189
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1190 if (this_sd->flags & SD_WAKE_AFFINE) {
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1191 unsigned long tl = this_load;
1037 /* 1192 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1193 * If sync wakeup then subtract the (maximum possible)
1039 * in this domain. 1194 * effect of the currently running task from the load
1195 * of the current CPU:
1040 */ 1196 */
1041 if (cpu_isset(cpu, sd->span)) { 1197 if (sync)
1042 schedstat_inc(sd, ttwu_move_affine); 1198 tl -= SCHED_LOAD_SCALE;
1199
1200 if ((tl <= load &&
1201 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1202 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1203 /*
1204 * This domain has SD_WAKE_AFFINE and
1205 * p is cache cold in this domain, and
1206 * there is no bad imbalance.
1207 */
1208 schedstat_inc(this_sd, ttwu_move_affine);
1043 goto out_set_cpu; 1209 goto out_set_cpu;
1044 } 1210 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) && 1211 }
1046 imbalance*this_load <= 100*load) { 1212
1047 /* 1213 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1214 * Start passive balancing when half the imbalance_pct
1049 * an imbalance. 1215 * limit is reached.
1050 */ 1216 */
1051 if (cpu_isset(cpu, sd->span)) { 1217 if (this_sd->flags & SD_WAKE_BALANCE) {
1052 schedstat_inc(sd, ttwu_move_balance); 1218 if (imbalance*this_load <= 100*load) {
1219 schedstat_inc(this_sd, ttwu_move_balance);
1053 goto out_set_cpu; 1220 goto out_set_cpu;
1054 } 1221 }
1055 } 1222 }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1120 return try_to_wake_up(p, state, 0); 1287 return try_to_wake_up(p, state, 0);
1121} 1288}
1122 1289
1123#ifdef CONFIG_SMP
1124static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd);
1126#endif
1127
1128/* 1290/*
1129 * Perform scheduler related setup for a newly forked process p. 1291 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current. 1292 * p is forked by current.
1131 */ 1293 */
1132void fastcall sched_fork(task_t *p) 1294void fastcall sched_fork(task_t *p, int clone_flags)
1133{ 1295{
1296 int cpu = get_cpu();
1297
1298#ifdef CONFIG_SMP
1299 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1300#endif
1301 set_task_cpu(p, cpu);
1302
1134 /* 1303 /*
1135 * We mark the process as running here, but have not actually 1304 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that 1305 * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
1140 p->state = TASK_RUNNING; 1309 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list); 1310 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL; 1311 p->array = NULL;
1143 spin_lock_init(&p->switch_lock);
1144#ifdef CONFIG_SCHEDSTATS 1312#ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1313 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146#endif 1314#endif
1315#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1316 p->oncpu = 0;
1317#endif
1147#ifdef CONFIG_PREEMPT 1318#ifdef CONFIG_PREEMPT
1148 /* 1319 /* Want to start with kernel preemption disabled. */
1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */
1154 p->thread_info->preempt_count = 1; 1320 p->thread_info->preempt_count = 1;
1155#endif 1321#endif
1156 /* 1322 /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
1174 * runqueue lock is not a problem. 1340 * runqueue lock is not a problem.
1175 */ 1341 */
1176 current->time_slice = 1; 1342 current->time_slice = 1;
1177 preempt_disable();
1178 scheduler_tick(); 1343 scheduler_tick();
1179 local_irq_enable(); 1344 }
1180 preempt_enable(); 1345 local_irq_enable();
1181 } else 1346 put_cpu();
1182 local_irq_enable();
1183} 1347}
1184 1348
1185/* 1349/*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1196 runqueue_t *rq, *this_rq; 1360 runqueue_t *rq, *this_rq;
1197 1361
1198 rq = task_rq_lock(p, &flags); 1362 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id();
1201
1202 BUG_ON(p->state != TASK_RUNNING); 1363 BUG_ON(p->state != TASK_RUNNING);
1364 this_cpu = smp_processor_id();
1365 cpu = task_cpu(p);
1203 1366
1204 /* 1367 /*
1205 * We decrease the sleep average of forking parents 1368 * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
1296} 1459}
1297 1460
1298/** 1461/**
1462 * prepare_task_switch - prepare to switch tasks
1463 * @rq: the runqueue preparing to switch
1464 * @next: the task we are going to switch to.
1465 *
1466 * This is called with the rq lock held and interrupts off. It must
1467 * be paired with a subsequent finish_task_switch after the context
1468 * switch.
1469 *
1470 * prepare_task_switch sets up locking and calls architecture specific
1471 * hooks.
1472 */
1473static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1474{
1475 prepare_lock_switch(rq, next);
1476 prepare_arch_switch(next);
1477}
1478
1479/**
1299 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from. 1481 * @prev: the thread we just switched away from.
1301 * 1482 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch() 1483 * finish_task_switch must be called after the context switch, paired
1303 * will unlock it along with doing any other architecture-specific cleanup 1484 * with a prepare_task_switch call before the context switch.
1304 * actions. 1485 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1486 * and do any other architecture-specific cleanup actions.
1305 * 1487 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If 1488 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it 1489 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for 1490 * with the lock held can cause deadlocks; see schedule() for
1309 * details.) 1491 * details.)
1310 */ 1492 */
1311static inline void finish_task_switch(task_t *prev) 1493static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1312 __releases(rq->lock) 1494 __releases(rq->lock)
1313{ 1495{
1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm; 1496 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags; 1497 unsigned long prev_task_flags;
1317 1498
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
1329 * Manfred Spraul <manfred@colorfullife.com> 1510 * Manfred Spraul <manfred@colorfullife.com>
1330 */ 1511 */
1331 prev_task_flags = prev->flags; 1512 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev); 1513 finish_arch_switch(prev);
1514 finish_lock_switch(rq, prev);
1333 if (mm) 1515 if (mm)
1334 mmdrop(mm); 1516 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD)) 1517 if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
1343asmlinkage void schedule_tail(task_t *prev) 1525asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock) 1526 __releases(rq->lock)
1345{ 1527{
1346 finish_task_switch(prev); 1528 runqueue_t *rq = this_rq();
1347 1529 finish_task_switch(rq, prev);
1530#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1531 /* In this case, finish_task_switch does not reenable preemption */
1532 preempt_enable();
1533#endif
1348 if (current->set_child_tid) 1534 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid); 1535 put_user(current->pid, current->set_child_tid);
1350} 1536}
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1494} 1680}
1495 1681
1496/* 1682/*
1497 * find_idlest_cpu - find the least busy runqueue.
1498 */
1499static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd)
1501{
1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu;
1504 cpumask_t mask;
1505
1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX;
1508
1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510
1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i);
1513
1514 if (load < min_load) {
1515 min_cpu = i;
1516 min_load = load;
1517
1518 /* break out early on an idle CPU: */
1519 if (!min_load)
1520 break;
1521 }
1522 }
1523
1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526
1527 /*
1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU?
1531 *
1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance.
1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu;
1537
1538 return this_cpu;
1539}
1540
1541/*
1542 * If dest_cpu is allowed for this process, migrate the task to it. 1683 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only 1684 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1685 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1571,37 +1712,16 @@ out:
1571} 1712}
1572 1713
1573/* 1714/*
1574 * sched_exec(): find the highest-level, exec-balance-capable 1715 * sched_exec - execve() is a valuable balancing opportunity, because at
1575 * domain and try to migrate the task to the least loaded CPU. 1716 * this point the task has the smallest effective memory and cache footprint.
1576 *
1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint.
1579 */ 1717 */
1580void sched_exec(void) 1718void sched_exec(void)
1581{ 1719{
1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu(); 1720 int new_cpu, this_cpu = get_cpu();
1584 1721 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1)
1587 goto out;
1588
1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp;
1592
1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu();
1599 sched_migrate_task(current, new_cpu);
1600 return;
1601 }
1602 }
1603out:
1604 put_cpu(); 1722 put_cpu();
1723 if (new_cpu != this_cpu)
1724 sched_migrate_task(current, new_cpu);
1605} 1725}
1606 1726
1607/* 1727/*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1632 */ 1752 */
1633static inline 1753static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1636{ 1756{
1637 /* 1757 /*
1638 * We do not migrate tasks that are: 1758 * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1760 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1761 * 3) are cache-hot on their current CPU.
1642 */ 1762 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1763 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1764 return 0;
1765 *all_pinned = 0;
1766
1767 if (task_running(rq, p))
1768 return 0;
1647 1769
1648 /* 1770 /*
1649 * Aggressive migration if: 1771 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or 1772 * 1) task is cache cold, or
1651 * 2) too many balance attempts have failed. 1773 * 2) too many balance attempts have failed.
1652 */ 1774 */
1653 1775
1654 if (cpu_and_siblings_are_idle(this_cpu) || \ 1776 if (sd->nr_balance_failed > sd->cache_nice_tries)
1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1; 1777 return 1;
1657 1778
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1779 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1780 return 0;
1660 return 1; 1781 return 1;
1661} 1782}
1662 1783
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1669 */ 1790 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1791static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1792 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1793 enum idle_type idle, int *all_pinned)
1673{ 1794{
1674 prio_array_t *array, *dst_array; 1795 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1796 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1797 int idx, pulled = 0, pinned = 0;
1677 task_t *tmp; 1798 task_t *tmp;
1678 1799
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1800 if (max_nr_move == 0)
1680 goto out; 1801 goto out;
1681 1802
1803 pinned = 1;
1804
1682 /* 1805 /*
1683 * We first consider expired tasks. Those will likely not be 1806 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1807 * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
1717 1840
1718 curr = curr->prev; 1841 curr = curr->prev;
1719 1842
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1843 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1721 if (curr != head) 1844 if (curr != head)
1722 goto skip_queue; 1845 goto skip_queue;
1723 idx++; 1846 idx++;
@@ -1746,6 +1869,9 @@ out:
1746 * inside pull_task(). 1869 * inside pull_task().
1747 */ 1870 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1871 schedstat_add(sd, lb_gained[idle], pulled);
1872
1873 if (all_pinned)
1874 *all_pinned = pinned;
1749 return pulled; 1875 return pulled;
1750} 1876}
1751 1877
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1760{ 1886{
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1889 int load_idx;
1763 1890
1764 max_load = this_load = total_load = total_pwr = 0; 1891 max_load = this_load = total_load = total_pwr = 0;
1892 if (idle == NOT_IDLE)
1893 load_idx = sd->busy_idx;
1894 else if (idle == NEWLY_IDLE)
1895 load_idx = sd->newidle_idx;
1896 else
1897 load_idx = sd->idle_idx;
1765 1898
1766 do { 1899 do {
1767 unsigned long load; 1900 unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1776 for_each_cpu_mask(i, group->cpumask) { 1909 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */ 1910 /* Bias balancing toward cpus of our domain */
1778 if (local_group) 1911 if (local_group)
1779 load = target_load(i); 1912 load = target_load(i, load_idx);
1780 else 1913 else
1781 load = source_load(i); 1914 load = source_load(i, load_idx);
1782 1915
1783 avg_load += load; 1916 avg_load += load;
1784 } 1917 }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792 if (local_group) { 1925 if (local_group) {
1793 this_load = avg_load; 1926 this_load = avg_load;
1794 this = group; 1927 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1928 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1929 max_load = avg_load;
1798 busiest = group; 1930 busiest = group;
1799 } 1931 }
1800nextgroup:
1801 group = group->next; 1932 group = group->next;
1802 } while (group != sd->groups); 1933 } while (group != sd->groups);
1803 1934
@@ -1870,15 +2001,9 @@ nextgroup:
1870 2001
1871 /* Get rid of the scaling factor, rounding down as we divide */ 2002 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE; 2003 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873
1874 return busiest; 2004 return busiest;
1875 2005
1876out_balanced: 2006out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1;
1880 return busiest;
1881 }
1882 2007
1883 *imbalance = 0; 2008 *imbalance = 0;
1884 return NULL; 2009 return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1894 int i; 2019 int i;
1895 2020
1896 for_each_cpu_mask(i, group->cpumask) { 2021 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i); 2022 load = source_load(i, 0);
1898 2023
1899 if (load > max_load) { 2024 if (load > max_load) {
1900 max_load = load; 2025 max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1906} 2031}
1907 2032
1908/* 2033/*
2034 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2035 * so long as it is large enough.
2036 */
2037#define MAX_PINNED_INTERVAL 512
2038
2039/*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2040 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance. 2041 * tasks if there is an imbalance.
1911 * 2042 *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1917 struct sched_group *group; 2048 struct sched_group *group;
1918 runqueue_t *busiest; 2049 runqueue_t *busiest;
1919 unsigned long imbalance; 2050 unsigned long imbalance;
1920 int nr_moved; 2051 int nr_moved, all_pinned = 0;
2052 int active_balance = 0;
1921 2053
1922 spin_lock(&this_rq->lock); 2054 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 2055 schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1934 goto out_balanced; 2066 goto out_balanced;
1935 } 2067 }
1936 2068
1937 /* 2069 BUG_ON(busiest == this_rq);
1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory.
1941 */
1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1);
1944 goto out_balanced;
1945 }
1946 2070
1947 schedstat_add(sd, lb_imbalance[idle], imbalance); 2071 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948 2072
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1956 */ 2080 */
1957 double_lock_balance(this_rq, busiest); 2081 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2082 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 2083 imbalance, sd, idle,
2084 &all_pinned);
1960 spin_unlock(&busiest->lock); 2085 spin_unlock(&busiest->lock);
2086
2087 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned))
2089 goto out_balanced;
1961 } 2090 }
2091
1962 spin_unlock(&this_rq->lock); 2092 spin_unlock(&this_rq->lock);
1963 2093
1964 if (!nr_moved) { 2094 if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1966 sd->nr_balance_failed++; 2096 sd->nr_balance_failed++;
1967 2097
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970 2099
1971 spin_lock(&busiest->lock); 2100 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 2101 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 2102 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 2103 busiest->push_cpu = this_cpu;
1975 wake = 1; 2104 active_balance = 1;
1976 } 2105 }
1977 spin_unlock(&busiest->lock); 2106 spin_unlock(&busiest->lock);
1978 if (wake) 2107 if (active_balance)
1979 wake_up_process(busiest->migration_thread); 2108 wake_up_process(busiest->migration_thread);
1980 2109
1981 /* 2110 /*
1982 * We've kicked active balancing, reset the failure 2111 * We've kicked active balancing, reset the failure
1983 * counter. 2112 * counter.
1984 */ 2113 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 2114 sd->nr_balance_failed = sd->cache_nice_tries+1;
1986 } 2115 }
1987 2116 } else
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0; 2117 sd->nr_balance_failed = 0;
1996 2118
2119 if (likely(!active_balance)) {
1997 /* We were unbalanced, so reset the balancing interval */ 2120 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 2121 sd->balance_interval = sd->min_interval;
2122 } else {
2123 /*
2124 * If we've begun active balancing, start to back off. This
2125 * case may not be covered by the all_pinned logic if there
2126 * is only 1 task on the busy runqueue (because we don't call
2127 * move_tasks).
2128 */
2129 if (sd->balance_interval < sd->max_interval)
2130 sd->balance_interval *= 2;
1999 } 2131 }
2000 2132
2001 return nr_moved; 2133 return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
2005 2137
2006 schedstat_inc(sd, lb_balanced[idle]); 2138 schedstat_inc(sd, lb_balanced[idle]);
2007 2139
2140 sd->nr_balance_failed = 0;
2008 /* tune up the balancing interval */ 2141 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval) 2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval))
2010 sd->balance_interval *= 2; 2144 sd->balance_interval *= 2;
2011 2145
2012 return 0; 2146 return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) { 2166 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out; 2168 goto out_balanced;
2036 } 2169 }
2037 2170
2038 busiest = find_busiest_queue(group); 2171 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) { 2172 if (!busiest) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2173 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out; 2174 goto out_balanced;
2043 } 2175 }
2044 2176
2177 BUG_ON(busiest == this_rq);
2178
2045 /* Attempt to move tasks */ 2179 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest); 2180 double_lock_balance(this_rq, busiest);
2047 2181
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2183 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2184 imbalance, sd, NEWLY_IDLE, NULL);
2051 if (!nr_moved) 2185 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else
2188 sd->nr_balance_failed = 0;
2053 2189
2054 spin_unlock(&busiest->lock); 2190 spin_unlock(&busiest->lock);
2055
2056out:
2057 return nr_moved; 2191 return nr_moved;
2192
2193out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2195 sd->nr_balance_failed = 0;
2196 return 0;
2058} 2197}
2059 2198
2060/* 2199/*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2086static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2225static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087{ 2226{
2088 struct sched_domain *sd; 2227 struct sched_domain *sd;
2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq; 2228 runqueue_t *target_rq;
2091 cpumask_t visited_cpus; 2229 int target_cpu = busiest_rq->push_cpu;
2092 int cpu; 2230
2231 if (busiest_rq->nr_running <= 1)
2232 /* no task to move */
2233 return;
2234
2235 target_rq = cpu_rq(target_cpu);
2093 2236
2094 /* 2237 /*
2095 * Search for suitable CPUs to push tasks to in successively higher 2238 * This condition is "impossible", if it occurs
2096 * domains with SD_LOAD_BALANCE set. 2239 * we need to fix it. Originally reported by
2240 * Bjorn Helgaas on a 128-cpu setup.
2097 */ 2241 */
2098 visited_cpus = CPU_MASK_NONE; 2242 BUG_ON(busiest_rq == target_rq);
2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */
2102 break;
2103 2243
2104 schedstat_inc(sd, alb_cnt); 2244 /* move a task from busiest_rq to target_rq */
2245 double_lock_balance(busiest_rq, target_rq);
2105 2246
2106 cpu_group = sd->groups; 2247 /* Search for an sd spanning us and the target CPU. */
2107 do { 2248 for_each_domain(target_cpu, sd)
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) { 2249 if ((sd->flags & SD_LOAD_BALANCE) &&
2109 if (busiest_rq->nr_running <= 1) 2250 cpu_isset(busiest_cpu, sd->span))
2110 /* no more tasks left to move */ 2251 break;
2111 return; 2252
2112 if (cpu_isset(cpu, visited_cpus)) 2253 if (unlikely(sd == NULL))
2113 continue; 2254 goto out;
2114 cpu_set(cpu, visited_cpus); 2255
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) 2256 schedstat_inc(sd, alb_cnt);
2116 continue; 2257
2117 2258 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2118 target_rq = cpu_rq(cpu); 2259 schedstat_inc(sd, alb_pushed);
2119 /* 2260 else
2120 * This condition is "impossible", if it occurs 2261 schedstat_inc(sd, alb_failed);
2121 * we need to fix it. Originally reported by 2262out:
2122 * Bjorn Helgaas on a 128-cpu setup. 2263 spin_unlock(&target_rq->lock);
2123 */
2124 BUG_ON(busiest_rq == target_rq);
2125
2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed);
2131 } else {
2132 schedstat_inc(sd, alb_failed);
2133 }
2134 spin_unlock(&target_rq->lock);
2135 }
2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups);
2138 }
2139} 2264}
2140 2265
2141/* 2266/*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2156 unsigned long old_load, this_load; 2281 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2282 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd; 2283 struct sched_domain *sd;
2284 int i;
2159 2285
2160 /* Update our load */
2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2286 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /* 2287 /* Update our load */
2164 * Round up the averaging division if load is increasing. This 2288 for (i = 0; i < 3; i++) {
2165 * prevents us from getting stuck on 9 if the load is 10, for 2289 unsigned long new_load = this_load;
2166 * example. 2290 int scale = 1 << i;
2167 */ 2291 old_load = this_rq->cpu_load[i];
2168 if (this_load > old_load) 2292 /*
2169 old_load++; 2293 * Round up the averaging division if load is increasing. This
2170 this_rq->cpu_load = (old_load + this_load) / 2; 2294 * prevents us from getting stuck on 9 if the load is 10, for
2295 * example.
2296 */
2297 if (new_load > old_load)
2298 new_load += scale-1;
2299 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2300 }
2171 2301
2172 for_each_domain(this_cpu, sd) { 2302 for_each_domain(this_cpu, sd) {
2173 unsigned long interval; 2303 unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
2447#ifdef CONFIG_SCHED_SMT 2577#ifdef CONFIG_SCHED_SMT
2448static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449{ 2579{
2450 struct sched_domain *sd = this_rq->sd; 2580 struct sched_domain *tmp, *sd = NULL;
2451 cpumask_t sibling_map; 2581 cpumask_t sibling_map;
2452 int i; 2582 int i;
2453 2583
2454 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2584 for_each_domain(this_cpu, tmp)
2585 if (tmp->flags & SD_SHARE_CPUPOWER)
2586 sd = tmp;
2587
2588 if (!sd)
2455 return; 2589 return;
2456 2590
2457 /* 2591 /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2492 2626
2493static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494{ 2628{
2495 struct sched_domain *sd = this_rq->sd; 2629 struct sched_domain *tmp, *sd = NULL;
2496 cpumask_t sibling_map; 2630 cpumask_t sibling_map;
2497 prio_array_t *array; 2631 prio_array_t *array;
2498 int ret = 0, i; 2632 int ret = 0, i;
2499 task_t *p; 2633 task_t *p;
2500 2634
2501 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2635 for_each_domain(this_cpu, tmp)
2636 if (tmp->flags & SD_SHARE_CPUPOWER)
2637 sd = tmp;
2638
2639 if (!sd)
2502 return 0; 2640 return 0;
2503 2641
2504 /* 2642 /*
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val)
2576 /* 2714 /*
2577 * Underflow? 2715 * Underflow?
2578 */ 2716 */
2579 BUG_ON(((int)preempt_count() < 0)); 2717 BUG_ON((preempt_count() < 0));
2580 preempt_count() += val; 2718 preempt_count() += val;
2581 /* 2719 /*
2582 * Spinlock count overflowing soon? 2720 * Spinlock count overflowing soon?
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
2613 struct list_head *queue; 2751 struct list_head *queue;
2614 unsigned long long now; 2752 unsigned long long now;
2615 unsigned long run_time; 2753 unsigned long run_time;
2616 int cpu, idx; 2754 int cpu, idx, new_prio;
2617 2755
2618 /* 2756 /*
2619 * Test if we are atomic. Since do_exit() needs to call into 2757 * Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2873 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736 2874
2737 array = next->array; 2875 array = next->array;
2738 dequeue_task(next, array); 2876 new_prio = recalc_task_prio(next, next->timestamp + delta);
2739 recalc_task_prio(next, next->timestamp + delta); 2877
2740 enqueue_task(next, array); 2878 if (unlikely(next->prio != new_prio)) {
2879 dequeue_task(next, array);
2880 next->prio = new_prio;
2881 enqueue_task(next, array);
2882 } else
2883 requeue_task(next, array);
2741 } 2884 }
2742 next->activated = 0; 2885 next->activated = 0;
2743switch_tasks: 2886switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
2761 rq->curr = next; 2904 rq->curr = next;
2762 ++*switch_count; 2905 ++*switch_count;
2763 2906
2764 prepare_arch_switch(rq, next); 2907 prepare_task_switch(rq, next);
2765 prev = context_switch(rq, prev, next); 2908 prev = context_switch(rq, prev, next);
2766 barrier(); 2909 barrier();
2767 2910 /*
2768 finish_task_switch(prev); 2911 * this_rq must be evaluated again because prev may have moved
2912 * CPUs since it called schedule(), thus the 'rq' on its stack
2913 * frame will be invalid.
2914 */
2915 finish_task_switch(this_rq(), prev);
2769 } else 2916 } else
2770 spin_unlock_irq(&rq->lock); 2917 spin_unlock_irq(&rq->lock);
2771 2918
@@ -2869,7 +3016,7 @@ need_resched:
2869 3016
2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 3017int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871{ 3018{
2872 task_t *p = curr->task; 3019 task_t *p = curr->private;
2873 return try_to_wake_up(p, mode, sync); 3020 return try_to_wake_up(p, mode, sync);
2874} 3021}
2875 3022
@@ -3384,13 +3531,24 @@ recheck:
3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3531 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385 return -EINVAL; 3532 return -EINVAL;
3386 3533
3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) && 3534 /*
3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && 3535 * Allow unprivileged RT tasks to decrease priority:
3389 !capable(CAP_SYS_NICE)) 3536 */
3390 return -EPERM; 3537 if (!capable(CAP_SYS_NICE)) {
3391 if ((current->euid != p->euid) && (current->euid != p->uid) && 3538 /* can't change policy */
3392 !capable(CAP_SYS_NICE)) 3539 if (policy != p->policy)
3393 return -EPERM; 3540 return -EPERM;
3541 /* can't increase priority */
3542 if (policy != SCHED_NORMAL &&
3543 param->sched_priority > p->rt_priority &&
3544 param->sched_priority >
3545 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3546 return -EPERM;
3547 /* can't change other user's priorities */
3548 if ((current->euid != p->euid) &&
3549 (current->euid != p->uid))
3550 return -EPERM;
3551 }
3394 3552
3395 retval = security_task_setscheduler(p, policy, param); 3553 retval = security_task_setscheduler(p, policy, param);
3396 if (retval) 3554 if (retval)
@@ -3755,19 +3913,22 @@ EXPORT_SYMBOL(cond_resched);
3755 */ 3913 */
3756int cond_resched_lock(spinlock_t * lock) 3914int cond_resched_lock(spinlock_t * lock)
3757{ 3915{
3916 int ret = 0;
3917
3758 if (need_lockbreak(lock)) { 3918 if (need_lockbreak(lock)) {
3759 spin_unlock(lock); 3919 spin_unlock(lock);
3760 cpu_relax(); 3920 cpu_relax();
3921 ret = 1;
3761 spin_lock(lock); 3922 spin_lock(lock);
3762 } 3923 }
3763 if (need_resched()) { 3924 if (need_resched()) {
3764 _raw_spin_unlock(lock); 3925 _raw_spin_unlock(lock);
3765 preempt_enable_no_resched(); 3926 preempt_enable_no_resched();
3766 __cond_resched(); 3927 __cond_resched();
3928 ret = 1;
3767 spin_lock(lock); 3929 spin_lock(lock);
3768 return 1;
3769 } 3930 }
3770 return 0; 3931 return ret;
3771} 3932}
3772 3933
3773EXPORT_SYMBOL(cond_resched_lock); 3934EXPORT_SYMBOL(cond_resched_lock);
@@ -3811,7 +3972,7 @@ EXPORT_SYMBOL(yield);
3811 */ 3972 */
3812void __sched io_schedule(void) 3973void __sched io_schedule(void)
3813{ 3974{
3814 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3975 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3815 3976
3816 atomic_inc(&rq->nr_iowait); 3977 atomic_inc(&rq->nr_iowait);
3817 schedule(); 3978 schedule();
@@ -3822,7 +3983,7 @@ EXPORT_SYMBOL(io_schedule);
3822 3983
3823long __sched io_schedule_timeout(long timeout) 3984long __sched io_schedule_timeout(long timeout)
3824{ 3985{
3825 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3986 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3826 long ret; 3987 long ret;
3827 3988
3828 atomic_inc(&rq->nr_iowait); 3989 atomic_inc(&rq->nr_iowait);
@@ -4027,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4027 4188
4028 spin_lock_irqsave(&rq->lock, flags); 4189 spin_lock_irqsave(&rq->lock, flags);
4029 rq->curr = rq->idle = idle; 4190 rq->curr = rq->idle = idle;
4191#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4192 idle->oncpu = 1;
4193#endif
4030 set_tsk_need_resched(idle); 4194 set_tsk_need_resched(idle);
4031 spin_unlock_irqrestore(&rq->lock, flags); 4195 spin_unlock_irqrestore(&rq->lock, flags);
4032 4196
@@ -4171,8 +4335,7 @@ static int migration_thread(void * data)
4171 struct list_head *head; 4335 struct list_head *head;
4172 migration_req_t *req; 4336 migration_req_t *req;
4173 4337
4174 if (current->flags & PF_FREEZE) 4338 try_to_freeze();
4175 refrigerator(PF_FREEZE);
4176 4339
4177 spin_lock_irq(&rq->lock); 4340 spin_lock_irq(&rq->lock);
4178 4341
@@ -4197,17 +4360,9 @@ static int migration_thread(void * data)
4197 req = list_entry(head->next, migration_req_t, list); 4360 req = list_entry(head->next, migration_req_t, list);
4198 list_del_init(head->next); 4361 list_del_init(head->next);
4199 4362
4200 if (req->type == REQ_MOVE_TASK) { 4363 spin_unlock(&rq->lock);
4201 spin_unlock(&rq->lock); 4364 __migrate_task(req->task, cpu, req->dest_cpu);
4202 __migrate_task(req->task, cpu, req->dest_cpu); 4365 local_irq_enable();
4203 local_irq_enable();
4204 } else if (req->type == REQ_SET_DOMAIN) {
4205 rq->sd = req->sd;
4206 spin_unlock_irq(&rq->lock);
4207 } else {
4208 spin_unlock_irq(&rq->lock);
4209 WARN_ON(1);
4210 }
4211 4366
4212 complete(&req->done); 4367 complete(&req->done);
4213 } 4368 }
@@ -4438,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4438 migration_req_t *req; 4593 migration_req_t *req;
4439 req = list_entry(rq->migration_queue.next, 4594 req = list_entry(rq->migration_queue.next,
4440 migration_req_t, list); 4595 migration_req_t, list);
4441 BUG_ON(req->type != REQ_MOVE_TASK);
4442 list_del_init(&req->list); 4596 list_del_init(&req->list);
4443 complete(&req->done); 4597 complete(&req->done);
4444 } 4598 }
@@ -4469,12 +4623,17 @@ int __init migration_init(void)
4469#endif 4623#endif
4470 4624
4471#ifdef CONFIG_SMP 4625#ifdef CONFIG_SMP
4472#define SCHED_DOMAIN_DEBUG 4626#undef SCHED_DOMAIN_DEBUG
4473#ifdef SCHED_DOMAIN_DEBUG 4627#ifdef SCHED_DOMAIN_DEBUG
4474static void sched_domain_debug(struct sched_domain *sd, int cpu) 4628static void sched_domain_debug(struct sched_domain *sd, int cpu)
4475{ 4629{
4476 int level = 0; 4630 int level = 0;
4477 4631
4632 if (!sd) {
4633 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4634 return;
4635 }
4636
4478 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4637 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4479 4638
4480 do { 4639 do {
@@ -4557,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4557#define sched_domain_debug(sd, cpu) {} 4716#define sched_domain_debug(sd, cpu) {}
4558#endif 4717#endif
4559 4718
4719static int sd_degenerate(struct sched_domain *sd)
4720{
4721 if (cpus_weight(sd->span) == 1)
4722 return 1;
4723
4724 /* Following flags need at least 2 groups */
4725 if (sd->flags & (SD_LOAD_BALANCE |
4726 SD_BALANCE_NEWIDLE |
4727 SD_BALANCE_FORK |
4728 SD_BALANCE_EXEC)) {
4729 if (sd->groups != sd->groups->next)
4730 return 0;
4731 }
4732
4733 /* Following flags don't use groups */
4734 if (sd->flags & (SD_WAKE_IDLE |
4735 SD_WAKE_AFFINE |
4736 SD_WAKE_BALANCE))
4737 return 0;
4738
4739 return 1;
4740}
4741
4742static int sd_parent_degenerate(struct sched_domain *sd,
4743 struct sched_domain *parent)
4744{
4745 unsigned long cflags = sd->flags, pflags = parent->flags;
4746
4747 if (sd_degenerate(parent))
4748 return 1;
4749
4750 if (!cpus_equal(sd->span, parent->span))
4751 return 0;
4752
4753 /* Does parent contain flags not in child? */
4754 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
4755 if (cflags & SD_WAKE_AFFINE)
4756 pflags &= ~SD_WAKE_BALANCE;
4757 /* Flags needing groups don't count if only 1 group in parent */
4758 if (parent->groups == parent->groups->next) {
4759 pflags &= ~(SD_LOAD_BALANCE |
4760 SD_BALANCE_NEWIDLE |
4761 SD_BALANCE_FORK |
4762 SD_BALANCE_EXEC);
4763 }
4764 if (~cflags & pflags)
4765 return 0;
4766
4767 return 1;
4768}
4769
4560/* 4770/*
4561 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4771 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4562 * hold the hotplug lock. 4772 * hold the hotplug lock.
4563 */ 4773 */
4564void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4774void cpu_attach_domain(struct sched_domain *sd, int cpu)
4565{ 4775{
4566 migration_req_t req;
4567 unsigned long flags;
4568 runqueue_t *rq = cpu_rq(cpu); 4776 runqueue_t *rq = cpu_rq(cpu);
4569 int local = 1; 4777 struct sched_domain *tmp;
4570
4571 sched_domain_debug(sd, cpu);
4572 4778
4573 spin_lock_irqsave(&rq->lock, flags); 4779 /* Remove the sched domains which do not contribute to scheduling. */
4574 4780 for (tmp = sd; tmp; tmp = tmp->parent) {
4575 if (cpu == smp_processor_id() || !cpu_online(cpu)) { 4781 struct sched_domain *parent = tmp->parent;
4576 rq->sd = sd; 4782 if (!parent)
4577 } else { 4783 break;
4578 init_completion(&req.done); 4784 if (sd_parent_degenerate(tmp, parent))
4579 req.type = REQ_SET_DOMAIN; 4785 tmp->parent = parent->parent;
4580 req.sd = sd;
4581 list_add(&req.list, &rq->migration_queue);
4582 local = 0;
4583 } 4786 }
4584 4787
4585 spin_unlock_irqrestore(&rq->lock, flags); 4788 if (sd && sd_degenerate(sd))
4789 sd = sd->parent;
4586 4790
4587 if (!local) { 4791 sched_domain_debug(sd, cpu);
4588 wake_up_process(rq->migration_thread); 4792
4589 wait_for_completion(&req.done); 4793 rcu_assign_pointer(rq->sd, sd);
4590 }
4591} 4794}
4592 4795
4593/* cpus with isolated domains */ 4796/* cpus with isolated domains */
@@ -4619,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
4619 * covered by the given span, and will set each group's ->cpumask correctly, 4822 * covered by the given span, and will set each group's ->cpumask correctly,
4620 * and ->cpu_power to 0. 4823 * and ->cpu_power to 0.
4621 */ 4824 */
4622void __devinit init_sched_build_groups(struct sched_group groups[], 4825void init_sched_build_groups(struct sched_group groups[],
4623 cpumask_t span, int (*group_fn)(int cpu)) 4826 cpumask_t span, int (*group_fn)(int cpu))
4624{ 4827{
4625 struct sched_group *first = NULL, *last = NULL; 4828 struct sched_group *first = NULL, *last = NULL;
@@ -4655,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
4655 4858
4656 4859
4657#ifdef ARCH_HAS_SCHED_DOMAIN 4860#ifdef ARCH_HAS_SCHED_DOMAIN
4658extern void __devinit arch_init_sched_domains(void); 4861extern void build_sched_domains(const cpumask_t *cpu_map);
4659extern void __devinit arch_destroy_sched_domains(void); 4862extern void arch_init_sched_domains(const cpumask_t *cpu_map);
4863extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
4660#else 4864#else
4661#ifdef CONFIG_SCHED_SMT 4865#ifdef CONFIG_SCHED_SMT
4662static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4866static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4663static struct sched_group sched_group_cpus[NR_CPUS]; 4867static struct sched_group sched_group_cpus[NR_CPUS];
4664static int __devinit cpu_to_cpu_group(int cpu) 4868static int cpu_to_cpu_group(int cpu)
4665{ 4869{
4666 return cpu; 4870 return cpu;
4667} 4871}
@@ -4669,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
4669 4873
4670static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4874static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4671static struct sched_group sched_group_phys[NR_CPUS]; 4875static struct sched_group sched_group_phys[NR_CPUS];
4672static int __devinit cpu_to_phys_group(int cpu) 4876static int cpu_to_phys_group(int cpu)
4673{ 4877{
4674#ifdef CONFIG_SCHED_SMT 4878#ifdef CONFIG_SCHED_SMT
4675 return first_cpu(cpu_sibling_map[cpu]); 4879 return first_cpu(cpu_sibling_map[cpu]);
@@ -4682,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)
4682 4886
4683static DEFINE_PER_CPU(struct sched_domain, node_domains); 4887static DEFINE_PER_CPU(struct sched_domain, node_domains);
4684static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4888static struct sched_group sched_group_nodes[MAX_NUMNODES];
4685static int __devinit cpu_to_node_group(int cpu) 4889static int cpu_to_node_group(int cpu)
4686{ 4890{
4687 return cpu_to_node(cpu); 4891 return cpu_to_node(cpu);
4688} 4892}
@@ -4713,39 +4917,28 @@ static void check_sibling_maps(void)
4713#endif 4917#endif
4714 4918
4715/* 4919/*
4716 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4920 * Build sched domains for a given set of cpus and attach the sched domains
4921 * to the individual cpus
4717 */ 4922 */
4718static void __devinit arch_init_sched_domains(void) 4923static void build_sched_domains(const cpumask_t *cpu_map)
4719{ 4924{
4720 int i; 4925 int i;
4721 cpumask_t cpu_default_map;
4722
4723#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4724 check_sibling_maps();
4725#endif
4726 /*
4727 * Setup mask for cpus without special case scheduling requirements.
4728 * For now this just excludes isolated cpus, but could be used to
4729 * exclude other special cases in the future.
4730 */
4731 cpus_complement(cpu_default_map, cpu_isolated_map);
4732 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4733 4926
4734 /* 4927 /*
4735 * Set up domains. Isolated domains just stay on the dummy domain. 4928 * Set up domains for cpus specified by the cpu_map.
4736 */ 4929 */
4737 for_each_cpu_mask(i, cpu_default_map) { 4930 for_each_cpu_mask(i, *cpu_map) {
4738 int group; 4931 int group;
4739 struct sched_domain *sd = NULL, *p; 4932 struct sched_domain *sd = NULL, *p;
4740 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4933 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4741 4934
4742 cpus_and(nodemask, nodemask, cpu_default_map); 4935 cpus_and(nodemask, nodemask, *cpu_map);
4743 4936
4744#ifdef CONFIG_NUMA 4937#ifdef CONFIG_NUMA
4745 sd = &per_cpu(node_domains, i); 4938 sd = &per_cpu(node_domains, i);
4746 group = cpu_to_node_group(i); 4939 group = cpu_to_node_group(i);
4747 *sd = SD_NODE_INIT; 4940 *sd = SD_NODE_INIT;
4748 sd->span = cpu_default_map; 4941 sd->span = *cpu_map;
4749 sd->groups = &sched_group_nodes[group]; 4942 sd->groups = &sched_group_nodes[group];
4750#endif 4943#endif
4751 4944
@@ -4763,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)
4763 group = cpu_to_cpu_group(i); 4956 group = cpu_to_cpu_group(i);
4764 *sd = SD_SIBLING_INIT; 4957 *sd = SD_SIBLING_INIT;
4765 sd->span = cpu_sibling_map[i]; 4958 sd->span = cpu_sibling_map[i];
4766 cpus_and(sd->span, sd->span, cpu_default_map); 4959 cpus_and(sd->span, sd->span, *cpu_map);
4767 sd->parent = p; 4960 sd->parent = p;
4768 sd->groups = &sched_group_cpus[group]; 4961 sd->groups = &sched_group_cpus[group];
4769#endif 4962#endif
@@ -4773,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)
4773 /* Set up CPU (sibling) groups */ 4966 /* Set up CPU (sibling) groups */
4774 for_each_online_cpu(i) { 4967 for_each_online_cpu(i) {
4775 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4968 cpumask_t this_sibling_map = cpu_sibling_map[i];
4776 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4969 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4777 if (i != first_cpu(this_sibling_map)) 4970 if (i != first_cpu(this_sibling_map))
4778 continue; 4971 continue;
4779 4972
@@ -4786,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)
4786 for (i = 0; i < MAX_NUMNODES; i++) { 4979 for (i = 0; i < MAX_NUMNODES; i++) {
4787 cpumask_t nodemask = node_to_cpumask(i); 4980 cpumask_t nodemask = node_to_cpumask(i);
4788 4981
4789 cpus_and(nodemask, nodemask, cpu_default_map); 4982 cpus_and(nodemask, nodemask, *cpu_map);
4790 if (cpus_empty(nodemask)) 4983 if (cpus_empty(nodemask))
4791 continue; 4984 continue;
4792 4985
@@ -4796,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)
4796 4989
4797#ifdef CONFIG_NUMA 4990#ifdef CONFIG_NUMA
4798 /* Set up node groups */ 4991 /* Set up node groups */
4799 init_sched_build_groups(sched_group_nodes, cpu_default_map, 4992 init_sched_build_groups(sched_group_nodes, *cpu_map,
4800 &cpu_to_node_group); 4993 &cpu_to_node_group);
4801#endif 4994#endif
4802 4995
4803 /* Calculate CPU power for physical packages and nodes */ 4996 /* Calculate CPU power for physical packages and nodes */
4804 for_each_cpu_mask(i, cpu_default_map) { 4997 for_each_cpu_mask(i, *cpu_map) {
4805 int power; 4998 int power;
4806 struct sched_domain *sd; 4999 struct sched_domain *sd;
4807#ifdef CONFIG_SCHED_SMT 5000#ifdef CONFIG_SCHED_SMT
@@ -4825,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)
4825 } 5018 }
4826 5019
4827 /* Attach the domains */ 5020 /* Attach the domains */
4828 for_each_online_cpu(i) { 5021 for_each_cpu_mask(i, *cpu_map) {
4829 struct sched_domain *sd; 5022 struct sched_domain *sd;
4830#ifdef CONFIG_SCHED_SMT 5023#ifdef CONFIG_SCHED_SMT
4831 sd = &per_cpu(cpu_domains, i); 5024 sd = &per_cpu(cpu_domains, i);
@@ -4835,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)
4835 cpu_attach_domain(sd, i); 5028 cpu_attach_domain(sd, i);
4836 } 5029 }
4837} 5030}
5031/*
5032 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5033 */
5034static void arch_init_sched_domains(cpumask_t *cpu_map)
5035{
5036 cpumask_t cpu_default_map;
5037
5038#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5039 check_sibling_maps();
5040#endif
5041 /*
5042 * Setup mask for cpus without special case scheduling requirements.
5043 * For now this just excludes isolated cpus, but could be used to
5044 * exclude other special cases in the future.
5045 */
5046 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5047
5048 build_sched_domains(&cpu_default_map);
5049}
4838 5050
4839#ifdef CONFIG_HOTPLUG_CPU 5051static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
4840static void __devinit arch_destroy_sched_domains(void)
4841{ 5052{
4842 /* Do nothing: everything is statically allocated. */ 5053 /* Do nothing: everything is statically allocated. */
4843} 5054}
4844#endif
4845 5055
4846#endif /* ARCH_HAS_SCHED_DOMAIN */ 5056#endif /* ARCH_HAS_SCHED_DOMAIN */
4847 5057
4848/* 5058/*
4849 * Initial dummy domain for early boot and for hotplug cpu. Being static, 5059 * Detach sched domains from a group of cpus specified in cpu_map
4850 * it is initialized to zero, so all balancing flags are cleared which is 5060 * These cpus will now be attached to the NULL domain
4851 * what we want.
4852 */ 5061 */
4853static struct sched_domain sched_domain_dummy; 5062static inline void detach_destroy_domains(const cpumask_t *cpu_map)
5063{
5064 int i;
5065
5066 for_each_cpu_mask(i, *cpu_map)
5067 cpu_attach_domain(NULL, i);
5068 synchronize_sched();
5069 arch_destroy_sched_domains(cpu_map);
5070}
5071
5072/*
5073 * Partition sched domains as specified by the cpumasks below.
5074 * This attaches all cpus from the cpumasks to the NULL domain,
5075 * waits for a RCU quiescent period, recalculates sched
5076 * domain information and then attaches them back to the
5077 * correct sched domains
5078 * Call with hotplug lock held
5079 */
5080void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5081{
5082 cpumask_t change_map;
5083
5084 cpus_and(*partition1, *partition1, cpu_online_map);
5085 cpus_and(*partition2, *partition2, cpu_online_map);
5086 cpus_or(change_map, *partition1, *partition2);
5087
5088 /* Detach sched domains from all of the affected cpus */
5089 detach_destroy_domains(&change_map);
5090 if (!cpus_empty(*partition1))
5091 build_sched_domains(partition1);
5092 if (!cpus_empty(*partition2))
5093 build_sched_domains(partition2);
5094}
4854 5095
4855#ifdef CONFIG_HOTPLUG_CPU 5096#ifdef CONFIG_HOTPLUG_CPU
4856/* 5097/*
4857 * Force a reinitialization of the sched domains hierarchy. The domains 5098 * Force a reinitialization of the sched domains hierarchy. The domains
4858 * and groups cannot be updated in place without racing with the balancing 5099 * and groups cannot be updated in place without racing with the balancing
4859 * code, so we temporarily attach all running cpus to a "dummy" domain 5100 * code, so we temporarily attach all running cpus to the NULL domain
4860 * which will prevent rebalancing while the sched domains are recalculated. 5101 * which will prevent rebalancing while the sched domains are recalculated.
4861 */ 5102 */
4862static int update_sched_domains(struct notifier_block *nfb, 5103static int update_sched_domains(struct notifier_block *nfb,
4863 unsigned long action, void *hcpu) 5104 unsigned long action, void *hcpu)
4864{ 5105{
4865 int i;
4866
4867 switch (action) { 5106 switch (action) {
4868 case CPU_UP_PREPARE: 5107 case CPU_UP_PREPARE:
4869 case CPU_DOWN_PREPARE: 5108 case CPU_DOWN_PREPARE:
4870 for_each_online_cpu(i) 5109 detach_destroy_domains(&cpu_online_map);
4871 cpu_attach_domain(&sched_domain_dummy, i);
4872 arch_destroy_sched_domains();
4873 return NOTIFY_OK; 5110 return NOTIFY_OK;
4874 5111
4875 case CPU_UP_CANCELED: 5112 case CPU_UP_CANCELED:
@@ -4885,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4885 } 5122 }
4886 5123
4887 /* The hotplug lock is already held by cpu_up/cpu_down */ 5124 /* The hotplug lock is already held by cpu_up/cpu_down */
4888 arch_init_sched_domains(); 5125 arch_init_sched_domains(&cpu_online_map);
4889 5126
4890 return NOTIFY_OK; 5127 return NOTIFY_OK;
4891} 5128}
@@ -4894,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4894void __init sched_init_smp(void) 5131void __init sched_init_smp(void)
4895{ 5132{
4896 lock_cpu_hotplug(); 5133 lock_cpu_hotplug();
4897 arch_init_sched_domains(); 5134 arch_init_sched_domains(&cpu_online_map);
4898 unlock_cpu_hotplug(); 5135 unlock_cpu_hotplug();
4899 /* XXX: Theoretical race here - CPU may be hotplugged now */ 5136 /* XXX: Theoretical race here - CPU may be hotplugged now */
4900 hotcpu_notifier(update_sched_domains, 0); 5137 hotcpu_notifier(update_sched_domains, 0);
@@ -4924,13 +5161,15 @@ void __init sched_init(void)
4924 5161
4925 rq = cpu_rq(i); 5162 rq = cpu_rq(i);
4926 spin_lock_init(&rq->lock); 5163 spin_lock_init(&rq->lock);
5164 rq->nr_running = 0;
4927 rq->active = rq->arrays; 5165 rq->active = rq->arrays;
4928 rq->expired = rq->arrays + 1; 5166 rq->expired = rq->arrays + 1;
4929 rq->best_expired_prio = MAX_PRIO; 5167 rq->best_expired_prio = MAX_PRIO;
4930 5168
4931#ifdef CONFIG_SMP 5169#ifdef CONFIG_SMP
4932 rq->sd = &sched_domain_dummy; 5170 rq->sd = NULL;
4933 rq->cpu_load = 0; 5171 for (j = 1; j < 3; j++)
5172 rq->cpu_load[j] = 0;
4934 rq->active_balance = 0; 5173 rq->active_balance = 0;
4935 rq->push_cpu = 0; 5174 rq->push_cpu = 0;
4936 rq->migration_thread = NULL; 5175 rq->migration_thread = NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index b3c24c732c5a..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,7 @@
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/signal.h> 26#include <linux/signal.h>
27#include <linux/audit.h>
27#include <asm/param.h> 28#include <asm/param.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29#include <asm/unistd.h> 30#include <asm/unistd.h>
@@ -212,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
212fastcall void recalc_sigpending_tsk(struct task_struct *t) 213fastcall void recalc_sigpending_tsk(struct task_struct *t)
213{ 214{
214 if (t->signal->group_stop_count > 0 || 215 if (t->signal->group_stop_count > 0 ||
216 (freezing(t)) ||
215 PENDING(&t->pending, &t->blocked) || 217 PENDING(&t->pending, &t->blocked) ||
216 PENDING(&t->signal->shared_pending, &t->blocked)) 218 PENDING(&t->signal->shared_pending, &t->blocked))
217 set_tsk_thread_flag(t, TIF_SIGPENDING); 219 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -667,7 +669,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
667 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 669 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
668 && !capable(CAP_KILL)) 670 && !capable(CAP_KILL))
669 return error; 671 return error;
670 return security_task_kill(t, info, sig); 672
673 error = security_task_kill(t, info, sig);
674 if (!error)
675 audit_signal_info(sig, t); /* Let audit system see the signal */
676 return error;
671} 677}
672 678
673/* forward decl */ 679/* forward decl */
@@ -2225,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2225 current->state = TASK_INTERRUPTIBLE; 2231 current->state = TASK_INTERRUPTIBLE;
2226 timeout = schedule_timeout(timeout); 2232 timeout = schedule_timeout(timeout);
2227 2233
2228 if (current->flags & PF_FREEZE) 2234 try_to_freeze();
2229 refrigerator(PF_FREEZE);
2230 spin_lock_irq(&current->sighand->siglock); 2235 spin_lock_irq(&current->sighand->siglock);
2231 sig = dequeue_signal(current, &these, &info); 2236 sig = dequeue_signal(current, &these, &info);
2232 current->blocked = current->real_blocked; 2237 current->blocked = current->real_blocked;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25aa7cf..84a9d18aa8da 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
100 stopmachine_state = STOPMACHINE_WAIT; 100 stopmachine_state = STOPMACHINE_WAIT;
101 101
102 for_each_online_cpu(i) { 102 for_each_online_cpu(i) {
103 if (i == _smp_processor_id()) 103 if (i == raw_smp_processor_id())
104 continue; 104 continue;
105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
106 if (ret < 0) 106 if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
182 182
183 /* If they don't care which CPU fn runs on, bind to any online one. */ 183 /* If they don't care which CPU fn runs on, bind to any online one. */
184 if (cpu == NR_CPUS) 184 if (cpu == NR_CPUS)
185 cpu = _smp_processor_id(); 185 cpu = raw_smp_processor_id();
186 186
187 p = kthread_create(do_stop, &smdata, "kstopmachine"); 187 p = kthread_create(do_stop, &smdata, "kstopmachine");
188 if (!IS_ERR(p)) { 188 if (!IS_ERR(p)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/highuid.h> 17#include <linux/highuid.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/kexec.h>
19#include <linux/workqueue.h> 21#include <linux/workqueue.h>
20#include <linux/device.h> 22#include <linux/device.h>
21#include <linux/key.h> 23#include <linux/key.h>
@@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
405 case LINUX_REBOOT_CMD_HALT: 407 case LINUX_REBOOT_CMD_HALT:
406 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 408 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
407 system_state = SYSTEM_HALT; 409 system_state = SYSTEM_HALT;
410 device_suspend(PMSG_SUSPEND);
408 device_shutdown(); 411 device_shutdown();
409 printk(KERN_EMERG "System halted.\n"); 412 printk(KERN_EMERG "System halted.\n");
410 machine_halt(); 413 machine_halt();
@@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
415 case LINUX_REBOOT_CMD_POWER_OFF: 418 case LINUX_REBOOT_CMD_POWER_OFF:
416 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 419 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
417 system_state = SYSTEM_POWER_OFF; 420 system_state = SYSTEM_POWER_OFF;
421 device_suspend(PMSG_SUSPEND);
418 device_shutdown(); 422 device_shutdown();
419 printk(KERN_EMERG "Power down.\n"); 423 printk(KERN_EMERG "Power down.\n");
420 machine_power_off(); 424 machine_power_off();
@@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
431 435
432 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); 436 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
433 system_state = SYSTEM_RESTART; 437 system_state = SYSTEM_RESTART;
438 device_suspend(PMSG_FREEZE);
434 device_shutdown(); 439 device_shutdown();
435 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); 440 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
436 machine_restart(buffer); 441 machine_restart(buffer);
437 break; 442 break;
438 443
444#ifdef CONFIG_KEXEC
445 case LINUX_REBOOT_CMD_KEXEC:
446 {
447 struct kimage *image;
448 image = xchg(&kexec_image, 0);
449 if (!image) {
450 unlock_kernel();
451 return -EINVAL;
452 }
453 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
454 system_state = SYSTEM_RESTART;
455 device_shutdown();
456 printk(KERN_EMERG "Starting new kernel\n");
457 machine_shutdown();
458 machine_kexec(image);
459 break;
460 }
461#endif
439#ifdef CONFIG_SOFTWARE_SUSPEND 462#ifdef CONFIG_SOFTWARE_SUSPEND
440 case LINUX_REBOOT_CMD_SW_SUSPEND: 463 case LINUX_REBOOT_CMD_SW_SUSPEND:
441 { 464 {
@@ -525,7 +548,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
525 } 548 }
526 if (new_egid != old_egid) 549 if (new_egid != old_egid)
527 { 550 {
528 current->mm->dumpable = 0; 551 current->mm->dumpable = suid_dumpable;
529 smp_wmb(); 552 smp_wmb();
530 } 553 }
531 if (rgid != (gid_t) -1 || 554 if (rgid != (gid_t) -1 ||
@@ -556,7 +579,7 @@ asmlinkage long sys_setgid(gid_t gid)
556 { 579 {
557 if(old_egid != gid) 580 if(old_egid != gid)
558 { 581 {
559 current->mm->dumpable=0; 582 current->mm->dumpable = suid_dumpable;
560 smp_wmb(); 583 smp_wmb();
561 } 584 }
562 current->gid = current->egid = current->sgid = current->fsgid = gid; 585 current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +588,7 @@ asmlinkage long sys_setgid(gid_t gid)
565 { 588 {
566 if(old_egid != gid) 589 if(old_egid != gid)
567 { 590 {
568 current->mm->dumpable=0; 591 current->mm->dumpable = suid_dumpable;
569 smp_wmb(); 592 smp_wmb();
570 } 593 }
571 current->egid = current->fsgid = gid; 594 current->egid = current->fsgid = gid;
@@ -596,7 +619,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
596 619
597 if(dumpclear) 620 if(dumpclear)
598 { 621 {
599 current->mm->dumpable = 0; 622 current->mm->dumpable = suid_dumpable;
600 smp_wmb(); 623 smp_wmb();
601 } 624 }
602 current->uid = new_ruid; 625 current->uid = new_ruid;
@@ -653,7 +676,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 676
654 if (new_euid != old_euid) 677 if (new_euid != old_euid)
655 { 678 {
656 current->mm->dumpable=0; 679 current->mm->dumpable = suid_dumpable;
657 smp_wmb(); 680 smp_wmb();
658 } 681 }
659 current->fsuid = current->euid = new_euid; 682 current->fsuid = current->euid = new_euid;
@@ -703,7 +726,7 @@ asmlinkage long sys_setuid(uid_t uid)
703 726
704 if (old_euid != uid) 727 if (old_euid != uid)
705 { 728 {
706 current->mm->dumpable = 0; 729 current->mm->dumpable = suid_dumpable;
707 smp_wmb(); 730 smp_wmb();
708 } 731 }
709 current->fsuid = current->euid = uid; 732 current->fsuid = current->euid = uid;
@@ -748,7 +771,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
748 if (euid != (uid_t) -1) { 771 if (euid != (uid_t) -1) {
749 if (euid != current->euid) 772 if (euid != current->euid)
750 { 773 {
751 current->mm->dumpable = 0; 774 current->mm->dumpable = suid_dumpable;
752 smp_wmb(); 775 smp_wmb();
753 } 776 }
754 current->euid = euid; 777 current->euid = euid;
@@ -798,7 +821,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
798 if (egid != (gid_t) -1) { 821 if (egid != (gid_t) -1) {
799 if (egid != current->egid) 822 if (egid != current->egid)
800 { 823 {
801 current->mm->dumpable = 0; 824 current->mm->dumpable = suid_dumpable;
802 smp_wmb(); 825 smp_wmb();
803 } 826 }
804 current->egid = egid; 827 current->egid = egid;
@@ -845,7 +868,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
845 { 868 {
846 if (uid != old_fsuid) 869 if (uid != old_fsuid)
847 { 870 {
848 current->mm->dumpable = 0; 871 current->mm->dumpable = suid_dumpable;
849 smp_wmb(); 872 smp_wmb();
850 } 873 }
851 current->fsuid = uid; 874 current->fsuid = uid;
@@ -875,7 +898,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
875 { 898 {
876 if (gid != old_fsgid) 899 if (gid != old_fsgid)
877 { 900 {
878 current->mm->dumpable = 0; 901 current->mm->dumpable = suid_dumpable;
879 smp_wmb(); 902 smp_wmb();
880 } 903 }
881 current->fsgid = gid; 904 current->fsgid = gid;
@@ -894,35 +917,69 @@ asmlinkage long sys_times(struct tms __user * tbuf)
894 */ 917 */
895 if (tbuf) { 918 if (tbuf) {
896 struct tms tmp; 919 struct tms tmp;
897 struct task_struct *tsk = current;
898 struct task_struct *t;
899 cputime_t utime, stime, cutime, cstime; 920 cputime_t utime, stime, cutime, cstime;
900 921
901 read_lock(&tasklist_lock); 922#ifdef CONFIG_SMP
902 utime = tsk->signal->utime; 923 if (thread_group_empty(current)) {
903 stime = tsk->signal->stime; 924 /*
904 t = tsk; 925 * Single thread case without the use of any locks.
905 do { 926 *
906 utime = cputime_add(utime, t->utime); 927 * We may race with release_task if two threads are
907 stime = cputime_add(stime, t->stime); 928 * executing. However, release task first adds up the
908 t = next_thread(t); 929 * counters (__exit_signal) before removing the task
909 } while (t != tsk); 930 * from the process tasklist (__unhash_process).
910 931 * __exit_signal also acquires and releases the
911 /* 932 * siglock which results in the proper memory ordering
912 * While we have tasklist_lock read-locked, no dying thread 933 * so that the list modifications are always visible
913 * can be updating current->signal->[us]time. Instead, 934 * after the counters have been updated.
914 * we got their counts included in the live thread loop. 935 *
915 * However, another thread can come in right now and 936 * If the counters have been updated by the second thread
916 * do a wait call that updates current->signal->c[us]time. 937 * but the thread has not yet been removed from the list
917 * To make sure we always see that pair updated atomically, 938 * then the other branch will be executing which will
918 * we take the siglock around fetching them. 939 * block on tasklist_lock until the exit handling of the
919 */ 940 * other task is finished.
920 spin_lock_irq(&tsk->sighand->siglock); 941 *
921 cutime = tsk->signal->cutime; 942 * This also implies that the sighand->siglock cannot
922 cstime = tsk->signal->cstime; 943 * be held by another processor. So we can also
923 spin_unlock_irq(&tsk->sighand->siglock); 944 * skip acquiring that lock.
924 read_unlock(&tasklist_lock); 945 */
946 utime = cputime_add(current->signal->utime, current->utime);
947 stime = cputime_add(current->signal->utime, current->stime);
948 cutime = current->signal->cutime;
949 cstime = current->signal->cstime;
950 } else
951#endif
952 {
925 953
954 /* Process with multiple threads */
955 struct task_struct *tsk = current;
956 struct task_struct *t;
957
958 read_lock(&tasklist_lock);
959 utime = tsk->signal->utime;
960 stime = tsk->signal->stime;
961 t = tsk;
962 do {
963 utime = cputime_add(utime, t->utime);
964 stime = cputime_add(stime, t->stime);
965 t = next_thread(t);
966 } while (t != tsk);
967
968 /*
969 * While we have tasklist_lock read-locked, no dying thread
970 * can be updating current->signal->[us]time. Instead,
971 * we got their counts included in the live thread loop.
972 * However, another thread can come in right now and
973 * do a wait call that updates current->signal->c[us]time.
974 * To make sure we always see that pair updated atomically,
975 * we take the siglock around fetching them.
976 */
977 spin_lock_irq(&tsk->sighand->siglock);
978 cutime = tsk->signal->cutime;
979 cstime = tsk->signal->cstime;
980 spin_unlock_irq(&tsk->sighand->siglock);
981 read_unlock(&tasklist_lock);
982 }
926 tmp.tms_utime = cputime_to_clock_t(utime); 983 tmp.tms_utime = cputime_to_clock_t(utime);
927 tmp.tms_stime = cputime_to_clock_t(stime); 984 tmp.tms_stime = cputime_to_clock_t(stime);
928 tmp.tms_cutime = cputime_to_clock_t(cutime); 985 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1225,7 +1282,7 @@ static void groups_sort(struct group_info *group_info)
1225} 1282}
1226 1283
1227/* a simple bsearch */ 1284/* a simple bsearch */
1228static int groups_search(struct group_info *group_info, gid_t grp) 1285int groups_search(struct group_info *group_info, gid_t grp)
1229{ 1286{
1230 int left, right; 1287 int left, right;
1231 1288
@@ -1652,7 +1709,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1652 error = 1; 1709 error = 1;
1653 break; 1710 break;
1654 case PR_SET_DUMPABLE: 1711 case PR_SET_DUMPABLE:
1655 if (arg2 != 0 && arg2 != 1) { 1712 if (arg2 < 0 || arg2 > 2) {
1656 error = -EINVAL; 1713 error = -EINVAL;
1657 break; 1714 break;
1658 } 1715 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70ed1f98..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 18cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 19cond_syscall(sys_swapon);
20cond_syscall(sys_swapoff); 20cond_syscall(sys_swapoff);
21cond_syscall(sys_kexec_load);
22cond_syscall(compat_sys_kexec_load);
21cond_syscall(sys_init_module); 23cond_syscall(sys_init_module);
22cond_syscall(sys_delete_module); 24cond_syscall(sys_delete_module);
23cond_syscall(sys_socketpair); 25cond_syscall(sys_socketpair);
@@ -77,6 +79,7 @@ cond_syscall(sys_request_key);
77cond_syscall(sys_keyctl); 79cond_syscall(sys_keyctl);
78cond_syscall(compat_sys_keyctl); 80cond_syscall(compat_sys_keyctl);
79cond_syscall(compat_sys_socketcall); 81cond_syscall(compat_sys_socketcall);
82cond_syscall(sys_set_zone_reclaim);
80 83
81/* arch-specific weak syscall entries */ 84/* arch-specific weak syscall entries */
82cond_syscall(sys_pciconfig_read); 85cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..270ee7fadbd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
58extern int max_threads; 58extern int max_threads;
59extern int sysrq_enabled; 59extern int sysrq_enabled;
60extern int core_uses_pid; 60extern int core_uses_pid;
61extern int suid_dumpable;
61extern char core_pattern[]; 62extern char core_pattern[];
62extern int cad_pid; 63extern int cad_pid;
63extern int pid_max; 64extern int pid_max;
@@ -950,6 +951,14 @@ static ctl_table fs_table[] = {
950 .proc_handler = &proc_dointvec, 951 .proc_handler = &proc_dointvec,
951 }, 952 },
952#endif 953#endif
954 {
955 .ctl_name = KERN_SETUID_DUMPABLE,
956 .procname = "suid_dumpable",
957 .data = &suid_dumpable,
958 .maxlen = sizeof(int),
959 .mode = 0644,
960 .proc_handler = &proc_dointvec,
961 },
953 { .ctl_name = 0 } 962 { .ctl_name = 0 }
954}; 963};
955 964
@@ -991,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
991 int error = parse_table(name, nlen, oldval, oldlenp, 1000 int error = parse_table(name, nlen, oldval, oldlenp,
992 newval, newlen, head->ctl_table, 1001 newval, newlen, head->ctl_table,
993 &context); 1002 &context);
994 if (context) 1003 kfree(context);
995 kfree(context);
996 if (error != -ENOTDIR) 1004 if (error != -ENOTDIR)
997 return error; 1005 return error;
998 tmp = tmp->next; 1006 tmp = tmp->next;
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..f2a11887a726 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
57#define TVN_MASK (TVN_SIZE - 1) 57#define TVN_MASK (TVN_SIZE - 1)
58#define TVR_MASK (TVR_SIZE - 1) 58#define TVR_MASK (TVR_SIZE - 1)
59 59
60struct timer_base_s {
61 spinlock_t lock;
62 struct timer_list *running_timer;
63};
64
60typedef struct tvec_s { 65typedef struct tvec_s {
61 struct list_head vec[TVN_SIZE]; 66 struct list_head vec[TVN_SIZE];
62} tvec_t; 67} tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
66} tvec_root_t; 71} tvec_root_t;
67 72
68struct tvec_t_base_s { 73struct tvec_t_base_s {
69 spinlock_t lock; 74 struct timer_base_s t_base;
70 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
71 struct timer_list *running_timer;
72 tvec_root_t tv1; 76 tvec_root_t tv1;
73 tvec_t tv2; 77 tvec_t tv2;
74 tvec_t tv3; 78 tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
77} ____cacheline_aligned_in_smp; 81} ____cacheline_aligned_in_smp;
78 82
79typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
84static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
80 85
81static inline void set_running_timer(tvec_base_t *base, 86static inline void set_running_timer(tvec_base_t *base,
82 struct timer_list *timer) 87 struct timer_list *timer)
83{ 88{
84#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
85 base->running_timer = timer; 90 base->t_base.running_timer = timer;
86#endif 91#endif
87} 92}
88 93
89/* Fake initialization */
90static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
91
92static void check_timer_failed(struct timer_list *timer) 94static void check_timer_failed(struct timer_list *timer)
93{ 95{
94 static int whine_count; 96 static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
103 /* 105 /*
104 * Now fix it up 106 * Now fix it up
105 */ 107 */
106 spin_lock_init(&timer->lock);
107 timer->magic = TIMER_MAGIC; 108 timer->magic = TIMER_MAGIC;
108} 109}
109 110
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
156 list_add_tail(&timer->entry, vec); 157 list_add_tail(&timer->entry, vec);
157} 158}
158 159
160typedef struct timer_base_s timer_base_t;
161/*
162 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
163 * at compile time, and we need timer->base to lock the timer.
164 */
165timer_base_t __init_timer_base
166 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
167EXPORT_SYMBOL(__init_timer_base);
168
169/***
170 * init_timer - initialize a timer.
171 * @timer: the timer to be initialized
172 *
173 * init_timer() must be done to a timer prior calling *any* of the
174 * other timer functions.
175 */
176void fastcall init_timer(struct timer_list *timer)
177{
178 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC;
181}
182EXPORT_SYMBOL(init_timer);
183
184static inline void detach_timer(struct timer_list *timer,
185 int clear_pending)
186{
187 struct list_head *entry = &timer->entry;
188
189 __list_del(entry->prev, entry->next);
190 if (clear_pending)
191 entry->next = NULL;
192 entry->prev = LIST_POISON2;
193}
194
195/*
196 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
197 * means that all timers which are tied to this base via timer->base are
198 * locked, and the base itself is locked too.
199 *
200 * So __run_timers/migrate_timers can safely modify all timers which could
201 * be found on ->tvX lists.
202 *
203 * When the timer's base is locked, and the timer removed from list, it is
204 * possible to set timer->base = NULL and drop the lock: the timer remains
205 * locked.
206 */
207static timer_base_t *lock_timer_base(struct timer_list *timer,
208 unsigned long *flags)
209{
210 timer_base_t *base;
211
212 for (;;) {
213 base = timer->base;
214 if (likely(base != NULL)) {
215 spin_lock_irqsave(&base->lock, *flags);
216 if (likely(base == timer->base))
217 return base;
218 /* The timer has migrated to another CPU */
219 spin_unlock_irqrestore(&base->lock, *flags);
220 }
221 cpu_relax();
222 }
223}
224
159int __mod_timer(struct timer_list *timer, unsigned long expires) 225int __mod_timer(struct timer_list *timer, unsigned long expires)
160{ 226{
161 tvec_base_t *old_base, *new_base; 227 timer_base_t *base;
228 tvec_base_t *new_base;
162 unsigned long flags; 229 unsigned long flags;
163 int ret = 0; 230 int ret = 0;
164 231
165 BUG_ON(!timer->function); 232 BUG_ON(!timer->function);
166
167 check_timer(timer); 233 check_timer(timer);
168 234
169 spin_lock_irqsave(&timer->lock, flags); 235 base = lock_timer_base(timer, &flags);
236
237 if (timer_pending(timer)) {
238 detach_timer(timer, 0);
239 ret = 1;
240 }
241
170 new_base = &__get_cpu_var(tvec_bases); 242 new_base = &__get_cpu_var(tvec_bases);
171repeat:
172 old_base = timer->base;
173 243
174 /* 244 if (base != &new_base->t_base) {
175 * Prevent deadlocks via ordering by old_base < new_base.
176 */
177 if (old_base && (new_base != old_base)) {
178 if (old_base < new_base) {
179 spin_lock(&new_base->lock);
180 spin_lock(&old_base->lock);
181 } else {
182 spin_lock(&old_base->lock);
183 spin_lock(&new_base->lock);
184 }
185 /* 245 /*
186 * The timer base might have been cancelled while we were 246 * We are trying to schedule the timer on the local CPU.
187 * trying to take the lock(s): 247 * However we can't change timer's base while it is running,
248 * otherwise del_timer_sync() can't detect that the timer's
249 * handler yet has not finished. This also guarantees that
250 * the timer is serialized wrt itself.
188 */ 251 */
189 if (timer->base != old_base) { 252 if (unlikely(base->running_timer == timer)) {
190 spin_unlock(&new_base->lock); 253 /* The timer remains on a former base */
191 spin_unlock(&old_base->lock); 254 new_base = container_of(base, tvec_base_t, t_base);
192 goto repeat; 255 } else {
193 } 256 /* See the comment in lock_timer_base() */
194 } else { 257 timer->base = NULL;
195 spin_lock(&new_base->lock); 258 spin_unlock(&base->lock);
196 if (timer->base != old_base) { 259 spin_lock(&new_base->t_base.lock);
197 spin_unlock(&new_base->lock); 260 timer->base = &new_base->t_base;
198 goto repeat;
199 } 261 }
200 } 262 }
201 263
202 /*
203 * Delete the previous timeout (if there was any), and install
204 * the new one:
205 */
206 if (old_base) {
207 list_del(&timer->entry);
208 ret = 1;
209 }
210 timer->expires = expires; 264 timer->expires = expires;
211 internal_add_timer(new_base, timer); 265 internal_add_timer(new_base, timer);
212 timer->base = new_base; 266 spin_unlock_irqrestore(&new_base->t_base.lock, flags);
213
214 if (old_base && (new_base != old_base))
215 spin_unlock(&old_base->lock);
216 spin_unlock(&new_base->lock);
217 spin_unlock_irqrestore(&timer->lock, flags);
218 267
219 return ret; 268 return ret;
220} 269}
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
232{ 281{
233 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 282 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
234 unsigned long flags; 283 unsigned long flags;
235 284
236 BUG_ON(timer_pending(timer) || !timer->function); 285 BUG_ON(timer_pending(timer) || !timer->function);
237 286
238 check_timer(timer); 287 check_timer(timer);
239 288
240 spin_lock_irqsave(&base->lock, flags); 289 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base;
241 internal_add_timer(base, timer); 291 internal_add_timer(base, timer);
242 timer->base = base; 292 spin_unlock_irqrestore(&base->t_base.lock, flags);
243 spin_unlock_irqrestore(&base->lock, flags);
244} 293}
245 294
246 295
@@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer);
295 */ 344 */
296int del_timer(struct timer_list *timer) 345int del_timer(struct timer_list *timer)
297{ 346{
347 timer_base_t *base;
298 unsigned long flags; 348 unsigned long flags;
299 tvec_base_t *base; 349 int ret = 0;
300 350
301 check_timer(timer); 351 check_timer(timer);
302 352
303repeat: 353 if (timer_pending(timer)) {
304 base = timer->base; 354 base = lock_timer_base(timer, &flags);
305 if (!base) 355 if (timer_pending(timer)) {
306 return 0; 356 detach_timer(timer, 1);
307 spin_lock_irqsave(&base->lock, flags); 357 ret = 1;
308 if (base != timer->base) { 358 }
309 spin_unlock_irqrestore(&base->lock, flags); 359 spin_unlock_irqrestore(&base->lock, flags);
310 goto repeat;
311 } 360 }
312 list_del(&timer->entry);
313 /* Need to make sure that anybody who sees a NULL base also sees the list ops */
314 smp_wmb();
315 timer->base = NULL;
316 spin_unlock_irqrestore(&base->lock, flags);
317 361
318 return 1; 362 return ret;
319} 363}
320 364
321EXPORT_SYMBOL(del_timer); 365EXPORT_SYMBOL(del_timer);
322 366
323#ifdef CONFIG_SMP 367#ifdef CONFIG_SMP
324/*** 368/*
325 * del_timer_sync - deactivate a timer and wait for the handler to finish. 369 * This function tries to deactivate a timer. Upon successful (ret >= 0)
326 * @timer: the timer to be deactivated 370 * exit the timer is not queued and the handler is not running on any CPU.
327 *
328 * This function only differs from del_timer() on SMP: besides deactivating
329 * the timer it also makes sure the handler has finished executing on other
330 * CPUs.
331 *
332 * Synchronization rules: callers must prevent restarting of the timer,
333 * otherwise this function is meaningless. It must not be called from
334 * interrupt contexts. The caller must not hold locks which would prevent
335 * completion of the timer's handler. Upon exit the timer is not queued and
336 * the handler is not running on any CPU.
337 *
338 * The function returns whether it has deactivated a pending timer or not.
339 * 371 *
340 * del_timer_sync() is slow and complicated because it copes with timer 372 * It must not be called from interrupt contexts.
341 * handlers which re-arm the timer (periodic timers). If the timer handler
342 * is known to not do this (a single shot timer) then use
343 * del_singleshot_timer_sync() instead.
344 */ 373 */
345int del_timer_sync(struct timer_list *timer) 374int try_to_del_timer_sync(struct timer_list *timer)
346{ 375{
347 tvec_base_t *base; 376 timer_base_t *base;
348 int i, ret = 0; 377 unsigned long flags;
378 int ret = -1;
349 379
350 check_timer(timer); 380 base = lock_timer_base(timer, &flags);
351 381
352del_again: 382 if (base->running_timer == timer)
353 ret += del_timer(timer); 383 goto out;
354 384
355 for_each_online_cpu(i) { 385 ret = 0;
356 base = &per_cpu(tvec_bases, i); 386 if (timer_pending(timer)) {
357 if (base->running_timer == timer) { 387 detach_timer(timer, 1);
358 while (base->running_timer == timer) { 388 ret = 1;
359 cpu_relax();
360 preempt_check_resched();
361 }
362 break;
363 }
364 } 389 }
365 smp_rmb(); 390out:
366 if (timer_pending(timer)) 391 spin_unlock_irqrestore(&base->lock, flags);
367 goto del_again;
368 392
369 return ret; 393 return ret;
370} 394}
371EXPORT_SYMBOL(del_timer_sync);
372 395
373/*** 396/***
374 * del_singleshot_timer_sync - deactivate a non-recursive timer 397 * del_timer_sync - deactivate a timer and wait for the handler to finish.
375 * @timer: the timer to be deactivated 398 * @timer: the timer to be deactivated
376 * 399 *
377 * This function is an optimization of del_timer_sync for the case where the 400 * This function only differs from del_timer() on SMP: besides deactivating
378 * caller can guarantee the timer does not reschedule itself in its timer 401 * the timer it also makes sure the handler has finished executing on other
379 * function. 402 * CPUs.
380 * 403 *
381 * Synchronization rules: callers must prevent restarting of the timer, 404 * Synchronization rules: callers must prevent restarting of the timer,
382 * otherwise this function is meaningless. It must not be called from 405 * otherwise this function is meaningless. It must not be called from
383 * interrupt contexts. The caller must not hold locks which wold prevent 406 * interrupt contexts. The caller must not hold locks which would prevent
384 * completion of the timer's handler. Upon exit the timer is not queued and 407 * completion of the timer's handler. The timer's handler must not call
385 * the handler is not running on any CPU. 408 * add_timer_on(). Upon exit the timer is not queued and the handler is
409 * not running on any CPU.
386 * 410 *
387 * The function returns whether it has deactivated a pending timer or not. 411 * The function returns whether it has deactivated a pending timer or not.
388 */ 412 */
389int del_singleshot_timer_sync(struct timer_list *timer) 413int del_timer_sync(struct timer_list *timer)
390{ 414{
391 int ret = del_timer(timer); 415 check_timer(timer);
392 416
393 if (!ret) { 417 for (;;) {
394 ret = del_timer_sync(timer); 418 int ret = try_to_del_timer_sync(timer);
395 BUG_ON(ret); 419 if (ret >= 0)
420 return ret;
396 } 421 }
397
398 return ret;
399} 422}
400EXPORT_SYMBOL(del_singleshot_timer_sync); 423
424EXPORT_SYMBOL(del_timer_sync);
401#endif 425#endif
402 426
403static int cascade(tvec_base_t *base, tvec_t *tv, int index) 427static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
415 struct timer_list *tmp; 439 struct timer_list *tmp;
416 440
417 tmp = list_entry(curr, struct timer_list, entry); 441 tmp = list_entry(curr, struct timer_list, entry);
418 BUG_ON(tmp->base != base); 442 BUG_ON(tmp->base != &base->t_base);
419 curr = curr->next; 443 curr = curr->next;
420 internal_add_timer(base, tmp); 444 internal_add_timer(base, tmp);
421 } 445 }
@@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base)
437{ 461{
438 struct timer_list *timer; 462 struct timer_list *timer;
439 463
440 spin_lock_irq(&base->lock); 464 spin_lock_irq(&base->t_base.lock);
441 while (time_after_eq(jiffies, base->timer_jiffies)) { 465 while (time_after_eq(jiffies, base->timer_jiffies)) {
442 struct list_head work_list = LIST_HEAD_INIT(work_list); 466 struct list_head work_list = LIST_HEAD_INIT(work_list);
443 struct list_head *head = &work_list; 467 struct list_head *head = &work_list;
@@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base)
453 cascade(base, &base->tv5, INDEX(3)); 477 cascade(base, &base->tv5, INDEX(3));
454 ++base->timer_jiffies; 478 ++base->timer_jiffies;
455 list_splice_init(base->tv1.vec + index, &work_list); 479 list_splice_init(base->tv1.vec + index, &work_list);
456repeat: 480 while (!list_empty(head)) {
457 if (!list_empty(head)) {
458 void (*fn)(unsigned long); 481 void (*fn)(unsigned long);
459 unsigned long data; 482 unsigned long data;
460 483
@@ -462,25 +485,26 @@ repeat:
462 fn = timer->function; 485 fn = timer->function;
463 data = timer->data; 486 data = timer->data;
464 487
465 list_del(&timer->entry);
466 set_running_timer(base, timer); 488 set_running_timer(base, timer);
467 smp_wmb(); 489 detach_timer(timer, 1);
468 timer->base = NULL; 490 spin_unlock_irq(&base->t_base.lock);
469 spin_unlock_irq(&base->lock);
470 { 491 {
471 u32 preempt_count = preempt_count(); 492 int preempt_count = preempt_count();
472 fn(data); 493 fn(data);
473 if (preempt_count != preempt_count()) { 494 if (preempt_count != preempt_count()) {
474 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); 495 printk(KERN_WARNING "huh, entered %p "
496 "with preempt_count %08x, exited"
497 " with %08x?\n",
498 fn, preempt_count,
499 preempt_count());
475 BUG(); 500 BUG();
476 } 501 }
477 } 502 }
478 spin_lock_irq(&base->lock); 503 spin_lock_irq(&base->t_base.lock);
479 goto repeat;
480 } 504 }
481 } 505 }
482 set_running_timer(base, NULL); 506 set_running_timer(base, NULL);
483 spin_unlock_irq(&base->lock); 507 spin_unlock_irq(&base->t_base.lock);
484} 508}
485 509
486#ifdef CONFIG_NO_IDLE_HZ 510#ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void)
499 int i, j; 523 int i, j;
500 524
501 base = &__get_cpu_var(tvec_bases); 525 base = &__get_cpu_var(tvec_bases);
502 spin_lock(&base->lock); 526 spin_lock(&base->t_base.lock);
503 expires = base->timer_jiffies + (LONG_MAX >> 1); 527 expires = base->timer_jiffies + (LONG_MAX >> 1);
504 list = 0; 528 list = 0;
505 529
@@ -547,7 +571,7 @@ found:
547 expires = nte->expires; 571 expires = nte->expires;
548 } 572 }
549 } 573 }
550 spin_unlock(&base->lock); 574 spin_unlock(&base->t_base.lock);
551 return expires; 575 return expires;
552} 576}
553#endif 577#endif
@@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu)
1286{ 1310{
1287 int j; 1311 int j;
1288 tvec_base_t *base; 1312 tvec_base_t *base;
1289 1313
1290 base = &per_cpu(tvec_bases, cpu); 1314 base = &per_cpu(tvec_bases, cpu);
1291 spin_lock_init(&base->lock); 1315 spin_lock_init(&base->t_base.lock);
1292 for (j = 0; j < TVN_SIZE; j++) { 1316 for (j = 0; j < TVN_SIZE; j++) {
1293 INIT_LIST_HEAD(base->tv5.vec + j); 1317 INIT_LIST_HEAD(base->tv5.vec + j);
1294 INIT_LIST_HEAD(base->tv4.vec + j); 1318 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu)
1302} 1326}
1303 1327
1304#ifdef CONFIG_HOTPLUG_CPU 1328#ifdef CONFIG_HOTPLUG_CPU
1305static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1329static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1306{ 1330{
1307 struct timer_list *timer; 1331 struct timer_list *timer;
1308 1332
1309 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1310 timer = list_entry(head->next, struct timer_list, entry); 1334 timer = list_entry(head->next, struct timer_list, entry);
1311 /* We're locking backwards from __mod_timer order here, 1335 detach_timer(timer, 0);
1312 beware deadlock. */ 1336 timer->base = &new_base->t_base;
1313 if (!spin_trylock(&timer->lock))
1314 return 0;
1315 list_del(&timer->entry);
1316 internal_add_timer(new_base, timer); 1337 internal_add_timer(new_base, timer);
1317 timer->base = new_base;
1318 spin_unlock(&timer->lock);
1319 } 1338 }
1320 return 1;
1321} 1339}
1322 1340
1323static void __devinit migrate_timers(int cpu) 1341static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu)
1331 new_base = &get_cpu_var(tvec_bases); 1349 new_base = &get_cpu_var(tvec_bases);
1332 1350
1333 local_irq_disable(); 1351 local_irq_disable();
1334again: 1352 spin_lock(&new_base->t_base.lock);
1335 /* Prevent deadlocks via ordering by old_base < new_base. */ 1353 spin_lock(&old_base->t_base.lock);
1336 if (old_base < new_base) {
1337 spin_lock(&new_base->lock);
1338 spin_lock(&old_base->lock);
1339 } else {
1340 spin_lock(&old_base->lock);
1341 spin_lock(&new_base->lock);
1342 }
1343 1354
1344 if (old_base->running_timer) 1355 if (old_base->t_base.running_timer)
1345 BUG(); 1356 BUG();
1346 for (i = 0; i < TVR_SIZE; i++) 1357 for (i = 0; i < TVR_SIZE; i++)
1347 if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) 1358 migrate_timer_list(new_base, old_base->tv1.vec + i);
1348 goto unlock_again; 1359 for (i = 0; i < TVN_SIZE; i++) {
1349 for (i = 0; i < TVN_SIZE; i++) 1360 migrate_timer_list(new_base, old_base->tv2.vec + i);
1350 if (!migrate_timer_list(new_base, old_base->tv2.vec + i) 1361 migrate_timer_list(new_base, old_base->tv3.vec + i);
1351 || !migrate_timer_list(new_base, old_base->tv3.vec + i) 1362 migrate_timer_list(new_base, old_base->tv4.vec + i);
1352 || !migrate_timer_list(new_base, old_base->tv4.vec + i) 1363 migrate_timer_list(new_base, old_base->tv5.vec + i);
1353 || !migrate_timer_list(new_base, old_base->tv5.vec + i)) 1364 }
1354 goto unlock_again; 1365
1355 spin_unlock(&old_base->lock); 1366 spin_unlock(&old_base->t_base.lock);
1356 spin_unlock(&new_base->lock); 1367 spin_unlock(&new_base->t_base.lock);
1357 local_irq_enable(); 1368 local_irq_enable();
1358 put_cpu_var(tvec_bases); 1369 put_cpu_var(tvec_bases);
1359 return;
1360
1361unlock_again:
1362 /* Avoid deadlock with __mod_timer, by backing off. */
1363 spin_unlock(&old_base->lock);
1364 spin_unlock(&new_base->lock);
1365 cpu_relax();
1366 goto again;
1367} 1370}
1368#endif /* CONFIG_HOTPLUG_CPU */ 1371#endif /* CONFIG_HOTPLUG_CPU */
1369 1372
@@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs)
1594EXPORT_SYMBOL(msleep); 1597EXPORT_SYMBOL(msleep);
1595 1598
1596/** 1599/**
1597 * msleep_interruptible - sleep waiting for waitqueue interruptions 1600 * msleep_interruptible - sleep waiting for signals
1598 * @msecs: Time in milliseconds to sleep for 1601 * @msecs: Time in milliseconds to sleep for
1599 */ 1602 */
1600unsigned long msleep_interruptible(unsigned int msecs) 1603unsigned long msleep_interruptible(unsigned int msecs)