aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c45
-rw-r--r--kernel/audit.c128
-rw-r--r--kernel/auditsc.c327
-rw-r--r--kernel/compat.c9
-rw-r--r--kernel/cpuset.c229
-rw-r--r--kernel/exit.c35
-rw-r--r--kernel/fork.c106
-rw-r--r--kernel/futex.c137
-rw-r--r--kernel/intermodule.c3
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kprobes.c94
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/posix-cpu-timers.c91
-rw-r--r--kernel/posix-timers.c30
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/disk.c6
-rw-r--r--kernel/power/pm.c3
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/swsusp.c37
-rw-r--r--kernel/printk.c20
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcupdate.c27
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched.c618
-rw-r--r--kernel/signal.c173
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c151
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/sys.c61
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/timer.c55
-rw-r--r--kernel/workqueue.c5
37 files changed, 1866 insertions, 680 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..ff4dc02ce170 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
12obj-$(CONFIG_FUTEX) += futex.o 12obj-$(CONFIG_FUTEX) += futex.o
13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
14obj-$(CONFIG_SMP) += cpu.o spinlock.o 14obj-$(CONFIG_SMP) += cpu.o spinlock.o
15obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
15obj-$(CONFIG_UID16) += uid16.o 16obj-$(CONFIG_UID16) += uid16.o
16obj-$(CONFIG_MODULES) += module.o 17obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 18obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o
27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 28obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
28obj-$(CONFIG_KPROBES) += kprobes.o 29obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_SYSFS) += ksysfs.o 30obj-$(CONFIG_SYSFS) += ksysfs.o
31obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 32obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 33obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
32obj-$(CONFIG_SECCOMP) += seccomp.o 34obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..b756f527497e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
165} 165}
166 166
167/* 167/*
168 * Close the old accouting file (if currently open) and then replace 168 * Close the old accounting file (if currently open) and then replace
169 * it with file (if non-NULL). 169 * it with file (if non-NULL).
170 * 170 *
171 * NOTE: acct_globals.lock MUST be held on entry and exit. 171 * NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
199 } 199 }
200} 200}
201 201
202/* 202/**
203 * sys_acct() is the only system call needed to implement process 203 * sys_acct - enable/disable process accounting
204 * accounting. It takes the name of the file where accounting records 204 * @name: file name for accounting records or NULL to shutdown accounting
205 * should be written. If the filename is NULL, accounting will be 205 *
206 * shutdown. 206 * Returns 0 for success or negative errno values for failure.
207 *
208 * sys_acct() is the only system call needed to implement process
209 * accounting. It takes the name of the file where accounting records
210 * should be written. If the filename is NULL, accounting will be
211 * shutdown.
207 */ 212 */
208asmlinkage long sys_acct(const char __user *name) 213asmlinkage long sys_acct(const char __user *name)
209{ 214{
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
220 return (PTR_ERR(tmp)); 225 return (PTR_ERR(tmp));
221 } 226 }
222 /* Difference from BSD - they don't do O_APPEND */ 227 /* Difference from BSD - they don't do O_APPEND */
223 file = filp_open(tmp, O_WRONLY|O_APPEND, 0); 228 file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
224 putname(tmp); 229 putname(tmp);
225 if (IS_ERR(file)) { 230 if (IS_ERR(file)) {
226 return (PTR_ERR(file)); 231 return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
250 return (0); 255 return (0);
251} 256}
252 257
253/* 258/**
254 * If the accouting is turned on for a file in the filesystem pointed 259 * acct_auto_close - turn off a filesystem's accounting if it is on
255 * to by sb, turn accouting off. 260 * @sb: super block for the filesystem
261 *
262 * If the accounting is turned on for a file in the filesystem pointed
263 * to by sb, turn accounting off.
256 */ 264 */
257void acct_auto_close(struct super_block *sb) 265void acct_auto_close(struct super_block *sb)
258{ 266{
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
503 set_fs(fs); 511 set_fs(fs);
504} 512}
505 513
506/* 514/**
507 * acct_process - now just a wrapper around do_acct_process 515 * acct_process - now just a wrapper around do_acct_process
516 * @exitcode: task exit code
517 *
518 * handles process accounting for an exiting task
508 */ 519 */
509void acct_process(long exitcode) 520void acct_process(long exitcode)
510{ 521{
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
530} 541}
531 542
532 543
533/* 544/**
534 * acct_update_integrals 545 * acct_update_integrals - update mm integral fields in task_struct
535 * - update mm integral fields in task_struct 546 * @tsk: task_struct for accounting
536 */ 547 */
537void acct_update_integrals(struct task_struct *tsk) 548void acct_update_integrals(struct task_struct *tsk)
538{ 549{
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
547 } 558 }
548} 559}
549 560
550/* 561/**
551 * acct_clear_integrals 562 * acct_clear_integrals - clear the mm integral fields in task_struct
552 * - clear the mm integral fields in task_struct 563 * @tsk: task_struct whose accounting fields are cleared
553 */ 564 */
554void acct_clear_integrals(struct task_struct *tsk) 565void acct_clear_integrals(struct task_struct *tsk)
555{ 566{
diff --git a/kernel/audit.c b/kernel/audit.c
index 7f0699790d46..aefa73a8a586 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -79,6 +79,8 @@ static int audit_rate_limit;
79 79
80/* Number of outstanding audit_buffers allowed. */ 80/* Number of outstanding audit_buffers allowed. */
81static int audit_backlog_limit = 64; 81static int audit_backlog_limit = 64;
82static int audit_backlog_wait_time = 60 * HZ;
83static int audit_backlog_wait_overflow = 0;
82 84
83/* The identity of the user shutting down the audit system. */ 85/* The identity of the user shutting down the audit system. */
84uid_t audit_sig_uid = -1; 86uid_t audit_sig_uid = -1;
@@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist);
106static struct sk_buff_head audit_skb_queue; 108static struct sk_buff_head audit_skb_queue;
107static struct task_struct *kauditd_task; 109static struct task_struct *kauditd_task;
108static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 110static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
109 111static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
110/* There are three lists of rules -- one to search at task creation
111 * time, one to search at syscall entry time, and another to search at
112 * syscall exit time. */
113static LIST_HEAD(audit_tsklist);
114static LIST_HEAD(audit_entlist);
115static LIST_HEAD(audit_extlist);
116 112
117/* The netlink socket is only to be read by 1 CPU, which lets us assume 113/* The netlink socket is only to be read by 1 CPU, which lets us assume
118 * that list additions and deletions never happen simultaneously in 114 * that list additions and deletions never happen simultaneously in
119 * auditsc.c */ 115 * auditsc.c */
120static DECLARE_MUTEX(audit_netlink_sem); 116DECLARE_MUTEX(audit_netlink_sem);
121 117
122/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 118/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
123 * audit records. Since printk uses a 1024 byte buffer, this buffer 119 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -137,6 +133,7 @@ struct audit_buffer {
137 struct list_head list; 133 struct list_head list;
138 struct sk_buff *skb; /* formatted skb ready to send */ 134 struct sk_buff *skb; /* formatted skb ready to send */
139 struct audit_context *ctx; /* NULL or associated context */ 135 struct audit_context *ctx; /* NULL or associated context */
136 int gfp_mask;
140}; 137};
141 138
142static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 139static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
145 nlh->nlmsg_pid = pid; 142 nlh->nlmsg_pid = pid;
146} 143}
147 144
148struct audit_entry {
149 struct list_head list;
150 struct audit_rule rule;
151};
152
153static void audit_panic(const char *message) 145static void audit_panic(const char *message)
154{ 146{
155 switch (audit_failure) 147 switch (audit_failure)
@@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
233{ 225{
234 int old = audit_rate_limit; 226 int old = audit_rate_limit;
235 audit_rate_limit = limit; 227 audit_rate_limit = limit;
236 audit_log(NULL, AUDIT_CONFIG_CHANGE, 228 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
237 "audit_rate_limit=%d old=%d by auid=%u", 229 "audit_rate_limit=%d old=%d by auid=%u",
238 audit_rate_limit, old, loginuid); 230 audit_rate_limit, old, loginuid);
239 return old; 231 return old;
@@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
243{ 235{
244 int old = audit_backlog_limit; 236 int old = audit_backlog_limit;
245 audit_backlog_limit = limit; 237 audit_backlog_limit = limit;
246 audit_log(NULL, AUDIT_CONFIG_CHANGE, 238 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
247 "audit_backlog_limit=%d old=%d by auid=%u", 239 "audit_backlog_limit=%d old=%d by auid=%u",
248 audit_backlog_limit, old, loginuid); 240 audit_backlog_limit, old, loginuid);
249 return old; 241 return old;
@@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid)
255 if (state != 0 && state != 1) 247 if (state != 0 && state != 1)
256 return -EINVAL; 248 return -EINVAL;
257 audit_enabled = state; 249 audit_enabled = state;
258 audit_log(NULL, AUDIT_CONFIG_CHANGE, 250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
259 "audit_enabled=%d old=%d by auid=%u", 251 "audit_enabled=%d old=%d by auid=%u",
260 audit_enabled, old, loginuid); 252 audit_enabled, old, loginuid);
261 return old; 253 return old;
@@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid)
269 && state != AUDIT_FAIL_PANIC) 261 && state != AUDIT_FAIL_PANIC)
270 return -EINVAL; 262 return -EINVAL;
271 audit_failure = state; 263 audit_failure = state;
272 audit_log(NULL, AUDIT_CONFIG_CHANGE, 264 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
273 "audit_failure=%d old=%d by auid=%u", 265 "audit_failure=%d old=%d by auid=%u",
274 audit_failure, old, loginuid); 266 audit_failure, old, loginuid);
275 return old; 267 return old;
@@ -281,6 +273,7 @@ int kauditd_thread(void *dummy)
281 273
282 while (1) { 274 while (1) {
283 skb = skb_dequeue(&audit_skb_queue); 275 skb = skb_dequeue(&audit_skb_queue);
276 wake_up(&audit_backlog_wait);
284 if (skb) { 277 if (skb) {
285 if (audit_pid) { 278 if (audit_pid) {
286 int err = netlink_unicast(audit_sock, skb, audit_pid, 0); 279 int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
@@ -290,7 +283,7 @@ int kauditd_thread(void *dummy)
290 audit_pid = 0; 283 audit_pid = 0;
291 } 284 }
292 } else { 285 } else {
293 printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); 286 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
294 kfree_skb(skb); 287 kfree_skb(skb);
295 } 288 }
296 } else { 289 } else {
@@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
423 if (status_get->mask & AUDIT_STATUS_PID) { 416 if (status_get->mask & AUDIT_STATUS_PID) {
424 int old = audit_pid; 417 int old = audit_pid;
425 audit_pid = status_get->pid; 418 audit_pid = status_get->pid;
426 audit_log(NULL, AUDIT_CONFIG_CHANGE, 419 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
427 "audit_pid=%d old=%d by auid=%u", 420 "audit_pid=%d old=%d by auid=%u",
428 audit_pid, old, loginuid); 421 audit_pid, old, loginuid);
429 } 422 }
@@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
435 break; 428 break;
436 case AUDIT_USER: 429 case AUDIT_USER:
437 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 430 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
438 ab = audit_log_start(NULL, msg_type); 431 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
439 if (!ab) 432 return 0;
440 break; /* audit_panic has been called */ 433
441 audit_log_format(ab, 434 err = audit_filter_user(&NETLINK_CB(skb), msg_type);
442 "user pid=%d uid=%u auid=%u" 435 if (err == 1) {
443 " msg='%.1024s'", 436 err = 0;
444 pid, uid, loginuid, (char *)data); 437 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
445 audit_set_pid(ab, pid); 438 if (ab) {
446 audit_log_end(ab); 439 audit_log_format(ab,
440 "user pid=%d uid=%u auid=%u msg='%.1024s'",
441 pid, uid, loginuid, (char *)data);
442 audit_set_pid(ab, pid);
443 audit_log_end(ab);
444 }
445 }
447 break; 446 break;
448 case AUDIT_ADD: 447 case AUDIT_ADD:
449 case AUDIT_DEL: 448 case AUDIT_DEL:
@@ -523,7 +522,7 @@ static int __init audit_init(void)
523 skb_queue_head_init(&audit_skb_queue); 522 skb_queue_head_init(&audit_skb_queue);
524 audit_initialized = 1; 523 audit_initialized = 1;
525 audit_enabled = audit_default; 524 audit_enabled = audit_default;
526 audit_log(NULL, AUDIT_KERNEL, "initialized"); 525 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
527 return 0; 526 return 0;
528} 527}
529__initcall(audit_init); 528__initcall(audit_init);
@@ -561,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
561} 560}
562 561
563static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, 562static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
564 int gfp_mask, int type) 563 gfp_t gfp_mask, int type)
565{ 564{
566 unsigned long flags; 565 unsigned long flags;
567 struct audit_buffer *ab = NULL; 566 struct audit_buffer *ab = NULL;
@@ -587,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
587 goto err; 586 goto err;
588 587
589 ab->ctx = ctx; 588 ab->ctx = ctx;
589 ab->gfp_mask = gfp_mask;
590 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); 590 nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
591 nlh->nlmsg_type = type; 591 nlh->nlmsg_type = type;
592 nlh->nlmsg_flags = 0; 592 nlh->nlmsg_flags = 0;
@@ -606,26 +606,27 @@ err:
606 * (timestamp,serial) tuple is unique for each syscall and is live from 606 * (timestamp,serial) tuple is unique for each syscall and is live from
607 * syscall entry to syscall exit. 607 * syscall entry to syscall exit.
608 * 608 *
609 * Atomic values are only guaranteed to be 24-bit, so we count down.
610 *
611 * NOTE: Another possibility is to store the formatted records off the 609 * NOTE: Another possibility is to store the formatted records off the
612 * audit context (for those records that have a context), and emit them 610 * audit context (for those records that have a context), and emit them
613 * all at syscall exit. However, this could delay the reporting of 611 * all at syscall exit. However, this could delay the reporting of
614 * significant errors until syscall exit (or never, if the system 612 * significant errors until syscall exit (or never, if the system
615 * halts). */ 613 * halts). */
614
616unsigned int audit_serial(void) 615unsigned int audit_serial(void)
617{ 616{
618 static atomic_t serial = ATOMIC_INIT(0xffffff); 617 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
619 unsigned int a, b; 618 static unsigned int serial = 0;
619
620 unsigned long flags;
621 unsigned int ret;
620 622
623 spin_lock_irqsave(&serial_lock, flags);
621 do { 624 do {
622 a = atomic_read(&serial); 625 ret = ++serial;
623 if (atomic_dec_and_test(&serial)) 626 } while (unlikely(!ret));
624 atomic_set(&serial, 0xffffff); 627 spin_unlock_irqrestore(&serial_lock, flags);
625 b = atomic_read(&serial);
626 } while (b != a - 1);
627 628
628 return 0xffffff - b; 629 return ret;
629} 630}
630 631
631static inline void audit_get_stamp(struct audit_context *ctx, 632static inline void audit_get_stamp(struct audit_context *ctx,
@@ -645,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx,
645 * syscall, then the syscall is marked as auditable and an audit record 646 * syscall, then the syscall is marked as auditable and an audit record
646 * will be written at syscall exit. If there is no associated task, tsk 647 * will be written at syscall exit. If there is no associated task, tsk
647 * should be NULL. */ 648 * should be NULL. */
648struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) 649
650struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
651 int type)
649{ 652{
650 struct audit_buffer *ab = NULL; 653 struct audit_buffer *ab = NULL;
651 struct timespec t; 654 struct timespec t;
652 unsigned int serial; 655 unsigned int serial;
656 int reserve;
657 unsigned long timeout_start = jiffies;
653 658
654 if (!audit_initialized) 659 if (!audit_initialized)
655 return NULL; 660 return NULL;
656 661
657 if (audit_backlog_limit 662 if (gfp_mask & __GFP_WAIT)
658 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { 663 reserve = 0;
664 else
665 reserve = 5; /* Allow atomic callers to go up to five
666 entries over the normal backlog limit */
667
668 while (audit_backlog_limit
669 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
670 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
671 && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
672
673 /* Wait for auditd to drain the queue a little */
674 DECLARE_WAITQUEUE(wait, current);
675 set_current_state(TASK_INTERRUPTIBLE);
676 add_wait_queue(&audit_backlog_wait, &wait);
677
678 if (audit_backlog_limit &&
679 skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
680 schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
681
682 __set_current_state(TASK_RUNNING);
683 remove_wait_queue(&audit_backlog_wait, &wait);
684 continue;
685 }
659 if (audit_rate_check()) 686 if (audit_rate_check())
660 printk(KERN_WARNING 687 printk(KERN_WARNING
661 "audit: audit_backlog=%d > " 688 "audit: audit_backlog=%d > "
@@ -663,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
663 skb_queue_len(&audit_skb_queue), 690 skb_queue_len(&audit_skb_queue),
664 audit_backlog_limit); 691 audit_backlog_limit);
665 audit_log_lost("backlog limit exceeded"); 692 audit_log_lost("backlog limit exceeded");
693 audit_backlog_wait_time = audit_backlog_wait_overflow;
694 wake_up(&audit_backlog_wait);
666 return NULL; 695 return NULL;
667 } 696 }
668 697
669 ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); 698 ab = audit_buffer_alloc(ctx, gfp_mask, type);
670 if (!ab) { 699 if (!ab) {
671 audit_log_lost("out of memory in audit_log_start"); 700 audit_log_lost("out of memory in audit_log_start");
672 return NULL; 701 return NULL;
@@ -690,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
690{ 719{
691 struct sk_buff *skb = ab->skb; 720 struct sk_buff *skb = ab->skb;
692 int ret = pskb_expand_head(skb, skb_headroom(skb), extra, 721 int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
693 GFP_ATOMIC); 722 ab->gfp_mask);
694 if (ret < 0) { 723 if (ret < 0) {
695 audit_log_lost("out of memory in audit_expand"); 724 audit_log_lost("out of memory in audit_expand");
696 return 0; 725 return 0;
@@ -809,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
809 audit_log_format(ab, " %s", prefix); 838 audit_log_format(ab, " %s", prefix);
810 839
811 /* We will allow 11 spaces for ' (deleted)' to be appended */ 840 /* We will allow 11 spaces for ' (deleted)' to be appended */
812 path = kmalloc(PATH_MAX+11, GFP_KERNEL); 841 path = kmalloc(PATH_MAX+11, ab->gfp_mask);
813 if (!path) { 842 if (!path) {
814 audit_log_format(ab, "<no memory>"); 843 audit_log_format(ab, "<no memory>");
815 return; 844 return;
@@ -841,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab)
841 ab->skb = NULL; 870 ab->skb = NULL;
842 wake_up_interruptible(&kauditd_wait); 871 wake_up_interruptible(&kauditd_wait);
843 } else { 872 } else {
844 printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); 873 printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
845 } 874 }
846 } 875 }
847 audit_buffer_free(ab); 876 audit_buffer_free(ab);
@@ -850,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab)
850/* Log an audit record. This is a convenience function that calls 879/* Log an audit record. This is a convenience function that calls
851 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 880 * audit_log_start, audit_log_vformat, and audit_log_end. It may be
852 * called in any context. */ 881 * called in any context. */
853void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) 882void audit_log(struct audit_context *ctx, int gfp_mask, int type,
883 const char *fmt, ...)
854{ 884{
855 struct audit_buffer *ab; 885 struct audit_buffer *ab;
856 va_list args; 886 va_list args;
857 887
858 ab = audit_log_start(ctx, type); 888 ab = audit_log_start(ctx, gfp_mask, type);
859 if (ab) { 889 if (ab) {
860 va_start(args, fmt); 890 va_start(args, fmt);
861 audit_log_vformat(ab, fmt, args); 891 audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e75f84e1a1a0..88696f639aab 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -39,6 +39,9 @@
39#include <linux/audit.h> 39#include <linux/audit.h>
40#include <linux/personality.h> 40#include <linux/personality.h>
41#include <linux/time.h> 41#include <linux/time.h>
42#include <linux/kthread.h>
43#include <linux/netlink.h>
44#include <linux/compiler.h>
42#include <asm/unistd.h> 45#include <asm/unistd.h>
43 46
44/* 0 = no checking 47/* 0 = no checking
@@ -95,6 +98,7 @@ struct audit_names {
95 uid_t uid; 98 uid_t uid;
96 gid_t gid; 99 gid_t gid;
97 dev_t rdev; 100 dev_t rdev;
101 unsigned flags;
98}; 102};
99 103
100struct audit_aux_data { 104struct audit_aux_data {
@@ -167,9 +171,16 @@ struct audit_context {
167/* There are three lists of rules -- one to search at task creation 171/* There are three lists of rules -- one to search at task creation
168 * time, one to search at syscall entry time, and another to search at 172 * time, one to search at syscall entry time, and another to search at
169 * syscall exit time. */ 173 * syscall exit time. */
170static LIST_HEAD(audit_tsklist); 174static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
171static LIST_HEAD(audit_entlist); 175 LIST_HEAD_INIT(audit_filter_list[0]),
172static LIST_HEAD(audit_extlist); 176 LIST_HEAD_INIT(audit_filter_list[1]),
177 LIST_HEAD_INIT(audit_filter_list[2]),
178 LIST_HEAD_INIT(audit_filter_list[3]),
179 LIST_HEAD_INIT(audit_filter_list[4]),
180#if AUDIT_NR_FILTERS != 5
181#error Fix audit_filter_list initialiser
182#endif
183};
173 184
174struct audit_entry { 185struct audit_entry {
175 struct list_head list; 186 struct list_head list;
@@ -179,9 +190,36 @@ struct audit_entry {
179 190
180extern int audit_pid; 191extern int audit_pid;
181 192
193/* Copy rule from user-space to kernel-space. Called from
194 * audit_add_rule during AUDIT_ADD. */
195static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
196{
197 int i;
198
199 if (s->action != AUDIT_NEVER
200 && s->action != AUDIT_POSSIBLE
201 && s->action != AUDIT_ALWAYS)
202 return -1;
203 if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
204 return -1;
205 if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
206 return -1;
207
208 d->flags = s->flags;
209 d->action = s->action;
210 d->field_count = s->field_count;
211 for (i = 0; i < d->field_count; i++) {
212 d->fields[i] = s->fields[i];
213 d->values[i] = s->values[i];
214 }
215 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
216 return 0;
217}
218
182/* Check to see if two rules are identical. It is called from 219/* Check to see if two rules are identical. It is called from
220 * audit_add_rule during AUDIT_ADD and
183 * audit_del_rule during AUDIT_DEL. */ 221 * audit_del_rule during AUDIT_DEL. */
184static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) 222static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
185{ 223{
186 int i; 224 int i;
187 225
@@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
210/* Note that audit_add_rule and audit_del_rule are called via 248/* Note that audit_add_rule and audit_del_rule are called via
211 * audit_receive() in audit.c, and are protected by 249 * audit_receive() in audit.c, and are protected by
212 * audit_netlink_sem. */ 250 * audit_netlink_sem. */
213static inline int audit_add_rule(struct audit_entry *entry, 251static inline int audit_add_rule(struct audit_rule *rule,
214 struct list_head *list) 252 struct list_head *list)
215{ 253{
216 if (entry->rule.flags & AUDIT_PREPEND) { 254 struct audit_entry *entry;
217 entry->rule.flags &= ~AUDIT_PREPEND; 255
256 /* Do not use the _rcu iterator here, since this is the only
257 * addition routine. */
258 list_for_each_entry(entry, list, list) {
259 if (!audit_compare_rule(rule, &entry->rule)) {
260 return -EEXIST;
261 }
262 }
263
264 if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
265 return -ENOMEM;
266 if (audit_copy_rule(&entry->rule, rule)) {
267 kfree(entry);
268 return -EINVAL;
269 }
270
271 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
272 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
218 list_add_rcu(&entry->list, list); 273 list_add_rcu(&entry->list, list);
219 } else { 274 } else {
220 list_add_tail_rcu(&entry->list, list); 275 list_add_tail_rcu(&entry->list, list);
221 } 276 }
277
222 return 0; 278 return 0;
223} 279}
224 280
225static void audit_free_rule(struct rcu_head *head) 281static inline void audit_free_rule(struct rcu_head *head)
226{ 282{
227 struct audit_entry *e = container_of(head, struct audit_entry, rcu); 283 struct audit_entry *e = container_of(head, struct audit_entry, rcu);
228 kfree(e); 284 kfree(e);
@@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule,
245 return 0; 301 return 0;
246 } 302 }
247 } 303 }
248 return -EFAULT; /* No matching rule */ 304 return -ENOENT; /* No matching rule */
249} 305}
250 306
251/* Copy rule from user-space to kernel-space. Called during 307static int audit_list_rules(void *_dest)
252 * AUDIT_ADD. */
253static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
254{ 308{
309 int pid, seq;
310 int *dest = _dest;
311 struct audit_entry *entry;
255 int i; 312 int i;
256 313
257 if (s->action != AUDIT_NEVER 314 pid = dest[0];
258 && s->action != AUDIT_POSSIBLE 315 seq = dest[1];
259 && s->action != AUDIT_ALWAYS) 316 kfree(dest);
260 return -1;
261 if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
262 return -1;
263 317
264 d->flags = s->flags; 318 down(&audit_netlink_sem);
265 d->action = s->action; 319
266 d->field_count = s->field_count; 320 /* The *_rcu iterators not needed here because we are
267 for (i = 0; i < d->field_count; i++) { 321 always called with audit_netlink_sem held. */
268 d->fields[i] = s->fields[i]; 322 for (i=0; i<AUDIT_NR_FILTERS; i++) {
269 d->values[i] = s->values[i]; 323 list_for_each_entry(entry, &audit_filter_list[i], list)
324 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
325 &entry->rule, sizeof(entry->rule));
270 } 326 }
271 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; 327 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
328
329 up(&audit_netlink_sem);
272 return 0; 330 return 0;
273} 331}
274 332
275int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 333int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
276 uid_t loginuid) 334 uid_t loginuid)
277{ 335{
278 u32 flags; 336 struct task_struct *tsk;
279 struct audit_entry *entry; 337 int *dest;
280 int err = 0; 338 int err = 0;
339 unsigned listnr;
281 340
282 switch (type) { 341 switch (type) {
283 case AUDIT_LIST: 342 case AUDIT_LIST:
284 /* The *_rcu iterators not needed here because we are 343 /* We can't just spew out the rules here because we might fill
285 always called with audit_netlink_sem held. */ 344 * the available socket buffer space and deadlock waiting for
286 list_for_each_entry(entry, &audit_tsklist, list) 345 * auditctl to read from it... which isn't ever going to
287 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 346 * happen if we're actually running in the context of auditctl
288 &entry->rule, sizeof(entry->rule)); 347 * trying to _send_ the stuff */
289 list_for_each_entry(entry, &audit_entlist, list) 348
290 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 349 dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
291 &entry->rule, sizeof(entry->rule)); 350 if (!dest)
292 list_for_each_entry(entry, &audit_extlist, list) 351 return -ENOMEM;
293 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 352 dest[0] = pid;
294 &entry->rule, sizeof(entry->rule)); 353 dest[1] = seq;
295 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); 354
355 tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
356 if (IS_ERR(tsk)) {
357 kfree(dest);
358 err = PTR_ERR(tsk);
359 }
296 break; 360 break;
297 case AUDIT_ADD: 361 case AUDIT_ADD:
298 if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) 362 listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
299 return -ENOMEM; 363 if (listnr >= AUDIT_NR_FILTERS)
300 if (audit_copy_rule(&entry->rule, data)) {
301 kfree(entry);
302 return -EINVAL; 364 return -EINVAL;
303 } 365
304 flags = entry->rule.flags; 366 err = audit_add_rule(data, &audit_filter_list[listnr]);
305 if (!err && (flags & AUDIT_PER_TASK)) 367 if (!err)
306 err = audit_add_rule(entry, &audit_tsklist); 368 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
307 if (!err && (flags & AUDIT_AT_ENTRY)) 369 "auid=%u added an audit rule\n", loginuid);
308 err = audit_add_rule(entry, &audit_entlist);
309 if (!err && (flags & AUDIT_AT_EXIT))
310 err = audit_add_rule(entry, &audit_extlist);
311 audit_log(NULL, AUDIT_CONFIG_CHANGE,
312 "auid=%u added an audit rule\n", loginuid);
313 break; 370 break;
314 case AUDIT_DEL: 371 case AUDIT_DEL:
315 flags =((struct audit_rule *)data)->flags; 372 listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
316 if (!err && (flags & AUDIT_PER_TASK)) 373 if (listnr >= AUDIT_NR_FILTERS)
317 err = audit_del_rule(data, &audit_tsklist); 374 return -EINVAL;
318 if (!err && (flags & AUDIT_AT_ENTRY)) 375
319 err = audit_del_rule(data, &audit_entlist); 376 err = audit_del_rule(data, &audit_filter_list[listnr]);
320 if (!err && (flags & AUDIT_AT_EXIT)) 377 if (!err)
321 err = audit_del_rule(data, &audit_extlist); 378 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
322 audit_log(NULL, AUDIT_CONFIG_CHANGE, 379 "auid=%u removed an audit rule\n", loginuid);
323 "auid=%u removed an audit rule\n", loginuid);
324 break; 380 break;
325 default: 381 default:
326 return -EINVAL; 382 return -EINVAL;
@@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk,
384 result = (ctx->return_code == value); 440 result = (ctx->return_code == value);
385 break; 441 break;
386 case AUDIT_SUCCESS: 442 case AUDIT_SUCCESS:
387 if (ctx && ctx->return_valid) 443 if (ctx && ctx->return_valid) {
388 result = (ctx->return_valid == AUDITSC_SUCCESS); 444 if (value)
445 result = (ctx->return_valid == AUDITSC_SUCCESS);
446 else
447 result = (ctx->return_valid == AUDITSC_FAILURE);
448 }
389 break; 449 break;
390 case AUDIT_DEVMAJOR: 450 case AUDIT_DEVMAJOR:
391 if (ctx) { 451 if (ctx) {
@@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
454 enum audit_state state; 514 enum audit_state state;
455 515
456 rcu_read_lock(); 516 rcu_read_lock();
457 list_for_each_entry_rcu(e, &audit_tsklist, list) { 517 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
458 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { 518 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
459 rcu_read_unlock(); 519 rcu_read_unlock();
460 return state; 520 return state;
@@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
474 struct list_head *list) 534 struct list_head *list)
475{ 535{
476 struct audit_entry *e; 536 struct audit_entry *e;
537 enum audit_state state;
538
539 if (audit_pid && tsk->tgid == audit_pid)
540 return AUDIT_DISABLED;
541
542 rcu_read_lock();
543 if (!list_empty(list)) {
544 int word = AUDIT_WORD(ctx->major);
545 int bit = AUDIT_BIT(ctx->major);
546
547 list_for_each_entry_rcu(e, list, list) {
548 if ((e->rule.mask[word] & bit) == bit
549 && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
550 rcu_read_unlock();
551 return state;
552 }
553 }
554 }
555 rcu_read_unlock();
556 return AUDIT_BUILD_CONTEXT;
557}
558
559static int audit_filter_user_rules(struct netlink_skb_parms *cb,
560 struct audit_rule *rule,
561 enum audit_state *state)
562{
563 int i;
564
565 for (i = 0; i < rule->field_count; i++) {
566 u32 field = rule->fields[i] & ~AUDIT_NEGATE;
567 u32 value = rule->values[i];
568 int result = 0;
569
570 switch (field) {
571 case AUDIT_PID:
572 result = (cb->creds.pid == value);
573 break;
574 case AUDIT_UID:
575 result = (cb->creds.uid == value);
576 break;
577 case AUDIT_GID:
578 result = (cb->creds.gid == value);
579 break;
580 case AUDIT_LOGINUID:
581 result = (cb->loginuid == value);
582 break;
583 }
584
585 if (rule->fields[i] & AUDIT_NEGATE)
586 result = !result;
587 if (!result)
588 return 0;
589 }
590 switch (rule->action) {
591 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
592 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
593 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
594 }
595 return 1;
596}
597
598int audit_filter_user(struct netlink_skb_parms *cb, int type)
599{
600 struct audit_entry *e;
477 enum audit_state state; 601 enum audit_state state;
478 int word = AUDIT_WORD(ctx->major); 602 int ret = 1;
479 int bit = AUDIT_BIT(ctx->major);
480 603
481 rcu_read_lock(); 604 rcu_read_lock();
482 list_for_each_entry_rcu(e, list, list) { 605 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
483 if ((e->rule.mask[word] & bit) == bit 606 if (audit_filter_user_rules(cb, &e->rule, &state)) {
484 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 607 if (state == AUDIT_DISABLED)
485 rcu_read_unlock(); 608 ret = 0;
486 return state; 609 break;
487 } 610 }
488 } 611 }
489 rcu_read_unlock(); 612 rcu_read_unlock();
490 return AUDIT_BUILD_CONTEXT; 613
614 return ret; /* Audit by default */
491} 615}
492 616
493/* This should be called with task_lock() held. */ 617/* This should be called with task_lock() held. */
@@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
504 628
505 if (context->in_syscall && !context->auditable) { 629 if (context->in_syscall && !context->auditable) {
506 enum audit_state state; 630 enum audit_state state;
507 state = audit_filter_syscall(tsk, context, &audit_extlist); 631 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
508 if (state == AUDIT_RECORD_CONTEXT) 632 if (state == AUDIT_RECORD_CONTEXT)
509 context->auditable = 1; 633 context->auditable = 1;
510 } 634 }
@@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab)
679 up_read(&mm->mmap_sem); 803 up_read(&mm->mmap_sem);
680} 804}
681 805
682static void audit_log_exit(struct audit_context *context) 806static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
683{ 807{
684 int i; 808 int i;
685 struct audit_buffer *ab; 809 struct audit_buffer *ab;
686 struct audit_aux_data *aux; 810 struct audit_aux_data *aux;
687 811
688 ab = audit_log_start(context, AUDIT_SYSCALL); 812 ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
689 if (!ab) 813 if (!ab)
690 return; /* audit_panic has been called */ 814 return; /* audit_panic has been called */
691 audit_log_format(ab, "arch=%x syscall=%d", 815 audit_log_format(ab, "arch=%x syscall=%d",
@@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context)
717 841
718 for (aux = context->aux; aux; aux = aux->next) { 842 for (aux = context->aux; aux; aux = aux->next) {
719 843
720 ab = audit_log_start(context, aux->type); 844 ab = audit_log_start(context, GFP_KERNEL, aux->type);
721 if (!ab) 845 if (!ab)
722 continue; /* audit_panic has been called */ 846 continue; /* audit_panic has been called */
723 847
@@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context)
754 } 878 }
755 879
756 if (context->pwd && context->pwdmnt) { 880 if (context->pwd && context->pwdmnt) {
757 ab = audit_log_start(context, AUDIT_CWD); 881 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
758 if (ab) { 882 if (ab) {
759 audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); 883 audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
760 audit_log_end(ab); 884 audit_log_end(ab);
761 } 885 }
762 } 886 }
763 for (i = 0; i < context->name_count; i++) { 887 for (i = 0; i < context->name_count; i++) {
764 ab = audit_log_start(context, AUDIT_PATH); 888 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
765 if (!ab) 889 if (!ab)
766 continue; /* audit_panic has been called */ 890 continue; /* audit_panic has been called */
767 891
@@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context)
770 audit_log_format(ab, " name="); 894 audit_log_format(ab, " name=");
771 audit_log_untrustedstring(ab, context->names[i].name); 895 audit_log_untrustedstring(ab, context->names[i].name);
772 } 896 }
897 audit_log_format(ab, " flags=%x\n", context->names[i].flags);
898
773 if (context->names[i].ino != (unsigned long)-1) 899 if (context->names[i].ino != (unsigned long)-1)
774 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" 900 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
775 " ouid=%u ogid=%u rdev=%02x:%02x", 901 " ouid=%u ogid=%u rdev=%02x:%02x",
@@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk)
799 return; 925 return;
800 926
801 /* Check for system calls that do not go through the exit 927 /* Check for system calls that do not go through the exit
802 * function (e.g., exit_group), then free context block. */ 928 * function (e.g., exit_group), then free context block.
803 if (context->in_syscall && context->auditable && context->pid != audit_pid) 929 * We use GFP_ATOMIC here because we might be doing this
804 audit_log_exit(context); 930 * in the context of the idle thread */
931 if (context->in_syscall && context->auditable)
932 audit_log_exit(context, GFP_ATOMIC);
805 933
806 audit_free_context(context); 934 audit_free_context(context);
807} 935}
@@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
876 1004
877 state = context->state; 1005 state = context->state;
878 if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) 1006 if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
879 state = audit_filter_syscall(tsk, context, &audit_entlist); 1007 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
880 if (likely(state == AUDIT_DISABLED)) 1008 if (likely(state == AUDIT_DISABLED))
881 return; 1009 return;
882 1010
883 context->serial = audit_serial(); 1011 context->serial = 0;
884 context->ctime = CURRENT_TIME; 1012 context->ctime = CURRENT_TIME;
885 context->in_syscall = 1; 1013 context->in_syscall = 1;
886 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 1014 context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
@@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
903 /* Not having a context here is ok, since the parent may have 1031 /* Not having a context here is ok, since the parent may have
904 * called __put_task_struct. */ 1032 * called __put_task_struct. */
905 if (likely(!context)) 1033 if (likely(!context))
906 return; 1034 goto out;
907 1035
908 if (context->in_syscall && context->auditable && context->pid != audit_pid) 1036 if (context->in_syscall && context->auditable)
909 audit_log_exit(context); 1037 audit_log_exit(context, GFP_KERNEL);
910 1038
911 context->in_syscall = 0; 1039 context->in_syscall = 0;
912 context->auditable = 0; 1040 context->auditable = 0;
@@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
919 } else { 1047 } else {
920 audit_free_names(context); 1048 audit_free_names(context);
921 audit_free_aux(context); 1049 audit_free_aux(context);
922 audit_zero_context(context, context->state);
923 tsk->audit_context = context; 1050 tsk->audit_context = context;
924 } 1051 }
1052 out:
925 put_task_struct(tsk); 1053 put_task_struct(tsk);
926} 1054}
927 1055
@@ -996,7 +1124,7 @@ void audit_putname(const char *name)
996 1124
997/* Store the inode and device from a lookup. Called from 1125/* Store the inode and device from a lookup. Called from
998 * fs/namei.c:path_lookup(). */ 1126 * fs/namei.c:path_lookup(). */
999void audit_inode(const char *name, const struct inode *inode) 1127void audit_inode(const char *name, const struct inode *inode, unsigned flags)
1000{ 1128{
1001 int idx; 1129 int idx;
1002 struct audit_context *context = current->audit_context; 1130 struct audit_context *context = current->audit_context;
@@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode)
1022 ++context->ino_count; 1150 ++context->ino_count;
1023#endif 1151#endif
1024 } 1152 }
1025 context->names[idx].ino = inode->i_ino; 1153 context->names[idx].flags = flags;
1026 context->names[idx].dev = inode->i_sb->s_dev; 1154 context->names[idx].ino = inode->i_ino;
1027 context->names[idx].mode = inode->i_mode; 1155 context->names[idx].dev = inode->i_sb->s_dev;
1028 context->names[idx].uid = inode->i_uid; 1156 context->names[idx].mode = inode->i_mode;
1029 context->names[idx].gid = inode->i_gid; 1157 context->names[idx].uid = inode->i_uid;
1030 context->names[idx].rdev = inode->i_rdev; 1158 context->names[idx].gid = inode->i_gid;
1159 context->names[idx].rdev = inode->i_rdev;
1031} 1160}
1032 1161
1033void auditsc_get_stamp(struct audit_context *ctx, 1162void auditsc_get_stamp(struct audit_context *ctx,
1034 struct timespec *t, unsigned int *serial) 1163 struct timespec *t, unsigned int *serial)
1035{ 1164{
1165 if (!ctx->serial)
1166 ctx->serial = audit_serial();
1036 t->tv_sec = ctx->ctime.tv_sec; 1167 t->tv_sec = ctx->ctime.tv_sec;
1037 t->tv_nsec = ctx->ctime.tv_nsec; 1168 t->tv_nsec = ctx->ctime.tv_nsec;
1038 *serial = ctx->serial; 1169 *serial = ctx->serial;
@@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1044 if (task->audit_context) { 1175 if (task->audit_context) {
1045 struct audit_buffer *ab; 1176 struct audit_buffer *ab;
1046 1177
1047 ab = audit_log_start(NULL, AUDIT_LOGIN); 1178 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1048 if (ab) { 1179 if (ab) {
1049 audit_log_format(ab, "login pid=%d uid=%u " 1180 audit_log_format(ab, "login pid=%d uid=%u "
1050 "old auid=%u new auid=%u", 1181 "old auid=%u new auid=%u",
@@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t)
1153 extern pid_t audit_sig_pid; 1284 extern pid_t audit_sig_pid;
1154 extern uid_t audit_sig_uid; 1285 extern uid_t audit_sig_uid;
1155 1286
1156 if (unlikely(audit_pid && t->pid == audit_pid)) { 1287 if (unlikely(audit_pid && t->tgid == audit_pid)) {
1157 if (sig == SIGTERM || sig == SIGHUP) { 1288 if (sig == SIGTERM || sig == SIGHUP) {
1158 struct audit_context *ctx = current->audit_context; 1289 struct audit_context *ctx = current->audit_context;
1159 audit_sig_pid = current->pid; 1290 audit_sig_pid = current->pid;
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa86623..102296e21ea8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
48 if (!time_after(expire, now)) 48 if (!time_after(expire, now))
49 return 0; 49 return 0;
50 50
51 current->state = TASK_INTERRUPTIBLE; 51 expire = schedule_timeout_interruptible(expire - now);
52 expire = schedule_timeout(expire - now);
53 if (expire == 0) 52 if (expire == 0)
54 return 0; 53 return 0;
55 54
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
82 return -EINVAL; 81 return -EINVAL;
83 82
84 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 83 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
85 current->state = TASK_INTERRUPTIBLE; 84 expire = schedule_timeout_interruptible(expire);
86 expire = schedule_timeout(expire);
87 if (expire == 0) 85 if (expire == 0)
88 return 0; 86 return 0;
89 87
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
795 recalc_sigpending(); 793 recalc_sigpending();
796 spin_unlock_irq(&current->sighand->siglock); 794 spin_unlock_irq(&current->sighand->siglock);
797 795
798 current->state = TASK_INTERRUPTIBLE; 796 timeout = schedule_timeout_interruptible(timeout);
799 timeout = schedule_timeout(timeout);
800 797
801 spin_lock_irq(&current->sighand->siglock); 798 spin_lock_irq(&current->sighand->siglock);
802 sig = dequeue_signal(current, &s, &info); 799 sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..28176d083f7b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL;
180 */ 180 */
181 181
182static DECLARE_MUTEX(cpuset_sem); 182static DECLARE_MUTEX(cpuset_sem);
183static struct task_struct *cpuset_sem_owner;
184static int cpuset_sem_depth;
185
186/*
187 * The global cpuset semaphore cpuset_sem can be needed by the
188 * memory allocator to update a tasks mems_allowed (see the calls
189 * to cpuset_update_current_mems_allowed()) or to walk up the
190 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
191 * to cpuset_excl_nodes_overlap()).
192 *
193 * But if the memory allocation is being done by cpuset.c code, it
194 * usually already holds cpuset_sem. Double tripping on a kernel
195 * semaphore deadlocks the current task, and any other task that
196 * subsequently tries to obtain the lock.
197 *
198 * Run all up's and down's on cpuset_sem through the following
199 * wrappers, which will detect this nested locking, and avoid
200 * deadlocking.
201 */
202
203static inline void cpuset_down(struct semaphore *psem)
204{
205 if (cpuset_sem_owner != current) {
206 down(psem);
207 cpuset_sem_owner = current;
208 }
209 cpuset_sem_depth++;
210}
211
212static inline void cpuset_up(struct semaphore *psem)
213{
214 if (--cpuset_sem_depth == 0) {
215 cpuset_sem_owner = NULL;
216 up(psem);
217 }
218}
183 219
184/* 220/*
185 * A couple of forward declarations required, due to cyclic reference loop: 221 * A couple of forward declarations required, due to cyclic reference loop:
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
522 * Refresh current tasks mems_allowed and mems_generation from 558 * Refresh current tasks mems_allowed and mems_generation from
523 * current tasks cpuset. Call with cpuset_sem held. 559 * current tasks cpuset. Call with cpuset_sem held.
524 * 560 *
525 * Be sure to call refresh_mems() on any cpuset operation which 561 * This routine is needed to update the per-task mems_allowed
526 * (1) holds cpuset_sem, and (2) might possibly alloc memory. 562 * data, within the tasks context, when it is trying to allocate
527 * Call after obtaining cpuset_sem lock, before any possible 563 * memory (in various mm/mempolicy.c routines) and notices
528 * allocation. Otherwise one risks trying to allocate memory 564 * that some other task has been modifying its cpuset.
529 * while the task cpuset_mems_generation is not the same as
530 * the mems_generation in its cpuset, which would deadlock on
531 * cpuset_sem in cpuset_update_current_mems_allowed().
532 *
533 * Since we hold cpuset_sem, once refresh_mems() is called, the
534 * test (current->cpuset_mems_generation != cs->mems_generation)
535 * in cpuset_update_current_mems_allowed() will remain false,
536 * until we drop cpuset_sem. Anyone else who would change our
537 * cpusets mems_generation needs to lock cpuset_sem first.
538 */ 565 */
539 566
540static void refresh_mems(void) 567static void refresh_mems(void)
@@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 655 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */ 656 */
630 657
631/*
632 * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
633 * Disable letting 'cpu_exclusive' cpusets define dynamic sched
634 * domains, until the sched domain can handle partial nodes.
635 * Remove this #if hackery when sched domains fixed.
636 */
637#if 0
638static void update_cpu_domains(struct cpuset *cur) 658static void update_cpu_domains(struct cpuset *cur)
639{ 659{
640 struct cpuset *c, *par = cur->parent; 660 struct cpuset *c, *par = cur->parent;
@@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur)
675 partition_sched_domains(&pspan, &cspan); 695 partition_sched_domains(&pspan, &cspan);
676 unlock_cpu_hotplug(); 696 unlock_cpu_hotplug();
677} 697}
678#else
679static void update_cpu_domains(struct cpuset *cur)
680{
681}
682#endif
683 698
684static int update_cpumask(struct cpuset *cs, char *buf) 699static int update_cpumask(struct cpuset *cs, char *buf)
685{ 700{
@@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
852 } 867 }
853 buffer[nbytes] = 0; /* nul-terminate */ 868 buffer[nbytes] = 0; /* nul-terminate */
854 869
855 down(&cpuset_sem); 870 cpuset_down(&cpuset_sem);
856 871
857 if (is_removed(cs)) { 872 if (is_removed(cs)) {
858 retval = -ENODEV; 873 retval = -ENODEV;
@@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
886 if (retval == 0) 901 if (retval == 0)
887 retval = nbytes; 902 retval = nbytes;
888out2: 903out2:
889 up(&cpuset_sem); 904 cpuset_up(&cpuset_sem);
890 cpuset_release_agent(pathbuf); 905 cpuset_release_agent(pathbuf);
891out1: 906out1:
892 kfree(buffer); 907 kfree(buffer);
@@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
926{ 941{
927 cpumask_t mask; 942 cpumask_t mask;
928 943
929 down(&cpuset_sem); 944 cpuset_down(&cpuset_sem);
930 mask = cs->cpus_allowed; 945 mask = cs->cpus_allowed;
931 up(&cpuset_sem); 946 cpuset_up(&cpuset_sem);
932 947
933 return cpulist_scnprintf(page, PAGE_SIZE, mask); 948 return cpulist_scnprintf(page, PAGE_SIZE, mask);
934} 949}
@@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
937{ 952{
938 nodemask_t mask; 953 nodemask_t mask;
939 954
940 down(&cpuset_sem); 955 cpuset_down(&cpuset_sem);
941 mask = cs->mems_allowed; 956 mask = cs->mems_allowed;
942 up(&cpuset_sem); 957 cpuset_up(&cpuset_sem);
943 958
944 return nodelist_scnprintf(page, PAGE_SIZE, mask); 959 return nodelist_scnprintf(page, PAGE_SIZE, mask);
945} 960}
@@ -953,8 +968,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
953 char *page; 968 char *page;
954 ssize_t retval = 0; 969 ssize_t retval = 0;
955 char *s; 970 char *s;
956 char *start;
957 size_t n;
958 971
959 if (!(page = (char *)__get_free_page(GFP_KERNEL))) 972 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
960 return -ENOMEM; 973 return -ENOMEM;
@@ -984,10 +997,7 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
984 *s++ = '\n'; 997 *s++ = '\n';
985 *s = '\0'; 998 *s = '\0';
986 999
987 start = page + *ppos; 1000 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
988 n = s - start;
989 retval = n - copy_to_user(buf, start, min(n, nbytes));
990 *ppos += retval;
991out: 1001out:
992 free_page((unsigned long)page); 1002 free_page((unsigned long)page);
993 return retval; 1003 return retval;
@@ -1342,8 +1352,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1342 if (!cs) 1352 if (!cs)
1343 return -ENOMEM; 1353 return -ENOMEM;
1344 1354
1345 down(&cpuset_sem); 1355 cpuset_down(&cpuset_sem);
1346 refresh_mems();
1347 cs->flags = 0; 1356 cs->flags = 0;
1348 if (notify_on_release(parent)) 1357 if (notify_on_release(parent))
1349 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1358 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1368,14 +1377,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1368 * will down() this new directory's i_sem and if we race with 1377 * will down() this new directory's i_sem and if we race with
1369 * another mkdir, we might deadlock. 1378 * another mkdir, we might deadlock.
1370 */ 1379 */
1371 up(&cpuset_sem); 1380 cpuset_up(&cpuset_sem);
1372 1381
1373 err = cpuset_populate_dir(cs->dentry); 1382 err = cpuset_populate_dir(cs->dentry);
1374 /* If err < 0, we have a half-filled directory - oh well ;) */ 1383 /* If err < 0, we have a half-filled directory - oh well ;) */
1375 return 0; 1384 return 0;
1376err: 1385err:
1377 list_del(&cs->sibling); 1386 list_del(&cs->sibling);
1378 up(&cpuset_sem); 1387 cpuset_up(&cpuset_sem);
1379 kfree(cs); 1388 kfree(cs);
1380 return err; 1389 return err;
1381} 1390}
@@ -1397,14 +1406,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1397 1406
1398 /* the vfs holds both inode->i_sem already */ 1407 /* the vfs holds both inode->i_sem already */
1399 1408
1400 down(&cpuset_sem); 1409 cpuset_down(&cpuset_sem);
1401 refresh_mems();
1402 if (atomic_read(&cs->count) > 0) { 1410 if (atomic_read(&cs->count) > 0) {
1403 up(&cpuset_sem); 1411 cpuset_up(&cpuset_sem);
1404 return -EBUSY; 1412 return -EBUSY;
1405 } 1413 }
1406 if (!list_empty(&cs->children)) { 1414 if (!list_empty(&cs->children)) {
1407 up(&cpuset_sem); 1415 cpuset_up(&cpuset_sem);
1408 return -EBUSY; 1416 return -EBUSY;
1409 } 1417 }
1410 parent = cs->parent; 1418 parent = cs->parent;
@@ -1420,7 +1428,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1420 spin_unlock(&d->d_lock); 1428 spin_unlock(&d->d_lock);
1421 cpuset_d_remove_dir(d); 1429 cpuset_d_remove_dir(d);
1422 dput(d); 1430 dput(d);
1423 up(&cpuset_sem); 1431 cpuset_up(&cpuset_sem);
1424 cpuset_release_agent(pathbuf); 1432 cpuset_release_agent(pathbuf);
1425 return 0; 1433 return 0;
1426} 1434}
@@ -1523,10 +1531,10 @@ void cpuset_exit(struct task_struct *tsk)
1523 if (notify_on_release(cs)) { 1531 if (notify_on_release(cs)) {
1524 char *pathbuf = NULL; 1532 char *pathbuf = NULL;
1525 1533
1526 down(&cpuset_sem); 1534 cpuset_down(&cpuset_sem);
1527 if (atomic_dec_and_test(&cs->count)) 1535 if (atomic_dec_and_test(&cs->count))
1528 check_for_release(cs, &pathbuf); 1536 check_for_release(cs, &pathbuf);
1529 up(&cpuset_sem); 1537 cpuset_up(&cpuset_sem);
1530 cpuset_release_agent(pathbuf); 1538 cpuset_release_agent(pathbuf);
1531 } else { 1539 } else {
1532 atomic_dec(&cs->count); 1540 atomic_dec(&cs->count);
@@ -1547,11 +1555,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1547{ 1555{
1548 cpumask_t mask; 1556 cpumask_t mask;
1549 1557
1550 down(&cpuset_sem); 1558 cpuset_down(&cpuset_sem);
1551 task_lock((struct task_struct *)tsk); 1559 task_lock((struct task_struct *)tsk);
1552 guarantee_online_cpus(tsk->cpuset, &mask); 1560 guarantee_online_cpus(tsk->cpuset, &mask);
1553 task_unlock((struct task_struct *)tsk); 1561 task_unlock((struct task_struct *)tsk);
1554 up(&cpuset_sem); 1562 cpuset_up(&cpuset_sem);
1555 1563
1556 return mask; 1564 return mask;
1557} 1565}
@@ -1576,9 +1584,9 @@ void cpuset_update_current_mems_allowed(void)
1576 if (!cs) 1584 if (!cs)
1577 return; /* task is exiting */ 1585 return; /* task is exiting */
1578 if (current->cpuset_mems_generation != cs->mems_generation) { 1586 if (current->cpuset_mems_generation != cs->mems_generation) {
1579 down(&cpuset_sem); 1587 cpuset_down(&cpuset_sem);
1580 refresh_mems(); 1588 refresh_mems();
1581 up(&cpuset_sem); 1589 cpuset_up(&cpuset_sem);
1582 } 1590 }
1583} 1591}
1584 1592
@@ -1611,17 +1619,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1611 return 0; 1619 return 0;
1612} 1620}
1613 1621
1622/*
1623 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1624 * ancestor to the specified cpuset. Call while holding cpuset_sem.
1625 * If no ancestor is mem_exclusive (an unusual configuration), then
1626 * returns the root cpuset.
1627 */
1628static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1629{
1630 while (!is_mem_exclusive(cs) && cs->parent)
1631 cs = cs->parent;
1632 return cs;
1633}
1634
1614/** 1635/**
1615 * cpuset_zone_allowed - is zone z allowed in current->mems_allowed 1636 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
1616 * @z: zone in question 1637 * @z: is this zone on an allowed node?
1638 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
1617 * 1639 *
1618 * Is zone z allowed in current->mems_allowed, or is 1640 * If we're in interrupt, yes, we can always allocate. If zone
1619 * the CPU in interrupt context? (zone is always allowed in this case) 1641 * z's node is in our tasks mems_allowed, yes. If it's not a
1620 */ 1642 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1621int cpuset_zone_allowed(struct zone *z) 1643 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
1644 * Otherwise, no.
1645 *
1646 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
1647 * and do not allow allocations outside the current tasks cpuset.
1648 * GFP_KERNEL allocations are not so marked, so can escape to the
1649 * nearest mem_exclusive ancestor cpuset.
1650 *
1651 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
1652 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1653 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1654 * mems_allowed came up empty on the first pass over the zonelist.
1655 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1656 * short of memory, might require taking the cpuset_sem semaphore.
1657 *
1658 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1659 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
1660 * hardwall cpusets - no allocation on a node outside the cpuset is
1661 * allowed (unless in interrupt, of course).
1662 *
1663 * The second loop doesn't even call here for GFP_ATOMIC requests
1664 * (if the __alloc_pages() local variable 'wait' is set). That check
1665 * and the checks below have the combined affect in the second loop of
1666 * the __alloc_pages() routine that:
1667 * in_interrupt - any node ok (current task context irrelevant)
1668 * GFP_ATOMIC - any node ok
1669 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
1670 * GFP_USER - only nodes in current tasks mems allowed ok.
1671 **/
1672
1673int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1674{
1675 int node; /* node that zone z is on */
1676 const struct cpuset *cs; /* current cpuset ancestors */
1677 int allowed = 1; /* is allocation in zone z allowed? */
1678
1679 if (in_interrupt())
1680 return 1;
1681 node = z->zone_pgdat->node_id;
1682 if (node_isset(node, current->mems_allowed))
1683 return 1;
1684 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
1685 return 0;
1686
1687 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1688 cpuset_down(&cpuset_sem);
1689 cs = current->cpuset;
1690 if (!cs)
1691 goto done; /* current task exiting */
1692 cs = nearest_exclusive_ancestor(cs);
1693 allowed = node_isset(node, cs->mems_allowed);
1694done:
1695 cpuset_up(&cpuset_sem);
1696 return allowed;
1697}
1698
1699/**
1700 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
1701 * @p: pointer to task_struct of some other task.
1702 *
1703 * Description: Return true if the nearest mem_exclusive ancestor
1704 * cpusets of tasks @p and current overlap. Used by oom killer to
1705 * determine if task @p's memory usage might impact the memory
1706 * available to the current task.
1707 *
1708 * Acquires cpuset_sem - not suitable for calling from a fast path.
1709 **/
1710
1711int cpuset_excl_nodes_overlap(const struct task_struct *p)
1622{ 1712{
1623 return in_interrupt() || 1713 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1624 node_isset(z->zone_pgdat->node_id, current->mems_allowed); 1714 int overlap = 0; /* do cpusets overlap? */
1715
1716 cpuset_down(&cpuset_sem);
1717 cs1 = current->cpuset;
1718 if (!cs1)
1719 goto done; /* current task exiting */
1720 cs2 = p->cpuset;
1721 if (!cs2)
1722 goto done; /* task p is exiting */
1723 cs1 = nearest_exclusive_ancestor(cs1);
1724 cs2 = nearest_exclusive_ancestor(cs2);
1725 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1726done:
1727 cpuset_up(&cpuset_sem);
1728
1729 return overlap;
1625} 1730}
1626 1731
1627/* 1732/*
@@ -1642,7 +1747,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1642 return -ENOMEM; 1747 return -ENOMEM;
1643 1748
1644 tsk = m->private; 1749 tsk = m->private;
1645 down(&cpuset_sem); 1750 cpuset_down(&cpuset_sem);
1646 task_lock(tsk); 1751 task_lock(tsk);
1647 cs = tsk->cpuset; 1752 cs = tsk->cpuset;
1648 task_unlock(tsk); 1753 task_unlock(tsk);
@@ -1657,7 +1762,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1657 seq_puts(m, buf); 1762 seq_puts(m, buf);
1658 seq_putc(m, '\n'); 1763 seq_putc(m, '\n');
1659out: 1764out:
1660 up(&cpuset_sem); 1765 cpuset_up(&cpuset_sem);
1661 kfree(buf); 1766 kfree(buf);
1662 return retval; 1767 return retval;
1663} 1768}
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b0fb9f09f21..3b25b182d2be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,25 @@ EXPORT_SYMBOL(daemonize);
368static inline void close_files(struct files_struct * files) 368static inline void close_files(struct files_struct * files)
369{ 369{
370 int i, j; 370 int i, j;
371 struct fdtable *fdt;
371 372
372 j = 0; 373 j = 0;
374
375 /*
376 * It is safe to dereference the fd table without RCU or
377 * ->file_lock because this is the last reference to the
378 * files structure.
379 */
380 fdt = files_fdtable(files);
373 for (;;) { 381 for (;;) {
374 unsigned long set; 382 unsigned long set;
375 i = j * __NFDBITS; 383 i = j * __NFDBITS;
376 if (i >= files->max_fdset || i >= files->max_fds) 384 if (i >= fdt->max_fdset || i >= fdt->max_fds)
377 break; 385 break;
378 set = files->open_fds->fds_bits[j++]; 386 set = fdt->open_fds->fds_bits[j++];
379 while (set) { 387 while (set) {
380 if (set & 1) { 388 if (set & 1) {
381 struct file * file = xchg(&files->fd[i], NULL); 389 struct file * file = xchg(&fdt->fd[i], NULL);
382 if (file) 390 if (file)
383 filp_close(file, files); 391 filp_close(file, files);
384 } 392 }
@@ -403,18 +411,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
403 411
404void fastcall put_files_struct(struct files_struct *files) 412void fastcall put_files_struct(struct files_struct *files)
405{ 413{
414 struct fdtable *fdt;
415
406 if (atomic_dec_and_test(&files->count)) { 416 if (atomic_dec_and_test(&files->count)) {
407 close_files(files); 417 close_files(files);
408 /* 418 /*
409 * Free the fd and fdset arrays if we expanded them. 419 * Free the fd and fdset arrays if we expanded them.
420 * If the fdtable was embedded, pass files for freeing
421 * at the end of the RCU grace period. Otherwise,
422 * you can free files immediately.
410 */ 423 */
411 if (files->fd != &files->fd_array[0]) 424 fdt = files_fdtable(files);
412 free_fd_array(files->fd, files->max_fds); 425 if (fdt == &files->fdtab)
413 if (files->max_fdset > __FD_SETSIZE) { 426 fdt->free_files = files;
414 free_fdset(files->open_fds, files->max_fdset); 427 else
415 free_fdset(files->close_on_exec, files->max_fdset); 428 kmem_cache_free(files_cachep, files);
416 } 429 free_fdtable(fdt);
417 kmem_cache_free(files_cachep, files);
418 } 430 }
419} 431}
420 432
@@ -831,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code)
831 group_dead = atomic_dec_and_test(&tsk->signal->live); 843 group_dead = atomic_dec_and_test(&tsk->signal->live);
832 if (group_dead) { 844 if (group_dead) {
833 del_timer_sync(&tsk->signal->real_timer); 845 del_timer_sync(&tsk->signal->real_timer);
846 exit_itimers(tsk->signal);
834 acct_process(code); 847 acct_process(code);
835 } 848 }
836 exit_mm(tsk); 849 exit_mm(tsk);
@@ -1191,7 +1204,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
1191 1204
1192 exit_code = p->exit_code; 1205 exit_code = p->exit_code;
1193 if (unlikely(!exit_code) || 1206 if (unlikely(!exit_code) ||
1194 unlikely(p->state > TASK_STOPPED)) 1207 unlikely(p->state & TASK_TRACED))
1195 goto bail_ref; 1208 goto bail_ref;
1196 return wait_noreap_copyout(p, pid, uid, 1209 return wait_noreap_copyout(p, pid, uid,
1197 why, (exit_code << 8) | 0x7f, 1210 why, (exit_code << 8) | 0x7f,
diff --git a/kernel/fork.c b/kernel/fork.c
index 7e1ead9a6ba4..280bd44ac441 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/jiffies.h> 36#include <linux/jiffies.h>
37#include <linux/futex.h> 37#include <linux/futex.h>
38#include <linux/rcupdate.h>
38#include <linux/ptrace.h> 39#include <linux/ptrace.h>
39#include <linux/mount.h> 40#include <linux/mount.h>
40#include <linux/audit.h> 41#include <linux/audit.h>
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
176 177
177 /* One for us, one for whoever does the "release_task()" (usually parent) */ 178 /* One for us, one for whoever does the "release_task()" (usually parent) */
178 atomic_set(&tsk->usage,2); 179 atomic_set(&tsk->usage,2);
180 atomic_set(&tsk->fs_excl, 0);
179 return tsk; 181 return tsk;
180} 182}
181 183
@@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
564 return 0; 566 return 0;
565} 567}
566 568
567static int count_open_files(struct files_struct *files, int size) 569static int count_open_files(struct fdtable *fdt)
568{ 570{
571 int size = fdt->max_fdset;
569 int i; 572 int i;
570 573
571 /* Find the last open fd */ 574 /* Find the last open fd */
572 for (i = size/(8*sizeof(long)); i > 0; ) { 575 for (i = size/(8*sizeof(long)); i > 0; ) {
573 if (files->open_fds->fds_bits[--i]) 576 if (fdt->open_fds->fds_bits[--i])
574 break; 577 break;
575 } 578 }
576 i = (i+1) * 8 * sizeof(long); 579 i = (i+1) * 8 * sizeof(long);
577 return i; 580 return i;
578} 581}
579 582
583static struct files_struct *alloc_files(void)
584{
585 struct files_struct *newf;
586 struct fdtable *fdt;
587
588 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
589 if (!newf)
590 goto out;
591
592 atomic_set(&newf->count, 1);
593
594 spin_lock_init(&newf->file_lock);
595 fdt = &newf->fdtab;
596 fdt->next_fd = 0;
597 fdt->max_fds = NR_OPEN_DEFAULT;
598 fdt->max_fdset = __FD_SETSIZE;
599 fdt->close_on_exec = &newf->close_on_exec_init;
600 fdt->open_fds = &newf->open_fds_init;
601 fdt->fd = &newf->fd_array[0];
602 INIT_RCU_HEAD(&fdt->rcu);
603 fdt->free_files = NULL;
604 fdt->next = NULL;
605 rcu_assign_pointer(newf->fdt, fdt);
606out:
607 return newf;
608}
609
580static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 610static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
581{ 611{
582 struct files_struct *oldf, *newf; 612 struct files_struct *oldf, *newf;
583 struct file **old_fds, **new_fds; 613 struct file **old_fds, **new_fds;
584 int open_files, size, i, error = 0, expand; 614 int open_files, size, i, error = 0, expand;
615 struct fdtable *old_fdt, *new_fdt;
585 616
586 /* 617 /*
587 * A background process may not have any files ... 618 * A background process may not have any files ...
@@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
602 */ 633 */
603 tsk->files = NULL; 634 tsk->files = NULL;
604 error = -ENOMEM; 635 error = -ENOMEM;
605 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 636 newf = alloc_files();
606 if (!newf) 637 if (!newf)
607 goto out; 638 goto out;
608 639
609 atomic_set(&newf->count, 1);
610
611 spin_lock_init(&newf->file_lock);
612 newf->next_fd = 0;
613 newf->max_fds = NR_OPEN_DEFAULT;
614 newf->max_fdset = __FD_SETSIZE;
615 newf->close_on_exec = &newf->close_on_exec_init;
616 newf->open_fds = &newf->open_fds_init;
617 newf->fd = &newf->fd_array[0];
618
619 spin_lock(&oldf->file_lock); 640 spin_lock(&oldf->file_lock);
620 641 old_fdt = files_fdtable(oldf);
621 open_files = count_open_files(oldf, oldf->max_fdset); 642 new_fdt = files_fdtable(newf);
643 size = old_fdt->max_fdset;
644 open_files = count_open_files(old_fdt);
622 expand = 0; 645 expand = 0;
623 646
624 /* 647 /*
625 * Check whether we need to allocate a larger fd array or fd set. 648 * Check whether we need to allocate a larger fd array or fd set.
626 * Note: we're not a clone task, so the open count won't change. 649 * Note: we're not a clone task, so the open count won't change.
627 */ 650 */
628 if (open_files > newf->max_fdset) { 651 if (open_files > new_fdt->max_fdset) {
629 newf->max_fdset = 0; 652 new_fdt->max_fdset = 0;
630 expand = 1; 653 expand = 1;
631 } 654 }
632 if (open_files > newf->max_fds) { 655 if (open_files > new_fdt->max_fds) {
633 newf->max_fds = 0; 656 new_fdt->max_fds = 0;
634 expand = 1; 657 expand = 1;
635 } 658 }
636 659
@@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
642 spin_unlock(&newf->file_lock); 665 spin_unlock(&newf->file_lock);
643 if (error < 0) 666 if (error < 0)
644 goto out_release; 667 goto out_release;
668 new_fdt = files_fdtable(newf);
669 /*
670 * Reacquire the oldf lock and a pointer to its fd table
671 * who knows it may have a new bigger fd table. We need
672 * the latest pointer.
673 */
645 spin_lock(&oldf->file_lock); 674 spin_lock(&oldf->file_lock);
675 old_fdt = files_fdtable(oldf);
646 } 676 }
647 677
648 old_fds = oldf->fd; 678 old_fds = old_fdt->fd;
649 new_fds = newf->fd; 679 new_fds = new_fdt->fd;
650 680
651 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); 681 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
652 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); 682 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
653 683
654 for (i = open_files; i != 0; i--) { 684 for (i = open_files; i != 0; i--) {
655 struct file *f = *old_fds++; 685 struct file *f = *old_fds++;
@@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
662 * is partway through open(). So make sure that this 692 * is partway through open(). So make sure that this
663 * fd is available to the new process. 693 * fd is available to the new process.
664 */ 694 */
665 FD_CLR(open_files - i, newf->open_fds); 695 FD_CLR(open_files - i, new_fdt->open_fds);
666 } 696 }
667 *new_fds++ = f; 697 rcu_assign_pointer(*new_fds++, f);
668 } 698 }
669 spin_unlock(&oldf->file_lock); 699 spin_unlock(&oldf->file_lock);
670 700
671 /* compute the remainder to be cleared */ 701 /* compute the remainder to be cleared */
672 size = (newf->max_fds - open_files) * sizeof(struct file *); 702 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
673 703
674 /* This is long word aligned thus could use a optimized version */ 704 /* This is long word aligned thus could use a optimized version */
675 memset(new_fds, 0, size); 705 memset(new_fds, 0, size);
676 706
677 if (newf->max_fdset > open_files) { 707 if (new_fdt->max_fdset > open_files) {
678 int left = (newf->max_fdset-open_files)/8; 708 int left = (new_fdt->max_fdset-open_files)/8;
679 int start = open_files / (8 * sizeof(unsigned long)); 709 int start = open_files / (8 * sizeof(unsigned long));
680 710
681 memset(&newf->open_fds->fds_bits[start], 0, left); 711 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
682 memset(&newf->close_on_exec->fds_bits[start], 0, left); 712 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
683 } 713 }
684 714
685 tsk->files = newf; 715 tsk->files = newf;
@@ -688,9 +718,9 @@ out:
688 return error; 718 return error;
689 719
690out_release: 720out_release:
691 free_fdset (newf->close_on_exec, newf->max_fdset); 721 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
692 free_fdset (newf->open_fds, newf->max_fdset); 722 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
693 free_fd_array(newf->fd, newf->max_fds); 723 free_fd_array(new_fdt->fd, new_fdt->max_fds);
694 kmem_cache_free(files_cachep, newf); 724 kmem_cache_free(files_cachep, newf);
695 goto out; 725 goto out;
696} 726}
@@ -818,7 +848,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
818{ 848{
819 unsigned long new_flags = p->flags; 849 unsigned long new_flags = p->flags;
820 850
821 new_flags &= ~PF_SUPERPRIV; 851 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
822 new_flags |= PF_FORKNOEXEC; 852 new_flags |= PF_FORKNOEXEC;
823 if (!(clone_flags & CLONE_PTRACE)) 853 if (!(clone_flags & CLONE_PTRACE))
824 p->ptrace = 0; 854 p->ptrace = 0;
@@ -1032,7 +1062,8 @@ static task_t *copy_process(unsigned long clone_flags,
1032 * parent's CPU). This avoids alot of nasty races. 1062 * parent's CPU). This avoids alot of nasty races.
1033 */ 1063 */
1034 p->cpus_allowed = current->cpus_allowed; 1064 p->cpus_allowed = current->cpus_allowed;
1035 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) 1065 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1066 !cpu_online(task_cpu(p))))
1036 set_task_cpu(p, smp_processor_id()); 1067 set_task_cpu(p, smp_processor_id());
1037 1068
1038 /* 1069 /*
@@ -1115,6 +1146,9 @@ static task_t *copy_process(unsigned long clone_flags,
1115 __get_cpu_var(process_counts)++; 1146 __get_cpu_var(process_counts)++;
1116 } 1147 }
1117 1148
1149 if (!current->signal->tty && p->signal->tty)
1150 p->signal->tty = NULL;
1151
1118 nr_threads++; 1152 nr_threads++;
1119 total_forks++; 1153 total_forks++;
1120 write_unlock_irq(&tasklist_lock); 1154 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/syscalls.h> 41#include <linux/syscalls.h>
42#include <linux/signal.h> 42#include <linux/signal.h>
43#include <asm/futex.h>
43 44
44#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 45#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
45 46
@@ -327,6 +328,118 @@ out:
327} 328}
328 329
329/* 330/*
331 * Wake up all waiters hashed on the physical page that is mapped
332 * to this virtual address:
333 */
334static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
335{
336 union futex_key key1, key2;
337 struct futex_hash_bucket *bh1, *bh2;
338 struct list_head *head;
339 struct futex_q *this, *next;
340 int ret, op_ret, attempt = 0;
341
342retryfull:
343 down_read(&current->mm->mmap_sem);
344
345 ret = get_futex_key(uaddr1, &key1);
346 if (unlikely(ret != 0))
347 goto out;
348 ret = get_futex_key(uaddr2, &key2);
349 if (unlikely(ret != 0))
350 goto out;
351
352 bh1 = hash_futex(&key1);
353 bh2 = hash_futex(&key2);
354
355retry:
356 if (bh1 < bh2)
357 spin_lock(&bh1->lock);
358 spin_lock(&bh2->lock);
359 if (bh1 > bh2)
360 spin_lock(&bh1->lock);
361
362 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
363 if (unlikely(op_ret < 0)) {
364 int dummy;
365
366 spin_unlock(&bh1->lock);
367 if (bh1 != bh2)
368 spin_unlock(&bh2->lock);
369
370 /* futex_atomic_op_inuser needs to both read and write
371 * *(int __user *)uaddr2, but we can't modify it
372 * non-atomically. Therefore, if get_user below is not
373 * enough, we need to handle the fault ourselves, while
374 * still holding the mmap_sem. */
375 if (attempt++) {
376 struct vm_area_struct * vma;
377 struct mm_struct *mm = current->mm;
378
379 ret = -EFAULT;
380 if (attempt >= 2 ||
381 !(vma = find_vma(mm, uaddr2)) ||
382 vma->vm_start > uaddr2 ||
383 !(vma->vm_flags & VM_WRITE))
384 goto out;
385
386 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
387 case VM_FAULT_MINOR:
388 current->min_flt++;
389 break;
390 case VM_FAULT_MAJOR:
391 current->maj_flt++;
392 break;
393 default:
394 goto out;
395 }
396 goto retry;
397 }
398
399 /* If we would have faulted, release mmap_sem,
400 * fault it in and start all over again. */
401 up_read(&current->mm->mmap_sem);
402
403 ret = get_user(dummy, (int __user *)uaddr2);
404 if (ret)
405 return ret;
406
407 goto retryfull;
408 }
409
410 head = &bh1->chain;
411
412 list_for_each_entry_safe(this, next, head, list) {
413 if (match_futex (&this->key, &key1)) {
414 wake_futex(this);
415 if (++ret >= nr_wake)
416 break;
417 }
418 }
419
420 if (op_ret > 0) {
421 head = &bh2->chain;
422
423 op_ret = 0;
424 list_for_each_entry_safe(this, next, head, list) {
425 if (match_futex (&this->key, &key2)) {
426 wake_futex(this);
427 if (++op_ret >= nr_wake2)
428 break;
429 }
430 }
431 ret += op_ret;
432 }
433
434 spin_unlock(&bh1->lock);
435 if (bh1 != bh2)
436 spin_unlock(&bh2->lock);
437out:
438 up_read(&current->mm->mmap_sem);
439 return ret;
440}
441
442/*
330 * Requeue all waiters hashed on one physical page to another 443 * Requeue all waiters hashed on one physical page to another
331 * physical page. 444 * physical page.
332 */ 445 */
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
673 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 786 filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
674 787
675 if (signal) { 788 if (signal) {
676 int err;
677 err = f_setown(filp, current->pid, 1); 789 err = f_setown(filp, current->pid, 1);
678 if (err < 0) { 790 if (err < 0) {
679 put_unused_fd(ret); 791 goto error;
680 put_filp(filp);
681 ret = err;
682 goto out;
683 } 792 }
684 filp->f_owner.signum = signal; 793 filp->f_owner.signum = signal;
685 } 794 }
686 795
687 q = kmalloc(sizeof(*q), GFP_KERNEL); 796 q = kmalloc(sizeof(*q), GFP_KERNEL);
688 if (!q) { 797 if (!q) {
689 put_unused_fd(ret); 798 err = -ENOMEM;
690 put_filp(filp); 799 goto error;
691 ret = -ENOMEM;
692 goto out;
693 } 800 }
694 801
695 down_read(&current->mm->mmap_sem); 802 down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
697 804
698 if (unlikely(err != 0)) { 805 if (unlikely(err != 0)) {
699 up_read(&current->mm->mmap_sem); 806 up_read(&current->mm->mmap_sem);
700 put_unused_fd(ret);
701 put_filp(filp);
702 kfree(q); 807 kfree(q);
703 return err; 808 goto error;
704 } 809 }
705 810
706 /* 811 /*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
716 fd_install(ret, filp); 821 fd_install(ret, filp);
717out: 822out:
718 return ret; 823 return ret;
824error:
825 put_unused_fd(ret);
826 put_filp(filp);
827 ret = err;
828 goto out;
719} 829}
720 830
721long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 831long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
740 case FUTEX_CMP_REQUEUE: 850 case FUTEX_CMP_REQUEUE:
741 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 851 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
742 break; 852 break;
853 case FUTEX_WAKE_OP:
854 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
855 break;
743 default: 856 default:
744 ret = -ENOSYS; 857 ret = -ENOSYS;
745 } 858 }
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
39 struct list_head *tmp; 39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new; 40 struct inter_module_entry *ime, *ime_new;
41 41
42 if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { 42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */ 43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR 44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", 45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
47 kmalloc_failed = 1; 47 kmalloc_failed = 1;
48 return; 48 return;
49 } 49 }
50 memset(ime_new, 0, sizeof(*ime_new));
51 ime_new->im_name = im_name; 50 ime_new->im_name = im_name;
52 ime_new->owner = owner; 51 ime_new->owner = owner;
53 ime_new->userdata = userdata; 52 ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
111 unsigned int status; 111 unsigned int status;
112 112
113 kstat_this_cpu.irqs[irq]++; 113 kstat_this_cpu.irqs[irq]++;
114 if (desc->status & IRQ_PER_CPU) { 114 if (CHECK_IRQ_PER_CPU(desc->status)) {
115 irqreturn_t action_ret; 115 irqreturn_t action_ret;
116 116
117 /* 117 /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
18 18
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; 19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20 20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
21/** 25/**
22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
23 * 27 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
19 */ 19 */
20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; 20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
21 21
22void __attribute__((weak)) 22#ifdef CONFIG_GENERIC_PENDING_IRQ
23proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 23void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{
25 /*
26 * Save these away for later use. Re-progam when the
27 * interrupt is pending
28 */
29 set_pending_irq(irq, mask_val);
30}
31#else
32void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{ 33{
25 irq_affinity[irq] = mask_val; 34 irq_affinity[irq] = mask_val;
26 irq_desc[irq].handler->set_affinity(irq, mask_val); 35 irq_desc[irq].handler->set_affinity(irq, mask_val);
27} 36}
37#endif
28 38
29static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
30 int count, int *eof, void *data) 40 int count, int *eof, void *data)
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 179baafcdd96..64ab045c3d9d 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -36,7 +36,7 @@
36 * struct kfifo with kfree(). 36 * struct kfifo with kfree().
37 */ 37 */
38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
39 unsigned int __nocast gfp_mask, spinlock_t *lock) 39 gfp_t gfp_mask, spinlock_t *lock)
40{ 40{
41 struct kfifo *fifo; 41 struct kfifo *fifo;
42 42
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init);
64 * 64 *
65 * The size will be rounded-up to a power of 2. 65 * The size will be rounded-up to a power of 2.
66 */ 66 */
67struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) 67struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
68{ 68{
69 unsigned char *buffer; 69 unsigned char *buffer;
70 struct kfifo *ret; 70 struct kfifo *ret;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
37#include <linux/init.h> 37#include <linux/init.h>
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <asm-generic/sections.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
41#include <asm/errno.h> 42#include <asm/errno.h>
42#include <asm/kdebug.h> 43#include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
72 * get_insn_slot() - Find a slot on an executable page for an instruction. 73 * get_insn_slot() - Find a slot on an executable page for an instruction.
73 * We allocate an executable page if there's no room on existing ones. 74 * We allocate an executable page if there's no room on existing ones.
74 */ 75 */
75kprobe_opcode_t *get_insn_slot(void) 76kprobe_opcode_t __kprobes *get_insn_slot(void)
76{ 77{
77 struct kprobe_insn_page *kip; 78 struct kprobe_insn_page *kip;
78 struct hlist_node *pos; 79 struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
117 return kip->insns; 118 return kip->insns;
118} 119}
119 120
120void free_insn_slot(kprobe_opcode_t *slot) 121void __kprobes free_insn_slot(kprobe_opcode_t *slot)
121{ 122{
122 struct kprobe_insn_page *kip; 123 struct kprobe_insn_page *kip;
123 struct hlist_node *pos; 124 struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
152} 153}
153 154
154/* Locks kprobe: irqs must be disabled */ 155/* Locks kprobe: irqs must be disabled */
155void lock_kprobes(void) 156void __kprobes lock_kprobes(void)
156{ 157{
158 unsigned long flags = 0;
159
160 /* Avoiding local interrupts to happen right after we take the kprobe_lock
161 * and before we get a chance to update kprobe_cpu, this to prevent
162 * deadlock when we have a kprobe on ISR routine and a kprobe on task
163 * routine
164 */
165 local_irq_save(flags);
166
157 spin_lock(&kprobe_lock); 167 spin_lock(&kprobe_lock);
158 kprobe_cpu = smp_processor_id(); 168 kprobe_cpu = smp_processor_id();
169
170 local_irq_restore(flags);
159} 171}
160 172
161void unlock_kprobes(void) 173void __kprobes unlock_kprobes(void)
162{ 174{
175 unsigned long flags = 0;
176
177 /* Avoiding local interrupts to happen right after we update
178 * kprobe_cpu and before we get a a chance to release kprobe_lock,
179 * this to prevent deadlock when we have a kprobe on ISR routine and
180 * a kprobe on task routine
181 */
182 local_irq_save(flags);
183
163 kprobe_cpu = NR_CPUS; 184 kprobe_cpu = NR_CPUS;
164 spin_unlock(&kprobe_lock); 185 spin_unlock(&kprobe_lock);
186
187 local_irq_restore(flags);
165} 188}
166 189
167/* You have to be holding the kprobe_lock */ 190/* You have to be holding the kprobe_lock */
168struct kprobe *get_kprobe(void *addr) 191struct kprobe __kprobes *get_kprobe(void *addr)
169{ 192{
170 struct hlist_head *head; 193 struct hlist_head *head;
171 struct hlist_node *node; 194 struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
183 * Aggregate handlers for multiple kprobes support - these handlers 206 * Aggregate handlers for multiple kprobes support - these handlers
184 * take care of invoking the individual kprobe handlers on p->list 207 * take care of invoking the individual kprobe handlers on p->list
185 */ 208 */
186static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 209static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
187{ 210{
188 struct kprobe *kp; 211 struct kprobe *kp;
189 212
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
198 return 0; 221 return 0;
199} 222}
200 223
201static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 224static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
202 unsigned long flags) 225 unsigned long flags)
203{ 226{
204 struct kprobe *kp; 227 struct kprobe *kp;
205 228
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
213 return; 236 return;
214} 237}
215 238
216static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 239static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
217 int trapnr) 240 int trapnr)
218{ 241{
219 /* 242 /*
220 * if we faulted "during" the execution of a user specified 243 * if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
227 return 0; 250 return 0;
228} 251}
229 252
230static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 253static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
231{ 254{
232 struct kprobe *kp = curr_kprobe; 255 struct kprobe *kp = curr_kprobe;
233 if (curr_kprobe && kp->break_handler) { 256 if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
240 return 0; 263 return 0;
241} 264}
242 265
243struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) 266struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
244{ 267{
245 struct hlist_node *node; 268 struct hlist_node *node;
246 struct kretprobe_instance *ri; 269 struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
249 return NULL; 272 return NULL;
250} 273}
251 274
252static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) 275static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
276 *rp)
253{ 277{
254 struct hlist_node *node; 278 struct hlist_node *node;
255 struct kretprobe_instance *ri; 279 struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
258 return NULL; 282 return NULL;
259} 283}
260 284
261void add_rp_inst(struct kretprobe_instance *ri) 285void __kprobes add_rp_inst(struct kretprobe_instance *ri)
262{ 286{
263 /* 287 /*
264 * Remove rp inst off the free list - 288 * Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
276 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 300 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
277} 301}
278 302
279void recycle_rp_inst(struct kretprobe_instance *ri) 303void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
280{ 304{
281 /* remove rp inst off the rprobe_inst_table */ 305 /* remove rp inst off the rprobe_inst_table */
282 hlist_del(&ri->hlist); 306 hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
291 kfree(ri); 315 kfree(ri);
292} 316}
293 317
294struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) 318struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
295{ 319{
296 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 320 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
297} 321}
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
302 * instances associated with this task. These left over instances represent 326 * instances associated with this task. These left over instances represent
303 * probed functions that have been called but will never return. 327 * probed functions that have been called but will never return.
304 */ 328 */
305void kprobe_flush_task(struct task_struct *tk) 329void __kprobes kprobe_flush_task(struct task_struct *tk)
306{ 330{
307 struct kretprobe_instance *ri; 331 struct kretprobe_instance *ri;
308 struct hlist_head *head; 332 struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
322 * This kprobe pre_handler is registered with every kretprobe. When probe 346 * This kprobe pre_handler is registered with every kretprobe. When probe
323 * hits it will set up the return probe. 347 * hits it will set up the return probe.
324 */ 348 */
325static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) 349static int __kprobes pre_handler_kretprobe(struct kprobe *p,
350 struct pt_regs *regs)
326{ 351{
327 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 352 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
328 353
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
353* Add the new probe to old_p->list. Fail if this is the 378* Add the new probe to old_p->list. Fail if this is the
354* second jprobe at the address - two jprobes can't coexist 379* second jprobe at the address - two jprobes can't coexist
355*/ 380*/
356static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 381static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
357{ 382{
358 struct kprobe *kp; 383 struct kprobe *kp;
359 384
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
395 * the intricacies 420 * the intricacies
396 * TODO: Move kcalloc outside the spinlock 421 * TODO: Move kcalloc outside the spinlock
397 */ 422 */
398static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) 423static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
424 struct kprobe *p)
399{ 425{
400 int ret = 0; 426 int ret = 0;
401 struct kprobe *ap; 427 struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
434 spin_unlock_irqrestore(&kprobe_lock, flags); 460 spin_unlock_irqrestore(&kprobe_lock, flags);
435} 461}
436 462
437int register_kprobe(struct kprobe *p) 463static int __kprobes in_kprobes_functions(unsigned long addr)
464{
465 if (addr >= (unsigned long)__kprobes_text_start
466 && addr < (unsigned long)__kprobes_text_end)
467 return -EINVAL;
468 return 0;
469}
470
471int __kprobes register_kprobe(struct kprobe *p)
438{ 472{
439 int ret = 0; 473 int ret = 0;
440 unsigned long flags = 0; 474 unsigned long flags = 0;
441 struct kprobe *old_p; 475 struct kprobe *old_p;
442 476
443 if ((ret = arch_prepare_kprobe(p)) != 0) { 477 if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
478 return ret;
479 if ((ret = arch_prepare_kprobe(p)) != 0)
444 goto rm_kprobe; 480 goto rm_kprobe;
445 } 481
446 spin_lock_irqsave(&kprobe_lock, flags); 482 spin_lock_irqsave(&kprobe_lock, flags);
447 old_p = get_kprobe(p->addr); 483 old_p = get_kprobe(p->addr);
448 p->nmissed = 0; 484 p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
466 return ret; 502 return ret;
467} 503}
468 504
469void unregister_kprobe(struct kprobe *p) 505void __kprobes unregister_kprobe(struct kprobe *p)
470{ 506{
471 unsigned long flags; 507 unsigned long flags;
472 struct kprobe *old_p; 508 struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
487 .priority = 0x7fffffff /* we need to notified first */ 523 .priority = 0x7fffffff /* we need to notified first */
488}; 524};
489 525
490int register_jprobe(struct jprobe *jp) 526int __kprobes register_jprobe(struct jprobe *jp)
491{ 527{
492 /* Todo: Verify probepoint is a function entry point */ 528 /* Todo: Verify probepoint is a function entry point */
493 jp->kp.pre_handler = setjmp_pre_handler; 529 jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
496 return register_kprobe(&jp->kp); 532 return register_kprobe(&jp->kp);
497} 533}
498 534
499void unregister_jprobe(struct jprobe *jp) 535void __kprobes unregister_jprobe(struct jprobe *jp)
500{ 536{
501 unregister_kprobe(&jp->kp); 537 unregister_kprobe(&jp->kp);
502} 538}
503 539
504#ifdef ARCH_SUPPORTS_KRETPROBES 540#ifdef ARCH_SUPPORTS_KRETPROBES
505 541
506int register_kretprobe(struct kretprobe *rp) 542int __kprobes register_kretprobe(struct kretprobe *rp)
507{ 543{
508 int ret = 0; 544 int ret = 0;
509 struct kretprobe_instance *inst; 545 struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
540 576
541#else /* ARCH_SUPPORTS_KRETPROBES */ 577#else /* ARCH_SUPPORTS_KRETPROBES */
542 578
543int register_kretprobe(struct kretprobe *rp) 579int __kprobes register_kretprobe(struct kretprobe *rp)
544{ 580{
545 return -ENOSYS; 581 return -ENOSYS;
546} 582}
547 583
548#endif /* ARCH_SUPPORTS_KRETPROBES */ 584#endif /* ARCH_SUPPORTS_KRETPROBES */
549 585
550void unregister_kretprobe(struct kretprobe *rp) 586void __kprobes unregister_kretprobe(struct kretprobe *rp)
551{ 587{
552 unsigned long flags; 588 unsigned long flags;
553 struct kretprobe_instance *ri; 589 struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..ff5c500ab625 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/moduleloader.h> 21#include <linux/moduleloader.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kernel.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
24#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
25#include <linux/elf.h> 26#include <linux/elf.h>
@@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags)
498{ 499{
499 int ret = (flags & O_TRUNC); 500 int ret = (flags & O_TRUNC);
500 if (ret) 501 if (ret)
501 tainted |= TAINT_FORCED_MODULE; 502 add_taint(TAINT_FORCED_MODULE);
502 return ret; 503 return ret;
503} 504}
504#else 505#else
@@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs,
897 if (!(tainted & TAINT_FORCED_MODULE)) { 898 if (!(tainted & TAINT_FORCED_MODULE)) {
898 printk("%s: no version for \"%s\" found: kernel tainted.\n", 899 printk("%s: no version for \"%s\" found: kernel tainted.\n",
899 mod->name, symname); 900 mod->name, symname);
900 tainted |= TAINT_FORCED_MODULE; 901 add_taint(TAINT_FORCED_MODULE);
901 } 902 }
902 return 1; 903 return 1;
903} 904}
@@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license)
1352 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { 1353 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
1353 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", 1354 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
1354 mod->name, license); 1355 mod->name, license);
1355 tainted |= TAINT_PROPRIETARY_MODULE; 1356 add_taint(TAINT_PROPRIETARY_MODULE);
1356 } 1357 }
1357} 1358}
1358 1359
@@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod,
1509 long err = 0; 1510 long err = 0;
1510 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1511 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1511 struct exception_table_entry *extable; 1512 struct exception_table_entry *extable;
1513 mm_segment_t old_fs;
1512 1514
1513 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1515 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
1514 umod, len, uargs); 1516 umod, len, uargs);
@@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod,
1609 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1611 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1610 /* This is allowed: modprobe --force will invalidate it. */ 1612 /* This is allowed: modprobe --force will invalidate it. */
1611 if (!modmagic) { 1613 if (!modmagic) {
1612 tainted |= TAINT_FORCED_MODULE; 1614 add_taint(TAINT_FORCED_MODULE);
1613 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1615 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1614 mod->name); 1616 mod->name);
1615 } else if (!same_magic(modmagic, vermagic)) { 1617 } else if (!same_magic(modmagic, vermagic)) {
@@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod,
1738 (mod->num_gpl_syms && !gplcrcindex)) { 1740 (mod->num_gpl_syms && !gplcrcindex)) {
1739 printk(KERN_WARNING "%s: No versions for exported symbols." 1741 printk(KERN_WARNING "%s: No versions for exported symbols."
1740 " Tainting kernel.\n", mod->name); 1742 " Tainting kernel.\n", mod->name);
1741 tainted |= TAINT_FORCED_MODULE; 1743 add_taint(TAINT_FORCED_MODULE);
1742 } 1744 }
1743#endif 1745#endif
1744 1746
@@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod,
1779 if (err < 0) 1781 if (err < 0)
1780 goto cleanup; 1782 goto cleanup;
1781 1783
1784 /* flush the icache in correct context */
1785 old_fs = get_fs();
1786 set_fs(KERNEL_DS);
1787
1788 /*
1789 * Flush the instruction cache, since we've played with text.
1790 * Do it before processing of module parameters, so the module
1791 * can provide parameter accessor functions of its own.
1792 */
1793 if (mod->module_init)
1794 flush_icache_range((unsigned long)mod->module_init,
1795 (unsigned long)mod->module_init
1796 + mod->init_size);
1797 flush_icache_range((unsigned long)mod->module_core,
1798 (unsigned long)mod->module_core + mod->core_size);
1799
1800 set_fs(old_fs);
1801
1782 mod->args = args; 1802 mod->args = args;
1783 if (obsparmindex) { 1803 if (obsparmindex) {
1784 err = obsolete_params(mod->name, mod->args, 1804 err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod,
1860 const char __user *uargs) 1880 const char __user *uargs)
1861{ 1881{
1862 struct module *mod; 1882 struct module *mod;
1863 mm_segment_t old_fs = get_fs();
1864 int ret = 0; 1883 int ret = 0;
1865 1884
1866 /* Must have permission */ 1885 /* Must have permission */
@@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod,
1878 return PTR_ERR(mod); 1897 return PTR_ERR(mod);
1879 } 1898 }
1880 1899
1881 /* flush the icache in correct context */
1882 set_fs(KERNEL_DS);
1883
1884 /* Flush the instruction cache, since we've played with text */
1885 if (mod->module_init)
1886 flush_icache_range((unsigned long)mod->module_init,
1887 (unsigned long)mod->module_init
1888 + mod->init_size);
1889 flush_icache_range((unsigned long)mod->module_core,
1890 (unsigned long)mod->module_core + mod->core_size);
1891
1892 set_fs(old_fs);
1893
1894 /* Now sew it into the lists. They won't access us, since 1900 /* Now sew it into the lists. They won't access us, since
1895 strong_try_module_get() will fail. */ 1901 strong_try_module_get() will fail. */
1896 stop_machine_run(__link_module, mod, NR_CPUS); 1902 stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..1a8614bac5d5 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -80,8 +80,6 @@ static char *next_arg(char *args, char **param, char **val)
80 int in_quote = 0, quoted = 0; 80 int in_quote = 0, quoted = 0;
81 char *next; 81 char *next;
82 82
83 /* Chew any extra spaces */
84 while (*args == ' ') args++;
85 if (*args == '"') { 83 if (*args == '"') {
86 args++; 84 args++;
87 in_quote = 1; 85 in_quote = 1;
@@ -121,6 +119,10 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i + 1; 119 next = args + i + 1;
122 } else 120 } else
123 next = args + i; 121 next = args + i;
122
123 /* Chew up trailing spaces. */
124 while (*next == ' ')
125 next++;
124 return next; 126 return next;
125} 127}
126 128
@@ -135,6 +137,10 @@ int parse_args(const char *name,
135 137
136 DEBUGP("Parsing ARGS: %s\n", args); 138 DEBUGP("Parsing ARGS: %s\n", args);
137 139
140 /* Chew leading spaces */
141 while (*args == ' ')
142 args++;
143
138 while (*args) { 144 while (*args) {
139 int ret; 145 int ret;
140 146
@@ -542,8 +548,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
542{ 548{
543 struct module_kobject *mk; 549 struct module_kobject *mk;
544 550
545 mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); 551 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
546 memset(mk, 0, sizeof(struct module_kobject)); 552 BUG_ON(!mk);
547 553
548 mk->mod = THIS_MODULE; 554 mk->mod = THIS_MODULE;
549 kobj_set_kset_s(mk, module_subsys); 555 kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ad85d3f0dcc4..bf374fceb39c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
91 * Update expiry time from increment, and increase overrun count, 91 * Update expiry time from increment, and increase overrun count,
92 * given the current clock sample. 92 * given the current clock sample.
93 */ 93 */
94static inline void bump_cpu_timer(struct k_itimer *timer, 94static void bump_cpu_timer(struct k_itimer *timer,
95 union cpu_time_count now) 95 union cpu_time_count now)
96{ 96{
97 int i; 97 int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
110 for (i = 0; incr < delta - incr; i++) 110 for (i = 0; incr < delta - incr; i++)
111 incr = incr << 1; 111 incr = incr << 1;
112 for (; i >= 0; incr >>= 1, i--) { 112 for (; i >= 0; incr >>= 1, i--) {
113 if (delta <= incr) 113 if (delta < incr)
114 continue; 114 continue;
115 timer->it.cpu.expires.sched += incr; 115 timer->it.cpu.expires.sched += incr;
116 timer->it_overrun += 1 << i; 116 timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
128 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 128 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
129 incr = cputime_add(incr, incr); 129 incr = cputime_add(incr, incr);
130 for (; i >= 0; incr = cputime_halve(incr), i--) { 130 for (; i >= 0; incr = cputime_halve(incr), i--) {
131 if (cputime_le(delta, incr)) 131 if (cputime_lt(delta, incr))
132 continue; 132 continue;
133 timer->it.cpu.expires.cpu = 133 timer->it.cpu.expires.cpu =
134 cputime_add(timer->it.cpu.expires.cpu, incr); 134 cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
380int posix_cpu_timer_del(struct k_itimer *timer) 380int posix_cpu_timer_del(struct k_itimer *timer)
381{ 381{
382 struct task_struct *p = timer->it.cpu.task; 382 struct task_struct *p = timer->it.cpu.task;
383 int ret = 0;
383 384
384 if (timer->it.cpu.firing) 385 if (likely(p != NULL)) {
385 return TIMER_RETRY;
386
387 if (unlikely(p == NULL))
388 return 0;
389
390 if (!list_empty(&timer->it.cpu.entry)) {
391 read_lock(&tasklist_lock); 386 read_lock(&tasklist_lock);
392 if (unlikely(p->signal == NULL)) { 387 if (unlikely(p->signal == NULL)) {
393 /* 388 /*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
396 */ 391 */
397 BUG_ON(!list_empty(&timer->it.cpu.entry)); 392 BUG_ON(!list_empty(&timer->it.cpu.entry));
398 } else { 393 } else {
399 /*
400 * Take us off the task's timer list.
401 */
402 spin_lock(&p->sighand->siglock); 394 spin_lock(&p->sighand->siglock);
403 list_del(&timer->it.cpu.entry); 395 if (timer->it.cpu.firing)
396 ret = TIMER_RETRY;
397 else
398 list_del(&timer->it.cpu.entry);
404 spin_unlock(&p->sighand->siglock); 399 spin_unlock(&p->sighand->siglock);
405 } 400 }
406 read_unlock(&tasklist_lock); 401 read_unlock(&tasklist_lock);
402
403 if (!ret)
404 put_task_struct(p);
407 } 405 }
408 put_task_struct(p);
409 406
410 return 0; 407 return ret;
411} 408}
412 409
413/* 410/*
@@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head,
424 cputime_t ptime = cputime_add(utime, stime); 421 cputime_t ptime = cputime_add(utime, stime);
425 422
426 list_for_each_entry_safe(timer, next, head, entry) { 423 list_for_each_entry_safe(timer, next, head, entry) {
427 timer->task = NULL;
428 list_del_init(&timer->entry); 424 list_del_init(&timer->entry);
429 if (cputime_lt(timer->expires.cpu, ptime)) { 425 if (cputime_lt(timer->expires.cpu, ptime)) {
430 timer->expires.cpu = cputime_zero; 426 timer->expires.cpu = cputime_zero;
@@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head,
436 432
437 ++head; 433 ++head;
438 list_for_each_entry_safe(timer, next, head, entry) { 434 list_for_each_entry_safe(timer, next, head, entry) {
439 timer->task = NULL;
440 list_del_init(&timer->entry); 435 list_del_init(&timer->entry);
441 if (cputime_lt(timer->expires.cpu, utime)) { 436 if (cputime_lt(timer->expires.cpu, utime)) {
442 timer->expires.cpu = cputime_zero; 437 timer->expires.cpu = cputime_zero;
@@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head,
448 443
449 ++head; 444 ++head;
450 list_for_each_entry_safe(timer, next, head, entry) { 445 list_for_each_entry_safe(timer, next, head, entry) {
451 timer->task = NULL;
452 list_del_init(&timer->entry); 446 list_del_init(&timer->entry);
453 if (timer->expires.sched < sched_time) { 447 if (timer->expires.sched < sched_time) {
454 timer->expires.sched = 0; 448 timer->expires.sched = 0;
@@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
492 struct task_struct *t = p; 486 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live); 487 unsigned int nthreads = atomic_read(&p->signal->live);
494 488
489 if (!nthreads)
490 return;
491
495 switch (clock_idx) { 492 switch (clock_idx) {
496 default: 493 default:
497 BUG(); 494 BUG();
@@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
500 left = cputime_div(cputime_sub(expires.cpu, val.cpu), 497 left = cputime_div(cputime_sub(expires.cpu, val.cpu),
501 nthreads); 498 nthreads);
502 do { 499 do {
503 if (!unlikely(t->exit_state)) { 500 if (!unlikely(t->flags & PF_EXITING)) {
504 ticks = cputime_add(prof_ticks(t), left); 501 ticks = cputime_add(prof_ticks(t), left);
505 if (cputime_eq(t->it_prof_expires, 502 if (cputime_eq(t->it_prof_expires,
506 cputime_zero) || 503 cputime_zero) ||
@@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
515 left = cputime_div(cputime_sub(expires.cpu, val.cpu), 512 left = cputime_div(cputime_sub(expires.cpu, val.cpu),
516 nthreads); 513 nthreads);
517 do { 514 do {
518 if (!unlikely(t->exit_state)) { 515 if (!unlikely(t->flags & PF_EXITING)) {
519 ticks = cputime_add(virt_ticks(t), left); 516 ticks = cputime_add(virt_ticks(t), left);
520 if (cputime_eq(t->it_virt_expires, 517 if (cputime_eq(t->it_virt_expires,
521 cputime_zero) || 518 cputime_zero) ||
@@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
530 nsleft = expires.sched - val.sched; 527 nsleft = expires.sched - val.sched;
531 do_div(nsleft, nthreads); 528 do_div(nsleft, nthreads);
532 do { 529 do {
533 if (!unlikely(t->exit_state)) { 530 if (!unlikely(t->flags & PF_EXITING)) {
534 ns = t->sched_time + nsleft; 531 ns = t->sched_time + nsleft;
535 if (t->it_sched_expires == 0 || 532 if (t->it_sched_expires == 0 ||
536 t->it_sched_expires > ns) { 533 t->it_sched_expires > ns) {
@@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
569 struct cpu_timer_list *next; 566 struct cpu_timer_list *next;
570 unsigned long i; 567 unsigned long i;
571 568
569 if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
570 return;
571
572 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 572 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
573 p->cpu_timers : p->signal->cpu_timers); 573 p->cpu_timers : p->signal->cpu_timers);
574 head += CPUCLOCK_WHICH(timer->it_clock); 574 head += CPUCLOCK_WHICH(timer->it_clock);
@@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
579 listpos = head; 579 listpos = head;
580 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 580 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
581 list_for_each_entry(next, head, entry) { 581 list_for_each_entry(next, head, entry) {
582 if (next->expires.sched > nt->expires.sched) { 582 if (next->expires.sched > nt->expires.sched)
583 listpos = &next->entry;
584 break; 583 break;
585 } 584 listpos = &next->entry;
586 } 585 }
587 } else { 586 } else {
588 list_for_each_entry(next, head, entry) { 587 list_for_each_entry(next, head, entry) {
589 if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { 588 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
590 listpos = &next->entry;
591 break; 589 break;
592 } 590 listpos = &next->entry;
593 } 591 }
594 } 592 }
595 list_add(&nt->entry, listpos); 593 list_add(&nt->entry, listpos);
@@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
733 * Disarm any old timer after extracting its expiry time. 731 * Disarm any old timer after extracting its expiry time.
734 */ 732 */
735 BUG_ON(!irqs_disabled()); 733 BUG_ON(!irqs_disabled());
734
735 ret = 0;
736 spin_lock(&p->sighand->siglock); 736 spin_lock(&p->sighand->siglock);
737 old_expires = timer->it.cpu.expires; 737 old_expires = timer->it.cpu.expires;
738 list_del_init(&timer->it.cpu.entry); 738 if (unlikely(timer->it.cpu.firing)) {
739 timer->it.cpu.firing = -1;
740 ret = TIMER_RETRY;
741 } else
742 list_del_init(&timer->it.cpu.entry);
739 spin_unlock(&p->sighand->siglock); 743 spin_unlock(&p->sighand->siglock);
740 744
741 /* 745 /*
@@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
783 } 787 }
784 } 788 }
785 789
786 if (unlikely(timer->it.cpu.firing)) { 790 if (unlikely(ret)) {
787 /* 791 /*
788 * We are colliding with the timer actually firing. 792 * We are colliding with the timer actually firing.
789 * Punt after filling in the timer's old value, and 793 * Punt after filling in the timer's old value, and
@@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
791 * it as an overrun (thanks to bump_cpu_timer above). 795 * it as an overrun (thanks to bump_cpu_timer above).
792 */ 796 */
793 read_unlock(&tasklist_lock); 797 read_unlock(&tasklist_lock);
794 timer->it.cpu.firing = -1;
795 ret = TIMER_RETRY;
796 goto out; 798 goto out;
797 } 799 }
798 800
@@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
958static void check_thread_timers(struct task_struct *tsk, 960static void check_thread_timers(struct task_struct *tsk,
959 struct list_head *firing) 961 struct list_head *firing)
960{ 962{
963 int maxfire;
961 struct list_head *timers = tsk->cpu_timers; 964 struct list_head *timers = tsk->cpu_timers;
962 965
966 maxfire = 20;
963 tsk->it_prof_expires = cputime_zero; 967 tsk->it_prof_expires = cputime_zero;
964 while (!list_empty(timers)) { 968 while (!list_empty(timers)) {
965 struct cpu_timer_list *t = list_entry(timers->next, 969 struct cpu_timer_list *t = list_entry(timers->next,
966 struct cpu_timer_list, 970 struct cpu_timer_list,
967 entry); 971 entry);
968 if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 972 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
969 tsk->it_prof_expires = t->expires.cpu; 973 tsk->it_prof_expires = t->expires.cpu;
970 break; 974 break;
971 } 975 }
@@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk,
974 } 978 }
975 979
976 ++timers; 980 ++timers;
981 maxfire = 20;
977 tsk->it_virt_expires = cputime_zero; 982 tsk->it_virt_expires = cputime_zero;
978 while (!list_empty(timers)) { 983 while (!list_empty(timers)) {
979 struct cpu_timer_list *t = list_entry(timers->next, 984 struct cpu_timer_list *t = list_entry(timers->next,
980 struct cpu_timer_list, 985 struct cpu_timer_list,
981 entry); 986 entry);
982 if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 987 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
983 tsk->it_virt_expires = t->expires.cpu; 988 tsk->it_virt_expires = t->expires.cpu;
984 break; 989 break;
985 } 990 }
@@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk,
988 } 993 }
989 994
990 ++timers; 995 ++timers;
996 maxfire = 20;
991 tsk->it_sched_expires = 0; 997 tsk->it_sched_expires = 0;
992 while (!list_empty(timers)) { 998 while (!list_empty(timers)) {
993 struct cpu_timer_list *t = list_entry(timers->next, 999 struct cpu_timer_list *t = list_entry(timers->next,
994 struct cpu_timer_list, 1000 struct cpu_timer_list,
995 entry); 1001 entry);
996 if (tsk->sched_time < t->expires.sched) { 1002 if (!--maxfire || tsk->sched_time < t->expires.sched) {
997 tsk->it_sched_expires = t->expires.sched; 1003 tsk->it_sched_expires = t->expires.sched;
998 break; 1004 break;
999 } 1005 }
@@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk,
1010static void check_process_timers(struct task_struct *tsk, 1016static void check_process_timers(struct task_struct *tsk,
1011 struct list_head *firing) 1017 struct list_head *firing)
1012{ 1018{
1019 int maxfire;
1013 struct signal_struct *const sig = tsk->signal; 1020 struct signal_struct *const sig = tsk->signal;
1014 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1021 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1015 unsigned long long sched_time, sched_expires; 1022 unsigned long long sched_time, sched_expires;
@@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk,
1042 } while (t != tsk); 1049 } while (t != tsk);
1043 ptime = cputime_add(utime, stime); 1050 ptime = cputime_add(utime, stime);
1044 1051
1052 maxfire = 20;
1045 prof_expires = cputime_zero; 1053 prof_expires = cputime_zero;
1046 while (!list_empty(timers)) { 1054 while (!list_empty(timers)) {
1047 struct cpu_timer_list *t = list_entry(timers->next, 1055 struct cpu_timer_list *t = list_entry(timers->next,
1048 struct cpu_timer_list, 1056 struct cpu_timer_list,
1049 entry); 1057 entry);
1050 if (cputime_lt(ptime, t->expires.cpu)) { 1058 if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
1051 prof_expires = t->expires.cpu; 1059 prof_expires = t->expires.cpu;
1052 break; 1060 break;
1053 } 1061 }
@@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk,
1056 } 1064 }
1057 1065
1058 ++timers; 1066 ++timers;
1067 maxfire = 20;
1059 virt_expires = cputime_zero; 1068 virt_expires = cputime_zero;
1060 while (!list_empty(timers)) { 1069 while (!list_empty(timers)) {
1061 struct cpu_timer_list *t = list_entry(timers->next, 1070 struct cpu_timer_list *t = list_entry(timers->next,
1062 struct cpu_timer_list, 1071 struct cpu_timer_list,
1063 entry); 1072 entry);
1064 if (cputime_lt(utime, t->expires.cpu)) { 1073 if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
1065 virt_expires = t->expires.cpu; 1074 virt_expires = t->expires.cpu;
1066 break; 1075 break;
1067 } 1076 }
@@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk,
1070 } 1079 }
1071 1080
1072 ++timers; 1081 ++timers;
1082 maxfire = 20;
1073 sched_expires = 0; 1083 sched_expires = 0;
1074 while (!list_empty(timers)) { 1084 while (!list_empty(timers)) {
1075 struct cpu_timer_list *t = list_entry(timers->next, 1085 struct cpu_timer_list *t = list_entry(timers->next,
1076 struct cpu_timer_list, 1086 struct cpu_timer_list,
1077 entry); 1087 entry);
1078 if (sched_time < t->expires.sched) { 1088 if (!--maxfire || sched_time < t->expires.sched) {
1079 sched_expires = t->expires.sched; 1089 sched_expires = t->expires.sched;
1080 break; 1090 break;
1081 } 1091 }
@@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk,
1158 unsigned long long sched_left, sched; 1168 unsigned long long sched_left, sched;
1159 const unsigned int nthreads = atomic_read(&sig->live); 1169 const unsigned int nthreads = atomic_read(&sig->live);
1160 1170
1171 if (!nthreads)
1172 return;
1173
1161 prof_left = cputime_sub(prof_expires, utime); 1174 prof_left = cputime_sub(prof_expires, utime);
1162 prof_left = cputime_sub(prof_left, stime); 1175 prof_left = cputime_sub(prof_left, stime);
1163 prof_left = cputime_div(prof_left, nthreads); 1176 prof_left = cputime_div(prof_left, nthreads);
@@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
1194 1207
1195 do { 1208 do {
1196 t = next_thread(t); 1209 t = next_thread(t);
1197 } while (unlikely(t->exit_state)); 1210 } while (unlikely(t->flags & PF_EXITING));
1198 } while (t != tsk); 1211 } while (t != tsk);
1199 } 1212 }
1200} 1213}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..dda3cda73c77 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
427 timr->sigq->info.si_code = SI_TIMER; 427 timr->sigq->info.si_code = SI_TIMER;
428 timr->sigq->info.si_tid = timr->it_id; 428 timr->sigq->info.si_tid = timr->it_id;
429 timr->sigq->info.si_value = timr->it_sigev_value; 429 timr->sigq->info.si_value = timr->it_sigev_value;
430
430 if (timr->it_sigev_notify & SIGEV_THREAD_ID) { 431 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
431 if (unlikely(timr->it_process->flags & PF_EXITING)) { 432 struct task_struct *leader;
432 timr->it_sigev_notify = SIGEV_SIGNAL; 433 int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
433 put_task_struct(timr->it_process); 434 timr->it_process);
434 timr->it_process = timr->it_process->group_leader; 435
435 goto group; 436 if (likely(ret >= 0))
436 } 437 return ret;
437 return send_sigqueue(timr->it_sigev_signo, timr->sigq, 438
438 timr->it_process); 439 timr->it_sigev_notify = SIGEV_SIGNAL;
439 } 440 leader = timr->it_process->group_leader;
440 else { 441 put_task_struct(timr->it_process);
441 group: 442 timr->it_process = leader;
442 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
443 timr->it_process);
444 } 443 }
444
445 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
446 timr->it_process);
445} 447}
446EXPORT_SYMBOL_GPL(posix_timer_event); 448EXPORT_SYMBOL_GPL(posix_timer_event);
447 449
@@ -1155,7 +1157,7 @@ retry_delete:
1155} 1157}
1156 1158
1157/* 1159/*
1158 * This is called by __exit_signal, only when there are no more 1160 * This is called by do_exit or de_thread, only when there are no more
1159 * references to the shared signal_struct. 1161 * references to the shared signal_struct.
1160 */ 1162 */
1161void exit_itimers(struct signal_struct *sig) 1163void exit_itimers(struct signal_struct *sig)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 917066a5767c..46a5e5acff97 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,5 +1,6 @@
1config PM 1config PM
2 bool "Power Management support" 2 bool "Power Management support"
3 depends on !IA64_HP_SIM
3 ---help--- 4 ---help---
4 "Power Management" means that parts of your computer are shut 5 "Power Management" means that parts of your computer are shut
5 off or put into a power conserving "sleep" mode if they are not 6 off or put into a power conserving "sleep" mode if they are not
@@ -28,7 +29,7 @@ config PM_DEBUG
28 29
29config SOFTWARE_SUSPEND 30config SOFTWARE_SUSPEND
30 bool "Software Suspend" 31 bool "Software Suspend"
31 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) 32 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP)
32 ---help--- 33 ---help---
33 Enable the possibility of suspending the machine. 34 Enable the possibility of suspending the machine.
34 It doesn't need APM. 35 It doesn't need APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d8bf054d036..761956e813f5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -17,12 +17,12 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pm.h>
20 21
21#include "power.h" 22#include "power.h"
22 23
23 24
24extern suspend_disk_method_t pm_disk_mode; 25extern suspend_disk_method_t pm_disk_mode;
25extern struct pm_ops * pm_ops;
26 26
27extern int swsusp_suspend(void); 27extern int swsusp_suspend(void);
28extern int swsusp_write(void); 28extern int swsusp_write(void);
@@ -49,13 +49,11 @@ dev_t swsusp_resume_device;
49 49
50static void power_down(suspend_disk_method_t mode) 50static void power_down(suspend_disk_method_t mode)
51{ 51{
52 unsigned long flags;
53 int error = 0; 52 int error = 0;
54 53
55 local_irq_save(flags);
56 switch(mode) { 54 switch(mode) {
57 case PM_DISK_PLATFORM: 55 case PM_DISK_PLATFORM:
58 device_shutdown(); 56 kernel_power_off_prepare();
59 error = pm_ops->enter(PM_SUSPEND_DISK); 57 error = pm_ops->enter(PM_SUSPEND_DISK);
60 break; 58 break;
61 case PM_DISK_SHUTDOWN: 59 case PM_DISK_SHUTDOWN:
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id, 60 unsigned long id,
61 pm_callback callback) 61 pm_callback callback)
62{ 62{
63 struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); 63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) { 64 if (dev) {
65 memset(dev, 0, sizeof(*dev));
66 dev->type = type; 65 dev->type = type;
67 dev->id = id; 66 dev->id = id;
68 dev->callback = callback; 67 dev->callback = callback;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cd6a3493cc0d..6748de23e83c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,7 +1,7 @@
1#include <linux/suspend.h> 1#include <linux/suspend.h>
2#include <linux/utsname.h> 2#include <linux/utsname.h>
3 3
4/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but 4/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
5 we probably do not take enough locks for switching consoles, etc, 5 we probably do not take enough locks for switching consoles, etc,
6 so bad things might happen. 6 so bad things might happen.
7*/ 7*/
@@ -9,6 +9,9 @@
9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
10#endif 10#endif
11 11
12#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \
13 - 4 - 3*sizeof(unsigned long) - sizeof(int) \
14 - sizeof(void *)) / sizeof(swp_entry_t))
12 15
13struct swsusp_info { 16struct swsusp_info {
14 struct new_utsname uts; 17 struct new_utsname uts;
@@ -18,7 +21,7 @@ struct swsusp_info {
18 unsigned long image_pages; 21 unsigned long image_pages;
19 unsigned long pagedir_pages; 22 unsigned long pagedir_pages;
20 suspend_pagedir_t * suspend_pagedir; 23 suspend_pagedir_t * suspend_pagedir;
21 swp_entry_t pagedir[768]; 24 swp_entry_t pagedir[MAX_PBES];
22} __attribute__((aligned(PAGE_SIZE))); 25} __attribute__((aligned(PAGE_SIZE)));
23 26
24 27
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..2d5c45676442 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -363,7 +363,7 @@ static void lock_swapdevices(void)
363} 363}
364 364
365/** 365/**
366 * write_swap_page - Write one page to a fresh swap location. 366 * write_page - Write one page to a fresh swap location.
367 * @addr: Address we're writing. 367 * @addr: Address we're writing.
368 * @loc: Place to store the entry we used. 368 * @loc: Place to store the entry we used.
369 * 369 *
@@ -402,15 +402,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
402static void data_free(void) 402static void data_free(void)
403{ 403{
404 swp_entry_t entry; 404 swp_entry_t entry;
405 int i; 405 struct pbe * p;
406 406
407 for (i = 0; i < nr_copy_pages; i++) { 407 for_each_pbe(p, pagedir_nosave) {
408 entry = (pagedir_nosave + i)->swap_address; 408 entry = p->swap_address;
409 if (entry.val) 409 if (entry.val)
410 swap_free(entry); 410 swap_free(entry);
411 else 411 else
412 break; 412 break;
413 (pagedir_nosave + i)->swap_address = (swp_entry_t){0};
414 } 413 }
415} 414}
416 415
@@ -863,6 +862,9 @@ static int alloc_image_pages(void)
863 return 0; 862 return 0;
864} 863}
865 864
865/* Free pages we allocated for suspend. Suspend pages are alocated
866 * before atomic copy, so we need to free them after resume.
867 */
866void swsusp_free(void) 868void swsusp_free(void)
867{ 869{
868 BUG_ON(PageNosave(virt_to_page(pagedir_save))); 870 BUG_ON(PageNosave(virt_to_page(pagedir_save)));
@@ -918,6 +920,7 @@ static int swsusp_alloc(void)
918 920
919 pagedir_nosave = NULL; 921 pagedir_nosave = NULL;
920 nr_copy_pages = calc_nr(nr_copy_pages); 922 nr_copy_pages = calc_nr(nr_copy_pages);
923 nr_copy_pages_check = nr_copy_pages;
921 924
922 pr_debug("suspend: (pages needed: %d + %d free: %d)\n", 925 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
923 nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); 926 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
@@ -928,6 +931,10 @@ static int swsusp_alloc(void)
928 if (!enough_swap()) 931 if (!enough_swap())
929 return -ENOSPC; 932 return -ENOSPC;
930 933
934 if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
935 !!(nr_copy_pages % PBES_PER_PAGE))
936 return -ENOSPC;
937
931 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { 938 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
932 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 939 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
933 return -ENOMEM; 940 return -ENOMEM;
@@ -940,7 +947,6 @@ static int swsusp_alloc(void)
940 return error; 947 return error;
941 } 948 }
942 949
943 nr_copy_pages_check = nr_copy_pages;
944 return 0; 950 return 0;
945} 951}
946 952
@@ -1059,6 +1065,7 @@ int swsusp_resume(void)
1059 BUG_ON(!error); 1065 BUG_ON(!error);
1060 restore_processor_state(); 1066 restore_processor_state();
1061 restore_highmem(); 1067 restore_highmem();
1068 touch_softlockup_watchdog();
1062 device_power_up(); 1069 device_power_up();
1063 local_irq_enable(); 1070 local_irq_enable();
1064 return error; 1071 return error;
@@ -1088,7 +1095,7 @@ static inline void eat_page(void *page)
1088 *eaten_memory = c; 1095 *eaten_memory = c;
1089} 1096}
1090 1097
1091static unsigned long get_usable_page(unsigned gfp_mask) 1098unsigned long get_usable_page(unsigned gfp_mask)
1092{ 1099{
1093 unsigned long m; 1100 unsigned long m;
1094 1101
@@ -1102,7 +1109,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
1102 return m; 1109 return m;
1103} 1110}
1104 1111
1105static void free_eaten_memory(void) 1112void free_eaten_memory(void)
1106{ 1113{
1107 unsigned long m; 1114 unsigned long m;
1108 void **c; 1115 void **c;
@@ -1212,8 +1219,9 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1212 free_pagedir(pblist); 1219 free_pagedir(pblist);
1213 free_eaten_memory(); 1220 free_eaten_memory();
1214 pblist = NULL; 1221 pblist = NULL;
1215 } 1222 /* Is this even worth handling? It should never ever happen, and we
1216 else 1223 have just lost user's state, anyway... */
1224 } else
1217 printk("swsusp: Relocated %d pages\n", rel); 1225 printk("swsusp: Relocated %d pages\n", rel);
1218 1226
1219 return pblist; 1227 return pblist;
@@ -1433,9 +1441,9 @@ static int read_pagedir(struct pbe *pblist)
1433 } 1441 }
1434 1442
1435 if (error) 1443 if (error)
1436 free_page((unsigned long)pblist); 1444 free_pagedir(pblist);
1437 1445 else
1438 BUG_ON(i != swsusp_info.pagedir_pages); 1446 BUG_ON(i != swsusp_info.pagedir_pages);
1439 1447
1440 return error; 1448 return error;
1441} 1449}
@@ -1473,11 +1481,12 @@ static int read_suspend_image(void)
1473 /* Allocate memory for the image and read the data from swap */ 1481 /* Allocate memory for the image and read the data from swap */
1474 1482
1475 error = check_pagedir(pagedir_nosave); 1483 error = check_pagedir(pagedir_nosave);
1476 free_eaten_memory(); 1484
1477 if (!error) 1485 if (!error)
1478 error = data_read(pagedir_nosave); 1486 error = data_read(pagedir_nosave);
1479 1487
1480 if (error) { /* We fail cleanly */ 1488 if (error) { /* We fail cleanly */
1489 free_eaten_memory();
1481 for_each_pbe (p, pagedir_nosave) 1490 for_each_pbe (p, pagedir_nosave)
1482 if (p->address) { 1491 if (p->address) {
1483 free_page(p->address); 1492 free_page(p->address);
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..4b8f0f9230a4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -488,6 +488,11 @@ static int __init printk_time_setup(char *str)
488 488
489__setup("time", printk_time_setup); 489__setup("time", printk_time_setup);
490 490
491__attribute__((weak)) unsigned long long printk_clock(void)
492{
493 return sched_clock();
494}
495
491/* 496/*
492 * This is printk. It can be called from any context. We want it to work. 497 * This is printk. It can be called from any context. We want it to work.
493 * 498 *
@@ -514,6 +519,9 @@ asmlinkage int printk(const char *fmt, ...)
514 return r; 519 return r;
515} 520}
516 521
522/* cpu currently holding logbuf_lock */
523static volatile unsigned int printk_cpu = UINT_MAX;
524
517asmlinkage int vprintk(const char *fmt, va_list args) 525asmlinkage int vprintk(const char *fmt, va_list args)
518{ 526{
519 unsigned long flags; 527 unsigned long flags;
@@ -522,11 +530,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
522 static char printk_buf[1024]; 530 static char printk_buf[1024];
523 static int log_level_unknown = 1; 531 static int log_level_unknown = 1;
524 532
525 if (unlikely(oops_in_progress)) 533 preempt_disable();
534 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
535 /* If a crash is occurring during printk() on this CPU,
536 * make sure we can't deadlock */
526 zap_locks(); 537 zap_locks();
527 538
528 /* This stops the holder of console_sem just where we want him */ 539 /* This stops the holder of console_sem just where we want him */
529 spin_lock_irqsave(&logbuf_lock, flags); 540 spin_lock_irqsave(&logbuf_lock, flags);
541 printk_cpu = smp_processor_id();
530 542
531 /* Emit the output into the temporary buffer */ 543 /* Emit the output into the temporary buffer */
532 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 544 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -558,7 +570,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
558 loglev_char = default_message_loglevel 570 loglev_char = default_message_loglevel
559 + '0'; 571 + '0';
560 } 572 }
561 t = sched_clock(); 573 t = printk_clock();
562 nanosec_rem = do_div(t, 1000000000); 574 nanosec_rem = do_div(t, 1000000000);
563 tlen = sprintf(tbuf, 575 tlen = sprintf(tbuf,
564 "<%c>[%5lu.%06lu] ", 576 "<%c>[%5lu.%06lu] ",
@@ -595,6 +607,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
595 * CPU until it is officially up. We shouldn't be calling into 607 * CPU until it is officially up. We shouldn't be calling into
596 * random console drivers on a CPU which doesn't exist yet.. 608 * random console drivers on a CPU which doesn't exist yet..
597 */ 609 */
610 printk_cpu = UINT_MAX;
598 spin_unlock_irqrestore(&logbuf_lock, flags); 611 spin_unlock_irqrestore(&logbuf_lock, flags);
599 goto out; 612 goto out;
600 } 613 }
@@ -604,6 +617,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
604 * We own the drivers. We can drop the spinlock and let 617 * We own the drivers. We can drop the spinlock and let
605 * release_console_sem() print the text 618 * release_console_sem() print the text
606 */ 619 */
620 printk_cpu = UINT_MAX;
607 spin_unlock_irqrestore(&logbuf_lock, flags); 621 spin_unlock_irqrestore(&logbuf_lock, flags);
608 console_may_schedule = 0; 622 console_may_schedule = 0;
609 release_console_sem(); 623 release_console_sem();
@@ -613,9 +627,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
613 * allows the semaphore holder to proceed and to call the 627 * allows the semaphore holder to proceed and to call the
614 * console drivers with the output which we just produced. 628 * console drivers with the output which we just produced.
615 */ 629 */
630 printk_cpu = UINT_MAX;
616 spin_unlock_irqrestore(&logbuf_lock, flags); 631 spin_unlock_irqrestore(&logbuf_lock, flags);
617 } 632 }
618out: 633out:
634 preempt_enable();
619 return printed_len; 635 return printed_len;
620} 636}
621EXPORT_SYMBOL(printk); 637EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
118 return ret; 118 return ret;
119} 119}
120 120
121static int may_attach(struct task_struct *task)
122{
123 if (!task->mm)
124 return -EPERM;
125 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) ||
127 (current->uid != task->uid) ||
128 (current->gid != task->egid) ||
129 (current->gid != task->sgid) ||
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM;
132 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM;
135
136 return security_ptrace(current, task);
137}
138
139int ptrace_may_attach(struct task_struct *task)
140{
141 int err;
142 task_lock(task);
143 err = may_attach(task);
144 task_unlock(task);
145 return !err;
146}
147
121int ptrace_attach(struct task_struct *task) 148int ptrace_attach(struct task_struct *task)
122{ 149{
123 int retval; 150 int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
127 goto bad; 154 goto bad;
128 if (task == current) 155 if (task == current)
129 goto bad; 156 goto bad;
130 if (!task->mm)
131 goto bad;
132 if(((current->uid != task->euid) ||
133 (current->uid != task->suid) ||
134 (current->uid != task->uid) ||
135 (current->gid != task->egid) ||
136 (current->gid != task->sgid) ||
137 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
138 goto bad;
139 smp_rmb();
140 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
141 goto bad;
142 /* the same process cannot be attached many times */ 157 /* the same process cannot be attached many times */
143 if (task->ptrace & PT_PTRACED) 158 if (task->ptrace & PT_PTRACED)
144 goto bad; 159 goto bad;
145 retval = security_ptrace(current, task); 160 retval = may_attach(task);
146 if (retval) 161 if (retval)
147 goto bad; 162 goto bad;
148 163
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f436993bd590..2559d4b8f23f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
45#include <linux/percpu.h> 45#include <linux/percpu.h>
46#include <linux/notifier.h> 46#include <linux/notifier.h>
47#include <linux/rcupdate.h> 47#include <linux/rcupdate.h>
48#include <linux/rcuref.h>
48#include <linux/cpu.h> 49#include <linux/cpu.h>
49 50
50/* Definition for rcupdate control block. */ 51/* Definition for rcupdate control block. */
@@ -70,7 +71,20 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
70 71
71/* Fake initialization required by compiler */ 72/* Fake initialization required by compiler */
72static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; 73static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
73static int maxbatch = 10; 74static int maxbatch = 10000;
75
76#ifndef __HAVE_ARCH_CMPXCHG
77/*
78 * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
79 * 32 bit atomic_t implementations, and a hash function similar to that
80 * for our refcounting needs.
81 * Can't help multiprocessors which donot have cmpxchg :(
82 */
83
84spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
85 [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
86};
87#endif
74 88
75/** 89/**
76 * call_rcu - Queue an RCU callback for invocation after a grace period. 90 * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -95,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head,
95 rdp = &__get_cpu_var(rcu_data); 109 rdp = &__get_cpu_var(rcu_data);
96 *rdp->nxttail = head; 110 *rdp->nxttail = head;
97 rdp->nxttail = &head->next; 111 rdp->nxttail = &head->next;
112
113 if (unlikely(++rdp->count > 10000))
114 set_need_resched();
115
98 local_irq_restore(flags); 116 local_irq_restore(flags);
99} 117}
100 118
@@ -126,6 +144,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
126 rdp = &__get_cpu_var(rcu_bh_data); 144 rdp = &__get_cpu_var(rcu_bh_data);
127 *rdp->nxttail = head; 145 *rdp->nxttail = head;
128 rdp->nxttail = &head->next; 146 rdp->nxttail = &head->next;
147 rdp->count++;
148/*
149 * Should we directly call rcu_do_batch() here ?
150 * if (unlikely(rdp->count > 10000))
151 * rcu_do_batch(rdp);
152 */
129 local_irq_restore(flags); 153 local_irq_restore(flags);
130} 154}
131 155
@@ -143,6 +167,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
143 next = rdp->donelist = list->next; 167 next = rdp->donelist = list->next;
144 list->func(list); 168 list->func(list);
145 list = next; 169 list = next;
170 rdp->count--;
146 if (++count >= maxbatch) 171 if (++count >= maxbatch)
147 break; 172 break;
148 } 173 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
430 */ 430 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
432{ 432{
433 struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); 433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 434
435 if (res) { 435 if (res) {
436 memset(res, 0, sizeof(*res));
437 res->name = name; 436 res->name = name;
438 res->start = start; 437 res->start = start;
439 res->end = start + n - 1; 438 res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..1e5cafdf4e27 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
294 294
295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
296{ 296{
297#ifdef CONFIG_DEBUG_SPINLOCK
298 /* this is a valid case when another task releases the spinlock */
299 rq->lock.owner = current;
300#endif
297 spin_unlock_irq(&rq->lock); 301 spin_unlock_irq(&rq->lock);
298} 302}
299 303
@@ -875,7 +879,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
875 * smp_call_function() if an IPI is sent by the same process we are 879 * smp_call_function() if an IPI is sent by the same process we are
876 * waiting to become inactive. 880 * waiting to become inactive.
877 */ 881 */
878void wait_task_inactive(task_t * p) 882void wait_task_inactive(task_t *p)
879{ 883{
880 unsigned long flags; 884 unsigned long flags;
881 runqueue_t *rq; 885 runqueue_t *rq;
@@ -966,8 +970,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
966 int local_group; 970 int local_group;
967 int i; 971 int i;
968 972
973 /* Skip over this group if it has no CPUs allowed */
974 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
975 goto nextgroup;
976
969 local_group = cpu_isset(this_cpu, group->cpumask); 977 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971 978
972 /* Tally up the load of all CPUs in the group */ 979 /* Tally up the load of all CPUs in the group */
973 avg_load = 0; 980 avg_load = 0;
@@ -992,6 +999,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
992 min_load = avg_load; 999 min_load = avg_load;
993 idlest = group; 1000 idlest = group;
994 } 1001 }
1002nextgroup:
995 group = group->next; 1003 group = group->next;
996 } while (group != sd->groups); 1004 } while (group != sd->groups);
997 1005
@@ -1003,13 +1011,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1003/* 1011/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group. 1012 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */ 1013 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu) 1014static int
1015find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1007{ 1016{
1017 cpumask_t tmp;
1008 unsigned long load, min_load = ULONG_MAX; 1018 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1; 1019 int idlest = -1;
1010 int i; 1020 int i;
1011 1021
1012 for_each_cpu_mask(i, group->cpumask) { 1022 /* Traverse only the allowed CPUs */
1023 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1024
1025 for_each_cpu_mask(i, tmp) {
1013 load = source_load(i, 0); 1026 load = source_load(i, 0);
1014 1027
1015 if (load < min_load || (load == min_load && i == this_cpu)) { 1028 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1065,7 @@ static int sched_balance_self(int cpu, int flag)
1052 if (!group) 1065 if (!group)
1053 goto nextlevel; 1066 goto nextlevel;
1054 1067
1055 new_cpu = find_idlest_cpu(group, cpu); 1068 new_cpu = find_idlest_cpu(group, t, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu) 1069 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel; 1070 goto nextlevel;
1058 1071
@@ -1127,7 +1140,7 @@ static inline int wake_idle(int cpu, task_t *p)
1127 * 1140 *
1128 * returns failure only if the task is already active. 1141 * returns failure only if the task is already active.
1129 */ 1142 */
1130static int try_to_wake_up(task_t * p, unsigned int state, int sync) 1143static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1131{ 1144{
1132 int cpu, this_cpu, success = 0; 1145 int cpu, this_cpu, success = 0;
1133 unsigned long flags; 1146 unsigned long flags;
@@ -1252,6 +1265,16 @@ out_activate:
1252 } 1265 }
1253 1266
1254 /* 1267 /*
1268 * Tasks that have marked their sleep as noninteractive get
1269 * woken up without updating their sleep average. (i.e. their
1270 * sleep is handled in a priority-neutral manner, no priority
1271 * boost and no penalty.)
1272 */
1273 if (old_state & TASK_NONINTERACTIVE)
1274 __activate_task(p, rq);
1275 else
1276 activate_task(p, rq, cpu == this_cpu);
1277 /*
1255 * Sync wakeups (i.e. those types of wakeups where the waker 1278 * Sync wakeups (i.e. those types of wakeups where the waker
1256 * has indicated that it will leave the CPU in short order) 1279 * has indicated that it will leave the CPU in short order)
1257 * don't trigger a preemption, if the woken up task will run on 1280 * don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1282,6 @@ out_activate:
1259 * the waker guarantees that the freshly woken up task is going 1282 * the waker guarantees that the freshly woken up task is going
1260 * to be considered on this CPU.) 1283 * to be considered on this CPU.)
1261 */ 1284 */
1262 activate_task(p, rq, cpu == this_cpu);
1263 if (!sync || cpu != this_cpu) { 1285 if (!sync || cpu != this_cpu) {
1264 if (TASK_PREEMPTS_CURR(p, rq)) 1286 if (TASK_PREEMPTS_CURR(p, rq))
1265 resched_task(rq->curr); 1287 resched_task(rq->curr);
@@ -1274,7 +1296,7 @@ out:
1274 return success; 1296 return success;
1275} 1297}
1276 1298
1277int fastcall wake_up_process(task_t * p) 1299int fastcall wake_up_process(task_t *p)
1278{ 1300{
1279 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1301 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1280 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1302 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1375,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1353 * that must be done for every newly created context, then puts the task 1375 * that must be done for every newly created context, then puts the task
1354 * on the runqueue and wakes it. 1376 * on the runqueue and wakes it.
1355 */ 1377 */
1356void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) 1378void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1357{ 1379{
1358 unsigned long flags; 1380 unsigned long flags;
1359 int this_cpu, cpu; 1381 int this_cpu, cpu;
@@ -1436,7 +1458,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1436 * artificially, because any timeslice recovered here 1458 * artificially, because any timeslice recovered here
1437 * was given away by the parent in the first place.) 1459 * was given away by the parent in the first place.)
1438 */ 1460 */
1439void fastcall sched_exit(task_t * p) 1461void fastcall sched_exit(task_t *p)
1440{ 1462{
1441 unsigned long flags; 1463 unsigned long flags;
1442 runqueue_t *rq; 1464 runqueue_t *rq;
@@ -1478,6 +1500,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1478 1500
1479/** 1501/**
1480 * finish_task_switch - clean up after a task-switch 1502 * finish_task_switch - clean up after a task-switch
1503 * @rq: runqueue associated with task-switch
1481 * @prev: the thread we just switched away from. 1504 * @prev: the thread we just switched away from.
1482 * 1505 *
1483 * finish_task_switch must be called after the context switch, paired 1506 * finish_task_switch must be called after the context switch, paired
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1752 */ 1775 */
1753static inline 1776static inline
1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1777int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned) 1778 struct sched_domain *sd, enum idle_type idle,
1779 int *all_pinned)
1756{ 1780{
1757 /* 1781 /*
1758 * We do not migrate tasks that are: 1782 * We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
1882 */ 1906 */
1883static struct sched_group * 1907static struct sched_group *
1884find_busiest_group(struct sched_domain *sd, int this_cpu, 1908find_busiest_group(struct sched_domain *sd, int this_cpu,
1885 unsigned long *imbalance, enum idle_type idle) 1909 unsigned long *imbalance, enum idle_type idle, int *sd_idle)
1886{ 1910{
1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1911 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1912 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1913 unsigned long max_pull;
1889 int load_idx; 1914 int load_idx;
1890 1915
1891 max_load = this_load = total_load = total_pwr = 0; 1916 max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1907 avg_load = 0; 1932 avg_load = 0;
1908 1933
1909 for_each_cpu_mask(i, group->cpumask) { 1934 for_each_cpu_mask(i, group->cpumask) {
1935 if (*sd_idle && !idle_cpu(i))
1936 *sd_idle = 0;
1937
1910 /* Bias balancing toward cpus of our domain */ 1938 /* Bias balancing toward cpus of our domain */
1911 if (local_group) 1939 if (local_group)
1912 load = target_load(i, load_idx); 1940 load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1932 group = group->next; 1960 group = group->next;
1933 } while (group != sd->groups); 1961 } while (group != sd->groups);
1934 1962
1935 if (!busiest || this_load >= max_load) 1963 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
1936 goto out_balanced; 1964 goto out_balanced;
1937 1965
1938 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 1966 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1952 * by pulling tasks to us. Be careful of negative numbers as they'll 1980 * by pulling tasks to us. Be careful of negative numbers as they'll
1953 * appear as very large values with unsigned longs. 1981 * appear as very large values with unsigned longs.
1954 */ 1982 */
1983
1984 /* Don't want to pull so many tasks that a group would go idle */
1985 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
1986
1955 /* How much load to actually move to equalise the imbalance */ 1987 /* How much load to actually move to equalise the imbalance */
1956 *imbalance = min((max_load - avg_load) * busiest->cpu_power, 1988 *imbalance = min(max_pull * busiest->cpu_power,
1957 (avg_load - this_load) * this->cpu_power) 1989 (avg_load - this_load) * this->cpu_power)
1958 / SCHED_LOAD_SCALE; 1990 / SCHED_LOAD_SCALE;
1959 1991
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2050 unsigned long imbalance; 2082 unsigned long imbalance;
2051 int nr_moved, all_pinned = 0; 2083 int nr_moved, all_pinned = 0;
2052 int active_balance = 0; 2084 int active_balance = 0;
2085 int sd_idle = 0;
2086
2087 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
2088 sd_idle = 1;
2053 2089
2054 spin_lock(&this_rq->lock);
2055 schedstat_inc(sd, lb_cnt[idle]); 2090 schedstat_inc(sd, lb_cnt[idle]);
2056 2091
2057 group = find_busiest_group(sd, this_cpu, &imbalance, idle); 2092 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
2058 if (!group) { 2093 if (!group) {
2059 schedstat_inc(sd, lb_nobusyg[idle]); 2094 schedstat_inc(sd, lb_nobusyg[idle]);
2060 goto out_balanced; 2095 goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2078 * still unbalanced. nr_moved simply stays zero, so it is 2113 * still unbalanced. nr_moved simply stays zero, so it is
2079 * correctly treated as an imbalance. 2114 * correctly treated as an imbalance.
2080 */ 2115 */
2081 double_lock_balance(this_rq, busiest); 2116 double_rq_lock(this_rq, busiest);
2082 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2117 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2083 imbalance, sd, idle, 2118 imbalance, sd, idle, &all_pinned);
2084 &all_pinned); 2119 double_rq_unlock(this_rq, busiest);
2085 spin_unlock(&busiest->lock);
2086 2120
2087 /* All tasks on this runqueue were pinned by CPU affinity */ 2121 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned)) 2122 if (unlikely(all_pinned))
2089 goto out_balanced; 2123 goto out_balanced;
2090 } 2124 }
2091 2125
2092 spin_unlock(&this_rq->lock);
2093
2094 if (!nr_moved) { 2126 if (!nr_moved) {
2095 schedstat_inc(sd, lb_failed[idle]); 2127 schedstat_inc(sd, lb_failed[idle]);
2096 sd->nr_balance_failed++; 2128 sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2130 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2099 2131
2100 spin_lock(&busiest->lock); 2132 spin_lock(&busiest->lock);
2133
2134 /* don't kick the migration_thread, if the curr
2135 * task on busiest cpu can't be moved to this_cpu
2136 */
2137 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2138 spin_unlock(&busiest->lock);
2139 all_pinned = 1;
2140 goto out_one_pinned;
2141 }
2142
2101 if (!busiest->active_balance) { 2143 if (!busiest->active_balance) {
2102 busiest->active_balance = 1; 2144 busiest->active_balance = 1;
2103 busiest->push_cpu = this_cpu; 2145 busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2130 sd->balance_interval *= 2; 2172 sd->balance_interval *= 2;
2131 } 2173 }
2132 2174
2175 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2176 return -1;
2133 return nr_moved; 2177 return nr_moved;
2134 2178
2135out_balanced: 2179out_balanced:
2136 spin_unlock(&this_rq->lock);
2137
2138 schedstat_inc(sd, lb_balanced[idle]); 2180 schedstat_inc(sd, lb_balanced[idle]);
2139 2181
2140 sd->nr_balance_failed = 0; 2182 sd->nr_balance_failed = 0;
2183
2184out_one_pinned:
2141 /* tune up the balancing interval */ 2185 /* tune up the balancing interval */
2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 2186 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval)) 2187 (sd->balance_interval < sd->max_interval))
2144 sd->balance_interval *= 2; 2188 sd->balance_interval *= 2;
2145 2189
2190 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2191 return -1;
2146 return 0; 2192 return 0;
2147} 2193}
2148 2194
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2160 runqueue_t *busiest = NULL; 2206 runqueue_t *busiest = NULL;
2161 unsigned long imbalance; 2207 unsigned long imbalance;
2162 int nr_moved = 0; 2208 int nr_moved = 0;
2209 int sd_idle = 0;
2210
2211 if (sd->flags & SD_SHARE_CPUPOWER)
2212 sd_idle = 1;
2163 2213
2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2214 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2215 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
2166 if (!group) { 2216 if (!group) {
2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2217 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2168 goto out_balanced; 2218 goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2176 2226
2177 BUG_ON(busiest == this_rq); 2227 BUG_ON(busiest == this_rq);
2178 2228
2179 /* Attempt to move tasks */
2180 double_lock_balance(this_rq, busiest);
2181
2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2229 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2183 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2230
2231 nr_moved = 0;
2232 if (busiest->nr_running > 1) {
2233 /* Attempt to move tasks */
2234 double_lock_balance(this_rq, busiest);
2235 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2184 imbalance, sd, NEWLY_IDLE, NULL); 2236 imbalance, sd, NEWLY_IDLE, NULL);
2185 if (!nr_moved) 2237 spin_unlock(&busiest->lock);
2238 }
2239
2240 if (!nr_moved) {
2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2241 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else 2242 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2243 return -1;
2244 } else
2188 sd->nr_balance_failed = 0; 2245 sd->nr_balance_failed = 0;
2189 2246
2190 spin_unlock(&busiest->lock);
2191 return nr_moved; 2247 return nr_moved;
2192 2248
2193out_balanced: 2249out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2250 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2251 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2252 return -1;
2195 sd->nr_balance_failed = 0; 2253 sd->nr_balance_failed = 0;
2196 return 0; 2254 return 0;
2197} 2255}
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2316 2374
2317 if (j - sd->last_balance >= interval) { 2375 if (j - sd->last_balance >= interval) {
2318 if (load_balance(this_cpu, this_rq, sd, idle)) { 2376 if (load_balance(this_cpu, this_rq, sd, idle)) {
2319 /* We've pulled tasks over so no longer idle */ 2377 /*
2378 * We've pulled tasks over so either we're no
2379 * longer idle, or one of our SMT siblings is
2380 * not idle.
2381 */
2320 idle = NOT_IDLE; 2382 idle = NOT_IDLE;
2321 } 2383 }
2322 sd->last_balance += interval; 2384 sd->last_balance += interval;
@@ -2575,6 +2637,13 @@ out:
2575} 2637}
2576 2638
2577#ifdef CONFIG_SCHED_SMT 2639#ifdef CONFIG_SCHED_SMT
2640static inline void wakeup_busy_runqueue(runqueue_t *rq)
2641{
2642 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2643 if (rq->curr == rq->idle && rq->nr_running)
2644 resched_task(rq->idle);
2645}
2646
2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2647static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2579{ 2648{
2580 struct sched_domain *tmp, *sd = NULL; 2649 struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2608 for_each_cpu_mask(i, sibling_map) { 2677 for_each_cpu_mask(i, sibling_map) {
2609 runqueue_t *smt_rq = cpu_rq(i); 2678 runqueue_t *smt_rq = cpu_rq(i);
2610 2679
2611 /* 2680 wakeup_busy_runqueue(smt_rq);
2612 * If an SMT sibling task is sleeping due to priority
2613 * reasons wake it up now.
2614 */
2615 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2616 resched_task(smt_rq->idle);
2617 } 2681 }
2618 2682
2619 for_each_cpu_mask(i, sibling_map) 2683 for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2624 */ 2688 */
2625} 2689}
2626 2690
2691/*
2692 * number of 'lost' timeslices this task wont be able to fully
2693 * utilize, if another task runs on a sibling. This models the
2694 * slowdown effect of other tasks running on siblings:
2695 */
2696static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2697{
2698 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2699}
2700
2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2701static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2628{ 2702{
2629 struct sched_domain *tmp, *sd = NULL; 2703 struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2667 runqueue_t *smt_rq = cpu_rq(i); 2741 runqueue_t *smt_rq = cpu_rq(i);
2668 task_t *smt_curr = smt_rq->curr; 2742 task_t *smt_curr = smt_rq->curr;
2669 2743
2744 /* Kernel threads do not participate in dependent sleeping */
2745 if (!p->mm || !smt_curr->mm || rt_task(p))
2746 goto check_smt_task;
2747
2670 /* 2748 /*
2671 * If a user task with lower static priority than the 2749 * If a user task with lower static priority than the
2672 * running task on the SMT sibling is trying to schedule, 2750 * running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2675 * task from using an unfair proportion of the 2753 * task from using an unfair proportion of the
2676 * physical cpu's resources. -ck 2754 * physical cpu's resources. -ck
2677 */ 2755 */
2678 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > 2756 if (rt_task(smt_curr)) {
2679 task_timeslice(p) || rt_task(smt_curr)) && 2757 /*
2680 p->mm && smt_curr->mm && !rt_task(p)) 2758 * With real time tasks we run non-rt tasks only
2681 ret = 1; 2759 * per_cpu_gain% of the time.
2760 */
2761 if ((jiffies % DEF_TIMESLICE) >
2762 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2763 ret = 1;
2764 } else
2765 if (smt_curr->static_prio < p->static_prio &&
2766 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2767 smt_slice(smt_curr, sd) > task_timeslice(p))
2768 ret = 1;
2769
2770check_smt_task:
2771 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2772 rt_task(smt_curr))
2773 continue;
2774 if (!p->mm) {
2775 wakeup_busy_runqueue(smt_rq);
2776 continue;
2777 }
2682 2778
2683 /* 2779 /*
2684 * Reschedule a lower priority task on the SMT sibling, 2780 * Reschedule a lower priority task on the SMT sibling for
2685 * or wake it up if it has been put to sleep for priority 2781 * it to be put to sleep, or wake it up if it has been put to
2686 * reasons. 2782 * sleep for priority reasons to see if it should run now.
2687 */ 2783 */
2688 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > 2784 if (rt_task(p)) {
2689 task_timeslice(smt_curr) || rt_task(p)) && 2785 if ((jiffies % DEF_TIMESLICE) >
2690 smt_curr->mm && p->mm && !rt_task(smt_curr)) || 2786 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2691 (smt_curr == smt_rq->idle && smt_rq->nr_running)) 2787 resched_task(smt_curr);
2692 resched_task(smt_curr); 2788 } else {
2789 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2790 smt_slice(p, sd) > task_timeslice(smt_curr))
2791 resched_task(smt_curr);
2792 else
2793 wakeup_busy_runqueue(smt_rq);
2794 }
2693 } 2795 }
2694out_unlock: 2796out_unlock:
2695 for_each_cpu_mask(i, sibling_map) 2797 for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2989,7 @@ switch_tasks:
2887 if (next == rq->idle) 2989 if (next == rq->idle)
2888 schedstat_inc(rq, sched_goidle); 2990 schedstat_inc(rq, sched_goidle);
2889 prefetch(next); 2991 prefetch(next);
2992 prefetch_stack(next);
2890 clear_tsk_need_resched(prev); 2993 clear_tsk_need_resched(prev);
2891 rcu_qsctr_inc(task_cpu(prev)); 2994 rcu_qsctr_inc(task_cpu(prev));
2892 2995
@@ -3014,7 +3117,8 @@ need_resched:
3014 3117
3015#endif /* CONFIG_PREEMPT */ 3118#endif /* CONFIG_PREEMPT */
3016 3119
3017int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 3120int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3121 void *key)
3018{ 3122{
3019 task_t *p = curr->private; 3123 task_t *p = curr->private;
3020 return try_to_wake_up(p, mode, sync); 3124 return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3056 * @key: is directly passed to the wakeup function 3160 * @key: is directly passed to the wakeup function
3057 */ 3161 */
3058void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 3162void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3059 int nr_exclusive, void *key) 3163 int nr_exclusive, void *key)
3060{ 3164{
3061 unsigned long flags; 3165 unsigned long flags;
3062 3166
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3088 * 3192 *
3089 * On UP it can prevent extra preemption. 3193 * On UP it can prevent extra preemption.
3090 */ 3194 */
3091void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3195void fastcall
3196__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3092{ 3197{
3093 unsigned long flags; 3198 unsigned long flags;
3094 int sync = 1; 3199 int sync = 1;
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3279 3384
3280EXPORT_SYMBOL(interruptible_sleep_on); 3385EXPORT_SYMBOL(interruptible_sleep_on);
3281 3386
3282long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3387long fastcall __sched
3388interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3283{ 3389{
3284 SLEEP_ON_VAR 3390 SLEEP_ON_VAR
3285 3391
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3498 * @policy: new policy. 3604 * @policy: new policy.
3499 * @param: structure containing the new RT priority. 3605 * @param: structure containing the new RT priority.
3500 */ 3606 */
3501int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) 3607int sched_setscheduler(struct task_struct *p, int policy,
3608 struct sched_param *param)
3502{ 3609{
3503 int retval; 3610 int retval;
3504 int oldprio, oldpolicy = -1; 3611 int oldprio, oldpolicy = -1;
@@ -3518,7 +3625,7 @@ recheck:
3518 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3625 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3519 */ 3626 */
3520 if (param->sched_priority < 0 || 3627 if (param->sched_priority < 0 ||
3521 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3628 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3522 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3629 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3523 return -EINVAL; 3630 return -EINVAL;
3524 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3631 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3688,8 @@ recheck:
3581} 3688}
3582EXPORT_SYMBOL_GPL(sched_setscheduler); 3689EXPORT_SYMBOL_GPL(sched_setscheduler);
3583 3690
3584static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3691static int
3692do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3585{ 3693{
3586 int retval; 3694 int retval;
3587 struct sched_param lparam; 3695 struct sched_param lparam;
@@ -3771,6 +3879,7 @@ EXPORT_SYMBOL(cpu_present_map);
3771 3879
3772#ifndef CONFIG_SMP 3880#ifndef CONFIG_SMP
3773cpumask_t cpu_online_map = CPU_MASK_ALL; 3881cpumask_t cpu_online_map = CPU_MASK_ALL;
3882EXPORT_SYMBOL_GPL(cpu_online_map);
3774cpumask_t cpu_possible_map = CPU_MASK_ALL; 3883cpumask_t cpu_possible_map = CPU_MASK_ALL;
3775#endif 3884#endif
3776 3885
@@ -3848,7 +3957,7 @@ asmlinkage long sys_sched_yield(void)
3848 if (rt_task(current)) 3957 if (rt_task(current))
3849 target = rq->active; 3958 target = rq->active;
3850 3959
3851 if (current->array->nr_active == 1) { 3960 if (array->nr_active == 1) {
3852 schedstat_inc(rq, yld_act_empty); 3961 schedstat_inc(rq, yld_act_empty);
3853 if (!rq->expired->nr_active) 3962 if (!rq->expired->nr_active)
3854 schedstat_inc(rq, yld_both_empty); 3963 schedstat_inc(rq, yld_both_empty);
@@ -3912,7 +4021,7 @@ EXPORT_SYMBOL(cond_resched);
3912 * operations here to prevent schedule() from being called twice (once via 4021 * operations here to prevent schedule() from being called twice (once via
3913 * spin_unlock(), once by hand). 4022 * spin_unlock(), once by hand).
3914 */ 4023 */
3915int cond_resched_lock(spinlock_t * lock) 4024int cond_resched_lock(spinlock_t *lock)
3916{ 4025{
3917 int ret = 0; 4026 int ret = 0;
3918 4027
@@ -4095,7 +4204,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
4095 return list_entry(p->sibling.next,struct task_struct,sibling); 4204 return list_entry(p->sibling.next,struct task_struct,sibling);
4096} 4205}
4097 4206
4098static void show_task(task_t * p) 4207static void show_task(task_t *p)
4099{ 4208{
4100 task_t *relative; 4209 task_t *relative;
4101 unsigned state; 4210 unsigned state;
@@ -4121,7 +4230,7 @@ static void show_task(task_t * p)
4121#endif 4230#endif
4122#ifdef CONFIG_DEBUG_STACK_USAGE 4231#ifdef CONFIG_DEBUG_STACK_USAGE
4123 { 4232 {
4124 unsigned long * n = (unsigned long *) (p->thread_info+1); 4233 unsigned long *n = (unsigned long *) (p->thread_info+1);
4125 while (!*n) 4234 while (!*n)
4126 n++; 4235 n++;
4127 free = (unsigned long) n - (unsigned long)(p->thread_info+1); 4236 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4439,7 @@ out:
4330 * thread migration by bumping thread off CPU then 'pushing' onto 4439 * thread migration by bumping thread off CPU then 'pushing' onto
4331 * another runqueue. 4440 * another runqueue.
4332 */ 4441 */
4333static int migration_thread(void * data) 4442static int migration_thread(void *data)
4334{ 4443{
4335 runqueue_t *rq; 4444 runqueue_t *rq;
4336 int cpu = (long)data; 4445 int cpu = (long)data;
@@ -4779,7 +4888,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4888 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780 * hold the hotplug lock. 4889 * hold the hotplug lock.
4781 */ 4890 */
4782void cpu_attach_domain(struct sched_domain *sd, int cpu) 4891static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783{ 4892{
4784 runqueue_t *rq = cpu_rq(cpu); 4893 runqueue_t *rq = cpu_rq(cpu);
4785 struct sched_domain *tmp; 4894 struct sched_domain *tmp;
@@ -4802,7 +4911,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802} 4911}
4803 4912
4804/* cpus with isolated domains */ 4913/* cpus with isolated domains */
4805cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4914static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806 4915
4807/* Setup the mask of cpus configured for isolated domains */ 4916/* Setup the mask of cpus configured for isolated domains */
4808static int __init isolated_cpu_setup(char *str) 4917static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4939,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830 * covered by the given span, and will set each group's ->cpumask correctly, 4939 * covered by the given span, and will set each group's ->cpumask correctly,
4831 * and ->cpu_power to 0. 4940 * and ->cpu_power to 0.
4832 */ 4941 */
4833void init_sched_build_groups(struct sched_group groups[], 4942static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834 cpumask_t span, int (*group_fn)(int cpu)) 4943 int (*group_fn)(int cpu))
4835{ 4944{
4836 struct sched_group *first = NULL, *last = NULL; 4945 struct sched_group *first = NULL, *last = NULL;
4837 cpumask_t covered = CPU_MASK_NONE; 4946 cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4973,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864 last->next = first; 4973 last->next = first;
4865} 4974}
4866 4975
4976#define SD_NODES_PER_DOMAIN 16
4867 4977
4868#ifdef ARCH_HAS_SCHED_DOMAIN 4978#ifdef CONFIG_NUMA
4869extern void build_sched_domains(const cpumask_t *cpu_map); 4979/**
4870extern void arch_init_sched_domains(const cpumask_t *cpu_map); 4980 * find_next_best_node - find the next node to include in a sched_domain
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); 4981 * @node: node whose sched_domain we're building
4872#else 4982 * @used_nodes: nodes already in the sched_domain
4983 *
4984 * Find the next node to include in a given scheduling domain. Simply
4985 * finds the closest node not already in the @used_nodes map.
4986 *
4987 * Should use nodemask_t.
4988 */
4989static int find_next_best_node(int node, unsigned long *used_nodes)
4990{
4991 int i, n, val, min_val, best_node = 0;
4992
4993 min_val = INT_MAX;
4994
4995 for (i = 0; i < MAX_NUMNODES; i++) {
4996 /* Start at @node */
4997 n = (node + i) % MAX_NUMNODES;
4998
4999 if (!nr_cpus_node(n))
5000 continue;
5001
5002 /* Skip already used nodes */
5003 if (test_bit(n, used_nodes))
5004 continue;
5005
5006 /* Simple min distance search */
5007 val = node_distance(node, n);
5008
5009 if (val < min_val) {
5010 min_val = val;
5011 best_node = n;
5012 }
5013 }
5014
5015 set_bit(best_node, used_nodes);
5016 return best_node;
5017}
5018
5019/**
5020 * sched_domain_node_span - get a cpumask for a node's sched_domain
5021 * @node: node whose cpumask we're constructing
5022 * @size: number of nodes to include in this span
5023 *
5024 * Given a node, construct a good cpumask for its sched_domain to span. It
5025 * should be one that prevents unnecessary balancing, but also spreads tasks
5026 * out optimally.
5027 */
5028static cpumask_t sched_domain_node_span(int node)
5029{
5030 int i;
5031 cpumask_t span, nodemask;
5032 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5033
5034 cpus_clear(span);
5035 bitmap_zero(used_nodes, MAX_NUMNODES);
5036
5037 nodemask = node_to_cpumask(node);
5038 cpus_or(span, span, nodemask);
5039 set_bit(node, used_nodes);
5040
5041 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5042 int next_node = find_next_best_node(node, used_nodes);
5043 nodemask = node_to_cpumask(next_node);
5044 cpus_or(span, span, nodemask);
5045 }
5046
5047 return span;
5048}
5049#endif
5050
5051/*
5052 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5053 * can switch it on easily if needed.
5054 */
4873#ifdef CONFIG_SCHED_SMT 5055#ifdef CONFIG_SCHED_SMT
4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 5056static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875static struct sched_group sched_group_cpus[NR_CPUS]; 5057static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5073,20 @@ static int cpu_to_phys_group(int cpu)
4891} 5073}
4892 5074
4893#ifdef CONFIG_NUMA 5075#ifdef CONFIG_NUMA
4894
4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897static int cpu_to_node_group(int cpu)
4898{
4899 return cpu_to_node(cpu);
4900}
4901#endif
4902
4903#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904/* 5076/*
4905 * The domains setup code relies on siblings not spanning 5077 * The init_sched_build_groups can't handle what we want to do with node
4906 * multiple nodes. Make sure the architecture has a proper 5078 * groups, so roll our own. Now each node has its own list of groups which
4907 * siblings map: 5079 * gets dynamically allocated.
4908 */ 5080 */
4909static void check_sibling_maps(void) 5081static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910{ 5082static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
4911 int i, j;
4912 5083
4913 for_each_online_cpu(i) { 5084static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914 for_each_cpu_mask(j, cpu_sibling_map[i]) { 5085static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
4915 if (cpu_to_node(i) != cpu_to_node(j)) { 5086
4916 printk(KERN_INFO "warning: CPU %d siblings map " 5087static int cpu_to_allnodes_group(int cpu)
4917 "to different node - isolating " 5088{
4918 "them.\n", i); 5089 return cpu_to_node(cpu);
4919 cpu_sibling_map[i] = cpumask_of_cpu(i);
4920 break;
4921 }
4922 }
4923 }
4924} 5090}
4925#endif 5091#endif
4926 5092
@@ -4928,9 +5094,24 @@ static void check_sibling_maps(void)
4928 * Build sched domains for a given set of cpus and attach the sched domains 5094 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus 5095 * to the individual cpus
4930 */ 5096 */
4931static void build_sched_domains(const cpumask_t *cpu_map) 5097void build_sched_domains(const cpumask_t *cpu_map)
4932{ 5098{
4933 int i; 5099 int i;
5100#ifdef CONFIG_NUMA
5101 struct sched_group **sched_group_nodes = NULL;
5102 struct sched_group *sched_group_allnodes = NULL;
5103
5104 /*
5105 * Allocate the per-node list of sched groups
5106 */
5107 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5108 GFP_ATOMIC);
5109 if (!sched_group_nodes) {
5110 printk(KERN_WARNING "Can not alloc sched group node list\n");
5111 return;
5112 }
5113 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5114#endif
4934 5115
4935 /* 5116 /*
4936 * Set up domains for cpus specified by the cpu_map. 5117 * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5124,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943 cpus_and(nodemask, nodemask, *cpu_map); 5124 cpus_and(nodemask, nodemask, *cpu_map);
4944 5125
4945#ifdef CONFIG_NUMA 5126#ifdef CONFIG_NUMA
5127 if (cpus_weight(*cpu_map)
5128 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5129 if (!sched_group_allnodes) {
5130 sched_group_allnodes
5131 = kmalloc(sizeof(struct sched_group)
5132 * MAX_NUMNODES,
5133 GFP_KERNEL);
5134 if (!sched_group_allnodes) {
5135 printk(KERN_WARNING
5136 "Can not alloc allnodes sched group\n");
5137 break;
5138 }
5139 sched_group_allnodes_bycpu[i]
5140 = sched_group_allnodes;
5141 }
5142 sd = &per_cpu(allnodes_domains, i);
5143 *sd = SD_ALLNODES_INIT;
5144 sd->span = *cpu_map;
5145 group = cpu_to_allnodes_group(i);
5146 sd->groups = &sched_group_allnodes[group];
5147 p = sd;
5148 } else
5149 p = NULL;
5150
4946 sd = &per_cpu(node_domains, i); 5151 sd = &per_cpu(node_domains, i);
4947 group = cpu_to_node_group(i);
4948 *sd = SD_NODE_INIT; 5152 *sd = SD_NODE_INIT;
4949 sd->span = *cpu_map; 5153 sd->span = sched_domain_node_span(cpu_to_node(i));
4950 sd->groups = &sched_group_nodes[group]; 5154 sd->parent = p;
5155 cpus_and(sd->span, sd->span, *cpu_map);
4951#endif 5156#endif
4952 5157
4953 p = sd; 5158 p = sd;
@@ -4972,7 +5177,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972 5177
4973#ifdef CONFIG_SCHED_SMT 5178#ifdef CONFIG_SCHED_SMT
4974 /* Set up CPU (sibling) groups */ 5179 /* Set up CPU (sibling) groups */
4975 for_each_online_cpu(i) { 5180 for_each_cpu_mask(i, *cpu_map) {
4976 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5181 cpumask_t this_sibling_map = cpu_sibling_map[i];
4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5182 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978 if (i != first_cpu(this_sibling_map)) 5183 if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5202,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997 5202
4998#ifdef CONFIG_NUMA 5203#ifdef CONFIG_NUMA
4999 /* Set up node groups */ 5204 /* Set up node groups */
5000 init_sched_build_groups(sched_group_nodes, *cpu_map, 5205 if (sched_group_allnodes)
5001 &cpu_to_node_group); 5206 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5207 &cpu_to_allnodes_group);
5208
5209 for (i = 0; i < MAX_NUMNODES; i++) {
5210 /* Set up node groups */
5211 struct sched_group *sg, *prev;
5212 cpumask_t nodemask = node_to_cpumask(i);
5213 cpumask_t domainspan;
5214 cpumask_t covered = CPU_MASK_NONE;
5215 int j;
5216
5217 cpus_and(nodemask, nodemask, *cpu_map);
5218 if (cpus_empty(nodemask)) {
5219 sched_group_nodes[i] = NULL;
5220 continue;
5221 }
5222
5223 domainspan = sched_domain_node_span(i);
5224 cpus_and(domainspan, domainspan, *cpu_map);
5225
5226 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5227 sched_group_nodes[i] = sg;
5228 for_each_cpu_mask(j, nodemask) {
5229 struct sched_domain *sd;
5230 sd = &per_cpu(node_domains, j);
5231 sd->groups = sg;
5232 if (sd->groups == NULL) {
5233 /* Turn off balancing if we have no groups */
5234 sd->flags = 0;
5235 }
5236 }
5237 if (!sg) {
5238 printk(KERN_WARNING
5239 "Can not alloc domain group for node %d\n", i);
5240 continue;
5241 }
5242 sg->cpu_power = 0;
5243 sg->cpumask = nodemask;
5244 cpus_or(covered, covered, nodemask);
5245 prev = sg;
5246
5247 for (j = 0; j < MAX_NUMNODES; j++) {
5248 cpumask_t tmp, notcovered;
5249 int n = (i + j) % MAX_NUMNODES;
5250
5251 cpus_complement(notcovered, covered);
5252 cpus_and(tmp, notcovered, *cpu_map);
5253 cpus_and(tmp, tmp, domainspan);
5254 if (cpus_empty(tmp))
5255 break;
5256
5257 nodemask = node_to_cpumask(n);
5258 cpus_and(tmp, tmp, nodemask);
5259 if (cpus_empty(tmp))
5260 continue;
5261
5262 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5263 if (!sg) {
5264 printk(KERN_WARNING
5265 "Can not alloc domain group for node %d\n", j);
5266 break;
5267 }
5268 sg->cpu_power = 0;
5269 sg->cpumask = tmp;
5270 cpus_or(covered, covered, tmp);
5271 prev->next = sg;
5272 prev = sg;
5273 }
5274 prev->next = sched_group_nodes[i];
5275 }
5002#endif 5276#endif
5003 5277
5004 /* Calculate CPU power for physical packages and nodes */ 5278 /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5291,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017 sd->groups->cpu_power = power; 5291 sd->groups->cpu_power = power;
5018 5292
5019#ifdef CONFIG_NUMA 5293#ifdef CONFIG_NUMA
5020 if (i == first_cpu(sd->groups->cpumask)) { 5294 sd = &per_cpu(allnodes_domains, i);
5021 /* Only add "power" once for each physical package. */ 5295 if (sd->groups) {
5022 sd = &per_cpu(node_domains, i); 5296 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023 sd->groups->cpu_power += power; 5297 (cpus_weight(sd->groups->cpumask)-1) / 10;
5298 sd->groups->cpu_power = power;
5024 } 5299 }
5025#endif 5300#endif
5026 } 5301 }
5027 5302
5303#ifdef CONFIG_NUMA
5304 for (i = 0; i < MAX_NUMNODES; i++) {
5305 struct sched_group *sg = sched_group_nodes[i];
5306 int j;
5307
5308 if (sg == NULL)
5309 continue;
5310next_sg:
5311 for_each_cpu_mask(j, sg->cpumask) {
5312 struct sched_domain *sd;
5313 int power;
5314
5315 sd = &per_cpu(phys_domains, j);
5316 if (j != first_cpu(sd->groups->cpumask)) {
5317 /*
5318 * Only add "power" once for each
5319 * physical package.
5320 */
5321 continue;
5322 }
5323 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5324 (cpus_weight(sd->groups->cpumask)-1) / 10;
5325
5326 sg->cpu_power += power;
5327 }
5328 sg = sg->next;
5329 if (sg != sched_group_nodes[i])
5330 goto next_sg;
5331 }
5332#endif
5333
5028 /* Attach the domains */ 5334 /* Attach the domains */
5029 for_each_cpu_mask(i, *cpu_map) { 5335 for_each_cpu_mask(i, *cpu_map) {
5030 struct sched_domain *sd; 5336 struct sched_domain *sd;
@@ -5039,13 +5345,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039/* 5345/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5346 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */ 5347 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map) 5348static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043{ 5349{
5044 cpumask_t cpu_default_map; 5350 cpumask_t cpu_default_map;
5045 5351
5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047 check_sibling_maps();
5048#endif
5049 /* 5352 /*
5050 * Setup mask for cpus without special case scheduling requirements. 5353 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to 5354 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5361,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058 5361
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5362static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060{ 5363{
5061 /* Do nothing: everything is statically allocated. */ 5364#ifdef CONFIG_NUMA
5062} 5365 int i;
5366 int cpu;
5063 5367
5064#endif /* ARCH_HAS_SCHED_DOMAIN */ 5368 for_each_cpu_mask(cpu, *cpu_map) {
5369 struct sched_group *sched_group_allnodes
5370 = sched_group_allnodes_bycpu[cpu];
5371 struct sched_group **sched_group_nodes
5372 = sched_group_nodes_bycpu[cpu];
5373
5374 if (sched_group_allnodes) {
5375 kfree(sched_group_allnodes);
5376 sched_group_allnodes_bycpu[cpu] = NULL;
5377 }
5378
5379 if (!sched_group_nodes)
5380 continue;
5381
5382 for (i = 0; i < MAX_NUMNODES; i++) {
5383 cpumask_t nodemask = node_to_cpumask(i);
5384 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5385
5386 cpus_and(nodemask, nodemask, *cpu_map);
5387 if (cpus_empty(nodemask))
5388 continue;
5389
5390 if (sg == NULL)
5391 continue;
5392 sg = sg->next;
5393next_sg:
5394 oldsg = sg;
5395 sg = sg->next;
5396 kfree(oldsg);
5397 if (oldsg != sched_group_nodes[i])
5398 goto next_sg;
5399 }
5400 kfree(sched_group_nodes);
5401 sched_group_nodes_bycpu[cpu] = NULL;
5402 }
5403#endif
5404}
5065 5405
5066/* 5406/*
5067 * Detach sched domains from a group of cpus specified in cpu_map 5407 * Detach sched domains from a group of cpus specified in cpu_map
@@ -5263,3 +5603,47 @@ void normalize_rt_tasks(void)
5263} 5603}
5264 5604
5265#endif /* CONFIG_MAGIC_SYSRQ */ 5605#endif /* CONFIG_MAGIC_SYSRQ */
5606
5607#ifdef CONFIG_IA64
5608/*
5609 * These functions are only useful for the IA64 MCA handling.
5610 *
5611 * They can only be called when the whole system has been
5612 * stopped - every CPU needs to be quiescent, and no scheduling
5613 * activity can take place. Using them for anything else would
5614 * be a serious bug, and as a result, they aren't even visible
5615 * under any other configuration.
5616 */
5617
5618/**
5619 * curr_task - return the current task for a given cpu.
5620 * @cpu: the processor in question.
5621 *
5622 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
5623 */
5624task_t *curr_task(int cpu)
5625{
5626 return cpu_curr(cpu);
5627}
5628
5629/**
5630 * set_curr_task - set the current task for a given cpu.
5631 * @cpu: the processor in question.
5632 * @p: the task pointer to set.
5633 *
5634 * Description: This function must only be used when non-maskable interrupts
5635 * are serviced on a separate stack. It allows the architecture to switch the
5636 * notion of the current task on a cpu in a non-blocking manner. This function
5637 * must be called with all CPU's synchronized, and interrupts disabled, the
5638 * and caller must save the original value of the current task (see
5639 * curr_task() above) and restore that value before reenabling interrupts and
5640 * re-starting the system.
5641 *
5642 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
5643 */
5644void set_curr_task(int cpu, task_t *p)
5645{
5646 cpu_curr(cpu) = p;
5647}
5648
5649#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..f2b96b08fb44 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask)
262 return sig; 262 return sig;
263} 263}
264 264
265static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, 265static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
266 int override_rlimit) 266 int override_rlimit)
267{ 267{
268 struct sigqueue *q = NULL; 268 struct sigqueue *q = NULL;
@@ -397,20 +397,8 @@ void __exit_signal(struct task_struct *tsk)
397 flush_sigqueue(&tsk->pending); 397 flush_sigqueue(&tsk->pending);
398 if (sig) { 398 if (sig) {
399 /* 399 /*
400 * We are cleaning up the signal_struct here. We delayed 400 * We are cleaning up the signal_struct here.
401 * calling exit_itimers until after flush_sigqueue, just in
402 * case our thread-local pending queue contained a queued
403 * timer signal that would have been cleared in
404 * exit_itimers. When that called sigqueue_free, it would
405 * attempt to re-take the tasklist_lock and deadlock. This
406 * can never happen if we ensure that all queues the
407 * timer's signal might be queued on have been flushed
408 * first. The shared_pending queue, and our own pending
409 * queue are the only queues the timer could be on, since
410 * there are no other threads left in the group and timer
411 * signals are constrained to threads inside the group.
412 */ 401 */
413 exit_itimers(sig);
414 exit_thread_group_keys(sig); 402 exit_thread_group_keys(sig);
415 kmem_cache_free(signal_cachep, sig); 403 kmem_cache_free(signal_cachep, sig);
416 } 404 }
@@ -578,7 +566,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
578 * is to alert stop-signal processing code when another 566 * is to alert stop-signal processing code when another
579 * processor has come along and cleared the flag. 567 * processor has come along and cleared the flag.
580 */ 568 */
581 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 569 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
570 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
582 } 571 }
583 if ( signr && 572 if ( signr &&
584 ((info->si_code & __SI_MASK) == __SI_TIMER) && 573 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
@@ -678,7 +667,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
678 667
679/* forward decl */ 668/* forward decl */
680static void do_notify_parent_cldstop(struct task_struct *tsk, 669static void do_notify_parent_cldstop(struct task_struct *tsk,
681 struct task_struct *parent, 670 int to_self,
682 int why); 671 int why);
683 672
684/* 673/*
@@ -729,14 +718,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
729 p->signal->group_stop_count = 0; 718 p->signal->group_stop_count = 0;
730 p->signal->flags = SIGNAL_STOP_CONTINUED; 719 p->signal->flags = SIGNAL_STOP_CONTINUED;
731 spin_unlock(&p->sighand->siglock); 720 spin_unlock(&p->sighand->siglock);
732 if (p->ptrace & PT_PTRACED) 721 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
733 do_notify_parent_cldstop(p, p->parent,
734 CLD_STOPPED);
735 else
736 do_notify_parent_cldstop(
737 p->group_leader,
738 p->group_leader->real_parent,
739 CLD_STOPPED);
740 spin_lock(&p->sighand->siglock); 722 spin_lock(&p->sighand->siglock);
741 } 723 }
742 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 724 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +759,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
777 p->signal->flags = SIGNAL_STOP_CONTINUED; 759 p->signal->flags = SIGNAL_STOP_CONTINUED;
778 p->signal->group_exit_code = 0; 760 p->signal->group_exit_code = 0;
779 spin_unlock(&p->sighand->siglock); 761 spin_unlock(&p->sighand->siglock);
780 if (p->ptrace & PT_PTRACED) 762 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
781 do_notify_parent_cldstop(p, p->parent,
782 CLD_CONTINUED);
783 else
784 do_notify_parent_cldstop(
785 p->group_leader,
786 p->group_leader->real_parent,
787 CLD_CONTINUED);
788 spin_lock(&p->sighand->siglock); 763 spin_lock(&p->sighand->siglock);
789 } else { 764 } else {
790 /* 765 /*
@@ -950,34 +925,31 @@ force_sig_specific(int sig, struct task_struct *t)
950 * as soon as they're available, so putting the signal on the shared queue 925 * as soon as they're available, so putting the signal on the shared queue
951 * will be equivalent to sending it to one such thread. 926 * will be equivalent to sending it to one such thread.
952 */ 927 */
953#define wants_signal(sig, p, mask) \ 928static inline int wants_signal(int sig, struct task_struct *p)
954 (!sigismember(&(p)->blocked, sig) \ 929{
955 && !((p)->state & mask) \ 930 if (sigismember(&p->blocked, sig))
956 && !((p)->flags & PF_EXITING) \ 931 return 0;
957 && (task_curr(p) || !signal_pending(p))) 932 if (p->flags & PF_EXITING)
958 933 return 0;
934 if (sig == SIGKILL)
935 return 1;
936 if (p->state & (TASK_STOPPED | TASK_TRACED))
937 return 0;
938 return task_curr(p) || !signal_pending(p);
939}
959 940
960static void 941static void
961__group_complete_signal(int sig, struct task_struct *p) 942__group_complete_signal(int sig, struct task_struct *p)
962{ 943{
963 unsigned int mask;
964 struct task_struct *t; 944 struct task_struct *t;
965 945
966 /* 946 /*
967 * Don't bother traced and stopped tasks (but
968 * SIGKILL will punch through that).
969 */
970 mask = TASK_STOPPED | TASK_TRACED;
971 if (sig == SIGKILL)
972 mask = 0;
973
974 /*
975 * Now find a thread we can wake up to take the signal off the queue. 947 * Now find a thread we can wake up to take the signal off the queue.
976 * 948 *
977 * If the main thread wants the signal, it gets first crack. 949 * If the main thread wants the signal, it gets first crack.
978 * Probably the least surprising to the average bear. 950 * Probably the least surprising to the average bear.
979 */ 951 */
980 if (wants_signal(sig, p, mask)) 952 if (wants_signal(sig, p))
981 t = p; 953 t = p;
982 else if (thread_group_empty(p)) 954 else if (thread_group_empty(p))
983 /* 955 /*
@@ -995,7 +967,7 @@ __group_complete_signal(int sig, struct task_struct *p)
995 t = p->signal->curr_target = p; 967 t = p->signal->curr_target = p;
996 BUG_ON(t->tgid != p->tgid); 968 BUG_ON(t->tgid != p->tgid);
997 969
998 while (!wants_signal(sig, t, mask)) { 970 while (!wants_signal(sig, t)) {
999 t = next_thread(t); 971 t = next_thread(t);
1000 if (t == p->signal->curr_target) 972 if (t == p->signal->curr_target)
1001 /* 973 /*
@@ -1209,6 +1181,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1209 return error; 1181 return error;
1210} 1182}
1211 1183
1184/* like kill_proc_info(), but doesn't use uid/euid of "current" */
1185int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1186 uid_t uid, uid_t euid)
1187{
1188 int ret = -EINVAL;
1189 struct task_struct *p;
1190
1191 if (!valid_signal(sig))
1192 return ret;
1193
1194 read_lock(&tasklist_lock);
1195 p = find_task_by_pid(pid);
1196 if (!p) {
1197 ret = -ESRCH;
1198 goto out_unlock;
1199 }
1200 if ((!info || ((unsigned long)info != 1 &&
1201 (unsigned long)info != 2 && SI_FROMUSER(info)))
1202 && (euid != p->suid) && (euid != p->uid)
1203 && (uid != p->suid) && (uid != p->uid)) {
1204 ret = -EPERM;
1205 goto out_unlock;
1206 }
1207 if (sig && p->sighand) {
1208 unsigned long flags;
1209 spin_lock_irqsave(&p->sighand->siglock, flags);
1210 ret = __group_send_sig_info(sig, info, p);
1211 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1212 }
1213out_unlock:
1214 read_unlock(&tasklist_lock);
1215 return ret;
1216}
1217EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
1212 1218
1213/* 1219/*
1214 * kill_something_info() interprets pid in interesting ways just like kill(2). 1220 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1380,16 +1386,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1380 unsigned long flags; 1386 unsigned long flags;
1381 int ret = 0; 1387 int ret = 0;
1382 1388
1383 /*
1384 * We need the tasklist lock even for the specific
1385 * thread case (when we don't need to follow the group
1386 * lists) in order to avoid races with "p->sighand"
1387 * going away or changing from under us.
1388 */
1389 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1389 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1390 read_lock(&tasklist_lock); 1390 read_lock(&tasklist_lock);
1391
1392 if (unlikely(p->flags & PF_EXITING)) {
1393 ret = -1;
1394 goto out_err;
1395 }
1396
1391 spin_lock_irqsave(&p->sighand->siglock, flags); 1397 spin_lock_irqsave(&p->sighand->siglock, flags);
1392 1398
1393 if (unlikely(!list_empty(&q->list))) { 1399 if (unlikely(!list_empty(&q->list))) {
1394 /* 1400 /*
1395 * If an SI_TIMER entry is already queue just increment 1401 * If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1405,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1399 BUG(); 1405 BUG();
1400 q->info.si_overrun++; 1406 q->info.si_overrun++;
1401 goto out; 1407 goto out;
1402 } 1408 }
1403 /* Short-circuit ignored signals. */ 1409 /* Short-circuit ignored signals. */
1404 if (sig_ignored(p, sig)) { 1410 if (sig_ignored(p, sig)) {
1405 ret = 1; 1411 ret = 1;
@@ -1414,8 +1420,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1414 1420
1415out: 1421out:
1416 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1422 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1423out_err:
1417 read_unlock(&tasklist_lock); 1424 read_unlock(&tasklist_lock);
1418 return(ret); 1425
1426 return ret;
1419} 1427}
1420 1428
1421int 1429int
@@ -1542,14 +1550,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1542 spin_unlock_irqrestore(&psig->siglock, flags); 1550 spin_unlock_irqrestore(&psig->siglock, flags);
1543} 1551}
1544 1552
1545static void 1553static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
1546do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
1547 int why)
1548{ 1554{
1549 struct siginfo info; 1555 struct siginfo info;
1550 unsigned long flags; 1556 unsigned long flags;
1557 struct task_struct *parent;
1551 struct sighand_struct *sighand; 1558 struct sighand_struct *sighand;
1552 1559
1560 if (to_self)
1561 parent = tsk->parent;
1562 else {
1563 tsk = tsk->group_leader;
1564 parent = tsk->real_parent;
1565 }
1566
1553 info.si_signo = SIGCHLD; 1567 info.si_signo = SIGCHLD;
1554 info.si_errno = 0; 1568 info.si_errno = 0;
1555 info.si_pid = tsk->pid; 1569 info.si_pid = tsk->pid;
@@ -1618,8 +1632,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1618 !(current->ptrace & PT_ATTACHED)) && 1632 !(current->ptrace & PT_ATTACHED)) &&
1619 (likely(current->parent->signal != current->signal) || 1633 (likely(current->parent->signal != current->signal) ||
1620 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1634 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1621 do_notify_parent_cldstop(current, current->parent, 1635 do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
1622 CLD_TRAPPED);
1623 read_unlock(&tasklist_lock); 1636 read_unlock(&tasklist_lock);
1624 schedule(); 1637 schedule();
1625 } else { 1638 } else {
@@ -1668,25 +1681,25 @@ void ptrace_notify(int exit_code)
1668static void 1681static void
1669finish_stop(int stop_count) 1682finish_stop(int stop_count)
1670{ 1683{
1684 int to_self;
1685
1671 /* 1686 /*
1672 * If there are no other threads in the group, or if there is 1687 * If there are no other threads in the group, or if there is
1673 * a group stop in progress and we are the last to stop, 1688 * a group stop in progress and we are the last to stop,
1674 * report to the parent. When ptraced, every thread reports itself. 1689 * report to the parent. When ptraced, every thread reports itself.
1675 */ 1690 */
1676 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { 1691 if (stop_count < 0 || (current->ptrace & PT_PTRACED))
1677 read_lock(&tasklist_lock); 1692 to_self = 1;
1678 do_notify_parent_cldstop(current, current->parent, 1693 else if (stop_count == 0)
1679 CLD_STOPPED); 1694 to_self = 0;
1680 read_unlock(&tasklist_lock); 1695 else
1681 } 1696 goto out;
1682 else if (stop_count == 0) {
1683 read_lock(&tasklist_lock);
1684 do_notify_parent_cldstop(current->group_leader,
1685 current->group_leader->real_parent,
1686 CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689 1697
1698 read_lock(&tasklist_lock);
1699 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1700 read_unlock(&tasklist_lock);
1701
1702out:
1690 schedule(); 1703 schedule();
1691 /* 1704 /*
1692 * Now we don't run again until continued. 1705 * Now we don't run again until continued.
@@ -1773,7 +1786,8 @@ do_signal_stop(int signr)
1773 * stop is always done with the siglock held, 1786 * stop is always done with the siglock held,
1774 * so this check has no races. 1787 * so this check has no races.
1775 */ 1788 */
1776 if (t->state < TASK_STOPPED) { 1789 if (!t->exit_state &&
1790 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1777 stop_count++; 1791 stop_count++;
1778 signal_wake_up(t, 0); 1792 signal_wake_up(t, 0);
1779 } 1793 }
@@ -2228,8 +2242,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2228 recalc_sigpending(); 2242 recalc_sigpending();
2229 spin_unlock_irq(&current->sighand->siglock); 2243 spin_unlock_irq(&current->sighand->siglock);
2230 2244
2231 current->state = TASK_INTERRUPTIBLE; 2245 timeout = schedule_timeout_interruptible(timeout);
2232 timeout = schedule_timeout(timeout);
2233 2246
2234 try_to_freeze(); 2247 try_to_freeze();
2235 spin_lock_irq(&current->sighand->siglock); 2248 spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..f766b2fc48be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void)
84 cpu = smp_processor_id(); 84 cpu = smp_processor_id();
85restart: 85restart:
86 /* Reset the pending bitmask before enabling irqs */ 86 /* Reset the pending bitmask before enabling irqs */
87 local_softirq_pending() = 0; 87 set_softirq_pending(0);
88 88
89 local_irq_enable(); 89 local_irq_enable();
90 90
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, (C) 2005, Red Hat
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9
10#include <linux/mm.h>
11#include <linux/cpu.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/kthread.h>
15#include <linux/notifier.h>
16#include <linux/module.h>
17
18static DEFINE_SPINLOCK(print_lock);
19
20static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
21static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23
24static int did_panic = 0;
25static int softlock_panic(struct notifier_block *this, unsigned long event,
26 void *ptr)
27{
28 did_panic = 1;
29
30 return NOTIFY_DONE;
31}
32
33static struct notifier_block panic_block = {
34 .notifier_call = softlock_panic,
35};
36
37void touch_softlockup_watchdog(void)
38{
39 per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
40}
41EXPORT_SYMBOL(touch_softlockup_watchdog);
42
43/*
44 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not:
46 */
47void softlockup_tick(struct pt_regs *regs)
48{
49 int this_cpu = smp_processor_id();
50 unsigned long timestamp = per_cpu(timestamp, this_cpu);
51
52 if (per_cpu(print_timestamp, this_cpu) == timestamp)
53 return;
54
55 /* Do not cause a second panic when there already was one */
56 if (did_panic)
57 return;
58
59 if (time_after(jiffies, timestamp + 10*HZ)) {
60 per_cpu(print_timestamp, this_cpu) = timestamp;
61
62 spin_lock(&print_lock);
63 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
64 this_cpu);
65 show_regs(regs);
66 spin_unlock(&print_lock);
67 }
68}
69
70/*
71 * The watchdog thread - runs every second and touches the timestamp.
72 */
73static int watchdog(void * __bind_cpu)
74{
75 struct sched_param param = { .sched_priority = 99 };
76 int this_cpu = (long) __bind_cpu;
77
78 printk("softlockup thread %d started up.\n", this_cpu);
79
80 sched_setscheduler(current, SCHED_FIFO, &param);
81 current->flags |= PF_NOFREEZE;
82
83 set_current_state(TASK_INTERRUPTIBLE);
84
85 /*
86 * Run briefly once per second - if this gets delayed for
87 * more than 10 seconds then the debug-printout triggers
88 * in softlockup_tick():
89 */
90 while (!kthread_should_stop()) {
91 msleep_interruptible(1000);
92 touch_softlockup_watchdog();
93 }
94 __set_current_state(TASK_RUNNING);
95
96 return 0;
97}
98
99/*
100 * Create/destroy watchdog threads as CPUs come and go:
101 */
102static int __devinit
103cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
104{
105 int hotcpu = (unsigned long)hcpu;
106 struct task_struct *p;
107
108 switch (action) {
109 case CPU_UP_PREPARE:
110 BUG_ON(per_cpu(watchdog_task, hotcpu));
111 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
112 if (IS_ERR(p)) {
113 printk("watchdog for %i failed\n", hotcpu);
114 return NOTIFY_BAD;
115 }
116 per_cpu(watchdog_task, hotcpu) = p;
117 kthread_bind(p, hotcpu);
118 break;
119 case CPU_ONLINE:
120
121 wake_up_process(per_cpu(watchdog_task, hotcpu));
122 break;
123#ifdef CONFIG_HOTPLUG_CPU
124 case CPU_UP_CANCELED:
125 /* Unbind so it can run. Fall thru. */
126 kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
127 case CPU_DEAD:
128 p = per_cpu(watchdog_task, hotcpu);
129 per_cpu(watchdog_task, hotcpu) = NULL;
130 kthread_stop(p);
131 break;
132#endif /* CONFIG_HOTPLUG_CPU */
133 }
134 return NOTIFY_OK;
135}
136
137static struct notifier_block __devinitdata cpu_nfb = {
138 .notifier_call = cpu_callback
139};
140
141__init void spawn_softlockup_task(void)
142{
143 void *cpu = (void *)(long)smp_processor_id();
144
145 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
146 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
147 register_cpu_notifier(&cpu_nfb);
148
149 notifier_chain_register(&panic_notifier_list, &panic_block);
150}
151
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe17..0375fcd5921d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
3 * 3 *
4 * Author: Zwane Mwaikambo <zwane@fsmlabs.com> 4 * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
5 * 5 *
6 * Copyright (2004) Ingo Molnar 6 * Copyright (2004, 2005) Ingo Molnar
7 *
8 * This file contains the spinlock/rwlock implementations for the
9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
7 */ 10 */
8 11
9#include <linux/config.h> 12#include <linux/config.h>
@@ -17,12 +20,12 @@
17 * Generic declaration of the raw read_trylock() function, 20 * Generic declaration of the raw read_trylock() function,
18 * architectures are supposed to optimize this: 21 * architectures are supposed to optimize this:
19 */ 22 */
20int __lockfunc generic_raw_read_trylock(rwlock_t *lock) 23int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
21{ 24{
22 _raw_read_lock(lock); 25 __raw_read_lock(lock);
23 return 1; 26 return 1;
24} 27}
25EXPORT_SYMBOL(generic_raw_read_trylock); 28EXPORT_SYMBOL(generic__raw_read_trylock);
26 29
27int __lockfunc _spin_trylock(spinlock_t *lock) 30int __lockfunc _spin_trylock(spinlock_t *lock)
28{ 31{
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
57} 60}
58EXPORT_SYMBOL(_write_trylock); 61EXPORT_SYMBOL(_write_trylock);
59 62
60#ifndef CONFIG_PREEMPT 63#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
61 64
62void __lockfunc _read_lock(rwlock_t *lock) 65void __lockfunc _read_lock(rwlock_t *lock)
63{ 66{
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
72 75
73 local_irq_save(flags); 76 local_irq_save(flags);
74 preempt_disable(); 77 preempt_disable();
75 _raw_spin_lock_flags(lock, flags); 78 _raw_spin_lock_flags(lock, &flags);
76 return flags; 79 return flags;
77} 80}
78EXPORT_SYMBOL(_spin_lock_irqsave); 81EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..2fa1ed18123c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,17 +361,35 @@ out_unlock:
361 return retval; 361 return retval;
362} 362}
363 363
364/**
365 * emergency_restart - reboot the system
366 *
367 * Without shutting down any hardware or taking any locks
368 * reboot the system. This is called when we know we are in
369 * trouble so this is our best effort to reboot. This is
370 * safe to call in interrupt context.
371 */
364void emergency_restart(void) 372void emergency_restart(void)
365{ 373{
366 machine_emergency_restart(); 374 machine_emergency_restart();
367} 375}
368EXPORT_SYMBOL_GPL(emergency_restart); 376EXPORT_SYMBOL_GPL(emergency_restart);
369 377
370void kernel_restart(char *cmd) 378/**
379 * kernel_restart - reboot the system
380 *
381 * Shutdown everything and perform a clean reboot.
382 * This is not safe to call in interrupt context.
383 */
384void kernel_restart_prepare(char *cmd)
371{ 385{
372 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 386 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
373 system_state = SYSTEM_RESTART; 387 system_state = SYSTEM_RESTART;
374 device_shutdown(); 388 device_shutdown();
389}
390void kernel_restart(char *cmd)
391{
392 kernel_restart_prepare(cmd);
375 if (!cmd) { 393 if (!cmd) {
376 printk(KERN_EMERG "Restarting system.\n"); 394 printk(KERN_EMERG "Restarting system.\n");
377 } else { 395 } else {
@@ -382,6 +400,12 @@ void kernel_restart(char *cmd)
382} 400}
383EXPORT_SYMBOL_GPL(kernel_restart); 401EXPORT_SYMBOL_GPL(kernel_restart);
384 402
403/**
404 * kernel_kexec - reboot the system
405 *
406 * Move into place and start executing a preloaded standalone
407 * executable. If nothing was preloaded return an error.
408 */
385void kernel_kexec(void) 409void kernel_kexec(void)
386{ 410{
387#ifdef CONFIG_KEXEC 411#ifdef CONFIG_KEXEC
@@ -390,9 +414,7 @@ void kernel_kexec(void)
390 if (!image) { 414 if (!image) {
391 return; 415 return;
392 } 416 }
393 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); 417 kernel_restart_prepare(NULL);
394 system_state = SYSTEM_RESTART;
395 device_shutdown();
396 printk(KERN_EMERG "Starting new kernel\n"); 418 printk(KERN_EMERG "Starting new kernel\n");
397 machine_shutdown(); 419 machine_shutdown();
398 machine_kexec(image); 420 machine_kexec(image);
@@ -400,21 +422,39 @@ void kernel_kexec(void)
400} 422}
401EXPORT_SYMBOL_GPL(kernel_kexec); 423EXPORT_SYMBOL_GPL(kernel_kexec);
402 424
403void kernel_halt(void) 425/**
426 * kernel_halt - halt the system
427 *
428 * Shutdown everything and perform a clean system halt.
429 */
430void kernel_halt_prepare(void)
404{ 431{
405 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 432 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
406 system_state = SYSTEM_HALT; 433 system_state = SYSTEM_HALT;
407 device_shutdown(); 434 device_shutdown();
435}
436void kernel_halt(void)
437{
438 kernel_halt_prepare();
408 printk(KERN_EMERG "System halted.\n"); 439 printk(KERN_EMERG "System halted.\n");
409 machine_halt(); 440 machine_halt();
410} 441}
411EXPORT_SYMBOL_GPL(kernel_halt); 442EXPORT_SYMBOL_GPL(kernel_halt);
412 443
413void kernel_power_off(void) 444/**
445 * kernel_power_off - power_off the system
446 *
447 * Shutdown everything and perform a clean system power_off.
448 */
449void kernel_power_off_prepare(void)
414{ 450{
415 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 451 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
416 system_state = SYSTEM_POWER_OFF; 452 system_state = SYSTEM_POWER_OFF;
417 device_shutdown(); 453 device_shutdown();
454}
455void kernel_power_off(void)
456{
457 kernel_power_off_prepare();
418 printk(KERN_EMERG "Power down.\n"); 458 printk(KERN_EMERG "Power down.\n");
419 machine_power_off(); 459 machine_power_off();
420} 460}
@@ -1711,7 +1751,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1711 unsigned long arg4, unsigned long arg5) 1751 unsigned long arg4, unsigned long arg5)
1712{ 1752{
1713 long error; 1753 long error;
1714 int sig;
1715 1754
1716 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1755 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1717 if (error) 1756 if (error)
@@ -1719,19 +1758,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1719 1758
1720 switch (option) { 1759 switch (option) {
1721 case PR_SET_PDEATHSIG: 1760 case PR_SET_PDEATHSIG:
1722 sig = arg2; 1761 if (!valid_signal(arg2)) {
1723 if (!valid_signal(sig)) {
1724 error = -EINVAL; 1762 error = -EINVAL;
1725 break; 1763 break;
1726 } 1764 }
1727 current->pdeath_signal = sig; 1765 current->pdeath_signal = arg2;
1728 break; 1766 break;
1729 case PR_GET_PDEATHSIG: 1767 case PR_GET_PDEATHSIG:
1730 error = put_user(current->pdeath_signal, (int __user *)arg2); 1768 error = put_user(current->pdeath_signal, (int __user *)arg2);
1731 break; 1769 break;
1732 case PR_GET_DUMPABLE: 1770 case PR_GET_DUMPABLE:
1733 if (current->mm->dumpable) 1771 error = current->mm->dumpable;
1734 error = 1;
1735 break; 1772 break;
1736 case PR_SET_DUMPABLE: 1773 case PR_SET_DUMPABLE:
1737 if (arg2 < 0 || arg2 > 2) { 1774 if (arg2 < 0 || arg2 > 2) {
diff --git a/kernel/time.c b/kernel/time.c
index dd5ae1162a8f..40c2410ac99a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -570,6 +570,7 @@ void getnstimeofday(struct timespec *tv)
570 tv->tv_sec = x.tv_sec; 570 tv->tv_sec = x.tv_sec;
571 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; 571 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
572} 572}
573EXPORT_SYMBOL_GPL(getnstimeofday);
573#endif 574#endif
574 575
575#if (BITS_PER_LONG < 64) 576#if (BITS_PER_LONG < 64)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..3ba10fa35b60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
950{ 950{
951 jiffies_64++; 951 jiffies_64++;
952 update_times(); 952 update_times();
953 softlockup_tick(regs);
953} 954}
954 955
955#ifdef __ARCH_WANT_SYS_ALARM 956#ifdef __ARCH_WANT_SYS_ALARM
@@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1150 out: 1151 out:
1151 return timeout < 0 ? 0 : timeout; 1152 return timeout < 0 ? 0 : timeout;
1152} 1153}
1153
1154EXPORT_SYMBOL(schedule_timeout); 1154EXPORT_SYMBOL(schedule_timeout);
1155 1155
1156/*
1157 * We can use __set_current_state() here because schedule_timeout() calls
1158 * schedule() unconditionally.
1159 */
1160signed long __sched schedule_timeout_interruptible(signed long timeout)
1161{
1162 __set_current_state(TASK_INTERRUPTIBLE);
1163 return schedule_timeout(timeout);
1164}
1165EXPORT_SYMBOL(schedule_timeout_interruptible);
1166
1167signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1168{
1169 __set_current_state(TASK_UNINTERRUPTIBLE);
1170 return schedule_timeout(timeout);
1171}
1172EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1173
1156/* Thread ID - the internal kernel "pid" */ 1174/* Thread ID - the internal kernel "pid" */
1157asmlinkage long sys_gettid(void) 1175asmlinkage long sys_gettid(void)
1158{ 1176{
@@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
1169 if (!time_after(expire, now)) 1187 if (!time_after(expire, now))
1170 return 0; 1188 return 0;
1171 1189
1172 current->state = TASK_INTERRUPTIBLE; 1190 expire = schedule_timeout_interruptible(expire - now);
1173 expire = schedule_timeout(expire - now);
1174 1191
1175 ret = 0; 1192 ret = 0;
1176 if (expire) { 1193 if (expire) {
@@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
1198 return -EINVAL; 1215 return -EINVAL;
1199 1216
1200 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 1217 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1201 current->state = TASK_INTERRUPTIBLE; 1218 expire = schedule_timeout_interruptible(expire);
1202 expire = schedule_timeout(expire);
1203 1219
1204 ret = 0; 1220 ret = 0;
1205 if (expire) { 1221 if (expire) {
@@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
1428 } 1444 }
1429} 1445}
1430 1446
1431static inline u64 time_interpolator_get_counter(void) 1447static inline u64 time_interpolator_get_counter(int writelock)
1432{ 1448{
1433 unsigned int src = time_interpolator->source; 1449 unsigned int src = time_interpolator->source;
1434 1450
@@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void)
1442 now = time_interpolator_get_cycles(src); 1458 now = time_interpolator_get_cycles(src);
1443 if (lcycle && time_after(lcycle, now)) 1459 if (lcycle && time_after(lcycle, now))
1444 return lcycle; 1460 return lcycle;
1461
1462 /* When holding the xtime write lock, there's no need
1463 * to add the overhead of the cmpxchg. Readers are
1464 * force to retry until the write lock is released.
1465 */
1466 if (writelock) {
1467 time_interpolator->last_cycle = now;
1468 return now;
1469 }
1445 /* Keep track of the last timer value returned. The use of cmpxchg here 1470 /* Keep track of the last timer value returned. The use of cmpxchg here
1446 * will cause contention in an SMP environment. 1471 * will cause contention in an SMP environment.
1447 */ 1472 */
@@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void)
1455void time_interpolator_reset(void) 1480void time_interpolator_reset(void)
1456{ 1481{
1457 time_interpolator->offset = 0; 1482 time_interpolator->offset = 0;
1458 time_interpolator->last_counter = time_interpolator_get_counter(); 1483 time_interpolator->last_counter = time_interpolator_get_counter(1);
1459} 1484}
1460 1485
1461#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) 1486#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void)
1467 return 0; 1492 return 0;
1468 1493
1469 return time_interpolator->offset + 1494 return time_interpolator->offset +
1470 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); 1495 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1471} 1496}
1472 1497
1473#define INTERPOLATOR_ADJUST 65536 1498#define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec)
1490 * and the tuning logic insures that. 1515 * and the tuning logic insures that.
1491 */ 1516 */
1492 1517
1493 counter = time_interpolator_get_counter(); 1518 counter = time_interpolator_get_counter(1);
1494 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); 1519 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1495 1520
1496 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1521 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
@@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs)
1588{ 1613{
1589 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1614 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1590 1615
1591 while (timeout) { 1616 while (timeout)
1592 set_current_state(TASK_UNINTERRUPTIBLE); 1617 timeout = schedule_timeout_uninterruptible(timeout);
1593 timeout = schedule_timeout(timeout);
1594 }
1595} 1618}
1596 1619
1597EXPORT_SYMBOL(msleep); 1620EXPORT_SYMBOL(msleep);
@@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
1604{ 1627{
1605 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1628 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1606 1629
1607 while (timeout && !signal_pending(current)) { 1630 while (timeout && !signal_pending(current))
1608 set_current_state(TASK_INTERRUPTIBLE); 1631 timeout = schedule_timeout_interruptible(timeout);
1609 timeout = schedule_timeout(timeout);
1610 }
1611 return jiffies_to_msecs(timeout); 1632 return jiffies_to_msecs(timeout);
1612} 1633}
1613 1634
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
308 struct workqueue_struct *wq; 308 struct workqueue_struct *wq;
309 struct task_struct *p; 309 struct task_struct *p;
310 310
311 wq = kmalloc(sizeof(*wq), GFP_KERNEL); 311 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
312 if (!wq) 312 if (!wq)
313 return NULL; 313 return NULL;
314 memset(wq, 0, sizeof(*wq));
315 314
316 wq->name = name; 315 wq->name = name;
317 /* We don't need the distraction of CPUs appearing and vanishing. */ 316 /* We don't need the distraction of CPUs appearing and vanishing. */
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
499 case CPU_UP_PREPARE: 498 case CPU_UP_PREPARE:
500 /* Create a new workqueue thread for it. */ 499 /* Create a new workqueue thread for it. */
501 list_for_each_entry(wq, &workqueues, list) { 500 list_for_each_entry(wq, &workqueues, list) {
502 if (create_workqueue_thread(wq, hotcpu) < 0) { 501 if (!create_workqueue_thread(wq, hotcpu)) {
503 printk("workqueue for %i failed\n", hotcpu); 502 printk("workqueue for %i failed\n", hotcpu);
504 return NOTIFY_BAD; 503 return NOTIFY_BAD;
505 } 504 }