aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c175
-rw-r--r--kernel/audit.h88
-rw-r--r--kernel/auditfilter.c630
-rw-r--r--kernel/auditsc.c836
-rw-r--r--kernel/capability.c16
-rw-r--r--kernel/compat.c82
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/cpuset.c364
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c141
-rw-r--r--kernel/fork.c147
-rw-r--r--kernel/futex.c170
-rw-r--r--kernel/futex_compat.c142
-rw-r--r--kernel/hrtimer.c193
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/manage.c24
-rw-r--r--kernel/irq/migration.c65
-rw-r--r--kernel/itimer.c117
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/module.c263
-rw-r--r--kernel/panic.c101
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/pid.c40
-rw-r--r--kernel/posix-timers.c68
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c20
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/pm.c21
-rw-r--r--kernel/power/power.h75
-rw-r--r--kernel/power/process.c61
-rw-r--r--kernel/power/smp.c4
-rw-r--r--kernel/power/snapshot.c335
-rw-r--r--kernel/power/swap.c545
-rw-r--r--kernel/power/swsusp.c887
-rw-r--r--kernel/power/user.c333
-rw-r--r--kernel/printk.c76
-rw-r--r--kernel/profile.c64
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/rcutorture.c37
-rw-r--r--kernel/relay.c1012
-rw-r--r--kernel/sched.c180
-rw-r--r--kernel/signal.c355
-rw-r--r--kernel/softirq.c20
-rw-r--r--kernel/softlockup.c57
-rw-r--r--kernel/spinlock.c9
-rw-r--r--kernel/sys.c514
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/time.c63
-rw-r--r--kernel/timer.c74
-rw-r--r--kernel/user.c10
-rw-r--r--kernel/workqueue.c29
57 files changed, 5814 insertions, 2778 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde81..58908f9d15 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
12 12
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 14obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o
17endif
15obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
16obj-$(CONFIG_SMP) += cpu.o spinlock.o 19obj-$(CONFIG_SMP) += cpu.o spinlock.o
17obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -26,7 +29,7 @@ obj-$(CONFIG_COMPAT) += compat.o
26obj-$(CONFIG_CPUSETS) += cpuset.o 29obj-$(CONFIG_CPUSETS) += cpuset.o
27obj-$(CONFIG_IKCONFIG) += configs.o 30obj-$(CONFIG_IKCONFIG) += configs.o
28obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 31obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
29obj-$(CONFIG_AUDIT) += audit.o 32obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
30obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 33obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
31obj-$(CONFIG_KPROBES) += kprobes.o 34obj-$(CONFIG_KPROBES) += kprobes.o
32obj-$(CONFIG_SYSFS) += ksysfs.o 35obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -34,6 +37,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
34obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 37obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
35obj-$(CONFIG_SECCOMP) += seccomp.o 38obj-$(CONFIG_SECCOMP) += seccomp.o
36obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 39obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
40obj-$(CONFIG_RELAY) += relay.o
37 41
38ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 42ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
39# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 43# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a813d2883..04fe2e301b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -52,6 +52,7 @@
52#include <linux/audit.h> 52#include <linux/audit.h>
53 53
54#include <net/sock.h> 54#include <net/sock.h>
55#include <net/netlink.h>
55#include <linux/skbuff.h> 56#include <linux/skbuff.h>
56#include <linux/netlink.h> 57#include <linux/netlink.h>
57 58
@@ -72,7 +73,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
72 * contains the (non-zero) pid. */ 73 * contains the (non-zero) pid. */
73int audit_pid; 74int audit_pid;
74 75
75/* If audit_limit is non-zero, limit the rate of sending audit records 76/* If audit_rate_limit is non-zero, limit the rate of sending audit records
76 * to that number per second. This prevents DoS attacks, but results in 77 * to that number per second. This prevents DoS attacks, but results in
77 * audit records being dropped. */ 78 * audit records being dropped. */
78static int audit_rate_limit; 79static int audit_rate_limit;
@@ -102,7 +103,7 @@ static struct sock *audit_sock;
102 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 103 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
103 * being placed on the freelist). */ 104 * being placed on the freelist). */
104static DEFINE_SPINLOCK(audit_freelist_lock); 105static DEFINE_SPINLOCK(audit_freelist_lock);
105static int audit_freelist_count = 0; 106static int audit_freelist_count;
106static LIST_HEAD(audit_freelist); 107static LIST_HEAD(audit_freelist);
107 108
108static struct sk_buff_head audit_skb_queue; 109static struct sk_buff_head audit_skb_queue;
@@ -113,7 +114,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
113/* The netlink socket is only to be read by 1 CPU, which lets us assume 114/* The netlink socket is only to be read by 1 CPU, which lets us assume
114 * that list additions and deletions never happen simultaneously in 115 * that list additions and deletions never happen simultaneously in
115 * auditsc.c */ 116 * auditsc.c */
116DECLARE_MUTEX(audit_netlink_sem); 117DEFINE_MUTEX(audit_netlink_mutex);
117 118
118/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 119/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
119 * audit records. Since printk uses a 1024 byte buffer, this buffer 120 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -142,7 +143,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
142 nlh->nlmsg_pid = pid; 143 nlh->nlmsg_pid = pid;
143} 144}
144 145
145static void audit_panic(const char *message) 146void audit_panic(const char *message)
146{ 147{
147 switch (audit_failure) 148 switch (audit_failure)
148 { 149 {
@@ -186,8 +187,14 @@ static inline int audit_rate_check(void)
186 return retval; 187 return retval;
187} 188}
188 189
189/* Emit at least 1 message per second, even if audit_rate_check is 190/**
190 * throttling. */ 191 * audit_log_lost - conditionally log lost audit message event
192 * @message: the message stating reason for lost audit message
193 *
194 * Emit at least 1 message per second, even if audit_rate_check is
195 * throttling.
196 * Always increment the lost messages counter.
197*/
191void audit_log_lost(const char *message) 198void audit_log_lost(const char *message)
192{ 199{
193 static unsigned long last_msg = 0; 200 static unsigned long last_msg = 0;
@@ -218,7 +225,6 @@ void audit_log_lost(const char *message)
218 audit_backlog_limit); 225 audit_backlog_limit);
219 audit_panic(message); 226 audit_panic(message);
220 } 227 }
221
222} 228}
223 229
224static int audit_set_rate_limit(int limit, uid_t loginuid) 230static int audit_set_rate_limit(int limit, uid_t loginuid)
@@ -300,8 +306,22 @@ static int kauditd_thread(void *dummy)
300 remove_wait_queue(&kauditd_wait, &wait); 306 remove_wait_queue(&kauditd_wait, &wait);
301 } 307 }
302 } 308 }
309 return 0;
303} 310}
304 311
312/**
313 * audit_send_reply - send an audit reply message via netlink
314 * @pid: process id to send reply to
315 * @seq: sequence number
316 * @type: audit message type
317 * @done: done (last) flag
318 * @multi: multi-part message flag
319 * @payload: payload data
320 * @size: payload size
321 *
322 * Allocates an skb, builds the netlink message, and sends it to the pid.
323 * No failure notifications.
324 */
305void audit_send_reply(int pid, int seq, int type, int done, int multi, 325void audit_send_reply(int pid, int seq, int type, int done, int multi,
306 void *payload, int size) 326 void *payload, int size)
307{ 327{
@@ -342,15 +362,19 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
342 switch (msg_type) { 362 switch (msg_type) {
343 case AUDIT_GET: 363 case AUDIT_GET:
344 case AUDIT_LIST: 364 case AUDIT_LIST:
365 case AUDIT_LIST_RULES:
345 case AUDIT_SET: 366 case AUDIT_SET:
346 case AUDIT_ADD: 367 case AUDIT_ADD:
368 case AUDIT_ADD_RULE:
347 case AUDIT_DEL: 369 case AUDIT_DEL:
370 case AUDIT_DEL_RULE:
348 case AUDIT_SIGNAL_INFO: 371 case AUDIT_SIGNAL_INFO:
349 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 372 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
350 err = -EPERM; 373 err = -EPERM;
351 break; 374 break;
352 case AUDIT_USER: 375 case AUDIT_USER:
353 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 376 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
377 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
354 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 378 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
355 err = -EPERM; 379 err = -EPERM;
356 break; 380 break;
@@ -376,7 +400,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
376 if (err) 400 if (err)
377 return err; 401 return err;
378 402
379 /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ 403 /* As soon as there's any sign of userspace auditd,
404 * start kauditd to talk to it */
380 if (!kauditd_task) 405 if (!kauditd_task)
381 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); 406 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
382 if (IS_ERR(kauditd_task)) { 407 if (IS_ERR(kauditd_task)) {
@@ -430,6 +455,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
430 break; 455 break;
431 case AUDIT_USER: 456 case AUDIT_USER:
432 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 457 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
458 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
433 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 459 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
434 return 0; 460 return 0;
435 461
@@ -448,12 +474,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
448 break; 474 break;
449 case AUDIT_ADD: 475 case AUDIT_ADD:
450 case AUDIT_DEL: 476 case AUDIT_DEL:
451 if (nlh->nlmsg_len < sizeof(struct audit_rule)) 477 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
452 return -EINVAL; 478 return -EINVAL;
453 /* fallthrough */ 479 /* fallthrough */
454 case AUDIT_LIST: 480 case AUDIT_LIST:
455 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 481 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
456 uid, seq, data, loginuid); 482 uid, seq, data, nlmsg_len(nlh),
483 loginuid);
484 break;
485 case AUDIT_ADD_RULE:
486 case AUDIT_DEL_RULE:
487 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
488 return -EINVAL;
489 /* fallthrough */
490 case AUDIT_LIST_RULES:
491 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
492 uid, seq, data, nlmsg_len(nlh),
493 loginuid);
457 break; 494 break;
458 case AUDIT_SIGNAL_INFO: 495 case AUDIT_SIGNAL_INFO:
459 sig_data.uid = audit_sig_uid; 496 sig_data.uid = audit_sig_uid;
@@ -469,9 +506,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
469 return err < 0 ? err : 0; 506 return err < 0 ? err : 0;
470} 507}
471 508
472/* Get message from skb (based on rtnetlink_rcv_skb). Each message is 509/*
510 * Get message from skb (based on rtnetlink_rcv_skb). Each message is
473 * processed by audit_receive_msg. Malformed skbs with wrong length are 511 * processed by audit_receive_msg. Malformed skbs with wrong length are
474 * discarded silently. */ 512 * discarded silently.
513 */
475static void audit_receive_skb(struct sk_buff *skb) 514static void audit_receive_skb(struct sk_buff *skb)
476{ 515{
477 int err; 516 int err;
@@ -499,14 +538,14 @@ static void audit_receive(struct sock *sk, int length)
499 struct sk_buff *skb; 538 struct sk_buff *skb;
500 unsigned int qlen; 539 unsigned int qlen;
501 540
502 down(&audit_netlink_sem); 541 mutex_lock(&audit_netlink_mutex);
503 542
504 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 543 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
505 skb = skb_dequeue(&sk->sk_receive_queue); 544 skb = skb_dequeue(&sk->sk_receive_queue);
506 audit_receive_skb(skb); 545 audit_receive_skb(skb);
507 kfree_skb(skb); 546 kfree_skb(skb);
508 } 547 }
509 up(&audit_netlink_sem); 548 mutex_unlock(&audit_netlink_mutex);
510} 549}
511 550
512 551
@@ -519,8 +558,9 @@ static int __init audit_init(void)
519 THIS_MODULE); 558 THIS_MODULE);
520 if (!audit_sock) 559 if (!audit_sock)
521 audit_panic("cannot initialize netlink socket"); 560 audit_panic("cannot initialize netlink socket");
561 else
562 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
522 563
523 audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
524 skb_queue_head_init(&audit_skb_queue); 564 skb_queue_head_init(&audit_skb_queue);
525 audit_initialized = 1; 565 audit_initialized = 1;
526 audit_enabled = audit_default; 566 audit_enabled = audit_default;
@@ -600,7 +640,10 @@ err:
600 return NULL; 640 return NULL;
601} 641}
602 642
603/* Compute a serial number for the audit record. Audit records are 643/**
644 * audit_serial - compute a serial number for the audit record
645 *
646 * Compute a serial number for the audit record. Audit records are
604 * written to user-space as soon as they are generated, so a complete 647 * written to user-space as soon as they are generated, so a complete
605 * audit record may be written in several pieces. The timestamp of the 648 * audit record may be written in several pieces. The timestamp of the
606 * record and this serial number are used by the user-space tools to 649 * record and this serial number are used by the user-space tools to
@@ -612,8 +655,8 @@ err:
612 * audit context (for those records that have a context), and emit them 655 * audit context (for those records that have a context), and emit them
613 * all at syscall exit. However, this could delay the reporting of 656 * all at syscall exit. However, this could delay the reporting of
614 * significant errors until syscall exit (or never, if the system 657 * significant errors until syscall exit (or never, if the system
615 * halts). */ 658 * halts).
616 659 */
617unsigned int audit_serial(void) 660unsigned int audit_serial(void)
618{ 661{
619 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 662 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
@@ -649,6 +692,21 @@ static inline void audit_get_stamp(struct audit_context *ctx,
649 * will be written at syscall exit. If there is no associated task, tsk 692 * will be written at syscall exit. If there is no associated task, tsk
650 * should be NULL. */ 693 * should be NULL. */
651 694
695/**
696 * audit_log_start - obtain an audit buffer
697 * @ctx: audit_context (may be NULL)
698 * @gfp_mask: type of allocation
699 * @type: audit message type
700 *
701 * Returns audit_buffer pointer on success or NULL on error.
702 *
703 * Obtain an audit buffer. This routine does locking to obtain the
704 * audit buffer, but then no locking is required for calls to
705 * audit_log_*format. If the task (ctx) is a task that is currently in a
706 * syscall, then the syscall is marked as auditable and an audit record
707 * will be written at syscall exit. If there is no associated task, then
708 * task context (ctx) should be NULL.
709 */
652struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, 710struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
653 int type) 711 int type)
654{ 712{
@@ -661,6 +719,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
661 if (!audit_initialized) 719 if (!audit_initialized)
662 return NULL; 720 return NULL;
663 721
722 if (unlikely(audit_filter_type(type)))
723 return NULL;
724
664 if (gfp_mask & __GFP_WAIT) 725 if (gfp_mask & __GFP_WAIT)
665 reserve = 0; 726 reserve = 0;
666 else 727 else
@@ -713,6 +774,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
713/** 774/**
714 * audit_expand - expand skb in the audit buffer 775 * audit_expand - expand skb in the audit buffer
715 * @ab: audit_buffer 776 * @ab: audit_buffer
777 * @extra: space to add at tail of the skb
716 * 778 *
717 * Returns 0 (no space) on failed expansion, or available space if 779 * Returns 0 (no space) on failed expansion, or available space if
718 * successful. 780 * successful.
@@ -729,10 +791,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
729 return skb_tailroom(skb); 791 return skb_tailroom(skb);
730} 792}
731 793
732/* Format an audit message into the audit buffer. If there isn't enough 794/*
795 * Format an audit message into the audit buffer. If there isn't enough
733 * room in the audit buffer, more room will be allocated and vsnprint 796 * room in the audit buffer, more room will be allocated and vsnprint
734 * will be called a second time. Currently, we assume that a printk 797 * will be called a second time. Currently, we assume that a printk
735 * can't format message larger than 1024 bytes, so we don't either. */ 798 * can't format message larger than 1024 bytes, so we don't either.
799 */
736static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, 800static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
737 va_list args) 801 va_list args)
738{ 802{
@@ -757,7 +821,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
757 /* The printk buffer is 1024 bytes long, so if we get 821 /* The printk buffer is 1024 bytes long, so if we get
758 * here and AUDIT_BUFSIZ is at least 1024, then we can 822 * here and AUDIT_BUFSIZ is at least 1024, then we can
759 * log everything that printk could have logged. */ 823 * log everything that printk could have logged. */
760 avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 824 avail = audit_expand(ab,
825 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
761 if (!avail) 826 if (!avail)
762 goto out; 827 goto out;
763 len = vsnprintf(skb->tail, avail, fmt, args2); 828 len = vsnprintf(skb->tail, avail, fmt, args2);
@@ -768,8 +833,14 @@ out:
768 return; 833 return;
769} 834}
770 835
771/* Format a message into the audit buffer. All the work is done in 836/**
772 * audit_log_vformat. */ 837 * audit_log_format - format a message into the audit buffer.
838 * @ab: audit_buffer
839 * @fmt: format string
840 * @...: optional parameters matching @fmt string
841 *
842 * All the work is done in audit_log_vformat.
843 */
773void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) 844void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
774{ 845{
775 va_list args; 846 va_list args;
@@ -781,9 +852,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
781 va_end(args); 852 va_end(args);
782} 853}
783 854
784/* This function will take the passed buf and convert it into a string of 855/**
785 * ascii hex digits. The new string is placed onto the skb. */ 856 * audit_log_hex - convert a buffer to hex and append it to the audit skb
786void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 857 * @ab: the audit_buffer
858 * @buf: buffer to convert to hex
859 * @len: length of @buf to be converted
860 *
861 * No return value; failure to expand is silently ignored.
862 *
863 * This function will take the passed buf and convert it into a string of
864 * ascii hex digits. The new string is placed onto the skb.
865 */
866void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
787 size_t len) 867 size_t len)
788{ 868{
789 int i, avail, new_len; 869 int i, avail, new_len;
@@ -812,10 +892,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
812 skb_put(skb, len << 1); /* new string is twice the old string */ 892 skb_put(skb, len << 1); /* new string is twice the old string */
813} 893}
814 894
815/* This code will escape a string that is passed to it if the string 895/**
816 * contains a control character, unprintable character, double quote mark, 896 * audit_log_unstrustedstring - log a string that may contain random characters
897 * @ab: audit_buffer
898 * @string: string to be logged
899 *
900 * This code will escape a string that is passed to it if the string
901 * contains a control character, unprintable character, double quote mark,
817 * or a space. Unescaped strings will start and end with a double quote mark. 902 * or a space. Unescaped strings will start and end with a double quote mark.
818 * Strings that are escaped are printed in hex (2 digits per char). */ 903 * Strings that are escaped are printed in hex (2 digits per char).
904 */
819void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 905void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
820{ 906{
821 const unsigned char *p = string; 907 const unsigned char *p = string;
@@ -854,10 +940,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
854 kfree(path); 940 kfree(path);
855} 941}
856 942
857/* The netlink_* functions cannot be called inside an irq context, so 943/**
858 * the audit buffer is places on a queue and a tasklet is scheduled to 944 * audit_log_end - end one audit record
945 * @ab: the audit_buffer
946 *
947 * The netlink_* functions cannot be called inside an irq context, so
948 * the audit buffer is placed on a queue and a tasklet is scheduled to
859 * remove them from the queue outside the irq context. May be called in 949 * remove them from the queue outside the irq context. May be called in
860 * any context. */ 950 * any context.
951 */
861void audit_log_end(struct audit_buffer *ab) 952void audit_log_end(struct audit_buffer *ab)
862{ 953{
863 if (!ab) 954 if (!ab)
@@ -878,9 +969,18 @@ void audit_log_end(struct audit_buffer *ab)
878 audit_buffer_free(ab); 969 audit_buffer_free(ab);
879} 970}
880 971
881/* Log an audit record. This is a convenience function that calls 972/**
882 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 973 * audit_log - Log an audit record
883 * called in any context. */ 974 * @ctx: audit context
975 * @gfp_mask: type of allocation
976 * @type: audit message type
977 * @fmt: format string to use
978 * @...: variable parameters matching the format string
979 *
980 * This is a convenience function that calls audit_log_start,
981 * audit_log_vformat, and audit_log_end. It may be called
982 * in any context.
983 */
884void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 984void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
885 const char *fmt, ...) 985 const char *fmt, ...)
886{ 986{
@@ -895,3 +995,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
895 audit_log_end(ab); 995 audit_log_end(ab);
896 } 996 }
897} 997}
998
999EXPORT_SYMBOL(audit_log_start);
1000EXPORT_SYMBOL(audit_log_end);
1001EXPORT_SYMBOL(audit_log_format);
1002EXPORT_SYMBOL(audit_log);
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 0000000000..bc5392076e
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,88 @@
1/* audit -- definition of audit_context structure and supporting types
2 *
3 * Copyright 2003-2004 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/mutex.h>
23#include <linux/fs.h>
24#include <linux/audit.h>
25
26/* 0 = no checking
27 1 = put_count checking
28 2 = verbose put_count checking
29*/
30#define AUDIT_DEBUG 0
31
32/* At task start time, the audit_state is set in the audit_context using
33 a per-task filter. At syscall entry, the audit_state is augmented by
34 the syscall filter. */
35enum audit_state {
36 AUDIT_DISABLED, /* Do not create per-task audit_context.
37 * No syscall-specific audit records can
38 * be generated. */
39 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
40 * but don't necessarily fill it in at
41 * syscall entry time (i.e., filter
42 * instead). */
43 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
44 * and always fill it in at syscall
45 * entry time. This makes a full
46 * syscall record available if some
47 * other part of the kernel decides it
48 * should be recorded. */
49 AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
50 * always fill it in at syscall entry
51 * time, and always write out the audit
52 * record at syscall exit time. */
53};
54
55/* Rule lists */
56struct audit_field {
57 u32 type;
58 u32 val;
59 u32 op;
60};
61
62struct audit_krule {
63 int vers_ops;
64 u32 flags;
65 u32 listnr;
66 u32 action;
67 u32 mask[AUDIT_BITMASK_SIZE];
68 u32 buflen; /* for data alloc on list rules */
69 u32 field_count;
70 struct audit_field *fields;
71};
72
73struct audit_entry {
74 struct list_head list;
75 struct rcu_head rcu;
76 struct audit_krule rule;
77};
78
79
80extern int audit_pid;
81extern int audit_comparator(const u32 left, const u32 op, const u32 right);
82
83extern void audit_send_reply(int pid, int seq, int type,
84 int done, int multi,
85 void *payload, int size);
86extern void audit_log_lost(const char *message);
87extern void audit_panic(const char *message);
88extern struct mutex audit_netlink_mutex;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 0000000000..d3a8539f3a
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,630 @@
1/* auditfilter.c -- filtering of audit events
2 *
3 * Copyright 2003-2004 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/kernel.h>
23#include <linux/audit.h>
24#include <linux/kthread.h>
25#include <linux/netlink.h>
26#include "audit.h"
27
28/* There are three lists of rules -- one to search at task creation
29 * time, one to search at syscall entry time, and another to search at
30 * syscall exit time. */
31struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
32 LIST_HEAD_INIT(audit_filter_list[0]),
33 LIST_HEAD_INIT(audit_filter_list[1]),
34 LIST_HEAD_INIT(audit_filter_list[2]),
35 LIST_HEAD_INIT(audit_filter_list[3]),
36 LIST_HEAD_INIT(audit_filter_list[4]),
37 LIST_HEAD_INIT(audit_filter_list[5]),
38#if AUDIT_NR_FILTERS != 6
39#error Fix audit_filter_list initialiser
40#endif
41};
42
43static inline void audit_free_rule(struct audit_entry *e)
44{
45 kfree(e->rule.fields);
46 kfree(e);
47}
48
49static inline void audit_free_rule_rcu(struct rcu_head *head)
50{
51 struct audit_entry *e = container_of(head, struct audit_entry, rcu);
52 audit_free_rule(e);
53}
54
55/* Unpack a filter field's string representation from user-space
56 * buffer. */
57static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
58{
59 char *str;
60
61 if (!*bufp || (len == 0) || (len > *remain))
62 return ERR_PTR(-EINVAL);
63
64 /* Of the currently implemented string fields, PATH_MAX
65 * defines the longest valid length.
66 */
67 if (len > PATH_MAX)
68 return ERR_PTR(-ENAMETOOLONG);
69
70 str = kmalloc(len + 1, GFP_KERNEL);
71 if (unlikely(!str))
72 return ERR_PTR(-ENOMEM);
73
74 memcpy(str, *bufp, len);
75 str[len] = 0;
76 *bufp += len;
77 *remain -= len;
78
79 return str;
80}
81
82/* Common user-space to kernel rule translation. */
83static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
84{
85 unsigned listnr;
86 struct audit_entry *entry;
87 struct audit_field *fields;
88 int i, err;
89
90 err = -EINVAL;
91 listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
92 switch(listnr) {
93 default:
94 goto exit_err;
95 case AUDIT_FILTER_USER:
96 case AUDIT_FILTER_TYPE:
97#ifdef CONFIG_AUDITSYSCALL
98 case AUDIT_FILTER_ENTRY:
99 case AUDIT_FILTER_EXIT:
100 case AUDIT_FILTER_TASK:
101#endif
102 ;
103 }
104 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
105 rule->action != AUDIT_ALWAYS)
106 goto exit_err;
107 if (rule->field_count > AUDIT_MAX_FIELDS)
108 goto exit_err;
109
110 err = -ENOMEM;
111 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
112 if (unlikely(!entry))
113 goto exit_err;
114 fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
115 if (unlikely(!fields)) {
116 kfree(entry);
117 goto exit_err;
118 }
119
120 memset(&entry->rule, 0, sizeof(struct audit_krule));
121 memset(fields, 0, sizeof(struct audit_field));
122
123 entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
124 entry->rule.listnr = listnr;
125 entry->rule.action = rule->action;
126 entry->rule.field_count = rule->field_count;
127 entry->rule.fields = fields;
128
129 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
130 entry->rule.mask[i] = rule->mask[i];
131
132 return entry;
133
134exit_err:
135 return ERR_PTR(err);
136}
137
138/* Translate struct audit_rule to kernel's rule respresentation.
139 * Exists for backward compatibility with userspace. */
140static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
141{
142 struct audit_entry *entry;
143 int err = 0;
144 int i;
145
146 entry = audit_to_entry_common(rule);
147 if (IS_ERR(entry))
148 goto exit_nofree;
149
150 for (i = 0; i < rule->field_count; i++) {
151 struct audit_field *f = &entry->rule.fields[i];
152
153 if (rule->fields[i] & AUDIT_UNUSED_BITS) {
154 err = -EINVAL;
155 goto exit_free;
156 }
157
158 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
159 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
160 f->val = rule->values[i];
161
162 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
163
164 /* Support for legacy operators where
165 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
166 if (f->op & AUDIT_NEGATE)
167 f->op = AUDIT_NOT_EQUAL;
168 else if (!f->op)
169 f->op = AUDIT_EQUAL;
170 else if (f->op == AUDIT_OPERATORS) {
171 err = -EINVAL;
172 goto exit_free;
173 }
174 }
175
176exit_nofree:
177 return entry;
178
179exit_free:
180 audit_free_rule(entry);
181 return ERR_PTR(err);
182}
183
184/* Translate struct audit_rule_data to kernel's rule respresentation. */
185static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
186 size_t datasz)
187{
188 int err = 0;
189 struct audit_entry *entry;
190 void *bufp;
191 /* size_t remain = datasz - sizeof(struct audit_rule_data); */
192 int i;
193
194 entry = audit_to_entry_common((struct audit_rule *)data);
195 if (IS_ERR(entry))
196 goto exit_nofree;
197
198 bufp = data->buf;
199 entry->rule.vers_ops = 2;
200 for (i = 0; i < data->field_count; i++) {
201 struct audit_field *f = &entry->rule.fields[i];
202
203 err = -EINVAL;
204 if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
205 data->fieldflags[i] & ~AUDIT_OPERATORS)
206 goto exit_free;
207
208 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
209 f->type = data->fields[i];
210 switch(f->type) {
211 /* call type-specific conversion routines here */
212 default:
213 f->val = data->values[i];
214 }
215 }
216
217exit_nofree:
218 return entry;
219
220exit_free:
221 audit_free_rule(entry);
222 return ERR_PTR(err);
223}
224
225/* Pack a filter field's string representation into data block. */
226static inline size_t audit_pack_string(void **bufp, char *str)
227{
228 size_t len = strlen(str);
229
230 memcpy(*bufp, str, len);
231 *bufp += len;
232
233 return len;
234}
235
236/* Translate kernel rule respresentation to struct audit_rule.
237 * Exists for backward compatibility with userspace. */
238static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
239{
240 struct audit_rule *rule;
241 int i;
242
243 rule = kmalloc(sizeof(*rule), GFP_KERNEL);
244 if (unlikely(!rule))
245 return ERR_PTR(-ENOMEM);
246 memset(rule, 0, sizeof(*rule));
247
248 rule->flags = krule->flags | krule->listnr;
249 rule->action = krule->action;
250 rule->field_count = krule->field_count;
251 for (i = 0; i < rule->field_count; i++) {
252 rule->values[i] = krule->fields[i].val;
253 rule->fields[i] = krule->fields[i].type;
254
255 if (krule->vers_ops == 1) {
256 if (krule->fields[i].op & AUDIT_NOT_EQUAL)
257 rule->fields[i] |= AUDIT_NEGATE;
258 } else {
259 rule->fields[i] |= krule->fields[i].op;
260 }
261 }
262 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
263
264 return rule;
265}
266
267/* Translate kernel rule respresentation to struct audit_rule_data. */
268static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
269{
270 struct audit_rule_data *data;
271 void *bufp;
272 int i;
273
274 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
275 if (unlikely(!data))
276 return ERR_PTR(-ENOMEM);
277 memset(data, 0, sizeof(*data));
278
279 data->flags = krule->flags | krule->listnr;
280 data->action = krule->action;
281 data->field_count = krule->field_count;
282 bufp = data->buf;
283 for (i = 0; i < data->field_count; i++) {
284 struct audit_field *f = &krule->fields[i];
285
286 data->fields[i] = f->type;
287 data->fieldflags[i] = f->op;
288 switch(f->type) {
289 /* call type-specific conversion routines here */
290 default:
291 data->values[i] = f->val;
292 }
293 }
294 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
295
296 return data;
297}
298
299/* Compare two rules in kernel format. Considered success if rules
300 * don't match. */
301static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
302{
303 int i;
304
305 if (a->flags != b->flags ||
306 a->listnr != b->listnr ||
307 a->action != b->action ||
308 a->field_count != b->field_count)
309 return 1;
310
311 for (i = 0; i < a->field_count; i++) {
312 if (a->fields[i].type != b->fields[i].type ||
313 a->fields[i].op != b->fields[i].op)
314 return 1;
315
316 switch(a->fields[i].type) {
317 /* call type-specific comparison routines here */
318 default:
319 if (a->fields[i].val != b->fields[i].val)
320 return 1;
321 }
322 }
323
324 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
325 if (a->mask[i] != b->mask[i])
326 return 1;
327
328 return 0;
329}
330
331/* Add rule to given filterlist if not a duplicate. Protected by
332 * audit_netlink_mutex. */
333static inline int audit_add_rule(struct audit_entry *entry,
334 struct list_head *list)
335{
336 struct audit_entry *e;
337
338 /* Do not use the _rcu iterator here, since this is the only
339 * addition routine. */
340 list_for_each_entry(e, list, list) {
341 if (!audit_compare_rule(&entry->rule, &e->rule))
342 return -EEXIST;
343 }
344
345 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
346 list_add_rcu(&entry->list, list);
347 } else {
348 list_add_tail_rcu(&entry->list, list);
349 }
350
351 return 0;
352}
353
354/* Remove an existing rule from filterlist. Protected by
355 * audit_netlink_mutex. */
356static inline int audit_del_rule(struct audit_entry *entry,
357 struct list_head *list)
358{
359 struct audit_entry *e;
360
361 /* Do not use the _rcu iterator here, since this is the only
362 * deletion routine. */
363 list_for_each_entry(e, list, list) {
364 if (!audit_compare_rule(&entry->rule, &e->rule)) {
365 list_del_rcu(&e->list);
366 call_rcu(&e->rcu, audit_free_rule_rcu);
367 return 0;
368 }
369 }
370 return -ENOENT; /* No matching rule */
371}
372
373/* List rules using struct audit_rule. Exists for backward
374 * compatibility with userspace. */
375static int audit_list(void *_dest)
376{
377 int pid, seq;
378 int *dest = _dest;
379 struct audit_entry *entry;
380 int i;
381
382 pid = dest[0];
383 seq = dest[1];
384 kfree(dest);
385
386 mutex_lock(&audit_netlink_mutex);
387
388 /* The *_rcu iterators not needed here because we are
389 always called with audit_netlink_mutex held. */
390 for (i=0; i<AUDIT_NR_FILTERS; i++) {
391 list_for_each_entry(entry, &audit_filter_list[i], list) {
392 struct audit_rule *rule;
393
394 rule = audit_krule_to_rule(&entry->rule);
395 if (unlikely(!rule))
396 break;
397 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
398 rule, sizeof(*rule));
399 kfree(rule);
400 }
401 }
402 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
403
404 mutex_unlock(&audit_netlink_mutex);
405 return 0;
406}
407
408/* List rules using struct audit_rule_data. */
409static int audit_list_rules(void *_dest)
410{
411 int pid, seq;
412 int *dest = _dest;
413 struct audit_entry *e;
414 int i;
415
416 pid = dest[0];
417 seq = dest[1];
418 kfree(dest);
419
420 mutex_lock(&audit_netlink_mutex);
421
422 /* The *_rcu iterators not needed here because we are
423 always called with audit_netlink_mutex held. */
424 for (i=0; i<AUDIT_NR_FILTERS; i++) {
425 list_for_each_entry(e, &audit_filter_list[i], list) {
426 struct audit_rule_data *data;
427
428 data = audit_krule_to_data(&e->rule);
429 if (unlikely(!data))
430 break;
431 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
432 data, sizeof(*data));
433 kfree(data);
434 }
435 }
436 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
437
438 mutex_unlock(&audit_netlink_mutex);
439 return 0;
440}
441
442/**
443 * audit_receive_filter - apply all rules to the specified message type
444 * @type: audit message type
445 * @pid: target pid for netlink audit messages
446 * @uid: target uid for netlink audit messages
447 * @seq: netlink audit message sequence (serial) number
448 * @data: payload data
449 * @datasz: size of payload data
450 * @loginuid: loginuid of sender
451 */
452int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
453 size_t datasz, uid_t loginuid)
454{
455 struct task_struct *tsk;
456 int *dest;
457 int err = 0;
458 struct audit_entry *entry;
459
460 switch (type) {
461 case AUDIT_LIST:
462 case AUDIT_LIST_RULES:
463 /* We can't just spew out the rules here because we might fill
464 * the available socket buffer space and deadlock waiting for
465 * auditctl to read from it... which isn't ever going to
466 * happen if we're actually running in the context of auditctl
467 * trying to _send_ the stuff */
468
469 dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
470 if (!dest)
471 return -ENOMEM;
472 dest[0] = pid;
473 dest[1] = seq;
474
475 if (type == AUDIT_LIST)
476 tsk = kthread_run(audit_list, dest, "audit_list");
477 else
478 tsk = kthread_run(audit_list_rules, dest,
479 "audit_list_rules");
480 if (IS_ERR(tsk)) {
481 kfree(dest);
482 err = PTR_ERR(tsk);
483 }
484 break;
485 case AUDIT_ADD:
486 case AUDIT_ADD_RULE:
487 if (type == AUDIT_ADD)
488 entry = audit_rule_to_entry(data);
489 else
490 entry = audit_data_to_entry(data, datasz);
491 if (IS_ERR(entry))
492 return PTR_ERR(entry);
493
494 err = audit_add_rule(entry,
495 &audit_filter_list[entry->rule.listnr]);
496 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
497 "auid=%u add rule to list=%d res=%d\n",
498 loginuid, entry->rule.listnr, !err);
499
500 if (err)
501 audit_free_rule(entry);
502 break;
503 case AUDIT_DEL:
504 case AUDIT_DEL_RULE:
505 if (type == AUDIT_DEL)
506 entry = audit_rule_to_entry(data);
507 else
508 entry = audit_data_to_entry(data, datasz);
509 if (IS_ERR(entry))
510 return PTR_ERR(entry);
511
512 err = audit_del_rule(entry,
513 &audit_filter_list[entry->rule.listnr]);
514 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
515 "auid=%u remove rule from list=%d res=%d\n",
516 loginuid, entry->rule.listnr, !err);
517
518 audit_free_rule(entry);
519 break;
520 default:
521 return -EINVAL;
522 }
523
524 return err;
525}
526
527int audit_comparator(const u32 left, const u32 op, const u32 right)
528{
529 switch (op) {
530 case AUDIT_EQUAL:
531 return (left == right);
532 case AUDIT_NOT_EQUAL:
533 return (left != right);
534 case AUDIT_LESS_THAN:
535 return (left < right);
536 case AUDIT_LESS_THAN_OR_EQUAL:
537 return (left <= right);
538 case AUDIT_GREATER_THAN:
539 return (left > right);
540 case AUDIT_GREATER_THAN_OR_EQUAL:
541 return (left >= right);
542 }
543 BUG();
544 return 0;
545}
546
547
548
549static int audit_filter_user_rules(struct netlink_skb_parms *cb,
550 struct audit_krule *rule,
551 enum audit_state *state)
552{
553 int i;
554
555 for (i = 0; i < rule->field_count; i++) {
556 struct audit_field *f = &rule->fields[i];
557 int result = 0;
558
559 switch (f->type) {
560 case AUDIT_PID:
561 result = audit_comparator(cb->creds.pid, f->op, f->val);
562 break;
563 case AUDIT_UID:
564 result = audit_comparator(cb->creds.uid, f->op, f->val);
565 break;
566 case AUDIT_GID:
567 result = audit_comparator(cb->creds.gid, f->op, f->val);
568 break;
569 case AUDIT_LOGINUID:
570 result = audit_comparator(cb->loginuid, f->op, f->val);
571 break;
572 }
573
574 if (!result)
575 return 0;
576 }
577 switch (rule->action) {
578 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
579 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
580 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
581 }
582 return 1;
583}
584
585int audit_filter_user(struct netlink_skb_parms *cb, int type)
586{
587 struct audit_entry *e;
588 enum audit_state state;
589 int ret = 1;
590
591 rcu_read_lock();
592 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
593 if (audit_filter_user_rules(cb, &e->rule, &state)) {
594 if (state == AUDIT_DISABLED)
595 ret = 0;
596 break;
597 }
598 }
599 rcu_read_unlock();
600
601 return ret; /* Audit by default */
602}
603
604int audit_filter_type(int type)
605{
606 struct audit_entry *e;
607 int result = 0;
608
609 rcu_read_lock();
610 if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
611 goto unlock_and_return;
612
613 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
614 list) {
615 int i;
616 for (i = 0; i < e->rule.field_count; i++) {
617 struct audit_field *f = &e->rule.fields[i];
618 if (f->type == AUDIT_MSGTYPE) {
619 result = audit_comparator(type, f->op, f->val);
620 if (!result)
621 break;
622 }
623 }
624 if (result)
625 goto unlock_and_return;
626 }
627unlock_and_return:
628 rcu_read_unlock();
629 return result;
630}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7e7e637b9..7f160df21a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,8 @@
2 * Handles all system-call specific auditing features. 2 * Handles all system-call specific auditing features.
3 * 3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
6 * Copyright (C) 2005 IBM Corporation
5 * All Rights Reserved. 7 * All Rights Reserved.
6 * 8 *
7 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -27,11 +29,22 @@
27 * this file -- see entry.S) is based on a GPL'd patch written by 29 * this file -- see entry.S) is based on a GPL'd patch written by
28 * okir@suse.de and Copyright 2003 SuSE Linux AG. 30 * okir@suse.de and Copyright 2003 SuSE Linux AG.
29 * 31 *
32 * The support of additional filter rules compares (>, <, >=, <=) was
33 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
34 *
35 * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
36 * filesystem information.
37 *
38 * Subject and object context labeling support added by <danjones@us.ibm.com>
39 * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
30 */ 40 */
31 41
32#include <linux/init.h> 42#include <linux/init.h>
33#include <asm/types.h> 43#include <asm/types.h>
34#include <asm/atomic.h> 44#include <asm/atomic.h>
45#include <asm/types.h>
46#include <linux/fs.h>
47#include <linux/namei.h>
35#include <linux/mm.h> 48#include <linux/mm.h>
36#include <linux/module.h> 49#include <linux/module.h>
37#include <linux/mount.h> 50#include <linux/mount.h>
@@ -39,16 +52,16 @@
39#include <linux/audit.h> 52#include <linux/audit.h>
40#include <linux/personality.h> 53#include <linux/personality.h>
41#include <linux/time.h> 54#include <linux/time.h>
42#include <linux/kthread.h>
43#include <linux/netlink.h> 55#include <linux/netlink.h>
44#include <linux/compiler.h> 56#include <linux/compiler.h>
45#include <asm/unistd.h> 57#include <asm/unistd.h>
58#include <linux/security.h>
59#include <linux/list.h>
60#include <linux/tty.h>
61
62#include "audit.h"
46 63
47/* 0 = no checking 64extern struct list_head audit_filter_list[];
48 1 = put_count checking
49 2 = verbose put_count checking
50*/
51#define AUDIT_DEBUG 0
52 65
53/* No syscall auditing will take place unless audit_enabled != 0. */ 66/* No syscall auditing will take place unless audit_enabled != 0. */
54extern int audit_enabled; 67extern int audit_enabled;
@@ -62,29 +75,6 @@ extern int audit_enabled;
62 * path_lookup. */ 75 * path_lookup. */
63#define AUDIT_NAMES_RESERVED 7 76#define AUDIT_NAMES_RESERVED 7
64 77
65/* At task start time, the audit_state is set in the audit_context using
66 a per-task filter. At syscall entry, the audit_state is augmented by
67 the syscall filter. */
68enum audit_state {
69 AUDIT_DISABLED, /* Do not create per-task audit_context.
70 * No syscall-specific audit records can
71 * be generated. */
72 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
73 * but don't necessarily fill it in at
74 * syscall entry time (i.e., filter
75 * instead). */
76 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
77 * and always fill it in at syscall
78 * entry time. This makes a full
79 * syscall record available if some
80 * other part of the kernel decides it
81 * should be recorded. */
82 AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
83 * always fill it in at syscall entry
84 * time, and always write out the audit
85 * record at syscall exit time. */
86};
87
88/* When fs/namei.c:getname() is called, we store the pointer in name and 78/* When fs/namei.c:getname() is called, we store the pointer in name and
89 * we don't let putname() free it (instead we free all of the saved 79 * we don't let putname() free it (instead we free all of the saved
90 * pointers at syscall exit time). 80 * pointers at syscall exit time).
@@ -93,12 +83,13 @@ enum audit_state {
93struct audit_names { 83struct audit_names {
94 const char *name; 84 const char *name;
95 unsigned long ino; 85 unsigned long ino;
86 unsigned long pino;
96 dev_t dev; 87 dev_t dev;
97 umode_t mode; 88 umode_t mode;
98 uid_t uid; 89 uid_t uid;
99 gid_t gid; 90 gid_t gid;
100 dev_t rdev; 91 dev_t rdev;
101 unsigned flags; 92 char *ctx;
102}; 93};
103 94
104struct audit_aux_data { 95struct audit_aux_data {
@@ -115,6 +106,7 @@ struct audit_aux_data_ipcctl {
115 uid_t uid; 106 uid_t uid;
116 gid_t gid; 107 gid_t gid;
117 mode_t mode; 108 mode_t mode;
109 char *ctx;
118}; 110};
119 111
120struct audit_aux_data_socketcall { 112struct audit_aux_data_socketcall {
@@ -167,290 +159,72 @@ struct audit_context {
167#endif 159#endif
168}; 160};
169 161
170 /* Public API */
171/* There are three lists of rules -- one to search at task creation
172 * time, one to search at syscall entry time, and another to search at
173 * syscall exit time. */
174static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
175 LIST_HEAD_INIT(audit_filter_list[0]),
176 LIST_HEAD_INIT(audit_filter_list[1]),
177 LIST_HEAD_INIT(audit_filter_list[2]),
178 LIST_HEAD_INIT(audit_filter_list[3]),
179 LIST_HEAD_INIT(audit_filter_list[4]),
180#if AUDIT_NR_FILTERS != 5
181#error Fix audit_filter_list initialiser
182#endif
183};
184
185struct audit_entry {
186 struct list_head list;
187 struct rcu_head rcu;
188 struct audit_rule rule;
189};
190
191extern int audit_pid;
192
193/* Copy rule from user-space to kernel-space. Called from
194 * audit_add_rule during AUDIT_ADD. */
195static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
196{
197 int i;
198
199 if (s->action != AUDIT_NEVER
200 && s->action != AUDIT_POSSIBLE
201 && s->action != AUDIT_ALWAYS)
202 return -1;
203 if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
204 return -1;
205 if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
206 return -1;
207
208 d->flags = s->flags;
209 d->action = s->action;
210 d->field_count = s->field_count;
211 for (i = 0; i < d->field_count; i++) {
212 d->fields[i] = s->fields[i];
213 d->values[i] = s->values[i];
214 }
215 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
216 return 0;
217}
218
219/* Check to see if two rules are identical. It is called from
220 * audit_add_rule during AUDIT_ADD and
221 * audit_del_rule during AUDIT_DEL. */
222static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
223{
224 int i;
225
226 if (a->flags != b->flags)
227 return 1;
228
229 if (a->action != b->action)
230 return 1;
231
232 if (a->field_count != b->field_count)
233 return 1;
234
235 for (i = 0; i < a->field_count; i++) {
236 if (a->fields[i] != b->fields[i]
237 || a->values[i] != b->values[i])
238 return 1;
239 }
240
241 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
242 if (a->mask[i] != b->mask[i])
243 return 1;
244
245 return 0;
246}
247
248/* Note that audit_add_rule and audit_del_rule are called via
249 * audit_receive() in audit.c, and are protected by
250 * audit_netlink_sem. */
251static inline int audit_add_rule(struct audit_rule *rule,
252 struct list_head *list)
253{
254 struct audit_entry *entry;
255
256 /* Do not use the _rcu iterator here, since this is the only
257 * addition routine. */
258 list_for_each_entry(entry, list, list) {
259 if (!audit_compare_rule(rule, &entry->rule)) {
260 return -EEXIST;
261 }
262 }
263
264 if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
265 return -ENOMEM;
266 if (audit_copy_rule(&entry->rule, rule)) {
267 kfree(entry);
268 return -EINVAL;
269 }
270
271 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
272 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
273 list_add_rcu(&entry->list, list);
274 } else {
275 list_add_tail_rcu(&entry->list, list);
276 }
277
278 return 0;
279}
280
281static inline void audit_free_rule(struct rcu_head *head)
282{
283 struct audit_entry *e = container_of(head, struct audit_entry, rcu);
284 kfree(e);
285}
286
287/* Note that audit_add_rule and audit_del_rule are called via
288 * audit_receive() in audit.c, and are protected by
289 * audit_netlink_sem. */
290static inline int audit_del_rule(struct audit_rule *rule,
291 struct list_head *list)
292{
293 struct audit_entry *e;
294
295 /* Do not use the _rcu iterator here, since this is the only
296 * deletion routine. */
297 list_for_each_entry(e, list, list) {
298 if (!audit_compare_rule(rule, &e->rule)) {
299 list_del_rcu(&e->list);
300 call_rcu(&e->rcu, audit_free_rule);
301 return 0;
302 }
303 }
304 return -ENOENT; /* No matching rule */
305}
306
307static int audit_list_rules(void *_dest)
308{
309 int pid, seq;
310 int *dest = _dest;
311 struct audit_entry *entry;
312 int i;
313
314 pid = dest[0];
315 seq = dest[1];
316 kfree(dest);
317
318 down(&audit_netlink_sem);
319
320 /* The *_rcu iterators not needed here because we are
321 always called with audit_netlink_sem held. */
322 for (i=0; i<AUDIT_NR_FILTERS; i++) {
323 list_for_each_entry(entry, &audit_filter_list[i], list)
324 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
325 &entry->rule, sizeof(entry->rule));
326 }
327 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
328
329 up(&audit_netlink_sem);
330 return 0;
331}
332
333int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
334 uid_t loginuid)
335{
336 struct task_struct *tsk;
337 int *dest;
338 int err = 0;
339 unsigned listnr;
340
341 switch (type) {
342 case AUDIT_LIST:
343 /* We can't just spew out the rules here because we might fill
344 * the available socket buffer space and deadlock waiting for
345 * auditctl to read from it... which isn't ever going to
346 * happen if we're actually running in the context of auditctl
347 * trying to _send_ the stuff */
348
349 dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
350 if (!dest)
351 return -ENOMEM;
352 dest[0] = pid;
353 dest[1] = seq;
354
355 tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
356 if (IS_ERR(tsk)) {
357 kfree(dest);
358 err = PTR_ERR(tsk);
359 }
360 break;
361 case AUDIT_ADD:
362 listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
363 if (listnr >= AUDIT_NR_FILTERS)
364 return -EINVAL;
365
366 err = audit_add_rule(data, &audit_filter_list[listnr]);
367 if (!err)
368 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
369 "auid=%u added an audit rule\n", loginuid);
370 break;
371 case AUDIT_DEL:
372 listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
373 if (listnr >= AUDIT_NR_FILTERS)
374 return -EINVAL;
375
376 err = audit_del_rule(data, &audit_filter_list[listnr]);
377 if (!err)
378 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
379 "auid=%u removed an audit rule\n", loginuid);
380 break;
381 default:
382 return -EINVAL;
383 }
384
385 return err;
386}
387 162
388/* Compare a task_struct with an audit_rule. Return 1 on match, 0 163/* Compare a task_struct with an audit_rule. Return 1 on match, 0
389 * otherwise. */ 164 * otherwise. */
390static int audit_filter_rules(struct task_struct *tsk, 165static int audit_filter_rules(struct task_struct *tsk,
391 struct audit_rule *rule, 166 struct audit_krule *rule,
392 struct audit_context *ctx, 167 struct audit_context *ctx,
393 enum audit_state *state) 168 enum audit_state *state)
394{ 169{
395 int i, j; 170 int i, j;
396 171
397 for (i = 0; i < rule->field_count; i++) { 172 for (i = 0; i < rule->field_count; i++) {
398 u32 field = rule->fields[i] & ~AUDIT_NEGATE; 173 struct audit_field *f = &rule->fields[i];
399 u32 value = rule->values[i];
400 int result = 0; 174 int result = 0;
401 175
402 switch (field) { 176 switch (f->type) {
403 case AUDIT_PID: 177 case AUDIT_PID:
404 result = (tsk->pid == value); 178 result = audit_comparator(tsk->pid, f->op, f->val);
405 break; 179 break;
406 case AUDIT_UID: 180 case AUDIT_UID:
407 result = (tsk->uid == value); 181 result = audit_comparator(tsk->uid, f->op, f->val);
408 break; 182 break;
409 case AUDIT_EUID: 183 case AUDIT_EUID:
410 result = (tsk->euid == value); 184 result = audit_comparator(tsk->euid, f->op, f->val);
411 break; 185 break;
412 case AUDIT_SUID: 186 case AUDIT_SUID:
413 result = (tsk->suid == value); 187 result = audit_comparator(tsk->suid, f->op, f->val);
414 break; 188 break;
415 case AUDIT_FSUID: 189 case AUDIT_FSUID:
416 result = (tsk->fsuid == value); 190 result = audit_comparator(tsk->fsuid, f->op, f->val);
417 break; 191 break;
418 case AUDIT_GID: 192 case AUDIT_GID:
419 result = (tsk->gid == value); 193 result = audit_comparator(tsk->gid, f->op, f->val);
420 break; 194 break;
421 case AUDIT_EGID: 195 case AUDIT_EGID:
422 result = (tsk->egid == value); 196 result = audit_comparator(tsk->egid, f->op, f->val);
423 break; 197 break;
424 case AUDIT_SGID: 198 case AUDIT_SGID:
425 result = (tsk->sgid == value); 199 result = audit_comparator(tsk->sgid, f->op, f->val);
426 break; 200 break;
427 case AUDIT_FSGID: 201 case AUDIT_FSGID:
428 result = (tsk->fsgid == value); 202 result = audit_comparator(tsk->fsgid, f->op, f->val);
429 break; 203 break;
430 case AUDIT_PERS: 204 case AUDIT_PERS:
431 result = (tsk->personality == value); 205 result = audit_comparator(tsk->personality, f->op, f->val);
432 break; 206 break;
433 case AUDIT_ARCH: 207 case AUDIT_ARCH:
434 if (ctx) 208 if (ctx)
435 result = (ctx->arch == value); 209 result = audit_comparator(ctx->arch, f->op, f->val);
436 break; 210 break;
437 211
438 case AUDIT_EXIT: 212 case AUDIT_EXIT:
439 if (ctx && ctx->return_valid) 213 if (ctx && ctx->return_valid)
440 result = (ctx->return_code == value); 214 result = audit_comparator(ctx->return_code, f->op, f->val);
441 break; 215 break;
442 case AUDIT_SUCCESS: 216 case AUDIT_SUCCESS:
443 if (ctx && ctx->return_valid) { 217 if (ctx && ctx->return_valid) {
444 if (value) 218 if (f->val)
445 result = (ctx->return_valid == AUDITSC_SUCCESS); 219 result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
446 else 220 else
447 result = (ctx->return_valid == AUDITSC_FAILURE); 221 result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
448 } 222 }
449 break; 223 break;
450 case AUDIT_DEVMAJOR: 224 case AUDIT_DEVMAJOR:
451 if (ctx) { 225 if (ctx) {
452 for (j = 0; j < ctx->name_count; j++) { 226 for (j = 0; j < ctx->name_count; j++) {
453 if (MAJOR(ctx->names[j].dev)==value) { 227 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
454 ++result; 228 ++result;
455 break; 229 break;
456 } 230 }
@@ -460,7 +234,7 @@ static int audit_filter_rules(struct task_struct *tsk,
460 case AUDIT_DEVMINOR: 234 case AUDIT_DEVMINOR:
461 if (ctx) { 235 if (ctx) {
462 for (j = 0; j < ctx->name_count; j++) { 236 for (j = 0; j < ctx->name_count; j++) {
463 if (MINOR(ctx->names[j].dev)==value) { 237 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
464 ++result; 238 ++result;
465 break; 239 break;
466 } 240 }
@@ -470,7 +244,8 @@ static int audit_filter_rules(struct task_struct *tsk,
470 case AUDIT_INODE: 244 case AUDIT_INODE:
471 if (ctx) { 245 if (ctx) {
472 for (j = 0; j < ctx->name_count; j++) { 246 for (j = 0; j < ctx->name_count; j++) {
473 if (ctx->names[j].ino == value) { 247 if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
248 audit_comparator(ctx->names[j].pino, f->op, f->val)) {
474 ++result; 249 ++result;
475 break; 250 break;
476 } 251 }
@@ -480,19 +255,17 @@ static int audit_filter_rules(struct task_struct *tsk,
480 case AUDIT_LOGINUID: 255 case AUDIT_LOGINUID:
481 result = 0; 256 result = 0;
482 if (ctx) 257 if (ctx)
483 result = (ctx->loginuid == value); 258 result = audit_comparator(ctx->loginuid, f->op, f->val);
484 break; 259 break;
485 case AUDIT_ARG0: 260 case AUDIT_ARG0:
486 case AUDIT_ARG1: 261 case AUDIT_ARG1:
487 case AUDIT_ARG2: 262 case AUDIT_ARG2:
488 case AUDIT_ARG3: 263 case AUDIT_ARG3:
489 if (ctx) 264 if (ctx)
490 result = (ctx->argv[field-AUDIT_ARG0]==value); 265 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
491 break; 266 break;
492 } 267 }
493 268
494 if (rule->fields[i] & AUDIT_NEGATE)
495 result = !result;
496 if (!result) 269 if (!result)
497 return 0; 270 return 0;
498 } 271 }
@@ -527,7 +300,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
527/* At syscall entry and exit time, this filter is called if the 300/* At syscall entry and exit time, this filter is called if the
528 * audit_state is not low enough that auditing cannot take place, but is 301 * audit_state is not low enough that auditing cannot take place, but is
529 * also not high enough that we already know we have to write an audit 302 * also not high enough that we already know we have to write an audit
530 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). 303 * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
531 */ 304 */
532static enum audit_state audit_filter_syscall(struct task_struct *tsk, 305static enum audit_state audit_filter_syscall(struct task_struct *tsk,
533 struct audit_context *ctx, 306 struct audit_context *ctx,
@@ -541,77 +314,19 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
541 314
542 rcu_read_lock(); 315 rcu_read_lock();
543 if (!list_empty(list)) { 316 if (!list_empty(list)) {
544 int word = AUDIT_WORD(ctx->major); 317 int word = AUDIT_WORD(ctx->major);
545 int bit = AUDIT_BIT(ctx->major); 318 int bit = AUDIT_BIT(ctx->major);
546 319
547 list_for_each_entry_rcu(e, list, list) { 320 list_for_each_entry_rcu(e, list, list) {
548 if ((e->rule.mask[word] & bit) == bit 321 if ((e->rule.mask[word] & bit) == bit
549 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 322 && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
550 rcu_read_unlock(); 323 rcu_read_unlock();
551 return state; 324 return state;
552 } 325 }
553 }
554 }
555 rcu_read_unlock();
556 return AUDIT_BUILD_CONTEXT;
557}
558
559static int audit_filter_user_rules(struct netlink_skb_parms *cb,
560 struct audit_rule *rule,
561 enum audit_state *state)
562{
563 int i;
564
565 for (i = 0; i < rule->field_count; i++) {
566 u32 field = rule->fields[i] & ~AUDIT_NEGATE;
567 u32 value = rule->values[i];
568 int result = 0;
569
570 switch (field) {
571 case AUDIT_PID:
572 result = (cb->creds.pid == value);
573 break;
574 case AUDIT_UID:
575 result = (cb->creds.uid == value);
576 break;
577 case AUDIT_GID:
578 result = (cb->creds.gid == value);
579 break;
580 case AUDIT_LOGINUID:
581 result = (cb->loginuid == value);
582 break;
583 }
584
585 if (rule->fields[i] & AUDIT_NEGATE)
586 result = !result;
587 if (!result)
588 return 0;
589 }
590 switch (rule->action) {
591 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
592 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
593 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
594 }
595 return 1;
596}
597
598int audit_filter_user(struct netlink_skb_parms *cb, int type)
599{
600 struct audit_entry *e;
601 enum audit_state state;
602 int ret = 1;
603
604 rcu_read_lock();
605 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
606 if (audit_filter_user_rules(cb, &e->rule, &state)) {
607 if (state == AUDIT_DISABLED)
608 ret = 0;
609 break;
610 } 326 }
611 } 327 }
612 rcu_read_unlock(); 328 rcu_read_unlock();
613 329 return AUDIT_BUILD_CONTEXT;
614 return ret; /* Audit by default */
615} 330}
616 331
617/* This should be called with task_lock() held. */ 332/* This should be called with task_lock() held. */
@@ -654,17 +369,18 @@ static inline void audit_free_names(struct audit_context *context)
654#if AUDIT_DEBUG == 2 369#if AUDIT_DEBUG == 2
655 if (context->auditable 370 if (context->auditable
656 ||context->put_count + context->ino_count != context->name_count) { 371 ||context->put_count + context->ino_count != context->name_count) {
657 printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" 372 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
658 " name_count=%d put_count=%d" 373 " name_count=%d put_count=%d"
659 " ino_count=%d [NOT freeing]\n", 374 " ino_count=%d [NOT freeing]\n",
660 __LINE__, 375 __FILE__, __LINE__,
661 context->serial, context->major, context->in_syscall, 376 context->serial, context->major, context->in_syscall,
662 context->name_count, context->put_count, 377 context->name_count, context->put_count,
663 context->ino_count); 378 context->ino_count);
664 for (i = 0; i < context->name_count; i++) 379 for (i = 0; i < context->name_count; i++) {
665 printk(KERN_ERR "names[%d] = %p = %s\n", i, 380 printk(KERN_ERR "names[%d] = %p = %s\n", i,
666 context->names[i].name, 381 context->names[i].name,
667 context->names[i].name); 382 context->names[i].name ?: "(null)");
383 }
668 dump_stack(); 384 dump_stack();
669 return; 385 return;
670 } 386 }
@@ -674,9 +390,13 @@ static inline void audit_free_names(struct audit_context *context)
674 context->ino_count = 0; 390 context->ino_count = 0;
675#endif 391#endif
676 392
677 for (i = 0; i < context->name_count; i++) 393 for (i = 0; i < context->name_count; i++) {
394 char *p = context->names[i].ctx;
395 context->names[i].ctx = NULL;
396 kfree(p);
678 if (context->names[i].name) 397 if (context->names[i].name)
679 __putname(context->names[i].name); 398 __putname(context->names[i].name);
399 }
680 context->name_count = 0; 400 context->name_count = 0;
681 if (context->pwd) 401 if (context->pwd)
682 dput(context->pwd); 402 dput(context->pwd);
@@ -696,6 +416,12 @@ static inline void audit_free_aux(struct audit_context *context)
696 dput(axi->dentry); 416 dput(axi->dentry);
697 mntput(axi->mnt); 417 mntput(axi->mnt);
698 } 418 }
419 if ( aux->type == AUDIT_IPC ) {
420 struct audit_aux_data_ipcctl *axi = (void *)aux;
421 if (axi->ctx)
422 kfree(axi->ctx);
423 }
424
699 context->aux = aux->next; 425 context->aux = aux->next;
700 kfree(aux); 426 kfree(aux);
701 } 427 }
@@ -721,10 +447,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
721 return context; 447 return context;
722} 448}
723 449
724/* Filter on the task information and allocate a per-task audit context 450/**
451 * audit_alloc - allocate an audit context block for a task
452 * @tsk: task
453 *
454 * Filter on the task information and allocate a per-task audit context
725 * if necessary. Doing so turns on system call auditing for the 455 * if necessary. Doing so turns on system call auditing for the
726 * specified task. This is called from copy_process, so no lock is 456 * specified task. This is called from copy_process, so no lock is
727 * needed. */ 457 * needed.
458 */
728int audit_alloc(struct task_struct *tsk) 459int audit_alloc(struct task_struct *tsk)
729{ 460{
730 struct audit_context *context; 461 struct audit_context *context;
@@ -775,7 +506,37 @@ static inline void audit_free_context(struct audit_context *context)
775 printk(KERN_ERR "audit: freed %d contexts\n", count); 506 printk(KERN_ERR "audit: freed %d contexts\n", count);
776} 507}
777 508
778static void audit_log_task_info(struct audit_buffer *ab) 509static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
510{
511 char *ctx = NULL;
512 ssize_t len = 0;
513
514 len = security_getprocattr(current, "current", NULL, 0);
515 if (len < 0) {
516 if (len != -EINVAL)
517 goto error_path;
518 return;
519 }
520
521 ctx = kmalloc(len, gfp_mask);
522 if (!ctx)
523 goto error_path;
524
525 len = security_getprocattr(current, "current", ctx, len);
526 if (len < 0 )
527 goto error_path;
528
529 audit_log_format(ab, " subj=%s", ctx);
530 return;
531
532error_path:
533 if (ctx)
534 kfree(ctx);
535 audit_panic("error in audit_log_task_context");
536 return;
537}
538
539static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask)
779{ 540{
780 char name[sizeof(current->comm)]; 541 char name[sizeof(current->comm)];
781 struct mm_struct *mm = current->mm; 542 struct mm_struct *mm = current->mm;
@@ -788,6 +549,10 @@ static void audit_log_task_info(struct audit_buffer *ab)
788 if (!mm) 549 if (!mm)
789 return; 550 return;
790 551
552 /*
553 * this is brittle; all callers that pass GFP_ATOMIC will have
554 * NULL current->mm and we won't get here.
555 */
791 down_read(&mm->mmap_sem); 556 down_read(&mm->mmap_sem);
792 vma = mm->mmap; 557 vma = mm->mmap;
793 while (vma) { 558 while (vma) {
@@ -801,6 +566,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
801 vma = vma->vm_next; 566 vma = vma->vm_next;
802 } 567 }
803 up_read(&mm->mmap_sem); 568 up_read(&mm->mmap_sem);
569 audit_log_task_context(ab, gfp_mask);
804} 570}
805 571
806static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) 572static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
@@ -808,6 +574,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
808 int i; 574 int i;
809 struct audit_buffer *ab; 575 struct audit_buffer *ab;
810 struct audit_aux_data *aux; 576 struct audit_aux_data *aux;
577 const char *tty;
811 578
812 ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); 579 ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
813 if (!ab) 580 if (!ab)
@@ -820,11 +587,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
820 audit_log_format(ab, " success=%s exit=%ld", 587 audit_log_format(ab, " success=%s exit=%ld",
821 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 588 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
822 context->return_code); 589 context->return_code);
590 if (current->signal->tty && current->signal->tty->name)
591 tty = current->signal->tty->name;
592 else
593 tty = "(none)";
823 audit_log_format(ab, 594 audit_log_format(ab,
824 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 595 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
825 " pid=%d auid=%u uid=%u gid=%u" 596 " pid=%d auid=%u uid=%u gid=%u"
826 " euid=%u suid=%u fsuid=%u" 597 " euid=%u suid=%u fsuid=%u"
827 " egid=%u sgid=%u fsgid=%u", 598 " egid=%u sgid=%u fsgid=%u tty=%s",
828 context->argv[0], 599 context->argv[0],
829 context->argv[1], 600 context->argv[1],
830 context->argv[2], 601 context->argv[2],
@@ -835,8 +606,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
835 context->uid, 606 context->uid,
836 context->gid, 607 context->gid,
837 context->euid, context->suid, context->fsuid, 608 context->euid, context->suid, context->fsuid,
838 context->egid, context->sgid, context->fsgid); 609 context->egid, context->sgid, context->fsgid, tty);
839 audit_log_task_info(ab); 610 audit_log_task_info(ab, gfp_mask);
840 audit_log_end(ab); 611 audit_log_end(ab);
841 612
842 for (aux = context->aux; aux; aux = aux->next) { 613 for (aux = context->aux; aux; aux = aux->next) {
@@ -849,8 +620,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
849 case AUDIT_IPC: { 620 case AUDIT_IPC: {
850 struct audit_aux_data_ipcctl *axi = (void *)aux; 621 struct audit_aux_data_ipcctl *axi = (void *)aux;
851 audit_log_format(ab, 622 audit_log_format(ab,
852 " qbytes=%lx iuid=%u igid=%u mode=%x", 623 " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
853 axi->qbytes, axi->uid, axi->gid, axi->mode); 624 axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
854 break; } 625 break; }
855 626
856 case AUDIT_SOCKETCALL: { 627 case AUDIT_SOCKETCALL: {
@@ -885,42 +656,62 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
885 } 656 }
886 } 657 }
887 for (i = 0; i < context->name_count; i++) { 658 for (i = 0; i < context->name_count; i++) {
659 unsigned long ino = context->names[i].ino;
660 unsigned long pino = context->names[i].pino;
661
888 ab = audit_log_start(context, gfp_mask, AUDIT_PATH); 662 ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
889 if (!ab) 663 if (!ab)
890 continue; /* audit_panic has been called */ 664 continue; /* audit_panic has been called */
891 665
892 audit_log_format(ab, "item=%d", i); 666 audit_log_format(ab, "item=%d", i);
893 if (context->names[i].name) { 667
894 audit_log_format(ab, " name="); 668 audit_log_format(ab, " name=");
669 if (context->names[i].name)
895 audit_log_untrustedstring(ab, context->names[i].name); 670 audit_log_untrustedstring(ab, context->names[i].name);
896 } 671 else
897 audit_log_format(ab, " flags=%x\n", context->names[i].flags); 672 audit_log_format(ab, "(null)");
898 673
899 if (context->names[i].ino != (unsigned long)-1) 674 if (pino != (unsigned long)-1)
900 audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" 675 audit_log_format(ab, " parent=%lu", pino);
901 " ouid=%u ogid=%u rdev=%02x:%02x", 676 if (ino != (unsigned long)-1)
902 context->names[i].ino, 677 audit_log_format(ab, " inode=%lu", ino);
903 MAJOR(context->names[i].dev), 678 if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
904 MINOR(context->names[i].dev), 679 audit_log_format(ab, " dev=%02x:%02x mode=%#o"
905 context->names[i].mode, 680 " ouid=%u ogid=%u rdev=%02x:%02x",
906 context->names[i].uid, 681 MAJOR(context->names[i].dev),
907 context->names[i].gid, 682 MINOR(context->names[i].dev),
908 MAJOR(context->names[i].rdev), 683 context->names[i].mode,
684 context->names[i].uid,
685 context->names[i].gid,
686 MAJOR(context->names[i].rdev),
909 MINOR(context->names[i].rdev)); 687 MINOR(context->names[i].rdev));
688 if (context->names[i].ctx) {
689 audit_log_format(ab, " obj=%s",
690 context->names[i].ctx);
691 }
692
910 audit_log_end(ab); 693 audit_log_end(ab);
911 } 694 }
912} 695}
913 696
914/* Free a per-task audit context. Called from copy_process and 697/**
915 * __put_task_struct. */ 698 * audit_free - free a per-task audit context
699 * @tsk: task whose audit context block to free
700 *
701 * Called from copy_process and __put_task_struct.
702 */
916void audit_free(struct task_struct *tsk) 703void audit_free(struct task_struct *tsk)
917{ 704{
918 struct audit_context *context; 705 struct audit_context *context;
919 706
920 task_lock(tsk); 707 /*
708 * No need to lock the task - when we execute audit_free()
709 * then the task has no external references anymore, and
710 * we are tearing it down. (The locking also confuses
711 * DEBUG_LOCKDEP - this freeing may occur in softirq
712 * contexts as well, via RCU.)
713 */
921 context = audit_get_context(tsk, 0, 0); 714 context = audit_get_context(tsk, 0, 0);
922 task_unlock(tsk);
923
924 if (likely(!context)) 715 if (likely(!context))
925 return; 716 return;
926 717
@@ -934,13 +725,24 @@ void audit_free(struct task_struct *tsk)
934 audit_free_context(context); 725 audit_free_context(context);
935} 726}
936 727
937/* Fill in audit context at syscall entry. This only happens if the 728/**
729 * audit_syscall_entry - fill in an audit record at syscall entry
730 * @tsk: task being audited
731 * @arch: architecture type
732 * @major: major syscall type (function)
733 * @a1: additional syscall register 1
734 * @a2: additional syscall register 2
735 * @a3: additional syscall register 3
736 * @a4: additional syscall register 4
737 *
738 * Fill in audit context at syscall entry. This only happens if the
938 * audit context was created when the task was created and the state or 739 * audit context was created when the task was created and the state or
939 * filters demand the audit context be built. If the state from the 740 * filters demand the audit context be built. If the state from the
940 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, 741 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
941 * then the record will be written at syscall exit time (otherwise, it 742 * then the record will be written at syscall exit time (otherwise, it
942 * will only be written if another part of the kernel requests that it 743 * will only be written if another part of the kernel requests that it
943 * be written). */ 744 * be written).
745 */
944void audit_syscall_entry(struct task_struct *tsk, int arch, int major, 746void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
945 unsigned long a1, unsigned long a2, 747 unsigned long a1, unsigned long a2,
946 unsigned long a3, unsigned long a4) 748 unsigned long a3, unsigned long a4)
@@ -950,7 +752,8 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
950 752
951 BUG_ON(!context); 753 BUG_ON(!context);
952 754
953 /* This happens only on certain architectures that make system 755 /*
756 * This happens only on certain architectures that make system
954 * calls in kernel_thread via the entry.S interface, instead of 757 * calls in kernel_thread via the entry.S interface, instead of
955 * with direct calls. (If you are porting to a new 758 * with direct calls. (If you are porting to a new
956 * architecture, hitting this condition can indicate that you 759 * architecture, hitting this condition can indicate that you
@@ -958,7 +761,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
958 * 761 *
959 * i386 no 762 * i386 no
960 * x86_64 no 763 * x86_64 no
961 * ppc64 yes (see arch/ppc64/kernel/misc.S) 764 * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
962 * 765 *
963 * This also happens with vm86 emulation in a non-nested manner 766 * This also happens with vm86 emulation in a non-nested manner
964 * (entries without exits), so this case must be caught. 767 * (entries without exits), so this case must be caught.
@@ -966,11 +769,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
966 if (context->in_syscall) { 769 if (context->in_syscall) {
967 struct audit_context *newctx; 770 struct audit_context *newctx;
968 771
969#if defined(__NR_vm86) && defined(__NR_vm86old)
970 /* vm86 mode should only be entered once */
971 if (major == __NR_vm86 || major == __NR_vm86old)
972 return;
973#endif
974#if AUDIT_DEBUG 772#if AUDIT_DEBUG
975 printk(KERN_ERR 773 printk(KERN_ERR
976 "audit(:%d) pid=%d in syscall=%d;" 774 "audit(:%d) pid=%d in syscall=%d;"
@@ -1014,11 +812,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
1014 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 812 context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
1015} 813}
1016 814
1017/* Tear down after system call. If the audit context has been marked as 815/**
816 * audit_syscall_exit - deallocate audit context after a system call
817 * @tsk: task being audited
818 * @valid: success/failure flag
819 * @return_code: syscall return value
820 *
821 * Tear down after system call. If the audit context has been marked as
1018 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 822 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
1019 * filtering, or because some other part of the kernel write an audit 823 * filtering, or because some other part of the kernel write an audit
1020 * message), then write out the syscall information. In call cases, 824 * message), then write out the syscall information. In call cases,
1021 * free the names stored from getname(). */ 825 * free the names stored from getname().
826 */
1022void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) 827void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
1023{ 828{
1024 struct audit_context *context; 829 struct audit_context *context;
@@ -1053,7 +858,13 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
1053 put_task_struct(tsk); 858 put_task_struct(tsk);
1054} 859}
1055 860
1056/* Add a name to the list. Called from fs/namei.c:getname(). */ 861/**
862 * audit_getname - add a name to the list
863 * @name: name to add
864 *
865 * Add a name to the list of audit names for this context.
866 * Called from fs/namei.c:getname().
867 */
1057void audit_getname(const char *name) 868void audit_getname(const char *name)
1058{ 869{
1059 struct audit_context *context = current->audit_context; 870 struct audit_context *context = current->audit_context;
@@ -1082,10 +893,13 @@ void audit_getname(const char *name)
1082 893
1083} 894}
1084 895
1085/* Intercept a putname request. Called from 896/* audit_putname - intercept a putname request
1086 * include/linux/fs.h:putname(). If we have stored the name from 897 * @name: name to intercept and delay for putname
1087 * getname in the audit context, then we delay the putname until syscall 898 *
1088 * exit. */ 899 * If we have stored the name from getname in the audit context,
900 * then we delay the putname until syscall exit.
901 * Called from include/linux/fs.h:putname().
902 */
1089void audit_putname(const char *name) 903void audit_putname(const char *name)
1090{ 904{
1091 struct audit_context *context = current->audit_context; 905 struct audit_context *context = current->audit_context;
@@ -1100,7 +914,7 @@ void audit_putname(const char *name)
1100 for (i = 0; i < context->name_count; i++) 914 for (i = 0; i < context->name_count; i++)
1101 printk(KERN_ERR "name[%d] = %p = %s\n", i, 915 printk(KERN_ERR "name[%d] = %p = %s\n", i,
1102 context->names[i].name, 916 context->names[i].name,
1103 context->names[i].name); 917 context->names[i].name ?: "(null)");
1104 } 918 }
1105#endif 919#endif
1106 __putname(name); 920 __putname(name);
@@ -1122,9 +936,52 @@ void audit_putname(const char *name)
1122#endif 936#endif
1123} 937}
1124 938
1125/* Store the inode and device from a lookup. Called from 939void audit_inode_context(int idx, const struct inode *inode)
1126 * fs/namei.c:path_lookup(). */ 940{
1127void audit_inode(const char *name, const struct inode *inode, unsigned flags) 941 struct audit_context *context = current->audit_context;
942 const char *suffix = security_inode_xattr_getsuffix();
943 char *ctx = NULL;
944 int len = 0;
945
946 if (!suffix)
947 goto ret;
948
949 len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
950 if (len == -EOPNOTSUPP)
951 goto ret;
952 if (len < 0)
953 goto error_path;
954
955 ctx = kmalloc(len, GFP_KERNEL);
956 if (!ctx)
957 goto error_path;
958
959 len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
960 if (len < 0)
961 goto error_path;
962
963 kfree(context->names[idx].ctx);
964 context->names[idx].ctx = ctx;
965 goto ret;
966
967error_path:
968 if (ctx)
969 kfree(ctx);
970 audit_panic("error in audit_inode_context");
971ret:
972 return;
973}
974
975
976/**
977 * audit_inode - store the inode and device from a lookup
978 * @name: name being audited
979 * @inode: inode being audited
980 * @flags: lookup flags (as used in path_lookup())
981 *
982 * Called from fs/namei.c:path_lookup().
983 */
984void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
1128{ 985{
1129 int idx; 986 int idx;
1130 struct audit_context *context = current->audit_context; 987 struct audit_context *context = current->audit_context;
@@ -1150,15 +1007,105 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
1150 ++context->ino_count; 1007 ++context->ino_count;
1151#endif 1008#endif
1152 } 1009 }
1153 context->names[idx].flags = flags;
1154 context->names[idx].ino = inode->i_ino;
1155 context->names[idx].dev = inode->i_sb->s_dev; 1010 context->names[idx].dev = inode->i_sb->s_dev;
1156 context->names[idx].mode = inode->i_mode; 1011 context->names[idx].mode = inode->i_mode;
1157 context->names[idx].uid = inode->i_uid; 1012 context->names[idx].uid = inode->i_uid;
1158 context->names[idx].gid = inode->i_gid; 1013 context->names[idx].gid = inode->i_gid;
1159 context->names[idx].rdev = inode->i_rdev; 1014 context->names[idx].rdev = inode->i_rdev;
1015 audit_inode_context(idx, inode);
1016 if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
1017 (strcmp(name, ".") != 0)) {
1018 context->names[idx].ino = (unsigned long)-1;
1019 context->names[idx].pino = inode->i_ino;
1020 } else {
1021 context->names[idx].ino = inode->i_ino;
1022 context->names[idx].pino = (unsigned long)-1;
1023 }
1024}
1025
1026/**
1027 * audit_inode_child - collect inode info for created/removed objects
1028 * @dname: inode's dentry name
1029 * @inode: inode being audited
1030 * @pino: inode number of dentry parent
1031 *
1032 * For syscalls that create or remove filesystem objects, audit_inode
1033 * can only collect information for the filesystem object's parent.
1034 * This call updates the audit context with the child's information.
1035 * Syscalls that create a new filesystem object must be hooked after
1036 * the object is created. Syscalls that remove a filesystem object
1037 * must be hooked prior, in order to capture the target inode during
1038 * unsuccessful attempts.
1039 */
1040void __audit_inode_child(const char *dname, const struct inode *inode,
1041 unsigned long pino)
1042{
1043 int idx;
1044 struct audit_context *context = current->audit_context;
1045
1046 if (!context->in_syscall)
1047 return;
1048
1049 /* determine matching parent */
1050 if (dname)
1051 for (idx = 0; idx < context->name_count; idx++)
1052 if (context->names[idx].pino == pino) {
1053 const char *n;
1054 const char *name = context->names[idx].name;
1055 int dlen = strlen(dname);
1056 int nlen = name ? strlen(name) : 0;
1057
1058 if (nlen < dlen)
1059 continue;
1060
1061 /* disregard trailing slashes */
1062 n = name + nlen - 1;
1063 while ((*n == '/') && (n > name))
1064 n--;
1065
1066 /* find last path component */
1067 n = n - dlen + 1;
1068 if (n < name)
1069 continue;
1070 else if (n > name) {
1071 if (*--n != '/')
1072 continue;
1073 else
1074 n++;
1075 }
1076
1077 if (strncmp(n, dname, dlen) == 0)
1078 goto update_context;
1079 }
1080
1081 /* catch-all in case match not found */
1082 idx = context->name_count++;
1083 context->names[idx].name = NULL;
1084 context->names[idx].pino = pino;
1085#if AUDIT_DEBUG
1086 context->ino_count++;
1087#endif
1088
1089update_context:
1090 if (inode) {
1091 context->names[idx].ino = inode->i_ino;
1092 context->names[idx].dev = inode->i_sb->s_dev;
1093 context->names[idx].mode = inode->i_mode;
1094 context->names[idx].uid = inode->i_uid;
1095 context->names[idx].gid = inode->i_gid;
1096 context->names[idx].rdev = inode->i_rdev;
1097 audit_inode_context(idx, inode);
1098 }
1160} 1099}
1161 1100
1101/**
1102 * auditsc_get_stamp - get local copies of audit_context values
1103 * @ctx: audit_context for the task
1104 * @t: timespec to store time recorded in the audit_context
1105 * @serial: serial value that is recorded in the audit_context
1106 *
1107 * Also sets the context as auditable.
1108 */
1162void auditsc_get_stamp(struct audit_context *ctx, 1109void auditsc_get_stamp(struct audit_context *ctx,
1163 struct timespec *t, unsigned int *serial) 1110 struct timespec *t, unsigned int *serial)
1164{ 1111{
@@ -1170,6 +1117,15 @@ void auditsc_get_stamp(struct audit_context *ctx,
1170 ctx->auditable = 1; 1117 ctx->auditable = 1;
1171} 1118}
1172 1119
1120/**
1121 * audit_set_loginuid - set a task's audit_context loginuid
1122 * @task: task whose audit context is being modified
1123 * @loginuid: loginuid value
1124 *
1125 * Returns 0.
1126 *
1127 * Called (set) from fs/proc/base.c::proc_loginuid_write().
1128 */
1173int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1129int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1174{ 1130{
1175 if (task->audit_context) { 1131 if (task->audit_context) {
@@ -1188,12 +1144,59 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1188 return 0; 1144 return 0;
1189} 1145}
1190 1146
1147/**
1148 * audit_get_loginuid - get the loginuid for an audit_context
1149 * @ctx: the audit_context
1150 *
1151 * Returns the context's loginuid or -1 if @ctx is NULL.
1152 */
1191uid_t audit_get_loginuid(struct audit_context *ctx) 1153uid_t audit_get_loginuid(struct audit_context *ctx)
1192{ 1154{
1193 return ctx ? ctx->loginuid : -1; 1155 return ctx ? ctx->loginuid : -1;
1194} 1156}
1195 1157
1196int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 1158static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
1159{
1160 struct audit_context *context = current->audit_context;
1161 char *ctx = NULL;
1162 int len = 0;
1163
1164 if (likely(!context))
1165 return NULL;
1166
1167 len = security_ipc_getsecurity(ipcp, NULL, 0);
1168 if (len == -EOPNOTSUPP)
1169 goto ret;
1170 if (len < 0)
1171 goto error_path;
1172
1173 ctx = kmalloc(len, GFP_ATOMIC);
1174 if (!ctx)
1175 goto error_path;
1176
1177 len = security_ipc_getsecurity(ipcp, ctx, len);
1178 if (len < 0)
1179 goto error_path;
1180
1181 return ctx;
1182
1183error_path:
1184 kfree(ctx);
1185 audit_panic("error in audit_ipc_context");
1186ret:
1187 return NULL;
1188}
1189
1190/**
1191 * audit_ipc_perms - record audit data for ipc
1192 * @qbytes: msgq bytes
1193 * @uid: msgq user id
1194 * @gid: msgq group id
1195 * @mode: msgq mode (permissions)
1196 *
1197 * Returns 0 for success or NULL context or < 0 on error.
1198 */
1199int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
1197{ 1200{
1198 struct audit_aux_data_ipcctl *ax; 1201 struct audit_aux_data_ipcctl *ax;
1199 struct audit_context *context = current->audit_context; 1202 struct audit_context *context = current->audit_context;
@@ -1201,7 +1204,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1201 if (likely(!context)) 1204 if (likely(!context))
1202 return 0; 1205 return 0;
1203 1206
1204 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 1207 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1205 if (!ax) 1208 if (!ax)
1206 return -ENOMEM; 1209 return -ENOMEM;
1207 1210
@@ -1209,6 +1212,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1209 ax->uid = uid; 1212 ax->uid = uid;
1210 ax->gid = gid; 1213 ax->gid = gid;
1211 ax->mode = mode; 1214 ax->mode = mode;
1215 ax->ctx = audit_ipc_context(ipcp);
1212 1216
1213 ax->d.type = AUDIT_IPC; 1217 ax->d.type = AUDIT_IPC;
1214 ax->d.next = context->aux; 1218 ax->d.next = context->aux;
@@ -1216,6 +1220,13 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1216 return 0; 1220 return 0;
1217} 1221}
1218 1222
1223/**
1224 * audit_socketcall - record audit data for sys_socketcall
1225 * @nargs: number of args
1226 * @args: args array
1227 *
1228 * Returns 0 for success or NULL context or < 0 on error.
1229 */
1219int audit_socketcall(int nargs, unsigned long *args) 1230int audit_socketcall(int nargs, unsigned long *args)
1220{ 1231{
1221 struct audit_aux_data_socketcall *ax; 1232 struct audit_aux_data_socketcall *ax;
@@ -1237,6 +1248,13 @@ int audit_socketcall(int nargs, unsigned long *args)
1237 return 0; 1248 return 0;
1238} 1249}
1239 1250
1251/**
1252 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
1253 * @len: data length in user space
1254 * @a: data address in kernel space
1255 *
1256 * Returns 0 for success or NULL context or < 0 on error.
1257 */
1240int audit_sockaddr(int len, void *a) 1258int audit_sockaddr(int len, void *a)
1241{ 1259{
1242 struct audit_aux_data_sockaddr *ax; 1260 struct audit_aux_data_sockaddr *ax;
@@ -1258,6 +1276,15 @@ int audit_sockaddr(int len, void *a)
1258 return 0; 1276 return 0;
1259} 1277}
1260 1278
1279/**
1280 * audit_avc_path - record the granting or denial of permissions
1281 * @dentry: dentry to record
1282 * @mnt: mnt to record
1283 *
1284 * Returns 0 for success or NULL context or < 0 on error.
1285 *
1286 * Called from security/selinux/avc.c::avc_audit()
1287 */
1261int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) 1288int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1262{ 1289{
1263 struct audit_aux_data_path *ax; 1290 struct audit_aux_data_path *ax;
@@ -1279,6 +1306,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1279 return 0; 1306 return 0;
1280} 1307}
1281 1308
1309/**
1310 * audit_signal_info - record signal info for shutting down audit subsystem
1311 * @sig: signal value
1312 * @t: task being signaled
1313 *
1314 * If the audit subsystem is being terminated, record the task (pid)
1315 * and uid that is doing that.
1316 */
1282void audit_signal_info(int sig, struct task_struct *t) 1317void audit_signal_info(int sig, struct task_struct *t)
1283{ 1318{
1284 extern pid_t audit_sig_pid; 1319 extern pid_t audit_sig_pid;
@@ -1295,4 +1330,3 @@ void audit_signal_info(int sig, struct task_struct *t)
1295 } 1330 }
1296 } 1331 }
1297} 1332}
1298
diff --git a/kernel/capability.c b/kernel/capability.c
index bfa3c92e16..1a4d8a40d3 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -233,3 +233,19 @@ out:
233 233
234 return ret; 234 return ret;
235} 235}
236
237int __capable(struct task_struct *t, int cap)
238{
239 if (security_capable(t, cap) == 0) {
240 t->flags |= PF_SUPERPRIV;
241 return 1;
242 }
243 return 0;
244}
245EXPORT_SYMBOL(__capable);
246
247int capable(int cap)
248{
249 return __capable(current, cap);
250}
251EXPORT_SYMBOL(capable);
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b67..c1601a84f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,10 +17,10 @@
17#include <linux/time.h> 17#include <linux/time.h>
18#include <linux/signal.h> 18#include <linux/signal.h>
19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ 19#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
20#include <linux/futex.h> /* for FUTEX_WAIT */
21#include <linux/syscalls.h> 20#include <linux/syscalls.h>
22#include <linux/unistd.h> 21#include <linux/unistd.h>
23#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h>
24 24
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26 26
@@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
238 return ret; 238 return ret;
239} 239}
240 240
241#ifdef CONFIG_FUTEX
242asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
243 struct compat_timespec __user *utime, u32 __user *uaddr2,
244 int val3)
245{
246 struct timespec t;
247 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
248 int val2 = 0;
249
250 if ((op == FUTEX_WAIT) && utime) {
251 if (get_compat_timespec(&t, utime))
252 return -EFAULT;
253 timeout = timespec_to_jiffies(&t) + 1;
254 }
255 if (op >= FUTEX_REQUEUE)
256 val2 = (int) (unsigned long) utime;
257
258 return do_futex((unsigned long)uaddr, op, val, timeout,
259 (unsigned long)uaddr2, val2, val3);
260}
261#endif
262
263asmlinkage long compat_sys_setrlimit(unsigned int resource, 241asmlinkage long compat_sys_setrlimit(unsigned int resource,
264 struct compat_rlimit __user *rlim) 242 struct compat_rlimit __user *rlim)
265{ 243{
@@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
898 return -ERESTARTNOHAND; 876 return -ERESTARTNOHAND;
899} 877}
900#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 878#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
879
880asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
881{
882 struct timex txc;
883 int ret;
884
885 memset(&txc, 0, sizeof(struct timex));
886
887 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
888 __get_user(txc.modes, &utp->modes) ||
889 __get_user(txc.offset, &utp->offset) ||
890 __get_user(txc.freq, &utp->freq) ||
891 __get_user(txc.maxerror, &utp->maxerror) ||
892 __get_user(txc.esterror, &utp->esterror) ||
893 __get_user(txc.status, &utp->status) ||
894 __get_user(txc.constant, &utp->constant) ||
895 __get_user(txc.precision, &utp->precision) ||
896 __get_user(txc.tolerance, &utp->tolerance) ||
897 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
898 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
899 __get_user(txc.tick, &utp->tick) ||
900 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
901 __get_user(txc.jitter, &utp->jitter) ||
902 __get_user(txc.shift, &utp->shift) ||
903 __get_user(txc.stabil, &utp->stabil) ||
904 __get_user(txc.jitcnt, &utp->jitcnt) ||
905 __get_user(txc.calcnt, &utp->calcnt) ||
906 __get_user(txc.errcnt, &utp->errcnt) ||
907 __get_user(txc.stbcnt, &utp->stbcnt))
908 return -EFAULT;
909
910 ret = do_adjtimex(&txc);
911
912 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
913 __put_user(txc.modes, &utp->modes) ||
914 __put_user(txc.offset, &utp->offset) ||
915 __put_user(txc.freq, &utp->freq) ||
916 __put_user(txc.maxerror, &utp->maxerror) ||
917 __put_user(txc.esterror, &utp->esterror) ||
918 __put_user(txc.status, &utp->status) ||
919 __put_user(txc.constant, &utp->constant) ||
920 __put_user(txc.precision, &utp->precision) ||
921 __put_user(txc.tolerance, &utp->tolerance) ||
922 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
923 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
924 __put_user(txc.tick, &utp->tick) ||
925 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
926 __put_user(txc.jitter, &utp->jitter) ||
927 __put_user(txc.shift, &utp->shift) ||
928 __put_user(txc.stabil, &utp->stabil) ||
929 __put_user(txc.jitcnt, &utp->jitcnt) ||
930 __put_user(txc.calcnt, &utp->calcnt) ||
931 __put_user(txc.errcnt, &utp->errcnt) ||
932 __put_user(txc.stbcnt, &utp->stbcnt))
933 ret = -EFAULT;
934
935 return ret;
936}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e882c6babf..fe2b8d0bfe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DECLARE_MUTEX(cpucontrol);
20 20
21static struct notifier_block *cpu_chain; 21static BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 int ret; 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75
76 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
77 return ret;
78 ret = notifier_chain_register(&cpu_chain, nb);
79 unlock_cpu_hotplug();
80 return ret;
81} 75}
82EXPORT_SYMBOL(register_cpu_notifier); 76EXPORT_SYMBOL(register_cpu_notifier);
83 77
84void unregister_cpu_notifier(struct notifier_block *nb) 78void unregister_cpu_notifier(struct notifier_block *nb)
85{ 79{
86 lock_cpu_hotplug(); 80 blocking_notifier_chain_unregister(&cpu_chain, nb);
87 notifier_chain_unregister(&cpu_chain, nb);
88 unlock_cpu_hotplug();
89} 81}
90EXPORT_SYMBOL(unregister_cpu_notifier); 82EXPORT_SYMBOL(unregister_cpu_notifier);
91 83
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
141 goto out; 133 goto out;
142 } 134 }
143 135
144 err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 136 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
145 (void *)(long)cpu); 137 (void *)(long)cpu);
146 if (err == NOTIFY_BAD) { 138 if (err == NOTIFY_BAD) {
147 printk("%s: attempt to take down CPU %u failed\n", 139 printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
159 p = __stop_machine_run(take_cpu_down, NULL, cpu); 151 p = __stop_machine_run(take_cpu_down, NULL, cpu);
160 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
161 /* CPU didn't die: tell everyone. Can't complain. */ 153 /* CPU didn't die: tell everyone. Can't complain. */
162 if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 154 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
163 (void *)(long)cpu) == NOTIFY_BAD) 155 (void *)(long)cpu) == NOTIFY_BAD)
164 BUG(); 156 BUG();
165 157
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
182 put_cpu(); 174 put_cpu();
183 175
184 /* CPU is completely dead: tell everyone. Too late to complain. */ 176 /* CPU is completely dead: tell everyone. Too late to complain. */
185 if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) 177 if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
186 == NOTIFY_BAD) 178 (void *)(long)cpu) == NOTIFY_BAD)
187 BUG(); 179 BUG();
188 180
189 check_for_tasks(cpu); 181 check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
211 goto out; 203 goto out;
212 } 204 }
213 205
214 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 206 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
215 if (ret == NOTIFY_BAD) { 207 if (ret == NOTIFY_BAD) {
216 printk("%s: attempt to bring up CPU %u failed\n", 208 printk("%s: attempt to bring up CPU %u failed\n",
217 __FUNCTION__, cpu); 209 __FUNCTION__, cpu);
@@ -223,15 +215,15 @@ int __devinit cpu_up(unsigned int cpu)
223 ret = __cpu_up(cpu); 215 ret = __cpu_up(cpu);
224 if (ret != 0) 216 if (ret != 0)
225 goto out_notify; 217 goto out_notify;
226 if (!cpu_online(cpu)) 218 BUG_ON(!cpu_online(cpu));
227 BUG();
228 219
229 /* Now call notifier in preparation. */ 220 /* Now call notifier in preparation. */
230 notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 221 blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
231 222
232out_notify: 223out_notify:
233 if (ret != 0) 224 if (ret != 0)
234 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 225 blocking_notifier_call_chain(&cpu_chain,
226 CPU_UP_CANCELED, hcpu);
235out: 227out:
236 unlock_cpu_hotplug(); 228 unlock_cpu_hotplug();
237 return ret; 229 return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a..18aea1bd12 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 * Portions derived from Patrick Mochel's sysfs code. 9 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 10 * sysfs is Copyright (c) 2001-3 Patrick Mochel
11 * Portions Copyright (c) 2004 Silicon Graphics, Inc.
12 * 11 *
13 * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> 12 * 2003-10-10 Written by Simon Derr.
14 * 2003-10-22 Updates by Stephen Hemminger. 13 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson <pj@sgi.com> 14 * 2004 May-July Rework by Paul Jackson.
16 * 15 *
17 * This file is subject to the terms and conditions of the GNU General Public 16 * This file is subject to the terms and conditions of the GNU General Public
18 * License. See the file COPYING in the main directory of the Linux 17 * License. See the file COPYING in the main directory of the Linux
@@ -53,7 +52,7 @@
53 52
54#include <asm/uaccess.h> 53#include <asm/uaccess.h>
55#include <asm/atomic.h> 54#include <asm/atomic.h>
56#include <asm/semaphore.h> 55#include <linux/mutex.h>
57 56
58#define CPUSET_SUPER_MAGIC 0x27e0eb 57#define CPUSET_SUPER_MAGIC 0x27e0eb
59 58
@@ -108,37 +107,49 @@ typedef enum {
108 CS_MEM_EXCLUSIVE, 107 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE, 108 CS_MEMORY_MIGRATE,
110 CS_REMOVED, 109 CS_REMOVED,
111 CS_NOTIFY_ON_RELEASE 110 CS_NOTIFY_ON_RELEASE,
111 CS_SPREAD_PAGE,
112 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 113} cpuset_flagbits_t;
113 114
114/* convenient tests for these bits */ 115/* convenient tests for these bits */
115static inline int is_cpu_exclusive(const struct cpuset *cs) 116static inline int is_cpu_exclusive(const struct cpuset *cs)
116{ 117{
117 return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 118 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
118} 119}
119 120
120static inline int is_mem_exclusive(const struct cpuset *cs) 121static inline int is_mem_exclusive(const struct cpuset *cs)
121{ 122{
122 return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 123 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 124}
124 125
125static inline int is_removed(const struct cpuset *cs) 126static inline int is_removed(const struct cpuset *cs)
126{ 127{
127 return !!test_bit(CS_REMOVED, &cs->flags); 128 return test_bit(CS_REMOVED, &cs->flags);
128} 129}
129 130
130static inline int notify_on_release(const struct cpuset *cs) 131static inline int notify_on_release(const struct cpuset *cs)
131{ 132{
132 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 133 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133} 134}
134 135
135static inline int is_memory_migrate(const struct cpuset *cs) 136static inline int is_memory_migrate(const struct cpuset *cs)
136{ 137{
137 return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); 138 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
139}
140
141static inline int is_spread_page(const struct cpuset *cs)
142{
143 return test_bit(CS_SPREAD_PAGE, &cs->flags);
144}
145
146static inline int is_spread_slab(const struct cpuset *cs)
147{
148 return test_bit(CS_SPREAD_SLAB, &cs->flags);
138} 149}
139 150
140/* 151/*
141 * Increment this atomic integer everytime any cpuset changes its 152 * Increment this integer everytime any cpuset changes its
142 * mems_allowed value. Users of cpusets can track this generation 153 * mems_allowed value. Users of cpusets can track this generation
143 * number, and avoid having to lock and reload mems_allowed unless 154 * number, and avoid having to lock and reload mems_allowed unless
144 * the cpuset they're using changes generation. 155 * the cpuset they're using changes generation.
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs)
152 * on every visit to __alloc_pages(), to efficiently check whether 163 * on every visit to __alloc_pages(), to efficiently check whether
153 * its current->cpuset->mems_allowed has changed, requiring an update 164 * its current->cpuset->mems_allowed has changed, requiring an update
154 * of its current->mems_allowed. 165 * of its current->mems_allowed.
166 *
167 * Since cpuset_mems_generation is guarded by manage_mutex,
168 * there is no need to mark it atomic.
155 */ 169 */
156static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); 170static int cpuset_mems_generation;
157 171
158static struct cpuset top_cpuset = { 172static struct cpuset top_cpuset = {
159 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 173 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -168,63 +182,57 @@ static struct vfsmount *cpuset_mount;
168static struct super_block *cpuset_sb; 182static struct super_block *cpuset_sb;
169 183
170/* 184/*
171 * We have two global cpuset semaphores below. They can nest. 185 * We have two global cpuset mutexes below. They can nest.
172 * It is ok to first take manage_sem, then nest callback_sem. We also 186 * It is ok to first take manage_mutex, then nest callback_mutex. We also
173 * require taking task_lock() when dereferencing a tasks cpuset pointer. 187 * require taking task_lock() when dereferencing a tasks cpuset pointer.
174 * See "The task_lock() exception", at the end of this comment. 188 * See "The task_lock() exception", at the end of this comment.
175 * 189 *
176 * A task must hold both semaphores to modify cpusets. If a task 190 * A task must hold both mutexes to modify cpusets. If a task
177 * holds manage_sem, then it blocks others wanting that semaphore, 191 * holds manage_mutex, then it blocks others wanting that mutex,
178 * ensuring that it is the only task able to also acquire callback_sem 192 * ensuring that it is the only task able to also acquire callback_mutex
179 * and be able to modify cpusets. It can perform various checks on 193 * and be able to modify cpusets. It can perform various checks on
180 * the cpuset structure first, knowing nothing will change. It can 194 * the cpuset structure first, knowing nothing will change. It can
181 * also allocate memory while just holding manage_sem. While it is 195 * also allocate memory while just holding manage_mutex. While it is
182 * performing these checks, various callback routines can briefly 196 * performing these checks, various callback routines can briefly
183 * acquire callback_sem to query cpusets. Once it is ready to make 197 * acquire callback_mutex to query cpusets. Once it is ready to make
184 * the changes, it takes callback_sem, blocking everyone else. 198 * the changes, it takes callback_mutex, blocking everyone else.
185 * 199 *
186 * Calls to the kernel memory allocator can not be made while holding 200 * Calls to the kernel memory allocator can not be made while holding
187 * callback_sem, as that would risk double tripping on callback_sem 201 * callback_mutex, as that would risk double tripping on callback_mutex
188 * from one of the callbacks into the cpuset code from within 202 * from one of the callbacks into the cpuset code from within
189 * __alloc_pages(). 203 * __alloc_pages().
190 * 204 *
191 * If a task is only holding callback_sem, then it has read-only 205 * If a task is only holding callback_mutex, then it has read-only
192 * access to cpusets. 206 * access to cpusets.
193 * 207 *
194 * The task_struct fields mems_allowed and mems_generation may only 208 * The task_struct fields mems_allowed and mems_generation may only
195 * be accessed in the context of that task, so require no locks. 209 * be accessed in the context of that task, so require no locks.
196 * 210 *
197 * Any task can increment and decrement the count field without lock. 211 * Any task can increment and decrement the count field without lock.
198 * So in general, code holding manage_sem or callback_sem can't rely 212 * So in general, code holding manage_mutex or callback_mutex can't rely
199 * on the count field not changing. However, if the count goes to 213 * on the count field not changing. However, if the count goes to
200 * zero, then only attach_task(), which holds both semaphores, can 214 * zero, then only attach_task(), which holds both mutexes, can
201 * increment it again. Because a count of zero means that no tasks 215 * increment it again. Because a count of zero means that no tasks
202 * are currently attached, therefore there is no way a task attached 216 * are currently attached, therefore there is no way a task attached
203 * to that cpuset can fork (the other way to increment the count). 217 * to that cpuset can fork (the other way to increment the count).
204 * So code holding manage_sem or callback_sem can safely assume that 218 * So code holding manage_mutex or callback_mutex can safely assume that
205 * if the count is zero, it will stay zero. Similarly, if a task 219 * if the count is zero, it will stay zero. Similarly, if a task
206 * holds manage_sem or callback_sem on a cpuset with zero count, it 220 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs 221 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
208 * both of those semaphores. 222 * both of those mutexes.
209 *
210 * A possible optimization to improve parallelism would be to make
211 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
212 * to proceed in parallel, with read access, until the holder of
213 * manage_sem needed to take this rwsem for exclusive write access
214 * and modify some cpusets.
215 * 223 *
216 * The cpuset_common_file_write handler for operations that modify 224 * The cpuset_common_file_write handler for operations that modify
217 * the cpuset hierarchy holds manage_sem across the entire operation, 225 * the cpuset hierarchy holds manage_mutex across the entire operation,
218 * single threading all such cpuset modifications across the system. 226 * single threading all such cpuset modifications across the system.
219 * 227 *
220 * The cpuset_common_file_read() handlers only hold callback_sem across 228 * The cpuset_common_file_read() handlers only hold callback_mutex across
221 * small pieces of code, such as when reading out possibly multi-word 229 * small pieces of code, such as when reading out possibly multi-word
222 * cpumasks and nodemasks. 230 * cpumasks and nodemasks.
223 * 231 *
224 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't 232 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
225 * (usually) take either semaphore. These are the two most performance 233 * (usually) take either mutex. These are the two most performance
226 * critical pieces of code here. The exception occurs on cpuset_exit(), 234 * critical pieces of code here. The exception occurs on cpuset_exit(),
227 * when a task in a notify_on_release cpuset exits. Then manage_sem 235 * when a task in a notify_on_release cpuset exits. Then manage_mutex
228 * is taken, and if the cpuset count is zero, a usermode call made 236 * is taken, and if the cpuset count is zero, a usermode call made
229 * to /sbin/cpuset_release_agent with the name of the cpuset (path 237 * to /sbin/cpuset_release_agent with the name of the cpuset (path
230 * relative to the root of cpuset file system) as the argument. 238 * relative to the root of cpuset file system) as the argument.
@@ -242,9 +250,9 @@ static struct super_block *cpuset_sb;
242 * 250 *
243 * The need for this exception arises from the action of attach_task(), 251 * The need for this exception arises from the action of attach_task(),
244 * which overwrites one tasks cpuset pointer with another. It does 252 * which overwrites one tasks cpuset pointer with another. It does
245 * so using both semaphores, however there are several performance 253 * so using both mutexes, however there are several performance
246 * critical places that need to reference task->cpuset without the 254 * critical places that need to reference task->cpuset without the
247 * expense of grabbing a system global semaphore. Therefore except as 255 * expense of grabbing a system global mutex. Therefore except as
248 * noted below, when dereferencing or, as in attach_task(), modifying 256 * noted below, when dereferencing or, as in attach_task(), modifying
249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 257 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
250 * (task->alloc_lock) already in the task_struct routinely used for 258 * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +264,8 @@ static struct super_block *cpuset_sb;
256 * the routine cpuset_update_task_memory_state(). 264 * the routine cpuset_update_task_memory_state().
257 */ 265 */
258 266
259static DECLARE_MUTEX(manage_sem); 267static DEFINE_MUTEX(manage_mutex);
260static DECLARE_MUTEX(callback_sem); 268static DEFINE_MUTEX(callback_mutex);
261 269
262/* 270/*
263 * A couple of forward declarations required, due to cyclic reference loop: 271 * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +440,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
432} 440}
433 441
434/* 442/*
435 * Call with manage_sem held. Writes path of cpuset into buf. 443 * Call with manage_mutex held. Writes path of cpuset into buf.
436 * Returns 0 on success, -errno on error. 444 * Returns 0 on success, -errno on error.
437 */ 445 */
438 446
@@ -484,11 +492,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
484 * status of the /sbin/cpuset_release_agent task, so no sense holding 492 * status of the /sbin/cpuset_release_agent task, so no sense holding
485 * our caller up for that. 493 * our caller up for that.
486 * 494 *
487 * When we had only one cpuset semaphore, we had to call this 495 * When we had only one cpuset mutex, we had to call this
488 * without holding it, to avoid deadlock when call_usermodehelper() 496 * without holding it, to avoid deadlock when call_usermodehelper()
489 * allocated memory. With two locks, we could now call this while 497 * allocated memory. With two locks, we could now call this while
490 * holding manage_sem, but we still don't, so as to minimize 498 * holding manage_mutex, but we still don't, so as to minimize
491 * the time manage_sem is held. 499 * the time manage_mutex is held.
492 */ 500 */
493 501
494static void cpuset_release_agent(const char *pathbuf) 502static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +528,15 @@ static void cpuset_release_agent(const char *pathbuf)
520 * cs is notify_on_release() and now both the user count is zero and 528 * cs is notify_on_release() and now both the user count is zero and
521 * the list of children is empty, prepare cpuset path in a kmalloc'd 529 * the list of children is empty, prepare cpuset path in a kmalloc'd
522 * buffer, to be returned via ppathbuf, so that the caller can invoke 530 * buffer, to be returned via ppathbuf, so that the caller can invoke
523 * cpuset_release_agent() with it later on, once manage_sem is dropped. 531 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
524 * Call here with manage_sem held. 532 * Call here with manage_mutex held.
525 * 533 *
526 * This check_for_release() routine is responsible for kmalloc'ing 534 * This check_for_release() routine is responsible for kmalloc'ing
527 * pathbuf. The above cpuset_release_agent() is responsible for 535 * pathbuf. The above cpuset_release_agent() is responsible for
528 * kfree'ing pathbuf. The caller of these routines is responsible 536 * kfree'ing pathbuf. The caller of these routines is responsible
529 * for providing a pathbuf pointer, initialized to NULL, then 537 * for providing a pathbuf pointer, initialized to NULL, then
530 * calling check_for_release() with manage_sem held and the address 538 * calling check_for_release() with manage_mutex held and the address
531 * of the pathbuf pointer, then dropping manage_sem, then calling 539 * of the pathbuf pointer, then dropping manage_mutex, then calling
532 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 540 * cpuset_release_agent() with pathbuf, as set by check_for_release().
533 */ 541 */
534 542
@@ -559,7 +567,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
559 * One way or another, we guarantee to return some non-empty subset 567 * One way or another, we guarantee to return some non-empty subset
560 * of cpu_online_map. 568 * of cpu_online_map.
561 * 569 *
562 * Call with callback_sem held. 570 * Call with callback_mutex held.
563 */ 571 */
564 572
565static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 573static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +591,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
583 * One way or another, we guarantee to return some non-empty subset 591 * One way or another, we guarantee to return some non-empty subset
584 * of node_online_map. 592 * of node_online_map.
585 * 593 *
586 * Call with callback_sem held. 594 * Call with callback_mutex held.
587 */ 595 */
588 596
589static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 597static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +616,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
608 * current->cpuset if a task has its memory placement changed. 616 * current->cpuset if a task has its memory placement changed.
609 * Do not call this routine if in_interrupt(). 617 * Do not call this routine if in_interrupt().
610 * 618 *
611 * Call without callback_sem or task_lock() held. May be called 619 * Call without callback_mutex or task_lock() held. May be called
612 * with or without manage_sem held. Doesn't need task_lock to guard 620 * with or without manage_mutex held. Doesn't need task_lock to guard
613 * against another task changing a non-NULL cpuset pointer to NULL, 621 * against another task changing a non-NULL cpuset pointer to NULL,
614 * as that is only done by a task on itself, and if the current task 622 * as that is only done by a task on itself, and if the current task
615 * is here, it is not simultaneously in the exit code NULL'ing its 623 * is here, it is not simultaneously in the exit code NULL'ing its
616 * cpuset pointer. This routine also might acquire callback_sem and 624 * cpuset pointer. This routine also might acquire callback_mutex and
617 * current->mm->mmap_sem during call. 625 * current->mm->mmap_sem during call.
618 * 626 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock 627 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +666,21 @@ void cpuset_update_task_memory_state(void)
658 } 666 }
659 667
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 668 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
661 down(&callback_sem); 669 mutex_lock(&callback_mutex);
662 task_lock(tsk); 670 task_lock(tsk);
663 cs = tsk->cpuset; /* Maybe changed when task not locked */ 671 cs = tsk->cpuset; /* Maybe changed when task not locked */
664 guarantee_online_mems(cs, &tsk->mems_allowed); 672 guarantee_online_mems(cs, &tsk->mems_allowed);
665 tsk->cpuset_mems_generation = cs->mems_generation; 673 tsk->cpuset_mems_generation = cs->mems_generation;
674 if (is_spread_page(cs))
675 tsk->flags |= PF_SPREAD_PAGE;
676 else
677 tsk->flags &= ~PF_SPREAD_PAGE;
678 if (is_spread_slab(cs))
679 tsk->flags |= PF_SPREAD_SLAB;
680 else
681 tsk->flags &= ~PF_SPREAD_SLAB;
666 task_unlock(tsk); 682 task_unlock(tsk);
667 up(&callback_sem); 683 mutex_unlock(&callback_mutex);
668 mpol_rebind_task(tsk, &tsk->mems_allowed); 684 mpol_rebind_task(tsk, &tsk->mems_allowed);
669 } 685 }
670} 686}
@@ -674,7 +690,7 @@ void cpuset_update_task_memory_state(void)
674 * 690 *
675 * One cpuset is a subset of another if all its allowed CPUs and 691 * One cpuset is a subset of another if all its allowed CPUs and
676 * Memory Nodes are a subset of the other, and its exclusive flags 692 * Memory Nodes are a subset of the other, and its exclusive flags
677 * are only set if the other's are set. Call holding manage_sem. 693 * are only set if the other's are set. Call holding manage_mutex.
678 */ 694 */
679 695
680static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 696static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +708,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
692 * If we replaced the flag and mask values of the current cpuset 708 * If we replaced the flag and mask values of the current cpuset
693 * (cur) with those values in the trial cpuset (trial), would 709 * (cur) with those values in the trial cpuset (trial), would
694 * our various subset and exclusive rules still be valid? Presumes 710 * our various subset and exclusive rules still be valid? Presumes
695 * manage_sem held. 711 * manage_mutex held.
696 * 712 *
697 * 'cur' is the address of an actual, in-use cpuset. Operations 713 * 'cur' is the address of an actual, in-use cpuset. Operations
698 * such as list traversal that depend on the actual address of the 714 * such as list traversal that depend on the actual address of the
@@ -746,7 +762,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
746 * exclusive child cpusets 762 * exclusive child cpusets
747 * Build these two partitions by calling partition_sched_domains 763 * Build these two partitions by calling partition_sched_domains
748 * 764 *
749 * Call with manage_sem held. May nest a call to the 765 * Call with manage_mutex held. May nest a call to the
750 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 766 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
751 */ 767 */
752 768
@@ -792,7 +808,7 @@ static void update_cpu_domains(struct cpuset *cur)
792} 808}
793 809
794/* 810/*
795 * Call with manage_sem held. May take callback_sem during call. 811 * Call with manage_mutex held. May take callback_mutex during call.
796 */ 812 */
797 813
798static int update_cpumask(struct cpuset *cs, char *buf) 814static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +827,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
811 if (retval < 0) 827 if (retval < 0)
812 return retval; 828 return retval;
813 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 829 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
814 down(&callback_sem); 830 mutex_lock(&callback_mutex);
815 cs->cpus_allowed = trialcs.cpus_allowed; 831 cs->cpus_allowed = trialcs.cpus_allowed;
816 up(&callback_sem); 832 mutex_unlock(&callback_mutex);
817 if (is_cpu_exclusive(cs) && !cpus_unchanged) 833 if (is_cpu_exclusive(cs) && !cpus_unchanged)
818 update_cpu_domains(cs); 834 update_cpu_domains(cs);
819 return 0; 835 return 0;
@@ -827,7 +843,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
827 * the cpuset is marked 'memory_migrate', migrate the tasks 843 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory. 844 * pages to the new memory.
829 * 845 *
830 * Call with manage_sem held. May take callback_sem during call. 846 * Call with manage_mutex held. May take callback_mutex during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 847 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 848 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed. 849 * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +878,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
862 if (retval < 0) 878 if (retval < 0)
863 goto done; 879 goto done;
864 880
865 down(&callback_sem); 881 mutex_lock(&callback_mutex);
866 cs->mems_allowed = trialcs.mems_allowed; 882 cs->mems_allowed = trialcs.mems_allowed;
867 atomic_inc(&cpuset_mems_generation); 883 cs->mems_generation = cpuset_mems_generation++;
868 cs->mems_generation = atomic_read(&cpuset_mems_generation); 884 mutex_unlock(&callback_mutex);
869 up(&callback_sem);
870 885
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 886 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872 887
@@ -922,7 +937,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 * tasklist_lock. Forks can happen again now - the mpol_copy() 937 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind 938 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global 939 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will 940 * cpuset manage_mutex, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound. 941 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 942 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes. 943 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -948,7 +963,7 @@ done:
948} 963}
949 964
950/* 965/*
951 * Call with manage_sem held. 966 * Call with manage_mutex held.
952 */ 967 */
953 968
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) 969static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -963,11 +978,12 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
963/* 978/*
964 * update_flag - read a 0 or a 1 in a file and update associated flag 979 * update_flag - read a 0 or a 1 in a file and update associated flag
965 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 980 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
966 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) 981 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
982 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
967 * cs: the cpuset to update 983 * cs: the cpuset to update
968 * buf: the buffer where we read the 0 or 1 984 * buf: the buffer where we read the 0 or 1
969 * 985 *
970 * Call with manage_sem held. 986 * Call with manage_mutex held.
971 */ 987 */
972 988
973static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 989static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +1005,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
989 return err; 1005 return err;
990 cpu_exclusive_changed = 1006 cpu_exclusive_changed =
991 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1007 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
992 down(&callback_sem); 1008 mutex_lock(&callback_mutex);
993 if (turning_on) 1009 if (turning_on)
994 set_bit(bit, &cs->flags); 1010 set_bit(bit, &cs->flags);
995 else 1011 else
996 clear_bit(bit, &cs->flags); 1012 clear_bit(bit, &cs->flags);
997 up(&callback_sem); 1013 mutex_unlock(&callback_mutex);
998 1014
999 if (cpu_exclusive_changed) 1015 if (cpu_exclusive_changed)
1000 update_cpu_domains(cs); 1016 update_cpu_domains(cs);
@@ -1104,7 +1120,7 @@ static int fmeter_getrate(struct fmeter *fmp)
1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1120 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1105 * notified on release. 1121 * notified on release.
1106 * 1122 *
1107 * Call holding manage_sem. May take callback_sem and task_lock of 1123 * Call holding manage_mutex. May take callback_mutex and task_lock of
1108 * the task 'pid' during call. 1124 * the task 'pid' during call.
1109 */ 1125 */
1110 1126
@@ -1144,13 +1160,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1144 get_task_struct(tsk); 1160 get_task_struct(tsk);
1145 } 1161 }
1146 1162
1147 down(&callback_sem); 1163 mutex_lock(&callback_mutex);
1148 1164
1149 task_lock(tsk); 1165 task_lock(tsk);
1150 oldcs = tsk->cpuset; 1166 oldcs = tsk->cpuset;
1151 if (!oldcs) { 1167 if (!oldcs) {
1152 task_unlock(tsk); 1168 task_unlock(tsk);
1153 up(&callback_sem); 1169 mutex_unlock(&callback_mutex);
1154 put_task_struct(tsk); 1170 put_task_struct(tsk);
1155 return -ESRCH; 1171 return -ESRCH;
1156 } 1172 }
@@ -1164,7 +1180,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1164 from = oldcs->mems_allowed; 1180 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed; 1181 to = cs->mems_allowed;
1166 1182
1167 up(&callback_sem); 1183 mutex_unlock(&callback_mutex);
1168 1184
1169 mm = get_task_mm(tsk); 1185 mm = get_task_mm(tsk);
1170 if (mm) { 1186 if (mm) {
@@ -1194,6 +1210,8 @@ typedef enum {
1194 FILE_NOTIFY_ON_RELEASE, 1210 FILE_NOTIFY_ON_RELEASE,
1195 FILE_MEMORY_PRESSURE_ENABLED, 1211 FILE_MEMORY_PRESSURE_ENABLED,
1196 FILE_MEMORY_PRESSURE, 1212 FILE_MEMORY_PRESSURE,
1213 FILE_SPREAD_PAGE,
1214 FILE_SPREAD_SLAB,
1197 FILE_TASKLIST, 1215 FILE_TASKLIST,
1198} cpuset_filetype_t; 1216} cpuset_filetype_t;
1199 1217
@@ -1221,7 +1239,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1221 } 1239 }
1222 buffer[nbytes] = 0; /* nul-terminate */ 1240 buffer[nbytes] = 0; /* nul-terminate */
1223 1241
1224 down(&manage_sem); 1242 mutex_lock(&manage_mutex);
1225 1243
1226 if (is_removed(cs)) { 1244 if (is_removed(cs)) {
1227 retval = -ENODEV; 1245 retval = -ENODEV;
@@ -1253,6 +1271,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1253 case FILE_MEMORY_PRESSURE: 1271 case FILE_MEMORY_PRESSURE:
1254 retval = -EACCES; 1272 retval = -EACCES;
1255 break; 1273 break;
1274 case FILE_SPREAD_PAGE:
1275 retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
1276 cs->mems_generation = cpuset_mems_generation++;
1277 break;
1278 case FILE_SPREAD_SLAB:
1279 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1280 cs->mems_generation = cpuset_mems_generation++;
1281 break;
1256 case FILE_TASKLIST: 1282 case FILE_TASKLIST:
1257 retval = attach_task(cs, buffer, &pathbuf); 1283 retval = attach_task(cs, buffer, &pathbuf);
1258 break; 1284 break;
@@ -1264,7 +1290,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1264 if (retval == 0) 1290 if (retval == 0)
1265 retval = nbytes; 1291 retval = nbytes;
1266out2: 1292out2:
1267 up(&manage_sem); 1293 mutex_unlock(&manage_mutex);
1268 cpuset_release_agent(pathbuf); 1294 cpuset_release_agent(pathbuf);
1269out1: 1295out1:
1270 kfree(buffer); 1296 kfree(buffer);
@@ -1304,9 +1330,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1304{ 1330{
1305 cpumask_t mask; 1331 cpumask_t mask;
1306 1332
1307 down(&callback_sem); 1333 mutex_lock(&callback_mutex);
1308 mask = cs->cpus_allowed; 1334 mask = cs->cpus_allowed;
1309 up(&callback_sem); 1335 mutex_unlock(&callback_mutex);
1310 1336
1311 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1337 return cpulist_scnprintf(page, PAGE_SIZE, mask);
1312} 1338}
@@ -1315,9 +1341,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1315{ 1341{
1316 nodemask_t mask; 1342 nodemask_t mask;
1317 1343
1318 down(&callback_sem); 1344 mutex_lock(&callback_mutex);
1319 mask = cs->mems_allowed; 1345 mask = cs->mems_allowed;
1320 up(&callback_sem); 1346 mutex_unlock(&callback_mutex);
1321 1347
1322 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1348 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1323} 1349}
@@ -1362,6 +1388,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1362 case FILE_MEMORY_PRESSURE: 1388 case FILE_MEMORY_PRESSURE:
1363 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); 1389 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1364 break; 1390 break;
1391 case FILE_SPREAD_PAGE:
1392 *s++ = is_spread_page(cs) ? '1' : '0';
1393 break;
1394 case FILE_SPREAD_SLAB:
1395 *s++ = is_spread_slab(cs) ? '1' : '0';
1396 break;
1365 default: 1397 default:
1366 retval = -EINVAL; 1398 retval = -EINVAL;
1367 goto out; 1399 goto out;
@@ -1598,7 +1630,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1598 * Handle an open on 'tasks' file. Prepare a buffer listing the 1630 * Handle an open on 'tasks' file. Prepare a buffer listing the
1599 * process id's of tasks currently attached to the cpuset being opened. 1631 * process id's of tasks currently attached to the cpuset being opened.
1600 * 1632 *
1601 * Does not require any specific cpuset semaphores, and does not take any. 1633 * Does not require any specific cpuset mutexes, and does not take any.
1602 */ 1634 */
1603static int cpuset_tasks_open(struct inode *unused, struct file *file) 1635static int cpuset_tasks_open(struct inode *unused, struct file *file)
1604{ 1636{
@@ -1725,6 +1757,16 @@ static struct cftype cft_memory_pressure = {
1725 .private = FILE_MEMORY_PRESSURE, 1757 .private = FILE_MEMORY_PRESSURE,
1726}; 1758};
1727 1759
1760static struct cftype cft_spread_page = {
1761 .name = "memory_spread_page",
1762 .private = FILE_SPREAD_PAGE,
1763};
1764
1765static struct cftype cft_spread_slab = {
1766 .name = "memory_spread_slab",
1767 .private = FILE_SPREAD_SLAB,
1768};
1769
1728static int cpuset_populate_dir(struct dentry *cs_dentry) 1770static int cpuset_populate_dir(struct dentry *cs_dentry)
1729{ 1771{
1730 int err; 1772 int err;
@@ -1743,6 +1785,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1743 return err; 1785 return err;
1744 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1786 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1745 return err; 1787 return err;
1788 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
1789 return err;
1790 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
1791 return err;
1746 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1792 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1747 return err; 1793 return err;
1748 return 0; 1794 return 0;
@@ -1754,7 +1800,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1754 * name: name of the new cpuset. Will be strcpy'ed. 1800 * name: name of the new cpuset. Will be strcpy'ed.
1755 * mode: mode to set on new inode 1801 * mode: mode to set on new inode
1756 * 1802 *
1757 * Must be called with the semaphore on the parent inode held 1803 * Must be called with the mutex on the parent inode held
1758 */ 1804 */
1759 1805
1760static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1806static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,44 +1812,47 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1766 if (!cs) 1812 if (!cs)
1767 return -ENOMEM; 1813 return -ENOMEM;
1768 1814
1769 down(&manage_sem); 1815 mutex_lock(&manage_mutex);
1770 cpuset_update_task_memory_state(); 1816 cpuset_update_task_memory_state();
1771 cs->flags = 0; 1817 cs->flags = 0;
1772 if (notify_on_release(parent)) 1818 if (notify_on_release(parent))
1773 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1819 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1820 if (is_spread_page(parent))
1821 set_bit(CS_SPREAD_PAGE, &cs->flags);
1822 if (is_spread_slab(parent))
1823 set_bit(CS_SPREAD_SLAB, &cs->flags);
1774 cs->cpus_allowed = CPU_MASK_NONE; 1824 cs->cpus_allowed = CPU_MASK_NONE;
1775 cs->mems_allowed = NODE_MASK_NONE; 1825 cs->mems_allowed = NODE_MASK_NONE;
1776 atomic_set(&cs->count, 0); 1826 atomic_set(&cs->count, 0);
1777 INIT_LIST_HEAD(&cs->sibling); 1827 INIT_LIST_HEAD(&cs->sibling);
1778 INIT_LIST_HEAD(&cs->children); 1828 INIT_LIST_HEAD(&cs->children);
1779 atomic_inc(&cpuset_mems_generation); 1829 cs->mems_generation = cpuset_mems_generation++;
1780 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1781 fmeter_init(&cs->fmeter); 1830 fmeter_init(&cs->fmeter);
1782 1831
1783 cs->parent = parent; 1832 cs->parent = parent;
1784 1833
1785 down(&callback_sem); 1834 mutex_lock(&callback_mutex);
1786 list_add(&cs->sibling, &cs->parent->children); 1835 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++; 1836 number_of_cpusets++;
1788 up(&callback_sem); 1837 mutex_unlock(&callback_mutex);
1789 1838
1790 err = cpuset_create_dir(cs, name, mode); 1839 err = cpuset_create_dir(cs, name, mode);
1791 if (err < 0) 1840 if (err < 0)
1792 goto err; 1841 goto err;
1793 1842
1794 /* 1843 /*
1795 * Release manage_sem before cpuset_populate_dir() because it 1844 * Release manage_mutex before cpuset_populate_dir() because it
1796 * will down() this new directory's i_mutex and if we race with 1845 * will down() this new directory's i_mutex and if we race with
1797 * another mkdir, we might deadlock. 1846 * another mkdir, we might deadlock.
1798 */ 1847 */
1799 up(&manage_sem); 1848 mutex_unlock(&manage_mutex);
1800 1849
1801 err = cpuset_populate_dir(cs->dentry); 1850 err = cpuset_populate_dir(cs->dentry);
1802 /* If err < 0, we have a half-filled directory - oh well ;) */ 1851 /* If err < 0, we have a half-filled directory - oh well ;) */
1803 return 0; 1852 return 0;
1804err: 1853err:
1805 list_del(&cs->sibling); 1854 list_del(&cs->sibling);
1806 up(&manage_sem); 1855 mutex_unlock(&manage_mutex);
1807 kfree(cs); 1856 kfree(cs);
1808 return err; 1857 return err;
1809} 1858}
@@ -1825,18 +1874,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1825 1874
1826 /* the vfs holds both inode->i_mutex already */ 1875 /* the vfs holds both inode->i_mutex already */
1827 1876
1828 down(&manage_sem); 1877 mutex_lock(&manage_mutex);
1829 cpuset_update_task_memory_state(); 1878 cpuset_update_task_memory_state();
1830 if (atomic_read(&cs->count) > 0) { 1879 if (atomic_read(&cs->count) > 0) {
1831 up(&manage_sem); 1880 mutex_unlock(&manage_mutex);
1832 return -EBUSY; 1881 return -EBUSY;
1833 } 1882 }
1834 if (!list_empty(&cs->children)) { 1883 if (!list_empty(&cs->children)) {
1835 up(&manage_sem); 1884 mutex_unlock(&manage_mutex);
1836 return -EBUSY; 1885 return -EBUSY;
1837 } 1886 }
1838 parent = cs->parent; 1887 parent = cs->parent;
1839 down(&callback_sem); 1888 mutex_lock(&callback_mutex);
1840 set_bit(CS_REMOVED, &cs->flags); 1889 set_bit(CS_REMOVED, &cs->flags);
1841 if (is_cpu_exclusive(cs)) 1890 if (is_cpu_exclusive(cs))
1842 update_cpu_domains(cs); 1891 update_cpu_domains(cs);
@@ -1848,10 +1897,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1848 cpuset_d_remove_dir(d); 1897 cpuset_d_remove_dir(d);
1849 dput(d); 1898 dput(d);
1850 number_of_cpusets--; 1899 number_of_cpusets--;
1851 up(&callback_sem); 1900 mutex_unlock(&callback_mutex);
1852 if (list_empty(&parent->children)) 1901 if (list_empty(&parent->children))
1853 check_for_release(parent, &pathbuf); 1902 check_for_release(parent, &pathbuf);
1854 up(&manage_sem); 1903 mutex_unlock(&manage_mutex);
1855 cpuset_release_agent(pathbuf); 1904 cpuset_release_agent(pathbuf);
1856 return 0; 1905 return 0;
1857} 1906}
@@ -1867,7 +1916,7 @@ int __init cpuset_init_early(void)
1867 struct task_struct *tsk = current; 1916 struct task_struct *tsk = current;
1868 1917
1869 tsk->cpuset = &top_cpuset; 1918 tsk->cpuset = &top_cpuset;
1870 tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); 1919 tsk->cpuset->mems_generation = cpuset_mems_generation++;
1871 return 0; 1920 return 0;
1872} 1921}
1873 1922
@@ -1886,8 +1935,7 @@ int __init cpuset_init(void)
1886 top_cpuset.mems_allowed = NODE_MASK_ALL; 1935 top_cpuset.mems_allowed = NODE_MASK_ALL;
1887 1936
1888 fmeter_init(&top_cpuset.fmeter); 1937 fmeter_init(&top_cpuset.fmeter);
1889 atomic_inc(&cpuset_mems_generation); 1938 top_cpuset.mems_generation = cpuset_mems_generation++;
1890 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1891 1939
1892 init_task.cpuset = &top_cpuset; 1940 init_task.cpuset = &top_cpuset;
1893 1941
@@ -1960,25 +2008,25 @@ void cpuset_fork(struct task_struct *child)
1960 * Description: Detach cpuset from @tsk and release it. 2008 * Description: Detach cpuset from @tsk and release it.
1961 * 2009 *
1962 * Note that cpusets marked notify_on_release force every task in 2010 * Note that cpusets marked notify_on_release force every task in
1963 * them to take the global manage_sem semaphore when exiting. 2011 * them to take the global manage_mutex mutex when exiting.
1964 * This could impact scaling on very large systems. Be reluctant to 2012 * This could impact scaling on very large systems. Be reluctant to
1965 * use notify_on_release cpusets where very high task exit scaling 2013 * use notify_on_release cpusets where very high task exit scaling
1966 * is required on large systems. 2014 * is required on large systems.
1967 * 2015 *
1968 * Don't even think about derefencing 'cs' after the cpuset use count 2016 * Don't even think about derefencing 'cs' after the cpuset use count
1969 * goes to zero, except inside a critical section guarded by manage_sem 2017 * goes to zero, except inside a critical section guarded by manage_mutex
1970 * or callback_sem. Otherwise a zero cpuset use count is a license to 2018 * or callback_mutex. Otherwise a zero cpuset use count is a license to
1971 * any other task to nuke the cpuset immediately, via cpuset_rmdir(). 2019 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1972 * 2020 *
1973 * This routine has to take manage_sem, not callback_sem, because 2021 * This routine has to take manage_mutex, not callback_mutex, because
1974 * it is holding that semaphore while calling check_for_release(), 2022 * it is holding that mutex while calling check_for_release(),
1975 * which calls kmalloc(), so can't be called holding callback__sem(). 2023 * which calls kmalloc(), so can't be called holding callback_mutex().
1976 * 2024 *
1977 * We don't need to task_lock() this reference to tsk->cpuset, 2025 * We don't need to task_lock() this reference to tsk->cpuset,
1978 * because tsk is already marked PF_EXITING, so attach_task() won't 2026 * because tsk is already marked PF_EXITING, so attach_task() won't
1979 * mess with it, or task is a failed fork, never visible to attach_task. 2027 * mess with it, or task is a failed fork, never visible to attach_task.
1980 * 2028 *
1981 * Hack: 2029 * the_top_cpuset_hack:
1982 * 2030 *
1983 * Set the exiting tasks cpuset to the root cpuset (top_cpuset). 2031 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
1984 * 2032 *
@@ -2017,15 +2065,15 @@ void cpuset_exit(struct task_struct *tsk)
2017 struct cpuset *cs; 2065 struct cpuset *cs;
2018 2066
2019 cs = tsk->cpuset; 2067 cs = tsk->cpuset;
2020 tsk->cpuset = &top_cpuset; /* Hack - see comment above */ 2068 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2021 2069
2022 if (notify_on_release(cs)) { 2070 if (notify_on_release(cs)) {
2023 char *pathbuf = NULL; 2071 char *pathbuf = NULL;
2024 2072
2025 down(&manage_sem); 2073 mutex_lock(&manage_mutex);
2026 if (atomic_dec_and_test(&cs->count)) 2074 if (atomic_dec_and_test(&cs->count))
2027 check_for_release(cs, &pathbuf); 2075 check_for_release(cs, &pathbuf);
2028 up(&manage_sem); 2076 mutex_unlock(&manage_mutex);
2029 cpuset_release_agent(pathbuf); 2077 cpuset_release_agent(pathbuf);
2030 } else { 2078 } else {
2031 atomic_dec(&cs->count); 2079 atomic_dec(&cs->count);
@@ -2046,11 +2094,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2046{ 2094{
2047 cpumask_t mask; 2095 cpumask_t mask;
2048 2096
2049 down(&callback_sem); 2097 mutex_lock(&callback_mutex);
2050 task_lock(tsk); 2098 task_lock(tsk);
2051 guarantee_online_cpus(tsk->cpuset, &mask); 2099 guarantee_online_cpus(tsk->cpuset, &mask);
2052 task_unlock(tsk); 2100 task_unlock(tsk);
2053 up(&callback_sem); 2101 mutex_unlock(&callback_mutex);
2054 2102
2055 return mask; 2103 return mask;
2056} 2104}
@@ -2074,11 +2122,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2074{ 2122{
2075 nodemask_t mask; 2123 nodemask_t mask;
2076 2124
2077 down(&callback_sem); 2125 mutex_lock(&callback_mutex);
2078 task_lock(tsk); 2126 task_lock(tsk);
2079 guarantee_online_mems(tsk->cpuset, &mask); 2127 guarantee_online_mems(tsk->cpuset, &mask);
2080 task_unlock(tsk); 2128 task_unlock(tsk);
2081 up(&callback_sem); 2129 mutex_unlock(&callback_mutex);
2082 2130
2083 return mask; 2131 return mask;
2084} 2132}
@@ -2104,7 +2152,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2104 2152
2105/* 2153/*
2106 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 2154 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
2107 * ancestor to the specified cpuset. Call holding callback_sem. 2155 * ancestor to the specified cpuset. Call holding callback_mutex.
2108 * If no ancestor is mem_exclusive (an unusual configuration), then 2156 * If no ancestor is mem_exclusive (an unusual configuration), then
2109 * returns the root cpuset. 2157 * returns the root cpuset.
2110 */ 2158 */
@@ -2131,12 +2179,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2131 * GFP_KERNEL allocations are not so marked, so can escape to the 2179 * GFP_KERNEL allocations are not so marked, so can escape to the
2132 * nearest mem_exclusive ancestor cpuset. 2180 * nearest mem_exclusive ancestor cpuset.
2133 * 2181 *
2134 * Scanning up parent cpusets requires callback_sem. The __alloc_pages() 2182 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
2135 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2183 * routine only calls here with __GFP_HARDWALL bit _not_ set if
2136 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2184 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
2137 * mems_allowed came up empty on the first pass over the zonelist. 2185 * mems_allowed came up empty on the first pass over the zonelist.
2138 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2186 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
2139 * short of memory, might require taking the callback_sem semaphore. 2187 * short of memory, might require taking the callback_mutex mutex.
2140 * 2188 *
2141 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 2189 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
2142 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 2190 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2157,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2157{ 2205{
2158 int node; /* node that zone z is on */ 2206 int node; /* node that zone z is on */
2159 const struct cpuset *cs; /* current cpuset ancestors */ 2207 const struct cpuset *cs; /* current cpuset ancestors */
2160 int allowed = 1; /* is allocation in zone z allowed? */ 2208 int allowed; /* is allocation in zone z allowed? */
2161 2209
2162 if (in_interrupt()) 2210 if (in_interrupt())
2163 return 1; 2211 return 1;
@@ -2171,31 +2219,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2171 return 1; 2219 return 1;
2172 2220
2173 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2221 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2174 down(&callback_sem); 2222 mutex_lock(&callback_mutex);
2175 2223
2176 task_lock(current); 2224 task_lock(current);
2177 cs = nearest_exclusive_ancestor(current->cpuset); 2225 cs = nearest_exclusive_ancestor(current->cpuset);
2178 task_unlock(current); 2226 task_unlock(current);
2179 2227
2180 allowed = node_isset(node, cs->mems_allowed); 2228 allowed = node_isset(node, cs->mems_allowed);
2181 up(&callback_sem); 2229 mutex_unlock(&callback_mutex);
2182 return allowed; 2230 return allowed;
2183} 2231}
2184 2232
2185/** 2233/**
2186 * cpuset_lock - lock out any changes to cpuset structures 2234 * cpuset_lock - lock out any changes to cpuset structures
2187 * 2235 *
2188 * The out of memory (oom) code needs to lock down cpusets 2236 * The out of memory (oom) code needs to mutex_lock cpusets
2189 * from being changed while it scans the tasklist looking for a 2237 * from being changed while it scans the tasklist looking for a
2190 * task in an overlapping cpuset. Expose callback_sem via this 2238 * task in an overlapping cpuset. Expose callback_mutex via this
2191 * cpuset_lock() routine, so the oom code can lock it, before 2239 * cpuset_lock() routine, so the oom code can lock it, before
2192 * locking the task list. The tasklist_lock is a spinlock, so 2240 * locking the task list. The tasklist_lock is a spinlock, so
2193 * must be taken inside callback_sem. 2241 * must be taken inside callback_mutex.
2194 */ 2242 */
2195 2243
2196void cpuset_lock(void) 2244void cpuset_lock(void)
2197{ 2245{
2198 down(&callback_sem); 2246 mutex_lock(&callback_mutex);
2199} 2247}
2200 2248
2201/** 2249/**
@@ -2206,10 +2254,48 @@ void cpuset_lock(void)
2206 2254
2207void cpuset_unlock(void) 2255void cpuset_unlock(void)
2208{ 2256{
2209 up(&callback_sem); 2257 mutex_unlock(&callback_mutex);
2210} 2258}
2211 2259
2212/** 2260/**
2261 * cpuset_mem_spread_node() - On which node to begin search for a page
2262 *
2263 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2264 * tasks in a cpuset with is_spread_page or is_spread_slab set),
2265 * and if the memory allocation used cpuset_mem_spread_node()
2266 * to determine on which node to start looking, as it will for
2267 * certain page cache or slab cache pages such as used for file
2268 * system buffers and inode caches, then instead of starting on the
2269 * local node to look for a free page, rather spread the starting
2270 * node around the tasks mems_allowed nodes.
2271 *
2272 * We don't have to worry about the returned node being offline
2273 * because "it can't happen", and even if it did, it would be ok.
2274 *
2275 * The routines calling guarantee_online_mems() are careful to
2276 * only set nodes in task->mems_allowed that are online. So it
2277 * should not be possible for the following code to return an
2278 * offline node. But if it did, that would be ok, as this routine
2279 * is not returning the node where the allocation must be, only
2280 * the node where the search should start. The zonelist passed to
2281 * __alloc_pages() will include all nodes. If the slab allocator
2282 * is passed an offline node, it will fall back to the local node.
2283 * See kmem_cache_alloc_node().
2284 */
2285
2286int cpuset_mem_spread_node(void)
2287{
2288 int node;
2289
2290 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2291 if (node == MAX_NUMNODES)
2292 node = first_node(current->mems_allowed);
2293 current->cpuset_mem_spread_rotor = node;
2294 return node;
2295}
2296EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2297
2298/**
2213 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2299 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
2214 * @p: pointer to task_struct of some other task. 2300 * @p: pointer to task_struct of some other task.
2215 * 2301 *
@@ -2218,7 +2304,7 @@ void cpuset_unlock(void)
2218 * determine if task @p's memory usage might impact the memory 2304 * determine if task @p's memory usage might impact the memory
2219 * available to the current task. 2305 * available to the current task.
2220 * 2306 *
2221 * Call while holding callback_sem. 2307 * Call while holding callback_mutex.
2222 **/ 2308 **/
2223 2309
2224int cpuset_excl_nodes_overlap(const struct task_struct *p) 2310int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,13 +2375,13 @@ void __cpuset_memory_pressure_bump(void)
2289 * - Used for /proc/<pid>/cpuset. 2375 * - Used for /proc/<pid>/cpuset.
2290 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2376 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2291 * doesn't really matter if tsk->cpuset changes after we read it, 2377 * doesn't really matter if tsk->cpuset changes after we read it,
2292 * and we take manage_sem, keeping attach_task() from changing it 2378 * and we take manage_mutex, keeping attach_task() from changing it
2293 * anyway. 2379 * anyway. No need to check that tsk->cpuset != NULL, thanks to
2380 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2381 * cpuset to top_cpuset.
2294 */ 2382 */
2295
2296static int proc_cpuset_show(struct seq_file *m, void *v) 2383static int proc_cpuset_show(struct seq_file *m, void *v)
2297{ 2384{
2298 struct cpuset *cs;
2299 struct task_struct *tsk; 2385 struct task_struct *tsk;
2300 char *buf; 2386 char *buf;
2301 int retval = 0; 2387 int retval = 0;
@@ -2305,20 +2391,14 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2305 return -ENOMEM; 2391 return -ENOMEM;
2306 2392
2307 tsk = m->private; 2393 tsk = m->private;
2308 down(&manage_sem); 2394 mutex_lock(&manage_mutex);
2309 cs = tsk->cpuset; 2395 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2310 if (!cs) {
2311 retval = -EINVAL;
2312 goto out;
2313 }
2314
2315 retval = cpuset_path(cs, buf, PAGE_SIZE);
2316 if (retval < 0) 2396 if (retval < 0)
2317 goto out; 2397 goto out;
2318 seq_puts(m, buf); 2398 seq_puts(m, buf);
2319 seq_putc(m, '\n'); 2399 seq_putc(m, '\n');
2320out: 2400out:
2321 up(&manage_sem); 2401 mutex_unlock(&manage_mutex);
2322 kfree(buf); 2402 kfree(buf);
2323 return retval; 2403 return retval;
2324} 2404}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 867d6dbeb5..c01cead2cf 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -140,6 +140,7 @@ __set_personality(u_long personality)
140 ep = lookup_exec_domain(personality); 140 ep = lookup_exec_domain(personality);
141 if (ep == current_thread_info()->exec_domain) { 141 if (ep == current_thread_info()->exec_domain) {
142 current->personality = personality; 142 current->personality = personality;
143 module_put(ep->module);
143 return 0; 144 return 0;
144 } 145 }
145 146
diff --git a/kernel/exit.c b/kernel/exit.c
index d1e8d500a7..bc0ec674d3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,11 @@
29#include <linux/cpuset.h> 29#include <linux/cpuset.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/signal.h> 31#include <linux/signal.h>
32#include <linux/posix-timers.h>
32#include <linux/cn_proc.h> 33#include <linux/cn_proc.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/futex.h>
36#include <linux/compat.h>
34 37
35#include <asm/uaccess.h> 38#include <asm/uaccess.h>
36#include <asm/unistd.h> 39#include <asm/unistd.h>
@@ -48,15 +51,80 @@ static void __unhash_process(struct task_struct *p)
48{ 51{
49 nr_threads--; 52 nr_threads--;
50 detach_pid(p, PIDTYPE_PID); 53 detach_pid(p, PIDTYPE_PID);
51 detach_pid(p, PIDTYPE_TGID);
52 if (thread_group_leader(p)) { 54 if (thread_group_leader(p)) {
53 detach_pid(p, PIDTYPE_PGID); 55 detach_pid(p, PIDTYPE_PGID);
54 detach_pid(p, PIDTYPE_SID); 56 detach_pid(p, PIDTYPE_SID);
55 if (p->pid) 57
56 __get_cpu_var(process_counts)--; 58 list_del_init(&p->tasks);
59 __get_cpu_var(process_counts)--;
57 } 60 }
61 list_del_rcu(&p->thread_group);
62 remove_parent(p);
63}
58 64
59 REMOVE_LINKS(p); 65/*
66 * This function expects the tasklist_lock write-locked.
67 */
68static void __exit_signal(struct task_struct *tsk)
69{
70 struct signal_struct *sig = tsk->signal;
71 struct sighand_struct *sighand;
72
73 BUG_ON(!sig);
74 BUG_ON(!atomic_read(&sig->count));
75
76 rcu_read_lock();
77 sighand = rcu_dereference(tsk->sighand);
78 spin_lock(&sighand->siglock);
79
80 posix_cpu_timers_exit(tsk);
81 if (atomic_dec_and_test(&sig->count))
82 posix_cpu_timers_exit_group(tsk);
83 else {
84 /*
85 * If there is any task waiting for the group exit
86 * then notify it:
87 */
88 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
89 wake_up_process(sig->group_exit_task);
90 sig->group_exit_task = NULL;
91 }
92 if (tsk == sig->curr_target)
93 sig->curr_target = next_thread(tsk);
94 /*
95 * Accumulate here the counters for all threads but the
96 * group leader as they die, so they can be added into
97 * the process-wide totals when those are taken.
98 * The group leader stays around as a zombie as long
99 * as there are other threads. When it gets reaped,
100 * the exit.c code will add its counts into these totals.
101 * We won't ever get here for the group leader, since it
102 * will have been the last reference on the signal_struct.
103 */
104 sig->utime = cputime_add(sig->utime, tsk->utime);
105 sig->stime = cputime_add(sig->stime, tsk->stime);
106 sig->min_flt += tsk->min_flt;
107 sig->maj_flt += tsk->maj_flt;
108 sig->nvcsw += tsk->nvcsw;
109 sig->nivcsw += tsk->nivcsw;
110 sig->sched_time += tsk->sched_time;
111 sig = NULL; /* Marker for below. */
112 }
113
114 __unhash_process(tsk);
115
116 tsk->signal = NULL;
117 tsk->sighand = NULL;
118 spin_unlock(&sighand->siglock);
119 rcu_read_unlock();
120
121 __cleanup_sighand(sighand);
122 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
123 flush_sigqueue(&tsk->pending);
124 if (sig) {
125 flush_sigqueue(&sig->shared_pending);
126 __cleanup_signal(sig);
127 }
60} 128}
61 129
62void release_task(struct task_struct * p) 130void release_task(struct task_struct * p)
@@ -65,21 +133,14 @@ void release_task(struct task_struct * p)
65 task_t *leader; 133 task_t *leader;
66 struct dentry *proc_dentry; 134 struct dentry *proc_dentry;
67 135
68repeat: 136repeat:
69 atomic_dec(&p->user->processes); 137 atomic_dec(&p->user->processes);
70 spin_lock(&p->proc_lock); 138 spin_lock(&p->proc_lock);
71 proc_dentry = proc_pid_unhash(p); 139 proc_dentry = proc_pid_unhash(p);
72 write_lock_irq(&tasklist_lock); 140 write_lock_irq(&tasklist_lock);
73 if (unlikely(p->ptrace)) 141 ptrace_unlink(p);
74 __ptrace_unlink(p);
75 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 142 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
76 __exit_signal(p); 143 __exit_signal(p);
77 /*
78 * Note that the fastpath in sys_times depends on __exit_signal having
79 * updated the counters before a task is removed from the tasklist of
80 * the process by __unhash_process.
81 */
82 __unhash_process(p);
83 144
84 /* 145 /*
85 * If we are the last non-leader member of the thread 146 * If we are the last non-leader member of the thread
@@ -114,21 +175,6 @@ repeat:
114 goto repeat; 175 goto repeat;
115} 176}
116 177
117/* we are using it only for SMP init */
118
119void unhash_process(struct task_struct *p)
120{
121 struct dentry *proc_dentry;
122
123 spin_lock(&p->proc_lock);
124 proc_dentry = proc_pid_unhash(p);
125 write_lock_irq(&tasklist_lock);
126 __unhash_process(p);
127 write_unlock_irq(&tasklist_lock);
128 spin_unlock(&p->proc_lock);
129 proc_pid_flush(proc_dentry);
130}
131
132/* 178/*
133 * This checks not only the pgrp, but falls back on the pid if no 179 * This checks not only the pgrp, but falls back on the pid if no
134 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly 180 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +282,10 @@ static void reparent_to_init(void)
236 282
237 ptrace_unlink(current); 283 ptrace_unlink(current);
238 /* Reparent to init */ 284 /* Reparent to init */
239 REMOVE_LINKS(current); 285 remove_parent(current);
240 current->parent = child_reaper; 286 current->parent = child_reaper;
241 current->real_parent = child_reaper; 287 current->real_parent = child_reaper;
242 SET_LINKS(current); 288 add_parent(current);
243 289
244 /* Set the exit signal to SIGCHLD so we signal init on exit */ 290 /* Set the exit signal to SIGCHLD so we signal init on exit */
245 current->exit_signal = SIGCHLD; 291 current->exit_signal = SIGCHLD;
@@ -345,9 +391,9 @@ void daemonize(const char *name, ...)
345 exit_mm(current); 391 exit_mm(current);
346 392
347 set_special_pids(1, 1); 393 set_special_pids(1, 1);
348 down(&tty_sem); 394 mutex_lock(&tty_mutex);
349 current->signal->tty = NULL; 395 current->signal->tty = NULL;
350 up(&tty_sem); 396 mutex_unlock(&tty_mutex);
351 397
352 /* Block and flush all signals */ 398 /* Block and flush all signals */
353 sigfillset(&blocked); 399 sigfillset(&blocked);
@@ -536,13 +582,13 @@ static void exit_mm(struct task_struct * tsk)
536 mmput(mm); 582 mmput(mm);
537} 583}
538 584
539static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) 585static inline void choose_new_parent(task_t *p, task_t *reaper)
540{ 586{
541 /* 587 /*
542 * Make sure we're not reparenting to ourselves and that 588 * Make sure we're not reparenting to ourselves and that
543 * the parent is not a zombie. 589 * the parent is not a zombie.
544 */ 590 */
545 BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); 591 BUG_ON(p == reaper || reaper->exit_state);
546 p->real_parent = reaper; 592 p->real_parent = reaper;
547} 593}
548 594
@@ -567,9 +613,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
567 * anyway, so let go of it. 613 * anyway, so let go of it.
568 */ 614 */
569 p->ptrace = 0; 615 p->ptrace = 0;
570 list_del_init(&p->sibling); 616 remove_parent(p);
571 p->parent = p->real_parent; 617 p->parent = p->real_parent;
572 list_add_tail(&p->sibling, &p->parent->children); 618 add_parent(p);
573 619
574 /* If we'd notified the old parent about this child's death, 620 /* If we'd notified the old parent about this child's death,
575 * also notify the new parent. 621 * also notify the new parent.
@@ -643,7 +689,7 @@ static void forget_original_parent(struct task_struct * father,
643 689
644 if (father == p->real_parent) { 690 if (father == p->real_parent) {
645 /* reparent with a reaper, real father it's us */ 691 /* reparent with a reaper, real father it's us */
646 choose_new_parent(p, reaper, child_reaper); 692 choose_new_parent(p, reaper);
647 reparent_thread(p, father, 0); 693 reparent_thread(p, father, 0);
648 } else { 694 } else {
649 /* reparent ptraced task to its real parent */ 695 /* reparent ptraced task to its real parent */
@@ -664,7 +710,7 @@ static void forget_original_parent(struct task_struct * father,
664 } 710 }
665 list_for_each_safe(_p, _n, &father->ptrace_children) { 711 list_for_each_safe(_p, _n, &father->ptrace_children) {
666 p = list_entry(_p,struct task_struct,ptrace_list); 712 p = list_entry(_p,struct task_struct,ptrace_list);
667 choose_new_parent(p, reaper, child_reaper); 713 choose_new_parent(p, reaper);
668 reparent_thread(p, father, 1); 714 reparent_thread(p, father, 1);
669 } 715 }
670} 716}
@@ -805,7 +851,7 @@ fastcall NORET_TYPE void do_exit(long code)
805 panic("Aiee, killing interrupt handler!"); 851 panic("Aiee, killing interrupt handler!");
806 if (unlikely(!tsk->pid)) 852 if (unlikely(!tsk->pid))
807 panic("Attempted to kill the idle task!"); 853 panic("Attempted to kill the idle task!");
808 if (unlikely(tsk->pid == 1)) 854 if (unlikely(tsk == child_reaper))
809 panic("Attempted to kill init!"); 855 panic("Attempted to kill init!");
810 856
811 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 857 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
@@ -852,6 +898,12 @@ fastcall NORET_TYPE void do_exit(long code)
852 exit_itimers(tsk->signal); 898 exit_itimers(tsk->signal);
853 acct_process(code); 899 acct_process(code);
854 } 900 }
901 if (unlikely(tsk->robust_list))
902 exit_robust_list(tsk);
903#ifdef CONFIG_COMPAT
904 if (unlikely(tsk->compat_robust_list))
905 compat_exit_robust_list(tsk);
906#endif
855 exit_mm(tsk); 907 exit_mm(tsk);
856 908
857 exit_sem(tsk); 909 exit_sem(tsk);
@@ -912,13 +964,6 @@ asmlinkage long sys_exit(int error_code)
912 do_exit((error_code&0xff)<<8); 964 do_exit((error_code&0xff)<<8);
913} 965}
914 966
915task_t fastcall *next_thread(const task_t *p)
916{
917 return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
918}
919
920EXPORT_SYMBOL(next_thread);
921
922/* 967/*
923 * Take down every thread in the group. This is called by fatal signals 968 * Take down every thread in the group. This is called by fatal signals
924 * as well as by sys_exit_group (below). 969 * as well as by sys_exit_group (below).
@@ -933,7 +978,6 @@ do_group_exit(int exit_code)
933 else if (!thread_group_empty(current)) { 978 else if (!thread_group_empty(current)) {
934 struct signal_struct *const sig = current->signal; 979 struct signal_struct *const sig = current->signal;
935 struct sighand_struct *const sighand = current->sighand; 980 struct sighand_struct *const sighand = current->sighand;
936 read_lock(&tasklist_lock);
937 spin_lock_irq(&sighand->siglock); 981 spin_lock_irq(&sighand->siglock);
938 if (sig->flags & SIGNAL_GROUP_EXIT) 982 if (sig->flags & SIGNAL_GROUP_EXIT)
939 /* Another thread got here before we took the lock. */ 983 /* Another thread got here before we took the lock. */
@@ -943,7 +987,6 @@ do_group_exit(int exit_code)
943 zap_other_threads(current); 987 zap_other_threads(current);
944 } 988 }
945 spin_unlock_irq(&sighand->siglock); 989 spin_unlock_irq(&sighand->siglock);
946 read_unlock(&tasklist_lock);
947 } 990 }
948 991
949 do_exit(exit_code); 992 do_exit(exit_code);
@@ -1273,7 +1316,7 @@ bail_ref:
1273 1316
1274 /* move to end of parent's list to avoid starvation */ 1317 /* move to end of parent's list to avoid starvation */
1275 remove_parent(p); 1318 remove_parent(p);
1276 add_parent(p, p->parent); 1319 add_parent(p);
1277 1320
1278 write_unlock_irq(&tasklist_lock); 1321 write_unlock_irq(&tasklist_lock);
1279 1322
diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca4..b3f7a1bb5e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
84#endif 84#endif
85 85
86/* SLAB cache for signal_struct structures (tsk->signal) */ 86/* SLAB cache for signal_struct structures (tsk->signal) */
87kmem_cache_t *signal_cachep; 87static kmem_cache_t *signal_cachep;
88 88
89/* SLAB cache for sighand_struct structures (tsk->sighand) */ 89/* SLAB cache for sighand_struct structures (tsk->sighand) */
90kmem_cache_t *sighand_cachep; 90kmem_cache_t *sighand_cachep;
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
181 /* One for us, one for whoever does the "release_task()" (usually parent) */ 181 /* One for us, one for whoever does the "release_task()" (usually parent) */
182 atomic_set(&tsk->usage,2); 182 atomic_set(&tsk->usage,2);
183 atomic_set(&tsk->fs_excl, 0); 183 atomic_set(&tsk->fs_excl, 0);
184 tsk->btrace_seq = 0;
184 return tsk; 185 return tsk;
185} 186}
186 187
@@ -607,12 +608,12 @@ static struct files_struct *alloc_files(void)
607 atomic_set(&newf->count, 1); 608 atomic_set(&newf->count, 1);
608 609
609 spin_lock_init(&newf->file_lock); 610 spin_lock_init(&newf->file_lock);
611 newf->next_fd = 0;
610 fdt = &newf->fdtab; 612 fdt = &newf->fdtab;
611 fdt->next_fd = 0;
612 fdt->max_fds = NR_OPEN_DEFAULT; 613 fdt->max_fds = NR_OPEN_DEFAULT;
613 fdt->max_fdset = __FD_SETSIZE; 614 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
614 fdt->close_on_exec = &newf->close_on_exec_init; 615 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
615 fdt->open_fds = &newf->open_fds_init; 616 fdt->open_fds = (fd_set *)&newf->open_fds_init;
616 fdt->fd = &newf->fd_array[0]; 617 fdt->fd = &newf->fd_array[0];
617 INIT_RCU_HEAD(&fdt->rcu); 618 INIT_RCU_HEAD(&fdt->rcu);
618 fdt->free_files = NULL; 619 fdt->free_files = NULL;
@@ -768,8 +769,7 @@ int unshare_files(void)
768 struct files_struct *files = current->files; 769 struct files_struct *files = current->files;
769 int rc; 770 int rc;
770 771
771 if(!files) 772 BUG_ON(!files);
772 BUG();
773 773
774 /* This can race but the race causes us to copy when we don't 774 /* This can race but the race causes us to copy when we don't
775 need to and drop the copy */ 775 need to and drop the copy */
@@ -786,14 +786,6 @@ int unshare_files(void)
786 786
787EXPORT_SYMBOL(unshare_files); 787EXPORT_SYMBOL(unshare_files);
788 788
789void sighand_free_cb(struct rcu_head *rhp)
790{
791 struct sighand_struct *sp;
792
793 sp = container_of(rhp, struct sighand_struct, rcu);
794 kmem_cache_free(sighand_cachep, sp);
795}
796
797static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 789static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
798{ 790{
799 struct sighand_struct *sig; 791 struct sighand_struct *sig;
@@ -806,12 +798,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
806 rcu_assign_pointer(tsk->sighand, sig); 798 rcu_assign_pointer(tsk->sighand, sig);
807 if (!sig) 799 if (!sig)
808 return -ENOMEM; 800 return -ENOMEM;
809 spin_lock_init(&sig->siglock);
810 atomic_set(&sig->count, 1); 801 atomic_set(&sig->count, 1);
811 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 802 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
812 return 0; 803 return 0;
813} 804}
814 805
806void __cleanup_sighand(struct sighand_struct *sighand)
807{
808 if (atomic_dec_and_test(&sighand->count))
809 kmem_cache_free(sighand_cachep, sighand);
810}
811
815static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 812static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
816{ 813{
817 struct signal_struct *sig; 814 struct signal_struct *sig;
@@ -847,7 +844,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
847 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); 844 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
848 sig->it_real_incr.tv64 = 0; 845 sig->it_real_incr.tv64 = 0;
849 sig->real_timer.function = it_real_fn; 846 sig->real_timer.function = it_real_fn;
850 sig->real_timer.data = tsk; 847 sig->tsk = tsk;
851 848
852 sig->it_virt_expires = cputime_zero; 849 sig->it_virt_expires = cputime_zero;
853 sig->it_virt_incr = cputime_zero; 850 sig->it_virt_incr = cputime_zero;
@@ -881,6 +878,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
881 return 0; 878 return 0;
882} 879}
883 880
881void __cleanup_signal(struct signal_struct *sig)
882{
883 exit_thread_group_keys(sig);
884 kmem_cache_free(signal_cachep, sig);
885}
886
887static inline void cleanup_signal(struct task_struct *tsk)
888{
889 struct signal_struct *sig = tsk->signal;
890
891 atomic_dec(&sig->live);
892
893 if (atomic_dec_and_test(&sig->count))
894 __cleanup_signal(sig);
895}
896
884static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 897static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
885{ 898{
886 unsigned long new_flags = p->flags; 899 unsigned long new_flags = p->flags;
@@ -1020,6 +1033,7 @@ static task_t *copy_process(unsigned long clone_flags,
1020 p->mempolicy = NULL; 1033 p->mempolicy = NULL;
1021 goto bad_fork_cleanup_cpuset; 1034 goto bad_fork_cleanup_cpuset;
1022 } 1035 }
1036 mpol_fix_fork_child_flag(p);
1023#endif 1037#endif
1024 1038
1025#ifdef CONFIG_DEBUG_MUTEXES 1039#ifdef CONFIG_DEBUG_MUTEXES
@@ -1060,7 +1074,10 @@ static task_t *copy_process(unsigned long clone_flags,
1060 * Clear TID on mm_release()? 1074 * Clear TID on mm_release()?
1061 */ 1075 */
1062 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1076 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1063 1077 p->robust_list = NULL;
1078#ifdef CONFIG_COMPAT
1079 p->compat_robust_list = NULL;
1080#endif
1064 /* 1081 /*
1065 * sigaltstack should be cleared when sharing the same VM 1082 * sigaltstack should be cleared when sharing the same VM
1066 */ 1083 */
@@ -1091,6 +1108,7 @@ static task_t *copy_process(unsigned long clone_flags,
1091 * We dont wake it up yet. 1108 * We dont wake it up yet.
1092 */ 1109 */
1093 p->group_leader = p; 1110 p->group_leader = p;
1111 INIT_LIST_HEAD(&p->thread_group);
1094 INIT_LIST_HEAD(&p->ptrace_children); 1112 INIT_LIST_HEAD(&p->ptrace_children);
1095 INIT_LIST_HEAD(&p->ptrace_list); 1113 INIT_LIST_HEAD(&p->ptrace_list);
1096 1114
@@ -1114,16 +1132,6 @@ static task_t *copy_process(unsigned long clone_flags,
1114 !cpu_online(task_cpu(p)))) 1132 !cpu_online(task_cpu(p))))
1115 set_task_cpu(p, smp_processor_id()); 1133 set_task_cpu(p, smp_processor_id());
1116 1134
1117 /*
1118 * Check for pending SIGKILL! The new thread should not be allowed
1119 * to slip out of an OOM kill. (or normal SIGKILL.)
1120 */
1121 if (sigismember(&current->pending.signal, SIGKILL)) {
1122 write_unlock_irq(&tasklist_lock);
1123 retval = -EINTR;
1124 goto bad_fork_cleanup_namespace;
1125 }
1126
1127 /* CLONE_PARENT re-uses the old parent */ 1135 /* CLONE_PARENT re-uses the old parent */
1128 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1136 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1129 p->real_parent = current->real_parent; 1137 p->real_parent = current->real_parent;
@@ -1132,6 +1140,23 @@ static task_t *copy_process(unsigned long clone_flags,
1132 p->parent = p->real_parent; 1140 p->parent = p->real_parent;
1133 1141
1134 spin_lock(&current->sighand->siglock); 1142 spin_lock(&current->sighand->siglock);
1143
1144 /*
1145 * Process group and session signals need to be delivered to just the
1146 * parent before the fork or both the parent and the child after the
1147 * fork. Restart if a signal comes in before we add the new process to
1148 * it's process group.
1149 * A fatal signal pending means that current will exit, so the new
1150 * thread can't slip out of an OOM kill (or normal SIGKILL).
1151 */
1152 recalc_sigpending();
1153 if (signal_pending(current)) {
1154 spin_unlock(&current->sighand->siglock);
1155 write_unlock_irq(&tasklist_lock);
1156 retval = -ERESTARTNOINTR;
1157 goto bad_fork_cleanup_namespace;
1158 }
1159
1135 if (clone_flags & CLONE_THREAD) { 1160 if (clone_flags & CLONE_THREAD) {
1136 /* 1161 /*
1137 * Important: if an exit-all has been started then 1162 * Important: if an exit-all has been started then
@@ -1144,17 +1169,9 @@ static task_t *copy_process(unsigned long clone_flags,
1144 retval = -EAGAIN; 1169 retval = -EAGAIN;
1145 goto bad_fork_cleanup_namespace; 1170 goto bad_fork_cleanup_namespace;
1146 } 1171 }
1147 p->group_leader = current->group_leader;
1148 1172
1149 if (current->signal->group_stop_count > 0) { 1173 p->group_leader = current->group_leader;
1150 /* 1174 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1151 * There is an all-stop in progress for the group.
1152 * We ourselves will stop as soon as we check signals.
1153 * Make the new thread part of that group stop too.
1154 */
1155 current->signal->group_stop_count++;
1156 set_tsk_thread_flag(p, TIF_SIGPENDING);
1157 }
1158 1175
1159 if (!cputime_eq(current->signal->it_virt_expires, 1176 if (!cputime_eq(current->signal->it_virt_expires,
1160 cputime_zero) || 1177 cputime_zero) ||
@@ -1177,23 +1194,25 @@ static task_t *copy_process(unsigned long clone_flags,
1177 */ 1194 */
1178 p->ioprio = current->ioprio; 1195 p->ioprio = current->ioprio;
1179 1196
1180 SET_LINKS(p); 1197 if (likely(p->pid)) {
1181 if (unlikely(p->ptrace & PT_PTRACED)) 1198 add_parent(p);
1182 __ptrace_link(p, current->parent); 1199 if (unlikely(p->ptrace & PT_PTRACED))
1183 1200 __ptrace_link(p, current->parent);
1184 if (thread_group_leader(p)) { 1201
1185 p->signal->tty = current->signal->tty; 1202 if (thread_group_leader(p)) {
1186 p->signal->pgrp = process_group(current); 1203 p->signal->tty = current->signal->tty;
1187 p->signal->session = current->signal->session; 1204 p->signal->pgrp = process_group(current);
1188 attach_pid(p, PIDTYPE_PGID, process_group(p)); 1205 p->signal->session = current->signal->session;
1189 attach_pid(p, PIDTYPE_SID, p->signal->session); 1206 attach_pid(p, PIDTYPE_PGID, process_group(p));
1190 if (p->pid) 1207 attach_pid(p, PIDTYPE_SID, p->signal->session);
1208
1209 list_add_tail(&p->tasks, &init_task.tasks);
1191 __get_cpu_var(process_counts)++; 1210 __get_cpu_var(process_counts)++;
1211 }
1212 attach_pid(p, PIDTYPE_PID, p->pid);
1213 nr_threads++;
1192 } 1214 }
1193 attach_pid(p, PIDTYPE_TGID, p->tgid);
1194 attach_pid(p, PIDTYPE_PID, p->pid);
1195 1215
1196 nr_threads++;
1197 total_forks++; 1216 total_forks++;
1198 spin_unlock(&current->sighand->siglock); 1217 spin_unlock(&current->sighand->siglock);
1199 write_unlock_irq(&tasklist_lock); 1218 write_unlock_irq(&tasklist_lock);
@@ -1208,9 +1227,9 @@ bad_fork_cleanup_mm:
1208 if (p->mm) 1227 if (p->mm)
1209 mmput(p->mm); 1228 mmput(p->mm);
1210bad_fork_cleanup_signal: 1229bad_fork_cleanup_signal:
1211 exit_signal(p); 1230 cleanup_signal(p);
1212bad_fork_cleanup_sighand: 1231bad_fork_cleanup_sighand:
1213 exit_sighand(p); 1232 __cleanup_sighand(p->sighand);
1214bad_fork_cleanup_fs: 1233bad_fork_cleanup_fs:
1215 exit_fs(p); /* blocking */ 1234 exit_fs(p); /* blocking */
1216bad_fork_cleanup_files: 1235bad_fork_cleanup_files:
@@ -1257,7 +1276,7 @@ task_t * __devinit fork_idle(int cpu)
1257 if (!task) 1276 if (!task)
1258 return ERR_PTR(-ENOMEM); 1277 return ERR_PTR(-ENOMEM);
1259 init_idle(task, cpu); 1278 init_idle(task, cpu);
1260 unhash_process(task); 1279
1261 return task; 1280 return task;
1262} 1281}
1263 1282
@@ -1349,11 +1368,21 @@ long do_fork(unsigned long clone_flags,
1349#define ARCH_MIN_MMSTRUCT_ALIGN 0 1368#define ARCH_MIN_MMSTRUCT_ALIGN 0
1350#endif 1369#endif
1351 1370
1371static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
1372{
1373 struct sighand_struct *sighand = data;
1374
1375 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1376 SLAB_CTOR_CONSTRUCTOR)
1377 spin_lock_init(&sighand->siglock);
1378}
1379
1352void __init proc_caches_init(void) 1380void __init proc_caches_init(void)
1353{ 1381{
1354 sighand_cachep = kmem_cache_create("sighand_cache", 1382 sighand_cachep = kmem_cache_create("sighand_cache",
1355 sizeof(struct sighand_struct), 0, 1383 sizeof(struct sighand_struct), 0,
1356 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1384 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1385 sighand_ctor, NULL);
1357 signal_cachep = kmem_cache_create("signal_cache", 1386 signal_cachep = kmem_cache_create("signal_cache",
1358 sizeof(struct signal_struct), 0, 1387 sizeof(struct signal_struct), 0,
1359 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1388 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
@@ -1534,6 +1563,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1534 1563
1535 check_unshare_flags(&unshare_flags); 1564 check_unshare_flags(&unshare_flags);
1536 1565
1566 /* Return -EINVAL for all unsupported flags */
1567 err = -EINVAL;
1568 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1569 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
1570 goto bad_unshare_out;
1571
1537 if ((err = unshare_thread(unshare_flags))) 1572 if ((err = unshare_thread(unshare_flags)))
1538 goto bad_unshare_out; 1573 goto bad_unshare_out;
1539 if ((err = unshare_fs(unshare_flags, &new_fs))) 1574 if ((err = unshare_fs(unshare_flags, &new_fs)))
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f9780..9c9b2b6b22 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
8 * Removed page pinning, fix privately mapped COW pages and other cleanups 8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier 9 * (C) Copyright 2003, 2004 Jamie Lokier
10 * 10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
11 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
12 * enough at me, Linus for the original (flawed) idea, Matthew 16 * enough at me, Linus for the original (flawed) idea, Matthew
13 * Kirkwood for proof-of-concept implementation. 17 * Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
829 goto out; 833 goto out;
830} 834}
831 835
836/*
837 * Support for robust futexes: the kernel cleans up held futexes at
838 * thread exit time.
839 *
840 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after
847 * acquiring the lock, but just before it could have added itself to
848 * the list. There can only be one such pending lock.
849 */
850
851/**
852 * sys_set_robust_list - set the robust-futex list head of a task
853 * @head: pointer to the list-head
854 * @len: length of the list-head, as userspace expects
855 */
856asmlinkage long
857sys_set_robust_list(struct robust_list_head __user *head,
858 size_t len)
859{
860 /*
861 * The kernel knows only one size for now:
862 */
863 if (unlikely(len != sizeof(*head)))
864 return -EINVAL;
865
866 current->robust_list = head;
867
868 return 0;
869}
870
871/**
872 * sys_get_robust_list - get the robust-futex list head of a task
873 * @pid: pid of the process [zero for current task]
874 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
875 * @len_ptr: pointer to a length field, the kernel fills in the header size
876 */
877asmlinkage long
878sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
879 size_t __user *len_ptr)
880{
881 struct robust_list_head *head;
882 unsigned long ret;
883
884 if (!pid)
885 head = current->robust_list;
886 else {
887 struct task_struct *p;
888
889 ret = -ESRCH;
890 read_lock(&tasklist_lock);
891 p = find_task_by_pid(pid);
892 if (!p)
893 goto err_unlock;
894 ret = -EPERM;
895 if ((current->euid != p->euid) && (current->euid != p->uid) &&
896 !capable(CAP_SYS_PTRACE))
897 goto err_unlock;
898 head = p->robust_list;
899 read_unlock(&tasklist_lock);
900 }
901
902 if (put_user(sizeof(*head), len_ptr))
903 return -EFAULT;
904 return put_user(head, head_ptr);
905
906err_unlock:
907 read_unlock(&tasklist_lock);
908
909 return ret;
910}
911
912/*
913 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so:
915 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{
918 u32 uval;
919
920retry:
921 if (get_user(uval, uaddr))
922 return -1;
923
924 if ((uval & FUTEX_TID_MASK) == curr->pid) {
925 /*
926 * Ok, this dying thread is truly holding a futex
927 * of interest. Set the OWNER_DIED bit atomically
928 * via cmpxchg, and if the value had FUTEX_WAITERS
929 * set, wake up a waiter (if any). (We have to do a
930 * futex_wake() even if OWNER_DIED is already set -
931 * to handle the rare but possible case of recursive
932 * thread-death.) The rest of the cleanup is done in
933 * userspace.
934 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval)
937 goto retry;
938
939 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1);
941 }
942 return 0;
943}
944
945/*
946 * Walk curr->robust_list (very carefully, it's a userspace list!)
947 * and mark any locks found there dead, and notify any waiters.
948 *
949 * We silently return on any sign of list-walking problem.
950 */
951void exit_robust_list(struct task_struct *curr)
952{
953 struct robust_list_head __user *head = curr->robust_list;
954 struct robust_list __user *entry, *pending;
955 unsigned int limit = ROBUST_LIST_LIMIT;
956 unsigned long futex_offset;
957
958 /*
959 * Fetch the list head (which was registered earlier, via
960 * sys_set_robust_list()):
961 */
962 if (get_user(entry, &head->list.next))
963 return;
964 /*
965 * Fetch the relative futex offset:
966 */
967 if (get_user(futex_offset, &head->futex_offset))
968 return;
969 /*
970 * Fetch any possibly pending lock-add first, and handle it
971 * if it exists:
972 */
973 if (get_user(pending, &head->list_op_pending))
974 return;
975 if (pending)
976 handle_futex_death((void *)pending + futex_offset, curr);
977
978 while (entry != &head->list) {
979 /*
980 * A pending lock might already be on the list, so
981 * dont process it twice:
982 */
983 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset,
985 curr))
986 return;
987 /*
988 * Fetch the next entry in the list:
989 */
990 if (get_user(entry, &entry->next))
991 return;
992 /*
993 * Avoid excessively long or circular lists:
994 */
995 if (!--limit)
996 break;
997
998 cond_resched();
999 }
1000}
1001
832long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
833 unsigned long uaddr2, int val2, int val3) 1003 unsigned long uaddr2, int val2, int val3)
834{ 1004{
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 0000000000..54274fc853
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,142 @@
1/*
2 * linux/kernel/futex_compat.c
3 *
4 * Futex compatibililty routines.
5 *
6 * Copyright 2006, Red Hat, Inc., Ingo Molnar
7 */
8
9#include <linux/linkage.h>
10#include <linux/compat.h>
11#include <linux/futex.h>
12
13#include <asm/uaccess.h>
14
15/*
16 * Walk curr->robust_list (very carefully, it's a userspace list!)
17 * and mark any locks found there dead, and notify any waiters.
18 *
19 * We silently return on any sign of list-walking problem.
20 */
21void compat_exit_robust_list(struct task_struct *curr)
22{
23 struct compat_robust_list_head __user *head = curr->compat_robust_list;
24 struct robust_list __user *entry, *pending;
25 compat_uptr_t uentry, upending;
26 unsigned int limit = ROBUST_LIST_LIMIT;
27 compat_long_t futex_offset;
28
29 /*
30 * Fetch the list head (which was registered earlier, via
31 * sys_set_robust_list()):
32 */
33 if (get_user(uentry, &head->list.next))
34 return;
35 entry = compat_ptr(uentry);
36 /*
37 * Fetch the relative futex offset:
38 */
39 if (get_user(futex_offset, &head->futex_offset))
40 return;
41 /*
42 * Fetch any possibly pending lock-add first, and handle it
43 * if it exists:
44 */
45 if (get_user(upending, &head->list_op_pending))
46 return;
47 pending = compat_ptr(upending);
48 if (upending)
49 handle_futex_death((void *)pending + futex_offset, curr);
50
51 while (compat_ptr(uentry) != &head->list) {
52 /*
53 * A pending lock might already be on the list, so
54 * dont process it twice:
55 */
56 if (entry != pending)
57 if (handle_futex_death((void *)entry + futex_offset,
58 curr))
59 return;
60
61 /*
62 * Fetch the next entry in the list:
63 */
64 if (get_user(uentry, (compat_uptr_t *)&entry->next))
65 return;
66 entry = compat_ptr(uentry);
67 /*
68 * Avoid excessively long or circular lists:
69 */
70 if (!--limit)
71 break;
72
73 cond_resched();
74 }
75}
76
77asmlinkage long
78compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
79 compat_size_t len)
80{
81 if (unlikely(len != sizeof(*head)))
82 return -EINVAL;
83
84 current->compat_robust_list = head;
85
86 return 0;
87}
88
89asmlinkage long
90compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
91 compat_size_t __user *len_ptr)
92{
93 struct compat_robust_list_head *head;
94 unsigned long ret;
95
96 if (!pid)
97 head = current->compat_robust_list;
98 else {
99 struct task_struct *p;
100
101 ret = -ESRCH;
102 read_lock(&tasklist_lock);
103 p = find_task_by_pid(pid);
104 if (!p)
105 goto err_unlock;
106 ret = -EPERM;
107 if ((current->euid != p->euid) && (current->euid != p->uid) &&
108 !capable(CAP_SYS_PTRACE))
109 goto err_unlock;
110 head = p->compat_robust_list;
111 read_unlock(&tasklist_lock);
112 }
113
114 if (put_user(sizeof(*head), len_ptr))
115 return -EFAULT;
116 return put_user(ptr_to_compat(head), head_ptr);
117
118err_unlock:
119 read_unlock(&tasklist_lock);
120
121 return ret;
122}
123
124asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
125 struct compat_timespec __user *utime, u32 __user *uaddr2,
126 u32 val3)
127{
128 struct timespec t;
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0;
131
132 if ((op == FUTEX_WAIT) && utime) {
133 if (get_compat_timespec(&t, utime))
134 return -EFAULT;
135 timeout = timespec_to_jiffies(&t) + 1;
136 }
137 if (op >= FUTEX_REQUEUE)
138 val2 = (int) (unsigned long) utime;
139
140 return do_futex((unsigned long)uaddr, op, val, timeout,
141 (unsigned long)uaddr2, val2, val3);
142}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14bc9cfa63..0237a556eb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts)
123EXPORT_SYMBOL_GPL(ktime_get_ts); 123EXPORT_SYMBOL_GPL(ktime_get_ts);
124 124
125/* 125/*
126 * Get the coarse grained time at the softirq based on xtime and
127 * wall_to_monotonic.
128 */
129static void hrtimer_get_softirq_time(struct hrtimer_base *base)
130{
131 ktime_t xtim, tomono;
132 unsigned long seq;
133
134 do {
135 seq = read_seqbegin(&xtime_lock);
136 xtim = timespec_to_ktime(xtime);
137 tomono = timespec_to_ktime(wall_to_monotonic);
138
139 } while (read_seqretry(&xtime_lock, seq));
140
141 base[CLOCK_REALTIME].softirq_time = xtim;
142 base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
143}
144
145/*
126 * Functions and macros which are different for UP/SMP systems are kept in a 146 * Functions and macros which are different for UP/SMP systems are kept in a
127 * single place 147 * single place
128 */ 148 */
@@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
246/* 266/*
247 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
248 */ 268 */
249static unsigned long ktime_divns(const ktime_t kt, nsec_t div) 269static unsigned long ktime_divns(const ktime_t kt, s64 div)
250{ 270{
251 u64 dclc, inc, dns; 271 u64 dclc, inc, dns;
252 int sft = 0; 272 int sft = 0;
@@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
281 * hrtimer_forward - forward the timer expiry 301 * hrtimer_forward - forward the timer expiry
282 * 302 *
283 * @timer: hrtimer to forward 303 * @timer: hrtimer to forward
304 * @now: forward past this time
284 * @interval: the interval to forward 305 * @interval: the interval to forward
285 * 306 *
286 * Forward the timer expiry so it will expire in the future. 307 * Forward the timer expiry so it will expire in the future.
287 * Returns the number of overruns. 308 * Returns the number of overruns.
288 */ 309 */
289unsigned long 310unsigned long
290hrtimer_forward(struct hrtimer *timer, ktime_t interval) 311hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
291{ 312{
292 unsigned long orun = 1; 313 unsigned long orun = 1;
293 ktime_t delta, now; 314 ktime_t delta;
294
295 now = timer->base->get_time();
296 315
297 delta = ktime_sub(now, timer->expires); 316 delta = ktime_sub(now, timer->expires);
298 317
@@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval)
303 interval.tv64 = timer->base->resolution.tv64; 322 interval.tv64 = timer->base->resolution.tv64;
304 323
305 if (unlikely(delta.tv64 >= interval.tv64)) { 324 if (unlikely(delta.tv64 >= interval.tv64)) {
306 nsec_t incr = ktime_to_ns(interval); 325 s64 incr = ktime_to_ns(interval);
307 326
308 orun = ktime_divns(delta, incr); 327 orun = ktime_divns(delta, incr);
309 timer->expires = ktime_add_ns(timer->expires, incr * orun); 328 timer->expires = ktime_add_ns(timer->expires, incr * orun);
@@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
355 rb_link_node(&timer->node, parent, link); 374 rb_link_node(&timer->node, parent, link);
356 rb_insert_color(&timer->node, &base->active); 375 rb_insert_color(&timer->node, &base->active);
357 376
358 timer->state = HRTIMER_PENDING;
359
360 if (!base->first || timer->expires.tv64 < 377 if (!base->first || timer->expires.tv64 <
361 rb_entry(base->first, struct hrtimer, node)->expires.tv64) 378 rb_entry(base->first, struct hrtimer, node)->expires.tv64)
362 base->first = &timer->node; 379 base->first = &timer->node;
@@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
376 if (base->first == &timer->node) 393 if (base->first == &timer->node)
377 base->first = rb_next(&timer->node); 394 base->first = rb_next(&timer->node);
378 rb_erase(&timer->node, &base->active); 395 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE;
379} 397}
380 398
381/* 399/*
@@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
386{ 404{
387 if (hrtimer_active(timer)) { 405 if (hrtimer_active(timer)) {
388 __remove_hrtimer(timer, base); 406 __remove_hrtimer(timer, base);
389 timer->state = HRTIMER_INACTIVE;
390 return 1; 407 return 1;
391 } 408 }
392 return 0; 409 return 0;
@@ -560,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
560 clock_id = CLOCK_MONOTONIC; 577 clock_id = CLOCK_MONOTONIC;
561 578
562 timer->base = &bases[clock_id]; 579 timer->base = &bases[clock_id];
580 timer->node.rb_parent = HRTIMER_INACTIVE;
563} 581}
564 582
565/** 583/**
@@ -586,48 +604,35 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
586 */ 604 */
587static inline void run_hrtimer_queue(struct hrtimer_base *base) 605static inline void run_hrtimer_queue(struct hrtimer_base *base)
588{ 606{
589 ktime_t now = base->get_time();
590 struct rb_node *node; 607 struct rb_node *node;
591 608
609 if (base->get_softirq_time)
610 base->softirq_time = base->get_softirq_time();
611
592 spin_lock_irq(&base->lock); 612 spin_lock_irq(&base->lock);
593 613
594 while ((node = base->first)) { 614 while ((node = base->first)) {
595 struct hrtimer *timer; 615 struct hrtimer *timer;
596 int (*fn)(void *); 616 int (*fn)(struct hrtimer *);
597 int restart; 617 int restart;
598 void *data;
599 618
600 timer = rb_entry(node, struct hrtimer, node); 619 timer = rb_entry(node, struct hrtimer, node);
601 if (now.tv64 <= timer->expires.tv64) 620 if (base->softirq_time.tv64 <= timer->expires.tv64)
602 break; 621 break;
603 622
604 fn = timer->function; 623 fn = timer->function;
605 data = timer->data;
606 set_curr_timer(base, timer); 624 set_curr_timer(base, timer);
607 timer->state = HRTIMER_RUNNING;
608 __remove_hrtimer(timer, base); 625 __remove_hrtimer(timer, base);
609 spin_unlock_irq(&base->lock); 626 spin_unlock_irq(&base->lock);
610 627
611 /* 628 restart = fn(timer);
612 * fn == NULL is special case for the simplest timer
613 * variant - wake up process and do not restart:
614 */
615 if (!fn) {
616 wake_up_process(data);
617 restart = HRTIMER_NORESTART;
618 } else
619 restart = fn(data);
620 629
621 spin_lock_irq(&base->lock); 630 spin_lock_irq(&base->lock);
622 631
623 /* Another CPU has added back the timer */ 632 if (restart != HRTIMER_NORESTART) {
624 if (timer->state != HRTIMER_RUNNING) 633 BUG_ON(hrtimer_active(timer));
625 continue;
626
627 if (restart == HRTIMER_RESTART)
628 enqueue_hrtimer(timer, base); 634 enqueue_hrtimer(timer, base);
629 else 635 }
630 timer->state = HRTIMER_EXPIRED;
631 } 636 }
632 set_curr_timer(base, NULL); 637 set_curr_timer(base, NULL);
633 spin_unlock_irq(&base->lock); 638 spin_unlock_irq(&base->lock);
@@ -641,6 +646,8 @@ void hrtimer_run_queues(void)
641 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); 646 struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
642 int i; 647 int i;
643 648
649 hrtimer_get_softirq_time(base);
650
644 for (i = 0; i < MAX_HRTIMER_BASES; i++) 651 for (i = 0; i < MAX_HRTIMER_BASES; i++)
645 run_hrtimer_queue(&base[i]); 652 run_hrtimer_queue(&base[i]);
646} 653}
@@ -649,79 +656,70 @@ void hrtimer_run_queues(void)
649 * Sleep related functions: 656 * Sleep related functions:
650 */ 657 */
651 658
652/** 659struct sleep_hrtimer {
653 * schedule_hrtimer - sleep until timeout 660 struct hrtimer timer;
654 * 661 struct task_struct *task;
655 * @timer: hrtimer variable initialized with the correct clock base 662 int expired;
656 * @mode: timeout value is abs/rel 663};
657 *
658 * Make the current task sleep until @timeout is
659 * elapsed.
660 *
661 * You can set the task state as follows -
662 *
663 * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
664 * pass before the routine returns. The routine will return 0
665 *
666 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
667 * delivered to the current task. In this case the remaining time
668 * will be returned
669 *
670 * The current task state is guaranteed to be TASK_RUNNING when this
671 * routine returns.
672 */
673static ktime_t __sched
674schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
675{
676 /* fn stays NULL, meaning single-shot wakeup: */
677 timer->data = current;
678 664
679 hrtimer_start(timer, timer->expires, mode); 665static int nanosleep_wakeup(struct hrtimer *timer)
666{
667 struct sleep_hrtimer *t =
668 container_of(timer, struct sleep_hrtimer, timer);
680 669
681 schedule(); 670 t->expired = 1;
682 hrtimer_cancel(timer); 671 wake_up_process(t->task);
683 672
684 /* Return the remaining time: */ 673 return HRTIMER_NORESTART;
685 if (timer->state != HRTIMER_EXPIRED)
686 return ktime_sub(timer->expires, timer->base->get_time());
687 else
688 return (ktime_t) {.tv64 = 0 };
689} 674}
690 675
691static inline ktime_t __sched 676static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
692schedule_hrtimer_interruptible(struct hrtimer *timer,
693 const enum hrtimer_mode mode)
694{ 677{
695 set_current_state(TASK_INTERRUPTIBLE); 678 t->timer.function = nanosleep_wakeup;
679 t->task = current;
680 t->expired = 0;
681
682 do {
683 set_current_state(TASK_INTERRUPTIBLE);
684 hrtimer_start(&t->timer, t->timer.expires, mode);
685
686 schedule();
687
688 if (unlikely(!t->expired)) {
689 hrtimer_cancel(&t->timer);
690 mode = HRTIMER_ABS;
691 }
692 } while (!t->expired && !signal_pending(current));
696 693
697 return schedule_hrtimer(timer, mode); 694 return t->expired;
698} 695}
699 696
700static long __sched nanosleep_restart(struct restart_block *restart) 697static long __sched nanosleep_restart(struct restart_block *restart)
701{ 698{
699 struct sleep_hrtimer t;
702 struct timespec __user *rmtp; 700 struct timespec __user *rmtp;
703 struct timespec tu; 701 struct timespec tu;
704 void *rfn_save = restart->fn; 702 ktime_t time;
705 struct hrtimer timer;
706 ktime_t rem;
707 703
708 restart->fn = do_no_restart_syscall; 704 restart->fn = do_no_restart_syscall;
709 705
710 hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); 706 hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
711 707 t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
712 timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
713
714 rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
715 708
716 if (rem.tv64 <= 0) 709 if (do_nanosleep(&t, HRTIMER_ABS))
717 return 0; 710 return 0;
718 711
719 rmtp = (struct timespec __user *) restart->arg2; 712 rmtp = (struct timespec __user *) restart->arg2;
720 tu = ktime_to_timespec(rem); 713 if (rmtp) {
721 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) 714 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
722 return -EFAULT; 715 if (time.tv64 <= 0)
716 return 0;
717 tu = ktime_to_timespec(time);
718 if (copy_to_user(rmtp, &tu, sizeof(tu)))
719 return -EFAULT;
720 }
723 721
724 restart->fn = rfn_save; 722 restart->fn = nanosleep_restart;
725 723
726 /* The other values in restart are already filled in */ 724 /* The other values in restart are already filled in */
727 return -ERESTART_RESTARTBLOCK; 725 return -ERESTART_RESTARTBLOCK;
@@ -731,33 +729,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
731 const enum hrtimer_mode mode, const clockid_t clockid) 729 const enum hrtimer_mode mode, const clockid_t clockid)
732{ 730{
733 struct restart_block *restart; 731 struct restart_block *restart;
734 struct hrtimer timer; 732 struct sleep_hrtimer t;
735 struct timespec tu; 733 struct timespec tu;
736 ktime_t rem; 734 ktime_t rem;
737 735
738 hrtimer_init(&timer, clockid, mode); 736 hrtimer_init(&t.timer, clockid, mode);
739 737 t.timer.expires = timespec_to_ktime(*rqtp);
740 timer.expires = timespec_to_ktime(*rqtp); 738 if (do_nanosleep(&t, mode))
741
742 rem = schedule_hrtimer_interruptible(&timer, mode);
743 if (rem.tv64 <= 0)
744 return 0; 739 return 0;
745 740
746 /* Absolute timers do not update the rmtp value and restart: */ 741 /* Absolute timers do not update the rmtp value and restart: */
747 if (mode == HRTIMER_ABS) 742 if (mode == HRTIMER_ABS)
748 return -ERESTARTNOHAND; 743 return -ERESTARTNOHAND;
749 744
750 tu = ktime_to_timespec(rem); 745 if (rmtp) {
751 746 rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
752 if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) 747 if (rem.tv64 <= 0)
753 return -EFAULT; 748 return 0;
749 tu = ktime_to_timespec(rem);
750 if (copy_to_user(rmtp, &tu, sizeof(tu)))
751 return -EFAULT;
752 }
754 753
755 restart = &current_thread_info()->restart_block; 754 restart = &current_thread_info()->restart_block;
756 restart->fn = nanosleep_restart; 755 restart->fn = nanosleep_restart;
757 restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; 756 restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
758 restart->arg1 = timer.expires.tv64 >> 32; 757 restart->arg1 = t.timer.expires.tv64 >> 32;
759 restart->arg2 = (unsigned long) rmtp; 758 restart->arg2 = (unsigned long) rmtp;
760 restart->arg3 = (unsigned long) timer.base->index; 759 restart->arg3 = (unsigned long) t.timer.base->index;
761 760
762 return -ERESTART_RESTARTBLOCK; 761 return -ERESTART_RESTARTBLOCK;
763} 762}
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 49378738ff..2b33f852be 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,4 @@
1 1
2obj-y := handle.o manage.o spurious.o 2obj-y := handle.o manage.o spurious.o migration.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 97d5559997..ac766ad573 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
204 p = &desc->action; 204 p = &desc->action;
205 if ((old = *p) != NULL) { 205 if ((old = *p) != NULL) {
206 /* Can't share interrupts unless both agree to */ 206 /* Can't share interrupts unless both agree to */
207 if (!(old->flags & new->flags & SA_SHIRQ)) { 207 if (!(old->flags & new->flags & SA_SHIRQ))
208 spin_unlock_irqrestore(&desc->lock,flags); 208 goto mismatch;
209 return -EBUSY; 209
210 } 210#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
211 /* All handlers must agree on per-cpuness */
212 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
213 goto mismatch;
214#endif
211 215
212 /* add new interrupt at end of irq queue */ 216 /* add new interrupt at end of irq queue */
213 do { 217 do {
@@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
218 } 222 }
219 223
220 *p = new; 224 *p = new;
221 225#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
226 if (new->flags & SA_PERCPU_IRQ)
227 desc->status |= IRQ_PER_CPU;
228#endif
222 if (!shared) { 229 if (!shared) {
223 desc->depth = 0; 230 desc->depth = 0;
224 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | 231 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -236,6 +243,12 @@ int setup_irq(unsigned int irq, struct irqaction * new)
236 register_handler_proc(irq, new); 243 register_handler_proc(irq, new);
237 244
238 return 0; 245 return 0;
246
247mismatch:
248 spin_unlock_irqrestore(&desc->lock, flags);
249 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
250 dump_stack();
251 return -EBUSY;
239} 252}
240 253
241/** 254/**
@@ -258,6 +271,7 @@ void free_irq(unsigned int irq, void *dev_id)
258 struct irqaction **p; 271 struct irqaction **p;
259 unsigned long flags; 272 unsigned long flags;
260 273
274 WARN_ON(in_interrupt());
261 if (irq >= NR_IRQS) 275 if (irq >= NR_IRQS)
262 return; 276 return;
263 277
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 0000000000..52a8655fa0
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,65 @@
1#include <linux/irq.h>
2
3#if defined(CONFIG_GENERIC_PENDING_IRQ)
4
5void set_pending_irq(unsigned int irq, cpumask_t mask)
6{
7 irq_desc_t *desc = irq_desc + irq;
8 unsigned long flags;
9
10 spin_lock_irqsave(&desc->lock, flags);
11 desc->move_irq = 1;
12 pending_irq_cpumask[irq] = mask;
13 spin_unlock_irqrestore(&desc->lock, flags);
14}
15
16void move_native_irq(int irq)
17{
18 cpumask_t tmp;
19 irq_desc_t *desc = irq_descp(irq);
20
21 if (likely(!desc->move_irq))
22 return;
23
24 /*
25 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
26 */
27 if (CHECK_IRQ_PER_CPU(desc->status)) {
28 WARN_ON(1);
29 return;
30 }
31
32 desc->move_irq = 0;
33
34 if (likely(cpus_empty(pending_irq_cpumask[irq])))
35 return;
36
37 if (!desc->handler->set_affinity)
38 return;
39
40 assert_spin_locked(&desc->lock);
41
42 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
43
44 /*
45 * If there was a valid mask to work with, please
46 * do the disable, re-program, enable sequence.
47 * This is *not* particularly important for level triggered
48 * but in a edge trigger case, we might be setting rte
49 * when an active trigger is comming in. This could
50 * cause some ioapics to mal-function.
51 * Being paranoid i guess!
52 */
53 if (unlikely(!cpus_empty(tmp))) {
54 if (likely(!(desc->status & IRQ_DISABLED)))
55 desc->handler->disable(irq);
56
57 desc->handler->set_affinity(irq,tmp);
58
59 if (likely(!(desc->status & IRQ_DISABLED)))
60 desc->handler->enable(irq);
61 }
62 cpus_clear(pending_irq_cpumask[irq]);
63}
64
65#endif
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 379be2f8c8..204ed7939e 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,21 +128,75 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
128/* 128/*
129 * The timer is automagically restarted, when interval != 0 129 * The timer is automagically restarted, when interval != 0
130 */ 130 */
131int it_real_fn(void *data) 131int it_real_fn(struct hrtimer *timer)
132{ 132{
133 struct task_struct *tsk = (struct task_struct *) data; 133 struct signal_struct *sig =
134 container_of(timer, struct signal_struct, real_timer);
134 135
135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); 136 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
136
137 if (tsk->signal->it_real_incr.tv64 != 0) {
138 hrtimer_forward(&tsk->signal->real_timer,
139 tsk->signal->it_real_incr);
140 137
138 if (sig->it_real_incr.tv64 != 0) {
139 hrtimer_forward(timer, timer->base->softirq_time,
140 sig->it_real_incr);
141 return HRTIMER_RESTART; 141 return HRTIMER_RESTART;
142 } 142 }
143 return HRTIMER_NORESTART; 143 return HRTIMER_NORESTART;
144} 144}
145 145
146/*
147 * We do not care about correctness. We just sanitize the values so
148 * the ktime_t operations which expect normalized values do not
149 * break. This converts negative values to long timeouts similar to
150 * the code in kernel versions < 2.6.16
151 *
152 * Print a limited number of warning messages when an invalid timeval
153 * is detected.
154 */
155static void fixup_timeval(struct timeval *tv, int interval)
156{
157 static int warnlimit = 10;
158 unsigned long tmp;
159
160 if (warnlimit > 0) {
161 warnlimit--;
162 printk(KERN_WARNING
163 "setitimer: %s (pid = %d) provided "
164 "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
165 current->comm, current->pid,
166 interval ? "it_interval" : "it_value",
167 tv->tv_sec, (long) tv->tv_usec);
168 }
169
170 tmp = tv->tv_usec;
171 if (tmp >= USEC_PER_SEC) {
172 tv->tv_usec = tmp % USEC_PER_SEC;
173 tv->tv_sec += tmp / USEC_PER_SEC;
174 }
175
176 tmp = tv->tv_sec;
177 if (tmp > LONG_MAX)
178 tv->tv_sec = LONG_MAX;
179}
180
181/*
182 * Returns true if the timeval is in canonical form
183 */
184#define timeval_valid(t) \
185 (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
186
187/*
188 * Check for invalid timevals, sanitize them and print a limited
189 * number of warnings.
190 */
191static void check_itimerval(struct itimerval *value) {
192
193 if (unlikely(!timeval_valid(&value->it_value)))
194 fixup_timeval(&value->it_value, 0);
195
196 if (unlikely(!timeval_valid(&value->it_interval)))
197 fixup_timeval(&value->it_interval, 1);
198}
199
146int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) 200int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
147{ 201{
148 struct task_struct *tsk = current; 202 struct task_struct *tsk = current;
@@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
150 ktime_t expires; 204 ktime_t expires;
151 cputime_t cval, cinterval, nval, ninterval; 205 cputime_t cval, cinterval, nval, ninterval;
152 206
207 /*
208 * Validate the timevals in value.
209 *
210 * Note: Although the spec requires that invalid values shall
211 * return -EINVAL, we just fixup the value and print a limited
212 * number of warnings in order not to break users of this
213 * historical misfeature.
214 *
215 * Scheduled for replacement in March 2007
216 */
217 check_itimerval(value);
218
153 switch (which) { 219 switch (which) {
154 case ITIMER_REAL: 220 case ITIMER_REAL:
155again: 221again:
@@ -226,6 +292,43 @@ again:
226 return 0; 292 return 0;
227} 293}
228 294
295/**
296 * alarm_setitimer - set alarm in seconds
297 *
298 * @seconds: number of seconds until alarm
299 * 0 disables the alarm
300 *
301 * Returns the remaining time in seconds of a pending timer or 0 when
302 * the timer is not active.
303 *
304 * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
305 * negative timeval settings which would cause immediate expiry.
306 */
307unsigned int alarm_setitimer(unsigned int seconds)
308{
309 struct itimerval it_new, it_old;
310
311#if BITS_PER_LONG < 64
312 if (seconds > INT_MAX)
313 seconds = INT_MAX;
314#endif
315 it_new.it_value.tv_sec = seconds;
316 it_new.it_value.tv_usec = 0;
317 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
318
319 do_setitimer(ITIMER_REAL, &it_new, &it_old);
320
321 /*
322 * We can't return 0 if we have an alarm pending ... And we'd
323 * better return too much than too little anyway
324 */
325 if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
326 it_old.it_value.tv_usec >= 500000)
327 it_old.it_value.tv_sec++;
328
329 return it_old.it_value.tv_sec;
330}
331
229asmlinkage long sys_setitimer(int which, 332asmlinkage long sys_setitimer(int which,
230 struct itimerval __user *value, 333 struct itimerval __user *value,
231 struct itimerval __user *ovalue) 334 struct itimerval __user *ovalue)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063a..20a997c73c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
170 sa.sa.sa_handler = SIG_IGN; 170 sa.sa.sa_handler = SIG_IGN;
171 sa.sa.sa_flags = 0; 171 sa.sa.sa_flags = 0;
172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); 172 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
173 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); 173 do_sigaction(SIGCHLD, &sa, NULL);
174 allow_signal(SIGCHLD); 174 allow_signal(SIGCHLD);
175 175
176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 176 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73..1156eb0977 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
323} 323}
324 324
325/* 325/*
326 * This function is called from exit_thread or flush_thread when task tk's 326 * This function is called from finish_task_switch when task tk becomes dead,
327 * stack is being recycled so that we can recycle any function-return probe 327 * so that we can recycle any function-return probe instances associated
328 * instances associated with this task. These left over instances represent 328 * with this task. These left over instances represent probed functions
329 * probed functions that have been called but will never return. 329 * that have been called but will never return.
330 */ 330 */
331void __kprobes kprobe_flush_task(struct task_struct *tk) 331void __kprobes kprobe_flush_task(struct task_struct *tk)
332{ 332{
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
336 unsigned long flags = 0; 336 unsigned long flags = 0;
337 337
338 spin_lock_irqsave(&kretprobe_lock, flags); 338 spin_lock_irqsave(&kretprobe_lock, flags);
339 head = kretprobe_inst_table_head(current); 339 head = kretprobe_inst_table_head(tk);
340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 340 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
341 if (ri->task == tk) 341 if (ri->task == tk)
342 recycle_rp_inst(ri); 342 recycle_rp_inst(ri);
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
460 } 460 }
461 461
462 p->nmissed = 0; 462 p->nmissed = 0;
463 down(&kprobe_mutex); 463 mutex_lock(&kprobe_mutex);
464 old_p = get_kprobe(p->addr); 464 old_p = get_kprobe(p->addr);
465 if (old_p) { 465 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 466 ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
477 arch_arm_kprobe(p); 477 arch_arm_kprobe(p);
478 478
479out: 479out:
480 up(&kprobe_mutex); 480 mutex_unlock(&kprobe_mutex);
481 481
482 if (ret && probed_mod) 482 if (ret && probed_mod)
483 module_put(probed_mod); 483 module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
496 struct kprobe *old_p, *list_p; 496 struct kprobe *old_p, *list_p;
497 int cleanup_p; 497 int cleanup_p;
498 498
499 down(&kprobe_mutex); 499 mutex_lock(&kprobe_mutex);
500 old_p = get_kprobe(p->addr); 500 old_p = get_kprobe(p->addr);
501 if (unlikely(!old_p)) { 501 if (unlikely(!old_p)) {
502 up(&kprobe_mutex); 502 mutex_unlock(&kprobe_mutex);
503 return; 503 return;
504 } 504 }
505 if (p != old_p) { 505 if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
507 if (list_p == p) 507 if (list_p == p)
508 /* kprobe p is a valid probe */ 508 /* kprobe p is a valid probe */
509 goto valid_p; 509 goto valid_p;
510 up(&kprobe_mutex); 510 mutex_unlock(&kprobe_mutex);
511 return; 511 return;
512 } 512 }
513valid_p: 513valid_p:
@@ -523,7 +523,7 @@ valid_p:
523 cleanup_p = 0; 523 cleanup_p = 0;
524 } 524 }
525 525
526 up(&kprobe_mutex); 526 mutex_unlock(&kprobe_mutex);
527 527
528 synchronize_sched(); 528 synchronize_sched();
529 if (p->mod_refcounted && 529 if (p->mod_refcounted &&
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f2690ed745..f119e098e6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -22,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
22static struct subsys_attribute _name##_attr = \ 22static struct subsys_attribute _name##_attr = \
23 __ATTR(_name, 0644, _name##_show, _name##_store) 23 __ATTR(_name, 0644, _name##_show, _name##_store)
24 24
25#ifdef CONFIG_HOTPLUG 25#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
26/* current uevent sequence number */ 26/* current uevent sequence number */
27static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) 27static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
28{ 28{
@@ -52,7 +52,7 @@ decl_subsys(kernel, NULL, NULL);
52EXPORT_SYMBOL_GPL(kernel_subsys); 52EXPORT_SYMBOL_GPL(kernel_subsys);
53 53
54static struct attribute * kernel_attrs[] = { 54static struct attribute * kernel_attrs[] = {
55#ifdef CONFIG_HOTPLUG 55#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
56 &uevent_seqnum_attr.attr, 56 &uevent_seqnum_attr.attr,
57 &uevent_helper_attr.attr, 57 &uevent_helper_attr.attr,
58#endif 58#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a109..c5f3c6613b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
12#include <linux/unistd.h> 12#include <linux/unistd.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h>
15#include <asm/semaphore.h> 16#include <asm/semaphore.h>
16 17
17/* 18/*
@@ -41,7 +42,7 @@ struct kthread_stop_info
41 42
42/* Thread stopping is done by setthing this var: lock serializes 43/* Thread stopping is done by setthing this var: lock serializes
43 * multiple kthread_stop calls. */ 44 * multiple kthread_stop calls. */
44static DECLARE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
45static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
46 47
47int kthread_should_stop(void) 48int kthread_should_stop(void)
@@ -114,7 +115,9 @@ static void keventd_create_kthread(void *_create)
114 create->result = ERR_PTR(pid); 115 create->result = ERR_PTR(pid);
115 } else { 116 } else {
116 wait_for_completion(&create->started); 117 wait_for_completion(&create->started);
118 read_lock(&tasklist_lock);
117 create->result = find_task_by_pid(pid); 119 create->result = find_task_by_pid(pid);
120 read_unlock(&tasklist_lock);
118 } 121 }
119 complete(&create->done); 122 complete(&create->done);
120} 123}
@@ -173,7 +176,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
173{ 176{
174 int ret; 177 int ret;
175 178
176 down(&kthread_stop_lock); 179 mutex_lock(&kthread_stop_lock);
177 180
178 /* It could exit after stop_info.k set, but before wake_up_process. */ 181 /* It could exit after stop_info.k set, but before wake_up_process. */
179 get_task_struct(k); 182 get_task_struct(k);
@@ -194,7 +197,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
194 wait_for_completion(&kthread_stop_info.done); 197 wait_for_completion(&kthread_stop_info.done);
195 kthread_stop_info.k = NULL; 198 kthread_stop_info.k = NULL;
196 ret = kthread_stop_info.err; 199 ret = kthread_stop_info.err;
197 up(&kthread_stop_lock); 200 mutex_unlock(&kthread_stop_lock);
198 201
199 return ret; 202 return ret;
200} 203}
diff --git a/kernel/module.c b/kernel/module.c
index 77764f22f0..bd088a7c14 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
39#include <linux/device.h> 39#include <linux/device.h>
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/mutex.h>
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/semaphore.h> 44#include <asm/semaphore.h>
44#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
@@ -60,29 +61,20 @@
60static DEFINE_SPINLOCK(modlist_lock); 61static DEFINE_SPINLOCK(modlist_lock);
61 62
62/* List of modules, protected by module_mutex AND modlist_lock */ 63/* List of modules, protected by module_mutex AND modlist_lock */
63static DECLARE_MUTEX(module_mutex); 64static DEFINE_MUTEX(module_mutex);
64static LIST_HEAD(modules); 65static LIST_HEAD(modules);
65 66
66static DECLARE_MUTEX(notify_mutex); 67static BLOCKING_NOTIFIER_HEAD(module_notify_list);
67static struct notifier_block * module_notify_list;
68 68
69int register_module_notifier(struct notifier_block * nb) 69int register_module_notifier(struct notifier_block * nb)
70{ 70{
71 int err; 71 return blocking_notifier_chain_register(&module_notify_list, nb);
72 down(&notify_mutex);
73 err = notifier_chain_register(&module_notify_list, nb);
74 up(&notify_mutex);
75 return err;
76} 72}
77EXPORT_SYMBOL(register_module_notifier); 73EXPORT_SYMBOL(register_module_notifier);
78 74
79int unregister_module_notifier(struct notifier_block * nb) 75int unregister_module_notifier(struct notifier_block * nb)
80{ 76{
81 int err; 77 return blocking_notifier_chain_unregister(&module_notify_list, nb);
82 down(&notify_mutex);
83 err = notifier_chain_unregister(&module_notify_list, nb);
84 up(&notify_mutex);
85 return err;
86} 78}
87EXPORT_SYMBOL(unregister_module_notifier); 79EXPORT_SYMBOL(unregister_module_notifier);
88 80
@@ -135,7 +127,7 @@ extern const unsigned long __start___kcrctab_gpl_future[];
135#ifndef CONFIG_MODVERSIONS 127#ifndef CONFIG_MODVERSIONS
136#define symversion(base, idx) NULL 128#define symversion(base, idx) NULL
137#else 129#else
138#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) 130#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
139#endif 131#endif
140 132
141/* lookup symbol in given range of kernel_symbols */ 133/* lookup symbol in given range of kernel_symbols */
@@ -232,24 +224,6 @@ static unsigned long __find_symbol(const char *name,
232 return 0; 224 return 0;
233} 225}
234 226
235/* Find a symbol in this elf symbol table */
236static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
237 unsigned int symindex,
238 const char *strtab,
239 const char *name)
240{
241 unsigned int i;
242 Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
243
244 /* Search (defined) internal symbols first. */
245 for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
246 if (sym[i].st_shndx != SHN_UNDEF
247 && strcmp(name, strtab + sym[i].st_name) == 0)
248 return sym[i].st_value;
249 }
250 return 0;
251}
252
253/* Search for module by name: must hold module_mutex. */ 227/* Search for module by name: must hold module_mutex. */
254static struct module *find_module(const char *name) 228static struct module *find_module(const char *name)
255{ 229{
@@ -601,7 +575,7 @@ static void free_module(struct module *mod);
601static void wait_for_zero_refcount(struct module *mod) 575static void wait_for_zero_refcount(struct module *mod)
602{ 576{
603 /* Since we might sleep for some time, drop the semaphore first */ 577 /* Since we might sleep for some time, drop the semaphore first */
604 up(&module_mutex); 578 mutex_unlock(&module_mutex);
605 for (;;) { 579 for (;;) {
606 DEBUGP("Looking at refcount...\n"); 580 DEBUGP("Looking at refcount...\n");
607 set_current_state(TASK_UNINTERRUPTIBLE); 581 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -610,7 +584,7 @@ static void wait_for_zero_refcount(struct module *mod)
610 schedule(); 584 schedule();
611 } 585 }
612 current->state = TASK_RUNNING; 586 current->state = TASK_RUNNING;
613 down(&module_mutex); 587 mutex_lock(&module_mutex);
614} 588}
615 589
616asmlinkage long 590asmlinkage long
@@ -627,7 +601,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
627 return -EFAULT; 601 return -EFAULT;
628 name[MODULE_NAME_LEN-1] = '\0'; 602 name[MODULE_NAME_LEN-1] = '\0';
629 603
630 if (down_interruptible(&module_mutex) != 0) 604 if (mutex_lock_interruptible(&module_mutex) != 0)
631 return -EINTR; 605 return -EINTR;
632 606
633 mod = find_module(name); 607 mod = find_module(name);
@@ -676,14 +650,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
676 650
677 /* Final destruction now noone is using it. */ 651 /* Final destruction now noone is using it. */
678 if (mod->exit != NULL) { 652 if (mod->exit != NULL) {
679 up(&module_mutex); 653 mutex_unlock(&module_mutex);
680 mod->exit(); 654 mod->exit();
681 down(&module_mutex); 655 mutex_lock(&module_mutex);
682 } 656 }
683 free_module(mod); 657 free_module(mod);
684 658
685 out: 659 out:
686 up(&module_mutex); 660 mutex_unlock(&module_mutex);
687 return ret; 661 return ret;
688} 662}
689 663
@@ -784,139 +758,6 @@ static struct module_attribute *modinfo_attrs[] = {
784 NULL, 758 NULL,
785}; 759};
786 760
787#ifdef CONFIG_OBSOLETE_MODPARM
788/* Bounds checking done below */
789static int obsparm_copy_string(const char *val, struct kernel_param *kp)
790{
791 strcpy(kp->arg, val);
792 return 0;
793}
794
795static int set_obsolete(const char *val, struct kernel_param *kp)
796{
797 unsigned int min, max;
798 unsigned int size, maxsize;
799 int dummy;
800 char *endp;
801 const char *p;
802 struct obsolete_modparm *obsparm = kp->arg;
803
804 if (!val) {
805 printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
806 return -EINVAL;
807 }
808
809 /* type is: [min[-max]]{b,h,i,l,s} */
810 p = obsparm->type;
811 min = simple_strtol(p, &endp, 10);
812 if (endp == obsparm->type)
813 min = max = 1;
814 else if (*endp == '-') {
815 p = endp+1;
816 max = simple_strtol(p, &endp, 10);
817 } else
818 max = min;
819 switch (*endp) {
820 case 'b':
821 return param_array(kp->name, val, min, max, obsparm->addr,
822 1, param_set_byte, &dummy);
823 case 'h':
824 return param_array(kp->name, val, min, max, obsparm->addr,
825 sizeof(short), param_set_short, &dummy);
826 case 'i':
827 return param_array(kp->name, val, min, max, obsparm->addr,
828 sizeof(int), param_set_int, &dummy);
829 case 'l':
830 return param_array(kp->name, val, min, max, obsparm->addr,
831 sizeof(long), param_set_long, &dummy);
832 case 's':
833 return param_array(kp->name, val, min, max, obsparm->addr,
834 sizeof(char *), param_set_charp, &dummy);
835
836 case 'c':
837 /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
838 and the decl is "char xxx[5][50];" */
839 p = endp+1;
840 maxsize = simple_strtol(p, &endp, 10);
841 /* We check lengths here (yes, this is a hack). */
842 p = val;
843 while (p[size = strcspn(p, ",")]) {
844 if (size >= maxsize)
845 goto oversize;
846 p += size+1;
847 }
848 if (size >= maxsize)
849 goto oversize;
850 return param_array(kp->name, val, min, max, obsparm->addr,
851 maxsize, obsparm_copy_string, &dummy);
852 }
853 printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
854 return -EINVAL;
855 oversize:
856 printk(KERN_ERR
857 "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
858 return -EINVAL;
859}
860
861static int obsolete_params(const char *name,
862 char *args,
863 struct obsolete_modparm obsparm[],
864 unsigned int num,
865 Elf_Shdr *sechdrs,
866 unsigned int symindex,
867 const char *strtab)
868{
869 struct kernel_param *kp;
870 unsigned int i;
871 int ret;
872
873 kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
874 if (!kp)
875 return -ENOMEM;
876
877 for (i = 0; i < num; i++) {
878 char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
879
880 snprintf(sym_name, sizeof(sym_name), "%s%s",
881 MODULE_SYMBOL_PREFIX, obsparm[i].name);
882
883 kp[i].name = obsparm[i].name;
884 kp[i].perm = 000;
885 kp[i].set = set_obsolete;
886 kp[i].get = NULL;
887 obsparm[i].addr
888 = (void *)find_local_symbol(sechdrs, symindex, strtab,
889 sym_name);
890 if (!obsparm[i].addr) {
891 printk("%s: falsely claims to have parameter %s\n",
892 name, obsparm[i].name);
893 ret = -EINVAL;
894 goto out;
895 }
896 kp[i].arg = &obsparm[i];
897 }
898
899 ret = parse_args(name, args, kp, num, NULL);
900 out:
901 kfree(kp);
902 return ret;
903}
904#else
905static int obsolete_params(const char *name,
906 char *args,
907 struct obsolete_modparm obsparm[],
908 unsigned int num,
909 Elf_Shdr *sechdrs,
910 unsigned int symindex,
911 const char *strtab)
912{
913 if (num != 0)
914 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
915 name);
916 return 0;
917}
918#endif /* CONFIG_OBSOLETE_MODPARM */
919
920static const char vermagic[] = VERMAGIC_STRING; 761static const char vermagic[] = VERMAGIC_STRING;
921 762
922#ifdef CONFIG_MODVERSIONS 763#ifdef CONFIG_MODVERSIONS
@@ -1571,7 +1412,6 @@ static struct module *load_module(void __user *umod,
1571 exportindex, modindex, obsparmindex, infoindex, gplindex, 1412 exportindex, modindex, obsparmindex, infoindex, gplindex,
1572 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1413 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
1573 gplfuturecrcindex; 1414 gplfuturecrcindex;
1574 long arglen;
1575 struct module *mod; 1415 struct module *mod;
1576 long err = 0; 1416 long err = 0;
1577 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1417 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1690,23 +1530,11 @@ static struct module *load_module(void __user *umod,
1690 } 1530 }
1691 1531
1692 /* Now copy in args */ 1532 /* Now copy in args */
1693 arglen = strlen_user(uargs); 1533 args = strndup_user(uargs, ~0UL >> 1);
1694 if (!arglen) { 1534 if (IS_ERR(args)) {
1695 err = -EFAULT; 1535 err = PTR_ERR(args);
1696 goto free_hdr; 1536 goto free_hdr;
1697 } 1537 }
1698 args = kmalloc(arglen, GFP_KERNEL);
1699 if (!args) {
1700 err = -ENOMEM;
1701 goto free_hdr;
1702 }
1703 if (copy_from_user(args, uargs, arglen) != 0) {
1704 err = -EFAULT;
1705 goto free_mod;
1706 }
1707
1708 /* Userspace could have altered the string after the strlen_user() */
1709 args[arglen - 1] = '\0';
1710 1538
1711 if (find_module(mod->name)) { 1539 if (find_module(mod->name)) {
1712 err = -EEXIST; 1540 err = -EEXIST;
@@ -1886,27 +1714,17 @@ static struct module *load_module(void __user *umod,
1886 set_fs(old_fs); 1714 set_fs(old_fs);
1887 1715
1888 mod->args = args; 1716 mod->args = args;
1889 if (obsparmindex) { 1717 if (obsparmindex)
1890 err = obsolete_params(mod->name, mod->args, 1718 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
1891 (struct obsolete_modparm *) 1719 mod->name);
1892 sechdrs[obsparmindex].sh_addr, 1720
1893 sechdrs[obsparmindex].sh_size 1721 /* Size of section 0 is 0, so this works well if no params */
1894 / sizeof(struct obsolete_modparm), 1722 err = parse_args(mod->name, mod->args,
1895 sechdrs, symindex, 1723 (struct kernel_param *)
1896 (char *)sechdrs[strindex].sh_addr); 1724 sechdrs[setupindex].sh_addr,
1897 if (setupindex) 1725 sechdrs[setupindex].sh_size
1898 printk(KERN_WARNING "%s: Ignoring new-style " 1726 / sizeof(struct kernel_param),
1899 "parameters in presence of obsolete ones\n", 1727 NULL);
1900 mod->name);
1901 } else {
1902 /* Size of section 0 is 0, so this works well if no params */
1903 err = parse_args(mod->name, mod->args,
1904 (struct kernel_param *)
1905 sechdrs[setupindex].sh_addr,
1906 sechdrs[setupindex].sh_size
1907 / sizeof(struct kernel_param),
1908 NULL);
1909 }
1910 if (err < 0) 1728 if (err < 0)
1911 goto arch_cleanup; 1729 goto arch_cleanup;
1912 1730
@@ -1972,13 +1790,13 @@ sys_init_module(void __user *umod,
1972 return -EPERM; 1790 return -EPERM;
1973 1791
1974 /* Only one module load at a time, please */ 1792 /* Only one module load at a time, please */
1975 if (down_interruptible(&module_mutex) != 0) 1793 if (mutex_lock_interruptible(&module_mutex) != 0)
1976 return -EINTR; 1794 return -EINTR;
1977 1795
1978 /* Do all the hard work */ 1796 /* Do all the hard work */
1979 mod = load_module(umod, len, uargs); 1797 mod = load_module(umod, len, uargs);
1980 if (IS_ERR(mod)) { 1798 if (IS_ERR(mod)) {
1981 up(&module_mutex); 1799 mutex_unlock(&module_mutex);
1982 return PTR_ERR(mod); 1800 return PTR_ERR(mod);
1983 } 1801 }
1984 1802
@@ -1987,11 +1805,10 @@ sys_init_module(void __user *umod,
1987 stop_machine_run(__link_module, mod, NR_CPUS); 1805 stop_machine_run(__link_module, mod, NR_CPUS);
1988 1806
1989 /* Drop lock so they can recurse */ 1807 /* Drop lock so they can recurse */
1990 up(&module_mutex); 1808 mutex_unlock(&module_mutex);
1991 1809
1992 down(&notify_mutex); 1810 blocking_notifier_call_chain(&module_notify_list,
1993 notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); 1811 MODULE_STATE_COMING, mod);
1994 up(&notify_mutex);
1995 1812
1996 /* Start the module */ 1813 /* Start the module */
1997 if (mod->init != NULL) 1814 if (mod->init != NULL)
@@ -2006,15 +1823,15 @@ sys_init_module(void __user *umod,
2006 mod->name); 1823 mod->name);
2007 else { 1824 else {
2008 module_put(mod); 1825 module_put(mod);
2009 down(&module_mutex); 1826 mutex_lock(&module_mutex);
2010 free_module(mod); 1827 free_module(mod);
2011 up(&module_mutex); 1828 mutex_unlock(&module_mutex);
2012 } 1829 }
2013 return ret; 1830 return ret;
2014 } 1831 }
2015 1832
2016 /* Now it's a first class citizen! */ 1833 /* Now it's a first class citizen! */
2017 down(&module_mutex); 1834 mutex_lock(&module_mutex);
2018 mod->state = MODULE_STATE_LIVE; 1835 mod->state = MODULE_STATE_LIVE;
2019 /* Drop initial reference. */ 1836 /* Drop initial reference. */
2020 module_put(mod); 1837 module_put(mod);
@@ -2022,7 +1839,7 @@ sys_init_module(void __user *umod,
2022 mod->module_init = NULL; 1839 mod->module_init = NULL;
2023 mod->init_size = 0; 1840 mod->init_size = 0;
2024 mod->init_text_size = 0; 1841 mod->init_text_size = 0;
2025 up(&module_mutex); 1842 mutex_unlock(&module_mutex);
2026 1843
2027 return 0; 1844 return 0;
2028} 1845}
@@ -2112,7 +1929,7 @@ struct module *module_get_kallsym(unsigned int symnum,
2112{ 1929{
2113 struct module *mod; 1930 struct module *mod;
2114 1931
2115 down(&module_mutex); 1932 mutex_lock(&module_mutex);
2116 list_for_each_entry(mod, &modules, list) { 1933 list_for_each_entry(mod, &modules, list) {
2117 if (symnum < mod->num_symtab) { 1934 if (symnum < mod->num_symtab) {
2118 *value = mod->symtab[symnum].st_value; 1935 *value = mod->symtab[symnum].st_value;
@@ -2120,12 +1937,12 @@ struct module *module_get_kallsym(unsigned int symnum,
2120 strncpy(namebuf, 1937 strncpy(namebuf,
2121 mod->strtab + mod->symtab[symnum].st_name, 1938 mod->strtab + mod->symtab[symnum].st_name,
2122 127); 1939 127);
2123 up(&module_mutex); 1940 mutex_unlock(&module_mutex);
2124 return mod; 1941 return mod;
2125 } 1942 }
2126 symnum -= mod->num_symtab; 1943 symnum -= mod->num_symtab;
2127 } 1944 }
2128 up(&module_mutex); 1945 mutex_unlock(&module_mutex);
2129 return NULL; 1946 return NULL;
2130} 1947}
2131 1948
@@ -2168,7 +1985,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
2168 struct list_head *i; 1985 struct list_head *i;
2169 loff_t n = 0; 1986 loff_t n = 0;
2170 1987
2171 down(&module_mutex); 1988 mutex_lock(&module_mutex);
2172 list_for_each(i, &modules) { 1989 list_for_each(i, &modules) {
2173 if (n++ == *pos) 1990 if (n++ == *pos)
2174 break; 1991 break;
@@ -2189,7 +2006,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2189 2006
2190static void m_stop(struct seq_file *m, void *p) 2007static void m_stop(struct seq_file *m, void *p)
2191{ 2008{
2192 up(&module_mutex); 2009 mutex_unlock(&module_mutex);
2193} 2010}
2194 2011
2195static int m_show(struct seq_file *m, void *p) 2012static int m_show(struct seq_file *m, void *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c..f895c7c01d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,13 +20,16 @@
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h> 21#include <linux/kexec.h>
22 22
23int panic_timeout;
24int panic_on_oops; 23int panic_on_oops;
25int tainted; 24int tainted;
25static int pause_on_oops;
26static int pause_on_oops_flag;
27static DEFINE_SPINLOCK(pause_on_oops_lock);
26 28
29int panic_timeout;
27EXPORT_SYMBOL(panic_timeout); 30EXPORT_SYMBOL(panic_timeout);
28 31
29struct notifier_block *panic_notifier_list; 32ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
30 33
31EXPORT_SYMBOL(panic_notifier_list); 34EXPORT_SYMBOL(panic_notifier_list);
32 35
@@ -94,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...)
94 smp_send_stop(); 97 smp_send_stop();
95#endif 98#endif
96 99
97 notifier_call_chain(&panic_notifier_list, 0, buf); 100 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
98 101
99 if (!panic_blink) 102 if (!panic_blink)
100 panic_blink = no_blink; 103 panic_blink = no_blink;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
174 tainted |= flag; 177 tainted |= flag;
175} 178}
176EXPORT_SYMBOL(add_taint); 179EXPORT_SYMBOL(add_taint);
180
181static int __init pause_on_oops_setup(char *str)
182{
183 pause_on_oops = simple_strtoul(str, NULL, 0);
184 return 1;
185}
186__setup("pause_on_oops=", pause_on_oops_setup);
187
188static void spin_msec(int msecs)
189{
190 int i;
191
192 for (i = 0; i < msecs; i++) {
193 touch_nmi_watchdog();
194 mdelay(1);
195 }
196}
197
198/*
199 * It just happens that oops_enter() and oops_exit() are identically
200 * implemented...
201 */
202static void do_oops_enter_exit(void)
203{
204 unsigned long flags;
205 static int spin_counter;
206
207 if (!pause_on_oops)
208 return;
209
210 spin_lock_irqsave(&pause_on_oops_lock, flags);
211 if (pause_on_oops_flag == 0) {
212 /* This CPU may now print the oops message */
213 pause_on_oops_flag = 1;
214 } else {
215 /* We need to stall this CPU */
216 if (!spin_counter) {
217 /* This CPU gets to do the counting */
218 spin_counter = pause_on_oops;
219 do {
220 spin_unlock(&pause_on_oops_lock);
221 spin_msec(MSEC_PER_SEC);
222 spin_lock(&pause_on_oops_lock);
223 } while (--spin_counter);
224 pause_on_oops_flag = 0;
225 } else {
226 /* This CPU waits for a different one */
227 while (spin_counter) {
228 spin_unlock(&pause_on_oops_lock);
229 spin_msec(1);
230 spin_lock(&pause_on_oops_lock);
231 }
232 }
233 }
234 spin_unlock_irqrestore(&pause_on_oops_lock, flags);
235}
236
237/*
238 * Return true if the calling CPU is allowed to print oops-related info. This
239 * is a bit racy..
240 */
241int oops_may_print(void)
242{
243 return pause_on_oops_flag == 0;
244}
245
246/*
247 * Called when the architecture enters its oops handler, before it prints
248 * anything. If this is the first CPU to oops, and it's oopsing the first time
249 * then let it proceed.
250 *
251 * This is all enabled by the pause_on_oops kernel boot option. We do all this
252 * to ensure that oopses don't scroll off the screen. It has the side-effect
253 * of preventing later-oopsing CPUs from mucking up the display, too.
254 *
255 * It turns out that the CPU which is allowed to print ends up pausing for the
256 * right duration, whereas all the other CPUs pause for twice as long: once in
257 * oops_enter(), once in oops_exit().
258 */
259void oops_enter(void)
260{
261 do_oops_enter_exit();
262}
263
264/*
265 * Called when the architecture exits its oops handler, after printing
266 * everything.
267 */
268void oops_exit(void)
269{
270 do_oops_enter_exit();
271}
diff --git a/kernel/params.c b/kernel/params.c
index a291505823..af43ecdc8d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
31#define DEBUGP(fmt, a...) 31#define DEBUGP(fmt, a...)
32#endif 32#endif
33 33
34static inline int dash2underscore(char c) 34static inline char dash2underscore(char c)
35{ 35{
36 if (c == '-') 36 if (c == '-')
37 return '_'; 37 return '_';
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
265} 265}
266 266
267/* We cheat here and temporarily mangle the string. */ 267/* We cheat here and temporarily mangle the string. */
268int param_array(const char *name, 268static int param_array(const char *name,
269 const char *val, 269 const char *val,
270 unsigned int min, unsigned int max, 270 unsigned int min, unsigned int max,
271 void *elem, int elemsize, 271 void *elem, int elemsize,
272 int (*set)(const char *, struct kernel_param *kp), 272 int (*set)(const char *, struct kernel_param *kp),
273 int *num) 273 int *num)
274{ 274{
275 int ret; 275 int ret;
276 struct kernel_param kp; 276 struct kernel_param kp;
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc072469..a9f2dfd006 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,36 +218,6 @@ task_t *find_task_by_pid_type(int type, int nr)
218EXPORT_SYMBOL(find_task_by_pid_type); 218EXPORT_SYMBOL(find_task_by_pid_type);
219 219
220/* 220/*
221 * This function switches the PIDs if a non-leader thread calls
222 * sys_execve() - this must be done without releasing the PID.
223 * (which a detach_pid() would eventually do.)
224 */
225void switch_exec_pids(task_t *leader, task_t *thread)
226{
227 __detach_pid(leader, PIDTYPE_PID);
228 __detach_pid(leader, PIDTYPE_TGID);
229 __detach_pid(leader, PIDTYPE_PGID);
230 __detach_pid(leader, PIDTYPE_SID);
231
232 __detach_pid(thread, PIDTYPE_PID);
233 __detach_pid(thread, PIDTYPE_TGID);
234
235 leader->pid = leader->tgid = thread->pid;
236 thread->pid = thread->tgid;
237
238 attach_pid(thread, PIDTYPE_PID, thread->pid);
239 attach_pid(thread, PIDTYPE_TGID, thread->tgid);
240 attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
241 attach_pid(thread, PIDTYPE_SID, thread->signal->session);
242 list_add_tail(&thread->tasks, &init_task.tasks);
243
244 attach_pid(leader, PIDTYPE_PID, leader->pid);
245 attach_pid(leader, PIDTYPE_TGID, leader->tgid);
246 attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
247 attach_pid(leader, PIDTYPE_SID, leader->signal->session);
248}
249
250/*
251 * The pid hash table is scaled according to the amount of memory in the 221 * The pid hash table is scaled according to the amount of memory in the
252 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 222 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
253 * more. 223 * more.
@@ -277,16 +247,8 @@ void __init pidhash_init(void)
277 247
278void __init pidmap_init(void) 248void __init pidmap_init(void)
279{ 249{
280 int i;
281
282 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); 250 pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
251 /* Reserve PID 0. We never call free_pidmap(0) */
283 set_bit(0, pidmap_array->page); 252 set_bit(0, pidmap_array->page);
284 atomic_dec(&pidmap_array->nr_free); 253 atomic_dec(&pidmap_array->nr_free);
285
286 /*
287 * Allocate PID 0, and hash it via all PID types:
288 */
289
290 for (i = 0; i < PIDTYPE_MAX; i++)
291 attach_pid(current, i, 0);
292} 254}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fa895fc2ec..ac6dc87444 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
35#include <linux/interrupt.h> 35#include <linux/interrupt.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/mutex.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40#include <asm/semaphore.h> 41#include <asm/semaphore.h>
@@ -144,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
144 struct itimerspec *, struct itimerspec *); 145 struct itimerspec *, struct itimerspec *);
145static int common_timer_del(struct k_itimer *timer); 146static int common_timer_del(struct k_itimer *timer);
146 147
147static int posix_timer_fn(void *data); 148static int posix_timer_fn(struct hrtimer *data);
148 149
149static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); 150static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
150 151
@@ -250,15 +251,18 @@ __initcall(init_posix_timers);
250 251
251static void schedule_next_timer(struct k_itimer *timr) 252static void schedule_next_timer(struct k_itimer *timr)
252{ 253{
254 struct hrtimer *timer = &timr->it.real.timer;
255
253 if (timr->it.real.interval.tv64 == 0) 256 if (timr->it.real.interval.tv64 == 0)
254 return; 257 return;
255 258
256 timr->it_overrun += hrtimer_forward(&timr->it.real.timer, 259 timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
257 timr->it.real.interval); 260 timr->it.real.interval);
261
258 timr->it_overrun_last = timr->it_overrun; 262 timr->it_overrun_last = timr->it_overrun;
259 timr->it_overrun = -1; 263 timr->it_overrun = -1;
260 ++timr->it_requeue_pending; 264 ++timr->it_requeue_pending;
261 hrtimer_restart(&timr->it.real.timer); 265 hrtimer_restart(timer);
262} 266}
263 267
264/* 268/*
@@ -330,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
330 334
331 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. 335 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
332 */ 336 */
333static int posix_timer_fn(void *data) 337static int posix_timer_fn(struct hrtimer *timer)
334{ 338{
335 struct k_itimer *timr = data; 339 struct k_itimer *timr;
336 unsigned long flags; 340 unsigned long flags;
337 int si_private = 0; 341 int si_private = 0;
338 int ret = HRTIMER_NORESTART; 342 int ret = HRTIMER_NORESTART;
339 343
344 timr = container_of(timer, struct k_itimer, it.real.timer);
340 spin_lock_irqsave(&timr->it_lock, flags); 345 spin_lock_irqsave(&timr->it_lock, flags);
341 346
342 if (timr->it.real.interval.tv64 != 0) 347 if (timr->it.real.interval.tv64 != 0)
@@ -350,7 +355,8 @@ static int posix_timer_fn(void *data)
350 */ 355 */
351 if (timr->it.real.interval.tv64 != 0) { 356 if (timr->it.real.interval.tv64 != 0) {
352 timr->it_overrun += 357 timr->it_overrun +=
353 hrtimer_forward(&timr->it.real.timer, 358 hrtimer_forward(timer,
359 timer->base->softirq_time,
354 timr->it.real.interval); 360 timr->it.real.interval);
355 ret = HRTIMER_RESTART; 361 ret = HRTIMER_RESTART;
356 ++timr->it_requeue_pending; 362 ++timr->it_requeue_pending;
@@ -602,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
602static void 608static void
603common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) 609common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
604{ 610{
605 ktime_t remaining; 611 ktime_t now, remaining, iv;
606 struct hrtimer *timer = &timr->it.real.timer; 612 struct hrtimer *timer = &timr->it.real.timer;
607 613
608 memset(cur_setting, 0, sizeof(struct itimerspec)); 614 memset(cur_setting, 0, sizeof(struct itimerspec));
609 remaining = hrtimer_get_remaining(timer);
610 615
611 /* Time left ? or timer pending */ 616 iv = timr->it.real.interval;
612 if (remaining.tv64 > 0 || hrtimer_active(timer)) 617
613 goto calci;
614 /* interval timer ? */ 618 /* interval timer ? */
615 if (timr->it.real.interval.tv64 == 0) 619 if (iv.tv64)
620 cur_setting->it_interval = ktime_to_timespec(iv);
621 else if (!hrtimer_active(timer) &&
622 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
616 return; 623 return;
624
625 now = timer->base->get_time();
626
617 /* 627 /*
618 * When a requeue is pending or this is a SIGEV_NONE timer 628 * When a requeue is pending or this is a SIGEV_NONE
619 * move the expiry time forward by intervals, so expiry is > 629 * timer move the expiry time forward by intervals, so
620 * now. 630 * expiry is > now.
621 */ 631 */
622 if (timr->it_requeue_pending & REQUEUE_PENDING || 632 if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
623 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { 633 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
624 timr->it_overrun += 634 timr->it_overrun += hrtimer_forward(timer, now, iv);
625 hrtimer_forward(timer, timr->it.real.interval); 635
626 remaining = hrtimer_get_remaining(timer); 636 remaining = ktime_sub(timer->expires, now);
627 }
628 calci:
629 /* interval timer ? */
630 if (timr->it.real.interval.tv64 != 0)
631 cur_setting->it_interval =
632 ktime_to_timespec(timr->it.real.interval);
633 /* Return 0 only, when the timer is expired and not pending */ 637 /* Return 0 only, when the timer is expired and not pending */
634 if (remaining.tv64 <= 0) 638 if (remaining.tv64 <= 0) {
635 cur_setting->it_value.tv_nsec = 1; 639 /*
636 else 640 * A single shot SIGEV_NONE timer must return 0, when
641 * it is expired !
642 */
643 if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
644 cur_setting->it_value.tv_nsec = 1;
645 } else
637 cur_setting->it_value = ktime_to_timespec(remaining); 646 cur_setting->it_value = ktime_to_timespec(remaining);
638} 647}
639 648
@@ -716,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
716 725
717 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; 726 mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
718 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 727 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
719 timr->it.real.timer.data = timr;
720 timr->it.real.timer.function = posix_timer_fn; 728 timr->it.real.timer.function = posix_timer_fn;
721 729
722 timer->expires = timespec_to_ktime(new_setting->it_value); 730 timer->expires = timespec_to_ktime(new_setting->it_value);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96..8d0af3d37a 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
5 5
6obj-y := main.o process.o console.o 6obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
9 9
10obj-$(CONFIG_SUSPEND_SMP) += smp.o 10obj-$(CONFIG_SUSPEND_SMP) += smp.o
11 11
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc9..81d4d982f3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
22#include "power.h" 22#include "power.h"
23 23
24 24
25extern suspend_disk_method_t pm_disk_mode;
26
27extern int swsusp_shrink_memory(void);
28extern int swsusp_suspend(void);
29extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
30extern int swsusp_check(void);
31extern int swsusp_read(struct pbe **pblist_ptr);
32extern void swsusp_close(void);
33extern int swsusp_resume(void);
34
35
36static int noresume = 0; 25static int noresume = 0;
37char resume_file[256] = CONFIG_PM_STD_PARTITION; 26char resume_file[256] = CONFIG_PM_STD_PARTITION;
38dev_t swsusp_resume_device; 27dev_t swsusp_resume_device;
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode)
70 while(1); 59 while(1);
71} 60}
72 61
73
74static int in_suspend __nosavedata = 0;
75
76
77static inline void platform_finish(void) 62static inline void platform_finish(void)
78{ 63{
79 if (pm_disk_mode == PM_DISK_PLATFORM) { 64 if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -87,7 +72,6 @@ static int prepare_processes(void)
87 int error; 72 int error;
88 73
89 pm_prepare_console(); 74 pm_prepare_console();
90 sys_sync();
91 disable_nonboot_cpus(); 75 disable_nonboot_cpus();
92 76
93 if (freeze_processes()) { 77 if (freeze_processes()) {
@@ -145,7 +129,7 @@ int pm_suspend_disk(void)
145 if (in_suspend) { 129 if (in_suspend) {
146 device_resume(); 130 device_resume();
147 pr_debug("PM: writing image.\n"); 131 pr_debug("PM: writing image.\n");
148 error = swsusp_write(pagedir_nosave, nr_copy_pages); 132 error = swsusp_write();
149 if (!error) 133 if (!error)
150 power_down(pm_disk_mode); 134 power_down(pm_disk_mode);
151 else { 135 else {
@@ -216,7 +200,7 @@ static int software_resume(void)
216 200
217 pr_debug("PM: Reading swsusp image.\n"); 201 pr_debug("PM: Reading swsusp image.\n");
218 202
219 if ((error = swsusp_read(&pagedir_nosave))) { 203 if ((error = swsusp_read())) {
220 swsusp_free(); 204 swsusp_free();
221 goto Thaw; 205 goto Thaw;
222 } 206 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4..ee371f50cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
103} 103}
104 104
105 105
106static int suspend_enter(suspend_state_t state) 106int suspend_enter(suspend_state_t state)
107{ 107{
108 int error = 0; 108 int error = 0;
109 unsigned long flags; 109 unsigned long flags;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857..0f6908cce1 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/pm_legacy.h> 26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/mutex.h>
28 29
29int pm_active; 30int pm_active;
30 31
@@ -40,7 +41,7 @@ int pm_active;
40 * until a resume but that will be fine. 41 * until a resume but that will be fine.
41 */ 42 */
42 43
43static DECLARE_MUTEX(pm_devs_lock); 44static DEFINE_MUTEX(pm_devs_lock);
44static LIST_HEAD(pm_devs); 45static LIST_HEAD(pm_devs);
45 46
46/** 47/**
@@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type,
67 dev->id = id; 68 dev->id = id;
68 dev->callback = callback; 69 dev->callback = callback;
69 70
70 down(&pm_devs_lock); 71 mutex_lock(&pm_devs_lock);
71 list_add(&dev->entry, &pm_devs); 72 list_add(&dev->entry, &pm_devs);
72 up(&pm_devs_lock); 73 mutex_unlock(&pm_devs_lock);
73 } 74 }
74 return dev; 75 return dev;
75} 76}
@@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type,
85void pm_unregister(struct pm_dev *dev) 86void pm_unregister(struct pm_dev *dev)
86{ 87{
87 if (dev) { 88 if (dev) {
88 down(&pm_devs_lock); 89 mutex_lock(&pm_devs_lock);
89 list_del(&dev->entry); 90 list_del(&dev->entry);
90 up(&pm_devs_lock); 91 mutex_unlock(&pm_devs_lock);
91 92
92 kfree(dev); 93 kfree(dev);
93 } 94 }
@@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback)
118 if (!callback) 119 if (!callback)
119 return; 120 return;
120 121
121 down(&pm_devs_lock); 122 mutex_lock(&pm_devs_lock);
122 entry = pm_devs.next; 123 entry = pm_devs.next;
123 while (entry != &pm_devs) { 124 while (entry != &pm_devs) {
124 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); 125 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback)
126 if (dev->callback == callback) 127 if (dev->callback == callback)
127 __pm_unregister(dev); 128 __pm_unregister(dev);
128 } 129 }
129 up(&pm_devs_lock); 130 mutex_unlock(&pm_devs_lock);
130} 131}
131 132
132/** 133/**
@@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data)
234{ 235{
235 struct list_head *entry; 236 struct list_head *entry;
236 237
237 down(&pm_devs_lock); 238 mutex_lock(&pm_devs_lock);
238 entry = pm_devs.next; 239 entry = pm_devs.next;
239 while (entry != &pm_devs) { 240 while (entry != &pm_devs) {
240 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); 241 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data)
246 */ 247 */
247 if (rqst == PM_SUSPEND) 248 if (rqst == PM_SUSPEND)
248 pm_undo_all(dev); 249 pm_undo_all(dev);
249 up(&pm_devs_lock); 250 mutex_unlock(&pm_devs_lock);
250 return status; 251 return status;
251 } 252 }
252 } 253 }
253 entry = entry->next; 254 entry = entry->next;
254 } 255 }
255 up(&pm_devs_lock); 256 mutex_unlock(&pm_devs_lock);
256 return 0; 257 return 0;
257} 258}
258 259
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba6808..f06f12f217 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
8 int cpus; 8 int cpus;
9 unsigned long image_pages; 9 unsigned long image_pages;
10 unsigned long pages; 10 unsigned long pages;
11 unsigned long size;
11} __attribute__((aligned(PAGE_SIZE))); 12} __attribute__((aligned(PAGE_SIZE)));
12 13
13 14
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys;
37/* References to section boundaries */ 38/* References to section boundaries */
38extern const void __nosave_begin, __nosave_end; 39extern const void __nosave_begin, __nosave_end;
39 40
40extern unsigned int nr_copy_pages;
41extern struct pbe *pagedir_nosave; 41extern struct pbe *pagedir_nosave;
42 42
43/* Preferred image size in bytes (default 500 MB) */ 43/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 44extern unsigned long image_size;
45extern int in_suspend;
46extern dev_t swsusp_resume_device;
45 47
46extern asmlinkage int swsusp_arch_suspend(void); 48extern asmlinkage int swsusp_arch_suspend(void);
47extern asmlinkage int swsusp_arch_resume(void); 49extern asmlinkage int swsusp_arch_resume(void);
48 50
49extern unsigned int count_data_pages(void); 51extern unsigned int count_data_pages(void);
50extern void free_pagedir(struct pbe *pblist); 52
51extern void release_eaten_pages(void); 53struct snapshot_handle {
52extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); 54 loff_t offset;
55 unsigned int page;
56 unsigned int page_offset;
57 unsigned int prev;
58 struct pbe *pbe;
59 void *buffer;
60 unsigned int buf_offset;
61};
62
63#define data_of(handle) ((handle).buffer + (handle).buf_offset)
64
65extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
66extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
67int snapshot_image_loaded(struct snapshot_handle *handle);
68
69#define SNAPSHOT_IOC_MAGIC '3'
70#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
71#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2)
72#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
73#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4)
74#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5)
75#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
76#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
77#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
78#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
79#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
80#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
81#define SNAPSHOT_IOC_MAXNR 11
82
83/**
84 * The bitmap is used for tracing allocated swap pages
85 *
86 * The entire bitmap consists of a number of bitmap_page
87 * structures linked with the help of the .next member.
88 * Thus each page can be allocated individually, so we only
89 * need to make 0-order memory allocations to create
90 * the bitmap.
91 */
92
93#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
94#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
95#define BITS_PER_CHUNK (sizeof(long) * 8)
96#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
97
98struct bitmap_page {
99 unsigned long chunks[BITMAP_PAGE_CHUNKS];
100 struct bitmap_page *next;
101};
102
103extern void free_bitmap(struct bitmap_page *bitmap);
104extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
105extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
106extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
107
108extern int swsusp_check(void);
109extern int swsusp_shrink_memory(void);
53extern void swsusp_free(void); 110extern void swsusp_free(void);
54extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); 111extern int swsusp_suspend(void);
55extern unsigned int snapshot_nr_pages(void); 112extern int swsusp_resume(void);
56extern struct pbe *snapshot_pblist(void); 113extern int swsusp_read(void);
57extern void snapshot_pblist_set(struct pbe *pblist); 114extern int swsusp_write(void);
115extern void swsusp_close(void);
116extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a..8ac7c35fad 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/suspend.h> 13#include <linux/suspend.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/syscalls.h>
15 16
16/* 17/*
17 * Timeout for stopping processes 18 * Timeout for stopping processes
18 */ 19 */
19#define TIMEOUT (6 * HZ) 20#define TIMEOUT (20 * HZ)
20 21
21 22
22static inline int freezeable(struct task_struct * p) 23static inline int freezeable(struct task_struct * p)
@@ -54,38 +55,62 @@ void refrigerator(void)
54 current->state = save; 55 current->state = save;
55} 56}
56 57
58static inline void freeze_process(struct task_struct *p)
59{
60 unsigned long flags;
61
62 if (!freezing(p)) {
63 freeze(p);
64 spin_lock_irqsave(&p->sighand->siglock, flags);
65 signal_wake_up(p, 0);
66 spin_unlock_irqrestore(&p->sighand->siglock, flags);
67 }
68}
69
57/* 0 = success, else # of processes that we failed to stop */ 70/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 71int freeze_processes(void)
59{ 72{
60 int todo; 73 int todo, nr_user, user_frozen;
61 unsigned long start_time; 74 unsigned long start_time;
62 struct task_struct *g, *p; 75 struct task_struct *g, *p;
63 unsigned long flags; 76 unsigned long flags;
64 77
65 printk( "Stopping tasks: " ); 78 printk( "Stopping tasks: " );
66 start_time = jiffies; 79 start_time = jiffies;
80 user_frozen = 0;
67 do { 81 do {
68 todo = 0; 82 nr_user = todo = 0;
69 read_lock(&tasklist_lock); 83 read_lock(&tasklist_lock);
70 do_each_thread(g, p) { 84 do_each_thread(g, p) {
71 if (!freezeable(p)) 85 if (!freezeable(p))
72 continue; 86 continue;
73 if (frozen(p)) 87 if (frozen(p))
74 continue; 88 continue;
75 89 if (p->mm && !(p->flags & PF_BORROWED_MM)) {
76 freeze(p); 90 /* The task is a user-space one.
77 spin_lock_irqsave(&p->sighand->siglock, flags); 91 * Freeze it unless there's a vfork completion
78 signal_wake_up(p, 0); 92 * pending
79 spin_unlock_irqrestore(&p->sighand->siglock, flags); 93 */
80 todo++; 94 if (!p->vfork_done)
95 freeze_process(p);
96 nr_user++;
97 } else {
98 /* Freeze only if the user space is frozen */
99 if (user_frozen)
100 freeze_process(p);
101 todo++;
102 }
81 } while_each_thread(g, p); 103 } while_each_thread(g, p);
82 read_unlock(&tasklist_lock); 104 read_unlock(&tasklist_lock);
105 todo += nr_user;
106 if (!user_frozen && !nr_user) {
107 sys_sync();
108 start_time = jiffies;
109 }
110 user_frozen = !nr_user;
83 yield(); /* Yield is okay here */ 111 yield(); /* Yield is okay here */
84 if (todo && time_after(jiffies, start_time + TIMEOUT)) { 112 if (todo && time_after(jiffies, start_time + TIMEOUT))
85 printk( "\n" );
86 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
87 break; 113 break;
88 }
89 } while(todo); 114 } while(todo);
90 115
91 /* This does not unfreeze processes that are already frozen 116 /* This does not unfreeze processes that are already frozen
@@ -94,8 +119,14 @@ int freeze_processes(void)
94 * but it cleans up leftover PF_FREEZE requests. 119 * but it cleans up leftover PF_FREEZE requests.
95 */ 120 */
96 if (todo) { 121 if (todo) {
122 printk( "\n" );
123 printk(KERN_ERR " stopping tasks timed out "
124 "after %d seconds (%d tasks remaining):\n",
125 TIMEOUT / HZ, todo);
97 read_lock(&tasklist_lock); 126 read_lock(&tasklist_lock);
98 do_each_thread(g, p) 127 do_each_thread(g, p) {
128 if (freezeable(p) && !frozen(p))
129 printk(KERN_ERR " %s\n", p->comm);
99 if (freezing(p)) { 130 if (freezing(p)) {
100 pr_debug(" clean up: %s\n", p->comm); 131 pr_debug(" clean up: %s\n", p->comm);
101 p->flags &= ~PF_FREEZE; 132 p->flags &= ~PF_FREEZE;
@@ -103,7 +134,7 @@ int freeze_processes(void)
103 recalc_sigpending_tsk(p); 134 recalc_sigpending_tsk(p);
104 spin_unlock_irqrestore(&p->sighand->siglock, flags); 135 spin_unlock_irqrestore(&p->sighand->siglock, flags);
105 } 136 }
106 while_each_thread(g, p); 137 } while_each_thread(g, p);
107 read_unlock(&tasklist_lock); 138 read_unlock(&tasklist_lock);
108 return todo; 139 return todo;
109 } 140 }
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 911fc62b82..5957312b2d 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void)
49 49
50 printk("Thawing cpus ...\n"); 50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) { 51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = smp_prepare_cpu(cpu); 52 error = cpu_up(cpu);
53 if (!error)
54 error = cpu_up(cpu);
55 if (!error) { 53 if (!error) {
56 printk("CPU%d is up\n", cpu); 54 printk("CPU%d is up\n", cpu);
57 continue; 55 continue;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d6..c5863d02c8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12 12
13#include <linux/version.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15#include <linux/suspend.h> 16#include <linux/suspend.h>
@@ -34,7 +35,9 @@
34#include "power.h" 35#include "power.h"
35 36
36struct pbe *pagedir_nosave; 37struct pbe *pagedir_nosave;
37unsigned int nr_copy_pages; 38static unsigned int nr_copy_pages;
39static unsigned int nr_meta_pages;
40static unsigned long *buffer;
38 41
39#ifdef CONFIG_HIGHMEM 42#ifdef CONFIG_HIGHMEM
40unsigned int count_highmem_pages(void) 43unsigned int count_highmem_pages(void)
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
80 void *kaddr; 83 void *kaddr;
81 unsigned long pfn = zone_pfn + zone->zone_start_pfn; 84 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
82 85
83 if (!(pfn%1000)) 86 if (!(pfn%10000))
84 printk("."); 87 printk(".");
85 if (!pfn_valid(pfn)) 88 if (!pfn_valid(pfn))
86 continue; 89 continue;
@@ -119,13 +122,15 @@ int save_highmem(void)
119 struct zone *zone; 122 struct zone *zone;
120 int res = 0; 123 int res = 0;
121 124
122 pr_debug("swsusp: Saving Highmem\n"); 125 pr_debug("swsusp: Saving Highmem");
126 drain_local_pages();
123 for_each_zone (zone) { 127 for_each_zone (zone) {
124 if (is_highmem(zone)) 128 if (is_highmem(zone))
125 res = save_highmem_zone(zone); 129 res = save_highmem_zone(zone);
126 if (res) 130 if (res)
127 return res; 131 return res;
128 } 132 }
133 printk("\n");
129 return 0; 134 return 0;
130} 135}
131 136
@@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
235 * free_pagedir - free pages allocated with alloc_pagedir() 240 * free_pagedir - free pages allocated with alloc_pagedir()
236 */ 241 */
237 242
238void free_pagedir(struct pbe *pblist) 243static void free_pagedir(struct pbe *pblist)
239{ 244{
240 struct pbe *pbe; 245 struct pbe *pbe;
241 246
@@ -301,7 +306,7 @@ struct eaten_page {
301 306
302static struct eaten_page *eaten_pages = NULL; 307static struct eaten_page *eaten_pages = NULL;
303 308
304void release_eaten_pages(void) 309static void release_eaten_pages(void)
305{ 310{
306 struct eaten_page *p, *q; 311 struct eaten_page *p, *q;
307 312
@@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
376 if (!nr_pages) 381 if (!nr_pages)
377 return NULL; 382 return NULL;
378 383
379 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
380 pblist = alloc_image_page(gfp_mask, safe_needed); 384 pblist = alloc_image_page(gfp_mask, safe_needed);
381 /* FIXME: rewrite this ugly loop */ 385 /* FIXME: rewrite this ugly loop */
382 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 386 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
388 free_pagedir(pblist); 392 free_pagedir(pblist);
389 pblist = NULL; 393 pblist = NULL;
390 } else 394 } else
391 create_pbe_list(pblist, nr_pages); 395 create_pbe_list(pblist, nr_pages);
392 return pblist; 396 return pblist;
393} 397}
394 398
@@ -414,6 +418,10 @@ void swsusp_free(void)
414 } 418 }
415 } 419 }
416 } 420 }
421 nr_copy_pages = 0;
422 nr_meta_pages = 0;
423 pagedir_nosave = NULL;
424 buffer = NULL;
417} 425}
418 426
419 427
@@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages)
437 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 445 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
438} 446}
439 447
440int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) 448static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
441{ 449{
442 struct pbe *p; 450 struct pbe *p;
443 451
@@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void)
504 */ 512 */
505 513
506 nr_copy_pages = nr_pages; 514 nr_copy_pages = nr_pages;
515 nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
507 516
508 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 517 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
509 return 0; 518 return 0;
510} 519}
520
521static void init_header(struct swsusp_info *info)
522{
523 memset(info, 0, sizeof(struct swsusp_info));
524 info->version_code = LINUX_VERSION_CODE;
525 info->num_physpages = num_physpages;
526 memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
527 info->cpus = num_online_cpus();
528 info->image_pages = nr_copy_pages;
529 info->pages = nr_copy_pages + nr_meta_pages + 1;
530 info->size = info->pages;
531 info->size <<= PAGE_SHIFT;
532}
533
534/**
535 * pack_orig_addresses - the .orig_address fields of the PBEs from the
536 * list starting at @pbe are stored in the array @buf[] (1 page)
537 */
538
539static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
540{
541 int j;
542
543 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
544 buf[j] = pbe->orig_address;
545 pbe = pbe->next;
546 }
547 if (!pbe)
548 for (; j < PAGE_SIZE / sizeof(long); j++)
549 buf[j] = 0;
550 return pbe;
551}
552
553/**
554 * snapshot_read_next - used for reading the system memory snapshot.
555 *
556 * On the first call to it @handle should point to a zeroed
557 * snapshot_handle structure. The structure gets updated and a pointer
558 * to it should be passed to this function every next time.
559 *
560 * The @count parameter should contain the number of bytes the caller
561 * wants to read from the snapshot. It must not be zero.
562 *
563 * On success the function returns a positive number. Then, the caller
564 * is allowed to read up to the returned number of bytes from the memory
565 * location computed by the data_of() macro. The number returned
566 * may be smaller than @count, but this only happens if the read would
567 * cross a page boundary otherwise.
568 *
569 * The function returns 0 to indicate the end of data stream condition,
570 * and a negative number is returned on error. In such cases the
571 * structure pointed to by @handle is not updated and should not be used
572 * any more.
573 */
574
575int snapshot_read_next(struct snapshot_handle *handle, size_t count)
576{
577 if (handle->page > nr_meta_pages + nr_copy_pages)
578 return 0;
579 if (!buffer) {
580 /* This makes the buffer be freed by swsusp_free() */
581 buffer = alloc_image_page(GFP_ATOMIC, 0);
582 if (!buffer)
583 return -ENOMEM;
584 }
585 if (!handle->offset) {
586 init_header((struct swsusp_info *)buffer);
587 handle->buffer = buffer;
588 handle->pbe = pagedir_nosave;
589 }
590 if (handle->prev < handle->page) {
591 if (handle->page <= nr_meta_pages) {
592 handle->pbe = pack_orig_addresses(buffer, handle->pbe);
593 if (!handle->pbe)
594 handle->pbe = pagedir_nosave;
595 } else {
596 handle->buffer = (void *)handle->pbe->address;
597 handle->pbe = handle->pbe->next;
598 }
599 handle->prev = handle->page;
600 }
601 handle->buf_offset = handle->page_offset;
602 if (handle->page_offset + count >= PAGE_SIZE) {
603 count = PAGE_SIZE - handle->page_offset;
604 handle->page_offset = 0;
605 handle->page++;
606 } else {
607 handle->page_offset += count;
608 }
609 handle->offset += count;
610 return count;
611}
612
613/**
614 * mark_unsafe_pages - mark the pages that cannot be used for storing
615 * the image during resume, because they conflict with the pages that
616 * had been used before suspend
617 */
618
619static int mark_unsafe_pages(struct pbe *pblist)
620{
621 struct zone *zone;
622 unsigned long zone_pfn;
623 struct pbe *p;
624
625 if (!pblist) /* a sanity check */
626 return -EINVAL;
627
628 /* Clear page flags */
629 for_each_zone (zone) {
630 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
631 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
632 ClearPageNosaveFree(pfn_to_page(zone_pfn +
633 zone->zone_start_pfn));
634 }
635
636 /* Mark orig addresses */
637 for_each_pbe (p, pblist) {
638 if (virt_addr_valid(p->orig_address))
639 SetPageNosaveFree(virt_to_page(p->orig_address));
640 else
641 return -EFAULT;
642 }
643
644 return 0;
645}
646
647static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
648{
649 /* We assume both lists contain the same number of elements */
650 while (src) {
651 dst->orig_address = src->orig_address;
652 dst = dst->next;
653 src = src->next;
654 }
655}
656
657static int check_header(struct swsusp_info *info)
658{
659 char *reason = NULL;
660
661 if (info->version_code != LINUX_VERSION_CODE)
662 reason = "kernel version";
663 if (info->num_physpages != num_physpages)
664 reason = "memory size";
665 if (strcmp(info->uts.sysname,system_utsname.sysname))
666 reason = "system type";
667 if (strcmp(info->uts.release,system_utsname.release))
668 reason = "kernel release";
669 if (strcmp(info->uts.version,system_utsname.version))
670 reason = "version";
671 if (strcmp(info->uts.machine,system_utsname.machine))
672 reason = "machine";
673 if (reason) {
674 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
675 return -EPERM;
676 }
677 return 0;
678}
679
680/**
681 * load header - check the image header and copy data from it
682 */
683
684static int load_header(struct snapshot_handle *handle,
685 struct swsusp_info *info)
686{
687 int error;
688 struct pbe *pblist;
689
690 error = check_header(info);
691 if (!error) {
692 pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
693 if (!pblist)
694 return -ENOMEM;
695 pagedir_nosave = pblist;
696 handle->pbe = pblist;
697 nr_copy_pages = info->image_pages;
698 nr_meta_pages = info->pages - info->image_pages - 1;
699 }
700 return error;
701}
702
703/**
704 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
705 * the PBEs in the list starting at @pbe
706 */
707
708static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
709 struct pbe *pbe)
710{
711 int j;
712
713 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
714 pbe->orig_address = buf[j];
715 pbe = pbe->next;
716 }
717 return pbe;
718}
719
720/**
721 * create_image - use metadata contained in the PBE list
722 * pointed to by pagedir_nosave to mark the pages that will
723 * be overwritten in the process of restoring the system
724 * memory state from the image and allocate memory for
725 * the image avoiding these pages
726 */
727
728static int create_image(struct snapshot_handle *handle)
729{
730 int error = 0;
731 struct pbe *p, *pblist;
732
733 p = pagedir_nosave;
734 error = mark_unsafe_pages(p);
735 if (!error) {
736 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
737 if (pblist)
738 copy_page_backup_list(pblist, p);
739 free_pagedir(p);
740 if (!pblist)
741 error = -ENOMEM;
742 }
743 if (!error)
744 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
745 if (!error) {
746 release_eaten_pages();
747 pagedir_nosave = pblist;
748 } else {
749 pagedir_nosave = NULL;
750 handle->pbe = NULL;
751 nr_copy_pages = 0;
752 nr_meta_pages = 0;
753 }
754 return error;
755}
756
757/**
758 * snapshot_write_next - used for writing the system memory snapshot.
759 *
760 * On the first call to it @handle should point to a zeroed
761 * snapshot_handle structure. The structure gets updated and a pointer
762 * to it should be passed to this function every next time.
763 *
764 * The @count parameter should contain the number of bytes the caller
765 * wants to write to the image. It must not be zero.
766 *
767 * On success the function returns a positive number. Then, the caller
768 * is allowed to write up to the returned number of bytes to the memory
769 * location computed by the data_of() macro. The number returned
770 * may be smaller than @count, but this only happens if the write would
771 * cross a page boundary otherwise.
772 *
773 * The function returns 0 to indicate the "end of file" condition,
774 * and a negative number is returned on error. In such cases the
775 * structure pointed to by @handle is not updated and should not be used
776 * any more.
777 */
778
779int snapshot_write_next(struct snapshot_handle *handle, size_t count)
780{
781 int error = 0;
782
783 if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
784 return 0;
785 if (!buffer) {
786 /* This makes the buffer be freed by swsusp_free() */
787 buffer = alloc_image_page(GFP_ATOMIC, 0);
788 if (!buffer)
789 return -ENOMEM;
790 }
791 if (!handle->offset)
792 handle->buffer = buffer;
793 if (handle->prev < handle->page) {
794 if (!handle->prev) {
795 error = load_header(handle, (struct swsusp_info *)buffer);
796 if (error)
797 return error;
798 } else if (handle->prev <= nr_meta_pages) {
799 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
800 if (!handle->pbe) {
801 error = create_image(handle);
802 if (error)
803 return error;
804 handle->pbe = pagedir_nosave;
805 handle->buffer = (void *)handle->pbe->address;
806 }
807 } else {
808 handle->pbe = handle->pbe->next;
809 handle->buffer = (void *)handle->pbe->address;
810 }
811 handle->prev = handle->page;
812 }
813 handle->buf_offset = handle->page_offset;
814 if (handle->page_offset + count >= PAGE_SIZE) {
815 count = PAGE_SIZE - handle->page_offset;
816 handle->page_offset = 0;
817 handle->page++;
818 } else {
819 handle->page_offset += count;
820 }
821 handle->offset += count;
822 return count;
823}
824
825int snapshot_image_loaded(struct snapshot_handle *handle)
826{
827 return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
828 handle->page <= nr_meta_pages + nr_copy_pages);
829}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 0000000000..044b8e0c10
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,545 @@
1/*
2 * linux/kernel/power/swap.c
3 *
4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition.
6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 *
10 * This file is released under the GPLv2.
11 *
12 */
13
14#include <linux/module.h>
15#include <linux/smp_lock.h>
16#include <linux/file.h>
17#include <linux/utsname.h>
18#include <linux/version.h>
19#include <linux/delay.h>
20#include <linux/bitops.h>
21#include <linux/genhd.h>
22#include <linux/device.h>
23#include <linux/buffer_head.h>
24#include <linux/bio.h>
25#include <linux/swap.h>
26#include <linux/swapops.h>
27#include <linux/pm.h>
28
29#include "power.h"
30
31extern char resume_file[];
32
33#define SWSUSP_SIG "S1SUSPEND"
34
35static struct swsusp_header {
36 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
37 swp_entry_t image;
38 char orig_sig[10];
39 char sig[10];
40} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
41
42/*
43 * Saving part...
44 */
45
46static unsigned short root_swap = 0xffff;
47
48static int mark_swapfiles(swp_entry_t start)
49{
50 int error;
51
52 rw_swap_page_sync(READ,
53 swp_entry(root_swap, 0),
54 virt_to_page((unsigned long)&swsusp_header));
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE,
61 swp_entry(root_swap, 0),
62 virt_to_page((unsigned long)
63 &swsusp_header));
64 } else {
65 pr_debug("swsusp: Partition is not swap space.\n");
66 error = -ENODEV;
67 }
68 return error;
69}
70
71/**
72 * swsusp_swap_check - check if the resume device is a swap device
73 * and get its index (if so)
74 */
75
76static int swsusp_swap_check(void) /* This is called before saving image */
77{
78 int res = swap_type_of(swsusp_resume_device);
79
80 if (res >= 0) {
81 root_swap = res;
82 return 0;
83 }
84 return res;
85}
86
87/**
88 * write_page - Write one page to given swap location.
89 * @buf: Address we're writing.
90 * @offset: Offset of the swap page we're writing to.
91 */
92
93static int write_page(void *buf, unsigned long offset)
94{
95 swp_entry_t entry;
96 int error = -ENOSPC;
97
98 if (offset) {
99 entry = swp_entry(root_swap, offset);
100 error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
101 }
102 return error;
103}
104
105/*
106 * The swap map is a data structure used for keeping track of each page
107 * written to a swap partition. It consists of many swap_map_page
108 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
109 * These structures are stored on the swap and linked together with the
110 * help of the .next_swap member.
111 *
112 * The swap map is created during suspend. The swap map pages are
113 * allocated and populated one at a time, so we only need one memory
114 * page to set up the entire structure.
115 *
116 * During resume we also only need to use one swap_map_page structure
117 * at a time.
118 */
119
120#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1)
121
122struct swap_map_page {
123 unsigned long entries[MAP_PAGE_ENTRIES];
124 unsigned long next_swap;
125};
126
127/**
128 * The swap_map_handle structure is used for handling swap in
129 * a file-alike way
130 */
131
132struct swap_map_handle {
133 struct swap_map_page *cur;
134 unsigned long cur_swap;
135 struct bitmap_page *bitmap;
136 unsigned int k;
137};
138
139static void release_swap_writer(struct swap_map_handle *handle)
140{
141 if (handle->cur)
142 free_page((unsigned long)handle->cur);
143 handle->cur = NULL;
144 if (handle->bitmap)
145 free_bitmap(handle->bitmap);
146 handle->bitmap = NULL;
147}
148
149static int get_swap_writer(struct swap_map_handle *handle)
150{
151 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
152 if (!handle->cur)
153 return -ENOMEM;
154 handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
155 if (!handle->bitmap) {
156 release_swap_writer(handle);
157 return -ENOMEM;
158 }
159 handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
160 if (!handle->cur_swap) {
161 release_swap_writer(handle);
162 return -ENOSPC;
163 }
164 handle->k = 0;
165 return 0;
166}
167
168static int swap_write_page(struct swap_map_handle *handle, void *buf)
169{
170 int error;
171 unsigned long offset;
172
173 if (!handle->cur)
174 return -EINVAL;
175 offset = alloc_swap_page(root_swap, handle->bitmap);
176 error = write_page(buf, offset);
177 if (error)
178 return error;
179 handle->cur->entries[handle->k++] = offset;
180 if (handle->k >= MAP_PAGE_ENTRIES) {
181 offset = alloc_swap_page(root_swap, handle->bitmap);
182 if (!offset)
183 return -ENOSPC;
184 handle->cur->next_swap = offset;
185 error = write_page(handle->cur, handle->cur_swap);
186 if (error)
187 return error;
188 memset(handle->cur, 0, PAGE_SIZE);
189 handle->cur_swap = offset;
190 handle->k = 0;
191 }
192 return 0;
193}
194
195static int flush_swap_writer(struct swap_map_handle *handle)
196{
197 if (handle->cur && handle->cur_swap)
198 return write_page(handle->cur, handle->cur_swap);
199 else
200 return -EINVAL;
201}
202
203/**
204 * save_image - save the suspend image data
205 */
206
207static int save_image(struct swap_map_handle *handle,
208 struct snapshot_handle *snapshot,
209 unsigned int nr_pages)
210{
211 unsigned int m;
212 int ret;
213 int error = 0;
214
215 printk("Saving image data pages (%u pages) ... ", nr_pages);
216 m = nr_pages / 100;
217 if (!m)
218 m = 1;
219 nr_pages = 0;
220 do {
221 ret = snapshot_read_next(snapshot, PAGE_SIZE);
222 if (ret > 0) {
223 error = swap_write_page(handle, data_of(*snapshot));
224 if (error)
225 break;
226 if (!(nr_pages % m))
227 printk("\b\b\b\b%3d%%", nr_pages / m);
228 nr_pages++;
229 }
230 } while (ret > 0);
231 if (!error)
232 printk("\b\b\b\bdone\n");
233 return error;
234}
235
236/**
237 * enough_swap - Make sure we have enough swap to save the image.
238 *
239 * Returns TRUE or FALSE after checking the total amount of swap
240 * space avaiable from the resume partition.
241 */
242
243static int enough_swap(unsigned int nr_pages)
244{
245 unsigned int free_swap = count_swap_pages(root_swap, 1);
246
247 pr_debug("swsusp: free swap pages: %u\n", free_swap);
248 return free_swap > (nr_pages + PAGES_FOR_IO +
249 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
250}
251
252/**
253 * swsusp_write - Write entire image and metadata.
254 *
255 * It is important _NOT_ to umount filesystems at this point. We want
256 * them synced (in case something goes wrong) but we DO not want to mark
257 * filesystem clean: it is not. (And it does not matter, if we resume
258 * correctly, we'll mark system clean, anyway.)
259 */
260
261int swsusp_write(void)
262{
263 struct swap_map_handle handle;
264 struct snapshot_handle snapshot;
265 struct swsusp_info *header;
266 unsigned long start;
267 int error;
268
269 if ((error = swsusp_swap_check())) {
270 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
271 return error;
272 }
273 memset(&snapshot, 0, sizeof(struct snapshot_handle));
274 error = snapshot_read_next(&snapshot, PAGE_SIZE);
275 if (error < PAGE_SIZE)
276 return error < 0 ? error : -EFAULT;
277 header = (struct swsusp_info *)data_of(snapshot);
278 if (!enough_swap(header->pages)) {
279 printk(KERN_ERR "swsusp: Not enough free swap\n");
280 return -ENOSPC;
281 }
282 error = get_swap_writer(&handle);
283 if (!error) {
284 start = handle.cur_swap;
285 error = swap_write_page(&handle, header);
286 }
287 if (!error)
288 error = save_image(&handle, &snapshot, header->pages - 1);
289 if (!error) {
290 flush_swap_writer(&handle);
291 printk("S");
292 error = mark_swapfiles(swp_entry(root_swap, start));
293 printk("|\n");
294 }
295 if (error)
296 free_all_swap_pages(root_swap, handle.bitmap);
297 release_swap_writer(&handle);
298 return error;
299}
300
301/*
302 * Using bio to read from swap.
303 * This code requires a bit more work than just using buffer heads
304 * but, it is the recommended way for 2.5/2.6.
305 * The following are to signal the beginning and end of I/O. Bios
306 * finish asynchronously, while we want them to happen synchronously.
307 * A simple atomic_t, and a wait loop take care of this problem.
308 */
309
310static atomic_t io_done = ATOMIC_INIT(0);
311
312static int end_io(struct bio *bio, unsigned int num, int err)
313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
315 panic("I/O error reading memory image");
316 atomic_set(&io_done, 0);
317 return 0;
318}
319
320static struct block_device *resume_bdev;
321
322/**
323 * submit - submit BIO request.
324 * @rw: READ or WRITE.
325 * @off physical offset of page.
326 * @page: page we're reading or writing.
327 *
328 * Straight from the textbook - allocate and initialize the bio.
329 * If we're writing, make sure the page is marked as dirty.
330 * Then submit it and wait.
331 */
332
333static int submit(int rw, pgoff_t page_off, void *page)
334{
335 int error = 0;
336 struct bio *bio;
337
338 bio = bio_alloc(GFP_ATOMIC, 1);
339 if (!bio)
340 return -ENOMEM;
341 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
342 bio->bi_bdev = resume_bdev;
343 bio->bi_end_io = end_io;
344
345 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
346 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
347 error = -EFAULT;
348 goto Done;
349 }
350
351 atomic_set(&io_done, 1);
352 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
353 while (atomic_read(&io_done))
354 yield();
355 if (rw == READ)
356 bio_set_pages_dirty(bio);
357 Done:
358 bio_put(bio);
359 return error;
360}
361
362static int bio_read_page(pgoff_t page_off, void *page)
363{
364 return submit(READ, page_off, page);
365}
366
367static int bio_write_page(pgoff_t page_off, void *page)
368{
369 return submit(WRITE, page_off, page);
370}
371
372/**
373 * The following functions allow us to read data using a swap map
374 * in a file-alike way
375 */
376
377static void release_swap_reader(struct swap_map_handle *handle)
378{
379 if (handle->cur)
380 free_page((unsigned long)handle->cur);
381 handle->cur = NULL;
382}
383
384static int get_swap_reader(struct swap_map_handle *handle,
385 swp_entry_t start)
386{
387 int error;
388
389 if (!swp_offset(start))
390 return -EINVAL;
391 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
392 if (!handle->cur)
393 return -ENOMEM;
394 error = bio_read_page(swp_offset(start), handle->cur);
395 if (error) {
396 release_swap_reader(handle);
397 return error;
398 }
399 handle->k = 0;
400 return 0;
401}
402
403static int swap_read_page(struct swap_map_handle *handle, void *buf)
404{
405 unsigned long offset;
406 int error;
407
408 if (!handle->cur)
409 return -EINVAL;
410 offset = handle->cur->entries[handle->k];
411 if (!offset)
412 return -EFAULT;
413 error = bio_read_page(offset, buf);
414 if (error)
415 return error;
416 if (++handle->k >= MAP_PAGE_ENTRIES) {
417 handle->k = 0;
418 offset = handle->cur->next_swap;
419 if (!offset)
420 release_swap_reader(handle);
421 else
422 error = bio_read_page(offset, handle->cur);
423 }
424 return error;
425}
426
427/**
428 * load_image - load the image using the swap map handle
429 * @handle and the snapshot handle @snapshot
430 * (assume there are @nr_pages pages to load)
431 */
432
433static int load_image(struct swap_map_handle *handle,
434 struct snapshot_handle *snapshot,
435 unsigned int nr_pages)
436{
437 unsigned int m;
438 int ret;
439 int error = 0;
440
441 printk("Loading image data pages (%u pages) ... ", nr_pages);
442 m = nr_pages / 100;
443 if (!m)
444 m = 1;
445 nr_pages = 0;
446 do {
447 ret = snapshot_write_next(snapshot, PAGE_SIZE);
448 if (ret > 0) {
449 error = swap_read_page(handle, data_of(*snapshot));
450 if (error)
451 break;
452 if (!(nr_pages % m))
453 printk("\b\b\b\b%3d%%", nr_pages / m);
454 nr_pages++;
455 }
456 } while (ret > 0);
457 if (!error) {
458 printk("\b\b\b\bdone\n");
459 if (!snapshot_image_loaded(snapshot))
460 error = -ENODATA;
461 }
462 return error;
463}
464
465int swsusp_read(void)
466{
467 int error;
468 struct swap_map_handle handle;
469 struct snapshot_handle snapshot;
470 struct swsusp_info *header;
471
472 if (IS_ERR(resume_bdev)) {
473 pr_debug("swsusp: block device not initialised\n");
474 return PTR_ERR(resume_bdev);
475 }
476
477 memset(&snapshot, 0, sizeof(struct snapshot_handle));
478 error = snapshot_write_next(&snapshot, PAGE_SIZE);
479 if (error < PAGE_SIZE)
480 return error < 0 ? error : -EFAULT;
481 header = (struct swsusp_info *)data_of(snapshot);
482 error = get_swap_reader(&handle, swsusp_header.image);
483 if (!error)
484 error = swap_read_page(&handle, header);
485 if (!error)
486 error = load_image(&handle, &snapshot, header->pages - 1);
487 release_swap_reader(&handle);
488
489 blkdev_put(resume_bdev);
490
491 if (!error)
492 pr_debug("swsusp: Reading resume file was successful\n");
493 else
494 pr_debug("swsusp: Error %d resuming\n", error);
495 return error;
496}
497
498/**
499 * swsusp_check - Check for swsusp signature in the resume device
500 */
501
502int swsusp_check(void)
503{
504 int error;
505
506 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
507 if (!IS_ERR(resume_bdev)) {
508 set_blocksize(resume_bdev, PAGE_SIZE);
509 memset(&swsusp_header, 0, sizeof(swsusp_header));
510 if ((error = bio_read_page(0, &swsusp_header)))
511 return error;
512 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
513 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
514 /* Reset swap signature now */
515 error = bio_write_page(0, &swsusp_header);
516 } else {
517 return -EINVAL;
518 }
519 if (error)
520 blkdev_put(resume_bdev);
521 else
522 pr_debug("swsusp: Signature found, resuming\n");
523 } else {
524 error = PTR_ERR(resume_bdev);
525 }
526
527 if (error)
528 pr_debug("swsusp: Error %d check for resume file\n", error);
529
530 return error;
531}
532
533/**
534 * swsusp_close - close swap device.
535 */
536
537void swsusp_close(void)
538{
539 if (IS_ERR(resume_bdev)) {
540 pr_debug("swsusp: block device not initialised\n");
541 return;
542 }
543
544 blkdev_put(resume_bdev);
545}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d9d08f72f..c4016cbbd3 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
31 * Fixed runaway init 31 * Fixed runaway init
32 * 32 *
33 * Rafael J. Wysocki <rjw@sisk.pl> 33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Added the swap map data structure and reworked the handling of swap 34 * Reworked the freeing of memory and the handling of swap
35 * 35 *
36 * More state savers are welcome. Especially for the scsi layer... 36 * More state savers are welcome. Especially for the scsi layer...
37 * 37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt 38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */ 39 */
40 40
41#include <linux/module.h>
42#include <linux/mm.h> 41#include <linux/mm.h>
43#include <linux/suspend.h> 42#include <linux/suspend.h>
44#include <linux/smp_lock.h>
45#include <linux/file.h>
46#include <linux/utsname.h>
47#include <linux/version.h>
48#include <linux/delay.h>
49#include <linux/bitops.h>
50#include <linux/spinlock.h> 43#include <linux/spinlock.h>
51#include <linux/genhd.h>
52#include <linux/kernel.h> 44#include <linux/kernel.h>
53#include <linux/major.h> 45#include <linux/major.h>
54#include <linux/swap.h> 46#include <linux/swap.h>
55#include <linux/pm.h> 47#include <linux/pm.h>
56#include <linux/device.h>
57#include <linux/buffer_head.h>
58#include <linux/swapops.h> 48#include <linux/swapops.h>
59#include <linux/bootmem.h> 49#include <linux/bootmem.h>
60#include <linux/syscalls.h> 50#include <linux/syscalls.h>
61#include <linux/highmem.h> 51#include <linux/highmem.h>
62#include <linux/bio.h>
63
64#include <asm/uaccess.h>
65#include <asm/mmu_context.h>
66#include <asm/pgtable.h>
67#include <asm/tlbflush.h>
68#include <asm/io.h>
69 52
70#include "power.h" 53#include "power.h"
71 54
@@ -77,6 +60,8 @@
77 */ 60 */
78unsigned long image_size = 500 * 1024 * 1024; 61unsigned long image_size = 500 * 1024 * 1024;
79 62
63int in_suspend __nosavedata = 0;
64
80#ifdef CONFIG_HIGHMEM 65#ifdef CONFIG_HIGHMEM
81unsigned int count_highmem_pages(void); 66unsigned int count_highmem_pages(void);
82int save_highmem(void); 67int save_highmem(void);
@@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; }
87static unsigned int count_highmem_pages(void) { return 0; } 72static unsigned int count_highmem_pages(void) { return 0; }
88#endif 73#endif
89 74
90extern char resume_file[];
91
92#define SWSUSP_SIG "S1SUSPEND"
93
94static struct swsusp_header {
95 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
96 swp_entry_t image;
97 char orig_sig[10];
98 char sig[10];
99} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
100
101static struct swsusp_info swsusp_info;
102
103/*
104 * Saving part...
105 */
106
107static unsigned short root_swap = 0xffff;
108
109static int mark_swapfiles(swp_entry_t start)
110{
111 int error;
112
113 rw_swap_page_sync(READ,
114 swp_entry(root_swap, 0),
115 virt_to_page((unsigned long)&swsusp_header));
116 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
117 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
118 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
119 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
120 swsusp_header.image = start;
121 error = rw_swap_page_sync(WRITE,
122 swp_entry(root_swap, 0),
123 virt_to_page((unsigned long)
124 &swsusp_header));
125 } else {
126 pr_debug("swsusp: Partition is not swap space.\n");
127 error = -ENODEV;
128 }
129 return error;
130}
131
132/*
133 * Check whether the swap device is the specified resume
134 * device, irrespective of whether they are specified by
135 * identical names.
136 *
137 * (Thus, device inode aliasing is allowed. You can say /dev/hda4
138 * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
139 * and they'll be considered the same device. This is *necessary* for
140 * devfs, since the resume code can only recognize the form /dev/hda4,
141 * but the suspend code would see the long name.)
142 */
143static inline int is_resume_device(const struct swap_info_struct *swap_info)
144{
145 struct file *file = swap_info->swap_file;
146 struct inode *inode = file->f_dentry->d_inode;
147
148 return S_ISBLK(inode->i_mode) &&
149 swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
150}
151
152static int swsusp_swap_check(void) /* This is called before saving image */
153{
154 int i;
155
156 spin_lock(&swap_lock);
157 for (i = 0; i < MAX_SWAPFILES; i++) {
158 if (!(swap_info[i].flags & SWP_WRITEOK))
159 continue;
160 if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
161 spin_unlock(&swap_lock);
162 root_swap = i;
163 return 0;
164 }
165 }
166 spin_unlock(&swap_lock);
167 return -ENODEV;
168}
169
170/**
171 * write_page - Write one page to a fresh swap location.
172 * @addr: Address we're writing.
173 * @loc: Place to store the entry we used.
174 *
175 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
176 * errors. That is an artifact left over from swsusp. It did not
177 * check the return of rw_swap_page_sync() at all, since most pages
178 * written back to swap would return -EIO.
179 * This is a partial improvement, since we will at least return other
180 * errors, though we need to eventually fix the damn code.
181 */
182static int write_page(unsigned long addr, swp_entry_t *loc)
183{
184 swp_entry_t entry;
185 int error = -ENOSPC;
186
187 entry = get_swap_page_of_type(root_swap);
188 if (swp_offset(entry)) {
189 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
190 if (!error || error == -EIO)
191 *loc = entry;
192 }
193 return error;
194}
195
196/** 75/**
197 * Swap map-handling functions 76 * The following functions are used for tracing the allocated
198 * 77 * swap pages, so that they can be freed in case of an error.
199 * The swap map is a data structure used for keeping track of each page
200 * written to the swap. It consists of many swap_map_page structures
201 * that contain each an array of MAP_PAGE_SIZE swap entries.
202 * These structures are linked together with the help of either the
203 * .next (in memory) or the .next_swap (in swap) member.
204 * 78 *
205 * The swap map is created during suspend. At that time we need to keep 79 * The functions operate on a linked bitmap structure defined
206 * it in memory, because we have to free all of the allocated swap 80 * in power.h
207 * entries if an error occurs. The memory needed is preallocated
208 * so that we know in advance if there's enough of it.
209 *
210 * The first swap_map_page structure is filled with the swap entries that
211 * correspond to the first MAP_PAGE_SIZE data pages written to swap and
212 * so on. After the all of the data pages have been written, the order
213 * of the swap_map_page structures in the map is reversed so that they
214 * can be read from swap in the original order. This causes the data
215 * pages to be loaded in exactly the same order in which they have been
216 * saved.
217 *
218 * During resume we only need to use one swap_map_page structure
219 * at a time, which means that we only need to use two memory pages for
220 * reading the image - one for reading the swap_map_page structures
221 * and the second for reading the data pages from swap.
222 */ 81 */
223 82
224#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ 83void free_bitmap(struct bitmap_page *bitmap)
225 / sizeof(swp_entry_t))
226
227struct swap_map_page {
228 swp_entry_t entries[MAP_PAGE_SIZE];
229 swp_entry_t next_swap;
230 struct swap_map_page *next;
231};
232
233static inline void free_swap_map(struct swap_map_page *swap_map)
234{ 84{
235 struct swap_map_page *swp; 85 struct bitmap_page *bp;
236 86
237 while (swap_map) { 87 while (bitmap) {
238 swp = swap_map->next; 88 bp = bitmap->next;
239 free_page((unsigned long)swap_map); 89 free_page((unsigned long)bitmap);
240 swap_map = swp; 90 bitmap = bp;
241 } 91 }
242} 92}
243 93
244static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) 94struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
245{ 95{
246 struct swap_map_page *swap_map, *swp; 96 struct bitmap_page *bitmap, *bp;
247 unsigned n = 0; 97 unsigned int n;
248 98
249 if (!nr_pages) 99 if (!nr_bits)
250 return NULL; 100 return NULL;
251 101
252 pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); 102 bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
253 swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 103 bp = bitmap;
254 swp = swap_map; 104 for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
255 for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { 105 bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
256 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 106 bp = bp->next;
257 swp = swp->next; 107 if (!bp) {
258 if (!swp) { 108 free_bitmap(bitmap);
259 free_swap_map(swap_map);
260 return NULL; 109 return NULL;
261 } 110 }
262 } 111 }
263 return swap_map; 112 return bitmap;
264} 113}
265 114
266/** 115static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
267 * reverse_swap_map - reverse the order of pages in the swap map
268 * @swap_map
269 */
270
271static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
272{
273 struct swap_map_page *prev, *next;
274
275 prev = NULL;
276 while (swap_map) {
277 next = swap_map->next;
278 swap_map->next = prev;
279 prev = swap_map;
280 swap_map = next;
281 }
282 return prev;
283}
284
285/**
286 * free_swap_map_entries - free the swap entries allocated to store
287 * the swap map @swap_map (this is only called in case of an error)
288 */
289static inline void free_swap_map_entries(struct swap_map_page *swap_map)
290{
291 while (swap_map) {
292 if (swap_map->next_swap.val)
293 swap_free(swap_map->next_swap);
294 swap_map = swap_map->next;
295 }
296}
297
298/**
299 * save_swap_map - save the swap map used for tracing the data pages
300 * stored in the swap
301 */
302
303static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
304{
305 swp_entry_t entry = (swp_entry_t){0};
306 int error;
307
308 while (swap_map) {
309 swap_map->next_swap = entry;
310 if ((error = write_page((unsigned long)swap_map, &entry)))
311 return error;
312 swap_map = swap_map->next;
313 }
314 *start = entry;
315 return 0;
316}
317
318/**
319 * free_image_entries - free the swap entries allocated to store
320 * the image data pages (this is only called in case of an error)
321 */
322
323static inline void free_image_entries(struct swap_map_page *swp)
324{ 116{
325 unsigned k; 117 unsigned int n;
326 118
327 while (swp) { 119 n = BITMAP_PAGE_BITS;
328 for (k = 0; k < MAP_PAGE_SIZE; k++) 120 while (bitmap && n <= bit) {
329 if (swp->entries[k].val) 121 n += BITMAP_PAGE_BITS;
330 swap_free(swp->entries[k]); 122 bitmap = bitmap->next;
331 swp = swp->next;
332 } 123 }
333} 124 if (!bitmap)
334 125 return -EINVAL;
335/** 126 n -= BITMAP_PAGE_BITS;
336 * The swap_map_handle structure is used for handling the swap map in 127 bit -= n;
337 * a file-alike way 128 n = 0;
338 */ 129 while (bit >= BITS_PER_CHUNK) {
339 130 bit -= BITS_PER_CHUNK;
340struct swap_map_handle { 131 n++;
341 struct swap_map_page *cur;
342 unsigned int k;
343};
344
345static inline void init_swap_map_handle(struct swap_map_handle *handle,
346 struct swap_map_page *map)
347{
348 handle->cur = map;
349 handle->k = 0;
350}
351
352static inline int swap_map_write_page(struct swap_map_handle *handle,
353 unsigned long addr)
354{
355 int error;
356
357 error = write_page(addr, handle->cur->entries + handle->k);
358 if (error)
359 return error;
360 if (++handle->k >= MAP_PAGE_SIZE) {
361 handle->cur = handle->cur->next;
362 handle->k = 0;
363 } 132 }
133 bitmap->chunks[n] |= (1UL << bit);
364 return 0; 134 return 0;
365} 135}
366 136
367/** 137unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
368 * save_image_data - save the data pages pointed to by the PBEs
369 * from the list @pblist using the swap map handle @handle
370 * (assume there are @nr_pages data pages to save)
371 */
372
373static int save_image_data(struct pbe *pblist,
374 struct swap_map_handle *handle,
375 unsigned int nr_pages)
376{
377 unsigned int m;
378 struct pbe *p;
379 int error = 0;
380
381 printk("Saving image data pages (%u pages) ... ", nr_pages);
382 m = nr_pages / 100;
383 if (!m)
384 m = 1;
385 nr_pages = 0;
386 for_each_pbe (p, pblist) {
387 error = swap_map_write_page(handle, p->address);
388 if (error)
389 break;
390 if (!(nr_pages % m))
391 printk("\b\b\b\b%3d%%", nr_pages / m);
392 nr_pages++;
393 }
394 if (!error)
395 printk("\b\b\b\bdone\n");
396 return error;
397}
398
399static void dump_info(void)
400{
401 pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
402 pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
403 pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
404 pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
405 pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
406 pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
407 pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
408 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
409 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
410 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
411 pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
412}
413
414static void init_header(unsigned int nr_pages)
415{
416 memset(&swsusp_info, 0, sizeof(swsusp_info));
417 swsusp_info.version_code = LINUX_VERSION_CODE;
418 swsusp_info.num_physpages = num_physpages;
419 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
420
421 swsusp_info.cpus = num_online_cpus();
422 swsusp_info.image_pages = nr_pages;
423 swsusp_info.pages = nr_pages +
424 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
425}
426
427/**
428 * pack_orig_addresses - the .orig_address fields of the PBEs from the
429 * list starting at @pbe are stored in the array @buf[] (1 page)
430 */
431
432static inline struct pbe *pack_orig_addresses(unsigned long *buf,
433 struct pbe *pbe)
434{
435 int j;
436
437 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
438 buf[j] = pbe->orig_address;
439 pbe = pbe->next;
440 }
441 if (!pbe)
442 for (; j < PAGE_SIZE / sizeof(long); j++)
443 buf[j] = 0;
444 return pbe;
445}
446
447/**
448 * save_image_metadata - save the .orig_address fields of the PBEs
449 * from the list @pblist using the swap map handle @handle
450 */
451
452static int save_image_metadata(struct pbe *pblist,
453 struct swap_map_handle *handle)
454{ 138{
455 unsigned long *buf; 139 unsigned long offset;
456 unsigned int n = 0;
457 struct pbe *p;
458 int error = 0;
459 140
460 printk("Saving image metadata ... "); 141 offset = swp_offset(get_swap_page_of_type(swap));
461 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); 142 if (offset) {
462 if (!buf) 143 if (bitmap_set(bitmap, offset)) {
463 return -ENOMEM; 144 swap_free(swp_entry(swap, offset));
464 p = pblist; 145 offset = 0;
465 while (p) { 146 }
466 p = pack_orig_addresses(buf, p);
467 error = swap_map_write_page(handle, (unsigned long)buf);
468 if (error)
469 break;
470 n++;
471 } 147 }
472 free_page((unsigned long)buf); 148 return offset;
473 if (!error)
474 printk("done (%u pages saved)\n", n);
475 return error;
476} 149}
477 150
478/** 151void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
479 * enough_swap - Make sure we have enough swap to save the image.
480 *
481 * Returns TRUE or FALSE after checking the total amount of swap
482 * space avaiable from the resume partition.
483 */
484
485static int enough_swap(unsigned int nr_pages)
486{ 152{
487 unsigned int free_swap = swap_info[root_swap].pages - 153 unsigned int bit, n;
488 swap_info[root_swap].inuse_pages; 154 unsigned long test;
489
490 pr_debug("swsusp: free swap pages: %u\n", free_swap);
491 return free_swap > (nr_pages + PAGES_FOR_IO +
492 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
493}
494 155
495/** 156 bit = 0;
496 * swsusp_write - Write entire image and metadata. 157 while (bitmap) {
497 * 158 for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
498 * It is important _NOT_ to umount filesystems at this point. We want 159 for (test = 1UL; test; test <<= 1) {
499 * them synced (in case something goes wrong) but we DO not want to mark 160 if (bitmap->chunks[n] & test)
500 * filesystem clean: it is not. (And it does not matter, if we resume 161 swap_free(swp_entry(swap, bit));
501 * correctly, we'll mark system clean, anyway.) 162 bit++;
502 */ 163 }
503 164 bitmap = bitmap->next;
504int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
505{
506 struct swap_map_page *swap_map;
507 struct swap_map_handle handle;
508 swp_entry_t start;
509 int error;
510
511 if ((error = swsusp_swap_check())) {
512 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
513 return error;
514 }
515 if (!enough_swap(nr_pages)) {
516 printk(KERN_ERR "swsusp: Not enough free swap\n");
517 return -ENOSPC;
518 } 165 }
519
520 init_header(nr_pages);
521 swap_map = alloc_swap_map(swsusp_info.pages);
522 if (!swap_map)
523 return -ENOMEM;
524 init_swap_map_handle(&handle, swap_map);
525
526 error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
527 if (!error)
528 error = save_image_metadata(pblist, &handle);
529 if (!error)
530 error = save_image_data(pblist, &handle, nr_pages);
531 if (error)
532 goto Free_image_entries;
533
534 swap_map = reverse_swap_map(swap_map);
535 error = save_swap_map(swap_map, &start);
536 if (error)
537 goto Free_map_entries;
538
539 dump_info();
540 printk( "S" );
541 error = mark_swapfiles(start);
542 printk( "|\n" );
543 if (error)
544 goto Free_map_entries;
545
546Free_swap_map:
547 free_swap_map(swap_map);
548 return error;
549
550Free_map_entries:
551 free_swap_map_entries(swap_map);
552Free_image_entries:
553 free_image_entries(swap_map);
554 goto Free_swap_map;
555} 166}
556 167
557/** 168/**
@@ -660,379 +271,3 @@ int swsusp_resume(void)
660 local_irq_enable(); 271 local_irq_enable();
661 return error; 272 return error;
662} 273}
663
664/**
665 * mark_unsafe_pages - mark the pages that cannot be used for storing
666 * the image during resume, because they conflict with the pages that
667 * had been used before suspend
668 */
669
670static void mark_unsafe_pages(struct pbe *pblist)
671{
672 struct zone *zone;
673 unsigned long zone_pfn;
674 struct pbe *p;
675
676 if (!pblist) /* a sanity check */
677 return;
678
679 /* Clear page flags */
680 for_each_zone (zone) {
681 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
682 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
683 ClearPageNosaveFree(pfn_to_page(zone_pfn +
684 zone->zone_start_pfn));
685 }
686
687 /* Mark orig addresses */
688 for_each_pbe (p, pblist)
689 SetPageNosaveFree(virt_to_page(p->orig_address));
690
691}
692
693static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
694{
695 /* We assume both lists contain the same number of elements */
696 while (src) {
697 dst->orig_address = src->orig_address;
698 dst = dst->next;
699 src = src->next;
700 }
701}
702
703/*
704 * Using bio to read from swap.
705 * This code requires a bit more work than just using buffer heads
706 * but, it is the recommended way for 2.5/2.6.
707 * The following are to signal the beginning and end of I/O. Bios
708 * finish asynchronously, while we want them to happen synchronously.
709 * A simple atomic_t, and a wait loop take care of this problem.
710 */
711
712static atomic_t io_done = ATOMIC_INIT(0);
713
714static int end_io(struct bio *bio, unsigned int num, int err)
715{
716 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
717 panic("I/O error reading memory image");
718 atomic_set(&io_done, 0);
719 return 0;
720}
721
722static struct block_device *resume_bdev;
723
724/**
725 * submit - submit BIO request.
726 * @rw: READ or WRITE.
727 * @off physical offset of page.
728 * @page: page we're reading or writing.
729 *
730 * Straight from the textbook - allocate and initialize the bio.
731 * If we're writing, make sure the page is marked as dirty.
732 * Then submit it and wait.
733 */
734
735static int submit(int rw, pgoff_t page_off, void *page)
736{
737 int error = 0;
738 struct bio *bio;
739
740 bio = bio_alloc(GFP_ATOMIC, 1);
741 if (!bio)
742 return -ENOMEM;
743 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
744 bio->bi_bdev = resume_bdev;
745 bio->bi_end_io = end_io;
746
747 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
748 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
749 error = -EFAULT;
750 goto Done;
751 }
752
753
754 atomic_set(&io_done, 1);
755 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
756 while (atomic_read(&io_done))
757 yield();
758 if (rw == READ)
759 bio_set_pages_dirty(bio);
760 Done:
761 bio_put(bio);
762 return error;
763}
764
765static int bio_read_page(pgoff_t page_off, void *page)
766{
767 return submit(READ, page_off, page);
768}
769
770static int bio_write_page(pgoff_t page_off, void *page)
771{
772 return submit(WRITE, page_off, page);
773}
774
775/**
776 * The following functions allow us to read data using a swap map
777 * in a file-alike way
778 */
779
780static inline void release_swap_map_reader(struct swap_map_handle *handle)
781{
782 if (handle->cur)
783 free_page((unsigned long)handle->cur);
784 handle->cur = NULL;
785}
786
787static inline int get_swap_map_reader(struct swap_map_handle *handle,
788 swp_entry_t start)
789{
790 int error;
791
792 if (!swp_offset(start))
793 return -EINVAL;
794 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
795 if (!handle->cur)
796 return -ENOMEM;
797 error = bio_read_page(swp_offset(start), handle->cur);
798 if (error) {
799 release_swap_map_reader(handle);
800 return error;
801 }
802 handle->k = 0;
803 return 0;
804}
805
806static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
807{
808 unsigned long offset;
809 int error;
810
811 if (!handle->cur)
812 return -EINVAL;
813 offset = swp_offset(handle->cur->entries[handle->k]);
814 if (!offset)
815 return -EINVAL;
816 error = bio_read_page(offset, buf);
817 if (error)
818 return error;
819 if (++handle->k >= MAP_PAGE_SIZE) {
820 handle->k = 0;
821 offset = swp_offset(handle->cur->next_swap);
822 if (!offset)
823 release_swap_map_reader(handle);
824 else
825 error = bio_read_page(offset, handle->cur);
826 }
827 return error;
828}
829
830static int check_header(void)
831{
832 char *reason = NULL;
833
834 dump_info();
835 if (swsusp_info.version_code != LINUX_VERSION_CODE)
836 reason = "kernel version";
837 if (swsusp_info.num_physpages != num_physpages)
838 reason = "memory size";
839 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
840 reason = "system type";
841 if (strcmp(swsusp_info.uts.release,system_utsname.release))
842 reason = "kernel release";
843 if (strcmp(swsusp_info.uts.version,system_utsname.version))
844 reason = "version";
845 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
846 reason = "machine";
847 if (reason) {
848 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
849 return -EPERM;
850 }
851 return 0;
852}
853
854/**
855 * load_image_data - load the image data using the swap map handle
856 * @handle and store them using the page backup list @pblist
857 * (assume there are @nr_pages pages to load)
858 */
859
860static int load_image_data(struct pbe *pblist,
861 struct swap_map_handle *handle,
862 unsigned int nr_pages)
863{
864 int error;
865 unsigned int m;
866 struct pbe *p;
867
868 if (!pblist)
869 return -EINVAL;
870 printk("Loading image data pages (%u pages) ... ", nr_pages);
871 m = nr_pages / 100;
872 if (!m)
873 m = 1;
874 nr_pages = 0;
875 p = pblist;
876 while (p) {
877 error = swap_map_read_page(handle, (void *)p->address);
878 if (error)
879 break;
880 p = p->next;
881 if (!(nr_pages % m))
882 printk("\b\b\b\b%3d%%", nr_pages / m);
883 nr_pages++;
884 }
885 if (!error)
886 printk("\b\b\b\bdone\n");
887 return error;
888}
889
890/**
891 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
892 * the PBEs in the list starting at @pbe
893 */
894
895static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
896 struct pbe *pbe)
897{
898 int j;
899
900 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
901 pbe->orig_address = buf[j];
902 pbe = pbe->next;
903 }
904 return pbe;
905}
906
907/**
908 * load_image_metadata - load the image metadata using the swap map
909 * handle @handle and put them into the PBEs in the list @pblist
910 */
911
912static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
913{
914 struct pbe *p;
915 unsigned long *buf;
916 unsigned int n = 0;
917 int error = 0;
918
919 printk("Loading image metadata ... ");
920 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
921 if (!buf)
922 return -ENOMEM;
923 p = pblist;
924 while (p) {
925 error = swap_map_read_page(handle, buf);
926 if (error)
927 break;
928 p = unpack_orig_addresses(buf, p);
929 n++;
930 }
931 free_page((unsigned long)buf);
932 if (!error)
933 printk("done (%u pages loaded)\n", n);
934 return error;
935}
936
937int swsusp_read(struct pbe **pblist_ptr)
938{
939 int error;
940 struct pbe *p, *pblist;
941 struct swap_map_handle handle;
942 unsigned int nr_pages;
943
944 if (IS_ERR(resume_bdev)) {
945 pr_debug("swsusp: block device not initialised\n");
946 return PTR_ERR(resume_bdev);
947 }
948
949 error = get_swap_map_reader(&handle, swsusp_header.image);
950 if (!error)
951 error = swap_map_read_page(&handle, &swsusp_info);
952 if (!error)
953 error = check_header();
954 if (error)
955 return error;
956 nr_pages = swsusp_info.image_pages;
957 p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
958 if (!p)
959 return -ENOMEM;
960 error = load_image_metadata(p, &handle);
961 if (!error) {
962 mark_unsafe_pages(p);
963 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
964 if (pblist)
965 copy_page_backup_list(pblist, p);
966 free_pagedir(p);
967 if (!pblist)
968 error = -ENOMEM;
969
970 /* Allocate memory for the image and read the data from swap */
971 if (!error)
972 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
973 if (!error) {
974 release_eaten_pages();
975 error = load_image_data(pblist, &handle, nr_pages);
976 }
977 if (!error)
978 *pblist_ptr = pblist;
979 }
980 release_swap_map_reader(&handle);
981
982 blkdev_put(resume_bdev);
983
984 if (!error)
985 pr_debug("swsusp: Reading resume file was successful\n");
986 else
987 pr_debug("swsusp: Error %d resuming\n", error);
988 return error;
989}
990
991/**
992 * swsusp_check - Check for swsusp signature in the resume device
993 */
994
995int swsusp_check(void)
996{
997 int error;
998
999 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1000 if (!IS_ERR(resume_bdev)) {
1001 set_blocksize(resume_bdev, PAGE_SIZE);
1002 memset(&swsusp_header, 0, sizeof(swsusp_header));
1003 if ((error = bio_read_page(0, &swsusp_header)))
1004 return error;
1005 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1006 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1007 /* Reset swap signature now */
1008 error = bio_write_page(0, &swsusp_header);
1009 } else {
1010 return -EINVAL;
1011 }
1012 if (error)
1013 blkdev_put(resume_bdev);
1014 else
1015 pr_debug("swsusp: Signature found, resuming\n");
1016 } else {
1017 error = PTR_ERR(resume_bdev);
1018 }
1019
1020 if (error)
1021 pr_debug("swsusp: Error %d check for resume file\n", error);
1022
1023 return error;
1024}
1025
1026/**
1027 * swsusp_close - close swap device.
1028 */
1029
1030void swsusp_close(void)
1031{
1032 if (IS_ERR(resume_bdev)) {
1033 pr_debug("swsusp: block device not initialised\n");
1034 return;
1035 }
1036
1037 blkdev_put(resume_bdev);
1038}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 0000000000..3f1539fbe4
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,333 @@
1/*
2 * linux/kernel/power/user.c
3 *
4 * This file provides the user space interface for software suspend/resume.
5 *
6 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This file is released under the GPLv2.
9 *
10 */
11
12#include <linux/suspend.h>
13#include <linux/syscalls.h>
14#include <linux/string.h>
15#include <linux/device.h>
16#include <linux/miscdevice.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/swapops.h>
20#include <linux/pm.h>
21#include <linux/fs.h>
22
23#include <asm/uaccess.h>
24
25#include "power.h"
26
27#define SNAPSHOT_MINOR 231
28
29static struct snapshot_data {
30 struct snapshot_handle handle;
31 int swap;
32 struct bitmap_page *bitmap;
33 int mode;
34 char frozen;
35 char ready;
36} snapshot_state;
37
38static atomic_t device_available = ATOMIC_INIT(1);
39
40static int snapshot_open(struct inode *inode, struct file *filp)
41{
42 struct snapshot_data *data;
43
44 if (!atomic_add_unless(&device_available, -1, 0))
45 return -EBUSY;
46
47 if ((filp->f_flags & O_ACCMODE) == O_RDWR)
48 return -ENOSYS;
49
50 nonseekable_open(inode, filp);
51 data = &snapshot_state;
52 filp->private_data = data;
53 memset(&data->handle, 0, sizeof(struct snapshot_handle));
54 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
55 data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
56 data->mode = O_RDONLY;
57 } else {
58 data->swap = -1;
59 data->mode = O_WRONLY;
60 }
61 data->bitmap = NULL;
62 data->frozen = 0;
63 data->ready = 0;
64
65 return 0;
66}
67
68static int snapshot_release(struct inode *inode, struct file *filp)
69{
70 struct snapshot_data *data;
71
72 swsusp_free();
73 data = filp->private_data;
74 free_all_swap_pages(data->swap, data->bitmap);
75 free_bitmap(data->bitmap);
76 if (data->frozen) {
77 down(&pm_sem);
78 thaw_processes();
79 enable_nonboot_cpus();
80 up(&pm_sem);
81 }
82 atomic_inc(&device_available);
83 return 0;
84}
85
86static ssize_t snapshot_read(struct file *filp, char __user *buf,
87 size_t count, loff_t *offp)
88{
89 struct snapshot_data *data;
90 ssize_t res;
91
92 data = filp->private_data;
93 res = snapshot_read_next(&data->handle, count);
94 if (res > 0) {
95 if (copy_to_user(buf, data_of(data->handle), res))
96 res = -EFAULT;
97 else
98 *offp = data->handle.offset;
99 }
100 return res;
101}
102
103static ssize_t snapshot_write(struct file *filp, const char __user *buf,
104 size_t count, loff_t *offp)
105{
106 struct snapshot_data *data;
107 ssize_t res;
108
109 data = filp->private_data;
110 res = snapshot_write_next(&data->handle, count);
111 if (res > 0) {
112 if (copy_from_user(data_of(data->handle), buf, res))
113 res = -EFAULT;
114 else
115 *offp = data->handle.offset;
116 }
117 return res;
118}
119
120static int snapshot_ioctl(struct inode *inode, struct file *filp,
121 unsigned int cmd, unsigned long arg)
122{
123 int error = 0;
124 struct snapshot_data *data;
125 loff_t offset, avail;
126
127 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
128 return -ENOTTY;
129 if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
130 return -ENOTTY;
131 if (!capable(CAP_SYS_ADMIN))
132 return -EPERM;
133
134 data = filp->private_data;
135
136 switch (cmd) {
137
138 case SNAPSHOT_FREEZE:
139 if (data->frozen)
140 break;
141 down(&pm_sem);
142 disable_nonboot_cpus();
143 if (freeze_processes()) {
144 thaw_processes();
145 enable_nonboot_cpus();
146 error = -EBUSY;
147 }
148 up(&pm_sem);
149 if (!error)
150 data->frozen = 1;
151 break;
152
153 case SNAPSHOT_UNFREEZE:
154 if (!data->frozen)
155 break;
156 down(&pm_sem);
157 thaw_processes();
158 enable_nonboot_cpus();
159 up(&pm_sem);
160 data->frozen = 0;
161 break;
162
163 case SNAPSHOT_ATOMIC_SNAPSHOT:
164 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
165 error = -EPERM;
166 break;
167 }
168 down(&pm_sem);
169 /* Free memory before shutting down devices. */
170 error = swsusp_shrink_memory();
171 if (!error) {
172 error = device_suspend(PMSG_FREEZE);
173 if (!error) {
174 in_suspend = 1;
175 error = swsusp_suspend();
176 device_resume();
177 }
178 }
179 up(&pm_sem);
180 if (!error)
181 error = put_user(in_suspend, (unsigned int __user *)arg);
182 if (!error)
183 data->ready = 1;
184 break;
185
186 case SNAPSHOT_ATOMIC_RESTORE:
187 if (data->mode != O_WRONLY || !data->frozen ||
188 !snapshot_image_loaded(&data->handle)) {
189 error = -EPERM;
190 break;
191 }
192 down(&pm_sem);
193 pm_prepare_console();
194 error = device_suspend(PMSG_FREEZE);
195 if (!error) {
196 error = swsusp_resume();
197 device_resume();
198 }
199 pm_restore_console();
200 up(&pm_sem);
201 break;
202
203 case SNAPSHOT_FREE:
204 swsusp_free();
205 memset(&data->handle, 0, sizeof(struct snapshot_handle));
206 data->ready = 0;
207 break;
208
209 case SNAPSHOT_SET_IMAGE_SIZE:
210 image_size = arg;
211 break;
212
213 case SNAPSHOT_AVAIL_SWAP:
214 avail = count_swap_pages(data->swap, 1);
215 avail <<= PAGE_SHIFT;
216 error = put_user(avail, (loff_t __user *)arg);
217 break;
218
219 case SNAPSHOT_GET_SWAP_PAGE:
220 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
221 error = -ENODEV;
222 break;
223 }
224 if (!data->bitmap) {
225 data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
226 if (!data->bitmap) {
227 error = -ENOMEM;
228 break;
229 }
230 }
231 offset = alloc_swap_page(data->swap, data->bitmap);
232 if (offset) {
233 offset <<= PAGE_SHIFT;
234 error = put_user(offset, (loff_t __user *)arg);
235 } else {
236 error = -ENOSPC;
237 }
238 break;
239
240 case SNAPSHOT_FREE_SWAP_PAGES:
241 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
242 error = -ENODEV;
243 break;
244 }
245 free_all_swap_pages(data->swap, data->bitmap);
246 free_bitmap(data->bitmap);
247 data->bitmap = NULL;
248 break;
249
250 case SNAPSHOT_SET_SWAP_FILE:
251 if (!data->bitmap) {
252 /*
253 * User space encodes device types as two-byte values,
254 * so we need to recode them
255 */
256 if (old_decode_dev(arg)) {
257 data->swap = swap_type_of(old_decode_dev(arg));
258 if (data->swap < 0)
259 error = -ENODEV;
260 } else {
261 data->swap = -1;
262 error = -EINVAL;
263 }
264 } else {
265 error = -EPERM;
266 }
267 break;
268
269 case SNAPSHOT_S2RAM:
270 if (!data->frozen) {
271 error = -EPERM;
272 break;
273 }
274
275 if (down_trylock(&pm_sem)) {
276 error = -EBUSY;
277 break;
278 }
279
280 if (pm_ops->prepare) {
281 error = pm_ops->prepare(PM_SUSPEND_MEM);
282 if (error)
283 goto OutS3;
284 }
285
286 /* Put devices to sleep */
287 error = device_suspend(PMSG_SUSPEND);
288 if (error) {
289 printk(KERN_ERR "Failed to suspend some devices.\n");
290 } else {
291 /* Enter S3, system is already frozen */
292 suspend_enter(PM_SUSPEND_MEM);
293
294 /* Wake up devices */
295 device_resume();
296 }
297
298 if (pm_ops->finish)
299 pm_ops->finish(PM_SUSPEND_MEM);
300
301OutS3:
302 up(&pm_sem);
303 break;
304
305 default:
306 error = -ENOTTY;
307
308 }
309
310 return error;
311}
312
313static struct file_operations snapshot_fops = {
314 .open = snapshot_open,
315 .release = snapshot_release,
316 .read = snapshot_read,
317 .write = snapshot_write,
318 .llseek = no_llseek,
319 .ioctl = snapshot_ioctl,
320};
321
322static struct miscdevice snapshot_device = {
323 .minor = SNAPSHOT_MINOR,
324 .name = "snapshot",
325 .fops = &snapshot_fops,
326};
327
328static int __init snapshot_device_init(void)
329{
330 return misc_register(&snapshot_device);
331};
332
333device_initcall(snapshot_device_init);
diff --git a/kernel/printk.c b/kernel/printk.c
index 13ced0f782..8cc19431e7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -122,44 +122,6 @@ static char *log_buf = __log_buf;
122static int log_buf_len = __LOG_BUF_LEN; 122static int log_buf_len = __LOG_BUF_LEN;
123static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ 123static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
124 124
125/*
126 * Setup a list of consoles. Called from init/main.c
127 */
128static int __init console_setup(char *str)
129{
130 char name[sizeof(console_cmdline[0].name)];
131 char *s, *options;
132 int idx;
133
134 /*
135 * Decode str into name, index, options.
136 */
137 if (str[0] >= '0' && str[0] <= '9') {
138 strcpy(name, "ttyS");
139 strncpy(name + 4, str, sizeof(name) - 5);
140 } else
141 strncpy(name, str, sizeof(name) - 1);
142 name[sizeof(name) - 1] = 0;
143 if ((options = strchr(str, ',')) != NULL)
144 *(options++) = 0;
145#ifdef __sparc__
146 if (!strcmp(str, "ttya"))
147 strcpy(name, "ttyS0");
148 if (!strcmp(str, "ttyb"))
149 strcpy(name, "ttyS1");
150#endif
151 for (s = name; *s; s++)
152 if ((*s >= '0' && *s <= '9') || *s == ',')
153 break;
154 idx = simple_strtoul(s, NULL, 10);
155 *s = 0;
156
157 add_preferred_console(name, idx, options);
158 return 1;
159}
160
161__setup("console=", console_setup);
162
163static int __init log_buf_len_setup(char *str) 125static int __init log_buf_len_setup(char *str)
164{ 126{
165 unsigned long size = memparse(str, &str); 127 unsigned long size = memparse(str, &str);
@@ -659,6 +621,44 @@ static void call_console_drivers(unsigned long start, unsigned long end)
659 621
660#endif 622#endif
661 623
624/*
625 * Set up a list of consoles. Called from init/main.c
626 */
627static int __init console_setup(char *str)
628{
629 char name[sizeof(console_cmdline[0].name)];
630 char *s, *options;
631 int idx;
632
633 /*
634 * Decode str into name, index, options.
635 */
636 if (str[0] >= '0' && str[0] <= '9') {
637 strcpy(name, "ttyS");
638 strncpy(name + 4, str, sizeof(name) - 5);
639 } else {
640 strncpy(name, str, sizeof(name) - 1);
641 }
642 name[sizeof(name) - 1] = 0;
643 if ((options = strchr(str, ',')) != NULL)
644 *(options++) = 0;
645#ifdef __sparc__
646 if (!strcmp(str, "ttya"))
647 strcpy(name, "ttyS0");
648 if (!strcmp(str, "ttyb"))
649 strcpy(name, "ttyS1");
650#endif
651 for (s = name; *s; s++)
652 if ((*s >= '0' && *s <= '9') || *s == ',')
653 break;
654 idx = simple_strtoul(s, NULL, 10);
655 *s = 0;
656
657 add_preferred_console(name, idx, options);
658 return 1;
659}
660__setup("console=", console_setup);
661
662/** 662/**
663 * add_preferred_console - add a device to the list of preferred consoles. 663 * add_preferred_console - add a device to the list of preferred consoles.
664 * @name: device name 664 * @name: device name
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d7..5a730fdb1a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/profile.h> 24#include <linux/profile.h>
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/mutex.h>
26#include <asm/sections.h> 27#include <asm/sections.h>
27#include <asm/semaphore.h> 28#include <asm/semaphore.h>
28 29
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
44#ifdef CONFIG_SMP 45#ifdef CONFIG_SMP
45static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 46static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
46static DEFINE_PER_CPU(int, cpu_profile_flip); 47static DEFINE_PER_CPU(int, cpu_profile_flip);
47static DECLARE_MUTEX(profile_flip_mutex); 48static DEFINE_MUTEX(profile_flip_mutex);
48#endif /* CONFIG_SMP */ 49#endif /* CONFIG_SMP */
49 50
50static int __init profile_setup(char * str) 51static int __init profile_setup(char * str)
@@ -86,72 +87,52 @@ void __init profile_init(void)
86 87
87#ifdef CONFIG_PROFILING 88#ifdef CONFIG_PROFILING
88 89
89static DECLARE_RWSEM(profile_rwsem); 90static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
90static DEFINE_RWLOCK(handoff_lock); 91static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
91static struct notifier_block * task_exit_notifier; 92static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
92static struct notifier_block * task_free_notifier;
93static struct notifier_block * munmap_notifier;
94 93
95void profile_task_exit(struct task_struct * task) 94void profile_task_exit(struct task_struct * task)
96{ 95{
97 down_read(&profile_rwsem); 96 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
98 notifier_call_chain(&task_exit_notifier, 0, task);
99 up_read(&profile_rwsem);
100} 97}
101 98
102int profile_handoff_task(struct task_struct * task) 99int profile_handoff_task(struct task_struct * task)
103{ 100{
104 int ret; 101 int ret;
105 read_lock(&handoff_lock); 102 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
106 ret = notifier_call_chain(&task_free_notifier, 0, task);
107 read_unlock(&handoff_lock);
108 return (ret == NOTIFY_OK) ? 1 : 0; 103 return (ret == NOTIFY_OK) ? 1 : 0;
109} 104}
110 105
111void profile_munmap(unsigned long addr) 106void profile_munmap(unsigned long addr)
112{ 107{
113 down_read(&profile_rwsem); 108 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
114 notifier_call_chain(&munmap_notifier, 0, (void *)addr);
115 up_read(&profile_rwsem);
116} 109}
117 110
118int task_handoff_register(struct notifier_block * n) 111int task_handoff_register(struct notifier_block * n)
119{ 112{
120 int err = -EINVAL; 113 return atomic_notifier_chain_register(&task_free_notifier, n);
121
122 write_lock(&handoff_lock);
123 err = notifier_chain_register(&task_free_notifier, n);
124 write_unlock(&handoff_lock);
125 return err;
126} 114}
127 115
128int task_handoff_unregister(struct notifier_block * n) 116int task_handoff_unregister(struct notifier_block * n)
129{ 117{
130 int err = -EINVAL; 118 return atomic_notifier_chain_unregister(&task_free_notifier, n);
131
132 write_lock(&handoff_lock);
133 err = notifier_chain_unregister(&task_free_notifier, n);
134 write_unlock(&handoff_lock);
135 return err;
136} 119}
137 120
138int profile_event_register(enum profile_type type, struct notifier_block * n) 121int profile_event_register(enum profile_type type, struct notifier_block * n)
139{ 122{
140 int err = -EINVAL; 123 int err = -EINVAL;
141 124
142 down_write(&profile_rwsem);
143
144 switch (type) { 125 switch (type) {
145 case PROFILE_TASK_EXIT: 126 case PROFILE_TASK_EXIT:
146 err = notifier_chain_register(&task_exit_notifier, n); 127 err = blocking_notifier_chain_register(
128 &task_exit_notifier, n);
147 break; 129 break;
148 case PROFILE_MUNMAP: 130 case PROFILE_MUNMAP:
149 err = notifier_chain_register(&munmap_notifier, n); 131 err = blocking_notifier_chain_register(
132 &munmap_notifier, n);
150 break; 133 break;
151 } 134 }
152 135
153 up_write(&profile_rwsem);
154
155 return err; 136 return err;
156} 137}
157 138
@@ -160,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
160{ 141{
161 int err = -EINVAL; 142 int err = -EINVAL;
162 143
163 down_write(&profile_rwsem);
164
165 switch (type) { 144 switch (type) {
166 case PROFILE_TASK_EXIT: 145 case PROFILE_TASK_EXIT:
167 err = notifier_chain_unregister(&task_exit_notifier, n); 146 err = blocking_notifier_chain_unregister(
147 &task_exit_notifier, n);
168 break; 148 break;
169 case PROFILE_MUNMAP: 149 case PROFILE_MUNMAP:
170 err = notifier_chain_unregister(&munmap_notifier, n); 150 err = blocking_notifier_chain_unregister(
151 &munmap_notifier, n);
171 break; 152 break;
172 } 153 }
173 154
174 up_write(&profile_rwsem);
175 return err; 155 return err;
176} 156}
177 157
@@ -243,7 +223,7 @@ static void profile_flip_buffers(void)
243{ 223{
244 int i, j, cpu; 224 int i, j, cpu;
245 225
246 down(&profile_flip_mutex); 226 mutex_lock(&profile_flip_mutex);
247 j = per_cpu(cpu_profile_flip, get_cpu()); 227 j = per_cpu(cpu_profile_flip, get_cpu());
248 put_cpu(); 228 put_cpu();
249 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 229 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +239,14 @@ static void profile_flip_buffers(void)
259 hits[i].hits = hits[i].pc = 0; 239 hits[i].hits = hits[i].pc = 0;
260 } 240 }
261 } 241 }
262 up(&profile_flip_mutex); 242 mutex_unlock(&profile_flip_mutex);
263} 243}
264 244
265static void profile_discard_flip_buffers(void) 245static void profile_discard_flip_buffers(void)
266{ 246{
267 int i, cpu; 247 int i, cpu;
268 248
269 down(&profile_flip_mutex); 249 mutex_lock(&profile_flip_mutex);
270 i = per_cpu(cpu_profile_flip, get_cpu()); 250 i = per_cpu(cpu_profile_flip, get_cpu());
271 put_cpu(); 251 put_cpu();
272 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 252 on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +254,7 @@ static void profile_discard_flip_buffers(void)
274 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 254 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
275 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 255 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
276 } 256 }
277 up(&profile_flip_mutex); 257 mutex_unlock(&profile_flip_mutex);
278} 258}
279 259
280void profile_hit(int type, void *__pc) 260void profile_hit(int type, void *__pc)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d95a72c927..86a7f6c60c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -35,9 +35,9 @@ void __ptrace_link(task_t *child, task_t *new_parent)
35 if (child->parent == new_parent) 35 if (child->parent == new_parent)
36 return; 36 return;
37 list_add(&child->ptrace_list, &child->parent->ptrace_children); 37 list_add(&child->ptrace_list, &child->parent->ptrace_children);
38 REMOVE_LINKS(child); 38 remove_parent(child);
39 child->parent = new_parent; 39 child->parent = new_parent;
40 SET_LINKS(child); 40 add_parent(child);
41} 41}
42 42
43/* 43/*
@@ -77,9 +77,9 @@ void __ptrace_unlink(task_t *child)
77 child->ptrace = 0; 77 child->ptrace = 0;
78 if (!list_empty(&child->ptrace_list)) { 78 if (!list_empty(&child->ptrace_list)) {
79 list_del_init(&child->ptrace_list); 79 list_del_init(&child->ptrace_list);
80 REMOVE_LINKS(child); 80 remove_parent(child);
81 child->parent = child->real_parent; 81 child->parent = child->real_parent;
82 SET_LINKS(child); 82 add_parent(child);
83 } 83 }
84 84
85 ptrace_untrace(child); 85 ptrace_untrace(child);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index fedf5e3697..13458bbaa1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,15 +47,16 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/mutex.h>
50 51
51/* Definition for rcupdate control block. */ 52/* Definition for rcupdate control block. */
52struct rcu_ctrlblk rcu_ctrlblk = { 53static struct rcu_ctrlblk rcu_ctrlblk = {
53 .cur = -300, 54 .cur = -300,
54 .completed = -300, 55 .completed = -300,
55 .lock = SPIN_LOCK_UNLOCKED, 56 .lock = SPIN_LOCK_UNLOCKED,
56 .cpumask = CPU_MASK_NONE, 57 .cpumask = CPU_MASK_NONE,
57}; 58};
58struct rcu_ctrlblk rcu_bh_ctrlblk = { 59static struct rcu_ctrlblk rcu_bh_ctrlblk = {
59 .cur = -300, 60 .cur = -300,
60 .completed = -300, 61 .completed = -300,
61 .lock = SPIN_LOCK_UNLOCKED, 62 .lock = SPIN_LOCK_UNLOCKED,
@@ -75,7 +76,7 @@ static int rsinterval = 1000;
75#endif 76#endif
76 77
77static atomic_t rcu_barrier_cpu_count; 78static atomic_t rcu_barrier_cpu_count;
78static struct semaphore rcu_barrier_sema; 79static DEFINE_MUTEX(rcu_barrier_mutex);
79static struct completion rcu_barrier_completion; 80static struct completion rcu_barrier_completion;
80 81
81#ifdef CONFIG_SMP 82#ifdef CONFIG_SMP
@@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused)
207void rcu_barrier(void) 208void rcu_barrier(void)
208{ 209{
209 BUG_ON(in_interrupt()); 210 BUG_ON(in_interrupt());
210 /* Take cpucontrol semaphore to protect against CPU hotplug */ 211 /* Take cpucontrol mutex to protect against CPU hotplug */
211 down(&rcu_barrier_sema); 212 mutex_lock(&rcu_barrier_mutex);
212 init_completion(&rcu_barrier_completion); 213 init_completion(&rcu_barrier_completion);
213 atomic_set(&rcu_barrier_cpu_count, 0); 214 atomic_set(&rcu_barrier_cpu_count, 0);
214 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 215 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
215 wait_for_completion(&rcu_barrier_completion); 216 wait_for_completion(&rcu_barrier_completion);
216 up(&rcu_barrier_sema); 217 mutex_unlock(&rcu_barrier_mutex);
217} 218}
218EXPORT_SYMBOL_GPL(rcu_barrier); 219EXPORT_SYMBOL_GPL(rcu_barrier);
219 220
@@ -415,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
415 rdp->curtail = &rdp->curlist; 416 rdp->curtail = &rdp->curlist;
416 } 417 }
417 418
418 local_irq_disable();
419 if (rdp->nxtlist && !rdp->curlist) { 419 if (rdp->nxtlist && !rdp->curlist) {
420 local_irq_disable();
420 rdp->curlist = rdp->nxtlist; 421 rdp->curlist = rdp->nxtlist;
421 rdp->curtail = rdp->nxttail; 422 rdp->curtail = rdp->nxttail;
422 rdp->nxtlist = NULL; 423 rdp->nxtlist = NULL;
@@ -441,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
441 rcu_start_batch(rcp); 442 rcu_start_batch(rcp);
442 spin_unlock(&rcp->lock); 443 spin_unlock(&rcp->lock);
443 } 444 }
444 } else {
445 local_irq_enable();
446 } 445 }
446
447 rcu_check_quiescent_state(rcp, rdp); 447 rcu_check_quiescent_state(rcp, rdp);
448 if (rdp->donelist) 448 if (rdp->donelist)
449 rcu_do_batch(rdp); 449 rcu_do_batch(rdp);
@@ -549,7 +549,6 @@ static struct notifier_block __devinitdata rcu_nb = {
549 */ 549 */
550void __init rcu_init(void) 550void __init rcu_init(void)
551{ 551{
552 sema_init(&rcu_barrier_sema, 1);
553 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 552 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
554 (void *)(long)smp_processor_id()); 553 (void *)(long)smp_processor_id());
555 /* Register notifier for non-boot CPUs */ 554 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7712912dbc..8154e7589d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -54,15 +54,15 @@ static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56 56
57MODULE_PARM(nreaders, "i"); 57module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
59MODULE_PARM(stat_interval, "i"); 59module_param(stat_interval, int, 0);
60MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); 60MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
61MODULE_PARM(verbose, "i"); 61module_param(verbose, bool, 0);
62MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); 62MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
63MODULE_PARM(test_no_idle_hz, "i"); 63module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65MODULE_PARM(shuffle_interval, "i"); 65module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 67#define TORTURE_FLAG "rcutorture: "
68#define PRINTK_STRING(s) \ 68#define PRINTK_STRING(s) \
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 301 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; 302 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
303 303
304 for_each_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 305 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; 306 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; 307 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg)
441 return 0; 441 return 0;
442} 442}
443 443
444static inline void
445rcu_torture_print_module_parms(char *tag)
446{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
451 shuffle_interval);
452}
453
444static void 454static void
445rcu_torture_cleanup(void) 455rcu_torture_cleanup(void)
446{ 456{
@@ -483,9 +493,10 @@ rcu_torture_cleanup(void)
483 rcu_barrier(); 493 rcu_barrier();
484 494
485 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
486 printk(KERN_ALERT TORTURE_FLAG 496 if (atomic_read(&n_rcu_torture_error))
487 "--- End of test: %s\n", 497 rcu_torture_print_module_parms("End of test: FAILURE");
488 atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); 498 else
499 rcu_torture_print_module_parms("End of test: SUCCESS");
489} 500}
490 501
491static int 502static int
@@ -501,11 +512,7 @@ rcu_torture_init(void)
501 nrealreaders = nreaders; 512 nrealreaders = nreaders;
502 else 513 else
503 nrealreaders = 2 * num_online_cpus(); 514 nrealreaders = 2 * num_online_cpus();
504 printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " 515 rcu_torture_print_module_parms("Start of test");
505 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
506 "shuffle_interval = %d\n",
507 nrealreaders, stat_interval, verbose, test_no_idle_hz,
508 shuffle_interval);
509 fullstop = 0; 516 fullstop = 0;
510 517
511 /* Set up the freelist. */ 518 /* Set up the freelist. */
@@ -528,7 +535,7 @@ rcu_torture_init(void)
528 atomic_set(&n_rcu_torture_error, 0); 535 atomic_set(&n_rcu_torture_error, 0);
529 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 536 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
530 atomic_set(&rcu_torture_wcount[i], 0); 537 atomic_set(&rcu_torture_wcount[i], 0);
531 for_each_cpu(cpu) { 538 for_each_possible_cpu(cpu) {
532 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 539 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
533 per_cpu(rcu_torture_count, cpu)[i] = 0; 540 per_cpu(rcu_torture_count, cpu)[i] = 0;
534 per_cpu(rcu_torture_batch, cpu)[i] = 0; 541 per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 0000000000..33345e7348
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1012 @@
1/*
2 * Public API and common code for kernel->userspace relay file support.
3 *
4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
5 *
6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8 *
9 * Moved to kernel/relay.c by Paul Mundt, 2006.
10 *
11 * This file is released under the GPL.
12 */
13#include <linux/errno.h>
14#include <linux/stddef.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/relay.h>
19#include <linux/vmalloc.h>
20#include <linux/mm.h>
21
22/*
23 * close() vm_op implementation for relay file mapping.
24 */
25static void relay_file_mmap_close(struct vm_area_struct *vma)
26{
27 struct rchan_buf *buf = vma->vm_private_data;
28 buf->chan->cb->buf_unmapped(buf, vma->vm_file);
29}
30
31/*
32 * nopage() vm_op implementation for relay file mapping.
33 */
34static struct page *relay_buf_nopage(struct vm_area_struct *vma,
35 unsigned long address,
36 int *type)
37{
38 struct page *page;
39 struct rchan_buf *buf = vma->vm_private_data;
40 unsigned long offset = address - vma->vm_start;
41
42 if (address > vma->vm_end)
43 return NOPAGE_SIGBUS; /* Disallow mremap */
44 if (!buf)
45 return NOPAGE_OOM;
46
47 page = vmalloc_to_page(buf->start + offset);
48 if (!page)
49 return NOPAGE_OOM;
50 get_page(page);
51
52 if (type)
53 *type = VM_FAULT_MINOR;
54
55 return page;
56}
57
58/*
59 * vm_ops for relay file mappings.
60 */
61static struct vm_operations_struct relay_file_mmap_ops = {
62 .nopage = relay_buf_nopage,
63 .close = relay_file_mmap_close,
64};
65
66/**
67 * relay_mmap_buf: - mmap channel buffer to process address space
68 * @buf: relay channel buffer
69 * @vma: vm_area_struct describing memory to be mapped
70 *
71 * Returns 0 if ok, negative on error
72 *
73 * Caller should already have grabbed mmap_sem.
74 */
75int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
76{
77 unsigned long length = vma->vm_end - vma->vm_start;
78 struct file *filp = vma->vm_file;
79
80 if (!buf)
81 return -EBADF;
82
83 if (length != (unsigned long)buf->chan->alloc_size)
84 return -EINVAL;
85
86 vma->vm_ops = &relay_file_mmap_ops;
87 vma->vm_private_data = buf;
88 buf->chan->cb->buf_mapped(buf, filp);
89
90 return 0;
91}
92
93/**
94 * relay_alloc_buf - allocate a channel buffer
95 * @buf: the buffer struct
96 * @size: total size of the buffer
97 *
98 * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
99 * passed in size will get page aligned, if it isn't already.
100 */
101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
102{
103 void *mem;
104 unsigned int i, j, n_pages;
105
106 *size = PAGE_ALIGN(*size);
107 n_pages = *size >> PAGE_SHIFT;
108
109 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
110 if (!buf->page_array)
111 return NULL;
112
113 for (i = 0; i < n_pages; i++) {
114 buf->page_array[i] = alloc_page(GFP_KERNEL);
115 if (unlikely(!buf->page_array[i]))
116 goto depopulate;
117 }
118 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
119 if (!mem)
120 goto depopulate;
121
122 memset(mem, 0, *size);
123 buf->page_count = n_pages;
124 return mem;
125
126depopulate:
127 for (j = 0; j < i; j++)
128 __free_page(buf->page_array[j]);
129 kfree(buf->page_array);
130 return NULL;
131}
132
133/**
134 * relay_create_buf - allocate and initialize a channel buffer
135 * @alloc_size: size of the buffer to allocate
136 * @n_subbufs: number of sub-buffers in the channel
137 *
138 * Returns channel buffer if successful, NULL otherwise
139 */
140struct rchan_buf *relay_create_buf(struct rchan *chan)
141{
142 struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
143 if (!buf)
144 return NULL;
145
146 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
147 if (!buf->padding)
148 goto free_buf;
149
150 buf->start = relay_alloc_buf(buf, &chan->alloc_size);
151 if (!buf->start)
152 goto free_buf;
153
154 buf->chan = chan;
155 kref_get(&buf->chan->kref);
156 return buf;
157
158free_buf:
159 kfree(buf->padding);
160 kfree(buf);
161 return NULL;
162}
163
164/**
165 * relay_destroy_channel - free the channel struct
166 *
167 * Should only be called from kref_put().
168 */
169void relay_destroy_channel(struct kref *kref)
170{
171 struct rchan *chan = container_of(kref, struct rchan, kref);
172 kfree(chan);
173}
174
175/**
176 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
177 * @buf: the buffer struct
178 */
179void relay_destroy_buf(struct rchan_buf *buf)
180{
181 struct rchan *chan = buf->chan;
182 unsigned int i;
183
184 if (likely(buf->start)) {
185 vunmap(buf->start);
186 for (i = 0; i < buf->page_count; i++)
187 __free_page(buf->page_array[i]);
188 kfree(buf->page_array);
189 }
190 kfree(buf->padding);
191 kfree(buf);
192 kref_put(&chan->kref, relay_destroy_channel);
193}
194
195/**
196 * relay_remove_buf - remove a channel buffer
197 *
198 * Removes the file from the fileystem, which also frees the
199 * rchan_buf_struct and the channel buffer. Should only be called from
200 * kref_put().
201 */
202void relay_remove_buf(struct kref *kref)
203{
204 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
205 buf->chan->cb->remove_buf_file(buf->dentry);
206 relay_destroy_buf(buf);
207}
208
209/**
210 * relay_buf_empty - boolean, is the channel buffer empty?
211 * @buf: channel buffer
212 *
213 * Returns 1 if the buffer is empty, 0 otherwise.
214 */
215int relay_buf_empty(struct rchan_buf *buf)
216{
217 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
218}
219EXPORT_SYMBOL_GPL(relay_buf_empty);
220
221/**
222 * relay_buf_full - boolean, is the channel buffer full?
223 * @buf: channel buffer
224 *
225 * Returns 1 if the buffer is full, 0 otherwise.
226 */
227int relay_buf_full(struct rchan_buf *buf)
228{
229 size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
230 return (ready >= buf->chan->n_subbufs) ? 1 : 0;
231}
232EXPORT_SYMBOL_GPL(relay_buf_full);
233
234/*
235 * High-level relay kernel API and associated functions.
236 */
237
238/*
239 * rchan_callback implementations defining default channel behavior. Used
240 * in place of corresponding NULL values in client callback struct.
241 */
242
243/*
244 * subbuf_start() default callback. Does nothing.
245 */
246static int subbuf_start_default_callback (struct rchan_buf *buf,
247 void *subbuf,
248 void *prev_subbuf,
249 size_t prev_padding)
250{
251 if (relay_buf_full(buf))
252 return 0;
253
254 return 1;
255}
256
257/*
258 * buf_mapped() default callback. Does nothing.
259 */
260static void buf_mapped_default_callback(struct rchan_buf *buf,
261 struct file *filp)
262{
263}
264
265/*
266 * buf_unmapped() default callback. Does nothing.
267 */
268static void buf_unmapped_default_callback(struct rchan_buf *buf,
269 struct file *filp)
270{
271}
272
273/*
274 * create_buf_file_create() default callback. Does nothing.
275 */
276static struct dentry *create_buf_file_default_callback(const char *filename,
277 struct dentry *parent,
278 int mode,
279 struct rchan_buf *buf,
280 int *is_global)
281{
282 return NULL;
283}
284
285/*
286 * remove_buf_file() default callback. Does nothing.
287 */
288static int remove_buf_file_default_callback(struct dentry *dentry)
289{
290 return -EINVAL;
291}
292
293/* relay channel default callbacks */
294static struct rchan_callbacks default_channel_callbacks = {
295 .subbuf_start = subbuf_start_default_callback,
296 .buf_mapped = buf_mapped_default_callback,
297 .buf_unmapped = buf_unmapped_default_callback,
298 .create_buf_file = create_buf_file_default_callback,
299 .remove_buf_file = remove_buf_file_default_callback,
300};
301
302/**
303 * wakeup_readers - wake up readers waiting on a channel
304 * @private: the channel buffer
305 *
306 * This is the work function used to defer reader waking. The
307 * reason waking is deferred is that calling directly from write
308 * causes problems if you're writing from say the scheduler.
309 */
310static void wakeup_readers(void *private)
311{
312 struct rchan_buf *buf = private;
313 wake_up_interruptible(&buf->read_wait);
314}
315
316/**
317 * __relay_reset - reset a channel buffer
318 * @buf: the channel buffer
319 * @init: 1 if this is a first-time initialization
320 *
321 * See relay_reset for description of effect.
322 */
323static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
324{
325 size_t i;
326
327 if (init) {
328 init_waitqueue_head(&buf->read_wait);
329 kref_init(&buf->kref);
330 INIT_WORK(&buf->wake_readers, NULL, NULL);
331 } else {
332 cancel_delayed_work(&buf->wake_readers);
333 flush_scheduled_work();
334 }
335
336 buf->subbufs_produced = 0;
337 buf->subbufs_consumed = 0;
338 buf->bytes_consumed = 0;
339 buf->finalized = 0;
340 buf->data = buf->start;
341 buf->offset = 0;
342
343 for (i = 0; i < buf->chan->n_subbufs; i++)
344 buf->padding[i] = 0;
345
346 buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
347}
348
349/**
350 * relay_reset - reset the channel
351 * @chan: the channel
352 *
353 * This has the effect of erasing all data from all channel buffers
354 * and restarting the channel in its initial state. The buffers
355 * are not freed, so any mappings are still in effect.
356 *
357 * NOTE: Care should be taken that the channel isn't actually
358 * being used by anything when this call is made.
359 */
360void relay_reset(struct rchan *chan)
361{
362 unsigned int i;
363 struct rchan_buf *prev = NULL;
364
365 if (!chan)
366 return;
367
368 for (i = 0; i < NR_CPUS; i++) {
369 if (!chan->buf[i] || chan->buf[i] == prev)
370 break;
371 __relay_reset(chan->buf[i], 0);
372 prev = chan->buf[i];
373 }
374}
375EXPORT_SYMBOL_GPL(relay_reset);
376
377/**
378 * relay_open_buf - create a new relay channel buffer
379 *
380 * Internal - used by relay_open().
381 */
382static struct rchan_buf *relay_open_buf(struct rchan *chan,
383 const char *filename,
384 struct dentry *parent,
385 int *is_global)
386{
387 struct rchan_buf *buf;
388 struct dentry *dentry;
389
390 if (*is_global)
391 return chan->buf[0];
392
393 buf = relay_create_buf(chan);
394 if (!buf)
395 return NULL;
396
397 /* Create file in fs */
398 dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
399 buf, is_global);
400 if (!dentry) {
401 relay_destroy_buf(buf);
402 return NULL;
403 }
404
405 buf->dentry = dentry;
406 __relay_reset(buf, 1);
407
408 return buf;
409}
410
411/**
412 * relay_close_buf - close a channel buffer
413 * @buf: channel buffer
414 *
415 * Marks the buffer finalized and restores the default callbacks.
416 * The channel buffer and channel buffer data structure are then freed
417 * automatically when the last reference is given up.
418 */
419static inline void relay_close_buf(struct rchan_buf *buf)
420{
421 buf->finalized = 1;
422 cancel_delayed_work(&buf->wake_readers);
423 flush_scheduled_work();
424 kref_put(&buf->kref, relay_remove_buf);
425}
426
427static inline void setup_callbacks(struct rchan *chan,
428 struct rchan_callbacks *cb)
429{
430 if (!cb) {
431 chan->cb = &default_channel_callbacks;
432 return;
433 }
434
435 if (!cb->subbuf_start)
436 cb->subbuf_start = subbuf_start_default_callback;
437 if (!cb->buf_mapped)
438 cb->buf_mapped = buf_mapped_default_callback;
439 if (!cb->buf_unmapped)
440 cb->buf_unmapped = buf_unmapped_default_callback;
441 if (!cb->create_buf_file)
442 cb->create_buf_file = create_buf_file_default_callback;
443 if (!cb->remove_buf_file)
444 cb->remove_buf_file = remove_buf_file_default_callback;
445 chan->cb = cb;
446}
447
448/**
449 * relay_open - create a new relay channel
450 * @base_filename: base name of files to create
451 * @parent: dentry of parent directory, NULL for root directory
452 * @subbuf_size: size of sub-buffers
453 * @n_subbufs: number of sub-buffers
454 * @cb: client callback functions
455 *
456 * Returns channel pointer if successful, NULL otherwise.
457 *
458 * Creates a channel buffer for each cpu using the sizes and
459 * attributes specified. The created channel buffer files
460 * will be named base_filename0...base_filenameN-1. File
461 * permissions will be S_IRUSR.
462 */
463struct rchan *relay_open(const char *base_filename,
464 struct dentry *parent,
465 size_t subbuf_size,
466 size_t n_subbufs,
467 struct rchan_callbacks *cb)
468{
469 unsigned int i;
470 struct rchan *chan;
471 char *tmpname;
472 int is_global = 0;
473
474 if (!base_filename)
475 return NULL;
476
477 if (!(subbuf_size && n_subbufs))
478 return NULL;
479
480 chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
481 if (!chan)
482 return NULL;
483
484 chan->version = RELAYFS_CHANNEL_VERSION;
485 chan->n_subbufs = n_subbufs;
486 chan->subbuf_size = subbuf_size;
487 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
488 setup_callbacks(chan, cb);
489 kref_init(&chan->kref);
490
491 tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
492 if (!tmpname)
493 goto free_chan;
494
495 for_each_online_cpu(i) {
496 sprintf(tmpname, "%s%d", base_filename, i);
497 chan->buf[i] = relay_open_buf(chan, tmpname, parent,
498 &is_global);
499 if (!chan->buf[i])
500 goto free_bufs;
501
502 chan->buf[i]->cpu = i;
503 }
504
505 kfree(tmpname);
506 return chan;
507
508free_bufs:
509 for (i = 0; i < NR_CPUS; i++) {
510 if (!chan->buf[i])
511 break;
512 relay_close_buf(chan->buf[i]);
513 if (is_global)
514 break;
515 }
516 kfree(tmpname);
517
518free_chan:
519 kref_put(&chan->kref, relay_destroy_channel);
520 return NULL;
521}
522EXPORT_SYMBOL_GPL(relay_open);
523
524/**
525 * relay_switch_subbuf - switch to a new sub-buffer
526 * @buf: channel buffer
527 * @length: size of current event
528 *
529 * Returns either the length passed in or 0 if full.
530 *
531 * Performs sub-buffer-switch tasks such as invoking callbacks,
532 * updating padding counts, waking up readers, etc.
533 */
534size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
535{
536 void *old, *new;
537 size_t old_subbuf, new_subbuf;
538
539 if (unlikely(length > buf->chan->subbuf_size))
540 goto toobig;
541
542 if (buf->offset != buf->chan->subbuf_size + 1) {
543 buf->prev_padding = buf->chan->subbuf_size - buf->offset;
544 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
545 buf->padding[old_subbuf] = buf->prev_padding;
546 buf->subbufs_produced++;
547 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
548 buf->padding[old_subbuf];
549 smp_mb();
550 if (waitqueue_active(&buf->read_wait)) {
551 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
552 schedule_delayed_work(&buf->wake_readers, 1);
553 }
554 }
555
556 old = buf->data;
557 new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
558 new = buf->start + new_subbuf * buf->chan->subbuf_size;
559 buf->offset = 0;
560 if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
561 buf->offset = buf->chan->subbuf_size + 1;
562 return 0;
563 }
564 buf->data = new;
565 buf->padding[new_subbuf] = 0;
566
567 if (unlikely(length + buf->offset > buf->chan->subbuf_size))
568 goto toobig;
569
570 return length;
571
572toobig:
573 buf->chan->last_toobig = length;
574 return 0;
575}
576EXPORT_SYMBOL_GPL(relay_switch_subbuf);
577
578/**
579 * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
580 * @chan: the channel
581 * @cpu: the cpu associated with the channel buffer to update
582 * @subbufs_consumed: number of sub-buffers to add to current buf's count
583 *
584 * Adds to the channel buffer's consumed sub-buffer count.
585 * subbufs_consumed should be the number of sub-buffers newly consumed,
586 * not the total consumed.
587 *
588 * NOTE: kernel clients don't need to call this function if the channel
589 * mode is 'overwrite'.
590 */
591void relay_subbufs_consumed(struct rchan *chan,
592 unsigned int cpu,
593 size_t subbufs_consumed)
594{
595 struct rchan_buf *buf;
596
597 if (!chan)
598 return;
599
600 if (cpu >= NR_CPUS || !chan->buf[cpu])
601 return;
602
603 buf = chan->buf[cpu];
604 buf->subbufs_consumed += subbufs_consumed;
605 if (buf->subbufs_consumed > buf->subbufs_produced)
606 buf->subbufs_consumed = buf->subbufs_produced;
607}
608EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
609
610/**
611 * relay_close - close the channel
612 * @chan: the channel
613 *
614 * Closes all channel buffers and frees the channel.
615 */
616void relay_close(struct rchan *chan)
617{
618 unsigned int i;
619 struct rchan_buf *prev = NULL;
620
621 if (!chan)
622 return;
623
624 for (i = 0; i < NR_CPUS; i++) {
625 if (!chan->buf[i] || chan->buf[i] == prev)
626 break;
627 relay_close_buf(chan->buf[i]);
628 prev = chan->buf[i];
629 }
630
631 if (chan->last_toobig)
632 printk(KERN_WARNING "relay: one or more items not logged "
633 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
634 chan->last_toobig, chan->subbuf_size);
635
636 kref_put(&chan->kref, relay_destroy_channel);
637}
638EXPORT_SYMBOL_GPL(relay_close);
639
640/**
641 * relay_flush - close the channel
642 * @chan: the channel
643 *
644 * Flushes all channel buffers i.e. forces buffer switch.
645 */
646void relay_flush(struct rchan *chan)
647{
648 unsigned int i;
649 struct rchan_buf *prev = NULL;
650
651 if (!chan)
652 return;
653
654 for (i = 0; i < NR_CPUS; i++) {
655 if (!chan->buf[i] || chan->buf[i] == prev)
656 break;
657 relay_switch_subbuf(chan->buf[i], 0);
658 prev = chan->buf[i];
659 }
660}
661EXPORT_SYMBOL_GPL(relay_flush);
662
663/**
664 * relay_file_open - open file op for relay files
665 * @inode: the inode
666 * @filp: the file
667 *
668 * Increments the channel buffer refcount.
669 */
670static int relay_file_open(struct inode *inode, struct file *filp)
671{
672 struct rchan_buf *buf = inode->u.generic_ip;
673 kref_get(&buf->kref);
674 filp->private_data = buf;
675
676 return 0;
677}
678
679/**
680 * relay_file_mmap - mmap file op for relay files
681 * @filp: the file
682 * @vma: the vma describing what to map
683 *
684 * Calls upon relay_mmap_buf to map the file into user space.
685 */
686static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
687{
688 struct rchan_buf *buf = filp->private_data;
689 return relay_mmap_buf(buf, vma);
690}
691
692/**
693 * relay_file_poll - poll file op for relay files
694 * @filp: the file
695 * @wait: poll table
696 *
697 * Poll implemention.
698 */
699static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
700{
701 unsigned int mask = 0;
702 struct rchan_buf *buf = filp->private_data;
703
704 if (buf->finalized)
705 return POLLERR;
706
707 if (filp->f_mode & FMODE_READ) {
708 poll_wait(filp, &buf->read_wait, wait);
709 if (!relay_buf_empty(buf))
710 mask |= POLLIN | POLLRDNORM;
711 }
712
713 return mask;
714}
715
716/**
717 * relay_file_release - release file op for relay files
718 * @inode: the inode
719 * @filp: the file
720 *
721 * Decrements the channel refcount, as the filesystem is
722 * no longer using it.
723 */
724static int relay_file_release(struct inode *inode, struct file *filp)
725{
726 struct rchan_buf *buf = filp->private_data;
727 kref_put(&buf->kref, relay_remove_buf);
728
729 return 0;
730}
731
732/**
733 * relay_file_read_consume - update the consumed count for the buffer
734 */
735static void relay_file_read_consume(struct rchan_buf *buf,
736 size_t read_pos,
737 size_t bytes_consumed)
738{
739 size_t subbuf_size = buf->chan->subbuf_size;
740 size_t n_subbufs = buf->chan->n_subbufs;
741 size_t read_subbuf;
742
743 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
744 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
745 buf->bytes_consumed = 0;
746 }
747
748 buf->bytes_consumed += bytes_consumed;
749 read_subbuf = read_pos / buf->chan->subbuf_size;
750 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
751 if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
752 (buf->offset == subbuf_size))
753 return;
754 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
755 buf->bytes_consumed = 0;
756 }
757}
758
759/**
760 * relay_file_read_avail - boolean, are there unconsumed bytes available?
761 */
762static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
763{
764 size_t subbuf_size = buf->chan->subbuf_size;
765 size_t n_subbufs = buf->chan->n_subbufs;
766 size_t produced = buf->subbufs_produced;
767 size_t consumed = buf->subbufs_consumed;
768
769 relay_file_read_consume(buf, read_pos, 0);
770
771 if (unlikely(buf->offset > subbuf_size)) {
772 if (produced == consumed)
773 return 0;
774 return 1;
775 }
776
777 if (unlikely(produced - consumed >= n_subbufs)) {
778 consumed = (produced / n_subbufs) * n_subbufs;
779 buf->subbufs_consumed = consumed;
780 }
781
782 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
783 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
784
785 if (consumed > produced)
786 produced += n_subbufs * subbuf_size;
787
788 if (consumed == produced)
789 return 0;
790
791 return 1;
792}
793
794/**
795 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
796 */
797static size_t relay_file_read_subbuf_avail(size_t read_pos,
798 struct rchan_buf *buf)
799{
800 size_t padding, avail = 0;
801 size_t read_subbuf, read_offset, write_subbuf, write_offset;
802 size_t subbuf_size = buf->chan->subbuf_size;
803
804 write_subbuf = (buf->data - buf->start) / subbuf_size;
805 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
806 read_subbuf = read_pos / subbuf_size;
807 read_offset = read_pos % subbuf_size;
808 padding = buf->padding[read_subbuf];
809
810 if (read_subbuf == write_subbuf) {
811 if (read_offset + padding < write_offset)
812 avail = write_offset - (read_offset + padding);
813 } else
814 avail = (subbuf_size - padding) - read_offset;
815
816 return avail;
817}
818
819/**
820 * relay_file_read_start_pos - find the first available byte to read
821 *
822 * If the read_pos is in the middle of padding, return the
823 * position of the first actually available byte, otherwise
824 * return the original value.
825 */
826static size_t relay_file_read_start_pos(size_t read_pos,
827 struct rchan_buf *buf)
828{
829 size_t read_subbuf, padding, padding_start, padding_end;
830 size_t subbuf_size = buf->chan->subbuf_size;
831 size_t n_subbufs = buf->chan->n_subbufs;
832
833 read_subbuf = read_pos / subbuf_size;
834 padding = buf->padding[read_subbuf];
835 padding_start = (read_subbuf + 1) * subbuf_size - padding;
836 padding_end = (read_subbuf + 1) * subbuf_size;
837 if (read_pos >= padding_start && read_pos < padding_end) {
838 read_subbuf = (read_subbuf + 1) % n_subbufs;
839 read_pos = read_subbuf * subbuf_size;
840 }
841
842 return read_pos;
843}
844
845/**
846 * relay_file_read_end_pos - return the new read position
847 */
848static size_t relay_file_read_end_pos(struct rchan_buf *buf,
849 size_t read_pos,
850 size_t count)
851{
852 size_t read_subbuf, padding, end_pos;
853 size_t subbuf_size = buf->chan->subbuf_size;
854 size_t n_subbufs = buf->chan->n_subbufs;
855
856 read_subbuf = read_pos / subbuf_size;
857 padding = buf->padding[read_subbuf];
858 if (read_pos % subbuf_size + count + padding == subbuf_size)
859 end_pos = (read_subbuf + 1) * subbuf_size;
860 else
861 end_pos = read_pos + count;
862 if (end_pos >= subbuf_size * n_subbufs)
863 end_pos = 0;
864
865 return end_pos;
866}
867
868/**
869 * subbuf_read_actor - read up to one subbuf's worth of data
870 */
871static int subbuf_read_actor(size_t read_start,
872 struct rchan_buf *buf,
873 size_t avail,
874 read_descriptor_t *desc,
875 read_actor_t actor)
876{
877 void *from;
878 int ret = 0;
879
880 from = buf->start + read_start;
881 ret = avail;
882 if (copy_to_user(desc->arg.data, from, avail)) {
883 desc->error = -EFAULT;
884 ret = 0;
885 }
886 desc->arg.data += ret;
887 desc->written += ret;
888 desc->count -= ret;
889
890 return ret;
891}
892
893/**
894 * subbuf_send_actor - send up to one subbuf's worth of data
895 */
896static int subbuf_send_actor(size_t read_start,
897 struct rchan_buf *buf,
898 size_t avail,
899 read_descriptor_t *desc,
900 read_actor_t actor)
901{
902 unsigned long pidx, poff;
903 unsigned int subbuf_pages;
904 int ret = 0;
905
906 subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
907 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
908 poff = read_start & ~PAGE_MASK;
909 while (avail) {
910 struct page *p = buf->page_array[pidx];
911 unsigned int len;
912
913 len = PAGE_SIZE - poff;
914 if (len > avail)
915 len = avail;
916
917 len = actor(desc, p, poff, len);
918 if (desc->error)
919 break;
920
921 avail -= len;
922 ret += len;
923 poff = 0;
924 pidx = (pidx + 1) % subbuf_pages;
925 }
926
927 return ret;
928}
929
930typedef int (*subbuf_actor_t) (size_t read_start,
931 struct rchan_buf *buf,
932 size_t avail,
933 read_descriptor_t *desc,
934 read_actor_t actor);
935
936/**
937 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
938 */
939static inline ssize_t relay_file_read_subbufs(struct file *filp,
940 loff_t *ppos,
941 size_t count,
942 subbuf_actor_t subbuf_actor,
943 read_actor_t actor,
944 void *target)
945{
946 struct rchan_buf *buf = filp->private_data;
947 size_t read_start, avail;
948 read_descriptor_t desc;
949 int ret;
950
951 if (!count)
952 return 0;
953
954 desc.written = 0;
955 desc.count = count;
956 desc.arg.data = target;
957 desc.error = 0;
958
959 mutex_lock(&filp->f_dentry->d_inode->i_mutex);
960 do {
961 if (!relay_file_read_avail(buf, *ppos))
962 break;
963
964 read_start = relay_file_read_start_pos(*ppos, buf);
965 avail = relay_file_read_subbuf_avail(read_start, buf);
966 if (!avail)
967 break;
968
969 avail = min(desc.count, avail);
970 ret = subbuf_actor(read_start, buf, avail, &desc, actor);
971 if (desc.error < 0)
972 break;
973
974 if (ret) {
975 relay_file_read_consume(buf, read_start, ret);
976 *ppos = relay_file_read_end_pos(buf, read_start, ret);
977 }
978 } while (desc.count && ret);
979 mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
980
981 return desc.written;
982}
983
984static ssize_t relay_file_read(struct file *filp,
985 char __user *buffer,
986 size_t count,
987 loff_t *ppos)
988{
989 return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
990 NULL, buffer);
991}
992
993static ssize_t relay_file_sendfile(struct file *filp,
994 loff_t *ppos,
995 size_t count,
996 read_actor_t actor,
997 void *target)
998{
999 return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
1000 actor, target);
1001}
1002
1003struct file_operations relay_file_operations = {
1004 .open = relay_file_open,
1005 .poll = relay_file_poll,
1006 .mmap = relay_file_mmap,
1007 .read = relay_file_read,
1008 .llseek = no_llseek,
1009 .release = relay_file_release,
1010 .sendfile = relay_file_sendfile,
1011};
1012EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d46e90f59..a9ecac398b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
49#include <linux/syscalls.h> 49#include <linux/syscalls.h>
50#include <linux/times.h> 50#include <linux/times.h>
51#include <linux/acct.h> 51#include <linux/acct.h>
52#include <linux/kprobes.h>
52#include <asm/tlb.h> 53#include <asm/tlb.h>
53 54
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -144,7 +145,8 @@
144 (v1) * (v2_max) / (v1_max) 145 (v1) * (v2_max) / (v1_max)
145 146
146#define DELTA(p) \ 147#define DELTA(p) \
147 (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) 148 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
149 INTERACTIVE_DELTA)
148 150
149#define TASK_INTERACTIVE(p) \ 151#define TASK_INTERACTIVE(p) \
150 ((p)->prio <= (p)->static_prio - DELTA(p)) 152 ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -237,6 +239,7 @@ struct runqueue {
237 239
238 task_t *migration_thread; 240 task_t *migration_thread;
239 struct list_head migration_queue; 241 struct list_head migration_queue;
242 int cpu;
240#endif 243#endif
241 244
242#ifdef CONFIG_SCHEDSTATS 245#ifdef CONFIG_SCHEDSTATS
@@ -707,12 +710,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
707 DEF_TIMESLICE); 710 DEF_TIMESLICE);
708 } else { 711 } else {
709 /* 712 /*
710 * The lower the sleep avg a task has the more
711 * rapidly it will rise with sleep time.
712 */
713 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
714
715 /*
716 * Tasks waking from uninterruptible sleep are 713 * Tasks waking from uninterruptible sleep are
717 * limited in their sleep_avg rise as they 714 * limited in their sleep_avg rise as they
718 * are likely to be waiting on I/O 715 * are likely to be waiting on I/O
@@ -1551,8 +1548,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1551 finish_lock_switch(rq, prev); 1548 finish_lock_switch(rq, prev);
1552 if (mm) 1549 if (mm)
1553 mmdrop(mm); 1550 mmdrop(mm);
1554 if (unlikely(prev_task_flags & PF_DEAD)) 1551 if (unlikely(prev_task_flags & PF_DEAD)) {
1552 /*
1553 * Remove function-return probe instances associated with this
1554 * task and put them back on the free list.
1555 */
1556 kprobe_flush_task(prev);
1555 put_task_struct(prev); 1557 put_task_struct(prev);
1558 }
1556} 1559}
1557 1560
1558/** 1561/**
@@ -1622,7 +1625,7 @@ unsigned long nr_uninterruptible(void)
1622{ 1625{
1623 unsigned long i, sum = 0; 1626 unsigned long i, sum = 0;
1624 1627
1625 for_each_cpu(i) 1628 for_each_possible_cpu(i)
1626 sum += cpu_rq(i)->nr_uninterruptible; 1629 sum += cpu_rq(i)->nr_uninterruptible;
1627 1630
1628 /* 1631 /*
@@ -1639,7 +1642,7 @@ unsigned long long nr_context_switches(void)
1639{ 1642{
1640 unsigned long long i, sum = 0; 1643 unsigned long long i, sum = 0;
1641 1644
1642 for_each_cpu(i) 1645 for_each_possible_cpu(i)
1643 sum += cpu_rq(i)->nr_switches; 1646 sum += cpu_rq(i)->nr_switches;
1644 1647
1645 return sum; 1648 return sum;
@@ -1649,7 +1652,7 @@ unsigned long nr_iowait(void)
1649{ 1652{
1650 unsigned long i, sum = 0; 1653 unsigned long i, sum = 0;
1651 1654
1652 for_each_cpu(i) 1655 for_each_possible_cpu(i)
1653 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1656 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1654 1657
1655 return sum; 1658 return sum;
@@ -1660,6 +1663,9 @@ unsigned long nr_iowait(void)
1660/* 1663/*
1661 * double_rq_lock - safely lock two runqueues 1664 * double_rq_lock - safely lock two runqueues
1662 * 1665 *
1666 * We must take them in cpu order to match code in
1667 * dependent_sleeper and wake_dependent_sleeper.
1668 *
1663 * Note this does not disable interrupts like task_rq_lock, 1669 * Note this does not disable interrupts like task_rq_lock,
1664 * you need to do so manually before calling. 1670 * you need to do so manually before calling.
1665 */ 1671 */
@@ -1671,7 +1677,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1671 spin_lock(&rq1->lock); 1677 spin_lock(&rq1->lock);
1672 __acquire(rq2->lock); /* Fake it out ;) */ 1678 __acquire(rq2->lock); /* Fake it out ;) */
1673 } else { 1679 } else {
1674 if (rq1 < rq2) { 1680 if (rq1->cpu < rq2->cpu) {
1675 spin_lock(&rq1->lock); 1681 spin_lock(&rq1->lock);
1676 spin_lock(&rq2->lock); 1682 spin_lock(&rq2->lock);
1677 } else { 1683 } else {
@@ -1707,7 +1713,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1707 __acquires(this_rq->lock) 1713 __acquires(this_rq->lock)
1708{ 1714{
1709 if (unlikely(!spin_trylock(&busiest->lock))) { 1715 if (unlikely(!spin_trylock(&busiest->lock))) {
1710 if (busiest < this_rq) { 1716 if (busiest->cpu < this_rq->cpu) {
1711 spin_unlock(&this_rq->lock); 1717 spin_unlock(&this_rq->lock);
1712 spin_lock(&busiest->lock); 1718 spin_lock(&busiest->lock);
1713 spin_lock(&this_rq->lock); 1719 spin_lock(&this_rq->lock);
@@ -2873,13 +2879,11 @@ asmlinkage void __sched schedule(void)
2873 * schedule() atomically, we ignore that path for now. 2879 * schedule() atomically, we ignore that path for now.
2874 * Otherwise, whine if we are scheduling when we should not be. 2880 * Otherwise, whine if we are scheduling when we should not be.
2875 */ 2881 */
2876 if (likely(!current->exit_state)) { 2882 if (unlikely(in_atomic() && !current->exit_state)) {
2877 if (unlikely(in_atomic())) { 2883 printk(KERN_ERR "BUG: scheduling while atomic: "
2878 printk(KERN_ERR "scheduling while atomic: " 2884 "%s/0x%08x/%d\n",
2879 "%s/0x%08x/%d\n", 2885 current->comm, preempt_count(), current->pid);
2880 current->comm, preempt_count(), current->pid); 2886 dump_stack();
2881 dump_stack();
2882 }
2883 } 2887 }
2884 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2888 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2885 2889
@@ -5570,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
5570} 5574}
5571#endif 5575#endif
5572 5576
5577#ifdef CONFIG_SCHED_MC
5578static DEFINE_PER_CPU(struct sched_domain, core_domains);
5579static struct sched_group sched_group_core[NR_CPUS];
5580#endif
5581
5582#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5583static int cpu_to_core_group(int cpu)
5584{
5585 return first_cpu(cpu_sibling_map[cpu]);
5586}
5587#elif defined(CONFIG_SCHED_MC)
5588static int cpu_to_core_group(int cpu)
5589{
5590 return cpu;
5591}
5592#endif
5593
5573static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5594static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5574static struct sched_group sched_group_phys[NR_CPUS]; 5595static struct sched_group sched_group_phys[NR_CPUS];
5575static int cpu_to_phys_group(int cpu) 5596static int cpu_to_phys_group(int cpu)
5576{ 5597{
5577#ifdef CONFIG_SCHED_SMT 5598#if defined(CONFIG_SCHED_MC)
5599 cpumask_t mask = cpu_coregroup_map(cpu);
5600 return first_cpu(mask);
5601#elif defined(CONFIG_SCHED_SMT)
5578 return first_cpu(cpu_sibling_map[cpu]); 5602 return first_cpu(cpu_sibling_map[cpu]);
5579#else 5603#else
5580 return cpu; 5604 return cpu;
@@ -5597,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu)
5597{ 5621{
5598 return cpu_to_node(cpu); 5622 return cpu_to_node(cpu);
5599} 5623}
5624static void init_numa_sched_groups_power(struct sched_group *group_head)
5625{
5626 struct sched_group *sg = group_head;
5627 int j;
5628
5629 if (!sg)
5630 return;
5631next_sg:
5632 for_each_cpu_mask(j, sg->cpumask) {
5633 struct sched_domain *sd;
5634
5635 sd = &per_cpu(phys_domains, j);
5636 if (j != first_cpu(sd->groups->cpumask)) {
5637 /*
5638 * Only add "power" once for each
5639 * physical package.
5640 */
5641 continue;
5642 }
5643
5644 sg->cpu_power += sd->groups->cpu_power;
5645 }
5646 sg = sg->next;
5647 if (sg != group_head)
5648 goto next_sg;
5649}
5600#endif 5650#endif
5601 5651
5602/* 5652/*
@@ -5672,6 +5722,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
5672 sd->parent = p; 5722 sd->parent = p;
5673 sd->groups = &sched_group_phys[group]; 5723 sd->groups = &sched_group_phys[group];
5674 5724
5725#ifdef CONFIG_SCHED_MC
5726 p = sd;
5727 sd = &per_cpu(core_domains, i);
5728 group = cpu_to_core_group(i);
5729 *sd = SD_MC_INIT;
5730 sd->span = cpu_coregroup_map(i);
5731 cpus_and(sd->span, sd->span, *cpu_map);
5732 sd->parent = p;
5733 sd->groups = &sched_group_core[group];
5734#endif
5735
5675#ifdef CONFIG_SCHED_SMT 5736#ifdef CONFIG_SCHED_SMT
5676 p = sd; 5737 p = sd;
5677 sd = &per_cpu(cpu_domains, i); 5738 sd = &per_cpu(cpu_domains, i);
@@ -5697,6 +5758,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
5697 } 5758 }
5698#endif 5759#endif
5699 5760
5761#ifdef CONFIG_SCHED_MC
5762 /* Set up multi-core groups */
5763 for_each_cpu_mask(i, *cpu_map) {
5764 cpumask_t this_core_map = cpu_coregroup_map(i);
5765 cpus_and(this_core_map, this_core_map, *cpu_map);
5766 if (i != first_cpu(this_core_map))
5767 continue;
5768 init_sched_build_groups(sched_group_core, this_core_map,
5769 &cpu_to_core_group);
5770 }
5771#endif
5772
5773
5700 /* Set up physical groups */ 5774 /* Set up physical groups */
5701 for (i = 0; i < MAX_NUMNODES; i++) { 5775 for (i = 0; i < MAX_NUMNODES; i++) {
5702 cpumask_t nodemask = node_to_cpumask(i); 5776 cpumask_t nodemask = node_to_cpumask(i);
@@ -5793,51 +5867,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
5793 power = SCHED_LOAD_SCALE; 5867 power = SCHED_LOAD_SCALE;
5794 sd->groups->cpu_power = power; 5868 sd->groups->cpu_power = power;
5795#endif 5869#endif
5870#ifdef CONFIG_SCHED_MC
5871 sd = &per_cpu(core_domains, i);
5872 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5873 * SCHED_LOAD_SCALE / 10;
5874 sd->groups->cpu_power = power;
5875
5876 sd = &per_cpu(phys_domains, i);
5796 5877
5878 /*
5879 * This has to be < 2 * SCHED_LOAD_SCALE
5880 * Lets keep it SCHED_LOAD_SCALE, so that
5881 * while calculating NUMA group's cpu_power
5882 * we can simply do
5883 * numa_group->cpu_power += phys_group->cpu_power;
5884 *
5885 * See "only add power once for each physical pkg"
5886 * comment below
5887 */
5888 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5889#else
5797 sd = &per_cpu(phys_domains, i); 5890 sd = &per_cpu(phys_domains, i);
5798 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5891 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5799 (cpus_weight(sd->groups->cpumask)-1) / 10; 5892 (cpus_weight(sd->groups->cpumask)-1) / 10;
5800 sd->groups->cpu_power = power; 5893 sd->groups->cpu_power = power;
5801
5802#ifdef CONFIG_NUMA
5803 sd = &per_cpu(allnodes_domains, i);
5804 if (sd->groups) {
5805 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5806 (cpus_weight(sd->groups->cpumask)-1) / 10;
5807 sd->groups->cpu_power = power;
5808 }
5809#endif 5894#endif
5810 } 5895 }
5811 5896
5812#ifdef CONFIG_NUMA 5897#ifdef CONFIG_NUMA
5813 for (i = 0; i < MAX_NUMNODES; i++) { 5898 for (i = 0; i < MAX_NUMNODES; i++)
5814 struct sched_group *sg = sched_group_nodes[i]; 5899 init_numa_sched_groups_power(sched_group_nodes[i]);
5815 int j;
5816
5817 if (sg == NULL)
5818 continue;
5819next_sg:
5820 for_each_cpu_mask(j, sg->cpumask) {
5821 struct sched_domain *sd;
5822 int power;
5823 5900
5824 sd = &per_cpu(phys_domains, j); 5901 init_numa_sched_groups_power(sched_group_allnodes);
5825 if (j != first_cpu(sd->groups->cpumask)) {
5826 /*
5827 * Only add "power" once for each
5828 * physical package.
5829 */
5830 continue;
5831 }
5832 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5833 (cpus_weight(sd->groups->cpumask)-1) / 10;
5834
5835 sg->cpu_power += power;
5836 }
5837 sg = sg->next;
5838 if (sg != sched_group_nodes[i])
5839 goto next_sg;
5840 }
5841#endif 5902#endif
5842 5903
5843 /* Attach the domains */ 5904 /* Attach the domains */
@@ -5845,6 +5906,8 @@ next_sg:
5845 struct sched_domain *sd; 5906 struct sched_domain *sd;
5846#ifdef CONFIG_SCHED_SMT 5907#ifdef CONFIG_SCHED_SMT
5847 sd = &per_cpu(cpu_domains, i); 5908 sd = &per_cpu(cpu_domains, i);
5909#elif defined(CONFIG_SCHED_MC)
5910 sd = &per_cpu(core_domains, i);
5848#else 5911#else
5849 sd = &per_cpu(phys_domains, i); 5912 sd = &per_cpu(phys_domains, i);
5850#endif 5913#endif
@@ -6017,7 +6080,7 @@ void __init sched_init(void)
6017 runqueue_t *rq; 6080 runqueue_t *rq;
6018 int i, j, k; 6081 int i, j, k;
6019 6082
6020 for_each_cpu(i) { 6083 for_each_possible_cpu(i) {
6021 prio_array_t *array; 6084 prio_array_t *array;
6022 6085
6023 rq = cpu_rq(i); 6086 rq = cpu_rq(i);
@@ -6035,6 +6098,7 @@ void __init sched_init(void)
6035 rq->push_cpu = 0; 6098 rq->push_cpu = 0;
6036 rq->migration_thread = NULL; 6099 rq->migration_thread = NULL;
6037 INIT_LIST_HEAD(&rq->migration_queue); 6100 INIT_LIST_HEAD(&rq->migration_queue);
6101 rq->cpu = i;
6038#endif 6102#endif
6039 atomic_set(&rq->nr_iowait, 0); 6103 atomic_set(&rq->nr_iowait, 0);
6040 6104
@@ -6075,7 +6139,7 @@ void __might_sleep(char *file, int line)
6075 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6139 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6076 return; 6140 return;
6077 prev_jiffy = jiffies; 6141 prev_jiffy = jiffies;
6078 printk(KERN_ERR "Debug: sleeping function called from invalid" 6142 printk(KERN_ERR "BUG: sleeping function called from invalid"
6079 " context at %s:%d\n", file, line); 6143 " context at %s:%d\n", file, line);
6080 printk("in_atomic():%d, irqs_disabled():%d\n", 6144 printk("in_atomic():%d, irqs_disabled():%d\n",
6081 in_atomic(), irqs_disabled()); 6145 in_atomic(), irqs_disabled());
diff --git a/kernel/signal.c b/kernel/signal.c
index ea154104a0..4922928d91 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/posix-timers.h>
26#include <linux/signal.h> 25#include <linux/signal.h>
27#include <linux/audit.h> 26#include <linux/audit.h>
28#include <linux/capability.h> 27#include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
147#define sig_kernel_stop(sig) \ 146#define sig_kernel_stop(sig) \
148 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) 147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
149 148
149#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
150
150#define sig_user_defined(t, signr) \ 151#define sig_user_defined(t, signr) \
151 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ 152 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
152 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) 153 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
292 kmem_cache_free(sigqueue_cachep, q); 293 kmem_cache_free(sigqueue_cachep, q);
293} 294}
294 295
295static void flush_sigqueue(struct sigpending *queue) 296void flush_sigqueue(struct sigpending *queue)
296{ 297{
297 struct sigqueue *q; 298 struct sigqueue *q;
298 299
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
307/* 308/*
308 * Flush all pending signals for a task. 309 * Flush all pending signals for a task.
309 */ 310 */
310 311void flush_signals(struct task_struct *t)
311void
312flush_signals(struct task_struct *t)
313{ 312{
314 unsigned long flags; 313 unsigned long flags;
315 314
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
321} 320}
322 321
323/* 322/*
324 * This function expects the tasklist_lock write-locked.
325 */
326void __exit_sighand(struct task_struct *tsk)
327{
328 struct sighand_struct * sighand = tsk->sighand;
329
330 /* Ok, we're done with the signal handlers */
331 tsk->sighand = NULL;
332 if (atomic_dec_and_test(&sighand->count))
333 sighand_free(sighand);
334}
335
336void exit_sighand(struct task_struct *tsk)
337{
338 write_lock_irq(&tasklist_lock);
339 rcu_read_lock();
340 if (tsk->sighand != NULL) {
341 struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
342 spin_lock(&sighand->siglock);
343 __exit_sighand(tsk);
344 spin_unlock(&sighand->siglock);
345 }
346 rcu_read_unlock();
347 write_unlock_irq(&tasklist_lock);
348}
349
350/*
351 * This function expects the tasklist_lock write-locked.
352 */
353void __exit_signal(struct task_struct *tsk)
354{
355 struct signal_struct * sig = tsk->signal;
356 struct sighand_struct * sighand;
357
358 if (!sig)
359 BUG();
360 if (!atomic_read(&sig->count))
361 BUG();
362 rcu_read_lock();
363 sighand = rcu_dereference(tsk->sighand);
364 spin_lock(&sighand->siglock);
365 posix_cpu_timers_exit(tsk);
366 if (atomic_dec_and_test(&sig->count)) {
367 posix_cpu_timers_exit_group(tsk);
368 tsk->signal = NULL;
369 __exit_sighand(tsk);
370 spin_unlock(&sighand->siglock);
371 flush_sigqueue(&sig->shared_pending);
372 } else {
373 /*
374 * If there is any task waiting for the group exit
375 * then notify it:
376 */
377 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
378 wake_up_process(sig->group_exit_task);
379 sig->group_exit_task = NULL;
380 }
381 if (tsk == sig->curr_target)
382 sig->curr_target = next_thread(tsk);
383 tsk->signal = NULL;
384 /*
385 * Accumulate here the counters for all threads but the
386 * group leader as they die, so they can be added into
387 * the process-wide totals when those are taken.
388 * The group leader stays around as a zombie as long
389 * as there are other threads. When it gets reaped,
390 * the exit.c code will add its counts into these totals.
391 * We won't ever get here for the group leader, since it
392 * will have been the last reference on the signal_struct.
393 */
394 sig->utime = cputime_add(sig->utime, tsk->utime);
395 sig->stime = cputime_add(sig->stime, tsk->stime);
396 sig->min_flt += tsk->min_flt;
397 sig->maj_flt += tsk->maj_flt;
398 sig->nvcsw += tsk->nvcsw;
399 sig->nivcsw += tsk->nivcsw;
400 sig->sched_time += tsk->sched_time;
401 __exit_sighand(tsk);
402 spin_unlock(&sighand->siglock);
403 sig = NULL; /* Marker for below. */
404 }
405 rcu_read_unlock();
406 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
407 flush_sigqueue(&tsk->pending);
408 if (sig) {
409 /*
410 * We are cleaning up the signal_struct here.
411 */
412 exit_thread_group_keys(sig);
413 kmem_cache_free(signal_cachep, sig);
414 }
415}
416
417void exit_signal(struct task_struct *tsk)
418{
419 atomic_dec(&tsk->signal->live);
420
421 write_lock_irq(&tasklist_lock);
422 __exit_signal(tsk);
423 write_unlock_irq(&tasklist_lock);
424}
425
426/*
427 * Flush all handlers for a task. 323 * Flush all handlers for a task.
428 */ 324 */
429 325
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
695} 591}
696 592
697/* forward decl */ 593/* forward decl */
698static void do_notify_parent_cldstop(struct task_struct *tsk, 594static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
699 int to_self,
700 int why);
701 595
702/* 596/*
703 * Handle magic process-wide effects of stop/continue signals. 597 * Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
747 p->signal->group_stop_count = 0; 641 p->signal->group_stop_count = 0;
748 p->signal->flags = SIGNAL_STOP_CONTINUED; 642 p->signal->flags = SIGNAL_STOP_CONTINUED;
749 spin_unlock(&p->sighand->siglock); 643 spin_unlock(&p->sighand->siglock);
750 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); 644 do_notify_parent_cldstop(p, CLD_STOPPED);
751 spin_lock(&p->sighand->siglock); 645 spin_lock(&p->sighand->siglock);
752 } 646 }
753 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 647 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
788 p->signal->flags = SIGNAL_STOP_CONTINUED; 682 p->signal->flags = SIGNAL_STOP_CONTINUED;
789 p->signal->group_exit_code = 0; 683 p->signal->group_exit_code = 0;
790 spin_unlock(&p->sighand->siglock); 684 spin_unlock(&p->sighand->siglock);
791 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); 685 do_notify_parent_cldstop(p, CLD_CONTINUED);
792 spin_lock(&p->sighand->siglock); 686 spin_lock(&p->sighand->siglock);
793 } else { 687 } else {
794 /* 688 /*
@@ -1120,27 +1014,37 @@ void zap_other_threads(struct task_struct *p)
1120/* 1014/*
1121 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 1015 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
1122 */ 1016 */
1017struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1018{
1019 struct sighand_struct *sighand;
1020
1021 for (;;) {
1022 sighand = rcu_dereference(tsk->sighand);
1023 if (unlikely(sighand == NULL))
1024 break;
1025
1026 spin_lock_irqsave(&sighand->siglock, *flags);
1027 if (likely(sighand == tsk->sighand))
1028 break;
1029 spin_unlock_irqrestore(&sighand->siglock, *flags);
1030 }
1031
1032 return sighand;
1033}
1034
1123int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1035int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1124{ 1036{
1125 unsigned long flags; 1037 unsigned long flags;
1126 struct sighand_struct *sp;
1127 int ret; 1038 int ret;
1128 1039
1129retry:
1130 ret = check_kill_permission(sig, info, p); 1040 ret = check_kill_permission(sig, info, p);
1131 if (!ret && sig && (sp = rcu_dereference(p->sighand))) { 1041
1132 spin_lock_irqsave(&sp->siglock, flags); 1042 if (!ret && sig) {
1133 if (p->sighand != sp) { 1043 ret = -ESRCH;
1134 spin_unlock_irqrestore(&sp->siglock, flags); 1044 if (lock_task_sighand(p, &flags)) {
1135 goto retry; 1045 ret = __group_send_sig_info(sig, info, p);
1136 } 1046 unlock_task_sighand(p, &flags);
1137 if ((atomic_read(&sp->count) == 0) ||
1138 (atomic_read(&p->usage) == 0)) {
1139 spin_unlock_irqrestore(&sp->siglock, flags);
1140 return -ESRCH;
1141 } 1047 }
1142 ret = __group_send_sig_info(sig, info, p);
1143 spin_unlock_irqrestore(&sp->siglock, flags);
1144 } 1048 }
1145 1049
1146 return ret; 1050 return ret;
@@ -1189,7 +1093,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1189 struct task_struct *p; 1093 struct task_struct *p;
1190 1094
1191 rcu_read_lock(); 1095 rcu_read_lock();
1192 if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { 1096 if (unlikely(sig_needs_tasklist(sig))) {
1193 read_lock(&tasklist_lock); 1097 read_lock(&tasklist_lock);
1194 acquired_tasklist_lock = 1; 1098 acquired_tasklist_lock = 1;
1195 } 1099 }
@@ -1405,12 +1309,10 @@ void sigqueue_free(struct sigqueue *q)
1405 __sigqueue_free(q); 1309 __sigqueue_free(q);
1406} 1310}
1407 1311
1408int 1312int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1409send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1410{ 1313{
1411 unsigned long flags; 1314 unsigned long flags;
1412 int ret = 0; 1315 int ret = 0;
1413 struct sighand_struct *sh;
1414 1316
1415 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1317 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1416 1318
@@ -1424,48 +1326,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1424 */ 1326 */
1425 rcu_read_lock(); 1327 rcu_read_lock();
1426 1328
1427 if (unlikely(p->flags & PF_EXITING)) { 1329 if (!likely(lock_task_sighand(p, &flags))) {
1428 ret = -1; 1330 ret = -1;
1429 goto out_err; 1331 goto out_err;
1430 } 1332 }
1431 1333
1432retry:
1433 sh = rcu_dereference(p->sighand);
1434
1435 spin_lock_irqsave(&sh->siglock, flags);
1436 if (p->sighand != sh) {
1437 /* We raced with exec() in a multithreaded process... */
1438 spin_unlock_irqrestore(&sh->siglock, flags);
1439 goto retry;
1440 }
1441
1442 /*
1443 * We do the check here again to handle the following scenario:
1444 *
1445 * CPU 0 CPU 1
1446 * send_sigqueue
1447 * check PF_EXITING
1448 * interrupt exit code running
1449 * __exit_signal
1450 * lock sighand->siglock
1451 * unlock sighand->siglock
1452 * lock sh->siglock
1453 * add(tsk->pending) flush_sigqueue(tsk->pending)
1454 *
1455 */
1456
1457 if (unlikely(p->flags & PF_EXITING)) {
1458 ret = -1;
1459 goto out;
1460 }
1461
1462 if (unlikely(!list_empty(&q->list))) { 1334 if (unlikely(!list_empty(&q->list))) {
1463 /* 1335 /*
1464 * If an SI_TIMER entry is already queue just increment 1336 * If an SI_TIMER entry is already queue just increment
1465 * the overrun count. 1337 * the overrun count.
1466 */ 1338 */
1467 if (q->info.si_code != SI_TIMER) 1339 BUG_ON(q->info.si_code != SI_TIMER);
1468 BUG();
1469 q->info.si_overrun++; 1340 q->info.si_overrun++;
1470 goto out; 1341 goto out;
1471 } 1342 }
@@ -1481,7 +1352,7 @@ retry:
1481 signal_wake_up(p, sig == SIGKILL); 1352 signal_wake_up(p, sig == SIGKILL);
1482 1353
1483out: 1354out:
1484 spin_unlock_irqrestore(&sh->siglock, flags); 1355 unlock_task_sighand(p, &flags);
1485out_err: 1356out_err:
1486 rcu_read_unlock(); 1357 rcu_read_unlock();
1487 1358
@@ -1613,14 +1484,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1613 spin_unlock_irqrestore(&psig->siglock, flags); 1484 spin_unlock_irqrestore(&psig->siglock, flags);
1614} 1485}
1615 1486
1616static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) 1487static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1617{ 1488{
1618 struct siginfo info; 1489 struct siginfo info;
1619 unsigned long flags; 1490 unsigned long flags;
1620 struct task_struct *parent; 1491 struct task_struct *parent;
1621 struct sighand_struct *sighand; 1492 struct sighand_struct *sighand;
1622 1493
1623 if (to_self) 1494 if (tsk->ptrace & PT_PTRACED)
1624 parent = tsk->parent; 1495 parent = tsk->parent;
1625 else { 1496 else {
1626 tsk = tsk->group_leader; 1497 tsk = tsk->group_leader;
@@ -1695,7 +1566,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1695 !(current->ptrace & PT_ATTACHED)) && 1566 !(current->ptrace & PT_ATTACHED)) &&
1696 (likely(current->parent->signal != current->signal) || 1567 (likely(current->parent->signal != current->signal) ||
1697 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1568 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1698 do_notify_parent_cldstop(current, 1, CLD_TRAPPED); 1569 do_notify_parent_cldstop(current, CLD_TRAPPED);
1699 read_unlock(&tasklist_lock); 1570 read_unlock(&tasklist_lock);
1700 schedule(); 1571 schedule();
1701 } else { 1572 } else {
@@ -1744,25 +1615,17 @@ void ptrace_notify(int exit_code)
1744static void 1615static void
1745finish_stop(int stop_count) 1616finish_stop(int stop_count)
1746{ 1617{
1747 int to_self;
1748
1749 /* 1618 /*
1750 * If there are no other threads in the group, or if there is 1619 * If there are no other threads in the group, or if there is
1751 * a group stop in progress and we are the last to stop, 1620 * a group stop in progress and we are the last to stop,
1752 * report to the parent. When ptraced, every thread reports itself. 1621 * report to the parent. When ptraced, every thread reports itself.
1753 */ 1622 */
1754 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) 1623 if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
1755 to_self = 1; 1624 read_lock(&tasklist_lock);
1756 else if (stop_count == 0) 1625 do_notify_parent_cldstop(current, CLD_STOPPED);
1757 to_self = 0; 1626 read_unlock(&tasklist_lock);
1758 else 1627 }
1759 goto out;
1760
1761 read_lock(&tasklist_lock);
1762 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1763 read_unlock(&tasklist_lock);
1764 1628
1765out:
1766 schedule(); 1629 schedule();
1767 /* 1630 /*
1768 * Now we don't run again until continued. 1631 * Now we don't run again until continued.
@@ -1776,12 +1639,10 @@ out:
1776 * Returns nonzero if we've actually stopped and released the siglock. 1639 * Returns nonzero if we've actually stopped and released the siglock.
1777 * Returns zero if we didn't stop and still hold the siglock. 1640 * Returns zero if we didn't stop and still hold the siglock.
1778 */ 1641 */
1779static int 1642static int do_signal_stop(int signr)
1780do_signal_stop(int signr)
1781{ 1643{
1782 struct signal_struct *sig = current->signal; 1644 struct signal_struct *sig = current->signal;
1783 struct sighand_struct *sighand = current->sighand; 1645 int stop_count;
1784 int stop_count = -1;
1785 1646
1786 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) 1647 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
1787 return 0; 1648 return 0;
@@ -1791,86 +1652,37 @@ do_signal_stop(int signr)
1791 * There is a group stop in progress. We don't need to 1652 * There is a group stop in progress. We don't need to
1792 * start another one. 1653 * start another one.
1793 */ 1654 */
1794 signr = sig->group_exit_code;
1795 stop_count = --sig->group_stop_count; 1655 stop_count = --sig->group_stop_count;
1796 current->exit_code = signr; 1656 } else {
1797 set_current_state(TASK_STOPPED);
1798 if (stop_count == 0)
1799 sig->flags = SIGNAL_STOP_STOPPED;
1800 spin_unlock_irq(&sighand->siglock);
1801 }
1802 else if (thread_group_empty(current)) {
1803 /*
1804 * Lock must be held through transition to stopped state.
1805 */
1806 current->exit_code = current->signal->group_exit_code = signr;
1807 set_current_state(TASK_STOPPED);
1808 sig->flags = SIGNAL_STOP_STOPPED;
1809 spin_unlock_irq(&sighand->siglock);
1810 }
1811 else {
1812 /* 1657 /*
1813 * There is no group stop already in progress. 1658 * There is no group stop already in progress.
1814 * We must initiate one now, but that requires 1659 * We must initiate one now.
1815 * dropping siglock to get both the tasklist lock
1816 * and siglock again in the proper order. Note that
1817 * this allows an intervening SIGCONT to be posted.
1818 * We need to check for that and bail out if necessary.
1819 */ 1660 */
1820 struct task_struct *t; 1661 struct task_struct *t;
1821 1662
1822 spin_unlock_irq(&sighand->siglock); 1663 sig->group_exit_code = signr;
1823
1824 /* signals can be posted during this window */
1825 1664
1826 read_lock(&tasklist_lock); 1665 stop_count = 0;
1827 spin_lock_irq(&sighand->siglock); 1666 for (t = next_thread(current); t != current; t = next_thread(t))
1828
1829 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
1830 /* 1667 /*
1831 * Another stop or continue happened while we 1668 * Setting state to TASK_STOPPED for a group
1832 * didn't have the lock. We can just swallow this 1669 * stop is always done with the siglock held,
1833 * signal now. If we raced with a SIGCONT, that 1670 * so this check has no races.
1834 * should have just cleared it now. If we raced
1835 * with another processor delivering a stop signal,
1836 * then the SIGCONT that wakes us up should clear it.
1837 */ 1671 */
1838 read_unlock(&tasklist_lock); 1672 if (!t->exit_state &&
1839 return 0; 1673 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1840 } 1674 stop_count++;
1841 1675 signal_wake_up(t, 0);
1842 if (sig->group_stop_count == 0) { 1676 }
1843 sig->group_exit_code = signr; 1677 sig->group_stop_count = stop_count;
1844 stop_count = 0;
1845 for (t = next_thread(current); t != current;
1846 t = next_thread(t))
1847 /*
1848 * Setting state to TASK_STOPPED for a group
1849 * stop is always done with the siglock held,
1850 * so this check has no races.
1851 */
1852 if (!t->exit_state &&
1853 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1854 stop_count++;
1855 signal_wake_up(t, 0);
1856 }
1857 sig->group_stop_count = stop_count;
1858 }
1859 else {
1860 /* A race with another thread while unlocked. */
1861 signr = sig->group_exit_code;
1862 stop_count = --sig->group_stop_count;
1863 }
1864
1865 current->exit_code = signr;
1866 set_current_state(TASK_STOPPED);
1867 if (stop_count == 0)
1868 sig->flags = SIGNAL_STOP_STOPPED;
1869
1870 spin_unlock_irq(&sighand->siglock);
1871 read_unlock(&tasklist_lock);
1872 } 1678 }
1873 1679
1680 if (stop_count == 0)
1681 sig->flags = SIGNAL_STOP_STOPPED;
1682 current->exit_code = sig->group_exit_code;
1683 __set_current_state(TASK_STOPPED);
1684
1685 spin_unlock_irq(&current->sighand->siglock);
1874 finish_stop(stop_count); 1686 finish_stop(stop_count);
1875 return 1; 1687 return 1;
1876} 1688}
@@ -1922,6 +1734,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1922 sigset_t *mask = &current->blocked; 1734 sigset_t *mask = &current->blocked;
1923 int signr = 0; 1735 int signr = 0;
1924 1736
1737 try_to_freeze();
1738
1925relock: 1739relock:
1926 spin_lock_irq(&current->sighand->siglock); 1740 spin_lock_irq(&current->sighand->siglock);
1927 for (;;) { 1741 for (;;) {
@@ -1988,7 +1802,7 @@ relock:
1988 continue; 1802 continue;
1989 1803
1990 /* Init gets no signals it doesn't want. */ 1804 /* Init gets no signals it doesn't want. */
1991 if (current->pid == 1) 1805 if (current == child_reaper)
1992 continue; 1806 continue;
1993 1807
1994 if (sig_kernel_stop(signr)) { 1808 if (sig_kernel_stop(signr)) {
@@ -2099,10 +1913,11 @@ long do_no_restart_syscall(struct restart_block *param)
2099int sigprocmask(int how, sigset_t *set, sigset_t *oldset) 1913int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2100{ 1914{
2101 int error; 1915 int error;
2102 sigset_t old_block;
2103 1916
2104 spin_lock_irq(&current->sighand->siglock); 1917 spin_lock_irq(&current->sighand->siglock);
2105 old_block = current->blocked; 1918 if (oldset)
1919 *oldset = current->blocked;
1920
2106 error = 0; 1921 error = 0;
2107 switch (how) { 1922 switch (how) {
2108 case SIG_BLOCK: 1923 case SIG_BLOCK:
@@ -2119,8 +1934,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2119 } 1934 }
2120 recalc_sigpending(); 1935 recalc_sigpending();
2121 spin_unlock_irq(&current->sighand->siglock); 1936 spin_unlock_irq(&current->sighand->siglock);
2122 if (oldset) 1937
2123 *oldset = old_block;
2124 return error; 1938 return error;
2125} 1939}
2126 1940
@@ -2307,7 +2121,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2307 2121
2308 timeout = schedule_timeout_interruptible(timeout); 2122 timeout = schedule_timeout_interruptible(timeout);
2309 2123
2310 try_to_freeze();
2311 spin_lock_irq(&current->sighand->siglock); 2124 spin_lock_irq(&current->sighand->siglock);
2312 sig = dequeue_signal(current, &these, &info); 2125 sig = dequeue_signal(current, &these, &info);
2313 current->blocked = current->real_blocked; 2126 current->blocked = current->real_blocked;
@@ -2429,8 +2242,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
2429 return kill_proc_info(sig, &info, pid); 2242 return kill_proc_info(sig, &info, pid);
2430} 2243}
2431 2244
2432int 2245int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2433do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2434{ 2246{
2435 struct k_sigaction *k; 2247 struct k_sigaction *k;
2436 sigset_t mask; 2248 sigset_t mask;
@@ -2456,6 +2268,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2456 if (act) { 2268 if (act) {
2457 sigdelsetmask(&act->sa.sa_mask, 2269 sigdelsetmask(&act->sa.sa_mask,
2458 sigmask(SIGKILL) | sigmask(SIGSTOP)); 2270 sigmask(SIGKILL) | sigmask(SIGSTOP));
2271 *k = *act;
2459 /* 2272 /*
2460 * POSIX 3.3.1.3: 2273 * POSIX 3.3.1.3:
2461 * "Setting a signal action to SIG_IGN for a signal that is 2274 * "Setting a signal action to SIG_IGN for a signal that is
@@ -2468,19 +2281,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2468 * be discarded, whether or not it is blocked" 2281 * be discarded, whether or not it is blocked"
2469 */ 2282 */
2470 if (act->sa.sa_handler == SIG_IGN || 2283 if (act->sa.sa_handler == SIG_IGN ||
2471 (act->sa.sa_handler == SIG_DFL && 2284 (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
2472 sig_kernel_ignore(sig))) {
2473 /*
2474 * This is a fairly rare case, so we only take the
2475 * tasklist_lock once we're sure we'll need it.
2476 * Now we must do this little unlock and relock
2477 * dance to maintain the lock hierarchy.
2478 */
2479 struct task_struct *t = current; 2285 struct task_struct *t = current;
2480 spin_unlock_irq(&t->sighand->siglock);
2481 read_lock(&tasklist_lock);
2482 spin_lock_irq(&t->sighand->siglock);
2483 *k = *act;
2484 sigemptyset(&mask); 2286 sigemptyset(&mask);
2485 sigaddset(&mask, sig); 2287 sigaddset(&mask, sig);
2486 rm_from_queue_full(&mask, &t->signal->shared_pending); 2288 rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2489,12 +2291,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2489 recalc_sigpending_tsk(t); 2291 recalc_sigpending_tsk(t);
2490 t = next_thread(t); 2292 t = next_thread(t);
2491 } while (t != current); 2293 } while (t != current);
2492 spin_unlock_irq(&current->sighand->siglock);
2493 read_unlock(&tasklist_lock);
2494 return 0;
2495 } 2294 }
2496
2497 *k = *act;
2498 } 2295 }
2499 2296
2500 spin_unlock_irq(&current->sighand->siglock); 2297 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdde..ec8fed42a8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/kthread.h> 17#include <linux/kthread.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/smp.h>
19 20
20#include <asm/irq.h> 21#include <asm/irq.h>
21/* 22/*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
495 register_cpu_notifier(&cpu_nfb); 496 register_cpu_notifier(&cpu_nfb);
496 return 0; 497 return 0;
497} 498}
499
500#ifdef CONFIG_SMP
501/*
502 * Call a function on all processors
503 */
504int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
505{
506 int ret = 0;
507
508 preempt_disable();
509 ret = smp_call_function(func, info, retry, wait);
510 local_irq_disable();
511 func(info);
512 local_irq_enable();
513 preempt_enable();
514 return ret;
515}
516EXPORT_SYMBOL(on_each_cpu);
517#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c67189a25d..ced91e1ff5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -1,12 +1,11 @@
1/* 1/*
2 * Detect Soft Lockups 2 * Detect Soft Lockups
3 * 3 *
4 * started by Ingo Molnar, (C) 2005, Red Hat 4 * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
5 * 5 *
6 * this code detects soft lockups: incidents in where on a CPU 6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more. 7 * the kernel does not reschedule for 10 seconds or more.
8 */ 8 */
9
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/cpu.h> 10#include <linux/cpu.h>
12#include <linux/init.h> 11#include <linux/init.h>
@@ -17,13 +16,14 @@
17 16
18static DEFINE_SPINLOCK(print_lock); 17static DEFINE_SPINLOCK(print_lock);
19 18
20static DEFINE_PER_CPU(unsigned long, timestamp) = 0; 19static DEFINE_PER_CPU(unsigned long, touch_timestamp);
21static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; 20static DEFINE_PER_CPU(unsigned long, print_timestamp);
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 21static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23 22
24static int did_panic = 0; 23static int did_panic = 0;
25static int softlock_panic(struct notifier_block *this, unsigned long event, 24
26 void *ptr) 25static int
26softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
27{ 27{
28 did_panic = 1; 28 did_panic = 1;
29 29
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(timestamp, raw_smp_processor_id()) = jiffies; 39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog);
44 * This callback runs from the timer interrupt, and checks 44 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not: 45 * whether the watchdog thread has hung or not:
46 */ 46 */
47void softlockup_tick(struct pt_regs *regs) 47void softlockup_tick(void)
48{ 48{
49 int this_cpu = smp_processor_id(); 49 int this_cpu = smp_processor_id();
50 unsigned long timestamp = per_cpu(timestamp, this_cpu); 50 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
51 51
52 if (per_cpu(print_timestamp, this_cpu) == timestamp) 52 /* prevent double reports: */
53 if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
54 did_panic ||
55 !per_cpu(watchdog_task, this_cpu))
53 return; 56 return;
54 57
55 /* Do not cause a second panic when there already was one */ 58 /* do not print during early bootup: */
56 if (did_panic) 59 if (unlikely(system_state != SYSTEM_RUNNING)) {
60 touch_softlockup_watchdog();
57 return; 61 return;
62 }
58 63
59 if (time_after(jiffies, timestamp + 10*HZ)) { 64 /* Wake up the high-prio watchdog task every second: */
60 per_cpu(print_timestamp, this_cpu) = timestamp; 65 if (time_after(jiffies, touch_timestamp + HZ))
66 wake_up_process(per_cpu(watchdog_task, this_cpu));
67
68 /* Warn about unreasonable 10+ seconds delays: */
69 if (time_after(jiffies, touch_timestamp + 10*HZ)) {
70 per_cpu(print_timestamp, this_cpu) = touch_timestamp;
61 71
62 spin_lock(&print_lock); 72 spin_lock(&print_lock);
63 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", 73 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
64 this_cpu); 74 this_cpu);
65 show_regs(regs); 75 dump_stack();
66 spin_unlock(&print_lock); 76 spin_unlock(&print_lock);
67 } 77 }
68} 78}
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
77 sched_setscheduler(current, SCHED_FIFO, &param); 87 sched_setscheduler(current, SCHED_FIFO, &param);
78 current->flags |= PF_NOFREEZE; 88 current->flags |= PF_NOFREEZE;
79 89
80 set_current_state(TASK_INTERRUPTIBLE);
81
82 /* 90 /*
83 * Run briefly once per second - if this gets delayed for 91 * Run briefly once per second to reset the softlockup timestamp.
84 * more than 10 seconds then the debug-printout triggers 92 * If this gets delayed for more than 10 seconds then the
85 * in softlockup_tick(): 93 * debug-printout triggers in softlockup_tick().
86 */ 94 */
87 while (!kthread_should_stop()) { 95 while (!kthread_should_stop()) {
88 msleep_interruptible(1000); 96 set_current_state(TASK_INTERRUPTIBLE);
89 touch_softlockup_watchdog(); 97 touch_softlockup_watchdog();
98 schedule();
90 } 99 }
91 __set_current_state(TASK_RUNNING);
92 100
93 return 0; 101 return 0;
94} 102}
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
110 printk("watchdog for %i failed\n", hotcpu); 118 printk("watchdog for %i failed\n", hotcpu);
111 return NOTIFY_BAD; 119 return NOTIFY_BAD;
112 } 120 }
121 per_cpu(touch_timestamp, hotcpu) = jiffies;
113 per_cpu(watchdog_task, hotcpu) = p; 122 per_cpu(watchdog_task, hotcpu) = p;
114 kthread_bind(p, hotcpu); 123 kthread_bind(p, hotcpu);
115 break; 124 break;
116 case CPU_ONLINE: 125 case CPU_ONLINE:
117
118 wake_up_process(per_cpu(watchdog_task, hotcpu)); 126 wake_up_process(per_cpu(watchdog_task, hotcpu));
119 break; 127 break;
120#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
@@ -144,6 +152,5 @@ __init void spawn_softlockup_task(void)
144 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
145 register_cpu_notifier(&cpu_nfb); 153 register_cpu_notifier(&cpu_nfb);
146 154
147 notifier_chain_register(&panic_notifier_list, &panic_block); 155 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
148} 156}
149
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd592..d1b810782b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
179#define BUILD_LOCK_OPS(op, locktype) \ 179#define BUILD_LOCK_OPS(op, locktype) \
180void __lockfunc _##op##_lock(locktype##_t *lock) \ 180void __lockfunc _##op##_lock(locktype##_t *lock) \
181{ \ 181{ \
182 preempt_disable(); \
183 for (;;) { \ 182 for (;;) { \
183 preempt_disable(); \
184 if (likely(_raw_##op##_trylock(lock))) \ 184 if (likely(_raw_##op##_trylock(lock))) \
185 break; \ 185 break; \
186 preempt_enable(); \ 186 preempt_enable(); \
187 \
187 if (!(lock)->break_lock) \ 188 if (!(lock)->break_lock) \
188 (lock)->break_lock = 1; \ 189 (lock)->break_lock = 1; \
189 while (!op##_can_lock(lock) && (lock)->break_lock) \ 190 while (!op##_can_lock(lock) && (lock)->break_lock) \
190 cpu_relax(); \ 191 cpu_relax(); \
191 preempt_disable(); \
192 } \ 192 } \
193 (lock)->break_lock = 0; \ 193 (lock)->break_lock = 0; \
194} \ 194} \
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
199{ \ 199{ \
200 unsigned long flags; \ 200 unsigned long flags; \
201 \ 201 \
202 preempt_disable(); \
203 for (;;) { \ 202 for (;;) { \
203 preempt_disable(); \
204 local_irq_save(flags); \ 204 local_irq_save(flags); \
205 if (likely(_raw_##op##_trylock(lock))) \ 205 if (likely(_raw_##op##_trylock(lock))) \
206 break; \ 206 break; \
207 local_irq_restore(flags); \ 207 local_irq_restore(flags); \
208 \
209 preempt_enable(); \ 208 preempt_enable(); \
209 \
210 if (!(lock)->break_lock) \ 210 if (!(lock)->break_lock) \
211 (lock)->break_lock = 1; \ 211 (lock)->break_lock = 1; \
212 while (!op##_can_lock(lock) && (lock)->break_lock) \ 212 while (!op##_can_lock(lock) && (lock)->break_lock) \
213 cpu_relax(); \ 213 cpu_relax(); \
214 preempt_disable(); \
215 } \ 214 } \
216 (lock)->break_lock = 0; \ 215 (lock)->break_lock = 0; \
217 return flags; \ 216 return flags; \
diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a546..7ef7f6054c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
95 * and the like. 95 * and the like.
96 */ 96 */
97 97
98static struct notifier_block *reboot_notifier_list; 98static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
99static DEFINE_RWLOCK(notifier_lock); 99
100/*
101 * Notifier chain core routines. The exported routines below
102 * are layered on top of these, with appropriate locking added.
103 */
104
105static int notifier_chain_register(struct notifier_block **nl,
106 struct notifier_block *n)
107{
108 while ((*nl) != NULL) {
109 if (n->priority > (*nl)->priority)
110 break;
111 nl = &((*nl)->next);
112 }
113 n->next = *nl;
114 rcu_assign_pointer(*nl, n);
115 return 0;
116}
117
118static int notifier_chain_unregister(struct notifier_block **nl,
119 struct notifier_block *n)
120{
121 while ((*nl) != NULL) {
122 if ((*nl) == n) {
123 rcu_assign_pointer(*nl, n->next);
124 return 0;
125 }
126 nl = &((*nl)->next);
127 }
128 return -ENOENT;
129}
130
131static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v)
133{
134 int ret = NOTIFY_DONE;
135 struct notifier_block *nb;
136
137 nb = rcu_dereference(*nl);
138 while (nb) {
139 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break;
142 nb = rcu_dereference(nb->next);
143 }
144 return ret;
145}
146
147/*
148 * Atomic notifier chain routines. Registration and unregistration
149 * use a mutex, and call_chain is synchronized by RCU (no locks).
150 */
100 151
101/** 152/**
102 * notifier_chain_register - Add notifier to a notifier chain 153 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
103 * @list: Pointer to root list pointer 154 * @nh: Pointer to head of the atomic notifier chain
104 * @n: New entry in notifier chain 155 * @n: New entry in notifier chain
105 * 156 *
106 * Adds a notifier to a notifier chain. 157 * Adds a notifier to an atomic notifier chain.
107 * 158 *
108 * Currently always returns zero. 159 * Currently always returns zero.
109 */ 160 */
161
162int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
163 struct notifier_block *n)
164{
165 unsigned long flags;
166 int ret;
167
168 spin_lock_irqsave(&nh->lock, flags);
169 ret = notifier_chain_register(&nh->head, n);
170 spin_unlock_irqrestore(&nh->lock, flags);
171 return ret;
172}
173
174EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
175
176/**
177 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
178 * @nh: Pointer to head of the atomic notifier chain
179 * @n: Entry to remove from notifier chain
180 *
181 * Removes a notifier from an atomic notifier chain.
182 *
183 * Returns zero on success or %-ENOENT on failure.
184 */
185int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
186 struct notifier_block *n)
187{
188 unsigned long flags;
189 int ret;
190
191 spin_lock_irqsave(&nh->lock, flags);
192 ret = notifier_chain_unregister(&nh->head, n);
193 spin_unlock_irqrestore(&nh->lock, flags);
194 synchronize_rcu();
195 return ret;
196}
197
198EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
199
200/**
201 * atomic_notifier_call_chain - Call functions in an atomic notifier chain
202 * @nh: Pointer to head of the atomic notifier chain
203 * @val: Value passed unmodified to notifier function
204 * @v: Pointer passed unmodified to notifier function
205 *
206 * Calls each function in a notifier chain in turn. The functions
207 * run in an atomic context, so they must not block.
208 * This routine uses RCU to synchronize with changes to the chain.
209 *
210 * If the return value of the notifier can be and'ed
211 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
212 * will return immediately, with the return value of
213 * the notifier function which halted execution.
214 * Otherwise the return value is the return value
215 * of the last notifier function called.
216 */
110 217
111int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) 218int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
219 unsigned long val, void *v)
112{ 220{
113 write_lock(&notifier_lock); 221 int ret;
114 while(*list) 222
115 { 223 rcu_read_lock();
116 if(n->priority > (*list)->priority) 224 ret = notifier_call_chain(&nh->head, val, v);
117 break; 225 rcu_read_unlock();
118 list= &((*list)->next); 226 return ret;
119 }
120 n->next = *list;
121 *list=n;
122 write_unlock(&notifier_lock);
123 return 0;
124} 227}
125 228
126EXPORT_SYMBOL(notifier_chain_register); 229EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
230
231/*
232 * Blocking notifier chain routines. All access to the chain is
233 * synchronized by an rwsem.
234 */
127 235
128/** 236/**
129 * notifier_chain_unregister - Remove notifier from a notifier chain 237 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
130 * @nl: Pointer to root list pointer 238 * @nh: Pointer to head of the blocking notifier chain
131 * @n: New entry in notifier chain 239 * @n: New entry in notifier chain
132 * 240 *
133 * Removes a notifier from a notifier chain. 241 * Adds a notifier to a blocking notifier chain.
242 * Must be called in process context.
134 * 243 *
135 * Returns zero on success, or %-ENOENT on failure. 244 * Currently always returns zero.
136 */ 245 */
137 246
138int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) 247int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
248 struct notifier_block *n)
139{ 249{
140 write_lock(&notifier_lock); 250 int ret;
141 while((*nl)!=NULL) 251
142 { 252 /*
143 if((*nl)==n) 253 * This code gets used during boot-up, when task switching is
144 { 254 * not yet working and interrupts must remain disabled. At
145 *nl=n->next; 255 * such times we must not call down_write().
146 write_unlock(&notifier_lock); 256 */
147 return 0; 257 if (unlikely(system_state == SYSTEM_BOOTING))
148 } 258 return notifier_chain_register(&nh->head, n);
149 nl=&((*nl)->next); 259
150 } 260 down_write(&nh->rwsem);
151 write_unlock(&notifier_lock); 261 ret = notifier_chain_register(&nh->head, n);
152 return -ENOENT; 262 up_write(&nh->rwsem);
263 return ret;
153} 264}
154 265
155EXPORT_SYMBOL(notifier_chain_unregister); 266EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
156 267
157/** 268/**
158 * notifier_call_chain - Call functions in a notifier chain 269 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
159 * @n: Pointer to root pointer of notifier chain 270 * @nh: Pointer to head of the blocking notifier chain
271 * @n: Entry to remove from notifier chain
272 *
273 * Removes a notifier from a blocking notifier chain.
274 * Must be called from process context.
275 *
276 * Returns zero on success or %-ENOENT on failure.
277 */
278int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
279 struct notifier_block *n)
280{
281 int ret;
282
283 /*
284 * This code gets used during boot-up, when task switching is
285 * not yet working and interrupts must remain disabled. At
286 * such times we must not call down_write().
287 */
288 if (unlikely(system_state == SYSTEM_BOOTING))
289 return notifier_chain_unregister(&nh->head, n);
290
291 down_write(&nh->rwsem);
292 ret = notifier_chain_unregister(&nh->head, n);
293 up_write(&nh->rwsem);
294 return ret;
295}
296
297EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
298
299/**
300 * blocking_notifier_call_chain - Call functions in a blocking notifier chain
301 * @nh: Pointer to head of the blocking notifier chain
160 * @val: Value passed unmodified to notifier function 302 * @val: Value passed unmodified to notifier function
161 * @v: Pointer passed unmodified to notifier function 303 * @v: Pointer passed unmodified to notifier function
162 * 304 *
163 * Calls each function in a notifier chain in turn. 305 * Calls each function in a notifier chain in turn. The functions
306 * run in a process context, so they are allowed to block.
164 * 307 *
165 * If the return value of the notifier can be and'd 308 * If the return value of the notifier can be and'ed
166 * with %NOTIFY_STOP_MASK, then notifier_call_chain 309 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
167 * will return immediately, with the return value of 310 * will return immediately, with the return value of
168 * the notifier function which halted execution. 311 * the notifier function which halted execution.
169 * Otherwise, the return value is the return value 312 * Otherwise the return value is the return value
170 * of the last notifier function called. 313 * of the last notifier function called.
171 */ 314 */
172 315
173int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 316int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
317 unsigned long val, void *v)
174{ 318{
175 int ret=NOTIFY_DONE; 319 int ret;
176 struct notifier_block *nb = *n;
177 320
178 while(nb) 321 down_read(&nh->rwsem);
179 { 322 ret = notifier_call_chain(&nh->head, val, v);
180 ret=nb->notifier_call(nb,val,v); 323 up_read(&nh->rwsem);
181 if(ret&NOTIFY_STOP_MASK)
182 {
183 return ret;
184 }
185 nb=nb->next;
186 }
187 return ret; 324 return ret;
188} 325}
189 326
190EXPORT_SYMBOL(notifier_call_chain); 327EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
328
329/*
330 * Raw notifier chain routines. There is no protection;
331 * the caller must provide it. Use at your own risk!
332 */
333
334/**
335 * raw_notifier_chain_register - Add notifier to a raw notifier chain
336 * @nh: Pointer to head of the raw notifier chain
337 * @n: New entry in notifier chain
338 *
339 * Adds a notifier to a raw notifier chain.
340 * All locking must be provided by the caller.
341 *
342 * Currently always returns zero.
343 */
344
345int raw_notifier_chain_register(struct raw_notifier_head *nh,
346 struct notifier_block *n)
347{
348 return notifier_chain_register(&nh->head, n);
349}
350
351EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
352
353/**
354 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
355 * @nh: Pointer to head of the raw notifier chain
356 * @n: Entry to remove from notifier chain
357 *
358 * Removes a notifier from a raw notifier chain.
359 * All locking must be provided by the caller.
360 *
361 * Returns zero on success or %-ENOENT on failure.
362 */
363int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
364 struct notifier_block *n)
365{
366 return notifier_chain_unregister(&nh->head, n);
367}
368
369EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
370
371/**
372 * raw_notifier_call_chain - Call functions in a raw notifier chain
373 * @nh: Pointer to head of the raw notifier chain
374 * @val: Value passed unmodified to notifier function
375 * @v: Pointer passed unmodified to notifier function
376 *
377 * Calls each function in a notifier chain in turn. The functions
378 * run in an undefined context.
379 * All locking must be provided by the caller.
380 *
381 * If the return value of the notifier can be and'ed
382 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain
383 * will return immediately, with the return value of
384 * the notifier function which halted execution.
385 * Otherwise the return value is the return value
386 * of the last notifier function called.
387 */
388
389int raw_notifier_call_chain(struct raw_notifier_head *nh,
390 unsigned long val, void *v)
391{
392 return notifier_call_chain(&nh->head, val, v);
393}
394
395EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
191 396
192/** 397/**
193 * register_reboot_notifier - Register function to be called at reboot time 398 * register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
196 * Registers a function with the list of functions 401 * Registers a function with the list of functions
197 * to be called at reboot time. 402 * to be called at reboot time.
198 * 403 *
199 * Currently always returns zero, as notifier_chain_register 404 * Currently always returns zero, as blocking_notifier_chain_register
200 * always returns zero. 405 * always returns zero.
201 */ 406 */
202 407
203int register_reboot_notifier(struct notifier_block * nb) 408int register_reboot_notifier(struct notifier_block * nb)
204{ 409{
205 return notifier_chain_register(&reboot_notifier_list, nb); 410 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
206} 411}
207 412
208EXPORT_SYMBOL(register_reboot_notifier); 413EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,23 +424,11 @@ EXPORT_SYMBOL(register_reboot_notifier);
219 424
220int unregister_reboot_notifier(struct notifier_block * nb) 425int unregister_reboot_notifier(struct notifier_block * nb)
221{ 426{
222 return notifier_chain_unregister(&reboot_notifier_list, nb); 427 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
223} 428}
224 429
225EXPORT_SYMBOL(unregister_reboot_notifier); 430EXPORT_SYMBOL(unregister_reboot_notifier);
226 431
227#ifndef CONFIG_SECURITY
228int capable(int cap)
229{
230 if (cap_raised(current->cap_effective, cap)) {
231 current->flags |= PF_SUPERPRIV;
232 return 1;
233 }
234 return 0;
235}
236EXPORT_SYMBOL(capable);
237#endif
238
239static int set_one_prio(struct task_struct *p, int niceval, int error) 432static int set_one_prio(struct task_struct *p, int niceval, int error)
240{ 433{
241 int no_nice; 434 int no_nice;
@@ -392,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
392 585
393void kernel_restart_prepare(char *cmd) 586void kernel_restart_prepare(char *cmd)
394{ 587{
395 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
396 system_state = SYSTEM_RESTART; 589 system_state = SYSTEM_RESTART;
397 device_shutdown(); 590 device_shutdown();
398} 591}
@@ -442,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
442 635
443void kernel_shutdown_prepare(enum system_states state) 636void kernel_shutdown_prepare(enum system_states state)
444{ 637{
445 notifier_call_chain(&reboot_notifier_list, 638 blocking_notifier_call_chain(&reboot_notifier_list,
446 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); 639 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
447 system_state = state; 640 system_state = state;
448 device_shutdown(); 641 device_shutdown();
@@ -1009,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1009 */ 1202 */
1010 if (tbuf) { 1203 if (tbuf) {
1011 struct tms tmp; 1204 struct tms tmp;
1205 struct task_struct *tsk = current;
1206 struct task_struct *t;
1012 cputime_t utime, stime, cutime, cstime; 1207 cputime_t utime, stime, cutime, cstime;
1013 1208
1014#ifdef CONFIG_SMP 1209 spin_lock_irq(&tsk->sighand->siglock);
1015 if (thread_group_empty(current)) { 1210 utime = tsk->signal->utime;
1016 /* 1211 stime = tsk->signal->stime;
1017 * Single thread case without the use of any locks. 1212 t = tsk;
1018 * 1213 do {
1019 * We may race with release_task if two threads are 1214 utime = cputime_add(utime, t->utime);
1020 * executing. However, release task first adds up the 1215 stime = cputime_add(stime, t->stime);
1021 * counters (__exit_signal) before removing the task 1216 t = next_thread(t);
1022 * from the process tasklist (__unhash_process). 1217 } while (t != tsk);
1023 * __exit_signal also acquires and releases the
1024 * siglock which results in the proper memory ordering
1025 * so that the list modifications are always visible
1026 * after the counters have been updated.
1027 *
1028 * If the counters have been updated by the second thread
1029 * but the thread has not yet been removed from the list
1030 * then the other branch will be executing which will
1031 * block on tasklist_lock until the exit handling of the
1032 * other task is finished.
1033 *
1034 * This also implies that the sighand->siglock cannot
1035 * be held by another processor. So we can also
1036 * skip acquiring that lock.
1037 */
1038 utime = cputime_add(current->signal->utime, current->utime);
1039 stime = cputime_add(current->signal->utime, current->stime);
1040 cutime = current->signal->cutime;
1041 cstime = current->signal->cstime;
1042 } else
1043#endif
1044 {
1045 1218
1046 /* Process with multiple threads */ 1219 cutime = tsk->signal->cutime;
1047 struct task_struct *tsk = current; 1220 cstime = tsk->signal->cstime;
1048 struct task_struct *t; 1221 spin_unlock_irq(&tsk->sighand->siglock);
1049
1050 read_lock(&tasklist_lock);
1051 utime = tsk->signal->utime;
1052 stime = tsk->signal->stime;
1053 t = tsk;
1054 do {
1055 utime = cputime_add(utime, t->utime);
1056 stime = cputime_add(stime, t->stime);
1057 t = next_thread(t);
1058 } while (t != tsk);
1059 1222
1060 /*
1061 * While we have tasklist_lock read-locked, no dying thread
1062 * can be updating current->signal->[us]time. Instead,
1063 * we got their counts included in the live thread loop.
1064 * However, another thread can come in right now and
1065 * do a wait call that updates current->signal->c[us]time.
1066 * To make sure we always see that pair updated atomically,
1067 * we take the siglock around fetching them.
1068 */
1069 spin_lock_irq(&tsk->sighand->siglock);
1070 cutime = tsk->signal->cutime;
1071 cstime = tsk->signal->cstime;
1072 spin_unlock_irq(&tsk->sighand->siglock);
1073 read_unlock(&tasklist_lock);
1074 }
1075 tmp.tms_utime = cputime_to_clock_t(utime); 1223 tmp.tms_utime = cputime_to_clock_t(utime);
1076 tmp.tms_stime = cputime_to_clock_t(stime); 1224 tmp.tms_stime = cputime_to_clock_t(stime);
1077 tmp.tms_cutime = cputime_to_clock_t(cutime); 1225 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1227,7 +1375,7 @@ asmlinkage long sys_setsid(void)
1227 struct pid *pid; 1375 struct pid *pid;
1228 int err = -EPERM; 1376 int err = -EPERM;
1229 1377
1230 down(&tty_sem); 1378 mutex_lock(&tty_mutex);
1231 write_lock_irq(&tasklist_lock); 1379 write_lock_irq(&tasklist_lock);
1232 1380
1233 pid = find_pid(PIDTYPE_PGID, group_leader->pid); 1381 pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1389,7 @@ asmlinkage long sys_setsid(void)
1241 err = process_group(group_leader); 1389 err = process_group(group_leader);
1242out: 1390out:
1243 write_unlock_irq(&tasklist_lock); 1391 write_unlock_irq(&tasklist_lock);
1244 up(&tty_sem); 1392 mutex_unlock(&tty_mutex);
1245 return err; 1393 return err;
1246} 1394}
1247 1395
@@ -1375,7 +1523,7 @@ static void groups_sort(struct group_info *group_info)
1375/* a simple bsearch */ 1523/* a simple bsearch */
1376int groups_search(struct group_info *group_info, gid_t grp) 1524int groups_search(struct group_info *group_info, gid_t grp)
1377{ 1525{
1378 int left, right; 1526 unsigned int left, right;
1379 1527
1380 if (!group_info) 1528 if (!group_info)
1381 return 0; 1529 return 0;
@@ -1383,7 +1531,7 @@ int groups_search(struct group_info *group_info, gid_t grp)
1383 left = 0; 1531 left = 0;
1384 right = group_info->ngroups; 1532 right = group_info->ngroups;
1385 while (left < right) { 1533 while (left < right) {
1386 int mid = (left+right)/2; 1534 unsigned int mid = (left+right)/2;
1387 int cmp = grp - GROUP_AT(group_info, mid); 1535 int cmp = grp - GROUP_AT(group_info, mid);
1388 if (cmp > 0) 1536 if (cmp > 0)
1389 left = mid + 1; 1537 left = mid + 1;
@@ -1433,7 +1581,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1433 return -EINVAL; 1581 return -EINVAL;
1434 1582
1435 /* no need to grab task_lock here; it cannot change */ 1583 /* no need to grab task_lock here; it cannot change */
1436 get_group_info(current->group_info);
1437 i = current->group_info->ngroups; 1584 i = current->group_info->ngroups;
1438 if (gidsetsize) { 1585 if (gidsetsize) {
1439 if (i > gidsetsize) { 1586 if (i > gidsetsize) {
@@ -1446,7 +1593,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1446 } 1593 }
1447 } 1594 }
1448out: 1595out:
1449 put_group_info(current->group_info);
1450 return i; 1596 return i;
1451} 1597}
1452 1598
@@ -1487,9 +1633,7 @@ int in_group_p(gid_t grp)
1487{ 1633{
1488 int retval = 1; 1634 int retval = 1;
1489 if (grp != current->fsgid) { 1635 if (grp != current->fsgid) {
1490 get_group_info(current->group_info);
1491 retval = groups_search(current->group_info, grp); 1636 retval = groups_search(current->group_info, grp);
1492 put_group_info(current->group_info);
1493 } 1637 }
1494 return retval; 1638 return retval;
1495} 1639}
@@ -1500,9 +1644,7 @@ int in_egroup_p(gid_t grp)
1500{ 1644{
1501 int retval = 1; 1645 int retval = 1;
1502 if (grp != current->egid) { 1646 if (grp != current->egid) {
1503 get_group_info(current->group_info);
1504 retval = groups_search(current->group_info, grp); 1647 retval = groups_search(current->group_info, grp);
1505 put_group_info(current->group_info);
1506 } 1648 }
1507 return retval; 1649 return retval;
1508} 1650}
@@ -1630,20 +1772,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1630asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1772asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1631{ 1773{
1632 struct rlimit new_rlim, *old_rlim; 1774 struct rlimit new_rlim, *old_rlim;
1775 unsigned long it_prof_secs;
1633 int retval; 1776 int retval;
1634 1777
1635 if (resource >= RLIM_NLIMITS) 1778 if (resource >= RLIM_NLIMITS)
1636 return -EINVAL; 1779 return -EINVAL;
1637 if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1780 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1638 return -EFAULT; 1781 return -EFAULT;
1639 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1782 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1640 return -EINVAL; 1783 return -EINVAL;
1641 old_rlim = current->signal->rlim + resource; 1784 old_rlim = current->signal->rlim + resource;
1642 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1785 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1643 !capable(CAP_SYS_RESOURCE)) 1786 !capable(CAP_SYS_RESOURCE))
1644 return -EPERM; 1787 return -EPERM;
1645 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) 1788 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
1646 return -EPERM; 1789 return -EPERM;
1647 1790
1648 retval = security_task_setrlimit(resource, &new_rlim); 1791 retval = security_task_setrlimit(resource, &new_rlim);
1649 if (retval) 1792 if (retval)
@@ -1653,19 +1796,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1653 *old_rlim = new_rlim; 1796 *old_rlim = new_rlim;
1654 task_unlock(current->group_leader); 1797 task_unlock(current->group_leader);
1655 1798
1656 if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && 1799 if (resource != RLIMIT_CPU)
1657 (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 1800 goto out;
1658 new_rlim.rlim_cur <= cputime_to_secs( 1801
1659 current->signal->it_prof_expires))) { 1802 /*
1660 cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); 1803 * RLIMIT_CPU handling. Note that the kernel fails to return an error
1804 * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
1805 * very long-standing error, and fixing it now risks breakage of
1806 * applications, so we live with it
1807 */
1808 if (new_rlim.rlim_cur == RLIM_INFINITY)
1809 goto out;
1810
1811 it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
1812 if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
1813 unsigned long rlim_cur = new_rlim.rlim_cur;
1814 cputime_t cputime;
1815
1816 if (rlim_cur == 0) {
1817 /*
1818 * The caller is asking for an immediate RLIMIT_CPU
1819 * expiry. But we use the zero value to mean "it was
1820 * never set". So let's cheat and make it one second
1821 * instead
1822 */
1823 rlim_cur = 1;
1824 }
1825 cputime = secs_to_cputime(rlim_cur);
1661 read_lock(&tasklist_lock); 1826 read_lock(&tasklist_lock);
1662 spin_lock_irq(&current->sighand->siglock); 1827 spin_lock_irq(&current->sighand->siglock);
1663 set_process_cpu_timer(current, CPUCLOCK_PROF, 1828 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
1664 &cputime, NULL);
1665 spin_unlock_irq(&current->sighand->siglock); 1829 spin_unlock_irq(&current->sighand->siglock);
1666 read_unlock(&tasklist_lock); 1830 read_unlock(&tasklist_lock);
1667 } 1831 }
1668 1832out:
1669 return 0; 1833 return 0;
1670} 1834}
1671 1835
@@ -1677,9 +1841,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1677 * a lot simpler! (Which we're not doing right now because we're not 1841 * a lot simpler! (Which we're not doing right now because we're not
1678 * measuring them yet). 1842 * measuring them yet).
1679 * 1843 *
1680 * This expects to be called with tasklist_lock read-locked or better,
1681 * and the siglock not locked. It may momentarily take the siglock.
1682 *
1683 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1844 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1684 * races with threads incrementing their own counters. But since word 1845 * races with threads incrementing their own counters. But since word
1685 * reads are atomic, we either get new values or old values and we don't 1846 * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1848,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1687 * the c* fields from p->signal from races with exit.c updating those 1848 * the c* fields from p->signal from races with exit.c updating those
1688 * fields when reaping, so a sample either gets all the additions of a 1849 * fields when reaping, so a sample either gets all the additions of a
1689 * given child after it's reaped, or none so this sample is before reaping. 1850 * given child after it's reaped, or none so this sample is before reaping.
1851 *
1852 * tasklist_lock locking optimisation:
1853 * If we are current and single threaded, we do not need to take the tasklist
1854 * lock or the siglock. No one else can take our signal_struct away,
1855 * no one else can reap the children to update signal->c* counters, and
1856 * no one else can race with the signal-> fields.
1857 * If we do not take the tasklist_lock, the signal-> fields could be read
1858 * out of order while another thread was just exiting. So we place a
1859 * read memory barrier when we avoid the lock. On the writer side,
1860 * write memory barrier is implied in __exit_signal as __exit_signal releases
1861 * the siglock spinlock after updating the signal-> fields.
1862 *
1863 * We don't really need the siglock when we access the non c* fields
1864 * of the signal_struct (for RUSAGE_SELF) even in multithreaded
1865 * case, since we take the tasklist lock for read and the non c* signal->
1866 * fields are updated only in __exit_signal, which is called with
1867 * tasklist_lock taken for write, hence these two threads cannot execute
1868 * concurrently.
1869 *
1690 */ 1870 */
1691 1871
1692static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1872static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1874,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1694 struct task_struct *t; 1874 struct task_struct *t;
1695 unsigned long flags; 1875 unsigned long flags;
1696 cputime_t utime, stime; 1876 cputime_t utime, stime;
1877 int need_lock = 0;
1697 1878
1698 memset((char *) r, 0, sizeof *r); 1879 memset((char *) r, 0, sizeof *r);
1880 utime = stime = cputime_zero;
1699 1881
1700 if (unlikely(!p->signal)) 1882 if (p != current || !thread_group_empty(p))
1701 return; 1883 need_lock = 1;
1702 1884
1703 utime = stime = cputime_zero; 1885 if (need_lock) {
1886 read_lock(&tasklist_lock);
1887 if (unlikely(!p->signal)) {
1888 read_unlock(&tasklist_lock);
1889 return;
1890 }
1891 } else
1892 /* See locking comments above */
1893 smp_rmb();
1704 1894
1705 switch (who) { 1895 switch (who) {
1706 case RUSAGE_BOTH: 1896 case RUSAGE_BOTH:
@@ -1740,6 +1930,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1740 BUG(); 1930 BUG();
1741 } 1931 }
1742 1932
1933 if (need_lock)
1934 read_unlock(&tasklist_lock);
1743 cputime_to_timeval(utime, &r->ru_utime); 1935 cputime_to_timeval(utime, &r->ru_utime);
1744 cputime_to_timeval(stime, &r->ru_stime); 1936 cputime_to_timeval(stime, &r->ru_stime);
1745} 1937}
@@ -1747,9 +1939,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1747int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1939int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1748{ 1940{
1749 struct rusage r; 1941 struct rusage r;
1750 read_lock(&tasklist_lock);
1751 k_getrusage(p, who, &r); 1942 k_getrusage(p, who, &r);
1752 read_unlock(&tasklist_lock);
1753 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1943 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1754} 1944}
1755 1945
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6..d82864c4a6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
42cond_syscall(sys_socketcall); 42cond_syscall(sys_socketcall);
43cond_syscall(sys_futex); 43cond_syscall(sys_futex);
44cond_syscall(compat_sys_futex); 44cond_syscall(compat_sys_futex);
45cond_syscall(sys_set_robust_list);
46cond_syscall(compat_sys_set_robust_list);
47cond_syscall(sys_get_robust_list);
48cond_syscall(compat_sys_get_robust_list);
45cond_syscall(sys_epoll_create); 49cond_syscall(sys_epoll_create);
46cond_syscall(sys_epoll_ctl); 50cond_syscall(sys_epoll_ctl);
47cond_syscall(sys_epoll_wait); 51cond_syscall(sys_epoll_wait);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32b48e8ee3..e82726faee 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -742,18 +742,18 @@ static ctl_table vm_table[] = {
742 { 742 {
743 .ctl_name = VM_DIRTY_WB_CS, 743 .ctl_name = VM_DIRTY_WB_CS,
744 .procname = "dirty_writeback_centisecs", 744 .procname = "dirty_writeback_centisecs",
745 .data = &dirty_writeback_centisecs, 745 .data = &dirty_writeback_interval,
746 .maxlen = sizeof(dirty_writeback_centisecs), 746 .maxlen = sizeof(dirty_writeback_interval),
747 .mode = 0644, 747 .mode = 0644,
748 .proc_handler = &dirty_writeback_centisecs_handler, 748 .proc_handler = &dirty_writeback_centisecs_handler,
749 }, 749 },
750 { 750 {
751 .ctl_name = VM_DIRTY_EXPIRE_CS, 751 .ctl_name = VM_DIRTY_EXPIRE_CS,
752 .procname = "dirty_expire_centisecs", 752 .procname = "dirty_expire_centisecs",
753 .data = &dirty_expire_centisecs, 753 .data = &dirty_expire_interval,
754 .maxlen = sizeof(dirty_expire_centisecs), 754 .maxlen = sizeof(dirty_expire_interval),
755 .mode = 0644, 755 .mode = 0644,
756 .proc_handler = &proc_dointvec, 756 .proc_handler = &proc_dointvec_userhz_jiffies,
757 }, 757 },
758 { 758 {
759 .ctl_name = VM_NR_PDFLUSH_THREADS, 759 .ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -848,9 +848,8 @@ static ctl_table vm_table[] = {
848 .data = &laptop_mode, 848 .data = &laptop_mode,
849 .maxlen = sizeof(laptop_mode), 849 .maxlen = sizeof(laptop_mode),
850 .mode = 0644, 850 .mode = 0644,
851 .proc_handler = &proc_dointvec, 851 .proc_handler = &proc_dointvec_jiffies,
852 .strategy = &sysctl_intvec, 852 .strategy = &sysctl_jiffies,
853 .extra1 = &zero,
854 }, 853 },
855 { 854 {
856 .ctl_name = VM_BLOCK_DUMP, 855 .ctl_name = VM_BLOCK_DUMP,
@@ -2054,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2054 int write, void *data) 2053 int write, void *data)
2055{ 2054{
2056 if (write) { 2055 if (write) {
2056 if (*lvalp > LONG_MAX / HZ)
2057 return 1;
2057 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); 2058 *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
2058 } else { 2059 } else {
2059 int val = *valp; 2060 int val = *valp;
@@ -2075,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2075 int write, void *data) 2076 int write, void *data)
2076{ 2077{
2077 if (write) { 2078 if (write) {
2079 if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
2080 return 1;
2078 *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); 2081 *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
2079 } else { 2082 } else {
2080 int val = *valp; 2083 int val = *valp;
diff --git a/kernel/time.c b/kernel/time.c
index 804539165d..ff8e7019c4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
202 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 202 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
203} 203}
204 204
205long pps_offset; /* pps time offset (us) */
206long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
207
208long pps_freq; /* frequency offset (scaled ppm) */
209long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
210
211long pps_valid = PPS_VALID; /* pps signal watchdog counter */
212
213int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
214
215long pps_jitcnt; /* jitter limit exceeded */
216long pps_calcnt; /* calibration intervals */
217long pps_errcnt; /* calibration errors */
218long pps_stbcnt; /* stability limit exceeded */
219
220/* hook for a loadable hardpps kernel module */
221void (*hardpps_ptr)(struct timeval *);
222
223/* we call this to notify the arch when the clock is being 205/* we call this to notify the arch when the clock is being
224 * controlled. If no such arch routine, do nothing. 206 * controlled. If no such arch routine, do nothing.
225 */ 207 */
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc)
279 result = -EINVAL; 261 result = -EINVAL;
280 goto leave; 262 goto leave;
281 } 263 }
282 time_freq = txc->freq - pps_freq; 264 time_freq = txc->freq;
283 } 265 }
284 266
285 if (txc->modes & ADJ_MAXERROR) { 267 if (txc->modes & ADJ_MAXERROR) {
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc)
312 if ((time_next_adjust = txc->offset) == 0) 294 if ((time_next_adjust = txc->offset) == 0)
313 time_adjust = 0; 295 time_adjust = 0;
314 } 296 }
315 else if ( time_status & (STA_PLL | STA_PPSTIME) ) { 297 else if (time_status & STA_PLL) {
316 ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == 298 ltemp = txc->offset;
317 (STA_PPSTIME | STA_PPSSIGNAL) ?
318 pps_offset : txc->offset;
319 299
320 /* 300 /*
321 * Scale the phase adjustment and 301 * Scale the phase adjustment and
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc)
356 } 336 }
357 time_freq = min(time_freq, time_tolerance); 337 time_freq = min(time_freq, time_tolerance);
358 time_freq = max(time_freq, -time_tolerance); 338 time_freq = max(time_freq, -time_tolerance);
359 } /* STA_PLL || STA_PPSTIME */ 339 } /* STA_PLL */
360 } /* txc->modes & ADJ_OFFSET */ 340 } /* txc->modes & ADJ_OFFSET */
361 if (txc->modes & ADJ_TICK) { 341 if (txc->modes & ADJ_TICK) {
362 tick_usec = txc->tick; 342 tick_usec = txc->tick;
363 tick_nsec = TICK_USEC_TO_NSEC(tick_usec); 343 tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
364 } 344 }
365 } /* txc->modes */ 345 } /* txc->modes */
366leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 346leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
367 || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
368 && (time_status & STA_PPSSIGNAL) == 0)
369 /* p. 24, (b) */
370 || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
371 == (STA_PPSTIME|STA_PPSJITTER))
372 /* p. 24, (c) */
373 || ((time_status & STA_PPSFREQ) != 0
374 && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
375 /* p. 24, (d) */
376 result = TIME_ERROR; 347 result = TIME_ERROR;
377 348
378 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 349 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
@@ -380,7 +351,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
380 else { 351 else {
381 txc->offset = shift_right(time_offset, SHIFT_UPDATE); 352 txc->offset = shift_right(time_offset, SHIFT_UPDATE);
382 } 353 }
383 txc->freq = time_freq + pps_freq; 354 txc->freq = time_freq;
384 txc->maxerror = time_maxerror; 355 txc->maxerror = time_maxerror;
385 txc->esterror = time_esterror; 356 txc->esterror = time_esterror;
386 txc->status = time_status; 357 txc->status = time_status;
@@ -388,14 +359,16 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
388 txc->precision = time_precision; 359 txc->precision = time_precision;
389 txc->tolerance = time_tolerance; 360 txc->tolerance = time_tolerance;
390 txc->tick = tick_usec; 361 txc->tick = tick_usec;
391 txc->ppsfreq = pps_freq; 362
392 txc->jitter = pps_jitter >> PPS_AVG; 363 /* PPS is not implemented, so these are zero */
393 txc->shift = pps_shift; 364 txc->ppsfreq = 0;
394 txc->stabil = pps_stabil; 365 txc->jitter = 0;
395 txc->jitcnt = pps_jitcnt; 366 txc->shift = 0;
396 txc->calcnt = pps_calcnt; 367 txc->stabil = 0;
397 txc->errcnt = pps_errcnt; 368 txc->jitcnt = 0;
398 txc->stbcnt = pps_stbcnt; 369 txc->calcnt = 0;
370 txc->errcnt = 0;
371 txc->stbcnt = 0;
399 write_sequnlock_irq(&xtime_lock); 372 write_sequnlock_irq(&xtime_lock);
400 do_gettimeofday(&txc->time); 373 do_gettimeofday(&txc->time);
401 notify_arch_cmos_timer(); 374 notify_arch_cmos_timer();
@@ -637,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
637 * 610 *
638 * Returns the timespec representation of the nsec parameter. 611 * Returns the timespec representation of the nsec parameter.
639 */ 612 */
640struct timespec ns_to_timespec(const nsec_t nsec) 613struct timespec ns_to_timespec(const s64 nsec)
641{ 614{
642 struct timespec ts; 615 struct timespec ts;
643 616
@@ -657,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
657 * 630 *
658 * Returns the timeval representation of the nsec parameter. 631 * Returns the timeval representation of the nsec parameter.
659 */ 632 */
660struct timeval ns_to_timeval(const nsec_t nsec) 633struct timeval ns_to_timeval(const s64 nsec)
661{ 634{
662 struct timespec ts = ns_to_timespec(nsec); 635 struct timespec ts = ns_to_timespec(nsec);
663 struct timeval tv; 636 struct timeval tv;
diff --git a/kernel/timer.c b/kernel/timer.c
index 2410c18dbe..ab189dd187 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -86,7 +86,8 @@ struct tvec_t_base_s {
86} ____cacheline_aligned_in_smp; 86} ____cacheline_aligned_in_smp;
87 87
88typedef struct tvec_t_base_s tvec_base_t; 88typedef struct tvec_t_base_s tvec_base_t;
89static DEFINE_PER_CPU(tvec_base_t, tvec_bases); 89static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
90static tvec_base_t boot_tvec_bases;
90 91
91static inline void set_running_timer(tvec_base_t *base, 92static inline void set_running_timer(tvec_base_t *base,
92 struct timer_list *timer) 93 struct timer_list *timer)
@@ -157,7 +158,7 @@ EXPORT_SYMBOL(__init_timer_base);
157void fastcall init_timer(struct timer_list *timer) 158void fastcall init_timer(struct timer_list *timer)
158{ 159{
159 timer->entry.next = NULL; 160 timer->entry.next = NULL;
160 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; 161 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
161} 162}
162EXPORT_SYMBOL(init_timer); 163EXPORT_SYMBOL(init_timer);
163 164
@@ -218,7 +219,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
218 ret = 1; 219 ret = 1;
219 } 220 }
220 221
221 new_base = &__get_cpu_var(tvec_bases); 222 new_base = __get_cpu_var(tvec_bases);
222 223
223 if (base != &new_base->t_base) { 224 if (base != &new_base->t_base) {
224 /* 225 /*
@@ -258,7 +259,7 @@ EXPORT_SYMBOL(__mod_timer);
258 */ 259 */
259void add_timer_on(struct timer_list *timer, int cpu) 260void add_timer_on(struct timer_list *timer, int cpu)
260{ 261{
261 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 262 tvec_base_t *base = per_cpu(tvec_bases, cpu);
262 unsigned long flags; 263 unsigned long flags;
263 264
264 BUG_ON(timer_pending(timer) || !timer->function); 265 BUG_ON(timer_pending(timer) || !timer->function);
@@ -504,7 +505,7 @@ unsigned long next_timer_interrupt(void)
504 } 505 }
505 hr_expires += jiffies; 506 hr_expires += jiffies;
506 507
507 base = &__get_cpu_var(tvec_bases); 508 base = __get_cpu_var(tvec_bases);
508 spin_lock(&base->t_base.lock); 509 spin_lock(&base->t_base.lock);
509 expires = base->timer_jiffies + (LONG_MAX >> 1); 510 expires = base->timer_jiffies + (LONG_MAX >> 1);
510 list = NULL; 511 list = NULL;
@@ -696,18 +697,9 @@ static void second_overflow(void)
696 697
697 /* 698 /*
698 * Compute the frequency estimate and additional phase adjustment due 699 * Compute the frequency estimate and additional phase adjustment due
699 * to frequency error for the next second. When the PPS signal is 700 * to frequency error for the next second.
700 * engaged, gnaw on the watchdog counter and update the frequency
701 * computed by the pll and the PPS signal.
702 */ 701 */
703 pps_valid++; 702 ltemp = time_freq;
704 if (pps_valid == PPS_VALID) { /* PPS signal lost */
705 pps_jitter = MAXTIME;
706 pps_stabil = MAXFREQ;
707 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
708 STA_PPSWANDER | STA_PPSERROR);
709 }
710 ltemp = time_freq + pps_freq;
711 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); 703 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
712 704
713#if HZ == 100 705#if HZ == 100
@@ -901,7 +893,7 @@ EXPORT_SYMBOL(xtime_lock);
901 */ 893 */
902static void run_timer_softirq(struct softirq_action *h) 894static void run_timer_softirq(struct softirq_action *h)
903{ 895{
904 tvec_base_t *base = &__get_cpu_var(tvec_bases); 896 tvec_base_t *base = __get_cpu_var(tvec_bases);
905 897
906 hrtimer_run_queues(); 898 hrtimer_run_queues();
907 if (time_after_eq(jiffies, base->timer_jiffies)) 899 if (time_after_eq(jiffies, base->timer_jiffies))
@@ -914,6 +906,7 @@ static void run_timer_softirq(struct softirq_action *h)
914void run_local_timers(void) 906void run_local_timers(void)
915{ 907{
916 raise_softirq(TIMER_SOFTIRQ); 908 raise_softirq(TIMER_SOFTIRQ);
909 softlockup_tick();
917} 910}
918 911
919/* 912/*
@@ -944,7 +937,6 @@ void do_timer(struct pt_regs *regs)
944 /* prevent loading jiffies before storing new jiffies_64 value. */ 937 /* prevent loading jiffies before storing new jiffies_64 value. */
945 barrier(); 938 barrier();
946 update_times(); 939 update_times();
947 softlockup_tick(regs);
948} 940}
949 941
950#ifdef __ARCH_WANT_SYS_ALARM 942#ifdef __ARCH_WANT_SYS_ALARM
@@ -955,19 +947,7 @@ void do_timer(struct pt_regs *regs)
955 */ 947 */
956asmlinkage unsigned long sys_alarm(unsigned int seconds) 948asmlinkage unsigned long sys_alarm(unsigned int seconds)
957{ 949{
958 struct itimerval it_new, it_old; 950 return alarm_setitimer(seconds);
959 unsigned int oldalarm;
960
961 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
962 it_new.it_value.tv_sec = seconds;
963 it_new.it_value.tv_usec = 0;
964 do_setitimer(ITIMER_REAL, &it_new, &it_old);
965 oldalarm = it_old.it_value.tv_sec;
966 /* ehhh.. We can't return 0 if we have an alarm pending.. */
967 /* And we'd better return too much than too little anyway */
968 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
969 oldalarm++;
970 return oldalarm;
971} 951}
972 952
973#endif 953#endif
@@ -1256,12 +1236,32 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1256 return 0; 1236 return 0;
1257} 1237}
1258 1238
1259static void __devinit init_timers_cpu(int cpu) 1239static int __devinit init_timers_cpu(int cpu)
1260{ 1240{
1261 int j; 1241 int j;
1262 tvec_base_t *base; 1242 tvec_base_t *base;
1263 1243
1264 base = &per_cpu(tvec_bases, cpu); 1244 base = per_cpu(tvec_bases, cpu);
1245 if (!base) {
1246 static char boot_done;
1247
1248 /*
1249 * Cannot do allocation in init_timers as that runs before the
1250 * allocator initializes (and would waste memory if there are
1251 * more possible CPUs than will ever be installed/brought up).
1252 */
1253 if (boot_done) {
1254 base = kmalloc_node(sizeof(*base), GFP_KERNEL,
1255 cpu_to_node(cpu));
1256 if (!base)
1257 return -ENOMEM;
1258 memset(base, 0, sizeof(*base));
1259 } else {
1260 base = &boot_tvec_bases;
1261 boot_done = 1;
1262 }
1263 per_cpu(tvec_bases, cpu) = base;
1264 }
1265 spin_lock_init(&base->t_base.lock); 1265 spin_lock_init(&base->t_base.lock);
1266 for (j = 0; j < TVN_SIZE; j++) { 1266 for (j = 0; j < TVN_SIZE; j++) {
1267 INIT_LIST_HEAD(base->tv5.vec + j); 1267 INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1273,6 +1273,7 @@ static void __devinit init_timers_cpu(int cpu)
1273 INIT_LIST_HEAD(base->tv1.vec + j); 1273 INIT_LIST_HEAD(base->tv1.vec + j);
1274 1274
1275 base->timer_jiffies = jiffies; 1275 base->timer_jiffies = jiffies;
1276 return 0;
1276} 1277}
1277 1278
1278#ifdef CONFIG_HOTPLUG_CPU 1279#ifdef CONFIG_HOTPLUG_CPU
@@ -1295,8 +1296,8 @@ static void __devinit migrate_timers(int cpu)
1295 int i; 1296 int i;
1296 1297
1297 BUG_ON(cpu_online(cpu)); 1298 BUG_ON(cpu_online(cpu));
1298 old_base = &per_cpu(tvec_bases, cpu); 1299 old_base = per_cpu(tvec_bases, cpu);
1299 new_base = &get_cpu_var(tvec_bases); 1300 new_base = get_cpu_var(tvec_bases);
1300 1301
1301 local_irq_disable(); 1302 local_irq_disable();
1302 spin_lock(&new_base->t_base.lock); 1303 spin_lock(&new_base->t_base.lock);
@@ -1326,7 +1327,8 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
1326 long cpu = (long)hcpu; 1327 long cpu = (long)hcpu;
1327 switch(action) { 1328 switch(action) {
1328 case CPU_UP_PREPARE: 1329 case CPU_UP_PREPARE:
1329 init_timers_cpu(cpu); 1330 if (init_timers_cpu(cpu) < 0)
1331 return NOTIFY_BAD;
1330 break; 1332 break;
1331#ifdef CONFIG_HOTPLUG_CPU 1333#ifdef CONFIG_HOTPLUG_CPU
1332 case CPU_DEAD: 1334 case CPU_DEAD:
diff --git a/kernel/user.c b/kernel/user.c
index d9deae43a9..2116642f42 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -105,15 +105,19 @@ void free_uid(struct user_struct *up)
105{ 105{
106 unsigned long flags; 106 unsigned long flags;
107 107
108 if (!up)
109 return;
110
108 local_irq_save(flags); 111 local_irq_save(flags);
109 if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { 112 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
110 uid_hash_remove(up); 113 uid_hash_remove(up);
114 spin_unlock_irqrestore(&uidhash_lock, flags);
111 key_put(up->uid_keyring); 115 key_put(up->uid_keyring);
112 key_put(up->session_keyring); 116 key_put(up->session_keyring);
113 kmem_cache_free(uid_cachep, up); 117 kmem_cache_free(uid_cachep, up);
114 spin_unlock(&uidhash_lock); 118 } else {
119 local_irq_restore(flags);
115 } 120 }
116 local_irq_restore(flags);
117} 121}
118 122
119struct user_struct * alloc_uid(uid_t uid) 123struct user_struct * alloc_uid(uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c7..e9e464a903 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
27#include <linux/cpu.h> 27#include <linux/cpu.h>
28#include <linux/notifier.h> 28#include <linux/notifier.h>
29#include <linux/kthread.h> 29#include <linux/kthread.h>
30#include <linux/hardirq.h>
30 31
31/* 32/*
32 * The per-CPU workqueue (if single thread, we always use the first 33 * The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
476} 477}
477EXPORT_SYMBOL(cancel_rearming_delayed_work); 478EXPORT_SYMBOL(cancel_rearming_delayed_work);
478 479
480/**
481 * execute_in_process_context - reliably execute the routine with user context
482 * @fn: the function to execute
483 * @data: data to pass to the function
484 * @ew: guaranteed storage for the execute work structure (must
485 * be available when the work executes)
486 *
487 * Executes the function immediately if process context is available,
488 * otherwise schedules the function for delayed execution.
489 *
490 * Returns: 0 - function was executed
491 * 1 - function was scheduled for execution
492 */
493int execute_in_process_context(void (*fn)(void *data), void *data,
494 struct execute_work *ew)
495{
496 if (!in_interrupt()) {
497 fn(data);
498 return 0;
499 }
500
501 INIT_WORK(&ew->work, fn, data);
502 schedule_work(&ew->work);
503
504 return 1;
505}
506EXPORT_SYMBOL_GPL(execute_in_process_context);
507
479int keventd_up(void) 508int keventd_up(void)
480{ 509{
481 return keventd_wq != NULL; 510 return keventd_wq != NULL;