aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c97
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c25
-rw-r--r--kernel/auditsc.c140
-rw-r--r--kernel/cpu.c16
-rw-r--r--kernel/cpuset.c11
-rw-r--r--kernel/exit.c40
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/futex.c159
-rw-r--r--kernel/hrtimer.c17
-rw-r--r--kernel/irq/proc.c10
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/kallsyms.c27
-rw-r--r--kernel/kfifo.c3
-rw-r--r--kernel/kmod.c303
-rw-r--r--kernel/kprobes.c9
-rw-r--r--kernel/ksysfs.c28
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c1501
-rw-r--r--kernel/lockdep_proc.c301
-rw-r--r--kernel/module.c63
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/nsproxy.c72
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig29
-rw-r--r--kernel/power/disk.c251
-rw-r--r--kernel/power/main.c108
-rw-r--r--kernel/power/power.h29
-rw-r--r--kernel/power/process.c90
-rw-r--r--kernel/power/swap.c20
-rw-r--r--kernel/power/user.c154
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/ptrace.c28
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c13
-rw-r--r--kernel/rtmutex-debug.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c6
-rw-r--r--kernel/rtmutex_common.h9
-rw-r--r--kernel/rwsem.c8
-rw-r--r--kernel/sched.c63
-rw-r--r--kernel/seccomp.c29
-rw-r--r--kernel/signal.c43
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/spinlock.c32
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c104
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c111
-rw-r--r--kernel/taskstats.c4
-rw-r--r--kernel/time.c79
-rw-r--r--kernel/time/clockevents.c41
-rw-r--r--kernel/time/ntp.c71
-rw-r--r--kernel/time/tick-broadcast.c35
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-oneshot.c15
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c45
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c231
-rw-r--r--kernel/user.c20
-rw-r--r--kernel/user_namespace.c87
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/workqueue.c60
70 files changed, 3112 insertions, 1738 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 642d4277c2ea..2a999836ca18 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,11 +4,12 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
12 utsname.o
12 13
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 14obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/ 15obj-y += time/
@@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
49obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
50obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 51obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
51obj-$(CONFIG_UTS_NS) += utsname.o
52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
54 54
diff --git a/kernel/audit.c b/kernel/audit.c
index d13276d41410..eb0f9165b401 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,7 @@
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h> 60#include <linux/freezer.h>
61#include <linux/tty.h>
61 62
62#include "audit.h" 63#include "audit.h"
63 64
@@ -391,6 +392,7 @@ static int kauditd_thread(void *dummy)
391{ 392{
392 struct sk_buff *skb; 393 struct sk_buff *skb;
393 394
395 set_freezable();
394 while (!kthread_should_stop()) { 396 while (!kthread_should_stop()) {
395 skb = skb_dequeue(&audit_skb_queue); 397 skb = skb_dequeue(&audit_skb_queue);
396 wake_up(&audit_backlog_wait); 398 wake_up(&audit_backlog_wait);
@@ -423,6 +425,31 @@ static int kauditd_thread(void *dummy)
423 return 0; 425 return 0;
424} 426}
425 427
428static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
429{
430 struct task_struct *tsk;
431 int err;
432
433 read_lock(&tasklist_lock);
434 tsk = find_task_by_pid(pid);
435 err = -ESRCH;
436 if (!tsk)
437 goto out;
438 err = 0;
439
440 spin_lock_irq(&tsk->sighand->siglock);
441 if (!tsk->signal->audit_tty)
442 err = -EPERM;
443 spin_unlock_irq(&tsk->sighand->siglock);
444 if (err)
445 goto out;
446
447 tty_audit_push_task(tsk, loginuid);
448out:
449 read_unlock(&tasklist_lock);
450 return err;
451}
452
426int audit_send_list(void *_dest) 453int audit_send_list(void *_dest)
427{ 454{
428 struct audit_netlink_list *dest = _dest; 455 struct audit_netlink_list *dest = _dest;
@@ -511,6 +538,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
511 case AUDIT_DEL: 538 case AUDIT_DEL:
512 case AUDIT_DEL_RULE: 539 case AUDIT_DEL_RULE:
513 case AUDIT_SIGNAL_INFO: 540 case AUDIT_SIGNAL_INFO:
541 case AUDIT_TTY_GET:
542 case AUDIT_TTY_SET:
514 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) 543 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
515 err = -EPERM; 544 err = -EPERM;
516 break; 545 break;
@@ -622,6 +651,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
622 err = audit_filter_user(&NETLINK_CB(skb), msg_type); 651 err = audit_filter_user(&NETLINK_CB(skb), msg_type);
623 if (err == 1) { 652 if (err == 1) {
624 err = 0; 653 err = 0;
654 if (msg_type == AUDIT_USER_TTY) {
655 err = audit_prepare_user_tty(pid, loginuid);
656 if (err)
657 break;
658 }
625 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 659 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
626 if (ab) { 660 if (ab) {
627 audit_log_format(ab, 661 audit_log_format(ab,
@@ -638,8 +672,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
638 " subj=%s", ctx); 672 " subj=%s", ctx);
639 kfree(ctx); 673 kfree(ctx);
640 } 674 }
641 audit_log_format(ab, " msg='%.1024s'", 675 if (msg_type != AUDIT_USER_TTY)
642 (char *)data); 676 audit_log_format(ab, " msg='%.1024s'",
677 (char *)data);
678 else {
679 int size;
680
681 audit_log_format(ab, " msg=");
682 size = nlmsg_len(nlh);
683 audit_log_n_untrustedstring(ab, size,
684 data);
685 }
643 audit_set_pid(ab, pid); 686 audit_set_pid(ab, pid);
644 audit_log_end(ab); 687 audit_log_end(ab);
645 } 688 }
@@ -730,6 +773,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
730 0, 0, sig_data, sizeof(*sig_data) + len); 773 0, 0, sig_data, sizeof(*sig_data) + len);
731 kfree(sig_data); 774 kfree(sig_data);
732 break; 775 break;
776 case AUDIT_TTY_GET: {
777 struct audit_tty_status s;
778 struct task_struct *tsk;
779
780 read_lock(&tasklist_lock);
781 tsk = find_task_by_pid(pid);
782 if (!tsk)
783 err = -ESRCH;
784 else {
785 spin_lock_irq(&tsk->sighand->siglock);
786 s.enabled = tsk->signal->audit_tty != 0;
787 spin_unlock_irq(&tsk->sighand->siglock);
788 }
789 read_unlock(&tasklist_lock);
790 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
791 &s, sizeof(s));
792 break;
793 }
794 case AUDIT_TTY_SET: {
795 struct audit_tty_status *s;
796 struct task_struct *tsk;
797
798 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
799 return -EINVAL;
800 s = data;
801 if (s->enabled != 0 && s->enabled != 1)
802 return -EINVAL;
803 read_lock(&tasklist_lock);
804 tsk = find_task_by_pid(pid);
805 if (!tsk)
806 err = -ESRCH;
807 else {
808 spin_lock_irq(&tsk->sighand->siglock);
809 tsk->signal->audit_tty = s->enabled != 0;
810 spin_unlock_irq(&tsk->sighand->siglock);
811 }
812 read_unlock(&tasklist_lock);
813 break;
814 }
733 default: 815 default:
734 err = -EINVAL; 816 err = -EINVAL;
735 break; 817 break;
@@ -1185,7 +1267,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1185} 1267}
1186 1268
1187/** 1269/**
1188 * audit_log_n_unstrustedstring - log a string that may contain random characters 1270 * audit_log_n_untrustedstring - log a string that may contain random characters
1189 * @ab: audit_buffer 1271 * @ab: audit_buffer
1190 * @len: lenth of string (not including trailing null) 1272 * @len: lenth of string (not including trailing null)
1191 * @string: string to be logged 1273 * @string: string to be logged
@@ -1201,25 +1283,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1201const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1283const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1202 const char *string) 1284 const char *string)
1203{ 1285{
1204 const unsigned char *p = string; 1286 const unsigned char *p;
1205 1287
1206 while (*p) { 1288 for (p = string; p < (const unsigned char *)string + len && *p; p++) {
1207 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1289 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1208 audit_log_hex(ab, string, len); 1290 audit_log_hex(ab, string, len);
1209 return string + len + 1; 1291 return string + len + 1;
1210 } 1292 }
1211 p++;
1212 } 1293 }
1213 audit_log_n_string(ab, len, string); 1294 audit_log_n_string(ab, len, string);
1214 return p + 1; 1295 return p + 1;
1215} 1296}
1216 1297
1217/** 1298/**
1218 * audit_log_unstrustedstring - log a string that may contain random characters 1299 * audit_log_untrustedstring - log a string that may contain random characters
1219 * @ab: audit_buffer 1300 * @ab: audit_buffer
1220 * @string: string to be logged 1301 * @string: string to be logged
1221 * 1302 *
1222 * Same as audit_log_n_unstrustedstring(), except that strlen is used to 1303 * Same as audit_log_n_untrustedstring(), except that strlen is used to
1223 * determine string length. 1304 * determine string length.
1224 */ 1305 */
1225const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1306const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
diff --git a/kernel/audit.h b/kernel/audit.h
index 815d6f5c04ee..95877435c347 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -115,7 +115,6 @@ extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
115extern void audit_send_reply(int pid, int seq, int type, 115extern void audit_send_reply(int pid, int seq, int type,
116 int done, int multi, 116 int done, int multi,
117 void *payload, int size); 117 void *payload, int size);
118extern void audit_log_lost(const char *message);
119extern void audit_panic(const char *message); 118extern void audit_panic(const char *message);
120 119
121struct audit_netlink_list { 120struct audit_netlink_list {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce61f423542c..359645cff5b2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -304,7 +304,7 @@ int __init audit_register_class(int class, unsigned *list)
304 304
305int audit_match_class(int class, unsigned syscall) 305int audit_match_class(int class, unsigned syscall)
306{ 306{
307 if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) 307 if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
308 return 0; 308 return 0;
309 if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) 309 if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
310 return 0; 310 return 0;
@@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
456 case AUDIT_DEVMINOR: 456 case AUDIT_DEVMINOR:
457 case AUDIT_EXIT: 457 case AUDIT_EXIT:
458 case AUDIT_SUCCESS: 458 case AUDIT_SUCCESS:
459 /* bit ops are only useful on syscall args */
460 if (f->op == AUDIT_BIT_MASK ||
461 f->op == AUDIT_BIT_TEST) {
462 err = -EINVAL;
463 goto exit_free;
464 }
465 break;
459 case AUDIT_ARG0: 466 case AUDIT_ARG0:
460 case AUDIT_ARG1: 467 case AUDIT_ARG1:
461 case AUDIT_ARG2: 468 case AUDIT_ARG2:
@@ -1210,8 +1217,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1210 struct audit_entry *e; 1217 struct audit_entry *e;
1211 struct audit_field *inode_f = entry->rule.inode_f; 1218 struct audit_field *inode_f = entry->rule.inode_f;
1212 struct audit_watch *watch = entry->rule.watch; 1219 struct audit_watch *watch = entry->rule.watch;
1213 struct nameidata *ndp, *ndw; 1220 struct nameidata *ndp = NULL, *ndw = NULL;
1214 int h, err, putnd_needed = 0; 1221 int h, err;
1215#ifdef CONFIG_AUDITSYSCALL 1222#ifdef CONFIG_AUDITSYSCALL
1216 int dont_count = 0; 1223 int dont_count = 0;
1217 1224
@@ -1239,7 +1246,6 @@ static inline int audit_add_rule(struct audit_entry *entry,
1239 err = audit_get_nd(watch->path, &ndp, &ndw); 1246 err = audit_get_nd(watch->path, &ndp, &ndw);
1240 if (err) 1247 if (err)
1241 goto error; 1248 goto error;
1242 putnd_needed = 1;
1243 } 1249 }
1244 1250
1245 mutex_lock(&audit_filter_mutex); 1251 mutex_lock(&audit_filter_mutex);
@@ -1269,14 +1275,11 @@ static inline int audit_add_rule(struct audit_entry *entry,
1269#endif 1275#endif
1270 mutex_unlock(&audit_filter_mutex); 1276 mutex_unlock(&audit_filter_mutex);
1271 1277
1272 if (putnd_needed) 1278 audit_put_nd(ndp, ndw); /* NULL args OK */
1273 audit_put_nd(ndp, ndw);
1274
1275 return 0; 1279 return 0;
1276 1280
1277error: 1281error:
1278 if (putnd_needed) 1282 audit_put_nd(ndp, ndw); /* NULL args OK */
1279 audit_put_nd(ndp, ndw);
1280 if (watch) 1283 if (watch)
1281 audit_put_watch(watch); /* tmp watch, matches initial get */ 1284 audit_put_watch(watch); /* tmp watch, matches initial get */
1282 return err; 1285 return err;
@@ -1570,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
1570 return (left > right); 1573 return (left > right);
1571 case AUDIT_GREATER_THAN_OR_EQUAL: 1574 case AUDIT_GREATER_THAN_OR_EQUAL:
1572 return (left >= right); 1575 return (left >= right);
1576 case AUDIT_BIT_MASK:
1577 return (left & right);
1578 case AUDIT_BIT_TEST:
1579 return ((left & right) == right);
1573 } 1580 }
1574 BUG(); 1581 BUG();
1575 return 0; 1582 return 0;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e36481ed61b4..bde1124d5908 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -71,9 +71,6 @@
71 71
72extern struct list_head audit_filter_list[]; 72extern struct list_head audit_filter_list[];
73 73
74/* No syscall auditing will take place unless audit_enabled != 0. */
75extern int audit_enabled;
76
77/* AUDIT_NAMES is the number of slots we reserve in the audit_context 74/* AUDIT_NAMES is the number of slots we reserve in the audit_context
78 * for saving names from getname(). */ 75 * for saving names from getname(). */
79#define AUDIT_NAMES 20 76#define AUDIT_NAMES 20
@@ -156,7 +153,7 @@ struct audit_aux_data_execve {
156 struct audit_aux_data d; 153 struct audit_aux_data d;
157 int argc; 154 int argc;
158 int envc; 155 int envc;
159 char mem[0]; 156 struct mm_struct *mm;
160}; 157};
161 158
162struct audit_aux_data_socketcall { 159struct audit_aux_data_socketcall {
@@ -176,12 +173,6 @@ struct audit_aux_data_fd_pair {
176 int fd[2]; 173 int fd[2];
177}; 174};
178 175
179struct audit_aux_data_path {
180 struct audit_aux_data d;
181 struct dentry *dentry;
182 struct vfsmount *mnt;
183};
184
185struct audit_aux_data_pids { 176struct audit_aux_data_pids {
186 struct audit_aux_data d; 177 struct audit_aux_data d;
187 pid_t target_pid[AUDIT_AUX_PIDS]; 178 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -657,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context)
657 struct audit_aux_data *aux; 648 struct audit_aux_data *aux;
658 649
659 while ((aux = context->aux)) { 650 while ((aux = context->aux)) {
660 if (aux->type == AUDIT_AVC_PATH) {
661 struct audit_aux_data_path *axi = (void *)aux;
662 dput(axi->dentry);
663 mntput(axi->mnt);
664 }
665
666 context->aux = aux->next; 651 context->aux = aux->next;
667 kfree(aux); 652 kfree(aux);
668 } 653 }
@@ -834,6 +819,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
834 return rc; 819 return rc;
835} 820}
836 821
822static void audit_log_execve_info(struct audit_buffer *ab,
823 struct audit_aux_data_execve *axi)
824{
825 int i;
826 long len, ret;
827 const char __user *p = (const char __user *)axi->mm->arg_start;
828 char *buf;
829
830 if (axi->mm != current->mm)
831 return; /* execve failed, no additional info */
832
833 for (i = 0; i < axi->argc; i++, p += len) {
834 len = strnlen_user(p, MAX_ARG_STRLEN);
835 /*
836 * We just created this mm, if we can't find the strings
837 * we just copied into it something is _very_ wrong. Similar
838 * for strings that are too long, we should not have created
839 * any.
840 */
841 if (!len || len > MAX_ARG_STRLEN) {
842 WARN_ON(1);
843 send_sig(SIGKILL, current, 0);
844 }
845
846 buf = kmalloc(len, GFP_KERNEL);
847 if (!buf) {
848 audit_panic("out of memory for argv string\n");
849 break;
850 }
851
852 ret = copy_from_user(buf, p, len);
853 /*
854 * There is no reason for this copy to be short. We just
855 * copied them here, and the mm hasn't been exposed to user-
856 * space yet.
857 */
858 if (!ret) {
859 WARN_ON(1);
860 send_sig(SIGKILL, current, 0);
861 }
862
863 audit_log_format(ab, "a%d=", i);
864 audit_log_untrustedstring(ab, buf);
865 audit_log_format(ab, "\n");
866
867 kfree(buf);
868 }
869}
870
837static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 871static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
838{ 872{
839 int i, call_panic = 0; 873 int i, call_panic = 0;
@@ -949,7 +983,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
949 case AUDIT_IPC: { 983 case AUDIT_IPC: {
950 struct audit_aux_data_ipcctl *axi = (void *)aux; 984 struct audit_aux_data_ipcctl *axi = (void *)aux;
951 audit_log_format(ab, 985 audit_log_format(ab,
952 "ouid=%u ogid=%u mode=%x", 986 "ouid=%u ogid=%u mode=%#o",
953 axi->uid, axi->gid, axi->mode); 987 axi->uid, axi->gid, axi->mode);
954 if (axi->osid != 0) { 988 if (axi->osid != 0) {
955 char *ctx = NULL; 989 char *ctx = NULL;
@@ -968,19 +1002,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
968 case AUDIT_IPC_SET_PERM: { 1002 case AUDIT_IPC_SET_PERM: {
969 struct audit_aux_data_ipcctl *axi = (void *)aux; 1003 struct audit_aux_data_ipcctl *axi = (void *)aux;
970 audit_log_format(ab, 1004 audit_log_format(ab,
971 "qbytes=%lx ouid=%u ogid=%u mode=%x", 1005 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
972 axi->qbytes, axi->uid, axi->gid, axi->mode); 1006 axi->qbytes, axi->uid, axi->gid, axi->mode);
973 break; } 1007 break; }
974 1008
975 case AUDIT_EXECVE: { 1009 case AUDIT_EXECVE: {
976 struct audit_aux_data_execve *axi = (void *)aux; 1010 struct audit_aux_data_execve *axi = (void *)aux;
977 int i; 1011 audit_log_execve_info(ab, axi);
978 const char *p;
979 for (i = 0, p = axi->mem; i < axi->argc; i++) {
980 audit_log_format(ab, "a%d=", i);
981 p = audit_log_untrustedstring(ab, p);
982 audit_log_format(ab, "\n");
983 }
984 break; } 1012 break; }
985 1013
986 case AUDIT_SOCKETCALL: { 1014 case AUDIT_SOCKETCALL: {
@@ -998,11 +1026,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
998 audit_log_hex(ab, axs->a, axs->len); 1026 audit_log_hex(ab, axs->a, axs->len);
999 break; } 1027 break; }
1000 1028
1001 case AUDIT_AVC_PATH: {
1002 struct audit_aux_data_path *axi = (void *)aux;
1003 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
1004 break; }
1005
1006 case AUDIT_FD_PAIR: { 1029 case AUDIT_FD_PAIR: {
1007 struct audit_aux_data_fd_pair *axs = (void *)aux; 1030 struct audit_aux_data_fd_pair *axs = (void *)aux;
1008 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); 1031 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1824,32 +1847,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode
1824 return 0; 1847 return 0;
1825} 1848}
1826 1849
1850int audit_argv_kb = 32;
1851
1827int audit_bprm(struct linux_binprm *bprm) 1852int audit_bprm(struct linux_binprm *bprm)
1828{ 1853{
1829 struct audit_aux_data_execve *ax; 1854 struct audit_aux_data_execve *ax;
1830 struct audit_context *context = current->audit_context; 1855 struct audit_context *context = current->audit_context;
1831 unsigned long p, next;
1832 void *to;
1833 1856
1834 if (likely(!audit_enabled || !context || context->dummy)) 1857 if (likely(!audit_enabled || !context || context->dummy))
1835 return 0; 1858 return 0;
1836 1859
1837 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, 1860 /*
1838 GFP_KERNEL); 1861 * Even though the stack code doesn't limit the arg+env size any more,
1862 * the audit code requires that _all_ arguments be logged in a single
1863 * netlink skb. Hence cap it :-(
1864 */
1865 if (bprm->argv_len > (audit_argv_kb << 10))
1866 return -E2BIG;
1867
1868 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
1839 if (!ax) 1869 if (!ax)
1840 return -ENOMEM; 1870 return -ENOMEM;
1841 1871
1842 ax->argc = bprm->argc; 1872 ax->argc = bprm->argc;
1843 ax->envc = bprm->envc; 1873 ax->envc = bprm->envc;
1844 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { 1874 ax->mm = bprm->mm;
1845 struct page *page = bprm->page[p / PAGE_SIZE];
1846 void *kaddr = kmap(page);
1847 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1848 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1849 to += next - p;
1850 kunmap(page);
1851 }
1852
1853 ax->d.type = AUDIT_EXECVE; 1875 ax->d.type = AUDIT_EXECVE;
1854 ax->d.next = context->aux; 1876 ax->d.next = context->aux;
1855 context->aux = (void *)ax; 1877 context->aux = (void *)ax;
@@ -1952,36 +1974,6 @@ void __audit_ptrace(struct task_struct *t)
1952} 1974}
1953 1975
1954/** 1976/**
1955 * audit_avc_path - record the granting or denial of permissions
1956 * @dentry: dentry to record
1957 * @mnt: mnt to record
1958 *
1959 * Returns 0 for success or NULL context or < 0 on error.
1960 *
1961 * Called from security/selinux/avc.c::avc_audit()
1962 */
1963int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1964{
1965 struct audit_aux_data_path *ax;
1966 struct audit_context *context = current->audit_context;
1967
1968 if (likely(!context))
1969 return 0;
1970
1971 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1972 if (!ax)
1973 return -ENOMEM;
1974
1975 ax->dentry = dget(dentry);
1976 ax->mnt = mntget(mnt);
1977
1978 ax->d.type = AUDIT_AVC_PATH;
1979 ax->d.next = context->aux;
1980 context->aux = (void *)ax;
1981 return 0;
1982}
1983
1984/**
1985 * audit_signal_info - record signal info for shutting down audit subsystem 1977 * audit_signal_info - record signal info for shutting down audit subsystem
1986 * @sig: signal value 1978 * @sig: signal value
1987 * @t: task being signaled 1979 * @t: task being signaled
@@ -2040,7 +2032,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2040 2032
2041/** 2033/**
2042 * audit_core_dumps - record information about processes that end abnormally 2034 * audit_core_dumps - record information about processes that end abnormally
2043 * @sig: signal value 2035 * @signr: signal value
2044 * 2036 *
2045 * If a process ends with a core dump, something fishy is going on and we 2037 * If a process ends with a core dump, something fishy is going on and we
2046 * should record the event for investigation. 2038 * should record the event for investigation.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 208cf3497c10..181ae7086029 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
103 write_unlock_irq(&tasklist_lock); 103 write_unlock_irq(&tasklist_lock);
104} 104}
105 105
106struct take_cpu_down_param {
107 unsigned long mod;
108 void *hcpu;
109};
110
106/* Take this CPU down. */ 111/* Take this CPU down. */
107static int take_cpu_down(void *unused) 112static int take_cpu_down(void *_param)
108{ 113{
114 struct take_cpu_down_param *param = _param;
109 int err; 115 int err;
110 116
117 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
118 param->hcpu);
111 /* Ensure this CPU doesn't handle any more interrupts. */ 119 /* Ensure this CPU doesn't handle any more interrupts. */
112 err = __cpu_disable(); 120 err = __cpu_disable();
113 if (err < 0) 121 if (err < 0)
@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
127 cpumask_t old_allowed, tmp; 135 cpumask_t old_allowed, tmp;
128 void *hcpu = (void *)(long)cpu; 136 void *hcpu = (void *)(long)cpu;
129 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 137 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
138 struct take_cpu_down_param tcd_param = {
139 .mod = mod,
140 .hcpu = hcpu,
141 };
130 142
131 if (num_online_cpus() == 1) 143 if (num_online_cpus() == 1)
132 return -EBUSY; 144 return -EBUSY;
@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
153 set_cpus_allowed(current, tmp); 165 set_cpus_allowed(current, tmp);
154 166
155 mutex_lock(&cpu_bitmask_lock); 167 mutex_lock(&cpu_bitmask_lock);
156 p = __stop_machine_run(take_cpu_down, NULL, cpu); 168 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
157 mutex_unlock(&cpu_bitmask_lock); 169 mutex_unlock(&cpu_bitmask_lock);
158 170
159 if (IS_ERR(p) || cpu_online(cpu)) { 171 if (IS_ERR(p) || cpu_online(cpu)) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4c49188cc49b..57e6448b171e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL; 517 envp[i] = NULL;
518 518
519 call_usermodehelper(argv[0], argv, envp, 0); 519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf); 520 kfree(pathbuf);
521} 521}
522 522
@@ -981,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
982 if (!mmarray) 982 if (!mmarray)
983 goto done; 983 goto done;
984 write_lock_irq(&tasklist_lock); /* block fork */ 984 read_lock(&tasklist_lock); /* block fork */
985 if (atomic_read(&cs->count) <= ntasks) 985 if (atomic_read(&cs->count) <= ntasks)
986 break; /* got enough */ 986 break; /* got enough */
987 write_unlock_irq(&tasklist_lock); /* try again */ 987 read_unlock(&tasklist_lock); /* try again */
988 kfree(mmarray); 988 kfree(mmarray);
989 } 989 }
990 990
@@ -1006,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1006 continue; 1006 continue;
1007 mmarray[n++] = mm; 1007 mmarray[n++] = mm;
1008 } while_each_thread(g, p); 1008 } while_each_thread(g, p);
1009 write_unlock_irq(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1010 1010
1011 /* 1011 /*
1012 * Now that we've dropped the tasklist spinlock, we can 1012 * Now that we've dropped the tasklist spinlock, we can
@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
2138static int cpuset_handle_cpuhp(struct notifier_block *nb, 2138static int cpuset_handle_cpuhp(struct notifier_block *nb,
2139 unsigned long phase, void *cpu) 2139 unsigned long phase, void *cpu)
2140{ 2140{
2141 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
2142 return NOTIFY_DONE;
2143
2141 common_cpu_mem_hotplug_unplug(); 2144 common_cpu_mem_hotplug_unplug();
2142 return 0; 2145 return 0;
2143} 2146}
diff --git a/kernel/exit.c b/kernel/exit.c
index ca6a11b73023..464c2b172f07 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 32#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 33#include <linux/delayacct.h>
34#include <linux/freezer.h>
34#include <linux/cpuset.h> 35#include <linux/cpuset.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
36#include <linux/signal.h> 37#include <linux/signal.h>
@@ -44,6 +45,7 @@
44#include <linux/resource.h> 45#include <linux/resource.h>
45#include <linux/blkdev.h> 46#include <linux/blkdev.h>
46#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/freezer.h>
47 49
48#include <asm/uaccess.h> 50#include <asm/uaccess.h>
49#include <asm/unistd.h> 51#include <asm/unistd.h>
@@ -387,6 +389,11 @@ void daemonize(const char *name, ...)
387 * they would be locked into memory. 389 * they would be locked into memory.
388 */ 390 */
389 exit_mm(current); 391 exit_mm(current);
392 /*
393 * We don't want to have TIF_FREEZE set if the system-wide hibernation
394 * or suspend transition begins right now.
395 */
396 current->flags |= PF_NOFREEZE;
390 397
391 set_special_pids(1, 1); 398 set_special_pids(1, 1);
392 proc_clear_tty(current); 399 proc_clear_tty(current);
@@ -588,6 +595,8 @@ static void exit_mm(struct task_struct * tsk)
588 tsk->mm = NULL; 595 tsk->mm = NULL;
589 up_read(&mm->mmap_sem); 596 up_read(&mm->mmap_sem);
590 enter_lazy_tlb(mm, current); 597 enter_lazy_tlb(mm, current);
598 /* We don't want this task to be frozen prematurely */
599 clear_freeze_flag(tsk);
591 task_unlock(tsk); 600 task_unlock(tsk);
592 mmput(mm); 601 mmput(mm);
593} 602}
@@ -858,6 +867,34 @@ static void exit_notify(struct task_struct *tsk)
858 release_task(tsk); 867 release_task(tsk);
859} 868}
860 869
870#ifdef CONFIG_DEBUG_STACK_USAGE
871static void check_stack_usage(void)
872{
873 static DEFINE_SPINLOCK(low_water_lock);
874 static int lowest_to_date = THREAD_SIZE;
875 unsigned long *n = end_of_stack(current);
876 unsigned long free;
877
878 while (*n == 0)
879 n++;
880 free = (unsigned long)n - (unsigned long)end_of_stack(current);
881
882 if (free >= lowest_to_date)
883 return;
884
885 spin_lock(&low_water_lock);
886 if (free < lowest_to_date) {
887 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
888 "left\n",
889 current->comm, free);
890 lowest_to_date = free;
891 }
892 spin_unlock(&low_water_lock);
893}
894#else
895static inline void check_stack_usage(void) {}
896#endif
897
861fastcall NORET_TYPE void do_exit(long code) 898fastcall NORET_TYPE void do_exit(long code)
862{ 899{
863 struct task_struct *tsk = current; 900 struct task_struct *tsk = current;
@@ -937,6 +974,8 @@ fastcall NORET_TYPE void do_exit(long code)
937 if (unlikely(tsk->compat_robust_list)) 974 if (unlikely(tsk->compat_robust_list))
938 compat_exit_robust_list(tsk); 975 compat_exit_robust_list(tsk);
939#endif 976#endif
977 if (group_dead)
978 tty_audit_exit();
940 if (unlikely(tsk->audit_context)) 979 if (unlikely(tsk->audit_context))
941 audit_free(tsk); 980 audit_free(tsk);
942 981
@@ -949,6 +988,7 @@ fastcall NORET_TYPE void do_exit(long code)
949 exit_sem(tsk); 988 exit_sem(tsk);
950 __exit_files(tsk); 989 __exit_files(tsk);
951 __exit_fs(tsk); 990 __exit_fs(tsk);
991 check_stack_usage();
952 exit_thread(); 992 exit_thread();
953 cpuset_exit(tsk); 993 cpuset_exit(tsk);
954 exit_keys(tsk); 994 exit_keys(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index da3a155bba0d..7332e236d367 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/delayacct.h> 49#include <linux/delayacct.h>
50#include <linux/taskstats_kern.h> 50#include <linux/taskstats_kern.h>
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h>
52 53
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/pgalloc.h> 55#include <asm/pgalloc.h>
@@ -136,7 +137,7 @@ void __init fork_init(unsigned long mempages)
136 /* create a slab on which task_structs can be allocated */ 137 /* create a slab on which task_structs can be allocated */
137 task_struct_cachep = 138 task_struct_cachep =
138 kmem_cache_create("task_struct", sizeof(struct task_struct), 139 kmem_cache_create("task_struct", sizeof(struct task_struct),
139 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); 140 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
140#endif 141#endif
141 142
142 /* 143 /*
@@ -333,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
333 atomic_set(&mm->mm_count, 1); 334 atomic_set(&mm->mm_count, 1);
334 init_rwsem(&mm->mmap_sem); 335 init_rwsem(&mm->mmap_sem);
335 INIT_LIST_HEAD(&mm->mmlist); 336 INIT_LIST_HEAD(&mm->mmlist);
337 mm->flags = (current->mm) ? current->mm->flags
338 : MMF_DUMP_FILTER_DEFAULT;
336 mm->core_waiters = 0; 339 mm->core_waiters = 0;
337 mm->nr_ptes = 0; 340 mm->nr_ptes = 0;
338 set_mm_counter(mm, file_rss, 0); 341 set_mm_counter(mm, file_rss, 0);
@@ -897,6 +900,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
897 } 900 }
898 acct_init_pacct(&sig->pacct); 901 acct_init_pacct(&sig->pacct);
899 902
903 tty_audit_fork(sig);
904
900 return 0; 905 return 0;
901} 906}
902 907
@@ -920,7 +925,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
920{ 925{
921 unsigned long new_flags = p->flags; 926 unsigned long new_flags = p->flags;
922 927
923 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); 928 new_flags &= ~PF_SUPERPRIV;
924 new_flags |= PF_FORKNOEXEC; 929 new_flags |= PF_FORKNOEXEC;
925 if (!(clone_flags & CLONE_PTRACE)) 930 if (!(clone_flags & CLONE_PTRACE))
926 p->ptrace = 0; 931 p->ptrace = 0;
@@ -999,7 +1004,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
999 if (atomic_read(&p->user->processes) >= 1004 if (atomic_read(&p->user->processes) >=
1000 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1005 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
1001 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1006 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1002 p->user != &root_user) 1007 p->user != current->nsproxy->user_ns->root_user)
1003 goto bad_fork_free; 1008 goto bad_fork_free;
1004 } 1009 }
1005 1010
@@ -1059,6 +1064,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1059 1064
1060 p->lock_depth = -1; /* -1 = no lock */ 1065 p->lock_depth = -1; /* -1 = no lock */
1061 do_posix_clock_monotonic_gettime(&p->start_time); 1066 do_posix_clock_monotonic_gettime(&p->start_time);
1067 p->real_start_time = p->start_time;
1068 monotonic_to_bootbased(&p->real_start_time);
1062 p->security = NULL; 1069 p->security = NULL;
1063 p->io_context = NULL; 1070 p->io_context = NULL;
1064 p->io_wait = NULL; 1071 p->io_wait = NULL;
@@ -1439,22 +1446,22 @@ void __init proc_caches_init(void)
1439 sighand_cachep = kmem_cache_create("sighand_cache", 1446 sighand_cachep = kmem_cache_create("sighand_cache",
1440 sizeof(struct sighand_struct), 0, 1447 sizeof(struct sighand_struct), 0,
1441 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1448 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1442 sighand_ctor, NULL); 1449 sighand_ctor);
1443 signal_cachep = kmem_cache_create("signal_cache", 1450 signal_cachep = kmem_cache_create("signal_cache",
1444 sizeof(struct signal_struct), 0, 1451 sizeof(struct signal_struct), 0,
1445 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1452 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1446 files_cachep = kmem_cache_create("files_cache", 1453 files_cachep = kmem_cache_create("files_cache",
1447 sizeof(struct files_struct), 0, 1454 sizeof(struct files_struct), 0,
1448 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1455 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1449 fs_cachep = kmem_cache_create("fs_cache", 1456 fs_cachep = kmem_cache_create("fs_cache",
1450 sizeof(struct fs_struct), 0, 1457 sizeof(struct fs_struct), 0,
1451 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1458 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1452 vm_area_cachep = kmem_cache_create("vm_area_struct", 1459 vm_area_cachep = kmem_cache_create("vm_area_struct",
1453 sizeof(struct vm_area_struct), 0, 1460 sizeof(struct vm_area_struct), 0,
1454 SLAB_PANIC, NULL, NULL); 1461 SLAB_PANIC, NULL);
1455 mm_cachep = kmem_cache_create("mm_struct", 1462 mm_cachep = kmem_cache_create("mm_struct",
1456 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1463 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1457 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1458} 1465}
1459 1466
1460/* 1467/*
@@ -1601,7 +1608,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1601 err = -EINVAL; 1608 err = -EINVAL;
1602 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1603 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1604 CLONE_NEWUTS|CLONE_NEWIPC)) 1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
1605 goto bad_unshare_out; 1612 goto bad_unshare_out;
1606 1613
1607 if ((err = unshare_thread(unshare_flags))) 1614 if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/futex.c b/kernel/futex.c
index 45490bec5831..a12425051ee9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
121static struct vfsmount *futex_mnt; 121static struct vfsmount *futex_mnt;
122 122
123/* 123/*
124 * Take mm->mmap_sem, when futex is shared
125 */
126static inline void futex_lock_mm(struct rw_semaphore *fshared)
127{
128 if (fshared)
129 down_read(fshared);
130}
131
132/*
133 * Release mm->mmap_sem, when the futex is shared
134 */
135static inline void futex_unlock_mm(struct rw_semaphore *fshared)
136{
137 if (fshared)
138 up_read(fshared);
139}
140
141/*
124 * We hash on the keys returned from get_futex_key (see below). 142 * We hash on the keys returned from get_futex_key (see below).
125 */ 143 */
126static struct futex_hash_bucket *hash_futex(union futex_key *key) 144static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key *key)
287} 305}
288EXPORT_SYMBOL_GPL(drop_futex_key_refs); 306EXPORT_SYMBOL_GPL(drop_futex_key_refs);
289 307
290static inline int get_futex_value_locked(u32 *dest, u32 __user *from) 308static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
309{
310 u32 curval;
311
312 pagefault_disable();
313 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
314 pagefault_enable();
315
316 return curval;
317}
318
319static int get_futex_value_locked(u32 *dest, u32 __user *from)
291{ 320{
292 int ret; 321 int ret;
293 322
@@ -317,15 +346,20 @@ static int futex_handle_fault(unsigned long address,
317 vma = find_vma(mm, address); 346 vma = find_vma(mm, address);
318 if (vma && address >= vma->vm_start && 347 if (vma && address >= vma->vm_start &&
319 (vma->vm_flags & VM_WRITE)) { 348 (vma->vm_flags & VM_WRITE)) {
320 switch (handle_mm_fault(mm, vma, address, 1)) { 349 int fault;
321 case VM_FAULT_MINOR: 350 fault = handle_mm_fault(mm, vma, address, 1);
322 ret = 0; 351 if (unlikely((fault & VM_FAULT_ERROR))) {
323 current->min_flt++; 352#if 0
324 break; 353 /* XXX: let's do this when we verify it is OK */
325 case VM_FAULT_MAJOR: 354 if (ret & VM_FAULT_OOM)
355 ret = -ENOMEM;
356#endif
357 } else {
326 ret = 0; 358 ret = 0;
327 current->maj_flt++; 359 if (fault & VM_FAULT_MAJOR)
328 break; 360 current->maj_flt++;
361 else
362 current->min_flt++;
329 } 363 }
330 } 364 }
331 if (!fshared) 365 if (!fshared)
@@ -620,9 +654,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
620 654
621 newval = FUTEX_WAITERS | new_owner->pid; 655 newval = FUTEX_WAITERS | new_owner->pid;
622 656
623 pagefault_disable(); 657 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
624 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
625 pagefault_enable();
626 658
627 if (curval == -EFAULT) 659 if (curval == -EFAULT)
628 ret = -EFAULT; 660 ret = -EFAULT;
@@ -659,9 +691,7 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
659 * There is no waiter, so we unlock the futex. The owner died 691 * There is no waiter, so we unlock the futex. The owner died
660 * bit has not to be preserved here. We are the owner: 692 * bit has not to be preserved here. We are the owner:
661 */ 693 */
662 pagefault_disable(); 694 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
663 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
664 pagefault_enable();
665 695
666 if (oldval == -EFAULT) 696 if (oldval == -EFAULT)
667 return oldval; 697 return oldval;
@@ -700,8 +730,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
700 union futex_key key; 730 union futex_key key;
701 int ret; 731 int ret;
702 732
703 if (fshared) 733 futex_lock_mm(fshared);
704 down_read(fshared);
705 734
706 ret = get_futex_key(uaddr, fshared, &key); 735 ret = get_futex_key(uaddr, fshared, &key);
707 if (unlikely(ret != 0)) 736 if (unlikely(ret != 0))
@@ -725,8 +754,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
725 754
726 spin_unlock(&hb->lock); 755 spin_unlock(&hb->lock);
727out: 756out:
728 if (fshared) 757 futex_unlock_mm(fshared);
729 up_read(fshared);
730 return ret; 758 return ret;
731} 759}
732 760
@@ -746,8 +774,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
746 int ret, op_ret, attempt = 0; 774 int ret, op_ret, attempt = 0;
747 775
748retryfull: 776retryfull:
749 if (fshared) 777 futex_lock_mm(fshared);
750 down_read(fshared);
751 778
752 ret = get_futex_key(uaddr1, fshared, &key1); 779 ret = get_futex_key(uaddr1, fshared, &key1);
753 if (unlikely(ret != 0)) 780 if (unlikely(ret != 0))
@@ -793,7 +820,7 @@ retry:
793 */ 820 */
794 if (attempt++) { 821 if (attempt++) {
795 ret = futex_handle_fault((unsigned long)uaddr2, 822 ret = futex_handle_fault((unsigned long)uaddr2,
796 fshared, attempt); 823 fshared, attempt);
797 if (ret) 824 if (ret)
798 goto out; 825 goto out;
799 goto retry; 826 goto retry;
@@ -803,8 +830,7 @@ retry:
803 * If we would have faulted, release mmap_sem, 830 * If we would have faulted, release mmap_sem,
804 * fault it in and start all over again. 831 * fault it in and start all over again.
805 */ 832 */
806 if (fshared) 833 futex_unlock_mm(fshared);
807 up_read(fshared);
808 834
809 ret = get_user(dummy, uaddr2); 835 ret = get_user(dummy, uaddr2);
810 if (ret) 836 if (ret)
@@ -841,8 +867,8 @@ retry:
841 if (hb1 != hb2) 867 if (hb1 != hb2)
842 spin_unlock(&hb2->lock); 868 spin_unlock(&hb2->lock);
843out: 869out:
844 if (fshared) 870 futex_unlock_mm(fshared);
845 up_read(fshared); 871
846 return ret; 872 return ret;
847} 873}
848 874
@@ -861,8 +887,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
861 int ret, drop_count = 0; 887 int ret, drop_count = 0;
862 888
863 retry: 889 retry:
864 if (fshared) 890 futex_lock_mm(fshared);
865 down_read(fshared);
866 891
867 ret = get_futex_key(uaddr1, fshared, &key1); 892 ret = get_futex_key(uaddr1, fshared, &key1);
868 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
@@ -890,8 +915,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
890 * If we would have faulted, release mmap_sem, fault 915 * If we would have faulted, release mmap_sem, fault
891 * it in and start all over again. 916 * it in and start all over again.
892 */ 917 */
893 if (fshared) 918 futex_unlock_mm(fshared);
894 up_read(fshared);
895 919
896 ret = get_user(curval, uaddr1); 920 ret = get_user(curval, uaddr1);
897 921
@@ -944,8 +968,7 @@ out_unlock:
944 drop_futex_key_refs(&key1); 968 drop_futex_key_refs(&key1);
945 969
946out: 970out:
947 if (fshared) 971 futex_unlock_mm(fshared);
948 up_read(fshared);
949 return ret; 972 return ret;
950} 973}
951 974
@@ -1113,10 +1136,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1113 while (!ret) { 1136 while (!ret) {
1114 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1137 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1115 1138
1116 pagefault_disable(); 1139 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1117 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1118 uval, newval);
1119 pagefault_enable();
1120 1140
1121 if (curval == -EFAULT) 1141 if (curval == -EFAULT)
1122 ret = -EFAULT; 1142 ret = -EFAULT;
@@ -1134,6 +1154,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1134#define ARG3_SHARED 1 1154#define ARG3_SHARED 1
1135 1155
1136static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1157
1137static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1158static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1138 u32 val, ktime_t *abs_time) 1159 u32 val, ktime_t *abs_time)
1139{ 1160{
@@ -1148,8 +1169,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1148 1169
1149 q.pi_state = NULL; 1170 q.pi_state = NULL;
1150 retry: 1171 retry:
1151 if (fshared) 1172 futex_lock_mm(fshared);
1152 down_read(fshared);
1153 1173
1154 ret = get_futex_key(uaddr, fshared, &q.key); 1174 ret = get_futex_key(uaddr, fshared, &q.key);
1155 if (unlikely(ret != 0)) 1175 if (unlikely(ret != 0))
@@ -1186,8 +1206,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1186 * If we would have faulted, release mmap_sem, fault it in and 1206 * If we would have faulted, release mmap_sem, fault it in and
1187 * start all over again. 1207 * start all over again.
1188 */ 1208 */
1189 if (fshared) 1209 futex_unlock_mm(fshared);
1190 up_read(fshared);
1191 1210
1192 ret = get_user(uval, uaddr); 1211 ret = get_user(uval, uaddr);
1193 1212
@@ -1206,8 +1225,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1206 * Now the futex is queued and we have checked the data, we 1225 * Now the futex is queued and we have checked the data, we
1207 * don't want to hold mmap_sem while we sleep. 1226 * don't want to hold mmap_sem while we sleep.
1208 */ 1227 */
1209 if (fshared) 1228 futex_unlock_mm(fshared);
1210 up_read(fshared);
1211 1229
1212 /* 1230 /*
1213 * There might have been scheduling since the queue_me(), as we 1231 * There might have been scheduling since the queue_me(), as we
@@ -1285,8 +1303,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1285 queue_unlock(&q, hb); 1303 queue_unlock(&q, hb);
1286 1304
1287 out_release_sem: 1305 out_release_sem:
1288 if (fshared) 1306 futex_unlock_mm(fshared);
1289 up_read(fshared);
1290 return ret; 1307 return ret;
1291} 1308}
1292 1309
@@ -1333,8 +1350,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 1350
1334 q.pi_state = NULL; 1351 q.pi_state = NULL;
1335 retry: 1352 retry:
1336 if (fshared) 1353 futex_lock_mm(fshared);
1337 down_read(fshared);
1338 1354
1339 ret = get_futex_key(uaddr, fshared, &q.key); 1355 ret = get_futex_key(uaddr, fshared, &q.key);
1340 if (unlikely(ret != 0)) 1356 if (unlikely(ret != 0))
@@ -1353,9 +1369,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1353 */ 1369 */
1354 newval = current->pid; 1370 newval = current->pid;
1355 1371
1356 pagefault_disable(); 1372 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1357 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1358 pagefault_enable();
1359 1373
1360 if (unlikely(curval == -EFAULT)) 1374 if (unlikely(curval == -EFAULT))
1361 goto uaddr_faulted; 1375 goto uaddr_faulted;
@@ -1398,9 +1412,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1398 lock_taken = 1; 1412 lock_taken = 1;
1399 } 1413 }
1400 1414
1401 pagefault_disable(); 1415 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1402 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1403 pagefault_enable();
1404 1416
1405 if (unlikely(curval == -EFAULT)) 1417 if (unlikely(curval == -EFAULT))
1406 goto uaddr_faulted; 1418 goto uaddr_faulted;
@@ -1428,8 +1440,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1428 * exit to complete. 1440 * exit to complete.
1429 */ 1441 */
1430 queue_unlock(&q, hb); 1442 queue_unlock(&q, hb);
1431 if (fshared) 1443 futex_unlock_mm(fshared);
1432 up_read(fshared);
1433 cond_resched(); 1444 cond_resched();
1434 goto retry; 1445 goto retry;
1435 1446
@@ -1465,8 +1476,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1465 * Now the futex is queued and we have checked the data, we 1476 * Now the futex is queued and we have checked the data, we
1466 * don't want to hold mmap_sem while we sleep. 1477 * don't want to hold mmap_sem while we sleep.
1467 */ 1478 */
1468 if (fshared) 1479 futex_unlock_mm(fshared);
1469 up_read(fshared);
1470 1480
1471 WARN_ON(!q.pi_state); 1481 WARN_ON(!q.pi_state);
1472 /* 1482 /*
@@ -1480,8 +1490,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1480 ret = ret ? 0 : -EWOULDBLOCK; 1490 ret = ret ? 0 : -EWOULDBLOCK;
1481 } 1491 }
1482 1492
1483 if (fshared) 1493 futex_lock_mm(fshared);
1484 down_read(fshared);
1485 spin_lock(q.lock_ptr); 1494 spin_lock(q.lock_ptr);
1486 1495
1487 if (!ret) { 1496 if (!ret) {
@@ -1518,8 +1527,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1518 1527
1519 /* Unqueue and drop the lock */ 1528 /* Unqueue and drop the lock */
1520 unqueue_me_pi(&q); 1529 unqueue_me_pi(&q);
1521 if (fshared) 1530 futex_unlock_mm(fshared);
1522 up_read(fshared);
1523 1531
1524 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1532 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1525 1533
@@ -1527,8 +1535,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1527 queue_unlock(&q, hb); 1535 queue_unlock(&q, hb);
1528 1536
1529 out_release_sem: 1537 out_release_sem:
1530 if (fshared) 1538 futex_unlock_mm(fshared);
1531 up_read(fshared);
1532 return ret; 1539 return ret;
1533 1540
1534 uaddr_faulted: 1541 uaddr_faulted:
@@ -1550,8 +1557,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1550 goto retry_unlocked; 1557 goto retry_unlocked;
1551 } 1558 }
1552 1559
1553 if (fshared) 1560 futex_unlock_mm(fshared);
1554 up_read(fshared);
1555 1561
1556 ret = get_user(uval, uaddr); 1562 ret = get_user(uval, uaddr);
1557 if (!ret && (uval != -EFAULT)) 1563 if (!ret && (uval != -EFAULT))
@@ -1585,8 +1591,7 @@ retry:
1585 /* 1591 /*
1586 * First take all the futex related locks: 1592 * First take all the futex related locks:
1587 */ 1593 */
1588 if (fshared) 1594 futex_lock_mm(fshared);
1589 down_read(fshared);
1590 1595
1591 ret = get_futex_key(uaddr, fshared, &key); 1596 ret = get_futex_key(uaddr, fshared, &key);
1592 if (unlikely(ret != 0)) 1597 if (unlikely(ret != 0))
@@ -1601,11 +1606,9 @@ retry_unlocked:
1601 * again. If it succeeds then we can return without waking 1606 * again. If it succeeds then we can return without waking
1602 * anyone else up: 1607 * anyone else up:
1603 */ 1608 */
1604 if (!(uval & FUTEX_OWNER_DIED)) { 1609 if (!(uval & FUTEX_OWNER_DIED))
1605 pagefault_disable(); 1610 uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0);
1606 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1611
1607 pagefault_enable();
1608 }
1609 1612
1610 if (unlikely(uval == -EFAULT)) 1613 if (unlikely(uval == -EFAULT))
1611 goto pi_faulted; 1614 goto pi_faulted;
@@ -1647,8 +1650,7 @@ retry_unlocked:
1647out_unlock: 1650out_unlock:
1648 spin_unlock(&hb->lock); 1651 spin_unlock(&hb->lock);
1649out: 1652out:
1650 if (fshared) 1653 futex_unlock_mm(fshared);
1651 up_read(fshared);
1652 1654
1653 return ret; 1655 return ret;
1654 1656
@@ -1671,8 +1673,7 @@ pi_faulted:
1671 goto retry_unlocked; 1673 goto retry_unlocked;
1672 } 1674 }
1673 1675
1674 if (fshared) 1676 futex_unlock_mm(fshared);
1675 up_read(fshared);
1676 1677
1677 ret = get_user(uval, uaddr); 1678 ret = get_user(uval, uaddr);
1678 if (!ret && (uval != -EFAULT)) 1679 if (!ret && (uval != -EFAULT))
@@ -1729,8 +1730,8 @@ static int futex_fd(u32 __user *uaddr, int signal)
1729 1730
1730 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 1731 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1731 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " 1732 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1732 "will be removed from the kernel in June 2007\n", 1733 "will be removed from the kernel in June 2007\n",
1733 current->comm); 1734 current->comm);
1734 } 1735 }
1735 1736
1736 ret = -EINVAL; 1737 ret = -EINVAL;
@@ -1908,10 +1909,8 @@ retry:
1908 * Wake robust non-PI futexes here. The wakeup of 1909 * Wake robust non-PI futexes here. The wakeup of
1909 * PI futexes happens in exit_pi_state(): 1910 * PI futexes happens in exit_pi_state():
1910 */ 1911 */
1911 if (!pi) { 1912 if (!pi && (uval & FUTEX_WAITERS))
1912 if (uval & FUTEX_WAITERS)
1913 futex_wake(uaddr, &curr->mm->mmap_sem, 1); 1913 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1914 }
1915 } 1914 }
1916 return 0; 1915 return 0;
1917} 1916}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 23c03f43e196..eb1ddebd2c04 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
558 */ 558 */
559static int hrtimer_switch_to_hres(void) 559static int hrtimer_switch_to_hres(void)
560{ 560{
561 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 561 int cpu = smp_processor_id();
562 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
562 unsigned long flags; 563 unsigned long flags;
563 564
564 if (base->hres_active) 565 if (base->hres_active)
@@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void)
568 569
569 if (tick_init_highres()) { 570 if (tick_init_highres()) {
570 local_irq_restore(flags); 571 local_irq_restore(flags);
572 printk(KERN_WARNING "Could not switch to high resolution "
573 "mode on CPU %d\n", cpu);
571 return 0; 574 return 0;
572 } 575 }
573 base->hres_active = 1; 576 base->hres_active = 1;
@@ -683,6 +686,7 @@ static void enqueue_hrtimer(struct hrtimer *timer,
683 struct rb_node **link = &base->active.rb_node; 686 struct rb_node **link = &base->active.rb_node;
684 struct rb_node *parent = NULL; 687 struct rb_node *parent = NULL;
685 struct hrtimer *entry; 688 struct hrtimer *entry;
689 int leftmost = 1;
686 690
687 /* 691 /*
688 * Find the right place in the rbtree: 692 * Find the right place in the rbtree:
@@ -694,18 +698,19 @@ static void enqueue_hrtimer(struct hrtimer *timer,
694 * We dont care about collisions. Nodes with 698 * We dont care about collisions. Nodes with
695 * the same expiry time stay together. 699 * the same expiry time stay together.
696 */ 700 */
697 if (timer->expires.tv64 < entry->expires.tv64) 701 if (timer->expires.tv64 < entry->expires.tv64) {
698 link = &(*link)->rb_left; 702 link = &(*link)->rb_left;
699 else 703 } else {
700 link = &(*link)->rb_right; 704 link = &(*link)->rb_right;
705 leftmost = 0;
706 }
701 } 707 }
702 708
703 /* 709 /*
704 * Insert the timer to the rbtree and check whether it 710 * Insert the timer to the rbtree and check whether it
705 * replaces the first pending timer 711 * replaces the first pending timer
706 */ 712 */
707 if (!base->first || timer->expires.tv64 < 713 if (leftmost) {
708 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
709 /* 714 /*
710 * Reprogram the clock event device. When the timer is already 715 * Reprogram the clock event device. When the timer is already
711 * expired hrtimer_enqueue_reprogram has either called the 716 * expired hrtimer_enqueue_reprogram has either called the
@@ -1406,7 +1411,7 @@ static void migrate_hrtimers(int cpu)
1406static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1411static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1407 unsigned long action, void *hcpu) 1412 unsigned long action, void *hcpu)
1408{ 1413{
1409 long cpu = (long)hcpu; 1414 unsigned int cpu = (long)hcpu;
1410 1415
1411 switch (action) { 1416 switch (action) {
1412 1417
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index b4f1674fca79..50b81b98046a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir;
19static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
20 int count, int *eof, void *data) 20 int count, int *eof, void *data)
21{ 21{
22 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); 22 struct irq_desc *desc = irq_desc + (long)data;
23 cpumask_t *mask = &desc->affinity;
24 int len;
25
26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING)
28 mask = &desc->pending_mask;
29#endif
30 len = cpumask_scnprintf(page, count, *mask);
23 31
24 if (count - len < 2) 32 if (count - len < 2)
25 return -EINVAL; 33 return -EINVAL;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd9e272d55e9..32b161972fad 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
172 irqreturn_t action_ret) 172 irqreturn_t action_ret)
173{ 173{
174 if (unlikely(action_ret != IRQ_HANDLED)) { 174 if (unlikely(action_ret != IRQ_HANDLED)) {
175 desc->irqs_unhandled++; 175 /*
176 * If we are seeing only the odd spurious IRQ caused by
177 * bus asynchronicity then don't eventually trigger an error,
178 * otherwise the couter becomes a doomsday timer for otherwise
179 * working systems
180 */
181 if (jiffies - desc->last_unhandled > HZ/10)
182 desc->irqs_unhandled = 1;
183 else
184 desc->irqs_unhandled++;
185 desc->last_unhandled = jiffies;
176 if (unlikely(action_ret != IRQ_NONE)) 186 if (unlikely(action_ret != IRQ_NONE))
177 report_bad_irq(irq, desc, action_ret); 187 report_bad_irq(irq, desc, action_ret);
178 } 188 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fed54418626c..474219a41929 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos)
152/* Lookup the address for this symbol. Returns 0 if not found. */ 152/* Lookup the address for this symbol. Returns 0 if not found. */
153unsigned long kallsyms_lookup_name(const char *name) 153unsigned long kallsyms_lookup_name(const char *name)
154{ 154{
155 char namebuf[KSYM_NAME_LEN+1]; 155 char namebuf[KSYM_NAME_LEN];
156 unsigned long i; 156 unsigned long i;
157 unsigned int off; 157 unsigned int off;
158 158
@@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr,
248{ 248{
249 const char *msym; 249 const char *msym;
250 250
251 namebuf[KSYM_NAME_LEN] = 0; 251 namebuf[KSYM_NAME_LEN - 1] = 0;
252 namebuf[0] = 0; 252 namebuf[0] = 0;
253 253
254 if (is_ksym_addr(addr)) { 254 if (is_ksym_addr(addr)) {
@@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr,
265 /* see if it's in a module */ 265 /* see if it's in a module */
266 msym = module_address_lookup(addr, symbolsize, offset, modname); 266 msym = module_address_lookup(addr, symbolsize, offset, modname);
267 if (msym) 267 if (msym)
268 return strncpy(namebuf, msym, KSYM_NAME_LEN); 268 return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
269 269
270 return NULL; 270 return NULL;
271} 271}
@@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr,
273int lookup_symbol_name(unsigned long addr, char *symname) 273int lookup_symbol_name(unsigned long addr, char *symname)
274{ 274{
275 symname[0] = '\0'; 275 symname[0] = '\0';
276 symname[KSYM_NAME_LEN] = '\0'; 276 symname[KSYM_NAME_LEN - 1] = '\0';
277 277
278 if (is_ksym_addr(addr)) { 278 if (is_ksym_addr(addr)) {
279 unsigned long pos; 279 unsigned long pos;
@@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
291 unsigned long *offset, char *modname, char *name) 291 unsigned long *offset, char *modname, char *name)
292{ 292{
293 name[0] = '\0'; 293 name[0] = '\0';
294 name[KSYM_NAME_LEN] = '\0'; 294 name[KSYM_NAME_LEN - 1] = '\0';
295 295
296 if (is_ksym_addr(addr)) { 296 if (is_ksym_addr(addr)) {
297 unsigned long pos; 297 unsigned long pos;
@@ -312,18 +312,17 @@ int sprint_symbol(char *buffer, unsigned long address)
312 char *modname; 312 char *modname;
313 const char *name; 313 const char *name;
314 unsigned long offset, size; 314 unsigned long offset, size;
315 char namebuf[KSYM_NAME_LEN+1]; 315 char namebuf[KSYM_NAME_LEN];
316 316
317 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 317 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
318 if (!name) 318 if (!name)
319 return sprintf(buffer, "0x%lx", address); 319 return sprintf(buffer, "0x%lx", address);
320 else { 320
321 if (modname) 321 if (modname)
322 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 322 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
323 size, modname); 323 size, modname);
324 else 324 else
325 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 325 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
326 }
327} 326}
328 327
329/* Look up a kernel symbol and print it to the kernel messages. */ 328/* Look up a kernel symbol and print it to the kernel messages. */
@@ -343,8 +342,8 @@ struct kallsym_iter
343 unsigned long value; 342 unsigned long value;
344 unsigned int nameoff; /* If iterating in core kernel symbols */ 343 unsigned int nameoff; /* If iterating in core kernel symbols */
345 char type; 344 char type;
346 char name[KSYM_NAME_LEN+1]; 345 char name[KSYM_NAME_LEN];
347 char module_name[MODULE_NAME_LEN + 1]; 346 char module_name[MODULE_NAME_LEN];
348 int exported; 347 int exported;
349}; 348};
350 349
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index cee419143fd4..bc41ad0f24f8 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/kfifo.h> 26#include <linux/kfifo.h>
27#include <linux/log2.h>
27 28
28/** 29/**
29 * kfifo_init - allocates a new FIFO using a preallocated buffer 30 * kfifo_init - allocates a new FIFO using a preallocated buffer
@@ -41,7 +42,7 @@ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
41 struct kfifo *fifo; 42 struct kfifo *fifo;
42 43
43 /* size must be a power of 2 */ 44 /* size must be a power of 2 */
44 BUG_ON(size & (size - 1)); 45 BUG_ON(!is_power_of_2(size));
45 46
46 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
47 if (!fifo) 48 if (!fifo)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb077179..beedbdc64608 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -33,6 +33,8 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/resource.h> 35#include <linux/resource.h>
36#include <linux/notifier.h>
37#include <linux/suspend.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38extern int max_threads; 40extern int max_threads;
@@ -119,9 +121,10 @@ struct subprocess_info {
119 char **argv; 121 char **argv;
120 char **envp; 122 char **envp;
121 struct key *ring; 123 struct key *ring;
122 int wait; 124 enum umh_wait wait;
123 int retval; 125 int retval;
124 struct file *stdin; 126 struct file *stdin;
127 void (*cleanup)(char **argv, char **envp);
125}; 128};
126 129
127/* 130/*
@@ -180,6 +183,14 @@ static int ____call_usermodehelper(void *data)
180 do_exit(0); 183 do_exit(0);
181} 184}
182 185
186void call_usermodehelper_freeinfo(struct subprocess_info *info)
187{
188 if (info->cleanup)
189 (*info->cleanup)(info->argv, info->envp);
190 kfree(info);
191}
192EXPORT_SYMBOL(call_usermodehelper_freeinfo);
193
183/* Keventd can't block, but this (a child) can. */ 194/* Keventd can't block, but this (a child) can. */
184static int wait_for_helper(void *data) 195static int wait_for_helper(void *data)
185{ 196{
@@ -216,8 +227,8 @@ static int wait_for_helper(void *data)
216 sub_info->retval = ret; 227 sub_info->retval = ret;
217 } 228 }
218 229
219 if (sub_info->wait < 0) 230 if (sub_info->wait == UMH_NO_WAIT)
220 kfree(sub_info); 231 call_usermodehelper_freeinfo(sub_info);
221 else 232 else
222 complete(sub_info->complete); 233 complete(sub_info->complete);
223 return 0; 234 return 0;
@@ -229,34 +240,204 @@ static void __call_usermodehelper(struct work_struct *work)
229 struct subprocess_info *sub_info = 240 struct subprocess_info *sub_info =
230 container_of(work, struct subprocess_info, work); 241 container_of(work, struct subprocess_info, work);
231 pid_t pid; 242 pid_t pid;
232 int wait = sub_info->wait; 243 enum umh_wait wait = sub_info->wait;
233 244
234 /* CLONE_VFORK: wait until the usermode helper has execve'd 245 /* CLONE_VFORK: wait until the usermode helper has execve'd
235 * successfully We need the data structures to stay around 246 * successfully We need the data structures to stay around
236 * until that is done. */ 247 * until that is done. */
237 if (wait) 248 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
238 pid = kernel_thread(wait_for_helper, sub_info, 249 pid = kernel_thread(wait_for_helper, sub_info,
239 CLONE_FS | CLONE_FILES | SIGCHLD); 250 CLONE_FS | CLONE_FILES | SIGCHLD);
240 else 251 else
241 pid = kernel_thread(____call_usermodehelper, sub_info, 252 pid = kernel_thread(____call_usermodehelper, sub_info,
242 CLONE_VFORK | SIGCHLD); 253 CLONE_VFORK | SIGCHLD);
243 254
244 if (wait < 0) 255 switch (wait) {
245 return; 256 case UMH_NO_WAIT:
257 break;
246 258
247 if (pid < 0) { 259 case UMH_WAIT_PROC:
260 if (pid > 0)
261 break;
248 sub_info->retval = pid; 262 sub_info->retval = pid;
263 /* FALLTHROUGH */
264
265 case UMH_WAIT_EXEC:
249 complete(sub_info->complete); 266 complete(sub_info->complete);
250 } else if (!wait) 267 }
251 complete(sub_info->complete); 268}
269
270#ifdef CONFIG_PM
271/*
272 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
273 * (used for preventing user land processes from being created after the user
274 * land has been frozen during a system-wide hibernation or suspend operation).
275 */
276static int usermodehelper_disabled;
277
278/* Number of helpers running */
279static atomic_t running_helpers = ATOMIC_INIT(0);
280
281/*
282 * Wait queue head used by usermodehelper_pm_callback() to wait for all running
283 * helpers to finish.
284 */
285static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
286
287/*
288 * Time to wait for running_helpers to become zero before the setting of
289 * usermodehelper_disabled in usermodehelper_pm_callback() fails
290 */
291#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
292
293static int usermodehelper_pm_callback(struct notifier_block *nfb,
294 unsigned long action,
295 void *ignored)
296{
297 long retval;
298
299 switch (action) {
300 case PM_HIBERNATION_PREPARE:
301 case PM_SUSPEND_PREPARE:
302 usermodehelper_disabled = 1;
303 smp_mb();
304 /*
305 * From now on call_usermodehelper_exec() won't start any new
306 * helpers, so it is sufficient if running_helpers turns out to
307 * be zero at one point (it may be increased later, but that
308 * doesn't matter).
309 */
310 retval = wait_event_timeout(running_helpers_waitq,
311 atomic_read(&running_helpers) == 0,
312 RUNNING_HELPERS_TIMEOUT);
313 if (retval) {
314 return NOTIFY_OK;
315 } else {
316 usermodehelper_disabled = 0;
317 return NOTIFY_BAD;
318 }
319 case PM_POST_HIBERNATION:
320 case PM_POST_SUSPEND:
321 usermodehelper_disabled = 0;
322 return NOTIFY_OK;
323 }
324
325 return NOTIFY_DONE;
326}
327
328static void helper_lock(void)
329{
330 atomic_inc(&running_helpers);
331 smp_mb__after_atomic_inc();
332}
333
334static void helper_unlock(void)
335{
336 if (atomic_dec_and_test(&running_helpers))
337 wake_up(&running_helpers_waitq);
338}
339
340static void register_pm_notifier_callback(void)
341{
342 pm_notifier(usermodehelper_pm_callback, 0);
252} 343}
344#else /* CONFIG_PM */
345#define usermodehelper_disabled 0
346
347static inline void helper_lock(void) {}
348static inline void helper_unlock(void) {}
349static inline void register_pm_notifier_callback(void) {}
350#endif /* CONFIG_PM */
253 351
254/** 352/**
255 * call_usermodehelper_keys - start a usermode application 353 * call_usermodehelper_setup - prepare to call a usermode helper
256 * @path: pathname for the application 354 * @path - path to usermode executable
257 * @argv: null-terminated argument list 355 * @argv - arg vector for process
258 * @envp: null-terminated environment list 356 * @envp - environment for process
259 * @session_keyring: session keyring for process (NULL for an empty keyring) 357 *
358 * Returns either NULL on allocation failure, or a subprocess_info
359 * structure. This should be passed to call_usermodehelper_exec to
360 * exec the process and free the structure.
361 */
362struct subprocess_info *call_usermodehelper_setup(char *path,
363 char **argv, char **envp)
364{
365 struct subprocess_info *sub_info;
366 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
367 if (!sub_info)
368 goto out;
369
370 INIT_WORK(&sub_info->work, __call_usermodehelper);
371 sub_info->path = path;
372 sub_info->argv = argv;
373 sub_info->envp = envp;
374
375 out:
376 return sub_info;
377}
378EXPORT_SYMBOL(call_usermodehelper_setup);
379
380/**
381 * call_usermodehelper_setkeys - set the session keys for usermode helper
382 * @info: a subprocess_info returned by call_usermodehelper_setup
383 * @session_keyring: the session keyring for the process
384 */
385void call_usermodehelper_setkeys(struct subprocess_info *info,
386 struct key *session_keyring)
387{
388 info->ring = session_keyring;
389}
390EXPORT_SYMBOL(call_usermodehelper_setkeys);
391
392/**
393 * call_usermodehelper_setcleanup - set a cleanup function
394 * @info: a subprocess_info returned by call_usermodehelper_setup
395 * @cleanup: a cleanup function
396 *
397 * The cleanup function is just befor ethe subprocess_info is about to
398 * be freed. This can be used for freeing the argv and envp. The
399 * Function must be runnable in either a process context or the
400 * context in which call_usermodehelper_exec is called.
401 */
402void call_usermodehelper_setcleanup(struct subprocess_info *info,
403 void (*cleanup)(char **argv, char **envp))
404{
405 info->cleanup = cleanup;
406}
407EXPORT_SYMBOL(call_usermodehelper_setcleanup);
408
409/**
410 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
411 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
412 * @filp: set to the write-end of a pipe
413 *
414 * This constructs a pipe, and sets the read end to be the stdin of the
415 * subprocess, and returns the write-end in *@filp.
416 */
417int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
418 struct file **filp)
419{
420 struct file *f;
421
422 f = create_write_pipe();
423 if (IS_ERR(f))
424 return PTR_ERR(f);
425 *filp = f;
426
427 f = create_read_pipe(f);
428 if (IS_ERR(f)) {
429 free_write_pipe(*filp);
430 return PTR_ERR(f);
431 }
432 sub_info->stdin = f;
433
434 return 0;
435}
436EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
437
438/**
439 * call_usermodehelper_exec - start a usermode application
440 * @sub_info: information about the subprocessa
260 * @wait: wait for the application to finish and return status. 441 * @wait: wait for the application to finish and return status.
261 * when -1 don't wait at all, but you get no useful error back when 442 * when -1 don't wait at all, but you get no useful error back when
262 * the program couldn't be exec'ed. This makes it safe to call 443 * the program couldn't be exec'ed. This makes it safe to call
@@ -265,81 +446,70 @@ static void __call_usermodehelper(struct work_struct *work)
265 * Runs a user-space application. The application is started 446 * Runs a user-space application. The application is started
266 * asynchronously if wait is not set, and runs as a child of keventd. 447 * asynchronously if wait is not set, and runs as a child of keventd.
267 * (ie. it runs with full root capabilities). 448 * (ie. it runs with full root capabilities).
268 *
269 * Must be called from process context. Returns a negative error code
270 * if program was not execed successfully, or 0.
271 */ 449 */
272int call_usermodehelper_keys(char *path, char **argv, char **envp, 450int call_usermodehelper_exec(struct subprocess_info *sub_info,
273 struct key *session_keyring, int wait) 451 enum umh_wait wait)
274{ 452{
275 DECLARE_COMPLETION_ONSTACK(done); 453 DECLARE_COMPLETION_ONSTACK(done);
276 struct subprocess_info *sub_info;
277 int retval; 454 int retval;
278 455
279 if (!khelper_wq) 456 helper_lock();
280 return -EBUSY; 457 if (sub_info->path[0] == '\0') {
281 458 retval = 0;
282 if (path[0] == '\0') 459 goto out;
283 return 0; 460 }
284 461
285 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 462 if (!khelper_wq || usermodehelper_disabled) {
286 if (!sub_info) 463 retval = -EBUSY;
287 return -ENOMEM; 464 goto out;
465 }
288 466
289 INIT_WORK(&sub_info->work, __call_usermodehelper);
290 sub_info->complete = &done; 467 sub_info->complete = &done;
291 sub_info->path = path;
292 sub_info->argv = argv;
293 sub_info->envp = envp;
294 sub_info->ring = session_keyring;
295 sub_info->wait = wait; 468 sub_info->wait = wait;
296 469
297 queue_work(khelper_wq, &sub_info->work); 470 queue_work(khelper_wq, &sub_info->work);
298 if (wait < 0) /* task has freed sub_info */ 471 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
299 return 0; 472 return 0;
300 wait_for_completion(&done); 473 wait_for_completion(&done);
301 retval = sub_info->retval; 474 retval = sub_info->retval;
302 kfree(sub_info); 475
476 out:
477 call_usermodehelper_freeinfo(sub_info);
478 helper_unlock();
303 return retval; 479 return retval;
304} 480}
305EXPORT_SYMBOL(call_usermodehelper_keys); 481EXPORT_SYMBOL(call_usermodehelper_exec);
306 482
483/**
484 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
485 * @path: path to usermode executable
486 * @argv: arg vector for process
487 * @envp: environment for process
488 * @filp: set to the write-end of a pipe
489 *
490 * This is a simple wrapper which executes a usermode-helper function
491 * with a pipe as stdin. It is implemented entirely in terms of
492 * lower-level call_usermodehelper_* functions.
493 */
307int call_usermodehelper_pipe(char *path, char **argv, char **envp, 494int call_usermodehelper_pipe(char *path, char **argv, char **envp,
308 struct file **filp) 495 struct file **filp)
309{ 496{
310 DECLARE_COMPLETION(done); 497 struct subprocess_info *sub_info;
311 struct subprocess_info sub_info = { 498 int ret;
312 .work = __WORK_INITIALIZER(sub_info.work,
313 __call_usermodehelper),
314 .complete = &done,
315 .path = path,
316 .argv = argv,
317 .envp = envp,
318 .retval = 0,
319 };
320 struct file *f;
321
322 if (!khelper_wq)
323 return -EBUSY;
324 499
325 if (path[0] == '\0') 500 sub_info = call_usermodehelper_setup(path, argv, envp);
326 return 0; 501 if (sub_info == NULL)
502 return -ENOMEM;
327 503
328 f = create_write_pipe(); 504 ret = call_usermodehelper_stdinpipe(sub_info, filp);
329 if (IS_ERR(f)) 505 if (ret < 0)
330 return PTR_ERR(f); 506 goto out;
331 *filp = f;
332 507
333 f = create_read_pipe(f); 508 return call_usermodehelper_exec(sub_info, 1);
334 if (IS_ERR(f)) {
335 free_write_pipe(*filp);
336 return PTR_ERR(f);
337 }
338 sub_info.stdin = f;
339 509
340 queue_work(khelper_wq, &sub_info.work); 510 out:
341 wait_for_completion(&done); 511 call_usermodehelper_freeinfo(sub_info);
342 return sub_info.retval; 512 return ret;
343} 513}
344EXPORT_SYMBOL(call_usermodehelper_pipe); 514EXPORT_SYMBOL(call_usermodehelper_pipe);
345 515
@@ -347,4 +517,5 @@ void __init usermodehelper_init(void)
347{ 517{
348 khelper_wq = create_singlethread_workqueue("khelper"); 518 khelper_wq = create_singlethread_workqueue("khelper");
349 BUG_ON(!khelper_wq); 519 BUG_ON(!khelper_wq);
520 register_pm_notifier_callback();
350} 521}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9e47d8c493f3..3e9f513a728d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = {
675 .priority = 0x7fffffff /* we need to be notified first */ 675 .priority = 0x7fffffff /* we need to be notified first */
676}; 676};
677 677
678unsigned long __weak arch_deref_entry_point(void *entry)
679{
680 return (unsigned long)entry;
681}
678 682
679int __kprobes register_jprobe(struct jprobe *jp) 683int __kprobes register_jprobe(struct jprobe *jp)
680{ 684{
685 unsigned long addr = arch_deref_entry_point(jp->entry);
686
687 if (!kernel_text_address(addr))
688 return -EINVAL;
689
681 /* Todo: Verify probepoint is a function entry point */ 690 /* Todo: Verify probepoint is a function entry point */
682 jp->kp.pre_handler = setjmp_pre_handler; 691 jp->kp.pre_handler = setjmp_pre_handler;
683 jp->kp.break_handler = longjmp_break_handler; 692 jp->kp.break_handler = longjmp_break_handler;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 559deca5ed15..d0e5c48e18c7 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
62KERNEL_ATTR_RO(kexec_crash_loaded); 62KERNEL_ATTR_RO(kexec_crash_loaded);
63#endif /* CONFIG_KEXEC */ 63#endif /* CONFIG_KEXEC */
64 64
65/*
66 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
67 */
68extern const void __start_notes __attribute__((weak));
69extern const void __stop_notes __attribute__((weak));
70#define notes_size (&__stop_notes - &__start_notes)
71
72static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
73 char *buf, loff_t off, size_t count)
74{
75 memcpy(buf, &__start_notes + off, count);
76 return count;
77}
78
79static struct bin_attribute notes_attr = {
80 .attr = {
81 .name = "notes",
82 .mode = S_IRUGO,
83 },
84 .read = &notes_read,
85};
86
65decl_subsys(kernel, NULL, NULL); 87decl_subsys(kernel, NULL, NULL);
66EXPORT_SYMBOL_GPL(kernel_subsys); 88EXPORT_SYMBOL_GPL(kernel_subsys);
67 89
@@ -88,6 +110,12 @@ static int __init ksysfs_init(void)
88 error = sysfs_create_group(&kernel_subsys.kobj, 110 error = sysfs_create_group(&kernel_subsys.kobj,
89 &kernel_attr_group); 111 &kernel_attr_group);
90 112
113 if (!error && notes_size > 0) {
114 notes_attr.size = notes_size;
115 error = sysfs_create_bin_file(&kernel_subsys.kobj,
116 &notes_attr);
117 }
118
91 return error; 119 return error;
92} 120}
93 121
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bbd51b81a3e8..a404f7ee7395 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,7 +215,7 @@ int kthread_stop(struct task_struct *k)
215EXPORT_SYMBOL(kthread_stop); 215EXPORT_SYMBOL(kthread_stop);
216 216
217 217
218static __init void kthreadd_setup(void) 218static noinline __init_refok void kthreadd_setup(void)
219{ 219{
220 struct task_struct *tsk = current; 220 struct task_struct *tsk = current;
221 221
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1a5ff2211d88..734da579ad13 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Started by Ingo Molnar: 6 * Started by Ingo Molnar:
7 * 7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * 10 *
10 * this code maps all the lock dependencies as they occur in a live kernel 11 * this code maps all the lock dependencies as they occur in a live kernel
11 * and will warn about the following classes of locking bugs: 12 * and will warn about the following classes of locking bugs:
@@ -37,11 +38,26 @@
37#include <linux/debug_locks.h> 38#include <linux/debug_locks.h>
38#include <linux/irqflags.h> 39#include <linux/irqflags.h>
39#include <linux/utsname.h> 40#include <linux/utsname.h>
41#include <linux/hash.h>
40 42
41#include <asm/sections.h> 43#include <asm/sections.h>
42 44
43#include "lockdep_internals.h" 45#include "lockdep_internals.h"
44 46
47#ifdef CONFIG_PROVE_LOCKING
48int prove_locking = 1;
49module_param(prove_locking, int, 0644);
50#else
51#define prove_locking 0
52#endif
53
54#ifdef CONFIG_LOCK_STAT
55int lock_stat = 1;
56module_param(lock_stat, int, 0644);
57#else
58#define lock_stat 0
59#endif
60
45/* 61/*
46 * lockdep_lock: protects the lockdep graph, the hashes and the 62 * lockdep_lock: protects the lockdep graph, the hashes and the
47 * class/list/hash allocators. 63 * class/list/hash allocators.
@@ -96,23 +112,6 @@ unsigned long nr_list_entries;
96static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; 112static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
97 113
98/* 114/*
99 * Allocate a lockdep entry. (assumes the graph_lock held, returns
100 * with NULL on failure)
101 */
102static struct lock_list *alloc_list_entry(void)
103{
104 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
105 if (!debug_locks_off_graph_unlock())
106 return NULL;
107
108 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
109 printk("turning off the locking correctness validator.\n");
110 return NULL;
111 }
112 return list_entries + nr_list_entries++;
113}
114
115/*
116 * All data structures here are protected by the global debug_lock. 115 * All data structures here are protected by the global debug_lock.
117 * 116 *
118 * Mutex key structs only get allocated, once during bootup, and never 117 * Mutex key structs only get allocated, once during bootup, and never
@@ -121,6 +120,117 @@ static struct lock_list *alloc_list_entry(void)
121unsigned long nr_lock_classes; 120unsigned long nr_lock_classes;
122static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 121static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
123 122
123#ifdef CONFIG_LOCK_STAT
124static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
125
126static int lock_contention_point(struct lock_class *class, unsigned long ip)
127{
128 int i;
129
130 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
131 if (class->contention_point[i] == 0) {
132 class->contention_point[i] = ip;
133 break;
134 }
135 if (class->contention_point[i] == ip)
136 break;
137 }
138
139 return i;
140}
141
142static void lock_time_inc(struct lock_time *lt, s64 time)
143{
144 if (time > lt->max)
145 lt->max = time;
146
147 if (time < lt->min || !lt->min)
148 lt->min = time;
149
150 lt->total += time;
151 lt->nr++;
152}
153
154static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
155{
156 dst->min += src->min;
157 dst->max += src->max;
158 dst->total += src->total;
159 dst->nr += src->nr;
160}
161
162struct lock_class_stats lock_stats(struct lock_class *class)
163{
164 struct lock_class_stats stats;
165 int cpu, i;
166
167 memset(&stats, 0, sizeof(struct lock_class_stats));
168 for_each_possible_cpu(cpu) {
169 struct lock_class_stats *pcs =
170 &per_cpu(lock_stats, cpu)[class - lock_classes];
171
172 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
173 stats.contention_point[i] += pcs->contention_point[i];
174
175 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
176 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
177
178 lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
179 lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
180
181 for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
182 stats.bounces[i] += pcs->bounces[i];
183 }
184
185 return stats;
186}
187
188void clear_lock_stats(struct lock_class *class)
189{
190 int cpu;
191
192 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *cpu_stats =
194 &per_cpu(lock_stats, cpu)[class - lock_classes];
195
196 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
197 }
198 memset(class->contention_point, 0, sizeof(class->contention_point));
199}
200
201static struct lock_class_stats *get_lock_stats(struct lock_class *class)
202{
203 return &get_cpu_var(lock_stats)[class - lock_classes];
204}
205
206static void put_lock_stats(struct lock_class_stats *stats)
207{
208 put_cpu_var(lock_stats);
209}
210
211static void lock_release_holdtime(struct held_lock *hlock)
212{
213 struct lock_class_stats *stats;
214 s64 holdtime;
215
216 if (!lock_stat)
217 return;
218
219 holdtime = sched_clock() - hlock->holdtime_stamp;
220
221 stats = get_lock_stats(hlock->class);
222 if (hlock->read)
223 lock_time_inc(&stats->read_holdtime, holdtime);
224 else
225 lock_time_inc(&stats->write_holdtime, holdtime);
226 put_lock_stats(stats);
227}
228#else
229static inline void lock_release_holdtime(struct held_lock *hlock)
230{
231}
232#endif
233
124/* 234/*
125 * We keep a global list of all lock classes. The list only grows, 235 * We keep a global list of all lock classes. The list only grows,
126 * never shrinks. The list is only accessed with the lockdep 236 * never shrinks. The list is only accessed with the lockdep
@@ -133,24 +243,18 @@ LIST_HEAD(all_lock_classes);
133 */ 243 */
134#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) 244#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
135#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) 245#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
136#define CLASSHASH_MASK (CLASSHASH_SIZE - 1) 246#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
137#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
138#define classhashentry(key) (classhash_table + __classhashfn((key))) 247#define classhashentry(key) (classhash_table + __classhashfn((key)))
139 248
140static struct list_head classhash_table[CLASSHASH_SIZE]; 249static struct list_head classhash_table[CLASSHASH_SIZE];
141 250
142unsigned long nr_lock_chains;
143static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
144
145/* 251/*
146 * We put the lock dependency chains into a hash-table as well, to cache 252 * We put the lock dependency chains into a hash-table as well, to cache
147 * their existence: 253 * their existence:
148 */ 254 */
149#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) 255#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
150#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) 256#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
151#define CHAINHASH_MASK (CHAINHASH_SIZE - 1) 257#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
152#define __chainhashfn(chain) \
153 (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
154#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) 258#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
155 259
156static struct list_head chainhash_table[CHAINHASH_SIZE]; 260static struct list_head chainhash_table[CHAINHASH_SIZE];
@@ -223,26 +327,6 @@ static int verbose(struct lock_class *class)
223 return 0; 327 return 0;
224} 328}
225 329
226#ifdef CONFIG_TRACE_IRQFLAGS
227
228static int hardirq_verbose(struct lock_class *class)
229{
230#if HARDIRQ_VERBOSE
231 return class_filter(class);
232#endif
233 return 0;
234}
235
236static int softirq_verbose(struct lock_class *class)
237{
238#if SOFTIRQ_VERBOSE
239 return class_filter(class);
240#endif
241 return 0;
242}
243
244#endif
245
246/* 330/*
247 * Stack-trace: tightly packed array of stack backtrace 331 * Stack-trace: tightly packed array of stack backtrace
248 * addresses. Protected by the graph_lock. 332 * addresses. Protected by the graph_lock.
@@ -291,6 +375,11 @@ unsigned int max_recursion_depth;
291 * about it later on, in lockdep_info(). 375 * about it later on, in lockdep_info().
292 */ 376 */
293static int lockdep_init_error; 377static int lockdep_init_error;
378static unsigned long lockdep_init_trace_data[20];
379static struct stack_trace lockdep_init_trace = {
380 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
381 .entries = lockdep_init_trace_data,
382};
294 383
295/* 384/*
296 * Various lockdep statistics: 385 * Various lockdep statistics:
@@ -379,7 +468,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
379 468
380static void print_lock_name(struct lock_class *class) 469static void print_lock_name(struct lock_class *class)
381{ 470{
382 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; 471 char str[KSYM_NAME_LEN], c1, c2, c3, c4;
383 const char *name; 472 const char *name;
384 473
385 get_usage_chars(class, &c1, &c2, &c3, &c4); 474 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -401,7 +490,7 @@ static void print_lock_name(struct lock_class *class)
401static void print_lockdep_cache(struct lockdep_map *lock) 490static void print_lockdep_cache(struct lockdep_map *lock)
402{ 491{
403 const char *name; 492 const char *name;
404 char str[KSYM_NAME_LEN + 1]; 493 char str[KSYM_NAME_LEN];
405 494
406 name = lock->name; 495 name = lock->name;
407 if (!name) 496 if (!name)
@@ -482,6 +571,262 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
482 } 571 }
483} 572}
484 573
574static void print_kernel_version(void)
575{
576 printk("%s %.*s\n", init_utsname()->release,
577 (int)strcspn(init_utsname()->version, " "),
578 init_utsname()->version);
579}
580
581static int very_verbose(struct lock_class *class)
582{
583#if VERY_VERBOSE
584 return class_filter(class);
585#endif
586 return 0;
587}
588
589/*
590 * Is this the address of a static object:
591 */
592static int static_obj(void *obj)
593{
594 unsigned long start = (unsigned long) &_stext,
595 end = (unsigned long) &_end,
596 addr = (unsigned long) obj;
597#ifdef CONFIG_SMP
598 int i;
599#endif
600
601 /*
602 * static variable?
603 */
604 if ((addr >= start) && (addr < end))
605 return 1;
606
607#ifdef CONFIG_SMP
608 /*
609 * percpu var?
610 */
611 for_each_possible_cpu(i) {
612 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
613 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
614 + per_cpu_offset(i);
615
616 if ((addr >= start) && (addr < end))
617 return 1;
618 }
619#endif
620
621 /*
622 * module var?
623 */
624 return is_module_address(addr);
625}
626
627/*
628 * To make lock name printouts unique, we calculate a unique
629 * class->name_version generation counter:
630 */
631static int count_matching_names(struct lock_class *new_class)
632{
633 struct lock_class *class;
634 int count = 0;
635
636 if (!new_class->name)
637 return 0;
638
639 list_for_each_entry(class, &all_lock_classes, lock_entry) {
640 if (new_class->key - new_class->subclass == class->key)
641 return class->name_version;
642 if (class->name && !strcmp(class->name, new_class->name))
643 count = max(count, class->name_version);
644 }
645
646 return count + 1;
647}
648
649/*
650 * Register a lock's class in the hash-table, if the class is not present
651 * yet. Otherwise we look it up. We cache the result in the lock object
652 * itself, so actual lookup of the hash should be once per lock object.
653 */
654static inline struct lock_class *
655look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
656{
657 struct lockdep_subclass_key *key;
658 struct list_head *hash_head;
659 struct lock_class *class;
660
661#ifdef CONFIG_DEBUG_LOCKDEP
662 /*
663 * If the architecture calls into lockdep before initializing
664 * the hashes then we'll warn about it later. (we cannot printk
665 * right now)
666 */
667 if (unlikely(!lockdep_initialized)) {
668 lockdep_init();
669 lockdep_init_error = 1;
670 save_stack_trace(&lockdep_init_trace);
671 }
672#endif
673
674 /*
675 * Static locks do not have their class-keys yet - for them the key
676 * is the lock object itself:
677 */
678 if (unlikely(!lock->key))
679 lock->key = (void *)lock;
680
681 /*
682 * NOTE: the class-key must be unique. For dynamic locks, a static
683 * lock_class_key variable is passed in through the mutex_init()
684 * (or spin_lock_init()) call - which acts as the key. For static
685 * locks we use the lock object itself as the key.
686 */
687 BUILD_BUG_ON(sizeof(struct lock_class_key) >
688 sizeof(struct lockdep_map));
689
690 key = lock->key->subkeys + subclass;
691
692 hash_head = classhashentry(key);
693
694 /*
695 * We can walk the hash lockfree, because the hash only
696 * grows, and we are careful when adding entries to the end:
697 */
698 list_for_each_entry(class, hash_head, hash_entry) {
699 if (class->key == key) {
700 WARN_ON_ONCE(class->name != lock->name);
701 return class;
702 }
703 }
704
705 return NULL;
706}
707
708/*
709 * Register a lock's class in the hash-table, if the class is not present
710 * yet. Otherwise we look it up. We cache the result in the lock object
711 * itself, so actual lookup of the hash should be once per lock object.
712 */
713static inline struct lock_class *
714register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
715{
716 struct lockdep_subclass_key *key;
717 struct list_head *hash_head;
718 struct lock_class *class;
719 unsigned long flags;
720
721 class = look_up_lock_class(lock, subclass);
722 if (likely(class))
723 return class;
724
725 /*
726 * Debug-check: all keys must be persistent!
727 */
728 if (!static_obj(lock->key)) {
729 debug_locks_off();
730 printk("INFO: trying to register non-static key.\n");
731 printk("the code is fine but needs lockdep annotation.\n");
732 printk("turning off the locking correctness validator.\n");
733 dump_stack();
734
735 return NULL;
736 }
737
738 key = lock->key->subkeys + subclass;
739 hash_head = classhashentry(key);
740
741 raw_local_irq_save(flags);
742 if (!graph_lock()) {
743 raw_local_irq_restore(flags);
744 return NULL;
745 }
746 /*
747 * We have to do the hash-walk again, to avoid races
748 * with another CPU:
749 */
750 list_for_each_entry(class, hash_head, hash_entry)
751 if (class->key == key)
752 goto out_unlock_set;
753 /*
754 * Allocate a new key from the static array, and add it to
755 * the hash:
756 */
757 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
758 if (!debug_locks_off_graph_unlock()) {
759 raw_local_irq_restore(flags);
760 return NULL;
761 }
762 raw_local_irq_restore(flags);
763
764 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
765 printk("turning off the locking correctness validator.\n");
766 return NULL;
767 }
768 class = lock_classes + nr_lock_classes++;
769 debug_atomic_inc(&nr_unused_locks);
770 class->key = key;
771 class->name = lock->name;
772 class->subclass = subclass;
773 INIT_LIST_HEAD(&class->lock_entry);
774 INIT_LIST_HEAD(&class->locks_before);
775 INIT_LIST_HEAD(&class->locks_after);
776 class->name_version = count_matching_names(class);
777 /*
778 * We use RCU's safe list-add method to make
779 * parallel walking of the hash-list safe:
780 */
781 list_add_tail_rcu(&class->hash_entry, hash_head);
782
783 if (verbose(class)) {
784 graph_unlock();
785 raw_local_irq_restore(flags);
786
787 printk("\nnew class %p: %s", class->key, class->name);
788 if (class->name_version > 1)
789 printk("#%d", class->name_version);
790 printk("\n");
791 dump_stack();
792
793 raw_local_irq_save(flags);
794 if (!graph_lock()) {
795 raw_local_irq_restore(flags);
796 return NULL;
797 }
798 }
799out_unlock_set:
800 graph_unlock();
801 raw_local_irq_restore(flags);
802
803 if (!subclass || force)
804 lock->class_cache = class;
805
806 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
807 return NULL;
808
809 return class;
810}
811
812#ifdef CONFIG_PROVE_LOCKING
813/*
814 * Allocate a lockdep entry. (assumes the graph_lock held, returns
815 * with NULL on failure)
816 */
817static struct lock_list *alloc_list_entry(void)
818{
819 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
820 if (!debug_locks_off_graph_unlock())
821 return NULL;
822
823 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
824 printk("turning off the locking correctness validator.\n");
825 return NULL;
826 }
827 return list_entries + nr_list_entries++;
828}
829
485/* 830/*
486 * Add a new dependency to the head of the list: 831 * Add a new dependency to the head of the list:
487 */ 832 */
@@ -542,13 +887,6 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
542 return 0; 887 return 0;
543} 888}
544 889
545static void print_kernel_version(void)
546{
547 printk("%s %.*s\n", init_utsname()->release,
548 (int)strcspn(init_utsname()->version, " "),
549 init_utsname()->version);
550}
551
552/* 890/*
553 * When a circular dependency is detected, print the 891 * When a circular dependency is detected, print the
554 * header first: 892 * header first:
@@ -640,15 +978,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
640 return 1; 978 return 1;
641} 979}
642 980
643static int very_verbose(struct lock_class *class)
644{
645#if VERY_VERBOSE
646 return class_filter(class);
647#endif
648 return 0;
649}
650#ifdef CONFIG_TRACE_IRQFLAGS 981#ifdef CONFIG_TRACE_IRQFLAGS
651
652/* 982/*
653 * Forwards and backwards subgraph searching, for the purposes of 983 * Forwards and backwards subgraph searching, for the purposes of
654 * proving that two subgraphs can be connected by a new dependency 984 * proving that two subgraphs can be connected by a new dependency
@@ -821,6 +1151,78 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
821 bit_backwards, bit_forwards, irqclass); 1151 bit_backwards, bit_forwards, irqclass);
822} 1152}
823 1153
1154static int
1155check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1156 struct held_lock *next)
1157{
1158 /*
1159 * Prove that the new dependency does not connect a hardirq-safe
1160 * lock with a hardirq-unsafe lock - to achieve this we search
1161 * the backwards-subgraph starting at <prev>, and the
1162 * forwards-subgraph starting at <next>:
1163 */
1164 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
1165 LOCK_ENABLED_HARDIRQS, "hard"))
1166 return 0;
1167
1168 /*
1169 * Prove that the new dependency does not connect a hardirq-safe-read
1170 * lock with a hardirq-unsafe lock - to achieve this we search
1171 * the backwards-subgraph starting at <prev>, and the
1172 * forwards-subgraph starting at <next>:
1173 */
1174 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
1175 LOCK_ENABLED_HARDIRQS, "hard-read"))
1176 return 0;
1177
1178 /*
1179 * Prove that the new dependency does not connect a softirq-safe
1180 * lock with a softirq-unsafe lock - to achieve this we search
1181 * the backwards-subgraph starting at <prev>, and the
1182 * forwards-subgraph starting at <next>:
1183 */
1184 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
1185 LOCK_ENABLED_SOFTIRQS, "soft"))
1186 return 0;
1187 /*
1188 * Prove that the new dependency does not connect a softirq-safe-read
1189 * lock with a softirq-unsafe lock - to achieve this we search
1190 * the backwards-subgraph starting at <prev>, and the
1191 * forwards-subgraph starting at <next>:
1192 */
1193 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
1194 LOCK_ENABLED_SOFTIRQS, "soft"))
1195 return 0;
1196
1197 return 1;
1198}
1199
1200static void inc_chains(void)
1201{
1202 if (current->hardirq_context)
1203 nr_hardirq_chains++;
1204 else {
1205 if (current->softirq_context)
1206 nr_softirq_chains++;
1207 else
1208 nr_process_chains++;
1209 }
1210}
1211
1212#else
1213
1214static inline int
1215check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1216 struct held_lock *next)
1217{
1218 return 1;
1219}
1220
1221static inline void inc_chains(void)
1222{
1223 nr_process_chains++;
1224}
1225
824#endif 1226#endif
825 1227
826static int 1228static int
@@ -922,47 +1324,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
922 if (!(check_noncircular(next->class, 0))) 1324 if (!(check_noncircular(next->class, 0)))
923 return print_circular_bug_tail(); 1325 return print_circular_bug_tail();
924 1326
925#ifdef CONFIG_TRACE_IRQFLAGS 1327 if (!check_prev_add_irq(curr, prev, next))
926 /*
927 * Prove that the new dependency does not connect a hardirq-safe
928 * lock with a hardirq-unsafe lock - to achieve this we search
929 * the backwards-subgraph starting at <prev>, and the
930 * forwards-subgraph starting at <next>:
931 */
932 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
933 LOCK_ENABLED_HARDIRQS, "hard"))
934 return 0; 1328 return 0;
935 1329
936 /* 1330 /*
937 * Prove that the new dependency does not connect a hardirq-safe-read
938 * lock with a hardirq-unsafe lock - to achieve this we search
939 * the backwards-subgraph starting at <prev>, and the
940 * forwards-subgraph starting at <next>:
941 */
942 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
943 LOCK_ENABLED_HARDIRQS, "hard-read"))
944 return 0;
945
946 /*
947 * Prove that the new dependency does not connect a softirq-safe
948 * lock with a softirq-unsafe lock - to achieve this we search
949 * the backwards-subgraph starting at <prev>, and the
950 * forwards-subgraph starting at <next>:
951 */
952 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
953 LOCK_ENABLED_SOFTIRQS, "soft"))
954 return 0;
955 /*
956 * Prove that the new dependency does not connect a softirq-safe-read
957 * lock with a softirq-unsafe lock - to achieve this we search
958 * the backwards-subgraph starting at <prev>, and the
959 * forwards-subgraph starting at <next>:
960 */
961 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
962 LOCK_ENABLED_SOFTIRQS, "soft"))
963 return 0;
964#endif
965 /*
966 * For recursive read-locks we do all the dependency checks, 1331 * For recursive read-locks we do all the dependency checks,
967 * but we dont store read-triggered dependencies (only 1332 * but we dont store read-triggered dependencies (only
968 * write-triggered dependencies). This ensures that only the 1333 * write-triggered dependencies). This ensures that only the
@@ -1088,224 +1453,8 @@ out_bug:
1088 return 0; 1453 return 0;
1089} 1454}
1090 1455
1091 1456unsigned long nr_lock_chains;
1092/* 1457static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1093 * Is this the address of a static object:
1094 */
1095static int static_obj(void *obj)
1096{
1097 unsigned long start = (unsigned long) &_stext,
1098 end = (unsigned long) &_end,
1099 addr = (unsigned long) obj;
1100#ifdef CONFIG_SMP
1101 int i;
1102#endif
1103
1104 /*
1105 * static variable?
1106 */
1107 if ((addr >= start) && (addr < end))
1108 return 1;
1109
1110#ifdef CONFIG_SMP
1111 /*
1112 * percpu var?
1113 */
1114 for_each_possible_cpu(i) {
1115 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1116 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
1117 + per_cpu_offset(i);
1118
1119 if ((addr >= start) && (addr < end))
1120 return 1;
1121 }
1122#endif
1123
1124 /*
1125 * module var?
1126 */
1127 return is_module_address(addr);
1128}
1129
1130/*
1131 * To make lock name printouts unique, we calculate a unique
1132 * class->name_version generation counter:
1133 */
1134static int count_matching_names(struct lock_class *new_class)
1135{
1136 struct lock_class *class;
1137 int count = 0;
1138
1139 if (!new_class->name)
1140 return 0;
1141
1142 list_for_each_entry(class, &all_lock_classes, lock_entry) {
1143 if (new_class->key - new_class->subclass == class->key)
1144 return class->name_version;
1145 if (class->name && !strcmp(class->name, new_class->name))
1146 count = max(count, class->name_version);
1147 }
1148
1149 return count + 1;
1150}
1151
1152/*
1153 * Register a lock's class in the hash-table, if the class is not present
1154 * yet. Otherwise we look it up. We cache the result in the lock object
1155 * itself, so actual lookup of the hash should be once per lock object.
1156 */
1157static inline struct lock_class *
1158look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1159{
1160 struct lockdep_subclass_key *key;
1161 struct list_head *hash_head;
1162 struct lock_class *class;
1163
1164#ifdef CONFIG_DEBUG_LOCKDEP
1165 /*
1166 * If the architecture calls into lockdep before initializing
1167 * the hashes then we'll warn about it later. (we cannot printk
1168 * right now)
1169 */
1170 if (unlikely(!lockdep_initialized)) {
1171 lockdep_init();
1172 lockdep_init_error = 1;
1173 }
1174#endif
1175
1176 /*
1177 * Static locks do not have their class-keys yet - for them the key
1178 * is the lock object itself:
1179 */
1180 if (unlikely(!lock->key))
1181 lock->key = (void *)lock;
1182
1183 /*
1184 * NOTE: the class-key must be unique. For dynamic locks, a static
1185 * lock_class_key variable is passed in through the mutex_init()
1186 * (or spin_lock_init()) call - which acts as the key. For static
1187 * locks we use the lock object itself as the key.
1188 */
1189 BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
1190
1191 key = lock->key->subkeys + subclass;
1192
1193 hash_head = classhashentry(key);
1194
1195 /*
1196 * We can walk the hash lockfree, because the hash only
1197 * grows, and we are careful when adding entries to the end:
1198 */
1199 list_for_each_entry(class, hash_head, hash_entry)
1200 if (class->key == key)
1201 return class;
1202
1203 return NULL;
1204}
1205
1206/*
1207 * Register a lock's class in the hash-table, if the class is not present
1208 * yet. Otherwise we look it up. We cache the result in the lock object
1209 * itself, so actual lookup of the hash should be once per lock object.
1210 */
1211static inline struct lock_class *
1212register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1213{
1214 struct lockdep_subclass_key *key;
1215 struct list_head *hash_head;
1216 struct lock_class *class;
1217 unsigned long flags;
1218
1219 class = look_up_lock_class(lock, subclass);
1220 if (likely(class))
1221 return class;
1222
1223 /*
1224 * Debug-check: all keys must be persistent!
1225 */
1226 if (!static_obj(lock->key)) {
1227 debug_locks_off();
1228 printk("INFO: trying to register non-static key.\n");
1229 printk("the code is fine but needs lockdep annotation.\n");
1230 printk("turning off the locking correctness validator.\n");
1231 dump_stack();
1232
1233 return NULL;
1234 }
1235
1236 key = lock->key->subkeys + subclass;
1237 hash_head = classhashentry(key);
1238
1239 raw_local_irq_save(flags);
1240 if (!graph_lock()) {
1241 raw_local_irq_restore(flags);
1242 return NULL;
1243 }
1244 /*
1245 * We have to do the hash-walk again, to avoid races
1246 * with another CPU:
1247 */
1248 list_for_each_entry(class, hash_head, hash_entry)
1249 if (class->key == key)
1250 goto out_unlock_set;
1251 /*
1252 * Allocate a new key from the static array, and add it to
1253 * the hash:
1254 */
1255 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1256 if (!debug_locks_off_graph_unlock()) {
1257 raw_local_irq_restore(flags);
1258 return NULL;
1259 }
1260 raw_local_irq_restore(flags);
1261
1262 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1263 printk("turning off the locking correctness validator.\n");
1264 return NULL;
1265 }
1266 class = lock_classes + nr_lock_classes++;
1267 debug_atomic_inc(&nr_unused_locks);
1268 class->key = key;
1269 class->name = lock->name;
1270 class->subclass = subclass;
1271 INIT_LIST_HEAD(&class->lock_entry);
1272 INIT_LIST_HEAD(&class->locks_before);
1273 INIT_LIST_HEAD(&class->locks_after);
1274 class->name_version = count_matching_names(class);
1275 /*
1276 * We use RCU's safe list-add method to make
1277 * parallel walking of the hash-list safe:
1278 */
1279 list_add_tail_rcu(&class->hash_entry, hash_head);
1280
1281 if (verbose(class)) {
1282 graph_unlock();
1283 raw_local_irq_restore(flags);
1284
1285 printk("\nnew class %p: %s", class->key, class->name);
1286 if (class->name_version > 1)
1287 printk("#%d", class->name_version);
1288 printk("\n");
1289 dump_stack();
1290
1291 raw_local_irq_save(flags);
1292 if (!graph_lock()) {
1293 raw_local_irq_restore(flags);
1294 return NULL;
1295 }
1296 }
1297out_unlock_set:
1298 graph_unlock();
1299 raw_local_irq_restore(flags);
1300
1301 if (!subclass || force)
1302 lock->class_cache = class;
1303
1304 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
1305 return NULL;
1306
1307 return class;
1308}
1309 1458
1310/* 1459/*
1311 * Look up a dependency chain. If the key is not present yet then 1460 * Look up a dependency chain. If the key is not present yet then
@@ -1366,21 +1515,72 @@ cache_hit:
1366 chain->chain_key = chain_key; 1515 chain->chain_key = chain_key;
1367 list_add_tail_rcu(&chain->entry, hash_head); 1516 list_add_tail_rcu(&chain->entry, hash_head);
1368 debug_atomic_inc(&chain_lookup_misses); 1517 debug_atomic_inc(&chain_lookup_misses);
1369#ifdef CONFIG_TRACE_IRQFLAGS 1518 inc_chains();
1370 if (current->hardirq_context) 1519
1371 nr_hardirq_chains++; 1520 return 1;
1372 else { 1521}
1373 if (current->softirq_context) 1522
1374 nr_softirq_chains++; 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1375 else 1524 struct held_lock *hlock, int chain_head)
1376 nr_process_chains++; 1525{
1377 } 1526 /*
1378#else 1527 * Trylock needs to maintain the stack of held locks, but it
1379 nr_process_chains++; 1528 * does not add new dependencies, because trylock can be done
1380#endif 1529 * in any order.
1530 *
1531 * We look up the chain_key and do the O(N^2) check and update of
1532 * the dependencies only if this is a new dependency chain.
1533 * (If lookup_chain_cache() returns with 1 it acquires
1534 * graph_lock for us)
1535 */
1536 if (!hlock->trylock && (hlock->check == 2) &&
1537 lookup_chain_cache(curr->curr_chain_key, hlock->class)) {
1538 /*
1539 * Check whether last held lock:
1540 *
1541 * - is irq-safe, if this lock is irq-unsafe
1542 * - is softirq-safe, if this lock is hardirq-unsafe
1543 *
1544 * And check whether the new lock's dependency graph
1545 * could lead back to the previous lock.
1546 *
1547 * any of these scenarios could lead to a deadlock. If
1548 * All validations
1549 */
1550 int ret = check_deadlock(curr, hlock, lock, hlock->read);
1551
1552 if (!ret)
1553 return 0;
1554 /*
1555 * Mark recursive read, as we jump over it when
1556 * building dependencies (just like we jump over
1557 * trylock entries):
1558 */
1559 if (ret == 2)
1560 hlock->read = 2;
1561 /*
1562 * Add dependency only if this lock is not the head
1563 * of the chain, and if it's not a secondary read-lock:
1564 */
1565 if (!chain_head && ret != 2)
1566 if (!check_prevs_add(curr, hlock))
1567 return 0;
1568 graph_unlock();
1569 } else
1570 /* after lookup_chain_cache(): */
1571 if (unlikely(!debug_locks))
1572 return 0;
1381 1573
1382 return 1; 1574 return 1;
1383} 1575}
1576#else
1577static inline int validate_chain(struct task_struct *curr,
1578 struct lockdep_map *lock, struct held_lock *hlock,
1579 int chain_head)
1580{
1581 return 1;
1582}
1583#endif
1384 1584
1385/* 1585/*
1386 * We are building curr_chain_key incrementally, so double-check 1586 * We are building curr_chain_key incrementally, so double-check
@@ -1425,6 +1625,57 @@ static void check_chain_key(struct task_struct *curr)
1425#endif 1625#endif
1426} 1626}
1427 1627
1628static int
1629print_usage_bug(struct task_struct *curr, struct held_lock *this,
1630 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1631{
1632 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1633 return 0;
1634
1635 printk("\n=================================\n");
1636 printk( "[ INFO: inconsistent lock state ]\n");
1637 print_kernel_version();
1638 printk( "---------------------------------\n");
1639
1640 printk("inconsistent {%s} -> {%s} usage.\n",
1641 usage_str[prev_bit], usage_str[new_bit]);
1642
1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1644 curr->comm, curr->pid,
1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1647 trace_hardirqs_enabled(curr),
1648 trace_softirqs_enabled(curr));
1649 print_lock(this);
1650
1651 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1652 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1653
1654 print_irqtrace_events(curr);
1655 printk("\nother info that might help us debug this:\n");
1656 lockdep_print_held_locks(curr);
1657
1658 printk("\nstack backtrace:\n");
1659 dump_stack();
1660
1661 return 0;
1662}
1663
1664/*
1665 * Print out an error if an invalid bit is set:
1666 */
1667static inline int
1668valid_state(struct task_struct *curr, struct held_lock *this,
1669 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1670{
1671 if (unlikely(this->class->usage_mask & (1 << bad_bit)))
1672 return print_usage_bug(curr, this, bad_bit, new_bit);
1673 return 1;
1674}
1675
1676static int mark_lock(struct task_struct *curr, struct held_lock *this,
1677 enum lock_usage_bit new_bit);
1678
1428#ifdef CONFIG_TRACE_IRQFLAGS 1679#ifdef CONFIG_TRACE_IRQFLAGS
1429 1680
1430/* 1681/*
@@ -1518,90 +1769,30 @@ void print_irqtrace_events(struct task_struct *curr)
1518 print_ip_sym(curr->softirq_disable_ip); 1769 print_ip_sym(curr->softirq_disable_ip);
1519} 1770}
1520 1771
1521#endif 1772static int hardirq_verbose(struct lock_class *class)
1522
1523static int
1524print_usage_bug(struct task_struct *curr, struct held_lock *this,
1525 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1526{ 1773{
1527 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1774#if HARDIRQ_VERBOSE
1528 return 0; 1775 return class_filter(class);
1529 1776#endif
1530 printk("\n=================================\n");
1531 printk( "[ INFO: inconsistent lock state ]\n");
1532 print_kernel_version();
1533 printk( "---------------------------------\n");
1534
1535 printk("inconsistent {%s} -> {%s} usage.\n",
1536 usage_str[prev_bit], usage_str[new_bit]);
1537
1538 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1539 curr->comm, curr->pid,
1540 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1541 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1542 trace_hardirqs_enabled(curr),
1543 trace_softirqs_enabled(curr));
1544 print_lock(this);
1545
1546 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1547 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1548
1549 print_irqtrace_events(curr);
1550 printk("\nother info that might help us debug this:\n");
1551 lockdep_print_held_locks(curr);
1552
1553 printk("\nstack backtrace:\n");
1554 dump_stack();
1555
1556 return 0; 1777 return 0;
1557} 1778}
1558 1779
1559/* 1780static int softirq_verbose(struct lock_class *class)
1560 * Print out an error if an invalid bit is set:
1561 */
1562static inline int
1563valid_state(struct task_struct *curr, struct held_lock *this,
1564 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1565{ 1781{
1566 if (unlikely(this->class->usage_mask & (1 << bad_bit))) 1782#if SOFTIRQ_VERBOSE
1567 return print_usage_bug(curr, this, bad_bit, new_bit); 1783 return class_filter(class);
1568 return 1; 1784#endif
1785 return 0;
1569} 1786}
1570 1787
1571#define STRICT_READ_CHECKS 1 1788#define STRICT_READ_CHECKS 1
1572 1789
1573/* 1790static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1574 * Mark a lock with a usage bit, and validate the state transition: 1791 enum lock_usage_bit new_bit)
1575 */
1576static int mark_lock(struct task_struct *curr, struct held_lock *this,
1577 enum lock_usage_bit new_bit)
1578{ 1792{
1579 unsigned int new_mask = 1 << new_bit, ret = 1; 1793 int ret = 1;
1580
1581 /*
1582 * If already set then do not dirty the cacheline,
1583 * nor do any checks:
1584 */
1585 if (likely(this->class->usage_mask & new_mask))
1586 return 1;
1587
1588 if (!graph_lock())
1589 return 0;
1590 /*
1591 * Make sure we didnt race:
1592 */
1593 if (unlikely(this->class->usage_mask & new_mask)) {
1594 graph_unlock();
1595 return 1;
1596 }
1597
1598 this->class->usage_mask |= new_mask;
1599 1794
1600 if (!save_trace(this->class->usage_traces + new_bit)) 1795 switch(new_bit) {
1601 return 0;
1602
1603 switch (new_bit) {
1604#ifdef CONFIG_TRACE_IRQFLAGS
1605 case LOCK_USED_IN_HARDIRQ: 1796 case LOCK_USED_IN_HARDIRQ:
1606 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) 1797 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1607 return 0; 1798 return 0;
@@ -1760,37 +1951,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1760 if (softirq_verbose(this->class)) 1951 if (softirq_verbose(this->class))
1761 ret = 2; 1952 ret = 2;
1762 break; 1953 break;
1763#endif
1764 case LOCK_USED:
1765 /*
1766 * Add it to the global list of classes:
1767 */
1768 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
1769 debug_atomic_dec(&nr_unused_locks);
1770 break;
1771 default: 1954 default:
1772 if (!debug_locks_off_graph_unlock())
1773 return 0;
1774 WARN_ON(1); 1955 WARN_ON(1);
1775 return 0; 1956 break;
1776 }
1777
1778 graph_unlock();
1779
1780 /*
1781 * We must printk outside of the graph_lock:
1782 */
1783 if (ret == 2) {
1784 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
1785 print_lock(this);
1786 print_irqtrace_events(curr);
1787 dump_stack();
1788 } 1957 }
1789 1958
1790 return ret; 1959 return ret;
1791} 1960}
1792 1961
1793#ifdef CONFIG_TRACE_IRQFLAGS
1794/* 1962/*
1795 * Mark all held locks with a usage bit: 1963 * Mark all held locks with a usage bit:
1796 */ 1964 */
@@ -1973,9 +2141,176 @@ void trace_softirqs_off(unsigned long ip)
1973 debug_atomic_inc(&redundant_softirqs_off); 2141 debug_atomic_inc(&redundant_softirqs_off);
1974} 2142}
1975 2143
2144static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2145{
2146 /*
2147 * If non-trylock use in a hardirq or softirq context, then
2148 * mark the lock as used in these contexts:
2149 */
2150 if (!hlock->trylock) {
2151 if (hlock->read) {
2152 if (curr->hardirq_context)
2153 if (!mark_lock(curr, hlock,
2154 LOCK_USED_IN_HARDIRQ_READ))
2155 return 0;
2156 if (curr->softirq_context)
2157 if (!mark_lock(curr, hlock,
2158 LOCK_USED_IN_SOFTIRQ_READ))
2159 return 0;
2160 } else {
2161 if (curr->hardirq_context)
2162 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2163 return 0;
2164 if (curr->softirq_context)
2165 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2166 return 0;
2167 }
2168 }
2169 if (!hlock->hardirqs_off) {
2170 if (hlock->read) {
2171 if (!mark_lock(curr, hlock,
2172 LOCK_ENABLED_HARDIRQS_READ))
2173 return 0;
2174 if (curr->softirqs_enabled)
2175 if (!mark_lock(curr, hlock,
2176 LOCK_ENABLED_SOFTIRQS_READ))
2177 return 0;
2178 } else {
2179 if (!mark_lock(curr, hlock,
2180 LOCK_ENABLED_HARDIRQS))
2181 return 0;
2182 if (curr->softirqs_enabled)
2183 if (!mark_lock(curr, hlock,
2184 LOCK_ENABLED_SOFTIRQS))
2185 return 0;
2186 }
2187 }
2188
2189 return 1;
2190}
2191
2192static int separate_irq_context(struct task_struct *curr,
2193 struct held_lock *hlock)
2194{
2195 unsigned int depth = curr->lockdep_depth;
2196
2197 /*
2198 * Keep track of points where we cross into an interrupt context:
2199 */
2200 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2201 curr->softirq_context;
2202 if (depth) {
2203 struct held_lock *prev_hlock;
2204
2205 prev_hlock = curr->held_locks + depth-1;
2206 /*
2207 * If we cross into another context, reset the
2208 * hash key (this also prevents the checking and the
2209 * adding of the dependency to 'prev'):
2210 */
2211 if (prev_hlock->irq_context != hlock->irq_context)
2212 return 1;
2213 }
2214 return 0;
2215}
2216
2217#else
2218
2219static inline
2220int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2221 enum lock_usage_bit new_bit)
2222{
2223 WARN_ON(1);
2224 return 1;
2225}
2226
2227static inline int mark_irqflags(struct task_struct *curr,
2228 struct held_lock *hlock)
2229{
2230 return 1;
2231}
2232
2233static inline int separate_irq_context(struct task_struct *curr,
2234 struct held_lock *hlock)
2235{
2236 return 0;
2237}
2238
1976#endif 2239#endif
1977 2240
1978/* 2241/*
2242 * Mark a lock with a usage bit, and validate the state transition:
2243 */
2244static int mark_lock(struct task_struct *curr, struct held_lock *this,
2245 enum lock_usage_bit new_bit)
2246{
2247 unsigned int new_mask = 1 << new_bit, ret = 1;
2248
2249 /*
2250 * If already set then do not dirty the cacheline,
2251 * nor do any checks:
2252 */
2253 if (likely(this->class->usage_mask & new_mask))
2254 return 1;
2255
2256 if (!graph_lock())
2257 return 0;
2258 /*
2259 * Make sure we didnt race:
2260 */
2261 if (unlikely(this->class->usage_mask & new_mask)) {
2262 graph_unlock();
2263 return 1;
2264 }
2265
2266 this->class->usage_mask |= new_mask;
2267
2268 if (!save_trace(this->class->usage_traces + new_bit))
2269 return 0;
2270
2271 switch (new_bit) {
2272 case LOCK_USED_IN_HARDIRQ:
2273 case LOCK_USED_IN_SOFTIRQ:
2274 case LOCK_USED_IN_HARDIRQ_READ:
2275 case LOCK_USED_IN_SOFTIRQ_READ:
2276 case LOCK_ENABLED_HARDIRQS:
2277 case LOCK_ENABLED_SOFTIRQS:
2278 case LOCK_ENABLED_HARDIRQS_READ:
2279 case LOCK_ENABLED_SOFTIRQS_READ:
2280 ret = mark_lock_irq(curr, this, new_bit);
2281 if (!ret)
2282 return 0;
2283 break;
2284 case LOCK_USED:
2285 /*
2286 * Add it to the global list of classes:
2287 */
2288 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
2289 debug_atomic_dec(&nr_unused_locks);
2290 break;
2291 default:
2292 if (!debug_locks_off_graph_unlock())
2293 return 0;
2294 WARN_ON(1);
2295 return 0;
2296 }
2297
2298 graph_unlock();
2299
2300 /*
2301 * We must printk outside of the graph_lock:
2302 */
2303 if (ret == 2) {
2304 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
2305 print_lock(this);
2306 print_irqtrace_events(curr);
2307 dump_stack();
2308 }
2309
2310 return ret;
2311}
2312
2313/*
1979 * Initialize a lock instance's lock-class mapping info: 2314 * Initialize a lock instance's lock-class mapping info:
1980 */ 2315 */
1981void lockdep_init_map(struct lockdep_map *lock, const char *name, 2316void lockdep_init_map(struct lockdep_map *lock, const char *name,
@@ -1999,6 +2334,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
1999 lock->name = name; 2334 lock->name = name;
2000 lock->key = key; 2335 lock->key = key;
2001 lock->class_cache = NULL; 2336 lock->class_cache = NULL;
2337#ifdef CONFIG_LOCK_STAT
2338 lock->cpu = raw_smp_processor_id();
2339#endif
2002 if (subclass) 2340 if (subclass)
2003 register_lock_class(lock, subclass, 1); 2341 register_lock_class(lock, subclass, 1);
2004} 2342}
@@ -2020,6 +2358,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2020 int chain_head = 0; 2358 int chain_head = 0;
2021 u64 chain_key; 2359 u64 chain_key;
2022 2360
2361 if (!prove_locking)
2362 check = 1;
2363
2023 if (unlikely(!debug_locks)) 2364 if (unlikely(!debug_locks))
2024 return 0; 2365 return 0;
2025 2366
@@ -2070,57 +2411,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2070 hlock->read = read; 2411 hlock->read = read;
2071 hlock->check = check; 2412 hlock->check = check;
2072 hlock->hardirqs_off = hardirqs_off; 2413 hlock->hardirqs_off = hardirqs_off;
2073 2414#ifdef CONFIG_LOCK_STAT
2074 if (check != 2) 2415 hlock->waittime_stamp = 0;
2075 goto out_calc_hash; 2416 hlock->holdtime_stamp = sched_clock();
2076#ifdef CONFIG_TRACE_IRQFLAGS
2077 /*
2078 * If non-trylock use in a hardirq or softirq context, then
2079 * mark the lock as used in these contexts:
2080 */
2081 if (!trylock) {
2082 if (read) {
2083 if (curr->hardirq_context)
2084 if (!mark_lock(curr, hlock,
2085 LOCK_USED_IN_HARDIRQ_READ))
2086 return 0;
2087 if (curr->softirq_context)
2088 if (!mark_lock(curr, hlock,
2089 LOCK_USED_IN_SOFTIRQ_READ))
2090 return 0;
2091 } else {
2092 if (curr->hardirq_context)
2093 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2094 return 0;
2095 if (curr->softirq_context)
2096 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2097 return 0;
2098 }
2099 }
2100 if (!hardirqs_off) {
2101 if (read) {
2102 if (!mark_lock(curr, hlock,
2103 LOCK_ENABLED_HARDIRQS_READ))
2104 return 0;
2105 if (curr->softirqs_enabled)
2106 if (!mark_lock(curr, hlock,
2107 LOCK_ENABLED_SOFTIRQS_READ))
2108 return 0;
2109 } else {
2110 if (!mark_lock(curr, hlock,
2111 LOCK_ENABLED_HARDIRQS))
2112 return 0;
2113 if (curr->softirqs_enabled)
2114 if (!mark_lock(curr, hlock,
2115 LOCK_ENABLED_SOFTIRQS))
2116 return 0;
2117 }
2118 }
2119#endif 2417#endif
2418
2419 if (check == 2 && !mark_irqflags(curr, hlock))
2420 return 0;
2421
2120 /* mark it as used: */ 2422 /* mark it as used: */
2121 if (!mark_lock(curr, hlock, LOCK_USED)) 2423 if (!mark_lock(curr, hlock, LOCK_USED))
2122 return 0; 2424 return 0;
2123out_calc_hash: 2425
2124 /* 2426 /*
2125 * Calculate the chain hash: it's the combined has of all the 2427 * Calculate the chain hash: it's the combined has of all the
2126 * lock keys along the dependency chain. We save the hash value 2428 * lock keys along the dependency chain. We save the hash value
@@ -2143,77 +2445,15 @@ out_calc_hash:
2143 } 2445 }
2144 2446
2145 hlock->prev_chain_key = chain_key; 2447 hlock->prev_chain_key = chain_key;
2146 2448 if (separate_irq_context(curr, hlock)) {
2147#ifdef CONFIG_TRACE_IRQFLAGS 2449 chain_key = 0;
2148 /* 2450 chain_head = 1;
2149 * Keep track of points where we cross into an interrupt context:
2150 */
2151 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2152 curr->softirq_context;
2153 if (depth) {
2154 struct held_lock *prev_hlock;
2155
2156 prev_hlock = curr->held_locks + depth-1;
2157 /*
2158 * If we cross into another context, reset the
2159 * hash key (this also prevents the checking and the
2160 * adding of the dependency to 'prev'):
2161 */
2162 if (prev_hlock->irq_context != hlock->irq_context) {
2163 chain_key = 0;
2164 chain_head = 1;
2165 }
2166 } 2451 }
2167#endif
2168 chain_key = iterate_chain_key(chain_key, id); 2452 chain_key = iterate_chain_key(chain_key, id);
2169 curr->curr_chain_key = chain_key; 2453 curr->curr_chain_key = chain_key;
2170 2454
2171 /* 2455 if (!validate_chain(curr, lock, hlock, chain_head))
2172 * Trylock needs to maintain the stack of held locks, but it 2456 return 0;
2173 * does not add new dependencies, because trylock can be done
2174 * in any order.
2175 *
2176 * We look up the chain_key and do the O(N^2) check and update of
2177 * the dependencies only if this is a new dependency chain.
2178 * (If lookup_chain_cache() returns with 1 it acquires
2179 * graph_lock for us)
2180 */
2181 if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
2182 /*
2183 * Check whether last held lock:
2184 *
2185 * - is irq-safe, if this lock is irq-unsafe
2186 * - is softirq-safe, if this lock is hardirq-unsafe
2187 *
2188 * And check whether the new lock's dependency graph
2189 * could lead back to the previous lock.
2190 *
2191 * any of these scenarios could lead to a deadlock. If
2192 * All validations
2193 */
2194 int ret = check_deadlock(curr, hlock, lock, read);
2195
2196 if (!ret)
2197 return 0;
2198 /*
2199 * Mark recursive read, as we jump over it when
2200 * building dependencies (just like we jump over
2201 * trylock entries):
2202 */
2203 if (ret == 2)
2204 hlock->read = 2;
2205 /*
2206 * Add dependency only if this lock is not the head
2207 * of the chain, and if it's not a secondary read-lock:
2208 */
2209 if (!chain_head && ret != 2)
2210 if (!check_prevs_add(curr, hlock))
2211 return 0;
2212 graph_unlock();
2213 } else
2214 /* after lookup_chain_cache(): */
2215 if (unlikely(!debug_locks))
2216 return 0;
2217 2457
2218 curr->lockdep_depth++; 2458 curr->lockdep_depth++;
2219 check_chain_key(curr); 2459 check_chain_key(curr);
@@ -2315,6 +2555,8 @@ lock_release_non_nested(struct task_struct *curr,
2315 return print_unlock_inbalance_bug(curr, lock, ip); 2555 return print_unlock_inbalance_bug(curr, lock, ip);
2316 2556
2317found_it: 2557found_it:
2558 lock_release_holdtime(hlock);
2559
2318 /* 2560 /*
2319 * We have the right lock to unlock, 'hlock' points to it. 2561 * We have the right lock to unlock, 'hlock' points to it.
2320 * Now we remove it from the stack, and add back the other 2562 * Now we remove it from the stack, and add back the other
@@ -2367,6 +2609,8 @@ static int lock_release_nested(struct task_struct *curr,
2367 2609
2368 curr->curr_chain_key = hlock->prev_chain_key; 2610 curr->curr_chain_key = hlock->prev_chain_key;
2369 2611
2612 lock_release_holdtime(hlock);
2613
2370#ifdef CONFIG_DEBUG_LOCKDEP 2614#ifdef CONFIG_DEBUG_LOCKDEP
2371 hlock->prev_chain_key = 0; 2615 hlock->prev_chain_key = 0;
2372 hlock->class = NULL; 2616 hlock->class = NULL;
@@ -2441,6 +2685,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2441{ 2685{
2442 unsigned long flags; 2686 unsigned long flags;
2443 2687
2688 if (unlikely(!lock_stat && !prove_locking))
2689 return;
2690
2444 if (unlikely(current->lockdep_recursion)) 2691 if (unlikely(current->lockdep_recursion))
2445 return; 2692 return;
2446 2693
@@ -2460,6 +2707,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2460{ 2707{
2461 unsigned long flags; 2708 unsigned long flags;
2462 2709
2710 if (unlikely(!lock_stat && !prove_locking))
2711 return;
2712
2463 if (unlikely(current->lockdep_recursion)) 2713 if (unlikely(current->lockdep_recursion))
2464 return; 2714 return;
2465 2715
@@ -2473,6 +2723,166 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2473 2723
2474EXPORT_SYMBOL_GPL(lock_release); 2724EXPORT_SYMBOL_GPL(lock_release);
2475 2725
2726#ifdef CONFIG_LOCK_STAT
2727static int
2728print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
2729 unsigned long ip)
2730{
2731 if (!debug_locks_off())
2732 return 0;
2733 if (debug_locks_silent)
2734 return 0;
2735
2736 printk("\n=================================\n");
2737 printk( "[ BUG: bad contention detected! ]\n");
2738 printk( "---------------------------------\n");
2739 printk("%s/%d is trying to contend lock (",
2740 curr->comm, curr->pid);
2741 print_lockdep_cache(lock);
2742 printk(") at:\n");
2743 print_ip_sym(ip);
2744 printk("but there are no locks held!\n");
2745 printk("\nother info that might help us debug this:\n");
2746 lockdep_print_held_locks(curr);
2747
2748 printk("\nstack backtrace:\n");
2749 dump_stack();
2750
2751 return 0;
2752}
2753
2754static void
2755__lock_contended(struct lockdep_map *lock, unsigned long ip)
2756{
2757 struct task_struct *curr = current;
2758 struct held_lock *hlock, *prev_hlock;
2759 struct lock_class_stats *stats;
2760 unsigned int depth;
2761 int i, point;
2762
2763 depth = curr->lockdep_depth;
2764 if (DEBUG_LOCKS_WARN_ON(!depth))
2765 return;
2766
2767 prev_hlock = NULL;
2768 for (i = depth-1; i >= 0; i--) {
2769 hlock = curr->held_locks + i;
2770 /*
2771 * We must not cross into another context:
2772 */
2773 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2774 break;
2775 if (hlock->instance == lock)
2776 goto found_it;
2777 prev_hlock = hlock;
2778 }
2779 print_lock_contention_bug(curr, lock, ip);
2780 return;
2781
2782found_it:
2783 hlock->waittime_stamp = sched_clock();
2784
2785 point = lock_contention_point(hlock->class, ip);
2786
2787 stats = get_lock_stats(hlock->class);
2788 if (point < ARRAY_SIZE(stats->contention_point))
2789 stats->contention_point[i]++;
2790 if (lock->cpu != smp_processor_id())
2791 stats->bounces[bounce_contended + !!hlock->read]++;
2792 put_lock_stats(stats);
2793}
2794
2795static void
2796__lock_acquired(struct lockdep_map *lock)
2797{
2798 struct task_struct *curr = current;
2799 struct held_lock *hlock, *prev_hlock;
2800 struct lock_class_stats *stats;
2801 unsigned int depth;
2802 u64 now;
2803 s64 waittime = 0;
2804 int i, cpu;
2805
2806 depth = curr->lockdep_depth;
2807 if (DEBUG_LOCKS_WARN_ON(!depth))
2808 return;
2809
2810 prev_hlock = NULL;
2811 for (i = depth-1; i >= 0; i--) {
2812 hlock = curr->held_locks + i;
2813 /*
2814 * We must not cross into another context:
2815 */
2816 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2817 break;
2818 if (hlock->instance == lock)
2819 goto found_it;
2820 prev_hlock = hlock;
2821 }
2822 print_lock_contention_bug(curr, lock, _RET_IP_);
2823 return;
2824
2825found_it:
2826 cpu = smp_processor_id();
2827 if (hlock->waittime_stamp) {
2828 now = sched_clock();
2829 waittime = now - hlock->waittime_stamp;
2830 hlock->holdtime_stamp = now;
2831 }
2832
2833 stats = get_lock_stats(hlock->class);
2834 if (waittime) {
2835 if (hlock->read)
2836 lock_time_inc(&stats->read_waittime, waittime);
2837 else
2838 lock_time_inc(&stats->write_waittime, waittime);
2839 }
2840 if (lock->cpu != cpu)
2841 stats->bounces[bounce_acquired + !!hlock->read]++;
2842 put_lock_stats(stats);
2843
2844 lock->cpu = cpu;
2845}
2846
2847void lock_contended(struct lockdep_map *lock, unsigned long ip)
2848{
2849 unsigned long flags;
2850
2851 if (unlikely(!lock_stat))
2852 return;
2853
2854 if (unlikely(current->lockdep_recursion))
2855 return;
2856
2857 raw_local_irq_save(flags);
2858 check_flags(flags);
2859 current->lockdep_recursion = 1;
2860 __lock_contended(lock, ip);
2861 current->lockdep_recursion = 0;
2862 raw_local_irq_restore(flags);
2863}
2864EXPORT_SYMBOL_GPL(lock_contended);
2865
2866void lock_acquired(struct lockdep_map *lock)
2867{
2868 unsigned long flags;
2869
2870 if (unlikely(!lock_stat))
2871 return;
2872
2873 if (unlikely(current->lockdep_recursion))
2874 return;
2875
2876 raw_local_irq_save(flags);
2877 check_flags(flags);
2878 current->lockdep_recursion = 1;
2879 __lock_acquired(lock);
2880 current->lockdep_recursion = 0;
2881 raw_local_irq_restore(flags);
2882}
2883EXPORT_SYMBOL_GPL(lock_acquired);
2884#endif
2885
2476/* 2886/*
2477 * Used by the testsuite, sanitize the validator state 2887 * Used by the testsuite, sanitize the validator state
2478 * after a simulated failure: 2888 * after a simulated failure:
@@ -2636,8 +3046,11 @@ void __init lockdep_info(void)
2636 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3046 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
2637 3047
2638#ifdef CONFIG_DEBUG_LOCKDEP 3048#ifdef CONFIG_DEBUG_LOCKDEP
2639 if (lockdep_init_error) 3049 if (lockdep_init_error) {
2640 printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); 3050 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
3051 printk("Call stack leading to lockdep invocation was:\n");
3052 print_stack_trace(&lockdep_init_trace, 0);
3053 }
2641#endif 3054#endif
2642} 3055}
2643 3056
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 58f35e586ee3..9f17af4a2490 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Started by Ingo Molnar: 6 * Started by Ingo Molnar:
7 * 7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * 10 *
10 * Code for /proc/lockdep and /proc/lockdep_stats: 11 * Code for /proc/lockdep and /proc/lockdep_stats:
11 * 12 *
@@ -15,6 +16,10 @@
15#include <linux/seq_file.h> 16#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
17#include <linux/debug_locks.h> 18#include <linux/debug_locks.h>
19#include <linux/vmalloc.h>
20#include <linux/sort.h>
21#include <asm/uaccess.h>
22#include <asm/div64.h>
18 23
19#include "lockdep_internals.h" 24#include "lockdep_internals.h"
20 25
@@ -271,8 +276,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
271 if (nr_list_entries) 276 if (nr_list_entries)
272 factor = sum_forward_deps / nr_list_entries; 277 factor = sum_forward_deps / nr_list_entries;
273 278
279#ifdef CONFIG_PROVE_LOCKING
274 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 280 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
275 nr_lock_chains, MAX_LOCKDEP_CHAINS); 281 nr_lock_chains, MAX_LOCKDEP_CHAINS);
282#endif
276 283
277#ifdef CONFIG_TRACE_IRQFLAGS 284#ifdef CONFIG_TRACE_IRQFLAGS
278 seq_printf(m, " in-hardirq chains: %11u\n", 285 seq_printf(m, " in-hardirq chains: %11u\n",
@@ -342,6 +349,292 @@ static const struct file_operations proc_lockdep_stats_operations = {
342 .release = seq_release, 349 .release = seq_release,
343}; 350};
344 351
352#ifdef CONFIG_LOCK_STAT
353
354struct lock_stat_data {
355 struct lock_class *class;
356 struct lock_class_stats stats;
357};
358
359struct lock_stat_seq {
360 struct lock_stat_data *iter;
361 struct lock_stat_data *iter_end;
362 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
363};
364
365/*
366 * sort on absolute number of contentions
367 */
368static int lock_stat_cmp(const void *l, const void *r)
369{
370 const struct lock_stat_data *dl = l, *dr = r;
371 unsigned long nl, nr;
372
373 nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
374 nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
375
376 return nr - nl;
377}
378
379static void seq_line(struct seq_file *m, char c, int offset, int length)
380{
381 int i;
382
383 for (i = 0; i < offset; i++)
384 seq_puts(m, " ");
385 for (i = 0; i < length; i++)
386 seq_printf(m, "%c", c);
387 seq_puts(m, "\n");
388}
389
390static void snprint_time(char *buf, size_t bufsiz, s64 nr)
391{
392 unsigned long rem;
393
394 rem = do_div(nr, 1000); /* XXX: do_div_signed */
395 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
396}
397
398static void seq_time(struct seq_file *m, s64 time)
399{
400 char num[15];
401
402 snprint_time(num, sizeof(num), time);
403 seq_printf(m, " %14s", num);
404}
405
406static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
407{
408 seq_printf(m, "%14lu", lt->nr);
409 seq_time(m, lt->min);
410 seq_time(m, lt->max);
411 seq_time(m, lt->total);
412}
413
414static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
415{
416 char name[39];
417 struct lock_class *class;
418 struct lock_class_stats *stats;
419 int i, namelen;
420
421 class = data->class;
422 stats = &data->stats;
423
424 namelen = 38;
425 if (class->name_version > 1)
426 namelen -= 2; /* XXX truncates versions > 9 */
427 if (class->subclass)
428 namelen -= 2;
429
430 if (!class->name) {
431 char str[KSYM_NAME_LEN];
432 const char *key_name;
433
434 key_name = __get_key_name(class->key, str);
435 snprintf(name, namelen, "%s", key_name);
436 } else {
437 snprintf(name, namelen, "%s", class->name);
438 }
439 namelen = strlen(name);
440 if (class->name_version > 1) {
441 snprintf(name+namelen, 3, "#%d", class->name_version);
442 namelen += 2;
443 }
444 if (class->subclass) {
445 snprintf(name+namelen, 3, "/%d", class->subclass);
446 namelen += 2;
447 }
448
449 if (stats->write_holdtime.nr) {
450 if (stats->read_holdtime.nr)
451 seq_printf(m, "%38s-W:", name);
452 else
453 seq_printf(m, "%40s:", name);
454
455 seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
456 seq_lock_time(m, &stats->write_waittime);
457 seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
458 seq_lock_time(m, &stats->write_holdtime);
459 seq_puts(m, "\n");
460 }
461
462 if (stats->read_holdtime.nr) {
463 seq_printf(m, "%38s-R:", name);
464 seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
465 seq_lock_time(m, &stats->read_waittime);
466 seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
467 seq_lock_time(m, &stats->read_holdtime);
468 seq_puts(m, "\n");
469 }
470
471 if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
472 return;
473
474 if (stats->read_holdtime.nr)
475 namelen += 2;
476
477 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
478 char sym[KSYM_SYMBOL_LEN];
479 char ip[32];
480
481 if (class->contention_point[i] == 0)
482 break;
483
484 if (!i)
485 seq_line(m, '-', 40-namelen, namelen);
486
487 sprint_symbol(sym, class->contention_point[i]);
488 snprintf(ip, sizeof(ip), "[<%p>]",
489 (void *)class->contention_point[i]);
490 seq_printf(m, "%40s %14lu %29s %s\n", name,
491 stats->contention_point[i],
492 ip, sym);
493 }
494 if (i) {
495 seq_puts(m, "\n");
496 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
497 seq_puts(m, "\n");
498 }
499}
500
501static void seq_header(struct seq_file *m)
502{
503 seq_printf(m, "lock_stat version 0.2\n");
504 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
505 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
506 "%14s %14s\n",
507 "class name",
508 "con-bounces",
509 "contentions",
510 "waittime-min",
511 "waittime-max",
512 "waittime-total",
513 "acq-bounces",
514 "acquisitions",
515 "holdtime-min",
516 "holdtime-max",
517 "holdtime-total");
518 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
519 seq_printf(m, "\n");
520}
521
522static void *ls_start(struct seq_file *m, loff_t *pos)
523{
524 struct lock_stat_seq *data = m->private;
525
526 if (data->iter == data->stats)
527 seq_header(m);
528
529 if (data->iter == data->iter_end)
530 data->iter = NULL;
531
532 return data->iter;
533}
534
535static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
536{
537 struct lock_stat_seq *data = m->private;
538
539 (*pos)++;
540
541 data->iter = v;
542 data->iter++;
543 if (data->iter == data->iter_end)
544 data->iter = NULL;
545
546 return data->iter;
547}
548
549static void ls_stop(struct seq_file *m, void *v)
550{
551}
552
553static int ls_show(struct seq_file *m, void *v)
554{
555 struct lock_stat_seq *data = m->private;
556
557 seq_stats(m, data->iter);
558 return 0;
559}
560
561static struct seq_operations lockstat_ops = {
562 .start = ls_start,
563 .next = ls_next,
564 .stop = ls_stop,
565 .show = ls_show,
566};
567
568static int lock_stat_open(struct inode *inode, struct file *file)
569{
570 int res;
571 struct lock_class *class;
572 struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
573
574 if (!data)
575 return -ENOMEM;
576
577 res = seq_open(file, &lockstat_ops);
578 if (!res) {
579 struct lock_stat_data *iter = data->stats;
580 struct seq_file *m = file->private_data;
581
582 data->iter = iter;
583 list_for_each_entry(class, &all_lock_classes, lock_entry) {
584 iter->class = class;
585 iter->stats = lock_stats(class);
586 iter++;
587 }
588 data->iter_end = iter;
589
590 sort(data->stats, data->iter_end - data->iter,
591 sizeof(struct lock_stat_data),
592 lock_stat_cmp, NULL);
593
594 m->private = data;
595 } else
596 vfree(data);
597
598 return res;
599}
600
601static ssize_t lock_stat_write(struct file *file, const char __user *buf,
602 size_t count, loff_t *ppos)
603{
604 struct lock_class *class;
605 char c;
606
607 if (count) {
608 if (get_user(c, buf))
609 return -EFAULT;
610
611 if (c != '0')
612 return count;
613
614 list_for_each_entry(class, &all_lock_classes, lock_entry)
615 clear_lock_stats(class);
616 }
617 return count;
618}
619
620static int lock_stat_release(struct inode *inode, struct file *file)
621{
622 struct seq_file *seq = file->private_data;
623
624 vfree(seq->private);
625 seq->private = NULL;
626 return seq_release(inode, file);
627}
628
629static const struct file_operations proc_lock_stat_operations = {
630 .open = lock_stat_open,
631 .write = lock_stat_write,
632 .read = seq_read,
633 .llseek = seq_lseek,
634 .release = lock_stat_release,
635};
636#endif /* CONFIG_LOCK_STAT */
637
345static int __init lockdep_proc_init(void) 638static int __init lockdep_proc_init(void)
346{ 639{
347 struct proc_dir_entry *entry; 640 struct proc_dir_entry *entry;
@@ -354,6 +647,12 @@ static int __init lockdep_proc_init(void)
354 if (entry) 647 if (entry)
355 entry->proc_fops = &proc_lockdep_stats_operations; 648 entry->proc_fops = &proc_lockdep_stats_operations;
356 649
650#ifdef CONFIG_LOCK_STAT
651 entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
652 if (entry)
653 entry->proc_fops = &proc_lock_stat_operations;
654#endif
655
357 return 0; 656 return 0;
358} 657}
359 658
diff --git a/kernel/module.c b/kernel/module.c
index 015d60cfd90e..33c04ad51175 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,10 +61,8 @@ extern int module_sysfs_initialized;
61/* If this is set, the section belongs in the init part of the module */ 61/* If this is set, the section belongs in the init part of the module */
62#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 62#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
63 63
64/* Protects module list */ 64/* List of modules, protected by module_mutex or preempt_disable
65static DEFINE_SPINLOCK(modlist_lock); 65 * (add/delete uses stop_machine). */
66
67/* List of modules, protected by module_mutex AND modlist_lock */
68static DEFINE_MUTEX(module_mutex); 66static DEFINE_MUTEX(module_mutex);
69static LIST_HEAD(modules); 67static LIST_HEAD(modules);
70 68
@@ -760,14 +758,13 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
760void __symbol_put(const char *symbol) 758void __symbol_put(const char *symbol)
761{ 759{
762 struct module *owner; 760 struct module *owner;
763 unsigned long flags;
764 const unsigned long *crc; 761 const unsigned long *crc;
765 762
766 spin_lock_irqsave(&modlist_lock, flags); 763 preempt_disable();
767 if (!__find_symbol(symbol, &owner, &crc, 1)) 764 if (!__find_symbol(symbol, &owner, &crc, 1))
768 BUG(); 765 BUG();
769 module_put(owner); 766 module_put(owner);
770 spin_unlock_irqrestore(&modlist_lock, flags); 767 preempt_enable();
771} 768}
772EXPORT_SYMBOL(__symbol_put); 769EXPORT_SYMBOL(__symbol_put);
773 770
@@ -1228,14 +1225,14 @@ static void free_module(struct module *mod)
1228void *__symbol_get(const char *symbol) 1225void *__symbol_get(const char *symbol)
1229{ 1226{
1230 struct module *owner; 1227 struct module *owner;
1231 unsigned long value, flags; 1228 unsigned long value;
1232 const unsigned long *crc; 1229 const unsigned long *crc;
1233 1230
1234 spin_lock_irqsave(&modlist_lock, flags); 1231 preempt_disable();
1235 value = __find_symbol(symbol, &owner, &crc, 1); 1232 value = __find_symbol(symbol, &owner, &crc, 1);
1236 if (value && !strong_try_module_get(owner)) 1233 if (value && !strong_try_module_get(owner))
1237 value = 0; 1234 value = 0;
1238 spin_unlock_irqrestore(&modlist_lock, flags); 1235 preempt_enable();
1239 1236
1240 return (void *)value; 1237 return (void *)value;
1241} 1238}
@@ -2136,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2136 sym = get_ksymbol(mod, addr, NULL, NULL); 2133 sym = get_ksymbol(mod, addr, NULL, NULL);
2137 if (!sym) 2134 if (!sym)
2138 goto out; 2135 goto out;
2139 strlcpy(symname, sym, KSYM_NAME_LEN + 1); 2136 strlcpy(symname, sym, KSYM_NAME_LEN);
2140 mutex_unlock(&module_mutex); 2137 mutex_unlock(&module_mutex);
2141 return 0; 2138 return 0;
2142 } 2139 }
@@ -2161,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2161 if (!sym) 2158 if (!sym)
2162 goto out; 2159 goto out;
2163 if (modname) 2160 if (modname)
2164 strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); 2161 strlcpy(modname, mod->name, MODULE_NAME_LEN);
2165 if (name) 2162 if (name)
2166 strlcpy(name, sym, KSYM_NAME_LEN + 1); 2163 strlcpy(name, sym, KSYM_NAME_LEN);
2167 mutex_unlock(&module_mutex); 2164 mutex_unlock(&module_mutex);
2168 return 0; 2165 return 0;
2169 } 2166 }
@@ -2184,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2184 *value = mod->symtab[symnum].st_value; 2181 *value = mod->symtab[symnum].st_value;
2185 *type = mod->symtab[symnum].st_info; 2182 *type = mod->symtab[symnum].st_info;
2186 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2183 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2187 KSYM_NAME_LEN + 1); 2184 KSYM_NAME_LEN);
2188 strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); 2185 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2189 *exported = is_exported(name, mod); 2186 *exported = is_exported(name, mod);
2190 mutex_unlock(&module_mutex); 2187 mutex_unlock(&module_mutex);
2191 return 0; 2188 return 0;
@@ -2232,26 +2229,13 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2232/* Called by the /proc file system to return a list of modules. */ 2229/* Called by the /proc file system to return a list of modules. */
2233static void *m_start(struct seq_file *m, loff_t *pos) 2230static void *m_start(struct seq_file *m, loff_t *pos)
2234{ 2231{
2235 struct list_head *i;
2236 loff_t n = 0;
2237
2238 mutex_lock(&module_mutex); 2232 mutex_lock(&module_mutex);
2239 list_for_each(i, &modules) { 2233 return seq_list_start(&modules, *pos);
2240 if (n++ == *pos)
2241 break;
2242 }
2243 if (i == &modules)
2244 return NULL;
2245 return i;
2246} 2234}
2247 2235
2248static void *m_next(struct seq_file *m, void *p, loff_t *pos) 2236static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2249{ 2237{
2250 struct list_head *i = p; 2238 return seq_list_next(p, &modules, pos);
2251 (*pos)++;
2252 if (i->next == &modules)
2253 return NULL;
2254 return i->next;
2255} 2239}
2256 2240
2257static void m_stop(struct seq_file *m, void *p) 2241static void m_stop(struct seq_file *m, void *p)
@@ -2321,11 +2305,10 @@ const struct seq_operations modules_op = {
2321/* Given an address, look for it in the module exception tables. */ 2305/* Given an address, look for it in the module exception tables. */
2322const struct exception_table_entry *search_module_extables(unsigned long addr) 2306const struct exception_table_entry *search_module_extables(unsigned long addr)
2323{ 2307{
2324 unsigned long flags;
2325 const struct exception_table_entry *e = NULL; 2308 const struct exception_table_entry *e = NULL;
2326 struct module *mod; 2309 struct module *mod;
2327 2310
2328 spin_lock_irqsave(&modlist_lock, flags); 2311 preempt_disable();
2329 list_for_each_entry(mod, &modules, list) { 2312 list_for_each_entry(mod, &modules, list) {
2330 if (mod->num_exentries == 0) 2313 if (mod->num_exentries == 0)
2331 continue; 2314 continue;
@@ -2336,7 +2319,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2336 if (e) 2319 if (e)
2337 break; 2320 break;
2338 } 2321 }
2339 spin_unlock_irqrestore(&modlist_lock, flags); 2322 preempt_enable();
2340 2323
2341 /* Now, if we found one, we are running inside it now, hence 2324 /* Now, if we found one, we are running inside it now, hence
2342 we cannot unload the module, hence no refcnt needed. */ 2325 we cannot unload the module, hence no refcnt needed. */
@@ -2348,25 +2331,24 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2348 */ 2331 */
2349int is_module_address(unsigned long addr) 2332int is_module_address(unsigned long addr)
2350{ 2333{
2351 unsigned long flags;
2352 struct module *mod; 2334 struct module *mod;
2353 2335
2354 spin_lock_irqsave(&modlist_lock, flags); 2336 preempt_disable();
2355 2337
2356 list_for_each_entry(mod, &modules, list) { 2338 list_for_each_entry(mod, &modules, list) {
2357 if (within(addr, mod->module_core, mod->core_size)) { 2339 if (within(addr, mod->module_core, mod->core_size)) {
2358 spin_unlock_irqrestore(&modlist_lock, flags); 2340 preempt_enable();
2359 return 1; 2341 return 1;
2360 } 2342 }
2361 } 2343 }
2362 2344
2363 spin_unlock_irqrestore(&modlist_lock, flags); 2345 preempt_enable();
2364 2346
2365 return 0; 2347 return 0;
2366} 2348}
2367 2349
2368 2350
2369/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ 2351/* Is this a valid kernel address? */
2370struct module *__module_text_address(unsigned long addr) 2352struct module *__module_text_address(unsigned long addr)
2371{ 2353{
2372 struct module *mod; 2354 struct module *mod;
@@ -2381,11 +2363,10 @@ struct module *__module_text_address(unsigned long addr)
2381struct module *module_text_address(unsigned long addr) 2363struct module *module_text_address(unsigned long addr)
2382{ 2364{
2383 struct module *mod; 2365 struct module *mod;
2384 unsigned long flags;
2385 2366
2386 spin_lock_irqsave(&modlist_lock, flags); 2367 preempt_disable();
2387 mod = __module_text_address(addr); 2368 mod = __module_text_address(addr);
2388 spin_unlock_irqrestore(&modlist_lock, flags); 2369 preempt_enable();
2389 2370
2390 return mod; 2371 return mod;
2391} 2372}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 303eab18484b..691b86564dd9 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
139 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
140 waiter.task = task; 140 waiter.task = task;
141 141
142 old_val = atomic_xchg(&lock->count, -1);
143 if (old_val == 1)
144 goto done;
145
146 lock_contended(&lock->dep_map, _RET_IP_);
147
142 for (;;) { 148 for (;;) {
143 /* 149 /*
144 * Lets try to take the lock again - this is needed even if 150 * Lets try to take the lock again - this is needed even if
@@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
174 spin_lock_mutex(&lock->wait_lock, flags); 180 spin_lock_mutex(&lock->wait_lock, flags);
175 } 181 }
176 182
183done:
184 lock_acquired(&lock->dep_map);
177 /* got the lock - rejoice! */ 185 /* got the lock - rejoice! */
178 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 186 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
179 debug_mutex_set_owner(lock, task_thread_info(task)); 187 debug_mutex_set_owner(lock, task_thread_info(task));
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9e83b589f754..a4fb7d46971f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,8 @@
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23 23
24static struct kmem_cache *nsproxy_cachep;
25
24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 26struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
25 27
26static inline void get_nsproxy(struct nsproxy *ns) 28static inline void get_nsproxy(struct nsproxy *ns)
@@ -43,9 +45,11 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
43{ 45{
44 struct nsproxy *ns; 46 struct nsproxy *ns;
45 47
46 ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); 48 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
47 if (ns) 49 if (ns) {
50 memcpy(ns, orig, sizeof(struct nsproxy));
48 atomic_set(&ns->count, 1); 51 atomic_set(&ns->count, 1);
52 }
49 return ns; 53 return ns;
50} 54}
51 55
@@ -54,33 +58,51 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
54 * Return the newly created nsproxy. Do not attach this to the task, 58 * Return the newly created nsproxy. Do not attach this to the task,
55 * leave it to the caller to do proper locking and attach it to task. 59 * leave it to the caller to do proper locking and attach it to task.
56 */ 60 */
57static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, 61static struct nsproxy *create_new_namespaces(unsigned long flags,
58 struct fs_struct *new_fs) 62 struct task_struct *tsk, struct fs_struct *new_fs)
59{ 63{
60 struct nsproxy *new_nsp; 64 struct nsproxy *new_nsp;
65 int err;
61 66
62 new_nsp = clone_nsproxy(tsk->nsproxy); 67 new_nsp = clone_nsproxy(tsk->nsproxy);
63 if (!new_nsp) 68 if (!new_nsp)
64 return ERR_PTR(-ENOMEM); 69 return ERR_PTR(-ENOMEM);
65 70
66 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 71 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
67 if (IS_ERR(new_nsp->mnt_ns)) 72 if (IS_ERR(new_nsp->mnt_ns)) {
73 err = PTR_ERR(new_nsp->mnt_ns);
68 goto out_ns; 74 goto out_ns;
75 }
69 76
70 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 77 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
71 if (IS_ERR(new_nsp->uts_ns)) 78 if (IS_ERR(new_nsp->uts_ns)) {
79 err = PTR_ERR(new_nsp->uts_ns);
72 goto out_uts; 80 goto out_uts;
81 }
73 82
74 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 83 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
75 if (IS_ERR(new_nsp->ipc_ns)) 84 if (IS_ERR(new_nsp->ipc_ns)) {
85 err = PTR_ERR(new_nsp->ipc_ns);
76 goto out_ipc; 86 goto out_ipc;
87 }
77 88
78 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); 89 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
79 if (IS_ERR(new_nsp->pid_ns)) 90 if (IS_ERR(new_nsp->pid_ns)) {
91 err = PTR_ERR(new_nsp->pid_ns);
80 goto out_pid; 92 goto out_pid;
93 }
94
95 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
96 if (IS_ERR(new_nsp->user_ns)) {
97 err = PTR_ERR(new_nsp->user_ns);
98 goto out_user;
99 }
81 100
82 return new_nsp; 101 return new_nsp;
83 102
103out_user:
104 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns);
84out_pid: 106out_pid:
85 if (new_nsp->ipc_ns) 107 if (new_nsp->ipc_ns)
86 put_ipc_ns(new_nsp->ipc_ns); 108 put_ipc_ns(new_nsp->ipc_ns);
@@ -91,15 +113,15 @@ out_uts:
91 if (new_nsp->mnt_ns) 113 if (new_nsp->mnt_ns)
92 put_mnt_ns(new_nsp->mnt_ns); 114 put_mnt_ns(new_nsp->mnt_ns);
93out_ns: 115out_ns:
94 kfree(new_nsp); 116 kmem_cache_free(nsproxy_cachep, new_nsp);
95 return ERR_PTR(-ENOMEM); 117 return ERR_PTR(err);
96} 118}
97 119
98/* 120/*
99 * called from clone. This now handles copy for nsproxy and all 121 * called from clone. This now handles copy for nsproxy and all
100 * namespaces therein. 122 * namespaces therein.
101 */ 123 */
102int copy_namespaces(int flags, struct task_struct *tsk) 124int copy_namespaces(unsigned long flags, struct task_struct *tsk)
103{ 125{
104 struct nsproxy *old_ns = tsk->nsproxy; 126 struct nsproxy *old_ns = tsk->nsproxy;
105 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
@@ -110,7 +132,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
110 132
111 get_nsproxy(old_ns); 133 get_nsproxy(old_ns);
112 134
113 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
114 return 0; 136 return 0;
115 137
116 if (!capable(CAP_SYS_ADMIN)) { 138 if (!capable(CAP_SYS_ADMIN)) {
@@ -140,7 +162,9 @@ void free_nsproxy(struct nsproxy *ns)
140 put_ipc_ns(ns->ipc_ns); 162 put_ipc_ns(ns->ipc_ns);
141 if (ns->pid_ns) 163 if (ns->pid_ns)
142 put_pid_ns(ns->pid_ns); 164 put_pid_ns(ns->pid_ns);
143 kfree(ns); 165 if (ns->user_ns)
166 put_user_ns(ns->user_ns);
167 kmem_cache_free(nsproxy_cachep, ns);
144} 168}
145 169
146/* 170/*
@@ -152,19 +176,10 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
152{ 176{
153 int err = 0; 177 int err = 0;
154 178
155 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER)))
156 return 0; 181 return 0;
157 182
158#ifndef CONFIG_IPC_NS
159 if (unshare_flags & CLONE_NEWIPC)
160 return -EINVAL;
161#endif
162
163#ifndef CONFIG_UTS_NS
164 if (unshare_flags & CLONE_NEWUTS)
165 return -EINVAL;
166#endif
167
168 if (!capable(CAP_SYS_ADMIN)) 183 if (!capable(CAP_SYS_ADMIN))
169 return -EPERM; 184 return -EPERM;
170 185
@@ -174,3 +189,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
174 err = PTR_ERR(*new_nsp); 189 err = PTR_ERR(*new_nsp);
175 return err; 190 return err;
176} 191}
192
193static int __init nsproxy_cache_init(void)
194{
195 nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
196 0, SLAB_PANIC, NULL);
197 return 0;
198}
199
200module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 623d1828259a..f64f4c1ac11f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -159,14 +159,15 @@ const char *print_tainted(void)
159{ 159{
160 static char buf[20]; 160 static char buf[20];
161 if (tainted) { 161 if (tainted) {
162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", 162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
168 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 168 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
169 tainted & TAINT_USER ? 'U' : ' '); 169 tainted & TAINT_USER ? 'U' : ' ',
170 tainted & TAINT_DIE ? 'D' : ' ');
170 } 171 }
171 else 172 else
172 snprintf(buf, sizeof(buf), "Not tainted"); 173 snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/pid.c b/kernel/pid.c
index eb66bd2953ab..c6e3f9ffff87 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr)
365} 365}
366EXPORT_SYMBOL_GPL(find_get_pid); 366EXPORT_SYMBOL_GPL(find_get_pid);
367 367
368struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) 368struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
369{ 369{
370 BUG_ON(!old_ns); 370 BUG_ON(!old_ns);
371 get_pid_ns(old_ns); 371 get_pid_ns(old_ns);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 329ce0172074..55b3761edaa9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void)
241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
242 242
243 posix_timers_cache = kmem_cache_create("posix_timers_cache", 243 posix_timers_cache = kmem_cache_create("posix_timers_cache",
244 sizeof (struct k_itimer), 0, 0, NULL, NULL); 244 sizeof (struct k_itimer), 0, 0, NULL);
245 idr_init(&posix_timers_id); 245 idr_init(&posix_timers_id);
246 return 0; 246 return 0;
247} 247}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 495b7d4dd330..c1a106d87d90 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -33,13 +33,20 @@ config PM_DEBUG
33 bool "Power Management Debug Support" 33 bool "Power Management Debug Support"
34 depends on PM 34 depends on PM
35 ---help--- 35 ---help---
36 This option enables verbose debugging support in the Power Management 36 This option enables various debugging support in the Power Management
37 code. This is helpful when debugging and reporting various PM bugs, 37 code. This is helpful when debugging and reporting PM bugs, like
38 like suspend support. 38 suspend support.
39
40config PM_VERBOSE
41 bool "Verbose Power Management debugging"
42 depends on PM_DEBUG
43 default n
44 ---help---
45 This option enables verbose messages from the Power Management code.
39 46
40config DISABLE_CONSOLE_SUSPEND 47config DISABLE_CONSOLE_SUSPEND
41 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" 48 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
42 depends on PM && PM_DEBUG 49 depends on PM_DEBUG
43 default n 50 default n
44 ---help--- 51 ---help---
45 This option turns off the console suspend mechanism that prevents 52 This option turns off the console suspend mechanism that prevents
@@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
50 57
51config PM_TRACE 58config PM_TRACE
52 bool "Suspend/resume event tracing" 59 bool "Suspend/resume event tracing"
53 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL 60 depends on PM_DEBUG && X86 && EXPERIMENTAL
54 default n 61 default n
55 ---help--- 62 ---help---
56 This enables some cheesy code to save the last PM event point in the 63 This enables some cheesy code to save the last PM event point in the
@@ -65,18 +72,6 @@ config PM_TRACE
65 CAUTION: this option will cause your machine's real-time clock to be 72 CAUTION: this option will cause your machine's real-time clock to be
66 set to an invalid time after a resume. 73 set to an invalid time after a resume.
67 74
68config PM_SYSFS_DEPRECATED
69 bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
70 depends on PM && SYSFS
71 default n
72 help
73 The driver model started out with a sysfs file intended to provide
74 a userspace hook for device power management. This feature has never
75 worked very well, except for limited testing purposes, and so it will
76 be removed. It's not clear that a generic mechanism could really
77 handle the wide variability of device power states; any replacements
78 are likely to be bus or driver specific.
79
80config SOFTWARE_SUSPEND 75config SOFTWARE_SUSPEND
81 bool "Software Suspend (Hibernation)" 76 bool "Software Suspend (Hibernation)"
82 depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) 77 depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f445b9cd60fb..324ac0188ce1 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -45,7 +45,7 @@ enum {
45 45
46static int hibernation_mode = HIBERNATION_SHUTDOWN; 46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47 47
48struct hibernation_ops *hibernation_ops; 48static struct hibernation_ops *hibernation_ops;
49 49
50/** 50/**
51 * hibernation_set_ops - set the global hibernate operations 51 * hibernation_set_ops - set the global hibernate operations
@@ -54,7 +54,8 @@ struct hibernation_ops *hibernation_ops;
54 54
55void hibernation_set_ops(struct hibernation_ops *ops) 55void hibernation_set_ops(struct hibernation_ops *ops)
56{ 56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish)) { 57 if (ops && !(ops->prepare && ops->enter && ops->finish
58 && ops->pre_restore && ops->restore_cleanup)) {
58 WARN_ON(1); 59 WARN_ON(1);
59 return; 60 return;
60 } 61 }
@@ -74,9 +75,9 @@ void hibernation_set_ops(struct hibernation_ops *ops)
74 * platform driver if so configured and return an error code if it fails 75 * platform driver if so configured and return an error code if it fails
75 */ 76 */
76 77
77static int platform_prepare(void) 78static int platform_prepare(int platform_mode)
78{ 79{
79 return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? 80 return (platform_mode && hibernation_ops) ?
80 hibernation_ops->prepare() : 0; 81 hibernation_ops->prepare() : 0;
81} 82}
82 83
@@ -85,13 +86,145 @@ static int platform_prepare(void)
85 * using the platform driver (must be called after platform_prepare()) 86 * using the platform driver (must be called after platform_prepare())
86 */ 87 */
87 88
88static void platform_finish(void) 89static void platform_finish(int platform_mode)
89{ 90{
90 if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) 91 if (platform_mode && hibernation_ops)
91 hibernation_ops->finish(); 92 hibernation_ops->finish();
92} 93}
93 94
94/** 95/**
96 * platform_pre_restore - prepare the platform for the restoration from a
97 * hibernation image. If the restore fails after this function has been
98 * called, platform_restore_cleanup() must be called.
99 */
100
101static int platform_pre_restore(int platform_mode)
102{
103 return (platform_mode && hibernation_ops) ?
104 hibernation_ops->pre_restore() : 0;
105}
106
107/**
108 * platform_restore_cleanup - switch the platform to the normal mode of
109 * operation after a failing restore. If platform_pre_restore() has been
110 * called before the failing restore, this function must be called too,
111 * regardless of the result of platform_pre_restore().
112 */
113
114static void platform_restore_cleanup(int platform_mode)
115{
116 if (platform_mode && hibernation_ops)
117 hibernation_ops->restore_cleanup();
118}
119
120/**
121 * hibernation_snapshot - quiesce devices and create the hibernation
122 * snapshot image.
123 * @platform_mode - if set, use the platform driver, if available, to
124 * prepare the platform frimware for the power transition.
125 *
126 * Must be called with pm_mutex held
127 */
128
129int hibernation_snapshot(int platform_mode)
130{
131 int error;
132
133 /* Free memory before shutting down devices. */
134 error = swsusp_shrink_memory();
135 if (error)
136 return error;
137
138 suspend_console();
139 error = device_suspend(PMSG_FREEZE);
140 if (error)
141 goto Resume_console;
142
143 error = platform_prepare(platform_mode);
144 if (error)
145 goto Resume_devices;
146
147 error = disable_nonboot_cpus();
148 if (!error) {
149 if (hibernation_mode != HIBERNATION_TEST) {
150 in_suspend = 1;
151 error = swsusp_suspend();
152 /* Control returns here after successful restore */
153 } else {
154 printk("swsusp debug: Waiting for 5 seconds.\n");
155 mdelay(5000);
156 }
157 }
158 enable_nonboot_cpus();
159 Resume_devices:
160 platform_finish(platform_mode);
161 device_resume();
162 Resume_console:
163 resume_console();
164 return error;
165}
166
167/**
168 * hibernation_restore - quiesce devices and restore the hibernation
169 * snapshot image. If successful, control returns in hibernation_snaphot()
170 * @platform_mode - if set, use the platform driver, if available, to
171 * prepare the platform frimware for the transition.
172 *
173 * Must be called with pm_mutex held
174 */
175
176int hibernation_restore(int platform_mode)
177{
178 int error;
179
180 pm_prepare_console();
181 suspend_console();
182 error = device_suspend(PMSG_PRETHAW);
183 if (error)
184 goto Finish;
185
186 error = platform_pre_restore(platform_mode);
187 if (!error) {
188 error = disable_nonboot_cpus();
189 if (!error)
190 error = swsusp_resume();
191 enable_nonboot_cpus();
192 }
193 platform_restore_cleanup(platform_mode);
194 device_resume();
195 Finish:
196 resume_console();
197 pm_restore_console();
198 return error;
199}
200
201/**
202 * hibernation_platform_enter - enter the hibernation state using the
203 * platform driver (if available)
204 */
205
206int hibernation_platform_enter(void)
207{
208 int error;
209
210 if (hibernation_ops) {
211 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
212 /*
213 * We have cancelled the power transition by running
214 * hibernation_ops->finish() before saving the image, so we
215 * should let the firmware know that we're going to enter the
216 * sleep state after all
217 */
218 error = hibernation_ops->prepare();
219 if (!error)
220 error = hibernation_ops->enter();
221 } else {
222 error = -ENOSYS;
223 }
224 return error;
225}
226
227/**
95 * power_down - Shut the machine down for hibernation. 228 * power_down - Shut the machine down for hibernation.
96 * 229 *
97 * Use the platform driver, if configured so; otherwise try 230 * Use the platform driver, if configured so; otherwise try
@@ -111,11 +244,7 @@ static void power_down(void)
111 kernel_restart(NULL); 244 kernel_restart(NULL);
112 break; 245 break;
113 case HIBERNATION_PLATFORM: 246 case HIBERNATION_PLATFORM:
114 if (hibernation_ops) { 247 hibernation_platform_enter();
115 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
116 hibernation_ops->enter();
117 break;
118 }
119 } 248 }
120 kernel_halt(); 249 kernel_halt();
121 /* 250 /*
@@ -152,9 +281,16 @@ int hibernate(void)
152{ 281{
153 int error; 282 int error;
154 283
284 mutex_lock(&pm_mutex);
155 /* The snapshot device should not be opened while we're running */ 285 /* The snapshot device should not be opened while we're running */
156 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) 286 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
157 return -EBUSY; 287 error = -EBUSY;
288 goto Unlock;
289 }
290
291 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
292 if (error)
293 goto Exit;
158 294
159 /* Allocate memory management structures */ 295 /* Allocate memory management structures */
160 error = create_basic_memory_bitmaps(); 296 error = create_basic_memory_bitmaps();
@@ -165,75 +301,35 @@ int hibernate(void)
165 if (error) 301 if (error)
166 goto Finish; 302 goto Finish;
167 303
168 mutex_lock(&pm_mutex);
169 if (hibernation_mode == HIBERNATION_TESTPROC) { 304 if (hibernation_mode == HIBERNATION_TESTPROC) {
170 printk("swsusp debug: Waiting for 5 seconds.\n"); 305 printk("swsusp debug: Waiting for 5 seconds.\n");
171 mdelay(5000); 306 mdelay(5000);
172 goto Thaw; 307 goto Thaw;
173 } 308 }
309 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
310 if (in_suspend && !error) {
311 unsigned int flags = 0;
174 312
175 /* Free memory before shutting down devices. */ 313 if (hibernation_mode == HIBERNATION_PLATFORM)
176 error = swsusp_shrink_memory(); 314 flags |= SF_PLATFORM_MODE;
177 if (error)
178 goto Thaw;
179
180 error = platform_prepare();
181 if (error)
182 goto Thaw;
183
184 suspend_console();
185 error = device_suspend(PMSG_FREEZE);
186 if (error) {
187 printk(KERN_ERR "PM: Some devices failed to suspend\n");
188 goto Resume_devices;
189 }
190 error = disable_nonboot_cpus();
191 if (error)
192 goto Enable_cpus;
193
194 if (hibernation_mode == HIBERNATION_TEST) {
195 printk("swsusp debug: Waiting for 5 seconds.\n");
196 mdelay(5000);
197 goto Enable_cpus;
198 }
199
200 pr_debug("PM: snapshotting memory.\n");
201 in_suspend = 1;
202 error = swsusp_suspend();
203 if (error)
204 goto Enable_cpus;
205
206 if (in_suspend) {
207 enable_nonboot_cpus();
208 platform_finish();
209 device_resume();
210 resume_console();
211 pr_debug("PM: writing image.\n"); 315 pr_debug("PM: writing image.\n");
212 error = swsusp_write(); 316 error = swsusp_write(flags);
317 swsusp_free();
213 if (!error) 318 if (!error)
214 power_down(); 319 power_down();
215 else {
216 swsusp_free();
217 goto Thaw;
218 }
219 } else { 320 } else {
220 pr_debug("PM: Image restored successfully.\n"); 321 pr_debug("PM: Image restored successfully.\n");
322 swsusp_free();
221 } 323 }
222
223 swsusp_free();
224 Enable_cpus:
225 enable_nonboot_cpus();
226 Resume_devices:
227 platform_finish();
228 device_resume();
229 resume_console();
230 Thaw: 324 Thaw:
231 mutex_unlock(&pm_mutex);
232 unprepare_processes(); 325 unprepare_processes();
233 Finish: 326 Finish:
234 free_basic_memory_bitmaps(); 327 free_basic_memory_bitmaps();
235 Exit: 328 Exit:
329 pm_notifier_call_chain(PM_POST_HIBERNATION);
236 atomic_inc(&snapshot_device_available); 330 atomic_inc(&snapshot_device_available);
331 Unlock:
332 mutex_unlock(&pm_mutex);
237 return error; 333 return error;
238} 334}
239 335
@@ -253,6 +349,7 @@ int hibernate(void)
253static int software_resume(void) 349static int software_resume(void)
254{ 350{
255 int error; 351 int error;
352 unsigned int flags;
256 353
257 mutex_lock(&pm_mutex); 354 mutex_lock(&pm_mutex);
258 if (!swsusp_resume_device) { 355 if (!swsusp_resume_device) {
@@ -300,30 +397,12 @@ static int software_resume(void)
300 397
301 pr_debug("PM: Reading swsusp image.\n"); 398 pr_debug("PM: Reading swsusp image.\n");
302 399
303 error = swsusp_read(); 400 error = swsusp_read(&flags);
304 if (error) {
305 swsusp_free();
306 goto Thaw;
307 }
308
309 pr_debug("PM: Preparing devices for restore.\n");
310
311 suspend_console();
312 error = device_suspend(PMSG_PRETHAW);
313 if (error)
314 goto Free;
315
316 error = disable_nonboot_cpus();
317 if (!error) 401 if (!error)
318 swsusp_resume(); 402 hibernation_restore(flags & SF_PLATFORM_MODE);
319 403
320 enable_nonboot_cpus();
321 Free:
322 swsusp_free();
323 device_resume();
324 resume_console();
325 Thaw:
326 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 404 printk(KERN_ERR "PM: Restore failed, recovering.\n");
405 swsusp_free();
327 unprepare_processes(); 406 unprepare_processes();
328 Done: 407 Done:
329 free_basic_memory_bitmaps(); 408 free_basic_memory_bitmaps();
@@ -333,7 +412,7 @@ static int software_resume(void)
333 Unlock: 412 Unlock:
334 mutex_unlock(&pm_mutex); 413 mutex_unlock(&pm_mutex);
335 pr_debug("PM: Resume from disk failed.\n"); 414 pr_debug("PM: Resume from disk failed.\n");
336 return 0; 415 return error;
337} 416}
338 417
339late_initcall(software_resume); 418late_initcall(software_resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fc45ed22620f..32147b57c3bf 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -23,6 +23,8 @@
23 23
24#include "power.h" 24#include "power.h"
25 25
26BLOCKING_NOTIFIER_HEAD(pm_chain_head);
27
26/*This is just an arbitrary number */ 28/*This is just an arbitrary number */
27#define FREE_PAGE_NUMBER (100) 29#define FREE_PAGE_NUMBER (100)
28 30
@@ -63,14 +65,11 @@ static inline void pm_finish(suspend_state_t state)
63 65
64/** 66/**
65 * suspend_prepare - Do prep work before entering low-power state. 67 * suspend_prepare - Do prep work before entering low-power state.
66 * @state: State we're entering.
67 * 68 *
68 * This is common code that is called for each state that we're 69 * This is common code that is called for each state that we're entering.
69 * entering. Allocate a console, stop all processes, then make sure 70 * Run suspend notifiers, allocate a console and stop all processes.
70 * the platform can enter the requested state.
71 */ 71 */
72 72static int suspend_prepare(void)
73static int suspend_prepare(suspend_state_t state)
74{ 73{
75 int error; 74 int error;
76 unsigned int free_pages; 75 unsigned int free_pages;
@@ -78,6 +77,10 @@ static int suspend_prepare(suspend_state_t state)
78 if (!pm_ops || !pm_ops->enter) 77 if (!pm_ops || !pm_ops->enter)
79 return -EPERM; 78 return -EPERM;
80 79
80 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
81 if (error)
82 goto Finish;
83
81 pm_prepare_console(); 84 pm_prepare_console();
82 85
83 if (freeze_processes()) { 86 if (freeze_processes()) {
@@ -85,46 +88,23 @@ static int suspend_prepare(suspend_state_t state)
85 goto Thaw; 88 goto Thaw;
86 } 89 }
87 90
88 if ((free_pages = global_page_state(NR_FREE_PAGES)) 91 free_pages = global_page_state(NR_FREE_PAGES);
89 < FREE_PAGE_NUMBER) { 92 if (free_pages < FREE_PAGE_NUMBER) {
90 pr_debug("PM: free some memory\n"); 93 pr_debug("PM: free some memory\n");
91 shrink_all_memory(FREE_PAGE_NUMBER - free_pages); 94 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
92 if (nr_free_pages() < FREE_PAGE_NUMBER) { 95 if (nr_free_pages() < FREE_PAGE_NUMBER) {
93 error = -ENOMEM; 96 error = -ENOMEM;
94 printk(KERN_ERR "PM: No enough memory\n"); 97 printk(KERN_ERR "PM: No enough memory\n");
95 goto Thaw;
96 } 98 }
97 } 99 }
98
99 if (pm_ops->set_target) {
100 error = pm_ops->set_target(state);
101 if (error)
102 goto Thaw;
103 }
104 suspend_console();
105 error = device_suspend(PMSG_SUSPEND);
106 if (error) {
107 printk(KERN_ERR "Some devices failed to suspend\n");
108 goto Resume_console;
109 }
110 if (pm_ops->prepare) {
111 if ((error = pm_ops->prepare(state)))
112 goto Resume_devices;
113 }
114
115 error = disable_nonboot_cpus();
116 if (!error) 100 if (!error)
117 return 0; 101 return 0;
118 102
119 enable_nonboot_cpus();
120 pm_finish(state);
121 Resume_devices:
122 device_resume();
123 Resume_console:
124 resume_console();
125 Thaw: 103 Thaw:
126 thaw_processes(); 104 thaw_processes();
127 pm_restore_console(); 105 pm_restore_console();
106 Finish:
107 pm_notifier_call_chain(PM_POST_SUSPEND);
128 return error; 108 return error;
129} 109}
130 110
@@ -140,6 +120,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
140 local_irq_enable(); 120 local_irq_enable();
141} 121}
142 122
123/**
124 * suspend_enter - enter the desired system sleep state.
125 * @state: state to enter
126 *
127 * This function should be called after devices have been suspended.
128 */
143int suspend_enter(suspend_state_t state) 129int suspend_enter(suspend_state_t state)
144{ 130{
145 int error = 0; 131 int error = 0;
@@ -159,23 +145,58 @@ int suspend_enter(suspend_state_t state)
159 return error; 145 return error;
160} 146}
161 147
148/**
149 * suspend_devices_and_enter - suspend devices and enter the desired system sleep
150 * state.
151 * @state: state to enter
152 */
153int suspend_devices_and_enter(suspend_state_t state)
154{
155 int error;
156
157 if (!pm_ops)
158 return -ENOSYS;
159
160 if (pm_ops->set_target) {
161 error = pm_ops->set_target(state);
162 if (error)
163 return error;
164 }
165 suspend_console();
166 error = device_suspend(PMSG_SUSPEND);
167 if (error) {
168 printk(KERN_ERR "Some devices failed to suspend\n");
169 goto Resume_console;
170 }
171 if (pm_ops->prepare) {
172 error = pm_ops->prepare(state);
173 if (error)
174 goto Resume_devices;
175 }
176 error = disable_nonboot_cpus();
177 if (!error)
178 suspend_enter(state);
179
180 enable_nonboot_cpus();
181 pm_finish(state);
182 Resume_devices:
183 device_resume();
184 Resume_console:
185 resume_console();
186 return error;
187}
162 188
163/** 189/**
164 * suspend_finish - Do final work before exiting suspend sequence. 190 * suspend_finish - Do final work before exiting suspend sequence.
165 * @state: State we're coming out of.
166 * 191 *
167 * Call platform code to clean up, restart processes, and free the 192 * Call platform code to clean up, restart processes, and free the
168 * console that we've allocated. This is not called for suspend-to-disk. 193 * console that we've allocated. This is not called for suspend-to-disk.
169 */ 194 */
170 195static void suspend_finish(void)
171static void suspend_finish(suspend_state_t state)
172{ 196{
173 enable_nonboot_cpus();
174 pm_finish(state);
175 device_resume();
176 resume_console();
177 thaw_processes(); 197 thaw_processes();
178 pm_restore_console(); 198 pm_restore_console();
199 pm_notifier_call_chain(PM_POST_SUSPEND);
179} 200}
180 201
181 202
@@ -207,7 +228,6 @@ static inline int valid_state(suspend_state_t state)
207 * Then, do the setup for suspend, enter the state, and cleaup (after 228 * Then, do the setup for suspend, enter the state, and cleaup (after
208 * we've woken up). 229 * we've woken up).
209 */ 230 */
210
211static int enter_state(suspend_state_t state) 231static int enter_state(suspend_state_t state)
212{ 232{
213 int error; 233 int error;
@@ -218,14 +238,14 @@ static int enter_state(suspend_state_t state)
218 return -EBUSY; 238 return -EBUSY;
219 239
220 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 240 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
221 if ((error = suspend_prepare(state))) 241 if ((error = suspend_prepare()))
222 goto Unlock; 242 goto Unlock;
223 243
224 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 244 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
225 error = suspend_enter(state); 245 error = suspend_devices_and_enter(state);
226 246
227 pr_debug("PM: Finishing wakeup.\n"); 247 pr_debug("PM: Finishing wakeup.\n");
228 suspend_finish(state); 248 suspend_finish();
229 Unlock: 249 Unlock:
230 mutex_unlock(&pm_mutex); 250 mutex_unlock(&pm_mutex);
231 return error; 251 return error;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 51381487103f..5f24c786f8ec 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,7 +25,10 @@ struct swsusp_info {
25 */ 25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
27 27
28extern struct hibernation_ops *hibernation_ops; 28/* kernel/power/disk.c */
29extern int hibernation_snapshot(int platform_mode);
30extern int hibernation_restore(int platform_mode);
31extern int hibernation_platform_enter(void);
29#endif 32#endif
30 33
31extern int pfn_is_nosave(unsigned long); 34extern int pfn_is_nosave(unsigned long);
@@ -152,16 +155,34 @@ extern sector_t alloc_swapdev_block(int swap);
152extern void free_all_swap_pages(int swap); 155extern void free_all_swap_pages(int swap);
153extern int swsusp_swap_in_use(void); 156extern int swsusp_swap_in_use(void);
154 157
158/*
159 * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
160 * the image header.
161 */
162#define SF_PLATFORM_MODE 1
163
164/* kernel/power/disk.c */
155extern int swsusp_check(void); 165extern int swsusp_check(void);
156extern int swsusp_shrink_memory(void); 166extern int swsusp_shrink_memory(void);
157extern void swsusp_free(void); 167extern void swsusp_free(void);
158extern int swsusp_suspend(void); 168extern int swsusp_suspend(void);
159extern int swsusp_resume(void); 169extern int swsusp_resume(void);
160extern int swsusp_read(void); 170extern int swsusp_read(unsigned int *flags_p);
161extern int swsusp_write(void); 171extern int swsusp_write(unsigned int flags);
162extern void swsusp_close(void); 172extern void swsusp_close(void);
163extern int suspend_enter(suspend_state_t state);
164 173
165struct timeval; 174struct timeval;
175/* kernel/power/swsusp.c */
166extern void swsusp_show_speed(struct timeval *, struct timeval *, 176extern void swsusp_show_speed(struct timeval *, struct timeval *,
167 unsigned int, char *); 177 unsigned int, char *);
178
179/* kernel/power/main.c */
180extern int suspend_enter(suspend_state_t state);
181extern int suspend_devices_and_enter(suspend_state_t state);
182extern struct blocking_notifier_head pm_chain_head;
183
184static inline int pm_notifier_call_chain(unsigned long val)
185{
186 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
187 == NOTIFY_BAD) ? -EINVAL : 0;
188}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e0233d8422b9..3434940a3df1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,7 @@ static inline void frozen_process(void)
40 current->flags |= PF_FROZEN; 40 current->flags |= PF_FROZEN;
41 wmb(); 41 wmb();
42 } 42 }
43 clear_tsk_thread_flag(current, TIF_FREEZE); 43 clear_freeze_flag(current);
44} 44}
45 45
46/* Refrigerator is place where frozen processes are stored :-). */ 46/* Refrigerator is place where frozen processes are stored :-). */
@@ -72,20 +72,19 @@ void refrigerator(void)
72 schedule(); 72 schedule();
73 } 73 }
74 pr_debug("%s left refrigerator\n", current->comm); 74 pr_debug("%s left refrigerator\n", current->comm);
75 current->state = save; 75 __set_current_state(save);
76} 76}
77 77
78static inline void freeze_process(struct task_struct *p) 78static void freeze_task(struct task_struct *p)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 81
82 if (!freezing(p)) { 82 if (!freezing(p)) {
83 rmb(); 83 rmb();
84 if (!frozen(p)) { 84 if (!frozen(p)) {
85 set_freeze_flag(p);
85 if (p->state == TASK_STOPPED) 86 if (p->state == TASK_STOPPED)
86 force_sig_specific(SIGSTOP, p); 87 force_sig_specific(SIGSTOP, p);
87
88 freeze(p);
89 spin_lock_irqsave(&p->sighand->siglock, flags); 88 spin_lock_irqsave(&p->sighand->siglock, flags);
90 signal_wake_up(p, p->state == TASK_STOPPED); 89 signal_wake_up(p, p->state == TASK_STOPPED);
91 spin_unlock_irqrestore(&p->sighand->siglock, flags); 90 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -99,19 +98,14 @@ static void cancel_freezing(struct task_struct *p)
99 98
100 if (freezing(p)) { 99 if (freezing(p)) {
101 pr_debug(" clean up: %s\n", p->comm); 100 pr_debug(" clean up: %s\n", p->comm);
102 do_not_freeze(p); 101 clear_freeze_flag(p);
103 spin_lock_irqsave(&p->sighand->siglock, flags); 102 spin_lock_irqsave(&p->sighand->siglock, flags);
104 recalc_sigpending_and_wake(p); 103 recalc_sigpending_and_wake(p);
105 spin_unlock_irqrestore(&p->sighand->siglock, flags); 104 spin_unlock_irqrestore(&p->sighand->siglock, flags);
106 } 105 }
107} 106}
108 107
109static inline int is_user_space(struct task_struct *p) 108static int try_to_freeze_tasks(int freeze_user_space)
110{
111 return p->mm && !(p->flags & PF_BORROWED_MM);
112}
113
114static unsigned int try_to_freeze_tasks(int freeze_user_space)
115{ 109{
116 struct task_struct *g, *p; 110 struct task_struct *g, *p;
117 unsigned long end_time; 111 unsigned long end_time;
@@ -122,26 +116,40 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
122 todo = 0; 116 todo = 0;
123 read_lock(&tasklist_lock); 117 read_lock(&tasklist_lock);
124 do_each_thread(g, p) { 118 do_each_thread(g, p) {
125 if (!freezeable(p)) 119 if (frozen(p) || !freezeable(p))
126 continue; 120 continue;
127 121
128 if (frozen(p)) 122 if (freeze_user_space) {
129 continue; 123 if (p->state == TASK_TRACED &&
130 124 frozen(p->parent)) {
131 if (p->state == TASK_TRACED && frozen(p->parent)) { 125 cancel_freezing(p);
132 cancel_freezing(p); 126 continue;
133 continue; 127 }
128 /*
129 * Kernel threads should not have TIF_FREEZE set
130 * at this point, so we must ensure that either
131 * p->mm is not NULL *and* PF_BORROWED_MM is
132 * unset, or TIF_FRREZE is left unset.
133 * The task_lock() is necessary to prevent races
134 * with exit_mm() or use_mm()/unuse_mm() from
135 * occuring.
136 */
137 task_lock(p);
138 if (!p->mm || (p->flags & PF_BORROWED_MM)) {
139 task_unlock(p);
140 continue;
141 }
142 freeze_task(p);
143 task_unlock(p);
144 } else {
145 freeze_task(p);
134 } 146 }
135 if (freeze_user_space && !is_user_space(p))
136 continue;
137
138 freeze_process(p);
139 if (!freezer_should_skip(p)) 147 if (!freezer_should_skip(p))
140 todo++; 148 todo++;
141 } while_each_thread(g, p); 149 } while_each_thread(g, p);
142 read_unlock(&tasklist_lock); 150 read_unlock(&tasklist_lock);
143 yield(); /* Yield is okay here */ 151 yield(); /* Yield is okay here */
144 if (todo && time_after(jiffies, end_time)) 152 if (time_after(jiffies, end_time))
145 break; 153 break;
146 } while (todo); 154 } while (todo);
147 155
@@ -152,49 +160,41 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
152 * but it cleans up leftover PF_FREEZE requests. 160 * but it cleans up leftover PF_FREEZE requests.
153 */ 161 */
154 printk("\n"); 162 printk("\n");
155 printk(KERN_ERR "Stopping %s timed out after %d seconds " 163 printk(KERN_ERR "Freezing of %s timed out after %d seconds "
156 "(%d tasks refusing to freeze):\n", 164 "(%d tasks refusing to freeze):\n",
157 freeze_user_space ? "user space processes" : 165 freeze_user_space ? "user space " : "tasks ",
158 "kernel threads",
159 TIMEOUT / HZ, todo); 166 TIMEOUT / HZ, todo);
167 show_state();
160 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
161 do_each_thread(g, p) { 169 do_each_thread(g, p) {
162 if (freeze_user_space && !is_user_space(p))
163 continue;
164
165 task_lock(p); 170 task_lock(p);
166 if (freezeable(p) && !frozen(p) && 171 if (freezing(p) && !freezer_should_skip(p))
167 !freezer_should_skip(p))
168 printk(KERN_ERR " %s\n", p->comm); 172 printk(KERN_ERR " %s\n", p->comm);
169
170 cancel_freezing(p); 173 cancel_freezing(p);
171 task_unlock(p); 174 task_unlock(p);
172 } while_each_thread(g, p); 175 } while_each_thread(g, p);
173 read_unlock(&tasklist_lock); 176 read_unlock(&tasklist_lock);
174 } 177 }
175 178
176 return todo; 179 return todo ? -EBUSY : 0;
177} 180}
178 181
179/** 182/**
180 * freeze_processes - tell processes to enter the refrigerator 183 * freeze_processes - tell processes to enter the refrigerator
181 *
182 * Returns 0 on success, or the number of processes that didn't freeze,
183 * although they were told to.
184 */ 184 */
185int freeze_processes(void) 185int freeze_processes(void)
186{ 186{
187 unsigned int nr_unfrozen; 187 int error;
188 188
189 printk("Stopping tasks ... "); 189 printk("Stopping tasks ... ");
190 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); 190 error = try_to_freeze_tasks(FREEZER_USER_SPACE);
191 if (nr_unfrozen) 191 if (error)
192 return nr_unfrozen; 192 return error;
193 193
194 sys_sync(); 194 sys_sync();
195 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 195 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
196 if (nr_unfrozen) 196 if (error)
197 return nr_unfrozen; 197 return error;
198 198
199 printk("done.\n"); 199 printk("done.\n");
200 BUG_ON(in_atomic()); 200 BUG_ON(in_atomic());
@@ -210,7 +210,7 @@ static void thaw_tasks(int thaw_user_space)
210 if (!freezeable(p)) 210 if (!freezeable(p))
211 continue; 211 continue;
212 212
213 if (is_user_space(p) == !thaw_user_space) 213 if (!p->mm == thaw_user_space)
214 continue; 214 continue;
215 215
216 thaw_process(p); 216 thaw_process(p);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8b1a1b837145..917aba100575 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,8 +33,9 @@ extern char resume_file[];
33#define SWSUSP_SIG "S1SUSPEND" 33#define SWSUSP_SIG "S1SUSPEND"
34 34
35struct swsusp_header { 35struct swsusp_header {
36 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; 36 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
37 sector_t image; 37 sector_t image;
38 unsigned int flags; /* Flags to pass to the "boot" kernel */
38 char orig_sig[10]; 39 char orig_sig[10];
39 char sig[10]; 40 char sig[10];
40} __attribute__((packed)); 41} __attribute__((packed));
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain)
138 * Saving part 139 * Saving part
139 */ 140 */
140 141
141static int mark_swapfiles(sector_t start) 142static int mark_swapfiles(sector_t start, unsigned int flags)
142{ 143{
143 int error; 144 int error;
144 145
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start)
148 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 149 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
149 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 150 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
150 swsusp_header->image = start; 151 swsusp_header->image = start;
152 swsusp_header->flags = flags;
151 error = bio_write_page(swsusp_resume_block, 153 error = bio_write_page(swsusp_resume_block,
152 swsusp_header, NULL); 154 swsusp_header, NULL);
153 } else { 155 } else {
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages)
369 371
370/** 372/**
371 * swsusp_write - Write entire image and metadata. 373 * swsusp_write - Write entire image and metadata.
374 * @flags: flags to pass to the "boot" kernel in the image header
372 * 375 *
373 * It is important _NOT_ to umount filesystems at this point. We want 376 * It is important _NOT_ to umount filesystems at this point. We want
374 * them synced (in case something goes wrong) but we DO not want to mark 377 * them synced (in case something goes wrong) but we DO not want to mark
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages)
376 * correctly, we'll mark system clean, anyway.) 379 * correctly, we'll mark system clean, anyway.)
377 */ 380 */
378 381
379int swsusp_write(void) 382int swsusp_write(unsigned int flags)
380{ 383{
381 struct swap_map_handle handle; 384 struct swap_map_handle handle;
382 struct snapshot_handle snapshot; 385 struct snapshot_handle snapshot;
@@ -415,7 +418,7 @@ int swsusp_write(void)
415 if (!error) { 418 if (!error) {
416 flush_swap_writer(&handle); 419 flush_swap_writer(&handle);
417 printk("S"); 420 printk("S");
418 error = mark_swapfiles(start); 421 error = mark_swapfiles(start, flags);
419 printk("|\n"); 422 printk("|\n");
420 } 423 }
421 } 424 }
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle,
540 return error; 543 return error;
541} 544}
542 545
543int swsusp_read(void) 546/**
547 * swsusp_read - read the hibernation image.
548 * @flags_p: flags passed by the "frozen" kernel in the image header should
549 * be written into this memeory location
550 */
551
552int swsusp_read(unsigned int *flags_p)
544{ 553{
545 int error; 554 int error;
546 struct swap_map_handle handle; 555 struct swap_map_handle handle;
547 struct snapshot_handle snapshot; 556 struct snapshot_handle snapshot;
548 struct swsusp_info *header; 557 struct swsusp_info *header;
549 558
559 *flags_p = swsusp_header->flags;
550 if (IS_ERR(resume_bdev)) { 560 if (IS_ERR(resume_bdev)) {
551 pr_debug("swsusp: block device not initialised\n"); 561 pr_debug("swsusp: block device not initialised\n");
552 return PTR_ERR(resume_bdev); 562 return PTR_ERR(resume_bdev);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d65305b515b1..bd0723a7df3f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -128,92 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
128 return res; 128 return res;
129} 129}
130 130
131static inline int platform_prepare(void)
132{
133 int error = 0;
134
135 if (hibernation_ops)
136 error = hibernation_ops->prepare();
137
138 return error;
139}
140
141static inline void platform_finish(void)
142{
143 if (hibernation_ops)
144 hibernation_ops->finish();
145}
146
147static inline int snapshot_suspend(int platform_suspend)
148{
149 int error;
150
151 mutex_lock(&pm_mutex);
152 /* Free memory before shutting down devices. */
153 error = swsusp_shrink_memory();
154 if (error)
155 goto Finish;
156
157 if (platform_suspend) {
158 error = platform_prepare();
159 if (error)
160 goto Finish;
161 }
162 suspend_console();
163 error = device_suspend(PMSG_FREEZE);
164 if (error)
165 goto Resume_devices;
166
167 error = disable_nonboot_cpus();
168 if (!error) {
169 in_suspend = 1;
170 error = swsusp_suspend();
171 }
172 enable_nonboot_cpus();
173 Resume_devices:
174 if (platform_suspend)
175 platform_finish();
176
177 device_resume();
178 resume_console();
179 Finish:
180 mutex_unlock(&pm_mutex);
181 return error;
182}
183
184static inline int snapshot_restore(int platform_suspend)
185{
186 int error;
187
188 mutex_lock(&pm_mutex);
189 pm_prepare_console();
190 if (platform_suspend) {
191 error = platform_prepare();
192 if (error)
193 goto Finish;
194 }
195 suspend_console();
196 error = device_suspend(PMSG_PRETHAW);
197 if (error)
198 goto Resume_devices;
199
200 error = disable_nonboot_cpus();
201 if (!error)
202 error = swsusp_resume();
203
204 enable_nonboot_cpus();
205 Resume_devices:
206 if (platform_suspend)
207 platform_finish();
208
209 device_resume();
210 resume_console();
211 Finish:
212 pm_restore_console();
213 mutex_unlock(&pm_mutex);
214 return error;
215}
216
217static int snapshot_ioctl(struct inode *inode, struct file *filp, 131static int snapshot_ioctl(struct inode *inode, struct file *filp,
218 unsigned int cmd, unsigned long arg) 132 unsigned int cmd, unsigned long arg)
219{ 133{
@@ -237,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
237 if (data->frozen) 151 if (data->frozen)
238 break; 152 break;
239 mutex_lock(&pm_mutex); 153 mutex_lock(&pm_mutex);
240 if (freeze_processes()) { 154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
241 thaw_processes(); 155 if (!error) {
242 error = -EBUSY; 156 error = freeze_processes();
157 if (error)
158 thaw_processes();
243 } 159 }
160 if (error)
161 pm_notifier_call_chain(PM_POST_HIBERNATION);
244 mutex_unlock(&pm_mutex); 162 mutex_unlock(&pm_mutex);
245 if (!error) 163 if (!error)
246 data->frozen = 1; 164 data->frozen = 1;
@@ -251,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
251 break; 169 break;
252 mutex_lock(&pm_mutex); 170 mutex_lock(&pm_mutex);
253 thaw_processes(); 171 thaw_processes();
172 pm_notifier_call_chain(PM_POST_HIBERNATION);
254 mutex_unlock(&pm_mutex); 173 mutex_unlock(&pm_mutex);
255 data->frozen = 0; 174 data->frozen = 0;
256 break; 175 break;
@@ -260,7 +179,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
260 error = -EPERM; 179 error = -EPERM;
261 break; 180 break;
262 } 181 }
263 error = snapshot_suspend(data->platform_suspend); 182 error = hibernation_snapshot(data->platform_suspend);
264 if (!error) 183 if (!error)
265 error = put_user(in_suspend, (unsigned int __user *)arg); 184 error = put_user(in_suspend, (unsigned int __user *)arg);
266 if (!error) 185 if (!error)
@@ -274,7 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
274 error = -EPERM; 193 error = -EPERM;
275 break; 194 break;
276 } 195 }
277 error = snapshot_restore(data->platform_suspend); 196 error = hibernation_restore(data->platform_suspend);
278 break; 197 break;
279 198
280 case SNAPSHOT_FREE: 199 case SNAPSHOT_FREE:
@@ -336,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
336 break; 255 break;
337 256
338 case SNAPSHOT_S2RAM: 257 case SNAPSHOT_S2RAM:
339 if (!pm_ops) {
340 error = -ENOSYS;
341 break;
342 }
343
344 if (!data->frozen) { 258 if (!data->frozen) {
345 error = -EPERM; 259 error = -EPERM;
346 break; 260 break;
347 } 261 }
348
349 if (!mutex_trylock(&pm_mutex)) { 262 if (!mutex_trylock(&pm_mutex)) {
350 error = -EBUSY; 263 error = -EBUSY;
351 break; 264 break;
352 } 265 }
353 266 /*
354 if (pm_ops->prepare) { 267 * Tasks are frozen and the notifiers have been called with
355 error = pm_ops->prepare(PM_SUSPEND_MEM); 268 * PM_HIBERNATION_PREPARE
356 if (error) 269 */
357 goto OutS3; 270 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
358 }
359
360 /* Put devices to sleep */
361 suspend_console();
362 error = device_suspend(PMSG_SUSPEND);
363 if (error) {
364 printk(KERN_ERR "Failed to suspend some devices.\n");
365 } else {
366 error = disable_nonboot_cpus();
367 if (!error) {
368 /* Enter S3, system is already frozen */
369 suspend_enter(PM_SUSPEND_MEM);
370 enable_nonboot_cpus();
371 }
372 /* Wake up devices */
373 device_resume();
374 }
375 resume_console();
376 if (pm_ops->finish)
377 pm_ops->finish(PM_SUSPEND_MEM);
378
379 OutS3:
380 mutex_unlock(&pm_mutex); 271 mutex_unlock(&pm_mutex);
381 break; 272 break;
382 273
@@ -386,19 +277,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
386 switch (arg) { 277 switch (arg) {
387 278
388 case PMOPS_PREPARE: 279 case PMOPS_PREPARE:
389 if (hibernation_ops) { 280 data->platform_suspend = 1;
390 data->platform_suspend = 1; 281 error = 0;
391 error = 0;
392 } else {
393 error = -ENOSYS;
394 }
395 break; 282 break;
396 283
397 case PMOPS_ENTER: 284 case PMOPS_ENTER:
398 if (data->platform_suspend) { 285 if (data->platform_suspend)
399 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 286 error = hibernation_platform_enter();
400 error = hibernation_ops->enter(); 287
401 }
402 break; 288 break;
403 289
404 case PMOPS_FINISH: 290 case PMOPS_FINISH:
diff --git a/kernel/printk.c b/kernel/printk.c
index 0bbdeac2810c..051d27e36a6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -449,13 +449,16 @@ static int printk_time = 1;
449#else 449#else
450static int printk_time = 0; 450static int printk_time = 0;
451#endif 451#endif
452module_param(printk_time, int, S_IRUGO | S_IWUSR); 452module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
453 453
454static int __init printk_time_setup(char *str) 454static int __init printk_time_setup(char *str)
455{ 455{
456 if (*str) 456 if (*str)
457 return 0; 457 return 0;
458 printk_time = 1; 458 printk_time = 1;
459 printk(KERN_NOTICE "The 'time' option is deprecated and "
460 "is scheduled for removal in early 2008\n");
461 printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
459 return 1; 462 return 1;
460} 463}
461 464
@@ -483,6 +486,9 @@ static int have_callable_console(void)
483 * @fmt: format string 486 * @fmt: format string
484 * 487 *
485 * This is printk(). It can be called from any context. We want it to work. 488 * This is printk(). It can be called from any context. We want it to work.
489 * Be aware of the fact that if oops_in_progress is not set, we might try to
490 * wake klogd up which could deadlock on runqueue lock if printk() is called
491 * from scheduler code.
486 * 492 *
487 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 493 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
488 * call the console drivers. If we fail to get the semaphore we place the output 494 * call the console drivers. If we fail to get the semaphore we place the output
@@ -654,7 +660,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
654 */ 660 */
655static int __init console_setup(char *str) 661static int __init console_setup(char *str)
656{ 662{
657 char name[sizeof(console_cmdline[0].name)]; 663 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
658 char *s, *options; 664 char *s, *options;
659 int idx; 665 int idx;
660 666
@@ -662,27 +668,27 @@ static int __init console_setup(char *str)
662 * Decode str into name, index, options. 668 * Decode str into name, index, options.
663 */ 669 */
664 if (str[0] >= '0' && str[0] <= '9') { 670 if (str[0] >= '0' && str[0] <= '9') {
665 strcpy(name, "ttyS"); 671 strcpy(buf, "ttyS");
666 strncpy(name + 4, str, sizeof(name) - 5); 672 strncpy(buf + 4, str, sizeof(buf) - 5);
667 } else { 673 } else {
668 strncpy(name, str, sizeof(name) - 1); 674 strncpy(buf, str, sizeof(buf) - 1);
669 } 675 }
670 name[sizeof(name) - 1] = 0; 676 buf[sizeof(buf) - 1] = 0;
671 if ((options = strchr(str, ',')) != NULL) 677 if ((options = strchr(str, ',')) != NULL)
672 *(options++) = 0; 678 *(options++) = 0;
673#ifdef __sparc__ 679#ifdef __sparc__
674 if (!strcmp(str, "ttya")) 680 if (!strcmp(str, "ttya"))
675 strcpy(name, "ttyS0"); 681 strcpy(buf, "ttyS0");
676 if (!strcmp(str, "ttyb")) 682 if (!strcmp(str, "ttyb"))
677 strcpy(name, "ttyS1"); 683 strcpy(buf, "ttyS1");
678#endif 684#endif
679 for (s = name; *s; s++) 685 for (s = buf; *s; s++)
680 if ((*s >= '0' && *s <= '9') || *s == ',') 686 if ((*s >= '0' && *s <= '9') || *s == ',')
681 break; 687 break;
682 idx = simple_strtoul(s, NULL, 10); 688 idx = simple_strtoul(s, NULL, 10);
683 *s = 0; 689 *s = 0;
684 690
685 add_preferred_console(name, idx, options); 691 add_preferred_console(buf, idx, options);
686 return 1; 692 return 1;
687} 693}
688__setup("console=", console_setup); 694__setup("console=", console_setup);
@@ -709,7 +715,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
709 * See if this tty is not yet registered, and 715 * See if this tty is not yet registered, and
710 * if we have a slot free. 716 * if we have a slot free.
711 */ 717 */
712 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 718 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
713 if (strcmp(console_cmdline[i].name, name) == 0 && 719 if (strcmp(console_cmdline[i].name, name) == 0 &&
714 console_cmdline[i].index == idx) { 720 console_cmdline[i].index == idx) {
715 selected_console = i; 721 selected_console = i;
@@ -726,6 +732,25 @@ int __init add_preferred_console(char *name, int idx, char *options)
726 return 0; 732 return 0;
727} 733}
728 734
735int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
736{
737 struct console_cmdline *c;
738 int i;
739
740 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
741 if (strcmp(console_cmdline[i].name, name) == 0 &&
742 console_cmdline[i].index == idx) {
743 c = &console_cmdline[i];
744 memcpy(c->name, name_new, sizeof(c->name));
745 c->name[sizeof(c->name) - 1] = 0;
746 c->options = options;
747 c->index = idx_new;
748 return i;
749 }
750 /* not found */
751 return -1;
752}
753
729#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND 754#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
730/** 755/**
731 * suspend_console - suspend the console subsystem 756 * suspend_console - suspend the console subsystem
@@ -942,6 +967,9 @@ void register_console(struct console *console)
942 if (preferred_console < 0 || bootconsole || !console_drivers) 967 if (preferred_console < 0 || bootconsole || !console_drivers)
943 preferred_console = selected_console; 968 preferred_console = selected_console;
944 969
970 if (console->early_setup)
971 console->early_setup();
972
945 /* 973 /*
946 * See if we want to use this console driver. If we 974 * See if we want to use this console driver. If we
947 * didn't select a console we take the first one 975 * didn't select a console we take the first one
@@ -985,12 +1013,15 @@ void register_console(struct console *console)
985 if (!(console->flags & CON_ENABLED)) 1013 if (!(console->flags & CON_ENABLED))
986 return; 1014 return;
987 1015
988 if (bootconsole) { 1016 if (bootconsole && (console->flags & CON_CONSDEV)) {
989 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1017 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
990 bootconsole->name, bootconsole->index, 1018 bootconsole->name, bootconsole->index,
991 console->name, console->index); 1019 console->name, console->index);
992 unregister_console(bootconsole); 1020 unregister_console(bootconsole);
993 console->flags &= ~CON_PRINTBUFFER; 1021 console->flags &= ~CON_PRINTBUFFER;
1022 } else {
1023 printk(KERN_INFO "console [%s%d] enabled\n",
1024 console->name, console->index);
994 } 1025 }
995 1026
996 /* 1027 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ad7949a589dd..82a558b655da 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task)
142 return -EPERM; 142 return -EPERM;
143 smp_rmb(); 143 smp_rmb();
144 if (task->mm) 144 if (task->mm)
145 dumpable = task->mm->dumpable; 145 dumpable = get_dumpable(task->mm);
146 if (!dumpable && !capable(CAP_SYS_PTRACE)) 146 if (!dumpable && !capable(CAP_SYS_PTRACE))
147 return -EPERM; 147 return -EPERM;
148 148
@@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct *task)
161int ptrace_attach(struct task_struct *task) 161int ptrace_attach(struct task_struct *task)
162{ 162{
163 int retval; 163 int retval;
164 unsigned long flags;
164 165
165 audit_ptrace(task); 166 audit_ptrace(task);
166 167
@@ -181,9 +182,7 @@ repeat:
181 * cpu's that may have task_lock). 182 * cpu's that may have task_lock).
182 */ 183 */
183 task_lock(task); 184 task_lock(task);
184 local_irq_disable(); 185 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
185 if (!write_trylock(&tasklist_lock)) {
186 local_irq_enable();
187 task_unlock(task); 186 task_unlock(task);
188 do { 187 do {
189 cpu_relax(); 188 cpu_relax();
@@ -211,7 +210,7 @@ repeat:
211 force_sig_specific(SIGSTOP, task); 210 force_sig_specific(SIGSTOP, task);
212 211
213bad: 212bad:
214 write_unlock_irq(&tasklist_lock); 213 write_unlock_irqrestore(&tasklist_lock, flags);
215 task_unlock(task); 214 task_unlock(task);
216out: 215out:
217 return retval; 216 return retval;
@@ -491,3 +490,22 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
491 return ret; 490 return ret;
492} 491}
493#endif /* __ARCH_SYS_PTRACE */ 492#endif /* __ARCH_SYS_PTRACE */
493
494int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
495{
496 unsigned long tmp;
497 int copied;
498
499 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
500 if (copied != sizeof(tmp))
501 return -EIO;
502 return put_user(tmp, (unsigned long __user *)data);
503}
504
505int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
506{
507 int copied;
508
509 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
510 return (copied == sizeof(data)) ? 0 : -EIO;
511}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 55ba82a85a66..ddff33247785 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -40,6 +40,7 @@
40#include <linux/moduleparam.h> 40#include <linux/moduleparam.h>
41#include <linux/percpu.h> 41#include <linux/percpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/random.h> 45#include <linux/random.h>
45#include <linux/delay.h> 46#include <linux/delay.h>
@@ -518,7 +519,6 @@ rcu_torture_writer(void *arg)
518 519
519 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 520 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
520 set_user_nice(current, 19); 521 set_user_nice(current, 19);
521 current->flags |= PF_NOFREEZE;
522 522
523 do { 523 do {
524 schedule_timeout_uninterruptible(1); 524 schedule_timeout_uninterruptible(1);
@@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg)
558 558
559 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 559 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
560 set_user_nice(current, 19); 560 set_user_nice(current, 19);
561 current->flags |= PF_NOFREEZE;
562 561
563 do { 562 do {
564 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 563 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -589,7 +588,6 @@ rcu_torture_reader(void *arg)
589 588
590 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 589 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
591 set_user_nice(current, 19); 590 set_user_nice(current, 19);
592 current->flags |= PF_NOFREEZE;
593 591
594 do { 592 do {
595 idx = cur_ops->readlock(); 593 idx = cur_ops->readlock();
diff --git a/kernel/relay.c b/kernel/relay.c
index a615a8f513fc..510fbbd7b500 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -80,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = {
80 * 80 *
81 * Caller should already have grabbed mmap_sem. 81 * Caller should already have grabbed mmap_sem.
82 */ 82 */
83int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) 83static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
84{ 84{
85 unsigned long length = vma->vm_end - vma->vm_start; 85 unsigned long length = vma->vm_end - vma->vm_start;
86 struct file *filp = vma->vm_file; 86 struct file *filp = vma->vm_file;
@@ -145,7 +145,7 @@ depopulate:
145 * 145 *
146 * Returns channel buffer if successful, %NULL otherwise. 146 * Returns channel buffer if successful, %NULL otherwise.
147 */ 147 */
148struct rchan_buf *relay_create_buf(struct rchan *chan) 148static struct rchan_buf *relay_create_buf(struct rchan *chan)
149{ 149{
150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); 150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
151 if (!buf) 151 if (!buf)
@@ -175,7 +175,7 @@ free_buf:
175 * 175 *
176 * Should only be called from kref_put(). 176 * Should only be called from kref_put().
177 */ 177 */
178void relay_destroy_channel(struct kref *kref) 178static void relay_destroy_channel(struct kref *kref)
179{ 179{
180 struct rchan *chan = container_of(kref, struct rchan, kref); 180 struct rchan *chan = container_of(kref, struct rchan, kref);
181 kfree(chan); 181 kfree(chan);
@@ -185,7 +185,7 @@ void relay_destroy_channel(struct kref *kref)
185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer 185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
186 * @buf: the buffer struct 186 * @buf: the buffer struct
187 */ 187 */
188void relay_destroy_buf(struct rchan_buf *buf) 188static void relay_destroy_buf(struct rchan_buf *buf)
189{ 189{
190 struct rchan *chan = buf->chan; 190 struct rchan *chan = buf->chan;
191 unsigned int i; 191 unsigned int i;
@@ -210,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
210 * rchan_buf_struct and the channel buffer. Should only be called from 210 * rchan_buf_struct and the channel buffer. Should only be called from
211 * kref_put(). 211 * kref_put().
212 */ 212 */
213void relay_remove_buf(struct kref *kref) 213static void relay_remove_buf(struct kref *kref)
214{ 214{
215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
216 buf->chan->cb->remove_buf_file(buf->dentry); 216 buf->chan->cb->remove_buf_file(buf->dentry);
@@ -223,11 +223,10 @@ void relay_remove_buf(struct kref *kref)
223 * 223 *
224 * Returns 1 if the buffer is empty, 0 otherwise. 224 * Returns 1 if the buffer is empty, 0 otherwise.
225 */ 225 */
226int relay_buf_empty(struct rchan_buf *buf) 226static int relay_buf_empty(struct rchan_buf *buf)
227{ 227{
228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; 228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
229} 229}
230EXPORT_SYMBOL_GPL(relay_buf_empty);
231 230
232/** 231/**
233 * relay_buf_full - boolean, is the channel buffer full? 232 * relay_buf_full - boolean, is the channel buffer full?
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index da8d6bf46457..5aedbee014df 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,12 +29,6 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x) 32# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x) 33# define TRACE_BUG_ON(x) BUG_ON(x)
40 34
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 015fc633c96c..e3055ba69159 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -260,6 +260,7 @@ static int test_func(void *data)
260 int ret; 260 int ret;
261 261
262 current->flags |= PF_MUTEX_TESTER; 262 current->flags |= PF_MUTEX_TESTER;
263 set_freezable();
263 allow_signal(SIGHUP); 264 allow_signal(SIGHUP);
264 265
265 for(;;) { 266 for(;;) {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 17d28ce20300..8cd9bd2cdb34 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -17,12 +17,6 @@
17 17
18#include "rtmutex_common.h" 18#include "rtmutex_common.h"
19 19
20#ifdef CONFIG_DEBUG_RT_MUTEXES
21# include "rtmutex-debug.h"
22#else
23# include "rtmutex.h"
24#endif
25
26/* 20/*
27 * lock->owner state tracking: 21 * lock->owner state tracking:
28 * 22 *
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..2d3b83593ca3 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -103,7 +103,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
103 103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) 104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{ 105{
106 return (struct task_struct *) 106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); 107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108} 108}
109 109
@@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123
124#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h"
126#else
127# include "rtmutex.h"
128#endif
129
123#endif 130#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9a87886b022e..1ec620c03064 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem)
20 might_sleep(); 20 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
22 22
23 __down_read(sem); 23 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
24} 24}
25 25
26EXPORT_SYMBOL(down_read); 26EXPORT_SYMBOL(down_read);
@@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem)
47 might_sleep(); 47 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49 49
50 __down_write(sem); 50 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
51} 51}
52 52
53EXPORT_SYMBOL(down_write); 53EXPORT_SYMBOL(down_write);
@@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
111 might_sleep(); 111 might_sleep();
112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
113 113
114 __down_read(sem); 114 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
115} 115}
116 116
117EXPORT_SYMBOL(down_read_nested); 117EXPORT_SYMBOL(down_read_nested);
@@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
130 might_sleep(); 130 might_sleep();
131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
132 132
133 __down_write_nested(sem, subclass); 133 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
134} 134}
135 135
136EXPORT_SYMBOL(down_write_nested); 136EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/sched.c b/kernel/sched.c
index 3332bbb5d5cf..93cf241cfbe9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct rq {
301 struct lock_class_key rq_lock_key; 301 struct lock_class_key rq_lock_key;
302}; 302};
303 303
304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
305static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
306 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -379,6 +379,23 @@ static inline unsigned long long rq_clock(struct rq *rq)
379#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
380#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
381 381
382/*
383 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
384 * clock constructed from sched_clock():
385 */
386unsigned long long cpu_clock(int cpu)
387{
388 struct rq *rq = cpu_rq(cpu);
389 unsigned long long now;
390 unsigned long flags;
391
392 spin_lock_irqsave(&rq->lock, flags);
393 now = rq_clock(rq);
394 spin_unlock_irqrestore(&rq->lock, flags);
395
396 return now;
397}
398
382#ifdef CONFIG_FAIR_GROUP_SCHED 399#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */ 400/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p) 401static inline void set_task_cfs_rq(struct task_struct *p)
@@ -736,7 +753,9 @@ static void update_curr_load(struct rq *rq, u64 now)
736 * 753 *
737 * The "10% effect" is relative and cumulative: from _any_ nice level, 754 * The "10% effect" is relative and cumulative: from _any_ nice level,
738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 755 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
739 * it's +10% CPU usage. 756 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
757 * If a task goes up by ~10% and another task goes down by ~10% then
758 * the relative distance between them is ~25%.)
740 */ 759 */
741static const int prio_to_weight[40] = { 760static const int prio_to_weight[40] = {
742/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, 761/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
@@ -746,15 +765,22 @@ static const int prio_to_weight[40] = {
746/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, 765/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
747}; 766};
748 767
768/*
769 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
770 *
771 * In cases where the weight does not change often, we can use the
772 * precalculated inverse to speed up arithmetics by turning divisions
773 * into multiplications:
774 */
749static const u32 prio_to_wmult[40] = { 775static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573, 776/* -20 */ 48356, 60446, 75558, 94446, 118058,
751 184467, 230589, 288233, 360285, 450347, 777/* -15 */ 147573, 184467, 230589, 288233, 360285,
752 562979, 703746, 879575, 1099582, 1374389, 778/* -10 */ 450347, 562979, 703746, 879575, 1099582,
753 1717986, 2147483, 2684354, 3355443, 4194304, 779/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,
754 5244160, 6557201, 8196502, 10250518, 12782640, 780/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,
755 16025997, 19976592, 24970740, 31350126, 39045157, 781/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,
756 49367440, 61356675, 76695844, 95443717, 119304647, 782/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,
757 148102320, 186737708, 238609294, 286331153, 783/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
758}; 784};
759 785
760static inline void 786static inline void
@@ -2226,7 +2252,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2226 2252
2227 rq = cpu_rq(i); 2253 rq = cpu_rq(i);
2228 2254
2229 if (*sd_idle && !idle_cpu(i)) 2255 if (*sd_idle && rq->nr_running)
2230 *sd_idle = 0; 2256 *sd_idle = 0;
2231 2257
2232 /* Bias balancing toward cpus of our domain */ 2258 /* Bias balancing toward cpus of our domain */
@@ -2248,9 +2274,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2248 /* 2274 /*
2249 * First idle cpu or the first cpu(busiest) in this sched group 2275 * First idle cpu or the first cpu(busiest) in this sched group
2250 * is eligible for doing load balancing at this and above 2276 * is eligible for doing load balancing at this and above
2251 * domains. 2277 * domains. In the newly idle case, we will allow all the cpu's
2278 * to do the newly idle load balance.
2252 */ 2279 */
2253 if (local_group && balance_cpu != this_cpu && balance) { 2280 if (idle != CPU_NEWLY_IDLE && local_group &&
2281 balance_cpu != this_cpu && balance) {
2254 *balance = 0; 2282 *balance = 0;
2255 goto ret; 2283 goto ret;
2256 } 2284 }
@@ -2668,6 +2696,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2668 unsigned long imbalance; 2696 unsigned long imbalance;
2669 int nr_moved = 0; 2697 int nr_moved = 0;
2670 int sd_idle = 0; 2698 int sd_idle = 0;
2699 int all_pinned = 0;
2671 cpumask_t cpus = CPU_MASK_ALL; 2700 cpumask_t cpus = CPU_MASK_ALL;
2672 2701
2673 /* 2702 /*
@@ -2706,10 +2735,11 @@ redo:
2706 double_lock_balance(this_rq, busiest); 2735 double_lock_balance(this_rq, busiest);
2707 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2736 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2708 minus_1_or_zero(busiest->nr_running), 2737 minus_1_or_zero(busiest->nr_running),
2709 imbalance, sd, CPU_NEWLY_IDLE, NULL); 2738 imbalance, sd, CPU_NEWLY_IDLE,
2739 &all_pinned);
2710 spin_unlock(&busiest->lock); 2740 spin_unlock(&busiest->lock);
2711 2741
2712 if (!nr_moved) { 2742 if (unlikely(all_pinned)) {
2713 cpu_clear(cpu_of(busiest), cpus); 2743 cpu_clear(cpu_of(busiest), cpus);
2714 if (!cpus_empty(cpus)) 2744 if (!cpus_empty(cpus))
2715 goto redo; 2745 goto redo;
@@ -4903,8 +4933,6 @@ static int migration_thread(void *data)
4903 struct migration_req *req; 4933 struct migration_req *req;
4904 struct list_head *head; 4934 struct list_head *head;
4905 4935
4906 try_to_freeze();
4907
4908 spin_lock_irq(&rq->lock); 4936 spin_lock_irq(&rq->lock);
4909 4937
4910 if (cpu_is_offline(cpu)) { 4938 if (cpu_is_offline(cpu)) {
@@ -5138,7 +5166,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5138 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); 5166 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5139 if (IS_ERR(p)) 5167 if (IS_ERR(p))
5140 return NOTIFY_BAD; 5168 return NOTIFY_BAD;
5141 p->flags |= PF_NOFREEZE;
5142 kthread_bind(p, cpu); 5169 kthread_bind(p, cpu);
5143 /* Must be high prio: stop_machine expects to yield to it. */ 5170 /* Must be high prio: stop_machine expects to yield to it. */
5144 rq = task_rq_lock(p, &flags); 5171 rq = task_rq_lock(p, &flags);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index c3391b6020e8..ad64fcb731f2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -10,6 +10,7 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11 11
12/* #define SECCOMP_DEBUG 1 */ 12/* #define SECCOMP_DEBUG 1 */
13#define NR_SECCOMP_MODES 1
13 14
14/* 15/*
15 * Secure computing mode 1 allows only read/write/exit/sigreturn. 16 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -54,3 +55,31 @@ void __secure_computing(int this_syscall)
54#endif 55#endif
55 do_exit(SIGKILL); 56 do_exit(SIGKILL);
56} 57}
58
59long prctl_get_seccomp(void)
60{
61 return current->seccomp.mode;
62}
63
64long prctl_set_seccomp(unsigned long seccomp_mode)
65{
66 long ret;
67
68 /* can set it only once to be even more secure */
69 ret = -EPERM;
70 if (unlikely(current->seccomp.mode))
71 goto out;
72
73 ret = -EINVAL;
74 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
75 current->seccomp.mode = seccomp_mode;
76 set_thread_flag(TIF_SECCOMP);
77#ifdef TIF_NOTSC
78 disable_TSC();
79#endif
80 ret = 0;
81 }
82
83 out:
84 return ret;
85}
diff --git a/kernel/signal.c b/kernel/signal.c
index f9405609774e..ef8156a6aad5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default)
255 } 255 }
256} 256}
257 257
258int unhandled_signal(struct task_struct *tsk, int sig)
259{
260 if (is_init(tsk))
261 return 1;
262 if (tsk->ptrace & PT_PTRACED)
263 return 0;
264 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
265 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
266}
267
258 268
259/* Notify the system that a driver wants to block all signals for this 269/* Notify the system that a driver wants to block all signals for this
260 * process, and wants to be notified if any signals at all were to be 270 * process, and wants to be notified if any signals at all were to be
@@ -718,6 +728,37 @@ out_set:
718#define LEGACY_QUEUE(sigptr, sig) \ 728#define LEGACY_QUEUE(sigptr, sig) \
719 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) 729 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
720 730
731int print_fatal_signals;
732
733static void print_fatal_signal(struct pt_regs *regs, int signr)
734{
735 printk("%s/%d: potentially unexpected fatal signal %d.\n",
736 current->comm, current->pid, signr);
737
738#ifdef __i386__
739 printk("code at %08lx: ", regs->eip);
740 {
741 int i;
742 for (i = 0; i < 16; i++) {
743 unsigned char insn;
744
745 __get_user(insn, (unsigned char *)(regs->eip + i));
746 printk("%02x ", insn);
747 }
748 }
749#endif
750 printk("\n");
751 show_regs(regs);
752}
753
754static int __init setup_print_fatal_signals(char *str)
755{
756 get_option (&str, &print_fatal_signals);
757
758 return 1;
759}
760
761__setup("print-fatal-signals=", setup_print_fatal_signals);
721 762
722static int 763static int
723specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) 764specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -1855,6 +1896,8 @@ relock:
1855 * Anything else is fatal, maybe with a core dump. 1896 * Anything else is fatal, maybe with a core dump.
1856 */ 1897 */
1857 current->flags |= PF_SIGNALED; 1898 current->flags |= PF_SIGNALED;
1899 if ((signr != SIGKILL) && print_fatal_signals)
1900 print_fatal_signal(regs, signr);
1858 if (sig_kernel_coredump(signr)) { 1901 if (sig_kernel_coredump(signr)) {
1859 /* 1902 /*
1860 * If it was able to dump core, this kills all 1903 * If it was able to dump core, this kills all
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 73217a9e2875..0f546ddea43d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -14,6 +14,7 @@
14#include <linux/notifier.h> 14#include <linux/notifier.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/freezer.h>
17#include <linux/kthread.h> 18#include <linux/kthread.h>
18#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
@@ -488,8 +489,6 @@ void __init softirq_init(void)
488 489
489static int ksoftirqd(void * __bind_cpu) 490static int ksoftirqd(void * __bind_cpu)
490{ 491{
491 current->flags |= PF_NOFREEZE;
492
493 set_current_state(TASK_INTERRUPTIBLE); 492 set_current_state(TASK_INTERRUPTIBLE);
494 493
495 while (!kthread_should_stop()) { 494 while (!kthread_should_stop()) {
@@ -614,12 +613,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
614 kthread_bind(per_cpu(ksoftirqd, hotcpu), 613 kthread_bind(per_cpu(ksoftirqd, hotcpu),
615 any_online_cpu(cpu_online_map)); 614 any_online_cpu(cpu_online_map));
616 case CPU_DEAD: 615 case CPU_DEAD:
617 case CPU_DEAD_FROZEN: 616 case CPU_DEAD_FROZEN: {
617 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
618
618 p = per_cpu(ksoftirqd, hotcpu); 619 p = per_cpu(ksoftirqd, hotcpu);
619 per_cpu(ksoftirqd, hotcpu) = NULL; 620 per_cpu(ksoftirqd, hotcpu) = NULL;
621 sched_setscheduler(p, SCHED_FIFO, &param);
620 kthread_stop(p); 622 kthread_stop(p);
621 takeover_tasklets(hotcpu); 623 takeover_tasklets(hotcpu);
622 break; 624 break;
625 }
623#endif /* CONFIG_HOTPLUG_CPU */ 626#endif /* CONFIG_HOTPLUG_CPU */
624 } 627 }
625 return NOTIFY_OK; 628 return NOTIFY_OK;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0131e296ffb4..708d4882c0c3 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -10,6 +10,7 @@
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/freezer.h>
13#include <linux/kthread.h> 14#include <linux/kthread.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu)
116 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 117 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
117 118
118 sched_setscheduler(current, SCHED_FIFO, &param); 119 sched_setscheduler(current, SCHED_FIFO, &param);
119 current->flags |= PF_NOFREEZE;
120 120
121 /* initialize timestamp */ 121 /* initialize timestamp */
122 touch_softlockup_watchdog(); 122 touch_softlockup_watchdog();
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 2c6c2bf85514..cd72424c2662 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock)
72{ 72{
73 preempt_disable(); 73 preempt_disable();
74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
75 _raw_read_lock(lock); 75 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
76} 76}
77EXPORT_SYMBOL(_read_lock); 77EXPORT_SYMBOL(_read_lock);
78 78
@@ -88,8 +88,8 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
88 * _raw_spin_lock_flags() code, because lockdep assumes 88 * _raw_spin_lock_flags() code, because lockdep assumes
89 * that interrupts are not re-enabled during lock-acquire: 89 * that interrupts are not re-enabled during lock-acquire:
90 */ 90 */
91#ifdef CONFIG_PROVE_LOCKING 91#ifdef CONFIG_LOCKDEP
92 _raw_spin_lock(lock); 92 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
93#else 93#else
94 _raw_spin_lock_flags(lock, &flags); 94 _raw_spin_lock_flags(lock, &flags);
95#endif 95#endif
@@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
102 local_irq_disable(); 102 local_irq_disable();
103 preempt_disable(); 103 preempt_disable();
104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
105 _raw_spin_lock(lock); 105 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
106} 106}
107EXPORT_SYMBOL(_spin_lock_irq); 107EXPORT_SYMBOL(_spin_lock_irq);
108 108
@@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
111 local_bh_disable(); 111 local_bh_disable();
112 preempt_disable(); 112 preempt_disable();
113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
114 _raw_spin_lock(lock); 114 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
115} 115}
116EXPORT_SYMBOL(_spin_lock_bh); 116EXPORT_SYMBOL(_spin_lock_bh);
117 117
@@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
122 local_irq_save(flags); 122 local_irq_save(flags);
123 preempt_disable(); 123 preempt_disable();
124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
125 _raw_read_lock(lock); 125 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
126 return flags; 126 return flags;
127} 127}
128EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
132 local_irq_disable(); 132 local_irq_disable();
133 preempt_disable(); 133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 _raw_read_lock(lock); 135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 136}
137EXPORT_SYMBOL(_read_lock_irq); 137EXPORT_SYMBOL(_read_lock_irq);
138 138
@@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
141 local_bh_disable(); 141 local_bh_disable();
142 preempt_disable(); 142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 _raw_read_lock(lock); 144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 145}
146EXPORT_SYMBOL(_read_lock_bh); 146EXPORT_SYMBOL(_read_lock_bh);
147 147
@@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
152 local_irq_save(flags); 152 local_irq_save(flags);
153 preempt_disable(); 153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 _raw_write_lock(lock); 155 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
156 return flags; 156 return flags;
157} 157}
158EXPORT_SYMBOL(_write_lock_irqsave); 158EXPORT_SYMBOL(_write_lock_irqsave);
@@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
162 local_irq_disable(); 162 local_irq_disable();
163 preempt_disable(); 163 preempt_disable();
164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
165 _raw_write_lock(lock); 165 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
166} 166}
167EXPORT_SYMBOL(_write_lock_irq); 167EXPORT_SYMBOL(_write_lock_irq);
168 168
@@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
171 local_bh_disable(); 171 local_bh_disable();
172 preempt_disable(); 172 preempt_disable();
173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
174 _raw_write_lock(lock); 174 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
175} 175}
176EXPORT_SYMBOL(_write_lock_bh); 176EXPORT_SYMBOL(_write_lock_bh);
177 177
@@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock)
179{ 179{
180 preempt_disable(); 180 preempt_disable();
181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
182 _raw_spin_lock(lock); 182 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
183} 183}
184 184
185EXPORT_SYMBOL(_spin_lock); 185EXPORT_SYMBOL(_spin_lock);
@@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock)
188{ 188{
189 preempt_disable(); 189 preempt_disable();
190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
191 _raw_write_lock(lock); 191 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
192} 192}
193 193
194EXPORT_SYMBOL(_write_lock); 194EXPORT_SYMBOL(_write_lock);
@@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
289{ 289{
290 preempt_disable(); 290 preempt_disable();
291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
292 _raw_spin_lock(lock); 292 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
293} 293}
294 294
295EXPORT_SYMBOL(_spin_lock_nested); 295EXPORT_SYMBOL(_spin_lock_nested);
@@ -305,8 +305,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
305 * _raw_spin_lock_flags() code, because lockdep assumes 305 * _raw_spin_lock_flags() code, because lockdep assumes
306 * that interrupts are not re-enabled during lock-acquire: 306 * that interrupts are not re-enabled during lock-acquire:
307 */ 307 */
308#ifdef CONFIG_PROVE_SPIN_LOCKING 308#ifdef CONFIG_LOCKDEP
309 _raw_spin_lock(lock); 309 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
310#else 310#else
311 _raw_spin_lock_flags(lock, &flags); 311 _raw_spin_lock_flags(lock, &flags);
312#endif 312#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fcee2a8e6da3..319821ef78af 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -93,10 +93,6 @@ static void stopmachine_set_state(enum stopmachine_state state)
93static int stop_machine(void) 93static int stop_machine(void)
94{ 94{
95 int i, ret = 0; 95 int i, ret = 0;
96 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
97
98 /* One high-prio thread per cpu. We'll do this one. */
99 sched_setscheduler(current, SCHED_FIFO, &param);
100 96
101 atomic_set(&stopmachine_thread_ack, 0); 97 atomic_set(&stopmachine_thread_ack, 0);
102 stopmachine_num_threads = 0; 98 stopmachine_num_threads = 0;
@@ -189,6 +185,10 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
189 185
190 p = kthread_create(do_stop, &smdata, "kstopmachine"); 186 p = kthread_create(do_stop, &smdata, "kstopmachine");
191 if (!IS_ERR(p)) { 187 if (!IS_ERR(p)) {
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 /* One high-prio thread per cpu. We'll do this one. */
191 sched_setscheduler(p, SCHED_FIFO, &param);
192 kthread_bind(p, cpu); 192 kthread_bind(p, cpu);
193 wake_up_process(p); 193 wake_up_process(p);
194 wait_for_completion(&smdata.done); 194 wait_for_completion(&smdata.done);
diff --git a/kernel/sys.c b/kernel/sys.c
index 872271ccc384..08562f419768 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,10 +31,12 @@
31#include <linux/cn_proc.h> 31#include <linux/cn_proc.h>
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h>
34 35
35#include <linux/compat.h> 36#include <linux/compat.h>
36#include <linux/syscalls.h> 37#include <linux/syscalls.h>
37#include <linux/kprobes.h> 38#include <linux/kprobes.h>
39#include <linux/user_namespace.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40#include <asm/io.h> 42#include <asm/io.h>
@@ -98,6 +100,13 @@ struct pid *cad_pid;
98EXPORT_SYMBOL(cad_pid); 100EXPORT_SYMBOL(cad_pid);
99 101
100/* 102/*
103 * If set, this is used for preparing the system to power off.
104 */
105
106void (*pm_power_off_prepare)(void);
107EXPORT_SYMBOL(pm_power_off_prepare);
108
109/*
101 * Notifier list for kernel code which wants to be called 110 * Notifier list for kernel code which wants to be called
102 * at shutdown. This is used to stop any idling DMA operations 111 * at shutdown. This is used to stop any idling DMA operations
103 * and the like. 112 * and the like.
@@ -865,6 +874,8 @@ EXPORT_SYMBOL_GPL(kernel_halt);
865void kernel_power_off(void) 874void kernel_power_off(void)
866{ 875{
867 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 876 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
877 if (pm_power_off_prepare)
878 pm_power_off_prepare();
868 printk(KERN_EMERG "Power down.\n"); 879 printk(KERN_EMERG "Power down.\n");
869 machine_power_off(); 880 machine_power_off();
870} 881}
@@ -1025,7 +1036,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
1025 return -EPERM; 1036 return -EPERM;
1026 } 1037 }
1027 if (new_egid != old_egid) { 1038 if (new_egid != old_egid) {
1028 current->mm->dumpable = suid_dumpable; 1039 set_dumpable(current->mm, suid_dumpable);
1029 smp_wmb(); 1040 smp_wmb();
1030 } 1041 }
1031 if (rgid != (gid_t) -1 || 1042 if (rgid != (gid_t) -1 ||
@@ -1055,13 +1066,13 @@ asmlinkage long sys_setgid(gid_t gid)
1055 1066
1056 if (capable(CAP_SETGID)) { 1067 if (capable(CAP_SETGID)) {
1057 if (old_egid != gid) { 1068 if (old_egid != gid) {
1058 current->mm->dumpable = suid_dumpable; 1069 set_dumpable(current->mm, suid_dumpable);
1059 smp_wmb(); 1070 smp_wmb();
1060 } 1071 }
1061 current->gid = current->egid = current->sgid = current->fsgid = gid; 1072 current->gid = current->egid = current->sgid = current->fsgid = gid;
1062 } else if ((gid == current->gid) || (gid == current->sgid)) { 1073 } else if ((gid == current->gid) || (gid == current->sgid)) {
1063 if (old_egid != gid) { 1074 if (old_egid != gid) {
1064 current->mm->dumpable = suid_dumpable; 1075 set_dumpable(current->mm, suid_dumpable);
1065 smp_wmb(); 1076 smp_wmb();
1066 } 1077 }
1067 current->egid = current->fsgid = gid; 1078 current->egid = current->fsgid = gid;
@@ -1078,13 +1089,13 @@ static int set_user(uid_t new_ruid, int dumpclear)
1078{ 1089{
1079 struct user_struct *new_user; 1090 struct user_struct *new_user;
1080 1091
1081 new_user = alloc_uid(new_ruid); 1092 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
1082 if (!new_user) 1093 if (!new_user)
1083 return -EAGAIN; 1094 return -EAGAIN;
1084 1095
1085 if (atomic_read(&new_user->processes) >= 1096 if (atomic_read(&new_user->processes) >=
1086 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 1097 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
1087 new_user != &root_user) { 1098 new_user != current->nsproxy->user_ns->root_user) {
1088 free_uid(new_user); 1099 free_uid(new_user);
1089 return -EAGAIN; 1100 return -EAGAIN;
1090 } 1101 }
@@ -1092,7 +1103,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
1092 switch_uid(new_user); 1103 switch_uid(new_user);
1093 1104
1094 if (dumpclear) { 1105 if (dumpclear) {
1095 current->mm->dumpable = suid_dumpable; 1106 set_dumpable(current->mm, suid_dumpable);
1096 smp_wmb(); 1107 smp_wmb();
1097 } 1108 }
1098 current->uid = new_ruid; 1109 current->uid = new_ruid;
@@ -1148,7 +1159,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1148 return -EAGAIN; 1159 return -EAGAIN;
1149 1160
1150 if (new_euid != old_euid) { 1161 if (new_euid != old_euid) {
1151 current->mm->dumpable = suid_dumpable; 1162 set_dumpable(current->mm, suid_dumpable);
1152 smp_wmb(); 1163 smp_wmb();
1153 } 1164 }
1154 current->fsuid = current->euid = new_euid; 1165 current->fsuid = current->euid = new_euid;
@@ -1198,7 +1209,7 @@ asmlinkage long sys_setuid(uid_t uid)
1198 return -EPERM; 1209 return -EPERM;
1199 1210
1200 if (old_euid != uid) { 1211 if (old_euid != uid) {
1201 current->mm->dumpable = suid_dumpable; 1212 set_dumpable(current->mm, suid_dumpable);
1202 smp_wmb(); 1213 smp_wmb();
1203 } 1214 }
1204 current->fsuid = current->euid = uid; 1215 current->fsuid = current->euid = uid;
@@ -1243,7 +1254,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
1243 } 1254 }
1244 if (euid != (uid_t) -1) { 1255 if (euid != (uid_t) -1) {
1245 if (euid != current->euid) { 1256 if (euid != current->euid) {
1246 current->mm->dumpable = suid_dumpable; 1257 set_dumpable(current->mm, suid_dumpable);
1247 smp_wmb(); 1258 smp_wmb();
1248 } 1259 }
1249 current->euid = euid; 1260 current->euid = euid;
@@ -1293,7 +1304,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
1293 } 1304 }
1294 if (egid != (gid_t) -1) { 1305 if (egid != (gid_t) -1) {
1295 if (egid != current->egid) { 1306 if (egid != current->egid) {
1296 current->mm->dumpable = suid_dumpable; 1307 set_dumpable(current->mm, suid_dumpable);
1297 smp_wmb(); 1308 smp_wmb();
1298 } 1309 }
1299 current->egid = egid; 1310 current->egid = egid;
@@ -1339,7 +1350,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
1339 uid == current->suid || uid == current->fsuid || 1350 uid == current->suid || uid == current->fsuid ||
1340 capable(CAP_SETUID)) { 1351 capable(CAP_SETUID)) {
1341 if (uid != old_fsuid) { 1352 if (uid != old_fsuid) {
1342 current->mm->dumpable = suid_dumpable; 1353 set_dumpable(current->mm, suid_dumpable);
1343 smp_wmb(); 1354 smp_wmb();
1344 } 1355 }
1345 current->fsuid = uid; 1356 current->fsuid = uid;
@@ -1368,7 +1379,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
1368 gid == current->sgid || gid == current->fsgid || 1379 gid == current->sgid || gid == current->fsgid ||
1369 capable(CAP_SETGID)) { 1380 capable(CAP_SETGID)) {
1370 if (gid != old_fsgid) { 1381 if (gid != old_fsgid) {
1371 current->mm->dumpable = suid_dumpable; 1382 set_dumpable(current->mm, suid_dumpable);
1372 smp_wmb(); 1383 smp_wmb();
1373 } 1384 }
1374 current->fsgid = gid; 1385 current->fsgid = gid;
@@ -2165,14 +2176,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2165 error = put_user(current->pdeath_signal, (int __user *)arg2); 2176 error = put_user(current->pdeath_signal, (int __user *)arg2);
2166 break; 2177 break;
2167 case PR_GET_DUMPABLE: 2178 case PR_GET_DUMPABLE:
2168 error = current->mm->dumpable; 2179 error = get_dumpable(current->mm);
2169 break; 2180 break;
2170 case PR_SET_DUMPABLE: 2181 case PR_SET_DUMPABLE:
2171 if (arg2 < 0 || arg2 > 1) { 2182 if (arg2 < 0 || arg2 > 1) {
2172 error = -EINVAL; 2183 error = -EINVAL;
2173 break; 2184 break;
2174 } 2185 }
2175 current->mm->dumpable = arg2; 2186 set_dumpable(current->mm, arg2);
2176 break; 2187 break;
2177 2188
2178 case PR_SET_UNALIGN: 2189 case PR_SET_UNALIGN:
@@ -2241,6 +2252,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2241 error = SET_ENDIAN(current, arg2); 2252 error = SET_ENDIAN(current, arg2);
2242 break; 2253 break;
2243 2254
2255 case PR_GET_SECCOMP:
2256 error = prctl_get_seccomp();
2257 break;
2258 case PR_SET_SECCOMP:
2259 error = prctl_set_seccomp(arg2);
2260 break;
2261
2244 default: 2262 default:
2245 error = -EINVAL; 2263 error = -EINVAL;
2246 break; 2264 break;
@@ -2277,3 +2295,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2277 } 2295 }
2278 return err ? -EFAULT : 0; 2296 return err ? -EFAULT : 0;
2279} 2297}
2298
2299char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2300
2301static void argv_cleanup(char **argv, char **envp)
2302{
2303 argv_free(argv);
2304}
2305
2306/**
2307 * orderly_poweroff - Trigger an orderly system poweroff
2308 * @force: force poweroff if command execution fails
2309 *
2310 * This may be called from any context to trigger a system shutdown.
2311 * If the orderly shutdown fails, it will force an immediate shutdown.
2312 */
2313int orderly_poweroff(bool force)
2314{
2315 int argc;
2316 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2317 static char *envp[] = {
2318 "HOME=/",
2319 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2320 NULL
2321 };
2322 int ret = -ENOMEM;
2323 struct subprocess_info *info;
2324
2325 if (argv == NULL) {
2326 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2327 __func__, poweroff_cmd);
2328 goto out;
2329 }
2330
2331 info = call_usermodehelper_setup(argv[0], argv, envp);
2332 if (info == NULL) {
2333 argv_free(argv);
2334 goto out;
2335 }
2336
2337 call_usermodehelper_setcleanup(info, argv_cleanup);
2338
2339 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
2340
2341 out:
2342 if (ret && force) {
2343 printk(KERN_WARNING "Failed to start orderly shutdown: "
2344 "forcing the issue\n");
2345
2346 /* I guess this should try to kick off some daemon to
2347 sync and poweroff asap. Or not even bother syncing
2348 if we're doing an emergency shutdown? */
2349 emergency_sync();
2350 kernel_power_off();
2351 }
2352
2353 return ret;
2354}
2355EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7e11e2c98bf9..b0ec498a18d9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -14,6 +14,7 @@ asmlinkage long sys_ni_syscall(void)
14 14
15cond_syscall(sys_nfsservctl); 15cond_syscall(sys_nfsservctl);
16cond_syscall(sys_quotactl); 16cond_syscall(sys_quotactl);
17cond_syscall(sys32_quotactl);
17cond_syscall(sys_acct); 18cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 19cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 20cond_syscall(sys_swapon);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d93e13d93f24..ddebf3f2affe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -29,6 +29,7 @@
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/kernel.h> 34#include <linux/kernel.h>
34#include <linux/kobject.h> 35#include <linux/kobject.h>
@@ -45,13 +46,11 @@
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
50#include <asm/processor.h> 52#include <asm/processor.h>
51 53
52extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
53 void __user *buffer, size_t *lenp, loff_t *ppos);
54
55#ifdef CONFIG_X86 54#ifdef CONFIG_X86
56#include <asm/nmi.h> 55#include <asm/nmi.h>
57#include <asm/stacktrace.h> 56#include <asm/stacktrace.h>
@@ -61,6 +60,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
61 60
62/* External variables not in a header file. */ 61/* External variables not in a header file. */
63extern int C_A_D; 62extern int C_A_D;
63extern int print_fatal_signals;
64extern int sysctl_overcommit_memory; 64extern int sysctl_overcommit_memory;
65extern int sysctl_overcommit_ratio; 65extern int sysctl_overcommit_ratio;
66extern int sysctl_panic_on_oom; 66extern int sysctl_panic_on_oom;
@@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction;
78extern int compat_log; 78extern int compat_log;
79extern int maps_protect; 79extern int maps_protect;
80extern int sysctl_stat_interval; 80extern int sysctl_stat_interval;
81extern int audit_argv_kb;
81 82
82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 83/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
83static int maxolduid = 65535; 84static int maxolduid = 65535;
@@ -160,6 +161,8 @@ extern ctl_table inotify_table[];
160int sysctl_legacy_va_layout; 161int sysctl_legacy_va_layout;
161#endif 162#endif
162 163
164extern int prove_locking;
165extern int lock_stat;
163 166
164/* The default sysctl tables: */ 167/* The default sysctl tables: */
165 168
@@ -202,7 +205,10 @@ static ctl_table root_table[] = {
202 .mode = 0555, 205 .mode = 0555,
203 .child = dev_table, 206 .child = dev_table,
204 }, 207 },
205 208/*
209 * NOTE: do not add new entries to this table unless you have read
210 * Documentation/sysctl/ctl_unnumbered.txt
211 */
206 { .ctl_name = 0 } 212 { .ctl_name = 0 }
207}; 213};
208 214
@@ -278,6 +284,26 @@ static ctl_table kern_table[] = {
278 .mode = 0644, 284 .mode = 0644,
279 .proc_handler = &proc_dointvec, 285 .proc_handler = &proc_dointvec,
280 }, 286 },
287#ifdef CONFIG_PROVE_LOCKING
288 {
289 .ctl_name = CTL_UNNUMBERED,
290 .procname = "prove_locking",
291 .data = &prove_locking,
292 .maxlen = sizeof(int),
293 .mode = 0644,
294 .proc_handler = &proc_dointvec,
295 },
296#endif
297#ifdef CONFIG_LOCK_STAT
298 {
299 .ctl_name = CTL_UNNUMBERED,
300 .procname = "lock_stat",
301 .data = &lock_stat,
302 .maxlen = sizeof(int),
303 .mode = 0644,
304 .proc_handler = &proc_dointvec,
305 },
306#endif
281 { 307 {
282 .ctl_name = CTL_UNNUMBERED, 308 .ctl_name = CTL_UNNUMBERED,
283 .procname = "sched_features", 309 .procname = "sched_features",
@@ -303,6 +329,16 @@ static ctl_table kern_table[] = {
303 .mode = 0644, 329 .mode = 0644,
304 .proc_handler = &proc_dointvec, 330 .proc_handler = &proc_dointvec,
305 }, 331 },
332#ifdef CONFIG_AUDITSYSCALL
333 {
334 .ctl_name = CTL_UNNUMBERED,
335 .procname = "audit_argv_kb",
336 .data = &audit_argv_kb,
337 .maxlen = sizeof(int),
338 .mode = 0644,
339 .proc_handler = &proc_dointvec,
340 },
341#endif
306 { 342 {
307 .ctl_name = KERN_CORE_PATTERN, 343 .ctl_name = KERN_CORE_PATTERN,
308 .procname = "core_pattern", 344 .procname = "core_pattern",
@@ -340,6 +376,14 @@ static ctl_table kern_table[] = {
340 .proc_handler = &proc_dointvec, 376 .proc_handler = &proc_dointvec,
341 }, 377 },
342#endif 378#endif
379 {
380 .ctl_name = CTL_UNNUMBERED,
381 .procname = "print-fatal-signals",
382 .data = &print_fatal_signals,
383 .maxlen = sizeof(int),
384 .mode = 0644,
385 .proc_handler = &proc_dointvec,
386 },
343#ifdef __sparc__ 387#ifdef __sparc__
344 { 388 {
345 .ctl_name = KERN_SPARC_REBOOT, 389 .ctl_name = KERN_SPARC_REBOOT,
@@ -649,7 +693,7 @@ static ctl_table kern_table[] = {
649 { 693 {
650 .ctl_name = KERN_ACPI_VIDEO_FLAGS, 694 .ctl_name = KERN_ACPI_VIDEO_FLAGS,
651 .procname = "acpi_video_flags", 695 .procname = "acpi_video_flags",
652 .data = &acpi_video_flags, 696 .data = &acpi_realmode_flags,
653 .maxlen = sizeof (unsigned long), 697 .maxlen = sizeof (unsigned long),
654 .mode = 0644, 698 .mode = 0644,
655 .proc_handler = &proc_doulongvec_minmax, 699 .proc_handler = &proc_doulongvec_minmax,
@@ -695,13 +739,26 @@ static ctl_table kern_table[] = {
695 .proc_handler = &proc_dointvec, 739 .proc_handler = &proc_dointvec,
696 }, 740 },
697#endif 741#endif
698 742 {
743 .ctl_name = CTL_UNNUMBERED,
744 .procname = "poweroff_cmd",
745 .data = &poweroff_cmd,
746 .maxlen = POWEROFF_CMD_PATH_LEN,
747 .mode = 0644,
748 .proc_handler = &proc_dostring,
749 .strategy = &sysctl_string,
750 },
751/*
752 * NOTE: do not add new entries to this table unless you have read
753 * Documentation/sysctl/ctl_unnumbered.txt
754 */
699 { .ctl_name = 0 } 755 { .ctl_name = 0 }
700}; 756};
701 757
702/* Constants for minimum and maximum testing in vm_table. 758/* Constants for minimum and maximum testing in vm_table.
703 We use these as one-element integer vectors. */ 759 We use these as one-element integer vectors. */
704static int zero; 760static int zero;
761static int two = 2;
705static int one_hundred = 100; 762static int one_hundred = 100;
706 763
707 764
@@ -814,6 +871,14 @@ static ctl_table vm_table[] = {
814 .mode = 0644, 871 .mode = 0644,
815 .proc_handler = &proc_dointvec, 872 .proc_handler = &proc_dointvec,
816 }, 873 },
874 {
875 .ctl_name = CTL_UNNUMBERED,
876 .procname = "hugepages_treat_as_movable",
877 .data = &hugepages_treat_as_movable,
878 .maxlen = sizeof(int),
879 .mode = 0644,
880 .proc_handler = &hugetlb_treat_movable_handler,
881 },
817#endif 882#endif
818 { 883 {
819 .ctl_name = VM_LOWMEM_RESERVE_RATIO, 884 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
@@ -958,6 +1023,17 @@ static ctl_table vm_table[] = {
958 .mode = 0644, 1023 .mode = 0644,
959 .proc_handler = &proc_doulongvec_minmax, 1024 .proc_handler = &proc_doulongvec_minmax,
960 }, 1025 },
1026#ifdef CONFIG_NUMA
1027 {
1028 .ctl_name = CTL_UNNUMBERED,
1029 .procname = "numa_zonelist_order",
1030 .data = &numa_zonelist_order,
1031 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1032 .mode = 0644,
1033 .proc_handler = &numa_zonelist_order_handler,
1034 .strategy = &sysctl_string,
1035 },
1036#endif
961#endif 1037#endif
962#if defined(CONFIG_X86_32) || \ 1038#if defined(CONFIG_X86_32) || \
963 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1039 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
@@ -972,6 +1048,10 @@ static ctl_table vm_table[] = {
972 .extra1 = &zero, 1048 .extra1 = &zero,
973 }, 1049 },
974#endif 1050#endif
1051/*
1052 * NOTE: do not add new entries to this table unless you have read
1053 * Documentation/sysctl/ctl_unnumbered.txt
1054 */
975 { .ctl_name = 0 } 1055 { .ctl_name = 0 }
976}; 1056};
977 1057
@@ -1069,7 +1149,10 @@ static ctl_table fs_table[] = {
1069 .data = &lease_break_time, 1149 .data = &lease_break_time,
1070 .maxlen = sizeof(int), 1150 .maxlen = sizeof(int),
1071 .mode = 0644, 1151 .mode = 0644,
1072 .proc_handler = &proc_dointvec, 1152 .proc_handler = &proc_dointvec_minmax,
1153 .strategy = &sysctl_intvec,
1154 .extra1 = &zero,
1155 .extra2 = &two,
1073 }, 1156 },
1074 { 1157 {
1075 .ctl_name = FS_AIO_NR, 1158 .ctl_name = FS_AIO_NR,
@@ -1112,10 +1195,24 @@ static ctl_table fs_table[] = {
1112 .child = binfmt_misc_table, 1195 .child = binfmt_misc_table,
1113 }, 1196 },
1114#endif 1197#endif
1198/*
1199 * NOTE: do not add new entries to this table unless you have read
1200 * Documentation/sysctl/ctl_unnumbered.txt
1201 */
1115 { .ctl_name = 0 } 1202 { .ctl_name = 0 }
1116}; 1203};
1117 1204
1118static ctl_table debug_table[] = { 1205static ctl_table debug_table[] = {
1206#ifdef CONFIG_X86
1207 {
1208 .ctl_name = CTL_UNNUMBERED,
1209 .procname = "exception-trace",
1210 .data = &show_unhandled_signals,
1211 .maxlen = sizeof(int),
1212 .mode = 0644,
1213 .proc_handler = proc_dointvec
1214 },
1215#endif
1119 { .ctl_name = 0 } 1216 { .ctl_name = 0 }
1120}; 1217};
1121 1218
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 906cae771585..059431ed67db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
196 196
197 /* fill in basic acct fields */ 197 /* fill in basic acct fields */
198 stats->version = TASKSTATS_VERSION; 198 stats->version = TASKSTATS_VERSION;
199 stats->nvcsw = tsk->nvcsw;
200 stats->nivcsw = tsk->nivcsw;
199 bacct_add_tsk(stats, tsk); 201 bacct_add_tsk(stats, tsk);
200 202
201 /* fill in extended acct fields */ 203 /* fill in extended acct fields */
@@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
242 */ 244 */
243 delayacct_add_tsk(stats, tsk); 245 delayacct_add_tsk(stats, tsk);
244 246
247 stats->nvcsw += tsk->nvcsw;
248 stats->nivcsw += tsk->nivcsw;
245 } while_each_thread(first, tsk); 249 } while_each_thread(first, tsk);
246 250
247 unlock_task_sighand(first, &flags); 251 unlock_task_sighand(first, &flags);
diff --git a/kernel/time.c b/kernel/time.c
index f04791f69408..5b81da08bbdb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -58,9 +58,9 @@ EXPORT_SYMBOL(sys_tz);
58asmlinkage long sys_time(time_t __user * tloc) 58asmlinkage long sys_time(time_t __user * tloc)
59{ 59{
60 time_t i; 60 time_t i;
61 struct timeval tv; 61 struct timespec tv;
62 62
63 do_gettimeofday(&tv); 63 getnstimeofday(&tv);
64 i = tv.tv_sec; 64 i = tv.tv_sec;
65 65
66 if (tloc) { 66 if (tloc) {
@@ -133,7 +133,6 @@ static inline void warp_clock(void)
133 write_seqlock_irq(&xtime_lock); 133 write_seqlock_irq(&xtime_lock);
134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
135 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 135 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
136 time_interpolator_reset();
137 write_sequnlock_irq(&xtime_lock); 136 write_sequnlock_irq(&xtime_lock);
138 clock_was_set(); 137 clock_was_set();
139} 138}
@@ -306,79 +305,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
306} 305}
307EXPORT_SYMBOL(timespec_trunc); 306EXPORT_SYMBOL(timespec_trunc);
308 307
309#ifdef CONFIG_TIME_INTERPOLATION
310void getnstimeofday (struct timespec *tv)
311{
312 unsigned long seq,sec,nsec;
313
314 do {
315 seq = read_seqbegin(&xtime_lock);
316 sec = xtime.tv_sec;
317 nsec = xtime.tv_nsec+time_interpolator_get_offset();
318 } while (unlikely(read_seqretry(&xtime_lock, seq)));
319
320 while (unlikely(nsec >= NSEC_PER_SEC)) {
321 nsec -= NSEC_PER_SEC;
322 ++sec;
323 }
324 tv->tv_sec = sec;
325 tv->tv_nsec = nsec;
326}
327EXPORT_SYMBOL_GPL(getnstimeofday);
328
329int do_settimeofday (struct timespec *tv)
330{
331 time_t wtm_sec, sec = tv->tv_sec;
332 long wtm_nsec, nsec = tv->tv_nsec;
333
334 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
335 return -EINVAL;
336
337 write_seqlock_irq(&xtime_lock);
338 {
339 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
340 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
341
342 set_normalized_timespec(&xtime, sec, nsec);
343 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
344
345 time_adjust = 0; /* stop active adjtime() */
346 time_status |= STA_UNSYNC;
347 time_maxerror = NTP_PHASE_LIMIT;
348 time_esterror = NTP_PHASE_LIMIT;
349 time_interpolator_reset();
350 }
351 write_sequnlock_irq(&xtime_lock);
352 clock_was_set();
353 return 0;
354}
355EXPORT_SYMBOL(do_settimeofday);
356
357void do_gettimeofday (struct timeval *tv)
358{
359 unsigned long seq, nsec, usec, sec, offset;
360 do {
361 seq = read_seqbegin(&xtime_lock);
362 offset = time_interpolator_get_offset();
363 sec = xtime.tv_sec;
364 nsec = xtime.tv_nsec;
365 } while (unlikely(read_seqretry(&xtime_lock, seq)));
366
367 usec = (nsec + offset) / 1000;
368
369 while (unlikely(usec >= USEC_PER_SEC)) {
370 usec -= USEC_PER_SEC;
371 ++sec;
372 }
373
374 tv->tv_sec = sec;
375 tv->tv_usec = usec;
376}
377
378EXPORT_SYMBOL(do_gettimeofday);
379
380
381#else
382#ifndef CONFIG_GENERIC_TIME 308#ifndef CONFIG_GENERIC_TIME
383/* 309/*
384 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 310 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
@@ -394,7 +320,6 @@ void getnstimeofday(struct timespec *tv)
394} 320}
395EXPORT_SYMBOL_GPL(getnstimeofday); 321EXPORT_SYMBOL_GPL(getnstimeofday);
396#endif 322#endif
397#endif
398 323
399/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 324/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
400 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 325 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 76212b2a99de..2ad1c37b8dfe 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -205,47 +205,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
205} 205}
206 206
207/** 207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events 208 * clockevents_notify - notification about relevant events
250 */ 209 */
251void clockevents_notify(unsigned long reason, void *arg) 210void clockevents_notify(unsigned long reason, void *arg)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cf53bb5814cb..cd91237dbfe3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,10 +10,11 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
13#include <linux/timex.h> 14#include <linux/timex.h>
14#include <linux/jiffies.h> 15#include <linux/jiffies.h>
15#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
16 17#include <linux/capability.h>
17#include <asm/div64.h> 18#include <asm/div64.h>
18#include <asm/timex.h> 19#include <asm/timex.h>
19 20
@@ -116,11 +117,6 @@ void second_overflow(void)
116 if (xtime.tv_sec % 86400 == 0) { 117 if (xtime.tv_sec % 86400 == 0) {
117 xtime.tv_sec--; 118 xtime.tv_sec--;
118 wall_to_monotonic.tv_sec++; 119 wall_to_monotonic.tv_sec++;
119 /*
120 * The timer interpolator will make time change
121 * gradually instead of an immediate jump by one second
122 */
123 time_interpolator_update(-NSEC_PER_SEC);
124 time_state = TIME_OOP; 120 time_state = TIME_OOP;
125 printk(KERN_NOTICE "Clock: inserting leap second " 121 printk(KERN_NOTICE "Clock: inserting leap second "
126 "23:59:60 UTC\n"); 122 "23:59:60 UTC\n");
@@ -130,11 +126,6 @@ void second_overflow(void)
130 if ((xtime.tv_sec + 1) % 86400 == 0) { 126 if ((xtime.tv_sec + 1) % 86400 == 0) {
131 xtime.tv_sec++; 127 xtime.tv_sec++;
132 wall_to_monotonic.tv_sec--; 128 wall_to_monotonic.tv_sec--;
133 /*
134 * Use of time interpolator for a gradual change of
135 * time
136 */
137 time_interpolator_update(NSEC_PER_SEC);
138 time_state = TIME_WAIT; 129 time_state = TIME_WAIT;
139 printk(KERN_NOTICE "Clock: deleting leap second " 130 printk(KERN_NOTICE "Clock: deleting leap second "
140 "23:59:59 UTC\n"); 131 "23:59:59 UTC\n");
@@ -185,12 +176,64 @@ u64 current_tick_length(void)
185 return tick_length; 176 return tick_length;
186} 177}
187 178
179#ifdef CONFIG_GENERIC_CMOS_UPDATE
180
181/* Disable the cmos update - used by virtualization and embedded */
182int no_sync_cmos_clock __read_mostly;
183
184static void sync_cmos_clock(unsigned long dummy);
185
186static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
188 187
189void __attribute__ ((weak)) notify_arch_cmos_timer(void) 188static void sync_cmos_clock(unsigned long dummy)
190{ 189{
191 return; 190 struct timespec now, next;
191 int fail = 1;
192
193 /*
194 * If we have an externally synchronized Linux clock, then update
195 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
196 * called as close as possible to 500 ms before the new second starts.
197 * This code is run on a timer. If the clock is set, that timer
198 * may not expire at the correct time. Thus, we adjust...
199 */
200 if (!ntp_synced())
201 /*
202 * Not synced, exit, do not restart a timer (if one is
203 * running, let it run out).
204 */
205 return;
206
207 getnstimeofday(&now);
208 if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
209 fail = update_persistent_clock(now);
210
211 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
212 if (next.tv_nsec <= 0)
213 next.tv_nsec += NSEC_PER_SEC;
214
215 if (!fail)
216 next.tv_sec = 659;
217 else
218 next.tv_sec = 0;
219
220 if (next.tv_nsec >= NSEC_PER_SEC) {
221 next.tv_sec++;
222 next.tv_nsec -= NSEC_PER_SEC;
223 }
224 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
192} 225}
193 226
227static void notify_cmos_timer(void)
228{
229 if (no_sync_cmos_clock)
230 mod_timer(&sync_cmos_timer, jiffies + 1);
231}
232
233#else
234static inline void notify_cmos_timer(void) { }
235#endif
236
194/* adjtimex mainly allows reading (and writing, if superuser) of 237/* adjtimex mainly allows reading (and writing, if superuser) of
195 * kernel time-keeping variables. used by xntpd. 238 * kernel time-keeping variables. used by xntpd.
196 */ 239 */
@@ -355,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
355 txc->stbcnt = 0; 398 txc->stbcnt = 0;
356 write_sequnlock_irq(&xtime_lock); 399 write_sequnlock_irq(&xtime_lock);
357 do_gettimeofday(&txc->time); 400 do_gettimeofday(&txc->time);
358 notify_arch_cmos_timer(); 401 notify_cmos_timer();
359 return(result); 402 return(result);
360} 403}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8001d37071f5..db8e0f3d409b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock); 32static DEFINE_SPINLOCK(tick_broadcast_lock);
33 33
34#ifdef CONFIG_TICK_ONESHOT
35static void tick_broadcast_clear_oneshot(int cpu);
36#else
37static inline void tick_broadcast_clear_oneshot(int cpu) { }
38#endif
39
34/* 40/*
35 * Debugging: see timer_list.c 41 * Debugging: see timer_list.c
36 */ 42 */
@@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void)
49 */ 55 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc) 56static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{ 57{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) 58 if (bc)
53 tick_setup_periodic(bc, 1); 59 tick_setup_periodic(bc, 1);
54} 60}
55 61
@@ -99,8 +105,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
99 cpu_set(cpu, tick_broadcast_mask); 105 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 106 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1; 107 ret = 1;
102 } 108 } else {
109 /*
110 * When the new device is not affected by the stop
111 * feature and the cpu is marked in the broadcast mask
112 * then clear the broadcast bit.
113 */
114 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
115 int cpu = smp_processor_id();
103 116
117 cpu_clear(cpu, tick_broadcast_mask);
118 tick_broadcast_clear_oneshot(cpu);
119 }
120 }
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 121 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret; 122 return ret;
106} 123}
@@ -299,7 +316,7 @@ void tick_suspend_broadcast(void)
299 spin_lock_irqsave(&tick_broadcast_lock, flags); 316 spin_lock_irqsave(&tick_broadcast_lock, flags);
300 317
301 bc = tick_broadcast_device.evtdev; 318 bc = tick_broadcast_device.evtdev;
302 if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 319 if (bc)
303 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 320 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
304 321
305 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 322 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -316,6 +333,8 @@ int tick_resume_broadcast(void)
316 bc = tick_broadcast_device.evtdev; 333 bc = tick_broadcast_device.evtdev;
317 334
318 if (bc) { 335 if (bc) {
336 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
337
319 switch (tick_broadcast_device.mode) { 338 switch (tick_broadcast_device.mode) {
320 case TICKDEV_MODE_PERIODIC: 339 case TICKDEV_MODE_PERIODIC:
321 if(!cpus_empty(tick_broadcast_mask)) 340 if(!cpus_empty(tick_broadcast_mask))
@@ -485,6 +504,16 @@ out:
485 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 504 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
486} 505}
487 506
507/*
508 * Reset the one shot broadcast for a cpu
509 *
510 * Called with tick_broadcast_lock held
511 */
512static void tick_broadcast_clear_oneshot(int cpu)
513{
514 cpu_clear(cpu, tick_broadcast_oneshot_mask);
515}
516
488/** 517/**
489 * tick_broadcast_setup_highres - setup the broadcast device for highres 518 * tick_broadcast_setup_highres - setup the broadcast device for highres
490 */ 519 */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a96ec9ab3454..77a21abc8716 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -318,12 +318,17 @@ static void tick_resume(void)
318{ 318{
319 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 319 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
320 unsigned long flags; 320 unsigned long flags;
321 int broadcast = tick_resume_broadcast();
321 322
322 spin_lock_irqsave(&tick_device_lock, flags); 323 spin_lock_irqsave(&tick_device_lock, flags);
323 if (td->mode == TICKDEV_MODE_PERIODIC) 324 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
324 tick_setup_periodic(td->evtdev, 0); 325
325 else 326 if (!broadcast) {
326 tick_resume_oneshot(); 327 if (td->mode == TICKDEV_MODE_PERIODIC)
328 tick_setup_periodic(td->evtdev, 0);
329 else
330 tick_resume_oneshot();
331 }
327 spin_unlock_irqrestore(&tick_device_lock, flags); 332 spin_unlock_irqrestore(&tick_device_lock, flags);
328} 333}
329 334
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
360 break; 365 break;
361 366
362 case CLOCK_EVT_NOTIFY_RESUME: 367 case CLOCK_EVT_NOTIFY_RESUME:
363 if (!tick_resume_broadcast()) 368 tick_resume();
364 tick_resume();
365 break; 369 break;
366 370
367 default: 371 default:
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f6997ab0c3c9..0258d3115d54 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
73 struct clock_event_device *dev = td->evtdev; 73 struct clock_event_device *dev = td->evtdev;
74 74
75 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || 75 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
76 !tick_device_is_functional(dev)) 76 !tick_device_is_functional(dev)) {
77
78 printk(KERN_INFO "Clockevents: "
79 "could not switch to one-shot mode:");
80 if (!dev) {
81 printk(" no tick device\n");
82 } else {
83 if (!tick_device_is_functional(dev))
84 printk(" %s is not functional.\n", dev->name);
85 else
86 printk(" %s does not support one-shot mode.\n",
87 dev->name);
88 }
77 return -EINVAL; 89 return -EINVAL;
90 }
78 91
79 td->mode = TICKDEV_MODE_ONESHOT; 92 td->mode = TICKDEV_MODE_ONESHOT;
80 dev->event_handler = handler; 93 dev->event_handler = handler;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 52db9e3c526e..b416995b9757 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -546,6 +546,7 @@ void tick_setup_sched_timer(void)
546{ 546{
547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
548 ktime_t now = ktime_get(); 548 ktime_t now = ktime_get();
549 u64 offset;
549 550
550 /* 551 /*
551 * Emulate tick processing via per-CPU hrtimers: 552 * Emulate tick processing via per-CPU hrtimers:
@@ -554,8 +555,12 @@ void tick_setup_sched_timer(void)
554 ts->sched_timer.function = tick_sched_timer; 555 ts->sched_timer.function = tick_sched_timer;
555 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 556 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
556 557
557 /* Get the next period */ 558 /* Get the next period (per cpu) */
558 ts->sched_timer.expires = tick_init_jiffy_update(); 559 ts->sched_timer.expires = tick_init_jiffy_update();
560 offset = ktime_to_ns(tick_period) >> 1;
561 do_div(offset, NR_CPUS);
562 offset *= smp_processor_id();
563 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
559 564
560 for (;;) { 565 for (;;) {
561 hrtimer_forward(&ts->sched_timer, now, tick_period); 566 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3d1042f82a68..88c81026e003 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock);
36 * at zero at system boot time, so wall_to_monotonic will be negative, 36 * at zero at system boot time, so wall_to_monotonic will be negative,
37 * however, we will ALWAYS keep the tv_nsec part positive so we can use 37 * however, we will ALWAYS keep the tv_nsec part positive so we can use
38 * the usual normalization. 38 * the usual normalization.
39 *
40 * wall_to_monotonic is moved after resume from suspend for the monotonic
41 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
42 * to get the real boot based time offset.
43 *
44 * - wall_to_monotonic is no longer the boot time, getboottime must be
45 * used instead.
39 */ 46 */
40struct timespec xtime __attribute__ ((aligned (16))); 47struct timespec xtime __attribute__ ((aligned (16)));
41struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 48struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
49static unsigned long total_sleep_time; /* seconds */
42 50
43EXPORT_SYMBOL(xtime); 51EXPORT_SYMBOL(xtime);
44 52
@@ -251,6 +259,7 @@ void __init timekeeping_init(void)
251 xtime.tv_nsec = 0; 259 xtime.tv_nsec = 0;
252 set_normalized_timespec(&wall_to_monotonic, 260 set_normalized_timespec(&wall_to_monotonic,
253 -xtime.tv_sec, -xtime.tv_nsec); 261 -xtime.tv_sec, -xtime.tv_nsec);
262 total_sleep_time = 0;
254 263
255 write_sequnlock_irqrestore(&xtime_lock, flags); 264 write_sequnlock_irqrestore(&xtime_lock, flags);
256} 265}
@@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev)
282 291
283 xtime.tv_sec += sleep_length; 292 xtime.tv_sec += sleep_length;
284 wall_to_monotonic.tv_sec -= sleep_length; 293 wall_to_monotonic.tv_sec -= sleep_length;
294 total_sleep_time += sleep_length;
285 } 295 }
286 /* re-base the last cycle value */ 296 /* re-base the last cycle value */
287 clock->cycle_last = clocksource_read(clock); 297 clock->cycle_last = clocksource_read(clock);
@@ -391,7 +401,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
391 * this is optimized for the most common adjustments of -1,0,1, 401 * this is optimized for the most common adjustments of -1,0,1,
392 * for other values we can do a bit more work. 402 * for other values we can do a bit more work.
393 */ 403 */
394static void clocksource_adjust(struct clocksource *clock, s64 offset) 404static void clocksource_adjust(s64 offset)
395{ 405{
396 s64 error, interval = clock->cycle_interval; 406 s64 error, interval = clock->cycle_interval;
397 int adj; 407 int adj;
@@ -456,17 +466,13 @@ void update_wall_time(void)
456 second_overflow(); 466 second_overflow();
457 } 467 }
458 468
459 /* interpolator bits */
460 time_interpolator_update(clock->xtime_interval
461 >> clock->shift);
462
463 /* accumulate error between NTP and clock interval */ 469 /* accumulate error between NTP and clock interval */
464 clock->error += current_tick_length(); 470 clock->error += current_tick_length();
465 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); 471 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
466 } 472 }
467 473
468 /* correct the clock when NTP error is too big */ 474 /* correct the clock when NTP error is too big */
469 clocksource_adjust(clock, offset); 475 clocksource_adjust(offset);
470 476
471 /* store full nanoseconds into xtime */ 477 /* store full nanoseconds into xtime */
472 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 478 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
@@ -476,3 +482,30 @@ void update_wall_time(void)
476 change_clocksource(); 482 change_clocksource();
477 update_vsyscall(&xtime, clock); 483 update_vsyscall(&xtime, clock);
478} 484}
485
486/**
487 * getboottime - Return the real time of system boot.
488 * @ts: pointer to the timespec to be set
489 *
490 * Returns the time of day in a timespec.
491 *
492 * This is based on the wall_to_monotonic offset and the total suspend
493 * time. Calls to settimeofday will affect the value returned (which
494 * basically means that however wrong your real time clock is at boot time,
495 * you get the right time here).
496 */
497void getboottime(struct timespec *ts)
498{
499 set_normalized_timespec(ts,
500 - (wall_to_monotonic.tv_sec + total_sleep_time),
501 - wall_to_monotonic.tv_nsec);
502}
503
504/**
505 * monotonic_to_bootbased - Convert the monotonic time to boot based.
506 * @ts: pointer to the timespec to be converted
507 */
508void monotonic_to_bootbased(struct timespec *ts)
509{
510 ts->tv_sec += total_sleep_time;
511}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 8bbcfb77f7d2..e5edc3a22a08 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
38 38
39static void print_name_offset(struct seq_file *m, void *sym) 39static void print_name_offset(struct seq_file *m, void *sym)
40{ 40{
41 char symname[KSYM_NAME_LEN+1]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%p>", sym);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 321693724ad7..8ed62fda16c6 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,6 +68,7 @@ struct entry {
68 * Number of timeout events: 68 * Number of timeout events:
69 */ 69 */
70 unsigned long count; 70 unsigned long count;
71 unsigned int timer_flag;
71 72
72 /* 73 /*
73 * We save the command-line string to preserve 74 * We save the command-line string to preserve
@@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
231 * incremented. Otherwise the timer is registered in a free slot. 232 * incremented. Otherwise the timer is registered in a free slot.
232 */ 233 */
233void timer_stats_update_stats(void *timer, pid_t pid, void *startf, 234void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
234 void *timerf, char * comm) 235 void *timerf, char *comm,
236 unsigned int timer_flag)
235{ 237{
236 /* 238 /*
237 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
@@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
249 input.start_func = startf; 251 input.start_func = startf;
250 input.expire_func = timerf; 252 input.expire_func = timerf;
251 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag;
252 255
253 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
254 if (!active) 257 if (!active)
@@ -266,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
266 269
267static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
268{ 271{
269 char symname[KSYM_NAME_LEN+1]; 272 char symname[KSYM_NAME_LEN];
270 273
271 if (lookup_symbol_name(addr, symname) < 0) 274 if (lookup_symbol_name(addr, symname) < 0)
272 seq_printf(m, "<%p>", (void *)addr); 275 seq_printf(m, "<%p>", (void *)addr);
@@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v)
295 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
296 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
297 300
298 seq_puts(m, "Timer Stats Version: v0.1\n"); 301 seq_puts(m, "Timer Stats Version: v0.2\n");
299 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
300 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
301 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n",
@@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v)
303 306
304 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
305 entry = entries + i; 308 entry = entries + i;
306 seq_printf(m, "%4lu, %5d %-16s ", 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ",
307 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else {
313 seq_printf(m, " %4lu, %5d %-16s ",
314 entry->count, entry->pid, entry->comm);
315 }
308 316
309 print_name_offset(m, (unsigned long)entry->start_func); 317 print_name_offset(m, (unsigned long)entry->start_func);
310 seq_puts(m, " ("); 318 seq_puts(m, " (");
diff --git a/kernel/timer.c b/kernel/timer.c
index 1a69705c2fb9..6ce1952eea7d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
103static inline void timer_set_deferrable(struct timer_list *timer) 103static inline void timer_set_deferrable(struct timer_list *timer)
104{ 104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
106 TBASE_DEFERRABLE_FLAG)); 106 TBASE_DEFERRABLE_FLAG));
107} 107}
108 108
109static inline void 109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{ 111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 112 timer->base = (tvec_base_t *)((unsigned long)(new_base) |
113 tbase_get_deferrable(timer->base)); 113 tbase_get_deferrable(timer->base));
114} 114}
115 115
116/** 116/**
@@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
305 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 305 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
306 timer->start_pid = current->pid; 306 timer->start_pid = current->pid;
307} 307}
308
309static void timer_stats_account_timer(struct timer_list *timer)
310{
311 unsigned int flag = 0;
312
313 if (unlikely(tbase_get_deferrable(timer->base)))
314 flag |= TIMER_STATS_FLAG_DEFERRABLE;
315
316 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
317 timer->function, timer->start_comm, flag);
318}
319
320#else
321static void timer_stats_account_timer(struct timer_list *timer) {}
308#endif 322#endif
309 323
310/** 324/**
@@ -431,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer);
431void add_timer_on(struct timer_list *timer, int cpu) 445void add_timer_on(struct timer_list *timer, int cpu)
432{ 446{
433 tvec_base_t *base = per_cpu(tvec_bases, cpu); 447 tvec_base_t *base = per_cpu(tvec_bases, cpu);
434 unsigned long flags; 448 unsigned long flags;
435 449
436 timer_stats_timer_set_start_info(timer); 450 timer_stats_timer_set_start_info(timer);
437 BUG_ON(timer_pending(timer) || !timer->function); 451 BUG_ON(timer_pending(timer) || !timer->function);
438 spin_lock_irqsave(&base->lock, flags); 452 spin_lock_irqsave(&base->lock, flags);
439 timer_set_base(timer, base); 453 timer_set_base(timer, base);
440 internal_add_timer(base, timer); 454 internal_add_timer(base, timer);
@@ -613,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base)
613 while (time_after_eq(jiffies, base->timer_jiffies)) { 627 while (time_after_eq(jiffies, base->timer_jiffies)) {
614 struct list_head work_list; 628 struct list_head work_list;
615 struct list_head *head = &work_list; 629 struct list_head *head = &work_list;
616 int index = base->timer_jiffies & TVR_MASK; 630 int index = base->timer_jiffies & TVR_MASK;
617 631
618 /* 632 /*
619 * Cascade timers: 633 * Cascade timers:
@@ -630,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base)
630 unsigned long data; 644 unsigned long data;
631 645
632 timer = list_first_entry(head, struct timer_list,entry); 646 timer = list_first_entry(head, struct timer_list,entry);
633 fn = timer->function; 647 fn = timer->function;
634 data = timer->data; 648 data = timer->data;
635 649
636 timer_stats_account_timer(timer); 650 timer_stats_account_timer(timer);
637 651
@@ -675,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
675 index = slot = timer_jiffies & TVR_MASK; 689 index = slot = timer_jiffies & TVR_MASK;
676 do { 690 do {
677 list_for_each_entry(nte, base->tv1.vec + slot, entry) { 691 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
678 if (tbase_get_deferrable(nte->base)) 692 if (tbase_get_deferrable(nte->base))
679 continue; 693 continue;
680 694
681 found = 1; 695 found = 1;
682 expires = nte->expires; 696 expires = nte->expires;
@@ -820,7 +834,7 @@ void update_process_times(int user_tick)
820 if (rcu_pending(cpu)) 834 if (rcu_pending(cpu))
821 rcu_check_callbacks(cpu, user_tick); 835 rcu_check_callbacks(cpu, user_tick);
822 scheduler_tick(); 836 scheduler_tick();
823 run_posix_cpu_timers(p); 837 run_posix_cpu_timers(p);
824} 838}
825 839
826/* 840/*
@@ -895,7 +909,7 @@ static inline void update_times(unsigned long ticks)
895 update_wall_time(); 909 update_wall_time();
896 calc_load(ticks); 910 calc_load(ticks);
897} 911}
898 912
899/* 913/*
900 * The 64-bit jiffies value is not atomic - you MUST NOT read it 914 * The 64-bit jiffies value is not atomic - you MUST NOT read it
901 * without sampling the sequence number in xtime_lock. 915 * without sampling the sequence number in xtime_lock.
@@ -1091,7 +1105,7 @@ asmlinkage long sys_gettid(void)
1091/** 1105/**
1092 * do_sysinfo - fill in sysinfo struct 1106 * do_sysinfo - fill in sysinfo struct
1093 * @info: pointer to buffer to fill 1107 * @info: pointer to buffer to fill
1094 */ 1108 */
1095int do_sysinfo(struct sysinfo *info) 1109int do_sysinfo(struct sysinfo *info)
1096{ 1110{
1097 unsigned long mem_total, sav_total; 1111 unsigned long mem_total, sav_total;
@@ -1114,6 +1128,7 @@ int do_sysinfo(struct sysinfo *info)
1114 getnstimeofday(&tp); 1128 getnstimeofday(&tp);
1115 tp.tv_sec += wall_to_monotonic.tv_sec; 1129 tp.tv_sec += wall_to_monotonic.tv_sec;
1116 tp.tv_nsec += wall_to_monotonic.tv_nsec; 1130 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1131 monotonic_to_bootbased(&tp);
1117 if (tp.tv_nsec - NSEC_PER_SEC >= 0) { 1132 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1118 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; 1133 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1119 tp.tv_sec++; 1134 tp.tv_sec++;
@@ -1206,7 +1221,8 @@ static int __devinit init_timers_cpu(int cpu)
1206 /* 1221 /*
1207 * The APs use this path later in boot 1222 * The APs use this path later in boot
1208 */ 1223 */
1209 base = kmalloc_node(sizeof(*base), GFP_KERNEL, 1224 base = kmalloc_node(sizeof(*base),
1225 GFP_KERNEL | __GFP_ZERO,
1210 cpu_to_node(cpu)); 1226 cpu_to_node(cpu));
1211 if (!base) 1227 if (!base)
1212 return -ENOMEM; 1228 return -ENOMEM;
@@ -1217,7 +1233,6 @@ static int __devinit init_timers_cpu(int cpu)
1217 kfree(base); 1233 kfree(base);
1218 return -ENOMEM; 1234 return -ENOMEM;
1219 } 1235 }
1220 memset(base, 0, sizeof(*base));
1221 per_cpu(tvec_bases, cpu) = base; 1236 per_cpu(tvec_bases, cpu) = base;
1222 } else { 1237 } else {
1223 /* 1238 /*
@@ -1334,194 +1349,6 @@ void __init init_timers(void)
1334 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1349 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1335} 1350}
1336 1351
1337#ifdef CONFIG_TIME_INTERPOLATION
1338
1339struct time_interpolator *time_interpolator __read_mostly;
1340static struct time_interpolator *time_interpolator_list __read_mostly;
1341static DEFINE_SPINLOCK(time_interpolator_lock);
1342
1343static inline cycles_t time_interpolator_get_cycles(unsigned int src)
1344{
1345 unsigned long (*x)(void);
1346
1347 switch (src)
1348 {
1349 case TIME_SOURCE_FUNCTION:
1350 x = time_interpolator->addr;
1351 return x();
1352
1353 case TIME_SOURCE_MMIO64 :
1354 return readq_relaxed((void __iomem *)time_interpolator->addr);
1355
1356 case TIME_SOURCE_MMIO32 :
1357 return readl_relaxed((void __iomem *)time_interpolator->addr);
1358
1359 default: return get_cycles();
1360 }
1361}
1362
1363static inline u64 time_interpolator_get_counter(int writelock)
1364{
1365 unsigned int src = time_interpolator->source;
1366
1367 if (time_interpolator->jitter)
1368 {
1369 cycles_t lcycle;
1370 cycles_t now;
1371
1372 do {
1373 lcycle = time_interpolator->last_cycle;
1374 now = time_interpolator_get_cycles(src);
1375 if (lcycle && time_after(lcycle, now))
1376 return lcycle;
1377
1378 /* When holding the xtime write lock, there's no need
1379 * to add the overhead of the cmpxchg. Readers are
1380 * force to retry until the write lock is released.
1381 */
1382 if (writelock) {
1383 time_interpolator->last_cycle = now;
1384 return now;
1385 }
1386 /* Keep track of the last timer value returned. The use of cmpxchg here
1387 * will cause contention in an SMP environment.
1388 */
1389 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1390 return now;
1391 }
1392 else
1393 return time_interpolator_get_cycles(src);
1394}
1395
1396void time_interpolator_reset(void)
1397{
1398 time_interpolator->offset = 0;
1399 time_interpolator->last_counter = time_interpolator_get_counter(1);
1400}
1401
1402#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1403
1404unsigned long time_interpolator_get_offset(void)
1405{
1406 /* If we do not have a time interpolator set up then just return zero */
1407 if (!time_interpolator)
1408 return 0;
1409
1410 return time_interpolator->offset +
1411 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1412}
1413
1414#define INTERPOLATOR_ADJUST 65536
1415#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1416
1417void time_interpolator_update(long delta_nsec)
1418{
1419 u64 counter;
1420 unsigned long offset;
1421
1422 /* If there is no time interpolator set up then do nothing */
1423 if (!time_interpolator)
1424 return;
1425
1426 /*
1427 * The interpolator compensates for late ticks by accumulating the late
1428 * time in time_interpolator->offset. A tick earlier than expected will
1429 * lead to a reset of the offset and a corresponding jump of the clock
1430 * forward. Again this only works if the interpolator clock is running
1431 * slightly slower than the regular clock and the tuning logic insures
1432 * that.
1433 */
1434
1435 counter = time_interpolator_get_counter(1);
1436 offset = time_interpolator->offset +
1437 GET_TI_NSECS(counter, time_interpolator);
1438
1439 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1440 time_interpolator->offset = offset - delta_nsec;
1441 else {
1442 time_interpolator->skips++;
1443 time_interpolator->ns_skipped += delta_nsec - offset;
1444 time_interpolator->offset = 0;
1445 }
1446 time_interpolator->last_counter = counter;
1447
1448 /* Tuning logic for time interpolator invoked every minute or so.
1449 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1450 * Increase interpolator clock speed if we skip too much time.
1451 */
1452 if (jiffies % INTERPOLATOR_ADJUST == 0)
1453 {
1454 if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
1455 time_interpolator->nsec_per_cyc--;
1456 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1457 time_interpolator->nsec_per_cyc++;
1458 time_interpolator->skips = 0;
1459 time_interpolator->ns_skipped = 0;
1460 }
1461}
1462
1463static inline int
1464is_better_time_interpolator(struct time_interpolator *new)
1465{
1466 if (!time_interpolator)
1467 return 1;
1468 return new->frequency > 2*time_interpolator->frequency ||
1469 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1470}
1471
1472void
1473register_time_interpolator(struct time_interpolator *ti)
1474{
1475 unsigned long flags;
1476
1477 /* Sanity check */
1478 BUG_ON(ti->frequency == 0 || ti->mask == 0);
1479
1480 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1481 spin_lock(&time_interpolator_lock);
1482 write_seqlock_irqsave(&xtime_lock, flags);
1483 if (is_better_time_interpolator(ti)) {
1484 time_interpolator = ti;
1485 time_interpolator_reset();
1486 }
1487 write_sequnlock_irqrestore(&xtime_lock, flags);
1488
1489 ti->next = time_interpolator_list;
1490 time_interpolator_list = ti;
1491 spin_unlock(&time_interpolator_lock);
1492}
1493
1494void
1495unregister_time_interpolator(struct time_interpolator *ti)
1496{
1497 struct time_interpolator *curr, **prev;
1498 unsigned long flags;
1499
1500 spin_lock(&time_interpolator_lock);
1501 prev = &time_interpolator_list;
1502 for (curr = *prev; curr; curr = curr->next) {
1503 if (curr == ti) {
1504 *prev = curr->next;
1505 break;
1506 }
1507 prev = &curr->next;
1508 }
1509
1510 write_seqlock_irqsave(&xtime_lock, flags);
1511 if (ti == time_interpolator) {
1512 /* we lost the best time-interpolator: */
1513 time_interpolator = NULL;
1514 /* find the next-best interpolator */
1515 for (curr = time_interpolator_list; curr; curr = curr->next)
1516 if (is_better_time_interpolator(curr))
1517 time_interpolator = curr;
1518 time_interpolator_reset();
1519 }
1520 write_sequnlock_irqrestore(&xtime_lock, flags);
1521 spin_unlock(&time_interpolator_lock);
1522}
1523#endif /* CONFIG_TIME_INTERPOLATION */
1524
1525/** 1352/**
1526 * msleep - sleep safely even with waitqueue interruptions 1353 * msleep - sleep safely even with waitqueue interruptions
1527 * @msecs: Time in milliseconds to sleep for 1354 * @msecs: Time in milliseconds to sleep for
diff --git a/kernel/user.c b/kernel/user.c
index 4869563080e9..e7d11cef6998 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,20 +14,19 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/key.h> 15#include <linux/key.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/user_namespace.h>
17 19
18/* 20/*
19 * UID task count cache, to get fast user lookup in "alloc_uid" 21 * UID task count cache, to get fast user lookup in "alloc_uid"
20 * when changing user ID's (ie setuid() and friends). 22 * when changing user ID's (ie setuid() and friends).
21 */ 23 */
22 24
23#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
24#define UIDHASH_SZ (1 << UIDHASH_BITS)
25#define UIDHASH_MASK (UIDHASH_SZ - 1) 25#define UIDHASH_MASK (UIDHASH_SZ - 1)
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
28 28
29static struct kmem_cache *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ];
31 30
32/* 31/*
33 * The uidhash_lock is mostly taken from process context, but it is 32 * The uidhash_lock is mostly taken from process context, but it is
@@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid)
94{ 93{
95 struct user_struct *ret; 94 struct user_struct *ret;
96 unsigned long flags; 95 unsigned long flags;
96 struct user_namespace *ns = current->nsproxy->user_ns;
97 97
98 spin_lock_irqsave(&uidhash_lock, flags); 98 spin_lock_irqsave(&uidhash_lock, flags);
99 ret = uid_hash_find(uid, uidhashentry(uid)); 99 ret = uid_hash_find(uid, uidhashentry(ns, uid));
100 spin_unlock_irqrestore(&uidhash_lock, flags); 100 spin_unlock_irqrestore(&uidhash_lock, flags);
101 return ret; 101 return ret;
102} 102}
@@ -120,9 +120,9 @@ void free_uid(struct user_struct *up)
120 } 120 }
121} 121}
122 122
123struct user_struct * alloc_uid(uid_t uid) 123struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
124{ 124{
125 struct list_head *hashent = uidhashentry(uid); 125 struct list_head *hashent = uidhashentry(ns, uid);
126 struct user_struct *up; 126 struct user_struct *up;
127 127
128 spin_lock_irq(&uidhash_lock); 128 spin_lock_irq(&uidhash_lock);
@@ -208,14 +208,14 @@ static int __init uid_cache_init(void)
208 int n; 208 int n;
209 209
210 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 210 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
212 212
213 for(n = 0; n < UIDHASH_SZ; ++n) 213 for(n = 0; n < UIDHASH_SZ; ++n)
214 INIT_LIST_HEAD(uidhash_table + n); 214 INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
215 215
216 /* Insert the root user immediately (init already runs as root) */ 216 /* Insert the root user immediately (init already runs as root) */
217 spin_lock_irq(&uidhash_lock); 217 spin_lock_irq(&uidhash_lock);
218 uid_hash_insert(&root_user, uidhashentry(0)); 218 uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
219 spin_unlock_irq(&uidhash_lock); 219 spin_unlock_irq(&uidhash_lock);
220 220
221 return 0; 221 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 000000000000..d055d987850c
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,87 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the
5 * License.
6 */
7
8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h>
11#include <linux/user_namespace.h>
12
13struct user_namespace init_user_ns = {
14 .kref = {
15 .refcount = ATOMIC_INIT(2),
16 },
17 .root_user = &root_user,
18};
19
20EXPORT_SYMBOL_GPL(init_user_ns);
21
22#ifdef CONFIG_USER_NS
23
24/*
25 * Clone a new ns copying an original user ns, setting refcount to 1
26 * @old_ns: namespace to clone
27 * Return NULL on error (failure to kmalloc), new ns otherwise
28 */
29static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
30{
31 struct user_namespace *ns;
32 struct user_struct *new_user;
33 int n;
34
35 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
36 if (!ns)
37 return ERR_PTR(-ENOMEM);
38
39 kref_init(&ns->kref);
40
41 for (n = 0; n < UIDHASH_SZ; ++n)
42 INIT_LIST_HEAD(ns->uidhash_table + n);
43
44 /* Insert new root user. */
45 ns->root_user = alloc_uid(ns, 0);
46 if (!ns->root_user) {
47 kfree(ns);
48 return ERR_PTR(-ENOMEM);
49 }
50
51 /* Reset current->user with a new one */
52 new_user = alloc_uid(ns, current->uid);
53 if (!new_user) {
54 free_uid(ns->root_user);
55 kfree(ns);
56 return ERR_PTR(-ENOMEM);
57 }
58
59 switch_uid(new_user);
60 return ns;
61}
62
63struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
64{
65 struct user_namespace *new_ns;
66
67 BUG_ON(!old_ns);
68 get_user_ns(old_ns);
69
70 if (!(flags & CLONE_NEWUSER))
71 return old_ns;
72
73 new_ns = clone_user_ns(old_ns);
74
75 put_user_ns(old_ns);
76 return new_ns;
77}
78
79void free_user_ns(struct kref *kref)
80{
81 struct user_namespace *ns;
82
83 ns = container_of(kref, struct user_namespace, kref);
84 kfree(ns);
85}
86
87#endif /* CONFIG_USER_NS */
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 160c8c5136bd..9d8180a0f0d8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h>
16 17
17/* 18/*
18 * Clone a new ns copying an original utsname, setting refcount to 1 19 * Clone a new ns copying an original utsname, setting refcount to 1
@@ -24,10 +25,11 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24 struct uts_namespace *ns; 25 struct uts_namespace *ns;
25 26
26 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
27 if (ns) { 28 if (!ns)
28 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 29 return ERR_PTR(-ENOMEM);
29 kref_init(&ns->kref); 30
30 } 31 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
32 kref_init(&ns->kref);
31 return ns; 33 return ns;
32} 34}
33 35
@@ -37,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
37 * utsname of this process won't be seen by parent, and vice 39 * utsname of this process won't be seen by parent, and vice
38 * versa. 40 * versa.
39 */ 41 */
40struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) 42struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
41{ 43{
42 struct uts_namespace *new_ns; 44 struct uts_namespace *new_ns;
43 45
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index f22b9dbd2a9c..c76c06466bfd 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -18,10 +18,7 @@
18static void *get_uts(ctl_table *table, int write) 18static void *get_uts(ctl_table *table, int write)
19{ 19{
20 char *which = table->data; 20 char *which = table->data;
21#ifdef CONFIG_UTS_NS 21
22 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
24#endif
25 if (!write) 22 if (!write)
26 down_read(&uts_sem); 23 down_read(&uts_sem);
27 else 24 else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3bebf73be976..58e5c152a6bb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -282,8 +282,8 @@ static int worker_thread(void *__cwq)
282 struct cpu_workqueue_struct *cwq = __cwq; 282 struct cpu_workqueue_struct *cwq = __cwq;
283 DEFINE_WAIT(wait); 283 DEFINE_WAIT(wait);
284 284
285 if (!cwq->wq->freezeable) 285 if (cwq->wq->freezeable)
286 current->flags |= PF_NOFREEZE; 286 set_freezable();
287 287
288 set_user_nice(current, -5); 288 set_user_nice(current, -5);
289 289
@@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
382EXPORT_SYMBOL_GPL(flush_workqueue); 382EXPORT_SYMBOL_GPL(flush_workqueue);
383 383
384/* 384/*
385 * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, 385 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
386 * so this work can't be re-armed in any way. 386 * so this work can't be re-armed in any way.
387 */ 387 */
388static int try_to_grab_pending(struct work_struct *work) 388static int try_to_grab_pending(struct work_struct *work)
389{ 389{
390 struct cpu_workqueue_struct *cwq; 390 struct cpu_workqueue_struct *cwq;
391 int ret = 0; 391 int ret = -1;
392 392
393 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 393 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
394 return 1; 394 return 0;
395 395
396 /* 396 /*
397 * The queueing is in progress, or it is already queued. Try to 397 * The queueing is in progress, or it is already queued. Try to
@@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work)
457 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 457 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
458} 458}
459 459
460static int __cancel_work_timer(struct work_struct *work,
461 struct timer_list* timer)
462{
463 int ret;
464
465 do {
466 ret = (timer && likely(del_timer(timer)));
467 if (!ret)
468 ret = try_to_grab_pending(work);
469 wait_on_work(work);
470 } while (unlikely(ret < 0));
471
472 work_clear_pending(work);
473 return ret;
474}
475
460/** 476/**
461 * cancel_work_sync - block until a work_struct's callback has terminated 477 * cancel_work_sync - block until a work_struct's callback has terminated
462 * @work: the work which is to be flushed 478 * @work: the work which is to be flushed
463 * 479 *
480 * Returns true if @work was pending.
481 *
464 * cancel_work_sync() will cancel the work if it is queued. If the work's 482 * cancel_work_sync() will cancel the work if it is queued. If the work's
465 * callback appears to be running, cancel_work_sync() will block until it 483 * callback appears to be running, cancel_work_sync() will block until it
466 * has completed. 484 * has completed.
@@ -476,31 +494,26 @@ static void wait_on_work(struct work_struct *work)
476 * The caller must ensure that workqueue_struct on which this work was last 494 * The caller must ensure that workqueue_struct on which this work was last
477 * queued can't be destroyed before this function returns. 495 * queued can't be destroyed before this function returns.
478 */ 496 */
479void cancel_work_sync(struct work_struct *work) 497int cancel_work_sync(struct work_struct *work)
480{ 498{
481 while (!try_to_grab_pending(work)) 499 return __cancel_work_timer(work, NULL);
482 cpu_relax();
483 wait_on_work(work);
484 work_clear_pending(work);
485} 500}
486EXPORT_SYMBOL_GPL(cancel_work_sync); 501EXPORT_SYMBOL_GPL(cancel_work_sync);
487 502
488/** 503/**
489 * cancel_rearming_delayed_work - reliably kill off a delayed work. 504 * cancel_delayed_work_sync - reliably kill off a delayed work.
490 * @dwork: the delayed work struct 505 * @dwork: the delayed work struct
491 * 506 *
507 * Returns true if @dwork was pending.
508 *
492 * It is possible to use this function if @dwork rearms itself via queue_work() 509 * It is possible to use this function if @dwork rearms itself via queue_work()
493 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 510 * or queue_delayed_work(). See also the comment for cancel_work_sync().
494 */ 511 */
495void cancel_rearming_delayed_work(struct delayed_work *dwork) 512int cancel_delayed_work_sync(struct delayed_work *dwork)
496{ 513{
497 while (!del_timer(&dwork->timer) && 514 return __cancel_work_timer(&dwork->work, &dwork->timer);
498 !try_to_grab_pending(&dwork->work))
499 cpu_relax();
500 wait_on_work(&dwork->work);
501 work_clear_pending(&dwork->work);
502} 515}
503EXPORT_SYMBOL(cancel_rearming_delayed_work); 516EXPORT_SYMBOL(cancel_delayed_work_sync);
504 517
505static struct workqueue_struct *keventd_wq __read_mostly; 518static struct workqueue_struct *keventd_wq __read_mostly;
506 519
@@ -739,18 +752,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
739 if (cwq->thread == NULL) 752 if (cwq->thread == NULL)
740 return; 753 return;
741 754
755 flush_cpu_workqueue(cwq);
742 /* 756 /*
743 * If the caller is CPU_DEAD the single flush_cpu_workqueue() 757 * If the caller is CPU_DEAD and cwq->worklist was not empty,
744 * is not enough, a concurrent flush_workqueue() can insert a 758 * a concurrent flush_workqueue() can insert a barrier after us.
745 * barrier after us. 759 * However, in that case run_workqueue() won't return and check
760 * kthread_should_stop() until it flushes all work_struct's.
746 * When ->worklist becomes empty it is safe to exit because no 761 * When ->worklist becomes empty it is safe to exit because no
747 * more work_structs can be queued on this cwq: flush_workqueue 762 * more work_structs can be queued on this cwq: flush_workqueue
748 * checks list_empty(), and a "normal" queue_work() can't use 763 * checks list_empty(), and a "normal" queue_work() can't use
749 * a dead CPU. 764 * a dead CPU.
750 */ 765 */
751 while (flush_cpu_workqueue(cwq))
752 ;
753
754 kthread_stop(cwq->thread); 766 kthread_stop(cwq->thread);
755 cwq->thread = NULL; 767 cwq->thread = NULL;
756} 768}