aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c86
-rw-r--r--kernel/futex.c106
-rw-r--r--kernel/hrtimer.c1
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/proc.c15
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/itimer.c60
-rw-r--r--kernel/kallsyms.c81
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c293
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/module.c79
-rw-r--r--kernel/nsproxy.c139
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c11
-rw-r--r--kernel/posix-cpu-timers.c14
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c1
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/printk.c27
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/rwsem.c2
-rw-r--r--kernel/sched.c365
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/softlockup.c48
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/time.c61
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c51
-rw-r--r--kernel/time/timekeeping.c476
-rw-r--r--kernel/time/timer_list.c15
-rw-r--r--kernel/time/timer_stats.c14
-rw-r--r--kernel/timer.c528
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname.c41
47 files changed, 1670 insertions, 1124 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac6b27abb1ad..642d4277c2ea 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/ 14obj-y += time/
diff --git a/kernel/audit.c b/kernel/audit.c
index 4e9d20829681..d13276d41410 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
515 err = -EPERM; 515 err = -EPERM;
516 break; 516 break;
517 case AUDIT_USER: 517 case AUDIT_USER:
518 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 518 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
519 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: 519 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
520 if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) 520 if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
521 err = -EPERM; 521 err = -EPERM;
522 break; 522 break;
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
614 loginuid, sid); 614 loginuid, sid);
615 break; 615 break;
616 case AUDIT_USER: 616 case AUDIT_USER:
617 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 617 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
618 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: 618 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
619 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 619 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
620 return 0; 620 return 0;
621 621
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d240349cbf0f..88b416dfbc72 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/smp_lock.h>
46#include <linux/spinlock.h> 45#include <linux/spinlock.h>
47#include <linux/stat.h> 46#include <linux/stat.h>
48#include <linux/string.h> 47#include <linux/string.h>
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
822 return -EACCES; 821 return -EACCES;
823 822
824 trialcs = *cs; 823 trialcs = *cs;
825 retval = cpulist_parse(buf, trialcs.cpus_allowed); 824
826 if (retval < 0) 825 /*
827 return retval; 826 * We allow a cpuset's cpus_allowed to be empty; if it has attached
827 * tasks, we'll catch it later when we validate the change and return
828 * -ENOSPC.
829 */
830 if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
831 cpus_clear(trialcs.cpus_allowed);
832 } else {
833 retval = cpulist_parse(buf, trialcs.cpus_allowed);
834 if (retval < 0)
835 return retval;
836 }
828 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); 837 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
829 if (cpus_empty(trialcs.cpus_allowed)) 838 /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
839 if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
830 return -ENOSPC; 840 return -ENOSPC;
831 retval = validate_change(cs, &trialcs); 841 retval = validate_change(cs, &trialcs);
832 if (retval < 0) 842 if (retval < 0)
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
919 return -EACCES; 929 return -EACCES;
920 930
921 trialcs = *cs; 931 trialcs = *cs;
922 retval = nodelist_parse(buf, trialcs.mems_allowed); 932
923 if (retval < 0) 933 /*
924 goto done; 934 * We allow a cpuset's mems_allowed to be empty; if it has attached
935 * tasks, we'll catch it later when we validate the change and return
936 * -ENOSPC.
937 */
938 if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
939 nodes_clear(trialcs.mems_allowed);
940 } else {
941 retval = nodelist_parse(buf, trialcs.mems_allowed);
942 if (retval < 0)
943 goto done;
944 }
925 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); 945 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
926 oldmem = cs->mems_allowed; 946 oldmem = cs->mems_allowed;
927 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 947 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
928 retval = 0; /* Too easy - nothing to do */ 948 retval = 0; /* Too easy - nothing to do */
929 goto done; 949 goto done;
930 } 950 }
931 if (nodes_empty(trialcs.mems_allowed)) { 951 /* mems_allowed cannot be empty for a cpuset with attached tasks. */
952 if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
932 retval = -ENOSPC; 953 retval = -ENOSPC;
933 goto done; 954 goto done;
934 } 955 }
@@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child)
2200 * it is holding that mutex while calling check_for_release(), 2221 * it is holding that mutex while calling check_for_release(),
2201 * which calls kmalloc(), so can't be called holding callback_mutex(). 2222 * which calls kmalloc(), so can't be called holding callback_mutex().
2202 * 2223 *
2203 * We don't need to task_lock() this reference to tsk->cpuset,
2204 * because tsk is already marked PF_EXITING, so attach_task() won't
2205 * mess with it, or task is a failed fork, never visible to attach_task.
2206 *
2207 * the_top_cpuset_hack: 2224 * the_top_cpuset_hack:
2208 * 2225 *
2209 * Set the exiting tasks cpuset to the root cpuset (top_cpuset). 2226 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk)
2242{ 2259{
2243 struct cpuset *cs; 2260 struct cpuset *cs;
2244 2261
2262 task_lock(current);
2245 cs = tsk->cpuset; 2263 cs = tsk->cpuset;
2246 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ 2264 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2265 task_unlock(current);
2247 2266
2248 if (notify_on_release(cs)) { 2267 if (notify_on_release(cs)) {
2249 char *pathbuf = NULL; 2268 char *pathbuf = NULL;
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
new file mode 100644
index 000000000000..0d98827887a7
--- /dev/null
+++ b/kernel/die_notifier.c
@@ -0,0 +1,38 @@
1
2#include <linux/module.h>
3#include <linux/notifier.h>
4#include <linux/vmalloc.h>
5#include <linux/kdebug.h>
6
7
8static ATOMIC_NOTIFIER_HEAD(die_chain);
9
10int notify_die(enum die_val val, const char *str,
11 struct pt_regs *regs, long err, int trap, int sig)
12{
13 struct die_args args = {
14 .regs = regs,
15 .str = str,
16 .err = err,
17 .trapnr = trap,
18 .signr = sig,
19
20 };
21
22 return atomic_notifier_call_chain(&die_chain, val, &args);
23}
24
25int register_die_notifier(struct notifier_block *nb)
26{
27 vmalloc_sync_all();
28 return atomic_notifier_chain_register(&die_chain, nb);
29}
30EXPORT_SYMBOL_GPL(register_die_notifier);
31
32int unregister_die_notifier(struct notifier_block *nb)
33{
34 return atomic_notifier_chain_unregister(&die_chain, nb);
35}
36EXPORT_SYMBOL_GPL(unregister_die_notifier);
37
38
diff --git a/kernel/exit.c b/kernel/exit.c
index 92369240d91d..f5a7abb621f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,7 +7,6 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/smp_lock.h>
11#include <linux/module.h> 10#include <linux/module.h>
12#include <linux/capability.h> 11#include <linux/capability.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d169def942..a8dd75d4992b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,7 +14,6 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/unistd.h> 16#include <linux/unistd.h>
17#include <linux/smp_lock.h>
18#include <linux/module.h> 17#include <linux/module.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/completion.h> 19#include <linux/completion.h>
@@ -1516,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1516} 1515}
1517 1516
1518/* 1517/*
1519 * Unshare the mnt_namespace structure if it is being shared
1520 */
1521static int unshare_mnt_namespace(unsigned long unshare_flags,
1522 struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
1523{
1524 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
1525
1526 if ((unshare_flags & CLONE_NEWNS) && ns) {
1527 if (!capable(CAP_SYS_ADMIN))
1528 return -EPERM;
1529
1530 *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
1531 if (!*new_nsp)
1532 return -ENOMEM;
1533 }
1534
1535 return 0;
1536}
1537
1538/*
1539 * Unsharing of sighand is not supported yet 1518 * Unsharing of sighand is not supported yet
1540 */ 1519 */
1541static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) 1520static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
@@ -1593,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
1593 return 0; 1572 return 0;
1594} 1573}
1595 1574
1596#ifndef CONFIG_IPC_NS
1597static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
1598{
1599 if (flags & CLONE_NEWIPC)
1600 return -EINVAL;
1601
1602 return 0;
1603}
1604#endif
1605
1606/* 1575/*
1607 * unshare allows a process to 'unshare' part of the process 1576 * unshare allows a process to 'unshare' part of the process
1608 * context which was originally shared using clone. copy_* 1577 * context which was originally shared using clone. copy_*
@@ -1615,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1615{ 1584{
1616 int err = 0; 1585 int err = 0;
1617 struct fs_struct *fs, *new_fs = NULL; 1586 struct fs_struct *fs, *new_fs = NULL;
1618 struct mnt_namespace *ns, *new_ns = NULL;
1619 struct sighand_struct *new_sigh = NULL; 1587 struct sighand_struct *new_sigh = NULL;
1620 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1588 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1621 struct files_struct *fd, *new_fd = NULL; 1589 struct files_struct *fd, *new_fd = NULL;
1622 struct sem_undo_list *new_ulist = NULL; 1590 struct sem_undo_list *new_ulist = NULL;
1623 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; 1591 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
1624 struct uts_namespace *uts, *new_uts = NULL;
1625 struct ipc_namespace *ipc, *new_ipc = NULL;
1626 1592
1627 check_unshare_flags(&unshare_flags); 1593 check_unshare_flags(&unshare_flags);
1628 1594
@@ -1637,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1637 goto bad_unshare_out; 1603 goto bad_unshare_out;
1638 if ((err = unshare_fs(unshare_flags, &new_fs))) 1604 if ((err = unshare_fs(unshare_flags, &new_fs)))
1639 goto bad_unshare_cleanup_thread; 1605 goto bad_unshare_cleanup_thread;
1640 if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
1641 goto bad_unshare_cleanup_fs;
1642 if ((err = unshare_sighand(unshare_flags, &new_sigh))) 1606 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1643 goto bad_unshare_cleanup_ns; 1607 goto bad_unshare_cleanup_fs;
1644 if ((err = unshare_vm(unshare_flags, &new_mm))) 1608 if ((err = unshare_vm(unshare_flags, &new_mm)))
1645 goto bad_unshare_cleanup_sigh; 1609 goto bad_unshare_cleanup_sigh;
1646 if ((err = unshare_fd(unshare_flags, &new_fd))) 1610 if ((err = unshare_fd(unshare_flags, &new_fd)))
1647 goto bad_unshare_cleanup_vm; 1611 goto bad_unshare_cleanup_vm;
1648 if ((err = unshare_semundo(unshare_flags, &new_ulist))) 1612 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1649 goto bad_unshare_cleanup_fd; 1613 goto bad_unshare_cleanup_fd;
1650 if ((err = unshare_utsname(unshare_flags, &new_uts))) 1614 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1615 new_fs)))
1651 goto bad_unshare_cleanup_semundo; 1616 goto bad_unshare_cleanup_semundo;
1652 if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
1653 goto bad_unshare_cleanup_uts;
1654
1655 if (new_ns || new_uts || new_ipc) {
1656 old_nsproxy = current->nsproxy;
1657 new_nsproxy = dup_namespaces(old_nsproxy);
1658 if (!new_nsproxy) {
1659 err = -ENOMEM;
1660 goto bad_unshare_cleanup_ipc;
1661 }
1662 }
1663 1617
1664 if (new_fs || new_ns || new_mm || new_fd || new_ulist || 1618 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
1665 new_uts || new_ipc) {
1666 1619
1667 task_lock(current); 1620 task_lock(current);
1668 1621
1669 if (new_nsproxy) { 1622 if (new_nsproxy) {
1623 old_nsproxy = current->nsproxy;
1670 current->nsproxy = new_nsproxy; 1624 current->nsproxy = new_nsproxy;
1671 new_nsproxy = old_nsproxy; 1625 new_nsproxy = old_nsproxy;
1672 } 1626 }
@@ -1677,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1677 new_fs = fs; 1631 new_fs = fs;
1678 } 1632 }
1679 1633
1680 if (new_ns) {
1681 ns = current->nsproxy->mnt_ns;
1682 current->nsproxy->mnt_ns = new_ns;
1683 new_ns = ns;
1684 }
1685
1686 if (new_mm) { 1634 if (new_mm) {
1687 mm = current->mm; 1635 mm = current->mm;
1688 active_mm = current->active_mm; 1636 active_mm = current->active_mm;
@@ -1698,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1698 new_fd = fd; 1646 new_fd = fd;
1699 } 1647 }
1700 1648
1701 if (new_uts) {
1702 uts = current->nsproxy->uts_ns;
1703 current->nsproxy->uts_ns = new_uts;
1704 new_uts = uts;
1705 }
1706
1707 if (new_ipc) {
1708 ipc = current->nsproxy->ipc_ns;
1709 current->nsproxy->ipc_ns = new_ipc;
1710 new_ipc = ipc;
1711 }
1712
1713 task_unlock(current); 1649 task_unlock(current);
1714 } 1650 }
1715 1651
1716 if (new_nsproxy) 1652 if (new_nsproxy)
1717 put_nsproxy(new_nsproxy); 1653 put_nsproxy(new_nsproxy);
1718 1654
1719bad_unshare_cleanup_ipc:
1720 if (new_ipc)
1721 put_ipc_ns(new_ipc);
1722
1723bad_unshare_cleanup_uts:
1724 if (new_uts)
1725 put_uts_ns(new_uts);
1726
1727bad_unshare_cleanup_semundo: 1655bad_unshare_cleanup_semundo:
1728bad_unshare_cleanup_fd: 1656bad_unshare_cleanup_fd:
1729 if (new_fd) 1657 if (new_fd)
@@ -1738,10 +1666,6 @@ bad_unshare_cleanup_sigh:
1738 if (atomic_dec_and_test(&new_sigh->count)) 1666 if (atomic_dec_and_test(&new_sigh->count))
1739 kmem_cache_free(sighand_cachep, new_sigh); 1667 kmem_cache_free(sighand_cachep, new_sigh);
1740 1668
1741bad_unshare_cleanup_ns:
1742 if (new_ns)
1743 put_mnt_ns(new_ns);
1744
1745bad_unshare_cleanup_fs: 1669bad_unshare_cleanup_fs:
1746 if (new_fs) 1670 if (new_fs)
1747 put_fs_struct(new_fs); 1671 put_fs_struct(new_fs);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a270b5e3f95..600bc9d801f2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,6 +48,7 @@
48#include <linux/pagemap.h> 48#include <linux/pagemap.h>
49#include <linux/syscalls.h> 49#include <linux/syscalls.h>
50#include <linux/signal.h> 50#include <linux/signal.h>
51#include <linux/module.h>
51#include <asm/futex.h> 52#include <asm/futex.h>
52 53
53#include "rtmutex_common.h" 54#include "rtmutex_common.h"
@@ -55,32 +56,6 @@
55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 56#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
56 57
57/* 58/*
58 * Futexes are matched on equal values of this key.
59 * The key type depends on whether it's a shared or private mapping.
60 * Don't rearrange members without looking at hash_futex().
61 *
62 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
63 * We set bit 0 to indicate if it's an inode-based key.
64 */
65union futex_key {
66 struct {
67 unsigned long pgoff;
68 struct inode *inode;
69 int offset;
70 } shared;
71 struct {
72 unsigned long address;
73 struct mm_struct *mm;
74 int offset;
75 } private;
76 struct {
77 unsigned long word;
78 void *ptr;
79 int offset;
80 } both;
81};
82
83/*
84 * Priority Inheritance state: 59 * Priority Inheritance state:
85 */ 60 */
86struct futex_pi_state { 61struct futex_pi_state {
@@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
175 * 150 *
176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 151 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
177 */ 152 */
178static int get_futex_key(u32 __user *uaddr, union futex_key *key) 153int get_futex_key(u32 __user *uaddr, union futex_key *key)
179{ 154{
180 unsigned long address = (unsigned long)uaddr; 155 unsigned long address = (unsigned long)uaddr;
181 struct mm_struct *mm = current->mm; 156 struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
246 } 221 }
247 return err; 222 return err;
248} 223}
224EXPORT_SYMBOL_GPL(get_futex_key);
249 225
250/* 226/*
251 * Take a reference to the resource addressed by a key. 227 * Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
254 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this 230 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
255 * function, if it is called at all. mmap_sem keeps key->shared.inode valid. 231 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
256 */ 232 */
257static inline void get_key_refs(union futex_key *key) 233inline void get_futex_key_refs(union futex_key *key)
258{ 234{
259 if (key->both.ptr != 0) { 235 if (key->both.ptr != 0) {
260 if (key->both.offset & 1) 236 if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key)
263 atomic_inc(&key->private.mm->mm_count); 239 atomic_inc(&key->private.mm->mm_count);
264 } 240 }
265} 241}
242EXPORT_SYMBOL_GPL(get_futex_key_refs);
266 243
267/* 244/*
268 * Drop a reference to the resource addressed by a key. 245 * Drop a reference to the resource addressed by a key.
269 * The hash bucket spinlock must not be held. 246 * The hash bucket spinlock must not be held.
270 */ 247 */
271static void drop_key_refs(union futex_key *key) 248void drop_futex_key_refs(union futex_key *key)
272{ 249{
273 if (key->both.ptr != 0) { 250 if (key->both.ptr != 0) {
274 if (key->both.offset & 1) 251 if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key)
277 mmdrop(key->private.mm); 254 mmdrop(key->private.mm);
278 } 255 }
279} 256}
257EXPORT_SYMBOL_GPL(drop_futex_key_refs);
280 258
281static inline int get_futex_value_locked(u32 *dest, u32 __user *from) 259static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
282{ 260{
@@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
873 this->lock_ptr = &hb2->lock; 851 this->lock_ptr = &hb2->lock;
874 } 852 }
875 this->key = key2; 853 this->key = key2;
876 get_key_refs(&key2); 854 get_futex_key_refs(&key2);
877 drop_count++; 855 drop_count++;
878 856
879 if (ret - nr_wake >= nr_requeue) 857 if (ret - nr_wake >= nr_requeue)
@@ -886,9 +864,9 @@ out_unlock:
886 if (hb1 != hb2) 864 if (hb1 != hb2)
887 spin_unlock(&hb2->lock); 865 spin_unlock(&hb2->lock);
888 866
889 /* drop_key_refs() must be called outside the spinlocks. */ 867 /* drop_futex_key_refs() must be called outside the spinlocks. */
890 while (--drop_count >= 0) 868 while (--drop_count >= 0)
891 drop_key_refs(&key1); 869 drop_futex_key_refs(&key1);
892 870
893out: 871out:
894 up_read(&current->mm->mmap_sem); 872 up_read(&current->mm->mmap_sem);
@@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
906 884
907 init_waitqueue_head(&q->waiters); 885 init_waitqueue_head(&q->waiters);
908 886
909 get_key_refs(&q->key); 887 get_futex_key_refs(&q->key);
910 hb = hash_futex(&q->key); 888 hb = hash_futex(&q->key);
911 q->lock_ptr = &hb->lock; 889 q->lock_ptr = &hb->lock;
912 890
@@ -925,7 +903,7 @@ static inline void
925queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 903queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
926{ 904{
927 spin_unlock(&hb->lock); 905 spin_unlock(&hb->lock);
928 drop_key_refs(&q->key); 906 drop_futex_key_refs(&q->key);
929} 907}
930 908
931/* 909/*
@@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q)
980 ret = 1; 958 ret = 1;
981 } 959 }
982 960
983 drop_key_refs(&q->key); 961 drop_futex_key_refs(&q->key);
984 return ret; 962 return ret;
985} 963}
986 964
@@ -999,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
999 977
1000 spin_unlock(&hb->lock); 978 spin_unlock(&hb->lock);
1001 979
1002 drop_key_refs(&q->key); 980 drop_futex_key_refs(&q->key);
1003} 981}
1004 982
1005static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) 983static long futex_wait_restart(struct restart_block *restart);
984static int futex_wait_abstime(u32 __user *uaddr, u32 val,
985 int timed, unsigned long abs_time)
1006{ 986{
1007 struct task_struct *curr = current; 987 struct task_struct *curr = current;
1008 DECLARE_WAITQUEUE(wait, curr); 988 DECLARE_WAITQUEUE(wait, curr);
1009 struct futex_hash_bucket *hb; 989 struct futex_hash_bucket *hb;
1010 struct futex_q q; 990 struct futex_q q;
991 unsigned long time_left = 0;
1011 u32 uval; 992 u32 uval;
1012 int ret; 993 int ret;
1013 994
@@ -1087,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
1087 * !list_empty() is safe here without any lock. 1068 * !list_empty() is safe here without any lock.
1088 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1069 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1089 */ 1070 */
1090 if (likely(!list_empty(&q.list))) 1071 time_left = 0;
1091 time = schedule_timeout(time); 1072 if (likely(!list_empty(&q.list))) {
1073 unsigned long rel_time;
1074
1075 if (timed) {
1076 unsigned long now = jiffies;
1077 if (time_after(now, abs_time))
1078 rel_time = 0;
1079 else
1080 rel_time = abs_time - now;
1081 } else
1082 rel_time = MAX_SCHEDULE_TIMEOUT;
1083
1084 time_left = schedule_timeout(rel_time);
1085 }
1092 __set_current_state(TASK_RUNNING); 1086 __set_current_state(TASK_RUNNING);
1093 1087
1094 /* 1088 /*
@@ -1099,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
1099 /* If we were woken (and unqueued), we succeeded, whatever. */ 1093 /* If we were woken (and unqueued), we succeeded, whatever. */
1100 if (!unqueue_me(&q)) 1094 if (!unqueue_me(&q))
1101 return 0; 1095 return 0;
1102 if (time == 0) 1096 if (time_left == 0)
1103 return -ETIMEDOUT; 1097 return -ETIMEDOUT;
1098
1104 /* 1099 /*
1105 * We expect signal_pending(current), but another thread may 1100 * We expect signal_pending(current), but another thread may
1106 * have handled it for us already. 1101 * have handled it for us already.
1107 */ 1102 */
1108 return -EINTR; 1103 if (time_left == MAX_SCHEDULE_TIMEOUT)
1104 return -ERESTARTSYS;
1105 else {
1106 struct restart_block *restart;
1107 restart = &current_thread_info()->restart_block;
1108 restart->fn = futex_wait_restart;
1109 restart->arg0 = (unsigned long)uaddr;
1110 restart->arg1 = (unsigned long)val;
1111 restart->arg2 = (unsigned long)timed;
1112 restart->arg3 = abs_time;
1113 return -ERESTART_RESTARTBLOCK;
1114 }
1109 1115
1110 out_unlock_release_sem: 1116 out_unlock_release_sem:
1111 queue_unlock(&q, hb); 1117 queue_unlock(&q, hb);
@@ -1115,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
1115 return ret; 1121 return ret;
1116} 1122}
1117 1123
1124static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
1125{
1126 int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
1127 return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
1128}
1129
1130static long futex_wait_restart(struct restart_block *restart)
1131{
1132 u32 __user *uaddr = (u32 __user *)restart->arg0;
1133 u32 val = (u32)restart->arg1;
1134 int timed = (int)restart->arg2;
1135 unsigned long abs_time = restart->arg3;
1136
1137 restart->fn = do_no_restart_syscall;
1138 return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
1139}
1140
1141
1118/* 1142/*
1119 * Userspace tried a 0 -> TID atomic transition of the futex value 1143 * Userspace tried a 0 -> TID atomic transition of the futex value
1120 * and failed. The kernel side here does the whole locking operation: 1144 * and failed. The kernel side here does the whole locking operation:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1b3033105b40..c9f4f044a8a8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
669 669
670 return orun; 670 return orun;
671} 671}
672EXPORT_SYMBOL_GPL(hrtimer_forward);
672 673
673/* 674/*
674 * enqueue_hrtimer - internal function to (re)start a timer 675 * enqueue_hrtimer - internal function to (re)start a timer
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aff1f0fabb0d..32e1ab1477d1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
48 * 48 *
49 * Controller mappings for all interrupt sources: 49 * Controller mappings for all interrupt sources:
50 */ 50 */
51struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { 51struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
52 [0 ... NR_IRQS-1] = { 52 [0 ... NR_IRQS-1] = {
53 .status = IRQ_DISABLED, 53 .status = IRQ_DISABLED,
54 .chip = &no_irq_chip, 54 .chip = &no_irq_chip,
@@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
180 if (desc->chip->ack) 180 if (desc->chip->ack)
181 desc->chip->ack(irq); 181 desc->chip->ack(irq);
182 action_ret = handle_IRQ_event(irq, desc->action); 182 action_ret = handle_IRQ_event(irq, desc->action);
183 if (!noirqdebug)
184 note_interrupt(irq, desc, action_ret);
183 desc->chip->end(irq); 185 desc->chip->end(irq);
184 return 1; 186 return 1;
185 } 187 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5597c157442a..203a518b6f14 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
317 } 317 }
318 318
319 *p = new; 319 *p = new;
320#if defined(CONFIG_IRQ_PER_CPU) 320
321 if (new->flags & IRQF_PERCPU)
322 desc->status |= IRQ_PER_CPU;
323#endif
324 /* Exclude IRQ from balancing */ 321 /* Exclude IRQ from balancing */
325 if (new->flags & IRQF_NOBALANCING) 322 if (new->flags & IRQF_NOBALANCING)
326 desc->status |= IRQ_NO_BALANCING; 323 desc->status |= IRQ_NO_BALANCING;
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new)
328 if (!shared) { 325 if (!shared) {
329 irq_chip_set_defaults(desc->chip); 326 irq_chip_set_defaults(desc->chip);
330 327
328#if defined(CONFIG_IRQ_PER_CPU)
329 if (new->flags & IRQF_PERCPU)
330 desc->status |= IRQ_PER_CPU;
331#endif
332
331 /* Setup the type (level, edge polarity) if configured: */ 333 /* Setup the type (level, edge polarity) if configured: */
332 if (new->flags & IRQF_TRIGGER_MASK) { 334 if (new->flags & IRQF_TRIGGER_MASK) {
333 if (desc->chip && desc->chip->set_type) 335 if (desc->chip && desc->chip->set_type)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2db91eb54ad8..ddde0ef9ccdc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
66{ 66{
67 struct irq_desc *desc = irq_desc + irq; 67 struct irq_desc *desc = irq_desc + irq;
68 struct irqaction *action; 68 struct irqaction *action;
69 unsigned long flags;
70 int ret = 1;
69 71
70 for (action = desc->action ; action; action = action->next) 72 spin_lock_irqsave(&desc->lock, flags);
73 for (action = desc->action ; action; action = action->next) {
71 if ((action != new_action) && action->name && 74 if ((action != new_action) && action->name &&
72 !strcmp(new_action->name, action->name)) 75 !strcmp(new_action->name, action->name)) {
73 return 0; 76 ret = 0;
74 return 1; 77 break;
78 }
79 }
80 spin_unlock_irqrestore(&desc->lock, flags);
81 return ret;
75} 82}
76 83
77void register_handler_proc(unsigned int irq, struct irqaction *action) 84void register_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9d8c79b48823..b0d81aae472f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
146 146
147 if (unlikely(irqfixup)) { 147 if (unlikely(irqfixup)) {
148 /* Don't punish working computers */ 148 /* Don't punish working computers */
149 if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { 149 if ((irqfixup == 2 && ((irq == 0) ||
150 (desc->action->flags & IRQF_IRQPOLL))) ||
151 action_ret == IRQ_NONE) {
150 int ok = misrouted_irq(irq); 152 int ok = misrouted_irq(irq);
151 if (action_ret == IRQ_NONE) 153 if (action_ret == IRQ_NONE)
152 desc->irqs_unhandled -= ok; 154 desc->irqs_unhandled -= ok;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 307c6a632ef6..3205e8e114fa 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -7,7 +7,6 @@
7/* These are all the functions necessary to implement itimers */ 7/* These are all the functions necessary to implement itimers */
8 8
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/smp_lock.h>
11#include <linux/interrupt.h> 10#include <linux/interrupt.h>
12#include <linux/syscalls.h> 11#include <linux/syscalls.h>
13#include <linux/time.h> 12#include <linux/time.h>
@@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
139} 138}
140 139
141/* 140/*
142 * We do not care about correctness. We just sanitize the values so
143 * the ktime_t operations which expect normalized values do not
144 * break. This converts negative values to long timeouts similar to
145 * the code in kernel versions < 2.6.16
146 *
147 * Print a limited number of warning messages when an invalid timeval
148 * is detected.
149 */
150static void fixup_timeval(struct timeval *tv, int interval)
151{
152 static int warnlimit = 10;
153 unsigned long tmp;
154
155 if (warnlimit > 0) {
156 warnlimit--;
157 printk(KERN_WARNING
158 "setitimer: %s (pid = %d) provided "
159 "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
160 current->comm, current->pid,
161 interval ? "it_interval" : "it_value",
162 tv->tv_sec, (long) tv->tv_usec);
163 }
164
165 tmp = tv->tv_usec;
166 if (tmp >= USEC_PER_SEC) {
167 tv->tv_usec = tmp % USEC_PER_SEC;
168 tv->tv_sec += tmp / USEC_PER_SEC;
169 }
170
171 tmp = tv->tv_sec;
172 if (tmp > LONG_MAX)
173 tv->tv_sec = LONG_MAX;
174}
175
176/*
177 * Returns true if the timeval is in canonical form 141 * Returns true if the timeval is in canonical form
178 */ 142 */
179#define timeval_valid(t) \ 143#define timeval_valid(t) \
180 (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) 144 (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
181 145
182/*
183 * Check for invalid timevals, sanitize them and print a limited
184 * number of warnings.
185 */
186static void check_itimerval(struct itimerval *value) {
187
188 if (unlikely(!timeval_valid(&value->it_value)))
189 fixup_timeval(&value->it_value, 0);
190
191 if (unlikely(!timeval_valid(&value->it_interval)))
192 fixup_timeval(&value->it_interval, 1);
193}
194
195int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) 146int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
196{ 147{
197 struct task_struct *tsk = current; 148 struct task_struct *tsk = current;
@@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
201 152
202 /* 153 /*
203 * Validate the timevals in value. 154 * Validate the timevals in value.
204 *
205 * Note: Although the spec requires that invalid values shall
206 * return -EINVAL, we just fixup the value and print a limited
207 * number of warnings in order not to break users of this
208 * historical misfeature.
209 *
210 * Scheduled for replacement in March 2007
211 */ 155 */
212 check_itimerval(value); 156 if (!timeval_valid(&value->it_value) ||
157 !timeval_valid(&value->it_interval))
158 return -EINVAL;
213 159
214 switch (which) { 160 switch (which) {
215 case ITIMER_REAL: 161 case ITIMER_REAL:
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5a0de8409739..f1bda23140b2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
214 symbol_end = (unsigned long)_etext; 214 symbol_end = (unsigned long)_etext;
215 } 215 }
216 216
217 *symbolsize = symbol_end - symbol_start; 217 if (symbolsize)
218 *offset = addr - symbol_start; 218 *symbolsize = symbol_end - symbol_start;
219 if (offset)
220 *offset = addr - symbol_start;
219 221
220 return low; 222 return low;
221} 223}
@@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr,
267 return NULL; 269 return NULL;
268} 270}
269 271
272int lookup_symbol_name(unsigned long addr, char *symname)
273{
274 symname[0] = '\0';
275 symname[KSYM_NAME_LEN] = '\0';
276
277 if (is_ksym_addr(addr)) {
278 unsigned long pos;
279
280 pos = get_symbol_pos(addr, NULL, NULL);
281 /* Grab name */
282 kallsyms_expand_symbol(get_symbol_offset(pos), symname);
283 return 0;
284 }
285 /* see if it's in a module */
286 return lookup_module_symbol_name(addr, symname);
287}
288
289int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
290 unsigned long *offset, char *modname, char *name)
291{
292 name[0] = '\0';
293 name[KSYM_NAME_LEN] = '\0';
294
295 if (is_ksym_addr(addr)) {
296 unsigned long pos;
297
298 pos = get_symbol_pos(addr, size, offset);
299 /* Grab name */
300 kallsyms_expand_symbol(get_symbol_offset(pos), name);
301 modname[0] = '\0';
302 return 0;
303 }
304 /* see if it's in a module */
305 return lookup_module_symbol_attrs(addr, size, offset, modname, name);
306}
307
270/* Look up a kernel symbol and return it in a text buffer. */ 308/* Look up a kernel symbol and return it in a text buffer. */
271int sprint_symbol(char *buffer, unsigned long address) 309int sprint_symbol(char *buffer, unsigned long address)
272{ 310{
@@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address)
301struct kallsym_iter 339struct kallsym_iter
302{ 340{
303 loff_t pos; 341 loff_t pos;
304 struct module *owner;
305 unsigned long value; 342 unsigned long value;
306 unsigned int nameoff; /* If iterating in core kernel symbols */ 343 unsigned int nameoff; /* If iterating in core kernel symbols */
307 char type; 344 char type;
308 char name[KSYM_NAME_LEN+1]; 345 char name[KSYM_NAME_LEN+1];
346 char module_name[MODULE_NAME_LEN + 1];
347 int exported;
309}; 348};
310 349
311static int get_ksymbol_mod(struct kallsym_iter *iter) 350static int get_ksymbol_mod(struct kallsym_iter *iter)
312{ 351{
313 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 352 if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
314 &iter->value, &iter->type, 353 &iter->type, iter->name, iter->module_name,
315 iter->name, sizeof(iter->name)); 354 &iter->exported) < 0)
316 if (iter->owner == NULL)
317 return 0; 355 return 0;
318
319 /* Label it "global" if it is exported, "local" if not exported. */
320 iter->type = is_exported(iter->name, iter->owner)
321 ? toupper(iter->type) : tolower(iter->type);
322
323 return 1; 356 return 1;
324} 357}
325 358
@@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
328{ 361{
329 unsigned off = iter->nameoff; 362 unsigned off = iter->nameoff;
330 363
331 iter->owner = NULL; 364 iter->module_name[0] = '\0';
332 iter->value = kallsyms_addresses[iter->pos]; 365 iter->value = kallsyms_addresses[iter->pos];
333 366
334 iter->type = kallsyms_get_symbol_type(off); 367 iter->type = kallsyms_get_symbol_type(off);
@@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p)
392 if (!iter->name[0]) 425 if (!iter->name[0])
393 return 0; 426 return 0;
394 427
395 if (iter->owner) 428 if (iter->module_name[0]) {
429 char type;
430
431 /* Label it "global" if it is exported,
432 * "local" if not exported. */
433 type = iter->exported ? toupper(iter->type) :
434 tolower(iter->type);
396 seq_printf(m, "%0*lx %c %s\t[%s]\n", 435 seq_printf(m, "%0*lx %c %s\t[%s]\n",
397 (int)(2*sizeof(void*)), 436 (int)(2*sizeof(void*)),
398 iter->value, iter->type, iter->name, 437 iter->value, type, iter->name, iter->module_name);
399 module_name(iter->owner)); 438 } else
400 else
401 seq_printf(m, "%0*lx %c %s\n", 439 seq_printf(m, "%0*lx %c %s\n",
402 (int)(2*sizeof(void*)), 440 (int)(2*sizeof(void*)),
403 iter->value, iter->type, iter->name); 441 iter->value, iter->type, iter->name);
@@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
432 return ret; 470 return ret;
433} 471}
434 472
435static int kallsyms_release(struct inode *inode, struct file *file)
436{
437 struct seq_file *m = (struct seq_file *)file->private_data;
438 kfree(m->private);
439 return seq_release(inode, file);
440}
441
442static const struct file_operations kallsyms_operations = { 473static const struct file_operations kallsyms_operations = {
443 .open = kallsyms_open, 474 .open = kallsyms_open,
444 .read = seq_read, 475 .read = seq_read,
445 .llseek = seq_lseek, 476 .llseek = seq_lseek,
446 .release = kallsyms_release, 477 .release = seq_release_private,
447}; 478};
448 479
449static int __init kallsyms_init(void) 480static int __init kallsyms_init(void)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a59c8a01ae0..25db14b89e82 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1118 memset(&prstatus, 0, sizeof(prstatus)); 1118 memset(&prstatus, 0, sizeof(prstatus));
1119 prstatus.pr_pid = current->pid; 1119 prstatus.pr_pid = current->pid;
1120 elf_core_copy_regs(&prstatus.pr_reg, regs); 1120 elf_core_copy_regs(&prstatus.pr_reg, regs);
1121 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, 1121 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1122 sizeof(prstatus)); 1122 &prstatus, sizeof(prstatus));
1123 final_note(buf); 1123 final_note(buf);
1124} 1124}
1125 1125
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 796276141e51..49cc4b9c1a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -23,7 +23,6 @@
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/smp_lock.h>
27#include <linux/slab.h> 26#include <linux/slab.h>
28#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
29#include <linux/completion.h> 28#include <linux/completion.h>
@@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data)
166 /* We can run anywhere, unlike our parent keventd(). */ 165 /* We can run anywhere, unlike our parent keventd(). */
167 set_cpus_allowed(current, CPU_MASK_ALL); 166 set_cpus_allowed(current, CPU_MASK_ALL);
168 167
168 /*
169 * Our parent is keventd, which runs with elevated scheduling priority.
170 * Avoid propagating that into the userspace child.
171 */
172 set_user_nice(current, 0);
173
169 retval = -EPERM; 174 retval = -EPERM;
170 if (current->fs->root) 175 if (current->fs->root)
171 retval = kernel_execve(sub_info->path, 176 retval = kernel_execve(sub_info->path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d25a9ada3f8e..9e47d8c493f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,16 +35,19 @@
35#include <linux/hash.h> 35#include <linux/hash.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/stddef.h>
38#include <linux/module.h> 39#include <linux/module.h>
39#include <linux/moduleloader.h> 40#include <linux/moduleloader.h>
40#include <linux/kallsyms.h> 41#include <linux/kallsyms.h>
41#include <linux/freezer.h> 42#include <linux/freezer.h>
42#include <linux/seq_file.h> 43#include <linux/seq_file.h>
43#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h>
46
44#include <asm-generic/sections.h> 47#include <asm-generic/sections.h>
45#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
46#include <asm/errno.h> 49#include <asm/errno.h>
47#include <asm/kdebug.h> 50#include <asm/uaccess.h>
48 51
49#define KPROBE_HASH_BITS 6 52#define KPROBE_HASH_BITS 6
50#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 53#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
63static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 66static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
64static atomic_t kprobe_count; 67static atomic_t kprobe_count;
65 68
69/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled;
71
66DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
67DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 73DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
68static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 74static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
132 struct kprobe_insn_page *kip; 138 struct kprobe_insn_page *kip;
133 struct hlist_node *pos; 139 struct hlist_node *pos;
134 140
135 retry: 141 retry:
136 hlist_for_each(pos, &kprobe_insn_pages) { 142 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
137 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
138 if (kip->nused < INSNS_PER_PAGE) { 143 if (kip->nused < INSNS_PER_PAGE) {
139 int i; 144 int i;
140 for (i = 0; i < INSNS_PER_PAGE; i++) { 145 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
155 } 160 }
156 /* All out of space. Need to allocate a new page. Use slot 0. */ 161 /* All out of space. Need to allocate a new page. Use slot 0. */
157 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 162 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
158 if (!kip) { 163 if (!kip)
159 return NULL; 164 return NULL;
160 }
161 165
162 /* 166 /*
163 * Use module_alloc so this page is within +/- 2GB of where the 167 * Use module_alloc so this page is within +/- 2GB of where the
@@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void)
213 if (check_safety() != 0) 217 if (check_safety() != 0)
214 return -EAGAIN; 218 return -EAGAIN;
215 219
216 hlist_for_each_safe(pos, next, &kprobe_insn_pages) { 220 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
217 int i; 221 int i;
218 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
219 if (kip->ngarbage == 0) 222 if (kip->ngarbage == 0)
220 continue; 223 continue;
221 kip->ngarbage = 0; /* we will collect all garbages */ 224 kip->ngarbage = 0; /* we will collect all garbages */
@@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
234 struct kprobe_insn_page *kip; 237 struct kprobe_insn_page *kip;
235 struct hlist_node *pos; 238 struct hlist_node *pos;
236 239
237 hlist_for_each(pos, &kprobe_insn_pages) { 240 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
238 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
239 if (kip->insns <= slot && 241 if (kip->insns <= slot &&
240 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 242 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
241 int i = (slot - kip->insns) / MAX_INSN_SIZE; 243 int i = (slot - kip->insns) / MAX_INSN_SIZE;
@@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
248 break; 250 break;
249 } 251 }
250 } 252 }
251 if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { 253
254 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
252 collect_garbage_slots(); 255 collect_garbage_slots();
253 }
254} 256}
255#endif 257#endif
256 258
@@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
316 reset_kprobe_instance(); 318 reset_kprobe_instance();
317 } 319 }
318 } 320 }
319 return;
320} 321}
321 322
322static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 323static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
@@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
362} 363}
363 364
364/* Called with kretprobe_lock held */ 365/* Called with kretprobe_lock held */
365struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
366{
367 struct hlist_node *node;
368 struct kretprobe_instance *ri;
369 hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
370 return ri;
371 return NULL;
372}
373
374/* Called with kretprobe_lock held */
375static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
376 *rp)
377{
378 struct hlist_node *node;
379 struct kretprobe_instance *ri;
380 hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
381 return ri;
382 return NULL;
383}
384
385/* Called with kretprobe_lock held */
386void __kprobes add_rp_inst(struct kretprobe_instance *ri)
387{
388 /*
389 * Remove rp inst off the free list -
390 * Add it back when probed function returns
391 */
392 hlist_del(&ri->uflist);
393
394 /* Add rp inst onto table */
395 INIT_HLIST_NODE(&ri->hlist);
396 hlist_add_head(&ri->hlist,
397 &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
398
399 /* Also add this rp inst to the used list. */
400 INIT_HLIST_NODE(&ri->uflist);
401 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
402}
403
404/* Called with kretprobe_lock held */
405void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, 366void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
406 struct hlist_head *head) 367 struct hlist_head *head)
407{ 368{
@@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
454static inline void free_rp_inst(struct kretprobe *rp) 415static inline void free_rp_inst(struct kretprobe *rp)
455{ 416{
456 struct kretprobe_instance *ri; 417 struct kretprobe_instance *ri;
457 while ((ri = get_free_rp_inst(rp)) != NULL) { 418 struct hlist_node *pos, *next;
419
420 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
458 hlist_del(&ri->uflist); 421 hlist_del(&ri->uflist);
459 kfree(ri); 422 kfree(ri);
460 } 423 }
@@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
535 498
536static int __kprobes in_kprobes_functions(unsigned long addr) 499static int __kprobes in_kprobes_functions(unsigned long addr)
537{ 500{
538 if (addr >= (unsigned long)__kprobes_text_start 501 if (addr >= (unsigned long)__kprobes_text_start &&
539 && addr < (unsigned long)__kprobes_text_end) 502 addr < (unsigned long)__kprobes_text_end)
540 return -EINVAL; 503 return -EINVAL;
541 return 0; 504 return 0;
542} 505}
@@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p,
563 return -EINVAL; 526 return -EINVAL;
564 p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); 527 p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
565 528
566 if ((!kernel_text_address((unsigned long) p->addr)) || 529 if (!kernel_text_address((unsigned long) p->addr) ||
567 in_kprobes_functions((unsigned long) p->addr)) 530 in_kprobes_functions((unsigned long) p->addr))
568 return -EINVAL; 531 return -EINVAL;
569 532
570 p->mod_refcounted = 0; 533 p->mod_refcounted = 0;
571 /* Check are we probing a module */ 534
572 if ((probed_mod = module_text_address((unsigned long) p->addr))) { 535 /*
536 * Check if are we probing a module.
537 */
538 probed_mod = module_text_address((unsigned long) p->addr);
539 if (probed_mod) {
573 struct module *calling_mod = module_text_address(called_from); 540 struct module *calling_mod = module_text_address(called_from);
574 /* We must allow modules to probe themself and 541 /*
575 * in this case avoid incrementing the module refcount, 542 * We must allow modules to probe themself and in this case
576 * so as to allow unloading of self probing modules. 543 * avoid incrementing the module refcount, so as to allow
544 * unloading of self probing modules.
577 */ 545 */
578 if (calling_mod && (calling_mod != probed_mod)) { 546 if (calling_mod && calling_mod != probed_mod) {
579 if (unlikely(!try_module_get(probed_mod))) 547 if (unlikely(!try_module_get(probed_mod)))
580 return -EINVAL; 548 return -EINVAL;
581 p->mod_refcounted = 1; 549 p->mod_refcounted = 1;
@@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
593 goto out; 561 goto out;
594 } 562 }
595 563
596 if ((ret = arch_prepare_kprobe(p)) != 0) 564 ret = arch_prepare_kprobe(p);
565 if (ret)
597 goto out; 566 goto out;
598 567
599 INIT_HLIST_NODE(&p->hlist); 568 INIT_HLIST_NODE(&p->hlist);
600 hlist_add_head_rcu(&p->hlist, 569 hlist_add_head_rcu(&p->hlist,
601 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 570 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
602 571
603 if (atomic_add_return(1, &kprobe_count) == \ 572 if (kprobe_enabled) {
573 if (atomic_add_return(1, &kprobe_count) == \
604 (ARCH_INACTIVE_KPROBE_COUNT + 1)) 574 (ARCH_INACTIVE_KPROBE_COUNT + 1))
605 register_page_fault_notifier(&kprobe_page_fault_nb); 575 register_page_fault_notifier(&kprobe_page_fault_nb);
606
607 arch_arm_kprobe(p);
608 576
577 arch_arm_kprobe(p);
578 }
609out: 579out:
610 mutex_unlock(&kprobe_mutex); 580 mutex_unlock(&kprobe_mutex);
611 581
@@ -616,8 +586,7 @@ out:
616 586
617int __kprobes register_kprobe(struct kprobe *p) 587int __kprobes register_kprobe(struct kprobe *p)
618{ 588{
619 return __register_kprobe(p, 589 return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
620 (unsigned long)__builtin_return_address(0));
621} 590}
622 591
623void __kprobes unregister_kprobe(struct kprobe *p) 592void __kprobes unregister_kprobe(struct kprobe *p)
@@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
641 return; 610 return;
642 } 611 }
643valid_p: 612valid_p:
644 if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && 613 if (old_p == p ||
645 (p->list.next == &old_p->list) && 614 (old_p->pre_handler == aggr_pre_handler &&
646 (p->list.prev == &old_p->list))) { 615 p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
647 /* Only probe on the hash list */ 616 /*
648 arch_disarm_kprobe(p); 617 * Only probe on the hash list. Disarm only if kprobes are
618 * enabled - otherwise, the breakpoint would already have
619 * been removed. We save on flushing icache.
620 */
621 if (kprobe_enabled)
622 arch_disarm_kprobe(p);
649 hlist_del_rcu(&old_p->hlist); 623 hlist_del_rcu(&old_p->hlist);
650 cleanup_p = 1; 624 cleanup_p = 1;
651 } else { 625 } else {
@@ -656,9 +630,11 @@ valid_p:
656 mutex_unlock(&kprobe_mutex); 630 mutex_unlock(&kprobe_mutex);
657 631
658 synchronize_sched(); 632 synchronize_sched();
659 if (p->mod_refcounted && 633 if (p->mod_refcounted) {
660 (mod = module_text_address((unsigned long)p->addr))) 634 mod = module_text_address((unsigned long)p->addr);
661 module_put(mod); 635 if (mod)
636 module_put(mod);
637 }
662 638
663 if (cleanup_p) { 639 if (cleanup_p) {
664 if (p != old_p) { 640 if (p != old_p) {
@@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
729 705
730 /*TODO: consider to only swap the RA after the last pre_handler fired */ 706 /*TODO: consider to only swap the RA after the last pre_handler fired */
731 spin_lock_irqsave(&kretprobe_lock, flags); 707 spin_lock_irqsave(&kretprobe_lock, flags);
732 arch_prepare_kretprobe(rp, regs); 708 if (!hlist_empty(&rp->free_instances)) {
709 struct kretprobe_instance *ri;
710
711 ri = hlist_entry(rp->free_instances.first,
712 struct kretprobe_instance, uflist);
713 ri->rp = rp;
714 ri->task = current;
715 arch_prepare_kretprobe(ri, regs);
716
717 /* XXX(hch): why is there no hlist_move_head? */
718 hlist_del(&ri->uflist);
719 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
720 hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
721 } else
722 rp->nmissed++;
733 spin_unlock_irqrestore(&kretprobe_lock, flags); 723 spin_unlock_irqrestore(&kretprobe_lock, flags);
734 return 0; 724 return 0;
735} 725}
@@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
792{ 782{
793 unsigned long flags; 783 unsigned long flags;
794 struct kretprobe_instance *ri; 784 struct kretprobe_instance *ri;
785 struct hlist_node *pos, *next;
795 786
796 unregister_kprobe(&rp->kp); 787 unregister_kprobe(&rp->kp);
788
797 /* No race here */ 789 /* No race here */
798 spin_lock_irqsave(&kretprobe_lock, flags); 790 spin_lock_irqsave(&kretprobe_lock, flags);
799 while ((ri = get_used_rp_inst(rp)) != NULL) { 791 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
800 ri->rp = NULL; 792 ri->rp = NULL;
801 hlist_del(&ri->uflist); 793 hlist_del(&ri->uflist);
802 } 794 }
@@ -816,6 +808,9 @@ static int __init init_kprobes(void)
816 } 808 }
817 atomic_set(&kprobe_count, 0); 809 atomic_set(&kprobe_count, 0);
818 810
811 /* By default, kprobes are enabled */
812 kprobe_enabled = true;
813
819 err = arch_init_kprobes(); 814 err = arch_init_kprobes();
820 if (!err) 815 if (!err)
821 err = register_die_notifier(&kprobe_exceptions_nb); 816 err = register_die_notifier(&kprobe_exceptions_nb);
@@ -825,7 +820,7 @@ static int __init init_kprobes(void)
825 820
826#ifdef CONFIG_DEBUG_FS 821#ifdef CONFIG_DEBUG_FS
827static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 822static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
828 const char *sym, int offset,char *modname) 823 const char *sym, int offset,char *modname)
829{ 824{
830 char *kprobe_type; 825 char *kprobe_type;
831 826
@@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
867 struct kprobe *p, *kp; 862 struct kprobe *p, *kp;
868 const char *sym = NULL; 863 const char *sym = NULL;
869 unsigned int i = *(loff_t *) v; 864 unsigned int i = *(loff_t *) v;
870 unsigned long size, offset = 0; 865 unsigned long offset = 0;
871 char *modname, namebuf[128]; 866 char *modname, namebuf[128];
872 867
873 head = &kprobe_table[i]; 868 head = &kprobe_table[i];
874 preempt_disable(); 869 preempt_disable();
875 hlist_for_each_entry_rcu(p, node, head, hlist) { 870 hlist_for_each_entry_rcu(p, node, head, hlist) {
876 sym = kallsyms_lookup((unsigned long)p->addr, &size, 871 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
877 &offset, &modname, namebuf); 872 &offset, &modname, namebuf);
878 if (p->pre_handler == aggr_pre_handler) { 873 if (p->pre_handler == aggr_pre_handler) {
879 list_for_each_entry_rcu(kp, &p->list, list) 874 list_for_each_entry_rcu(kp, &p->list, list)
@@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = {
904 .release = seq_release, 899 .release = seq_release,
905}; 900};
906 901
902static void __kprobes enable_all_kprobes(void)
903{
904 struct hlist_head *head;
905 struct hlist_node *node;
906 struct kprobe *p;
907 unsigned int i;
908
909 mutex_lock(&kprobe_mutex);
910
911 /* If kprobes are already enabled, just return */
912 if (kprobe_enabled)
913 goto already_enabled;
914
915 /*
916 * Re-register the page fault notifier only if there are any
917 * active probes at the time of enabling kprobes globally
918 */
919 if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
920 register_page_fault_notifier(&kprobe_page_fault_nb);
921
922 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
923 head = &kprobe_table[i];
924 hlist_for_each_entry_rcu(p, node, head, hlist)
925 arch_arm_kprobe(p);
926 }
927
928 kprobe_enabled = true;
929 printk(KERN_INFO "Kprobes globally enabled\n");
930
931already_enabled:
932 mutex_unlock(&kprobe_mutex);
933 return;
934}
935
936static void __kprobes disable_all_kprobes(void)
937{
938 struct hlist_head *head;
939 struct hlist_node *node;
940 struct kprobe *p;
941 unsigned int i;
942
943 mutex_lock(&kprobe_mutex);
944
945 /* If kprobes are already disabled, just return */
946 if (!kprobe_enabled)
947 goto already_disabled;
948
949 kprobe_enabled = false;
950 printk(KERN_INFO "Kprobes globally disabled\n");
951 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
952 head = &kprobe_table[i];
953 hlist_for_each_entry_rcu(p, node, head, hlist) {
954 if (!arch_trampoline_kprobe(p))
955 arch_disarm_kprobe(p);
956 }
957 }
958
959 mutex_unlock(&kprobe_mutex);
960 /* Allow all currently running kprobes to complete */
961 synchronize_sched();
962
963 mutex_lock(&kprobe_mutex);
964 /* Unconditionally unregister the page_fault notifier */
965 unregister_page_fault_notifier(&kprobe_page_fault_nb);
966
967already_disabled:
968 mutex_unlock(&kprobe_mutex);
969 return;
970}
971
972/*
973 * XXX: The debugfs bool file interface doesn't allow for callbacks
974 * when the bool state is switched. We can reuse that facility when
975 * available
976 */
977static ssize_t read_enabled_file_bool(struct file *file,
978 char __user *user_buf, size_t count, loff_t *ppos)
979{
980 char buf[3];
981
982 if (kprobe_enabled)
983 buf[0] = '1';
984 else
985 buf[0] = '0';
986 buf[1] = '\n';
987 buf[2] = 0x00;
988 return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
989}
990
991static ssize_t write_enabled_file_bool(struct file *file,
992 const char __user *user_buf, size_t count, loff_t *ppos)
993{
994 char buf[32];
995 int buf_size;
996
997 buf_size = min(count, (sizeof(buf)-1));
998 if (copy_from_user(buf, user_buf, buf_size))
999 return -EFAULT;
1000
1001 switch (buf[0]) {
1002 case 'y':
1003 case 'Y':
1004 case '1':
1005 enable_all_kprobes();
1006 break;
1007 case 'n':
1008 case 'N':
1009 case '0':
1010 disable_all_kprobes();
1011 break;
1012 }
1013
1014 return count;
1015}
1016
1017static struct file_operations fops_kp = {
1018 .read = read_enabled_file_bool,
1019 .write = write_enabled_file_bool,
1020};
1021
907static int __kprobes debugfs_kprobe_init(void) 1022static int __kprobes debugfs_kprobe_init(void)
908{ 1023{
909 struct dentry *dir, *file; 1024 struct dentry *dir, *file;
1025 unsigned int value = 1;
910 1026
911 dir = debugfs_create_dir("kprobes", NULL); 1027 dir = debugfs_create_dir("kprobes", NULL);
912 if (!dir) 1028 if (!dir)
913 return -ENOMEM; 1029 return -ENOMEM;
914 1030
915 file = debugfs_create_file("list", 0444, dir , 0 , 1031 file = debugfs_create_file("list", 0444, dir, NULL,
916 &debugfs_kprobes_operations); 1032 &debugfs_kprobes_operations);
917 if (!file) { 1033 if (!file) {
918 debugfs_remove(dir); 1034 debugfs_remove(dir);
919 return -ENOMEM; 1035 return -ENOMEM;
920 } 1036 }
921 1037
1038 file = debugfs_create_file("enabled", 0600, dir,
1039 &value, &fops_kp);
1040 if (!file) {
1041 debugfs_remove(dir);
1042 return -ENOMEM;
1043 }
1044
922 return 0; 1045 return 0;
923} 1046}
924 1047
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7065a687ac54..1a5ff2211d88 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace)
257 trace->entries = stack_trace + nr_stack_trace_entries; 257 trace->entries = stack_trace + nr_stack_trace_entries;
258 258
259 trace->skip = 3; 259 trace->skip = 3;
260 trace->all_contexts = 0;
261 260
262 save_stack_trace(trace, NULL); 261 save_stack_trace(trace);
263 262
264 trace->max_entries = trace->nr_entries; 263 trace->max_entries = trace->nr_entries;
265 264
@@ -341,10 +340,7 @@ static const char *usage_str[] =
341 340
342const char * __get_key_name(struct lockdep_subclass_key *key, char *str) 341const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
343{ 342{
344 unsigned long offs, size; 343 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
345 char *modname;
346
347 return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
348} 344}
349 345
350void 346void
@@ -1313,8 +1309,9 @@ out_unlock_set:
1313 1309
1314/* 1310/*
1315 * Look up a dependency chain. If the key is not present yet then 1311 * Look up a dependency chain. If the key is not present yet then
1316 * add it and return 0 - in this case the new dependency chain is 1312 * add it and return 1 - in this case the new dependency chain is
1317 * validated. If the key is already hashed, return 1. 1313 * validated. If the key is already hashed, return 0.
1314 * (On return with 1 graph_lock is held.)
1318 */ 1315 */
1319static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) 1316static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
1320{ 1317{
@@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
1577 * Mark a lock with a usage bit, and validate the state transition: 1574 * Mark a lock with a usage bit, and validate the state transition:
1578 */ 1575 */
1579static int mark_lock(struct task_struct *curr, struct held_lock *this, 1576static int mark_lock(struct task_struct *curr, struct held_lock *this,
1580 enum lock_usage_bit new_bit, unsigned long ip) 1577 enum lock_usage_bit new_bit)
1581{ 1578{
1582 unsigned int new_mask = 1 << new_bit, ret = 1; 1579 unsigned int new_mask = 1 << new_bit, ret = 1;
1583 1580
@@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1600 1597
1601 this->class->usage_mask |= new_mask; 1598 this->class->usage_mask |= new_mask;
1602 1599
1603#ifdef CONFIG_TRACE_IRQFLAGS
1604 if (new_bit == LOCK_ENABLED_HARDIRQS ||
1605 new_bit == LOCK_ENABLED_HARDIRQS_READ)
1606 ip = curr->hardirq_enable_ip;
1607 else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
1608 new_bit == LOCK_ENABLED_SOFTIRQS_READ)
1609 ip = curr->softirq_enable_ip;
1610#endif
1611 if (!save_trace(this->class->usage_traces + new_bit)) 1600 if (!save_trace(this->class->usage_traces + new_bit))
1612 return 0; 1601 return 0;
1613 1602
@@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1806 * Mark all held locks with a usage bit: 1795 * Mark all held locks with a usage bit:
1807 */ 1796 */
1808static int 1797static int
1809mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) 1798mark_held_locks(struct task_struct *curr, int hardirq)
1810{ 1799{
1811 enum lock_usage_bit usage_bit; 1800 enum lock_usage_bit usage_bit;
1812 struct held_lock *hlock; 1801 struct held_lock *hlock;
@@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
1826 else 1815 else
1827 usage_bit = LOCK_ENABLED_SOFTIRQS; 1816 usage_bit = LOCK_ENABLED_SOFTIRQS;
1828 } 1817 }
1829 if (!mark_lock(curr, hlock, usage_bit, ip)) 1818 if (!mark_lock(curr, hlock, usage_bit))
1830 return 0; 1819 return 0;
1831 } 1820 }
1832 1821
@@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void)
1879 * We are going to turn hardirqs on, so set the 1868 * We are going to turn hardirqs on, so set the
1880 * usage bit for all held locks: 1869 * usage bit for all held locks:
1881 */ 1870 */
1882 if (!mark_held_locks(curr, 1, ip)) 1871 if (!mark_held_locks(curr, 1))
1883 return; 1872 return;
1884 /* 1873 /*
1885 * If we have softirqs enabled, then set the usage 1874 * If we have softirqs enabled, then set the usage
@@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void)
1887 * this bit from being set before) 1876 * this bit from being set before)
1888 */ 1877 */
1889 if (curr->softirqs_enabled) 1878 if (curr->softirqs_enabled)
1890 if (!mark_held_locks(curr, 0, ip)) 1879 if (!mark_held_locks(curr, 0))
1891 return; 1880 return;
1892 1881
1893 curr->hardirq_enable_ip = ip; 1882 curr->hardirq_enable_ip = ip;
@@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip)
1955 * enabled too: 1944 * enabled too:
1956 */ 1945 */
1957 if (curr->hardirqs_enabled) 1946 if (curr->hardirqs_enabled)
1958 mark_held_locks(curr, 0, ip); 1947 mark_held_locks(curr, 0);
1959} 1948}
1960 1949
1961/* 1950/*
@@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2093 if (read) { 2082 if (read) {
2094 if (curr->hardirq_context) 2083 if (curr->hardirq_context)
2095 if (!mark_lock(curr, hlock, 2084 if (!mark_lock(curr, hlock,
2096 LOCK_USED_IN_HARDIRQ_READ, ip)) 2085 LOCK_USED_IN_HARDIRQ_READ))
2097 return 0; 2086 return 0;
2098 if (curr->softirq_context) 2087 if (curr->softirq_context)
2099 if (!mark_lock(curr, hlock, 2088 if (!mark_lock(curr, hlock,
2100 LOCK_USED_IN_SOFTIRQ_READ, ip)) 2089 LOCK_USED_IN_SOFTIRQ_READ))
2101 return 0; 2090 return 0;
2102 } else { 2091 } else {
2103 if (curr->hardirq_context) 2092 if (curr->hardirq_context)
2104 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) 2093 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2105 return 0; 2094 return 0;
2106 if (curr->softirq_context) 2095 if (curr->softirq_context)
2107 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) 2096 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2108 return 0; 2097 return 0;
2109 } 2098 }
2110 } 2099 }
2111 if (!hardirqs_off) { 2100 if (!hardirqs_off) {
2112 if (read) { 2101 if (read) {
2113 if (!mark_lock(curr, hlock, 2102 if (!mark_lock(curr, hlock,
2114 LOCK_ENABLED_HARDIRQS_READ, ip)) 2103 LOCK_ENABLED_HARDIRQS_READ))
2115 return 0; 2104 return 0;
2116 if (curr->softirqs_enabled) 2105 if (curr->softirqs_enabled)
2117 if (!mark_lock(curr, hlock, 2106 if (!mark_lock(curr, hlock,
2118 LOCK_ENABLED_SOFTIRQS_READ, ip)) 2107 LOCK_ENABLED_SOFTIRQS_READ))
2119 return 0; 2108 return 0;
2120 } else { 2109 } else {
2121 if (!mark_lock(curr, hlock, 2110 if (!mark_lock(curr, hlock,
2122 LOCK_ENABLED_HARDIRQS, ip)) 2111 LOCK_ENABLED_HARDIRQS))
2123 return 0; 2112 return 0;
2124 if (curr->softirqs_enabled) 2113 if (curr->softirqs_enabled)
2125 if (!mark_lock(curr, hlock, 2114 if (!mark_lock(curr, hlock,
2126 LOCK_ENABLED_SOFTIRQS, ip)) 2115 LOCK_ENABLED_SOFTIRQS))
2127 return 0; 2116 return 0;
2128 } 2117 }
2129 } 2118 }
2130#endif 2119#endif
2131 /* mark it as used: */ 2120 /* mark it as used: */
2132 if (!mark_lock(curr, hlock, LOCK_USED, ip)) 2121 if (!mark_lock(curr, hlock, LOCK_USED))
2133 return 0; 2122 return 0;
2134out_calc_hash: 2123out_calc_hash:
2135 /* 2124 /*
diff --git a/kernel/module.c b/kernel/module.c
index 1eb8ca565ba0..d36e45477fac 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kallsyms.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
24#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
@@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size)
310{ 311{
311 /* Reallocation required? */ 312 /* Reallocation required? */
312 if (pcpu_num_used + 1 > pcpu_num_allocated) { 313 if (pcpu_num_used + 1 > pcpu_num_allocated) {
313 int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, 314 int *new;
314 GFP_KERNEL); 315
316 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
317 GFP_KERNEL);
315 if (!new) 318 if (!new)
316 return 0; 319 return 0;
317 320
318 memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
319 pcpu_num_allocated *= 2; 321 pcpu_num_allocated *= 2;
320 kfree(pcpu_size);
321 pcpu_size = new; 322 pcpu_size = new;
322 } 323 }
323 324
@@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1471} 1472}
1472 1473
1473#ifdef CONFIG_KALLSYMS 1474#ifdef CONFIG_KALLSYMS
1474int is_exported(const char *name, const struct module *mod) 1475static int is_exported(const char *name, const struct module *mod)
1475{ 1476{
1476 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1477 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1477 return 1; 1478 return 1;
@@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod,
2097 if (!best) 2098 if (!best)
2098 return NULL; 2099 return NULL;
2099 2100
2100 *size = nextval - mod->symtab[best].st_value; 2101 if (size)
2101 *offset = addr - mod->symtab[best].st_value; 2102 *size = nextval - mod->symtab[best].st_value;
2103 if (offset)
2104 *offset = addr - mod->symtab[best].st_value;
2102 return mod->strtab + mod->symtab[best].st_name; 2105 return mod->strtab + mod->symtab[best].st_name;
2103} 2106}
2104 2107
@@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr,
2123 return NULL; 2126 return NULL;
2124} 2127}
2125 2128
2126struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, 2129int lookup_module_symbol_name(unsigned long addr, char *symname)
2127 char *type, char *name, size_t namelen) 2130{
2131 struct module *mod;
2132
2133 mutex_lock(&module_mutex);
2134 list_for_each_entry(mod, &modules, list) {
2135 if (within(addr, mod->module_init, mod->init_size) ||
2136 within(addr, mod->module_core, mod->core_size)) {
2137 const char *sym;
2138
2139 sym = get_ksymbol(mod, addr, NULL, NULL);
2140 if (!sym)
2141 goto out;
2142 strlcpy(symname, sym, KSYM_NAME_LEN + 1);
2143 mutex_unlock(&module_mutex);
2144 return 0;
2145 }
2146 }
2147out:
2148 mutex_unlock(&module_mutex);
2149 return -ERANGE;
2150}
2151
2152int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2153 unsigned long *offset, char *modname, char *name)
2154{
2155 struct module *mod;
2156
2157 mutex_lock(&module_mutex);
2158 list_for_each_entry(mod, &modules, list) {
2159 if (within(addr, mod->module_init, mod->init_size) ||
2160 within(addr, mod->module_core, mod->core_size)) {
2161 const char *sym;
2162
2163 sym = get_ksymbol(mod, addr, size, offset);
2164 if (!sym)
2165 goto out;
2166 if (modname)
2167 strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
2168 if (name)
2169 strlcpy(name, sym, KSYM_NAME_LEN + 1);
2170 mutex_unlock(&module_mutex);
2171 return 0;
2172 }
2173 }
2174out:
2175 mutex_unlock(&module_mutex);
2176 return -ERANGE;
2177}
2178
2179int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2180 char *name, char *module_name, int *exported)
2128{ 2181{
2129 struct module *mod; 2182 struct module *mod;
2130 2183
@@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
2134 *value = mod->symtab[symnum].st_value; 2187 *value = mod->symtab[symnum].st_value;
2135 *type = mod->symtab[symnum].st_info; 2188 *type = mod->symtab[symnum].st_info;
2136 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2189 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2137 namelen); 2190 KSYM_NAME_LEN + 1);
2191 strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
2192 *exported = is_exported(name, mod);
2138 mutex_unlock(&module_mutex); 2193 mutex_unlock(&module_mutex);
2139 return mod; 2194 return 0;
2140 } 2195 }
2141 symnum -= mod->num_symtab; 2196 symnum -= mod->num_symtab;
2142 } 2197 }
2143 mutex_unlock(&module_mutex); 2198 mutex_unlock(&module_mutex);
2144 return NULL; 2199 return -ERANGE;
2145} 2200}
2146 2201
2147static unsigned long mod_find_symname(struct module *mod, const char *name) 2202static unsigned long mod_find_symname(struct module *mod, const char *name)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6f6bbb..1bc4b55241a8 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk)
38 38
39/* 39/*
40 * creates a copy of "orig" with refcount 1. 40 * creates a copy of "orig" with refcount 1.
41 * This does not grab references to the contained namespaces,
42 * so that needs to be done by dup_namespaces.
43 */ 41 */
44static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) 42static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
45{ 43{
46 struct nsproxy *ns; 44 struct nsproxy *ns;
47 45
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
52} 50}
53 51
54/* 52/*
55 * copies the nsproxy, setting refcount to 1, and grabbing a 53 * Create new nsproxy and all of its the associated namespaces.
56 * reference to all contained namespaces. Called from 54 * Return the newly created nsproxy. Do not attach this to the task,
57 * sys_unshare() 55 * leave it to the caller to do proper locking and attach it to task.
58 */ 56 */
59struct nsproxy *dup_namespaces(struct nsproxy *orig) 57static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
58 struct fs_struct *new_fs)
60{ 59{
61 struct nsproxy *ns = clone_namespaces(orig); 60 struct nsproxy *new_nsp;
62 61
63 if (ns) { 62 new_nsp = clone_nsproxy(tsk->nsproxy);
64 if (ns->mnt_ns) 63 if (!new_nsp)
65 get_mnt_ns(ns->mnt_ns); 64 return ERR_PTR(-ENOMEM);
66 if (ns->uts_ns)
67 get_uts_ns(ns->uts_ns);
68 if (ns->ipc_ns)
69 get_ipc_ns(ns->ipc_ns);
70 if (ns->pid_ns)
71 get_pid_ns(ns->pid_ns);
72 }
73 65
74 return ns; 66 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
67 if (IS_ERR(new_nsp->mnt_ns))
68 goto out_ns;
69
70 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
71 if (IS_ERR(new_nsp->uts_ns))
72 goto out_uts;
73
74 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
75 if (IS_ERR(new_nsp->ipc_ns))
76 goto out_ipc;
77
78 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
79 if (IS_ERR(new_nsp->pid_ns))
80 goto out_pid;
81
82 return new_nsp;
83
84out_pid:
85 if (new_nsp->ipc_ns)
86 put_ipc_ns(new_nsp->ipc_ns);
87out_ipc:
88 if (new_nsp->uts_ns)
89 put_uts_ns(new_nsp->uts_ns);
90out_uts:
91 if (new_nsp->mnt_ns)
92 put_mnt_ns(new_nsp->mnt_ns);
93out_ns:
94 kfree(new_nsp);
95 return ERR_PTR(-ENOMEM);
75} 96}
76 97
77/* 98/*
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk)
92 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 113 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
93 return 0; 114 return 0;
94 115
95 new_ns = clone_namespaces(old_ns); 116 if (!capable(CAP_SYS_ADMIN)) {
96 if (!new_ns) { 117 err = -EPERM;
97 err = -ENOMEM;
98 goto out; 118 goto out;
99 } 119 }
100 120
101 tsk->nsproxy = new_ns; 121 new_ns = create_new_namespaces(flags, tsk, tsk->fs);
102 122 if (IS_ERR(new_ns)) {
103 err = copy_mnt_ns(flags, tsk); 123 err = PTR_ERR(new_ns);
104 if (err) 124 goto out;
105 goto out_ns; 125 }
106
107 err = copy_utsname(flags, tsk);
108 if (err)
109 goto out_uts;
110
111 err = copy_ipcs(flags, tsk);
112 if (err)
113 goto out_ipc;
114
115 err = copy_pid_ns(flags, tsk);
116 if (err)
117 goto out_pid;
118 126
127 tsk->nsproxy = new_ns;
119out: 128out:
120 put_nsproxy(old_ns); 129 put_nsproxy(old_ns);
121 return err; 130 return err;
122
123out_pid:
124 if (new_ns->ipc_ns)
125 put_ipc_ns(new_ns->ipc_ns);
126out_ipc:
127 if (new_ns->uts_ns)
128 put_uts_ns(new_ns->uts_ns);
129out_uts:
130 if (new_ns->mnt_ns)
131 put_mnt_ns(new_ns->mnt_ns);
132out_ns:
133 tsk->nsproxy = old_ns;
134 kfree(new_ns);
135 goto out;
136} 131}
137 132
138void free_nsproxy(struct nsproxy *ns) 133void free_nsproxy(struct nsproxy *ns)
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns)
147 put_pid_ns(ns->pid_ns); 142 put_pid_ns(ns->pid_ns);
148 kfree(ns); 143 kfree(ns);
149} 144}
145
146/*
147 * Called from unshare. Unshare all the namespaces part of nsproxy.
148 * On sucess, returns the new nsproxy and a reference to old nsproxy
149 * to make sure it stays around.
150 */
151int unshare_nsproxy_namespaces(unsigned long unshare_flags,
152 struct nsproxy **new_nsp, struct fs_struct *new_fs)
153{
154 struct nsproxy *old_ns = current->nsproxy;
155 int err = 0;
156
157 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
158 return 0;
159
160#ifndef CONFIG_IPC_NS
161 if (unshare_flags & CLONE_NEWIPC)
162 return -EINVAL;
163#endif
164
165#ifndef CONFIG_UTS_NS
166 if (unshare_flags & CLONE_NEWUTS)
167 return -EINVAL;
168#endif
169
170 if (!capable(CAP_SYS_ADMIN))
171 return -EPERM;
172
173 get_nsproxy(old_ns);
174
175 *new_nsp = create_new_namespaces(unshare_flags, current,
176 new_fs ? new_fs : current->fs);
177 if (IS_ERR(*new_nsp)) {
178 err = PTR_ERR(*new_nsp);
179 put_nsproxy(old_ns);
180 }
181 return err;
182}
diff --git a/kernel/params.c b/kernel/params.c
index 312172320b4c..e61c46c97ce7 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
269 return param_get_bool(buffer, &dummy); 269 return param_get_bool(buffer, &dummy);
270} 270}
271 271
272/* We cheat here and temporarily mangle the string. */ 272/* We break the rule and mangle the string. */
273static int param_array(const char *name, 273static int param_array(const char *name,
274 const char *val, 274 const char *val,
275 unsigned int min, unsigned int max, 275 unsigned int min, unsigned int max,
diff --git a/kernel/pid.c b/kernel/pid.c
index 9c80bc23d6b8..d3ad724afa83 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr)
360} 360}
361EXPORT_SYMBOL_GPL(find_get_pid); 361EXPORT_SYMBOL_GPL(find_get_pid);
362 362
363int copy_pid_ns(int flags, struct task_struct *tsk) 363struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
364{ 364{
365 struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; 365 BUG_ON(!old_ns);
366 int err = 0;
367
368 if (!old_ns)
369 return 0;
370
371 get_pid_ns(old_ns); 366 get_pid_ns(old_ns);
372 return err; 367 return old_ns;
373} 368}
374 369
375void free_pid_ns(struct kref *kref) 370void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 657f77697415..1de710e18373 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk,
971 maxfire = 20; 971 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 972 tsk->it_prof_expires = cputime_zero;
973 while (!list_empty(timers)) { 973 while (!list_empty(timers)) {
974 struct cpu_timer_list *t = list_entry(timers->next, 974 struct cpu_timer_list *t = list_first_entry(timers,
975 struct cpu_timer_list, 975 struct cpu_timer_list,
976 entry); 976 entry);
977 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 977 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk,
986 maxfire = 20; 986 maxfire = 20;
987 tsk->it_virt_expires = cputime_zero; 987 tsk->it_virt_expires = cputime_zero;
988 while (!list_empty(timers)) { 988 while (!list_empty(timers)) {
989 struct cpu_timer_list *t = list_entry(timers->next, 989 struct cpu_timer_list *t = list_first_entry(timers,
990 struct cpu_timer_list, 990 struct cpu_timer_list,
991 entry); 991 entry);
992 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 992 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk,
1001 maxfire = 20; 1001 maxfire = 20;
1002 tsk->it_sched_expires = 0; 1002 tsk->it_sched_expires = 0;
1003 while (!list_empty(timers)) { 1003 while (!list_empty(timers)) {
1004 struct cpu_timer_list *t = list_entry(timers->next, 1004 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 1005 struct cpu_timer_list,
1006 entry); 1006 entry);
1007 if (!--maxfire || tsk->sched_time < t->expires.sched) { 1007 if (!--maxfire || tsk->sched_time < t->expires.sched) {
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk,
1057 maxfire = 20; 1057 maxfire = 20;
1058 prof_expires = cputime_zero; 1058 prof_expires = cputime_zero;
1059 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1060 struct cpu_timer_list *t = list_entry(timers->next, 1060 struct cpu_timer_list *t = list_first_entry(timers,
1061 struct cpu_timer_list, 1061 struct cpu_timer_list,
1062 entry); 1062 entry);
1063 if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { 1063 if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
1072 maxfire = 20; 1072 maxfire = 20;
1073 virt_expires = cputime_zero; 1073 virt_expires = cputime_zero;
1074 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1075 struct cpu_timer_list *t = list_entry(timers->next, 1075 struct cpu_timer_list *t = list_first_entry(timers,
1076 struct cpu_timer_list, 1076 struct cpu_timer_list,
1077 entry); 1077 entry);
1078 if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { 1078 if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk,
1087 maxfire = 20; 1087 maxfire = 20;
1088 sched_expires = 0; 1088 sched_expires = 0;
1089 while (!list_empty(timers)) { 1089 while (!list_empty(timers)) {
1090 struct cpu_timer_list *t = list_entry(timers->next, 1090 struct cpu_timer_list *t = list_first_entry(timers,
1091 struct cpu_timer_list, 1091 struct cpu_timer_list,
1092 entry); 1092 entry);
1093 if (!--maxfire || sched_time < t->expires.sched) { 1093 if (!--maxfire || sched_time < t->expires.sched) {
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1400 */ 1400 */
1401 head = &tsk->signal->cpu_timers[clock_idx]; 1401 head = &tsk->signal->cpu_timers[clock_idx];
1402 if (list_empty(head) || 1402 if (list_empty(head) ||
1403 cputime_ge(list_entry(head->next, 1403 cputime_ge(list_first_entry(head,
1404 struct cpu_timer_list, entry)->expires.cpu, 1404 struct cpu_timer_list, entry)->expires.cpu,
1405 *newval)) { 1405 *newval)) {
1406 /* 1406 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca71978..588c99da0307 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,7 +31,6 @@
31 * POSIX clocks & timers 31 * POSIX clocks & timers
32 */ 32 */
33#include <linux/mm.h> 33#include <linux/mm.h>
34#include <linux/smp_lock.h>
35#include <linux/interrupt.h> 34#include <linux/interrupt.h>
36#include <linux/slab.h> 35#include <linux/slab.h>
37#include <linux/time.h> 36#include <linux/time.h>
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0eb5c420e8ed..088419387388 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -8,7 +8,6 @@
8 8
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/smp_lock.h>
12#include <linux/interrupt.h> 11#include <linux/interrupt.h>
13#include <linux/suspend.h> 12#include <linux/suspend.h>
14#include <linux/module.h> 13#include <linux/module.h>
@@ -25,10 +24,9 @@
25 24
26static inline int freezeable(struct task_struct * p) 25static inline int freezeable(struct task_struct * p)
27{ 26{
28 if ((p == current) || 27 if ((p == current) ||
29 (p->flags & PF_NOFREEZE) || 28 (p->flags & PF_NOFREEZE) ||
30 (p->exit_state == EXIT_ZOMBIE) || 29 (p->exit_state != 0))
31 (p->exit_state == EXIT_DEAD))
32 return 0; 30 return 0;
33 return 1; 31 return 1;
34} 32}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 128da11f01c2..b7039772b05c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/smp_lock.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/bitops.h> 18#include <linux/bitops.h>
20#include <linux/spinlock.h> 19#include <linux/spinlock.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e83ed9945a80..b8b235cc19d1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,7 +12,6 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/smp_lock.h>
16#include <linux/file.h> 15#include <linux/file.h>
17#include <linux/utsname.h> 16#include <linux/utsname.h>
18#include <linux/version.h> 17#include <linux/version.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b47e59248df..0bbdeac2810c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -20,7 +20,6 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/tty.h> 21#include <linux/tty.h>
22#include <linux/tty_driver.h> 22#include <linux/tty_driver.h>
23#include <linux/smp_lock.h>
24#include <linux/console.h> 23#include <linux/console.h>
25#include <linux/init.h> 24#include <linux/init.h>
26#include <linux/module.h> 25#include <linux/module.h>
@@ -931,8 +930,16 @@ void register_console(struct console *console)
931{ 930{
932 int i; 931 int i;
933 unsigned long flags; 932 unsigned long flags;
933 struct console *bootconsole = NULL;
934 934
935 if (preferred_console < 0) 935 if (console_drivers) {
936 if (console->flags & CON_BOOT)
937 return;
938 if (console_drivers->flags & CON_BOOT)
939 bootconsole = console_drivers;
940 }
941
942 if (preferred_console < 0 || bootconsole || !console_drivers)
936 preferred_console = selected_console; 943 preferred_console = selected_console;
937 944
938 /* 945 /*
@@ -978,8 +985,11 @@ void register_console(struct console *console)
978 if (!(console->flags & CON_ENABLED)) 985 if (!(console->flags & CON_ENABLED))
979 return; 986 return;
980 987
981 if (console_drivers && (console_drivers->flags & CON_BOOT)) { 988 if (bootconsole) {
982 unregister_console(console_drivers); 989 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
990 bootconsole->name, bootconsole->index,
991 console->name, console->index);
992 unregister_console(bootconsole);
983 console->flags &= ~CON_PRINTBUFFER; 993 console->flags &= ~CON_PRINTBUFFER;
984 } 994 }
985 995
@@ -1030,16 +1040,11 @@ int unregister_console(struct console *console)
1030 } 1040 }
1031 } 1041 }
1032 1042
1033 /* If last console is removed, we re-enable picking the first 1043 /*
1034 * one that gets registered. Without that, pmac early boot console
1035 * would prevent fbcon from taking over.
1036 *
1037 * If this isn't the last console and it has CON_CONSDEV set, we 1044 * If this isn't the last console and it has CON_CONSDEV set, we
1038 * need to set it on the next preferred console. 1045 * need to set it on the next preferred console.
1039 */ 1046 */
1040 if (console_drivers == NULL) 1047 if (console_drivers != NULL && console->flags & CON_CONSDEV)
1041 preferred_console = selected_console;
1042 else if (console->flags & CON_CONSDEV)
1043 console_drivers->flags |= CON_CONSDEV; 1048 console_drivers->flags |= CON_CONSDEV;
1044 1049
1045 release_console_sem(); 1050 release_console_sem();
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index bcd14e83ef39..55ba82a85a66 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = {
502 .name = "sched" 502 .name = "sched"
503}; 503};
504 504
505static struct rcu_torture_ops *torture_ops[] =
506 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
507 &sched_ops, NULL };
508
509/* 505/*
510 * RCU torture writer kthread. Repeatedly substitutes a new structure 506 * RCU torture writer kthread. Repeatedly substitutes a new structure
511 * for that pointed to by rcu_torture_current, freeing the old structure 507 * for that pointed to by rcu_torture_current, freeing the old structure
@@ -534,7 +530,7 @@ rcu_torture_writer(void *arg)
534 rp->rtort_mbtest = 1; 530 rp->rtort_mbtest = 1;
535 rcu_assign_pointer(rcu_torture_current, rp); 531 rcu_assign_pointer(rcu_torture_current, rp);
536 smp_wmb(); 532 smp_wmb();
537 if (old_rp != NULL) { 533 if (old_rp) {
538 i = old_rp->rtort_pipe_count; 534 i = old_rp->rtort_pipe_count;
539 if (i > RCU_TORTURE_PIPE_LEN) 535 if (i > RCU_TORTURE_PIPE_LEN)
540 i = RCU_TORTURE_PIPE_LEN; 536 i = RCU_TORTURE_PIPE_LEN;
@@ -685,7 +681,7 @@ rcu_torture_printk(char *page)
685 atomic_read(&rcu_torture_wcount[i])); 681 atomic_read(&rcu_torture_wcount[i]));
686 } 682 }
687 cnt += sprintf(&page[cnt], "\n"); 683 cnt += sprintf(&page[cnt], "\n");
688 if (cur_ops->stats != NULL) 684 if (cur_ops->stats)
689 cnt += cur_ops->stats(&page[cnt]); 685 cnt += cur_ops->stats(&page[cnt]);
690 return cnt; 686 return cnt;
691} 687}
@@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void)
749 745
750 set_cpus_allowed(current, tmp_mask); 746 set_cpus_allowed(current, tmp_mask);
751 747
752 if (reader_tasks != NULL) { 748 if (reader_tasks) {
753 for (i = 0; i < nrealreaders; i++) 749 for (i = 0; i < nrealreaders; i++)
754 if (reader_tasks[i]) 750 if (reader_tasks[i])
755 set_cpus_allowed(reader_tasks[i], tmp_mask); 751 set_cpus_allowed(reader_tasks[i], tmp_mask);
756 } 752 }
757 753
758 if (fakewriter_tasks != NULL) { 754 if (fakewriter_tasks) {
759 for (i = 0; i < nfakewriters; i++) 755 for (i = 0; i < nfakewriters; i++)
760 if (fakewriter_tasks[i]) 756 if (fakewriter_tasks[i])
761 set_cpus_allowed(fakewriter_tasks[i], tmp_mask); 757 set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
@@ -808,21 +804,21 @@ rcu_torture_cleanup(void)
808 int i; 804 int i;
809 805
810 fullstop = 1; 806 fullstop = 1;
811 if (shuffler_task != NULL) { 807 if (shuffler_task) {
812 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 808 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
813 kthread_stop(shuffler_task); 809 kthread_stop(shuffler_task);
814 } 810 }
815 shuffler_task = NULL; 811 shuffler_task = NULL;
816 812
817 if (writer_task != NULL) { 813 if (writer_task) {
818 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
819 kthread_stop(writer_task); 815 kthread_stop(writer_task);
820 } 816 }
821 writer_task = NULL; 817 writer_task = NULL;
822 818
823 if (reader_tasks != NULL) { 819 if (reader_tasks) {
824 for (i = 0; i < nrealreaders; i++) { 820 for (i = 0; i < nrealreaders; i++) {
825 if (reader_tasks[i] != NULL) { 821 if (reader_tasks[i]) {
826 VERBOSE_PRINTK_STRING( 822 VERBOSE_PRINTK_STRING(
827 "Stopping rcu_torture_reader task"); 823 "Stopping rcu_torture_reader task");
828 kthread_stop(reader_tasks[i]); 824 kthread_stop(reader_tasks[i]);
@@ -834,9 +830,9 @@ rcu_torture_cleanup(void)
834 } 830 }
835 rcu_torture_current = NULL; 831 rcu_torture_current = NULL;
836 832
837 if (fakewriter_tasks != NULL) { 833 if (fakewriter_tasks) {
838 for (i = 0; i < nfakewriters; i++) { 834 for (i = 0; i < nfakewriters; i++) {
839 if (fakewriter_tasks[i] != NULL) { 835 if (fakewriter_tasks[i]) {
840 VERBOSE_PRINTK_STRING( 836 VERBOSE_PRINTK_STRING(
841 "Stopping rcu_torture_fakewriter task"); 837 "Stopping rcu_torture_fakewriter task");
842 kthread_stop(fakewriter_tasks[i]); 838 kthread_stop(fakewriter_tasks[i]);
@@ -847,7 +843,7 @@ rcu_torture_cleanup(void)
847 fakewriter_tasks = NULL; 843 fakewriter_tasks = NULL;
848 } 844 }
849 845
850 if (stats_task != NULL) { 846 if (stats_task) {
851 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); 847 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
852 kthread_stop(stats_task); 848 kthread_stop(stats_task);
853 } 849 }
@@ -858,7 +854,7 @@ rcu_torture_cleanup(void)
858 854
859 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 855 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
860 856
861 if (cur_ops->cleanup != NULL) 857 if (cur_ops->cleanup)
862 cur_ops->cleanup(); 858 cur_ops->cleanup();
863 if (atomic_read(&n_rcu_torture_error)) 859 if (atomic_read(&n_rcu_torture_error))
864 rcu_torture_print_module_parms("End of test: FAILURE"); 860 rcu_torture_print_module_parms("End of test: FAILURE");
@@ -866,27 +862,28 @@ rcu_torture_cleanup(void)
866 rcu_torture_print_module_parms("End of test: SUCCESS"); 862 rcu_torture_print_module_parms("End of test: SUCCESS");
867} 863}
868 864
869static int 865static int __init
870rcu_torture_init(void) 866rcu_torture_init(void)
871{ 867{
872 int i; 868 int i;
873 int cpu; 869 int cpu;
874 int firsterr = 0; 870 int firsterr = 0;
871 static struct rcu_torture_ops *torture_ops[] =
872 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
873 &srcu_ops, &sched_ops, };
875 874
876 /* Process args and tell the world that the torturer is on the job. */ 875 /* Process args and tell the world that the torturer is on the job. */
877 876 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
878 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
879 cur_ops = torture_ops[i]; 877 cur_ops = torture_ops[i];
880 if (strcmp(torture_type, cur_ops->name) == 0) { 878 if (strcmp(torture_type, cur_ops->name) == 0)
881 break; 879 break;
882 }
883 } 880 }
884 if (cur_ops == NULL) { 881 if (i == ARRAY_SIZE(torture_ops)) {
885 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 882 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
886 torture_type); 883 torture_type);
887 return (-EINVAL); 884 return (-EINVAL);
888 } 885 }
889 if (cur_ops->init != NULL) 886 if (cur_ops->init)
890 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 887 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
891 888
892 if (nreaders >= 0) 889 if (nreaders >= 0)
@@ -899,7 +896,7 @@ rcu_torture_init(void)
899 /* Set up the freelist. */ 896 /* Set up the freelist. */
900 897
901 INIT_LIST_HEAD(&rcu_torture_freelist); 898 INIT_LIST_HEAD(&rcu_torture_freelist);
902 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { 899 for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
903 rcu_tortures[i].rtort_mbtest = 0; 900 rcu_tortures[i].rtort_mbtest = 0;
904 list_add_tail(&rcu_tortures[i].rtort_free, 901 list_add_tail(&rcu_tortures[i].rtort_free,
905 &rcu_torture_freelist); 902 &rcu_torture_freelist);
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 291ded556aa0..9a87886b022e 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem)
60 int ret = __down_write_trylock(sem); 60 int ret = __down_write_trylock(sem);
61 61
62 if (ret == 1) 62 if (ret == 1)
63 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 63 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
64 return ret; 64 return ret;
65} 65}
66 66
diff --git a/kernel/sched.c b/kernel/sched.c
index 0227f1625a75..a3a04085e794 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
52#include <linux/tsacct_kern.h> 52#include <linux/tsacct_kern.h>
53#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h> 54#include <linux/delayacct.h>
55#include <asm/tlb.h> 55#include <linux/reciprocal_div.h>
56 56
57#include <asm/tlb.h>
57#include <asm/unistd.h> 58#include <asm/unistd.h>
58 59
59/* 60/*
@@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
168 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) 169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
169 170
170#define TASK_PREEMPTS_CURR(p, rq) \ 171#define TASK_PREEMPTS_CURR(p, rq) \
171 ((p)->prio < (rq)->curr->prio) 172 (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
172 173
173#define SCALE_PRIO(x, prio) \ 174#define SCALE_PRIO(x, prio) \
174 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) 175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
181 return SCALE_PRIO(DEF_TIMESLICE, static_prio); 182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
182} 183}
183 184
185#ifdef CONFIG_SMP
186/*
187 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
188 * Since cpu_power is a 'constant', we can use a reciprocal divide.
189 */
190static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
191{
192 return reciprocal_divide(load, sg->reciprocal_cpu_power);
193}
194
195/*
196 * Each time a sched group cpu_power is changed,
197 * we must compute its reciprocal value
198 */
199static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
200{
201 sg->__cpu_power += val;
202 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
203}
204#endif
205
184/* 206/*
185 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
186 * to time slice values: [800ms ... 100ms ... 5ms] 208 * to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
223 unsigned long raw_weighted_load; 245 unsigned long raw_weighted_load;
224#ifdef CONFIG_SMP 246#ifdef CONFIG_SMP
225 unsigned long cpu_load[3]; 247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently;
251#endif
226#endif 252#endif
227 unsigned long long nr_switches; 253 unsigned long long nr_switches;
228 254
@@ -278,7 +304,7 @@ struct rq {
278 struct lock_class_key rq_lock_key; 304 struct lock_class_key rq_lock_key;
279}; 305};
280 306
281static DEFINE_PER_CPU(struct rq, runqueues); 307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
282 308
283static inline int cpu_of(struct rq *rq) 309static inline int cpu_of(struct rq *rq)
284{ 310{
@@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p)
1049 if (!tsk_is_polling(p)) 1075 if (!tsk_is_polling(p))
1050 smp_send_reschedule(cpu); 1076 smp_send_reschedule(cpu);
1051} 1077}
1078
1079static void resched_cpu(int cpu)
1080{
1081 struct rq *rq = cpu_rq(cpu);
1082 unsigned long flags;
1083
1084 if (!spin_trylock_irqsave(&rq->lock, flags))
1085 return;
1086 resched_task(cpu_curr(cpu));
1087 spin_unlock_irqrestore(&rq->lock, flags);
1088}
1052#else 1089#else
1053static inline void resched_task(struct task_struct *p) 1090static inline void resched_task(struct task_struct *p)
1054{ 1091{
@@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1241 } 1278 }
1242 1279
1243 /* Adjust by relative CPU power of the group */ 1280 /* Adjust by relative CPU power of the group */
1244 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1281 avg_load = sg_div_cpu_power(group,
1282 avg_load * SCHED_LOAD_SCALE);
1245 1283
1246 if (local_group) { 1284 if (local_group) {
1247 this_load = avg_load; 1285 this_load = avg_load;
@@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p)
1368 struct sched_domain *sd; 1406 struct sched_domain *sd;
1369 int i; 1407 int i;
1370 1408
1371 if (idle_cpu(cpu)) 1409 /*
1410 * If it is idle, then it is the best cpu to run this task.
1411 *
1412 * This cpu is also the best, if it has more than one task already.
1413 * Siblings must be also busy(in most cases) as they didn't already
1414 * pickup the extra load from this cpu and hence we need not check
1415 * sibling runqueue info. This will avoid the checks and cache miss
1416 * penalities associated with that.
1417 */
1418 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1372 return cpu; 1419 return cpu;
1373 1420
1374 for_each_domain(cpu, sd) { 1421 for_each_domain(cpu, sd) {
@@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2352 } 2399 }
2353 2400
2354 total_load += avg_load; 2401 total_load += avg_load;
2355 total_pwr += group->cpu_power; 2402 total_pwr += group->__cpu_power;
2356 2403
2357 /* Adjust by relative CPU power of the group */ 2404 /* Adjust by relative CPU power of the group */
2358 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2405 avg_load = sg_div_cpu_power(group,
2406 avg_load * SCHED_LOAD_SCALE);
2359 2407
2360 group_capacity = group->cpu_power / SCHED_LOAD_SCALE; 2408 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2361 2409
2362 if (local_group) { 2410 if (local_group) {
2363 this_load = avg_load; 2411 this_load = avg_load;
@@ -2468,8 +2516,8 @@ group_next:
2468 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 2516 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2469 2517
2470 /* How much load to actually move to equalise the imbalance */ 2518 /* How much load to actually move to equalise the imbalance */
2471 *imbalance = min(max_pull * busiest->cpu_power, 2519 *imbalance = min(max_pull * busiest->__cpu_power,
2472 (avg_load - this_load) * this->cpu_power) 2520 (avg_load - this_load) * this->__cpu_power)
2473 / SCHED_LOAD_SCALE; 2521 / SCHED_LOAD_SCALE;
2474 2522
2475 /* 2523 /*
@@ -2503,28 +2551,29 @@ small_imbalance:
2503 * moving them. 2551 * moving them.
2504 */ 2552 */
2505 2553
2506 pwr_now += busiest->cpu_power * 2554 pwr_now += busiest->__cpu_power *
2507 min(busiest_load_per_task, max_load); 2555 min(busiest_load_per_task, max_load);
2508 pwr_now += this->cpu_power * 2556 pwr_now += this->__cpu_power *
2509 min(this_load_per_task, this_load); 2557 min(this_load_per_task, this_load);
2510 pwr_now /= SCHED_LOAD_SCALE; 2558 pwr_now /= SCHED_LOAD_SCALE;
2511 2559
2512 /* Amount of load we'd subtract */ 2560 /* Amount of load we'd subtract */
2513 tmp = busiest_load_per_task * SCHED_LOAD_SCALE / 2561 tmp = sg_div_cpu_power(busiest,
2514 busiest->cpu_power; 2562 busiest_load_per_task * SCHED_LOAD_SCALE);
2515 if (max_load > tmp) 2563 if (max_load > tmp)
2516 pwr_move += busiest->cpu_power * 2564 pwr_move += busiest->__cpu_power *
2517 min(busiest_load_per_task, max_load - tmp); 2565 min(busiest_load_per_task, max_load - tmp);
2518 2566
2519 /* Amount of load we'd add */ 2567 /* Amount of load we'd add */
2520 if (max_load * busiest->cpu_power < 2568 if (max_load * busiest->__cpu_power <
2521 busiest_load_per_task * SCHED_LOAD_SCALE) 2569 busiest_load_per_task * SCHED_LOAD_SCALE)
2522 tmp = max_load * busiest->cpu_power / this->cpu_power; 2570 tmp = sg_div_cpu_power(this,
2571 max_load * busiest->__cpu_power);
2523 else 2572 else
2524 tmp = busiest_load_per_task * SCHED_LOAD_SCALE / 2573 tmp = sg_div_cpu_power(this,
2525 this->cpu_power; 2574 busiest_load_per_task * SCHED_LOAD_SCALE);
2526 pwr_move += this->cpu_power * 2575 pwr_move += this->__cpu_power *
2527 min(this_load_per_task, this_load + tmp); 2576 min(this_load_per_task, this_load + tmp);
2528 pwr_move /= SCHED_LOAD_SCALE; 2577 pwr_move /= SCHED_LOAD_SCALE;
2529 2578
2530 /* Move if we gain throughput */ 2579 /* Move if we gain throughput */
@@ -2657,6 +2706,12 @@ redo:
2657 double_rq_unlock(this_rq, busiest); 2706 double_rq_unlock(this_rq, busiest);
2658 local_irq_restore(flags); 2707 local_irq_restore(flags);
2659 2708
2709 /*
2710 * some other cpu did the load balance for us.
2711 */
2712 if (nr_moved && this_cpu != smp_processor_id())
2713 resched_cpu(this_cpu);
2714
2660 /* All tasks on this runqueue were pinned by CPU affinity */ 2715 /* All tasks on this runqueue were pinned by CPU affinity */
2661 if (unlikely(all_pinned)) { 2716 if (unlikely(all_pinned)) {
2662 cpu_clear(cpu_of(busiest), cpus); 2717 cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq)
2927 } 2982 }
2928} 2983}
2929 2984
2985#ifdef CONFIG_NO_HZ
2986static struct {
2987 atomic_t load_balancer;
2988 cpumask_t cpu_mask;
2989} nohz ____cacheline_aligned = {
2990 .load_balancer = ATOMIC_INIT(-1),
2991 .cpu_mask = CPU_MASK_NONE,
2992};
2993
2930/* 2994/*
2931 * run_rebalance_domains is triggered when needed from the scheduler tick. 2995 * This routine will try to nominate the ilb (idle load balancing)
2996 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2997 * load balancing on behalf of all those cpus. If all the cpus in the system
2998 * go into this tickless mode, then there will be no ilb owner (as there is
2999 * no need for one) and all the cpus will sleep till the next wakeup event
3000 * arrives...
2932 * 3001 *
3002 * For the ilb owner, tick is not stopped. And this tick will be used
3003 * for idle load balancing. ilb owner will still be part of
3004 * nohz.cpu_mask..
3005 *
3006 * While stopping the tick, this cpu will become the ilb owner if there
3007 * is no other owner. And will be the owner till that cpu becomes busy
3008 * or if all cpus in the system stop their ticks at which point
3009 * there is no need for ilb owner.
3010 *
3011 * When the ilb owner becomes busy, it nominates another owner, during the
3012 * next busy scheduler_tick()
3013 */
3014int select_nohz_load_balancer(int stop_tick)
3015{
3016 int cpu = smp_processor_id();
3017
3018 if (stop_tick) {
3019 cpu_set(cpu, nohz.cpu_mask);
3020 cpu_rq(cpu)->in_nohz_recently = 1;
3021
3022 /*
3023 * If we are going offline and still the leader, give up!
3024 */
3025 if (cpu_is_offline(cpu) &&
3026 atomic_read(&nohz.load_balancer) == cpu) {
3027 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3028 BUG();
3029 return 0;
3030 }
3031
3032 /* time for ilb owner also to sleep */
3033 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3034 if (atomic_read(&nohz.load_balancer) == cpu)
3035 atomic_set(&nohz.load_balancer, -1);
3036 return 0;
3037 }
3038
3039 if (atomic_read(&nohz.load_balancer) == -1) {
3040 /* make me the ilb owner */
3041 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3042 return 1;
3043 } else if (atomic_read(&nohz.load_balancer) == cpu)
3044 return 1;
3045 } else {
3046 if (!cpu_isset(cpu, nohz.cpu_mask))
3047 return 0;
3048
3049 cpu_clear(cpu, nohz.cpu_mask);
3050
3051 if (atomic_read(&nohz.load_balancer) == cpu)
3052 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3053 BUG();
3054 }
3055 return 0;
3056}
3057#endif
3058
3059static DEFINE_SPINLOCK(balancing);
3060
3061/*
2933 * It checks each scheduling domain to see if it is due to be balanced, 3062 * It checks each scheduling domain to see if it is due to be balanced,
2934 * and initiates a balancing operation if so. 3063 * and initiates a balancing operation if so.
2935 * 3064 *
2936 * Balancing parameters are set up in arch_init_sched_domains. 3065 * Balancing parameters are set up in arch_init_sched_domains.
2937 */ 3066 */
2938static DEFINE_SPINLOCK(balancing); 3067static inline void rebalance_domains(int cpu, enum idle_type idle)
2939
2940static void run_rebalance_domains(struct softirq_action *h)
2941{ 3068{
2942 int this_cpu = smp_processor_id(), balance = 1; 3069 int balance = 1;
2943 struct rq *this_rq = cpu_rq(this_cpu); 3070 struct rq *rq = cpu_rq(cpu);
2944 unsigned long interval; 3071 unsigned long interval;
2945 struct sched_domain *sd; 3072 struct sched_domain *sd;
2946 /* 3073 /* Earliest time when we have to do rebalance again */
2947 * We are idle if there are no processes running. This
2948 * is valid even if we are the idle process (SMT).
2949 */
2950 enum idle_type idle = !this_rq->nr_running ?
2951 SCHED_IDLE : NOT_IDLE;
2952 /* Earliest time when we have to call run_rebalance_domains again */
2953 unsigned long next_balance = jiffies + 60*HZ; 3074 unsigned long next_balance = jiffies + 60*HZ;
2954 3075
2955 for_each_domain(this_cpu, sd) { 3076 for_each_domain(cpu, sd) {
2956 if (!(sd->flags & SD_LOAD_BALANCE)) 3077 if (!(sd->flags & SD_LOAD_BALANCE))
2957 continue; 3078 continue;
2958 3079
@@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h)
2971 } 3092 }
2972 3093
2973 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3094 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2974 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { 3095 if (load_balance(cpu, rq, sd, idle, &balance)) {
2975 /* 3096 /*
2976 * We've pulled tasks over so either we're no 3097 * We've pulled tasks over so either we're no
2977 * longer idle, or one of our SMT siblings is 3098 * longer idle, or one of our SMT siblings is
@@ -2995,7 +3116,114 @@ out:
2995 if (!balance) 3116 if (!balance)
2996 break; 3117 break;
2997 } 3118 }
2998 this_rq->next_balance = next_balance; 3119 rq->next_balance = next_balance;
3120}
3121
3122/*
3123 * run_rebalance_domains is triggered when needed from the scheduler tick.
3124 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3125 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3126 */
3127static void run_rebalance_domains(struct softirq_action *h)
3128{
3129 int local_cpu = smp_processor_id();
3130 struct rq *local_rq = cpu_rq(local_cpu);
3131 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
3132
3133 rebalance_domains(local_cpu, idle);
3134
3135#ifdef CONFIG_NO_HZ
3136 /*
3137 * If this cpu is the owner for idle load balancing, then do the
3138 * balancing on behalf of the other idle cpus whose ticks are
3139 * stopped.
3140 */
3141 if (local_rq->idle_at_tick &&
3142 atomic_read(&nohz.load_balancer) == local_cpu) {
3143 cpumask_t cpus = nohz.cpu_mask;
3144 struct rq *rq;
3145 int balance_cpu;
3146
3147 cpu_clear(local_cpu, cpus);
3148 for_each_cpu_mask(balance_cpu, cpus) {
3149 /*
3150 * If this cpu gets work to do, stop the load balancing
3151 * work being done for other cpus. Next load
3152 * balancing owner will pick it up.
3153 */
3154 if (need_resched())
3155 break;
3156
3157 rebalance_domains(balance_cpu, SCHED_IDLE);
3158
3159 rq = cpu_rq(balance_cpu);
3160 if (time_after(local_rq->next_balance, rq->next_balance))
3161 local_rq->next_balance = rq->next_balance;
3162 }
3163 }
3164#endif
3165}
3166
3167/*
3168 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3169 *
3170 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3171 * idle load balancing owner or decide to stop the periodic load balancing,
3172 * if the whole system is idle.
3173 */
3174static inline void trigger_load_balance(int cpu)
3175{
3176 struct rq *rq = cpu_rq(cpu);
3177#ifdef CONFIG_NO_HZ
3178 /*
3179 * If we were in the nohz mode recently and busy at the current
3180 * scheduler tick, then check if we need to nominate new idle
3181 * load balancer.
3182 */
3183 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3184 rq->in_nohz_recently = 0;
3185
3186 if (atomic_read(&nohz.load_balancer) == cpu) {
3187 cpu_clear(cpu, nohz.cpu_mask);
3188 atomic_set(&nohz.load_balancer, -1);
3189 }
3190
3191 if (atomic_read(&nohz.load_balancer) == -1) {
3192 /*
3193 * simple selection for now: Nominate the
3194 * first cpu in the nohz list to be the next
3195 * ilb owner.
3196 *
3197 * TBD: Traverse the sched domains and nominate
3198 * the nearest cpu in the nohz.cpu_mask.
3199 */
3200 int ilb = first_cpu(nohz.cpu_mask);
3201
3202 if (ilb != NR_CPUS)
3203 resched_cpu(ilb);
3204 }
3205 }
3206
3207 /*
3208 * If this cpu is idle and doing idle load balancing for all the
3209 * cpus with ticks stopped, is it time for that to stop?
3210 */
3211 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3212 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3213 resched_cpu(cpu);
3214 return;
3215 }
3216
3217 /*
3218 * If this cpu is idle and the idle load balancing is done by
3219 * someone else, then no need raise the SCHED_SOFTIRQ
3220 */
3221 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3222 cpu_isset(cpu, nohz.cpu_mask))
3223 return;
3224#endif
3225 if (time_after_eq(jiffies, rq->next_balance))
3226 raise_softirq(SCHED_SOFTIRQ);
2999} 3227}
3000#else 3228#else
3001/* 3229/*
@@ -3218,16 +3446,17 @@ void scheduler_tick(void)
3218 unsigned long long now = sched_clock(); 3446 unsigned long long now = sched_clock();
3219 struct task_struct *p = current; 3447 struct task_struct *p = current;
3220 int cpu = smp_processor_id(); 3448 int cpu = smp_processor_id();
3449 int idle_at_tick = idle_cpu(cpu);
3221 struct rq *rq = cpu_rq(cpu); 3450 struct rq *rq = cpu_rq(cpu);
3222 3451
3223 update_cpu_clock(p, rq, now); 3452 update_cpu_clock(p, rq, now);
3224 3453
3225 if (p != rq->idle) 3454 if (!idle_at_tick)
3226 task_running_tick(rq, p); 3455 task_running_tick(rq, p);
3227#ifdef CONFIG_SMP 3456#ifdef CONFIG_SMP
3228 update_load(rq); 3457 update_load(rq);
3229 if (time_after_eq(jiffies, rq->next_balance)) 3458 rq->idle_at_tick = idle_at_tick;
3230 raise_softirq(SCHED_SOFTIRQ); 3459 trigger_load_balance(cpu);
3231#endif 3460#endif
3232} 3461}
3233 3462
@@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3847 struct prio_array *array; 4076 struct prio_array *array;
3848 unsigned long flags; 4077 unsigned long flags;
3849 struct rq *rq; 4078 struct rq *rq;
3850 int oldprio; 4079 int delta;
3851 4080
3852 BUG_ON(prio < 0 || prio > MAX_PRIO); 4081 BUG_ON(prio < 0 || prio > MAX_PRIO);
3853 4082
3854 rq = task_rq_lock(p, &flags); 4083 rq = task_rq_lock(p, &flags);
3855 4084
3856 oldprio = p->prio; 4085 delta = prio - p->prio;
3857 array = p->array; 4086 array = p->array;
3858 if (array) 4087 if (array)
3859 dequeue_task(p, array); 4088 dequeue_task(p, array);
@@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3869 enqueue_task(p, array); 4098 enqueue_task(p, array);
3870 /* 4099 /*
3871 * Reschedule if we are currently running on this runqueue and 4100 * Reschedule if we are currently running on this runqueue and
3872 * our priority decreased, or if we are not currently running on 4101 * our priority decreased, or if our priority became higher
3873 * this runqueue and our priority is higher than the current's 4102 * than the current's.
3874 */ 4103 */
3875 if (task_running(rq, p)) { 4104 if (TASK_PREEMPTS_CURR(p, rq) ||
3876 if (p->prio > oldprio) 4105 (delta > 0 && task_running(rq, p)))
3877 resched_task(rq->curr);
3878 } else if (TASK_PREEMPTS_CURR(p, rq))
3879 resched_task(rq->curr); 4106 resched_task(rq->curr);
3880 } 4107 }
3881 task_rq_unlock(rq, &flags); 4108 task_rq_unlock(rq, &flags);
@@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice)
3923 enqueue_task(p, array); 4150 enqueue_task(p, array);
3924 inc_raw_weighted_load(rq, p); 4151 inc_raw_weighted_load(rq, p);
3925 /* 4152 /*
3926 * If the task increased its priority or is running and 4153 * Reschedule if we are currently running on this runqueue and
3927 * lowered its priority, then reschedule its CPU: 4154 * our priority decreased, or if our priority became higher
4155 * than the current's.
3928 */ 4156 */
3929 if (delta < 0 || (delta > 0 && task_running(rq, p))) 4157 if (TASK_PREEMPTS_CURR(p, rq) ||
4158 (delta > 0 && task_running(rq, p)))
3930 resched_task(rq->curr); 4159 resched_task(rq->curr);
3931 } 4160 }
3932out_unlock: 4161out_unlock:
@@ -4153,13 +4382,11 @@ recheck:
4153 __activate_task(p, rq); 4382 __activate_task(p, rq);
4154 /* 4383 /*
4155 * Reschedule if we are currently running on this runqueue and 4384 * Reschedule if we are currently running on this runqueue and
4156 * our priority decreased, or if we are not currently running on 4385 * our priority decreased, or our priority became higher
4157 * this runqueue and our priority is higher than the current's 4386 * than the current's.
4158 */ 4387 */
4159 if (task_running(rq, p)) { 4388 if (TASK_PREEMPTS_CURR(p, rq) ||
4160 if (p->prio > oldprio) 4389 (task_running(rq, p) && p->prio > oldprio))
4161 resched_task(rq->curr);
4162 } else if (TASK_PREEMPTS_CURR(p, rq))
4163 resched_task(rq->curr); 4390 resched_task(rq->curr);
4164 } 4391 }
4165 __task_rq_unlock(rq); 4392 __task_rq_unlock(rq);
@@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter)
4750 show_task(p); 4977 show_task(p);
4751 } while_each_thread(g, p); 4978 } while_each_thread(g, p);
4752 4979
4980 touch_all_softlockup_watchdogs();
4981
4753 read_unlock(&tasklist_lock); 4982 read_unlock(&tasklist_lock);
4754 /* 4983 /*
4755 * Only show locks if all tasks are dumped: 4984 * Only show locks if all tasks are dumped:
@@ -5304,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5304 break; 5533 break;
5305 } 5534 }
5306 5535
5307 if (!group->cpu_power) { 5536 if (!group->__cpu_power) {
5308 printk("\n"); 5537 printk("\n");
5309 printk(KERN_ERR "ERROR: domain->cpu_power not " 5538 printk(KERN_ERR "ERROR: domain->cpu_power not "
5310 "set\n"); 5539 "set\n");
@@ -5481,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5481 continue; 5710 continue;
5482 5711
5483 sg->cpumask = CPU_MASK_NONE; 5712 sg->cpumask = CPU_MASK_NONE;
5484 sg->cpu_power = 0; 5713 sg->__cpu_power = 0;
5485 5714
5486 for_each_cpu_mask(j, span) { 5715 for_each_cpu_mask(j, span) {
5487 if (group_fn(j, cpu_map, NULL) != group) 5716 if (group_fn(j, cpu_map, NULL) != group)
@@ -6170,7 +6399,7 @@ next_sg:
6170 continue; 6399 continue;
6171 } 6400 }
6172 6401
6173 sg->cpu_power += sd->groups->cpu_power; 6402 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
6174 } 6403 }
6175 sg = sg->next; 6404 sg = sg->next;
6176 if (sg != group_head) 6405 if (sg != group_head)
@@ -6245,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6245 6474
6246 child = sd->child; 6475 child = sd->child;
6247 6476
6477 sd->groups->__cpu_power = 0;
6478
6248 /* 6479 /*
6249 * For perf policy, if the groups in child domain share resources 6480 * For perf policy, if the groups in child domain share resources
6250 * (for example cores sharing some portions of the cache hierarchy 6481 * (for example cores sharing some portions of the cache hierarchy
@@ -6255,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6255 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 6486 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6256 (child->flags & 6487 (child->flags &
6257 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 6488 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6258 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6489 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6259 return; 6490 return;
6260 } 6491 }
6261 6492
6262 sd->groups->cpu_power = 0;
6263
6264 /* 6493 /*
6265 * add cpu_power of each child group to this groups cpu_power 6494 * add cpu_power of each child group to this groups cpu_power
6266 */ 6495 */
6267 group = child->groups; 6496 group = child->groups;
6268 do { 6497 do {
6269 sd->groups->cpu_power += group->cpu_power; 6498 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6270 group = group->next; 6499 group = group->next;
6271 } while (group != child->groups); 6500 } while (group != child->groups);
6272} 6501}
@@ -6426,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6426 sd = &per_cpu(node_domains, j); 6655 sd = &per_cpu(node_domains, j);
6427 sd->groups = sg; 6656 sd->groups = sg;
6428 } 6657 }
6429 sg->cpu_power = 0; 6658 sg->__cpu_power = 0;
6430 sg->cpumask = nodemask; 6659 sg->cpumask = nodemask;
6431 sg->next = sg; 6660 sg->next = sg;
6432 cpus_or(covered, covered, nodemask); 6661 cpus_or(covered, covered, nodemask);
@@ -6454,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6454 "Can not alloc domain group for node %d\n", j); 6683 "Can not alloc domain group for node %d\n", j);
6455 goto error; 6684 goto error;
6456 } 6685 }
6457 sg->cpu_power = 0; 6686 sg->__cpu_power = 0;
6458 sg->cpumask = tmp; 6687 sg->cpumask = tmp;
6459 sg->next = prev->next; 6688 sg->next = prev->next;
6460 cpus_or(covered, covered, tmp); 6689 cpus_or(covered, covered, tmp);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2b4087d545a3..1368e67c8482 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -12,7 +12,6 @@
12 12
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/smp_lock.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb813305..8fa7040247ad 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -34,12 +34,32 @@ static struct notifier_block panic_block = {
34 .notifier_call = softlock_panic, 34 .notifier_call = softlock_panic,
35}; 35};
36 36
37/*
38 * Returns seconds, approximately. We don't need nanosecond
39 * resolution, and we don't need to waste time with a big divide when
40 * 2^30ns == 1.074s.
41 */
42static unsigned long get_timestamp(void)
43{
44 return sched_clock() >> 30; /* 2^30 ~= 10^9 */
45}
46
37void touch_softlockup_watchdog(void) 47void touch_softlockup_watchdog(void)
38{ 48{
39 __raw_get_cpu_var(touch_timestamp) = jiffies; 49 __raw_get_cpu_var(touch_timestamp) = get_timestamp();
40} 50}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 51EXPORT_SYMBOL(touch_softlockup_watchdog);
42 52
53void touch_all_softlockup_watchdogs(void)
54{
55 int cpu;
56
57 /* Cause each CPU to re-update its timestamp rather than complain */
58 for_each_online_cpu(cpu)
59 per_cpu(touch_timestamp, cpu) = 0;
60}
61EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
62
43/* 63/*
44 * This callback runs from the timer interrupt, and checks 64 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not: 65 * whether the watchdog thread has hung or not:
@@ -48,9 +68,18 @@ void softlockup_tick(void)
48{ 68{
49 int this_cpu = smp_processor_id(); 69 int this_cpu = smp_processor_id();
50 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 70 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
71 unsigned long print_timestamp;
72 unsigned long now;
73
74 if (touch_timestamp == 0) {
75 touch_softlockup_watchdog();
76 return;
77 }
78
79 print_timestamp = per_cpu(print_timestamp, this_cpu);
51 80
52 /* prevent double reports: */ 81 /* report at most once a second */
53 if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || 82 if (print_timestamp < (touch_timestamp + 1) ||
54 did_panic || 83 did_panic ||
55 !per_cpu(watchdog_task, this_cpu)) 84 !per_cpu(watchdog_task, this_cpu))
56 return; 85 return;
@@ -61,12 +90,14 @@ void softlockup_tick(void)
61 return; 90 return;
62 } 91 }
63 92
93 now = get_timestamp();
94
64 /* Wake up the high-prio watchdog task every second: */ 95 /* Wake up the high-prio watchdog task every second: */
65 if (time_after(jiffies, touch_timestamp + HZ)) 96 if (now > (touch_timestamp + 1))
66 wake_up_process(per_cpu(watchdog_task, this_cpu)); 97 wake_up_process(per_cpu(watchdog_task, this_cpu));
67 98
68 /* Warn about unreasonable 10+ seconds delays: */ 99 /* Warn about unreasonable 10+ seconds delays: */
69 if (time_after(jiffies, touch_timestamp + 10*HZ)) { 100 if (now > (touch_timestamp + 10)) {
70 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 101 per_cpu(print_timestamp, this_cpu) = touch_timestamp;
71 102
72 spin_lock(&print_lock); 103 spin_lock(&print_lock);
@@ -82,11 +113,14 @@ void softlockup_tick(void)
82 */ 113 */
83static int watchdog(void * __bind_cpu) 114static int watchdog(void * __bind_cpu)
84{ 115{
85 struct sched_param param = { .sched_priority = 99 }; 116 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
86 117
87 sched_setscheduler(current, SCHED_FIFO, &param); 118 sched_setscheduler(current, SCHED_FIFO, &param);
88 current->flags |= PF_NOFREEZE; 119 current->flags |= PF_NOFREEZE;
89 120
121 /* initialize timestamp */
122 touch_softlockup_watchdog();
123
90 /* 124 /*
91 * Run briefly once per second to reset the softlockup timestamp. 125 * Run briefly once per second to reset the softlockup timestamp.
92 * If this gets delayed for more than 10 seconds then the 126 * If this gets delayed for more than 10 seconds then the
@@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
118 printk("watchdog for %i failed\n", hotcpu); 152 printk("watchdog for %i failed\n", hotcpu);
119 return NOTIFY_BAD; 153 return NOTIFY_BAD;
120 } 154 }
121 per_cpu(touch_timestamp, hotcpu) = jiffies; 155 per_cpu(touch_timestamp, hotcpu) = 0;
122 per_cpu(watchdog_task, hotcpu) = p; 156 per_cpu(watchdog_task, hotcpu) = p;
123 kthread_bind(p, hotcpu); 157 kthread_bind(p, hotcpu);
124 break; 158 break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12458040e665..daabb74ee0bc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,11 +1,12 @@
1/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
2 * GPL v2 and any later version. 2 * GPL v2 and any later version.
3 */ 3 */
4#include <linux/stop_machine.h>
5#include <linux/kthread.h>
6#include <linux/sched.h>
7#include <linux/cpu.h> 4#include <linux/cpu.h>
8#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/kthread.h>
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/stop_machine.h>
9#include <linux/syscalls.h> 10#include <linux/syscalls.h>
10#include <asm/atomic.h> 11#include <asm/atomic.h>
11#include <asm/semaphore.h> 12#include <asm/semaphore.h>
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
208 209
209 return ret; 210 return ret;
210} 211}
212EXPORT_SYMBOL_GPL(stop_machine_run);
diff --git a/kernel/sys.c b/kernel/sys.c
index fe1f3ab20477..926bf9d7ac45 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1923 if (retval) 1923 if (retval)
1924 return retval; 1924 return retval;
1925 1925
1926 if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
1927 /*
1928 * The caller is asking for an immediate RLIMIT_CPU
1929 * expiry. But we use the zero value to mean "it was
1930 * never set". So let's cheat and make it one second
1931 * instead
1932 */
1933 new_rlim.rlim_cur = 1;
1934 }
1935
1926 task_lock(current->group_leader); 1936 task_lock(current->group_leader);
1927 *old_rlim = new_rlim; 1937 *old_rlim = new_rlim;
1928 task_unlock(current->group_leader); 1938 task_unlock(current->group_leader);
@@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1944 unsigned long rlim_cur = new_rlim.rlim_cur; 1954 unsigned long rlim_cur = new_rlim.rlim_cur;
1945 cputime_t cputime; 1955 cputime_t cputime;
1946 1956
1947 if (rlim_cur == 0) {
1948 /*
1949 * The caller is asking for an immediate RLIMIT_CPU
1950 * expiry. But we use the zero value to mean "it was
1951 * never set". So let's cheat and make it one second
1952 * instead
1953 */
1954 rlim_cur = 1;
1955 }
1956 cputime = secs_to_cputime(rlim_cur); 1957 cputime = secs_to_cputime(rlim_cur);
1957 read_lock(&tasklist_lock); 1958 read_lock(&tasklist_lock);
1958 spin_lock_irq(&current->sighand->siglock); 1959 spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c904748f2290..f0664bd5011c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max;
76extern int sysctl_drop_caches; 76extern int sysctl_drop_caches;
77extern int percpu_pagelist_fraction; 77extern int percpu_pagelist_fraction;
78extern int compat_log; 78extern int compat_log;
79extern int maps_protect;
79 80
80/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
81static int maxolduid = 65535; 82static int maxolduid = 65535;
@@ -603,6 +604,16 @@ static ctl_table kern_table[] = {
603 .proc_handler = &proc_dointvec, 604 .proc_handler = &proc_dointvec,
604 }, 605 },
605#endif 606#endif
607#ifdef CONFIG_PROC_FS
608 {
609 .ctl_name = CTL_UNNUMBERED,
610 .procname = "maps_protect",
611 .data = &maps_protect,
612 .maxlen = sizeof(int),
613 .mode = 0644,
614 .proc_handler = &proc_dointvec,
615 },
616#endif
606 617
607 { .ctl_name = 0 } 618 { .ctl_name = 0 }
608}; 619};
diff --git a/kernel/time.c b/kernel/time.c
index ba18ec4899bd..f04791f69408 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,7 +31,6 @@
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/errno.h> 33#include <linux/errno.h>
34#include <linux/smp_lock.h>
35#include <linux/syscalls.h> 34#include <linux/syscalls.h>
36#include <linux/security.h> 35#include <linux/security.h>
37#include <linux/fs.h> 36#include <linux/fs.h>
@@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb)
247} 246}
248EXPORT_SYMBOL(current_fs_time); 247EXPORT_SYMBOL(current_fs_time);
249 248
249/*
250 * Convert jiffies to milliseconds and back.
251 *
252 * Avoid unnecessary multiplications/divisions in the
253 * two most common HZ cases:
254 */
255unsigned int inline jiffies_to_msecs(const unsigned long j)
256{
257#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
258 return (MSEC_PER_SEC / HZ) * j;
259#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
260 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
261#else
262 return (j * MSEC_PER_SEC) / HZ;
263#endif
264}
265EXPORT_SYMBOL(jiffies_to_msecs);
266
267unsigned int inline jiffies_to_usecs(const unsigned long j)
268{
269#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
270 return (USEC_PER_SEC / HZ) * j;
271#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
272 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
273#else
274 return (j * USEC_PER_SEC) / HZ;
275#endif
276}
277EXPORT_SYMBOL(jiffies_to_usecs);
278
250/** 279/**
251 * timespec_trunc - Truncate timespec to a granularity 280 * timespec_trunc - Truncate timespec to a granularity
252 * @t: Timespec 281 * @t: Timespec
@@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec)
473EXPORT_SYMBOL(ns_to_timeval); 502EXPORT_SYMBOL(ns_to_timeval);
474 503
475/* 504/*
476 * Convert jiffies to milliseconds and back.
477 *
478 * Avoid unnecessary multiplications/divisions in the
479 * two most common HZ cases:
480 */
481unsigned int jiffies_to_msecs(const unsigned long j)
482{
483#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
484 return (MSEC_PER_SEC / HZ) * j;
485#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
486 return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
487#else
488 return (j * MSEC_PER_SEC) / HZ;
489#endif
490}
491EXPORT_SYMBOL(jiffies_to_msecs);
492
493unsigned int jiffies_to_usecs(const unsigned long j)
494{
495#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
496 return (USEC_PER_SEC / HZ) * j;
497#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
498 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
499#else
500 return (j * USEC_PER_SEC) / HZ;
501#endif
502}
503EXPORT_SYMBOL(jiffies_to_usecs);
504
505/*
506 * When we convert to jiffies then we interpret incoming values 505 * When we convert to jiffies then we interpret incoming values
507 * the following way: 506 * the following way:
508 * 507 *
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 93bccba1f265..99b6034fc86b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bfda3f7f0716..a96ec9ab3454 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
31 */ 31 */
32ktime_t tick_next_period; 32ktime_t tick_next_period;
33ktime_t tick_period; 33ktime_t tick_period;
34static int tick_do_timer_cpu = -1; 34int tick_do_timer_cpu __read_mostly = -1;
35DEFINE_SPINLOCK(tick_device_lock); 35DEFINE_SPINLOCK(tick_device_lock);
36 36
37/* 37/*
@@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup)
295 clockevents_exchange_device(dev, NULL); 295 clockevents_exchange_device(dev, NULL);
296 td->evtdev = NULL; 296 td->evtdev = NULL;
297 } 297 }
298 /* Transfer the do_timer job away from this cpu */
299 if (*cpup == tick_do_timer_cpu) {
300 int cpu = first_cpu(cpu_online_map);
301
302 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
303 }
298 spin_unlock_irqrestore(&tick_device_lock, flags); 304 spin_unlock_irqrestore(&tick_device_lock, flags);
299} 305}
300 306
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c9d203bde518..bb13f2724905 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 5extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 6extern ktime_t tick_next_period;
7extern ktime_t tick_period; 7extern ktime_t tick_period;
8extern int tick_do_timer_cpu __read_mostly;
8 9
9extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
10extern void tick_handle_periodic(struct clock_event_device *dev); 11extern void tick_handle_periodic(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 51556b95f60f..3483e6cb9549 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void)
217 * the scheduler tick in nohz_restart_sched_tick. 217 * the scheduler tick in nohz_restart_sched_tick.
218 */ 218 */
219 if (!ts->tick_stopped) { 219 if (!ts->tick_stopped) {
220 if (select_nohz_load_balancer(1)) {
221 /*
222 * sched tick not stopped!
223 */
224 cpu_clear(cpu, nohz_cpu_mask);
225 goto out;
226 }
227
220 ts->idle_tick = ts->sched_timer.expires; 228 ts->idle_tick = ts->sched_timer.expires;
221 ts->tick_stopped = 1; 229 ts->tick_stopped = 1;
222 ts->idle_jiffies = last_jiffies; 230 ts->idle_jiffies = last_jiffies;
223 } 231 }
232
233 /*
234 * If this cpu is the one which updates jiffies, then
235 * give up the assignment and let it be taken by the
236 * cpu which runs the tick timer next, which might be
237 * this cpu as well. If we don't drop this here the
238 * jiffies might be stale and do_timer() never
239 * invoked.
240 */
241 if (cpu == tick_do_timer_cpu)
242 tick_do_timer_cpu = -1;
243
224 /* 244 /*
225 * calculate the expiry time for the next timer wheel 245 * calculate the expiry time for the next timer wheel
226 * timer 246 * timer
@@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
273 now = ktime_get(); 293 now = ktime_get();
274 294
275 local_irq_disable(); 295 local_irq_disable();
296 select_nohz_load_balancer(0);
276 tick_do_update_jiffies64(now); 297 tick_do_update_jiffies64(now);
277 cpu_clear(cpu, nohz_cpu_mask); 298 cpu_clear(cpu, nohz_cpu_mask);
278 299
@@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
338{ 359{
339 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 360 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
340 struct pt_regs *regs = get_irq_regs(); 361 struct pt_regs *regs = get_irq_regs();
362 int cpu = smp_processor_id();
341 ktime_t now = ktime_get(); 363 ktime_t now = ktime_get();
342 364
343 dev->next_event.tv64 = KTIME_MAX; 365 dev->next_event.tv64 = KTIME_MAX;
344 366
367 /*
368 * Check if the do_timer duty was dropped. We don't care about
369 * concurrency: This happens only when the cpu in charge went
370 * into a long sleep. If two cpus happen to assign themself to
371 * this duty, then the jiffies update is still serialized by
372 * xtime_lock.
373 */
374 if (unlikely(tick_do_timer_cpu == -1))
375 tick_do_timer_cpu = cpu;
376
345 /* Check, if the jiffies need an update */ 377 /* Check, if the jiffies need an update */
346 tick_do_update_jiffies64(now); 378 if (tick_do_timer_cpu == cpu)
379 tick_do_update_jiffies64(now);
347 380
348 /* 381 /*
349 * When we are idle and the tick is stopped, we have to touch 382 * When we are idle and the tick is stopped, we have to touch
@@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
431 struct hrtimer_cpu_base *base = timer->base->cpu_base; 464 struct hrtimer_cpu_base *base = timer->base->cpu_base;
432 struct pt_regs *regs = get_irq_regs(); 465 struct pt_regs *regs = get_irq_regs();
433 ktime_t now = ktime_get(); 466 ktime_t now = ktime_get();
467 int cpu = smp_processor_id();
468
469#ifdef CONFIG_NO_HZ
470 /*
471 * Check if the do_timer duty was dropped. We don't care about
472 * concurrency: This happens only when the cpu in charge went
473 * into a long sleep. If two cpus happen to assign themself to
474 * this duty, then the jiffies update is still serialized by
475 * xtime_lock.
476 */
477 if (unlikely(tick_do_timer_cpu == -1))
478 tick_do_timer_cpu = cpu;
479#endif
434 480
435 /* Check, if the jiffies need an update */ 481 /* Check, if the jiffies need an update */
436 tick_do_update_jiffies64(now); 482 if (tick_do_timer_cpu == cpu)
483 tick_do_update_jiffies64(now);
437 484
438 /* 485 /*
439 * Do not call, when we are not in irq context and have 486 * Do not call, when we are not in irq context and have
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 000000000000..f9217bf644f6
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,476 @@
1/*
2 * linux/kernel/time/timekeeping.c
3 *
4 * Kernel timekeeping code and accessor functions
5 *
6 * This code was moved from linux/kernel/timer.c.
7 * Please see that file for copyright and history logs.
8 *
9 */
10
11#include <linux/module.h>
12#include <linux/interrupt.h>
13#include <linux/percpu.h>
14#include <linux/init.h>
15#include <linux/mm.h>
16#include <linux/sysdev.h>
17#include <linux/clocksource.h>
18#include <linux/jiffies.h>
19#include <linux/time.h>
20#include <linux/tick.h>
21
22
23/*
24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun.
26 */
27__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28
29EXPORT_SYMBOL(xtime_lock);
30
31
32/*
33 * The current time
34 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
35 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
36 * at zero at system boot time, so wall_to_monotonic will be negative,
37 * however, we will ALWAYS keep the tv_nsec part positive so we can use
38 * the usual normalization.
39 */
40struct timespec xtime __attribute__ ((aligned (16)));
41struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
42
43EXPORT_SYMBOL(xtime);
44
45
46static struct clocksource *clock; /* pointer to current clocksource */
47
48
49#ifdef CONFIG_GENERIC_TIME
50/**
51 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
52 *
53 * private function, must hold xtime_lock lock when being
54 * called. Returns the number of nanoseconds since the
55 * last call to update_wall_time() (adjusted by NTP scaling)
56 */
57static inline s64 __get_nsec_offset(void)
58{
59 cycle_t cycle_now, cycle_delta;
60 s64 ns_offset;
61
62 /* read clocksource: */
63 cycle_now = clocksource_read(clock);
64
65 /* calculate the delta since the last update_wall_time: */
66 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
67
68 /* convert to nanoseconds: */
69 ns_offset = cyc2ns(clock, cycle_delta);
70
71 return ns_offset;
72}
73
74/**
75 * __get_realtime_clock_ts - Returns the time of day in a timespec
76 * @ts: pointer to the timespec to be set
77 *
78 * Returns the time of day in a timespec. Used by
79 * do_gettimeofday() and get_realtime_clock_ts().
80 */
81static inline void __get_realtime_clock_ts(struct timespec *ts)
82{
83 unsigned long seq;
84 s64 nsecs;
85
86 do {
87 seq = read_seqbegin(&xtime_lock);
88
89 *ts = xtime;
90 nsecs = __get_nsec_offset();
91
92 } while (read_seqretry(&xtime_lock, seq));
93
94 timespec_add_ns(ts, nsecs);
95}
96
97/**
98 * getnstimeofday - Returns the time of day in a timespec
99 * @ts: pointer to the timespec to be set
100 *
101 * Returns the time of day in a timespec.
102 */
103void getnstimeofday(struct timespec *ts)
104{
105 __get_realtime_clock_ts(ts);
106}
107
108EXPORT_SYMBOL(getnstimeofday);
109
110/**
111 * do_gettimeofday - Returns the time of day in a timeval
112 * @tv: pointer to the timeval to be set
113 *
114 * NOTE: Users should be converted to using get_realtime_clock_ts()
115 */
116void do_gettimeofday(struct timeval *tv)
117{
118 struct timespec now;
119
120 __get_realtime_clock_ts(&now);
121 tv->tv_sec = now.tv_sec;
122 tv->tv_usec = now.tv_nsec/1000;
123}
124
125EXPORT_SYMBOL(do_gettimeofday);
126/**
127 * do_settimeofday - Sets the time of day
128 * @tv: pointer to the timespec variable containing the new time
129 *
130 * Sets the time of day to the new time and update NTP and notify hrtimers
131 */
132int do_settimeofday(struct timespec *tv)
133{
134 unsigned long flags;
135 time_t wtm_sec, sec = tv->tv_sec;
136 long wtm_nsec, nsec = tv->tv_nsec;
137
138 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
139 return -EINVAL;
140
141 write_seqlock_irqsave(&xtime_lock, flags);
142
143 nsec -= __get_nsec_offset();
144
145 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
146 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
147
148 set_normalized_timespec(&xtime, sec, nsec);
149 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
150
151 clock->error = 0;
152 ntp_clear();
153
154 update_vsyscall(&xtime, clock);
155
156 write_sequnlock_irqrestore(&xtime_lock, flags);
157
158 /* signal hrtimers about time change */
159 clock_was_set();
160
161 return 0;
162}
163
164EXPORT_SYMBOL(do_settimeofday);
165
166/**
167 * change_clocksource - Swaps clocksources if a new one is available
168 *
169 * Accumulates current time interval and initializes new clocksource
170 */
171static void change_clocksource(void)
172{
173 struct clocksource *new;
174 cycle_t now;
175 u64 nsec;
176
177 new = clocksource_get_next();
178
179 if (clock == new)
180 return;
181
182 now = clocksource_read(new);
183 nsec = __get_nsec_offset();
184 timespec_add_ns(&xtime, nsec);
185
186 clock = new;
187 clock->cycle_last = now;
188
189 clock->error = 0;
190 clock->xtime_nsec = 0;
191 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
192
193 tick_clock_notify();
194
195 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
196 clock->name);
197}
198#else
199static inline void change_clocksource(void) { }
200#endif
201
202/**
203 * timekeeping_is_continuous - check to see if timekeeping is free running
204 */
205int timekeeping_is_continuous(void)
206{
207 unsigned long seq;
208 int ret;
209
210 do {
211 seq = read_seqbegin(&xtime_lock);
212
213 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
214
215 } while (read_seqretry(&xtime_lock, seq));
216
217 return ret;
218}
219
220/**
221 * read_persistent_clock - Return time in seconds from the persistent clock.
222 *
223 * Weak dummy function for arches that do not yet support it.
224 * Returns seconds from epoch using the battery backed persistent clock.
225 * Returns zero if unsupported.
226 *
227 * XXX - Do be sure to remove it once all arches implement it.
228 */
229unsigned long __attribute__((weak)) read_persistent_clock(void)
230{
231 return 0;
232}
233
234/*
235 * timekeeping_init - Initializes the clocksource and common timekeeping values
236 */
237void __init timekeeping_init(void)
238{
239 unsigned long flags;
240 unsigned long sec = read_persistent_clock();
241
242 write_seqlock_irqsave(&xtime_lock, flags);
243
244 ntp_clear();
245
246 clock = clocksource_get_next();
247 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
248 clock->cycle_last = clocksource_read(clock);
249
250 xtime.tv_sec = sec;
251 xtime.tv_nsec = 0;
252 set_normalized_timespec(&wall_to_monotonic,
253 -xtime.tv_sec, -xtime.tv_nsec);
254
255 write_sequnlock_irqrestore(&xtime_lock, flags);
256}
257
258/* flag for if timekeeping is suspended */
259static int timekeeping_suspended;
260/* time in seconds when suspend began */
261static unsigned long timekeeping_suspend_time;
262
263/**
264 * timekeeping_resume - Resumes the generic timekeeping subsystem.
265 * @dev: unused
266 *
267 * This is for the generic clocksource timekeeping.
268 * xtime/wall_to_monotonic/jiffies/etc are
269 * still managed by arch specific suspend/resume code.
270 */
271static int timekeeping_resume(struct sys_device *dev)
272{
273 unsigned long flags;
274 unsigned long now = read_persistent_clock();
275
276 write_seqlock_irqsave(&xtime_lock, flags);
277
278 if (now && (now > timekeeping_suspend_time)) {
279 unsigned long sleep_length = now - timekeeping_suspend_time;
280
281 xtime.tv_sec += sleep_length;
282 wall_to_monotonic.tv_sec -= sleep_length;
283 }
284 /* re-base the last cycle value */
285 clock->cycle_last = clocksource_read(clock);
286 clock->error = 0;
287 timekeeping_suspended = 0;
288 write_sequnlock_irqrestore(&xtime_lock, flags);
289
290 touch_softlockup_watchdog();
291
292 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
293
294 /* Resume hrtimers */
295 hres_timers_resume();
296
297 return 0;
298}
299
300static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
301{
302 unsigned long flags;
303
304 write_seqlock_irqsave(&xtime_lock, flags);
305 timekeeping_suspended = 1;
306 timekeeping_suspend_time = read_persistent_clock();
307 write_sequnlock_irqrestore(&xtime_lock, flags);
308
309 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
310
311 return 0;
312}
313
314/* sysfs resume/suspend bits for timekeeping */
315static struct sysdev_class timekeeping_sysclass = {
316 .resume = timekeeping_resume,
317 .suspend = timekeeping_suspend,
318 set_kset_name("timekeeping"),
319};
320
321static struct sys_device device_timer = {
322 .id = 0,
323 .cls = &timekeeping_sysclass,
324};
325
326static int __init timekeeping_init_device(void)
327{
328 int error = sysdev_class_register(&timekeeping_sysclass);
329 if (!error)
330 error = sysdev_register(&device_timer);
331 return error;
332}
333
334device_initcall(timekeeping_init_device);
335
336/*
337 * If the error is already larger, we look ahead even further
338 * to compensate for late or lost adjustments.
339 */
340static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
341 s64 *offset)
342{
343 s64 tick_error, i;
344 u32 look_ahead, adj;
345 s32 error2, mult;
346
347 /*
348 * Use the current error value to determine how much to look ahead.
349 * The larger the error the slower we adjust for it to avoid problems
350 * with losing too many ticks, otherwise we would overadjust and
351 * produce an even larger error. The smaller the adjustment the
352 * faster we try to adjust for it, as lost ticks can do less harm
353 * here. This is tuned so that an error of about 1 msec is adusted
354 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
355 */
356 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
357 error2 = abs(error2);
358 for (look_ahead = 0; error2 > 0; look_ahead++)
359 error2 >>= 2;
360
361 /*
362 * Now calculate the error in (1 << look_ahead) ticks, but first
363 * remove the single look ahead already included in the error.
364 */
365 tick_error = current_tick_length() >>
366 (TICK_LENGTH_SHIFT - clock->shift + 1);
367 tick_error -= clock->xtime_interval >> 1;
368 error = ((error - tick_error) >> look_ahead) + tick_error;
369
370 /* Finally calculate the adjustment shift value. */
371 i = *interval;
372 mult = 1;
373 if (error < 0) {
374 error = -error;
375 *interval = -*interval;
376 *offset = -*offset;
377 mult = -1;
378 }
379 for (adj = 0; error > i; adj++)
380 error >>= 1;
381
382 *interval <<= adj;
383 *offset <<= adj;
384 return mult << adj;
385}
386
387/*
388 * Adjust the multiplier to reduce the error value,
389 * this is optimized for the most common adjustments of -1,0,1,
390 * for other values we can do a bit more work.
391 */
392static void clocksource_adjust(struct clocksource *clock, s64 offset)
393{
394 s64 error, interval = clock->cycle_interval;
395 int adj;
396
397 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
398 if (error > interval) {
399 error >>= 2;
400 if (likely(error <= interval))
401 adj = 1;
402 else
403 adj = clocksource_bigadjust(error, &interval, &offset);
404 } else if (error < -interval) {
405 error >>= 2;
406 if (likely(error >= -interval)) {
407 adj = -1;
408 interval = -interval;
409 offset = -offset;
410 } else
411 adj = clocksource_bigadjust(error, &interval, &offset);
412 } else
413 return;
414
415 clock->mult += adj;
416 clock->xtime_interval += interval;
417 clock->xtime_nsec -= offset;
418 clock->error -= (interval - offset) <<
419 (TICK_LENGTH_SHIFT - clock->shift);
420}
421
422/**
423 * update_wall_time - Uses the current clocksource to increment the wall time
424 *
425 * Called from the timer interrupt, must hold a write on xtime_lock.
426 */
427void update_wall_time(void)
428{
429 cycle_t offset;
430
431 /* Make sure we're fully resumed: */
432 if (unlikely(timekeeping_suspended))
433 return;
434
435#ifdef CONFIG_GENERIC_TIME
436 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
437#else
438 offset = clock->cycle_interval;
439#endif
440 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
441
442 /* normally this loop will run just once, however in the
443 * case of lost or late ticks, it will accumulate correctly.
444 */
445 while (offset >= clock->cycle_interval) {
446 /* accumulate one interval */
447 clock->xtime_nsec += clock->xtime_interval;
448 clock->cycle_last += clock->cycle_interval;
449 offset -= clock->cycle_interval;
450
451 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
452 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
453 xtime.tv_sec++;
454 second_overflow();
455 }
456
457 /* interpolator bits */
458 time_interpolator_update(clock->xtime_interval
459 >> clock->shift);
460
461 /* accumulate error between NTP and clock interval */
462 clock->error += current_tick_length();
463 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
464 }
465
466 /* correct the clock when NTP error is too big */
467 clocksource_adjust(clock, offset);
468
469 /* store full nanoseconds into xtime */
470 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
471 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
472
473 /* check to see if there is a new clocksource to use */
474 change_clocksource();
475 update_vsyscall(&xtime, clock);
476}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 59df5e8555a8..b734ca4bc75e 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
38 38
39static void print_name_offset(struct seq_file *m, void *sym) 39static void print_name_offset(struct seq_file *m, void *sym)
40{ 40{
41 unsigned long addr = (unsigned long)sym; 41 char symname[KSYM_NAME_LEN+1];
42 char namebuf[KSYM_NAME_LEN+1]; 42
43 unsigned long size, offset; 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 const char *sym_name;
45 char *modname;
46
47 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
48 if (sym_name)
49 SEQ_printf(m, "%s", sym_name);
50 else
51 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%p>", sym);
45 else
46 SEQ_printf(m, "%s", symname);
52} 47}
53 48
54static void 49static void
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1bc4882e28e0..868f1bceb07f 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
257 257
258static void print_name_offset(struct seq_file *m, unsigned long addr) 258static void print_name_offset(struct seq_file *m, unsigned long addr)
259{ 259{
260 char namebuf[KSYM_NAME_LEN+1]; 260 char symname[KSYM_NAME_LEN+1];
261 unsigned long size, offset; 261
262 const char *sym_name; 262 if (lookup_symbol_name(addr, symname) < 0)
263 char *modname;
264
265 sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
266 if (sym_name)
267 seq_printf(m, "%s", sym_name);
268 else
269 seq_printf(m, "<%p>", (void *)addr); 263 seq_printf(m, "<%p>", (void *)addr);
264 else
265 seq_printf(m, "%s", symname);
270} 266}
271 267
272static int tstats_show(struct seq_file *m, void *v) 268static int tstats_show(struct seq_file *m, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index b22bd39740dd..7a6448340f90 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/kernel/timer.c 2 * linux/kernel/timer.c
3 * 3 *
4 * Kernel internal timers, kernel timekeeping, basic process system calls 4 * Kernel internal timers, basic process system calls
5 * 5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 7 *
@@ -74,7 +74,7 @@ struct tvec_t_base_s {
74 tvec_t tv3; 74 tvec_t tv3;
75 tvec_t tv4; 75 tvec_t tv4;
76 tvec_t tv5; 76 tvec_t tv5;
77} ____cacheline_aligned_in_smp; 77} ____cacheline_aligned;
78 78
79typedef struct tvec_t_base_s tvec_base_t; 79typedef struct tvec_t_base_s tvec_base_t;
80 80
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases;
82EXPORT_SYMBOL(boot_tvec_bases); 82EXPORT_SYMBOL(boot_tvec_bases);
83static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 83static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
84 84
85/*
86 * Note that all tvec_bases is 2 byte aligned and lower bit of
87 * base in timer_list is guaranteed to be zero. Use the LSB for
88 * the new flag to indicate whether the timer is deferrable
89 */
90#define TBASE_DEFERRABLE_FLAG (0x1)
91
92/* Functions below help us manage 'deferrable' flag */
93static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
94{
95 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
96}
97
98static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
99{
100 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
101}
102
103static inline void timer_set_deferrable(struct timer_list *timer)
104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
106 TBASE_DEFERRABLE_FLAG));
107}
108
109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) |
113 tbase_get_deferrable(timer->base));
114}
115
85/** 116/**
86 * __round_jiffies - function to round jiffies to a full second 117 * __round_jiffies - function to round jiffies to a full second
87 * @j: the time in (absolute) jiffies that should be rounded 118 * @j: the time in (absolute) jiffies that should be rounded
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer)
295} 326}
296EXPORT_SYMBOL(init_timer); 327EXPORT_SYMBOL(init_timer);
297 328
329void fastcall init_timer_deferrable(struct timer_list *timer)
330{
331 init_timer(timer);
332 timer_set_deferrable(timer);
333}
334EXPORT_SYMBOL(init_timer_deferrable);
335
298static inline void detach_timer(struct timer_list *timer, 336static inline void detach_timer(struct timer_list *timer,
299 int clear_pending) 337 int clear_pending)
300{ 338{
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
325 tvec_base_t *base; 363 tvec_base_t *base;
326 364
327 for (;;) { 365 for (;;) {
328 base = timer->base; 366 tvec_base_t *prelock_base = timer->base;
367 base = tbase_get_base(prelock_base);
329 if (likely(base != NULL)) { 368 if (likely(base != NULL)) {
330 spin_lock_irqsave(&base->lock, *flags); 369 spin_lock_irqsave(&base->lock, *flags);
331 if (likely(base == timer->base)) 370 if (likely(prelock_base == timer->base))
332 return base; 371 return base;
333 /* The timer has migrated to another CPU */ 372 /* The timer has migrated to another CPU */
334 spin_unlock_irqrestore(&base->lock, *flags); 373 spin_unlock_irqrestore(&base->lock, *flags);
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
365 */ 404 */
366 if (likely(base->running_timer != timer)) { 405 if (likely(base->running_timer != timer)) {
367 /* See the comment in lock_timer_base() */ 406 /* See the comment in lock_timer_base() */
368 timer->base = NULL; 407 timer_set_base(timer, NULL);
369 spin_unlock(&base->lock); 408 spin_unlock(&base->lock);
370 base = new_base; 409 base = new_base;
371 spin_lock(&base->lock); 410 spin_lock(&base->lock);
372 timer->base = base; 411 timer_set_base(timer, base);
373 } 412 }
374 } 413 }
375 414
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
397 timer_stats_timer_set_start_info(timer); 436 timer_stats_timer_set_start_info(timer);
398 BUG_ON(timer_pending(timer) || !timer->function); 437 BUG_ON(timer_pending(timer) || !timer->function);
399 spin_lock_irqsave(&base->lock, flags); 438 spin_lock_irqsave(&base->lock, flags);
400 timer->base = base; 439 timer_set_base(timer, base);
401 internal_add_timer(base, timer); 440 internal_add_timer(base, timer);
402 spin_unlock_irqrestore(&base->lock, flags); 441 spin_unlock_irqrestore(&base->lock, flags);
403} 442}
@@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
550 * don't have to detach them individually. 589 * don't have to detach them individually.
551 */ 590 */
552 list_for_each_entry_safe(timer, tmp, &tv_list, entry) { 591 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
553 BUG_ON(timer->base != base); 592 BUG_ON(tbase_get_base(timer->base) != base);
554 internal_add_timer(base, timer); 593 internal_add_timer(base, timer);
555 } 594 }
556 595
@@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base)
590 void (*fn)(unsigned long); 629 void (*fn)(unsigned long);
591 unsigned long data; 630 unsigned long data;
592 631
593 timer = list_entry(head->next,struct timer_list,entry); 632 timer = list_first_entry(head, struct timer_list,entry);
594 fn = timer->function; 633 fn = timer->function;
595 data = timer->data; 634 data = timer->data;
596 635
@@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
636 index = slot = timer_jiffies & TVR_MASK; 675 index = slot = timer_jiffies & TVR_MASK;
637 do { 676 do {
638 list_for_each_entry(nte, base->tv1.vec + slot, entry) { 677 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
678 if (tbase_get_deferrable(nte->base))
679 continue;
680
639 found = 1; 681 found = 1;
640 expires = nte->expires; 682 expires = nte->expires;
641 /* Look at the cascade bucket(s)? */ 683 /* Look at the cascade bucket(s)? */
@@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void)
752 794
753#endif 795#endif
754 796
755/******************************************************************/
756
757/*
758 * The current time
759 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
760 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
761 * at zero at system boot time, so wall_to_monotonic will be negative,
762 * however, we will ALWAYS keep the tv_nsec part positive so we can use
763 * the usual normalization.
764 */
765struct timespec xtime __attribute__ ((aligned (16)));
766struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
767
768EXPORT_SYMBOL(xtime);
769
770
771/* XXX - all of this timekeeping code should be later moved to time.c */
772#include <linux/clocksource.h>
773static struct clocksource *clock; /* pointer to current clocksource */
774
775#ifdef CONFIG_GENERIC_TIME
776/**
777 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
778 *
779 * private function, must hold xtime_lock lock when being
780 * called. Returns the number of nanoseconds since the
781 * last call to update_wall_time() (adjusted by NTP scaling)
782 */
783static inline s64 __get_nsec_offset(void)
784{
785 cycle_t cycle_now, cycle_delta;
786 s64 ns_offset;
787
788 /* read clocksource: */
789 cycle_now = clocksource_read(clock);
790
791 /* calculate the delta since the last update_wall_time: */
792 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
793
794 /* convert to nanoseconds: */
795 ns_offset = cyc2ns(clock, cycle_delta);
796
797 return ns_offset;
798}
799
800/**
801 * __get_realtime_clock_ts - Returns the time of day in a timespec
802 * @ts: pointer to the timespec to be set
803 *
804 * Returns the time of day in a timespec. Used by
805 * do_gettimeofday() and get_realtime_clock_ts().
806 */
807static inline void __get_realtime_clock_ts(struct timespec *ts)
808{
809 unsigned long seq;
810 s64 nsecs;
811
812 do {
813 seq = read_seqbegin(&xtime_lock);
814
815 *ts = xtime;
816 nsecs = __get_nsec_offset();
817
818 } while (read_seqretry(&xtime_lock, seq));
819
820 timespec_add_ns(ts, nsecs);
821}
822
823/**
824 * getnstimeofday - Returns the time of day in a timespec
825 * @ts: pointer to the timespec to be set
826 *
827 * Returns the time of day in a timespec.
828 */
829void getnstimeofday(struct timespec *ts)
830{
831 __get_realtime_clock_ts(ts);
832}
833
834EXPORT_SYMBOL(getnstimeofday);
835
836/**
837 * do_gettimeofday - Returns the time of day in a timeval
838 * @tv: pointer to the timeval to be set
839 *
840 * NOTE: Users should be converted to using get_realtime_clock_ts()
841 */
842void do_gettimeofday(struct timeval *tv)
843{
844 struct timespec now;
845
846 __get_realtime_clock_ts(&now);
847 tv->tv_sec = now.tv_sec;
848 tv->tv_usec = now.tv_nsec/1000;
849}
850
851EXPORT_SYMBOL(do_gettimeofday);
852/**
853 * do_settimeofday - Sets the time of day
854 * @tv: pointer to the timespec variable containing the new time
855 *
856 * Sets the time of day to the new time and update NTP and notify hrtimers
857 */
858int do_settimeofday(struct timespec *tv)
859{
860 unsigned long flags;
861 time_t wtm_sec, sec = tv->tv_sec;
862 long wtm_nsec, nsec = tv->tv_nsec;
863
864 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
865 return -EINVAL;
866
867 write_seqlock_irqsave(&xtime_lock, flags);
868
869 nsec -= __get_nsec_offset();
870
871 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
872 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
873
874 set_normalized_timespec(&xtime, sec, nsec);
875 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
876
877 clock->error = 0;
878 ntp_clear();
879
880 update_vsyscall(&xtime, clock);
881
882 write_sequnlock_irqrestore(&xtime_lock, flags);
883
884 /* signal hrtimers about time change */
885 clock_was_set();
886
887 return 0;
888}
889
890EXPORT_SYMBOL(do_settimeofday);
891
892/**
893 * change_clocksource - Swaps clocksources if a new one is available
894 *
895 * Accumulates current time interval and initializes new clocksource
896 */
897static void change_clocksource(void)
898{
899 struct clocksource *new;
900 cycle_t now;
901 u64 nsec;
902
903 new = clocksource_get_next();
904
905 if (clock == new)
906 return;
907
908 now = clocksource_read(new);
909 nsec = __get_nsec_offset();
910 timespec_add_ns(&xtime, nsec);
911
912 clock = new;
913 clock->cycle_last = now;
914
915 clock->error = 0;
916 clock->xtime_nsec = 0;
917 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
918
919 tick_clock_notify();
920
921 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
922 clock->name);
923}
924#else
925static inline void change_clocksource(void) { }
926#endif
927
928/**
929 * timekeeping_is_continuous - check to see if timekeeping is free running
930 */
931int timekeeping_is_continuous(void)
932{
933 unsigned long seq;
934 int ret;
935
936 do {
937 seq = read_seqbegin(&xtime_lock);
938
939 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
940
941 } while (read_seqretry(&xtime_lock, seq));
942
943 return ret;
944}
945
946/**
947 * read_persistent_clock - Return time in seconds from the persistent clock.
948 *
949 * Weak dummy function for arches that do not yet support it.
950 * Returns seconds from epoch using the battery backed persistent clock.
951 * Returns zero if unsupported.
952 *
953 * XXX - Do be sure to remove it once all arches implement it.
954 */
955unsigned long __attribute__((weak)) read_persistent_clock(void)
956{
957 return 0;
958}
959
960/*
961 * timekeeping_init - Initializes the clocksource and common timekeeping values
962 */
963void __init timekeeping_init(void)
964{
965 unsigned long flags;
966 unsigned long sec = read_persistent_clock();
967
968 write_seqlock_irqsave(&xtime_lock, flags);
969
970 ntp_clear();
971
972 clock = clocksource_get_next();
973 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
974 clock->cycle_last = clocksource_read(clock);
975
976 xtime.tv_sec = sec;
977 xtime.tv_nsec = 0;
978 set_normalized_timespec(&wall_to_monotonic,
979 -xtime.tv_sec, -xtime.tv_nsec);
980
981 write_sequnlock_irqrestore(&xtime_lock, flags);
982}
983
984/* flag for if timekeeping is suspended */
985static int timekeeping_suspended;
986/* time in seconds when suspend began */
987static unsigned long timekeeping_suspend_time;
988
989/**
990 * timekeeping_resume - Resumes the generic timekeeping subsystem.
991 * @dev: unused
992 *
993 * This is for the generic clocksource timekeeping.
994 * xtime/wall_to_monotonic/jiffies/etc are
995 * still managed by arch specific suspend/resume code.
996 */
997static int timekeeping_resume(struct sys_device *dev)
998{
999 unsigned long flags;
1000 unsigned long now = read_persistent_clock();
1001
1002 write_seqlock_irqsave(&xtime_lock, flags);
1003
1004 if (now && (now > timekeeping_suspend_time)) {
1005 unsigned long sleep_length = now - timekeeping_suspend_time;
1006
1007 xtime.tv_sec += sleep_length;
1008 wall_to_monotonic.tv_sec -= sleep_length;
1009 }
1010 /* re-base the last cycle value */
1011 clock->cycle_last = clocksource_read(clock);
1012 clock->error = 0;
1013 timekeeping_suspended = 0;
1014 write_sequnlock_irqrestore(&xtime_lock, flags);
1015
1016 touch_softlockup_watchdog();
1017
1018 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
1019
1020 /* Resume hrtimers */
1021 hres_timers_resume();
1022
1023 return 0;
1024}
1025
1026static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
1027{
1028 unsigned long flags;
1029
1030 write_seqlock_irqsave(&xtime_lock, flags);
1031 timekeeping_suspended = 1;
1032 timekeeping_suspend_time = read_persistent_clock();
1033 write_sequnlock_irqrestore(&xtime_lock, flags);
1034
1035 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
1036
1037 return 0;
1038}
1039
1040/* sysfs resume/suspend bits for timekeeping */
1041static struct sysdev_class timekeeping_sysclass = {
1042 .resume = timekeeping_resume,
1043 .suspend = timekeeping_suspend,
1044 set_kset_name("timekeeping"),
1045};
1046
1047static struct sys_device device_timer = {
1048 .id = 0,
1049 .cls = &timekeeping_sysclass,
1050};
1051
1052static int __init timekeeping_init_device(void)
1053{
1054 int error = sysdev_class_register(&timekeeping_sysclass);
1055 if (!error)
1056 error = sysdev_register(&device_timer);
1057 return error;
1058}
1059
1060device_initcall(timekeeping_init_device);
1061
1062/*
1063 * If the error is already larger, we look ahead even further
1064 * to compensate for late or lost adjustments.
1065 */
1066static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
1067 s64 *offset)
1068{
1069 s64 tick_error, i;
1070 u32 look_ahead, adj;
1071 s32 error2, mult;
1072
1073 /*
1074 * Use the current error value to determine how much to look ahead.
1075 * The larger the error the slower we adjust for it to avoid problems
1076 * with losing too many ticks, otherwise we would overadjust and
1077 * produce an even larger error. The smaller the adjustment the
1078 * faster we try to adjust for it, as lost ticks can do less harm
1079 * here. This is tuned so that an error of about 1 msec is adusted
1080 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1081 */
1082 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
1083 error2 = abs(error2);
1084 for (look_ahead = 0; error2 > 0; look_ahead++)
1085 error2 >>= 2;
1086
1087 /*
1088 * Now calculate the error in (1 << look_ahead) ticks, but first
1089 * remove the single look ahead already included in the error.
1090 */
1091 tick_error = current_tick_length() >>
1092 (TICK_LENGTH_SHIFT - clock->shift + 1);
1093 tick_error -= clock->xtime_interval >> 1;
1094 error = ((error - tick_error) >> look_ahead) + tick_error;
1095
1096 /* Finally calculate the adjustment shift value. */
1097 i = *interval;
1098 mult = 1;
1099 if (error < 0) {
1100 error = -error;
1101 *interval = -*interval;
1102 *offset = -*offset;
1103 mult = -1;
1104 }
1105 for (adj = 0; error > i; adj++)
1106 error >>= 1;
1107
1108 *interval <<= adj;
1109 *offset <<= adj;
1110 return mult << adj;
1111}
1112
1113/*
1114 * Adjust the multiplier to reduce the error value,
1115 * this is optimized for the most common adjustments of -1,0,1,
1116 * for other values we can do a bit more work.
1117 */
1118static void clocksource_adjust(struct clocksource *clock, s64 offset)
1119{
1120 s64 error, interval = clock->cycle_interval;
1121 int adj;
1122
1123 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1124 if (error > interval) {
1125 error >>= 2;
1126 if (likely(error <= interval))
1127 adj = 1;
1128 else
1129 adj = clocksource_bigadjust(error, &interval, &offset);
1130 } else if (error < -interval) {
1131 error >>= 2;
1132 if (likely(error >= -interval)) {
1133 adj = -1;
1134 interval = -interval;
1135 offset = -offset;
1136 } else
1137 adj = clocksource_bigadjust(error, &interval, &offset);
1138 } else
1139 return;
1140
1141 clock->mult += adj;
1142 clock->xtime_interval += interval;
1143 clock->xtime_nsec -= offset;
1144 clock->error -= (interval - offset) <<
1145 (TICK_LENGTH_SHIFT - clock->shift);
1146}
1147
1148/**
1149 * update_wall_time - Uses the current clocksource to increment the wall time
1150 *
1151 * Called from the timer interrupt, must hold a write on xtime_lock.
1152 */
1153static void update_wall_time(void)
1154{
1155 cycle_t offset;
1156
1157 /* Make sure we're fully resumed: */
1158 if (unlikely(timekeeping_suspended))
1159 return;
1160
1161#ifdef CONFIG_GENERIC_TIME
1162 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1163#else
1164 offset = clock->cycle_interval;
1165#endif
1166 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1167
1168 /* normally this loop will run just once, however in the
1169 * case of lost or late ticks, it will accumulate correctly.
1170 */
1171 while (offset >= clock->cycle_interval) {
1172 /* accumulate one interval */
1173 clock->xtime_nsec += clock->xtime_interval;
1174 clock->cycle_last += clock->cycle_interval;
1175 offset -= clock->cycle_interval;
1176
1177 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1178 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
1179 xtime.tv_sec++;
1180 second_overflow();
1181 }
1182
1183 /* interpolator bits */
1184 time_interpolator_update(clock->xtime_interval
1185 >> clock->shift);
1186
1187 /* accumulate error between NTP and clock interval */
1188 clock->error += current_tick_length();
1189 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1190 }
1191
1192 /* correct the clock when NTP error is too big */
1193 clocksource_adjust(clock, offset);
1194
1195 /* store full nanoseconds into xtime */
1196 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
1197 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1198
1199 /* check to see if there is a new clocksource to use */
1200 change_clocksource();
1201 update_vsyscall(&xtime, clock);
1202}
1203
1204/* 797/*
1205 * Called from the timer interrupt handler to charge one tick to the current 798 * Called from the timer interrupt handler to charge one tick to the current
1206 * process. user_tick is 1 if the tick is user time, 0 for system. 799 * process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks)
1264} 857}
1265 858
1266/* 859/*
1267 * This read-write spinlock protects us from races in SMP while
1268 * playing with xtime and avenrun.
1269 */
1270__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
1271
1272EXPORT_SYMBOL(xtime_lock);
1273
1274/*
1275 * This function runs timers and the timer-tq in bottom half context. 860 * This function runs timers and the timer-tq in bottom half context.
1276 */ 861 */
1277static void run_timer_softirq(struct softirq_action *h) 862static void run_timer_softirq(struct softirq_action *h)
@@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu)
1617 cpu_to_node(cpu)); 1202 cpu_to_node(cpu));
1618 if (!base) 1203 if (!base)
1619 return -ENOMEM; 1204 return -ENOMEM;
1205
1206 /* Make sure that tvec_base is 2 byte aligned */
1207 if (tbase_get_deferrable(base)) {
1208 WARN_ON(1);
1209 kfree(base);
1210 return -ENOMEM;
1211 }
1620 memset(base, 0, sizeof(*base)); 1212 memset(base, 0, sizeof(*base));
1621 per_cpu(tvec_bases, cpu) = base; 1213 per_cpu(tvec_bases, cpu) = base;
1622 } else { 1214 } else {
@@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1656 struct timer_list *timer; 1248 struct timer_list *timer;
1657 1249
1658 while (!list_empty(head)) { 1250 while (!list_empty(head)) {
1659 timer = list_entry(head->next, struct timer_list, entry); 1251 timer = list_first_entry(head, struct timer_list, entry);
1660 detach_timer(timer, 0); 1252 detach_timer(timer, 0);
1661 timer->base = new_base; 1253 timer_set_base(timer, new_base);
1662 internal_add_timer(new_base, timer); 1254 internal_add_timer(new_base, timer);
1663 } 1255 }
1664} 1256}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 187e2a423878..dd308ba4e03b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -6,7 +6,6 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h> 7#include <linux/utsname.h>
8#include <linux/mman.h> 8#include <linux/mman.h>
9#include <linux/smp_lock.h>
10#include <linux/notifier.h> 9#include <linux/notifier.h>
11#include <linux/reboot.h> 10#include <linux/reboot.h>
12#include <linux/prctl.h> 11#include <linux/prctl.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index c859164a6993..160c8c5136bd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
32} 32}
33 33
34/* 34/*
35 * unshare the current process' utsname namespace.
36 * called only in sys_unshare()
37 */
38int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
39{
40 if (unshare_flags & CLONE_NEWUTS) {
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
43
44 *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
45 if (!*new_uts)
46 return -ENOMEM;
47 }
48
49 return 0;
50}
51
52/*
53 * Copy task tsk's utsname namespace, or clone it if flags 35 * Copy task tsk's utsname namespace, or clone it if flags
54 * specifies CLONE_NEWUTS. In latter case, changes to the 36 * specifies CLONE_NEWUTS. In latter case, changes to the
55 * utsname of this process won't be seen by parent, and vice 37 * utsname of this process won't be seen by parent, and vice
56 * versa. 38 * versa.
57 */ 39 */
58int copy_utsname(int flags, struct task_struct *tsk) 40struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
59{ 41{
60 struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
61 struct uts_namespace *new_ns; 42 struct uts_namespace *new_ns;
62 int err = 0;
63
64 if (!old_ns)
65 return 0;
66 43
44 BUG_ON(!old_ns);
67 get_uts_ns(old_ns); 45 get_uts_ns(old_ns);
68 46
69 if (!(flags & CLONE_NEWUTS)) 47 if (!(flags & CLONE_NEWUTS))
70 return 0; 48 return old_ns;
71
72 if (!capable(CAP_SYS_ADMIN)) {
73 err = -EPERM;
74 goto out;
75 }
76 49
77 new_ns = clone_uts_ns(old_ns); 50 new_ns = clone_uts_ns(old_ns);
78 if (!new_ns) {
79 err = -ENOMEM;
80 goto out;
81 }
82 tsk->nsproxy->uts_ns = new_ns;
83 51
84out:
85 put_uts_ns(old_ns); 52 put_uts_ns(old_ns);
86 return err; 53 return new_ns;
87} 54}
88 55
89void free_uts_ns(struct kref *kref) 56void free_uts_ns(struct kref *kref)