diff options
Diffstat (limited to 'kernel')
47 files changed, 1670 insertions, 1124 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ac6b27abb1ad..642d4277c2ea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o latency.o nsproxy.o srcu.o | 11 | hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o |
12 | 12 | ||
13 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 13 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
14 | obj-y += time/ | 14 | obj-y += time/ |
diff --git a/kernel/audit.c b/kernel/audit.c index 4e9d20829681..d13276d41410 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
515 | err = -EPERM; | 515 | err = -EPERM; |
516 | break; | 516 | break; |
517 | case AUDIT_USER: | 517 | case AUDIT_USER: |
518 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 518 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
519 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | 519 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: |
520 | if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) | 520 | if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) |
521 | err = -EPERM; | 521 | err = -EPERM; |
522 | break; | 522 | break; |
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
614 | loginuid, sid); | 614 | loginuid, sid); |
615 | break; | 615 | break; |
616 | case AUDIT_USER: | 616 | case AUDIT_USER: |
617 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 617 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
618 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | 618 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: |
619 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 619 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
620 | return 0; | 620 | return 0; |
621 | 621 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d240349cbf0f..88b416dfbc72 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -42,7 +42,6 @@ | |||
42 | #include <linux/seq_file.h> | 42 | #include <linux/seq_file.h> |
43 | #include <linux/security.h> | 43 | #include <linux/security.h> |
44 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
45 | #include <linux/smp_lock.h> | ||
46 | #include <linux/spinlock.h> | 45 | #include <linux/spinlock.h> |
47 | #include <linux/stat.h> | 46 | #include <linux/stat.h> |
48 | #include <linux/string.h> | 47 | #include <linux/string.h> |
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
822 | return -EACCES; | 821 | return -EACCES; |
823 | 822 | ||
824 | trialcs = *cs; | 823 | trialcs = *cs; |
825 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 824 | |
826 | if (retval < 0) | 825 | /* |
827 | return retval; | 826 | * We allow a cpuset's cpus_allowed to be empty; if it has attached |
827 | * tasks, we'll catch it later when we validate the change and return | ||
828 | * -ENOSPC. | ||
829 | */ | ||
830 | if (!buf[0] || (buf[0] == '\n' && !buf[1])) { | ||
831 | cpus_clear(trialcs.cpus_allowed); | ||
832 | } else { | ||
833 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | ||
834 | if (retval < 0) | ||
835 | return retval; | ||
836 | } | ||
828 | cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); | 837 | cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); |
829 | if (cpus_empty(trialcs.cpus_allowed)) | 838 | /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ |
839 | if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) | ||
830 | return -ENOSPC; | 840 | return -ENOSPC; |
831 | retval = validate_change(cs, &trialcs); | 841 | retval = validate_change(cs, &trialcs); |
832 | if (retval < 0) | 842 | if (retval < 0) |
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
919 | return -EACCES; | 929 | return -EACCES; |
920 | 930 | ||
921 | trialcs = *cs; | 931 | trialcs = *cs; |
922 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 932 | |
923 | if (retval < 0) | 933 | /* |
924 | goto done; | 934 | * We allow a cpuset's mems_allowed to be empty; if it has attached |
935 | * tasks, we'll catch it later when we validate the change and return | ||
936 | * -ENOSPC. | ||
937 | */ | ||
938 | if (!buf[0] || (buf[0] == '\n' && !buf[1])) { | ||
939 | nodes_clear(trialcs.mems_allowed); | ||
940 | } else { | ||
941 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
942 | if (retval < 0) | ||
943 | goto done; | ||
944 | } | ||
925 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | 945 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); |
926 | oldmem = cs->mems_allowed; | 946 | oldmem = cs->mems_allowed; |
927 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | 947 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { |
928 | retval = 0; /* Too easy - nothing to do */ | 948 | retval = 0; /* Too easy - nothing to do */ |
929 | goto done; | 949 | goto done; |
930 | } | 950 | } |
931 | if (nodes_empty(trialcs.mems_allowed)) { | 951 | /* mems_allowed cannot be empty for a cpuset with attached tasks. */ |
952 | if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { | ||
932 | retval = -ENOSPC; | 953 | retval = -ENOSPC; |
933 | goto done; | 954 | goto done; |
934 | } | 955 | } |
@@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child) | |||
2200 | * it is holding that mutex while calling check_for_release(), | 2221 | * it is holding that mutex while calling check_for_release(), |
2201 | * which calls kmalloc(), so can't be called holding callback_mutex(). | 2222 | * which calls kmalloc(), so can't be called holding callback_mutex(). |
2202 | * | 2223 | * |
2203 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
2204 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
2205 | * mess with it, or task is a failed fork, never visible to attach_task. | ||
2206 | * | ||
2207 | * the_top_cpuset_hack: | 2224 | * the_top_cpuset_hack: |
2208 | * | 2225 | * |
2209 | * Set the exiting tasks cpuset to the root cpuset (top_cpuset). | 2226 | * Set the exiting tasks cpuset to the root cpuset (top_cpuset). |
@@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
2242 | { | 2259 | { |
2243 | struct cpuset *cs; | 2260 | struct cpuset *cs; |
2244 | 2261 | ||
2262 | task_lock(current); | ||
2245 | cs = tsk->cpuset; | 2263 | cs = tsk->cpuset; |
2246 | tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ | 2264 | tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ |
2265 | task_unlock(current); | ||
2247 | 2266 | ||
2248 | if (notify_on_release(cs)) { | 2267 | if (notify_on_release(cs)) { |
2249 | char *pathbuf = NULL; | 2268 | char *pathbuf = NULL; |
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c new file mode 100644 index 000000000000..0d98827887a7 --- /dev/null +++ b/kernel/die_notifier.c | |||
@@ -0,0 +1,38 @@ | |||
1 | |||
2 | #include <linux/module.h> | ||
3 | #include <linux/notifier.h> | ||
4 | #include <linux/vmalloc.h> | ||
5 | #include <linux/kdebug.h> | ||
6 | |||
7 | |||
8 | static ATOMIC_NOTIFIER_HEAD(die_chain); | ||
9 | |||
10 | int notify_die(enum die_val val, const char *str, | ||
11 | struct pt_regs *regs, long err, int trap, int sig) | ||
12 | { | ||
13 | struct die_args args = { | ||
14 | .regs = regs, | ||
15 | .str = str, | ||
16 | .err = err, | ||
17 | .trapnr = trap, | ||
18 | .signr = sig, | ||
19 | |||
20 | }; | ||
21 | |||
22 | return atomic_notifier_call_chain(&die_chain, val, &args); | ||
23 | } | ||
24 | |||
25 | int register_die_notifier(struct notifier_block *nb) | ||
26 | { | ||
27 | vmalloc_sync_all(); | ||
28 | return atomic_notifier_chain_register(&die_chain, nb); | ||
29 | } | ||
30 | EXPORT_SYMBOL_GPL(register_die_notifier); | ||
31 | |||
32 | int unregister_die_notifier(struct notifier_block *nb) | ||
33 | { | ||
34 | return atomic_notifier_chain_unregister(&die_chain, nb); | ||
35 | } | ||
36 | EXPORT_SYMBOL_GPL(unregister_die_notifier); | ||
37 | |||
38 | |||
diff --git a/kernel/exit.c b/kernel/exit.c index 92369240d91d..f5a7abb621f3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -7,7 +7,6 @@ | |||
7 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
10 | #include <linux/smp_lock.h> | ||
11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
12 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
13 | #include <linux/completion.h> | 12 | #include <linux/completion.h> |
diff --git a/kernel/fork.c b/kernel/fork.c index b7d169def942..a8dd75d4992b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -14,7 +14,6 @@ | |||
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/unistd.h> | 16 | #include <linux/unistd.h> |
17 | #include <linux/smp_lock.h> | ||
18 | #include <linux/module.h> | 17 | #include <linux/module.h> |
19 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
20 | #include <linux/completion.h> | 19 | #include <linux/completion.h> |
@@ -1516,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
1516 | } | 1515 | } |
1517 | 1516 | ||
1518 | /* | 1517 | /* |
1519 | * Unshare the mnt_namespace structure if it is being shared | ||
1520 | */ | ||
1521 | static int unshare_mnt_namespace(unsigned long unshare_flags, | ||
1522 | struct mnt_namespace **new_nsp, struct fs_struct *new_fs) | ||
1523 | { | ||
1524 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; | ||
1525 | |||
1526 | if ((unshare_flags & CLONE_NEWNS) && ns) { | ||
1527 | if (!capable(CAP_SYS_ADMIN)) | ||
1528 | return -EPERM; | ||
1529 | |||
1530 | *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); | ||
1531 | if (!*new_nsp) | ||
1532 | return -ENOMEM; | ||
1533 | } | ||
1534 | |||
1535 | return 0; | ||
1536 | } | ||
1537 | |||
1538 | /* | ||
1539 | * Unsharing of sighand is not supported yet | 1518 | * Unsharing of sighand is not supported yet |
1540 | */ | 1519 | */ |
1541 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | 1520 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) |
@@ -1593,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n | |||
1593 | return 0; | 1572 | return 0; |
1594 | } | 1573 | } |
1595 | 1574 | ||
1596 | #ifndef CONFIG_IPC_NS | ||
1597 | static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) | ||
1598 | { | ||
1599 | if (flags & CLONE_NEWIPC) | ||
1600 | return -EINVAL; | ||
1601 | |||
1602 | return 0; | ||
1603 | } | ||
1604 | #endif | ||
1605 | |||
1606 | /* | 1575 | /* |
1607 | * unshare allows a process to 'unshare' part of the process | 1576 | * unshare allows a process to 'unshare' part of the process |
1608 | * context which was originally shared using clone. copy_* | 1577 | * context which was originally shared using clone. copy_* |
@@ -1615,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1615 | { | 1584 | { |
1616 | int err = 0; | 1585 | int err = 0; |
1617 | struct fs_struct *fs, *new_fs = NULL; | 1586 | struct fs_struct *fs, *new_fs = NULL; |
1618 | struct mnt_namespace *ns, *new_ns = NULL; | ||
1619 | struct sighand_struct *new_sigh = NULL; | 1587 | struct sighand_struct *new_sigh = NULL; |
1620 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | 1588 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; |
1621 | struct files_struct *fd, *new_fd = NULL; | 1589 | struct files_struct *fd, *new_fd = NULL; |
1622 | struct sem_undo_list *new_ulist = NULL; | 1590 | struct sem_undo_list *new_ulist = NULL; |
1623 | struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; | 1591 | struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; |
1624 | struct uts_namespace *uts, *new_uts = NULL; | ||
1625 | struct ipc_namespace *ipc, *new_ipc = NULL; | ||
1626 | 1592 | ||
1627 | check_unshare_flags(&unshare_flags); | 1593 | check_unshare_flags(&unshare_flags); |
1628 | 1594 | ||
@@ -1637,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1637 | goto bad_unshare_out; | 1603 | goto bad_unshare_out; |
1638 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1604 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
1639 | goto bad_unshare_cleanup_thread; | 1605 | goto bad_unshare_cleanup_thread; |
1640 | if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) | ||
1641 | goto bad_unshare_cleanup_fs; | ||
1642 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | 1606 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) |
1643 | goto bad_unshare_cleanup_ns; | 1607 | goto bad_unshare_cleanup_fs; |
1644 | if ((err = unshare_vm(unshare_flags, &new_mm))) | 1608 | if ((err = unshare_vm(unshare_flags, &new_mm))) |
1645 | goto bad_unshare_cleanup_sigh; | 1609 | goto bad_unshare_cleanup_sigh; |
1646 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1610 | if ((err = unshare_fd(unshare_flags, &new_fd))) |
1647 | goto bad_unshare_cleanup_vm; | 1611 | goto bad_unshare_cleanup_vm; |
1648 | if ((err = unshare_semundo(unshare_flags, &new_ulist))) | 1612 | if ((err = unshare_semundo(unshare_flags, &new_ulist))) |
1649 | goto bad_unshare_cleanup_fd; | 1613 | goto bad_unshare_cleanup_fd; |
1650 | if ((err = unshare_utsname(unshare_flags, &new_uts))) | 1614 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, |
1615 | new_fs))) | ||
1651 | goto bad_unshare_cleanup_semundo; | 1616 | goto bad_unshare_cleanup_semundo; |
1652 | if ((err = unshare_ipcs(unshare_flags, &new_ipc))) | ||
1653 | goto bad_unshare_cleanup_uts; | ||
1654 | |||
1655 | if (new_ns || new_uts || new_ipc) { | ||
1656 | old_nsproxy = current->nsproxy; | ||
1657 | new_nsproxy = dup_namespaces(old_nsproxy); | ||
1658 | if (!new_nsproxy) { | ||
1659 | err = -ENOMEM; | ||
1660 | goto bad_unshare_cleanup_ipc; | ||
1661 | } | ||
1662 | } | ||
1663 | 1617 | ||
1664 | if (new_fs || new_ns || new_mm || new_fd || new_ulist || | 1618 | if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { |
1665 | new_uts || new_ipc) { | ||
1666 | 1619 | ||
1667 | task_lock(current); | 1620 | task_lock(current); |
1668 | 1621 | ||
1669 | if (new_nsproxy) { | 1622 | if (new_nsproxy) { |
1623 | old_nsproxy = current->nsproxy; | ||
1670 | current->nsproxy = new_nsproxy; | 1624 | current->nsproxy = new_nsproxy; |
1671 | new_nsproxy = old_nsproxy; | 1625 | new_nsproxy = old_nsproxy; |
1672 | } | 1626 | } |
@@ -1677,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1677 | new_fs = fs; | 1631 | new_fs = fs; |
1678 | } | 1632 | } |
1679 | 1633 | ||
1680 | if (new_ns) { | ||
1681 | ns = current->nsproxy->mnt_ns; | ||
1682 | current->nsproxy->mnt_ns = new_ns; | ||
1683 | new_ns = ns; | ||
1684 | } | ||
1685 | |||
1686 | if (new_mm) { | 1634 | if (new_mm) { |
1687 | mm = current->mm; | 1635 | mm = current->mm; |
1688 | active_mm = current->active_mm; | 1636 | active_mm = current->active_mm; |
@@ -1698,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1698 | new_fd = fd; | 1646 | new_fd = fd; |
1699 | } | 1647 | } |
1700 | 1648 | ||
1701 | if (new_uts) { | ||
1702 | uts = current->nsproxy->uts_ns; | ||
1703 | current->nsproxy->uts_ns = new_uts; | ||
1704 | new_uts = uts; | ||
1705 | } | ||
1706 | |||
1707 | if (new_ipc) { | ||
1708 | ipc = current->nsproxy->ipc_ns; | ||
1709 | current->nsproxy->ipc_ns = new_ipc; | ||
1710 | new_ipc = ipc; | ||
1711 | } | ||
1712 | |||
1713 | task_unlock(current); | 1649 | task_unlock(current); |
1714 | } | 1650 | } |
1715 | 1651 | ||
1716 | if (new_nsproxy) | 1652 | if (new_nsproxy) |
1717 | put_nsproxy(new_nsproxy); | 1653 | put_nsproxy(new_nsproxy); |
1718 | 1654 | ||
1719 | bad_unshare_cleanup_ipc: | ||
1720 | if (new_ipc) | ||
1721 | put_ipc_ns(new_ipc); | ||
1722 | |||
1723 | bad_unshare_cleanup_uts: | ||
1724 | if (new_uts) | ||
1725 | put_uts_ns(new_uts); | ||
1726 | |||
1727 | bad_unshare_cleanup_semundo: | 1655 | bad_unshare_cleanup_semundo: |
1728 | bad_unshare_cleanup_fd: | 1656 | bad_unshare_cleanup_fd: |
1729 | if (new_fd) | 1657 | if (new_fd) |
@@ -1738,10 +1666,6 @@ bad_unshare_cleanup_sigh: | |||
1738 | if (atomic_dec_and_test(&new_sigh->count)) | 1666 | if (atomic_dec_and_test(&new_sigh->count)) |
1739 | kmem_cache_free(sighand_cachep, new_sigh); | 1667 | kmem_cache_free(sighand_cachep, new_sigh); |
1740 | 1668 | ||
1741 | bad_unshare_cleanup_ns: | ||
1742 | if (new_ns) | ||
1743 | put_mnt_ns(new_ns); | ||
1744 | |||
1745 | bad_unshare_cleanup_fs: | 1669 | bad_unshare_cleanup_fs: |
1746 | if (new_fs) | 1670 | if (new_fs) |
1747 | put_fs_struct(new_fs); | 1671 | put_fs_struct(new_fs); |
diff --git a/kernel/futex.c b/kernel/futex.c index 5a270b5e3f95..600bc9d801f2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/pagemap.h> | 48 | #include <linux/pagemap.h> |
49 | #include <linux/syscalls.h> | 49 | #include <linux/syscalls.h> |
50 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
51 | #include <linux/module.h> | ||
51 | #include <asm/futex.h> | 52 | #include <asm/futex.h> |
52 | 53 | ||
53 | #include "rtmutex_common.h" | 54 | #include "rtmutex_common.h" |
@@ -55,32 +56,6 @@ | |||
55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 56 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
56 | 57 | ||
57 | /* | 58 | /* |
58 | * Futexes are matched on equal values of this key. | ||
59 | * The key type depends on whether it's a shared or private mapping. | ||
60 | * Don't rearrange members without looking at hash_futex(). | ||
61 | * | ||
62 | * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. | ||
63 | * We set bit 0 to indicate if it's an inode-based key. | ||
64 | */ | ||
65 | union futex_key { | ||
66 | struct { | ||
67 | unsigned long pgoff; | ||
68 | struct inode *inode; | ||
69 | int offset; | ||
70 | } shared; | ||
71 | struct { | ||
72 | unsigned long address; | ||
73 | struct mm_struct *mm; | ||
74 | int offset; | ||
75 | } private; | ||
76 | struct { | ||
77 | unsigned long word; | ||
78 | void *ptr; | ||
79 | int offset; | ||
80 | } both; | ||
81 | }; | ||
82 | |||
83 | /* | ||
84 | * Priority Inheritance state: | 59 | * Priority Inheritance state: |
85 | */ | 60 | */ |
86 | struct futex_pi_state { | 61 | struct futex_pi_state { |
@@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
175 | * | 150 | * |
176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 151 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. |
177 | */ | 152 | */ |
178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) | 153 | int get_futex_key(u32 __user *uaddr, union futex_key *key) |
179 | { | 154 | { |
180 | unsigned long address = (unsigned long)uaddr; | 155 | unsigned long address = (unsigned long)uaddr; |
181 | struct mm_struct *mm = current->mm; | 156 | struct mm_struct *mm = current->mm; |
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) | |||
246 | } | 221 | } |
247 | return err; | 222 | return err; |
248 | } | 223 | } |
224 | EXPORT_SYMBOL_GPL(get_futex_key); | ||
249 | 225 | ||
250 | /* | 226 | /* |
251 | * Take a reference to the resource addressed by a key. | 227 | * Take a reference to the resource addressed by a key. |
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) | |||
254 | * NOTE: mmap_sem MUST be held between get_futex_key() and calling this | 230 | * NOTE: mmap_sem MUST be held between get_futex_key() and calling this |
255 | * function, if it is called at all. mmap_sem keeps key->shared.inode valid. | 231 | * function, if it is called at all. mmap_sem keeps key->shared.inode valid. |
256 | */ | 232 | */ |
257 | static inline void get_key_refs(union futex_key *key) | 233 | inline void get_futex_key_refs(union futex_key *key) |
258 | { | 234 | { |
259 | if (key->both.ptr != 0) { | 235 | if (key->both.ptr != 0) { |
260 | if (key->both.offset & 1) | 236 | if (key->both.offset & 1) |
@@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key) | |||
263 | atomic_inc(&key->private.mm->mm_count); | 239 | atomic_inc(&key->private.mm->mm_count); |
264 | } | 240 | } |
265 | } | 241 | } |
242 | EXPORT_SYMBOL_GPL(get_futex_key_refs); | ||
266 | 243 | ||
267 | /* | 244 | /* |
268 | * Drop a reference to the resource addressed by a key. | 245 | * Drop a reference to the resource addressed by a key. |
269 | * The hash bucket spinlock must not be held. | 246 | * The hash bucket spinlock must not be held. |
270 | */ | 247 | */ |
271 | static void drop_key_refs(union futex_key *key) | 248 | void drop_futex_key_refs(union futex_key *key) |
272 | { | 249 | { |
273 | if (key->both.ptr != 0) { | 250 | if (key->both.ptr != 0) { |
274 | if (key->both.offset & 1) | 251 | if (key->both.offset & 1) |
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key) | |||
277 | mmdrop(key->private.mm); | 254 | mmdrop(key->private.mm); |
278 | } | 255 | } |
279 | } | 256 | } |
257 | EXPORT_SYMBOL_GPL(drop_futex_key_refs); | ||
280 | 258 | ||
281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | 259 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) |
282 | { | 260 | { |
@@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, | |||
873 | this->lock_ptr = &hb2->lock; | 851 | this->lock_ptr = &hb2->lock; |
874 | } | 852 | } |
875 | this->key = key2; | 853 | this->key = key2; |
876 | get_key_refs(&key2); | 854 | get_futex_key_refs(&key2); |
877 | drop_count++; | 855 | drop_count++; |
878 | 856 | ||
879 | if (ret - nr_wake >= nr_requeue) | 857 | if (ret - nr_wake >= nr_requeue) |
@@ -886,9 +864,9 @@ out_unlock: | |||
886 | if (hb1 != hb2) | 864 | if (hb1 != hb2) |
887 | spin_unlock(&hb2->lock); | 865 | spin_unlock(&hb2->lock); |
888 | 866 | ||
889 | /* drop_key_refs() must be called outside the spinlocks. */ | 867 | /* drop_futex_key_refs() must be called outside the spinlocks. */ |
890 | while (--drop_count >= 0) | 868 | while (--drop_count >= 0) |
891 | drop_key_refs(&key1); | 869 | drop_futex_key_refs(&key1); |
892 | 870 | ||
893 | out: | 871 | out: |
894 | up_read(¤t->mm->mmap_sem); | 872 | up_read(¤t->mm->mmap_sem); |
@@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
906 | 884 | ||
907 | init_waitqueue_head(&q->waiters); | 885 | init_waitqueue_head(&q->waiters); |
908 | 886 | ||
909 | get_key_refs(&q->key); | 887 | get_futex_key_refs(&q->key); |
910 | hb = hash_futex(&q->key); | 888 | hb = hash_futex(&q->key); |
911 | q->lock_ptr = &hb->lock; | 889 | q->lock_ptr = &hb->lock; |
912 | 890 | ||
@@ -925,7 +903,7 @@ static inline void | |||
925 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 903 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
926 | { | 904 | { |
927 | spin_unlock(&hb->lock); | 905 | spin_unlock(&hb->lock); |
928 | drop_key_refs(&q->key); | 906 | drop_futex_key_refs(&q->key); |
929 | } | 907 | } |
930 | 908 | ||
931 | /* | 909 | /* |
@@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q) | |||
980 | ret = 1; | 958 | ret = 1; |
981 | } | 959 | } |
982 | 960 | ||
983 | drop_key_refs(&q->key); | 961 | drop_futex_key_refs(&q->key); |
984 | return ret; | 962 | return ret; |
985 | } | 963 | } |
986 | 964 | ||
@@ -999,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | |||
999 | 977 | ||
1000 | spin_unlock(&hb->lock); | 978 | spin_unlock(&hb->lock); |
1001 | 979 | ||
1002 | drop_key_refs(&q->key); | 980 | drop_futex_key_refs(&q->key); |
1003 | } | 981 | } |
1004 | 982 | ||
1005 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | 983 | static long futex_wait_restart(struct restart_block *restart); |
984 | static int futex_wait_abstime(u32 __user *uaddr, u32 val, | ||
985 | int timed, unsigned long abs_time) | ||
1006 | { | 986 | { |
1007 | struct task_struct *curr = current; | 987 | struct task_struct *curr = current; |
1008 | DECLARE_WAITQUEUE(wait, curr); | 988 | DECLARE_WAITQUEUE(wait, curr); |
1009 | struct futex_hash_bucket *hb; | 989 | struct futex_hash_bucket *hb; |
1010 | struct futex_q q; | 990 | struct futex_q q; |
991 | unsigned long time_left = 0; | ||
1011 | u32 uval; | 992 | u32 uval; |
1012 | int ret; | 993 | int ret; |
1013 | 994 | ||
@@ -1087,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
1087 | * !list_empty() is safe here without any lock. | 1068 | * !list_empty() is safe here without any lock. |
1088 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | 1069 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. |
1089 | */ | 1070 | */ |
1090 | if (likely(!list_empty(&q.list))) | 1071 | time_left = 0; |
1091 | time = schedule_timeout(time); | 1072 | if (likely(!list_empty(&q.list))) { |
1073 | unsigned long rel_time; | ||
1074 | |||
1075 | if (timed) { | ||
1076 | unsigned long now = jiffies; | ||
1077 | if (time_after(now, abs_time)) | ||
1078 | rel_time = 0; | ||
1079 | else | ||
1080 | rel_time = abs_time - now; | ||
1081 | } else | ||
1082 | rel_time = MAX_SCHEDULE_TIMEOUT; | ||
1083 | |||
1084 | time_left = schedule_timeout(rel_time); | ||
1085 | } | ||
1092 | __set_current_state(TASK_RUNNING); | 1086 | __set_current_state(TASK_RUNNING); |
1093 | 1087 | ||
1094 | /* | 1088 | /* |
@@ -1099,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
1099 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1093 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
1100 | if (!unqueue_me(&q)) | 1094 | if (!unqueue_me(&q)) |
1101 | return 0; | 1095 | return 0; |
1102 | if (time == 0) | 1096 | if (time_left == 0) |
1103 | return -ETIMEDOUT; | 1097 | return -ETIMEDOUT; |
1098 | |||
1104 | /* | 1099 | /* |
1105 | * We expect signal_pending(current), but another thread may | 1100 | * We expect signal_pending(current), but another thread may |
1106 | * have handled it for us already. | 1101 | * have handled it for us already. |
1107 | */ | 1102 | */ |
1108 | return -EINTR; | 1103 | if (time_left == MAX_SCHEDULE_TIMEOUT) |
1104 | return -ERESTARTSYS; | ||
1105 | else { | ||
1106 | struct restart_block *restart; | ||
1107 | restart = ¤t_thread_info()->restart_block; | ||
1108 | restart->fn = futex_wait_restart; | ||
1109 | restart->arg0 = (unsigned long)uaddr; | ||
1110 | restart->arg1 = (unsigned long)val; | ||
1111 | restart->arg2 = (unsigned long)timed; | ||
1112 | restart->arg3 = abs_time; | ||
1113 | return -ERESTART_RESTARTBLOCK; | ||
1114 | } | ||
1109 | 1115 | ||
1110 | out_unlock_release_sem: | 1116 | out_unlock_release_sem: |
1111 | queue_unlock(&q, hb); | 1117 | queue_unlock(&q, hb); |
@@ -1115,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
1115 | return ret; | 1121 | return ret; |
1116 | } | 1122 | } |
1117 | 1123 | ||
1124 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time) | ||
1125 | { | ||
1126 | int timed = (rel_time != MAX_SCHEDULE_TIMEOUT); | ||
1127 | return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time); | ||
1128 | } | ||
1129 | |||
1130 | static long futex_wait_restart(struct restart_block *restart) | ||
1131 | { | ||
1132 | u32 __user *uaddr = (u32 __user *)restart->arg0; | ||
1133 | u32 val = (u32)restart->arg1; | ||
1134 | int timed = (int)restart->arg2; | ||
1135 | unsigned long abs_time = restart->arg3; | ||
1136 | |||
1137 | restart->fn = do_no_restart_syscall; | ||
1138 | return (long)futex_wait_abstime(uaddr, val, timed, abs_time); | ||
1139 | } | ||
1140 | |||
1141 | |||
1118 | /* | 1142 | /* |
1119 | * Userspace tried a 0 -> TID atomic transition of the futex value | 1143 | * Userspace tried a 0 -> TID atomic transition of the futex value |
1120 | * and failed. The kernel side here does the whole locking operation: | 1144 | * and failed. The kernel side here does the whole locking operation: |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1b3033105b40..c9f4f044a8a8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) | |||
669 | 669 | ||
670 | return orun; | 670 | return orun; |
671 | } | 671 | } |
672 | EXPORT_SYMBOL_GPL(hrtimer_forward); | ||
672 | 673 | ||
673 | /* | 674 | /* |
674 | * enqueue_hrtimer - internal function to (re)start a timer | 675 | * enqueue_hrtimer - internal function to (re)start a timer |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index aff1f0fabb0d..32e1ab1477d1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) | |||
48 | * | 48 | * |
49 | * Controller mappings for all interrupt sources: | 49 | * Controller mappings for all interrupt sources: |
50 | */ | 50 | */ |
51 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { | 51 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { |
52 | [0 ... NR_IRQS-1] = { | 52 | [0 ... NR_IRQS-1] = { |
53 | .status = IRQ_DISABLED, | 53 | .status = IRQ_DISABLED, |
54 | .chip = &no_irq_chip, | 54 | .chip = &no_irq_chip, |
@@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq) | |||
180 | if (desc->chip->ack) | 180 | if (desc->chip->ack) |
181 | desc->chip->ack(irq); | 181 | desc->chip->ack(irq); |
182 | action_ret = handle_IRQ_event(irq, desc->action); | 182 | action_ret = handle_IRQ_event(irq, desc->action); |
183 | if (!noirqdebug) | ||
184 | note_interrupt(irq, desc, action_ret); | ||
183 | desc->chip->end(irq); | 185 | desc->chip->end(irq); |
184 | return 1; | 186 | return 1; |
185 | } | 187 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5597c157442a..203a518b6f14 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
317 | } | 317 | } |
318 | 318 | ||
319 | *p = new; | 319 | *p = new; |
320 | #if defined(CONFIG_IRQ_PER_CPU) | 320 | |
321 | if (new->flags & IRQF_PERCPU) | ||
322 | desc->status |= IRQ_PER_CPU; | ||
323 | #endif | ||
324 | /* Exclude IRQ from balancing */ | 321 | /* Exclude IRQ from balancing */ |
325 | if (new->flags & IRQF_NOBALANCING) | 322 | if (new->flags & IRQF_NOBALANCING) |
326 | desc->status |= IRQ_NO_BALANCING; | 323 | desc->status |= IRQ_NO_BALANCING; |
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new) | |||
328 | if (!shared) { | 325 | if (!shared) { |
329 | irq_chip_set_defaults(desc->chip); | 326 | irq_chip_set_defaults(desc->chip); |
330 | 327 | ||
328 | #if defined(CONFIG_IRQ_PER_CPU) | ||
329 | if (new->flags & IRQF_PERCPU) | ||
330 | desc->status |= IRQ_PER_CPU; | ||
331 | #endif | ||
332 | |||
331 | /* Setup the type (level, edge polarity) if configured: */ | 333 | /* Setup the type (level, edge polarity) if configured: */ |
332 | if (new->flags & IRQF_TRIGGER_MASK) { | 334 | if (new->flags & IRQF_TRIGGER_MASK) { |
333 | if (desc->chip && desc->chip->set_type) | 335 | if (desc->chip && desc->chip->set_type) |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 2db91eb54ad8..ddde0ef9ccdc 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) | |||
66 | { | 66 | { |
67 | struct irq_desc *desc = irq_desc + irq; | 67 | struct irq_desc *desc = irq_desc + irq; |
68 | struct irqaction *action; | 68 | struct irqaction *action; |
69 | unsigned long flags; | ||
70 | int ret = 1; | ||
69 | 71 | ||
70 | for (action = desc->action ; action; action = action->next) | 72 | spin_lock_irqsave(&desc->lock, flags); |
73 | for (action = desc->action ; action; action = action->next) { | ||
71 | if ((action != new_action) && action->name && | 74 | if ((action != new_action) && action->name && |
72 | !strcmp(new_action->name, action->name)) | 75 | !strcmp(new_action->name, action->name)) { |
73 | return 0; | 76 | ret = 0; |
74 | return 1; | 77 | break; |
78 | } | ||
79 | } | ||
80 | spin_unlock_irqrestore(&desc->lock, flags); | ||
81 | return ret; | ||
75 | } | 82 | } |
76 | 83 | ||
77 | void register_handler_proc(unsigned int irq, struct irqaction *action) | 84 | void register_handler_proc(unsigned int irq, struct irqaction *action) |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9d8c79b48823..b0d81aae472f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
146 | 146 | ||
147 | if (unlikely(irqfixup)) { | 147 | if (unlikely(irqfixup)) { |
148 | /* Don't punish working computers */ | 148 | /* Don't punish working computers */ |
149 | if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { | 149 | if ((irqfixup == 2 && ((irq == 0) || |
150 | (desc->action->flags & IRQF_IRQPOLL))) || | ||
151 | action_ret == IRQ_NONE) { | ||
150 | int ok = misrouted_irq(irq); | 152 | int ok = misrouted_irq(irq); |
151 | if (action_ret == IRQ_NONE) | 153 | if (action_ret == IRQ_NONE) |
152 | desc->irqs_unhandled -= ok; | 154 | desc->irqs_unhandled -= ok; |
diff --git a/kernel/itimer.c b/kernel/itimer.c index 307c6a632ef6..3205e8e114fa 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -7,7 +7,6 @@ | |||
7 | /* These are all the functions necessary to implement itimers */ | 7 | /* These are all the functions necessary to implement itimers */ |
8 | 8 | ||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/smp_lock.h> | ||
11 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
12 | #include <linux/syscalls.h> | 11 | #include <linux/syscalls.h> |
13 | #include <linux/time.h> | 12 | #include <linux/time.h> |
@@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) | |||
139 | } | 138 | } |
140 | 139 | ||
141 | /* | 140 | /* |
142 | * We do not care about correctness. We just sanitize the values so | ||
143 | * the ktime_t operations which expect normalized values do not | ||
144 | * break. This converts negative values to long timeouts similar to | ||
145 | * the code in kernel versions < 2.6.16 | ||
146 | * | ||
147 | * Print a limited number of warning messages when an invalid timeval | ||
148 | * is detected. | ||
149 | */ | ||
150 | static void fixup_timeval(struct timeval *tv, int interval) | ||
151 | { | ||
152 | static int warnlimit = 10; | ||
153 | unsigned long tmp; | ||
154 | |||
155 | if (warnlimit > 0) { | ||
156 | warnlimit--; | ||
157 | printk(KERN_WARNING | ||
158 | "setitimer: %s (pid = %d) provided " | ||
159 | "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", | ||
160 | current->comm, current->pid, | ||
161 | interval ? "it_interval" : "it_value", | ||
162 | tv->tv_sec, (long) tv->tv_usec); | ||
163 | } | ||
164 | |||
165 | tmp = tv->tv_usec; | ||
166 | if (tmp >= USEC_PER_SEC) { | ||
167 | tv->tv_usec = tmp % USEC_PER_SEC; | ||
168 | tv->tv_sec += tmp / USEC_PER_SEC; | ||
169 | } | ||
170 | |||
171 | tmp = tv->tv_sec; | ||
172 | if (tmp > LONG_MAX) | ||
173 | tv->tv_sec = LONG_MAX; | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * Returns true if the timeval is in canonical form | 141 | * Returns true if the timeval is in canonical form |
178 | */ | 142 | */ |
179 | #define timeval_valid(t) \ | 143 | #define timeval_valid(t) \ |
180 | (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) | 144 | (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) |
181 | 145 | ||
182 | /* | ||
183 | * Check for invalid timevals, sanitize them and print a limited | ||
184 | * number of warnings. | ||
185 | */ | ||
186 | static void check_itimerval(struct itimerval *value) { | ||
187 | |||
188 | if (unlikely(!timeval_valid(&value->it_value))) | ||
189 | fixup_timeval(&value->it_value, 0); | ||
190 | |||
191 | if (unlikely(!timeval_valid(&value->it_interval))) | ||
192 | fixup_timeval(&value->it_interval, 1); | ||
193 | } | ||
194 | |||
195 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | 146 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) |
196 | { | 147 | { |
197 | struct task_struct *tsk = current; | 148 | struct task_struct *tsk = current; |
@@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | |||
201 | 152 | ||
202 | /* | 153 | /* |
203 | * Validate the timevals in value. | 154 | * Validate the timevals in value. |
204 | * | ||
205 | * Note: Although the spec requires that invalid values shall | ||
206 | * return -EINVAL, we just fixup the value and print a limited | ||
207 | * number of warnings in order not to break users of this | ||
208 | * historical misfeature. | ||
209 | * | ||
210 | * Scheduled for replacement in March 2007 | ||
211 | */ | 155 | */ |
212 | check_itimerval(value); | 156 | if (!timeval_valid(&value->it_value) || |
157 | !timeval_valid(&value->it_interval)) | ||
158 | return -EINVAL; | ||
213 | 159 | ||
214 | switch (which) { | 160 | switch (which) { |
215 | case ITIMER_REAL: | 161 | case ITIMER_REAL: |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5a0de8409739..f1bda23140b2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr, | |||
214 | symbol_end = (unsigned long)_etext; | 214 | symbol_end = (unsigned long)_etext; |
215 | } | 215 | } |
216 | 216 | ||
217 | *symbolsize = symbol_end - symbol_start; | 217 | if (symbolsize) |
218 | *offset = addr - symbol_start; | 218 | *symbolsize = symbol_end - symbol_start; |
219 | if (offset) | ||
220 | *offset = addr - symbol_start; | ||
219 | 221 | ||
220 | return low; | 222 | return low; |
221 | } | 223 | } |
@@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr, | |||
267 | return NULL; | 269 | return NULL; |
268 | } | 270 | } |
269 | 271 | ||
272 | int lookup_symbol_name(unsigned long addr, char *symname) | ||
273 | { | ||
274 | symname[0] = '\0'; | ||
275 | symname[KSYM_NAME_LEN] = '\0'; | ||
276 | |||
277 | if (is_ksym_addr(addr)) { | ||
278 | unsigned long pos; | ||
279 | |||
280 | pos = get_symbol_pos(addr, NULL, NULL); | ||
281 | /* Grab name */ | ||
282 | kallsyms_expand_symbol(get_symbol_offset(pos), symname); | ||
283 | return 0; | ||
284 | } | ||
285 | /* see if it's in a module */ | ||
286 | return lookup_module_symbol_name(addr, symname); | ||
287 | } | ||
288 | |||
289 | int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | ||
290 | unsigned long *offset, char *modname, char *name) | ||
291 | { | ||
292 | name[0] = '\0'; | ||
293 | name[KSYM_NAME_LEN] = '\0'; | ||
294 | |||
295 | if (is_ksym_addr(addr)) { | ||
296 | unsigned long pos; | ||
297 | |||
298 | pos = get_symbol_pos(addr, size, offset); | ||
299 | /* Grab name */ | ||
300 | kallsyms_expand_symbol(get_symbol_offset(pos), name); | ||
301 | modname[0] = '\0'; | ||
302 | return 0; | ||
303 | } | ||
304 | /* see if it's in a module */ | ||
305 | return lookup_module_symbol_attrs(addr, size, offset, modname, name); | ||
306 | } | ||
307 | |||
270 | /* Look up a kernel symbol and return it in a text buffer. */ | 308 | /* Look up a kernel symbol and return it in a text buffer. */ |
271 | int sprint_symbol(char *buffer, unsigned long address) | 309 | int sprint_symbol(char *buffer, unsigned long address) |
272 | { | 310 | { |
@@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address) | |||
301 | struct kallsym_iter | 339 | struct kallsym_iter |
302 | { | 340 | { |
303 | loff_t pos; | 341 | loff_t pos; |
304 | struct module *owner; | ||
305 | unsigned long value; | 342 | unsigned long value; |
306 | unsigned int nameoff; /* If iterating in core kernel symbols */ | 343 | unsigned int nameoff; /* If iterating in core kernel symbols */ |
307 | char type; | 344 | char type; |
308 | char name[KSYM_NAME_LEN+1]; | 345 | char name[KSYM_NAME_LEN+1]; |
346 | char module_name[MODULE_NAME_LEN + 1]; | ||
347 | int exported; | ||
309 | }; | 348 | }; |
310 | 349 | ||
311 | static int get_ksymbol_mod(struct kallsym_iter *iter) | 350 | static int get_ksymbol_mod(struct kallsym_iter *iter) |
312 | { | 351 | { |
313 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, | 352 | if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, |
314 | &iter->value, &iter->type, | 353 | &iter->type, iter->name, iter->module_name, |
315 | iter->name, sizeof(iter->name)); | 354 | &iter->exported) < 0) |
316 | if (iter->owner == NULL) | ||
317 | return 0; | 355 | return 0; |
318 | |||
319 | /* Label it "global" if it is exported, "local" if not exported. */ | ||
320 | iter->type = is_exported(iter->name, iter->owner) | ||
321 | ? toupper(iter->type) : tolower(iter->type); | ||
322 | |||
323 | return 1; | 356 | return 1; |
324 | } | 357 | } |
325 | 358 | ||
@@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) | |||
328 | { | 361 | { |
329 | unsigned off = iter->nameoff; | 362 | unsigned off = iter->nameoff; |
330 | 363 | ||
331 | iter->owner = NULL; | 364 | iter->module_name[0] = '\0'; |
332 | iter->value = kallsyms_addresses[iter->pos]; | 365 | iter->value = kallsyms_addresses[iter->pos]; |
333 | 366 | ||
334 | iter->type = kallsyms_get_symbol_type(off); | 367 | iter->type = kallsyms_get_symbol_type(off); |
@@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p) | |||
392 | if (!iter->name[0]) | 425 | if (!iter->name[0]) |
393 | return 0; | 426 | return 0; |
394 | 427 | ||
395 | if (iter->owner) | 428 | if (iter->module_name[0]) { |
429 | char type; | ||
430 | |||
431 | /* Label it "global" if it is exported, | ||
432 | * "local" if not exported. */ | ||
433 | type = iter->exported ? toupper(iter->type) : | ||
434 | tolower(iter->type); | ||
396 | seq_printf(m, "%0*lx %c %s\t[%s]\n", | 435 | seq_printf(m, "%0*lx %c %s\t[%s]\n", |
397 | (int)(2*sizeof(void*)), | 436 | (int)(2*sizeof(void*)), |
398 | iter->value, iter->type, iter->name, | 437 | iter->value, type, iter->name, iter->module_name); |
399 | module_name(iter->owner)); | 438 | } else |
400 | else | ||
401 | seq_printf(m, "%0*lx %c %s\n", | 439 | seq_printf(m, "%0*lx %c %s\n", |
402 | (int)(2*sizeof(void*)), | 440 | (int)(2*sizeof(void*)), |
403 | iter->value, iter->type, iter->name); | 441 | iter->value, iter->type, iter->name); |
@@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) | |||
432 | return ret; | 470 | return ret; |
433 | } | 471 | } |
434 | 472 | ||
435 | static int kallsyms_release(struct inode *inode, struct file *file) | ||
436 | { | ||
437 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
438 | kfree(m->private); | ||
439 | return seq_release(inode, file); | ||
440 | } | ||
441 | |||
442 | static const struct file_operations kallsyms_operations = { | 473 | static const struct file_operations kallsyms_operations = { |
443 | .open = kallsyms_open, | 474 | .open = kallsyms_open, |
444 | .read = seq_read, | 475 | .read = seq_read, |
445 | .llseek = seq_lseek, | 476 | .llseek = seq_lseek, |
446 | .release = kallsyms_release, | 477 | .release = seq_release_private, |
447 | }; | 478 | }; |
448 | 479 | ||
449 | static int __init kallsyms_init(void) | 480 | static int __init kallsyms_init(void) |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a59c8a01ae0..25db14b89e82 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) | |||
1118 | memset(&prstatus, 0, sizeof(prstatus)); | 1118 | memset(&prstatus, 0, sizeof(prstatus)); |
1119 | prstatus.pr_pid = current->pid; | 1119 | prstatus.pr_pid = current->pid; |
1120 | elf_core_copy_regs(&prstatus.pr_reg, regs); | 1120 | elf_core_copy_regs(&prstatus.pr_reg, regs); |
1121 | buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, | 1121 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, |
1122 | sizeof(prstatus)); | 1122 | &prstatus, sizeof(prstatus)); |
1123 | final_note(buf); | 1123 | final_note(buf); |
1124 | } | 1124 | } |
1125 | 1125 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 796276141e51..49cc4b9c1a8d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <linux/syscalls.h> | 23 | #include <linux/syscalls.h> |
24 | #include <linux/unistd.h> | 24 | #include <linux/unistd.h> |
25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
26 | #include <linux/smp_lock.h> | ||
27 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
28 | #include <linux/mnt_namespace.h> | 27 | #include <linux/mnt_namespace.h> |
29 | #include <linux/completion.h> | 28 | #include <linux/completion.h> |
@@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data) | |||
166 | /* We can run anywhere, unlike our parent keventd(). */ | 165 | /* We can run anywhere, unlike our parent keventd(). */ |
167 | set_cpus_allowed(current, CPU_MASK_ALL); | 166 | set_cpus_allowed(current, CPU_MASK_ALL); |
168 | 167 | ||
168 | /* | ||
169 | * Our parent is keventd, which runs with elevated scheduling priority. | ||
170 | * Avoid propagating that into the userspace child. | ||
171 | */ | ||
172 | set_user_nice(current, 0); | ||
173 | |||
169 | retval = -EPERM; | 174 | retval = -EPERM; |
170 | if (current->fs->root) | 175 | if (current->fs->root) |
171 | retval = kernel_execve(sub_info->path, | 176 | retval = kernel_execve(sub_info->path, |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d25a9ada3f8e..9e47d8c493f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -35,16 +35,19 @@ | |||
35 | #include <linux/hash.h> | 35 | #include <linux/hash.h> |
36 | #include <linux/init.h> | 36 | #include <linux/init.h> |
37 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
38 | #include <linux/stddef.h> | ||
38 | #include <linux/module.h> | 39 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | 40 | #include <linux/moduleloader.h> |
40 | #include <linux/kallsyms.h> | 41 | #include <linux/kallsyms.h> |
41 | #include <linux/freezer.h> | 42 | #include <linux/freezer.h> |
42 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
43 | #include <linux/debugfs.h> | 44 | #include <linux/debugfs.h> |
45 | #include <linux/kdebug.h> | ||
46 | |||
44 | #include <asm-generic/sections.h> | 47 | #include <asm-generic/sections.h> |
45 | #include <asm/cacheflush.h> | 48 | #include <asm/cacheflush.h> |
46 | #include <asm/errno.h> | 49 | #include <asm/errno.h> |
47 | #include <asm/kdebug.h> | 50 | #include <asm/uaccess.h> |
48 | 51 | ||
49 | #define KPROBE_HASH_BITS 6 | 52 | #define KPROBE_HASH_BITS 6 |
50 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) | 53 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) |
@@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | |||
63 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 66 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
64 | static atomic_t kprobe_count; | 67 | static atomic_t kprobe_count; |
65 | 68 | ||
69 | /* NOTE: change this value only with kprobe_mutex held */ | ||
70 | static bool kprobe_enabled; | ||
71 | |||
66 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 72 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
67 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 73 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
68 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 74 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
@@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
132 | struct kprobe_insn_page *kip; | 138 | struct kprobe_insn_page *kip; |
133 | struct hlist_node *pos; | 139 | struct hlist_node *pos; |
134 | 140 | ||
135 | retry: | 141 | retry: |
136 | hlist_for_each(pos, &kprobe_insn_pages) { | 142 | hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { |
137 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
138 | if (kip->nused < INSNS_PER_PAGE) { | 143 | if (kip->nused < INSNS_PER_PAGE) { |
139 | int i; | 144 | int i; |
140 | for (i = 0; i < INSNS_PER_PAGE; i++) { | 145 | for (i = 0; i < INSNS_PER_PAGE; i++) { |
@@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
155 | } | 160 | } |
156 | /* All out of space. Need to allocate a new page. Use slot 0. */ | 161 | /* All out of space. Need to allocate a new page. Use slot 0. */ |
157 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 162 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); |
158 | if (!kip) { | 163 | if (!kip) |
159 | return NULL; | 164 | return NULL; |
160 | } | ||
161 | 165 | ||
162 | /* | 166 | /* |
163 | * Use module_alloc so this page is within +/- 2GB of where the | 167 | * Use module_alloc so this page is within +/- 2GB of where the |
@@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void) | |||
213 | if (check_safety() != 0) | 217 | if (check_safety() != 0) |
214 | return -EAGAIN; | 218 | return -EAGAIN; |
215 | 219 | ||
216 | hlist_for_each_safe(pos, next, &kprobe_insn_pages) { | 220 | hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { |
217 | int i; | 221 | int i; |
218 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
219 | if (kip->ngarbage == 0) | 222 | if (kip->ngarbage == 0) |
220 | continue; | 223 | continue; |
221 | kip->ngarbage = 0; /* we will collect all garbages */ | 224 | kip->ngarbage = 0; /* we will collect all garbages */ |
@@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | |||
234 | struct kprobe_insn_page *kip; | 237 | struct kprobe_insn_page *kip; |
235 | struct hlist_node *pos; | 238 | struct hlist_node *pos; |
236 | 239 | ||
237 | hlist_for_each(pos, &kprobe_insn_pages) { | 240 | hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { |
238 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
239 | if (kip->insns <= slot && | 241 | if (kip->insns <= slot && |
240 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 242 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { |
241 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | 243 | int i = (slot - kip->insns) / MAX_INSN_SIZE; |
@@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | |||
248 | break; | 250 | break; |
249 | } | 251 | } |
250 | } | 252 | } |
251 | if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { | 253 | |
254 | if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) | ||
252 | collect_garbage_slots(); | 255 | collect_garbage_slots(); |
253 | } | ||
254 | } | 256 | } |
255 | #endif | 257 | #endif |
256 | 258 | ||
@@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
316 | reset_kprobe_instance(); | 318 | reset_kprobe_instance(); |
317 | } | 319 | } |
318 | } | 320 | } |
319 | return; | ||
320 | } | 321 | } |
321 | 322 | ||
322 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 323 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
@@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | |||
362 | } | 363 | } |
363 | 364 | ||
364 | /* Called with kretprobe_lock held */ | 365 | /* Called with kretprobe_lock held */ |
365 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) | ||
366 | { | ||
367 | struct hlist_node *node; | ||
368 | struct kretprobe_instance *ri; | ||
369 | hlist_for_each_entry(ri, node, &rp->free_instances, uflist) | ||
370 | return ri; | ||
371 | return NULL; | ||
372 | } | ||
373 | |||
374 | /* Called with kretprobe_lock held */ | ||
375 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe | ||
376 | *rp) | ||
377 | { | ||
378 | struct hlist_node *node; | ||
379 | struct kretprobe_instance *ri; | ||
380 | hlist_for_each_entry(ri, node, &rp->used_instances, uflist) | ||
381 | return ri; | ||
382 | return NULL; | ||
383 | } | ||
384 | |||
385 | /* Called with kretprobe_lock held */ | ||
386 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) | ||
387 | { | ||
388 | /* | ||
389 | * Remove rp inst off the free list - | ||
390 | * Add it back when probed function returns | ||
391 | */ | ||
392 | hlist_del(&ri->uflist); | ||
393 | |||
394 | /* Add rp inst onto table */ | ||
395 | INIT_HLIST_NODE(&ri->hlist); | ||
396 | hlist_add_head(&ri->hlist, | ||
397 | &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); | ||
398 | |||
399 | /* Also add this rp inst to the used list. */ | ||
400 | INIT_HLIST_NODE(&ri->uflist); | ||
401 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | ||
402 | } | ||
403 | |||
404 | /* Called with kretprobe_lock held */ | ||
405 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | 366 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, |
406 | struct hlist_head *head) | 367 | struct hlist_head *head) |
407 | { | 368 | { |
@@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
454 | static inline void free_rp_inst(struct kretprobe *rp) | 415 | static inline void free_rp_inst(struct kretprobe *rp) |
455 | { | 416 | { |
456 | struct kretprobe_instance *ri; | 417 | struct kretprobe_instance *ri; |
457 | while ((ri = get_free_rp_inst(rp)) != NULL) { | 418 | struct hlist_node *pos, *next; |
419 | |||
420 | hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { | ||
458 | hlist_del(&ri->uflist); | 421 | hlist_del(&ri->uflist); |
459 | kfree(ri); | 422 | kfree(ri); |
460 | } | 423 | } |
@@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
535 | 498 | ||
536 | static int __kprobes in_kprobes_functions(unsigned long addr) | 499 | static int __kprobes in_kprobes_functions(unsigned long addr) |
537 | { | 500 | { |
538 | if (addr >= (unsigned long)__kprobes_text_start | 501 | if (addr >= (unsigned long)__kprobes_text_start && |
539 | && addr < (unsigned long)__kprobes_text_end) | 502 | addr < (unsigned long)__kprobes_text_end) |
540 | return -EINVAL; | 503 | return -EINVAL; |
541 | return 0; | 504 | return 0; |
542 | } | 505 | } |
@@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
563 | return -EINVAL; | 526 | return -EINVAL; |
564 | p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); | 527 | p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); |
565 | 528 | ||
566 | if ((!kernel_text_address((unsigned long) p->addr)) || | 529 | if (!kernel_text_address((unsigned long) p->addr) || |
567 | in_kprobes_functions((unsigned long) p->addr)) | 530 | in_kprobes_functions((unsigned long) p->addr)) |
568 | return -EINVAL; | 531 | return -EINVAL; |
569 | 532 | ||
570 | p->mod_refcounted = 0; | 533 | p->mod_refcounted = 0; |
571 | /* Check are we probing a module */ | 534 | |
572 | if ((probed_mod = module_text_address((unsigned long) p->addr))) { | 535 | /* |
536 | * Check if are we probing a module. | ||
537 | */ | ||
538 | probed_mod = module_text_address((unsigned long) p->addr); | ||
539 | if (probed_mod) { | ||
573 | struct module *calling_mod = module_text_address(called_from); | 540 | struct module *calling_mod = module_text_address(called_from); |
574 | /* We must allow modules to probe themself and | 541 | /* |
575 | * in this case avoid incrementing the module refcount, | 542 | * We must allow modules to probe themself and in this case |
576 | * so as to allow unloading of self probing modules. | 543 | * avoid incrementing the module refcount, so as to allow |
544 | * unloading of self probing modules. | ||
577 | */ | 545 | */ |
578 | if (calling_mod && (calling_mod != probed_mod)) { | 546 | if (calling_mod && calling_mod != probed_mod) { |
579 | if (unlikely(!try_module_get(probed_mod))) | 547 | if (unlikely(!try_module_get(probed_mod))) |
580 | return -EINVAL; | 548 | return -EINVAL; |
581 | p->mod_refcounted = 1; | 549 | p->mod_refcounted = 1; |
@@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
593 | goto out; | 561 | goto out; |
594 | } | 562 | } |
595 | 563 | ||
596 | if ((ret = arch_prepare_kprobe(p)) != 0) | 564 | ret = arch_prepare_kprobe(p); |
565 | if (ret) | ||
597 | goto out; | 566 | goto out; |
598 | 567 | ||
599 | INIT_HLIST_NODE(&p->hlist); | 568 | INIT_HLIST_NODE(&p->hlist); |
600 | hlist_add_head_rcu(&p->hlist, | 569 | hlist_add_head_rcu(&p->hlist, |
601 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 570 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
602 | 571 | ||
603 | if (atomic_add_return(1, &kprobe_count) == \ | 572 | if (kprobe_enabled) { |
573 | if (atomic_add_return(1, &kprobe_count) == \ | ||
604 | (ARCH_INACTIVE_KPROBE_COUNT + 1)) | 574 | (ARCH_INACTIVE_KPROBE_COUNT + 1)) |
605 | register_page_fault_notifier(&kprobe_page_fault_nb); | 575 | register_page_fault_notifier(&kprobe_page_fault_nb); |
606 | |||
607 | arch_arm_kprobe(p); | ||
608 | 576 | ||
577 | arch_arm_kprobe(p); | ||
578 | } | ||
609 | out: | 579 | out: |
610 | mutex_unlock(&kprobe_mutex); | 580 | mutex_unlock(&kprobe_mutex); |
611 | 581 | ||
@@ -616,8 +586,7 @@ out: | |||
616 | 586 | ||
617 | int __kprobes register_kprobe(struct kprobe *p) | 587 | int __kprobes register_kprobe(struct kprobe *p) |
618 | { | 588 | { |
619 | return __register_kprobe(p, | 589 | return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); |
620 | (unsigned long)__builtin_return_address(0)); | ||
621 | } | 590 | } |
622 | 591 | ||
623 | void __kprobes unregister_kprobe(struct kprobe *p) | 592 | void __kprobes unregister_kprobe(struct kprobe *p) |
@@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p) | |||
641 | return; | 610 | return; |
642 | } | 611 | } |
643 | valid_p: | 612 | valid_p: |
644 | if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && | 613 | if (old_p == p || |
645 | (p->list.next == &old_p->list) && | 614 | (old_p->pre_handler == aggr_pre_handler && |
646 | (p->list.prev == &old_p->list))) { | 615 | p->list.next == &old_p->list && p->list.prev == &old_p->list)) { |
647 | /* Only probe on the hash list */ | 616 | /* |
648 | arch_disarm_kprobe(p); | 617 | * Only probe on the hash list. Disarm only if kprobes are |
618 | * enabled - otherwise, the breakpoint would already have | ||
619 | * been removed. We save on flushing icache. | ||
620 | */ | ||
621 | if (kprobe_enabled) | ||
622 | arch_disarm_kprobe(p); | ||
649 | hlist_del_rcu(&old_p->hlist); | 623 | hlist_del_rcu(&old_p->hlist); |
650 | cleanup_p = 1; | 624 | cleanup_p = 1; |
651 | } else { | 625 | } else { |
@@ -656,9 +630,11 @@ valid_p: | |||
656 | mutex_unlock(&kprobe_mutex); | 630 | mutex_unlock(&kprobe_mutex); |
657 | 631 | ||
658 | synchronize_sched(); | 632 | synchronize_sched(); |
659 | if (p->mod_refcounted && | 633 | if (p->mod_refcounted) { |
660 | (mod = module_text_address((unsigned long)p->addr))) | 634 | mod = module_text_address((unsigned long)p->addr); |
661 | module_put(mod); | 635 | if (mod) |
636 | module_put(mod); | ||
637 | } | ||
662 | 638 | ||
663 | if (cleanup_p) { | 639 | if (cleanup_p) { |
664 | if (p != old_p) { | 640 | if (p != old_p) { |
@@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
729 | 705 | ||
730 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | 706 | /*TODO: consider to only swap the RA after the last pre_handler fired */ |
731 | spin_lock_irqsave(&kretprobe_lock, flags); | 707 | spin_lock_irqsave(&kretprobe_lock, flags); |
732 | arch_prepare_kretprobe(rp, regs); | 708 | if (!hlist_empty(&rp->free_instances)) { |
709 | struct kretprobe_instance *ri; | ||
710 | |||
711 | ri = hlist_entry(rp->free_instances.first, | ||
712 | struct kretprobe_instance, uflist); | ||
713 | ri->rp = rp; | ||
714 | ri->task = current; | ||
715 | arch_prepare_kretprobe(ri, regs); | ||
716 | |||
717 | /* XXX(hch): why is there no hlist_move_head? */ | ||
718 | hlist_del(&ri->uflist); | ||
719 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | ||
720 | hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); | ||
721 | } else | ||
722 | rp->nmissed++; | ||
733 | spin_unlock_irqrestore(&kretprobe_lock, flags); | 723 | spin_unlock_irqrestore(&kretprobe_lock, flags); |
734 | return 0; | 724 | return 0; |
735 | } | 725 | } |
@@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) | |||
792 | { | 782 | { |
793 | unsigned long flags; | 783 | unsigned long flags; |
794 | struct kretprobe_instance *ri; | 784 | struct kretprobe_instance *ri; |
785 | struct hlist_node *pos, *next; | ||
795 | 786 | ||
796 | unregister_kprobe(&rp->kp); | 787 | unregister_kprobe(&rp->kp); |
788 | |||
797 | /* No race here */ | 789 | /* No race here */ |
798 | spin_lock_irqsave(&kretprobe_lock, flags); | 790 | spin_lock_irqsave(&kretprobe_lock, flags); |
799 | while ((ri = get_used_rp_inst(rp)) != NULL) { | 791 | hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { |
800 | ri->rp = NULL; | 792 | ri->rp = NULL; |
801 | hlist_del(&ri->uflist); | 793 | hlist_del(&ri->uflist); |
802 | } | 794 | } |
@@ -816,6 +808,9 @@ static int __init init_kprobes(void) | |||
816 | } | 808 | } |
817 | atomic_set(&kprobe_count, 0); | 809 | atomic_set(&kprobe_count, 0); |
818 | 810 | ||
811 | /* By default, kprobes are enabled */ | ||
812 | kprobe_enabled = true; | ||
813 | |||
819 | err = arch_init_kprobes(); | 814 | err = arch_init_kprobes(); |
820 | if (!err) | 815 | if (!err) |
821 | err = register_die_notifier(&kprobe_exceptions_nb); | 816 | err = register_die_notifier(&kprobe_exceptions_nb); |
@@ -825,7 +820,7 @@ static int __init init_kprobes(void) | |||
825 | 820 | ||
826 | #ifdef CONFIG_DEBUG_FS | 821 | #ifdef CONFIG_DEBUG_FS |
827 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | 822 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, |
828 | const char *sym, int offset,char *modname) | 823 | const char *sym, int offset,char *modname) |
829 | { | 824 | { |
830 | char *kprobe_type; | 825 | char *kprobe_type; |
831 | 826 | ||
@@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | |||
867 | struct kprobe *p, *kp; | 862 | struct kprobe *p, *kp; |
868 | const char *sym = NULL; | 863 | const char *sym = NULL; |
869 | unsigned int i = *(loff_t *) v; | 864 | unsigned int i = *(loff_t *) v; |
870 | unsigned long size, offset = 0; | 865 | unsigned long offset = 0; |
871 | char *modname, namebuf[128]; | 866 | char *modname, namebuf[128]; |
872 | 867 | ||
873 | head = &kprobe_table[i]; | 868 | head = &kprobe_table[i]; |
874 | preempt_disable(); | 869 | preempt_disable(); |
875 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 870 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
876 | sym = kallsyms_lookup((unsigned long)p->addr, &size, | 871 | sym = kallsyms_lookup((unsigned long)p->addr, NULL, |
877 | &offset, &modname, namebuf); | 872 | &offset, &modname, namebuf); |
878 | if (p->pre_handler == aggr_pre_handler) { | 873 | if (p->pre_handler == aggr_pre_handler) { |
879 | list_for_each_entry_rcu(kp, &p->list, list) | 874 | list_for_each_entry_rcu(kp, &p->list, list) |
@@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = { | |||
904 | .release = seq_release, | 899 | .release = seq_release, |
905 | }; | 900 | }; |
906 | 901 | ||
902 | static void __kprobes enable_all_kprobes(void) | ||
903 | { | ||
904 | struct hlist_head *head; | ||
905 | struct hlist_node *node; | ||
906 | struct kprobe *p; | ||
907 | unsigned int i; | ||
908 | |||
909 | mutex_lock(&kprobe_mutex); | ||
910 | |||
911 | /* If kprobes are already enabled, just return */ | ||
912 | if (kprobe_enabled) | ||
913 | goto already_enabled; | ||
914 | |||
915 | /* | ||
916 | * Re-register the page fault notifier only if there are any | ||
917 | * active probes at the time of enabling kprobes globally | ||
918 | */ | ||
919 | if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT) | ||
920 | register_page_fault_notifier(&kprobe_page_fault_nb); | ||
921 | |||
922 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
923 | head = &kprobe_table[i]; | ||
924 | hlist_for_each_entry_rcu(p, node, head, hlist) | ||
925 | arch_arm_kprobe(p); | ||
926 | } | ||
927 | |||
928 | kprobe_enabled = true; | ||
929 | printk(KERN_INFO "Kprobes globally enabled\n"); | ||
930 | |||
931 | already_enabled: | ||
932 | mutex_unlock(&kprobe_mutex); | ||
933 | return; | ||
934 | } | ||
935 | |||
936 | static void __kprobes disable_all_kprobes(void) | ||
937 | { | ||
938 | struct hlist_head *head; | ||
939 | struct hlist_node *node; | ||
940 | struct kprobe *p; | ||
941 | unsigned int i; | ||
942 | |||
943 | mutex_lock(&kprobe_mutex); | ||
944 | |||
945 | /* If kprobes are already disabled, just return */ | ||
946 | if (!kprobe_enabled) | ||
947 | goto already_disabled; | ||
948 | |||
949 | kprobe_enabled = false; | ||
950 | printk(KERN_INFO "Kprobes globally disabled\n"); | ||
951 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | ||
952 | head = &kprobe_table[i]; | ||
953 | hlist_for_each_entry_rcu(p, node, head, hlist) { | ||
954 | if (!arch_trampoline_kprobe(p)) | ||
955 | arch_disarm_kprobe(p); | ||
956 | } | ||
957 | } | ||
958 | |||
959 | mutex_unlock(&kprobe_mutex); | ||
960 | /* Allow all currently running kprobes to complete */ | ||
961 | synchronize_sched(); | ||
962 | |||
963 | mutex_lock(&kprobe_mutex); | ||
964 | /* Unconditionally unregister the page_fault notifier */ | ||
965 | unregister_page_fault_notifier(&kprobe_page_fault_nb); | ||
966 | |||
967 | already_disabled: | ||
968 | mutex_unlock(&kprobe_mutex); | ||
969 | return; | ||
970 | } | ||
971 | |||
972 | /* | ||
973 | * XXX: The debugfs bool file interface doesn't allow for callbacks | ||
974 | * when the bool state is switched. We can reuse that facility when | ||
975 | * available | ||
976 | */ | ||
977 | static ssize_t read_enabled_file_bool(struct file *file, | ||
978 | char __user *user_buf, size_t count, loff_t *ppos) | ||
979 | { | ||
980 | char buf[3]; | ||
981 | |||
982 | if (kprobe_enabled) | ||
983 | buf[0] = '1'; | ||
984 | else | ||
985 | buf[0] = '0'; | ||
986 | buf[1] = '\n'; | ||
987 | buf[2] = 0x00; | ||
988 | return simple_read_from_buffer(user_buf, count, ppos, buf, 2); | ||
989 | } | ||
990 | |||
991 | static ssize_t write_enabled_file_bool(struct file *file, | ||
992 | const char __user *user_buf, size_t count, loff_t *ppos) | ||
993 | { | ||
994 | char buf[32]; | ||
995 | int buf_size; | ||
996 | |||
997 | buf_size = min(count, (sizeof(buf)-1)); | ||
998 | if (copy_from_user(buf, user_buf, buf_size)) | ||
999 | return -EFAULT; | ||
1000 | |||
1001 | switch (buf[0]) { | ||
1002 | case 'y': | ||
1003 | case 'Y': | ||
1004 | case '1': | ||
1005 | enable_all_kprobes(); | ||
1006 | break; | ||
1007 | case 'n': | ||
1008 | case 'N': | ||
1009 | case '0': | ||
1010 | disable_all_kprobes(); | ||
1011 | break; | ||
1012 | } | ||
1013 | |||
1014 | return count; | ||
1015 | } | ||
1016 | |||
1017 | static struct file_operations fops_kp = { | ||
1018 | .read = read_enabled_file_bool, | ||
1019 | .write = write_enabled_file_bool, | ||
1020 | }; | ||
1021 | |||
907 | static int __kprobes debugfs_kprobe_init(void) | 1022 | static int __kprobes debugfs_kprobe_init(void) |
908 | { | 1023 | { |
909 | struct dentry *dir, *file; | 1024 | struct dentry *dir, *file; |
1025 | unsigned int value = 1; | ||
910 | 1026 | ||
911 | dir = debugfs_create_dir("kprobes", NULL); | 1027 | dir = debugfs_create_dir("kprobes", NULL); |
912 | if (!dir) | 1028 | if (!dir) |
913 | return -ENOMEM; | 1029 | return -ENOMEM; |
914 | 1030 | ||
915 | file = debugfs_create_file("list", 0444, dir , 0 , | 1031 | file = debugfs_create_file("list", 0444, dir, NULL, |
916 | &debugfs_kprobes_operations); | 1032 | &debugfs_kprobes_operations); |
917 | if (!file) { | 1033 | if (!file) { |
918 | debugfs_remove(dir); | 1034 | debugfs_remove(dir); |
919 | return -ENOMEM; | 1035 | return -ENOMEM; |
920 | } | 1036 | } |
921 | 1037 | ||
1038 | file = debugfs_create_file("enabled", 0600, dir, | ||
1039 | &value, &fops_kp); | ||
1040 | if (!file) { | ||
1041 | debugfs_remove(dir); | ||
1042 | return -ENOMEM; | ||
1043 | } | ||
1044 | |||
922 | return 0; | 1045 | return 0; |
923 | } | 1046 | } |
924 | 1047 | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7065a687ac54..1a5ff2211d88 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace) | |||
257 | trace->entries = stack_trace + nr_stack_trace_entries; | 257 | trace->entries = stack_trace + nr_stack_trace_entries; |
258 | 258 | ||
259 | trace->skip = 3; | 259 | trace->skip = 3; |
260 | trace->all_contexts = 0; | ||
261 | 260 | ||
262 | save_stack_trace(trace, NULL); | 261 | save_stack_trace(trace); |
263 | 262 | ||
264 | trace->max_entries = trace->nr_entries; | 263 | trace->max_entries = trace->nr_entries; |
265 | 264 | ||
@@ -341,10 +340,7 @@ static const char *usage_str[] = | |||
341 | 340 | ||
342 | const char * __get_key_name(struct lockdep_subclass_key *key, char *str) | 341 | const char * __get_key_name(struct lockdep_subclass_key *key, char *str) |
343 | { | 342 | { |
344 | unsigned long offs, size; | 343 | return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); |
345 | char *modname; | ||
346 | |||
347 | return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); | ||
348 | } | 344 | } |
349 | 345 | ||
350 | void | 346 | void |
@@ -1313,8 +1309,9 @@ out_unlock_set: | |||
1313 | 1309 | ||
1314 | /* | 1310 | /* |
1315 | * Look up a dependency chain. If the key is not present yet then | 1311 | * Look up a dependency chain. If the key is not present yet then |
1316 | * add it and return 0 - in this case the new dependency chain is | 1312 | * add it and return 1 - in this case the new dependency chain is |
1317 | * validated. If the key is already hashed, return 1. | 1313 | * validated. If the key is already hashed, return 0. |
1314 | * (On return with 1 graph_lock is held.) | ||
1318 | */ | 1315 | */ |
1319 | static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) | 1316 | static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) |
1320 | { | 1317 | { |
@@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, | |||
1577 | * Mark a lock with a usage bit, and validate the state transition: | 1574 | * Mark a lock with a usage bit, and validate the state transition: |
1578 | */ | 1575 | */ |
1579 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | 1576 | static int mark_lock(struct task_struct *curr, struct held_lock *this, |
1580 | enum lock_usage_bit new_bit, unsigned long ip) | 1577 | enum lock_usage_bit new_bit) |
1581 | { | 1578 | { |
1582 | unsigned int new_mask = 1 << new_bit, ret = 1; | 1579 | unsigned int new_mask = 1 << new_bit, ret = 1; |
1583 | 1580 | ||
@@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
1600 | 1597 | ||
1601 | this->class->usage_mask |= new_mask; | 1598 | this->class->usage_mask |= new_mask; |
1602 | 1599 | ||
1603 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1604 | if (new_bit == LOCK_ENABLED_HARDIRQS || | ||
1605 | new_bit == LOCK_ENABLED_HARDIRQS_READ) | ||
1606 | ip = curr->hardirq_enable_ip; | ||
1607 | else if (new_bit == LOCK_ENABLED_SOFTIRQS || | ||
1608 | new_bit == LOCK_ENABLED_SOFTIRQS_READ) | ||
1609 | ip = curr->softirq_enable_ip; | ||
1610 | #endif | ||
1611 | if (!save_trace(this->class->usage_traces + new_bit)) | 1600 | if (!save_trace(this->class->usage_traces + new_bit)) |
1612 | return 0; | 1601 | return 0; |
1613 | 1602 | ||
@@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
1806 | * Mark all held locks with a usage bit: | 1795 | * Mark all held locks with a usage bit: |
1807 | */ | 1796 | */ |
1808 | static int | 1797 | static int |
1809 | mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) | 1798 | mark_held_locks(struct task_struct *curr, int hardirq) |
1810 | { | 1799 | { |
1811 | enum lock_usage_bit usage_bit; | 1800 | enum lock_usage_bit usage_bit; |
1812 | struct held_lock *hlock; | 1801 | struct held_lock *hlock; |
@@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) | |||
1826 | else | 1815 | else |
1827 | usage_bit = LOCK_ENABLED_SOFTIRQS; | 1816 | usage_bit = LOCK_ENABLED_SOFTIRQS; |
1828 | } | 1817 | } |
1829 | if (!mark_lock(curr, hlock, usage_bit, ip)) | 1818 | if (!mark_lock(curr, hlock, usage_bit)) |
1830 | return 0; | 1819 | return 0; |
1831 | } | 1820 | } |
1832 | 1821 | ||
@@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void) | |||
1879 | * We are going to turn hardirqs on, so set the | 1868 | * We are going to turn hardirqs on, so set the |
1880 | * usage bit for all held locks: | 1869 | * usage bit for all held locks: |
1881 | */ | 1870 | */ |
1882 | if (!mark_held_locks(curr, 1, ip)) | 1871 | if (!mark_held_locks(curr, 1)) |
1883 | return; | 1872 | return; |
1884 | /* | 1873 | /* |
1885 | * If we have softirqs enabled, then set the usage | 1874 | * If we have softirqs enabled, then set the usage |
@@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void) | |||
1887 | * this bit from being set before) | 1876 | * this bit from being set before) |
1888 | */ | 1877 | */ |
1889 | if (curr->softirqs_enabled) | 1878 | if (curr->softirqs_enabled) |
1890 | if (!mark_held_locks(curr, 0, ip)) | 1879 | if (!mark_held_locks(curr, 0)) |
1891 | return; | 1880 | return; |
1892 | 1881 | ||
1893 | curr->hardirq_enable_ip = ip; | 1882 | curr->hardirq_enable_ip = ip; |
@@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip) | |||
1955 | * enabled too: | 1944 | * enabled too: |
1956 | */ | 1945 | */ |
1957 | if (curr->hardirqs_enabled) | 1946 | if (curr->hardirqs_enabled) |
1958 | mark_held_locks(curr, 0, ip); | 1947 | mark_held_locks(curr, 0); |
1959 | } | 1948 | } |
1960 | 1949 | ||
1961 | /* | 1950 | /* |
@@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2093 | if (read) { | 2082 | if (read) { |
2094 | if (curr->hardirq_context) | 2083 | if (curr->hardirq_context) |
2095 | if (!mark_lock(curr, hlock, | 2084 | if (!mark_lock(curr, hlock, |
2096 | LOCK_USED_IN_HARDIRQ_READ, ip)) | 2085 | LOCK_USED_IN_HARDIRQ_READ)) |
2097 | return 0; | 2086 | return 0; |
2098 | if (curr->softirq_context) | 2087 | if (curr->softirq_context) |
2099 | if (!mark_lock(curr, hlock, | 2088 | if (!mark_lock(curr, hlock, |
2100 | LOCK_USED_IN_SOFTIRQ_READ, ip)) | 2089 | LOCK_USED_IN_SOFTIRQ_READ)) |
2101 | return 0; | 2090 | return 0; |
2102 | } else { | 2091 | } else { |
2103 | if (curr->hardirq_context) | 2092 | if (curr->hardirq_context) |
2104 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) | 2093 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) |
2105 | return 0; | 2094 | return 0; |
2106 | if (curr->softirq_context) | 2095 | if (curr->softirq_context) |
2107 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) | 2096 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) |
2108 | return 0; | 2097 | return 0; |
2109 | } | 2098 | } |
2110 | } | 2099 | } |
2111 | if (!hardirqs_off) { | 2100 | if (!hardirqs_off) { |
2112 | if (read) { | 2101 | if (read) { |
2113 | if (!mark_lock(curr, hlock, | 2102 | if (!mark_lock(curr, hlock, |
2114 | LOCK_ENABLED_HARDIRQS_READ, ip)) | 2103 | LOCK_ENABLED_HARDIRQS_READ)) |
2115 | return 0; | 2104 | return 0; |
2116 | if (curr->softirqs_enabled) | 2105 | if (curr->softirqs_enabled) |
2117 | if (!mark_lock(curr, hlock, | 2106 | if (!mark_lock(curr, hlock, |
2118 | LOCK_ENABLED_SOFTIRQS_READ, ip)) | 2107 | LOCK_ENABLED_SOFTIRQS_READ)) |
2119 | return 0; | 2108 | return 0; |
2120 | } else { | 2109 | } else { |
2121 | if (!mark_lock(curr, hlock, | 2110 | if (!mark_lock(curr, hlock, |
2122 | LOCK_ENABLED_HARDIRQS, ip)) | 2111 | LOCK_ENABLED_HARDIRQS)) |
2123 | return 0; | 2112 | return 0; |
2124 | if (curr->softirqs_enabled) | 2113 | if (curr->softirqs_enabled) |
2125 | if (!mark_lock(curr, hlock, | 2114 | if (!mark_lock(curr, hlock, |
2126 | LOCK_ENABLED_SOFTIRQS, ip)) | 2115 | LOCK_ENABLED_SOFTIRQS)) |
2127 | return 0; | 2116 | return 0; |
2128 | } | 2117 | } |
2129 | } | 2118 | } |
2130 | #endif | 2119 | #endif |
2131 | /* mark it as used: */ | 2120 | /* mark it as used: */ |
2132 | if (!mark_lock(curr, hlock, LOCK_USED, ip)) | 2121 | if (!mark_lock(curr, hlock, LOCK_USED)) |
2133 | return 0; | 2122 | return 0; |
2134 | out_calc_hash: | 2123 | out_calc_hash: |
2135 | /* | 2124 | /* |
diff --git a/kernel/module.c b/kernel/module.c index 1eb8ca565ba0..d36e45477fac 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/moduleloader.h> | 20 | #include <linux/moduleloader.h> |
21 | #include <linux/init.h> | 21 | #include <linux/init.h> |
22 | #include <linux/kallsyms.h> | ||
22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
@@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size) | |||
310 | { | 311 | { |
311 | /* Reallocation required? */ | 312 | /* Reallocation required? */ |
312 | if (pcpu_num_used + 1 > pcpu_num_allocated) { | 313 | if (pcpu_num_used + 1 > pcpu_num_allocated) { |
313 | int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, | 314 | int *new; |
314 | GFP_KERNEL); | 315 | |
316 | new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, | ||
317 | GFP_KERNEL); | ||
315 | if (!new) | 318 | if (!new) |
316 | return 0; | 319 | return 0; |
317 | 320 | ||
318 | memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); | ||
319 | pcpu_num_allocated *= 2; | 321 | pcpu_num_allocated *= 2; |
320 | kfree(pcpu_size); | ||
321 | pcpu_size = new; | 322 | pcpu_size = new; |
322 | } | 323 | } |
323 | 324 | ||
@@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | |||
1471 | } | 1472 | } |
1472 | 1473 | ||
1473 | #ifdef CONFIG_KALLSYMS | 1474 | #ifdef CONFIG_KALLSYMS |
1474 | int is_exported(const char *name, const struct module *mod) | 1475 | static int is_exported(const char *name, const struct module *mod) |
1475 | { | 1476 | { |
1476 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) | 1477 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) |
1477 | return 1; | 1478 | return 1; |
@@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod, | |||
2097 | if (!best) | 2098 | if (!best) |
2098 | return NULL; | 2099 | return NULL; |
2099 | 2100 | ||
2100 | *size = nextval - mod->symtab[best].st_value; | 2101 | if (size) |
2101 | *offset = addr - mod->symtab[best].st_value; | 2102 | *size = nextval - mod->symtab[best].st_value; |
2103 | if (offset) | ||
2104 | *offset = addr - mod->symtab[best].st_value; | ||
2102 | return mod->strtab + mod->symtab[best].st_name; | 2105 | return mod->strtab + mod->symtab[best].st_name; |
2103 | } | 2106 | } |
2104 | 2107 | ||
@@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr, | |||
2123 | return NULL; | 2126 | return NULL; |
2124 | } | 2127 | } |
2125 | 2128 | ||
2126 | struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, | 2129 | int lookup_module_symbol_name(unsigned long addr, char *symname) |
2127 | char *type, char *name, size_t namelen) | 2130 | { |
2131 | struct module *mod; | ||
2132 | |||
2133 | mutex_lock(&module_mutex); | ||
2134 | list_for_each_entry(mod, &modules, list) { | ||
2135 | if (within(addr, mod->module_init, mod->init_size) || | ||
2136 | within(addr, mod->module_core, mod->core_size)) { | ||
2137 | const char *sym; | ||
2138 | |||
2139 | sym = get_ksymbol(mod, addr, NULL, NULL); | ||
2140 | if (!sym) | ||
2141 | goto out; | ||
2142 | strlcpy(symname, sym, KSYM_NAME_LEN + 1); | ||
2143 | mutex_unlock(&module_mutex); | ||
2144 | return 0; | ||
2145 | } | ||
2146 | } | ||
2147 | out: | ||
2148 | mutex_unlock(&module_mutex); | ||
2149 | return -ERANGE; | ||
2150 | } | ||
2151 | |||
2152 | int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | ||
2153 | unsigned long *offset, char *modname, char *name) | ||
2154 | { | ||
2155 | struct module *mod; | ||
2156 | |||
2157 | mutex_lock(&module_mutex); | ||
2158 | list_for_each_entry(mod, &modules, list) { | ||
2159 | if (within(addr, mod->module_init, mod->init_size) || | ||
2160 | within(addr, mod->module_core, mod->core_size)) { | ||
2161 | const char *sym; | ||
2162 | |||
2163 | sym = get_ksymbol(mod, addr, size, offset); | ||
2164 | if (!sym) | ||
2165 | goto out; | ||
2166 | if (modname) | ||
2167 | strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); | ||
2168 | if (name) | ||
2169 | strlcpy(name, sym, KSYM_NAME_LEN + 1); | ||
2170 | mutex_unlock(&module_mutex); | ||
2171 | return 0; | ||
2172 | } | ||
2173 | } | ||
2174 | out: | ||
2175 | mutex_unlock(&module_mutex); | ||
2176 | return -ERANGE; | ||
2177 | } | ||
2178 | |||
2179 | int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | ||
2180 | char *name, char *module_name, int *exported) | ||
2128 | { | 2181 | { |
2129 | struct module *mod; | 2182 | struct module *mod; |
2130 | 2183 | ||
@@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, | |||
2134 | *value = mod->symtab[symnum].st_value; | 2187 | *value = mod->symtab[symnum].st_value; |
2135 | *type = mod->symtab[symnum].st_info; | 2188 | *type = mod->symtab[symnum].st_info; |
2136 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, | 2189 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, |
2137 | namelen); | 2190 | KSYM_NAME_LEN + 1); |
2191 | strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); | ||
2192 | *exported = is_exported(name, mod); | ||
2138 | mutex_unlock(&module_mutex); | 2193 | mutex_unlock(&module_mutex); |
2139 | return mod; | 2194 | return 0; |
2140 | } | 2195 | } |
2141 | symnum -= mod->num_symtab; | 2196 | symnum -= mod->num_symtab; |
2142 | } | 2197 | } |
2143 | mutex_unlock(&module_mutex); | 2198 | mutex_unlock(&module_mutex); |
2144 | return NULL; | 2199 | return -ERANGE; |
2145 | } | 2200 | } |
2146 | 2201 | ||
2147 | static unsigned long mod_find_symname(struct module *mod, const char *name) | 2202 | static unsigned long mod_find_symname(struct module *mod, const char *name) |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5b9ee6f6bbb..1bc4b55241a8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk) | |||
38 | 38 | ||
39 | /* | 39 | /* |
40 | * creates a copy of "orig" with refcount 1. | 40 | * creates a copy of "orig" with refcount 1. |
41 | * This does not grab references to the contained namespaces, | ||
42 | * so that needs to be done by dup_namespaces. | ||
43 | */ | 41 | */ |
44 | static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) | 42 | static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) |
45 | { | 43 | { |
46 | struct nsproxy *ns; | 44 | struct nsproxy *ns; |
47 | 45 | ||
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) | |||
52 | } | 50 | } |
53 | 51 | ||
54 | /* | 52 | /* |
55 | * copies the nsproxy, setting refcount to 1, and grabbing a | 53 | * Create new nsproxy and all of its the associated namespaces. |
56 | * reference to all contained namespaces. Called from | 54 | * Return the newly created nsproxy. Do not attach this to the task, |
57 | * sys_unshare() | 55 | * leave it to the caller to do proper locking and attach it to task. |
58 | */ | 56 | */ |
59 | struct nsproxy *dup_namespaces(struct nsproxy *orig) | 57 | static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, |
58 | struct fs_struct *new_fs) | ||
60 | { | 59 | { |
61 | struct nsproxy *ns = clone_namespaces(orig); | 60 | struct nsproxy *new_nsp; |
62 | 61 | ||
63 | if (ns) { | 62 | new_nsp = clone_nsproxy(tsk->nsproxy); |
64 | if (ns->mnt_ns) | 63 | if (!new_nsp) |
65 | get_mnt_ns(ns->mnt_ns); | 64 | return ERR_PTR(-ENOMEM); |
66 | if (ns->uts_ns) | ||
67 | get_uts_ns(ns->uts_ns); | ||
68 | if (ns->ipc_ns) | ||
69 | get_ipc_ns(ns->ipc_ns); | ||
70 | if (ns->pid_ns) | ||
71 | get_pid_ns(ns->pid_ns); | ||
72 | } | ||
73 | 65 | ||
74 | return ns; | 66 | new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); |
67 | if (IS_ERR(new_nsp->mnt_ns)) | ||
68 | goto out_ns; | ||
69 | |||
70 | new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); | ||
71 | if (IS_ERR(new_nsp->uts_ns)) | ||
72 | goto out_uts; | ||
73 | |||
74 | new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); | ||
75 | if (IS_ERR(new_nsp->ipc_ns)) | ||
76 | goto out_ipc; | ||
77 | |||
78 | new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); | ||
79 | if (IS_ERR(new_nsp->pid_ns)) | ||
80 | goto out_pid; | ||
81 | |||
82 | return new_nsp; | ||
83 | |||
84 | out_pid: | ||
85 | if (new_nsp->ipc_ns) | ||
86 | put_ipc_ns(new_nsp->ipc_ns); | ||
87 | out_ipc: | ||
88 | if (new_nsp->uts_ns) | ||
89 | put_uts_ns(new_nsp->uts_ns); | ||
90 | out_uts: | ||
91 | if (new_nsp->mnt_ns) | ||
92 | put_mnt_ns(new_nsp->mnt_ns); | ||
93 | out_ns: | ||
94 | kfree(new_nsp); | ||
95 | return ERR_PTR(-ENOMEM); | ||
75 | } | 96 | } |
76 | 97 | ||
77 | /* | 98 | /* |
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
92 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) | 113 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) |
93 | return 0; | 114 | return 0; |
94 | 115 | ||
95 | new_ns = clone_namespaces(old_ns); | 116 | if (!capable(CAP_SYS_ADMIN)) { |
96 | if (!new_ns) { | 117 | err = -EPERM; |
97 | err = -ENOMEM; | ||
98 | goto out; | 118 | goto out; |
99 | } | 119 | } |
100 | 120 | ||
101 | tsk->nsproxy = new_ns; | 121 | new_ns = create_new_namespaces(flags, tsk, tsk->fs); |
102 | 122 | if (IS_ERR(new_ns)) { | |
103 | err = copy_mnt_ns(flags, tsk); | 123 | err = PTR_ERR(new_ns); |
104 | if (err) | 124 | goto out; |
105 | goto out_ns; | 125 | } |
106 | |||
107 | err = copy_utsname(flags, tsk); | ||
108 | if (err) | ||
109 | goto out_uts; | ||
110 | |||
111 | err = copy_ipcs(flags, tsk); | ||
112 | if (err) | ||
113 | goto out_ipc; | ||
114 | |||
115 | err = copy_pid_ns(flags, tsk); | ||
116 | if (err) | ||
117 | goto out_pid; | ||
118 | 126 | ||
127 | tsk->nsproxy = new_ns; | ||
119 | out: | 128 | out: |
120 | put_nsproxy(old_ns); | 129 | put_nsproxy(old_ns); |
121 | return err; | 130 | return err; |
122 | |||
123 | out_pid: | ||
124 | if (new_ns->ipc_ns) | ||
125 | put_ipc_ns(new_ns->ipc_ns); | ||
126 | out_ipc: | ||
127 | if (new_ns->uts_ns) | ||
128 | put_uts_ns(new_ns->uts_ns); | ||
129 | out_uts: | ||
130 | if (new_ns->mnt_ns) | ||
131 | put_mnt_ns(new_ns->mnt_ns); | ||
132 | out_ns: | ||
133 | tsk->nsproxy = old_ns; | ||
134 | kfree(new_ns); | ||
135 | goto out; | ||
136 | } | 131 | } |
137 | 132 | ||
138 | void free_nsproxy(struct nsproxy *ns) | 133 | void free_nsproxy(struct nsproxy *ns) |
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns) | |||
147 | put_pid_ns(ns->pid_ns); | 142 | put_pid_ns(ns->pid_ns); |
148 | kfree(ns); | 143 | kfree(ns); |
149 | } | 144 | } |
145 | |||
146 | /* | ||
147 | * Called from unshare. Unshare all the namespaces part of nsproxy. | ||
148 | * On sucess, returns the new nsproxy and a reference to old nsproxy | ||
149 | * to make sure it stays around. | ||
150 | */ | ||
151 | int unshare_nsproxy_namespaces(unsigned long unshare_flags, | ||
152 | struct nsproxy **new_nsp, struct fs_struct *new_fs) | ||
153 | { | ||
154 | struct nsproxy *old_ns = current->nsproxy; | ||
155 | int err = 0; | ||
156 | |||
157 | if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) | ||
158 | return 0; | ||
159 | |||
160 | #ifndef CONFIG_IPC_NS | ||
161 | if (unshare_flags & CLONE_NEWIPC) | ||
162 | return -EINVAL; | ||
163 | #endif | ||
164 | |||
165 | #ifndef CONFIG_UTS_NS | ||
166 | if (unshare_flags & CLONE_NEWUTS) | ||
167 | return -EINVAL; | ||
168 | #endif | ||
169 | |||
170 | if (!capable(CAP_SYS_ADMIN)) | ||
171 | return -EPERM; | ||
172 | |||
173 | get_nsproxy(old_ns); | ||
174 | |||
175 | *new_nsp = create_new_namespaces(unshare_flags, current, | ||
176 | new_fs ? new_fs : current->fs); | ||
177 | if (IS_ERR(*new_nsp)) { | ||
178 | err = PTR_ERR(*new_nsp); | ||
179 | put_nsproxy(old_ns); | ||
180 | } | ||
181 | return err; | ||
182 | } | ||
diff --git a/kernel/params.c b/kernel/params.c index 312172320b4c..e61c46c97ce7 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) | |||
269 | return param_get_bool(buffer, &dummy); | 269 | return param_get_bool(buffer, &dummy); |
270 | } | 270 | } |
271 | 271 | ||
272 | /* We cheat here and temporarily mangle the string. */ | 272 | /* We break the rule and mangle the string. */ |
273 | static int param_array(const char *name, | 273 | static int param_array(const char *name, |
274 | const char *val, | 274 | const char *val, |
275 | unsigned int min, unsigned int max, | 275 | unsigned int min, unsigned int max, |
diff --git a/kernel/pid.c b/kernel/pid.c index 9c80bc23d6b8..d3ad724afa83 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr) | |||
360 | } | 360 | } |
361 | EXPORT_SYMBOL_GPL(find_get_pid); | 361 | EXPORT_SYMBOL_GPL(find_get_pid); |
362 | 362 | ||
363 | int copy_pid_ns(int flags, struct task_struct *tsk) | 363 | struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) |
364 | { | 364 | { |
365 | struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; | 365 | BUG_ON(!old_ns); |
366 | int err = 0; | ||
367 | |||
368 | if (!old_ns) | ||
369 | return 0; | ||
370 | |||
371 | get_pid_ns(old_ns); | 366 | get_pid_ns(old_ns); |
372 | return err; | 367 | return old_ns; |
373 | } | 368 | } |
374 | 369 | ||
375 | void free_pid_ns(struct kref *kref) | 370 | void free_pid_ns(struct kref *kref) |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 657f77697415..1de710e18373 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
971 | maxfire = 20; | 971 | maxfire = 20; |
972 | tsk->it_prof_expires = cputime_zero; | 972 | tsk->it_prof_expires = cputime_zero; |
973 | while (!list_empty(timers)) { | 973 | while (!list_empty(timers)) { |
974 | struct cpu_timer_list *t = list_entry(timers->next, | 974 | struct cpu_timer_list *t = list_first_entry(timers, |
975 | struct cpu_timer_list, | 975 | struct cpu_timer_list, |
976 | entry); | 976 | entry); |
977 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | 977 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { |
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
986 | maxfire = 20; | 986 | maxfire = 20; |
987 | tsk->it_virt_expires = cputime_zero; | 987 | tsk->it_virt_expires = cputime_zero; |
988 | while (!list_empty(timers)) { | 988 | while (!list_empty(timers)) { |
989 | struct cpu_timer_list *t = list_entry(timers->next, | 989 | struct cpu_timer_list *t = list_first_entry(timers, |
990 | struct cpu_timer_list, | 990 | struct cpu_timer_list, |
991 | entry); | 991 | entry); |
992 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | 992 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { |
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1001 | maxfire = 20; | 1001 | maxfire = 20; |
1002 | tsk->it_sched_expires = 0; | 1002 | tsk->it_sched_expires = 0; |
1003 | while (!list_empty(timers)) { | 1003 | while (!list_empty(timers)) { |
1004 | struct cpu_timer_list *t = list_entry(timers->next, | 1004 | struct cpu_timer_list *t = list_first_entry(timers, |
1005 | struct cpu_timer_list, | 1005 | struct cpu_timer_list, |
1006 | entry); | 1006 | entry); |
1007 | if (!--maxfire || tsk->sched_time < t->expires.sched) { | 1007 | if (!--maxfire || tsk->sched_time < t->expires.sched) { |
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1057 | maxfire = 20; | 1057 | maxfire = 20; |
1058 | prof_expires = cputime_zero; | 1058 | prof_expires = cputime_zero; |
1059 | while (!list_empty(timers)) { | 1059 | while (!list_empty(timers)) { |
1060 | struct cpu_timer_list *t = list_entry(timers->next, | 1060 | struct cpu_timer_list *t = list_first_entry(timers, |
1061 | struct cpu_timer_list, | 1061 | struct cpu_timer_list, |
1062 | entry); | 1062 | entry); |
1063 | if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { | 1063 | if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { |
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1072 | maxfire = 20; | 1072 | maxfire = 20; |
1073 | virt_expires = cputime_zero; | 1073 | virt_expires = cputime_zero; |
1074 | while (!list_empty(timers)) { | 1074 | while (!list_empty(timers)) { |
1075 | struct cpu_timer_list *t = list_entry(timers->next, | 1075 | struct cpu_timer_list *t = list_first_entry(timers, |
1076 | struct cpu_timer_list, | 1076 | struct cpu_timer_list, |
1077 | entry); | 1077 | entry); |
1078 | if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { | 1078 | if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { |
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1087 | maxfire = 20; | 1087 | maxfire = 20; |
1088 | sched_expires = 0; | 1088 | sched_expires = 0; |
1089 | while (!list_empty(timers)) { | 1089 | while (!list_empty(timers)) { |
1090 | struct cpu_timer_list *t = list_entry(timers->next, | 1090 | struct cpu_timer_list *t = list_first_entry(timers, |
1091 | struct cpu_timer_list, | 1091 | struct cpu_timer_list, |
1092 | entry); | 1092 | entry); |
1093 | if (!--maxfire || sched_time < t->expires.sched) { | 1093 | if (!--maxfire || sched_time < t->expires.sched) { |
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1400 | */ | 1400 | */ |
1401 | head = &tsk->signal->cpu_timers[clock_idx]; | 1401 | head = &tsk->signal->cpu_timers[clock_idx]; |
1402 | if (list_empty(head) || | 1402 | if (list_empty(head) || |
1403 | cputime_ge(list_entry(head->next, | 1403 | cputime_ge(list_first_entry(head, |
1404 | struct cpu_timer_list, entry)->expires.cpu, | 1404 | struct cpu_timer_list, entry)->expires.cpu, |
1405 | *newval)) { | 1405 | *newval)) { |
1406 | /* | 1406 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 44318ca71978..588c99da0307 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -31,7 +31,6 @@ | |||
31 | * POSIX clocks & timers | 31 | * POSIX clocks & timers |
32 | */ | 32 | */ |
33 | #include <linux/mm.h> | 33 | #include <linux/mm.h> |
34 | #include <linux/smp_lock.h> | ||
35 | #include <linux/interrupt.h> | 34 | #include <linux/interrupt.h> |
36 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
37 | #include <linux/time.h> | 36 | #include <linux/time.h> |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0eb5c420e8ed..088419387388 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -8,7 +8,6 @@ | |||
8 | 8 | ||
9 | #undef DEBUG | 9 | #undef DEBUG |
10 | 10 | ||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
13 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
14 | #include <linux/module.h> | 13 | #include <linux/module.h> |
@@ -25,10 +24,9 @@ | |||
25 | 24 | ||
26 | static inline int freezeable(struct task_struct * p) | 25 | static inline int freezeable(struct task_struct * p) |
27 | { | 26 | { |
28 | if ((p == current) || | 27 | if ((p == current) || |
29 | (p->flags & PF_NOFREEZE) || | 28 | (p->flags & PF_NOFREEZE) || |
30 | (p->exit_state == EXIT_ZOMBIE) || | 29 | (p->exit_state != 0)) |
31 | (p->exit_state == EXIT_DEAD)) | ||
32 | return 0; | 30 | return 0; |
33 | return 1; | 31 | return 1; |
34 | } | 32 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 128da11f01c2..b7039772b05c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -14,7 +14,6 @@ | |||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/suspend.h> | 16 | #include <linux/suspend.h> |
17 | #include <linux/smp_lock.h> | ||
18 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
19 | #include <linux/bitops.h> | 18 | #include <linux/bitops.h> |
20 | #include <linux/spinlock.h> | 19 | #include <linux/spinlock.h> |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e83ed9945a80..b8b235cc19d1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -12,7 +12,6 @@ | |||
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/smp_lock.h> | ||
16 | #include <linux/file.h> | 15 | #include <linux/file.h> |
17 | #include <linux/utsname.h> | 16 | #include <linux/utsname.h> |
18 | #include <linux/version.h> | 17 | #include <linux/version.h> |
diff --git a/kernel/printk.c b/kernel/printk.c index 4b47e59248df..0bbdeac2810c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/tty.h> | 21 | #include <linux/tty.h> |
22 | #include <linux/tty_driver.h> | 22 | #include <linux/tty_driver.h> |
23 | #include <linux/smp_lock.h> | ||
24 | #include <linux/console.h> | 23 | #include <linux/console.h> |
25 | #include <linux/init.h> | 24 | #include <linux/init.h> |
26 | #include <linux/module.h> | 25 | #include <linux/module.h> |
@@ -931,8 +930,16 @@ void register_console(struct console *console) | |||
931 | { | 930 | { |
932 | int i; | 931 | int i; |
933 | unsigned long flags; | 932 | unsigned long flags; |
933 | struct console *bootconsole = NULL; | ||
934 | 934 | ||
935 | if (preferred_console < 0) | 935 | if (console_drivers) { |
936 | if (console->flags & CON_BOOT) | ||
937 | return; | ||
938 | if (console_drivers->flags & CON_BOOT) | ||
939 | bootconsole = console_drivers; | ||
940 | } | ||
941 | |||
942 | if (preferred_console < 0 || bootconsole || !console_drivers) | ||
936 | preferred_console = selected_console; | 943 | preferred_console = selected_console; |
937 | 944 | ||
938 | /* | 945 | /* |
@@ -978,8 +985,11 @@ void register_console(struct console *console) | |||
978 | if (!(console->flags & CON_ENABLED)) | 985 | if (!(console->flags & CON_ENABLED)) |
979 | return; | 986 | return; |
980 | 987 | ||
981 | if (console_drivers && (console_drivers->flags & CON_BOOT)) { | 988 | if (bootconsole) { |
982 | unregister_console(console_drivers); | 989 | printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", |
990 | bootconsole->name, bootconsole->index, | ||
991 | console->name, console->index); | ||
992 | unregister_console(bootconsole); | ||
983 | console->flags &= ~CON_PRINTBUFFER; | 993 | console->flags &= ~CON_PRINTBUFFER; |
984 | } | 994 | } |
985 | 995 | ||
@@ -1030,16 +1040,11 @@ int unregister_console(struct console *console) | |||
1030 | } | 1040 | } |
1031 | } | 1041 | } |
1032 | 1042 | ||
1033 | /* If last console is removed, we re-enable picking the first | 1043 | /* |
1034 | * one that gets registered. Without that, pmac early boot console | ||
1035 | * would prevent fbcon from taking over. | ||
1036 | * | ||
1037 | * If this isn't the last console and it has CON_CONSDEV set, we | 1044 | * If this isn't the last console and it has CON_CONSDEV set, we |
1038 | * need to set it on the next preferred console. | 1045 | * need to set it on the next preferred console. |
1039 | */ | 1046 | */ |
1040 | if (console_drivers == NULL) | 1047 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
1041 | preferred_console = selected_console; | ||
1042 | else if (console->flags & CON_CONSDEV) | ||
1043 | console_drivers->flags |= CON_CONSDEV; | 1048 | console_drivers->flags |= CON_CONSDEV; |
1044 | 1049 | ||
1045 | release_console_sem(); | 1050 | release_console_sem(); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index bcd14e83ef39..55ba82a85a66 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = { | |||
502 | .name = "sched" | 502 | .name = "sched" |
503 | }; | 503 | }; |
504 | 504 | ||
505 | static struct rcu_torture_ops *torture_ops[] = | ||
506 | { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, | ||
507 | &sched_ops, NULL }; | ||
508 | |||
509 | /* | 505 | /* |
510 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 506 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
511 | * for that pointed to by rcu_torture_current, freeing the old structure | 507 | * for that pointed to by rcu_torture_current, freeing the old structure |
@@ -534,7 +530,7 @@ rcu_torture_writer(void *arg) | |||
534 | rp->rtort_mbtest = 1; | 530 | rp->rtort_mbtest = 1; |
535 | rcu_assign_pointer(rcu_torture_current, rp); | 531 | rcu_assign_pointer(rcu_torture_current, rp); |
536 | smp_wmb(); | 532 | smp_wmb(); |
537 | if (old_rp != NULL) { | 533 | if (old_rp) { |
538 | i = old_rp->rtort_pipe_count; | 534 | i = old_rp->rtort_pipe_count; |
539 | if (i > RCU_TORTURE_PIPE_LEN) | 535 | if (i > RCU_TORTURE_PIPE_LEN) |
540 | i = RCU_TORTURE_PIPE_LEN; | 536 | i = RCU_TORTURE_PIPE_LEN; |
@@ -685,7 +681,7 @@ rcu_torture_printk(char *page) | |||
685 | atomic_read(&rcu_torture_wcount[i])); | 681 | atomic_read(&rcu_torture_wcount[i])); |
686 | } | 682 | } |
687 | cnt += sprintf(&page[cnt], "\n"); | 683 | cnt += sprintf(&page[cnt], "\n"); |
688 | if (cur_ops->stats != NULL) | 684 | if (cur_ops->stats) |
689 | cnt += cur_ops->stats(&page[cnt]); | 685 | cnt += cur_ops->stats(&page[cnt]); |
690 | return cnt; | 686 | return cnt; |
691 | } | 687 | } |
@@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void) | |||
749 | 745 | ||
750 | set_cpus_allowed(current, tmp_mask); | 746 | set_cpus_allowed(current, tmp_mask); |
751 | 747 | ||
752 | if (reader_tasks != NULL) { | 748 | if (reader_tasks) { |
753 | for (i = 0; i < nrealreaders; i++) | 749 | for (i = 0; i < nrealreaders; i++) |
754 | if (reader_tasks[i]) | 750 | if (reader_tasks[i]) |
755 | set_cpus_allowed(reader_tasks[i], tmp_mask); | 751 | set_cpus_allowed(reader_tasks[i], tmp_mask); |
756 | } | 752 | } |
757 | 753 | ||
758 | if (fakewriter_tasks != NULL) { | 754 | if (fakewriter_tasks) { |
759 | for (i = 0; i < nfakewriters; i++) | 755 | for (i = 0; i < nfakewriters; i++) |
760 | if (fakewriter_tasks[i]) | 756 | if (fakewriter_tasks[i]) |
761 | set_cpus_allowed(fakewriter_tasks[i], tmp_mask); | 757 | set_cpus_allowed(fakewriter_tasks[i], tmp_mask); |
@@ -808,21 +804,21 @@ rcu_torture_cleanup(void) | |||
808 | int i; | 804 | int i; |
809 | 805 | ||
810 | fullstop = 1; | 806 | fullstop = 1; |
811 | if (shuffler_task != NULL) { | 807 | if (shuffler_task) { |
812 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); | 808 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); |
813 | kthread_stop(shuffler_task); | 809 | kthread_stop(shuffler_task); |
814 | } | 810 | } |
815 | shuffler_task = NULL; | 811 | shuffler_task = NULL; |
816 | 812 | ||
817 | if (writer_task != NULL) { | 813 | if (writer_task) { |
818 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | 814 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); |
819 | kthread_stop(writer_task); | 815 | kthread_stop(writer_task); |
820 | } | 816 | } |
821 | writer_task = NULL; | 817 | writer_task = NULL; |
822 | 818 | ||
823 | if (reader_tasks != NULL) { | 819 | if (reader_tasks) { |
824 | for (i = 0; i < nrealreaders; i++) { | 820 | for (i = 0; i < nrealreaders; i++) { |
825 | if (reader_tasks[i] != NULL) { | 821 | if (reader_tasks[i]) { |
826 | VERBOSE_PRINTK_STRING( | 822 | VERBOSE_PRINTK_STRING( |
827 | "Stopping rcu_torture_reader task"); | 823 | "Stopping rcu_torture_reader task"); |
828 | kthread_stop(reader_tasks[i]); | 824 | kthread_stop(reader_tasks[i]); |
@@ -834,9 +830,9 @@ rcu_torture_cleanup(void) | |||
834 | } | 830 | } |
835 | rcu_torture_current = NULL; | 831 | rcu_torture_current = NULL; |
836 | 832 | ||
837 | if (fakewriter_tasks != NULL) { | 833 | if (fakewriter_tasks) { |
838 | for (i = 0; i < nfakewriters; i++) { | 834 | for (i = 0; i < nfakewriters; i++) { |
839 | if (fakewriter_tasks[i] != NULL) { | 835 | if (fakewriter_tasks[i]) { |
840 | VERBOSE_PRINTK_STRING( | 836 | VERBOSE_PRINTK_STRING( |
841 | "Stopping rcu_torture_fakewriter task"); | 837 | "Stopping rcu_torture_fakewriter task"); |
842 | kthread_stop(fakewriter_tasks[i]); | 838 | kthread_stop(fakewriter_tasks[i]); |
@@ -847,7 +843,7 @@ rcu_torture_cleanup(void) | |||
847 | fakewriter_tasks = NULL; | 843 | fakewriter_tasks = NULL; |
848 | } | 844 | } |
849 | 845 | ||
850 | if (stats_task != NULL) { | 846 | if (stats_task) { |
851 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | 847 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); |
852 | kthread_stop(stats_task); | 848 | kthread_stop(stats_task); |
853 | } | 849 | } |
@@ -858,7 +854,7 @@ rcu_torture_cleanup(void) | |||
858 | 854 | ||
859 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 855 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
860 | 856 | ||
861 | if (cur_ops->cleanup != NULL) | 857 | if (cur_ops->cleanup) |
862 | cur_ops->cleanup(); | 858 | cur_ops->cleanup(); |
863 | if (atomic_read(&n_rcu_torture_error)) | 859 | if (atomic_read(&n_rcu_torture_error)) |
864 | rcu_torture_print_module_parms("End of test: FAILURE"); | 860 | rcu_torture_print_module_parms("End of test: FAILURE"); |
@@ -866,27 +862,28 @@ rcu_torture_cleanup(void) | |||
866 | rcu_torture_print_module_parms("End of test: SUCCESS"); | 862 | rcu_torture_print_module_parms("End of test: SUCCESS"); |
867 | } | 863 | } |
868 | 864 | ||
869 | static int | 865 | static int __init |
870 | rcu_torture_init(void) | 866 | rcu_torture_init(void) |
871 | { | 867 | { |
872 | int i; | 868 | int i; |
873 | int cpu; | 869 | int cpu; |
874 | int firsterr = 0; | 870 | int firsterr = 0; |
871 | static struct rcu_torture_ops *torture_ops[] = | ||
872 | { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, | ||
873 | &srcu_ops, &sched_ops, }; | ||
875 | 874 | ||
876 | /* Process args and tell the world that the torturer is on the job. */ | 875 | /* Process args and tell the world that the torturer is on the job. */ |
877 | 876 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | |
878 | for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { | ||
879 | cur_ops = torture_ops[i]; | 877 | cur_ops = torture_ops[i]; |
880 | if (strcmp(torture_type, cur_ops->name) == 0) { | 878 | if (strcmp(torture_type, cur_ops->name) == 0) |
881 | break; | 879 | break; |
882 | } | ||
883 | } | 880 | } |
884 | if (cur_ops == NULL) { | 881 | if (i == ARRAY_SIZE(torture_ops)) { |
885 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", | 882 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", |
886 | torture_type); | 883 | torture_type); |
887 | return (-EINVAL); | 884 | return (-EINVAL); |
888 | } | 885 | } |
889 | if (cur_ops->init != NULL) | 886 | if (cur_ops->init) |
890 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 887 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
891 | 888 | ||
892 | if (nreaders >= 0) | 889 | if (nreaders >= 0) |
@@ -899,7 +896,7 @@ rcu_torture_init(void) | |||
899 | /* Set up the freelist. */ | 896 | /* Set up the freelist. */ |
900 | 897 | ||
901 | INIT_LIST_HEAD(&rcu_torture_freelist); | 898 | INIT_LIST_HEAD(&rcu_torture_freelist); |
902 | for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { | 899 | for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { |
903 | rcu_tortures[i].rtort_mbtest = 0; | 900 | rcu_tortures[i].rtort_mbtest = 0; |
904 | list_add_tail(&rcu_tortures[i].rtort_free, | 901 | list_add_tail(&rcu_tortures[i].rtort_free, |
905 | &rcu_torture_freelist); | 902 | &rcu_torture_freelist); |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 291ded556aa0..9a87886b022e 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem) | |||
60 | int ret = __down_write_trylock(sem); | 60 | int ret = __down_write_trylock(sem); |
61 | 61 | ||
62 | if (ret == 1) | 62 | if (ret == 1) |
63 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 63 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); |
64 | return ret; | 64 | return ret; |
65 | } | 65 | } |
66 | 66 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 0227f1625a75..a3a04085e794 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -52,8 +52,9 @@ | |||
52 | #include <linux/tsacct_kern.h> | 52 | #include <linux/tsacct_kern.h> |
53 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
54 | #include <linux/delayacct.h> | 54 | #include <linux/delayacct.h> |
55 | #include <asm/tlb.h> | 55 | #include <linux/reciprocal_div.h> |
56 | 56 | ||
57 | #include <asm/tlb.h> | ||
57 | #include <asm/unistd.h> | 58 | #include <asm/unistd.h> |
58 | 59 | ||
59 | /* | 60 | /* |
@@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
168 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | 169 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) |
169 | 170 | ||
170 | #define TASK_PREEMPTS_CURR(p, rq) \ | 171 | #define TASK_PREEMPTS_CURR(p, rq) \ |
171 | ((p)->prio < (rq)->curr->prio) | 172 | (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active)) |
172 | 173 | ||
173 | #define SCALE_PRIO(x, prio) \ | 174 | #define SCALE_PRIO(x, prio) \ |
174 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | 175 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio) | |||
181 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | 182 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
182 | } | 183 | } |
183 | 184 | ||
185 | #ifdef CONFIG_SMP | ||
186 | /* | ||
187 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
188 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
189 | */ | ||
190 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
191 | { | ||
192 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Each time a sched group cpu_power is changed, | ||
197 | * we must compute its reciprocal value | ||
198 | */ | ||
199 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
200 | { | ||
201 | sg->__cpu_power += val; | ||
202 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
203 | } | ||
204 | #endif | ||
205 | |||
184 | /* | 206 | /* |
185 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | 207 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
186 | * to time slice values: [800ms ... 100ms ... 5ms] | 208 | * to time slice values: [800ms ... 100ms ... 5ms] |
@@ -223,6 +245,10 @@ struct rq { | |||
223 | unsigned long raw_weighted_load; | 245 | unsigned long raw_weighted_load; |
224 | #ifdef CONFIG_SMP | 246 | #ifdef CONFIG_SMP |
225 | unsigned long cpu_load[3]; | 247 | unsigned long cpu_load[3]; |
248 | unsigned char idle_at_tick; | ||
249 | #ifdef CONFIG_NO_HZ | ||
250 | unsigned char in_nohz_recently; | ||
251 | #endif | ||
226 | #endif | 252 | #endif |
227 | unsigned long long nr_switches; | 253 | unsigned long long nr_switches; |
228 | 254 | ||
@@ -278,7 +304,7 @@ struct rq { | |||
278 | struct lock_class_key rq_lock_key; | 304 | struct lock_class_key rq_lock_key; |
279 | }; | 305 | }; |
280 | 306 | ||
281 | static DEFINE_PER_CPU(struct rq, runqueues); | 307 | static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; |
282 | 308 | ||
283 | static inline int cpu_of(struct rq *rq) | 309 | static inline int cpu_of(struct rq *rq) |
284 | { | 310 | { |
@@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p) | |||
1049 | if (!tsk_is_polling(p)) | 1075 | if (!tsk_is_polling(p)) |
1050 | smp_send_reschedule(cpu); | 1076 | smp_send_reschedule(cpu); |
1051 | } | 1077 | } |
1078 | |||
1079 | static void resched_cpu(int cpu) | ||
1080 | { | ||
1081 | struct rq *rq = cpu_rq(cpu); | ||
1082 | unsigned long flags; | ||
1083 | |||
1084 | if (!spin_trylock_irqsave(&rq->lock, flags)) | ||
1085 | return; | ||
1086 | resched_task(cpu_curr(cpu)); | ||
1087 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1088 | } | ||
1052 | #else | 1089 | #else |
1053 | static inline void resched_task(struct task_struct *p) | 1090 | static inline void resched_task(struct task_struct *p) |
1054 | { | 1091 | { |
@@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1241 | } | 1278 | } |
1242 | 1279 | ||
1243 | /* Adjust by relative CPU power of the group */ | 1280 | /* Adjust by relative CPU power of the group */ |
1244 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1281 | avg_load = sg_div_cpu_power(group, |
1282 | avg_load * SCHED_LOAD_SCALE); | ||
1245 | 1283 | ||
1246 | if (local_group) { | 1284 | if (local_group) { |
1247 | this_load = avg_load; | 1285 | this_load = avg_load; |
@@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p) | |||
1368 | struct sched_domain *sd; | 1406 | struct sched_domain *sd; |
1369 | int i; | 1407 | int i; |
1370 | 1408 | ||
1371 | if (idle_cpu(cpu)) | 1409 | /* |
1410 | * If it is idle, then it is the best cpu to run this task. | ||
1411 | * | ||
1412 | * This cpu is also the best, if it has more than one task already. | ||
1413 | * Siblings must be also busy(in most cases) as they didn't already | ||
1414 | * pickup the extra load from this cpu and hence we need not check | ||
1415 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1416 | * penalities associated with that. | ||
1417 | */ | ||
1418 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1372 | return cpu; | 1419 | return cpu; |
1373 | 1420 | ||
1374 | for_each_domain(cpu, sd) { | 1421 | for_each_domain(cpu, sd) { |
@@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2352 | } | 2399 | } |
2353 | 2400 | ||
2354 | total_load += avg_load; | 2401 | total_load += avg_load; |
2355 | total_pwr += group->cpu_power; | 2402 | total_pwr += group->__cpu_power; |
2356 | 2403 | ||
2357 | /* Adjust by relative CPU power of the group */ | 2404 | /* Adjust by relative CPU power of the group */ |
2358 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2405 | avg_load = sg_div_cpu_power(group, |
2406 | avg_load * SCHED_LOAD_SCALE); | ||
2359 | 2407 | ||
2360 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | 2408 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
2361 | 2409 | ||
2362 | if (local_group) { | 2410 | if (local_group) { |
2363 | this_load = avg_load; | 2411 | this_load = avg_load; |
@@ -2468,8 +2516,8 @@ group_next: | |||
2468 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 2516 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2469 | 2517 | ||
2470 | /* How much load to actually move to equalise the imbalance */ | 2518 | /* How much load to actually move to equalise the imbalance */ |
2471 | *imbalance = min(max_pull * busiest->cpu_power, | 2519 | *imbalance = min(max_pull * busiest->__cpu_power, |
2472 | (avg_load - this_load) * this->cpu_power) | 2520 | (avg_load - this_load) * this->__cpu_power) |
2473 | / SCHED_LOAD_SCALE; | 2521 | / SCHED_LOAD_SCALE; |
2474 | 2522 | ||
2475 | /* | 2523 | /* |
@@ -2503,28 +2551,29 @@ small_imbalance: | |||
2503 | * moving them. | 2551 | * moving them. |
2504 | */ | 2552 | */ |
2505 | 2553 | ||
2506 | pwr_now += busiest->cpu_power * | 2554 | pwr_now += busiest->__cpu_power * |
2507 | min(busiest_load_per_task, max_load); | 2555 | min(busiest_load_per_task, max_load); |
2508 | pwr_now += this->cpu_power * | 2556 | pwr_now += this->__cpu_power * |
2509 | min(this_load_per_task, this_load); | 2557 | min(this_load_per_task, this_load); |
2510 | pwr_now /= SCHED_LOAD_SCALE; | 2558 | pwr_now /= SCHED_LOAD_SCALE; |
2511 | 2559 | ||
2512 | /* Amount of load we'd subtract */ | 2560 | /* Amount of load we'd subtract */ |
2513 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / | 2561 | tmp = sg_div_cpu_power(busiest, |
2514 | busiest->cpu_power; | 2562 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2515 | if (max_load > tmp) | 2563 | if (max_load > tmp) |
2516 | pwr_move += busiest->cpu_power * | 2564 | pwr_move += busiest->__cpu_power * |
2517 | min(busiest_load_per_task, max_load - tmp); | 2565 | min(busiest_load_per_task, max_load - tmp); |
2518 | 2566 | ||
2519 | /* Amount of load we'd add */ | 2567 | /* Amount of load we'd add */ |
2520 | if (max_load * busiest->cpu_power < | 2568 | if (max_load * busiest->__cpu_power < |
2521 | busiest_load_per_task * SCHED_LOAD_SCALE) | 2569 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2522 | tmp = max_load * busiest->cpu_power / this->cpu_power; | 2570 | tmp = sg_div_cpu_power(this, |
2571 | max_load * busiest->__cpu_power); | ||
2523 | else | 2572 | else |
2524 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / | 2573 | tmp = sg_div_cpu_power(this, |
2525 | this->cpu_power; | 2574 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2526 | pwr_move += this->cpu_power * | 2575 | pwr_move += this->__cpu_power * |
2527 | min(this_load_per_task, this_load + tmp); | 2576 | min(this_load_per_task, this_load + tmp); |
2528 | pwr_move /= SCHED_LOAD_SCALE; | 2577 | pwr_move /= SCHED_LOAD_SCALE; |
2529 | 2578 | ||
2530 | /* Move if we gain throughput */ | 2579 | /* Move if we gain throughput */ |
@@ -2657,6 +2706,12 @@ redo: | |||
2657 | double_rq_unlock(this_rq, busiest); | 2706 | double_rq_unlock(this_rq, busiest); |
2658 | local_irq_restore(flags); | 2707 | local_irq_restore(flags); |
2659 | 2708 | ||
2709 | /* | ||
2710 | * some other cpu did the load balance for us. | ||
2711 | */ | ||
2712 | if (nr_moved && this_cpu != smp_processor_id()) | ||
2713 | resched_cpu(this_cpu); | ||
2714 | |||
2660 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2715 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2661 | if (unlikely(all_pinned)) { | 2716 | if (unlikely(all_pinned)) { |
2662 | cpu_clear(cpu_of(busiest), cpus); | 2717 | cpu_clear(cpu_of(busiest), cpus); |
@@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq) | |||
2927 | } | 2982 | } |
2928 | } | 2983 | } |
2929 | 2984 | ||
2985 | #ifdef CONFIG_NO_HZ | ||
2986 | static struct { | ||
2987 | atomic_t load_balancer; | ||
2988 | cpumask_t cpu_mask; | ||
2989 | } nohz ____cacheline_aligned = { | ||
2990 | .load_balancer = ATOMIC_INIT(-1), | ||
2991 | .cpu_mask = CPU_MASK_NONE, | ||
2992 | }; | ||
2993 | |||
2930 | /* | 2994 | /* |
2931 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 2995 | * This routine will try to nominate the ilb (idle load balancing) |
2996 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
2997 | * load balancing on behalf of all those cpus. If all the cpus in the system | ||
2998 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
2999 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3000 | * arrives... | ||
2932 | * | 3001 | * |
3002 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3003 | * for idle load balancing. ilb owner will still be part of | ||
3004 | * nohz.cpu_mask.. | ||
3005 | * | ||
3006 | * While stopping the tick, this cpu will become the ilb owner if there | ||
3007 | * is no other owner. And will be the owner till that cpu becomes busy | ||
3008 | * or if all cpus in the system stop their ticks at which point | ||
3009 | * there is no need for ilb owner. | ||
3010 | * | ||
3011 | * When the ilb owner becomes busy, it nominates another owner, during the | ||
3012 | * next busy scheduler_tick() | ||
3013 | */ | ||
3014 | int select_nohz_load_balancer(int stop_tick) | ||
3015 | { | ||
3016 | int cpu = smp_processor_id(); | ||
3017 | |||
3018 | if (stop_tick) { | ||
3019 | cpu_set(cpu, nohz.cpu_mask); | ||
3020 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3021 | |||
3022 | /* | ||
3023 | * If we are going offline and still the leader, give up! | ||
3024 | */ | ||
3025 | if (cpu_is_offline(cpu) && | ||
3026 | atomic_read(&nohz.load_balancer) == cpu) { | ||
3027 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3028 | BUG(); | ||
3029 | return 0; | ||
3030 | } | ||
3031 | |||
3032 | /* time for ilb owner also to sleep */ | ||
3033 | if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3034 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3035 | atomic_set(&nohz.load_balancer, -1); | ||
3036 | return 0; | ||
3037 | } | ||
3038 | |||
3039 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3040 | /* make me the ilb owner */ | ||
3041 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3042 | return 1; | ||
3043 | } else if (atomic_read(&nohz.load_balancer) == cpu) | ||
3044 | return 1; | ||
3045 | } else { | ||
3046 | if (!cpu_isset(cpu, nohz.cpu_mask)) | ||
3047 | return 0; | ||
3048 | |||
3049 | cpu_clear(cpu, nohz.cpu_mask); | ||
3050 | |||
3051 | if (atomic_read(&nohz.load_balancer) == cpu) | ||
3052 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | ||
3053 | BUG(); | ||
3054 | } | ||
3055 | return 0; | ||
3056 | } | ||
3057 | #endif | ||
3058 | |||
3059 | static DEFINE_SPINLOCK(balancing); | ||
3060 | |||
3061 | /* | ||
2933 | * It checks each scheduling domain to see if it is due to be balanced, | 3062 | * It checks each scheduling domain to see if it is due to be balanced, |
2934 | * and initiates a balancing operation if so. | 3063 | * and initiates a balancing operation if so. |
2935 | * | 3064 | * |
2936 | * Balancing parameters are set up in arch_init_sched_domains. | 3065 | * Balancing parameters are set up in arch_init_sched_domains. |
2937 | */ | 3066 | */ |
2938 | static DEFINE_SPINLOCK(balancing); | 3067 | static inline void rebalance_domains(int cpu, enum idle_type idle) |
2939 | |||
2940 | static void run_rebalance_domains(struct softirq_action *h) | ||
2941 | { | 3068 | { |
2942 | int this_cpu = smp_processor_id(), balance = 1; | 3069 | int balance = 1; |
2943 | struct rq *this_rq = cpu_rq(this_cpu); | 3070 | struct rq *rq = cpu_rq(cpu); |
2944 | unsigned long interval; | 3071 | unsigned long interval; |
2945 | struct sched_domain *sd; | 3072 | struct sched_domain *sd; |
2946 | /* | 3073 | /* Earliest time when we have to do rebalance again */ |
2947 | * We are idle if there are no processes running. This | ||
2948 | * is valid even if we are the idle process (SMT). | ||
2949 | */ | ||
2950 | enum idle_type idle = !this_rq->nr_running ? | ||
2951 | SCHED_IDLE : NOT_IDLE; | ||
2952 | /* Earliest time when we have to call run_rebalance_domains again */ | ||
2953 | unsigned long next_balance = jiffies + 60*HZ; | 3074 | unsigned long next_balance = jiffies + 60*HZ; |
2954 | 3075 | ||
2955 | for_each_domain(this_cpu, sd) { | 3076 | for_each_domain(cpu, sd) { |
2956 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3077 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2957 | continue; | 3078 | continue; |
2958 | 3079 | ||
@@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
2971 | } | 3092 | } |
2972 | 3093 | ||
2973 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 3094 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
2974 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { | 3095 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
2975 | /* | 3096 | /* |
2976 | * We've pulled tasks over so either we're no | 3097 | * We've pulled tasks over so either we're no |
2977 | * longer idle, or one of our SMT siblings is | 3098 | * longer idle, or one of our SMT siblings is |
@@ -2995,7 +3116,114 @@ out: | |||
2995 | if (!balance) | 3116 | if (!balance) |
2996 | break; | 3117 | break; |
2997 | } | 3118 | } |
2998 | this_rq->next_balance = next_balance; | 3119 | rq->next_balance = next_balance; |
3120 | } | ||
3121 | |||
3122 | /* | ||
3123 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3124 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3125 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
3126 | */ | ||
3127 | static void run_rebalance_domains(struct softirq_action *h) | ||
3128 | { | ||
3129 | int local_cpu = smp_processor_id(); | ||
3130 | struct rq *local_rq = cpu_rq(local_cpu); | ||
3131 | enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; | ||
3132 | |||
3133 | rebalance_domains(local_cpu, idle); | ||
3134 | |||
3135 | #ifdef CONFIG_NO_HZ | ||
3136 | /* | ||
3137 | * If this cpu is the owner for idle load balancing, then do the | ||
3138 | * balancing on behalf of the other idle cpus whose ticks are | ||
3139 | * stopped. | ||
3140 | */ | ||
3141 | if (local_rq->idle_at_tick && | ||
3142 | atomic_read(&nohz.load_balancer) == local_cpu) { | ||
3143 | cpumask_t cpus = nohz.cpu_mask; | ||
3144 | struct rq *rq; | ||
3145 | int balance_cpu; | ||
3146 | |||
3147 | cpu_clear(local_cpu, cpus); | ||
3148 | for_each_cpu_mask(balance_cpu, cpus) { | ||
3149 | /* | ||
3150 | * If this cpu gets work to do, stop the load balancing | ||
3151 | * work being done for other cpus. Next load | ||
3152 | * balancing owner will pick it up. | ||
3153 | */ | ||
3154 | if (need_resched()) | ||
3155 | break; | ||
3156 | |||
3157 | rebalance_domains(balance_cpu, SCHED_IDLE); | ||
3158 | |||
3159 | rq = cpu_rq(balance_cpu); | ||
3160 | if (time_after(local_rq->next_balance, rq->next_balance)) | ||
3161 | local_rq->next_balance = rq->next_balance; | ||
3162 | } | ||
3163 | } | ||
3164 | #endif | ||
3165 | } | ||
3166 | |||
3167 | /* | ||
3168 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | ||
3169 | * | ||
3170 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3171 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3172 | * if the whole system is idle. | ||
3173 | */ | ||
3174 | static inline void trigger_load_balance(int cpu) | ||
3175 | { | ||
3176 | struct rq *rq = cpu_rq(cpu); | ||
3177 | #ifdef CONFIG_NO_HZ | ||
3178 | /* | ||
3179 | * If we were in the nohz mode recently and busy at the current | ||
3180 | * scheduler tick, then check if we need to nominate new idle | ||
3181 | * load balancer. | ||
3182 | */ | ||
3183 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3184 | rq->in_nohz_recently = 0; | ||
3185 | |||
3186 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3187 | cpu_clear(cpu, nohz.cpu_mask); | ||
3188 | atomic_set(&nohz.load_balancer, -1); | ||
3189 | } | ||
3190 | |||
3191 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3192 | /* | ||
3193 | * simple selection for now: Nominate the | ||
3194 | * first cpu in the nohz list to be the next | ||
3195 | * ilb owner. | ||
3196 | * | ||
3197 | * TBD: Traverse the sched domains and nominate | ||
3198 | * the nearest cpu in the nohz.cpu_mask. | ||
3199 | */ | ||
3200 | int ilb = first_cpu(nohz.cpu_mask); | ||
3201 | |||
3202 | if (ilb != NR_CPUS) | ||
3203 | resched_cpu(ilb); | ||
3204 | } | ||
3205 | } | ||
3206 | |||
3207 | /* | ||
3208 | * If this cpu is idle and doing idle load balancing for all the | ||
3209 | * cpus with ticks stopped, is it time for that to stop? | ||
3210 | */ | ||
3211 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3212 | cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3213 | resched_cpu(cpu); | ||
3214 | return; | ||
3215 | } | ||
3216 | |||
3217 | /* | ||
3218 | * If this cpu is idle and the idle load balancing is done by | ||
3219 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3220 | */ | ||
3221 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3222 | cpu_isset(cpu, nohz.cpu_mask)) | ||
3223 | return; | ||
3224 | #endif | ||
3225 | if (time_after_eq(jiffies, rq->next_balance)) | ||
3226 | raise_softirq(SCHED_SOFTIRQ); | ||
2999 | } | 3227 | } |
3000 | #else | 3228 | #else |
3001 | /* | 3229 | /* |
@@ -3218,16 +3446,17 @@ void scheduler_tick(void) | |||
3218 | unsigned long long now = sched_clock(); | 3446 | unsigned long long now = sched_clock(); |
3219 | struct task_struct *p = current; | 3447 | struct task_struct *p = current; |
3220 | int cpu = smp_processor_id(); | 3448 | int cpu = smp_processor_id(); |
3449 | int idle_at_tick = idle_cpu(cpu); | ||
3221 | struct rq *rq = cpu_rq(cpu); | 3450 | struct rq *rq = cpu_rq(cpu); |
3222 | 3451 | ||
3223 | update_cpu_clock(p, rq, now); | 3452 | update_cpu_clock(p, rq, now); |
3224 | 3453 | ||
3225 | if (p != rq->idle) | 3454 | if (!idle_at_tick) |
3226 | task_running_tick(rq, p); | 3455 | task_running_tick(rq, p); |
3227 | #ifdef CONFIG_SMP | 3456 | #ifdef CONFIG_SMP |
3228 | update_load(rq); | 3457 | update_load(rq); |
3229 | if (time_after_eq(jiffies, rq->next_balance)) | 3458 | rq->idle_at_tick = idle_at_tick; |
3230 | raise_softirq(SCHED_SOFTIRQ); | 3459 | trigger_load_balance(cpu); |
3231 | #endif | 3460 | #endif |
3232 | } | 3461 | } |
3233 | 3462 | ||
@@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3847 | struct prio_array *array; | 4076 | struct prio_array *array; |
3848 | unsigned long flags; | 4077 | unsigned long flags; |
3849 | struct rq *rq; | 4078 | struct rq *rq; |
3850 | int oldprio; | 4079 | int delta; |
3851 | 4080 | ||
3852 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4081 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
3853 | 4082 | ||
3854 | rq = task_rq_lock(p, &flags); | 4083 | rq = task_rq_lock(p, &flags); |
3855 | 4084 | ||
3856 | oldprio = p->prio; | 4085 | delta = prio - p->prio; |
3857 | array = p->array; | 4086 | array = p->array; |
3858 | if (array) | 4087 | if (array) |
3859 | dequeue_task(p, array); | 4088 | dequeue_task(p, array); |
@@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3869 | enqueue_task(p, array); | 4098 | enqueue_task(p, array); |
3870 | /* | 4099 | /* |
3871 | * Reschedule if we are currently running on this runqueue and | 4100 | * Reschedule if we are currently running on this runqueue and |
3872 | * our priority decreased, or if we are not currently running on | 4101 | * our priority decreased, or if our priority became higher |
3873 | * this runqueue and our priority is higher than the current's | 4102 | * than the current's. |
3874 | */ | 4103 | */ |
3875 | if (task_running(rq, p)) { | 4104 | if (TASK_PREEMPTS_CURR(p, rq) || |
3876 | if (p->prio > oldprio) | 4105 | (delta > 0 && task_running(rq, p))) |
3877 | resched_task(rq->curr); | ||
3878 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3879 | resched_task(rq->curr); | 4106 | resched_task(rq->curr); |
3880 | } | 4107 | } |
3881 | task_rq_unlock(rq, &flags); | 4108 | task_rq_unlock(rq, &flags); |
@@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3923 | enqueue_task(p, array); | 4150 | enqueue_task(p, array); |
3924 | inc_raw_weighted_load(rq, p); | 4151 | inc_raw_weighted_load(rq, p); |
3925 | /* | 4152 | /* |
3926 | * If the task increased its priority or is running and | 4153 | * Reschedule if we are currently running on this runqueue and |
3927 | * lowered its priority, then reschedule its CPU: | 4154 | * our priority decreased, or if our priority became higher |
4155 | * than the current's. | ||
3928 | */ | 4156 | */ |
3929 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 4157 | if (TASK_PREEMPTS_CURR(p, rq) || |
4158 | (delta > 0 && task_running(rq, p))) | ||
3930 | resched_task(rq->curr); | 4159 | resched_task(rq->curr); |
3931 | } | 4160 | } |
3932 | out_unlock: | 4161 | out_unlock: |
@@ -4153,13 +4382,11 @@ recheck: | |||
4153 | __activate_task(p, rq); | 4382 | __activate_task(p, rq); |
4154 | /* | 4383 | /* |
4155 | * Reschedule if we are currently running on this runqueue and | 4384 | * Reschedule if we are currently running on this runqueue and |
4156 | * our priority decreased, or if we are not currently running on | 4385 | * our priority decreased, or our priority became higher |
4157 | * this runqueue and our priority is higher than the current's | 4386 | * than the current's. |
4158 | */ | 4387 | */ |
4159 | if (task_running(rq, p)) { | 4388 | if (TASK_PREEMPTS_CURR(p, rq) || |
4160 | if (p->prio > oldprio) | 4389 | (task_running(rq, p) && p->prio > oldprio)) |
4161 | resched_task(rq->curr); | ||
4162 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
4163 | resched_task(rq->curr); | 4390 | resched_task(rq->curr); |
4164 | } | 4391 | } |
4165 | __task_rq_unlock(rq); | 4392 | __task_rq_unlock(rq); |
@@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter) | |||
4750 | show_task(p); | 4977 | show_task(p); |
4751 | } while_each_thread(g, p); | 4978 | } while_each_thread(g, p); |
4752 | 4979 | ||
4980 | touch_all_softlockup_watchdogs(); | ||
4981 | |||
4753 | read_unlock(&tasklist_lock); | 4982 | read_unlock(&tasklist_lock); |
4754 | /* | 4983 | /* |
4755 | * Only show locks if all tasks are dumped: | 4984 | * Only show locks if all tasks are dumped: |
@@ -5304,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5304 | break; | 5533 | break; |
5305 | } | 5534 | } |
5306 | 5535 | ||
5307 | if (!group->cpu_power) { | 5536 | if (!group->__cpu_power) { |
5308 | printk("\n"); | 5537 | printk("\n"); |
5309 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5538 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5310 | "set\n"); | 5539 | "set\n"); |
@@ -5481,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5481 | continue; | 5710 | continue; |
5482 | 5711 | ||
5483 | sg->cpumask = CPU_MASK_NONE; | 5712 | sg->cpumask = CPU_MASK_NONE; |
5484 | sg->cpu_power = 0; | 5713 | sg->__cpu_power = 0; |
5485 | 5714 | ||
5486 | for_each_cpu_mask(j, span) { | 5715 | for_each_cpu_mask(j, span) { |
5487 | if (group_fn(j, cpu_map, NULL) != group) | 5716 | if (group_fn(j, cpu_map, NULL) != group) |
@@ -6170,7 +6399,7 @@ next_sg: | |||
6170 | continue; | 6399 | continue; |
6171 | } | 6400 | } |
6172 | 6401 | ||
6173 | sg->cpu_power += sd->groups->cpu_power; | 6402 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
6174 | } | 6403 | } |
6175 | sg = sg->next; | 6404 | sg = sg->next; |
6176 | if (sg != group_head) | 6405 | if (sg != group_head) |
@@ -6245,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6245 | 6474 | ||
6246 | child = sd->child; | 6475 | child = sd->child; |
6247 | 6476 | ||
6477 | sd->groups->__cpu_power = 0; | ||
6478 | |||
6248 | /* | 6479 | /* |
6249 | * For perf policy, if the groups in child domain share resources | 6480 | * For perf policy, if the groups in child domain share resources |
6250 | * (for example cores sharing some portions of the cache hierarchy | 6481 | * (for example cores sharing some portions of the cache hierarchy |
@@ -6255,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6255 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 6486 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && |
6256 | (child->flags & | 6487 | (child->flags & |
6257 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 6488 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { |
6258 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6489 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); |
6259 | return; | 6490 | return; |
6260 | } | 6491 | } |
6261 | 6492 | ||
6262 | sd->groups->cpu_power = 0; | ||
6263 | |||
6264 | /* | 6493 | /* |
6265 | * add cpu_power of each child group to this groups cpu_power | 6494 | * add cpu_power of each child group to this groups cpu_power |
6266 | */ | 6495 | */ |
6267 | group = child->groups; | 6496 | group = child->groups; |
6268 | do { | 6497 | do { |
6269 | sd->groups->cpu_power += group->cpu_power; | 6498 | sg_inc_cpu_power(sd->groups, group->__cpu_power); |
6270 | group = group->next; | 6499 | group = group->next; |
6271 | } while (group != child->groups); | 6500 | } while (group != child->groups); |
6272 | } | 6501 | } |
@@ -6426,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6426 | sd = &per_cpu(node_domains, j); | 6655 | sd = &per_cpu(node_domains, j); |
6427 | sd->groups = sg; | 6656 | sd->groups = sg; |
6428 | } | 6657 | } |
6429 | sg->cpu_power = 0; | 6658 | sg->__cpu_power = 0; |
6430 | sg->cpumask = nodemask; | 6659 | sg->cpumask = nodemask; |
6431 | sg->next = sg; | 6660 | sg->next = sg; |
6432 | cpus_or(covered, covered, nodemask); | 6661 | cpus_or(covered, covered, nodemask); |
@@ -6454,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6454 | "Can not alloc domain group for node %d\n", j); | 6683 | "Can not alloc domain group for node %d\n", j); |
6455 | goto error; | 6684 | goto error; |
6456 | } | 6685 | } |
6457 | sg->cpu_power = 0; | 6686 | sg->__cpu_power = 0; |
6458 | sg->cpumask = tmp; | 6687 | sg->cpumask = tmp; |
6459 | sg->next = prev->next; | 6688 | sg->next = prev->next; |
6460 | cpus_or(covered, covered, tmp); | 6689 | cpus_or(covered, covered, tmp); |
diff --git a/kernel/signal.c b/kernel/signal.c index 2b4087d545a3..1368e67c8482 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -12,7 +12,6 @@ | |||
12 | 12 | ||
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/smp_lock.h> | ||
16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
17 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
18 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 50afeb813305..8fa7040247ad 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -34,12 +34,32 @@ static struct notifier_block panic_block = { | |||
34 | .notifier_call = softlock_panic, | 34 | .notifier_call = softlock_panic, |
35 | }; | 35 | }; |
36 | 36 | ||
37 | /* | ||
38 | * Returns seconds, approximately. We don't need nanosecond | ||
39 | * resolution, and we don't need to waste time with a big divide when | ||
40 | * 2^30ns == 1.074s. | ||
41 | */ | ||
42 | static unsigned long get_timestamp(void) | ||
43 | { | ||
44 | return sched_clock() >> 30; /* 2^30 ~= 10^9 */ | ||
45 | } | ||
46 | |||
37 | void touch_softlockup_watchdog(void) | 47 | void touch_softlockup_watchdog(void) |
38 | { | 48 | { |
39 | __raw_get_cpu_var(touch_timestamp) = jiffies; | 49 | __raw_get_cpu_var(touch_timestamp) = get_timestamp(); |
40 | } | 50 | } |
41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 51 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
42 | 52 | ||
53 | void touch_all_softlockup_watchdogs(void) | ||
54 | { | ||
55 | int cpu; | ||
56 | |||
57 | /* Cause each CPU to re-update its timestamp rather than complain */ | ||
58 | for_each_online_cpu(cpu) | ||
59 | per_cpu(touch_timestamp, cpu) = 0; | ||
60 | } | ||
61 | EXPORT_SYMBOL(touch_all_softlockup_watchdogs); | ||
62 | |||
43 | /* | 63 | /* |
44 | * This callback runs from the timer interrupt, and checks | 64 | * This callback runs from the timer interrupt, and checks |
45 | * whether the watchdog thread has hung or not: | 65 | * whether the watchdog thread has hung or not: |
@@ -48,9 +68,18 @@ void softlockup_tick(void) | |||
48 | { | 68 | { |
49 | int this_cpu = smp_processor_id(); | 69 | int this_cpu = smp_processor_id(); |
50 | unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); | 70 | unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); |
71 | unsigned long print_timestamp; | ||
72 | unsigned long now; | ||
73 | |||
74 | if (touch_timestamp == 0) { | ||
75 | touch_softlockup_watchdog(); | ||
76 | return; | ||
77 | } | ||
78 | |||
79 | print_timestamp = per_cpu(print_timestamp, this_cpu); | ||
51 | 80 | ||
52 | /* prevent double reports: */ | 81 | /* report at most once a second */ |
53 | if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || | 82 | if (print_timestamp < (touch_timestamp + 1) || |
54 | did_panic || | 83 | did_panic || |
55 | !per_cpu(watchdog_task, this_cpu)) | 84 | !per_cpu(watchdog_task, this_cpu)) |
56 | return; | 85 | return; |
@@ -61,12 +90,14 @@ void softlockup_tick(void) | |||
61 | return; | 90 | return; |
62 | } | 91 | } |
63 | 92 | ||
93 | now = get_timestamp(); | ||
94 | |||
64 | /* Wake up the high-prio watchdog task every second: */ | 95 | /* Wake up the high-prio watchdog task every second: */ |
65 | if (time_after(jiffies, touch_timestamp + HZ)) | 96 | if (now > (touch_timestamp + 1)) |
66 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | 97 | wake_up_process(per_cpu(watchdog_task, this_cpu)); |
67 | 98 | ||
68 | /* Warn about unreasonable 10+ seconds delays: */ | 99 | /* Warn about unreasonable 10+ seconds delays: */ |
69 | if (time_after(jiffies, touch_timestamp + 10*HZ)) { | 100 | if (now > (touch_timestamp + 10)) { |
70 | per_cpu(print_timestamp, this_cpu) = touch_timestamp; | 101 | per_cpu(print_timestamp, this_cpu) = touch_timestamp; |
71 | 102 | ||
72 | spin_lock(&print_lock); | 103 | spin_lock(&print_lock); |
@@ -82,11 +113,14 @@ void softlockup_tick(void) | |||
82 | */ | 113 | */ |
83 | static int watchdog(void * __bind_cpu) | 114 | static int watchdog(void * __bind_cpu) |
84 | { | 115 | { |
85 | struct sched_param param = { .sched_priority = 99 }; | 116 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
86 | 117 | ||
87 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 118 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
88 | current->flags |= PF_NOFREEZE; | 119 | current->flags |= PF_NOFREEZE; |
89 | 120 | ||
121 | /* initialize timestamp */ | ||
122 | touch_softlockup_watchdog(); | ||
123 | |||
90 | /* | 124 | /* |
91 | * Run briefly once per second to reset the softlockup timestamp. | 125 | * Run briefly once per second to reset the softlockup timestamp. |
92 | * If this gets delayed for more than 10 seconds then the | 126 | * If this gets delayed for more than 10 seconds then the |
@@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
118 | printk("watchdog for %i failed\n", hotcpu); | 152 | printk("watchdog for %i failed\n", hotcpu); |
119 | return NOTIFY_BAD; | 153 | return NOTIFY_BAD; |
120 | } | 154 | } |
121 | per_cpu(touch_timestamp, hotcpu) = jiffies; | 155 | per_cpu(touch_timestamp, hotcpu) = 0; |
122 | per_cpu(watchdog_task, hotcpu) = p; | 156 | per_cpu(watchdog_task, hotcpu) = p; |
123 | kthread_bind(p, hotcpu); | 157 | kthread_bind(p, hotcpu); |
124 | break; | 158 | break; |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12458040e665..daabb74ee0bc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -1,11 +1,12 @@ | |||
1 | /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. | 1 | /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. |
2 | * GPL v2 and any later version. | 2 | * GPL v2 and any later version. |
3 | */ | 3 | */ |
4 | #include <linux/stop_machine.h> | ||
5 | #include <linux/kthread.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/cpu.h> | 4 | #include <linux/cpu.h> |
8 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/kthread.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/stop_machine.h> | ||
9 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
10 | #include <asm/atomic.h> | 11 | #include <asm/atomic.h> |
11 | #include <asm/semaphore.h> | 12 | #include <asm/semaphore.h> |
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | |||
208 | 209 | ||
209 | return ret; | 210 | return ret; |
210 | } | 211 | } |
212 | EXPORT_SYMBOL_GPL(stop_machine_run); | ||
diff --git a/kernel/sys.c b/kernel/sys.c index fe1f3ab20477..926bf9d7ac45 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
1923 | if (retval) | 1923 | if (retval) |
1924 | return retval; | 1924 | return retval; |
1925 | 1925 | ||
1926 | if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { | ||
1927 | /* | ||
1928 | * The caller is asking for an immediate RLIMIT_CPU | ||
1929 | * expiry. But we use the zero value to mean "it was | ||
1930 | * never set". So let's cheat and make it one second | ||
1931 | * instead | ||
1932 | */ | ||
1933 | new_rlim.rlim_cur = 1; | ||
1934 | } | ||
1935 | |||
1926 | task_lock(current->group_leader); | 1936 | task_lock(current->group_leader); |
1927 | *old_rlim = new_rlim; | 1937 | *old_rlim = new_rlim; |
1928 | task_unlock(current->group_leader); | 1938 | task_unlock(current->group_leader); |
@@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) | |||
1944 | unsigned long rlim_cur = new_rlim.rlim_cur; | 1954 | unsigned long rlim_cur = new_rlim.rlim_cur; |
1945 | cputime_t cputime; | 1955 | cputime_t cputime; |
1946 | 1956 | ||
1947 | if (rlim_cur == 0) { | ||
1948 | /* | ||
1949 | * The caller is asking for an immediate RLIMIT_CPU | ||
1950 | * expiry. But we use the zero value to mean "it was | ||
1951 | * never set". So let's cheat and make it one second | ||
1952 | * instead | ||
1953 | */ | ||
1954 | rlim_cur = 1; | ||
1955 | } | ||
1956 | cputime = secs_to_cputime(rlim_cur); | 1957 | cputime = secs_to_cputime(rlim_cur); |
1957 | read_lock(&tasklist_lock); | 1958 | read_lock(&tasklist_lock); |
1958 | spin_lock_irq(¤t->sighand->siglock); | 1959 | spin_lock_irq(¤t->sighand->siglock); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c904748f2290..f0664bd5011c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max; | |||
76 | extern int sysctl_drop_caches; | 76 | extern int sysctl_drop_caches; |
77 | extern int percpu_pagelist_fraction; | 77 | extern int percpu_pagelist_fraction; |
78 | extern int compat_log; | 78 | extern int compat_log; |
79 | extern int maps_protect; | ||
79 | 80 | ||
80 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 81 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
81 | static int maxolduid = 65535; | 82 | static int maxolduid = 65535; |
@@ -603,6 +604,16 @@ static ctl_table kern_table[] = { | |||
603 | .proc_handler = &proc_dointvec, | 604 | .proc_handler = &proc_dointvec, |
604 | }, | 605 | }, |
605 | #endif | 606 | #endif |
607 | #ifdef CONFIG_PROC_FS | ||
608 | { | ||
609 | .ctl_name = CTL_UNNUMBERED, | ||
610 | .procname = "maps_protect", | ||
611 | .data = &maps_protect, | ||
612 | .maxlen = sizeof(int), | ||
613 | .mode = 0644, | ||
614 | .proc_handler = &proc_dointvec, | ||
615 | }, | ||
616 | #endif | ||
606 | 617 | ||
607 | { .ctl_name = 0 } | 618 | { .ctl_name = 0 } |
608 | }; | 619 | }; |
diff --git a/kernel/time.c b/kernel/time.c index ba18ec4899bd..f04791f69408 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/timex.h> | 31 | #include <linux/timex.h> |
32 | #include <linux/capability.h> | 32 | #include <linux/capability.h> |
33 | #include <linux/errno.h> | 33 | #include <linux/errno.h> |
34 | #include <linux/smp_lock.h> | ||
35 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
36 | #include <linux/security.h> | 35 | #include <linux/security.h> |
37 | #include <linux/fs.h> | 36 | #include <linux/fs.h> |
@@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb) | |||
247 | } | 246 | } |
248 | EXPORT_SYMBOL(current_fs_time); | 247 | EXPORT_SYMBOL(current_fs_time); |
249 | 248 | ||
249 | /* | ||
250 | * Convert jiffies to milliseconds and back. | ||
251 | * | ||
252 | * Avoid unnecessary multiplications/divisions in the | ||
253 | * two most common HZ cases: | ||
254 | */ | ||
255 | unsigned int inline jiffies_to_msecs(const unsigned long j) | ||
256 | { | ||
257 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | ||
258 | return (MSEC_PER_SEC / HZ) * j; | ||
259 | #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) | ||
260 | return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); | ||
261 | #else | ||
262 | return (j * MSEC_PER_SEC) / HZ; | ||
263 | #endif | ||
264 | } | ||
265 | EXPORT_SYMBOL(jiffies_to_msecs); | ||
266 | |||
267 | unsigned int inline jiffies_to_usecs(const unsigned long j) | ||
268 | { | ||
269 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | ||
270 | return (USEC_PER_SEC / HZ) * j; | ||
271 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
272 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); | ||
273 | #else | ||
274 | return (j * USEC_PER_SEC) / HZ; | ||
275 | #endif | ||
276 | } | ||
277 | EXPORT_SYMBOL(jiffies_to_usecs); | ||
278 | |||
250 | /** | 279 | /** |
251 | * timespec_trunc - Truncate timespec to a granularity | 280 | * timespec_trunc - Truncate timespec to a granularity |
252 | * @t: Timespec | 281 | * @t: Timespec |
@@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec) | |||
473 | EXPORT_SYMBOL(ns_to_timeval); | 502 | EXPORT_SYMBOL(ns_to_timeval); |
474 | 503 | ||
475 | /* | 504 | /* |
476 | * Convert jiffies to milliseconds and back. | ||
477 | * | ||
478 | * Avoid unnecessary multiplications/divisions in the | ||
479 | * two most common HZ cases: | ||
480 | */ | ||
481 | unsigned int jiffies_to_msecs(const unsigned long j) | ||
482 | { | ||
483 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | ||
484 | return (MSEC_PER_SEC / HZ) * j; | ||
485 | #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) | ||
486 | return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); | ||
487 | #else | ||
488 | return (j * MSEC_PER_SEC) / HZ; | ||
489 | #endif | ||
490 | } | ||
491 | EXPORT_SYMBOL(jiffies_to_msecs); | ||
492 | |||
493 | unsigned int jiffies_to_usecs(const unsigned long j) | ||
494 | { | ||
495 | #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) | ||
496 | return (USEC_PER_SEC / HZ) * j; | ||
497 | #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) | ||
498 | return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); | ||
499 | #else | ||
500 | return (j * USEC_PER_SEC) / HZ; | ||
501 | #endif | ||
502 | } | ||
503 | EXPORT_SYMBOL(jiffies_to_usecs); | ||
504 | |||
505 | /* | ||
506 | * When we convert to jiffies then we interpret incoming values | 505 | * When we convert to jiffies then we interpret incoming values |
507 | * the following way: | 506 | * the following way: |
508 | * | 507 | * |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 93bccba1f265..99b6034fc86b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y += ntp.o clocksource.o jiffies.o timer_list.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | 2 | ||
3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o | 3 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o |
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bfda3f7f0716..a96ec9ab3454 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); | |||
31 | */ | 31 | */ |
32 | ktime_t tick_next_period; | 32 | ktime_t tick_next_period; |
33 | ktime_t tick_period; | 33 | ktime_t tick_period; |
34 | static int tick_do_timer_cpu = -1; | 34 | int tick_do_timer_cpu __read_mostly = -1; |
35 | DEFINE_SPINLOCK(tick_device_lock); | 35 | DEFINE_SPINLOCK(tick_device_lock); |
36 | 36 | ||
37 | /* | 37 | /* |
@@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup) | |||
295 | clockevents_exchange_device(dev, NULL); | 295 | clockevents_exchange_device(dev, NULL); |
296 | td->evtdev = NULL; | 296 | td->evtdev = NULL; |
297 | } | 297 | } |
298 | /* Transfer the do_timer job away from this cpu */ | ||
299 | if (*cpup == tick_do_timer_cpu) { | ||
300 | int cpu = first_cpu(cpu_online_map); | ||
301 | |||
302 | tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; | ||
303 | } | ||
298 | spin_unlock_irqrestore(&tick_device_lock, flags); | 304 | spin_unlock_irqrestore(&tick_device_lock, flags); |
299 | } | 305 | } |
300 | 306 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c9d203bde518..bb13f2724905 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | |||
5 | extern spinlock_t tick_device_lock; | 5 | extern spinlock_t tick_device_lock; |
6 | extern ktime_t tick_next_period; | 6 | extern ktime_t tick_next_period; |
7 | extern ktime_t tick_period; | 7 | extern ktime_t tick_period; |
8 | extern int tick_do_timer_cpu __read_mostly; | ||
8 | 9 | ||
9 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | 10 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); |
10 | extern void tick_handle_periodic(struct clock_event_device *dev); | 11 | extern void tick_handle_periodic(struct clock_event_device *dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 51556b95f60f..3483e6cb9549 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void) | |||
217 | * the scheduler tick in nohz_restart_sched_tick. | 217 | * the scheduler tick in nohz_restart_sched_tick. |
218 | */ | 218 | */ |
219 | if (!ts->tick_stopped) { | 219 | if (!ts->tick_stopped) { |
220 | if (select_nohz_load_balancer(1)) { | ||
221 | /* | ||
222 | * sched tick not stopped! | ||
223 | */ | ||
224 | cpu_clear(cpu, nohz_cpu_mask); | ||
225 | goto out; | ||
226 | } | ||
227 | |||
220 | ts->idle_tick = ts->sched_timer.expires; | 228 | ts->idle_tick = ts->sched_timer.expires; |
221 | ts->tick_stopped = 1; | 229 | ts->tick_stopped = 1; |
222 | ts->idle_jiffies = last_jiffies; | 230 | ts->idle_jiffies = last_jiffies; |
223 | } | 231 | } |
232 | |||
233 | /* | ||
234 | * If this cpu is the one which updates jiffies, then | ||
235 | * give up the assignment and let it be taken by the | ||
236 | * cpu which runs the tick timer next, which might be | ||
237 | * this cpu as well. If we don't drop this here the | ||
238 | * jiffies might be stale and do_timer() never | ||
239 | * invoked. | ||
240 | */ | ||
241 | if (cpu == tick_do_timer_cpu) | ||
242 | tick_do_timer_cpu = -1; | ||
243 | |||
224 | /* | 244 | /* |
225 | * calculate the expiry time for the next timer wheel | 245 | * calculate the expiry time for the next timer wheel |
226 | * timer | 246 | * timer |
@@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void) | |||
273 | now = ktime_get(); | 293 | now = ktime_get(); |
274 | 294 | ||
275 | local_irq_disable(); | 295 | local_irq_disable(); |
296 | select_nohz_load_balancer(0); | ||
276 | tick_do_update_jiffies64(now); | 297 | tick_do_update_jiffies64(now); |
277 | cpu_clear(cpu, nohz_cpu_mask); | 298 | cpu_clear(cpu, nohz_cpu_mask); |
278 | 299 | ||
@@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev) | |||
338 | { | 359 | { |
339 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 360 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
340 | struct pt_regs *regs = get_irq_regs(); | 361 | struct pt_regs *regs = get_irq_regs(); |
362 | int cpu = smp_processor_id(); | ||
341 | ktime_t now = ktime_get(); | 363 | ktime_t now = ktime_get(); |
342 | 364 | ||
343 | dev->next_event.tv64 = KTIME_MAX; | 365 | dev->next_event.tv64 = KTIME_MAX; |
344 | 366 | ||
367 | /* | ||
368 | * Check if the do_timer duty was dropped. We don't care about | ||
369 | * concurrency: This happens only when the cpu in charge went | ||
370 | * into a long sleep. If two cpus happen to assign themself to | ||
371 | * this duty, then the jiffies update is still serialized by | ||
372 | * xtime_lock. | ||
373 | */ | ||
374 | if (unlikely(tick_do_timer_cpu == -1)) | ||
375 | tick_do_timer_cpu = cpu; | ||
376 | |||
345 | /* Check, if the jiffies need an update */ | 377 | /* Check, if the jiffies need an update */ |
346 | tick_do_update_jiffies64(now); | 378 | if (tick_do_timer_cpu == cpu) |
379 | tick_do_update_jiffies64(now); | ||
347 | 380 | ||
348 | /* | 381 | /* |
349 | * When we are idle and the tick is stopped, we have to touch | 382 | * When we are idle and the tick is stopped, we have to touch |
@@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
431 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | 464 | struct hrtimer_cpu_base *base = timer->base->cpu_base; |
432 | struct pt_regs *regs = get_irq_regs(); | 465 | struct pt_regs *regs = get_irq_regs(); |
433 | ktime_t now = ktime_get(); | 466 | ktime_t now = ktime_get(); |
467 | int cpu = smp_processor_id(); | ||
468 | |||
469 | #ifdef CONFIG_NO_HZ | ||
470 | /* | ||
471 | * Check if the do_timer duty was dropped. We don't care about | ||
472 | * concurrency: This happens only when the cpu in charge went | ||
473 | * into a long sleep. If two cpus happen to assign themself to | ||
474 | * this duty, then the jiffies update is still serialized by | ||
475 | * xtime_lock. | ||
476 | */ | ||
477 | if (unlikely(tick_do_timer_cpu == -1)) | ||
478 | tick_do_timer_cpu = cpu; | ||
479 | #endif | ||
434 | 480 | ||
435 | /* Check, if the jiffies need an update */ | 481 | /* Check, if the jiffies need an update */ |
436 | tick_do_update_jiffies64(now); | 482 | if (tick_do_timer_cpu == cpu) |
483 | tick_do_update_jiffies64(now); | ||
437 | 484 | ||
438 | /* | 485 | /* |
439 | * Do not call, when we are not in irq context and have | 486 | * Do not call, when we are not in irq context and have |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 000000000000..f9217bf644f6 --- /dev/null +++ b/kernel/time/timekeeping.c | |||
@@ -0,0 +1,476 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/timekeeping.c | ||
3 | * | ||
4 | * Kernel timekeeping code and accessor functions | ||
5 | * | ||
6 | * This code was moved from linux/kernel/timer.c. | ||
7 | * Please see that file for copyright and history logs. | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/interrupt.h> | ||
13 | #include <linux/percpu.h> | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/sysdev.h> | ||
17 | #include <linux/clocksource.h> | ||
18 | #include <linux/jiffies.h> | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/tick.h> | ||
21 | |||
22 | |||
23 | /* | ||
24 | * This read-write spinlock protects us from races in SMP while | ||
25 | * playing with xtime and avenrun. | ||
26 | */ | ||
27 | __attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
28 | |||
29 | EXPORT_SYMBOL(xtime_lock); | ||
30 | |||
31 | |||
32 | /* | ||
33 | * The current time | ||
34 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | ||
35 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | ||
36 | * at zero at system boot time, so wall_to_monotonic will be negative, | ||
37 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | ||
38 | * the usual normalization. | ||
39 | */ | ||
40 | struct timespec xtime __attribute__ ((aligned (16))); | ||
41 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | ||
42 | |||
43 | EXPORT_SYMBOL(xtime); | ||
44 | |||
45 | |||
46 | static struct clocksource *clock; /* pointer to current clocksource */ | ||
47 | |||
48 | |||
49 | #ifdef CONFIG_GENERIC_TIME | ||
50 | /** | ||
51 | * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook | ||
52 | * | ||
53 | * private function, must hold xtime_lock lock when being | ||
54 | * called. Returns the number of nanoseconds since the | ||
55 | * last call to update_wall_time() (adjusted by NTP scaling) | ||
56 | */ | ||
57 | static inline s64 __get_nsec_offset(void) | ||
58 | { | ||
59 | cycle_t cycle_now, cycle_delta; | ||
60 | s64 ns_offset; | ||
61 | |||
62 | /* read clocksource: */ | ||
63 | cycle_now = clocksource_read(clock); | ||
64 | |||
65 | /* calculate the delta since the last update_wall_time: */ | ||
66 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
67 | |||
68 | /* convert to nanoseconds: */ | ||
69 | ns_offset = cyc2ns(clock, cycle_delta); | ||
70 | |||
71 | return ns_offset; | ||
72 | } | ||
73 | |||
74 | /** | ||
75 | * __get_realtime_clock_ts - Returns the time of day in a timespec | ||
76 | * @ts: pointer to the timespec to be set | ||
77 | * | ||
78 | * Returns the time of day in a timespec. Used by | ||
79 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
80 | */ | ||
81 | static inline void __get_realtime_clock_ts(struct timespec *ts) | ||
82 | { | ||
83 | unsigned long seq; | ||
84 | s64 nsecs; | ||
85 | |||
86 | do { | ||
87 | seq = read_seqbegin(&xtime_lock); | ||
88 | |||
89 | *ts = xtime; | ||
90 | nsecs = __get_nsec_offset(); | ||
91 | |||
92 | } while (read_seqretry(&xtime_lock, seq)); | ||
93 | |||
94 | timespec_add_ns(ts, nsecs); | ||
95 | } | ||
96 | |||
97 | /** | ||
98 | * getnstimeofday - Returns the time of day in a timespec | ||
99 | * @ts: pointer to the timespec to be set | ||
100 | * | ||
101 | * Returns the time of day in a timespec. | ||
102 | */ | ||
103 | void getnstimeofday(struct timespec *ts) | ||
104 | { | ||
105 | __get_realtime_clock_ts(ts); | ||
106 | } | ||
107 | |||
108 | EXPORT_SYMBOL(getnstimeofday); | ||
109 | |||
110 | /** | ||
111 | * do_gettimeofday - Returns the time of day in a timeval | ||
112 | * @tv: pointer to the timeval to be set | ||
113 | * | ||
114 | * NOTE: Users should be converted to using get_realtime_clock_ts() | ||
115 | */ | ||
116 | void do_gettimeofday(struct timeval *tv) | ||
117 | { | ||
118 | struct timespec now; | ||
119 | |||
120 | __get_realtime_clock_ts(&now); | ||
121 | tv->tv_sec = now.tv_sec; | ||
122 | tv->tv_usec = now.tv_nsec/1000; | ||
123 | } | ||
124 | |||
125 | EXPORT_SYMBOL(do_gettimeofday); | ||
126 | /** | ||
127 | * do_settimeofday - Sets the time of day | ||
128 | * @tv: pointer to the timespec variable containing the new time | ||
129 | * | ||
130 | * Sets the time of day to the new time and update NTP and notify hrtimers | ||
131 | */ | ||
132 | int do_settimeofday(struct timespec *tv) | ||
133 | { | ||
134 | unsigned long flags; | ||
135 | time_t wtm_sec, sec = tv->tv_sec; | ||
136 | long wtm_nsec, nsec = tv->tv_nsec; | ||
137 | |||
138 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
139 | return -EINVAL; | ||
140 | |||
141 | write_seqlock_irqsave(&xtime_lock, flags); | ||
142 | |||
143 | nsec -= __get_nsec_offset(); | ||
144 | |||
145 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
146 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
147 | |||
148 | set_normalized_timespec(&xtime, sec, nsec); | ||
149 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
150 | |||
151 | clock->error = 0; | ||
152 | ntp_clear(); | ||
153 | |||
154 | update_vsyscall(&xtime, clock); | ||
155 | |||
156 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
157 | |||
158 | /* signal hrtimers about time change */ | ||
159 | clock_was_set(); | ||
160 | |||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | EXPORT_SYMBOL(do_settimeofday); | ||
165 | |||
166 | /** | ||
167 | * change_clocksource - Swaps clocksources if a new one is available | ||
168 | * | ||
169 | * Accumulates current time interval and initializes new clocksource | ||
170 | */ | ||
171 | static void change_clocksource(void) | ||
172 | { | ||
173 | struct clocksource *new; | ||
174 | cycle_t now; | ||
175 | u64 nsec; | ||
176 | |||
177 | new = clocksource_get_next(); | ||
178 | |||
179 | if (clock == new) | ||
180 | return; | ||
181 | |||
182 | now = clocksource_read(new); | ||
183 | nsec = __get_nsec_offset(); | ||
184 | timespec_add_ns(&xtime, nsec); | ||
185 | |||
186 | clock = new; | ||
187 | clock->cycle_last = now; | ||
188 | |||
189 | clock->error = 0; | ||
190 | clock->xtime_nsec = 0; | ||
191 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | ||
192 | |||
193 | tick_clock_notify(); | ||
194 | |||
195 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
196 | clock->name); | ||
197 | } | ||
198 | #else | ||
199 | static inline void change_clocksource(void) { } | ||
200 | #endif | ||
201 | |||
202 | /** | ||
203 | * timekeeping_is_continuous - check to see if timekeeping is free running | ||
204 | */ | ||
205 | int timekeeping_is_continuous(void) | ||
206 | { | ||
207 | unsigned long seq; | ||
208 | int ret; | ||
209 | |||
210 | do { | ||
211 | seq = read_seqbegin(&xtime_lock); | ||
212 | |||
213 | ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | ||
214 | |||
215 | } while (read_seqretry(&xtime_lock, seq)); | ||
216 | |||
217 | return ret; | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * read_persistent_clock - Return time in seconds from the persistent clock. | ||
222 | * | ||
223 | * Weak dummy function for arches that do not yet support it. | ||
224 | * Returns seconds from epoch using the battery backed persistent clock. | ||
225 | * Returns zero if unsupported. | ||
226 | * | ||
227 | * XXX - Do be sure to remove it once all arches implement it. | ||
228 | */ | ||
229 | unsigned long __attribute__((weak)) read_persistent_clock(void) | ||
230 | { | ||
231 | return 0; | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * timekeeping_init - Initializes the clocksource and common timekeeping values | ||
236 | */ | ||
237 | void __init timekeeping_init(void) | ||
238 | { | ||
239 | unsigned long flags; | ||
240 | unsigned long sec = read_persistent_clock(); | ||
241 | |||
242 | write_seqlock_irqsave(&xtime_lock, flags); | ||
243 | |||
244 | ntp_clear(); | ||
245 | |||
246 | clock = clocksource_get_next(); | ||
247 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | ||
248 | clock->cycle_last = clocksource_read(clock); | ||
249 | |||
250 | xtime.tv_sec = sec; | ||
251 | xtime.tv_nsec = 0; | ||
252 | set_normalized_timespec(&wall_to_monotonic, | ||
253 | -xtime.tv_sec, -xtime.tv_nsec); | ||
254 | |||
255 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
256 | } | ||
257 | |||
258 | /* flag for if timekeeping is suspended */ | ||
259 | static int timekeeping_suspended; | ||
260 | /* time in seconds when suspend began */ | ||
261 | static unsigned long timekeeping_suspend_time; | ||
262 | |||
263 | /** | ||
264 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | ||
265 | * @dev: unused | ||
266 | * | ||
267 | * This is for the generic clocksource timekeeping. | ||
268 | * xtime/wall_to_monotonic/jiffies/etc are | ||
269 | * still managed by arch specific suspend/resume code. | ||
270 | */ | ||
271 | static int timekeeping_resume(struct sys_device *dev) | ||
272 | { | ||
273 | unsigned long flags; | ||
274 | unsigned long now = read_persistent_clock(); | ||
275 | |||
276 | write_seqlock_irqsave(&xtime_lock, flags); | ||
277 | |||
278 | if (now && (now > timekeeping_suspend_time)) { | ||
279 | unsigned long sleep_length = now - timekeeping_suspend_time; | ||
280 | |||
281 | xtime.tv_sec += sleep_length; | ||
282 | wall_to_monotonic.tv_sec -= sleep_length; | ||
283 | } | ||
284 | /* re-base the last cycle value */ | ||
285 | clock->cycle_last = clocksource_read(clock); | ||
286 | clock->error = 0; | ||
287 | timekeeping_suspended = 0; | ||
288 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
289 | |||
290 | touch_softlockup_watchdog(); | ||
291 | |||
292 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | ||
293 | |||
294 | /* Resume hrtimers */ | ||
295 | hres_timers_resume(); | ||
296 | |||
297 | return 0; | ||
298 | } | ||
299 | |||
300 | static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | ||
301 | { | ||
302 | unsigned long flags; | ||
303 | |||
304 | write_seqlock_irqsave(&xtime_lock, flags); | ||
305 | timekeeping_suspended = 1; | ||
306 | timekeeping_suspend_time = read_persistent_clock(); | ||
307 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
308 | |||
309 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | ||
310 | |||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | /* sysfs resume/suspend bits for timekeeping */ | ||
315 | static struct sysdev_class timekeeping_sysclass = { | ||
316 | .resume = timekeeping_resume, | ||
317 | .suspend = timekeeping_suspend, | ||
318 | set_kset_name("timekeeping"), | ||
319 | }; | ||
320 | |||
321 | static struct sys_device device_timer = { | ||
322 | .id = 0, | ||
323 | .cls = &timekeeping_sysclass, | ||
324 | }; | ||
325 | |||
326 | static int __init timekeeping_init_device(void) | ||
327 | { | ||
328 | int error = sysdev_class_register(&timekeeping_sysclass); | ||
329 | if (!error) | ||
330 | error = sysdev_register(&device_timer); | ||
331 | return error; | ||
332 | } | ||
333 | |||
334 | device_initcall(timekeeping_init_device); | ||
335 | |||
336 | /* | ||
337 | * If the error is already larger, we look ahead even further | ||
338 | * to compensate for late or lost adjustments. | ||
339 | */ | ||
340 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, | ||
341 | s64 *offset) | ||
342 | { | ||
343 | s64 tick_error, i; | ||
344 | u32 look_ahead, adj; | ||
345 | s32 error2, mult; | ||
346 | |||
347 | /* | ||
348 | * Use the current error value to determine how much to look ahead. | ||
349 | * The larger the error the slower we adjust for it to avoid problems | ||
350 | * with losing too many ticks, otherwise we would overadjust and | ||
351 | * produce an even larger error. The smaller the adjustment the | ||
352 | * faster we try to adjust for it, as lost ticks can do less harm | ||
353 | * here. This is tuned so that an error of about 1 msec is adusted | ||
354 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | ||
355 | */ | ||
356 | error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); | ||
357 | error2 = abs(error2); | ||
358 | for (look_ahead = 0; error2 > 0; look_ahead++) | ||
359 | error2 >>= 2; | ||
360 | |||
361 | /* | ||
362 | * Now calculate the error in (1 << look_ahead) ticks, but first | ||
363 | * remove the single look ahead already included in the error. | ||
364 | */ | ||
365 | tick_error = current_tick_length() >> | ||
366 | (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
367 | tick_error -= clock->xtime_interval >> 1; | ||
368 | error = ((error - tick_error) >> look_ahead) + tick_error; | ||
369 | |||
370 | /* Finally calculate the adjustment shift value. */ | ||
371 | i = *interval; | ||
372 | mult = 1; | ||
373 | if (error < 0) { | ||
374 | error = -error; | ||
375 | *interval = -*interval; | ||
376 | *offset = -*offset; | ||
377 | mult = -1; | ||
378 | } | ||
379 | for (adj = 0; error > i; adj++) | ||
380 | error >>= 1; | ||
381 | |||
382 | *interval <<= adj; | ||
383 | *offset <<= adj; | ||
384 | return mult << adj; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * Adjust the multiplier to reduce the error value, | ||
389 | * this is optimized for the most common adjustments of -1,0,1, | ||
390 | * for other values we can do a bit more work. | ||
391 | */ | ||
392 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | ||
393 | { | ||
394 | s64 error, interval = clock->cycle_interval; | ||
395 | int adj; | ||
396 | |||
397 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | ||
398 | if (error > interval) { | ||
399 | error >>= 2; | ||
400 | if (likely(error <= interval)) | ||
401 | adj = 1; | ||
402 | else | ||
403 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
404 | } else if (error < -interval) { | ||
405 | error >>= 2; | ||
406 | if (likely(error >= -interval)) { | ||
407 | adj = -1; | ||
408 | interval = -interval; | ||
409 | offset = -offset; | ||
410 | } else | ||
411 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
412 | } else | ||
413 | return; | ||
414 | |||
415 | clock->mult += adj; | ||
416 | clock->xtime_interval += interval; | ||
417 | clock->xtime_nsec -= offset; | ||
418 | clock->error -= (interval - offset) << | ||
419 | (TICK_LENGTH_SHIFT - clock->shift); | ||
420 | } | ||
421 | |||
422 | /** | ||
423 | * update_wall_time - Uses the current clocksource to increment the wall time | ||
424 | * | ||
425 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
426 | */ | ||
427 | void update_wall_time(void) | ||
428 | { | ||
429 | cycle_t offset; | ||
430 | |||
431 | /* Make sure we're fully resumed: */ | ||
432 | if (unlikely(timekeeping_suspended)) | ||
433 | return; | ||
434 | |||
435 | #ifdef CONFIG_GENERIC_TIME | ||
436 | offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; | ||
437 | #else | ||
438 | offset = clock->cycle_interval; | ||
439 | #endif | ||
440 | clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; | ||
441 | |||
442 | /* normally this loop will run just once, however in the | ||
443 | * case of lost or late ticks, it will accumulate correctly. | ||
444 | */ | ||
445 | while (offset >= clock->cycle_interval) { | ||
446 | /* accumulate one interval */ | ||
447 | clock->xtime_nsec += clock->xtime_interval; | ||
448 | clock->cycle_last += clock->cycle_interval; | ||
449 | offset -= clock->cycle_interval; | ||
450 | |||
451 | if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { | ||
452 | clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; | ||
453 | xtime.tv_sec++; | ||
454 | second_overflow(); | ||
455 | } | ||
456 | |||
457 | /* interpolator bits */ | ||
458 | time_interpolator_update(clock->xtime_interval | ||
459 | >> clock->shift); | ||
460 | |||
461 | /* accumulate error between NTP and clock interval */ | ||
462 | clock->error += current_tick_length(); | ||
463 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | ||
464 | } | ||
465 | |||
466 | /* correct the clock when NTP error is too big */ | ||
467 | clocksource_adjust(clock, offset); | ||
468 | |||
469 | /* store full nanoseconds into xtime */ | ||
470 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; | ||
471 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | ||
472 | |||
473 | /* check to see if there is a new clocksource to use */ | ||
474 | change_clocksource(); | ||
475 | update_vsyscall(&xtime, clock); | ||
476 | } | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 59df5e8555a8..b734ca4bc75e 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | |||
38 | 38 | ||
39 | static void print_name_offset(struct seq_file *m, void *sym) | 39 | static void print_name_offset(struct seq_file *m, void *sym) |
40 | { | 40 | { |
41 | unsigned long addr = (unsigned long)sym; | 41 | char symname[KSYM_NAME_LEN+1]; |
42 | char namebuf[KSYM_NAME_LEN+1]; | 42 | |
43 | unsigned long size, offset; | 43 | if (lookup_symbol_name((unsigned long)sym, symname) < 0) |
44 | const char *sym_name; | ||
45 | char *modname; | ||
46 | |||
47 | sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); | ||
48 | if (sym_name) | ||
49 | SEQ_printf(m, "%s", sym_name); | ||
50 | else | ||
51 | SEQ_printf(m, "<%p>", sym); | 44 | SEQ_printf(m, "<%p>", sym); |
45 | else | ||
46 | SEQ_printf(m, "%s", symname); | ||
52 | } | 47 | } |
53 | 48 | ||
54 | static void | 49 | static void |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1bc4882e28e0..868f1bceb07f 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | |||
257 | 257 | ||
258 | static void print_name_offset(struct seq_file *m, unsigned long addr) | 258 | static void print_name_offset(struct seq_file *m, unsigned long addr) |
259 | { | 259 | { |
260 | char namebuf[KSYM_NAME_LEN+1]; | 260 | char symname[KSYM_NAME_LEN+1]; |
261 | unsigned long size, offset; | 261 | |
262 | const char *sym_name; | 262 | if (lookup_symbol_name(addr, symname) < 0) |
263 | char *modname; | ||
264 | |||
265 | sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); | ||
266 | if (sym_name) | ||
267 | seq_printf(m, "%s", sym_name); | ||
268 | else | ||
269 | seq_printf(m, "<%p>", (void *)addr); | 263 | seq_printf(m, "<%p>", (void *)addr); |
264 | else | ||
265 | seq_printf(m, "%s", symname); | ||
270 | } | 266 | } |
271 | 267 | ||
272 | static int tstats_show(struct seq_file *m, void *v) | 268 | static int tstats_show(struct seq_file *m, void *v) |
diff --git a/kernel/timer.c b/kernel/timer.c index b22bd39740dd..7a6448340f90 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/timer.c | 2 | * linux/kernel/timer.c |
3 | * | 3 | * |
4 | * Kernel internal timers, kernel timekeeping, basic process system calls | 4 | * Kernel internal timers, basic process system calls |
5 | * | 5 | * |
6 | * Copyright (C) 1991, 1992 Linus Torvalds | 6 | * Copyright (C) 1991, 1992 Linus Torvalds |
7 | * | 7 | * |
@@ -74,7 +74,7 @@ struct tvec_t_base_s { | |||
74 | tvec_t tv3; | 74 | tvec_t tv3; |
75 | tvec_t tv4; | 75 | tvec_t tv4; |
76 | tvec_t tv5; | 76 | tvec_t tv5; |
77 | } ____cacheline_aligned_in_smp; | 77 | } ____cacheline_aligned; |
78 | 78 | ||
79 | typedef struct tvec_t_base_s tvec_base_t; | 79 | typedef struct tvec_t_base_s tvec_base_t; |
80 | 80 | ||
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases; | |||
82 | EXPORT_SYMBOL(boot_tvec_bases); | 82 | EXPORT_SYMBOL(boot_tvec_bases); |
83 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 83 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; |
84 | 84 | ||
85 | /* | ||
86 | * Note that all tvec_bases is 2 byte aligned and lower bit of | ||
87 | * base in timer_list is guaranteed to be zero. Use the LSB for | ||
88 | * the new flag to indicate whether the timer is deferrable | ||
89 | */ | ||
90 | #define TBASE_DEFERRABLE_FLAG (0x1) | ||
91 | |||
92 | /* Functions below help us manage 'deferrable' flag */ | ||
93 | static inline unsigned int tbase_get_deferrable(tvec_base_t *base) | ||
94 | { | ||
95 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); | ||
96 | } | ||
97 | |||
98 | static inline tvec_base_t *tbase_get_base(tvec_base_t *base) | ||
99 | { | ||
100 | return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); | ||
101 | } | ||
102 | |||
103 | static inline void timer_set_deferrable(struct timer_list *timer) | ||
104 | { | ||
105 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | | ||
106 | TBASE_DEFERRABLE_FLAG)); | ||
107 | } | ||
108 | |||
109 | static inline void | ||
110 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) | ||
111 | { | ||
112 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | | ||
113 | tbase_get_deferrable(timer->base)); | ||
114 | } | ||
115 | |||
85 | /** | 116 | /** |
86 | * __round_jiffies - function to round jiffies to a full second | 117 | * __round_jiffies - function to round jiffies to a full second |
87 | * @j: the time in (absolute) jiffies that should be rounded | 118 | * @j: the time in (absolute) jiffies that should be rounded |
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer) | |||
295 | } | 326 | } |
296 | EXPORT_SYMBOL(init_timer); | 327 | EXPORT_SYMBOL(init_timer); |
297 | 328 | ||
329 | void fastcall init_timer_deferrable(struct timer_list *timer) | ||
330 | { | ||
331 | init_timer(timer); | ||
332 | timer_set_deferrable(timer); | ||
333 | } | ||
334 | EXPORT_SYMBOL(init_timer_deferrable); | ||
335 | |||
298 | static inline void detach_timer(struct timer_list *timer, | 336 | static inline void detach_timer(struct timer_list *timer, |
299 | int clear_pending) | 337 | int clear_pending) |
300 | { | 338 | { |
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, | |||
325 | tvec_base_t *base; | 363 | tvec_base_t *base; |
326 | 364 | ||
327 | for (;;) { | 365 | for (;;) { |
328 | base = timer->base; | 366 | tvec_base_t *prelock_base = timer->base; |
367 | base = tbase_get_base(prelock_base); | ||
329 | if (likely(base != NULL)) { | 368 | if (likely(base != NULL)) { |
330 | spin_lock_irqsave(&base->lock, *flags); | 369 | spin_lock_irqsave(&base->lock, *flags); |
331 | if (likely(base == timer->base)) | 370 | if (likely(prelock_base == timer->base)) |
332 | return base; | 371 | return base; |
333 | /* The timer has migrated to another CPU */ | 372 | /* The timer has migrated to another CPU */ |
334 | spin_unlock_irqrestore(&base->lock, *flags); | 373 | spin_unlock_irqrestore(&base->lock, *flags); |
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
365 | */ | 404 | */ |
366 | if (likely(base->running_timer != timer)) { | 405 | if (likely(base->running_timer != timer)) { |
367 | /* See the comment in lock_timer_base() */ | 406 | /* See the comment in lock_timer_base() */ |
368 | timer->base = NULL; | 407 | timer_set_base(timer, NULL); |
369 | spin_unlock(&base->lock); | 408 | spin_unlock(&base->lock); |
370 | base = new_base; | 409 | base = new_base; |
371 | spin_lock(&base->lock); | 410 | spin_lock(&base->lock); |
372 | timer->base = base; | 411 | timer_set_base(timer, base); |
373 | } | 412 | } |
374 | } | 413 | } |
375 | 414 | ||
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
397 | timer_stats_timer_set_start_info(timer); | 436 | timer_stats_timer_set_start_info(timer); |
398 | BUG_ON(timer_pending(timer) || !timer->function); | 437 | BUG_ON(timer_pending(timer) || !timer->function); |
399 | spin_lock_irqsave(&base->lock, flags); | 438 | spin_lock_irqsave(&base->lock, flags); |
400 | timer->base = base; | 439 | timer_set_base(timer, base); |
401 | internal_add_timer(base, timer); | 440 | internal_add_timer(base, timer); |
402 | spin_unlock_irqrestore(&base->lock, flags); | 441 | spin_unlock_irqrestore(&base->lock, flags); |
403 | } | 442 | } |
@@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
550 | * don't have to detach them individually. | 589 | * don't have to detach them individually. |
551 | */ | 590 | */ |
552 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { | 591 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { |
553 | BUG_ON(timer->base != base); | 592 | BUG_ON(tbase_get_base(timer->base) != base); |
554 | internal_add_timer(base, timer); | 593 | internal_add_timer(base, timer); |
555 | } | 594 | } |
556 | 595 | ||
@@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
590 | void (*fn)(unsigned long); | 629 | void (*fn)(unsigned long); |
591 | unsigned long data; | 630 | unsigned long data; |
592 | 631 | ||
593 | timer = list_entry(head->next,struct timer_list,entry); | 632 | timer = list_first_entry(head, struct timer_list,entry); |
594 | fn = timer->function; | 633 | fn = timer->function; |
595 | data = timer->data; | 634 | data = timer->data; |
596 | 635 | ||
@@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) | |||
636 | index = slot = timer_jiffies & TVR_MASK; | 675 | index = slot = timer_jiffies & TVR_MASK; |
637 | do { | 676 | do { |
638 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { | 677 | list_for_each_entry(nte, base->tv1.vec + slot, entry) { |
678 | if (tbase_get_deferrable(nte->base)) | ||
679 | continue; | ||
680 | |||
639 | found = 1; | 681 | found = 1; |
640 | expires = nte->expires; | 682 | expires = nte->expires; |
641 | /* Look at the cascade bucket(s)? */ | 683 | /* Look at the cascade bucket(s)? */ |
@@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void) | |||
752 | 794 | ||
753 | #endif | 795 | #endif |
754 | 796 | ||
755 | /******************************************************************/ | ||
756 | |||
757 | /* | ||
758 | * The current time | ||
759 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | ||
760 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | ||
761 | * at zero at system boot time, so wall_to_monotonic will be negative, | ||
762 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | ||
763 | * the usual normalization. | ||
764 | */ | ||
765 | struct timespec xtime __attribute__ ((aligned (16))); | ||
766 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | ||
767 | |||
768 | EXPORT_SYMBOL(xtime); | ||
769 | |||
770 | |||
771 | /* XXX - all of this timekeeping code should be later moved to time.c */ | ||
772 | #include <linux/clocksource.h> | ||
773 | static struct clocksource *clock; /* pointer to current clocksource */ | ||
774 | |||
775 | #ifdef CONFIG_GENERIC_TIME | ||
776 | /** | ||
777 | * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook | ||
778 | * | ||
779 | * private function, must hold xtime_lock lock when being | ||
780 | * called. Returns the number of nanoseconds since the | ||
781 | * last call to update_wall_time() (adjusted by NTP scaling) | ||
782 | */ | ||
783 | static inline s64 __get_nsec_offset(void) | ||
784 | { | ||
785 | cycle_t cycle_now, cycle_delta; | ||
786 | s64 ns_offset; | ||
787 | |||
788 | /* read clocksource: */ | ||
789 | cycle_now = clocksource_read(clock); | ||
790 | |||
791 | /* calculate the delta since the last update_wall_time: */ | ||
792 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
793 | |||
794 | /* convert to nanoseconds: */ | ||
795 | ns_offset = cyc2ns(clock, cycle_delta); | ||
796 | |||
797 | return ns_offset; | ||
798 | } | ||
799 | |||
800 | /** | ||
801 | * __get_realtime_clock_ts - Returns the time of day in a timespec | ||
802 | * @ts: pointer to the timespec to be set | ||
803 | * | ||
804 | * Returns the time of day in a timespec. Used by | ||
805 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
806 | */ | ||
807 | static inline void __get_realtime_clock_ts(struct timespec *ts) | ||
808 | { | ||
809 | unsigned long seq; | ||
810 | s64 nsecs; | ||
811 | |||
812 | do { | ||
813 | seq = read_seqbegin(&xtime_lock); | ||
814 | |||
815 | *ts = xtime; | ||
816 | nsecs = __get_nsec_offset(); | ||
817 | |||
818 | } while (read_seqretry(&xtime_lock, seq)); | ||
819 | |||
820 | timespec_add_ns(ts, nsecs); | ||
821 | } | ||
822 | |||
823 | /** | ||
824 | * getnstimeofday - Returns the time of day in a timespec | ||
825 | * @ts: pointer to the timespec to be set | ||
826 | * | ||
827 | * Returns the time of day in a timespec. | ||
828 | */ | ||
829 | void getnstimeofday(struct timespec *ts) | ||
830 | { | ||
831 | __get_realtime_clock_ts(ts); | ||
832 | } | ||
833 | |||
834 | EXPORT_SYMBOL(getnstimeofday); | ||
835 | |||
836 | /** | ||
837 | * do_gettimeofday - Returns the time of day in a timeval | ||
838 | * @tv: pointer to the timeval to be set | ||
839 | * | ||
840 | * NOTE: Users should be converted to using get_realtime_clock_ts() | ||
841 | */ | ||
842 | void do_gettimeofday(struct timeval *tv) | ||
843 | { | ||
844 | struct timespec now; | ||
845 | |||
846 | __get_realtime_clock_ts(&now); | ||
847 | tv->tv_sec = now.tv_sec; | ||
848 | tv->tv_usec = now.tv_nsec/1000; | ||
849 | } | ||
850 | |||
851 | EXPORT_SYMBOL(do_gettimeofday); | ||
852 | /** | ||
853 | * do_settimeofday - Sets the time of day | ||
854 | * @tv: pointer to the timespec variable containing the new time | ||
855 | * | ||
856 | * Sets the time of day to the new time and update NTP and notify hrtimers | ||
857 | */ | ||
858 | int do_settimeofday(struct timespec *tv) | ||
859 | { | ||
860 | unsigned long flags; | ||
861 | time_t wtm_sec, sec = tv->tv_sec; | ||
862 | long wtm_nsec, nsec = tv->tv_nsec; | ||
863 | |||
864 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
865 | return -EINVAL; | ||
866 | |||
867 | write_seqlock_irqsave(&xtime_lock, flags); | ||
868 | |||
869 | nsec -= __get_nsec_offset(); | ||
870 | |||
871 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
872 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
873 | |||
874 | set_normalized_timespec(&xtime, sec, nsec); | ||
875 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
876 | |||
877 | clock->error = 0; | ||
878 | ntp_clear(); | ||
879 | |||
880 | update_vsyscall(&xtime, clock); | ||
881 | |||
882 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
883 | |||
884 | /* signal hrtimers about time change */ | ||
885 | clock_was_set(); | ||
886 | |||
887 | return 0; | ||
888 | } | ||
889 | |||
890 | EXPORT_SYMBOL(do_settimeofday); | ||
891 | |||
892 | /** | ||
893 | * change_clocksource - Swaps clocksources if a new one is available | ||
894 | * | ||
895 | * Accumulates current time interval and initializes new clocksource | ||
896 | */ | ||
897 | static void change_clocksource(void) | ||
898 | { | ||
899 | struct clocksource *new; | ||
900 | cycle_t now; | ||
901 | u64 nsec; | ||
902 | |||
903 | new = clocksource_get_next(); | ||
904 | |||
905 | if (clock == new) | ||
906 | return; | ||
907 | |||
908 | now = clocksource_read(new); | ||
909 | nsec = __get_nsec_offset(); | ||
910 | timespec_add_ns(&xtime, nsec); | ||
911 | |||
912 | clock = new; | ||
913 | clock->cycle_last = now; | ||
914 | |||
915 | clock->error = 0; | ||
916 | clock->xtime_nsec = 0; | ||
917 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | ||
918 | |||
919 | tick_clock_notify(); | ||
920 | |||
921 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
922 | clock->name); | ||
923 | } | ||
924 | #else | ||
925 | static inline void change_clocksource(void) { } | ||
926 | #endif | ||
927 | |||
928 | /** | ||
929 | * timekeeping_is_continuous - check to see if timekeeping is free running | ||
930 | */ | ||
931 | int timekeeping_is_continuous(void) | ||
932 | { | ||
933 | unsigned long seq; | ||
934 | int ret; | ||
935 | |||
936 | do { | ||
937 | seq = read_seqbegin(&xtime_lock); | ||
938 | |||
939 | ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | ||
940 | |||
941 | } while (read_seqretry(&xtime_lock, seq)); | ||
942 | |||
943 | return ret; | ||
944 | } | ||
945 | |||
946 | /** | ||
947 | * read_persistent_clock - Return time in seconds from the persistent clock. | ||
948 | * | ||
949 | * Weak dummy function for arches that do not yet support it. | ||
950 | * Returns seconds from epoch using the battery backed persistent clock. | ||
951 | * Returns zero if unsupported. | ||
952 | * | ||
953 | * XXX - Do be sure to remove it once all arches implement it. | ||
954 | */ | ||
955 | unsigned long __attribute__((weak)) read_persistent_clock(void) | ||
956 | { | ||
957 | return 0; | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * timekeeping_init - Initializes the clocksource and common timekeeping values | ||
962 | */ | ||
963 | void __init timekeeping_init(void) | ||
964 | { | ||
965 | unsigned long flags; | ||
966 | unsigned long sec = read_persistent_clock(); | ||
967 | |||
968 | write_seqlock_irqsave(&xtime_lock, flags); | ||
969 | |||
970 | ntp_clear(); | ||
971 | |||
972 | clock = clocksource_get_next(); | ||
973 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | ||
974 | clock->cycle_last = clocksource_read(clock); | ||
975 | |||
976 | xtime.tv_sec = sec; | ||
977 | xtime.tv_nsec = 0; | ||
978 | set_normalized_timespec(&wall_to_monotonic, | ||
979 | -xtime.tv_sec, -xtime.tv_nsec); | ||
980 | |||
981 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
982 | } | ||
983 | |||
984 | /* flag for if timekeeping is suspended */ | ||
985 | static int timekeeping_suspended; | ||
986 | /* time in seconds when suspend began */ | ||
987 | static unsigned long timekeeping_suspend_time; | ||
988 | |||
989 | /** | ||
990 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | ||
991 | * @dev: unused | ||
992 | * | ||
993 | * This is for the generic clocksource timekeeping. | ||
994 | * xtime/wall_to_monotonic/jiffies/etc are | ||
995 | * still managed by arch specific suspend/resume code. | ||
996 | */ | ||
997 | static int timekeeping_resume(struct sys_device *dev) | ||
998 | { | ||
999 | unsigned long flags; | ||
1000 | unsigned long now = read_persistent_clock(); | ||
1001 | |||
1002 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1003 | |||
1004 | if (now && (now > timekeeping_suspend_time)) { | ||
1005 | unsigned long sleep_length = now - timekeeping_suspend_time; | ||
1006 | |||
1007 | xtime.tv_sec += sleep_length; | ||
1008 | wall_to_monotonic.tv_sec -= sleep_length; | ||
1009 | } | ||
1010 | /* re-base the last cycle value */ | ||
1011 | clock->cycle_last = clocksource_read(clock); | ||
1012 | clock->error = 0; | ||
1013 | timekeeping_suspended = 0; | ||
1014 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1015 | |||
1016 | touch_softlockup_watchdog(); | ||
1017 | |||
1018 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | ||
1019 | |||
1020 | /* Resume hrtimers */ | ||
1021 | hres_timers_resume(); | ||
1022 | |||
1023 | return 0; | ||
1024 | } | ||
1025 | |||
1026 | static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | ||
1027 | { | ||
1028 | unsigned long flags; | ||
1029 | |||
1030 | write_seqlock_irqsave(&xtime_lock, flags); | ||
1031 | timekeeping_suspended = 1; | ||
1032 | timekeeping_suspend_time = read_persistent_clock(); | ||
1033 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1034 | |||
1035 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | ||
1036 | |||
1037 | return 0; | ||
1038 | } | ||
1039 | |||
1040 | /* sysfs resume/suspend bits for timekeeping */ | ||
1041 | static struct sysdev_class timekeeping_sysclass = { | ||
1042 | .resume = timekeeping_resume, | ||
1043 | .suspend = timekeeping_suspend, | ||
1044 | set_kset_name("timekeeping"), | ||
1045 | }; | ||
1046 | |||
1047 | static struct sys_device device_timer = { | ||
1048 | .id = 0, | ||
1049 | .cls = &timekeeping_sysclass, | ||
1050 | }; | ||
1051 | |||
1052 | static int __init timekeeping_init_device(void) | ||
1053 | { | ||
1054 | int error = sysdev_class_register(&timekeeping_sysclass); | ||
1055 | if (!error) | ||
1056 | error = sysdev_register(&device_timer); | ||
1057 | return error; | ||
1058 | } | ||
1059 | |||
1060 | device_initcall(timekeeping_init_device); | ||
1061 | |||
1062 | /* | ||
1063 | * If the error is already larger, we look ahead even further | ||
1064 | * to compensate for late or lost adjustments. | ||
1065 | */ | ||
1066 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, | ||
1067 | s64 *offset) | ||
1068 | { | ||
1069 | s64 tick_error, i; | ||
1070 | u32 look_ahead, adj; | ||
1071 | s32 error2, mult; | ||
1072 | |||
1073 | /* | ||
1074 | * Use the current error value to determine how much to look ahead. | ||
1075 | * The larger the error the slower we adjust for it to avoid problems | ||
1076 | * with losing too many ticks, otherwise we would overadjust and | ||
1077 | * produce an even larger error. The smaller the adjustment the | ||
1078 | * faster we try to adjust for it, as lost ticks can do less harm | ||
1079 | * here. This is tuned so that an error of about 1 msec is adusted | ||
1080 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | ||
1081 | */ | ||
1082 | error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); | ||
1083 | error2 = abs(error2); | ||
1084 | for (look_ahead = 0; error2 > 0; look_ahead++) | ||
1085 | error2 >>= 2; | ||
1086 | |||
1087 | /* | ||
1088 | * Now calculate the error in (1 << look_ahead) ticks, but first | ||
1089 | * remove the single look ahead already included in the error. | ||
1090 | */ | ||
1091 | tick_error = current_tick_length() >> | ||
1092 | (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
1093 | tick_error -= clock->xtime_interval >> 1; | ||
1094 | error = ((error - tick_error) >> look_ahead) + tick_error; | ||
1095 | |||
1096 | /* Finally calculate the adjustment shift value. */ | ||
1097 | i = *interval; | ||
1098 | mult = 1; | ||
1099 | if (error < 0) { | ||
1100 | error = -error; | ||
1101 | *interval = -*interval; | ||
1102 | *offset = -*offset; | ||
1103 | mult = -1; | ||
1104 | } | ||
1105 | for (adj = 0; error > i; adj++) | ||
1106 | error >>= 1; | ||
1107 | |||
1108 | *interval <<= adj; | ||
1109 | *offset <<= adj; | ||
1110 | return mult << adj; | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * Adjust the multiplier to reduce the error value, | ||
1115 | * this is optimized for the most common adjustments of -1,0,1, | ||
1116 | * for other values we can do a bit more work. | ||
1117 | */ | ||
1118 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | ||
1119 | { | ||
1120 | s64 error, interval = clock->cycle_interval; | ||
1121 | int adj; | ||
1122 | |||
1123 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | ||
1124 | if (error > interval) { | ||
1125 | error >>= 2; | ||
1126 | if (likely(error <= interval)) | ||
1127 | adj = 1; | ||
1128 | else | ||
1129 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
1130 | } else if (error < -interval) { | ||
1131 | error >>= 2; | ||
1132 | if (likely(error >= -interval)) { | ||
1133 | adj = -1; | ||
1134 | interval = -interval; | ||
1135 | offset = -offset; | ||
1136 | } else | ||
1137 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
1138 | } else | ||
1139 | return; | ||
1140 | |||
1141 | clock->mult += adj; | ||
1142 | clock->xtime_interval += interval; | ||
1143 | clock->xtime_nsec -= offset; | ||
1144 | clock->error -= (interval - offset) << | ||
1145 | (TICK_LENGTH_SHIFT - clock->shift); | ||
1146 | } | ||
1147 | |||
1148 | /** | ||
1149 | * update_wall_time - Uses the current clocksource to increment the wall time | ||
1150 | * | ||
1151 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
1152 | */ | ||
1153 | static void update_wall_time(void) | ||
1154 | { | ||
1155 | cycle_t offset; | ||
1156 | |||
1157 | /* Make sure we're fully resumed: */ | ||
1158 | if (unlikely(timekeeping_suspended)) | ||
1159 | return; | ||
1160 | |||
1161 | #ifdef CONFIG_GENERIC_TIME | ||
1162 | offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; | ||
1163 | #else | ||
1164 | offset = clock->cycle_interval; | ||
1165 | #endif | ||
1166 | clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; | ||
1167 | |||
1168 | /* normally this loop will run just once, however in the | ||
1169 | * case of lost or late ticks, it will accumulate correctly. | ||
1170 | */ | ||
1171 | while (offset >= clock->cycle_interval) { | ||
1172 | /* accumulate one interval */ | ||
1173 | clock->xtime_nsec += clock->xtime_interval; | ||
1174 | clock->cycle_last += clock->cycle_interval; | ||
1175 | offset -= clock->cycle_interval; | ||
1176 | |||
1177 | if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { | ||
1178 | clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; | ||
1179 | xtime.tv_sec++; | ||
1180 | second_overflow(); | ||
1181 | } | ||
1182 | |||
1183 | /* interpolator bits */ | ||
1184 | time_interpolator_update(clock->xtime_interval | ||
1185 | >> clock->shift); | ||
1186 | |||
1187 | /* accumulate error between NTP and clock interval */ | ||
1188 | clock->error += current_tick_length(); | ||
1189 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | ||
1190 | } | ||
1191 | |||
1192 | /* correct the clock when NTP error is too big */ | ||
1193 | clocksource_adjust(clock, offset); | ||
1194 | |||
1195 | /* store full nanoseconds into xtime */ | ||
1196 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; | ||
1197 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | ||
1198 | |||
1199 | /* check to see if there is a new clocksource to use */ | ||
1200 | change_clocksource(); | ||
1201 | update_vsyscall(&xtime, clock); | ||
1202 | } | ||
1203 | |||
1204 | /* | 797 | /* |
1205 | * Called from the timer interrupt handler to charge one tick to the current | 798 | * Called from the timer interrupt handler to charge one tick to the current |
1206 | * process. user_tick is 1 if the tick is user time, 0 for system. | 799 | * process. user_tick is 1 if the tick is user time, 0 for system. |
@@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks) | |||
1264 | } | 857 | } |
1265 | 858 | ||
1266 | /* | 859 | /* |
1267 | * This read-write spinlock protects us from races in SMP while | ||
1268 | * playing with xtime and avenrun. | ||
1269 | */ | ||
1270 | __attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
1271 | |||
1272 | EXPORT_SYMBOL(xtime_lock); | ||
1273 | |||
1274 | /* | ||
1275 | * This function runs timers and the timer-tq in bottom half context. | 860 | * This function runs timers and the timer-tq in bottom half context. |
1276 | */ | 861 | */ |
1277 | static void run_timer_softirq(struct softirq_action *h) | 862 | static void run_timer_softirq(struct softirq_action *h) |
@@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu) | |||
1617 | cpu_to_node(cpu)); | 1202 | cpu_to_node(cpu)); |
1618 | if (!base) | 1203 | if (!base) |
1619 | return -ENOMEM; | 1204 | return -ENOMEM; |
1205 | |||
1206 | /* Make sure that tvec_base is 2 byte aligned */ | ||
1207 | if (tbase_get_deferrable(base)) { | ||
1208 | WARN_ON(1); | ||
1209 | kfree(base); | ||
1210 | return -ENOMEM; | ||
1211 | } | ||
1620 | memset(base, 0, sizeof(*base)); | 1212 | memset(base, 0, sizeof(*base)); |
1621 | per_cpu(tvec_bases, cpu) = base; | 1213 | per_cpu(tvec_bases, cpu) = base; |
1622 | } else { | 1214 | } else { |
@@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | |||
1656 | struct timer_list *timer; | 1248 | struct timer_list *timer; |
1657 | 1249 | ||
1658 | while (!list_empty(head)) { | 1250 | while (!list_empty(head)) { |
1659 | timer = list_entry(head->next, struct timer_list, entry); | 1251 | timer = list_first_entry(head, struct timer_list, entry); |
1660 | detach_timer(timer, 0); | 1252 | detach_timer(timer, 0); |
1661 | timer->base = new_base; | 1253 | timer_set_base(timer, new_base); |
1662 | internal_add_timer(new_base, timer); | 1254 | internal_add_timer(new_base, timer); |
1663 | } | 1255 | } |
1664 | } | 1256 | } |
diff --git a/kernel/uid16.c b/kernel/uid16.c index 187e2a423878..dd308ba4e03b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -6,7 +6,6 @@ | |||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/utsname.h> | 7 | #include <linux/utsname.h> |
8 | #include <linux/mman.h> | 8 | #include <linux/mman.h> |
9 | #include <linux/smp_lock.h> | ||
10 | #include <linux/notifier.h> | 9 | #include <linux/notifier.h> |
11 | #include <linux/reboot.h> | 10 | #include <linux/reboot.h> |
12 | #include <linux/prctl.h> | 11 | #include <linux/prctl.h> |
diff --git a/kernel/utsname.c b/kernel/utsname.c index c859164a6993..160c8c5136bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
32 | } | 32 | } |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * unshare the current process' utsname namespace. | ||
36 | * called only in sys_unshare() | ||
37 | */ | ||
38 | int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) | ||
39 | { | ||
40 | if (unshare_flags & CLONE_NEWUTS) { | ||
41 | if (!capable(CAP_SYS_ADMIN)) | ||
42 | return -EPERM; | ||
43 | |||
44 | *new_uts = clone_uts_ns(current->nsproxy->uts_ns); | ||
45 | if (!*new_uts) | ||
46 | return -ENOMEM; | ||
47 | } | ||
48 | |||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * Copy task tsk's utsname namespace, or clone it if flags | 35 | * Copy task tsk's utsname namespace, or clone it if flags |
54 | * specifies CLONE_NEWUTS. In latter case, changes to the | 36 | * specifies CLONE_NEWUTS. In latter case, changes to the |
55 | * utsname of this process won't be seen by parent, and vice | 37 | * utsname of this process won't be seen by parent, and vice |
56 | * versa. | 38 | * versa. |
57 | */ | 39 | */ |
58 | int copy_utsname(int flags, struct task_struct *tsk) | 40 | struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) |
59 | { | 41 | { |
60 | struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; | ||
61 | struct uts_namespace *new_ns; | 42 | struct uts_namespace *new_ns; |
62 | int err = 0; | ||
63 | |||
64 | if (!old_ns) | ||
65 | return 0; | ||
66 | 43 | ||
44 | BUG_ON(!old_ns); | ||
67 | get_uts_ns(old_ns); | 45 | get_uts_ns(old_ns); |
68 | 46 | ||
69 | if (!(flags & CLONE_NEWUTS)) | 47 | if (!(flags & CLONE_NEWUTS)) |
70 | return 0; | 48 | return old_ns; |
71 | |||
72 | if (!capable(CAP_SYS_ADMIN)) { | ||
73 | err = -EPERM; | ||
74 | goto out; | ||
75 | } | ||
76 | 49 | ||
77 | new_ns = clone_uts_ns(old_ns); | 50 | new_ns = clone_uts_ns(old_ns); |
78 | if (!new_ns) { | ||
79 | err = -ENOMEM; | ||
80 | goto out; | ||
81 | } | ||
82 | tsk->nsproxy->uts_ns = new_ns; | ||
83 | 51 | ||
84 | out: | ||
85 | put_uts_ns(old_ns); | 52 | put_uts_ns(old_ns); |
86 | return err; | 53 | return new_ns; |
87 | } | 54 | } |
88 | 55 | ||
89 | void free_uts_ns(struct kref *kref) | 56 | void free_uts_ns(struct kref *kref) |