diff options
Diffstat (limited to 'kernel')
106 files changed, 7918 insertions, 3660 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb44618..5068e2a4e75f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE | |||
199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE |
200 | 200 | ||
201 | config MUTEX_SPIN_ON_OWNER | 201 | config MUTEX_SPIN_ON_OWNER |
202 | def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES | 202 | def_bool SMP && !DEBUG_MUTEXES |
diff --git a/kernel/Makefile b/kernel/Makefile index 85cbfb31e73e..2d64cfcc8b42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg | |||
21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
23 | CFLAGS_REMOVE_sched_clock.o = -pg | 23 | CFLAGS_REMOVE_sched_clock.o = -pg |
24 | CFLAGS_REMOVE_perf_event.o = -pg | ||
25 | CFLAGS_REMOVE_irq_work.o = -pg | 24 | CFLAGS_REMOVE_irq_work.o = -pg |
26 | endif | 25 | endif |
27 | 26 | ||
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o | |||
62 | obj-$(CONFIG_CGROUPS) += cgroup.o | 61 | obj-$(CONFIG_CGROUPS) += cgroup.o |
63 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | 62 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o |
64 | obj-$(CONFIG_CPUSETS) += cpuset.o | 63 | obj-$(CONFIG_CPUSETS) += cpuset.o |
65 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | ||
66 | obj-$(CONFIG_UTS_NS) += utsname.o | 64 | obj-$(CONFIG_UTS_NS) += utsname.o |
67 | obj-$(CONFIG_USER_NS) += user_namespace.o | 65 | obj-$(CONFIG_USER_NS) += user_namespace.o |
68 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 66 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
@@ -103,8 +101,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/ | |||
103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 101 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
104 | obj-$(CONFIG_SMP) += sched_cpupri.o | 102 | obj-$(CONFIG_SMP) += sched_cpupri.o |
105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 104 | |
107 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 105 | obj-$(CONFIG_PERF_EVENTS) += events/ |
106 | |||
108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
109 | obj-$(CONFIG_PADATA) += padata.o | 108 | obj-$(CONFIG_PADATA) += padata.o |
110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b33513a08beb..00d79df03e76 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
443 | 443 | ||
444 | /* Determine if any context name data matches a rule's watch data */ | 444 | /* Determine if any context name data matches a rule's watch data */ |
445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
446 | * otherwise. */ | 446 | * otherwise. |
447 | * | ||
448 | * If task_creation is true, this is an explicit indication that we are | ||
449 | * filtering a task rule at task creation time. This and tsk == current are | ||
450 | * the only situations where tsk->cred may be accessed without an rcu read lock. | ||
451 | */ | ||
447 | static int audit_filter_rules(struct task_struct *tsk, | 452 | static int audit_filter_rules(struct task_struct *tsk, |
448 | struct audit_krule *rule, | 453 | struct audit_krule *rule, |
449 | struct audit_context *ctx, | 454 | struct audit_context *ctx, |
450 | struct audit_names *name, | 455 | struct audit_names *name, |
451 | enum audit_state *state) | 456 | enum audit_state *state, |
457 | bool task_creation) | ||
452 | { | 458 | { |
453 | const struct cred *cred = get_task_cred(tsk); | 459 | const struct cred *cred; |
454 | int i, j, need_sid = 1; | 460 | int i, j, need_sid = 1; |
455 | u32 sid; | 461 | u32 sid; |
456 | 462 | ||
463 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); | ||
464 | |||
457 | for (i = 0; i < rule->field_count; i++) { | 465 | for (i = 0; i < rule->field_count; i++) { |
458 | struct audit_field *f = &rule->fields[i]; | 466 | struct audit_field *f = &rule->fields[i]; |
459 | int result = 0; | 467 | int result = 0; |
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
637 | break; | 645 | break; |
638 | } | 646 | } |
639 | 647 | ||
640 | if (!result) { | 648 | if (!result) |
641 | put_cred(cred); | ||
642 | return 0; | 649 | return 0; |
643 | } | ||
644 | } | 650 | } |
645 | 651 | ||
646 | if (ctx) { | 652 | if (ctx) { |
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
656 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 662 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
657 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 663 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
658 | } | 664 | } |
659 | put_cred(cred); | ||
660 | return 1; | 665 | return 1; |
661 | } | 666 | } |
662 | 667 | ||
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) | |||
671 | 676 | ||
672 | rcu_read_lock(); | 677 | rcu_read_lock(); |
673 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { | 678 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { |
674 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { | 679 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, |
680 | &state, true)) { | ||
675 | if (state == AUDIT_RECORD_CONTEXT) | 681 | if (state == AUDIT_RECORD_CONTEXT) |
676 | *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); | 682 | *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); |
677 | rcu_read_unlock(); | 683 | rcu_read_unlock(); |
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
705 | list_for_each_entry_rcu(e, list, list) { | 711 | list_for_each_entry_rcu(e, list, list) { |
706 | if ((e->rule.mask[word] & bit) == bit && | 712 | if ((e->rule.mask[word] & bit) == bit && |
707 | audit_filter_rules(tsk, &e->rule, ctx, NULL, | 713 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
708 | &state)) { | 714 | &state, false)) { |
709 | rcu_read_unlock(); | 715 | rcu_read_unlock(); |
710 | ctx->current_state = state; | 716 | ctx->current_state = state; |
711 | return state; | 717 | return state; |
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) | |||
743 | 749 | ||
744 | list_for_each_entry_rcu(e, list, list) { | 750 | list_for_each_entry_rcu(e, list, list) { |
745 | if ((e->rule.mask[word] & bit) == bit && | 751 | if ((e->rule.mask[word] & bit) == bit && |
746 | audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { | 752 | audit_filter_rules(tsk, &e->rule, ctx, n, |
753 | &state, false)) { | ||
747 | rcu_read_unlock(); | 754 | rcu_read_unlock(); |
748 | ctx->current_state = state; | 755 | ctx->current_state = state; |
749 | return; | 756 | return; |
diff --git a/kernel/capability.c b/kernel/capability.c index 32a80e08ff4b..283c529f8b1c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -22,12 +22,8 @@ | |||
22 | */ | 22 | */ |
23 | 23 | ||
24 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; | 24 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; |
25 | const kernel_cap_t __cap_full_set = CAP_FULL_SET; | ||
26 | const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; | ||
27 | 25 | ||
28 | EXPORT_SYMBOL(__cap_empty_set); | 26 | EXPORT_SYMBOL(__cap_empty_set); |
29 | EXPORT_SYMBOL(__cap_full_set); | ||
30 | EXPORT_SYMBOL(__cap_init_eff_set); | ||
31 | 27 | ||
32 | int file_caps_enabled = 1; | 28 | int file_caps_enabled = 1; |
33 | 29 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25c7eb52de1a..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
58 | #include <linux/eventfd.h> | 58 | #include <linux/eventfd.h> |
59 | #include <linux/poll.h> | 59 | #include <linux/poll.h> |
60 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | ||
60 | 61 | ||
61 | #include <asm/atomic.h> | 62 | #include <asm/atomic.h> |
62 | 63 | ||
@@ -326,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | |||
326 | return &css_set_table[index]; | 327 | return &css_set_table[index]; |
327 | } | 328 | } |
328 | 329 | ||
329 | static void free_css_set_rcu(struct rcu_head *obj) | ||
330 | { | ||
331 | struct css_set *cg = container_of(obj, struct css_set, rcu_head); | ||
332 | kfree(cg); | ||
333 | } | ||
334 | |||
335 | /* We don't maintain the lists running through each css_set to its | 330 | /* We don't maintain the lists running through each css_set to its |
336 | * task until after the first call to cgroup_iter_start(). This | 331 | * task until after the first call to cgroup_iter_start(). This |
337 | * reduces the fork()/exit() overhead for people who have cgroups | 332 | * reduces the fork()/exit() overhead for people who have cgroups |
@@ -375,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
375 | } | 370 | } |
376 | 371 | ||
377 | write_unlock(&css_set_lock); | 372 | write_unlock(&css_set_lock); |
378 | call_rcu(&cg->rcu_head, free_css_set_rcu); | 373 | kfree_rcu(cg, rcu_head); |
379 | } | 374 | } |
380 | 375 | ||
381 | /* | 376 | /* |
@@ -812,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
812 | return ret; | 807 | return ret; |
813 | } | 808 | } |
814 | 809 | ||
815 | static void free_cgroup_rcu(struct rcu_head *obj) | ||
816 | { | ||
817 | struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); | ||
818 | |||
819 | kfree(cgrp); | ||
820 | } | ||
821 | |||
822 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 810 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
823 | { | 811 | { |
824 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 812 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -856,7 +844,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
856 | */ | 844 | */ |
857 | BUG_ON(!list_empty(&cgrp->pidlists)); | 845 | BUG_ON(!list_empty(&cgrp->pidlists)); |
858 | 846 | ||
859 | call_rcu(&cgrp->rcu_head, free_cgroup_rcu); | 847 | kfree_rcu(cgrp, rcu_head); |
860 | } | 848 | } |
861 | iput(inode); | 849 | iput(inode); |
862 | } | 850 | } |
@@ -1748,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1748 | } | 1736 | } |
1749 | EXPORT_SYMBOL_GPL(cgroup_path); | 1737 | EXPORT_SYMBOL_GPL(cgroup_path); |
1750 | 1738 | ||
1739 | /* | ||
1740 | * cgroup_task_migrate - move a task from one cgroup to another. | ||
1741 | * | ||
1742 | * 'guarantee' is set if the caller promises that a new css_set for the task | ||
1743 | * will already exist. If not set, this function might sleep, and can fail with | ||
1744 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. | ||
1745 | */ | ||
1746 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | ||
1747 | struct task_struct *tsk, bool guarantee) | ||
1748 | { | ||
1749 | struct css_set *oldcg; | ||
1750 | struct css_set *newcg; | ||
1751 | |||
1752 | /* | ||
1753 | * get old css_set. we need to take task_lock and refcount it, because | ||
1754 | * an exiting task can change its css_set to init_css_set and drop its | ||
1755 | * old one without taking cgroup_mutex. | ||
1756 | */ | ||
1757 | task_lock(tsk); | ||
1758 | oldcg = tsk->cgroups; | ||
1759 | get_css_set(oldcg); | ||
1760 | task_unlock(tsk); | ||
1761 | |||
1762 | /* locate or allocate a new css_set for this task. */ | ||
1763 | if (guarantee) { | ||
1764 | /* we know the css_set we want already exists. */ | ||
1765 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
1766 | read_lock(&css_set_lock); | ||
1767 | newcg = find_existing_css_set(oldcg, cgrp, template); | ||
1768 | BUG_ON(!newcg); | ||
1769 | get_css_set(newcg); | ||
1770 | read_unlock(&css_set_lock); | ||
1771 | } else { | ||
1772 | might_sleep(); | ||
1773 | /* find_css_set will give us newcg already referenced. */ | ||
1774 | newcg = find_css_set(oldcg, cgrp); | ||
1775 | if (!newcg) { | ||
1776 | put_css_set(oldcg); | ||
1777 | return -ENOMEM; | ||
1778 | } | ||
1779 | } | ||
1780 | put_css_set(oldcg); | ||
1781 | |||
1782 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ | ||
1783 | task_lock(tsk); | ||
1784 | if (tsk->flags & PF_EXITING) { | ||
1785 | task_unlock(tsk); | ||
1786 | put_css_set(newcg); | ||
1787 | return -ESRCH; | ||
1788 | } | ||
1789 | rcu_assign_pointer(tsk->cgroups, newcg); | ||
1790 | task_unlock(tsk); | ||
1791 | |||
1792 | /* Update the css_set linked lists if we're using them */ | ||
1793 | write_lock(&css_set_lock); | ||
1794 | if (!list_empty(&tsk->cg_list)) | ||
1795 | list_move(&tsk->cg_list, &newcg->tasks); | ||
1796 | write_unlock(&css_set_lock); | ||
1797 | |||
1798 | /* | ||
1799 | * We just gained a reference on oldcg by taking it from the task. As | ||
1800 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | ||
1801 | * it here; it will be freed under RCU. | ||
1802 | */ | ||
1803 | put_css_set(oldcg); | ||
1804 | |||
1805 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | ||
1806 | return 0; | ||
1807 | } | ||
1808 | |||
1751 | /** | 1809 | /** |
1752 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' | 1810 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' |
1753 | * @cgrp: the cgroup the task is attaching to | 1811 | * @cgrp: the cgroup the task is attaching to |
@@ -1758,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); | |||
1758 | */ | 1816 | */ |
1759 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1817 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1760 | { | 1818 | { |
1761 | int retval = 0; | 1819 | int retval; |
1762 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1820 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1763 | struct cgroup *oldcgrp; | 1821 | struct cgroup *oldcgrp; |
1764 | struct css_set *cg; | ||
1765 | struct css_set *newcg; | ||
1766 | struct cgroupfs_root *root = cgrp->root; | 1822 | struct cgroupfs_root *root = cgrp->root; |
1767 | 1823 | ||
1768 | /* Nothing to do if the task is already in that cgroup */ | 1824 | /* Nothing to do if the task is already in that cgroup */ |
@@ -1772,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1772 | 1828 | ||
1773 | for_each_subsys(root, ss) { | 1829 | for_each_subsys(root, ss) { |
1774 | if (ss->can_attach) { | 1830 | if (ss->can_attach) { |
1775 | retval = ss->can_attach(ss, cgrp, tsk, false); | 1831 | retval = ss->can_attach(ss, cgrp, tsk); |
1776 | if (retval) { | 1832 | if (retval) { |
1777 | /* | 1833 | /* |
1778 | * Remember on which subsystem the can_attach() | 1834 | * Remember on which subsystem the can_attach() |
@@ -1784,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1784 | goto out; | 1840 | goto out; |
1785 | } | 1841 | } |
1786 | } | 1842 | } |
1843 | if (ss->can_attach_task) { | ||
1844 | retval = ss->can_attach_task(cgrp, tsk); | ||
1845 | if (retval) { | ||
1846 | failed_ss = ss; | ||
1847 | goto out; | ||
1848 | } | ||
1849 | } | ||
1787 | } | 1850 | } |
1788 | 1851 | ||
1789 | task_lock(tsk); | 1852 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
1790 | cg = tsk->cgroups; | 1853 | if (retval) |
1791 | get_css_set(cg); | ||
1792 | task_unlock(tsk); | ||
1793 | /* | ||
1794 | * Locate or allocate a new css_set for this task, | ||
1795 | * based on its final set of cgroups | ||
1796 | */ | ||
1797 | newcg = find_css_set(cg, cgrp); | ||
1798 | put_css_set(cg); | ||
1799 | if (!newcg) { | ||
1800 | retval = -ENOMEM; | ||
1801 | goto out; | 1854 | goto out; |
1802 | } | ||
1803 | |||
1804 | task_lock(tsk); | ||
1805 | if (tsk->flags & PF_EXITING) { | ||
1806 | task_unlock(tsk); | ||
1807 | put_css_set(newcg); | ||
1808 | retval = -ESRCH; | ||
1809 | goto out; | ||
1810 | } | ||
1811 | rcu_assign_pointer(tsk->cgroups, newcg); | ||
1812 | task_unlock(tsk); | ||
1813 | |||
1814 | /* Update the css_set linked lists if we're using them */ | ||
1815 | write_lock(&css_set_lock); | ||
1816 | if (!list_empty(&tsk->cg_list)) | ||
1817 | list_move(&tsk->cg_list, &newcg->tasks); | ||
1818 | write_unlock(&css_set_lock); | ||
1819 | 1855 | ||
1820 | for_each_subsys(root, ss) { | 1856 | for_each_subsys(root, ss) { |
1857 | if (ss->pre_attach) | ||
1858 | ss->pre_attach(cgrp); | ||
1859 | if (ss->attach_task) | ||
1860 | ss->attach_task(cgrp, tsk); | ||
1821 | if (ss->attach) | 1861 | if (ss->attach) |
1822 | ss->attach(ss, cgrp, oldcgrp, tsk, false); | 1862 | ss->attach(ss, cgrp, oldcgrp, tsk); |
1823 | } | 1863 | } |
1824 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1864 | |
1825 | synchronize_rcu(); | 1865 | synchronize_rcu(); |
1826 | put_css_set(cg); | ||
1827 | 1866 | ||
1828 | /* | 1867 | /* |
1829 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1868 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
@@ -1842,7 +1881,7 @@ out: | |||
1842 | */ | 1881 | */ |
1843 | break; | 1882 | break; |
1844 | if (ss->cancel_attach) | 1883 | if (ss->cancel_attach) |
1845 | ss->cancel_attach(ss, cgrp, tsk, false); | 1884 | ss->cancel_attach(ss, cgrp, tsk); |
1846 | } | 1885 | } |
1847 | } | 1886 | } |
1848 | return retval; | 1887 | return retval; |
@@ -1873,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
1873 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 1912 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
1874 | 1913 | ||
1875 | /* | 1914 | /* |
1876 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1915 | * cgroup_attach_proc works in two stages, the first of which prefetches all |
1877 | * held. May take task_lock of task | 1916 | * new css_sets needed (to make sure we have enough memory before committing |
1917 | * to the move) and stores them in a list of entries of the following type. | ||
1918 | * TODO: possible optimization: use css_set->rcu_head for chaining instead | ||
1919 | */ | ||
1920 | struct cg_list_entry { | ||
1921 | struct css_set *cg; | ||
1922 | struct list_head links; | ||
1923 | }; | ||
1924 | |||
1925 | static bool css_set_check_fetched(struct cgroup *cgrp, | ||
1926 | struct task_struct *tsk, struct css_set *cg, | ||
1927 | struct list_head *newcg_list) | ||
1928 | { | ||
1929 | struct css_set *newcg; | ||
1930 | struct cg_list_entry *cg_entry; | ||
1931 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
1932 | |||
1933 | read_lock(&css_set_lock); | ||
1934 | newcg = find_existing_css_set(cg, cgrp, template); | ||
1935 | if (newcg) | ||
1936 | get_css_set(newcg); | ||
1937 | read_unlock(&css_set_lock); | ||
1938 | |||
1939 | /* doesn't exist at all? */ | ||
1940 | if (!newcg) | ||
1941 | return false; | ||
1942 | /* see if it's already in the list */ | ||
1943 | list_for_each_entry(cg_entry, newcg_list, links) { | ||
1944 | if (cg_entry->cg == newcg) { | ||
1945 | put_css_set(newcg); | ||
1946 | return true; | ||
1947 | } | ||
1948 | } | ||
1949 | |||
1950 | /* not found */ | ||
1951 | put_css_set(newcg); | ||
1952 | return false; | ||
1953 | } | ||
1954 | |||
1955 | /* | ||
1956 | * Find the new css_set and store it in the list in preparation for moving the | ||
1957 | * given task to the given cgroup. Returns 0 or -ENOMEM. | ||
1958 | */ | ||
1959 | static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | ||
1960 | struct list_head *newcg_list) | ||
1961 | { | ||
1962 | struct css_set *newcg; | ||
1963 | struct cg_list_entry *cg_entry; | ||
1964 | |||
1965 | /* ensure a new css_set will exist for this thread */ | ||
1966 | newcg = find_css_set(cg, cgrp); | ||
1967 | if (!newcg) | ||
1968 | return -ENOMEM; | ||
1969 | /* add it to the list */ | ||
1970 | cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); | ||
1971 | if (!cg_entry) { | ||
1972 | put_css_set(newcg); | ||
1973 | return -ENOMEM; | ||
1974 | } | ||
1975 | cg_entry->cg = newcg; | ||
1976 | list_add(&cg_entry->links, newcg_list); | ||
1977 | return 0; | ||
1978 | } | ||
1979 | |||
1980 | /** | ||
1981 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup | ||
1982 | * @cgrp: the cgroup to attach to | ||
1983 | * @leader: the threadgroup leader task_struct of the group to be attached | ||
1984 | * | ||
1985 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will | ||
1986 | * take task_lock of each thread in leader's threadgroup individually in turn. | ||
1987 | */ | ||
1988 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | ||
1989 | { | ||
1990 | int retval, i, group_size; | ||
1991 | struct cgroup_subsys *ss, *failed_ss = NULL; | ||
1992 | bool cancel_failed_ss = false; | ||
1993 | /* guaranteed to be initialized later, but the compiler needs this */ | ||
1994 | struct cgroup *oldcgrp = NULL; | ||
1995 | struct css_set *oldcg; | ||
1996 | struct cgroupfs_root *root = cgrp->root; | ||
1997 | /* threadgroup list cursor and array */ | ||
1998 | struct task_struct *tsk; | ||
1999 | struct flex_array *group; | ||
2000 | /* | ||
2001 | * we need to make sure we have css_sets for all the tasks we're | ||
2002 | * going to move -before- we actually start moving them, so that in | ||
2003 | * case we get an ENOMEM we can bail out before making any changes. | ||
2004 | */ | ||
2005 | struct list_head newcg_list; | ||
2006 | struct cg_list_entry *cg_entry, *temp_nobe; | ||
2007 | |||
2008 | /* | ||
2009 | * step 0: in order to do expensive, possibly blocking operations for | ||
2010 | * every thread, we cannot iterate the thread group list, since it needs | ||
2011 | * rcu or tasklist locked. instead, build an array of all threads in the | ||
2012 | * group - threadgroup_fork_lock prevents new threads from appearing, | ||
2013 | * and if threads exit, this will just be an over-estimate. | ||
2014 | */ | ||
2015 | group_size = get_nr_threads(leader); | ||
2016 | /* flex_array supports very large thread-groups better than kmalloc. */ | ||
2017 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, | ||
2018 | GFP_KERNEL); | ||
2019 | if (!group) | ||
2020 | return -ENOMEM; | ||
2021 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | ||
2022 | retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); | ||
2023 | if (retval) | ||
2024 | goto out_free_group_list; | ||
2025 | |||
2026 | /* prevent changes to the threadgroup list while we take a snapshot. */ | ||
2027 | rcu_read_lock(); | ||
2028 | if (!thread_group_leader(leader)) { | ||
2029 | /* | ||
2030 | * a race with de_thread from another thread's exec() may strip | ||
2031 | * us of our leadership, making while_each_thread unsafe to use | ||
2032 | * on this task. if this happens, there is no choice but to | ||
2033 | * throw this task away and try again (from cgroup_procs_write); | ||
2034 | * this is "double-double-toil-and-trouble-check locking". | ||
2035 | */ | ||
2036 | rcu_read_unlock(); | ||
2037 | retval = -EAGAIN; | ||
2038 | goto out_free_group_list; | ||
2039 | } | ||
2040 | /* take a reference on each task in the group to go in the array. */ | ||
2041 | tsk = leader; | ||
2042 | i = 0; | ||
2043 | do { | ||
2044 | /* as per above, nr_threads may decrease, but not increase. */ | ||
2045 | BUG_ON(i >= group_size); | ||
2046 | get_task_struct(tsk); | ||
2047 | /* | ||
2048 | * saying GFP_ATOMIC has no effect here because we did prealloc | ||
2049 | * earlier, but it's good form to communicate our expectations. | ||
2050 | */ | ||
2051 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); | ||
2052 | BUG_ON(retval != 0); | ||
2053 | i++; | ||
2054 | } while_each_thread(leader, tsk); | ||
2055 | /* remember the number of threads in the array for later. */ | ||
2056 | group_size = i; | ||
2057 | rcu_read_unlock(); | ||
2058 | |||
2059 | /* | ||
2060 | * step 1: check that we can legitimately attach to the cgroup. | ||
2061 | */ | ||
2062 | for_each_subsys(root, ss) { | ||
2063 | if (ss->can_attach) { | ||
2064 | retval = ss->can_attach(ss, cgrp, leader); | ||
2065 | if (retval) { | ||
2066 | failed_ss = ss; | ||
2067 | goto out_cancel_attach; | ||
2068 | } | ||
2069 | } | ||
2070 | /* a callback to be run on every thread in the threadgroup. */ | ||
2071 | if (ss->can_attach_task) { | ||
2072 | /* run on each task in the threadgroup. */ | ||
2073 | for (i = 0; i < group_size; i++) { | ||
2074 | tsk = flex_array_get_ptr(group, i); | ||
2075 | retval = ss->can_attach_task(cgrp, tsk); | ||
2076 | if (retval) { | ||
2077 | failed_ss = ss; | ||
2078 | cancel_failed_ss = true; | ||
2079 | goto out_cancel_attach; | ||
2080 | } | ||
2081 | } | ||
2082 | } | ||
2083 | } | ||
2084 | |||
2085 | /* | ||
2086 | * step 2: make sure css_sets exist for all threads to be migrated. | ||
2087 | * we use find_css_set, which allocates a new one if necessary. | ||
2088 | */ | ||
2089 | INIT_LIST_HEAD(&newcg_list); | ||
2090 | for (i = 0; i < group_size; i++) { | ||
2091 | tsk = flex_array_get_ptr(group, i); | ||
2092 | /* nothing to do if this task is already in the cgroup */ | ||
2093 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
2094 | if (cgrp == oldcgrp) | ||
2095 | continue; | ||
2096 | /* get old css_set pointer */ | ||
2097 | task_lock(tsk); | ||
2098 | if (tsk->flags & PF_EXITING) { | ||
2099 | /* ignore this task if it's going away */ | ||
2100 | task_unlock(tsk); | ||
2101 | continue; | ||
2102 | } | ||
2103 | oldcg = tsk->cgroups; | ||
2104 | get_css_set(oldcg); | ||
2105 | task_unlock(tsk); | ||
2106 | /* see if the new one for us is already in the list? */ | ||
2107 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { | ||
2108 | /* was already there, nothing to do. */ | ||
2109 | put_css_set(oldcg); | ||
2110 | } else { | ||
2111 | /* we don't already have it. get new one. */ | ||
2112 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | ||
2113 | put_css_set(oldcg); | ||
2114 | if (retval) | ||
2115 | goto out_list_teardown; | ||
2116 | } | ||
2117 | } | ||
2118 | |||
2119 | /* | ||
2120 | * step 3: now that we're guaranteed success wrt the css_sets, proceed | ||
2121 | * to move all tasks to the new cgroup, calling ss->attach_task for each | ||
2122 | * one along the way. there are no failure cases after here, so this is | ||
2123 | * the commit point. | ||
2124 | */ | ||
2125 | for_each_subsys(root, ss) { | ||
2126 | if (ss->pre_attach) | ||
2127 | ss->pre_attach(cgrp); | ||
2128 | } | ||
2129 | for (i = 0; i < group_size; i++) { | ||
2130 | tsk = flex_array_get_ptr(group, i); | ||
2131 | /* leave current thread as it is if it's already there */ | ||
2132 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
2133 | if (cgrp == oldcgrp) | ||
2134 | continue; | ||
2135 | /* attach each task to each subsystem */ | ||
2136 | for_each_subsys(root, ss) { | ||
2137 | if (ss->attach_task) | ||
2138 | ss->attach_task(cgrp, tsk); | ||
2139 | } | ||
2140 | /* if the thread is PF_EXITING, it can just get skipped. */ | ||
2141 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); | ||
2142 | BUG_ON(retval != 0 && retval != -ESRCH); | ||
2143 | } | ||
2144 | /* nothing is sensitive to fork() after this point. */ | ||
2145 | |||
2146 | /* | ||
2147 | * step 4: do expensive, non-thread-specific subsystem callbacks. | ||
2148 | * TODO: if ever a subsystem needs to know the oldcgrp for each task | ||
2149 | * being moved, this call will need to be reworked to communicate that. | ||
2150 | */ | ||
2151 | for_each_subsys(root, ss) { | ||
2152 | if (ss->attach) | ||
2153 | ss->attach(ss, cgrp, oldcgrp, leader); | ||
2154 | } | ||
2155 | |||
2156 | /* | ||
2157 | * step 5: success! and cleanup | ||
2158 | */ | ||
2159 | synchronize_rcu(); | ||
2160 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
2161 | retval = 0; | ||
2162 | out_list_teardown: | ||
2163 | /* clean up the list of prefetched css_sets. */ | ||
2164 | list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { | ||
2165 | list_del(&cg_entry->links); | ||
2166 | put_css_set(cg_entry->cg); | ||
2167 | kfree(cg_entry); | ||
2168 | } | ||
2169 | out_cancel_attach: | ||
2170 | /* same deal as in cgroup_attach_task */ | ||
2171 | if (retval) { | ||
2172 | for_each_subsys(root, ss) { | ||
2173 | if (ss == failed_ss) { | ||
2174 | if (cancel_failed_ss && ss->cancel_attach) | ||
2175 | ss->cancel_attach(ss, cgrp, leader); | ||
2176 | break; | ||
2177 | } | ||
2178 | if (ss->cancel_attach) | ||
2179 | ss->cancel_attach(ss, cgrp, leader); | ||
2180 | } | ||
2181 | } | ||
2182 | /* clean up the array of referenced threads in the group. */ | ||
2183 | for (i = 0; i < group_size; i++) { | ||
2184 | tsk = flex_array_get_ptr(group, i); | ||
2185 | put_task_struct(tsk); | ||
2186 | } | ||
2187 | out_free_group_list: | ||
2188 | flex_array_free(group); | ||
2189 | return retval; | ||
2190 | } | ||
2191 | |||
2192 | /* | ||
2193 | * Find the task_struct of the task to attach by vpid and pass it along to the | ||
2194 | * function to attach either it or all tasks in its threadgroup. Will take | ||
2195 | * cgroup_mutex; may take task_lock of task. | ||
1878 | */ | 2196 | */ |
1879 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) | 2197 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) |
1880 | { | 2198 | { |
1881 | struct task_struct *tsk; | 2199 | struct task_struct *tsk; |
1882 | const struct cred *cred = current_cred(), *tcred; | 2200 | const struct cred *cred = current_cred(), *tcred; |
1883 | int ret; | 2201 | int ret; |
1884 | 2202 | ||
2203 | if (!cgroup_lock_live_group(cgrp)) | ||
2204 | return -ENODEV; | ||
2205 | |||
1885 | if (pid) { | 2206 | if (pid) { |
1886 | rcu_read_lock(); | 2207 | rcu_read_lock(); |
1887 | tsk = find_task_by_vpid(pid); | 2208 | tsk = find_task_by_vpid(pid); |
1888 | if (!tsk || tsk->flags & PF_EXITING) { | 2209 | if (!tsk) { |
1889 | rcu_read_unlock(); | 2210 | rcu_read_unlock(); |
2211 | cgroup_unlock(); | ||
2212 | return -ESRCH; | ||
2213 | } | ||
2214 | if (threadgroup) { | ||
2215 | /* | ||
2216 | * RCU protects this access, since tsk was found in the | ||
2217 | * tid map. a race with de_thread may cause group_leader | ||
2218 | * to stop being the leader, but cgroup_attach_proc will | ||
2219 | * detect it later. | ||
2220 | */ | ||
2221 | tsk = tsk->group_leader; | ||
2222 | } else if (tsk->flags & PF_EXITING) { | ||
2223 | /* optimization for the single-task-only case */ | ||
2224 | rcu_read_unlock(); | ||
2225 | cgroup_unlock(); | ||
1890 | return -ESRCH; | 2226 | return -ESRCH; |
1891 | } | 2227 | } |
1892 | 2228 | ||
2229 | /* | ||
2230 | * even if we're attaching all tasks in the thread group, we | ||
2231 | * only need to check permissions on one of them. | ||
2232 | */ | ||
1893 | tcred = __task_cred(tsk); | 2233 | tcred = __task_cred(tsk); |
1894 | if (cred->euid && | 2234 | if (cred->euid && |
1895 | cred->euid != tcred->uid && | 2235 | cred->euid != tcred->uid && |
1896 | cred->euid != tcred->suid) { | 2236 | cred->euid != tcred->suid) { |
1897 | rcu_read_unlock(); | 2237 | rcu_read_unlock(); |
2238 | cgroup_unlock(); | ||
1898 | return -EACCES; | 2239 | return -EACCES; |
1899 | } | 2240 | } |
1900 | get_task_struct(tsk); | 2241 | get_task_struct(tsk); |
1901 | rcu_read_unlock(); | 2242 | rcu_read_unlock(); |
1902 | } else { | 2243 | } else { |
1903 | tsk = current; | 2244 | if (threadgroup) |
2245 | tsk = current->group_leader; | ||
2246 | else | ||
2247 | tsk = current; | ||
1904 | get_task_struct(tsk); | 2248 | get_task_struct(tsk); |
1905 | } | 2249 | } |
1906 | 2250 | ||
1907 | ret = cgroup_attach_task(cgrp, tsk); | 2251 | if (threadgroup) { |
2252 | threadgroup_fork_write_lock(tsk); | ||
2253 | ret = cgroup_attach_proc(cgrp, tsk); | ||
2254 | threadgroup_fork_write_unlock(tsk); | ||
2255 | } else { | ||
2256 | ret = cgroup_attach_task(cgrp, tsk); | ||
2257 | } | ||
1908 | put_task_struct(tsk); | 2258 | put_task_struct(tsk); |
2259 | cgroup_unlock(); | ||
1909 | return ret; | 2260 | return ret; |
1910 | } | 2261 | } |
1911 | 2262 | ||
1912 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2263 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) |
1913 | { | 2264 | { |
2265 | return attach_task_by_pid(cgrp, pid, false); | ||
2266 | } | ||
2267 | |||
2268 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | ||
2269 | { | ||
1914 | int ret; | 2270 | int ret; |
1915 | if (!cgroup_lock_live_group(cgrp)) | 2271 | do { |
1916 | return -ENODEV; | 2272 | /* |
1917 | ret = attach_task_by_pid(cgrp, pid); | 2273 | * attach_proc fails with -EAGAIN if threadgroup leadership |
1918 | cgroup_unlock(); | 2274 | * changes in the middle of the operation, in which case we need |
2275 | * to find the task_struct for the new leader and start over. | ||
2276 | */ | ||
2277 | ret = attach_task_by_pid(cgrp, tgid, true); | ||
2278 | } while (ret == -EAGAIN); | ||
1919 | return ret; | 2279 | return ret; |
1920 | } | 2280 | } |
1921 | 2281 | ||
@@ -3272,9 +3632,9 @@ static struct cftype files[] = { | |||
3272 | { | 3632 | { |
3273 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | 3633 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", |
3274 | .open = cgroup_procs_open, | 3634 | .open = cgroup_procs_open, |
3275 | /* .write_u64 = cgroup_procs_write, TODO */ | 3635 | .write_u64 = cgroup_procs_write, |
3276 | .release = cgroup_pidlist_release, | 3636 | .release = cgroup_pidlist_release, |
3277 | .mode = S_IRUGO, | 3637 | .mode = S_IRUGO | S_IWUSR, |
3278 | }, | 3638 | }, |
3279 | { | 3639 | { |
3280 | .name = "notify_on_release", | 3640 | .name = "notify_on_release", |
@@ -4270,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4270 | } | 4630 | } |
4271 | 4631 | ||
4272 | /** | 4632 | /** |
4273 | * cgroup_clone - clone the cgroup the given subsystem is attached to | ||
4274 | * @tsk: the task to be moved | ||
4275 | * @subsys: the given subsystem | ||
4276 | * @nodename: the name for the new cgroup | ||
4277 | * | ||
4278 | * Duplicate the current cgroup in the hierarchy that the given | ||
4279 | * subsystem is attached to, and move this task into the new | ||
4280 | * child. | ||
4281 | */ | ||
4282 | int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | ||
4283 | char *nodename) | ||
4284 | { | ||
4285 | struct dentry *dentry; | ||
4286 | int ret = 0; | ||
4287 | struct cgroup *parent, *child; | ||
4288 | struct inode *inode; | ||
4289 | struct css_set *cg; | ||
4290 | struct cgroupfs_root *root; | ||
4291 | struct cgroup_subsys *ss; | ||
4292 | |||
4293 | /* We shouldn't be called by an unregistered subsystem */ | ||
4294 | BUG_ON(!subsys->active); | ||
4295 | |||
4296 | /* First figure out what hierarchy and cgroup we're dealing | ||
4297 | * with, and pin them so we can drop cgroup_mutex */ | ||
4298 | mutex_lock(&cgroup_mutex); | ||
4299 | again: | ||
4300 | root = subsys->root; | ||
4301 | if (root == &rootnode) { | ||
4302 | mutex_unlock(&cgroup_mutex); | ||
4303 | return 0; | ||
4304 | } | ||
4305 | |||
4306 | /* Pin the hierarchy */ | ||
4307 | if (!atomic_inc_not_zero(&root->sb->s_active)) { | ||
4308 | /* We race with the final deactivate_super() */ | ||
4309 | mutex_unlock(&cgroup_mutex); | ||
4310 | return 0; | ||
4311 | } | ||
4312 | |||
4313 | /* Keep the cgroup alive */ | ||
4314 | task_lock(tsk); | ||
4315 | parent = task_cgroup(tsk, subsys->subsys_id); | ||
4316 | cg = tsk->cgroups; | ||
4317 | get_css_set(cg); | ||
4318 | task_unlock(tsk); | ||
4319 | |||
4320 | mutex_unlock(&cgroup_mutex); | ||
4321 | |||
4322 | /* Now do the VFS work to create a cgroup */ | ||
4323 | inode = parent->dentry->d_inode; | ||
4324 | |||
4325 | /* Hold the parent directory mutex across this operation to | ||
4326 | * stop anyone else deleting the new cgroup */ | ||
4327 | mutex_lock(&inode->i_mutex); | ||
4328 | dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); | ||
4329 | if (IS_ERR(dentry)) { | ||
4330 | printk(KERN_INFO | ||
4331 | "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, | ||
4332 | PTR_ERR(dentry)); | ||
4333 | ret = PTR_ERR(dentry); | ||
4334 | goto out_release; | ||
4335 | } | ||
4336 | |||
4337 | /* Create the cgroup directory, which also creates the cgroup */ | ||
4338 | ret = vfs_mkdir(inode, dentry, 0755); | ||
4339 | child = __d_cgrp(dentry); | ||
4340 | dput(dentry); | ||
4341 | if (ret) { | ||
4342 | printk(KERN_INFO | ||
4343 | "Failed to create cgroup %s: %d\n", nodename, | ||
4344 | ret); | ||
4345 | goto out_release; | ||
4346 | } | ||
4347 | |||
4348 | /* The cgroup now exists. Retake cgroup_mutex and check | ||
4349 | * that we're still in the same state that we thought we | ||
4350 | * were. */ | ||
4351 | mutex_lock(&cgroup_mutex); | ||
4352 | if ((root != subsys->root) || | ||
4353 | (parent != task_cgroup(tsk, subsys->subsys_id))) { | ||
4354 | /* Aargh, we raced ... */ | ||
4355 | mutex_unlock(&inode->i_mutex); | ||
4356 | put_css_set(cg); | ||
4357 | |||
4358 | deactivate_super(root->sb); | ||
4359 | /* The cgroup is still accessible in the VFS, but | ||
4360 | * we're not going to try to rmdir() it at this | ||
4361 | * point. */ | ||
4362 | printk(KERN_INFO | ||
4363 | "Race in cgroup_clone() - leaking cgroup %s\n", | ||
4364 | nodename); | ||
4365 | goto again; | ||
4366 | } | ||
4367 | |||
4368 | /* do any required auto-setup */ | ||
4369 | for_each_subsys(root, ss) { | ||
4370 | if (ss->post_clone) | ||
4371 | ss->post_clone(ss, child); | ||
4372 | } | ||
4373 | |||
4374 | /* All seems fine. Finish by moving the task into the new cgroup */ | ||
4375 | ret = cgroup_attach_task(child, tsk); | ||
4376 | mutex_unlock(&cgroup_mutex); | ||
4377 | |||
4378 | out_release: | ||
4379 | mutex_unlock(&inode->i_mutex); | ||
4380 | |||
4381 | mutex_lock(&cgroup_mutex); | ||
4382 | put_css_set(cg); | ||
4383 | mutex_unlock(&cgroup_mutex); | ||
4384 | deactivate_super(root->sb); | ||
4385 | return ret; | ||
4386 | } | ||
4387 | |||
4388 | /** | ||
4389 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp | 4633 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp |
4390 | * @cgrp: the cgroup in question | 4634 | * @cgrp: the cgroup in question |
4391 | * @task: the task in question | 4635 | * @task: the task in question |
@@ -4623,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
4623 | return ret; | 4867 | return ret; |
4624 | } | 4868 | } |
4625 | 4869 | ||
4626 | static void __free_css_id_cb(struct rcu_head *head) | ||
4627 | { | ||
4628 | struct css_id *id; | ||
4629 | |||
4630 | id = container_of(head, struct css_id, rcu_head); | ||
4631 | kfree(id); | ||
4632 | } | ||
4633 | |||
4634 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 4870 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
4635 | { | 4871 | { |
4636 | struct css_id *id = css->id; | 4872 | struct css_id *id = css->id; |
@@ -4645,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
4645 | spin_lock(&ss->id_lock); | 4881 | spin_lock(&ss->id_lock); |
4646 | idr_remove(&ss->idr, id->id); | 4882 | idr_remove(&ss->idr, id->id); |
4647 | spin_unlock(&ss->id_lock); | 4883 | spin_unlock(&ss->id_lock); |
4648 | call_rcu(&id->rcu_head, __free_css_id_cb); | 4884 | kfree_rcu(id, rcu_head); |
4649 | } | 4885 | } |
4650 | EXPORT_SYMBOL_GPL(free_css_id); | 4886 | EXPORT_SYMBOL_GPL(free_css_id); |
4651 | 4887 | ||
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e7bebb7c6c38..e691818d7e45 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
160 | */ | 160 | */ |
161 | static int freezer_can_attach(struct cgroup_subsys *ss, | 161 | static int freezer_can_attach(struct cgroup_subsys *ss, |
162 | struct cgroup *new_cgroup, | 162 | struct cgroup *new_cgroup, |
163 | struct task_struct *task, bool threadgroup) | 163 | struct task_struct *task) |
164 | { | 164 | { |
165 | struct freezer *freezer; | 165 | struct freezer *freezer; |
166 | 166 | ||
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
172 | if (freezer->state != CGROUP_THAWED) | 172 | if (freezer->state != CGROUP_THAWED) |
173 | return -EBUSY; | 173 | return -EBUSY; |
174 | 174 | ||
175 | return 0; | ||
176 | } | ||
177 | |||
178 | static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | ||
179 | { | ||
175 | rcu_read_lock(); | 180 | rcu_read_lock(); |
176 | if (__cgroup_freezing_or_frozen(task)) { | 181 | if (__cgroup_freezing_or_frozen(tsk)) { |
177 | rcu_read_unlock(); | 182 | rcu_read_unlock(); |
178 | return -EBUSY; | 183 | return -EBUSY; |
179 | } | 184 | } |
180 | rcu_read_unlock(); | 185 | rcu_read_unlock(); |
181 | |||
182 | if (threadgroup) { | ||
183 | struct task_struct *c; | ||
184 | |||
185 | rcu_read_lock(); | ||
186 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
187 | if (__cgroup_freezing_or_frozen(c)) { | ||
188 | rcu_read_unlock(); | ||
189 | return -EBUSY; | ||
190 | } | ||
191 | } | ||
192 | rcu_read_unlock(); | ||
193 | } | ||
194 | |||
195 | return 0; | 186 | return 0; |
196 | } | 187 | } |
197 | 188 | ||
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { | |||
390 | .populate = freezer_populate, | 381 | .populate = freezer_populate, |
391 | .subsys_id = freezer_subsys_id, | 382 | .subsys_id = freezer_subsys_id, |
392 | .can_attach = freezer_can_attach, | 383 | .can_attach = freezer_can_attach, |
384 | .can_attach_task = freezer_can_attach_task, | ||
385 | .pre_attach = NULL, | ||
386 | .attach_task = NULL, | ||
393 | .attach = NULL, | 387 | .attach = NULL, |
394 | .fork = freezer_fork, | 388 | .fork = freezer_fork, |
395 | .exit = NULL, | 389 | .exit = NULL, |
diff --git a/kernel/compat.c b/kernel/compat.c index 38b1d2c1cbe8..fc9eb093acd5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) | |||
293 | return compat_jiffies_to_clock_t(jiffies); | 293 | return compat_jiffies_to_clock_t(jiffies); |
294 | } | 294 | } |
295 | 295 | ||
296 | #ifdef __ARCH_WANT_SYS_SIGPENDING | ||
297 | |||
296 | /* | 298 | /* |
297 | * Assumption: old_sigset_t and compat_old_sigset_t are both | 299 | * Assumption: old_sigset_t and compat_old_sigset_t are both |
298 | * types that can be passed to put_user()/get_user(). | 300 | * types that can be passed to put_user()/get_user(). |
@@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
312 | return ret; | 314 | return ret; |
313 | } | 315 | } |
314 | 316 | ||
317 | #endif | ||
318 | |||
319 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | ||
320 | |||
315 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 321 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, |
316 | compat_old_sigset_t __user *oset) | 322 | compat_old_sigset_t __user *oset) |
317 | { | 323 | { |
@@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | |||
333 | return ret; | 339 | return ret; |
334 | } | 340 | } |
335 | 341 | ||
342 | #endif | ||
343 | |||
336 | asmlinkage long compat_sys_setrlimit(unsigned int resource, | 344 | asmlinkage long compat_sys_setrlimit(unsigned int resource, |
337 | struct compat_rlimit __user *rlim) | 345 | struct compat_rlimit __user *rlim) |
338 | { | 346 | { |
@@ -890,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
890 | { | 898 | { |
891 | compat_sigset_t s32; | 899 | compat_sigset_t s32; |
892 | sigset_t s; | 900 | sigset_t s; |
893 | int sig; | ||
894 | struct timespec t; | 901 | struct timespec t; |
895 | siginfo_t info; | 902 | siginfo_t info; |
896 | long ret, timeout = 0; | 903 | long ret; |
897 | 904 | ||
898 | if (sigsetsize != sizeof(sigset_t)) | 905 | if (sigsetsize != sizeof(sigset_t)) |
899 | return -EINVAL; | 906 | return -EINVAL; |
@@ -901,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
901 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | 908 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) |
902 | return -EFAULT; | 909 | return -EFAULT; |
903 | sigset_from_compat(&s, &s32); | 910 | sigset_from_compat(&s, &s32); |
904 | sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
905 | signotset(&s); | ||
906 | 911 | ||
907 | if (uts) { | 912 | if (uts) { |
908 | if (get_compat_timespec (&t, uts)) | 913 | if (get_compat_timespec(&t, uts)) |
909 | return -EFAULT; | 914 | return -EFAULT; |
910 | if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 | ||
911 | || t.tv_sec < 0) | ||
912 | return -EINVAL; | ||
913 | } | 915 | } |
914 | 916 | ||
915 | spin_lock_irq(¤t->sighand->siglock); | 917 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); |
916 | sig = dequeue_signal(current, &s, &info); | ||
917 | if (!sig) { | ||
918 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
919 | if (uts) | ||
920 | timeout = timespec_to_jiffies(&t) | ||
921 | +(t.tv_sec || t.tv_nsec); | ||
922 | if (timeout) { | ||
923 | current->real_blocked = current->blocked; | ||
924 | sigandsets(¤t->blocked, ¤t->blocked, &s); | ||
925 | |||
926 | recalc_sigpending(); | ||
927 | spin_unlock_irq(¤t->sighand->siglock); | ||
928 | |||
929 | timeout = schedule_timeout_interruptible(timeout); | ||
930 | |||
931 | spin_lock_irq(¤t->sighand->siglock); | ||
932 | sig = dequeue_signal(current, &s, &info); | ||
933 | current->blocked = current->real_blocked; | ||
934 | siginitset(¤t->real_blocked, 0); | ||
935 | recalc_sigpending(); | ||
936 | } | ||
937 | } | ||
938 | spin_unlock_irq(¤t->sighand->siglock); | ||
939 | 918 | ||
940 | if (sig) { | 919 | if (ret > 0 && uinfo) { |
941 | ret = sig; | 920 | if (copy_siginfo_to_user32(uinfo, &info)) |
942 | if (uinfo) { | 921 | ret = -EFAULT; |
943 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
944 | ret = -EFAULT; | ||
945 | } | ||
946 | }else { | ||
947 | ret = timeout?-EINTR:-EAGAIN; | ||
948 | } | 922 | } |
923 | |||
949 | return ret; | 924 | return ret; |
950 | 925 | ||
951 | } | 926 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb8..9c9b7545c810 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) | |||
1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
1160 | { | 1160 | { |
1161 | #ifdef CONFIG_SMP | 1161 | #ifdef CONFIG_SMP |
1162 | if (val < -1 || val >= SD_LV_MAX) | 1162 | if (val < -1 || val >= sched_domain_level_max) |
1163 | return -EINVAL; | 1163 | return -EINVAL; |
1164 | #endif | 1164 | #endif |
1165 | 1165 | ||
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1367 | return val; | 1367 | return val; |
1368 | } | 1368 | } |
1369 | 1369 | ||
1370 | /* Protected by cgroup_lock */ | ||
1371 | static cpumask_var_t cpus_attach; | ||
1372 | |||
1373 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1370 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1374 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1371 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
1375 | struct task_struct *tsk, bool threadgroup) | 1372 | struct task_struct *tsk) |
1376 | { | 1373 | { |
1377 | int ret; | ||
1378 | struct cpuset *cs = cgroup_cs(cont); | 1374 | struct cpuset *cs = cgroup_cs(cont); |
1379 | 1375 | ||
1380 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1376 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1391 | if (tsk->flags & PF_THREAD_BOUND) | 1387 | if (tsk->flags & PF_THREAD_BOUND) |
1392 | return -EINVAL; | 1388 | return -EINVAL; |
1393 | 1389 | ||
1394 | ret = security_task_setscheduler(tsk); | ||
1395 | if (ret) | ||
1396 | return ret; | ||
1397 | if (threadgroup) { | ||
1398 | struct task_struct *c; | ||
1399 | |||
1400 | rcu_read_lock(); | ||
1401 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
1402 | ret = security_task_setscheduler(c); | ||
1403 | if (ret) { | ||
1404 | rcu_read_unlock(); | ||
1405 | return ret; | ||
1406 | } | ||
1407 | } | ||
1408 | rcu_read_unlock(); | ||
1409 | } | ||
1410 | return 0; | 1390 | return 0; |
1411 | } | 1391 | } |
1412 | 1392 | ||
1413 | static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | 1393 | static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) |
1414 | struct cpuset *cs) | 1394 | { |
1395 | return security_task_setscheduler(task); | ||
1396 | } | ||
1397 | |||
1398 | /* | ||
1399 | * Protected by cgroup_lock. The nodemasks must be stored globally because | ||
1400 | * dynamically allocating them is not allowed in pre_attach, and they must | ||
1401 | * persist among pre_attach, attach_task, and attach. | ||
1402 | */ | ||
1403 | static cpumask_var_t cpus_attach; | ||
1404 | static nodemask_t cpuset_attach_nodemask_from; | ||
1405 | static nodemask_t cpuset_attach_nodemask_to; | ||
1406 | |||
1407 | /* Set-up work for before attaching each task. */ | ||
1408 | static void cpuset_pre_attach(struct cgroup *cont) | ||
1409 | { | ||
1410 | struct cpuset *cs = cgroup_cs(cont); | ||
1411 | |||
1412 | if (cs == &top_cpuset) | ||
1413 | cpumask_copy(cpus_attach, cpu_possible_mask); | ||
1414 | else | ||
1415 | guarantee_online_cpus(cs, cpus_attach); | ||
1416 | |||
1417 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | ||
1418 | } | ||
1419 | |||
1420 | /* Per-thread attachment work. */ | ||
1421 | static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) | ||
1415 | { | 1422 | { |
1416 | int err; | 1423 | int err; |
1424 | struct cpuset *cs = cgroup_cs(cont); | ||
1425 | |||
1417 | /* | 1426 | /* |
1418 | * can_attach beforehand should guarantee that this doesn't fail. | 1427 | * can_attach beforehand should guarantee that this doesn't fail. |
1419 | * TODO: have a better way to handle failure here | 1428 | * TODO: have a better way to handle failure here |
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | |||
1421 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | 1430 | err = set_cpus_allowed_ptr(tsk, cpus_attach); |
1422 | WARN_ON_ONCE(err); | 1431 | WARN_ON_ONCE(err); |
1423 | 1432 | ||
1424 | cpuset_change_task_nodemask(tsk, to); | 1433 | cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); |
1425 | cpuset_update_task_spread_flag(cs, tsk); | 1434 | cpuset_update_task_spread_flag(cs, tsk); |
1426 | |||
1427 | } | 1435 | } |
1428 | 1436 | ||
1429 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1437 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
1430 | struct cgroup *oldcont, struct task_struct *tsk, | 1438 | struct cgroup *oldcont, struct task_struct *tsk) |
1431 | bool threadgroup) | ||
1432 | { | 1439 | { |
1433 | struct mm_struct *mm; | 1440 | struct mm_struct *mm; |
1434 | struct cpuset *cs = cgroup_cs(cont); | 1441 | struct cpuset *cs = cgroup_cs(cont); |
1435 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1442 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1436 | static nodemask_t to; /* protected by cgroup_mutex */ | ||
1437 | 1443 | ||
1438 | if (cs == &top_cpuset) { | 1444 | /* |
1439 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1445 | * Change mm, possibly for multiple threads in a threadgroup. This is |
1440 | } else { | 1446 | * expensive and may sleep. |
1441 | guarantee_online_cpus(cs, cpus_attach); | 1447 | */ |
1442 | } | 1448 | cpuset_attach_nodemask_from = oldcs->mems_allowed; |
1443 | guarantee_online_mems(cs, &to); | 1449 | cpuset_attach_nodemask_to = cs->mems_allowed; |
1444 | |||
1445 | /* do per-task migration stuff possibly for each in the threadgroup */ | ||
1446 | cpuset_attach_task(tsk, &to, cs); | ||
1447 | if (threadgroup) { | ||
1448 | struct task_struct *c; | ||
1449 | rcu_read_lock(); | ||
1450 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
1451 | cpuset_attach_task(c, &to, cs); | ||
1452 | } | ||
1453 | rcu_read_unlock(); | ||
1454 | } | ||
1455 | |||
1456 | /* change mm; only needs to be done once even if threadgroup */ | ||
1457 | to = cs->mems_allowed; | ||
1458 | mm = get_task_mm(tsk); | 1450 | mm = get_task_mm(tsk); |
1459 | if (mm) { | 1451 | if (mm) { |
1460 | mpol_rebind_mm(mm, &to); | 1452 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1461 | if (is_memory_migrate(cs)) | 1453 | if (is_memory_migrate(cs)) |
1462 | cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); | 1454 | cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, |
1455 | &cpuset_attach_nodemask_to); | ||
1463 | mmput(mm); | 1456 | mmput(mm); |
1464 | } | 1457 | } |
1465 | } | 1458 | } |
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1809 | } | 1802 | } |
1810 | 1803 | ||
1811 | /* | 1804 | /* |
1812 | * post_clone() is called at the end of cgroup_clone(). | 1805 | * post_clone() is called during cgroup_create() when the |
1813 | * 'cgroup' was just created automatically as a result of | 1806 | * clone_children mount argument was specified. The cgroup |
1814 | * a cgroup_clone(), and the current task is about to | 1807 | * can not yet have any tasks. |
1815 | * be moved into 'cgroup'. | ||
1816 | * | 1808 | * |
1817 | * Currently we refuse to set up the cgroup - thereby | 1809 | * Currently we refuse to set up the cgroup - thereby |
1818 | * refusing the task to be entered, and as a result refusing | 1810 | * refusing the task to be entered, and as a result refusing |
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1911 | .create = cpuset_create, | 1903 | .create = cpuset_create, |
1912 | .destroy = cpuset_destroy, | 1904 | .destroy = cpuset_destroy, |
1913 | .can_attach = cpuset_can_attach, | 1905 | .can_attach = cpuset_can_attach, |
1906 | .can_attach_task = cpuset_can_attach_task, | ||
1907 | .pre_attach = cpuset_pre_attach, | ||
1908 | .attach_task = cpuset_attach_task, | ||
1914 | .attach = cpuset_attach, | 1909 | .attach = cpuset_attach, |
1915 | .populate = cpuset_populate, | 1910 | .populate = cpuset_populate, |
1916 | .post_clone = cpuset_post_clone, | 1911 | .post_clone = cpuset_post_clone, |
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
2195 | rcu_read_lock(); | 2190 | rcu_read_lock(); |
2196 | cs = task_cs(tsk); | 2191 | cs = task_cs(tsk); |
2197 | if (cs) | 2192 | if (cs) |
2198 | cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); | 2193 | do_set_cpus_allowed(tsk, cs->cpus_allowed); |
2199 | rcu_read_unlock(); | 2194 | rcu_read_unlock(); |
2200 | 2195 | ||
2201 | /* | 2196 | /* |
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
2222 | * Like above we can temporary set any mask and rely on | 2217 | * Like above we can temporary set any mask and rely on |
2223 | * set_cpus_allowed_ptr() as synchronization point. | 2218 | * set_cpus_allowed_ptr() as synchronization point. |
2224 | */ | 2219 | */ |
2225 | cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); | 2220 | do_set_cpus_allowed(tsk, cpu_possible_mask); |
2226 | cpu = cpumask_any(cpu_active_mask); | 2221 | cpu = cpumask_any(cpu_active_mask); |
2227 | } | 2222 | } |
2228 | 2223 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 8093c16b84b1..174fa84eca30 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Task credentials management - see Documentation/credentials.txt | 1 | /* Task credentials management - see Documentation/security/credentials.txt |
2 | * | 2 | * |
3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
@@ -49,10 +49,10 @@ struct cred init_cred = { | |||
49 | .magic = CRED_MAGIC, | 49 | .magic = CRED_MAGIC, |
50 | #endif | 50 | #endif |
51 | .securebits = SECUREBITS_DEFAULT, | 51 | .securebits = SECUREBITS_DEFAULT, |
52 | .cap_inheritable = CAP_INIT_INH_SET, | 52 | .cap_inheritable = CAP_EMPTY_SET, |
53 | .cap_permitted = CAP_FULL_SET, | 53 | .cap_permitted = CAP_FULL_SET, |
54 | .cap_effective = CAP_INIT_EFF_SET, | 54 | .cap_effective = CAP_FULL_SET, |
55 | .cap_bset = CAP_INIT_BSET, | 55 | .cap_bset = CAP_FULL_SET, |
56 | .user = INIT_USER, | 56 | .user = INIT_USER, |
57 | .user_ns = &init_user_ns, | 57 | .user_ns = &init_user_ns, |
58 | .group_info = &init_groups, | 58 | .group_info = &init_groups, |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 000000000000..1ce23d3d8394 --- /dev/null +++ b/kernel/events/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | ifdef CONFIG_FUNCTION_TRACER | ||
2 | CFLAGS_REMOVE_core.o = -pg | ||
3 | endif | ||
4 | |||
5 | obj-y := core.o | ||
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | ||
diff --git a/kernel/perf_event.c b/kernel/events/core.c index 8e81a9860a0d..9efe7108ccaf 100644 --- a/kernel/perf_event.c +++ b/kernel/events/core.c | |||
@@ -2,8 +2,8 @@ | |||
2 | * Performance events core code: | 2 | * Performance events core code: |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
@@ -39,10 +39,10 @@ | |||
39 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
40 | 40 | ||
41 | struct remote_function_call { | 41 | struct remote_function_call { |
42 | struct task_struct *p; | 42 | struct task_struct *p; |
43 | int (*func)(void *info); | 43 | int (*func)(void *info); |
44 | void *info; | 44 | void *info; |
45 | int ret; | 45 | int ret; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | static void remote_function(void *data) | 48 | static void remote_function(void *data) |
@@ -76,10 +76,10 @@ static int | |||
76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | 76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) |
77 | { | 77 | { |
78 | struct remote_function_call data = { | 78 | struct remote_function_call data = { |
79 | .p = p, | 79 | .p = p, |
80 | .func = func, | 80 | .func = func, |
81 | .info = info, | 81 | .info = info, |
82 | .ret = -ESRCH, /* No such (running) process */ | 82 | .ret = -ESRCH, /* No such (running) process */ |
83 | }; | 83 | }; |
84 | 84 | ||
85 | if (task_curr(p)) | 85 | if (task_curr(p)) |
@@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | |||
100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | 100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) |
101 | { | 101 | { |
102 | struct remote_function_call data = { | 102 | struct remote_function_call data = { |
103 | .p = NULL, | 103 | .p = NULL, |
104 | .func = func, | 104 | .func = func, |
105 | .info = info, | 105 | .info = info, |
106 | .ret = -ENXIO, /* No such CPU */ | 106 | .ret = -ENXIO, /* No such CPU */ |
107 | }; | 107 | }; |
108 | 108 | ||
109 | smp_call_function_single(cpu, remote_function, &data, 1); | 109 | smp_call_function_single(cpu, remote_function, &data, 1); |
@@ -125,7 +125,7 @@ enum event_type_t { | |||
125 | * perf_sched_events : >0 events exist | 125 | * perf_sched_events : >0 events exist |
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
127 | */ | 127 | */ |
128 | atomic_t perf_sched_events __read_mostly; | 128 | struct jump_label_key perf_sched_events __read_mostly; |
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
130 | 130 | ||
131 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
@@ -586,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx) | |||
586 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 586 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); |
587 | } | 587 | } |
588 | 588 | ||
589 | static void free_ctx(struct rcu_head *head) | ||
590 | { | ||
591 | struct perf_event_context *ctx; | ||
592 | |||
593 | ctx = container_of(head, struct perf_event_context, rcu_head); | ||
594 | kfree(ctx); | ||
595 | } | ||
596 | |||
597 | static void put_ctx(struct perf_event_context *ctx) | 589 | static void put_ctx(struct perf_event_context *ctx) |
598 | { | 590 | { |
599 | if (atomic_dec_and_test(&ctx->refcount)) { | 591 | if (atomic_dec_and_test(&ctx->refcount)) { |
@@ -601,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
601 | put_ctx(ctx->parent_ctx); | 593 | put_ctx(ctx->parent_ctx); |
602 | if (ctx->task) | 594 | if (ctx->task) |
603 | put_task_struct(ctx->task); | 595 | put_task_struct(ctx->task); |
604 | call_rcu(&ctx->rcu_head, free_ctx); | 596 | kfree_rcu(ctx, rcu_head); |
605 | } | 597 | } |
606 | } | 598 | } |
607 | 599 | ||
@@ -5036,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
5036 | else | 5028 | else |
5037 | perf_event_output(event, nmi, data, regs); | 5029 | perf_event_output(event, nmi, data, regs); |
5038 | 5030 | ||
5031 | if (event->fasync && event->pending_kill) { | ||
5032 | if (nmi) { | ||
5033 | event->pending_wakeup = 1; | ||
5034 | irq_work_queue(&event->pending); | ||
5035 | } else | ||
5036 | perf_event_wakeup(event); | ||
5037 | } | ||
5038 | |||
5039 | return ret; | 5039 | return ret; |
5040 | } | 5040 | } |
5041 | 5041 | ||
@@ -5331,14 +5331,6 @@ swevent_hlist_deref(struct swevent_htable *swhash) | |||
5331 | lockdep_is_held(&swhash->hlist_mutex)); | 5331 | lockdep_is_held(&swhash->hlist_mutex)); |
5332 | } | 5332 | } |
5333 | 5333 | ||
5334 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | ||
5335 | { | ||
5336 | struct swevent_hlist *hlist; | ||
5337 | |||
5338 | hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); | ||
5339 | kfree(hlist); | ||
5340 | } | ||
5341 | |||
5342 | static void swevent_hlist_release(struct swevent_htable *swhash) | 5334 | static void swevent_hlist_release(struct swevent_htable *swhash) |
5343 | { | 5335 | { |
5344 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); | 5336 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
@@ -5347,7 +5339,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) | |||
5347 | return; | 5339 | return; |
5348 | 5340 | ||
5349 | rcu_assign_pointer(swhash->swevent_hlist, NULL); | 5341 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
5350 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 5342 | kfree_rcu(hlist, rcu_head); |
5351 | } | 5343 | } |
5352 | 5344 | ||
5353 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 5345 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
@@ -5429,7 +5421,7 @@ fail: | |||
5429 | return err; | 5421 | return err; |
5430 | } | 5422 | } |
5431 | 5423 | ||
5432 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 5424 | struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
5433 | 5425 | ||
5434 | static void sw_perf_event_destroy(struct perf_event *event) | 5426 | static void sw_perf_event_destroy(struct perf_event *event) |
5435 | { | 5427 | { |
@@ -7410,26 +7402,12 @@ static int __perf_cgroup_move(void *info) | |||
7410 | return 0; | 7402 | return 0; |
7411 | } | 7403 | } |
7412 | 7404 | ||
7413 | static void perf_cgroup_move(struct task_struct *task) | 7405 | static void |
7406 | perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) | ||
7414 | { | 7407 | { |
7415 | task_function_call(task, __perf_cgroup_move, task); | 7408 | task_function_call(task, __perf_cgroup_move, task); |
7416 | } | 7409 | } |
7417 | 7410 | ||
7418 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7419 | struct cgroup *old_cgrp, struct task_struct *task, | ||
7420 | bool threadgroup) | ||
7421 | { | ||
7422 | perf_cgroup_move(task); | ||
7423 | if (threadgroup) { | ||
7424 | struct task_struct *c; | ||
7425 | rcu_read_lock(); | ||
7426 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
7427 | perf_cgroup_move(c); | ||
7428 | } | ||
7429 | rcu_read_unlock(); | ||
7430 | } | ||
7431 | } | ||
7432 | |||
7433 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7411 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7434 | struct cgroup *old_cgrp, struct task_struct *task) | 7412 | struct cgroup *old_cgrp, struct task_struct *task) |
7435 | { | 7413 | { |
@@ -7441,15 +7419,15 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7441 | if (!(task->flags & PF_EXITING)) | 7419 | if (!(task->flags & PF_EXITING)) |
7442 | return; | 7420 | return; |
7443 | 7421 | ||
7444 | perf_cgroup_move(task); | 7422 | perf_cgroup_attach_task(cgrp, task); |
7445 | } | 7423 | } |
7446 | 7424 | ||
7447 | struct cgroup_subsys perf_subsys = { | 7425 | struct cgroup_subsys perf_subsys = { |
7448 | .name = "perf_event", | 7426 | .name = "perf_event", |
7449 | .subsys_id = perf_subsys_id, | 7427 | .subsys_id = perf_subsys_id, |
7450 | .create = perf_cgroup_create, | 7428 | .create = perf_cgroup_create, |
7451 | .destroy = perf_cgroup_destroy, | 7429 | .destroy = perf_cgroup_destroy, |
7452 | .exit = perf_cgroup_exit, | 7430 | .exit = perf_cgroup_exit, |
7453 | .attach = perf_cgroup_attach, | 7431 | .attach_task = perf_cgroup_attach_task, |
7454 | }; | 7432 | }; |
7455 | #endif /* CONFIG_CGROUP_PERF */ | 7433 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf25a55e..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
diff --git a/kernel/exit.c b/kernel/exit.c index 8dd874181542..f2b321bae440 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -561,29 +561,28 @@ void exit_files(struct task_struct *tsk) | |||
561 | 561 | ||
562 | #ifdef CONFIG_MM_OWNER | 562 | #ifdef CONFIG_MM_OWNER |
563 | /* | 563 | /* |
564 | * Task p is exiting and it owned mm, lets find a new owner for it | 564 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
565 | */ | 565 | */ |
566 | static inline int | ||
567 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) | ||
568 | { | ||
569 | /* | ||
570 | * If there are other users of the mm and the owner (us) is exiting | ||
571 | * we need to find a new owner to take on the responsibility. | ||
572 | */ | ||
573 | if (atomic_read(&mm->mm_users) <= 1) | ||
574 | return 0; | ||
575 | if (mm->owner != p) | ||
576 | return 0; | ||
577 | return 1; | ||
578 | } | ||
579 | |||
580 | void mm_update_next_owner(struct mm_struct *mm) | 566 | void mm_update_next_owner(struct mm_struct *mm) |
581 | { | 567 | { |
582 | struct task_struct *c, *g, *p = current; | 568 | struct task_struct *c, *g, *p = current; |
583 | 569 | ||
584 | retry: | 570 | retry: |
585 | if (!mm_need_new_owner(mm, p)) | 571 | /* |
572 | * If the exiting or execing task is not the owner, it's | ||
573 | * someone else's problem. | ||
574 | */ | ||
575 | if (mm->owner != p) | ||
586 | return; | 576 | return; |
577 | /* | ||
578 | * The current owner is exiting/execing and there are no other | ||
579 | * candidates. Do not leave the mm pointing to a possibly | ||
580 | * freed task structure. | ||
581 | */ | ||
582 | if (atomic_read(&mm->mm_users) <= 1) { | ||
583 | mm->owner = NULL; | ||
584 | return; | ||
585 | } | ||
587 | 586 | ||
588 | read_lock(&tasklist_lock); | 587 | read_lock(&tasklist_lock); |
589 | /* | 588 | /* |
@@ -1377,11 +1376,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) | |||
1377 | return NULL; | 1376 | return NULL; |
1378 | } | 1377 | } |
1379 | 1378 | ||
1380 | /* | 1379 | /** |
1381 | * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold | 1380 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED |
1382 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1381 | * @wo: wait options |
1383 | * the lock and this task is uninteresting. If we return nonzero, we have | 1382 | * @ptrace: is the wait for ptrace |
1384 | * released the lock and the system call should return. | 1383 | * @p: task to wait for |
1384 | * | ||
1385 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. | ||
1386 | * | ||
1387 | * CONTEXT: | ||
1388 | * read_lock(&tasklist_lock), which is released if return value is | ||
1389 | * non-zero. Also, grabs and releases @p->sighand->siglock. | ||
1390 | * | ||
1391 | * RETURNS: | ||
1392 | * 0 if wait condition didn't exist and search for other wait conditions | ||
1393 | * should continue. Non-zero return, -errno on failure and @p's pid on | ||
1394 | * success, implies that tasklist_lock is released and wait condition | ||
1395 | * search should terminate. | ||
1385 | */ | 1396 | */ |
1386 | static int wait_task_stopped(struct wait_opts *wo, | 1397 | static int wait_task_stopped(struct wait_opts *wo, |
1387 | int ptrace, struct task_struct *p) | 1398 | int ptrace, struct task_struct *p) |
@@ -1397,6 +1408,9 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1397 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) | 1408 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) |
1398 | return 0; | 1409 | return 0; |
1399 | 1410 | ||
1411 | if (!task_stopped_code(p, ptrace)) | ||
1412 | return 0; | ||
1413 | |||
1400 | exit_code = 0; | 1414 | exit_code = 0; |
1401 | spin_lock_irq(&p->sighand->siglock); | 1415 | spin_lock_irq(&p->sighand->siglock); |
1402 | 1416 | ||
@@ -1538,33 +1552,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1538 | return 0; | 1552 | return 0; |
1539 | } | 1553 | } |
1540 | 1554 | ||
1541 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | 1555 | /* dead body doesn't have much to contribute */ |
1556 | if (p->exit_state == EXIT_DEAD) | ||
1557 | return 0; | ||
1558 | |||
1559 | /* slay zombie? */ | ||
1560 | if (p->exit_state == EXIT_ZOMBIE) { | ||
1561 | /* | ||
1562 | * A zombie ptracee is only visible to its ptracer. | ||
1563 | * Notification and reaping will be cascaded to the real | ||
1564 | * parent when the ptracer detaches. | ||
1565 | */ | ||
1566 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | ||
1567 | /* it will become visible, clear notask_error */ | ||
1568 | wo->notask_error = 0; | ||
1569 | return 0; | ||
1570 | } | ||
1571 | |||
1572 | /* we don't reap group leaders with subthreads */ | ||
1573 | if (!delay_group_leader(p)) | ||
1574 | return wait_task_zombie(wo, p); | ||
1575 | |||
1542 | /* | 1576 | /* |
1543 | * This child is hidden by ptrace. | 1577 | * Allow access to stopped/continued state via zombie by |
1544 | * We aren't allowed to see it now, but eventually we will. | 1578 | * falling through. Clearing of notask_error is complex. |
1579 | * | ||
1580 | * When !@ptrace: | ||
1581 | * | ||
1582 | * If WEXITED is set, notask_error should naturally be | ||
1583 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, | ||
1584 | * so, if there are live subthreads, there are events to | ||
1585 | * wait for. If all subthreads are dead, it's still safe | ||
1586 | * to clear - this function will be called again in finite | ||
1587 | * amount time once all the subthreads are released and | ||
1588 | * will then return without clearing. | ||
1589 | * | ||
1590 | * When @ptrace: | ||
1591 | * | ||
1592 | * Stopped state is per-task and thus can't change once the | ||
1593 | * target task dies. Only continued and exited can happen. | ||
1594 | * Clear notask_error if WCONTINUED | WEXITED. | ||
1595 | */ | ||
1596 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) | ||
1597 | wo->notask_error = 0; | ||
1598 | } else { | ||
1599 | /* | ||
1600 | * If @p is ptraced by a task in its real parent's group, | ||
1601 | * hide group stop/continued state when looking at @p as | ||
1602 | * the real parent; otherwise, a single stop can be | ||
1603 | * reported twice as group and ptrace stops. | ||
1604 | * | ||
1605 | * If a ptracer wants to distinguish the two events for its | ||
1606 | * own children, it should create a separate process which | ||
1607 | * takes the role of real parent. | ||
1608 | */ | ||
1609 | if (likely(!ptrace) && task_ptrace(p) && | ||
1610 | same_thread_group(p->parent, p->real_parent)) | ||
1611 | return 0; | ||
1612 | |||
1613 | /* | ||
1614 | * @p is alive and it's gonna stop, continue or exit, so | ||
1615 | * there always is something to wait for. | ||
1545 | */ | 1616 | */ |
1546 | wo->notask_error = 0; | 1617 | wo->notask_error = 0; |
1547 | return 0; | ||
1548 | } | 1618 | } |
1549 | 1619 | ||
1550 | if (p->exit_state == EXIT_DEAD) | ||
1551 | return 0; | ||
1552 | |||
1553 | /* | 1620 | /* |
1554 | * We don't reap group leaders with subthreads. | 1621 | * Wait for stopped. Depending on @ptrace, different stopped state |
1622 | * is used and the two don't interact with each other. | ||
1555 | */ | 1623 | */ |
1556 | if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) | 1624 | ret = wait_task_stopped(wo, ptrace, p); |
1557 | return wait_task_zombie(wo, p); | 1625 | if (ret) |
1626 | return ret; | ||
1558 | 1627 | ||
1559 | /* | 1628 | /* |
1560 | * It's stopped or running now, so it might | 1629 | * Wait for continued. There's only one continued state and the |
1561 | * later continue, exit, or stop again. | 1630 | * ptracer can consume it which can confuse the real parent. Don't |
1631 | * use WCONTINUED from ptracer. You don't need or want it. | ||
1562 | */ | 1632 | */ |
1563 | wo->notask_error = 0; | ||
1564 | |||
1565 | if (task_stopped_code(p, ptrace)) | ||
1566 | return wait_task_stopped(wo, ptrace, p); | ||
1567 | |||
1568 | return wait_task_continued(wo, p); | 1633 | return wait_task_continued(wo, p); |
1569 | } | 1634 | } |
1570 | 1635 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263f8524..5339705b8241 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr) | |||
72 | return 0; | 72 | return 0; |
73 | } | 73 | } |
74 | 74 | ||
75 | /** | ||
76 | * core_kernel_data - tell if addr points to kernel data | ||
77 | * @addr: address to test | ||
78 | * | ||
79 | * Returns true if @addr passed in is from the core kernel data | ||
80 | * section. | ||
81 | * | ||
82 | * Note: On some archs it may return true for core RODATA, and false | ||
83 | * for others. But will always be true for core RW data. | ||
84 | */ | ||
85 | int core_kernel_data(unsigned long addr) | ||
86 | { | ||
87 | if (addr >= (unsigned long)_sdata && | ||
88 | addr < (unsigned long)_edata) | ||
89 | return 1; | ||
90 | return 0; | ||
91 | } | ||
92 | |||
75 | int __kernel_text_address(unsigned long addr) | 93 | int __kernel_text_address(unsigned long addr) |
76 | { | 94 | { |
77 | if (core_kernel_text(addr)) | 95 | if (core_kernel_text(addr)) |
diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636b..0276c30401a0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -59,7 +59,6 @@ | |||
59 | #include <linux/taskstats_kern.h> | 59 | #include <linux/taskstats_kern.h> |
60 | #include <linux/random.h> | 60 | #include <linux/random.h> |
61 | #include <linux/tty.h> | 61 | #include <linux/tty.h> |
62 | #include <linux/proc_fs.h> | ||
63 | #include <linux/blkdev.h> | 62 | #include <linux/blkdev.h> |
64 | #include <linux/fs_struct.h> | 63 | #include <linux/fs_struct.h> |
65 | #include <linux/magic.h> | 64 | #include <linux/magic.h> |
@@ -383,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
383 | get_file(file); | 382 | get_file(file); |
384 | if (tmp->vm_flags & VM_DENYWRITE) | 383 | if (tmp->vm_flags & VM_DENYWRITE) |
385 | atomic_dec(&inode->i_writecount); | 384 | atomic_dec(&inode->i_writecount); |
386 | spin_lock(&mapping->i_mmap_lock); | 385 | mutex_lock(&mapping->i_mmap_mutex); |
387 | if (tmp->vm_flags & VM_SHARED) | 386 | if (tmp->vm_flags & VM_SHARED) |
388 | mapping->i_mmap_writable++; | 387 | mapping->i_mmap_writable++; |
389 | tmp->vm_truncate_count = mpnt->vm_truncate_count; | ||
390 | flush_dcache_mmap_lock(mapping); | 388 | flush_dcache_mmap_lock(mapping); |
391 | /* insert tmp into the share list, just after mpnt */ | 389 | /* insert tmp into the share list, just after mpnt */ |
392 | vma_prio_tree_add(tmp, mpnt); | 390 | vma_prio_tree_add(tmp, mpnt); |
393 | flush_dcache_mmap_unlock(mapping); | 391 | flush_dcache_mmap_unlock(mapping); |
394 | spin_unlock(&mapping->i_mmap_lock); | 392 | mutex_unlock(&mapping->i_mmap_mutex); |
395 | } | 393 | } |
396 | 394 | ||
397 | /* | 395 | /* |
@@ -522,11 +520,12 @@ struct mm_struct * mm_alloc(void) | |||
522 | struct mm_struct * mm; | 520 | struct mm_struct * mm; |
523 | 521 | ||
524 | mm = allocate_mm(); | 522 | mm = allocate_mm(); |
525 | if (mm) { | 523 | if (!mm) |
526 | memset(mm, 0, sizeof(*mm)); | 524 | return NULL; |
527 | mm = mm_init(mm, current); | 525 | |
528 | } | 526 | memset(mm, 0, sizeof(*mm)); |
529 | return mm; | 527 | mm_init_cpumask(mm); |
528 | return mm_init(mm, current); | ||
530 | } | 529 | } |
531 | 530 | ||
532 | /* | 531 | /* |
@@ -573,6 +572,57 @@ void mmput(struct mm_struct *mm) | |||
573 | } | 572 | } |
574 | EXPORT_SYMBOL_GPL(mmput); | 573 | EXPORT_SYMBOL_GPL(mmput); |
575 | 574 | ||
575 | /* | ||
576 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
577 | * during exec and are not mapped with the mmap system call. | ||
578 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
579 | */ | ||
580 | void added_exe_file_vma(struct mm_struct *mm) | ||
581 | { | ||
582 | mm->num_exe_file_vmas++; | ||
583 | } | ||
584 | |||
585 | void removed_exe_file_vma(struct mm_struct *mm) | ||
586 | { | ||
587 | mm->num_exe_file_vmas--; | ||
588 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ | ||
589 | fput(mm->exe_file); | ||
590 | mm->exe_file = NULL; | ||
591 | } | ||
592 | |||
593 | } | ||
594 | |||
595 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | ||
596 | { | ||
597 | if (new_exe_file) | ||
598 | get_file(new_exe_file); | ||
599 | if (mm->exe_file) | ||
600 | fput(mm->exe_file); | ||
601 | mm->exe_file = new_exe_file; | ||
602 | mm->num_exe_file_vmas = 0; | ||
603 | } | ||
604 | |||
605 | struct file *get_mm_exe_file(struct mm_struct *mm) | ||
606 | { | ||
607 | struct file *exe_file; | ||
608 | |||
609 | /* We need mmap_sem to protect against races with removal of | ||
610 | * VM_EXECUTABLE vmas */ | ||
611 | down_read(&mm->mmap_sem); | ||
612 | exe_file = mm->exe_file; | ||
613 | if (exe_file) | ||
614 | get_file(exe_file); | ||
615 | up_read(&mm->mmap_sem); | ||
616 | return exe_file; | ||
617 | } | ||
618 | |||
619 | static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) | ||
620 | { | ||
621 | /* It's safe to write the exe_file pointer without exe_file_lock because | ||
622 | * this is called during fork when the task is not yet in /proc */ | ||
623 | newmm->exe_file = get_mm_exe_file(oldmm); | ||
624 | } | ||
625 | |||
576 | /** | 626 | /** |
577 | * get_task_mm - acquire a reference to the task's mm | 627 | * get_task_mm - acquire a reference to the task's mm |
578 | * | 628 | * |
@@ -679,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
679 | goto fail_nomem; | 729 | goto fail_nomem; |
680 | 730 | ||
681 | memcpy(mm, oldmm, sizeof(*mm)); | 731 | memcpy(mm, oldmm, sizeof(*mm)); |
732 | mm_init_cpumask(mm); | ||
682 | 733 | ||
683 | /* Initializing for Swap token stuff */ | 734 | /* Initializing for Swap token stuff */ |
684 | mm->token_priority = 0; | 735 | mm->token_priority = 0; |
@@ -927,6 +978,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
927 | tty_audit_fork(sig); | 978 | tty_audit_fork(sig); |
928 | sched_autogroup_fork(sig); | 979 | sched_autogroup_fork(sig); |
929 | 980 | ||
981 | #ifdef CONFIG_CGROUPS | ||
982 | init_rwsem(&sig->threadgroup_fork_lock); | ||
983 | #endif | ||
984 | |||
930 | sig->oom_adj = current->signal->oom_adj; | 985 | sig->oom_adj = current->signal->oom_adj; |
931 | sig->oom_score_adj = current->signal->oom_score_adj; | 986 | sig->oom_score_adj = current->signal->oom_score_adj; |
932 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 987 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
@@ -1103,12 +1158,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1103 | 1158 | ||
1104 | posix_cpu_timers_init(p); | 1159 | posix_cpu_timers_init(p); |
1105 | 1160 | ||
1106 | p->lock_depth = -1; /* -1 = no lock */ | ||
1107 | do_posix_clock_monotonic_gettime(&p->start_time); | 1161 | do_posix_clock_monotonic_gettime(&p->start_time); |
1108 | p->real_start_time = p->start_time; | 1162 | p->real_start_time = p->start_time; |
1109 | monotonic_to_bootbased(&p->real_start_time); | 1163 | monotonic_to_bootbased(&p->real_start_time); |
1110 | p->io_context = NULL; | 1164 | p->io_context = NULL; |
1111 | p->audit_context = NULL; | 1165 | p->audit_context = NULL; |
1166 | if (clone_flags & CLONE_THREAD) | ||
1167 | threadgroup_fork_read_lock(current); | ||
1112 | cgroup_fork(p); | 1168 | cgroup_fork(p); |
1113 | #ifdef CONFIG_NUMA | 1169 | #ifdef CONFIG_NUMA |
1114 | p->mempolicy = mpol_dup(p->mempolicy); | 1170 | p->mempolicy = mpol_dup(p->mempolicy); |
@@ -1153,7 +1209,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1153 | #endif | 1209 | #endif |
1154 | 1210 | ||
1155 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1211 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1156 | sched_fork(p, clone_flags); | 1212 | sched_fork(p); |
1157 | 1213 | ||
1158 | retval = perf_event_init_task(p); | 1214 | retval = perf_event_init_task(p); |
1159 | if (retval) | 1215 | if (retval) |
@@ -1194,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1194 | if (clone_flags & CLONE_THREAD) | 1250 | if (clone_flags & CLONE_THREAD) |
1195 | p->tgid = current->tgid; | 1251 | p->tgid = current->tgid; |
1196 | 1252 | ||
1197 | if (current->nsproxy != p->nsproxy) { | ||
1198 | retval = ns_cgroup_clone(p, pid); | ||
1199 | if (retval) | ||
1200 | goto bad_fork_free_pid; | ||
1201 | } | ||
1202 | |||
1203 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1253 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
1204 | /* | 1254 | /* |
1205 | * Clear TID on mm_release()? | 1255 | * Clear TID on mm_release()? |
@@ -1313,6 +1363,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1313 | write_unlock_irq(&tasklist_lock); | 1363 | write_unlock_irq(&tasklist_lock); |
1314 | proc_fork_connector(p); | 1364 | proc_fork_connector(p); |
1315 | cgroup_post_fork(p); | 1365 | cgroup_post_fork(p); |
1366 | if (clone_flags & CLONE_THREAD) | ||
1367 | threadgroup_fork_read_unlock(current); | ||
1316 | perf_event_fork(p); | 1368 | perf_event_fork(p); |
1317 | return p; | 1369 | return p; |
1318 | 1370 | ||
@@ -1351,6 +1403,8 @@ bad_fork_cleanup_policy: | |||
1351 | mpol_put(p->mempolicy); | 1403 | mpol_put(p->mempolicy); |
1352 | bad_fork_cleanup_cgroup: | 1404 | bad_fork_cleanup_cgroup: |
1353 | #endif | 1405 | #endif |
1406 | if (clone_flags & CLONE_THREAD) | ||
1407 | threadgroup_fork_read_unlock(current); | ||
1354 | cgroup_exit(p, cgroup_callbacks_done); | 1408 | cgroup_exit(p, cgroup_callbacks_done); |
1355 | delayacct_tsk_free(p); | 1409 | delayacct_tsk_free(p); |
1356 | module_put(task_thread_info(p)->exec_domain->module); | 1410 | module_put(task_thread_info(p)->exec_domain->module); |
@@ -1464,7 +1518,7 @@ long do_fork(unsigned long clone_flags, | |||
1464 | */ | 1518 | */ |
1465 | p->flags &= ~PF_STARTING; | 1519 | p->flags &= ~PF_STARTING; |
1466 | 1520 | ||
1467 | wake_up_new_task(p, clone_flags); | 1521 | wake_up_new_task(p); |
1468 | 1522 | ||
1469 | tracehook_report_clone_complete(trace, regs, | 1523 | tracehook_report_clone_complete(trace, regs, |
1470 | clone_flags, nr, p); | 1524 | clone_flags, nr, p); |
@@ -1508,6 +1562,13 @@ void __init proc_caches_init(void) | |||
1508 | fs_cachep = kmem_cache_create("fs_cache", | 1562 | fs_cachep = kmem_cache_create("fs_cache", |
1509 | sizeof(struct fs_struct), 0, | 1563 | sizeof(struct fs_struct), 0, |
1510 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1564 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
1565 | /* | ||
1566 | * FIXME! The "sizeof(struct mm_struct)" currently includes the | ||
1567 | * whole struct cpumask for the OFFSTACK case. We could change | ||
1568 | * this to *only* allocate as much of it as required by the | ||
1569 | * maximum number of CPU's we can ever have. The cpumask_allocation | ||
1570 | * is at the end of the structure, exactly for that reason. | ||
1571 | */ | ||
1511 | mm_cachep = kmem_cache_create("mm_struct", | 1572 | mm_cachep = kmem_cache_create("mm_struct", |
1512 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1573 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1513 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1574 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 66ecd2ead215..7b01de98bb6a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -17,7 +17,7 @@ static inline void frozen_process(void) | |||
17 | { | 17 | { |
18 | if (!unlikely(current->flags & PF_NOFREEZE)) { | 18 | if (!unlikely(current->flags & PF_NOFREEZE)) { |
19 | current->flags |= PF_FROZEN; | 19 | current->flags |= PF_FROZEN; |
20 | wmb(); | 20 | smp_wmb(); |
21 | } | 21 | } |
22 | clear_freeze_flag(current); | 22 | clear_freeze_flag(current); |
23 | } | 23 | } |
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
93 | * the task as frozen and next clears its TIF_FREEZE. | 93 | * the task as frozen and next clears its TIF_FREEZE. |
94 | */ | 94 | */ |
95 | if (!freezing(p)) { | 95 | if (!freezing(p)) { |
96 | rmb(); | 96 | smp_rmb(); |
97 | if (frozen(p)) | 97 | if (frozen(p)) |
98 | return false; | 98 | return false; |
99 | 99 | ||
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index b8cadf70b1fb..5bf924d80b5c 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling" | |||
2 | 2 | ||
3 | config GCOV_KERNEL | 3 | config GCOV_KERNEL |
4 | bool "Enable gcov-based kernel profiling" | 4 | bool "Enable gcov-based kernel profiling" |
5 | depends on DEBUG_FS && CONSTRUCTORS | 5 | depends on DEBUG_FS |
6 | select CONSTRUCTORS | ||
6 | default n | 7 | default n |
7 | ---help--- | 8 | ---help--- |
8 | This option enables gcov-based code profiling (e.g. for code coverage | 9 | This option enables gcov-based code profiling (e.g. for code coverage |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 87fdb3f8db14..a9205e32a059 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -64,24 +64,27 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
64 | .clock_base = | 64 | .clock_base = |
65 | { | 65 | { |
66 | { | 66 | { |
67 | .index = CLOCK_REALTIME, | 67 | .index = HRTIMER_BASE_MONOTONIC, |
68 | .get_time = &ktime_get_real, | 68 | .clockid = CLOCK_MONOTONIC, |
69 | .get_time = &ktime_get, | ||
69 | .resolution = KTIME_LOW_RES, | 70 | .resolution = KTIME_LOW_RES, |
70 | }, | 71 | }, |
71 | { | 72 | { |
72 | .index = CLOCK_MONOTONIC, | 73 | .index = HRTIMER_BASE_REALTIME, |
73 | .get_time = &ktime_get, | 74 | .clockid = CLOCK_REALTIME, |
75 | .get_time = &ktime_get_real, | ||
74 | .resolution = KTIME_LOW_RES, | 76 | .resolution = KTIME_LOW_RES, |
75 | }, | 77 | }, |
76 | { | 78 | { |
77 | .index = CLOCK_BOOTTIME, | 79 | .index = HRTIMER_BASE_BOOTTIME, |
80 | .clockid = CLOCK_BOOTTIME, | ||
78 | .get_time = &ktime_get_boottime, | 81 | .get_time = &ktime_get_boottime, |
79 | .resolution = KTIME_LOW_RES, | 82 | .resolution = KTIME_LOW_RES, |
80 | }, | 83 | }, |
81 | } | 84 | } |
82 | }; | 85 | }; |
83 | 86 | ||
84 | static int hrtimer_clock_to_base_table[MAX_CLOCKS] = { | 87 | static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { |
85 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, | 88 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, |
86 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, | 89 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, |
87 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, | 90 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, |
@@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, | |||
196 | struct hrtimer_cpu_base *new_cpu_base; | 199 | struct hrtimer_cpu_base *new_cpu_base; |
197 | int this_cpu = smp_processor_id(); | 200 | int this_cpu = smp_processor_id(); |
198 | int cpu = hrtimer_get_target(this_cpu, pinned); | 201 | int cpu = hrtimer_get_target(this_cpu, pinned); |
199 | int basenum = hrtimer_clockid_to_base(base->index); | 202 | int basenum = base->index; |
200 | 203 | ||
201 | again: | 204 | again: |
202 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); | 205 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); |
@@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
621 | return res; | 624 | return res; |
622 | } | 625 | } |
623 | 626 | ||
624 | |||
625 | /* | ||
626 | * Retrigger next event is called after clock was set | ||
627 | * | ||
628 | * Called with interrupts disabled via on_each_cpu() | ||
629 | */ | ||
630 | static void retrigger_next_event(void *arg) | ||
631 | { | ||
632 | struct hrtimer_cpu_base *base; | ||
633 | struct timespec realtime_offset, wtm, sleep; | ||
634 | |||
635 | if (!hrtimer_hres_active()) | ||
636 | return; | ||
637 | |||
638 | get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, | ||
639 | &sleep); | ||
640 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
641 | |||
642 | base = &__get_cpu_var(hrtimer_bases); | ||
643 | |||
644 | /* Adjust CLOCK_REALTIME offset */ | ||
645 | raw_spin_lock(&base->lock); | ||
646 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | ||
647 | timespec_to_ktime(realtime_offset); | ||
648 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
649 | timespec_to_ktime(sleep); | ||
650 | |||
651 | hrtimer_force_reprogram(base, 0); | ||
652 | raw_spin_unlock(&base->lock); | ||
653 | } | ||
654 | |||
655 | /* | ||
656 | * Clock realtime was set | ||
657 | * | ||
658 | * Change the offset of the realtime clock vs. the monotonic | ||
659 | * clock. | ||
660 | * | ||
661 | * We might have to reprogram the high resolution timer interrupt. On | ||
662 | * SMP we call the architecture specific code to retrigger _all_ high | ||
663 | * resolution timer interrupts. On UP we just disable interrupts and | ||
664 | * call the high resolution interrupt code. | ||
665 | */ | ||
666 | void clock_was_set(void) | ||
667 | { | ||
668 | /* Retrigger the CPU local events everywhere */ | ||
669 | on_each_cpu(retrigger_next_event, NULL, 1); | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * During resume we might have to reprogram the high resolution timer | ||
674 | * interrupt (on the local CPU): | ||
675 | */ | ||
676 | void hres_timers_resume(void) | ||
677 | { | ||
678 | WARN_ONCE(!irqs_disabled(), | ||
679 | KERN_INFO "hres_timers_resume() called with IRQs enabled!"); | ||
680 | |||
681 | retrigger_next_event(NULL); | ||
682 | } | ||
683 | |||
684 | /* | 627 | /* |
685 | * Initialize the high resolution related parts of cpu_base | 628 | * Initialize the high resolution related parts of cpu_base |
686 | */ | 629 | */ |
@@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
715 | } | 658 | } |
716 | 659 | ||
717 | /* | 660 | /* |
661 | * Retrigger next event is called after clock was set | ||
662 | * | ||
663 | * Called with interrupts disabled via on_each_cpu() | ||
664 | */ | ||
665 | static void retrigger_next_event(void *arg) | ||
666 | { | ||
667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | ||
668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
669 | |||
670 | if (!hrtimer_hres_active()) | ||
671 | return; | ||
672 | |||
673 | /* Optimized out for !HIGH_RES */ | ||
674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
676 | |||
677 | /* Adjust CLOCK_REALTIME offset */ | ||
678 | raw_spin_lock(&base->lock); | ||
679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | ||
680 | timespec_to_ktime(realtime_offset); | ||
681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
682 | timespec_to_ktime(sleep); | ||
683 | |||
684 | hrtimer_force_reprogram(base, 0); | ||
685 | raw_spin_unlock(&base->lock); | ||
686 | } | ||
687 | |||
688 | /* | ||
718 | * Switch to high resolution mode | 689 | * Switch to high resolution mode |
719 | */ | 690 | */ |
720 | static int hrtimer_switch_to_hres(void) | 691 | static int hrtimer_switch_to_hres(void) |
721 | { | 692 | { |
722 | int cpu = smp_processor_id(); | 693 | int i, cpu = smp_processor_id(); |
723 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); | 694 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); |
724 | unsigned long flags; | 695 | unsigned long flags; |
725 | 696 | ||
@@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void) | |||
735 | return 0; | 706 | return 0; |
736 | } | 707 | } |
737 | base->hres_active = 1; | 708 | base->hres_active = 1; |
738 | base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; | 709 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
739 | base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; | 710 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
740 | base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; | ||
741 | 711 | ||
742 | tick_setup_sched_timer(); | 712 | tick_setup_sched_timer(); |
743 | 713 | ||
@@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
761 | return 0; | 731 | return 0; |
762 | } | 732 | } |
763 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 733 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
734 | static inline void retrigger_next_event(void *arg) { } | ||
764 | 735 | ||
765 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 736 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
766 | 737 | ||
738 | /* | ||
739 | * Clock realtime was set | ||
740 | * | ||
741 | * Change the offset of the realtime clock vs. the monotonic | ||
742 | * clock. | ||
743 | * | ||
744 | * We might have to reprogram the high resolution timer interrupt. On | ||
745 | * SMP we call the architecture specific code to retrigger _all_ high | ||
746 | * resolution timer interrupts. On UP we just disable interrupts and | ||
747 | * call the high resolution interrupt code. | ||
748 | */ | ||
749 | void clock_was_set(void) | ||
750 | { | ||
751 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
752 | /* Retrigger the CPU local events everywhere */ | ||
753 | on_each_cpu(retrigger_next_event, NULL, 1); | ||
754 | #endif | ||
755 | timerfd_clock_was_set(); | ||
756 | } | ||
757 | |||
758 | /* | ||
759 | * During resume we might have to reprogram the high resolution timer | ||
760 | * interrupt (on the local CPU): | ||
761 | */ | ||
762 | void hrtimers_resume(void) | ||
763 | { | ||
764 | WARN_ONCE(!irqs_disabled(), | ||
765 | KERN_INFO "hrtimers_resume() called with IRQs enabled!"); | ||
766 | |||
767 | retrigger_next_event(NULL); | ||
768 | timerfd_clock_was_set(); | ||
769 | } | ||
770 | |||
767 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) | 771 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) |
768 | { | 772 | { |
769 | #ifdef CONFIG_TIMER_STATS | 773 | #ifdef CONFIG_TIMER_STATS |
@@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, | |||
856 | debug_activate(timer); | 860 | debug_activate(timer); |
857 | 861 | ||
858 | timerqueue_add(&base->active, &timer->node); | 862 | timerqueue_add(&base->active, &timer->node); |
863 | base->cpu_base->active_bases |= 1 << base->index; | ||
859 | 864 | ||
860 | /* | 865 | /* |
861 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | 866 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the |
@@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
897 | #endif | 902 | #endif |
898 | } | 903 | } |
899 | timerqueue_del(&base->active, &timer->node); | 904 | timerqueue_del(&base->active, &timer->node); |
905 | if (!timerqueue_getnext(&base->active)) | ||
906 | base->cpu_base->active_bases &= ~(1 << base->index); | ||
900 | out: | 907 | out: |
901 | timer->state = newstate; | 908 | timer->state = newstate; |
902 | } | 909 | } |
@@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
1234 | void hrtimer_interrupt(struct clock_event_device *dev) | 1241 | void hrtimer_interrupt(struct clock_event_device *dev) |
1235 | { | 1242 | { |
1236 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1243 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
1237 | struct hrtimer_clock_base *base; | ||
1238 | ktime_t expires_next, now, entry_time, delta; | 1244 | ktime_t expires_next, now, entry_time, delta; |
1239 | int i, retries = 0; | 1245 | int i, retries = 0; |
1240 | 1246 | ||
@@ -1256,12 +1262,15 @@ retry: | |||
1256 | */ | 1262 | */ |
1257 | cpu_base->expires_next.tv64 = KTIME_MAX; | 1263 | cpu_base->expires_next.tv64 = KTIME_MAX; |
1258 | 1264 | ||
1259 | base = cpu_base->clock_base; | ||
1260 | |||
1261 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1265 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
1262 | ktime_t basenow; | 1266 | struct hrtimer_clock_base *base; |
1263 | struct timerqueue_node *node; | 1267 | struct timerqueue_node *node; |
1268 | ktime_t basenow; | ||
1269 | |||
1270 | if (!(cpu_base->active_bases & (1 << i))) | ||
1271 | continue; | ||
1264 | 1272 | ||
1273 | base = cpu_base->clock_base + i; | ||
1265 | basenow = ktime_add(now, base->offset); | 1274 | basenow = ktime_add(now, base->offset); |
1266 | 1275 | ||
1267 | while ((node = timerqueue_getnext(&base->active))) { | 1276 | while ((node = timerqueue_getnext(&base->active))) { |
@@ -1294,7 +1303,6 @@ retry: | |||
1294 | 1303 | ||
1295 | __run_hrtimer(timer, &basenow); | 1304 | __run_hrtimer(timer, &basenow); |
1296 | } | 1305 | } |
1297 | base++; | ||
1298 | } | 1306 | } |
1299 | 1307 | ||
1300 | /* | 1308 | /* |
@@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
1525 | struct timespec __user *rmtp; | 1533 | struct timespec __user *rmtp; |
1526 | int ret = 0; | 1534 | int ret = 0; |
1527 | 1535 | ||
1528 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, | 1536 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, |
1529 | HRTIMER_MODE_ABS); | 1537 | HRTIMER_MODE_ABS); |
1530 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); | 1538 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); |
1531 | 1539 | ||
@@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
1577 | 1585 | ||
1578 | restart = ¤t_thread_info()->restart_block; | 1586 | restart = ¤t_thread_info()->restart_block; |
1579 | restart->fn = hrtimer_nanosleep_restart; | 1587 | restart->fn = hrtimer_nanosleep_restart; |
1580 | restart->nanosleep.index = t.timer.base->index; | 1588 | restart->nanosleep.clockid = t.timer.base->clockid; |
1581 | restart->nanosleep.rmtp = rmtp; | 1589 | restart->nanosleep.rmtp = rmtp; |
1582 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); | 1590 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); |
1583 | 1591 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 53ead174da2f..ea640120ab86 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; | |||
33 | /* | 33 | /* |
34 | * Zero means infinite timeout - no checking done: | 34 | * Zero means infinite timeout - no checking done: |
35 | */ | 35 | */ |
36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | 36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; |
37 | 37 | ||
38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | 38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; |
39 | 39 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c574f9a12c48..d1d051b38e0b 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -48,6 +48,10 @@ config IRQ_PREFLOW_FASTEOI | |||
48 | config IRQ_EDGE_EOI_HANDLER | 48 | config IRQ_EDGE_EOI_HANDLER |
49 | bool | 49 | bool |
50 | 50 | ||
51 | # Generic configurable interrupt chip implementation | ||
52 | config GENERIC_IRQ_CHIP | ||
53 | bool | ||
54 | |||
51 | # Support forced irq threading | 55 | # Support forced irq threading |
52 | config IRQ_FORCED_THREADING | 56 | config IRQ_FORCED_THREADING |
53 | bool | 57 | bool |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 54329cd7b3ee..73290056cfb6 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,5 +1,6 @@ | |||
1 | 1 | ||
2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | ||
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 5 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 6 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4af1e2b244cb..d5a3009da71a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
310 | out_unlock: | 310 | out_unlock: |
311 | raw_spin_unlock(&desc->lock); | 311 | raw_spin_unlock(&desc->lock); |
312 | } | 312 | } |
313 | EXPORT_SYMBOL_GPL(handle_simple_irq); | ||
313 | 314 | ||
314 | /** | 315 | /** |
315 | * handle_level_irq - Level type irq handler | 316 | * handle_level_irq - Level type irq handler |
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
573 | if (handle != handle_bad_irq && is_chained) { | 574 | if (handle != handle_bad_irq && is_chained) { |
574 | irq_settings_set_noprobe(desc); | 575 | irq_settings_set_noprobe(desc); |
575 | irq_settings_set_norequest(desc); | 576 | irq_settings_set_norequest(desc); |
577 | irq_settings_set_nothread(desc); | ||
576 | irq_startup(desc); | 578 | irq_startup(desc); |
577 | } | 579 | } |
578 | out: | 580 | out: |
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | |||
612 | 614 | ||
613 | irq_put_desc_unlock(desc, flags); | 615 | irq_put_desc_unlock(desc, flags); |
614 | } | 616 | } |
617 | EXPORT_SYMBOL_GPL(irq_modify_status); | ||
615 | 618 | ||
616 | /** | 619 | /** |
617 | * irq_cpu_online - Invoke all irq_cpu_online functions. | 620 | * irq_cpu_online - Invoke all irq_cpu_online functions. |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 306cba37e9a5..97a8bfadc88a 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
27 | P(IRQ_PER_CPU); | 27 | P(IRQ_PER_CPU); |
28 | P(IRQ_NOPROBE); | 28 | P(IRQ_NOPROBE); |
29 | P(IRQ_NOREQUEST); | 29 | P(IRQ_NOREQUEST); |
30 | P(IRQ_NOTHREAD); | ||
30 | P(IRQ_NOAUTOEN); | 31 | P(IRQ_NOAUTOEN); |
31 | 32 | ||
32 | PS(IRQS_AUTODETECT); | 33 | PS(IRQS_AUTODETECT); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 000000000000..3a2cab407b93 --- /dev/null +++ b/kernel/irq/generic-chip.c | |||
@@ -0,0 +1,368 @@ | |||
1 | /* | ||
2 | * Library implementing the most common irq chip callback functions | ||
3 | * | ||
4 | * Copyright (C) 2011, Thomas Gleixner | ||
5 | */ | ||
6 | #include <linux/io.h> | ||
7 | #include <linux/irq.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/interrupt.h> | ||
10 | #include <linux/kernel_stat.h> | ||
11 | #include <linux/syscore_ops.h> | ||
12 | |||
13 | #include "internals.h" | ||
14 | |||
15 | static LIST_HEAD(gc_list); | ||
16 | static DEFINE_RAW_SPINLOCK(gc_lock); | ||
17 | |||
18 | static inline struct irq_chip_regs *cur_regs(struct irq_data *d) | ||
19 | { | ||
20 | return &container_of(d->chip, struct irq_chip_type, chip)->regs; | ||
21 | } | ||
22 | |||
23 | /** | ||
24 | * irq_gc_noop - NOOP function | ||
25 | * @d: irq_data | ||
26 | */ | ||
27 | void irq_gc_noop(struct irq_data *d) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | * irq_gc_mask_disable_reg - Mask chip via disable register | ||
33 | * @d: irq_data | ||
34 | * | ||
35 | * Chip has separate enable/disable registers instead of a single mask | ||
36 | * register. | ||
37 | */ | ||
38 | void irq_gc_mask_disable_reg(struct irq_data *d) | ||
39 | { | ||
40 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
41 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
42 | |||
43 | irq_gc_lock(gc); | ||
44 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); | ||
45 | gc->mask_cache &= ~mask; | ||
46 | irq_gc_unlock(gc); | ||
47 | } | ||
48 | |||
49 | /** | ||
50 | * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register | ||
51 | * @d: irq_data | ||
52 | * | ||
53 | * Chip has a single mask register. Values of this register are cached | ||
54 | * and protected by gc->lock | ||
55 | */ | ||
56 | void irq_gc_mask_set_bit(struct irq_data *d) | ||
57 | { | ||
58 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
59 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
60 | |||
61 | irq_gc_lock(gc); | ||
62 | gc->mask_cache |= mask; | ||
63 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
64 | irq_gc_unlock(gc); | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register | ||
69 | * @d: irq_data | ||
70 | * | ||
71 | * Chip has a single mask register. Values of this register are cached | ||
72 | * and protected by gc->lock | ||
73 | */ | ||
74 | void irq_gc_mask_clr_bit(struct irq_data *d) | ||
75 | { | ||
76 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
77 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
78 | |||
79 | irq_gc_lock(gc); | ||
80 | gc->mask_cache &= ~mask; | ||
81 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
82 | irq_gc_unlock(gc); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * irq_gc_unmask_enable_reg - Unmask chip via enable register | ||
87 | * @d: irq_data | ||
88 | * | ||
89 | * Chip has separate enable/disable registers instead of a single mask | ||
90 | * register. | ||
91 | */ | ||
92 | void irq_gc_unmask_enable_reg(struct irq_data *d) | ||
93 | { | ||
94 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
95 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
96 | |||
97 | irq_gc_lock(gc); | ||
98 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); | ||
99 | gc->mask_cache |= mask; | ||
100 | irq_gc_unlock(gc); | ||
101 | } | ||
102 | |||
103 | /** | ||
104 | * irq_gc_ack_set_bit - Ack pending interrupt via setting bit | ||
105 | * @d: irq_data | ||
106 | */ | ||
107 | void irq_gc_ack_set_bit(struct irq_data *d) | ||
108 | { | ||
109 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
110 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
111 | |||
112 | irq_gc_lock(gc); | ||
113 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
114 | irq_gc_unlock(gc); | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit | ||
119 | * @d: irq_data | ||
120 | */ | ||
121 | void irq_gc_ack_clr_bit(struct irq_data *d) | ||
122 | { | ||
123 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
124 | u32 mask = ~(1 << (d->irq - gc->irq_base)); | ||
125 | |||
126 | irq_gc_lock(gc); | ||
127 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
128 | irq_gc_unlock(gc); | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt | ||
133 | * @d: irq_data | ||
134 | */ | ||
135 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | ||
136 | { | ||
137 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
138 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
139 | |||
140 | irq_gc_lock(gc); | ||
141 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); | ||
142 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
143 | irq_gc_unlock(gc); | ||
144 | } | ||
145 | |||
146 | /** | ||
147 | * irq_gc_eoi - EOI interrupt | ||
148 | * @d: irq_data | ||
149 | */ | ||
150 | void irq_gc_eoi(struct irq_data *d) | ||
151 | { | ||
152 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
153 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
154 | |||
155 | irq_gc_lock(gc); | ||
156 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); | ||
157 | irq_gc_unlock(gc); | ||
158 | } | ||
159 | |||
160 | /** | ||
161 | * irq_gc_set_wake - Set/clr wake bit for an interrupt | ||
162 | * @d: irq_data | ||
163 | * | ||
164 | * For chips where the wake from suspend functionality is not | ||
165 | * configured in a separate register and the wakeup active state is | ||
166 | * just stored in a bitmask. | ||
167 | */ | ||
168 | int irq_gc_set_wake(struct irq_data *d, unsigned int on) | ||
169 | { | ||
170 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
171 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
172 | |||
173 | if (!(mask & gc->wake_enabled)) | ||
174 | return -EINVAL; | ||
175 | |||
176 | irq_gc_lock(gc); | ||
177 | if (on) | ||
178 | gc->wake_active |= mask; | ||
179 | else | ||
180 | gc->wake_active &= ~mask; | ||
181 | irq_gc_unlock(gc); | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * irq_alloc_generic_chip - Allocate a generic chip and initialize it | ||
187 | * @name: Name of the irq chip | ||
188 | * @num_ct: Number of irq_chip_type instances associated with this | ||
189 | * @irq_base: Interrupt base nr for this chip | ||
190 | * @reg_base: Register base address (virtual) | ||
191 | * @handler: Default flow handler associated with this chip | ||
192 | * | ||
193 | * Returns an initialized irq_chip_generic structure. The chip defaults | ||
194 | * to the primary (index 0) irq_chip_type and @handler | ||
195 | */ | ||
196 | struct irq_chip_generic * | ||
197 | irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, | ||
198 | void __iomem *reg_base, irq_flow_handler_t handler) | ||
199 | { | ||
200 | struct irq_chip_generic *gc; | ||
201 | unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
202 | |||
203 | gc = kzalloc(sz, GFP_KERNEL); | ||
204 | if (gc) { | ||
205 | raw_spin_lock_init(&gc->lock); | ||
206 | gc->num_ct = num_ct; | ||
207 | gc->irq_base = irq_base; | ||
208 | gc->reg_base = reg_base; | ||
209 | gc->chip_types->chip.name = name; | ||
210 | gc->chip_types->handler = handler; | ||
211 | } | ||
212 | return gc; | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Separate lockdep class for interrupt chip which can nest irq_desc | ||
217 | * lock. | ||
218 | */ | ||
219 | static struct lock_class_key irq_nested_lock_class; | ||
220 | |||
221 | /** | ||
222 | * irq_setup_generic_chip - Setup a range of interrupts with a generic chip | ||
223 | * @gc: Generic irq chip holding all data | ||
224 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
225 | * @flags: Flags for initialization | ||
226 | * @clr: IRQ_* bits to clear | ||
227 | * @set: IRQ_* bits to set | ||
228 | * | ||
229 | * Set up max. 32 interrupts starting from gc->irq_base. Note, this | ||
230 | * initializes all interrupts to the primary irq_chip_type and its | ||
231 | * associated handler. | ||
232 | */ | ||
233 | void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
234 | enum irq_gc_flags flags, unsigned int clr, | ||
235 | unsigned int set) | ||
236 | { | ||
237 | struct irq_chip_type *ct = gc->chip_types; | ||
238 | unsigned int i; | ||
239 | |||
240 | raw_spin_lock(&gc_lock); | ||
241 | list_add_tail(&gc->list, &gc_list); | ||
242 | raw_spin_unlock(&gc_lock); | ||
243 | |||
244 | /* Init mask cache ? */ | ||
245 | if (flags & IRQ_GC_INIT_MASK_CACHE) | ||
246 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); | ||
247 | |||
248 | for (i = gc->irq_base; msk; msk >>= 1, i++) { | ||
249 | if (!msk & 0x01) | ||
250 | continue; | ||
251 | |||
252 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | ||
253 | irq_set_lockdep_class(i, &irq_nested_lock_class); | ||
254 | |||
255 | irq_set_chip_and_handler(i, &ct->chip, ct->handler); | ||
256 | irq_set_chip_data(i, gc); | ||
257 | irq_modify_status(i, clr, set); | ||
258 | } | ||
259 | gc->irq_cnt = i - gc->irq_base; | ||
260 | } | ||
261 | |||
262 | /** | ||
263 | * irq_setup_alt_chip - Switch to alternative chip | ||
264 | * @d: irq_data for this interrupt | ||
265 | * @type Flow type to be initialized | ||
266 | * | ||
267 | * Only to be called from chip->irq_set_type() callbacks. | ||
268 | */ | ||
269 | int irq_setup_alt_chip(struct irq_data *d, unsigned int type) | ||
270 | { | ||
271 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
272 | struct irq_chip_type *ct = gc->chip_types; | ||
273 | unsigned int i; | ||
274 | |||
275 | for (i = 0; i < gc->num_ct; i++, ct++) { | ||
276 | if (ct->type & type) { | ||
277 | d->chip = &ct->chip; | ||
278 | irq_data_to_desc(d)->handle_irq = ct->handler; | ||
279 | return 0; | ||
280 | } | ||
281 | } | ||
282 | return -EINVAL; | ||
283 | } | ||
284 | |||
285 | /** | ||
286 | * irq_remove_generic_chip - Remove a chip | ||
287 | * @gc: Generic irq chip holding all data | ||
288 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
289 | * @clr: IRQ_* bits to clear | ||
290 | * @set: IRQ_* bits to set | ||
291 | * | ||
292 | * Remove up to 32 interrupts starting from gc->irq_base. | ||
293 | */ | ||
294 | void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
295 | unsigned int clr, unsigned int set) | ||
296 | { | ||
297 | unsigned int i = gc->irq_base; | ||
298 | |||
299 | raw_spin_lock(&gc_lock); | ||
300 | list_del(&gc->list); | ||
301 | raw_spin_unlock(&gc_lock); | ||
302 | |||
303 | for (; msk; msk >>= 1, i++) { | ||
304 | if (!msk & 0x01) | ||
305 | continue; | ||
306 | |||
307 | /* Remove handler first. That will mask the irq line */ | ||
308 | irq_set_handler(i, NULL); | ||
309 | irq_set_chip(i, &no_irq_chip); | ||
310 | irq_set_chip_data(i, NULL); | ||
311 | irq_modify_status(i, clr, set); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | #ifdef CONFIG_PM | ||
316 | static int irq_gc_suspend(void) | ||
317 | { | ||
318 | struct irq_chip_generic *gc; | ||
319 | |||
320 | list_for_each_entry(gc, &gc_list, list) { | ||
321 | struct irq_chip_type *ct = gc->chip_types; | ||
322 | |||
323 | if (ct->chip.irq_suspend) | ||
324 | ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); | ||
325 | } | ||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static void irq_gc_resume(void) | ||
330 | { | ||
331 | struct irq_chip_generic *gc; | ||
332 | |||
333 | list_for_each_entry(gc, &gc_list, list) { | ||
334 | struct irq_chip_type *ct = gc->chip_types; | ||
335 | |||
336 | if (ct->chip.irq_resume) | ||
337 | ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); | ||
338 | } | ||
339 | } | ||
340 | #else | ||
341 | #define irq_gc_suspend NULL | ||
342 | #define irq_gc_resume NULL | ||
343 | #endif | ||
344 | |||
345 | static void irq_gc_shutdown(void) | ||
346 | { | ||
347 | struct irq_chip_generic *gc; | ||
348 | |||
349 | list_for_each_entry(gc, &gc_list, list) { | ||
350 | struct irq_chip_type *ct = gc->chip_types; | ||
351 | |||
352 | if (ct->chip.irq_pm_shutdown) | ||
353 | ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); | ||
354 | } | ||
355 | } | ||
356 | |||
357 | static struct syscore_ops irq_gc_syscore_ops = { | ||
358 | .suspend = irq_gc_suspend, | ||
359 | .resume = irq_gc_resume, | ||
360 | .shutdown = irq_gc_shutdown, | ||
361 | }; | ||
362 | |||
363 | static int __init irq_gc_init_ops(void) | ||
364 | { | ||
365 | register_syscore_ops(&irq_gc_syscore_ops); | ||
366 | return 0; | ||
367 | } | ||
368 | device_initcall(irq_gc_init_ops); | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 90cb55f6d7eb..470d08c82bbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | |||
133 | switch (res) { | 133 | switch (res) { |
134 | case IRQ_WAKE_THREAD: | 134 | case IRQ_WAKE_THREAD: |
135 | /* | 135 | /* |
136 | * Set result to handled so the spurious check | ||
137 | * does not trigger. | ||
138 | */ | ||
139 | res = IRQ_HANDLED; | ||
140 | |||
141 | /* | ||
142 | * Catch drivers which return WAKE_THREAD but | 136 | * Catch drivers which return WAKE_THREAD but |
143 | * did not set up a thread function | 137 | * did not set up a thread function |
144 | */ | 138 | */ |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2c039c9b9383..4c60a50e66b2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -22,7 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | static struct lock_class_key irq_desc_lock_class; | 23 | static struct lock_class_key irq_desc_lock_class; |
24 | 24 | ||
25 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | 25 | #if defined(CONFIG_SMP) |
26 | static void __init init_irq_default_affinity(void) | 26 | static void __init init_irq_default_affinity(void) |
27 | { | 27 | { |
28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | 28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); |
@@ -257,13 +257,11 @@ int __init early_irq_init(void) | |||
257 | count = ARRAY_SIZE(irq_desc); | 257 | count = ARRAY_SIZE(irq_desc); |
258 | 258 | ||
259 | for (i = 0; i < count; i++) { | 259 | for (i = 0; i < count; i++) { |
260 | desc[i].irq_data.irq = i; | ||
261 | desc[i].irq_data.chip = &no_irq_chip; | ||
262 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | 260 | desc[i].kstat_irqs = alloc_percpu(unsigned int); |
263 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); | 261 | alloc_masks(&desc[i], GFP_KERNEL, node); |
264 | alloc_masks(desc + i, GFP_KERNEL, node); | 262 | raw_spin_lock_init(&desc[i].lock); |
265 | desc_smp_init(desc + i, node); | ||
266 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 263 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
264 | desc_set_defaults(i, &desc[i], node); | ||
267 | } | 265 | } |
268 | return arch_early_irq_init(); | 266 | return arch_early_irq_init(); |
269 | } | 267 | } |
@@ -290,6 +288,22 @@ static int irq_expand_nr_irqs(unsigned int nr) | |||
290 | 288 | ||
291 | #endif /* !CONFIG_SPARSE_IRQ */ | 289 | #endif /* !CONFIG_SPARSE_IRQ */ |
292 | 290 | ||
291 | /** | ||
292 | * generic_handle_irq - Invoke the handler for a particular irq | ||
293 | * @irq: The irq number to handle | ||
294 | * | ||
295 | */ | ||
296 | int generic_handle_irq(unsigned int irq) | ||
297 | { | ||
298 | struct irq_desc *desc = irq_to_desc(irq); | ||
299 | |||
300 | if (!desc) | ||
301 | return -EINVAL; | ||
302 | generic_handle_irq_desc(irq, desc); | ||
303 | return 0; | ||
304 | } | ||
305 | EXPORT_SYMBOL_GPL(generic_handle_irq); | ||
306 | |||
293 | /* Dynamic interrupt handling */ | 307 | /* Dynamic interrupt handling */ |
294 | 308 | ||
295 | /** | 309 | /** |
@@ -311,6 +325,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) | |||
311 | bitmap_clear(allocated_irqs, from, cnt); | 325 | bitmap_clear(allocated_irqs, from, cnt); |
312 | mutex_unlock(&sparse_irq_lock); | 326 | mutex_unlock(&sparse_irq_lock); |
313 | } | 327 | } |
328 | EXPORT_SYMBOL_GPL(irq_free_descs); | ||
314 | 329 | ||
315 | /** | 330 | /** |
316 | * irq_alloc_descs - allocate and initialize a range of irq descriptors | 331 | * irq_alloc_descs - allocate and initialize a range of irq descriptors |
@@ -329,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | |||
329 | if (!cnt) | 344 | if (!cnt) |
330 | return -EINVAL; | 345 | return -EINVAL; |
331 | 346 | ||
347 | if (irq >= 0) { | ||
348 | if (from > irq) | ||
349 | return -EINVAL; | ||
350 | from = irq; | ||
351 | } | ||
352 | |||
332 | mutex_lock(&sparse_irq_lock); | 353 | mutex_lock(&sparse_irq_lock); |
333 | 354 | ||
334 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, | 355 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, |
@@ -351,6 +372,7 @@ err: | |||
351 | mutex_unlock(&sparse_irq_lock); | 372 | mutex_unlock(&sparse_irq_lock); |
352 | return ret; | 373 | return ret; |
353 | } | 374 | } |
375 | EXPORT_SYMBOL_GPL(irq_alloc_descs); | ||
354 | 376 | ||
355 | /** | 377 | /** |
356 | * irq_reserve_irqs - mark irqs allocated | 378 | * irq_reserve_irqs - mark irqs allocated |
@@ -430,7 +452,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | |||
430 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | 452 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; |
431 | } | 453 | } |
432 | 454 | ||
433 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
434 | unsigned int kstat_irqs(unsigned int irq) | 455 | unsigned int kstat_irqs(unsigned int irq) |
435 | { | 456 | { |
436 | struct irq_desc *desc = irq_to_desc(irq); | 457 | struct irq_desc *desc = irq_to_desc(irq); |
@@ -443,4 +464,3 @@ unsigned int kstat_irqs(unsigned int irq) | |||
443 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); | 464 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
444 | return sum; | 465 | return sum; |
445 | } | 466 | } |
446 | #endif /* CONFIG_GENERIC_HARDIRQS */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 07c1611f3899..0a7840aeb0fb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) | |||
491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); |
492 | int ret = 0; | 492 | int ret = 0; |
493 | 493 | ||
494 | if (!desc) | ||
495 | return -EINVAL; | ||
496 | |||
494 | /* wakeup-capable irqs can be shared between drivers that | 497 | /* wakeup-capable irqs can be shared between drivers that |
495 | * don't need to have the same sleep mode behaviors. | 498 | * don't need to have the same sleep mode behaviors. |
496 | */ | 499 | */ |
@@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
723 | * context. So we need to disable bh here to avoid deadlocks and other | 726 | * context. So we need to disable bh here to avoid deadlocks and other |
724 | * side effects. | 727 | * side effects. |
725 | */ | 728 | */ |
726 | static void | 729 | static irqreturn_t |
727 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | 730 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) |
728 | { | 731 | { |
732 | irqreturn_t ret; | ||
733 | |||
729 | local_bh_disable(); | 734 | local_bh_disable(); |
730 | action->thread_fn(action->irq, action->dev_id); | 735 | ret = action->thread_fn(action->irq, action->dev_id); |
731 | irq_finalize_oneshot(desc, action, false); | 736 | irq_finalize_oneshot(desc, action, false); |
732 | local_bh_enable(); | 737 | local_bh_enable(); |
738 | return ret; | ||
733 | } | 739 | } |
734 | 740 | ||
735 | /* | 741 | /* |
@@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | |||
737 | * preemtible - many of them need to sleep and wait for slow busses to | 743 | * preemtible - many of them need to sleep and wait for slow busses to |
738 | * complete. | 744 | * complete. |
739 | */ | 745 | */ |
740 | static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) | 746 | static irqreturn_t irq_thread_fn(struct irq_desc *desc, |
747 | struct irqaction *action) | ||
741 | { | 748 | { |
742 | action->thread_fn(action->irq, action->dev_id); | 749 | irqreturn_t ret; |
750 | |||
751 | ret = action->thread_fn(action->irq, action->dev_id); | ||
743 | irq_finalize_oneshot(desc, action, false); | 752 | irq_finalize_oneshot(desc, action, false); |
753 | return ret; | ||
744 | } | 754 | } |
745 | 755 | ||
746 | /* | 756 | /* |
@@ -753,7 +763,8 @@ static int irq_thread(void *data) | |||
753 | }; | 763 | }; |
754 | struct irqaction *action = data; | 764 | struct irqaction *action = data; |
755 | struct irq_desc *desc = irq_to_desc(action->irq); | 765 | struct irq_desc *desc = irq_to_desc(action->irq); |
756 | void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); | 766 | irqreturn_t (*handler_fn)(struct irq_desc *desc, |
767 | struct irqaction *action); | ||
757 | int wake; | 768 | int wake; |
758 | 769 | ||
759 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, | 770 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, |
@@ -783,8 +794,12 @@ static int irq_thread(void *data) | |||
783 | desc->istate |= IRQS_PENDING; | 794 | desc->istate |= IRQS_PENDING; |
784 | raw_spin_unlock_irq(&desc->lock); | 795 | raw_spin_unlock_irq(&desc->lock); |
785 | } else { | 796 | } else { |
797 | irqreturn_t action_ret; | ||
798 | |||
786 | raw_spin_unlock_irq(&desc->lock); | 799 | raw_spin_unlock_irq(&desc->lock); |
787 | handler_fn(desc, action); | 800 | action_ret = handler_fn(desc, action); |
801 | if (!noirqdebug) | ||
802 | note_interrupt(action->irq, desc, action_ret); | ||
788 | } | 803 | } |
789 | 804 | ||
790 | wake = atomic_dec_and_test(&desc->threads_active); | 805 | wake = atomic_dec_and_test(&desc->threads_active); |
@@ -900,7 +915,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
900 | */ | 915 | */ |
901 | new->handler = irq_nested_primary_handler; | 916 | new->handler = irq_nested_primary_handler; |
902 | } else { | 917 | } else { |
903 | irq_setup_forced_threading(new); | 918 | if (irq_settings_can_thread(desc)) |
919 | irq_setup_forced_threading(new); | ||
904 | } | 920 | } |
905 | 921 | ||
906 | /* | 922 | /* |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 834899f2500f..4bd4faa6323a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; | |||
19 | 19 | ||
20 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
21 | 21 | ||
22 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 22 | static int show_irq_affinity(int type, struct seq_file *m, void *v) |
23 | { | 23 | { |
24 | struct irq_desc *desc = irq_to_desc((long)m->private); | 24 | struct irq_desc *desc = irq_to_desc((long)m->private); |
25 | const struct cpumask *mask = desc->irq_data.affinity; | 25 | const struct cpumask *mask = desc->irq_data.affinity; |
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) | |||
28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) | 28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) |
29 | mask = desc->pending_mask; | 29 | mask = desc->pending_mask; |
30 | #endif | 30 | #endif |
31 | seq_cpumask(m, mask); | 31 | if (type) |
32 | seq_cpumask_list(m, mask); | ||
33 | else | ||
34 | seq_cpumask(m, mask); | ||
32 | seq_putc(m, '\n'); | 35 | seq_putc(m, '\n'); |
33 | return 0; | 36 | return 0; |
34 | } | 37 | } |
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) | |||
59 | #endif | 62 | #endif |
60 | 63 | ||
61 | int no_irq_affinity; | 64 | int no_irq_affinity; |
62 | static ssize_t irq_affinity_proc_write(struct file *file, | 65 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
66 | { | ||
67 | return show_irq_affinity(0, m, v); | ||
68 | } | ||
69 | |||
70 | static int irq_affinity_list_proc_show(struct seq_file *m, void *v) | ||
71 | { | ||
72 | return show_irq_affinity(1, m, v); | ||
73 | } | ||
74 | |||
75 | |||
76 | static ssize_t write_irq_affinity(int type, struct file *file, | ||
63 | const char __user *buffer, size_t count, loff_t *pos) | 77 | const char __user *buffer, size_t count, loff_t *pos) |
64 | { | 78 | { |
65 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; | 79 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; |
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 86 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
73 | return -ENOMEM; | 87 | return -ENOMEM; |
74 | 88 | ||
75 | err = cpumask_parse_user(buffer, count, new_value); | 89 | if (type) |
90 | err = cpumask_parselist_user(buffer, count, new_value); | ||
91 | else | ||
92 | err = cpumask_parse_user(buffer, count, new_value); | ||
76 | if (err) | 93 | if (err) |
77 | goto free_cpumask; | 94 | goto free_cpumask; |
78 | 95 | ||
@@ -100,11 +117,28 @@ free_cpumask: | |||
100 | return err; | 117 | return err; |
101 | } | 118 | } |
102 | 119 | ||
120 | static ssize_t irq_affinity_proc_write(struct file *file, | ||
121 | const char __user *buffer, size_t count, loff_t *pos) | ||
122 | { | ||
123 | return write_irq_affinity(0, file, buffer, count, pos); | ||
124 | } | ||
125 | |||
126 | static ssize_t irq_affinity_list_proc_write(struct file *file, | ||
127 | const char __user *buffer, size_t count, loff_t *pos) | ||
128 | { | ||
129 | return write_irq_affinity(1, file, buffer, count, pos); | ||
130 | } | ||
131 | |||
103 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) | 132 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) |
104 | { | 133 | { |
105 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); | 134 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); |
106 | } | 135 | } |
107 | 136 | ||
137 | static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) | ||
138 | { | ||
139 | return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); | ||
140 | } | ||
141 | |||
108 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) | 142 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) |
109 | { | 143 | { |
110 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); | 144 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); |
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = { | |||
125 | .release = single_release, | 159 | .release = single_release, |
126 | }; | 160 | }; |
127 | 161 | ||
162 | static const struct file_operations irq_affinity_list_proc_fops = { | ||
163 | .open = irq_affinity_list_proc_open, | ||
164 | .read = seq_read, | ||
165 | .llseek = seq_lseek, | ||
166 | .release = single_release, | ||
167 | .write = irq_affinity_list_proc_write, | ||
168 | }; | ||
169 | |||
128 | static int default_affinity_show(struct seq_file *m, void *v) | 170 | static int default_affinity_show(struct seq_file *m, void *v) |
129 | { | 171 | { |
130 | seq_cpumask(m, irq_default_affinity); | 172 | seq_cpumask(m, irq_default_affinity); |
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
289 | proc_create_data("affinity_hint", 0400, desc->dir, | 331 | proc_create_data("affinity_hint", 0400, desc->dir, |
290 | &irq_affinity_hint_proc_fops, (void *)(long)irq); | 332 | &irq_affinity_hint_proc_fops, (void *)(long)irq); |
291 | 333 | ||
334 | /* create /proc/irq/<irq>/smp_affinity_list */ | ||
335 | proc_create_data("smp_affinity_list", 0600, desc->dir, | ||
336 | &irq_affinity_list_proc_fops, (void *)(long)irq); | ||
337 | |||
292 | proc_create_data("node", 0444, desc->dir, | 338 | proc_create_data("node", 0444, desc->dir, |
293 | &irq_node_proc_fops, (void *)(long)irq); | 339 | &irq_node_proc_fops, (void *)(long)irq); |
294 | #endif | 340 | #endif |
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
306 | #ifdef CONFIG_SMP | 352 | #ifdef CONFIG_SMP |
307 | remove_proc_entry("smp_affinity", desc->dir); | 353 | remove_proc_entry("smp_affinity", desc->dir); |
308 | remove_proc_entry("affinity_hint", desc->dir); | 354 | remove_proc_entry("affinity_hint", desc->dir); |
355 | remove_proc_entry("smp_affinity_list", desc->dir); | ||
309 | remove_proc_entry("node", desc->dir); | 356 | remove_proc_entry("node", desc->dir); |
310 | #endif | 357 | #endif |
311 | remove_proc_entry("spurious", desc->dir); | 358 | remove_proc_entry("spurious", desc->dir); |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0d91730b6330..f1667833d444 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h | |||
@@ -8,6 +8,7 @@ enum { | |||
8 | _IRQ_LEVEL = IRQ_LEVEL, | 8 | _IRQ_LEVEL = IRQ_LEVEL, |
9 | _IRQ_NOPROBE = IRQ_NOPROBE, | 9 | _IRQ_NOPROBE = IRQ_NOPROBE, |
10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, | 10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, |
11 | _IRQ_NOTHREAD = IRQ_NOTHREAD, | ||
11 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, | 12 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, |
12 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | 13 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, |
13 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | 14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, |
@@ -20,6 +21,7 @@ enum { | |||
20 | #define IRQ_LEVEL GOT_YOU_MORON | 21 | #define IRQ_LEVEL GOT_YOU_MORON |
21 | #define IRQ_NOPROBE GOT_YOU_MORON | 22 | #define IRQ_NOPROBE GOT_YOU_MORON |
22 | #define IRQ_NOREQUEST GOT_YOU_MORON | 23 | #define IRQ_NOREQUEST GOT_YOU_MORON |
24 | #define IRQ_NOTHREAD GOT_YOU_MORON | ||
23 | #define IRQ_NOAUTOEN GOT_YOU_MORON | 25 | #define IRQ_NOAUTOEN GOT_YOU_MORON |
24 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | 26 | #define IRQ_NESTED_THREAD GOT_YOU_MORON |
25 | #undef IRQF_MODIFY_MASK | 27 | #undef IRQF_MODIFY_MASK |
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc) | |||
94 | desc->status_use_accessors |= _IRQ_NOREQUEST; | 96 | desc->status_use_accessors |= _IRQ_NOREQUEST; |
95 | } | 97 | } |
96 | 98 | ||
99 | static inline bool irq_settings_can_thread(struct irq_desc *desc) | ||
100 | { | ||
101 | return !(desc->status_use_accessors & _IRQ_NOTHREAD); | ||
102 | } | ||
103 | |||
104 | static inline void irq_settings_clr_nothread(struct irq_desc *desc) | ||
105 | { | ||
106 | desc->status_use_accessors &= ~_IRQ_NOTHREAD; | ||
107 | } | ||
108 | |||
109 | static inline void irq_settings_set_nothread(struct irq_desc *desc) | ||
110 | { | ||
111 | desc->status_use_accessors |= _IRQ_NOTHREAD; | ||
112 | } | ||
113 | |||
97 | static inline bool irq_settings_can_probe(struct irq_desc *desc) | 114 | static inline bool irq_settings_can_probe(struct irq_desc *desc) |
98 | { | 115 | { |
99 | return !(desc->status_use_accessors & _IRQ_NOPROBE); | 116 | return !(desc->status_use_accessors & _IRQ_NOPROBE); |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dfbd550401b2..aa57d5da18c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -167,6 +167,13 @@ out: | |||
167 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 167 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
168 | } | 168 | } |
169 | 169 | ||
170 | static inline int bad_action_ret(irqreturn_t action_ret) | ||
171 | { | ||
172 | if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) | ||
173 | return 0; | ||
174 | return 1; | ||
175 | } | ||
176 | |||
170 | /* | 177 | /* |
171 | * If 99,900 of the previous 100,000 interrupts have not been handled | 178 | * If 99,900 of the previous 100,000 interrupts have not been handled |
172 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic | 179 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic |
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
182 | struct irqaction *action; | 189 | struct irqaction *action; |
183 | unsigned long flags; | 190 | unsigned long flags; |
184 | 191 | ||
185 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { | 192 | if (bad_action_ret(action_ret)) { |
186 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | 193 | printk(KERN_ERR "irq event %d: bogus return value %x\n", |
187 | irq, action_ret); | 194 | irq, action_ret); |
188 | } else { | 195 | } else { |
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
201 | raw_spin_lock_irqsave(&desc->lock, flags); | 208 | raw_spin_lock_irqsave(&desc->lock, flags); |
202 | action = desc->action; | 209 | action = desc->action; |
203 | while (action) { | 210 | while (action) { |
204 | printk(KERN_ERR "[<%p>]", action->handler); | 211 | printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); |
205 | print_symbol(" (%s)", | 212 | if (action->thread_fn) |
206 | (unsigned long)action->handler); | 213 | printk(KERN_CONT " threaded [<%p>] %pf", |
207 | printk("\n"); | 214 | action->thread_fn, action->thread_fn); |
215 | printk(KERN_CONT "\n"); | ||
208 | action = action->next; | 216 | action = action->next; |
209 | } | 217 | } |
210 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 218 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
262 | if (desc->istate & IRQS_POLL_INPROGRESS) | 270 | if (desc->istate & IRQS_POLL_INPROGRESS) |
263 | return; | 271 | return; |
264 | 272 | ||
265 | if (unlikely(action_ret != IRQ_HANDLED)) { | 273 | /* we get here again via the threaded handler */ |
274 | if (action_ret == IRQ_WAKE_THREAD) | ||
275 | return; | ||
276 | |||
277 | if (bad_action_ret(action_ret)) { | ||
278 | report_bad_irq(irq, desc, action_ret); | ||
279 | return; | ||
280 | } | ||
281 | |||
282 | if (unlikely(action_ret == IRQ_NONE)) { | ||
266 | /* | 283 | /* |
267 | * If we are seeing only the odd spurious IRQ caused by | 284 | * If we are seeing only the odd spurious IRQ caused by |
268 | * bus asynchronicity then don't eventually trigger an error, | 285 | * bus asynchronicity then don't eventually trigger an error, |
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
274 | else | 291 | else |
275 | desc->irqs_unhandled++; | 292 | desc->irqs_unhandled++; |
276 | desc->last_unhandled = jiffies; | 293 | desc->last_unhandled = jiffies; |
277 | if (unlikely(action_ret != IRQ_NONE)) | ||
278 | report_bad_irq(irq, desc, action_ret); | ||
279 | } | 294 | } |
280 | 295 | ||
281 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { | 296 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3b79bd938330..a8ce45097f3d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -2,43 +2,23 @@ | |||
2 | * jump label support | 2 | * jump label support |
3 | * | 3 | * |
4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | 4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> |
5 | * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | * | 6 | * |
6 | */ | 7 | */ |
7 | #include <linux/jump_label.h> | ||
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/list.h> | 11 | #include <linux/list.h> |
12 | #include <linux/jhash.h> | ||
13 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
14 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
15 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/jump_label.h> | ||
16 | 16 | ||
17 | #ifdef HAVE_JUMP_LABEL | 17 | #ifdef HAVE_JUMP_LABEL |
18 | 18 | ||
19 | #define JUMP_LABEL_HASH_BITS 6 | ||
20 | #define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) | ||
21 | static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; | ||
22 | |||
23 | /* mutex to protect coming/going of the the jump_label table */ | 19 | /* mutex to protect coming/going of the the jump_label table */ |
24 | static DEFINE_MUTEX(jump_label_mutex); | 20 | static DEFINE_MUTEX(jump_label_mutex); |
25 | 21 | ||
26 | struct jump_label_entry { | ||
27 | struct hlist_node hlist; | ||
28 | struct jump_entry *table; | ||
29 | int nr_entries; | ||
30 | /* hang modules off here */ | ||
31 | struct hlist_head modules; | ||
32 | unsigned long key; | ||
33 | }; | ||
34 | |||
35 | struct jump_label_module_entry { | ||
36 | struct hlist_node hlist; | ||
37 | struct jump_entry *table; | ||
38 | int nr_entries; | ||
39 | struct module *mod; | ||
40 | }; | ||
41 | |||
42 | void jump_label_lock(void) | 22 | void jump_label_lock(void) |
43 | { | 23 | { |
44 | mutex_lock(&jump_label_mutex); | 24 | mutex_lock(&jump_label_mutex); |
@@ -49,6 +29,11 @@ void jump_label_unlock(void) | |||
49 | mutex_unlock(&jump_label_mutex); | 29 | mutex_unlock(&jump_label_mutex); |
50 | } | 30 | } |
51 | 31 | ||
32 | bool jump_label_enabled(struct jump_label_key *key) | ||
33 | { | ||
34 | return !!atomic_read(&key->enabled); | ||
35 | } | ||
36 | |||
52 | static int jump_label_cmp(const void *a, const void *b) | 37 | static int jump_label_cmp(const void *a, const void *b) |
53 | { | 38 | { |
54 | const struct jump_entry *jea = a; | 39 | const struct jump_entry *jea = a; |
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b) | |||
64 | } | 49 | } |
65 | 50 | ||
66 | static void | 51 | static void |
67 | sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | 52 | jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) |
68 | { | 53 | { |
69 | unsigned long size; | 54 | unsigned long size; |
70 | 55 | ||
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | |||
73 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | 58 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); |
74 | } | 59 | } |
75 | 60 | ||
76 | static struct jump_label_entry *get_jump_label_entry(jump_label_t key) | 61 | static void jump_label_update(struct jump_label_key *key, int enable); |
77 | { | ||
78 | struct hlist_head *head; | ||
79 | struct hlist_node *node; | ||
80 | struct jump_label_entry *e; | ||
81 | u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
82 | |||
83 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
84 | hlist_for_each_entry(e, node, head, hlist) { | ||
85 | if (key == e->key) | ||
86 | return e; | ||
87 | } | ||
88 | return NULL; | ||
89 | } | ||
90 | 62 | ||
91 | static struct jump_label_entry * | 63 | void jump_label_inc(struct jump_label_key *key) |
92 | add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) | ||
93 | { | 64 | { |
94 | struct hlist_head *head; | 65 | if (atomic_inc_not_zero(&key->enabled)) |
95 | struct jump_label_entry *e; | 66 | return; |
96 | u32 hash; | ||
97 | |||
98 | e = get_jump_label_entry(key); | ||
99 | if (e) | ||
100 | return ERR_PTR(-EEXIST); | ||
101 | |||
102 | e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); | ||
103 | if (!e) | ||
104 | return ERR_PTR(-ENOMEM); | ||
105 | |||
106 | hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
107 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
108 | e->key = key; | ||
109 | e->table = table; | ||
110 | e->nr_entries = nr_entries; | ||
111 | INIT_HLIST_HEAD(&(e->modules)); | ||
112 | hlist_add_head(&e->hlist, head); | ||
113 | return e; | ||
114 | } | ||
115 | 67 | ||
116 | static int | 68 | jump_label_lock(); |
117 | build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) | 69 | if (atomic_add_return(1, &key->enabled) == 1) |
118 | { | 70 | jump_label_update(key, JUMP_LABEL_ENABLE); |
119 | struct jump_entry *iter, *iter_begin; | 71 | jump_label_unlock(); |
120 | struct jump_label_entry *entry; | ||
121 | int count; | ||
122 | |||
123 | sort_jump_label_entries(start, stop); | ||
124 | iter = start; | ||
125 | while (iter < stop) { | ||
126 | entry = get_jump_label_entry(iter->key); | ||
127 | if (!entry) { | ||
128 | iter_begin = iter; | ||
129 | count = 0; | ||
130 | while ((iter < stop) && | ||
131 | (iter->key == iter_begin->key)) { | ||
132 | iter++; | ||
133 | count++; | ||
134 | } | ||
135 | entry = add_jump_label_entry(iter_begin->key, | ||
136 | count, iter_begin); | ||
137 | if (IS_ERR(entry)) | ||
138 | return PTR_ERR(entry); | ||
139 | } else { | ||
140 | WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); | ||
141 | return -1; | ||
142 | } | ||
143 | } | ||
144 | return 0; | ||
145 | } | 72 | } |
146 | 73 | ||
147 | /*** | 74 | void jump_label_dec(struct jump_label_key *key) |
148 | * jump_label_update - update jump label text | ||
149 | * @key - key value associated with a a jump label | ||
150 | * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE | ||
151 | * | ||
152 | * Will enable/disable the jump for jump label @key, depending on the | ||
153 | * value of @type. | ||
154 | * | ||
155 | */ | ||
156 | |||
157 | void jump_label_update(unsigned long key, enum jump_label_type type) | ||
158 | { | 75 | { |
159 | struct jump_entry *iter; | 76 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) |
160 | struct jump_label_entry *entry; | 77 | return; |
161 | struct hlist_node *module_node; | ||
162 | struct jump_label_module_entry *e_module; | ||
163 | int count; | ||
164 | 78 | ||
165 | jump_label_lock(); | 79 | jump_label_update(key, JUMP_LABEL_DISABLE); |
166 | entry = get_jump_label_entry((jump_label_t)key); | ||
167 | if (entry) { | ||
168 | count = entry->nr_entries; | ||
169 | iter = entry->table; | ||
170 | while (count--) { | ||
171 | if (kernel_text_address(iter->code)) | ||
172 | arch_jump_label_transform(iter, type); | ||
173 | iter++; | ||
174 | } | ||
175 | /* eanble/disable jump labels in modules */ | ||
176 | hlist_for_each_entry(e_module, module_node, &(entry->modules), | ||
177 | hlist) { | ||
178 | count = e_module->nr_entries; | ||
179 | iter = e_module->table; | ||
180 | while (count--) { | ||
181 | if (iter->key && | ||
182 | kernel_text_address(iter->code)) | ||
183 | arch_jump_label_transform(iter, type); | ||
184 | iter++; | ||
185 | } | ||
186 | } | ||
187 | } | ||
188 | jump_label_unlock(); | 80 | jump_label_unlock(); |
189 | } | 81 | } |
190 | 82 | ||
@@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end) | |||
197 | return 0; | 89 | return 0; |
198 | } | 90 | } |
199 | 91 | ||
200 | #ifdef CONFIG_MODULES | 92 | static int __jump_label_text_reserved(struct jump_entry *iter_start, |
201 | 93 | struct jump_entry *iter_stop, void *start, void *end) | |
202 | static int module_conflict(void *start, void *end) | ||
203 | { | ||
204 | struct hlist_head *head; | ||
205 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
206 | struct jump_label_entry *e; | ||
207 | struct jump_label_module_entry *e_module; | ||
208 | struct jump_entry *iter; | ||
209 | int i, count; | ||
210 | int conflict = 0; | ||
211 | |||
212 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
213 | head = &jump_label_table[i]; | ||
214 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
215 | hlist_for_each_entry_safe(e_module, module_node, | ||
216 | module_node_next, | ||
217 | &(e->modules), hlist) { | ||
218 | count = e_module->nr_entries; | ||
219 | iter = e_module->table; | ||
220 | while (count--) { | ||
221 | if (addr_conflict(iter, start, end)) { | ||
222 | conflict = 1; | ||
223 | goto out; | ||
224 | } | ||
225 | iter++; | ||
226 | } | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | out: | ||
231 | return conflict; | ||
232 | } | ||
233 | |||
234 | #endif | ||
235 | |||
236 | /*** | ||
237 | * jump_label_text_reserved - check if addr range is reserved | ||
238 | * @start: start text addr | ||
239 | * @end: end text addr | ||
240 | * | ||
241 | * checks if the text addr located between @start and @end | ||
242 | * overlaps with any of the jump label patch addresses. Code | ||
243 | * that wants to modify kernel text should first verify that | ||
244 | * it does not overlap with any of the jump label addresses. | ||
245 | * Caller must hold jump_label_mutex. | ||
246 | * | ||
247 | * returns 1 if there is an overlap, 0 otherwise | ||
248 | */ | ||
249 | int jump_label_text_reserved(void *start, void *end) | ||
250 | { | 94 | { |
251 | struct jump_entry *iter; | 95 | struct jump_entry *iter; |
252 | struct jump_entry *iter_start = __start___jump_table; | ||
253 | struct jump_entry *iter_stop = __start___jump_table; | ||
254 | int conflict = 0; | ||
255 | 96 | ||
256 | iter = iter_start; | 97 | iter = iter_start; |
257 | while (iter < iter_stop) { | 98 | while (iter < iter_stop) { |
258 | if (addr_conflict(iter, start, end)) { | 99 | if (addr_conflict(iter, start, end)) |
259 | conflict = 1; | 100 | return 1; |
260 | goto out; | ||
261 | } | ||
262 | iter++; | 101 | iter++; |
263 | } | 102 | } |
264 | 103 | ||
265 | /* now check modules */ | 104 | return 0; |
266 | #ifdef CONFIG_MODULES | 105 | } |
267 | conflict = module_conflict(start, end); | 106 | |
268 | #endif | 107 | static void __jump_label_update(struct jump_label_key *key, |
269 | out: | 108 | struct jump_entry *entry, |
270 | return conflict; | 109 | struct jump_entry *stop, int enable) |
110 | { | ||
111 | for (; (entry < stop) && | ||
112 | (entry->key == (jump_label_t)(unsigned long)key); | ||
113 | entry++) { | ||
114 | /* | ||
115 | * entry->code set to 0 invalidates module init text sections | ||
116 | * kernel_text_address() verifies we are not in core kernel | ||
117 | * init code, see jump_label_invalidate_module_init(). | ||
118 | */ | ||
119 | if (entry->code && kernel_text_address(entry->code)) | ||
120 | arch_jump_label_transform(entry, enable); | ||
121 | } | ||
271 | } | 122 | } |
272 | 123 | ||
273 | /* | 124 | /* |
@@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr) | |||
277 | { | 128 | { |
278 | } | 129 | } |
279 | 130 | ||
280 | static __init int init_jump_label(void) | 131 | static __init int jump_label_init(void) |
281 | { | 132 | { |
282 | int ret; | ||
283 | struct jump_entry *iter_start = __start___jump_table; | 133 | struct jump_entry *iter_start = __start___jump_table; |
284 | struct jump_entry *iter_stop = __stop___jump_table; | 134 | struct jump_entry *iter_stop = __stop___jump_table; |
135 | struct jump_label_key *key = NULL; | ||
285 | struct jump_entry *iter; | 136 | struct jump_entry *iter; |
286 | 137 | ||
287 | jump_label_lock(); | 138 | jump_label_lock(); |
288 | ret = build_jump_label_hashtable(__start___jump_table, | 139 | jump_label_sort_entries(iter_start, iter_stop); |
289 | __stop___jump_table); | 140 | |
290 | iter = iter_start; | 141 | for (iter = iter_start; iter < iter_stop; iter++) { |
291 | while (iter < iter_stop) { | ||
292 | arch_jump_label_text_poke_early(iter->code); | 142 | arch_jump_label_text_poke_early(iter->code); |
293 | iter++; | 143 | if (iter->key == (jump_label_t)(unsigned long)key) |
144 | continue; | ||
145 | |||
146 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
147 | atomic_set(&key->enabled, 0); | ||
148 | key->entries = iter; | ||
149 | #ifdef CONFIG_MODULES | ||
150 | key->next = NULL; | ||
151 | #endif | ||
294 | } | 152 | } |
295 | jump_label_unlock(); | 153 | jump_label_unlock(); |
296 | return ret; | 154 | |
155 | return 0; | ||
297 | } | 156 | } |
298 | early_initcall(init_jump_label); | 157 | early_initcall(jump_label_init); |
299 | 158 | ||
300 | #ifdef CONFIG_MODULES | 159 | #ifdef CONFIG_MODULES |
301 | 160 | ||
302 | static struct jump_label_module_entry * | 161 | struct jump_label_mod { |
303 | add_jump_label_module_entry(struct jump_label_entry *entry, | 162 | struct jump_label_mod *next; |
304 | struct jump_entry *iter_begin, | 163 | struct jump_entry *entries; |
305 | int count, struct module *mod) | 164 | struct module *mod; |
165 | }; | ||
166 | |||
167 | static int __jump_label_mod_text_reserved(void *start, void *end) | ||
306 | { | 168 | { |
307 | struct jump_label_module_entry *e; | 169 | struct module *mod; |
308 | 170 | ||
309 | e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); | 171 | mod = __module_text_address((unsigned long)start); |
310 | if (!e) | 172 | if (!mod) |
311 | return ERR_PTR(-ENOMEM); | 173 | return 0; |
312 | e->mod = mod; | 174 | |
313 | e->nr_entries = count; | 175 | WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); |
314 | e->table = iter_begin; | 176 | |
315 | hlist_add_head(&e->hlist, &entry->modules); | 177 | return __jump_label_text_reserved(mod->jump_entries, |
316 | return e; | 178 | mod->jump_entries + mod->num_jump_entries, |
179 | start, end); | ||
317 | } | 180 | } |
318 | 181 | ||
319 | static int add_jump_label_module(struct module *mod) | 182 | static void __jump_label_mod_update(struct jump_label_key *key, int enable) |
320 | { | 183 | { |
321 | struct jump_entry *iter, *iter_begin; | 184 | struct jump_label_mod *mod = key->next; |
322 | struct jump_label_entry *entry; | ||
323 | struct jump_label_module_entry *module_entry; | ||
324 | int count; | ||
325 | 185 | ||
326 | /* if the module doesn't have jump label entries, just return */ | 186 | while (mod) { |
327 | if (!mod->num_jump_entries) | 187 | struct module *m = mod->mod; |
328 | return 0; | ||
329 | 188 | ||
330 | sort_jump_label_entries(mod->jump_entries, | 189 | __jump_label_update(key, mod->entries, |
331 | mod->jump_entries + mod->num_jump_entries); | 190 | m->jump_entries + m->num_jump_entries, |
332 | iter = mod->jump_entries; | 191 | enable); |
333 | while (iter < mod->jump_entries + mod->num_jump_entries) { | 192 | mod = mod->next; |
334 | entry = get_jump_label_entry(iter->key); | ||
335 | iter_begin = iter; | ||
336 | count = 0; | ||
337 | while ((iter < mod->jump_entries + mod->num_jump_entries) && | ||
338 | (iter->key == iter_begin->key)) { | ||
339 | iter++; | ||
340 | count++; | ||
341 | } | ||
342 | if (!entry) { | ||
343 | entry = add_jump_label_entry(iter_begin->key, 0, NULL); | ||
344 | if (IS_ERR(entry)) | ||
345 | return PTR_ERR(entry); | ||
346 | } | ||
347 | module_entry = add_jump_label_module_entry(entry, iter_begin, | ||
348 | count, mod); | ||
349 | if (IS_ERR(module_entry)) | ||
350 | return PTR_ERR(module_entry); | ||
351 | } | 193 | } |
352 | return 0; | ||
353 | } | 194 | } |
354 | 195 | ||
355 | static void remove_jump_label_module(struct module *mod) | 196 | /*** |
197 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
198 | * @mod: module to patch | ||
199 | * | ||
200 | * Allow for run-time selection of the optimal nops. Before the module | ||
201 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
202 | * the arch specific jump label code. | ||
203 | */ | ||
204 | void jump_label_apply_nops(struct module *mod) | ||
356 | { | 205 | { |
357 | struct hlist_head *head; | 206 | struct jump_entry *iter_start = mod->jump_entries; |
358 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | 207 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
359 | struct jump_label_entry *e; | 208 | struct jump_entry *iter; |
360 | struct jump_label_module_entry *e_module; | ||
361 | int i; | ||
362 | 209 | ||
363 | /* if the module doesn't have jump label entries, just return */ | 210 | /* if the module doesn't have jump label entries, just return */ |
364 | if (!mod->num_jump_entries) | 211 | if (iter_start == iter_stop) |
365 | return; | 212 | return; |
366 | 213 | ||
367 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | 214 | for (iter = iter_start; iter < iter_stop; iter++) |
368 | head = &jump_label_table[i]; | 215 | arch_jump_label_text_poke_early(iter->code); |
369 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | 216 | } |
370 | hlist_for_each_entry_safe(e_module, module_node, | 217 | |
371 | module_node_next, | 218 | static int jump_label_add_module(struct module *mod) |
372 | &(e->modules), hlist) { | 219 | { |
373 | if (e_module->mod == mod) { | 220 | struct jump_entry *iter_start = mod->jump_entries; |
374 | hlist_del(&e_module->hlist); | 221 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
375 | kfree(e_module); | 222 | struct jump_entry *iter; |
376 | } | 223 | struct jump_label_key *key = NULL; |
377 | } | 224 | struct jump_label_mod *jlm; |
378 | if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { | 225 | |
379 | hlist_del(&e->hlist); | 226 | /* if the module doesn't have jump label entries, just return */ |
380 | kfree(e); | 227 | if (iter_start == iter_stop) |
381 | } | 228 | return 0; |
229 | |||
230 | jump_label_sort_entries(iter_start, iter_stop); | ||
231 | |||
232 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
233 | if (iter->key == (jump_label_t)(unsigned long)key) | ||
234 | continue; | ||
235 | |||
236 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
237 | |||
238 | if (__module_address(iter->key) == mod) { | ||
239 | atomic_set(&key->enabled, 0); | ||
240 | key->entries = iter; | ||
241 | key->next = NULL; | ||
242 | continue; | ||
382 | } | 243 | } |
244 | |||
245 | jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); | ||
246 | if (!jlm) | ||
247 | return -ENOMEM; | ||
248 | |||
249 | jlm->mod = mod; | ||
250 | jlm->entries = iter; | ||
251 | jlm->next = key->next; | ||
252 | key->next = jlm; | ||
253 | |||
254 | if (jump_label_enabled(key)) | ||
255 | __jump_label_update(key, iter, iter_stop, | ||
256 | JUMP_LABEL_ENABLE); | ||
383 | } | 257 | } |
258 | |||
259 | return 0; | ||
384 | } | 260 | } |
385 | 261 | ||
386 | static void remove_jump_label_module_init(struct module *mod) | 262 | static void jump_label_del_module(struct module *mod) |
387 | { | 263 | { |
388 | struct hlist_head *head; | 264 | struct jump_entry *iter_start = mod->jump_entries; |
389 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | 265 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
390 | struct jump_label_entry *e; | ||
391 | struct jump_label_module_entry *e_module; | ||
392 | struct jump_entry *iter; | 266 | struct jump_entry *iter; |
393 | int i, count; | 267 | struct jump_label_key *key = NULL; |
268 | struct jump_label_mod *jlm, **prev; | ||
394 | 269 | ||
395 | /* if the module doesn't have jump label entries, just return */ | 270 | for (iter = iter_start; iter < iter_stop; iter++) { |
396 | if (!mod->num_jump_entries) | 271 | if (iter->key == (jump_label_t)(unsigned long)key) |
397 | return; | 272 | continue; |
273 | |||
274 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
275 | |||
276 | if (__module_address(iter->key) == mod) | ||
277 | continue; | ||
278 | |||
279 | prev = &key->next; | ||
280 | jlm = key->next; | ||
281 | |||
282 | while (jlm && jlm->mod != mod) { | ||
283 | prev = &jlm->next; | ||
284 | jlm = jlm->next; | ||
285 | } | ||
398 | 286 | ||
399 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | 287 | if (jlm) { |
400 | head = &jump_label_table[i]; | 288 | *prev = jlm->next; |
401 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | 289 | kfree(jlm); |
402 | hlist_for_each_entry_safe(e_module, module_node, | ||
403 | module_node_next, | ||
404 | &(e->modules), hlist) { | ||
405 | if (e_module->mod != mod) | ||
406 | continue; | ||
407 | count = e_module->nr_entries; | ||
408 | iter = e_module->table; | ||
409 | while (count--) { | ||
410 | if (within_module_init(iter->code, mod)) | ||
411 | iter->key = 0; | ||
412 | iter++; | ||
413 | } | ||
414 | } | ||
415 | } | 290 | } |
416 | } | 291 | } |
417 | } | 292 | } |
418 | 293 | ||
294 | static void jump_label_invalidate_module_init(struct module *mod) | ||
295 | { | ||
296 | struct jump_entry *iter_start = mod->jump_entries; | ||
297 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | ||
298 | struct jump_entry *iter; | ||
299 | |||
300 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
301 | if (within_module_init(iter->code, mod)) | ||
302 | iter->code = 0; | ||
303 | } | ||
304 | } | ||
305 | |||
419 | static int | 306 | static int |
420 | jump_label_module_notify(struct notifier_block *self, unsigned long val, | 307 | jump_label_module_notify(struct notifier_block *self, unsigned long val, |
421 | void *data) | 308 | void *data) |
@@ -426,59 +313,81 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, | |||
426 | switch (val) { | 313 | switch (val) { |
427 | case MODULE_STATE_COMING: | 314 | case MODULE_STATE_COMING: |
428 | jump_label_lock(); | 315 | jump_label_lock(); |
429 | ret = add_jump_label_module(mod); | 316 | ret = jump_label_add_module(mod); |
430 | if (ret) | 317 | if (ret) |
431 | remove_jump_label_module(mod); | 318 | jump_label_del_module(mod); |
432 | jump_label_unlock(); | 319 | jump_label_unlock(); |
433 | break; | 320 | break; |
434 | case MODULE_STATE_GOING: | 321 | case MODULE_STATE_GOING: |
435 | jump_label_lock(); | 322 | jump_label_lock(); |
436 | remove_jump_label_module(mod); | 323 | jump_label_del_module(mod); |
437 | jump_label_unlock(); | 324 | jump_label_unlock(); |
438 | break; | 325 | break; |
439 | case MODULE_STATE_LIVE: | 326 | case MODULE_STATE_LIVE: |
440 | jump_label_lock(); | 327 | jump_label_lock(); |
441 | remove_jump_label_module_init(mod); | 328 | jump_label_invalidate_module_init(mod); |
442 | jump_label_unlock(); | 329 | jump_label_unlock(); |
443 | break; | 330 | break; |
444 | } | 331 | } |
445 | return ret; | ||
446 | } | ||
447 | |||
448 | /*** | ||
449 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
450 | * @mod: module to patch | ||
451 | * | ||
452 | * Allow for run-time selection of the optimal nops. Before the module | ||
453 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
454 | * the arch specific jump label code. | ||
455 | */ | ||
456 | void jump_label_apply_nops(struct module *mod) | ||
457 | { | ||
458 | struct jump_entry *iter; | ||
459 | |||
460 | /* if the module doesn't have jump label entries, just return */ | ||
461 | if (!mod->num_jump_entries) | ||
462 | return; | ||
463 | 332 | ||
464 | iter = mod->jump_entries; | 333 | return notifier_from_errno(ret); |
465 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
466 | arch_jump_label_text_poke_early(iter->code); | ||
467 | iter++; | ||
468 | } | ||
469 | } | 334 | } |
470 | 335 | ||
471 | struct notifier_block jump_label_module_nb = { | 336 | struct notifier_block jump_label_module_nb = { |
472 | .notifier_call = jump_label_module_notify, | 337 | .notifier_call = jump_label_module_notify, |
473 | .priority = 0, | 338 | .priority = 1, /* higher than tracepoints */ |
474 | }; | 339 | }; |
475 | 340 | ||
476 | static __init int init_jump_label_module(void) | 341 | static __init int jump_label_init_module(void) |
477 | { | 342 | { |
478 | return register_module_notifier(&jump_label_module_nb); | 343 | return register_module_notifier(&jump_label_module_nb); |
479 | } | 344 | } |
480 | early_initcall(init_jump_label_module); | 345 | early_initcall(jump_label_init_module); |
481 | 346 | ||
482 | #endif /* CONFIG_MODULES */ | 347 | #endif /* CONFIG_MODULES */ |
483 | 348 | ||
349 | /*** | ||
350 | * jump_label_text_reserved - check if addr range is reserved | ||
351 | * @start: start text addr | ||
352 | * @end: end text addr | ||
353 | * | ||
354 | * checks if the text addr located between @start and @end | ||
355 | * overlaps with any of the jump label patch addresses. Code | ||
356 | * that wants to modify kernel text should first verify that | ||
357 | * it does not overlap with any of the jump label addresses. | ||
358 | * Caller must hold jump_label_mutex. | ||
359 | * | ||
360 | * returns 1 if there is an overlap, 0 otherwise | ||
361 | */ | ||
362 | int jump_label_text_reserved(void *start, void *end) | ||
363 | { | ||
364 | int ret = __jump_label_text_reserved(__start___jump_table, | ||
365 | __stop___jump_table, start, end); | ||
366 | |||
367 | if (ret) | ||
368 | return ret; | ||
369 | |||
370 | #ifdef CONFIG_MODULES | ||
371 | ret = __jump_label_mod_text_reserved(start, end); | ||
372 | #endif | ||
373 | return ret; | ||
374 | } | ||
375 | |||
376 | static void jump_label_update(struct jump_label_key *key, int enable) | ||
377 | { | ||
378 | struct jump_entry *entry = key->entries, *stop = __stop___jump_table; | ||
379 | |||
380 | #ifdef CONFIG_MODULES | ||
381 | struct module *mod = __module_address((jump_label_t)key); | ||
382 | |||
383 | __jump_label_mod_update(key, enable); | ||
384 | |||
385 | if (mod) | ||
386 | stop = mod->jump_entries + mod->num_jump_entries; | ||
387 | #endif | ||
388 | /* if there are no users, entry can be NULL */ | ||
389 | if (entry) | ||
390 | __jump_label_update(key, entry, stop, enable); | ||
391 | } | ||
392 | |||
484 | #endif | 393 | #endif |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 87b77de03dd3..8d814cbc8109 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1531,13 +1531,7 @@ int kernel_kexec(void) | |||
1531 | if (error) | 1531 | if (error) |
1532 | goto Enable_cpus; | 1532 | goto Enable_cpus; |
1533 | local_irq_disable(); | 1533 | local_irq_disable(); |
1534 | /* Suspend system devices */ | 1534 | error = syscore_suspend(); |
1535 | error = sysdev_suspend(PMSG_FREEZE); | ||
1536 | if (!error) { | ||
1537 | error = syscore_suspend(); | ||
1538 | if (error) | ||
1539 | sysdev_resume(); | ||
1540 | } | ||
1541 | if (error) | 1535 | if (error) |
1542 | goto Enable_irqs; | 1536 | goto Enable_irqs; |
1543 | } else | 1537 | } else |
@@ -1553,7 +1547,6 @@ int kernel_kexec(void) | |||
1553 | #ifdef CONFIG_KEXEC_JUMP | 1547 | #ifdef CONFIG_KEXEC_JUMP |
1554 | if (kexec_image->preserve_context) { | 1548 | if (kexec_image->preserve_context) { |
1555 | syscore_resume(); | 1549 | syscore_resume(); |
1556 | sysdev_resume(); | ||
1557 | Enable_irqs: | 1550 | Enable_irqs: |
1558 | local_irq_enable(); | 1551 | local_irq_enable(); |
1559 | Enable_cpus: | 1552 | Enable_cpus: |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a2..47613dfb7b28 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/completion.h> | 27 | #include <linux/completion.h> |
28 | #include <linux/cred.h> | ||
28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
29 | #include <linux/fdtable.h> | 30 | #include <linux/fdtable.h> |
30 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
@@ -43,6 +44,13 @@ extern int max_threads; | |||
43 | 44 | ||
44 | static struct workqueue_struct *khelper_wq; | 45 | static struct workqueue_struct *khelper_wq; |
45 | 46 | ||
47 | #define CAP_BSET (void *)1 | ||
48 | #define CAP_PI (void *)2 | ||
49 | |||
50 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; | ||
51 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; | ||
52 | static DEFINE_SPINLOCK(umh_sysctl_lock); | ||
53 | |||
46 | #ifdef CONFIG_MODULES | 54 | #ifdef CONFIG_MODULES |
47 | 55 | ||
48 | /* | 56 | /* |
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); | |||
132 | static int ____call_usermodehelper(void *data) | 140 | static int ____call_usermodehelper(void *data) |
133 | { | 141 | { |
134 | struct subprocess_info *sub_info = data; | 142 | struct subprocess_info *sub_info = data; |
143 | struct cred *new; | ||
135 | int retval; | 144 | int retval; |
136 | 145 | ||
137 | spin_lock_irq(¤t->sighand->siglock); | 146 | spin_lock_irq(¤t->sighand->siglock); |
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data) | |||
147 | */ | 156 | */ |
148 | set_user_nice(current, 0); | 157 | set_user_nice(current, 0); |
149 | 158 | ||
159 | retval = -ENOMEM; | ||
160 | new = prepare_kernel_cred(current); | ||
161 | if (!new) | ||
162 | goto fail; | ||
163 | |||
164 | spin_lock(&umh_sysctl_lock); | ||
165 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); | ||
166 | new->cap_inheritable = cap_intersect(usermodehelper_inheritable, | ||
167 | new->cap_inheritable); | ||
168 | spin_unlock(&umh_sysctl_lock); | ||
169 | |||
150 | if (sub_info->init) { | 170 | if (sub_info->init) { |
151 | retval = sub_info->init(sub_info); | 171 | retval = sub_info->init(sub_info, new); |
152 | if (retval) | 172 | if (retval) { |
173 | abort_creds(new); | ||
153 | goto fail; | 174 | goto fail; |
175 | } | ||
154 | } | 176 | } |
155 | 177 | ||
178 | commit_creds(new); | ||
179 | |||
156 | retval = kernel_execve(sub_info->path, | 180 | retval = kernel_execve(sub_info->path, |
157 | (const char *const *)sub_info->argv, | 181 | (const char *const *)sub_info->argv, |
158 | (const char *const *)sub_info->envp); | 182 | (const char *const *)sub_info->envp); |
@@ -245,7 +269,6 @@ static void __call_usermodehelper(struct work_struct *work) | |||
245 | } | 269 | } |
246 | } | 270 | } |
247 | 271 | ||
248 | #ifdef CONFIG_PM_SLEEP | ||
249 | /* | 272 | /* |
250 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | 273 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY |
251 | * (used for preventing user land processes from being created after the user | 274 | * (used for preventing user land processes from being created after the user |
@@ -301,6 +324,15 @@ void usermodehelper_enable(void) | |||
301 | usermodehelper_disabled = 0; | 324 | usermodehelper_disabled = 0; |
302 | } | 325 | } |
303 | 326 | ||
327 | /** | ||
328 | * usermodehelper_is_disabled - check if new helpers are allowed to be started | ||
329 | */ | ||
330 | bool usermodehelper_is_disabled(void) | ||
331 | { | ||
332 | return usermodehelper_disabled; | ||
333 | } | ||
334 | EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); | ||
335 | |||
304 | static void helper_lock(void) | 336 | static void helper_lock(void) |
305 | { | 337 | { |
306 | atomic_inc(&running_helpers); | 338 | atomic_inc(&running_helpers); |
@@ -312,12 +344,6 @@ static void helper_unlock(void) | |||
312 | if (atomic_dec_and_test(&running_helpers)) | 344 | if (atomic_dec_and_test(&running_helpers)) |
313 | wake_up(&running_helpers_waitq); | 345 | wake_up(&running_helpers_waitq); |
314 | } | 346 | } |
315 | #else /* CONFIG_PM_SLEEP */ | ||
316 | #define usermodehelper_disabled 0 | ||
317 | |||
318 | static inline void helper_lock(void) {} | ||
319 | static inline void helper_unlock(void) {} | ||
320 | #endif /* CONFIG_PM_SLEEP */ | ||
321 | 347 | ||
322 | /** | 348 | /** |
323 | * call_usermodehelper_setup - prepare to call a usermode helper | 349 | * call_usermodehelper_setup - prepare to call a usermode helper |
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); | |||
364 | * context in which call_usermodehelper_exec is called. | 390 | * context in which call_usermodehelper_exec is called. |
365 | */ | 391 | */ |
366 | void call_usermodehelper_setfns(struct subprocess_info *info, | 392 | void call_usermodehelper_setfns(struct subprocess_info *info, |
367 | int (*init)(struct subprocess_info *info), | 393 | int (*init)(struct subprocess_info *info, struct cred *new), |
368 | void (*cleanup)(struct subprocess_info *info), | 394 | void (*cleanup)(struct subprocess_info *info), |
369 | void *data) | 395 | void *data) |
370 | { | 396 | { |
@@ -418,6 +444,84 @@ unlock: | |||
418 | } | 444 | } |
419 | EXPORT_SYMBOL(call_usermodehelper_exec); | 445 | EXPORT_SYMBOL(call_usermodehelper_exec); |
420 | 446 | ||
447 | static int proc_cap_handler(struct ctl_table *table, int write, | ||
448 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
449 | { | ||
450 | struct ctl_table t; | ||
451 | unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; | ||
452 | kernel_cap_t new_cap; | ||
453 | int err, i; | ||
454 | |||
455 | if (write && (!capable(CAP_SETPCAP) || | ||
456 | !capable(CAP_SYS_MODULE))) | ||
457 | return -EPERM; | ||
458 | |||
459 | /* | ||
460 | * convert from the global kernel_cap_t to the ulong array to print to | ||
461 | * userspace if this is a read. | ||
462 | */ | ||
463 | spin_lock(&umh_sysctl_lock); | ||
464 | for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { | ||
465 | if (table->data == CAP_BSET) | ||
466 | cap_array[i] = usermodehelper_bset.cap[i]; | ||
467 | else if (table->data == CAP_PI) | ||
468 | cap_array[i] = usermodehelper_inheritable.cap[i]; | ||
469 | else | ||
470 | BUG(); | ||
471 | } | ||
472 | spin_unlock(&umh_sysctl_lock); | ||
473 | |||
474 | t = *table; | ||
475 | t.data = &cap_array; | ||
476 | |||
477 | /* | ||
478 | * actually read or write and array of ulongs from userspace. Remember | ||
479 | * these are least significant 32 bits first | ||
480 | */ | ||
481 | err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); | ||
482 | if (err < 0) | ||
483 | return err; | ||
484 | |||
485 | /* | ||
486 | * convert from the sysctl array of ulongs to the kernel_cap_t | ||
487 | * internal representation | ||
488 | */ | ||
489 | for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) | ||
490 | new_cap.cap[i] = cap_array[i]; | ||
491 | |||
492 | /* | ||
493 | * Drop everything not in the new_cap (but don't add things) | ||
494 | */ | ||
495 | spin_lock(&umh_sysctl_lock); | ||
496 | if (write) { | ||
497 | if (table->data == CAP_BSET) | ||
498 | usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); | ||
499 | if (table->data == CAP_PI) | ||
500 | usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); | ||
501 | } | ||
502 | spin_unlock(&umh_sysctl_lock); | ||
503 | |||
504 | return 0; | ||
505 | } | ||
506 | |||
507 | struct ctl_table usermodehelper_table[] = { | ||
508 | { | ||
509 | .procname = "bset", | ||
510 | .data = CAP_BSET, | ||
511 | .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), | ||
512 | .mode = 0600, | ||
513 | .proc_handler = proc_cap_handler, | ||
514 | }, | ||
515 | { | ||
516 | .procname = "inheritable", | ||
517 | .data = CAP_PI, | ||
518 | .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), | ||
519 | .mode = 0600, | ||
520 | .proc_handler = proc_cap_handler, | ||
521 | }, | ||
522 | { } | ||
523 | }; | ||
524 | |||
421 | void __init usermodehelper_init(void) | 525 | void __init usermodehelper_init(void) |
422 | { | 526 | { |
423 | khelper_wq = create_singlethread_workqueue("khelper"); | 527 | khelper_wq = create_singlethread_workqueue("khelper"); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0b624e791805..3b053c04dd86 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
17 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/capability.h> | ||
19 | 20 | ||
20 | #define KERNEL_ATTR_RO(_name) \ | 21 | #define KERNEL_ATTR_RO(_name) \ |
21 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | 22 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo); | |||
131 | 132 | ||
132 | #endif /* CONFIG_KEXEC */ | 133 | #endif /* CONFIG_KEXEC */ |
133 | 134 | ||
135 | /* whether file capabilities are enabled */ | ||
136 | static ssize_t fscaps_show(struct kobject *kobj, | ||
137 | struct kobj_attribute *attr, char *buf) | ||
138 | { | ||
139 | return sprintf(buf, "%d\n", file_caps_enabled); | ||
140 | } | ||
141 | KERNEL_ATTR_RO(fscaps); | ||
142 | |||
134 | /* | 143 | /* |
135 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 144 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
136 | */ | 145 | */ |
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj; | |||
158 | EXPORT_SYMBOL_GPL(kernel_kobj); | 167 | EXPORT_SYMBOL_GPL(kernel_kobj); |
159 | 168 | ||
160 | static struct attribute * kernel_attrs[] = { | 169 | static struct attribute * kernel_attrs[] = { |
170 | &fscaps_attr.attr, | ||
161 | #if defined(CONFIG_HOTPLUG) | 171 | #if defined(CONFIG_HOTPLUG) |
162 | &uevent_seqnum_attr.attr, | 172 | &uevent_seqnum_attr.attr, |
163 | &uevent_helper_attr.attr, | 173 | &uevent_helper_attr.attr, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 3b34d2732bce..4ba7cccb4994 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) | |||
202 | return; | 202 | return; |
203 | } | 203 | } |
204 | 204 | ||
205 | p->cpus_allowed = cpumask_of_cpu(cpu); | 205 | /* It's safe because the task is inactive. */ |
206 | p->rt.nr_cpus_allowed = 1; | 206 | do_set_cpus_allowed(p, cpumask_of(cpu)); |
207 | p->flags |= PF_THREAD_BOUND; | 207 | p->flags |= PF_THREAD_BOUND; |
208 | } | 208 | } |
209 | EXPORT_SYMBOL(kthread_bind); | 209 | EXPORT_SYMBOL(kthread_bind); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 53a68956f131..298c9276dfdb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
490 | usage[i] = '\0'; | 490 | usage[i] = '\0'; |
491 | } | 491 | } |
492 | 492 | ||
493 | static int __print_lock_name(struct lock_class *class) | ||
494 | { | ||
495 | char str[KSYM_NAME_LEN]; | ||
496 | const char *name; | ||
497 | |||
498 | name = class->name; | ||
499 | if (!name) | ||
500 | name = __get_key_name(class->key, str); | ||
501 | |||
502 | return printk("%s", name); | ||
503 | } | ||
504 | |||
493 | static void print_lock_name(struct lock_class *class) | 505 | static void print_lock_name(struct lock_class *class) |
494 | { | 506 | { |
495 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | 507 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; |
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth) | |||
1053 | return 0; | 1065 | return 0; |
1054 | } | 1066 | } |
1055 | 1067 | ||
1068 | static void | ||
1069 | print_circular_lock_scenario(struct held_lock *src, | ||
1070 | struct held_lock *tgt, | ||
1071 | struct lock_list *prt) | ||
1072 | { | ||
1073 | struct lock_class *source = hlock_class(src); | ||
1074 | struct lock_class *target = hlock_class(tgt); | ||
1075 | struct lock_class *parent = prt->class; | ||
1076 | |||
1077 | /* | ||
1078 | * A direct locking problem where unsafe_class lock is taken | ||
1079 | * directly by safe_class lock, then all we need to show | ||
1080 | * is the deadlock scenario, as it is obvious that the | ||
1081 | * unsafe lock is taken under the safe lock. | ||
1082 | * | ||
1083 | * But if there is a chain instead, where the safe lock takes | ||
1084 | * an intermediate lock (middle_class) where this lock is | ||
1085 | * not the same as the safe lock, then the lock chain is | ||
1086 | * used to describe the problem. Otherwise we would need | ||
1087 | * to show a different CPU case for each link in the chain | ||
1088 | * from the safe_class lock to the unsafe_class lock. | ||
1089 | */ | ||
1090 | if (parent != source) { | ||
1091 | printk("Chain exists of:\n "); | ||
1092 | __print_lock_name(source); | ||
1093 | printk(" --> "); | ||
1094 | __print_lock_name(parent); | ||
1095 | printk(" --> "); | ||
1096 | __print_lock_name(target); | ||
1097 | printk("\n\n"); | ||
1098 | } | ||
1099 | |||
1100 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1101 | printk(" CPU0 CPU1\n"); | ||
1102 | printk(" ---- ----\n"); | ||
1103 | printk(" lock("); | ||
1104 | __print_lock_name(target); | ||
1105 | printk(");\n"); | ||
1106 | printk(" lock("); | ||
1107 | __print_lock_name(parent); | ||
1108 | printk(");\n"); | ||
1109 | printk(" lock("); | ||
1110 | __print_lock_name(target); | ||
1111 | printk(");\n"); | ||
1112 | printk(" lock("); | ||
1113 | __print_lock_name(source); | ||
1114 | printk(");\n"); | ||
1115 | printk("\n *** DEADLOCK ***\n\n"); | ||
1116 | } | ||
1117 | |||
1056 | /* | 1118 | /* |
1057 | * When a circular dependency is detected, print the | 1119 | * When a circular dependency is detected, print the |
1058 | * header first: | 1120 | * header first: |
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1096 | { | 1158 | { |
1097 | struct task_struct *curr = current; | 1159 | struct task_struct *curr = current; |
1098 | struct lock_list *parent; | 1160 | struct lock_list *parent; |
1161 | struct lock_list *first_parent; | ||
1099 | int depth; | 1162 | int depth; |
1100 | 1163 | ||
1101 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1164 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1109 | print_circular_bug_header(target, depth, check_src, check_tgt); | 1172 | print_circular_bug_header(target, depth, check_src, check_tgt); |
1110 | 1173 | ||
1111 | parent = get_lock_parent(target); | 1174 | parent = get_lock_parent(target); |
1175 | first_parent = parent; | ||
1112 | 1176 | ||
1113 | while (parent) { | 1177 | while (parent) { |
1114 | print_circular_bug_entry(parent, --depth); | 1178 | print_circular_bug_entry(parent, --depth); |
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1116 | } | 1180 | } |
1117 | 1181 | ||
1118 | printk("\nother info that might help us debug this:\n\n"); | 1182 | printk("\nother info that might help us debug this:\n\n"); |
1183 | print_circular_lock_scenario(check_src, check_tgt, | ||
1184 | first_parent); | ||
1185 | |||
1119 | lockdep_print_held_locks(curr); | 1186 | lockdep_print_held_locks(curr); |
1120 | 1187 | ||
1121 | printk("\nstack backtrace:\n"); | 1188 | printk("\nstack backtrace:\n"); |
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1314 | printk("\n"); | 1381 | printk("\n"); |
1315 | 1382 | ||
1316 | if (depth == 0 && (entry != root)) { | 1383 | if (depth == 0 && (entry != root)) { |
1317 | printk("lockdep:%s bad BFS generated tree\n", __func__); | 1384 | printk("lockdep:%s bad path found in chain graph\n", __func__); |
1318 | break; | 1385 | break; |
1319 | } | 1386 | } |
1320 | 1387 | ||
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1325 | return; | 1392 | return; |
1326 | } | 1393 | } |
1327 | 1394 | ||
1395 | static void | ||
1396 | print_irq_lock_scenario(struct lock_list *safe_entry, | ||
1397 | struct lock_list *unsafe_entry, | ||
1398 | struct lock_class *prev_class, | ||
1399 | struct lock_class *next_class) | ||
1400 | { | ||
1401 | struct lock_class *safe_class = safe_entry->class; | ||
1402 | struct lock_class *unsafe_class = unsafe_entry->class; | ||
1403 | struct lock_class *middle_class = prev_class; | ||
1404 | |||
1405 | if (middle_class == safe_class) | ||
1406 | middle_class = next_class; | ||
1407 | |||
1408 | /* | ||
1409 | * A direct locking problem where unsafe_class lock is taken | ||
1410 | * directly by safe_class lock, then all we need to show | ||
1411 | * is the deadlock scenario, as it is obvious that the | ||
1412 | * unsafe lock is taken under the safe lock. | ||
1413 | * | ||
1414 | * But if there is a chain instead, where the safe lock takes | ||
1415 | * an intermediate lock (middle_class) where this lock is | ||
1416 | * not the same as the safe lock, then the lock chain is | ||
1417 | * used to describe the problem. Otherwise we would need | ||
1418 | * to show a different CPU case for each link in the chain | ||
1419 | * from the safe_class lock to the unsafe_class lock. | ||
1420 | */ | ||
1421 | if (middle_class != unsafe_class) { | ||
1422 | printk("Chain exists of:\n "); | ||
1423 | __print_lock_name(safe_class); | ||
1424 | printk(" --> "); | ||
1425 | __print_lock_name(middle_class); | ||
1426 | printk(" --> "); | ||
1427 | __print_lock_name(unsafe_class); | ||
1428 | printk("\n\n"); | ||
1429 | } | ||
1430 | |||
1431 | printk(" Possible interrupt unsafe locking scenario:\n\n"); | ||
1432 | printk(" CPU0 CPU1\n"); | ||
1433 | printk(" ---- ----\n"); | ||
1434 | printk(" lock("); | ||
1435 | __print_lock_name(unsafe_class); | ||
1436 | printk(");\n"); | ||
1437 | printk(" local_irq_disable();\n"); | ||
1438 | printk(" lock("); | ||
1439 | __print_lock_name(safe_class); | ||
1440 | printk(");\n"); | ||
1441 | printk(" lock("); | ||
1442 | __print_lock_name(middle_class); | ||
1443 | printk(");\n"); | ||
1444 | printk(" <Interrupt>\n"); | ||
1445 | printk(" lock("); | ||
1446 | __print_lock_name(safe_class); | ||
1447 | printk(");\n"); | ||
1448 | printk("\n *** DEADLOCK ***\n\n"); | ||
1449 | } | ||
1450 | |||
1328 | static int | 1451 | static int |
1329 | print_bad_irq_dependency(struct task_struct *curr, | 1452 | print_bad_irq_dependency(struct task_struct *curr, |
1330 | struct lock_list *prev_root, | 1453 | struct lock_list *prev_root, |
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1376 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); | 1499 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); |
1377 | 1500 | ||
1378 | printk("\nother info that might help us debug this:\n\n"); | 1501 | printk("\nother info that might help us debug this:\n\n"); |
1502 | print_irq_lock_scenario(backwards_entry, forwards_entry, | ||
1503 | hlock_class(prev), hlock_class(next)); | ||
1504 | |||
1379 | lockdep_print_held_locks(curr); | 1505 | lockdep_print_held_locks(curr); |
1380 | 1506 | ||
1381 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); | 1507 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); |
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void) | |||
1539 | 1665 | ||
1540 | #endif | 1666 | #endif |
1541 | 1667 | ||
1668 | static void | ||
1669 | print_deadlock_scenario(struct held_lock *nxt, | ||
1670 | struct held_lock *prv) | ||
1671 | { | ||
1672 | struct lock_class *next = hlock_class(nxt); | ||
1673 | struct lock_class *prev = hlock_class(prv); | ||
1674 | |||
1675 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1676 | printk(" CPU0\n"); | ||
1677 | printk(" ----\n"); | ||
1678 | printk(" lock("); | ||
1679 | __print_lock_name(prev); | ||
1680 | printk(");\n"); | ||
1681 | printk(" lock("); | ||
1682 | __print_lock_name(next); | ||
1683 | printk(");\n"); | ||
1684 | printk("\n *** DEADLOCK ***\n\n"); | ||
1685 | printk(" May be due to missing lock nesting notation\n\n"); | ||
1686 | } | ||
1687 | |||
1542 | static int | 1688 | static int |
1543 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | 1689 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, |
1544 | struct held_lock *next) | 1690 | struct held_lock *next) |
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1557 | print_lock(prev); | 1703 | print_lock(prev); |
1558 | 1704 | ||
1559 | printk("\nother info that might help us debug this:\n"); | 1705 | printk("\nother info that might help us debug this:\n"); |
1706 | print_deadlock_scenario(next, prev); | ||
1560 | lockdep_print_held_locks(curr); | 1707 | lockdep_print_held_locks(curr); |
1561 | 1708 | ||
1562 | printk("\nstack backtrace:\n"); | 1709 | printk("\nstack backtrace:\n"); |
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
1826 | struct list_head *hash_head = chainhashentry(chain_key); | 1973 | struct list_head *hash_head = chainhashentry(chain_key); |
1827 | struct lock_chain *chain; | 1974 | struct lock_chain *chain; |
1828 | struct held_lock *hlock_curr, *hlock_next; | 1975 | struct held_lock *hlock_curr, *hlock_next; |
1829 | int i, j, n, cn; | 1976 | int i, j; |
1830 | 1977 | ||
1831 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 1978 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
1832 | return 0; | 1979 | return 0; |
@@ -1886,15 +2033,9 @@ cache_hit: | |||
1886 | } | 2033 | } |
1887 | i++; | 2034 | i++; |
1888 | chain->depth = curr->lockdep_depth + 1 - i; | 2035 | chain->depth = curr->lockdep_depth + 1 - i; |
1889 | cn = nr_chain_hlocks; | 2036 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { |
1890 | while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { | 2037 | chain->base = nr_chain_hlocks; |
1891 | n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); | 2038 | nr_chain_hlocks += chain->depth; |
1892 | if (n == cn) | ||
1893 | break; | ||
1894 | cn = n; | ||
1895 | } | ||
1896 | if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { | ||
1897 | chain->base = cn; | ||
1898 | for (j = 0; j < chain->depth - 1; j++, i++) { | 2039 | for (j = 0; j < chain->depth - 1; j++, i++) { |
1899 | int lock_id = curr->held_locks[i].class_idx - 1; | 2040 | int lock_id = curr->held_locks[i].class_idx - 1; |
1900 | chain_hlocks[chain->base + j] = lock_id; | 2041 | chain_hlocks[chain->base + j] = lock_id; |
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr) | |||
2011 | #endif | 2152 | #endif |
2012 | } | 2153 | } |
2013 | 2154 | ||
2155 | static void | ||
2156 | print_usage_bug_scenario(struct held_lock *lock) | ||
2157 | { | ||
2158 | struct lock_class *class = hlock_class(lock); | ||
2159 | |||
2160 | printk(" Possible unsafe locking scenario:\n\n"); | ||
2161 | printk(" CPU0\n"); | ||
2162 | printk(" ----\n"); | ||
2163 | printk(" lock("); | ||
2164 | __print_lock_name(class); | ||
2165 | printk(");\n"); | ||
2166 | printk(" <Interrupt>\n"); | ||
2167 | printk(" lock("); | ||
2168 | __print_lock_name(class); | ||
2169 | printk(");\n"); | ||
2170 | printk("\n *** DEADLOCK ***\n\n"); | ||
2171 | } | ||
2172 | |||
2014 | static int | 2173 | static int |
2015 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | 2174 | print_usage_bug(struct task_struct *curr, struct held_lock *this, |
2016 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | 2175 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) |
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2039 | 2198 | ||
2040 | print_irqtrace_events(curr); | 2199 | print_irqtrace_events(curr); |
2041 | printk("\nother info that might help us debug this:\n"); | 2200 | printk("\nother info that might help us debug this:\n"); |
2201 | print_usage_bug_scenario(this); | ||
2202 | |||
2042 | lockdep_print_held_locks(curr); | 2203 | lockdep_print_held_locks(curr); |
2043 | 2204 | ||
2044 | printk("\nstack backtrace:\n"); | 2205 | printk("\nstack backtrace:\n"); |
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2073 | struct held_lock *this, int forwards, | 2234 | struct held_lock *this, int forwards, |
2074 | const char *irqclass) | 2235 | const char *irqclass) |
2075 | { | 2236 | { |
2237 | struct lock_list *entry = other; | ||
2238 | struct lock_list *middle = NULL; | ||
2239 | int depth; | ||
2240 | |||
2076 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2241 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2077 | return 0; | 2242 | return 0; |
2078 | 2243 | ||
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2091 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | 2256 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); |
2092 | 2257 | ||
2093 | printk("\nother info that might help us debug this:\n"); | 2258 | printk("\nother info that might help us debug this:\n"); |
2259 | |||
2260 | /* Find a middle lock (if one exists) */ | ||
2261 | depth = get_lock_depth(other); | ||
2262 | do { | ||
2263 | if (depth == 0 && (entry != root)) { | ||
2264 | printk("lockdep:%s bad path found in chain graph\n", __func__); | ||
2265 | break; | ||
2266 | } | ||
2267 | middle = entry; | ||
2268 | entry = get_lock_parent(entry); | ||
2269 | depth--; | ||
2270 | } while (entry && entry != root && (depth >= 0)); | ||
2271 | if (forwards) | ||
2272 | print_irq_lock_scenario(root, other, | ||
2273 | middle ? middle->class : root->class, other->class); | ||
2274 | else | ||
2275 | print_irq_lock_scenario(other, root, | ||
2276 | middle ? middle->class : other->class, root->class); | ||
2277 | |||
2094 | lockdep_print_held_locks(curr); | 2278 | lockdep_print_held_locks(curr); |
2095 | 2279 | ||
2096 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); | 2280 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); |
@@ -3242,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock) | |||
3242 | int ret = 0; | 3426 | int ret = 0; |
3243 | 3427 | ||
3244 | if (unlikely(current->lockdep_recursion)) | 3428 | if (unlikely(current->lockdep_recursion)) |
3245 | return ret; | 3429 | return 1; /* avoid false negative lockdep_assert_held() */ |
3246 | 3430 | ||
3247 | raw_local_irq_save(flags); | 3431 | raw_local_irq_save(flags); |
3248 | check_flags(flags); | 3432 | check_flags(flags); |
diff --git a/kernel/module.c b/kernel/module.c index d5938a5c19c4..795bdc7f5c3f 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | 58 | #include <linux/jump_label.h> |
59 | #include <linux/pfn.h> | 59 | #include <linux/pfn.h> |
60 | #include <linux/bsearch.h> | ||
60 | 61 | ||
61 | #define CREATE_TRACE_POINTS | 62 | #define CREATE_TRACE_POINTS |
62 | #include <trace/events/module.h> | 63 | #include <trace/events/module.h> |
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr, | |||
240 | struct module *owner, | 241 | struct module *owner, |
241 | bool (*fn)(const struct symsearch *syms, | 242 | bool (*fn)(const struct symsearch *syms, |
242 | struct module *owner, | 243 | struct module *owner, |
243 | unsigned int symnum, void *data), | 244 | void *data), |
244 | void *data) | 245 | void *data) |
245 | { | 246 | { |
246 | unsigned int i, j; | 247 | unsigned int j; |
247 | 248 | ||
248 | for (j = 0; j < arrsize; j++) { | 249 | for (j = 0; j < arrsize; j++) { |
249 | for (i = 0; i < arr[j].stop - arr[j].start; i++) | 250 | if (fn(&arr[j], owner, data)) |
250 | if (fn(&arr[j], owner, i, data)) | 251 | return true; |
251 | return true; | ||
252 | } | 252 | } |
253 | 253 | ||
254 | return false; | 254 | return false; |
255 | } | 255 | } |
256 | 256 | ||
257 | /* Returns true as soon as fn returns true, otherwise false. */ | 257 | /* Returns true as soon as fn returns true, otherwise false. */ |
258 | bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | 258 | bool each_symbol_section(bool (*fn)(const struct symsearch *arr, |
259 | unsigned int symnum, void *data), void *data) | 259 | struct module *owner, |
260 | void *data), | ||
261 | void *data) | ||
260 | { | 262 | { |
261 | struct module *mod; | 263 | struct module *mod; |
262 | static const struct symsearch arr[] = { | 264 | static const struct symsearch arr[] = { |
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | |||
309 | } | 311 | } |
310 | return false; | 312 | return false; |
311 | } | 313 | } |
312 | EXPORT_SYMBOL_GPL(each_symbol); | 314 | EXPORT_SYMBOL_GPL(each_symbol_section); |
313 | 315 | ||
314 | struct find_symbol_arg { | 316 | struct find_symbol_arg { |
315 | /* Input */ | 317 | /* Input */ |
@@ -323,15 +325,12 @@ struct find_symbol_arg { | |||
323 | const struct kernel_symbol *sym; | 325 | const struct kernel_symbol *sym; |
324 | }; | 326 | }; |
325 | 327 | ||
326 | static bool find_symbol_in_section(const struct symsearch *syms, | 328 | static bool check_symbol(const struct symsearch *syms, |
327 | struct module *owner, | 329 | struct module *owner, |
328 | unsigned int symnum, void *data) | 330 | unsigned int symnum, void *data) |
329 | { | 331 | { |
330 | struct find_symbol_arg *fsa = data; | 332 | struct find_symbol_arg *fsa = data; |
331 | 333 | ||
332 | if (strcmp(syms->start[symnum].name, fsa->name) != 0) | ||
333 | return false; | ||
334 | |||
335 | if (!fsa->gplok) { | 334 | if (!fsa->gplok) { |
336 | if (syms->licence == GPL_ONLY) | 335 | if (syms->licence == GPL_ONLY) |
337 | return false; | 336 | return false; |
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms, | |||
365 | return true; | 364 | return true; |
366 | } | 365 | } |
367 | 366 | ||
367 | static int cmp_name(const void *va, const void *vb) | ||
368 | { | ||
369 | const char *a; | ||
370 | const struct kernel_symbol *b; | ||
371 | a = va; b = vb; | ||
372 | return strcmp(a, b->name); | ||
373 | } | ||
374 | |||
375 | static bool find_symbol_in_section(const struct symsearch *syms, | ||
376 | struct module *owner, | ||
377 | void *data) | ||
378 | { | ||
379 | struct find_symbol_arg *fsa = data; | ||
380 | struct kernel_symbol *sym; | ||
381 | |||
382 | sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, | ||
383 | sizeof(struct kernel_symbol), cmp_name); | ||
384 | |||
385 | if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) | ||
386 | return true; | ||
387 | |||
388 | return false; | ||
389 | } | ||
390 | |||
368 | /* Find a symbol and return it, along with, (optional) crc and | 391 | /* Find a symbol and return it, along with, (optional) crc and |
369 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ | 392 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ |
370 | const struct kernel_symbol *find_symbol(const char *name, | 393 | const struct kernel_symbol *find_symbol(const char *name, |
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
379 | fsa.gplok = gplok; | 402 | fsa.gplok = gplok; |
380 | fsa.warn = warn; | 403 | fsa.warn = warn; |
381 | 404 | ||
382 | if (each_symbol(find_symbol_in_section, &fsa)) { | 405 | if (each_symbol_section(find_symbol_in_section, &fsa)) { |
383 | if (owner) | 406 | if (owner) |
384 | *owner = fsa.owner; | 407 | *owner = fsa.owner; |
385 | if (crc) | 408 | if (crc) |
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base, | |||
1607 | } | 1630 | } |
1608 | } | 1631 | } |
1609 | 1632 | ||
1610 | /* Setting memory back to RW+NX before releasing it */ | 1633 | static void unset_module_core_ro_nx(struct module *mod) |
1611 | void unset_section_ro_nx(struct module *mod, void *module_region) | ||
1612 | { | 1634 | { |
1613 | unsigned long total_pages; | 1635 | set_page_attributes(mod->module_core + mod->core_text_size, |
1614 | 1636 | mod->module_core + mod->core_size, | |
1615 | if (mod->module_core == module_region) { | 1637 | set_memory_x); |
1616 | /* Set core as NX+RW */ | 1638 | set_page_attributes(mod->module_core, |
1617 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); | 1639 | mod->module_core + mod->core_ro_size, |
1618 | set_memory_nx((unsigned long)mod->module_core, total_pages); | 1640 | set_memory_rw); |
1619 | set_memory_rw((unsigned long)mod->module_core, total_pages); | 1641 | } |
1620 | 1642 | ||
1621 | } else if (mod->module_init == module_region) { | 1643 | static void unset_module_init_ro_nx(struct module *mod) |
1622 | /* Set init as NX+RW */ | 1644 | { |
1623 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); | 1645 | set_page_attributes(mod->module_init + mod->init_text_size, |
1624 | set_memory_nx((unsigned long)mod->module_init, total_pages); | 1646 | mod->module_init + mod->init_size, |
1625 | set_memory_rw((unsigned long)mod->module_init, total_pages); | 1647 | set_memory_x); |
1626 | } | 1648 | set_page_attributes(mod->module_init, |
1649 | mod->module_init + mod->init_ro_size, | ||
1650 | set_memory_rw); | ||
1627 | } | 1651 | } |
1628 | 1652 | ||
1629 | /* Iterate through all modules and set each module's text as RW */ | 1653 | /* Iterate through all modules and set each module's text as RW */ |
1630 | void set_all_modules_text_rw() | 1654 | void set_all_modules_text_rw(void) |
1631 | { | 1655 | { |
1632 | struct module *mod; | 1656 | struct module *mod; |
1633 | 1657 | ||
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw() | |||
1648 | } | 1672 | } |
1649 | 1673 | ||
1650 | /* Iterate through all modules and set each module's text as RO */ | 1674 | /* Iterate through all modules and set each module's text as RO */ |
1651 | void set_all_modules_text_ro() | 1675 | void set_all_modules_text_ro(void) |
1652 | { | 1676 | { |
1653 | struct module *mod; | 1677 | struct module *mod; |
1654 | 1678 | ||
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro() | |||
1669 | } | 1693 | } |
1670 | #else | 1694 | #else |
1671 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | 1695 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } |
1672 | static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } | 1696 | static void unset_module_core_ro_nx(struct module *mod) { } |
1697 | static void unset_module_init_ro_nx(struct module *mod) { } | ||
1673 | #endif | 1698 | #endif |
1674 | 1699 | ||
1675 | /* Free a module, remove from lists, etc. */ | 1700 | /* Free a module, remove from lists, etc. */ |
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod) | |||
1696 | destroy_params(mod->kp, mod->num_kp); | 1721 | destroy_params(mod->kp, mod->num_kp); |
1697 | 1722 | ||
1698 | /* This may be NULL, but that's OK */ | 1723 | /* This may be NULL, but that's OK */ |
1699 | unset_section_ro_nx(mod, mod->module_init); | 1724 | unset_module_init_ro_nx(mod); |
1700 | module_free(mod, mod->module_init); | 1725 | module_free(mod, mod->module_init); |
1701 | kfree(mod->args); | 1726 | kfree(mod->args); |
1702 | percpu_modfree(mod); | 1727 | percpu_modfree(mod); |
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod) | |||
1705 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1730 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1706 | 1731 | ||
1707 | /* Finally, free the core (containing the module structure) */ | 1732 | /* Finally, free the core (containing the module structure) */ |
1708 | unset_section_ro_nx(mod, mod->module_core); | 1733 | unset_module_core_ro_nx(mod); |
1709 | module_free(mod, mod->module_core); | 1734 | module_free(mod, mod->module_core); |
1710 | 1735 | ||
1711 | #ifdef CONFIG_MPU | 1736 | #ifdef CONFIG_MPU |
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
2030 | const struct kernel_symbol *start, | 2055 | const struct kernel_symbol *start, |
2031 | const struct kernel_symbol *stop) | 2056 | const struct kernel_symbol *stop) |
2032 | { | 2057 | { |
2033 | const struct kernel_symbol *ks = start; | 2058 | return bsearch(name, start, stop - start, |
2034 | for (; ks < stop; ks++) | 2059 | sizeof(struct kernel_symbol), cmp_name); |
2035 | if (strcmp(ks->name, name) == 0) | ||
2036 | return ks; | ||
2037 | return NULL; | ||
2038 | } | 2060 | } |
2039 | 2061 | ||
2040 | static int is_exported(const char *name, unsigned long value, | 2062 | static int is_exported(const char *name, unsigned long value, |
@@ -2790,7 +2812,7 @@ static struct module *load_module(void __user *umod, | |||
2790 | } | 2812 | } |
2791 | 2813 | ||
2792 | /* This has to be done once we're sure module name is unique. */ | 2814 | /* This has to be done once we're sure module name is unique. */ |
2793 | if (!mod->taints) | 2815 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) |
2794 | dynamic_debug_setup(info.debug, info.num_debug); | 2816 | dynamic_debug_setup(info.debug, info.num_debug); |
2795 | 2817 | ||
2796 | /* Find duplicate symbols */ | 2818 | /* Find duplicate symbols */ |
@@ -2827,7 +2849,7 @@ static struct module *load_module(void __user *umod, | |||
2827 | module_bug_cleanup(mod); | 2849 | module_bug_cleanup(mod); |
2828 | 2850 | ||
2829 | ddebug: | 2851 | ddebug: |
2830 | if (!mod->taints) | 2852 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) |
2831 | dynamic_debug_remove(info.debug); | 2853 | dynamic_debug_remove(info.debug); |
2832 | unlock: | 2854 | unlock: |
2833 | mutex_unlock(&module_mutex); | 2855 | mutex_unlock(&module_mutex); |
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2931 | mod->symtab = mod->core_symtab; | 2953 | mod->symtab = mod->core_symtab; |
2932 | mod->strtab = mod->core_strtab; | 2954 | mod->strtab = mod->core_strtab; |
2933 | #endif | 2955 | #endif |
2934 | unset_section_ro_nx(mod, mod->module_init); | 2956 | unset_module_init_ro_nx(mod); |
2935 | module_free(mod, mod->module_init); | 2957 | module_free(mod, mod->module_init); |
2936 | mod->module_init = NULL; | 2958 | mod->module_init = NULL; |
2937 | mod->init_size = 0; | 2959 | mod->init_size = 0; |
2960 | mod->init_ro_size = 0; | ||
2938 | mod->init_text_size = 0; | 2961 | mod->init_text_size = 0; |
2939 | mutex_unlock(&module_mutex); | 2962 | mutex_unlock(&module_mutex); |
2940 | 2963 | ||
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) | |||
75 | return; | 75 | return; |
76 | 76 | ||
77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
78 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 78 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
80 | mutex_clear_owner(lock); | 80 | mutex_clear_owner(lock); |
81 | } | 81 | } |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, | |||
29 | 29 | ||
30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
31 | { | 31 | { |
32 | lock->owner = current_thread_info(); | 32 | lock->owner = current; |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa98900..d607ed5dd441 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock); | |||
131 | */ | 131 | */ |
132 | static inline int __sched | 132 | static inline int __sched |
133 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 133 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
134 | unsigned long ip) | 134 | struct lockdep_map *nest_lock, unsigned long ip) |
135 | { | 135 | { |
136 | struct task_struct *task = current; | 136 | struct task_struct *task = current; |
137 | struct mutex_waiter waiter; | 137 | struct mutex_waiter waiter; |
138 | unsigned long flags; | 138 | unsigned long flags; |
139 | 139 | ||
140 | preempt_disable(); | 140 | preempt_disable(); |
141 | mutex_acquire(&lock->dep_map, subclass, 0, ip); | 141 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
142 | 142 | ||
143 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 143 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
144 | /* | 144 | /* |
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
160 | */ | 160 | */ |
161 | 161 | ||
162 | for (;;) { | 162 | for (;;) { |
163 | struct thread_info *owner; | 163 | struct task_struct *owner; |
164 | |||
165 | /* | ||
166 | * If we own the BKL, then don't spin. The owner of | ||
167 | * the mutex might be waiting on us to release the BKL. | ||
168 | */ | ||
169 | if (unlikely(current->lock_depth >= 0)) | ||
170 | break; | ||
171 | 164 | ||
172 | /* | 165 | /* |
173 | * If there's an owner, wait for it to either | 166 | * If there's an owner, wait for it to either |
@@ -276,16 +269,25 @@ void __sched | |||
276 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | 269 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) |
277 | { | 270 | { |
278 | might_sleep(); | 271 | might_sleep(); |
279 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); | 272 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); |
280 | } | 273 | } |
281 | 274 | ||
282 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 275 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
283 | 276 | ||
277 | void __sched | ||
278 | _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | ||
279 | { | ||
280 | might_sleep(); | ||
281 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); | ||
282 | } | ||
283 | |||
284 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | ||
285 | |||
284 | int __sched | 286 | int __sched |
285 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | 287 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) |
286 | { | 288 | { |
287 | might_sleep(); | 289 | might_sleep(); |
288 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); | 290 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); |
289 | } | 291 | } |
290 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 292 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
291 | 293 | ||
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
294 | { | 296 | { |
295 | might_sleep(); | 297 | might_sleep(); |
296 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 298 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
297 | subclass, _RET_IP_); | 299 | subclass, NULL, _RET_IP_); |
298 | } | 300 | } |
299 | 301 | ||
300 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 302 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
400 | { | 402 | { |
401 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 403 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
402 | 404 | ||
403 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); | 405 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); |
404 | } | 406 | } |
405 | 407 | ||
406 | static noinline int __sched | 408 | static noinline int __sched |
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) | |||
408 | { | 410 | { |
409 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 411 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
410 | 412 | ||
411 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); | 413 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); |
412 | } | 414 | } |
413 | 415 | ||
414 | static noinline int __sched | 416 | static noinline int __sched |
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count) | |||
416 | { | 418 | { |
417 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 419 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
418 | 420 | ||
419 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); | 421 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); |
420 | } | 422 | } |
421 | #endif | 423 | #endif |
422 | 424 | ||
diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -19,7 +19,7 @@ | |||
19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_SMP |
20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
21 | { | 21 | { |
22 | lock->owner = current_thread_info(); | 22 | lock->owner = current; |
23 | } | 23 | } |
24 | 24 | ||
25 | static inline void mutex_clear_owner(struct mutex *lock) | 25 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0e..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null | |||
@@ -1,118 +0,0 @@ | |||
1 | /* | ||
2 | * ns_cgroup.c - namespace cgroup subsystem | ||
3 | * | ||
4 | * Copyright 2006, 2007 IBM Corp | ||
5 | */ | ||
6 | |||
7 | #include <linux/module.h> | ||
8 | #include <linux/cgroup.h> | ||
9 | #include <linux/fs.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/nsproxy.h> | ||
13 | |||
14 | struct ns_cgroup { | ||
15 | struct cgroup_subsys_state css; | ||
16 | }; | ||
17 | |||
18 | struct cgroup_subsys ns_subsys; | ||
19 | |||
20 | static inline struct ns_cgroup *cgroup_to_ns( | ||
21 | struct cgroup *cgroup) | ||
22 | { | ||
23 | return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), | ||
24 | struct ns_cgroup, css); | ||
25 | } | ||
26 | |||
27 | int ns_cgroup_clone(struct task_struct *task, struct pid *pid) | ||
28 | { | ||
29 | char name[PROC_NUMBUF]; | ||
30 | |||
31 | snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); | ||
32 | return cgroup_clone(task, &ns_subsys, name); | ||
33 | } | ||
34 | |||
35 | /* | ||
36 | * Rules: | ||
37 | * 1. you can only enter a cgroup which is a descendant of your current | ||
38 | * cgroup | ||
39 | * 2. you can only place another process into a cgroup if | ||
40 | * a. you have CAP_SYS_ADMIN | ||
41 | * b. your cgroup is an ancestor of task's destination cgroup | ||
42 | * (hence either you are in the same cgroup as task, or in an | ||
43 | * ancestor cgroup thereof) | ||
44 | */ | ||
45 | static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, | ||
46 | struct task_struct *task, bool threadgroup) | ||
47 | { | ||
48 | if (current != task) { | ||
49 | if (!capable(CAP_SYS_ADMIN)) | ||
50 | return -EPERM; | ||
51 | |||
52 | if (!cgroup_is_descendant(new_cgroup, current)) | ||
53 | return -EPERM; | ||
54 | } | ||
55 | |||
56 | if (!cgroup_is_descendant(new_cgroup, task)) | ||
57 | return -EPERM; | ||
58 | |||
59 | if (threadgroup) { | ||
60 | struct task_struct *c; | ||
61 | rcu_read_lock(); | ||
62 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
63 | if (!cgroup_is_descendant(new_cgroup, c)) { | ||
64 | rcu_read_unlock(); | ||
65 | return -EPERM; | ||
66 | } | ||
67 | } | ||
68 | rcu_read_unlock(); | ||
69 | } | ||
70 | |||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Rules: you can only create a cgroup if | ||
76 | * 1. you are capable(CAP_SYS_ADMIN) | ||
77 | * 2. the target cgroup is a descendant of your own cgroup | ||
78 | */ | ||
79 | static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | ||
80 | struct cgroup *cgroup) | ||
81 | { | ||
82 | struct ns_cgroup *ns_cgroup; | ||
83 | |||
84 | if (!capable(CAP_SYS_ADMIN)) | ||
85 | return ERR_PTR(-EPERM); | ||
86 | if (!cgroup_is_descendant(cgroup, current)) | ||
87 | return ERR_PTR(-EPERM); | ||
88 | if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { | ||
89 | printk("ns_cgroup can't be created with parent " | ||
90 | "'clone_children' set.\n"); | ||
91 | return ERR_PTR(-EINVAL); | ||
92 | } | ||
93 | |||
94 | printk_once("ns_cgroup deprecated: consider using the " | ||
95 | "'clone_children' flag without the ns_cgroup.\n"); | ||
96 | |||
97 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | ||
98 | if (!ns_cgroup) | ||
99 | return ERR_PTR(-ENOMEM); | ||
100 | return &ns_cgroup->css; | ||
101 | } | ||
102 | |||
103 | static void ns_destroy(struct cgroup_subsys *ss, | ||
104 | struct cgroup *cgroup) | ||
105 | { | ||
106 | struct ns_cgroup *ns_cgroup; | ||
107 | |||
108 | ns_cgroup = cgroup_to_ns(cgroup); | ||
109 | kfree(ns_cgroup); | ||
110 | } | ||
111 | |||
112 | struct cgroup_subsys ns_subsys = { | ||
113 | .name = "ns", | ||
114 | .can_attach = ns_can_attach, | ||
115 | .create = ns_create, | ||
116 | .destroy = ns_destroy, | ||
117 | .subsys_id = ns_subsys_id, | ||
118 | }; | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd9..d6a00f3de15d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -22,6 +22,9 @@ | |||
22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
23 | #include <net/net_namespace.h> | 23 | #include <net/net_namespace.h> |
24 | #include <linux/ipc_namespace.h> | 24 | #include <linux/ipc_namespace.h> |
25 | #include <linux/proc_fs.h> | ||
26 | #include <linux/file.h> | ||
27 | #include <linux/syscalls.h> | ||
25 | 28 | ||
26 | static struct kmem_cache *nsproxy_cachep; | 29 | static struct kmem_cache *nsproxy_cachep; |
27 | 30 | ||
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
198 | goto out; | 201 | goto out; |
199 | } | 202 | } |
200 | 203 | ||
201 | err = ns_cgroup_clone(current, task_pid(current)); | ||
202 | if (err) | ||
203 | put_nsproxy(*new_nsp); | ||
204 | |||
205 | out: | 204 | out: |
206 | return err; | 205 | return err; |
207 | } | 206 | } |
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p) | |||
233 | switch_task_namespaces(p, NULL); | 232 | switch_task_namespaces(p, NULL); |
234 | } | 233 | } |
235 | 234 | ||
235 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) | ||
236 | { | ||
237 | const struct proc_ns_operations *ops; | ||
238 | struct task_struct *tsk = current; | ||
239 | struct nsproxy *new_nsproxy; | ||
240 | struct proc_inode *ei; | ||
241 | struct file *file; | ||
242 | int err; | ||
243 | |||
244 | if (!capable(CAP_SYS_ADMIN)) | ||
245 | return -EPERM; | ||
246 | |||
247 | file = proc_ns_fget(fd); | ||
248 | if (IS_ERR(file)) | ||
249 | return PTR_ERR(file); | ||
250 | |||
251 | err = -EINVAL; | ||
252 | ei = PROC_I(file->f_dentry->d_inode); | ||
253 | ops = ei->ns_ops; | ||
254 | if (nstype && (ops->type != nstype)) | ||
255 | goto out; | ||
256 | |||
257 | new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); | ||
258 | if (IS_ERR(new_nsproxy)) { | ||
259 | err = PTR_ERR(new_nsproxy); | ||
260 | goto out; | ||
261 | } | ||
262 | |||
263 | err = ops->install(new_nsproxy, ei->ns); | ||
264 | if (err) { | ||
265 | free_nsproxy(new_nsproxy); | ||
266 | goto out; | ||
267 | } | ||
268 | switch_task_namespaces(tsk, new_nsproxy); | ||
269 | out: | ||
270 | fput(file); | ||
271 | return err; | ||
272 | } | ||
273 | |||
236 | static int __init nsproxy_cache_init(void) | 274 | static int __init nsproxy_cache_init(void) |
237 | { | 275 | { |
238 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); | 276 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); |
diff --git a/kernel/params.c b/kernel/params.c index 7ab388a48a2e..ed72e1330862 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); | |||
297 | int param_set_bool(const char *val, const struct kernel_param *kp) | 297 | int param_set_bool(const char *val, const struct kernel_param *kp) |
298 | { | 298 | { |
299 | bool v; | 299 | bool v; |
300 | int ret; | ||
300 | 301 | ||
301 | /* No equals means "set"... */ | 302 | /* No equals means "set"... */ |
302 | if (!val) val = "1"; | 303 | if (!val) val = "1"; |
303 | 304 | ||
304 | /* One of =[yYnN01] */ | 305 | /* One of =[yYnN01] */ |
305 | switch (val[0]) { | 306 | ret = strtobool(val, &v); |
306 | case 'y': case 'Y': case '1': | 307 | if (ret) |
307 | v = true; | 308 | return ret; |
308 | break; | ||
309 | case 'n': case 'N': case '0': | ||
310 | v = false; | ||
311 | break; | ||
312 | default: | ||
313 | return -EINVAL; | ||
314 | } | ||
315 | 309 | ||
316 | if (kp->flags & KPARAM_ISBOOL) | 310 | if (kp->flags & KPARAM_ISBOOL) |
317 | *(bool *)kp->arg = v; | 311 | *(bool *)kp->arg = v; |
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr, | |||
821 | return sprintf(buf, "%s\n", vattr->version); | 815 | return sprintf(buf, "%s\n", vattr->version); |
822 | } | 816 | } |
823 | 817 | ||
824 | extern struct module_version_attribute __start___modver[], __stop___modver[]; | 818 | extern const struct module_version_attribute *__start___modver[]; |
819 | extern const struct module_version_attribute *__stop___modver[]; | ||
825 | 820 | ||
826 | static void __init version_sysfs_builtin(void) | 821 | static void __init version_sysfs_builtin(void) |
827 | { | 822 | { |
828 | const struct module_version_attribute *vattr; | 823 | const struct module_version_attribute **p; |
829 | struct module_kobject *mk; | 824 | struct module_kobject *mk; |
830 | int err; | 825 | int err; |
831 | 826 | ||
832 | for (vattr = __start___modver; vattr < __stop___modver; vattr++) { | 827 | for (p = __start___modver; p < __stop___modver; p++) { |
828 | const struct module_version_attribute *vattr = *p; | ||
829 | |||
833 | mk = locate_module_kobject(vattr->module_name); | 830 | mk = locate_module_kobject(vattr->module_name); |
834 | if (mk) { | 831 | if (mk) { |
835 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | 832 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); |
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 0da058bff8eb..6824ca7d4d0c 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/string.h> | 40 | #include <linux/string.h> |
41 | #include <linux/platform_device.h> | 41 | #include <linux/platform_device.h> |
42 | #include <linux/init.h> | 42 | #include <linux/init.h> |
43 | #include <linux/kernel.h> | ||
43 | 44 | ||
44 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
45 | 46 | ||
@@ -53,11 +54,17 @@ enum pm_qos_type { | |||
53 | PM_QOS_MIN /* return the smallest value */ | 54 | PM_QOS_MIN /* return the smallest value */ |
54 | }; | 55 | }; |
55 | 56 | ||
57 | /* | ||
58 | * Note: The lockless read path depends on the CPU accessing | ||
59 | * target_value atomically. Atomic access is only guaranteed on all CPU | ||
60 | * types linux supports for 32 bit quantites | ||
61 | */ | ||
56 | struct pm_qos_object { | 62 | struct pm_qos_object { |
57 | struct plist_head requests; | 63 | struct plist_head requests; |
58 | struct blocking_notifier_head *notifiers; | 64 | struct blocking_notifier_head *notifiers; |
59 | struct miscdevice pm_qos_power_miscdev; | 65 | struct miscdevice pm_qos_power_miscdev; |
60 | char *name; | 66 | char *name; |
67 | s32 target_value; /* Do not change to 64 bit */ | ||
61 | s32 default_value; | 68 | s32 default_value; |
62 | enum pm_qos_type type; | 69 | enum pm_qos_type type; |
63 | }; | 70 | }; |
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = { | |||
70 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), | 77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), |
71 | .notifiers = &cpu_dma_lat_notifier, | 78 | .notifiers = &cpu_dma_lat_notifier, |
72 | .name = "cpu_dma_latency", | 79 | .name = "cpu_dma_latency", |
73 | .default_value = 2000 * USEC_PER_SEC, | 80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
81 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | ||
74 | .type = PM_QOS_MIN, | 82 | .type = PM_QOS_MIN, |
75 | }; | 83 | }; |
76 | 84 | ||
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = { | |||
79 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), | 87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), |
80 | .notifiers = &network_lat_notifier, | 88 | .notifiers = &network_lat_notifier, |
81 | .name = "network_latency", | 89 | .name = "network_latency", |
82 | .default_value = 2000 * USEC_PER_SEC, | 90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
91 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | ||
83 | .type = PM_QOS_MIN | 92 | .type = PM_QOS_MIN |
84 | }; | 93 | }; |
85 | 94 | ||
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = { | |||
89 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), | 98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), |
90 | .notifiers = &network_throughput_notifier, | 99 | .notifiers = &network_throughput_notifier, |
91 | .name = "network_throughput", | 100 | .name = "network_throughput", |
92 | .default_value = 0, | 101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
102 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | ||
93 | .type = PM_QOS_MAX, | 103 | .type = PM_QOS_MAX, |
94 | }; | 104 | }; |
95 | 105 | ||
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
135 | } | 145 | } |
136 | } | 146 | } |
137 | 147 | ||
148 | static inline s32 pm_qos_read_value(struct pm_qos_object *o) | ||
149 | { | ||
150 | return o->target_value; | ||
151 | } | ||
152 | |||
153 | static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) | ||
154 | { | ||
155 | o->target_value = value; | ||
156 | } | ||
157 | |||
138 | static void update_target(struct pm_qos_object *o, struct plist_node *node, | 158 | static void update_target(struct pm_qos_object *o, struct plist_node *node, |
139 | int del, int value) | 159 | int del, int value) |
140 | { | 160 | { |
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node, | |||
159 | plist_add(node, &o->requests); | 179 | plist_add(node, &o->requests); |
160 | } | 180 | } |
161 | curr_value = pm_qos_get_value(o); | 181 | curr_value = pm_qos_get_value(o); |
182 | pm_qos_set_value(o, curr_value); | ||
162 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 183 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
163 | 184 | ||
164 | if (prev_value != curr_value) | 185 | if (prev_value != curr_value) |
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor) | |||
193 | * pm_qos_request - returns current system wide qos expectation | 214 | * pm_qos_request - returns current system wide qos expectation |
194 | * @pm_qos_class: identification of which qos value is requested | 215 | * @pm_qos_class: identification of which qos value is requested |
195 | * | 216 | * |
196 | * This function returns the current target value in an atomic manner. | 217 | * This function returns the current target value. |
197 | */ | 218 | */ |
198 | int pm_qos_request(int pm_qos_class) | 219 | int pm_qos_request(int pm_qos_class) |
199 | { | 220 | { |
200 | unsigned long flags; | 221 | return pm_qos_read_value(pm_qos_array[pm_qos_class]); |
201 | int value; | ||
202 | |||
203 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
204 | value = pm_qos_get_value(pm_qos_array[pm_qos_class]); | ||
205 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
206 | |||
207 | return value; | ||
208 | } | 222 | } |
209 | EXPORT_SYMBOL_GPL(pm_qos_request); | 223 | EXPORT_SYMBOL_GPL(pm_qos_request); |
210 | 224 | ||
@@ -385,7 +399,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | |||
385 | s32 value; | 399 | s32 value; |
386 | unsigned long flags; | 400 | unsigned long flags; |
387 | struct pm_qos_object *o; | 401 | struct pm_qos_object *o; |
388 | struct pm_qos_request_list *pm_qos_req = filp->private_data;; | 402 | struct pm_qos_request_list *pm_qos_req = filp->private_data; |
389 | 403 | ||
390 | if (!pm_qos_req) | 404 | if (!pm_qos_req) |
391 | return -EINVAL; | 405 | return -EINVAL; |
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
404 | size_t count, loff_t *f_pos) | 418 | size_t count, loff_t *f_pos) |
405 | { | 419 | { |
406 | s32 value; | 420 | s32 value; |
407 | int x; | ||
408 | char ascii_value[11]; | ||
409 | struct pm_qos_request_list *pm_qos_req; | 421 | struct pm_qos_request_list *pm_qos_req; |
410 | 422 | ||
411 | if (count == sizeof(s32)) { | 423 | if (count == sizeof(s32)) { |
412 | if (copy_from_user(&value, buf, sizeof(s32))) | 424 | if (copy_from_user(&value, buf, sizeof(s32))) |
413 | return -EFAULT; | 425 | return -EFAULT; |
414 | } else if (count == 11) { /* len('0x12345678/0') */ | 426 | } else if (count <= 11) { /* ASCII perhaps? */ |
415 | if (copy_from_user(ascii_value, buf, 11)) | 427 | char ascii_value[11]; |
428 | unsigned long int ulval; | ||
429 | int ret; | ||
430 | |||
431 | if (copy_from_user(ascii_value, buf, count)) | ||
416 | return -EFAULT; | 432 | return -EFAULT; |
417 | if (strlen(ascii_value) != 10) | 433 | |
418 | return -EINVAL; | 434 | if (count > 10) { |
419 | x = sscanf(ascii_value, "%x", &value); | 435 | if (ascii_value[10] == '\n') |
420 | if (x != 1) | 436 | ascii_value[10] = '\0'; |
437 | else | ||
438 | return -EINVAL; | ||
439 | } else { | ||
440 | ascii_value[count] = '\0'; | ||
441 | } | ||
442 | ret = strict_strtoul(ascii_value, 16, &ulval); | ||
443 | if (ret) { | ||
444 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | ||
421 | return -EINVAL; | 445 | return -EINVAL; |
422 | pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); | 446 | } |
423 | } else | 447 | value = (s32)lower_32_bits(ulval); |
448 | } else { | ||
424 | return -EINVAL; | 449 | return -EINVAL; |
450 | } | ||
425 | 451 | ||
426 | pm_qos_req = filp->private_data; | 452 | pm_qos_req = filp->private_data; |
427 | pm_qos_update_request(pm_qos_req, value); | 453 | pm_qos_update_request(pm_qos_req, value); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0791b13df7bf..58f405b581e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1514 | return -EFAULT; | 1514 | return -EFAULT; |
1515 | 1515 | ||
1516 | restart_block->fn = posix_cpu_nsleep_restart; | 1516 | restart_block->fn = posix_cpu_nsleep_restart; |
1517 | restart_block->nanosleep.index = which_clock; | 1517 | restart_block->nanosleep.clockid = which_clock; |
1518 | restart_block->nanosleep.rmtp = rmtp; | 1518 | restart_block->nanosleep.rmtp = rmtp; |
1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); | 1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); |
1520 | } | 1520 | } |
@@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
1523 | 1523 | ||
1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
1525 | { | 1525 | { |
1526 | clockid_t which_clock = restart_block->nanosleep.index; | 1526 | clockid_t which_clock = restart_block->nanosleep.clockid; |
1527 | struct timespec t; | 1527 | struct timespec t; |
1528 | struct itimerspec it; | 1528 | struct itimerspec it; |
1529 | int error; | 1529 | int error; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e5498d7405c3..4556182527f3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void) | |||
491 | return tmr; | 491 | return tmr; |
492 | } | 492 | } |
493 | 493 | ||
494 | static void k_itimer_rcu_free(struct rcu_head *head) | ||
495 | { | ||
496 | struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); | ||
497 | |||
498 | kmem_cache_free(posix_timers_cache, tmr); | ||
499 | } | ||
500 | |||
494 | #define IT_ID_SET 1 | 501 | #define IT_ID_SET 1 |
495 | #define IT_ID_NOT_SET 0 | 502 | #define IT_ID_NOT_SET 0 |
496 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | 503 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) |
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
503 | } | 510 | } |
504 | put_pid(tmr->it_pid); | 511 | put_pid(tmr->it_pid); |
505 | sigqueue_free(tmr->sigq); | 512 | sigqueue_free(tmr->sigq); |
506 | kmem_cache_free(posix_timers_cache, tmr); | 513 | call_rcu(&tmr->it.rcu, k_itimer_rcu_free); |
507 | } | 514 | } |
508 | 515 | ||
509 | static struct k_clock *clockid_to_kclock(const clockid_t id) | 516 | static struct k_clock *clockid_to_kclock(const clockid_t id) |
@@ -631,22 +638,18 @@ out: | |||
631 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | 638 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) |
632 | { | 639 | { |
633 | struct k_itimer *timr; | 640 | struct k_itimer *timr; |
634 | /* | 641 | |
635 | * Watch out here. We do a irqsave on the idr_lock and pass the | 642 | rcu_read_lock(); |
636 | * flags part over to the timer lock. Must not let interrupts in | ||
637 | * while we are moving the lock. | ||
638 | */ | ||
639 | spin_lock_irqsave(&idr_lock, *flags); | ||
640 | timr = idr_find(&posix_timers_id, (int)timer_id); | 643 | timr = idr_find(&posix_timers_id, (int)timer_id); |
641 | if (timr) { | 644 | if (timr) { |
642 | spin_lock(&timr->it_lock); | 645 | spin_lock_irqsave(&timr->it_lock, *flags); |
643 | if (timr->it_signal == current->signal) { | 646 | if (timr->it_signal == current->signal) { |
644 | spin_unlock(&idr_lock); | 647 | rcu_read_unlock(); |
645 | return timr; | 648 | return timr; |
646 | } | 649 | } |
647 | spin_unlock(&timr->it_lock); | 650 | spin_unlock_irqrestore(&timr->it_lock, *flags); |
648 | } | 651 | } |
649 | spin_unlock_irqrestore(&idr_lock, *flags); | 652 | rcu_read_unlock(); |
650 | 653 | ||
651 | return NULL; | 654 | return NULL; |
652 | } | 655 | } |
@@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
1056 | */ | 1059 | */ |
1057 | long clock_nanosleep_restart(struct restart_block *restart_block) | 1060 | long clock_nanosleep_restart(struct restart_block *restart_block) |
1058 | { | 1061 | { |
1059 | clockid_t which_clock = restart_block->nanosleep.index; | 1062 | clockid_t which_clock = restart_block->nanosleep.clockid; |
1060 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1063 | struct k_clock *kc = clockid_to_kclock(which_clock); |
1061 | 1064 | ||
1062 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) | 1065 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6de9a8fc3417..87f4d24b55b0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -125,12 +125,6 @@ config PM_DEBUG | |||
125 | code. This is helpful when debugging and reporting PM bugs, like | 125 | code. This is helpful when debugging and reporting PM bugs, like |
126 | suspend support. | 126 | suspend support. |
127 | 127 | ||
128 | config PM_VERBOSE | ||
129 | bool "Verbose Power Management debugging" | ||
130 | depends on PM_DEBUG | ||
131 | ---help--- | ||
132 | This option enables verbose messages from the Power Management code. | ||
133 | |||
134 | config PM_ADVANCED_DEBUG | 128 | config PM_ADVANCED_DEBUG |
135 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | 129 | bool "Extra PM attributes in sysfs for low-level debugging/testing" |
136 | depends on PM_DEBUG | 130 | depends on PM_DEBUG |
@@ -229,3 +223,7 @@ config PM_OPP | |||
229 | representing individual voltage domains and provides SOC | 223 | representing individual voltage domains and provides SOC |
230 | implementations a ready to use framework to manage OPPs. | 224 | implementations a ready to use framework to manage OPPs. |
231 | For more information, read <file:Documentation/power/opp.txt> | 225 | For more information, read <file:Documentation/power/opp.txt> |
226 | |||
227 | config PM_RUNTIME_CLK | ||
228 | def_bool y | ||
229 | depends on PM_RUNTIME && HAVE_CLK | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 50aae660174d..8f7b1db1ece1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -25,7 +25,6 @@ | |||
25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
26 | #include <linux/syscore_ops.h> | 26 | #include <linux/syscore_ops.h> |
27 | #include <scsi/scsi_scan.h> | 27 | #include <scsi/scsi_scan.h> |
28 | #include <asm/suspend.h> | ||
29 | 28 | ||
30 | #include "power.h" | 29 | #include "power.h" |
31 | 30 | ||
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN; | |||
55 | static const struct platform_hibernation_ops *hibernation_ops; | 54 | static const struct platform_hibernation_ops *hibernation_ops; |
56 | 55 | ||
57 | /** | 56 | /** |
58 | * hibernation_set_ops - set the global hibernate operations | 57 | * hibernation_set_ops - Set the global hibernate operations. |
59 | * @ops: the hibernation operations to use in subsequent hibernation transitions | 58 | * @ops: Hibernation operations to use in subsequent hibernation transitions. |
60 | */ | 59 | */ |
61 | |||
62 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) | 60 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) |
63 | { | 61 | { |
64 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot | 62 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; } | |||
115 | #endif /* !CONFIG_PM_DEBUG */ | 113 | #endif /* !CONFIG_PM_DEBUG */ |
116 | 114 | ||
117 | /** | 115 | /** |
118 | * platform_begin - tell the platform driver that we're starting | 116 | * platform_begin - Call platform to start hibernation. |
119 | * hibernation | 117 | * @platform_mode: Whether or not to use the platform driver. |
120 | */ | 118 | */ |
121 | |||
122 | static int platform_begin(int platform_mode) | 119 | static int platform_begin(int platform_mode) |
123 | { | 120 | { |
124 | return (platform_mode && hibernation_ops) ? | 121 | return (platform_mode && hibernation_ops) ? |
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode) | |||
126 | } | 123 | } |
127 | 124 | ||
128 | /** | 125 | /** |
129 | * platform_end - tell the platform driver that we've entered the | 126 | * platform_end - Call platform to finish transition to the working state. |
130 | * working state | 127 | * @platform_mode: Whether or not to use the platform driver. |
131 | */ | 128 | */ |
132 | |||
133 | static void platform_end(int platform_mode) | 129 | static void platform_end(int platform_mode) |
134 | { | 130 | { |
135 | if (platform_mode && hibernation_ops) | 131 | if (platform_mode && hibernation_ops) |
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode) | |||
137 | } | 133 | } |
138 | 134 | ||
139 | /** | 135 | /** |
140 | * platform_pre_snapshot - prepare the machine for hibernation using the | 136 | * platform_pre_snapshot - Call platform to prepare the machine for hibernation. |
141 | * platform driver if so configured and return an error code if it fails | 137 | * @platform_mode: Whether or not to use the platform driver. |
138 | * | ||
139 | * Use the platform driver to prepare the system for creating a hibernate image, | ||
140 | * if so configured, and return an error code if that fails. | ||
142 | */ | 141 | */ |
143 | 142 | ||
144 | static int platform_pre_snapshot(int platform_mode) | 143 | static int platform_pre_snapshot(int platform_mode) |
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode) | |||
148 | } | 147 | } |
149 | 148 | ||
150 | /** | 149 | /** |
151 | * platform_leave - prepare the machine for switching to the normal mode | 150 | * platform_leave - Call platform to prepare a transition to the working state. |
152 | * of operation using the platform driver (called with interrupts disabled) | 151 | * @platform_mode: Whether or not to use the platform driver. |
152 | * | ||
153 | * Use the platform driver prepare to prepare the machine for switching to the | ||
154 | * normal mode of operation. | ||
155 | * | ||
156 | * This routine is called on one CPU with interrupts disabled. | ||
153 | */ | 157 | */ |
154 | |||
155 | static void platform_leave(int platform_mode) | 158 | static void platform_leave(int platform_mode) |
156 | { | 159 | { |
157 | if (platform_mode && hibernation_ops) | 160 | if (platform_mode && hibernation_ops) |
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode) | |||
159 | } | 162 | } |
160 | 163 | ||
161 | /** | 164 | /** |
162 | * platform_finish - switch the machine to the normal mode of operation | 165 | * platform_finish - Call platform to switch the system to the working state. |
163 | * using the platform driver (must be called after platform_prepare()) | 166 | * @platform_mode: Whether or not to use the platform driver. |
167 | * | ||
168 | * Use the platform driver to switch the machine to the normal mode of | ||
169 | * operation. | ||
170 | * | ||
171 | * This routine must be called after platform_prepare(). | ||
164 | */ | 172 | */ |
165 | |||
166 | static void platform_finish(int platform_mode) | 173 | static void platform_finish(int platform_mode) |
167 | { | 174 | { |
168 | if (platform_mode && hibernation_ops) | 175 | if (platform_mode && hibernation_ops) |
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode) | |||
170 | } | 177 | } |
171 | 178 | ||
172 | /** | 179 | /** |
173 | * platform_pre_restore - prepare the platform for the restoration from a | 180 | * platform_pre_restore - Prepare for hibernate image restoration. |
174 | * hibernation image. If the restore fails after this function has been | 181 | * @platform_mode: Whether or not to use the platform driver. |
175 | * called, platform_restore_cleanup() must be called. | 182 | * |
183 | * Use the platform driver to prepare the system for resume from a hibernation | ||
184 | * image. | ||
185 | * | ||
186 | * If the restore fails after this function has been called, | ||
187 | * platform_restore_cleanup() must be called. | ||
176 | */ | 188 | */ |
177 | |||
178 | static int platform_pre_restore(int platform_mode) | 189 | static int platform_pre_restore(int platform_mode) |
179 | { | 190 | { |
180 | return (platform_mode && hibernation_ops) ? | 191 | return (platform_mode && hibernation_ops) ? |
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode) | |||
182 | } | 193 | } |
183 | 194 | ||
184 | /** | 195 | /** |
185 | * platform_restore_cleanup - switch the platform to the normal mode of | 196 | * platform_restore_cleanup - Switch to the working state after failing restore. |
186 | * operation after a failing restore. If platform_pre_restore() has been | 197 | * @platform_mode: Whether or not to use the platform driver. |
187 | * called before the failing restore, this function must be called too, | 198 | * |
188 | * regardless of the result of platform_pre_restore(). | 199 | * Use the platform driver to switch the system to the normal mode of operation |
200 | * after a failing restore. | ||
201 | * | ||
202 | * If platform_pre_restore() has been called before the failing restore, this | ||
203 | * function must be called too, regardless of the result of | ||
204 | * platform_pre_restore(). | ||
189 | */ | 205 | */ |
190 | |||
191 | static void platform_restore_cleanup(int platform_mode) | 206 | static void platform_restore_cleanup(int platform_mode) |
192 | { | 207 | { |
193 | if (platform_mode && hibernation_ops) | 208 | if (platform_mode && hibernation_ops) |
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode) | |||
195 | } | 210 | } |
196 | 211 | ||
197 | /** | 212 | /** |
198 | * platform_recover - recover the platform from a failure to suspend | 213 | * platform_recover - Recover from a failure to suspend devices. |
199 | * devices. | 214 | * @platform_mode: Whether or not to use the platform driver. |
200 | */ | 215 | */ |
201 | |||
202 | static void platform_recover(int platform_mode) | 216 | static void platform_recover(int platform_mode) |
203 | { | 217 | { |
204 | if (platform_mode && hibernation_ops && hibernation_ops->recover) | 218 | if (platform_mode && hibernation_ops && hibernation_ops->recover) |
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode) | |||
206 | } | 220 | } |
207 | 221 | ||
208 | /** | 222 | /** |
209 | * swsusp_show_speed - print the time elapsed between two events. | 223 | * swsusp_show_speed - Print time elapsed between two events during hibernation. |
210 | * @start: Starting event. | 224 | * @start: Starting event. |
211 | * @stop: Final event. | 225 | * @stop: Final event. |
212 | * @nr_pages - number of pages processed between @start and @stop | 226 | * @nr_pages: Number of memory pages processed between @start and @stop. |
213 | * @msg - introductory message to print | 227 | * @msg: Additional diagnostic message to print. |
214 | */ | 228 | */ |
215 | |||
216 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 229 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, |
217 | unsigned nr_pages, char *msg) | 230 | unsigned nr_pages, char *msg) |
218 | { | 231 | { |
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, | |||
235 | } | 248 | } |
236 | 249 | ||
237 | /** | 250 | /** |
238 | * create_image - freeze devices that need to be frozen with interrupts | 251 | * create_image - Create a hibernation image. |
239 | * off, create the hibernation image and thaw those devices. Control | 252 | * @platform_mode: Whether or not to use the platform driver. |
240 | * reappears in this routine after a restore. | 253 | * |
254 | * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image | ||
255 | * and execute the drivers' .thaw_noirq() callbacks. | ||
256 | * | ||
257 | * Control reappears in this routine after the subsequent restore. | ||
241 | */ | 258 | */ |
242 | |||
243 | static int create_image(int platform_mode) | 259 | static int create_image(int platform_mode) |
244 | { | 260 | { |
245 | int error; | 261 | int error; |
246 | 262 | ||
247 | error = arch_prepare_suspend(); | ||
248 | if (error) | ||
249 | return error; | ||
250 | |||
251 | /* At this point, dpm_suspend_start() has been called, but *not* | ||
252 | * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. | ||
253 | * Otherwise, drivers for some devices (e.g. interrupt controllers) | ||
254 | * become desynchronized with the actual state of the hardware | ||
255 | * at resume time, and evil weirdness ensues. | ||
256 | */ | ||
257 | error = dpm_suspend_noirq(PMSG_FREEZE); | 263 | error = dpm_suspend_noirq(PMSG_FREEZE); |
258 | if (error) { | 264 | if (error) { |
259 | printk(KERN_ERR "PM: Some devices failed to power down, " | 265 | printk(KERN_ERR "PM: Some devices failed to power down, " |
@@ -272,12 +278,7 @@ static int create_image(int platform_mode) | |||
272 | 278 | ||
273 | local_irq_disable(); | 279 | local_irq_disable(); |
274 | 280 | ||
275 | error = sysdev_suspend(PMSG_FREEZE); | 281 | error = syscore_suspend(); |
276 | if (!error) { | ||
277 | error = syscore_suspend(); | ||
278 | if (error) | ||
279 | sysdev_resume(); | ||
280 | } | ||
281 | if (error) { | 282 | if (error) { |
282 | printk(KERN_ERR "PM: Some system devices failed to power down, " | 283 | printk(KERN_ERR "PM: Some system devices failed to power down, " |
283 | "aborting hibernation\n"); | 284 | "aborting hibernation\n"); |
@@ -302,10 +303,6 @@ static int create_image(int platform_mode) | |||
302 | 303 | ||
303 | Power_up: | 304 | Power_up: |
304 | syscore_resume(); | 305 | syscore_resume(); |
305 | sysdev_resume(); | ||
306 | /* NOTE: dpm_resume_noirq() is just a resume() for devices | ||
307 | * that suspended with irqs off ... no overall powerup. | ||
308 | */ | ||
309 | 306 | ||
310 | Enable_irqs: | 307 | Enable_irqs: |
311 | local_irq_enable(); | 308 | local_irq_enable(); |
@@ -323,30 +320,32 @@ static int create_image(int platform_mode) | |||
323 | } | 320 | } |
324 | 321 | ||
325 | /** | 322 | /** |
326 | * hibernation_snapshot - quiesce devices and create the hibernation | 323 | * hibernation_snapshot - Quiesce devices and create a hibernation image. |
327 | * snapshot image. | 324 | * @platform_mode: If set, use platform driver to prepare for the transition. |
328 | * @platform_mode - if set, use the platform driver, if available, to | ||
329 | * prepare the platform firmware for the power transition. | ||
330 | * | 325 | * |
331 | * Must be called with pm_mutex held | 326 | * This routine must be called with pm_mutex held. |
332 | */ | 327 | */ |
333 | |||
334 | int hibernation_snapshot(int platform_mode) | 328 | int hibernation_snapshot(int platform_mode) |
335 | { | 329 | { |
330 | pm_message_t msg = PMSG_RECOVER; | ||
336 | int error; | 331 | int error; |
337 | 332 | ||
338 | error = platform_begin(platform_mode); | 333 | error = platform_begin(platform_mode); |
339 | if (error) | 334 | if (error) |
340 | goto Close; | 335 | goto Close; |
341 | 336 | ||
337 | error = dpm_prepare(PMSG_FREEZE); | ||
338 | if (error) | ||
339 | goto Complete_devices; | ||
340 | |||
342 | /* Preallocate image memory before shutting down devices. */ | 341 | /* Preallocate image memory before shutting down devices. */ |
343 | error = hibernate_preallocate_memory(); | 342 | error = hibernate_preallocate_memory(); |
344 | if (error) | 343 | if (error) |
345 | goto Close; | 344 | goto Complete_devices; |
346 | 345 | ||
347 | suspend_console(); | 346 | suspend_console(); |
348 | pm_restrict_gfp_mask(); | 347 | pm_restrict_gfp_mask(); |
349 | error = dpm_suspend_start(PMSG_FREEZE); | 348 | error = dpm_suspend(PMSG_FREEZE); |
350 | if (error) | 349 | if (error) |
351 | goto Recover_platform; | 350 | goto Recover_platform; |
352 | 351 | ||
@@ -364,13 +363,17 @@ int hibernation_snapshot(int platform_mode) | |||
364 | if (error || !in_suspend) | 363 | if (error || !in_suspend) |
365 | swsusp_free(); | 364 | swsusp_free(); |
366 | 365 | ||
367 | dpm_resume_end(in_suspend ? | 366 | msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; |
368 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 367 | dpm_resume(msg); |
369 | 368 | ||
370 | if (error || !in_suspend) | 369 | if (error || !in_suspend) |
371 | pm_restore_gfp_mask(); | 370 | pm_restore_gfp_mask(); |
372 | 371 | ||
373 | resume_console(); | 372 | resume_console(); |
373 | |||
374 | Complete_devices: | ||
375 | dpm_complete(msg); | ||
376 | |||
374 | Close: | 377 | Close: |
375 | platform_end(platform_mode); | 378 | platform_end(platform_mode); |
376 | return error; | 379 | return error; |
@@ -381,13 +384,14 @@ int hibernation_snapshot(int platform_mode) | |||
381 | } | 384 | } |
382 | 385 | ||
383 | /** | 386 | /** |
384 | * resume_target_kernel - prepare devices that need to be suspended with | 387 | * resume_target_kernel - Restore system state from a hibernation image. |
385 | * interrupts off, restore the contents of highmem that have not been | 388 | * @platform_mode: Whether or not to use the platform driver. |
386 | * restored yet from the image and run the low level code that will restore | 389 | * |
387 | * the remaining contents of memory and switch to the just restored target | 390 | * Execute device drivers' .freeze_noirq() callbacks, restore the contents of |
388 | * kernel. | 391 | * highmem that have not been restored yet from the image and run the low-level |
392 | * code that will restore the remaining contents of memory and switch to the | ||
393 | * just restored target kernel. | ||
389 | */ | 394 | */ |
390 | |||
391 | static int resume_target_kernel(bool platform_mode) | 395 | static int resume_target_kernel(bool platform_mode) |
392 | { | 396 | { |
393 | int error; | 397 | int error; |
@@ -409,40 +413,36 @@ static int resume_target_kernel(bool platform_mode) | |||
409 | 413 | ||
410 | local_irq_disable(); | 414 | local_irq_disable(); |
411 | 415 | ||
412 | error = sysdev_suspend(PMSG_QUIESCE); | 416 | error = syscore_suspend(); |
413 | if (!error) { | ||
414 | error = syscore_suspend(); | ||
415 | if (error) | ||
416 | sysdev_resume(); | ||
417 | } | ||
418 | if (error) | 417 | if (error) |
419 | goto Enable_irqs; | 418 | goto Enable_irqs; |
420 | 419 | ||
421 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | ||
422 | save_processor_state(); | 420 | save_processor_state(); |
423 | error = restore_highmem(); | 421 | error = restore_highmem(); |
424 | if (!error) { | 422 | if (!error) { |
425 | error = swsusp_arch_resume(); | 423 | error = swsusp_arch_resume(); |
426 | /* | 424 | /* |
427 | * The code below is only ever reached in case of a failure. | 425 | * The code below is only ever reached in case of a failure. |
428 | * Otherwise execution continues at place where | 426 | * Otherwise, execution continues at the place where |
429 | * swsusp_arch_suspend() was called | 427 | * swsusp_arch_suspend() was called. |
430 | */ | 428 | */ |
431 | BUG_ON(!error); | 429 | BUG_ON(!error); |
432 | /* This call to restore_highmem() undos the previous one */ | 430 | /* |
431 | * This call to restore_highmem() reverts the changes made by | ||
432 | * the previous one. | ||
433 | */ | ||
433 | restore_highmem(); | 434 | restore_highmem(); |
434 | } | 435 | } |
435 | /* | 436 | /* |
436 | * The only reason why swsusp_arch_resume() can fail is memory being | 437 | * The only reason why swsusp_arch_resume() can fail is memory being |
437 | * very tight, so we have to free it as soon as we can to avoid | 438 | * very tight, so we have to free it as soon as we can to avoid |
438 | * subsequent failures | 439 | * subsequent failures. |
439 | */ | 440 | */ |
440 | swsusp_free(); | 441 | swsusp_free(); |
441 | restore_processor_state(); | 442 | restore_processor_state(); |
442 | touch_softlockup_watchdog(); | 443 | touch_softlockup_watchdog(); |
443 | 444 | ||
444 | syscore_resume(); | 445 | syscore_resume(); |
445 | sysdev_resume(); | ||
446 | 446 | ||
447 | Enable_irqs: | 447 | Enable_irqs: |
448 | local_irq_enable(); | 448 | local_irq_enable(); |
@@ -459,14 +459,12 @@ static int resume_target_kernel(bool platform_mode) | |||
459 | } | 459 | } |
460 | 460 | ||
461 | /** | 461 | /** |
462 | * hibernation_restore - quiesce devices and restore the hibernation | 462 | * hibernation_restore - Quiesce devices and restore from a hibernation image. |
463 | * snapshot image. If successful, control returns in hibernation_snaphot() | 463 | * @platform_mode: If set, use platform driver to prepare for the transition. |
464 | * @platform_mode - if set, use the platform driver, if available, to | ||
465 | * prepare the platform firmware for the transition. | ||
466 | * | 464 | * |
467 | * Must be called with pm_mutex held | 465 | * This routine must be called with pm_mutex held. If it is successful, control |
466 | * reappears in the restored target kernel in hibernation_snaphot(). | ||
468 | */ | 467 | */ |
469 | |||
470 | int hibernation_restore(int platform_mode) | 468 | int hibernation_restore(int platform_mode) |
471 | { | 469 | { |
472 | int error; | 470 | int error; |
@@ -486,10 +484,8 @@ int hibernation_restore(int platform_mode) | |||
486 | } | 484 | } |
487 | 485 | ||
488 | /** | 486 | /** |
489 | * hibernation_platform_enter - enter the hibernation state using the | 487 | * hibernation_platform_enter - Power off the system using the platform driver. |
490 | * platform driver (if available) | ||
491 | */ | 488 | */ |
492 | |||
493 | int hibernation_platform_enter(void) | 489 | int hibernation_platform_enter(void) |
494 | { | 490 | { |
495 | int error; | 491 | int error; |
@@ -528,7 +524,6 @@ int hibernation_platform_enter(void) | |||
528 | goto Platform_finish; | 524 | goto Platform_finish; |
529 | 525 | ||
530 | local_irq_disable(); | 526 | local_irq_disable(); |
531 | sysdev_suspend(PMSG_HIBERNATE); | ||
532 | syscore_suspend(); | 527 | syscore_suspend(); |
533 | if (pm_wakeup_pending()) { | 528 | if (pm_wakeup_pending()) { |
534 | error = -EAGAIN; | 529 | error = -EAGAIN; |
@@ -541,7 +536,6 @@ int hibernation_platform_enter(void) | |||
541 | 536 | ||
542 | Power_up: | 537 | Power_up: |
543 | syscore_resume(); | 538 | syscore_resume(); |
544 | sysdev_resume(); | ||
545 | local_irq_enable(); | 539 | local_irq_enable(); |
546 | enable_nonboot_cpus(); | 540 | enable_nonboot_cpus(); |
547 | 541 | ||
@@ -562,12 +556,12 @@ int hibernation_platform_enter(void) | |||
562 | } | 556 | } |
563 | 557 | ||
564 | /** | 558 | /** |
565 | * power_down - Shut the machine down for hibernation. | 559 | * power_down - Shut the machine down for hibernation. |
566 | * | 560 | * |
567 | * Use the platform driver, if configured so; otherwise try | 561 | * Use the platform driver, if configured, to put the system into the sleep |
568 | * to power off or reboot. | 562 | * state corresponding to hibernation, or try to power it off or reboot, |
563 | * depending on the value of hibernation_mode. | ||
569 | */ | 564 | */ |
570 | |||
571 | static void power_down(void) | 565 | static void power_down(void) |
572 | { | 566 | { |
573 | switch (hibernation_mode) { | 567 | switch (hibernation_mode) { |
@@ -604,9 +598,8 @@ static int prepare_processes(void) | |||
604 | } | 598 | } |
605 | 599 | ||
606 | /** | 600 | /** |
607 | * hibernate - The granpappy of the built-in hibernation management | 601 | * hibernate - Carry out system hibernation, including saving the image. |
608 | */ | 602 | */ |
609 | |||
610 | int hibernate(void) | 603 | int hibernate(void) |
611 | { | 604 | { |
612 | int error; | 605 | int error; |
@@ -684,17 +677,20 @@ int hibernate(void) | |||
684 | 677 | ||
685 | 678 | ||
686 | /** | 679 | /** |
687 | * software_resume - Resume from a saved image. | 680 | * software_resume - Resume from a saved hibernation image. |
681 | * | ||
682 | * This routine is called as a late initcall, when all devices have been | ||
683 | * discovered and initialized already. | ||
688 | * | 684 | * |
689 | * Called as a late_initcall (so all devices are discovered and | 685 | * The image reading code is called to see if there is a hibernation image |
690 | * initialized), we call swsusp to see if we have a saved image or not. | 686 | * available for reading. If that is the case, devices are quiesced and the |
691 | * If so, we quiesce devices, the restore the saved image. We will | 687 | * contents of memory is restored from the saved image. |
692 | * return above (in hibernate() ) if everything goes well. | ||
693 | * Otherwise, we fail gracefully and return to the normally | ||
694 | * scheduled program. | ||
695 | * | 688 | * |
689 | * If this is successful, control reappears in the restored target kernel in | ||
690 | * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine | ||
691 | * attempts to recover gracefully and make the kernel return to the normal mode | ||
692 | * of operation. | ||
696 | */ | 693 | */ |
697 | |||
698 | static int software_resume(void) | 694 | static int software_resume(void) |
699 | { | 695 | { |
700 | int error; | 696 | int error; |
@@ -824,21 +820,17 @@ static const char * const hibernation_modes[] = { | |||
824 | [HIBERNATION_TESTPROC] = "testproc", | 820 | [HIBERNATION_TESTPROC] = "testproc", |
825 | }; | 821 | }; |
826 | 822 | ||
827 | /** | 823 | /* |
828 | * disk - Control hibernation mode | 824 | * /sys/power/disk - Control hibernation mode. |
829 | * | ||
830 | * Suspend-to-disk can be handled in several ways. We have a few options | ||
831 | * for putting the system to sleep - using the platform driver (e.g. ACPI | ||
832 | * or other hibernation_ops), powering off the system or rebooting the | ||
833 | * system (for testing) as well as the two test modes. | ||
834 | * | 825 | * |
835 | * The system can support 'platform', and that is known a priori (and | 826 | * Hibernation can be handled in several ways. There are a few different ways |
836 | * encoded by the presence of hibernation_ops). However, the user may | 827 | * to put the system into the sleep state: using the platform driver (e.g. ACPI |
837 | * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the | 828 | * or other hibernation_ops), powering it off or rebooting it (for testing |
838 | * test modes, 'test' or 'testproc'. | 829 | * mostly), or using one of the two available test modes. |
839 | * | 830 | * |
840 | * show() will display what the mode is currently set to. | 831 | * The sysfs file /sys/power/disk provides an interface for selecting the |
841 | * store() will accept one of | 832 | * hibernation mode to use. Reading from this file causes the available modes |
833 | * to be printed. There are 5 modes that can be supported: | ||
842 | * | 834 | * |
843 | * 'platform' | 835 | * 'platform' |
844 | * 'shutdown' | 836 | * 'shutdown' |
@@ -846,8 +838,14 @@ static const char * const hibernation_modes[] = { | |||
846 | * 'test' | 838 | * 'test' |
847 | * 'testproc' | 839 | * 'testproc' |
848 | * | 840 | * |
849 | * It will only change to 'platform' if the system | 841 | * If a platform hibernation driver is in use, 'platform' will be supported |
850 | * supports it (as determined by having hibernation_ops). | 842 | * and will be used by default. Otherwise, 'shutdown' will be used by default. |
843 | * The selected option (i.e. the one corresponding to the current value of | ||
844 | * hibernation_mode) is enclosed by a square bracket. | ||
845 | * | ||
846 | * To select a given hibernation mode it is necessary to write the mode's | ||
847 | * string representation (as returned by reading from /sys/power/disk) back | ||
848 | * into /sys/power/disk. | ||
851 | */ | 849 | */ |
852 | 850 | ||
853 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | 851 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -880,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
880 | return buf-start; | 878 | return buf-start; |
881 | } | 879 | } |
882 | 880 | ||
883 | |||
884 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | 881 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, |
885 | const char *buf, size_t n) | 882 | const char *buf, size_t n) |
886 | { | 883 | { |
@@ -982,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att | |||
982 | 979 | ||
983 | power_attr(image_size); | 980 | power_attr(image_size); |
984 | 981 | ||
982 | static ssize_t reserved_size_show(struct kobject *kobj, | ||
983 | struct kobj_attribute *attr, char *buf) | ||
984 | { | ||
985 | return sprintf(buf, "%lu\n", reserved_size); | ||
986 | } | ||
987 | |||
988 | static ssize_t reserved_size_store(struct kobject *kobj, | ||
989 | struct kobj_attribute *attr, | ||
990 | const char *buf, size_t n) | ||
991 | { | ||
992 | unsigned long size; | ||
993 | |||
994 | if (sscanf(buf, "%lu", &size) == 1) { | ||
995 | reserved_size = size; | ||
996 | return n; | ||
997 | } | ||
998 | |||
999 | return -EINVAL; | ||
1000 | } | ||
1001 | |||
1002 | power_attr(reserved_size); | ||
1003 | |||
985 | static struct attribute * g[] = { | 1004 | static struct attribute * g[] = { |
986 | &disk_attr.attr, | 1005 | &disk_attr.attr, |
987 | &resume_attr.attr, | 1006 | &resume_attr.attr, |
988 | &image_size_attr.attr, | 1007 | &image_size_attr.attr, |
1008 | &reserved_size_attr.attr, | ||
989 | NULL, | 1009 | NULL, |
990 | }; | 1010 | }; |
991 | 1011 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index de9aef8742f4..2981af4ce7cb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -337,6 +337,7 @@ static int __init pm_init(void) | |||
337 | if (error) | 337 | if (error) |
338 | return error; | 338 | return error; |
339 | hibernate_image_size_init(); | 339 | hibernate_image_size_init(); |
340 | hibernate_reserved_size_init(); | ||
340 | power_kobj = kobject_create_and_add("power", NULL); | 341 | power_kobj = kobject_create_and_add("power", NULL); |
341 | if (!power_kobj) | 342 | if (!power_kobj) |
342 | return -ENOMEM; | 343 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 03634be55f62..9a00a0a26280 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -15,6 +15,7 @@ struct swsusp_info { | |||
15 | 15 | ||
16 | #ifdef CONFIG_HIBERNATION | 16 | #ifdef CONFIG_HIBERNATION |
17 | /* kernel/power/snapshot.c */ | 17 | /* kernel/power/snapshot.c */ |
18 | extern void __init hibernate_reserved_size_init(void); | ||
18 | extern void __init hibernate_image_size_init(void); | 19 | extern void __init hibernate_image_size_init(void); |
19 | 20 | ||
20 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER | 21 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER |
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void); | |||
55 | 56 | ||
56 | #else /* !CONFIG_HIBERNATION */ | 57 | #else /* !CONFIG_HIBERNATION */ |
57 | 58 | ||
59 | static inline void hibernate_reserved_size_init(void) {} | ||
58 | static inline void hibernate_image_size_init(void) {} | 60 | static inline void hibernate_image_size_init(void) {} |
59 | #endif /* !CONFIG_HIBERNATION */ | 61 | #endif /* !CONFIG_HIBERNATION */ |
60 | 62 | ||
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \ | |||
72 | 74 | ||
73 | /* Preferred image size in bytes (default 500 MB) */ | 75 | /* Preferred image size in bytes (default 500 MB) */ |
74 | extern unsigned long image_size; | 76 | extern unsigned long image_size; |
77 | /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ | ||
78 | extern unsigned long reserved_size; | ||
75 | extern int in_suspend; | 79 | extern int in_suspend; |
76 | extern dev_t swsusp_resume_device; | 80 | extern dev_t swsusp_resume_device; |
77 | extern sector_t swsusp_resume_block; | 81 | extern sector_t swsusp_resume_block; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ca0aacc24874..06efa54f93d6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *); | |||
41 | static void swsusp_unset_page_forbidden(struct page *); | 41 | static void swsusp_unset_page_forbidden(struct page *); |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Number of bytes to reserve for memory allocations made by device drivers | ||
45 | * from their ->freeze() and ->freeze_noirq() callbacks so that they don't | ||
46 | * cause image creation to fail (tunable via /sys/power/reserved_size). | ||
47 | */ | ||
48 | unsigned long reserved_size; | ||
49 | |||
50 | void __init hibernate_reserved_size_init(void) | ||
51 | { | ||
52 | reserved_size = SPARE_PAGES * PAGE_SIZE; | ||
53 | } | ||
54 | |||
55 | /* | ||
44 | * Preferred image size in bytes (tunable via /sys/power/image_size). | 56 | * Preferred image size in bytes (tunable via /sys/power/image_size). |
45 | * When it is set to N, the image creating code will do its best to | 57 | * When it is set to N, swsusp will do its best to ensure the image |
46 | * ensure the image size will not exceed N bytes, but if that is | 58 | * size will not exceed N bytes, but if that is impossible, it will |
47 | * impossible, it will try to create the smallest image possible. | 59 | * try to create the smallest image possible. |
48 | */ | 60 | */ |
49 | unsigned long image_size; | 61 | unsigned long image_size; |
50 | 62 | ||
51 | void __init hibernate_image_size_init(void) | 63 | void __init hibernate_image_size_init(void) |
52 | { | 64 | { |
53 | image_size = (totalram_pages / 3) * PAGE_SIZE; | 65 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; |
54 | } | 66 | } |
55 | 67 | ||
56 | /* List of PBEs needed for restoring the pages that were allocated before | 68 | /* List of PBEs needed for restoring the pages that were allocated before |
@@ -1199,7 +1211,11 @@ static void free_unnecessary_pages(void) | |||
1199 | to_free_highmem = alloc_highmem - save; | 1211 | to_free_highmem = alloc_highmem - save; |
1200 | } else { | 1212 | } else { |
1201 | to_free_highmem = 0; | 1213 | to_free_highmem = 0; |
1202 | to_free_normal -= save - alloc_highmem; | 1214 | save -= alloc_highmem; |
1215 | if (to_free_normal > save) | ||
1216 | to_free_normal -= save; | ||
1217 | else | ||
1218 | to_free_normal = 0; | ||
1203 | } | 1219 | } |
1204 | 1220 | ||
1205 | memory_bm_position_reset(©_bm); | 1221 | memory_bm_position_reset(©_bm); |
@@ -1263,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
1263 | * frame in use. We also need a number of page frames to be free during | 1279 | * frame in use. We also need a number of page frames to be free during |
1264 | * hibernation for allocations made while saving the image and for device | 1280 | * hibernation for allocations made while saving the image and for device |
1265 | * drivers, in case they need to allocate memory from their hibernation | 1281 | * drivers, in case they need to allocate memory from their hibernation |
1266 | * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, | 1282 | * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough |
1267 | * respectively, both of which are rough estimates). To make this happen, we | 1283 | * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through |
1268 | * compute the total number of available page frames and allocate at least | 1284 | * /sys/power/reserved_size, respectively). To make this happen, we compute the |
1285 | * total number of available page frames and allocate at least | ||
1269 | * | 1286 | * |
1270 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES | 1287 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 |
1288 | * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) | ||
1271 | * | 1289 | * |
1272 | * of them, which corresponds to the maximum size of a hibernation image. | 1290 | * of them, which corresponds to the maximum size of a hibernation image. |
1273 | * | 1291 | * |
@@ -1322,7 +1340,8 @@ int hibernate_preallocate_memory(void) | |||
1322 | count -= totalreserve_pages; | 1340 | count -= totalreserve_pages; |
1323 | 1341 | ||
1324 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1342 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1325 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; | 1343 | max_size = (count - (size + PAGES_FOR_IO)) / 2 |
1344 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); | ||
1326 | /* Compute the desired number of image pages specified by image_size. */ | 1345 | /* Compute the desired number of image pages specified by image_size. */ |
1327 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); | 1346 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); |
1328 | if (size > max_size) | 1347 | if (size > max_size) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6275970b2189..1c41ba215419 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -163,19 +163,13 @@ static int suspend_enter(suspend_state_t state) | |||
163 | arch_suspend_disable_irqs(); | 163 | arch_suspend_disable_irqs(); |
164 | BUG_ON(!irqs_disabled()); | 164 | BUG_ON(!irqs_disabled()); |
165 | 165 | ||
166 | error = sysdev_suspend(PMSG_SUSPEND); | 166 | error = syscore_suspend(); |
167 | if (!error) { | ||
168 | error = syscore_suspend(); | ||
169 | if (error) | ||
170 | sysdev_resume(); | ||
171 | } | ||
172 | if (!error) { | 167 | if (!error) { |
173 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { | 168 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
174 | error = suspend_ops->enter(state); | 169 | error = suspend_ops->enter(state); |
175 | events_check_enabled = false; | 170 | events_check_enabled = false; |
176 | } | 171 | } |
177 | syscore_resume(); | 172 | syscore_resume(); |
178 | sysdev_resume(); | ||
179 | } | 173 | } |
180 | 174 | ||
181 | arch_suspend_enable_irqs(); | 175 | arch_suspend_enable_irqs(); |
@@ -226,7 +220,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
226 | if (suspend_test(TEST_DEVICES)) | 220 | if (suspend_test(TEST_DEVICES)) |
227 | goto Recover_platform; | 221 | goto Recover_platform; |
228 | 222 | ||
229 | suspend_enter(state); | 223 | error = suspend_enter(state); |
230 | 224 | ||
231 | Resume_devices: | 225 | Resume_devices: |
232 | suspend_test_start(); | 226 | suspend_test_start(); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 7d02d33be699..42ddbc6f0de6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
113 | if (error) | 113 | if (error) |
114 | pm_notifier_call_chain(PM_POST_RESTORE); | 114 | pm_notifier_call_chain(PM_POST_RESTORE); |
115 | } | 115 | } |
116 | if (error) | 116 | if (error) { |
117 | free_basic_memory_bitmaps(); | ||
117 | atomic_inc(&snapshot_device_available); | 118 | atomic_inc(&snapshot_device_available); |
119 | } | ||
118 | data->frozen = 0; | 120 | data->frozen = 0; |
119 | data->ready = 0; | 121 | data->ready = 0; |
120 | data->platform_support = 0; | 122 | data->platform_support = 0; |
diff --git a/kernel/printk.c b/kernel/printk.c index da8ca817eae3..35185392173f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/smp.h> | 31 | #include <linux/smp.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/memblock.h> | ||
34 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
35 | #include <linux/kexec.h> | 36 | #include <linux/kexec.h> |
36 | #include <linux/kdb.h> | 37 | #include <linux/kdb.h> |
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void) | |||
167 | } | 168 | } |
168 | #endif | 169 | #endif |
169 | 170 | ||
171 | /* requested log_buf_len from kernel cmdline */ | ||
172 | static unsigned long __initdata new_log_buf_len; | ||
173 | |||
174 | /* save requested log_buf_len since it's too early to process it */ | ||
170 | static int __init log_buf_len_setup(char *str) | 175 | static int __init log_buf_len_setup(char *str) |
171 | { | 176 | { |
172 | unsigned size = memparse(str, &str); | 177 | unsigned size = memparse(str, &str); |
173 | unsigned long flags; | ||
174 | 178 | ||
175 | if (size) | 179 | if (size) |
176 | size = roundup_pow_of_two(size); | 180 | size = roundup_pow_of_two(size); |
177 | if (size > log_buf_len) { | 181 | if (size > log_buf_len) |
178 | unsigned start, dest_idx, offset; | 182 | new_log_buf_len = size; |
179 | char *new_log_buf; | ||
180 | 183 | ||
181 | new_log_buf = alloc_bootmem(size); | 184 | return 0; |
182 | if (!new_log_buf) { | 185 | } |
183 | printk(KERN_WARNING "log_buf_len: allocation failed\n"); | 186 | early_param("log_buf_len", log_buf_len_setup); |
184 | goto out; | ||
185 | } | ||
186 | 187 | ||
187 | spin_lock_irqsave(&logbuf_lock, flags); | 188 | void __init setup_log_buf(int early) |
188 | log_buf_len = size; | 189 | { |
189 | log_buf = new_log_buf; | 190 | unsigned long flags; |
190 | 191 | unsigned start, dest_idx, offset; | |
191 | offset = start = min(con_start, log_start); | 192 | char *new_log_buf; |
192 | dest_idx = 0; | 193 | int free; |
193 | while (start != log_end) { | 194 | |
194 | log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; | 195 | if (!new_log_buf_len) |
195 | start++; | 196 | return; |
196 | dest_idx++; | 197 | |
197 | } | 198 | if (early) { |
198 | log_start -= offset; | 199 | unsigned long mem; |
199 | con_start -= offset; | ||
200 | log_end -= offset; | ||
201 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
202 | 200 | ||
203 | printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); | 201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); |
202 | if (mem == MEMBLOCK_ERROR) | ||
203 | return; | ||
204 | new_log_buf = __va(mem); | ||
205 | } else { | ||
206 | new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); | ||
204 | } | 207 | } |
205 | out: | ||
206 | return 1; | ||
207 | } | ||
208 | 208 | ||
209 | __setup("log_buf_len=", log_buf_len_setup); | 209 | if (unlikely(!new_log_buf)) { |
210 | pr_err("log_buf_len: %ld bytes not available\n", | ||
211 | new_log_buf_len); | ||
212 | return; | ||
213 | } | ||
214 | |||
215 | spin_lock_irqsave(&logbuf_lock, flags); | ||
216 | log_buf_len = new_log_buf_len; | ||
217 | log_buf = new_log_buf; | ||
218 | new_log_buf_len = 0; | ||
219 | free = __LOG_BUF_LEN - log_end; | ||
220 | |||
221 | offset = start = min(con_start, log_start); | ||
222 | dest_idx = 0; | ||
223 | while (start != log_end) { | ||
224 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
225 | |||
226 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
227 | start++; | ||
228 | dest_idx++; | ||
229 | } | ||
230 | log_start -= offset; | ||
231 | con_start -= offset; | ||
232 | log_end -= offset; | ||
233 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
234 | |||
235 | pr_info("log_buf_len: %d\n", log_buf_len); | ||
236 | pr_info("early log buf free: %d(%d%%)\n", | ||
237 | free, (free * 100) / __LOG_BUF_LEN); | ||
238 | } | ||
210 | 239 | ||
211 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 240 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
212 | 241 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 66f841b7fbd3..961b389fe52f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -126,11 +126,9 @@ int __ref profile_init(void) | |||
126 | if (prof_buffer) | 126 | if (prof_buffer) |
127 | return 0; | 127 | return 0; |
128 | 128 | ||
129 | prof_buffer = vmalloc(buffer_bytes); | 129 | prof_buffer = vzalloc(buffer_bytes); |
130 | if (prof_buffer) { | 130 | if (prof_buffer) |
131 | memset(prof_buffer, 0, buffer_bytes); | ||
132 | return 0; | 131 | return 0; |
133 | } | ||
134 | 132 | ||
135 | free_cpumask_var(prof_cpu_mask); | 133 | free_cpumask_var(prof_cpu_mask); |
136 | return -ENOMEM; | 134 | return -ENOMEM; |
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void) | |||
305 | mutex_unlock(&profile_flip_mutex); | 303 | mutex_unlock(&profile_flip_mutex); |
306 | } | 304 | } |
307 | 305 | ||
308 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | 306 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
309 | { | 307 | { |
310 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 308 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
311 | int i, j, cpu; | 309 | int i, j, cpu; |
312 | struct profile_hit *hits; | 310 | struct profile_hit *hits; |
313 | 311 | ||
314 | if (prof_on != type || !prof_buffer) | ||
315 | return; | ||
316 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); | 312 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); |
317 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | 313 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; |
318 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | 314 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; |
@@ -419,16 +415,20 @@ out_free: | |||
419 | #define profile_discard_flip_buffers() do { } while (0) | 415 | #define profile_discard_flip_buffers() do { } while (0) |
420 | #define profile_cpu_callback NULL | 416 | #define profile_cpu_callback NULL |
421 | 417 | ||
422 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | 418 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
423 | { | 419 | { |
424 | unsigned long pc; | 420 | unsigned long pc; |
425 | |||
426 | if (prof_on != type || !prof_buffer) | ||
427 | return; | ||
428 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 421 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
429 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 422 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
430 | } | 423 | } |
431 | #endif /* !CONFIG_SMP */ | 424 | #endif /* !CONFIG_SMP */ |
425 | |||
426 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | ||
427 | { | ||
428 | if (prof_on != type || !prof_buffer) | ||
429 | return; | ||
430 | do_profile_hits(type, __pc, nr_hits); | ||
431 | } | ||
432 | EXPORT_SYMBOL_GPL(profile_hits); | 432 | EXPORT_SYMBOL_GPL(profile_hits); |
433 | 433 | ||
434 | void profile_tick(int type) | 434 | void profile_tick(int type) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dc7ab65f3b36..2df115790cd9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -38,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) | |||
38 | child->parent = new_parent; | 38 | child->parent = new_parent; |
39 | } | 39 | } |
40 | 40 | ||
41 | /* | 41 | /** |
42 | * Turn a tracing stop into a normal stop now, since with no tracer there | 42 | * __ptrace_unlink - unlink ptracee and restore its execution state |
43 | * would be no way to wake it up with SIGCONT or SIGKILL. If there was a | 43 | * @child: ptracee to be unlinked |
44 | * signal sent that would resume the child, but didn't because it was in | ||
45 | * TASK_TRACED, resume it now. | ||
46 | * Requires that irqs be disabled. | ||
47 | */ | ||
48 | static void ptrace_untrace(struct task_struct *child) | ||
49 | { | ||
50 | spin_lock(&child->sighand->siglock); | ||
51 | if (task_is_traced(child)) { | ||
52 | /* | ||
53 | * If the group stop is completed or in progress, | ||
54 | * this thread was already counted as stopped. | ||
55 | */ | ||
56 | if (child->signal->flags & SIGNAL_STOP_STOPPED || | ||
57 | child->signal->group_stop_count) | ||
58 | __set_task_state(child, TASK_STOPPED); | ||
59 | else | ||
60 | signal_wake_up(child, 1); | ||
61 | } | ||
62 | spin_unlock(&child->sighand->siglock); | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * unptrace a task: move it back to its original parent and | ||
67 | * remove it from the ptrace list. | ||
68 | * | 44 | * |
69 | * Must be called with the tasklist lock write-held. | 45 | * Remove @child from the ptrace list, move it back to the original parent, |
46 | * and restore the execution state so that it conforms to the group stop | ||
47 | * state. | ||
48 | * | ||
49 | * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer | ||
50 | * exiting. For PTRACE_DETACH, unless the ptracee has been killed between | ||
51 | * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. | ||
52 | * If the ptracer is exiting, the ptracee can be in any state. | ||
53 | * | ||
54 | * After detach, the ptracee should be in a state which conforms to the | ||
55 | * group stop. If the group is stopped or in the process of stopping, the | ||
56 | * ptracee should be put into TASK_STOPPED; otherwise, it should be woken | ||
57 | * up from TASK_TRACED. | ||
58 | * | ||
59 | * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, | ||
60 | * it goes through TRACED -> RUNNING -> STOPPED transition which is similar | ||
61 | * to but in the opposite direction of what happens while attaching to a | ||
62 | * stopped task. However, in this direction, the intermediate RUNNING | ||
63 | * state is not hidden even from the current ptracer and if it immediately | ||
64 | * re-attaches and performs a WNOHANG wait(2), it may fail. | ||
65 | * | ||
66 | * CONTEXT: | ||
67 | * write_lock_irq(tasklist_lock) | ||
70 | */ | 68 | */ |
71 | void __ptrace_unlink(struct task_struct *child) | 69 | void __ptrace_unlink(struct task_struct *child) |
72 | { | 70 | { |
@@ -76,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child) | |||
76 | child->parent = child->real_parent; | 74 | child->parent = child->real_parent; |
77 | list_del_init(&child->ptrace_entry); | 75 | list_del_init(&child->ptrace_entry); |
78 | 76 | ||
79 | if (task_is_traced(child)) | 77 | spin_lock(&child->sighand->siglock); |
80 | ptrace_untrace(child); | 78 | |
79 | /* | ||
80 | * Reinstate GROUP_STOP_PENDING if group stop is in effect and | ||
81 | * @child isn't dead. | ||
82 | */ | ||
83 | if (!(child->flags & PF_EXITING) && | ||
84 | (child->signal->flags & SIGNAL_STOP_STOPPED || | ||
85 | child->signal->group_stop_count)) | ||
86 | child->group_stop |= GROUP_STOP_PENDING; | ||
87 | |||
88 | /* | ||
89 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick | ||
90 | * @child in the butt. Note that @resume should be used iff @child | ||
91 | * is in TASK_TRACED; otherwise, we might unduly disrupt | ||
92 | * TASK_KILLABLE sleeps. | ||
93 | */ | ||
94 | if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) | ||
95 | signal_wake_up(child, task_is_traced(child)); | ||
96 | |||
97 | spin_unlock(&child->sighand->siglock); | ||
81 | } | 98 | } |
82 | 99 | ||
83 | /* | 100 | /* |
@@ -96,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
96 | */ | 113 | */ |
97 | read_lock(&tasklist_lock); | 114 | read_lock(&tasklist_lock); |
98 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { | 115 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { |
99 | ret = 0; | ||
100 | /* | 116 | /* |
101 | * child->sighand can't be NULL, release_task() | 117 | * child->sighand can't be NULL, release_task() |
102 | * does ptrace_unlink() before __exit_signal(). | 118 | * does ptrace_unlink() before __exit_signal(). |
103 | */ | 119 | */ |
104 | spin_lock_irq(&child->sighand->siglock); | 120 | spin_lock_irq(&child->sighand->siglock); |
105 | if (task_is_stopped(child)) | 121 | WARN_ON_ONCE(task_is_stopped(child)); |
106 | child->state = TASK_TRACED; | 122 | if (task_is_traced(child) || kill) |
107 | else if (!task_is_traced(child) && !kill) | 123 | ret = 0; |
108 | ret = -ESRCH; | ||
109 | spin_unlock_irq(&child->sighand->siglock); | 124 | spin_unlock_irq(&child->sighand->siglock); |
110 | } | 125 | } |
111 | read_unlock(&tasklist_lock); | 126 | read_unlock(&tasklist_lock); |
@@ -169,6 +184,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
169 | 184 | ||
170 | static int ptrace_attach(struct task_struct *task) | 185 | static int ptrace_attach(struct task_struct *task) |
171 | { | 186 | { |
187 | bool wait_trap = false; | ||
172 | int retval; | 188 | int retval; |
173 | 189 | ||
174 | audit_ptrace(task); | 190 | audit_ptrace(task); |
@@ -208,12 +224,42 @@ static int ptrace_attach(struct task_struct *task) | |||
208 | __ptrace_link(task, current); | 224 | __ptrace_link(task, current); |
209 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); | 225 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); |
210 | 226 | ||
227 | spin_lock(&task->sighand->siglock); | ||
228 | |||
229 | /* | ||
230 | * If the task is already STOPPED, set GROUP_STOP_PENDING and | ||
231 | * TRAPPING, and kick it so that it transits to TRACED. TRAPPING | ||
232 | * will be cleared if the child completes the transition or any | ||
233 | * event which clears the group stop states happens. We'll wait | ||
234 | * for the transition to complete before returning from this | ||
235 | * function. | ||
236 | * | ||
237 | * This hides STOPPED -> RUNNING -> TRACED transition from the | ||
238 | * attaching thread but a different thread in the same group can | ||
239 | * still observe the transient RUNNING state. IOW, if another | ||
240 | * thread's WNOHANG wait(2) on the stopped tracee races against | ||
241 | * ATTACH, the wait(2) may fail due to the transient RUNNING. | ||
242 | * | ||
243 | * The following task_is_stopped() test is safe as both transitions | ||
244 | * in and out of STOPPED are protected by siglock. | ||
245 | */ | ||
246 | if (task_is_stopped(task)) { | ||
247 | task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; | ||
248 | signal_wake_up(task, 1); | ||
249 | wait_trap = true; | ||
250 | } | ||
251 | |||
252 | spin_unlock(&task->sighand->siglock); | ||
253 | |||
211 | retval = 0; | 254 | retval = 0; |
212 | unlock_tasklist: | 255 | unlock_tasklist: |
213 | write_unlock_irq(&tasklist_lock); | 256 | write_unlock_irq(&tasklist_lock); |
214 | unlock_creds: | 257 | unlock_creds: |
215 | mutex_unlock(&task->signal->cred_guard_mutex); | 258 | mutex_unlock(&task->signal->cred_guard_mutex); |
216 | out: | 259 | out: |
260 | if (wait_trap) | ||
261 | wait_event(current->signal->wait_chldexit, | ||
262 | !(task->group_stop & GROUP_STOP_TRAPPING)); | ||
217 | return retval; | 263 | return retval; |
218 | } | 264 | } |
219 | 265 | ||
@@ -316,8 +362,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
316 | if (child->ptrace) { | 362 | if (child->ptrace) { |
317 | child->exit_code = data; | 363 | child->exit_code = data; |
318 | dead = __ptrace_detach(current, child); | 364 | dead = __ptrace_detach(current, child); |
319 | if (!child->exit_state) | ||
320 | wake_up_state(child, TASK_TRACED | TASK_STOPPED); | ||
321 | } | 365 | } |
322 | write_unlock_irq(&tasklist_lock); | 366 | write_unlock_irq(&tasklist_lock); |
323 | 367 | ||
@@ -518,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, | |||
518 | } | 562 | } |
519 | 563 | ||
520 | child->exit_code = data; | 564 | child->exit_code = data; |
521 | wake_up_process(child); | 565 | wake_up_state(child, __TASK_TRACED); |
522 | 566 | ||
523 | return 0; | 567 | return 0; |
524 | } | 568 | } |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f3240e987928..7784bd216b6a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | |||
142 | * Ensure that queued callbacks are all executed. | 142 | * Ensure that queued callbacks are all executed. |
143 | * If we detect that we are nested in a RCU read-side critical | 143 | * If we detect that we are nested in a RCU read-side critical |
144 | * section, we should simply fail, otherwise we would deadlock. | 144 | * section, we should simply fail, otherwise we would deadlock. |
145 | * In !PREEMPT configurations, there is no way to tell if we are | ||
146 | * in a RCU read-side critical section or not, so we never | ||
147 | * attempt any fixup and just print a warning. | ||
145 | */ | 148 | */ |
149 | #ifndef CONFIG_PREEMPT | ||
150 | WARN_ON_ONCE(1); | ||
151 | return 0; | ||
152 | #endif | ||
146 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 153 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
147 | irqs_disabled()) { | 154 | irqs_disabled()) { |
148 | WARN_ON(1); | 155 | WARN_ON_ONCE(1); |
149 | return 0; | 156 | return 0; |
150 | } | 157 | } |
151 | rcu_barrier(); | 158 | rcu_barrier(); |
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
184 | * Ensure that queued callbacks are all executed. | 191 | * Ensure that queued callbacks are all executed. |
185 | * If we detect that we are nested in a RCU read-side critical | 192 | * If we detect that we are nested in a RCU read-side critical |
186 | * section, we should simply fail, otherwise we would deadlock. | 193 | * section, we should simply fail, otherwise we would deadlock. |
194 | * In !PREEMPT configurations, there is no way to tell if we are | ||
195 | * in a RCU read-side critical section or not, so we never | ||
196 | * attempt any fixup and just print a warning. | ||
187 | */ | 197 | */ |
198 | #ifndef CONFIG_PREEMPT | ||
199 | WARN_ON_ONCE(1); | ||
200 | return 0; | ||
201 | #endif | ||
188 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 202 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
189 | irqs_disabled()) { | 203 | irqs_disabled()) { |
190 | WARN_ON(1); | 204 | WARN_ON_ONCE(1); |
191 | return 0; | 205 | return 0; |
192 | } | 206 | } |
193 | rcu_barrier(); | 207 | rcu_barrier(); |
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
214 | * Ensure that queued callbacks are all executed. | 228 | * Ensure that queued callbacks are all executed. |
215 | * If we detect that we are nested in a RCU read-side critical | 229 | * If we detect that we are nested in a RCU read-side critical |
216 | * section, we should simply fail, otherwise we would deadlock. | 230 | * section, we should simply fail, otherwise we would deadlock. |
217 | * Note that the machinery to reliably determine whether | 231 | * In !PREEMPT configurations, there is no way to tell if we are |
218 | * or not we are in an RCU read-side critical section | 232 | * in a RCU read-side critical section or not, so we never |
219 | * exists only in the preemptible RCU implementations | 233 | * attempt any fixup and just print a warning. |
220 | * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why | ||
221 | * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. | ||
222 | */ | 234 | */ |
235 | #ifndef CONFIG_PREEMPT | ||
236 | WARN_ON_ONCE(1); | ||
237 | return 0; | ||
238 | #endif | ||
223 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 239 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
224 | irqs_disabled()) { | 240 | irqs_disabled()) { |
225 | WARN_ON(1); | 241 | WARN_ON_ONCE(1); |
226 | return 0; | 242 | return 0; |
227 | } | 243 | } |
228 | rcu_barrier(); | 244 | rcu_barrier(); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9a46d5..7bbac7d0f5ab 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -35,15 +35,16 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
38 | #include <linux/prefetch.h> | ||
38 | 39 | ||
39 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | 40 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ |
40 | static struct task_struct *rcu_kthread_task; | 41 | static struct task_struct *rcu_kthread_task; |
41 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | 42 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); |
42 | static unsigned long have_rcu_kthread_work; | 43 | static unsigned long have_rcu_kthread_work; |
43 | static void invoke_rcu_kthread(void); | ||
44 | 44 | ||
45 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
46 | struct rcu_ctrlblk; | 46 | struct rcu_ctrlblk; |
47 | static void invoke_rcu_kthread(void); | ||
47 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
48 | static int rcu_kthread(void *arg); | 49 | static int rcu_kthread(void *arg); |
49 | static void __call_rcu(struct rcu_head *head, | 50 | static void __call_rcu(struct rcu_head *head, |
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void) | |||
79 | #endif /* #ifdef CONFIG_NO_HZ */ | 80 | #endif /* #ifdef CONFIG_NO_HZ */ |
80 | 81 | ||
81 | /* | 82 | /* |
82 | * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). | 83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
83 | * Also disable irqs to avoid confusion due to interrupt handlers | 84 | * Also irqs are disabled to avoid confusion due to interrupt handlers |
84 | * invoking call_rcu(). | 85 | * invoking call_rcu(). |
85 | */ | 86 | */ |
86 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 87 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
87 | { | 88 | { |
88 | unsigned long flags; | ||
89 | |||
90 | local_irq_save(flags); | ||
91 | if (rcp->rcucblist != NULL && | 89 | if (rcp->rcucblist != NULL && |
92 | rcp->donetail != rcp->curtail) { | 90 | rcp->donetail != rcp->curtail) { |
93 | rcp->donetail = rcp->curtail; | 91 | rcp->donetail = rcp->curtail; |
94 | local_irq_restore(flags); | ||
95 | return 1; | 92 | return 1; |
96 | } | 93 | } |
97 | local_irq_restore(flags); | ||
98 | 94 | ||
99 | return 0; | 95 | return 0; |
100 | } | 96 | } |
101 | 97 | ||
102 | /* | 98 | /* |
99 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
100 | * or to boost readers. | ||
101 | */ | ||
102 | static void invoke_rcu_kthread(void) | ||
103 | { | ||
104 | have_rcu_kthread_work = 1; | ||
105 | wake_up(&rcu_kthread_wq); | ||
106 | } | ||
107 | |||
108 | /* | ||
103 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | 109 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we |
104 | * are at it, given that any rcu quiescent state is also an rcu_bh | 110 | * are at it, given that any rcu quiescent state is also an rcu_bh |
105 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 111 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
106 | */ | 112 | */ |
107 | void rcu_sched_qs(int cpu) | 113 | void rcu_sched_qs(int cpu) |
108 | { | 114 | { |
115 | unsigned long flags; | ||
116 | |||
117 | local_irq_save(flags); | ||
109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 118 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
111 | invoke_rcu_kthread(); | 120 | invoke_rcu_kthread(); |
121 | local_irq_restore(flags); | ||
112 | } | 122 | } |
113 | 123 | ||
114 | /* | 124 | /* |
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu) | |||
116 | */ | 126 | */ |
117 | void rcu_bh_qs(int cpu) | 127 | void rcu_bh_qs(int cpu) |
118 | { | 128 | { |
129 | unsigned long flags; | ||
130 | |||
131 | local_irq_save(flags); | ||
119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 132 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
120 | invoke_rcu_kthread(); | 133 | invoke_rcu_kthread(); |
134 | local_irq_restore(flags); | ||
121 | } | 135 | } |
122 | 136 | ||
123 | /* | 137 | /* |
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
167 | prefetch(next); | 181 | prefetch(next); |
168 | debug_rcu_head_unqueue(list); | 182 | debug_rcu_head_unqueue(list); |
169 | local_bh_disable(); | 183 | local_bh_disable(); |
170 | list->func(list); | 184 | __rcu_reclaim(list); |
171 | local_bh_enable(); | 185 | local_bh_enable(); |
172 | list = next; | 186 | list = next; |
173 | RCU_TRACE(cb_count++); | 187 | RCU_TRACE(cb_count++); |
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg) | |||
208 | } | 222 | } |
209 | 223 | ||
210 | /* | 224 | /* |
211 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
212 | * or to boost readers. | ||
213 | */ | ||
214 | static void invoke_rcu_kthread(void) | ||
215 | { | ||
216 | unsigned long flags; | ||
217 | |||
218 | local_irq_save(flags); | ||
219 | have_rcu_kthread_work = 1; | ||
220 | wake_up(&rcu_kthread_wq); | ||
221 | local_irq_restore(flags); | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * Wait for a grace period to elapse. But it is illegal to invoke | 225 | * Wait for a grace period to elapse. But it is illegal to invoke |
226 | * synchronize_sched() from within an RCU read-side critical section. | 226 | * synchronize_sched() from within an RCU read-side critical section. |
227 | * Therefore, any legal call to synchronize_sched() is a quiescent | 227 | * Therefore, any legal call to synchronize_sched() is a quiescent |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3cb8e362e883..f259c676195f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk { | |||
100 | u8 completed; /* Last grace period completed. */ | 100 | u8 completed; /* Last grace period completed. */ |
101 | /* If all three are equal, RCU is idle. */ | 101 | /* If all three are equal, RCU is idle. */ |
102 | #ifdef CONFIG_RCU_BOOST | 102 | #ifdef CONFIG_RCU_BOOST |
103 | s8 boosted_this_gp; /* Has boosting already happened? */ | ||
104 | unsigned long boost_time; /* When to start boosting (jiffies) */ | 103 | unsigned long boost_time; /* When to start boosting (jiffies) */ |
105 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 104 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
106 | #ifdef CONFIG_RCU_TRACE | 105 | #ifdef CONFIG_RCU_TRACE |
107 | unsigned long n_grace_periods; | 106 | unsigned long n_grace_periods; |
108 | #ifdef CONFIG_RCU_BOOST | 107 | #ifdef CONFIG_RCU_BOOST |
109 | unsigned long n_tasks_boosted; | 108 | unsigned long n_tasks_boosted; |
109 | /* Total number of tasks boosted. */ | ||
110 | unsigned long n_exp_boosts; | 110 | unsigned long n_exp_boosts; |
111 | /* Number of tasks boosted for expedited GP. */ | ||
111 | unsigned long n_normal_boosts; | 112 | unsigned long n_normal_boosts; |
112 | unsigned long n_normal_balk_blkd_tasks; | 113 | /* Number of tasks boosted for normal GP. */ |
113 | unsigned long n_normal_balk_gp_tasks; | 114 | unsigned long n_balk_blkd_tasks; |
114 | unsigned long n_normal_balk_boost_tasks; | 115 | /* Refused to boost: no blocked tasks. */ |
115 | unsigned long n_normal_balk_boosted; | 116 | unsigned long n_balk_exp_gp_tasks; |
116 | unsigned long n_normal_balk_notyet; | 117 | /* Refused to boost: nothing blocking GP. */ |
117 | unsigned long n_normal_balk_nos; | 118 | unsigned long n_balk_boost_tasks; |
118 | unsigned long n_exp_balk_blkd_tasks; | 119 | /* Refused to boost: already boosting. */ |
119 | unsigned long n_exp_balk_nos; | 120 | unsigned long n_balk_notyet; |
121 | /* Refused to boost: not yet time. */ | ||
122 | unsigned long n_balk_nos; | ||
123 | /* Refused to boost: not sure why, though. */ | ||
124 | /* This can happen due to race conditions. */ | ||
120 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 125 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
121 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 126 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
122 | }; | 127 | }; |
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t) | |||
201 | 206 | ||
202 | #ifdef CONFIG_RCU_BOOST | 207 | #ifdef CONFIG_RCU_BOOST |
203 | static void rcu_initiate_boost_trace(void); | 208 | static void rcu_initiate_boost_trace(void); |
204 | static void rcu_initiate_exp_boost_trace(void); | ||
205 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 209 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
206 | 210 | ||
207 | /* | 211 | /* |
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
219 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | 223 | "N."[!rcu_preempt_ctrlblk.gp_tasks], |
220 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | 224 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); |
221 | #ifdef CONFIG_RCU_BOOST | 225 | #ifdef CONFIG_RCU_BOOST |
222 | seq_printf(m, " ttb=%c btg=", | 226 | seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", |
223 | "B."[!rcu_preempt_ctrlblk.boost_tasks]); | 227 | " ", |
224 | switch (rcu_preempt_ctrlblk.boosted_this_gp) { | 228 | "B."[!rcu_preempt_ctrlblk.boost_tasks], |
225 | case -1: | ||
226 | seq_puts(m, "exp"); | ||
227 | break; | ||
228 | case 0: | ||
229 | seq_puts(m, "no"); | ||
230 | break; | ||
231 | case 1: | ||
232 | seq_puts(m, "begun"); | ||
233 | break; | ||
234 | case 2: | ||
235 | seq_puts(m, "done"); | ||
236 | break; | ||
237 | default: | ||
238 | seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | ||
239 | } | ||
240 | seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
241 | rcu_preempt_ctrlblk.n_tasks_boosted, | 229 | rcu_preempt_ctrlblk.n_tasks_boosted, |
242 | rcu_preempt_ctrlblk.n_exp_boosts, | 230 | rcu_preempt_ctrlblk.n_exp_boosts, |
243 | rcu_preempt_ctrlblk.n_normal_boosts, | 231 | rcu_preempt_ctrlblk.n_normal_boosts, |
244 | (int)(jiffies & 0xffff), | 232 | (int)(jiffies & 0xffff), |
245 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | 233 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); |
246 | seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | 234 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", |
247 | "normal balk", | 235 | " balk", |
248 | rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | 236 | rcu_preempt_ctrlblk.n_balk_blkd_tasks, |
249 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | 237 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, |
250 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | 238 | rcu_preempt_ctrlblk.n_balk_boost_tasks, |
251 | rcu_preempt_ctrlblk.n_normal_balk_boosted, | 239 | rcu_preempt_ctrlblk.n_balk_notyet, |
252 | rcu_preempt_ctrlblk.n_normal_balk_notyet, | 240 | rcu_preempt_ctrlblk.n_balk_nos); |
253 | rcu_preempt_ctrlblk.n_normal_balk_nos); | ||
254 | seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | ||
255 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | ||
256 | rcu_preempt_ctrlblk.n_exp_balk_nos); | ||
257 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 241 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
258 | } | 242 | } |
259 | 243 | ||
@@ -271,25 +255,59 @@ static int rcu_boost(void) | |||
271 | { | 255 | { |
272 | unsigned long flags; | 256 | unsigned long flags; |
273 | struct rt_mutex mtx; | 257 | struct rt_mutex mtx; |
274 | struct list_head *np; | ||
275 | struct task_struct *t; | 258 | struct task_struct *t; |
259 | struct list_head *tb; | ||
276 | 260 | ||
277 | if (rcu_preempt_ctrlblk.boost_tasks == NULL) | 261 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && |
262 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
278 | return 0; /* Nothing to boost. */ | 263 | return 0; /* Nothing to boost. */ |
264 | |||
279 | raw_local_irq_save(flags); | 265 | raw_local_irq_save(flags); |
280 | rcu_preempt_ctrlblk.boosted_this_gp++; | 266 | |
281 | t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | 267 | /* |
282 | rcu_node_entry); | 268 | * Recheck with irqs disabled: all tasks in need of boosting |
283 | np = rcu_next_node_entry(t); | 269 | * might exit their RCU read-side critical sections on their own |
270 | * if we are preempted just before disabling irqs. | ||
271 | */ | ||
272 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
273 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
274 | raw_local_irq_restore(flags); | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * Preferentially boost tasks blocking expedited grace periods. | ||
280 | * This cannot starve the normal grace periods because a second | ||
281 | * expedited grace period must boost all blocked tasks, including | ||
282 | * those blocking the pre-existing normal grace period. | ||
283 | */ | ||
284 | if (rcu_preempt_ctrlblk.exp_tasks != NULL) { | ||
285 | tb = rcu_preempt_ctrlblk.exp_tasks; | ||
286 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
287 | } else { | ||
288 | tb = rcu_preempt_ctrlblk.boost_tasks; | ||
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
290 | } | ||
291 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
292 | |||
293 | /* | ||
294 | * We boost task t by manufacturing an rt_mutex that appears to | ||
295 | * be held by task t. We leave a pointer to that rt_mutex where | ||
296 | * task t can find it, and task t will release the mutex when it | ||
297 | * exits its outermost RCU read-side critical section. Then | ||
298 | * simply acquiring this artificial rt_mutex will boost task | ||
299 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
300 | */ | ||
301 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
284 | rt_mutex_init_proxy_locked(&mtx, t); | 302 | rt_mutex_init_proxy_locked(&mtx, t); |
285 | t->rcu_boost_mutex = &mtx; | 303 | t->rcu_boost_mutex = &mtx; |
286 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | 304 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; |
287 | raw_local_irq_restore(flags); | 305 | raw_local_irq_restore(flags); |
288 | rt_mutex_lock(&mtx); | 306 | rt_mutex_lock(&mtx); |
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | 307 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
290 | rcu_preempt_ctrlblk.boosted_this_gp++; | 308 | |
291 | rt_mutex_unlock(&mtx); | 309 | return rcu_preempt_ctrlblk.boost_tasks != NULL || |
292 | return rcu_preempt_ctrlblk.boost_tasks != NULL; | 310 | rcu_preempt_ctrlblk.exp_tasks != NULL; |
293 | } | 311 | } |
294 | 312 | ||
295 | /* | 313 | /* |
@@ -304,42 +322,25 @@ static int rcu_boost(void) | |||
304 | */ | 322 | */ |
305 | static int rcu_initiate_boost(void) | 323 | static int rcu_initiate_boost(void) |
306 | { | 324 | { |
307 | if (!rcu_preempt_blocked_readers_cgp()) { | 325 | if (!rcu_preempt_blocked_readers_cgp() && |
308 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | 326 | rcu_preempt_ctrlblk.exp_tasks == NULL) { |
327 | RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); | ||
309 | return 0; | 328 | return 0; |
310 | } | 329 | } |
311 | if (rcu_preempt_ctrlblk.gp_tasks != NULL && | 330 | if (rcu_preempt_ctrlblk.exp_tasks != NULL || |
312 | rcu_preempt_ctrlblk.boost_tasks == NULL && | 331 | (rcu_preempt_ctrlblk.gp_tasks != NULL && |
313 | rcu_preempt_ctrlblk.boosted_this_gp == 0 && | 332 | rcu_preempt_ctrlblk.boost_tasks == NULL && |
314 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | 333 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { |
315 | rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | 334 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) |
335 | rcu_preempt_ctrlblk.boost_tasks = | ||
336 | rcu_preempt_ctrlblk.gp_tasks; | ||
316 | invoke_rcu_kthread(); | 337 | invoke_rcu_kthread(); |
317 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
318 | } else | 338 | } else |
319 | RCU_TRACE(rcu_initiate_boost_trace()); | 339 | RCU_TRACE(rcu_initiate_boost_trace()); |
320 | return 1; | 340 | return 1; |
321 | } | 341 | } |
322 | 342 | ||
323 | /* | 343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
324 | * Initiate boosting for an expedited grace period. | ||
325 | */ | ||
326 | static void rcu_initiate_expedited_boost(void) | ||
327 | { | ||
328 | unsigned long flags; | ||
329 | |||
330 | raw_local_irq_save(flags); | ||
331 | if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | ||
332 | rcu_preempt_ctrlblk.boost_tasks = | ||
333 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
334 | rcu_preempt_ctrlblk.boosted_this_gp = -1; | ||
335 | invoke_rcu_kthread(); | ||
336 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
337 | } else | ||
338 | RCU_TRACE(rcu_initiate_exp_boost_trace()); | ||
339 | raw_local_irq_restore(flags); | ||
340 | } | ||
341 | |||
342 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | ||
343 | 344 | ||
344 | /* | 345 | /* |
345 | * Do priority-boost accounting for the start of a new grace period. | 346 | * Do priority-boost accounting for the start of a new grace period. |
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void) | |||
347 | static void rcu_preempt_boost_start_gp(void) | 348 | static void rcu_preempt_boost_start_gp(void) |
348 | { | 349 | { |
349 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | 350 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; |
350 | if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | ||
351 | rcu_preempt_ctrlblk.boosted_this_gp = 0; | ||
352 | } | 351 | } |
353 | 352 | ||
354 | #else /* #ifdef CONFIG_RCU_BOOST */ | 353 | #else /* #ifdef CONFIG_RCU_BOOST */ |
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void) | |||
372 | } | 371 | } |
373 | 372 | ||
374 | /* | 373 | /* |
375 | * If there is no RCU priority boosting, we don't initiate expedited boosting. | ||
376 | */ | ||
377 | static void rcu_initiate_expedited_boost(void) | ||
378 | { | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * If there is no RCU priority boosting, nothing to do at grace-period start. | 374 | * If there is no RCU priority boosting, nothing to do at grace-period start. |
383 | */ | 375 | */ |
384 | static void rcu_preempt_boost_start_gp(void) | 376 | static void rcu_preempt_boost_start_gp(void) |
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void) | |||
418 | if (!rcu_preempt_gp_in_progress()) | 410 | if (!rcu_preempt_gp_in_progress()) |
419 | return; | 411 | return; |
420 | /* | 412 | /* |
421 | * Check up on boosting. If there are no readers blocking the | 413 | * Check up on boosting. If there are readers blocking the |
422 | * current grace period, leave. | 414 | * current grace period, leave. |
423 | */ | 415 | */ |
424 | if (rcu_initiate_boost()) | 416 | if (rcu_initiate_boost()) |
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
578 | empty = !rcu_preempt_blocked_readers_cgp(); | 570 | empty = !rcu_preempt_blocked_readers_cgp(); |
579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | 571 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
580 | np = rcu_next_node_entry(t); | 572 | np = rcu_next_node_entry(t); |
581 | list_del(&t->rcu_node_entry); | 573 | list_del_init(&t->rcu_node_entry); |
582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | 574 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
583 | rcu_preempt_ctrlblk.gp_tasks = np; | 575 | rcu_preempt_ctrlblk.gp_tasks = np; |
584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | 576 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
587 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | 579 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) |
588 | rcu_preempt_ctrlblk.boost_tasks = np; | 580 | rcu_preempt_ctrlblk.boost_tasks = np; |
589 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 581 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
590 | INIT_LIST_HEAD(&t->rcu_node_entry); | ||
591 | 582 | ||
592 | /* | 583 | /* |
593 | * If this was the last task on the current list, and if | 584 | * If this was the last task on the current list, and if |
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void) | |||
812 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | 803 | rpcp->exp_tasks = rpcp->blkd_tasks.next; |
813 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | 804 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) |
814 | rpcp->exp_tasks = NULL; | 805 | rpcp->exp_tasks = NULL; |
815 | local_irq_restore(flags); | ||
816 | 806 | ||
817 | /* Wait for tail of ->blkd_tasks list to drain. */ | 807 | /* Wait for tail of ->blkd_tasks list to drain. */ |
818 | if (rcu_preempted_readers_exp()) | 808 | if (!rcu_preempted_readers_exp()) |
819 | rcu_initiate_expedited_boost(); | 809 | local_irq_restore(flags); |
810 | else { | ||
811 | rcu_initiate_boost(); | ||
812 | local_irq_restore(flags); | ||
820 | wait_event(sync_rcu_preempt_exp_wq, | 813 | wait_event(sync_rcu_preempt_exp_wq, |
821 | !rcu_preempted_readers_exp()); | 814 | !rcu_preempted_readers_exp()); |
815 | } | ||
822 | 816 | ||
823 | /* Clean up and exit. */ | 817 | /* Clean up and exit. */ |
824 | barrier(); /* ensure expedited GP seen before counter increment. */ | 818 | barrier(); /* ensure expedited GP seen before counter increment. */ |
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void) | |||
931 | 925 | ||
932 | static void rcu_initiate_boost_trace(void) | 926 | static void rcu_initiate_boost_trace(void) |
933 | { | 927 | { |
934 | if (rcu_preempt_ctrlblk.gp_tasks == NULL) | 928 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) |
935 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | 929 | rcu_preempt_ctrlblk.n_balk_blkd_tasks++; |
930 | else if (rcu_preempt_ctrlblk.gp_tasks == NULL && | ||
931 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
932 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; | ||
936 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | 933 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) |
937 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | 934 | rcu_preempt_ctrlblk.n_balk_boost_tasks++; |
938 | else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | ||
939 | rcu_preempt_ctrlblk.n_normal_balk_boosted++; | ||
940 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | 935 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) |
941 | rcu_preempt_ctrlblk.n_normal_balk_notyet++; | 936 | rcu_preempt_ctrlblk.n_balk_notyet++; |
942 | else | ||
943 | rcu_preempt_ctrlblk.n_normal_balk_nos++; | ||
944 | } | ||
945 | |||
946 | static void rcu_initiate_exp_boost_trace(void) | ||
947 | { | ||
948 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
949 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | ||
950 | else | 937 | else |
951 | rcu_preempt_ctrlblk.n_exp_balk_nos++; | 938 | rcu_preempt_ctrlblk.n_balk_nos++; |
952 | } | 939 | } |
953 | 940 | ||
954 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 941 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c224da41890c..2e138db03382 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -131,7 +131,7 @@ struct rcu_torture { | |||
131 | 131 | ||
132 | static LIST_HEAD(rcu_torture_freelist); | 132 | static LIST_HEAD(rcu_torture_freelist); |
133 | static struct rcu_torture __rcu *rcu_torture_current; | 133 | static struct rcu_torture __rcu *rcu_torture_current; |
134 | static long rcu_torture_current_version; | 134 | static unsigned long rcu_torture_current_version; |
135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
136 | static DEFINE_SPINLOCK(rcu_torture_lock); | 136 | static DEFINE_SPINLOCK(rcu_torture_lock); |
137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | 137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror; | |||
146 | static atomic_t n_rcu_torture_error; | 146 | static atomic_t n_rcu_torture_error; |
147 | static long n_rcu_torture_boost_ktrerror; | 147 | static long n_rcu_torture_boost_ktrerror; |
148 | static long n_rcu_torture_boost_rterror; | 148 | static long n_rcu_torture_boost_rterror; |
149 | static long n_rcu_torture_boost_allocerror; | ||
150 | static long n_rcu_torture_boost_afferror; | ||
151 | static long n_rcu_torture_boost_failure; | 149 | static long n_rcu_torture_boost_failure; |
152 | static long n_rcu_torture_boosts; | 150 | static long n_rcu_torture_boosts; |
153 | static long n_rcu_torture_timers; | 151 | static long n_rcu_torture_timers; |
@@ -163,11 +161,11 @@ static int stutter_pause_test; | |||
163 | #endif | 161 | #endif |
164 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
165 | 163 | ||
166 | #ifdef CONFIG_RCU_BOOST | 164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
167 | #define rcu_can_boost() 1 | 165 | #define rcu_can_boost() 1 |
168 | #else /* #ifdef CONFIG_RCU_BOOST */ | 166 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
169 | #define rcu_can_boost() 0 | 167 | #define rcu_can_boost() 0 |
170 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | 168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
171 | 169 | ||
172 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
173 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg) | |||
751 | n_rcu_torture_boost_rterror++; | 749 | n_rcu_torture_boost_rterror++; |
752 | } | 750 | } |
753 | 751 | ||
752 | init_rcu_head_on_stack(&rbi.rcu); | ||
754 | /* Each pass through the following loop does one boost-test cycle. */ | 753 | /* Each pass through the following loop does one boost-test cycle. */ |
755 | do { | 754 | do { |
756 | /* Wait for the next test interval. */ | 755 | /* Wait for the next test interval. */ |
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); | |||
810 | 809 | ||
811 | /* Clean up and exit. */ | 810 | /* Clean up and exit. */ |
812 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | 811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); |
812 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | 813 | rcutorture_shutdown_absorb("rcu_torture_boost"); |
814 | while (!kthread_should_stop() || rbi.inflight) | 814 | while (!kthread_should_stop() || rbi.inflight) |
815 | schedule_timeout_uninterruptible(1); | 815 | schedule_timeout_uninterruptible(1); |
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg) | |||
886 | old_rp->rtort_pipe_count++; | 886 | old_rp->rtort_pipe_count++; |
887 | cur_ops->deferred_free(old_rp); | 887 | cur_ops->deferred_free(old_rp); |
888 | } | 888 | } |
889 | rcu_torture_current_version++; | 889 | rcutorture_record_progress(++rcu_torture_current_version); |
890 | oldbatch = cur_ops->completed(); | 890 | oldbatch = cur_ops->completed(); |
891 | rcu_stutter_wait("rcu_torture_writer"); | 891 | rcu_stutter_wait("rcu_torture_writer"); |
892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page) | |||
1066 | } | 1066 | } |
1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
1068 | cnt += sprintf(&page[cnt], | 1068 | cnt += sprintf(&page[cnt], |
1069 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1069 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1070 | "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " | 1070 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1071 | "rtbf: %ld rtb: %ld nt: %ld", | 1071 | "rtbf: %ld rtb: %ld nt: %ld", |
1072 | rcu_torture_current, | 1072 | rcu_torture_current, |
1073 | rcu_torture_current_version, | 1073 | rcu_torture_current_version, |
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page) | |||
1078 | atomic_read(&n_rcu_torture_mberror), | 1078 | atomic_read(&n_rcu_torture_mberror), |
1079 | n_rcu_torture_boost_ktrerror, | 1079 | n_rcu_torture_boost_ktrerror, |
1080 | n_rcu_torture_boost_rterror, | 1080 | n_rcu_torture_boost_rterror, |
1081 | n_rcu_torture_boost_allocerror, | ||
1082 | n_rcu_torture_boost_afferror, | ||
1083 | n_rcu_torture_boost_failure, | 1081 | n_rcu_torture_boost_failure, |
1084 | n_rcu_torture_boosts, | 1082 | n_rcu_torture_boosts, |
1085 | n_rcu_torture_timers); | 1083 | n_rcu_torture_timers); |
1086 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1084 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1087 | n_rcu_torture_boost_ktrerror != 0 || | 1085 | n_rcu_torture_boost_ktrerror != 0 || |
1088 | n_rcu_torture_boost_rterror != 0 || | 1086 | n_rcu_torture_boost_rterror != 0 || |
1089 | n_rcu_torture_boost_allocerror != 0 || | ||
1090 | n_rcu_torture_boost_afferror != 0 || | ||
1091 | n_rcu_torture_boost_failure != 0) | 1087 | n_rcu_torture_boost_failure != 0) |
1092 | cnt += sprintf(&page[cnt], " !!!"); | 1088 | cnt += sprintf(&page[cnt], " !!!"); |
1093 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1089 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void) | |||
1331 | int i; | 1327 | int i; |
1332 | 1328 | ||
1333 | mutex_lock(&fullstop_mutex); | 1329 | mutex_lock(&fullstop_mutex); |
1330 | rcutorture_record_test_transition(); | ||
1334 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1331 | if (fullstop == FULLSTOP_SHUTDOWN) { |
1335 | printk(KERN_WARNING /* but going down anyway, so... */ | 1332 | printk(KERN_WARNING /* but going down anyway, so... */ |
1336 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1333 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
@@ -1486,8 +1483,6 @@ rcu_torture_init(void) | |||
1486 | atomic_set(&n_rcu_torture_error, 0); | 1483 | atomic_set(&n_rcu_torture_error, 0); |
1487 | n_rcu_torture_boost_ktrerror = 0; | 1484 | n_rcu_torture_boost_ktrerror = 0; |
1488 | n_rcu_torture_boost_rterror = 0; | 1485 | n_rcu_torture_boost_rterror = 0; |
1489 | n_rcu_torture_boost_allocerror = 0; | ||
1490 | n_rcu_torture_boost_afferror = 0; | ||
1491 | n_rcu_torture_boost_failure = 0; | 1486 | n_rcu_torture_boost_failure = 0; |
1492 | n_rcu_torture_boosts = 0; | 1487 | n_rcu_torture_boosts = 0; |
1493 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1488 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
@@ -1624,6 +1619,7 @@ rcu_torture_init(void) | |||
1624 | } | 1619 | } |
1625 | } | 1620 | } |
1626 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1621 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1622 | rcutorture_record_test_transition(); | ||
1627 | mutex_unlock(&fullstop_mutex); | 1623 | mutex_unlock(&fullstop_mutex); |
1628 | return 0; | 1624 | return 0; |
1629 | 1625 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8e..ba06207b1dd3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -36,7 +36,7 @@ | |||
36 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
37 | #include <linux/sched.h> | 37 | #include <linux/sched.h> |
38 | #include <linux/nmi.h> | 38 | #include <linux/nmi.h> |
39 | #include <asm/atomic.h> | 39 | #include <linux/atomic.h> |
40 | #include <linux/bitops.h> | 40 | #include <linux/bitops.h> |
41 | #include <linux/module.h> | 41 | #include <linux/module.h> |
42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
@@ -47,6 +47,9 @@ | |||
47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
49 | #include <linux/kernel_stat.h> | 49 | #include <linux/kernel_stat.h> |
50 | #include <linux/wait.h> | ||
51 | #include <linux/kthread.h> | ||
52 | #include <linux/prefetch.h> | ||
50 | 53 | ||
51 | #include "rcutree.h" | 54 | #include "rcutree.h" |
52 | 55 | ||
@@ -79,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
79 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
80 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
81 | 84 | ||
85 | static struct rcu_state *rcu_state; | ||
86 | |||
87 | /* | ||
88 | * The rcu_scheduler_active variable transitions from zero to one just | ||
89 | * before the first task is spawned. So when this variable is zero, RCU | ||
90 | * can assume that there is but one task, allowing RCU to (for example) | ||
91 | * optimized synchronize_sched() to a simple barrier(). When this variable | ||
92 | * is one, RCU must actually do all the hard work required to detect real | ||
93 | * grace periods. This variable is also used to suppress boot-time false | ||
94 | * positives from lockdep-RCU error checking. | ||
95 | */ | ||
82 | int rcu_scheduler_active __read_mostly; | 96 | int rcu_scheduler_active __read_mostly; |
83 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 97 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
84 | 98 | ||
85 | /* | 99 | /* |
100 | * The rcu_scheduler_fully_active variable transitions from zero to one | ||
101 | * during the early_initcall() processing, which is after the scheduler | ||
102 | * is capable of creating new tasks. So RCU processing (for example, | ||
103 | * creating tasks for RCU priority boosting) must be delayed until after | ||
104 | * rcu_scheduler_fully_active transitions from zero to one. We also | ||
105 | * currently delay invocation of any RCU callbacks until after this point. | ||
106 | * | ||
107 | * It might later prove better for people registering RCU callbacks during | ||
108 | * early boot to take responsibility for these callbacks, but one step at | ||
109 | * a time. | ||
110 | */ | ||
111 | static int rcu_scheduler_fully_active __read_mostly; | ||
112 | |||
113 | #ifdef CONFIG_RCU_BOOST | ||
114 | |||
115 | /* | ||
116 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
117 | * handle all flavors of RCU. | ||
118 | */ | ||
119 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
120 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
121 | DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
122 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
123 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
124 | |||
125 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
126 | |||
127 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | ||
128 | static void invoke_rcu_core(void); | ||
129 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | ||
130 | |||
131 | #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ | ||
132 | |||
133 | /* | ||
134 | * Track the rcutorture test sequence number and the update version | ||
135 | * number within a given test. The rcutorture_testseq is incremented | ||
136 | * on every rcutorture module load and unload, so has an odd value | ||
137 | * when a test is running. The rcutorture_vernum is set to zero | ||
138 | * when rcutorture starts and is incremented on each rcutorture update. | ||
139 | * These variables enable correlating rcutorture output with the | ||
140 | * RCU tracing information. | ||
141 | */ | ||
142 | unsigned long rcutorture_testseq; | ||
143 | unsigned long rcutorture_vernum; | ||
144 | |||
145 | /* | ||
86 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 146 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
87 | * permit this function to be invoked without holding the root rcu_node | 147 | * permit this function to be invoked without holding the root rcu_node |
88 | * structure's ->lock, but of course results can be subject to change. | 148 | * structure's ->lock, but of course results can be subject to change. |
@@ -124,11 +184,12 @@ void rcu_note_context_switch(int cpu) | |||
124 | rcu_sched_qs(cpu); | 184 | rcu_sched_qs(cpu); |
125 | rcu_preempt_note_context_switch(cpu); | 185 | rcu_preempt_note_context_switch(cpu); |
126 | } | 186 | } |
187 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | ||
127 | 188 | ||
128 | #ifdef CONFIG_NO_HZ | 189 | #ifdef CONFIG_NO_HZ |
129 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 190 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
130 | .dynticks_nesting = 1, | 191 | .dynticks_nesting = 1, |
131 | .dynticks = 1, | 192 | .dynticks = ATOMIC_INIT(1), |
132 | }; | 193 | }; |
133 | #endif /* #ifdef CONFIG_NO_HZ */ | 194 | #endif /* #ifdef CONFIG_NO_HZ */ |
134 | 195 | ||
@@ -140,10 +201,8 @@ module_param(blimit, int, 0); | |||
140 | module_param(qhimark, int, 0); | 201 | module_param(qhimark, int, 0); |
141 | module_param(qlowmark, int, 0); | 202 | module_param(qlowmark, int, 0); |
142 | 203 | ||
143 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 204 | int rcu_cpu_stall_suppress __read_mostly; |
144 | int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; | ||
145 | module_param(rcu_cpu_stall_suppress, int, 0644); | 205 | module_param(rcu_cpu_stall_suppress, int, 0644); |
146 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
147 | 206 | ||
148 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 207 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
149 | static int rcu_pending(int cpu); | 208 | static int rcu_pending(int cpu); |
@@ -176,6 +235,31 @@ void rcu_bh_force_quiescent_state(void) | |||
176 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 235 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
177 | 236 | ||
178 | /* | 237 | /* |
238 | * Record the number of times rcutorture tests have been initiated and | ||
239 | * terminated. This information allows the debugfs tracing stats to be | ||
240 | * correlated to the rcutorture messages, even when the rcutorture module | ||
241 | * is being repeatedly loaded and unloaded. In other words, we cannot | ||
242 | * store this state in rcutorture itself. | ||
243 | */ | ||
244 | void rcutorture_record_test_transition(void) | ||
245 | { | ||
246 | rcutorture_testseq++; | ||
247 | rcutorture_vernum = 0; | ||
248 | } | ||
249 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | ||
250 | |||
251 | /* | ||
252 | * Record the number of writer passes through the current rcutorture test. | ||
253 | * This is also used to correlate debugfs tracing stats with the rcutorture | ||
254 | * messages. | ||
255 | */ | ||
256 | void rcutorture_record_progress(unsigned long vernum) | ||
257 | { | ||
258 | rcutorture_vernum++; | ||
259 | } | ||
260 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | ||
261 | |||
262 | /* | ||
179 | * Force a quiescent state for RCU-sched. | 263 | * Force a quiescent state for RCU-sched. |
180 | */ | 264 | */ |
181 | void rcu_sched_force_quiescent_state(void) | 265 | void rcu_sched_force_quiescent_state(void) |
@@ -234,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
234 | return 1; | 318 | return 1; |
235 | } | 319 | } |
236 | 320 | ||
237 | /* If preemptable RCU, no point in sending reschedule IPI. */ | 321 | /* If preemptible RCU, no point in sending reschedule IPI. */ |
238 | if (rdp->preemptable) | 322 | if (rdp->preemptible) |
239 | return 0; | 323 | return 0; |
240 | 324 | ||
241 | /* The CPU is online, so send it a reschedule IPI. */ | 325 | /* The CPU is online, so send it a reschedule IPI. */ |
@@ -264,13 +348,25 @@ void rcu_enter_nohz(void) | |||
264 | unsigned long flags; | 348 | unsigned long flags; |
265 | struct rcu_dynticks *rdtp; | 349 | struct rcu_dynticks *rdtp; |
266 | 350 | ||
267 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
268 | local_irq_save(flags); | 351 | local_irq_save(flags); |
269 | rdtp = &__get_cpu_var(rcu_dynticks); | 352 | rdtp = &__get_cpu_var(rcu_dynticks); |
270 | rdtp->dynticks++; | 353 | if (--rdtp->dynticks_nesting) { |
271 | rdtp->dynticks_nesting--; | 354 | local_irq_restore(flags); |
272 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | 355 | return; |
356 | } | ||
357 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
358 | smp_mb__before_atomic_inc(); /* See above. */ | ||
359 | atomic_inc(&rdtp->dynticks); | ||
360 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
361 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
273 | local_irq_restore(flags); | 362 | local_irq_restore(flags); |
363 | |||
364 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
365 | if (in_irq() && | ||
366 | (__get_cpu_var(rcu_sched_data).nxtlist || | ||
367 | __get_cpu_var(rcu_bh_data).nxtlist || | ||
368 | rcu_preempt_needs_cpu(smp_processor_id()))) | ||
369 | set_need_resched(); | ||
274 | } | 370 | } |
275 | 371 | ||
276 | /* | 372 | /* |
@@ -286,11 +382,16 @@ void rcu_exit_nohz(void) | |||
286 | 382 | ||
287 | local_irq_save(flags); | 383 | local_irq_save(flags); |
288 | rdtp = &__get_cpu_var(rcu_dynticks); | 384 | rdtp = &__get_cpu_var(rcu_dynticks); |
289 | rdtp->dynticks++; | 385 | if (rdtp->dynticks_nesting++) { |
290 | rdtp->dynticks_nesting++; | 386 | local_irq_restore(flags); |
291 | WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); | 387 | return; |
388 | } | ||
389 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | ||
390 | atomic_inc(&rdtp->dynticks); | ||
391 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
392 | smp_mb__after_atomic_inc(); /* See above. */ | ||
393 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
292 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
293 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
294 | } | 395 | } |
295 | 396 | ||
296 | /** | 397 | /** |
@@ -304,11 +405,15 @@ void rcu_nmi_enter(void) | |||
304 | { | 405 | { |
305 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 406 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); |
306 | 407 | ||
307 | if (rdtp->dynticks & 0x1) | 408 | if (rdtp->dynticks_nmi_nesting == 0 && |
409 | (atomic_read(&rdtp->dynticks) & 0x1)) | ||
308 | return; | 410 | return; |
309 | rdtp->dynticks_nmi++; | 411 | rdtp->dynticks_nmi_nesting++; |
310 | WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); | 412 | smp_mb__before_atomic_inc(); /* Force delay from prior write. */ |
311 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | 413 | atomic_inc(&rdtp->dynticks); |
414 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
415 | smp_mb__after_atomic_inc(); /* See above. */ | ||
416 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
312 | } | 417 | } |
313 | 418 | ||
314 | /** | 419 | /** |
@@ -322,11 +427,14 @@ void rcu_nmi_exit(void) | |||
322 | { | 427 | { |
323 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 428 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); |
324 | 429 | ||
325 | if (rdtp->dynticks & 0x1) | 430 | if (rdtp->dynticks_nmi_nesting == 0 || |
431 | --rdtp->dynticks_nmi_nesting != 0) | ||
326 | return; | 432 | return; |
327 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | 433 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
328 | rdtp->dynticks_nmi++; | 434 | smp_mb__before_atomic_inc(); /* See above. */ |
329 | WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); | 435 | atomic_inc(&rdtp->dynticks); |
436 | smp_mb__after_atomic_inc(); /* Force delay to next write. */ | ||
437 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
330 | } | 438 | } |
331 | 439 | ||
332 | /** | 440 | /** |
@@ -337,13 +445,7 @@ void rcu_nmi_exit(void) | |||
337 | */ | 445 | */ |
338 | void rcu_irq_enter(void) | 446 | void rcu_irq_enter(void) |
339 | { | 447 | { |
340 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 448 | rcu_exit_nohz(); |
341 | |||
342 | if (rdtp->dynticks_nesting++) | ||
343 | return; | ||
344 | rdtp->dynticks++; | ||
345 | WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); | ||
346 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
347 | } | 449 | } |
348 | 450 | ||
349 | /** | 451 | /** |
@@ -355,18 +457,7 @@ void rcu_irq_enter(void) | |||
355 | */ | 457 | */ |
356 | void rcu_irq_exit(void) | 458 | void rcu_irq_exit(void) |
357 | { | 459 | { |
358 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 460 | rcu_enter_nohz(); |
359 | |||
360 | if (--rdtp->dynticks_nesting) | ||
361 | return; | ||
362 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
363 | rdtp->dynticks++; | ||
364 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | ||
365 | |||
366 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
367 | if (__this_cpu_read(rcu_sched_data.nxtlist) || | ||
368 | __this_cpu_read(rcu_bh_data.nxtlist)) | ||
369 | set_need_resched(); | ||
370 | } | 461 | } |
371 | 462 | ||
372 | #ifdef CONFIG_SMP | 463 | #ifdef CONFIG_SMP |
@@ -378,19 +469,8 @@ void rcu_irq_exit(void) | |||
378 | */ | 469 | */ |
379 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 470 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
380 | { | 471 | { |
381 | int ret; | 472 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
382 | int snap; | 473 | return 0; |
383 | int snap_nmi; | ||
384 | |||
385 | snap = rdp->dynticks->dynticks; | ||
386 | snap_nmi = rdp->dynticks->dynticks_nmi; | ||
387 | smp_mb(); /* Order sampling of snap with end of grace period. */ | ||
388 | rdp->dynticks_snap = snap; | ||
389 | rdp->dynticks_nmi_snap = snap_nmi; | ||
390 | ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); | ||
391 | if (ret) | ||
392 | rdp->dynticks_fqs++; | ||
393 | return ret; | ||
394 | } | 474 | } |
395 | 475 | ||
396 | /* | 476 | /* |
@@ -401,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
401 | */ | 481 | */ |
402 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 482 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
403 | { | 483 | { |
404 | long curr; | 484 | unsigned long curr; |
405 | long curr_nmi; | 485 | unsigned long snap; |
406 | long snap; | ||
407 | long snap_nmi; | ||
408 | 486 | ||
409 | curr = rdp->dynticks->dynticks; | 487 | curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); |
410 | snap = rdp->dynticks_snap; | 488 | snap = (unsigned long)rdp->dynticks_snap; |
411 | curr_nmi = rdp->dynticks->dynticks_nmi; | ||
412 | snap_nmi = rdp->dynticks_nmi_snap; | ||
413 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
414 | 489 | ||
415 | /* | 490 | /* |
416 | * If the CPU passed through or entered a dynticks idle phase with | 491 | * If the CPU passed through or entered a dynticks idle phase with |
@@ -420,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
420 | * read-side critical section that started before the beginning | 495 | * read-side critical section that started before the beginning |
421 | * of the current RCU grace period. | 496 | * of the current RCU grace period. |
422 | */ | 497 | */ |
423 | if ((curr != snap || (curr & 0x1) == 0) && | 498 | if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { |
424 | (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { | ||
425 | rdp->dynticks_fqs++; | 499 | rdp->dynticks_fqs++; |
426 | return 1; | 500 | return 1; |
427 | } | 501 | } |
@@ -450,8 +524,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
450 | 524 | ||
451 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 525 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
452 | 526 | ||
453 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
454 | |||
455 | int rcu_cpu_stall_suppress __read_mostly; | 527 | int rcu_cpu_stall_suppress __read_mostly; |
456 | 528 | ||
457 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 529 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
@@ -537,21 +609,24 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
537 | 609 | ||
538 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 610 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
539 | { | 611 | { |
540 | long delta; | 612 | unsigned long j; |
613 | unsigned long js; | ||
541 | struct rcu_node *rnp; | 614 | struct rcu_node *rnp; |
542 | 615 | ||
543 | if (rcu_cpu_stall_suppress) | 616 | if (rcu_cpu_stall_suppress) |
544 | return; | 617 | return; |
545 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); | 618 | j = ACCESS_ONCE(jiffies); |
619 | js = ACCESS_ONCE(rsp->jiffies_stall); | ||
546 | rnp = rdp->mynode; | 620 | rnp = rdp->mynode; |
547 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { | 621 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { |
548 | 622 | ||
549 | /* We haven't checked in, so go dump stack. */ | 623 | /* We haven't checked in, so go dump stack. */ |
550 | print_cpu_stall(rsp); | 624 | print_cpu_stall(rsp); |
551 | 625 | ||
552 | } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { | 626 | } else if (rcu_gp_in_progress(rsp) && |
627 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { | ||
553 | 628 | ||
554 | /* They had two time units to dump stack, so complain. */ | 629 | /* They had a few time units to dump stack, so complain. */ |
555 | print_other_cpu_stall(rsp); | 630 | print_other_cpu_stall(rsp); |
556 | } | 631 | } |
557 | } | 632 | } |
@@ -587,26 +662,6 @@ static void __init check_cpu_stall_init(void) | |||
587 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | 662 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); |
588 | } | 663 | } |
589 | 664 | ||
590 | #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
591 | |||
592 | static void record_gp_stall_check_time(struct rcu_state *rsp) | ||
593 | { | ||
594 | } | ||
595 | |||
596 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | ||
597 | { | ||
598 | } | ||
599 | |||
600 | void rcu_cpu_stall_reset(void) | ||
601 | { | ||
602 | } | ||
603 | |||
604 | static void __init check_cpu_stall_init(void) | ||
605 | { | ||
606 | } | ||
607 | |||
608 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
609 | |||
610 | /* | 665 | /* |
611 | * Update CPU-local rcu_data state to record the newly noticed grace period. | 666 | * Update CPU-local rcu_data state to record the newly noticed grace period. |
612 | * This is used both when we started the grace period and when we notice | 667 | * This is used both when we started the grace period and when we notice |
@@ -809,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
809 | rnp->completed = rsp->completed; | 864 | rnp->completed = rsp->completed; |
810 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 865 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
811 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 866 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
867 | rcu_preempt_boost_start_gp(rnp); | ||
812 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 868 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
813 | return; | 869 | return; |
814 | } | 870 | } |
@@ -844,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
844 | rnp->completed = rsp->completed; | 900 | rnp->completed = rsp->completed; |
845 | if (rnp == rdp->mynode) | 901 | if (rnp == rdp->mynode) |
846 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 902 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
903 | rcu_preempt_boost_start_gp(rnp); | ||
847 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 904 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
848 | } | 905 | } |
849 | 906 | ||
@@ -864,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
864 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 921 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
865 | __releases(rcu_get_root(rsp)->lock) | 922 | __releases(rcu_get_root(rsp)->lock) |
866 | { | 923 | { |
924 | unsigned long gp_duration; | ||
925 | |||
867 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 926 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
927 | |||
928 | /* | ||
929 | * Ensure that all grace-period and pre-grace-period activity | ||
930 | * is seen before the assignment to rsp->completed. | ||
931 | */ | ||
932 | smp_mb(); /* See above block comment. */ | ||
933 | gp_duration = jiffies - rsp->gp_start; | ||
934 | if (gp_duration > rsp->gp_max) | ||
935 | rsp->gp_max = gp_duration; | ||
868 | rsp->completed = rsp->gpnum; | 936 | rsp->completed = rsp->gpnum; |
869 | rsp->signaled = RCU_GP_IDLE; | 937 | rsp->signaled = RCU_GP_IDLE; |
870 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 938 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
@@ -894,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
894 | return; | 962 | return; |
895 | } | 963 | } |
896 | rnp->qsmask &= ~mask; | 964 | rnp->qsmask &= ~mask; |
897 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 965 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
898 | 966 | ||
899 | /* Other bits still set at this level, so done. */ | 967 | /* Other bits still set at this level, so done. */ |
900 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 968 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -1037,6 +1105,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) | |||
1037 | /* | 1105 | /* |
1038 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy | 1106 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy |
1039 | * and move all callbacks from the outgoing CPU to the current one. | 1107 | * and move all callbacks from the outgoing CPU to the current one. |
1108 | * There can only be one CPU hotplug operation at a time, so no other | ||
1109 | * CPU can be attempting to update rcu_cpu_kthread_task. | ||
1040 | */ | 1110 | */ |
1041 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | 1111 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) |
1042 | { | 1112 | { |
@@ -1046,6 +1116,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1046 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1116 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1047 | struct rcu_node *rnp; | 1117 | struct rcu_node *rnp; |
1048 | 1118 | ||
1119 | rcu_stop_cpu_kthread(cpu); | ||
1120 | |||
1049 | /* Exclude any attempts to start a new grace period. */ | 1121 | /* Exclude any attempts to start a new grace period. */ |
1050 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1122 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1051 | 1123 | ||
@@ -1082,6 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1154 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1155 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1084 | rcu_report_exp_rnp(rsp, rnp); | 1156 | rcu_report_exp_rnp(rsp, rnp); |
1157 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1085 | } | 1158 | } |
1086 | 1159 | ||
1087 | /* | 1160 | /* |
@@ -1143,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1143 | next = list->next; | 1216 | next = list->next; |
1144 | prefetch(next); | 1217 | prefetch(next); |
1145 | debug_rcu_head_unqueue(list); | 1218 | debug_rcu_head_unqueue(list); |
1146 | list->func(list); | 1219 | __rcu_reclaim(list); |
1147 | list = next; | 1220 | list = next; |
1148 | if (++count >= rdp->blimit) | 1221 | if (++count >= rdp->blimit) |
1149 | break; | 1222 | break; |
@@ -1179,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1179 | 1252 | ||
1180 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1253 | /* Re-raise the RCU softirq if there are callbacks remaining. */ |
1181 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1254 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1182 | raise_softirq(RCU_SOFTIRQ); | 1255 | invoke_rcu_core(); |
1183 | } | 1256 | } |
1184 | 1257 | ||
1185 | /* | 1258 | /* |
@@ -1225,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
1225 | } | 1298 | } |
1226 | rcu_preempt_check_callbacks(cpu); | 1299 | rcu_preempt_check_callbacks(cpu); |
1227 | if (rcu_pending(cpu)) | 1300 | if (rcu_pending(cpu)) |
1228 | raise_softirq(RCU_SOFTIRQ); | 1301 | invoke_rcu_core(); |
1229 | } | 1302 | } |
1230 | 1303 | ||
1231 | #ifdef CONFIG_SMP | 1304 | #ifdef CONFIG_SMP |
@@ -1233,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
1233 | /* | 1306 | /* |
1234 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1307 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
1235 | * have not yet encountered a quiescent state, using the function specified. | 1308 | * have not yet encountered a quiescent state, using the function specified. |
1309 | * Also initiate boosting for any threads blocked on the root rcu_node. | ||
1310 | * | ||
1236 | * The caller must have suppressed start of new grace periods. | 1311 | * The caller must have suppressed start of new grace periods. |
1237 | */ | 1312 | */ |
1238 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 1313 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
@@ -1251,7 +1326,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1251 | return; | 1326 | return; |
1252 | } | 1327 | } |
1253 | if (rnp->qsmask == 0) { | 1328 | if (rnp->qsmask == 0) { |
1254 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1329 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
1255 | continue; | 1330 | continue; |
1256 | } | 1331 | } |
1257 | cpu = rnp->grplo; | 1332 | cpu = rnp->grplo; |
@@ -1269,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1269 | } | 1344 | } |
1270 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1345 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1271 | } | 1346 | } |
1347 | rnp = rcu_get_root(rsp); | ||
1348 | if (rnp->qsmask == 0) { | ||
1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1350 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1351 | } | ||
1272 | } | 1352 | } |
1273 | 1353 | ||
1274 | /* | 1354 | /* |
@@ -1383,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1383 | } | 1463 | } |
1384 | 1464 | ||
1385 | /* If there are callbacks ready, invoke them. */ | 1465 | /* If there are callbacks ready, invoke them. */ |
1386 | rcu_do_batch(rsp, rdp); | 1466 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1467 | invoke_rcu_callbacks(rsp, rdp); | ||
1387 | } | 1468 | } |
1388 | 1469 | ||
1389 | /* | 1470 | /* |
@@ -1391,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1391 | */ | 1472 | */ |
1392 | static void rcu_process_callbacks(struct softirq_action *unused) | 1473 | static void rcu_process_callbacks(struct softirq_action *unused) |
1393 | { | 1474 | { |
1394 | /* | ||
1395 | * Memory references from any prior RCU read-side critical sections | ||
1396 | * executed by the interrupted code must be seen before any RCU | ||
1397 | * grace-period manipulations below. | ||
1398 | */ | ||
1399 | smp_mb(); /* See above block comment. */ | ||
1400 | |||
1401 | __rcu_process_callbacks(&rcu_sched_state, | 1475 | __rcu_process_callbacks(&rcu_sched_state, |
1402 | &__get_cpu_var(rcu_sched_data)); | 1476 | &__get_cpu_var(rcu_sched_data)); |
1403 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1477 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); |
1404 | rcu_preempt_process_callbacks(); | 1478 | rcu_preempt_process_callbacks(); |
1405 | 1479 | ||
1406 | /* | ||
1407 | * Memory references from any later RCU read-side critical sections | ||
1408 | * executed by the interrupted code must be seen after any RCU | ||
1409 | * grace-period manipulations above. | ||
1410 | */ | ||
1411 | smp_mb(); /* See above block comment. */ | ||
1412 | |||
1413 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | 1480 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ |
1414 | rcu_needs_cpu_flush(); | 1481 | rcu_needs_cpu_flush(); |
1415 | } | 1482 | } |
1416 | 1483 | ||
1484 | /* | ||
1485 | * Wake up the current CPU's kthread. This replaces raise_softirq() | ||
1486 | * in earlier versions of RCU. Note that because we are running on | ||
1487 | * the current CPU with interrupts disabled, the rcu_cpu_kthread_task | ||
1488 | * cannot disappear out from under us. | ||
1489 | */ | ||
1490 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | ||
1491 | { | ||
1492 | if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) | ||
1493 | return; | ||
1494 | if (likely(!rsp->boost)) { | ||
1495 | rcu_do_batch(rsp, rdp); | ||
1496 | return; | ||
1497 | } | ||
1498 | invoke_rcu_callbacks_kthread(); | ||
1499 | } | ||
1500 | |||
1501 | static void invoke_rcu_core(void) | ||
1502 | { | ||
1503 | raise_softirq(RCU_SOFTIRQ); | ||
1504 | } | ||
1505 | |||
1417 | static void | 1506 | static void |
1418 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1507 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
1419 | struct rcu_state *rsp) | 1508 | struct rcu_state *rsp) |
@@ -1439,6 +1528,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1439 | /* Add the callback to our list. */ | 1528 | /* Add the callback to our list. */ |
1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1529 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1530 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1531 | rdp->qlen++; | ||
1532 | |||
1533 | /* If interrupts were disabled, don't dive into RCU core. */ | ||
1534 | if (irqs_disabled_flags(flags)) { | ||
1535 | local_irq_restore(flags); | ||
1536 | return; | ||
1537 | } | ||
1442 | 1538 | ||
1443 | /* | 1539 | /* |
1444 | * Force the grace period if too many callbacks or too long waiting. | 1540 | * Force the grace period if too many callbacks or too long waiting. |
@@ -1447,7 +1543,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1447 | * invoking force_quiescent_state() if the newly enqueued callback | 1543 | * invoking force_quiescent_state() if the newly enqueued callback |
1448 | * is the only one waiting for a grace period to complete. | 1544 | * is the only one waiting for a grace period to complete. |
1449 | */ | 1545 | */ |
1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1546 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
1451 | 1547 | ||
1452 | /* Are we ignoring a completed grace period? */ | 1548 | /* Are we ignoring a completed grace period? */ |
1453 | rcu_process_gp_end(rsp, rdp); | 1549 | rcu_process_gp_end(rsp, rdp); |
@@ -1583,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1583 | * or RCU-bh, force a local reschedule. | 1679 | * or RCU-bh, force a local reschedule. |
1584 | */ | 1680 | */ |
1585 | rdp->n_rp_qs_pending++; | 1681 | rdp->n_rp_qs_pending++; |
1586 | if (!rdp->preemptable && | 1682 | if (!rdp->preemptible && |
1587 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | 1683 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, |
1588 | jiffies)) | 1684 | jiffies)) |
1589 | set_need_resched(); | 1685 | set_need_resched(); |
@@ -1760,7 +1856,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1760 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 1856 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
1761 | */ | 1857 | */ |
1762 | static void __cpuinit | 1858 | static void __cpuinit |
1763 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | 1859 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) |
1764 | { | 1860 | { |
1765 | unsigned long flags; | 1861 | unsigned long flags; |
1766 | unsigned long mask; | 1862 | unsigned long mask; |
@@ -1772,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1772 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1868 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
1773 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1869 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
1774 | rdp->beenonline = 1; /* We have now been online. */ | 1870 | rdp->beenonline = 1; /* We have now been online. */ |
1775 | rdp->preemptable = preemptable; | 1871 | rdp->preemptible = preemptible; |
1776 | rdp->qlen_last_fqs_check = 0; | 1872 | rdp->qlen_last_fqs_check = 0; |
1777 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1873 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1778 | rdp->blimit = blimit; | 1874 | rdp->blimit = blimit; |
@@ -1806,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1806 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1902 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
1807 | } | 1903 | } |
1808 | 1904 | ||
1809 | static void __cpuinit rcu_online_cpu(int cpu) | 1905 | static void __cpuinit rcu_prepare_cpu(int cpu) |
1810 | { | 1906 | { |
1811 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); | 1907 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); |
1812 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); | 1908 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); |
@@ -1820,11 +1916,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1820 | unsigned long action, void *hcpu) | 1916 | unsigned long action, void *hcpu) |
1821 | { | 1917 | { |
1822 | long cpu = (long)hcpu; | 1918 | long cpu = (long)hcpu; |
1919 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
1920 | struct rcu_node *rnp = rdp->mynode; | ||
1823 | 1921 | ||
1824 | switch (action) { | 1922 | switch (action) { |
1825 | case CPU_UP_PREPARE: | 1923 | case CPU_UP_PREPARE: |
1826 | case CPU_UP_PREPARE_FROZEN: | 1924 | case CPU_UP_PREPARE_FROZEN: |
1827 | rcu_online_cpu(cpu); | 1925 | rcu_prepare_cpu(cpu); |
1926 | rcu_prepare_kthreads(cpu); | ||
1927 | break; | ||
1928 | case CPU_ONLINE: | ||
1929 | case CPU_DOWN_FAILED: | ||
1930 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1931 | rcu_cpu_kthread_setrt(cpu, 1); | ||
1932 | break; | ||
1933 | case CPU_DOWN_PREPARE: | ||
1934 | rcu_node_kthread_setaffinity(rnp, cpu); | ||
1935 | rcu_cpu_kthread_setrt(cpu, 0); | ||
1828 | break; | 1936 | break; |
1829 | case CPU_DYING: | 1937 | case CPU_DYING: |
1830 | case CPU_DYING_FROZEN: | 1938 | case CPU_DYING_FROZEN: |
@@ -1943,10 +2051,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
1943 | j / rsp->levelspread[i - 1]; | 2051 | j / rsp->levelspread[i - 1]; |
1944 | } | 2052 | } |
1945 | rnp->level = i; | 2053 | rnp->level = i; |
1946 | INIT_LIST_HEAD(&rnp->blocked_tasks[0]); | 2054 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
1947 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); | ||
1948 | INIT_LIST_HEAD(&rnp->blocked_tasks[2]); | ||
1949 | INIT_LIST_HEAD(&rnp->blocked_tasks[3]); | ||
1950 | } | 2055 | } |
1951 | } | 2056 | } |
1952 | 2057 | ||
@@ -1968,7 +2073,7 @@ void __init rcu_init(void) | |||
1968 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 2073 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
1969 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 2074 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
1970 | __rcu_init_preempt(); | 2075 | __rcu_init_preempt(); |
1971 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 2076 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
1972 | 2077 | ||
1973 | /* | 2078 | /* |
1974 | * We don't need protection against CPU-hotplug here because | 2079 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e8f057e44e3e..01b2ccda26fb 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,13 +84,19 @@ | |||
84 | * Dynticks per-CPU state. | 84 | * Dynticks per-CPU state. |
85 | */ | 85 | */ |
86 | struct rcu_dynticks { | 86 | struct rcu_dynticks { |
87 | int dynticks_nesting; /* Track nesting level, sort of. */ | 87 | int dynticks_nesting; /* Track irq/process nesting level. */ |
88 | int dynticks; /* Even value for dynticks-idle, else odd. */ | 88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
89 | int dynticks_nmi; /* Even value for either dynticks-idle or */ | 89 | atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ |
90 | /* not in nmi handler, else odd. So this */ | ||
91 | /* remains even for nmi from irq handler. */ | ||
92 | }; | 90 | }; |
93 | 91 | ||
92 | /* RCU's kthread states for tracing. */ | ||
93 | #define RCU_KTHREAD_STOPPED 0 | ||
94 | #define RCU_KTHREAD_RUNNING 1 | ||
95 | #define RCU_KTHREAD_WAITING 2 | ||
96 | #define RCU_KTHREAD_OFFCPU 3 | ||
97 | #define RCU_KTHREAD_YIELDING 4 | ||
98 | #define RCU_KTHREAD_MAX 4 | ||
99 | |||
94 | /* | 100 | /* |
95 | * Definition for node within the RCU grace-period-detection hierarchy. | 101 | * Definition for node within the RCU grace-period-detection hierarchy. |
96 | */ | 102 | */ |
@@ -109,10 +115,13 @@ struct rcu_node { | |||
109 | /* an rcu_data structure, otherwise, each */ | 115 | /* an rcu_data structure, otherwise, each */ |
110 | /* bit corresponds to a child rcu_node */ | 116 | /* bit corresponds to a child rcu_node */ |
111 | /* structure. */ | 117 | /* structure. */ |
112 | unsigned long expmask; /* Groups that have ->blocked_tasks[] */ | 118 | unsigned long expmask; /* Groups that have ->blkd_tasks */ |
113 | /* elements that need to drain to allow the */ | 119 | /* elements that need to drain to allow the */ |
114 | /* current expedited grace period to */ | 120 | /* current expedited grace period to */ |
115 | /* complete (only for TREE_PREEMPT_RCU). */ | 121 | /* complete (only for TREE_PREEMPT_RCU). */ |
122 | atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ | ||
123 | /* Since this has meaning only for leaf */ | ||
124 | /* rcu_node structures, 32 bits suffices. */ | ||
116 | unsigned long qsmaskinit; | 125 | unsigned long qsmaskinit; |
117 | /* Per-GP initial value for qsmask & expmask. */ | 126 | /* Per-GP initial value for qsmask & expmask. */ |
118 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 127 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
@@ -122,11 +131,62 @@ struct rcu_node { | |||
122 | u8 grpnum; /* CPU/group number for next level up. */ | 131 | u8 grpnum; /* CPU/group number for next level up. */ |
123 | u8 level; /* root is at level 0. */ | 132 | u8 level; /* root is at level 0. */ |
124 | struct rcu_node *parent; | 133 | struct rcu_node *parent; |
125 | struct list_head blocked_tasks[4]; | 134 | struct list_head blkd_tasks; |
126 | /* Tasks blocked in RCU read-side critsect. */ | 135 | /* Tasks blocked in RCU read-side critical */ |
127 | /* Grace period number (->gpnum) x blocked */ | 136 | /* section. Tasks are placed at the head */ |
128 | /* by tasks on the (x & 0x1) element of the */ | 137 | /* of this list and age towards the tail. */ |
129 | /* blocked_tasks[] array. */ | 138 | struct list_head *gp_tasks; |
139 | /* Pointer to the first task blocking the */ | ||
140 | /* current grace period, or NULL if there */ | ||
141 | /* is no such task. */ | ||
142 | struct list_head *exp_tasks; | ||
143 | /* Pointer to the first task blocking the */ | ||
144 | /* current expedited grace period, or NULL */ | ||
145 | /* if there is no such task. If there */ | ||
146 | /* is no current expedited grace period, */ | ||
147 | /* then there can cannot be any such task. */ | ||
148 | #ifdef CONFIG_RCU_BOOST | ||
149 | struct list_head *boost_tasks; | ||
150 | /* Pointer to first task that needs to be */ | ||
151 | /* priority boosted, or NULL if no priority */ | ||
152 | /* boosting is needed for this rcu_node */ | ||
153 | /* structure. If there are no tasks */ | ||
154 | /* queued on this rcu_node structure that */ | ||
155 | /* are blocking the current grace period, */ | ||
156 | /* there can be no such task. */ | ||
157 | unsigned long boost_time; | ||
158 | /* When to start boosting (jiffies). */ | ||
159 | struct task_struct *boost_kthread_task; | ||
160 | /* kthread that takes care of priority */ | ||
161 | /* boosting for this rcu_node structure. */ | ||
162 | unsigned int boost_kthread_status; | ||
163 | /* State of boost_kthread_task for tracing. */ | ||
164 | unsigned long n_tasks_boosted; | ||
165 | /* Total number of tasks boosted. */ | ||
166 | unsigned long n_exp_boosts; | ||
167 | /* Number of tasks boosted for expedited GP. */ | ||
168 | unsigned long n_normal_boosts; | ||
169 | /* Number of tasks boosted for normal GP. */ | ||
170 | unsigned long n_balk_blkd_tasks; | ||
171 | /* Refused to boost: no blocked tasks. */ | ||
172 | unsigned long n_balk_exp_gp_tasks; | ||
173 | /* Refused to boost: nothing blocking GP. */ | ||
174 | unsigned long n_balk_boost_tasks; | ||
175 | /* Refused to boost: already boosting. */ | ||
176 | unsigned long n_balk_notblocked; | ||
177 | /* Refused to boost: RCU RS CS still running. */ | ||
178 | unsigned long n_balk_notyet; | ||
179 | /* Refused to boost: not yet time. */ | ||
180 | unsigned long n_balk_nos; | ||
181 | /* Refused to boost: not sure why, though. */ | ||
182 | /* This can happen due to race conditions. */ | ||
183 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
184 | struct task_struct *node_kthread_task; | ||
185 | /* kthread that takes care of this rcu_node */ | ||
186 | /* structure, for example, awakening the */ | ||
187 | /* per-CPU kthreads as needed. */ | ||
188 | unsigned int node_kthread_status; | ||
189 | /* State of node_kthread_task for tracing. */ | ||
130 | } ____cacheline_internodealigned_in_smp; | 190 | } ____cacheline_internodealigned_in_smp; |
131 | 191 | ||
132 | /* | 192 | /* |
@@ -175,7 +235,7 @@ struct rcu_data { | |||
175 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 235 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
176 | bool qs_pending; /* Core waits for quiesc state. */ | 236 | bool qs_pending; /* Core waits for quiesc state. */ |
177 | bool beenonline; /* CPU online at least once. */ | 237 | bool beenonline; /* CPU online at least once. */ |
178 | bool preemptable; /* Preemptable RCU? */ | 238 | bool preemptible; /* Preemptible RCU? */ |
179 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 239 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
180 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 240 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
181 | 241 | ||
@@ -218,7 +278,6 @@ struct rcu_data { | |||
218 | /* 3) dynticks interface. */ | 278 | /* 3) dynticks interface. */ |
219 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | 279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ |
220 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | 280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ |
221 | int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ | ||
222 | #endif /* #ifdef CONFIG_NO_HZ */ | 281 | #endif /* #ifdef CONFIG_NO_HZ */ |
223 | 282 | ||
224 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 283 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
@@ -254,7 +313,6 @@ struct rcu_data { | |||
254 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 313 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
255 | 314 | ||
256 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 315 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
257 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
258 | 316 | ||
259 | #ifdef CONFIG_PROVE_RCU | 317 | #ifdef CONFIG_PROVE_RCU |
260 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | 318 | #define RCU_STALL_DELAY_DELTA (5 * HZ) |
@@ -272,13 +330,16 @@ struct rcu_data { | |||
272 | /* scheduling clock irq */ | 330 | /* scheduling clock irq */ |
273 | /* before ratting on them. */ | 331 | /* before ratting on them. */ |
274 | 332 | ||
275 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE | 333 | #define rcu_wait(cond) \ |
276 | #define RCU_CPU_STALL_SUPPRESS_INIT 0 | 334 | do { \ |
277 | #else | 335 | for (;;) { \ |
278 | #define RCU_CPU_STALL_SUPPRESS_INIT 1 | 336 | set_current_state(TASK_INTERRUPTIBLE); \ |
279 | #endif | 337 | if (cond) \ |
280 | 338 | break; \ | |
281 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 339 | schedule(); \ |
340 | } \ | ||
341 | __set_current_state(TASK_RUNNING); \ | ||
342 | } while (0) | ||
282 | 343 | ||
283 | /* | 344 | /* |
284 | * RCU global state, including node hierarchy. This hierarchy is | 345 | * RCU global state, including node hierarchy. This hierarchy is |
@@ -308,6 +369,7 @@ struct rcu_state { | |||
308 | /* period because */ | 369 | /* period because */ |
309 | /* force_quiescent_state() */ | 370 | /* force_quiescent_state() */ |
310 | /* was running. */ | 371 | /* was running. */ |
372 | u8 boost; /* Subject to priority boost. */ | ||
311 | unsigned long gpnum; /* Current gp number. */ | 373 | unsigned long gpnum; /* Current gp number. */ |
312 | unsigned long completed; /* # of last completed gp. */ | 374 | unsigned long completed; /* # of last completed gp. */ |
313 | 375 | ||
@@ -325,12 +387,12 @@ struct rcu_state { | |||
325 | /* due to lock unavailable. */ | 387 | /* due to lock unavailable. */ |
326 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | 388 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ |
327 | /* due to no GP active. */ | 389 | /* due to no GP active. */ |
328 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
329 | unsigned long gp_start; /* Time at which GP started, */ | 390 | unsigned long gp_start; /* Time at which GP started, */ |
330 | /* but in jiffies. */ | 391 | /* but in jiffies. */ |
331 | unsigned long jiffies_stall; /* Time at which to check */ | 392 | unsigned long jiffies_stall; /* Time at which to check */ |
332 | /* for CPU stalls. */ | 393 | /* for CPU stalls. */ |
333 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 394 | unsigned long gp_max; /* Maximum GP duration in */ |
395 | /* jiffies. */ | ||
334 | char *name; /* Name of structure. */ | 396 | char *name; /* Name of structure. */ |
335 | }; | 397 | }; |
336 | 398 | ||
@@ -361,16 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | |||
361 | static void rcu_bootup_announce(void); | 423 | static void rcu_bootup_announce(void); |
362 | long rcu_batches_completed(void); | 424 | long rcu_batches_completed(void); |
363 | static void rcu_preempt_note_context_switch(int cpu); | 425 | static void rcu_preempt_note_context_switch(int cpu); |
364 | static int rcu_preempted_readers(struct rcu_node *rnp); | 426 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
365 | #ifdef CONFIG_HOTPLUG_CPU | 427 | #ifdef CONFIG_HOTPLUG_CPU |
366 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 428 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
367 | unsigned long flags); | 429 | unsigned long flags); |
430 | static void rcu_stop_cpu_kthread(int cpu); | ||
368 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 431 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
369 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
370 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 432 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
371 | static void rcu_print_task_stall(struct rcu_node *rnp); | 433 | static void rcu_print_task_stall(struct rcu_node *rnp); |
372 | static void rcu_preempt_stall_reset(void); | 434 | static void rcu_preempt_stall_reset(void); |
373 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
374 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 435 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
375 | #ifdef CONFIG_HOTPLUG_CPU | 436 | #ifdef CONFIG_HOTPLUG_CPU |
376 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 437 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
@@ -390,5 +451,20 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | |||
390 | static void rcu_preempt_send_cbs_to_online(void); | 451 | static void rcu_preempt_send_cbs_to_online(void); |
391 | static void __init __rcu_init_preempt(void); | 452 | static void __init __rcu_init_preempt(void); |
392 | static void rcu_needs_cpu_flush(void); | 453 | static void rcu_needs_cpu_flush(void); |
454 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | ||
455 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | ||
456 | static void invoke_rcu_callbacks_kthread(void); | ||
457 | #ifdef CONFIG_RCU_BOOST | ||
458 | static void rcu_preempt_do_callbacks(void); | ||
459 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
460 | cpumask_var_t cm); | ||
461 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
462 | struct rcu_node *rnp, | ||
463 | int rnp_index); | ||
464 | static void invoke_rcu_node_kthread(struct rcu_node *rnp); | ||
465 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | ||
466 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
467 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | ||
468 | static void __cpuinit rcu_prepare_kthreads(int cpu); | ||
393 | 469 | ||
394 | #endif /* #ifndef RCU_TREE_NONCORE */ | 470 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a3638710dc67..75113cb7c4fb 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) |
3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); | 55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); |
56 | #endif | 56 | #endif |
57 | #ifndef CONFIG_RCU_CPU_STALL_DETECTOR | ||
58 | printk(KERN_INFO | ||
59 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | ||
60 | #endif | ||
61 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | 57 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
62 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 58 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
63 | #endif | 59 | #endif |
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
70 | 66 | ||
71 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); | 67 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); |
72 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 68 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
69 | static struct rcu_state *rcu_state = &rcu_preempt_state; | ||
73 | 70 | ||
74 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 71 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
75 | 72 | ||
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); | |||
78 | */ | 75 | */ |
79 | static void __init rcu_bootup_announce(void) | 76 | static void __init rcu_bootup_announce(void) |
80 | { | 77 | { |
81 | printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); | 78 | printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); |
82 | rcu_bootup_announce_oddness(); | 79 | rcu_bootup_announce_oddness(); |
83 | } | 80 | } |
84 | 81 | ||
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void) | |||
111 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 108 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
112 | 109 | ||
113 | /* | 110 | /* |
114 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 111 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
115 | * that this just means that the task currently running on the CPU is | 112 | * that this just means that the task currently running on the CPU is |
116 | * not in a quiescent state. There might be any number of tasks blocked | 113 | * not in a quiescent state. There might be any number of tasks blocked |
117 | * while in an RCU read-side critical section. | 114 | * while in an RCU read-side critical section. |
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu) | |||
134 | * We have entered the scheduler, and the current task might soon be | 131 | * We have entered the scheduler, and the current task might soon be |
135 | * context-switched away from. If this task is in an RCU read-side | 132 | * context-switched away from. If this task is in an RCU read-side |
136 | * critical section, we will no longer be able to rely on the CPU to | 133 | * critical section, we will no longer be able to rely on the CPU to |
137 | * record that fact, so we enqueue the task on the appropriate entry | 134 | * record that fact, so we enqueue the task on the blkd_tasks list. |
138 | * of the blocked_tasks[] array. The task will dequeue itself when | 135 | * The task will dequeue itself when it exits the outermost enclosing |
139 | * it exits the outermost enclosing RCU read-side critical section. | 136 | * RCU read-side critical section. Therefore, the current grace period |
140 | * Therefore, the current grace period cannot be permitted to complete | 137 | * cannot be permitted to complete until the blkd_tasks list entries |
141 | * until the blocked_tasks[] entry indexed by the low-order bit of | 138 | * predating the current grace period drain, in other words, until |
142 | * rnp->gpnum empties. | 139 | * rnp->gp_tasks becomes NULL. |
143 | * | 140 | * |
144 | * Caller must disable preemption. | 141 | * Caller must disable preemption. |
145 | */ | 142 | */ |
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
147 | { | 144 | { |
148 | struct task_struct *t = current; | 145 | struct task_struct *t = current; |
149 | unsigned long flags; | 146 | unsigned long flags; |
150 | int phase; | ||
151 | struct rcu_data *rdp; | 147 | struct rcu_data *rdp; |
152 | struct rcu_node *rnp; | 148 | struct rcu_node *rnp; |
153 | 149 | ||
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
169 | * (i.e., this CPU has not yet passed through a quiescent | 165 | * (i.e., this CPU has not yet passed through a quiescent |
170 | * state for the current grace period), then as long | 166 | * state for the current grace period), then as long |
171 | * as that task remains queued, the current grace period | 167 | * as that task remains queued, the current grace period |
172 | * cannot end. | 168 | * cannot end. Note that there is some uncertainty as |
169 | * to exactly when the current grace period started. | ||
170 | * We take a conservative approach, which can result | ||
171 | * in unnecessarily waiting on tasks that started very | ||
172 | * slightly after the current grace period began. C'est | ||
173 | * la vie!!! | ||
173 | * | 174 | * |
174 | * But first, note that the current CPU must still be | 175 | * But first, note that the current CPU must still be |
175 | * on line! | 176 | * on line! |
176 | */ | 177 | */ |
177 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | 178 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); |
178 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 179 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
179 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 180 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { |
180 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 181 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); |
182 | rnp->gp_tasks = &t->rcu_node_entry; | ||
183 | #ifdef CONFIG_RCU_BOOST | ||
184 | if (rnp->boost_tasks != NULL) | ||
185 | rnp->boost_tasks = rnp->gp_tasks; | ||
186 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
187 | } else { | ||
188 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
189 | if (rnp->qsmask & rdp->grpmask) | ||
190 | rnp->gp_tasks = &t->rcu_node_entry; | ||
191 | } | ||
181 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 192 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
182 | } | 193 | } |
183 | 194 | ||
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
196 | } | 207 | } |
197 | 208 | ||
198 | /* | 209 | /* |
199 | * Tree-preemptable RCU implementation for rcu_read_lock(). | 210 | * Tree-preemptible RCU implementation for rcu_read_lock(). |
200 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | 211 | * Just increment ->rcu_read_lock_nesting, shared state will be updated |
201 | * if we block. | 212 | * if we block. |
202 | */ | 213 | */ |
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); | |||
212 | * for the specified rcu_node structure. If the caller needs a reliable | 223 | * for the specified rcu_node structure. If the caller needs a reliable |
213 | * answer, it must hold the rcu_node's ->lock. | 224 | * answer, it must hold the rcu_node's ->lock. |
214 | */ | 225 | */ |
215 | static int rcu_preempted_readers(struct rcu_node *rnp) | 226 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
216 | { | 227 | { |
217 | int phase = rnp->gpnum & 0x1; | 228 | return rnp->gp_tasks != NULL; |
218 | |||
219 | return !list_empty(&rnp->blocked_tasks[phase]) || | ||
220 | !list_empty(&rnp->blocked_tasks[phase + 2]); | ||
221 | } | 229 | } |
222 | 230 | ||
223 | /* | 231 | /* |
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
233 | unsigned long mask; | 241 | unsigned long mask; |
234 | struct rcu_node *rnp_p; | 242 | struct rcu_node *rnp_p; |
235 | 243 | ||
236 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 244 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
237 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 245 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
238 | return; /* Still need more quiescent states! */ | 246 | return; /* Still need more quiescent states! */ |
239 | } | 247 | } |
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
257 | } | 265 | } |
258 | 266 | ||
259 | /* | 267 | /* |
268 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
269 | * returning NULL if at the end of the list. | ||
270 | */ | ||
271 | static struct list_head *rcu_next_node_entry(struct task_struct *t, | ||
272 | struct rcu_node *rnp) | ||
273 | { | ||
274 | struct list_head *np; | ||
275 | |||
276 | np = t->rcu_node_entry.next; | ||
277 | if (np == &rnp->blkd_tasks) | ||
278 | np = NULL; | ||
279 | return np; | ||
280 | } | ||
281 | |||
282 | /* | ||
260 | * Handle special cases during rcu_read_unlock(), such as needing to | 283 | * Handle special cases during rcu_read_unlock(), such as needing to |
261 | * notify RCU core processing or task having blocked during the RCU | 284 | * notify RCU core processing or task having blocked during the RCU |
262 | * read-side critical section. | 285 | * read-side critical section. |
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
266 | int empty; | 289 | int empty; |
267 | int empty_exp; | 290 | int empty_exp; |
268 | unsigned long flags; | 291 | unsigned long flags; |
292 | struct list_head *np; | ||
269 | struct rcu_node *rnp; | 293 | struct rcu_node *rnp; |
270 | int special; | 294 | int special; |
271 | 295 | ||
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
306 | break; | 330 | break; |
307 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 331 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
308 | } | 332 | } |
309 | empty = !rcu_preempted_readers(rnp); | 333 | empty = !rcu_preempt_blocked_readers_cgp(rnp); |
310 | empty_exp = !rcu_preempted_readers_exp(rnp); | 334 | empty_exp = !rcu_preempted_readers_exp(rnp); |
311 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 335 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
336 | np = rcu_next_node_entry(t, rnp); | ||
312 | list_del_init(&t->rcu_node_entry); | 337 | list_del_init(&t->rcu_node_entry); |
338 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
339 | rnp->gp_tasks = np; | ||
340 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
341 | rnp->exp_tasks = np; | ||
342 | #ifdef CONFIG_RCU_BOOST | ||
343 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
344 | rnp->boost_tasks = np; | ||
345 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
313 | t->rcu_blocked_node = NULL; | 346 | t->rcu_blocked_node = NULL; |
314 | 347 | ||
315 | /* | 348 | /* |
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
322 | else | 355 | else |
323 | rcu_report_unblock_qs_rnp(rnp, flags); | 356 | rcu_report_unblock_qs_rnp(rnp, flags); |
324 | 357 | ||
358 | #ifdef CONFIG_RCU_BOOST | ||
359 | /* Unboost if we were boosted. */ | ||
360 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
361 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | ||
362 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
363 | t->rcu_boost_mutex = NULL; | ||
364 | } | ||
365 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
366 | |||
325 | /* | 367 | /* |
326 | * If this was the last task on the expedited lists, | 368 | * If this was the last task on the expedited lists, |
327 | * then we need to report up the rcu_node hierarchy. | 369 | * then we need to report up the rcu_node hierarchy. |
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
334 | } | 376 | } |
335 | 377 | ||
336 | /* | 378 | /* |
337 | * Tree-preemptable RCU implementation for rcu_read_unlock(). | 379 | * Tree-preemptible RCU implementation for rcu_read_unlock(). |
338 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | 380 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost |
339 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | 381 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then |
340 | * invoke rcu_read_unlock_special() to clean up after a context switch | 382 | * invoke rcu_read_unlock_special() to clean up after a context switch |
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void) | |||
356 | } | 398 | } |
357 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 399 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
358 | 400 | ||
359 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
360 | |||
361 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | 401 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE |
362 | 402 | ||
363 | /* | 403 | /* |
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); | |||
367 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | 407 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) |
368 | { | 408 | { |
369 | unsigned long flags; | 409 | unsigned long flags; |
370 | struct list_head *lp; | ||
371 | int phase; | ||
372 | struct task_struct *t; | 410 | struct task_struct *t; |
373 | 411 | ||
374 | if (rcu_preempted_readers(rnp)) { | 412 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
375 | raw_spin_lock_irqsave(&rnp->lock, flags); | 413 | return; |
376 | phase = rnp->gpnum & 0x1; | 414 | raw_spin_lock_irqsave(&rnp->lock, flags); |
377 | lp = &rnp->blocked_tasks[phase]; | 415 | t = list_entry(rnp->gp_tasks, |
378 | list_for_each_entry(t, lp, rcu_node_entry) | 416 | struct task_struct, rcu_node_entry); |
379 | sched_show_task(t); | 417 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
380 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 418 | sched_show_task(t); |
381 | } | 419 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
382 | } | 420 | } |
383 | 421 | ||
384 | /* | 422 | /* |
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
408 | */ | 446 | */ |
409 | static void rcu_print_task_stall(struct rcu_node *rnp) | 447 | static void rcu_print_task_stall(struct rcu_node *rnp) |
410 | { | 448 | { |
411 | struct list_head *lp; | ||
412 | int phase; | ||
413 | struct task_struct *t; | 449 | struct task_struct *t; |
414 | 450 | ||
415 | if (rcu_preempted_readers(rnp)) { | 451 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
416 | phase = rnp->gpnum & 0x1; | 452 | return; |
417 | lp = &rnp->blocked_tasks[phase]; | 453 | t = list_entry(rnp->gp_tasks, |
418 | list_for_each_entry(t, lp, rcu_node_entry) | 454 | struct task_struct, rcu_node_entry); |
419 | printk(" P%d", t->pid); | 455 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
420 | } | 456 | printk(" P%d", t->pid); |
421 | } | 457 | } |
422 | 458 | ||
423 | /* | 459 | /* |
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void) | |||
430 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 466 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; |
431 | } | 467 | } |
432 | 468 | ||
433 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
434 | |||
435 | /* | 469 | /* |
436 | * Check that the list of blocked tasks for the newly completed grace | 470 | * Check that the list of blocked tasks for the newly completed grace |
437 | * period is in fact empty. It is a serious bug to complete a grace | 471 | * period is in fact empty. It is a serious bug to complete a grace |
438 | * period that still has RCU readers blocked! This function must be | 472 | * period that still has RCU readers blocked! This function must be |
439 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | 473 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock |
440 | * must be held by the caller. | 474 | * must be held by the caller. |
475 | * | ||
476 | * Also, if there are blocked tasks on the list, they automatically | ||
477 | * block the newly created grace period, so set up ->gp_tasks accordingly. | ||
441 | */ | 478 | */ |
442 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | 479 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) |
443 | { | 480 | { |
444 | WARN_ON_ONCE(rcu_preempted_readers(rnp)); | 481 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); |
482 | if (!list_empty(&rnp->blkd_tasks)) | ||
483 | rnp->gp_tasks = rnp->blkd_tasks.next; | ||
445 | WARN_ON_ONCE(rnp->qsmask); | 484 | WARN_ON_ONCE(rnp->qsmask); |
446 | } | 485 | } |
447 | 486 | ||
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
465 | struct rcu_node *rnp, | 504 | struct rcu_node *rnp, |
466 | struct rcu_data *rdp) | 505 | struct rcu_data *rdp) |
467 | { | 506 | { |
468 | int i; | ||
469 | struct list_head *lp; | 507 | struct list_head *lp; |
470 | struct list_head *lp_root; | 508 | struct list_head *lp_root; |
471 | int retval = 0; | 509 | int retval = 0; |
472 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 510 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
473 | struct task_struct *tp; | 511 | struct task_struct *t; |
474 | 512 | ||
475 | if (rnp == rnp_root) { | 513 | if (rnp == rnp_root) { |
476 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | 514 | WARN_ONCE(1, "Last CPU thought to be offlined?"); |
477 | return 0; /* Shouldn't happen: at least one CPU online. */ | 515 | return 0; /* Shouldn't happen: at least one CPU online. */ |
478 | } | 516 | } |
479 | WARN_ON_ONCE(rnp != rdp->mynode && | 517 | |
480 | (!list_empty(&rnp->blocked_tasks[0]) || | 518 | /* If we are on an internal node, complain bitterly. */ |
481 | !list_empty(&rnp->blocked_tasks[1]) || | 519 | WARN_ON_ONCE(rnp != rdp->mynode); |
482 | !list_empty(&rnp->blocked_tasks[2]) || | ||
483 | !list_empty(&rnp->blocked_tasks[3]))); | ||
484 | 520 | ||
485 | /* | 521 | /* |
486 | * Move tasks up to root rcu_node. Rely on the fact that the | 522 | * Move tasks up to root rcu_node. Don't try to get fancy for |
487 | * root rcu_node can be at most one ahead of the rest of the | 523 | * this corner-case operation -- just put this node's tasks |
488 | * rcu_nodes in terms of gp_num value. This fact allows us to | 524 | * at the head of the root node's list, and update the root node's |
489 | * move the blocked_tasks[] array directly, element by element. | 525 | * ->gp_tasks and ->exp_tasks pointers to those of this node's, |
526 | * if non-NULL. This might result in waiting for more tasks than | ||
527 | * absolutely necessary, but this is a good performance/complexity | ||
528 | * tradeoff. | ||
490 | */ | 529 | */ |
491 | if (rcu_preempted_readers(rnp)) | 530 | if (rcu_preempt_blocked_readers_cgp(rnp)) |
492 | retval |= RCU_OFL_TASKS_NORM_GP; | 531 | retval |= RCU_OFL_TASKS_NORM_GP; |
493 | if (rcu_preempted_readers_exp(rnp)) | 532 | if (rcu_preempted_readers_exp(rnp)) |
494 | retval |= RCU_OFL_TASKS_EXP_GP; | 533 | retval |= RCU_OFL_TASKS_EXP_GP; |
495 | for (i = 0; i < 4; i++) { | 534 | lp = &rnp->blkd_tasks; |
496 | lp = &rnp->blocked_tasks[i]; | 535 | lp_root = &rnp_root->blkd_tasks; |
497 | lp_root = &rnp_root->blocked_tasks[i]; | 536 | while (!list_empty(lp)) { |
498 | while (!list_empty(lp)) { | 537 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); |
499 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 538 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
500 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 539 | list_del(&t->rcu_node_entry); |
501 | list_del(&tp->rcu_node_entry); | 540 | t->rcu_blocked_node = rnp_root; |
502 | tp->rcu_blocked_node = rnp_root; | 541 | list_add(&t->rcu_node_entry, lp_root); |
503 | list_add(&tp->rcu_node_entry, lp_root); | 542 | if (&t->rcu_node_entry == rnp->gp_tasks) |
504 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 543 | rnp_root->gp_tasks = rnp->gp_tasks; |
505 | } | 544 | if (&t->rcu_node_entry == rnp->exp_tasks) |
545 | rnp_root->exp_tasks = rnp->exp_tasks; | ||
546 | #ifdef CONFIG_RCU_BOOST | ||
547 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
548 | rnp_root->boost_tasks = rnp->boost_tasks; | ||
549 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
550 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
506 | } | 551 | } |
552 | |||
553 | #ifdef CONFIG_RCU_BOOST | ||
554 | /* In case root is being boosted and leaf is not. */ | ||
555 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
556 | if (rnp_root->boost_tasks != NULL && | ||
557 | rnp_root->boost_tasks != rnp_root->gp_tasks) | ||
558 | rnp_root->boost_tasks = rnp_root->gp_tasks; | ||
559 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
560 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
561 | |||
562 | rnp->gp_tasks = NULL; | ||
563 | rnp->exp_tasks = NULL; | ||
507 | return retval; | 564 | return retval; |
508 | } | 565 | } |
509 | 566 | ||
510 | /* | 567 | /* |
511 | * Do CPU-offline processing for preemptable RCU. | 568 | * Do CPU-offline processing for preemptible RCU. |
512 | */ | 569 | */ |
513 | static void rcu_preempt_offline_cpu(int cpu) | 570 | static void rcu_preempt_offline_cpu(int cpu) |
514 | { | 571 | { |
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
537 | } | 594 | } |
538 | 595 | ||
539 | /* | 596 | /* |
540 | * Process callbacks for preemptable RCU. | 597 | * Process callbacks for preemptible RCU. |
541 | */ | 598 | */ |
542 | static void rcu_preempt_process_callbacks(void) | 599 | static void rcu_preempt_process_callbacks(void) |
543 | { | 600 | { |
@@ -545,8 +602,17 @@ static void rcu_preempt_process_callbacks(void) | |||
545 | &__get_cpu_var(rcu_preempt_data)); | 602 | &__get_cpu_var(rcu_preempt_data)); |
546 | } | 603 | } |
547 | 604 | ||
605 | #ifdef CONFIG_RCU_BOOST | ||
606 | |||
607 | static void rcu_preempt_do_callbacks(void) | ||
608 | { | ||
609 | rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); | ||
610 | } | ||
611 | |||
612 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
613 | |||
548 | /* | 614 | /* |
549 | * Queue a preemptable-RCU callback for invocation after a grace period. | 615 | * Queue a preemptible-RCU callback for invocation after a grace period. |
550 | */ | 616 | */ |
551 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 617 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
552 | { | 618 | { |
@@ -594,8 +660,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | |||
594 | */ | 660 | */ |
595 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | 661 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) |
596 | { | 662 | { |
597 | return !list_empty(&rnp->blocked_tasks[2]) || | 663 | return rnp->exp_tasks != NULL; |
598 | !list_empty(&rnp->blocked_tasks[3]); | ||
599 | } | 664 | } |
600 | 665 | ||
601 | /* | 666 | /* |
@@ -655,13 +720,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
655 | static void | 720 | static void |
656 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 721 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
657 | { | 722 | { |
658 | int must_wait; | 723 | unsigned long flags; |
724 | int must_wait = 0; | ||
659 | 725 | ||
660 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 726 | raw_spin_lock_irqsave(&rnp->lock, flags); |
661 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 727 | if (list_empty(&rnp->blkd_tasks)) |
662 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 728 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
663 | must_wait = rcu_preempted_readers_exp(rnp); | 729 | else { |
664 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 730 | rnp->exp_tasks = rnp->blkd_tasks.next; |
731 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
732 | must_wait = 1; | ||
733 | } | ||
665 | if (!must_wait) | 734 | if (!must_wait) |
666 | rcu_report_exp_rnp(rsp, rnp); | 735 | rcu_report_exp_rnp(rsp, rnp); |
667 | } | 736 | } |
@@ -669,9 +738,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
669 | /* | 738 | /* |
670 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | 739 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea |
671 | * is to invoke synchronize_sched_expedited() to push all the tasks to | 740 | * is to invoke synchronize_sched_expedited() to push all the tasks to |
672 | * the ->blocked_tasks[] lists, move all entries from the first set of | 741 | * the ->blkd_tasks lists and wait for this list to drain. |
673 | * ->blocked_tasks[] lists to the second set, and finally wait for this | ||
674 | * second set to drain. | ||
675 | */ | 742 | */ |
676 | void synchronize_rcu_expedited(void) | 743 | void synchronize_rcu_expedited(void) |
677 | { | 744 | { |
@@ -703,7 +770,7 @@ void synchronize_rcu_expedited(void) | |||
703 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | 770 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) |
704 | goto unlock_mb_ret; /* Others did our work for us. */ | 771 | goto unlock_mb_ret; /* Others did our work for us. */ |
705 | 772 | ||
706 | /* force all RCU readers onto blocked_tasks[]. */ | 773 | /* force all RCU readers onto ->blkd_tasks lists. */ |
707 | synchronize_sched_expedited(); | 774 | synchronize_sched_expedited(); |
708 | 775 | ||
709 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 776 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
@@ -715,7 +782,7 @@ void synchronize_rcu_expedited(void) | |||
715 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 782 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
716 | } | 783 | } |
717 | 784 | ||
718 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 785 | /* Snapshot current state of ->blkd_tasks lists. */ |
719 | rcu_for_each_leaf_node(rsp, rnp) | 786 | rcu_for_each_leaf_node(rsp, rnp) |
720 | sync_rcu_preempt_exp_init(rsp, rnp); | 787 | sync_rcu_preempt_exp_init(rsp, rnp); |
721 | if (NUM_RCU_NODES > 1) | 788 | if (NUM_RCU_NODES > 1) |
@@ -723,7 +790,7 @@ void synchronize_rcu_expedited(void) | |||
723 | 790 | ||
724 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 791 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
725 | 792 | ||
726 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 793 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
727 | rnp = rcu_get_root(rsp); | 794 | rnp = rcu_get_root(rsp); |
728 | wait_event(sync_rcu_preempt_exp_wq, | 795 | wait_event(sync_rcu_preempt_exp_wq, |
729 | sync_rcu_preempt_exp_done(rnp)); | 796 | sync_rcu_preempt_exp_done(rnp)); |
@@ -739,7 +806,7 @@ mb_ret: | |||
739 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 806 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
740 | 807 | ||
741 | /* | 808 | /* |
742 | * Check to see if there is any immediate preemptable-RCU-related work | 809 | * Check to see if there is any immediate preemptible-RCU-related work |
743 | * to be done. | 810 | * to be done. |
744 | */ | 811 | */ |
745 | static int rcu_preempt_pending(int cpu) | 812 | static int rcu_preempt_pending(int cpu) |
@@ -749,7 +816,7 @@ static int rcu_preempt_pending(int cpu) | |||
749 | } | 816 | } |
750 | 817 | ||
751 | /* | 818 | /* |
752 | * Does preemptable RCU need the CPU to stay out of dynticks mode? | 819 | * Does preemptible RCU need the CPU to stay out of dynticks mode? |
753 | */ | 820 | */ |
754 | static int rcu_preempt_needs_cpu(int cpu) | 821 | static int rcu_preempt_needs_cpu(int cpu) |
755 | { | 822 | { |
@@ -766,7 +833,7 @@ void rcu_barrier(void) | |||
766 | EXPORT_SYMBOL_GPL(rcu_barrier); | 833 | EXPORT_SYMBOL_GPL(rcu_barrier); |
767 | 834 | ||
768 | /* | 835 | /* |
769 | * Initialize preemptable RCU's per-CPU data. | 836 | * Initialize preemptible RCU's per-CPU data. |
770 | */ | 837 | */ |
771 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 838 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
772 | { | 839 | { |
@@ -774,7 +841,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
774 | } | 841 | } |
775 | 842 | ||
776 | /* | 843 | /* |
777 | * Move preemptable RCU's callbacks from dying CPU to other online CPU. | 844 | * Move preemptible RCU's callbacks from dying CPU to other online CPU. |
778 | */ | 845 | */ |
779 | static void rcu_preempt_send_cbs_to_online(void) | 846 | static void rcu_preempt_send_cbs_to_online(void) |
780 | { | 847 | { |
@@ -782,7 +849,7 @@ static void rcu_preempt_send_cbs_to_online(void) | |||
782 | } | 849 | } |
783 | 850 | ||
784 | /* | 851 | /* |
785 | * Initialize preemptable RCU's state structures. | 852 | * Initialize preemptible RCU's state structures. |
786 | */ | 853 | */ |
787 | static void __init __rcu_init_preempt(void) | 854 | static void __init __rcu_init_preempt(void) |
788 | { | 855 | { |
@@ -790,7 +857,7 @@ static void __init __rcu_init_preempt(void) | |||
790 | } | 857 | } |
791 | 858 | ||
792 | /* | 859 | /* |
793 | * Check for a task exiting while in a preemptable-RCU read-side | 860 | * Check for a task exiting while in a preemptible-RCU read-side |
794 | * critical section, clean up if so. No need to issue warnings, | 861 | * critical section, clean up if so. No need to issue warnings, |
795 | * as debug_check_no_locks_held() already does this if lockdep | 862 | * as debug_check_no_locks_held() already does this if lockdep |
796 | * is enabled. | 863 | * is enabled. |
@@ -802,11 +869,13 @@ void exit_rcu(void) | |||
802 | if (t->rcu_read_lock_nesting == 0) | 869 | if (t->rcu_read_lock_nesting == 0) |
803 | return; | 870 | return; |
804 | t->rcu_read_lock_nesting = 1; | 871 | t->rcu_read_lock_nesting = 1; |
805 | rcu_read_unlock(); | 872 | __rcu_read_unlock(); |
806 | } | 873 | } |
807 | 874 | ||
808 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 875 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
809 | 876 | ||
877 | static struct rcu_state *rcu_state = &rcu_sched_state; | ||
878 | |||
810 | /* | 879 | /* |
811 | * Tell them what RCU they are running. | 880 | * Tell them what RCU they are running. |
812 | */ | 881 | */ |
@@ -836,7 +905,7 @@ void rcu_force_quiescent_state(void) | |||
836 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 905 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
837 | 906 | ||
838 | /* | 907 | /* |
839 | * Because preemptable RCU does not exist, we never have to check for | 908 | * Because preemptible RCU does not exist, we never have to check for |
840 | * CPUs being in quiescent states. | 909 | * CPUs being in quiescent states. |
841 | */ | 910 | */ |
842 | static void rcu_preempt_note_context_switch(int cpu) | 911 | static void rcu_preempt_note_context_switch(int cpu) |
@@ -844,10 +913,10 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
844 | } | 913 | } |
845 | 914 | ||
846 | /* | 915 | /* |
847 | * Because preemptable RCU does not exist, there are never any preempted | 916 | * Because preemptible RCU does not exist, there are never any preempted |
848 | * RCU readers. | 917 | * RCU readers. |
849 | */ | 918 | */ |
850 | static int rcu_preempted_readers(struct rcu_node *rnp) | 919 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
851 | { | 920 | { |
852 | return 0; | 921 | return 0; |
853 | } | 922 | } |
@@ -862,10 +931,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
862 | 931 | ||
863 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 932 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
864 | 933 | ||
865 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
866 | |||
867 | /* | 934 | /* |
868 | * Because preemptable RCU does not exist, we never have to check for | 935 | * Because preemptible RCU does not exist, we never have to check for |
869 | * tasks blocked within RCU read-side critical sections. | 936 | * tasks blocked within RCU read-side critical sections. |
870 | */ | 937 | */ |
871 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | 938 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) |
@@ -873,7 +940,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
873 | } | 940 | } |
874 | 941 | ||
875 | /* | 942 | /* |
876 | * Because preemptable RCU does not exist, we never have to check for | 943 | * Because preemptible RCU does not exist, we never have to check for |
877 | * tasks blocked within RCU read-side critical sections. | 944 | * tasks blocked within RCU read-side critical sections. |
878 | */ | 945 | */ |
879 | static void rcu_print_task_stall(struct rcu_node *rnp) | 946 | static void rcu_print_task_stall(struct rcu_node *rnp) |
@@ -888,10 +955,8 @@ static void rcu_preempt_stall_reset(void) | |||
888 | { | 955 | { |
889 | } | 956 | } |
890 | 957 | ||
891 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
892 | |||
893 | /* | 958 | /* |
894 | * Because there is no preemptable RCU, there can be no readers blocked, | 959 | * Because there is no preemptible RCU, there can be no readers blocked, |
895 | * so there is no need to check for blocked tasks. So check only for | 960 | * so there is no need to check for blocked tasks. So check only for |
896 | * bogus qsmask values. | 961 | * bogus qsmask values. |
897 | */ | 962 | */ |
@@ -903,7 +968,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
903 | #ifdef CONFIG_HOTPLUG_CPU | 968 | #ifdef CONFIG_HOTPLUG_CPU |
904 | 969 | ||
905 | /* | 970 | /* |
906 | * Because preemptable RCU does not exist, it never needs to migrate | 971 | * Because preemptible RCU does not exist, it never needs to migrate |
907 | * tasks that were blocked within RCU read-side critical sections, and | 972 | * tasks that were blocked within RCU read-side critical sections, and |
908 | * such non-existent tasks cannot possibly have been blocking the current | 973 | * such non-existent tasks cannot possibly have been blocking the current |
909 | * grace period. | 974 | * grace period. |
@@ -916,7 +981,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
916 | } | 981 | } |
917 | 982 | ||
918 | /* | 983 | /* |
919 | * Because preemptable RCU does not exist, it never needs CPU-offline | 984 | * Because preemptible RCU does not exist, it never needs CPU-offline |
920 | * processing. | 985 | * processing. |
921 | */ | 986 | */ |
922 | static void rcu_preempt_offline_cpu(int cpu) | 987 | static void rcu_preempt_offline_cpu(int cpu) |
@@ -926,7 +991,7 @@ static void rcu_preempt_offline_cpu(int cpu) | |||
926 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 991 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
927 | 992 | ||
928 | /* | 993 | /* |
929 | * Because preemptable RCU does not exist, it never has any callbacks | 994 | * Because preemptible RCU does not exist, it never has any callbacks |
930 | * to check. | 995 | * to check. |
931 | */ | 996 | */ |
932 | static void rcu_preempt_check_callbacks(int cpu) | 997 | static void rcu_preempt_check_callbacks(int cpu) |
@@ -934,7 +999,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
934 | } | 999 | } |
935 | 1000 | ||
936 | /* | 1001 | /* |
937 | * Because preemptable RCU does not exist, it never has any callbacks | 1002 | * Because preemptible RCU does not exist, it never has any callbacks |
938 | * to process. | 1003 | * to process. |
939 | */ | 1004 | */ |
940 | static void rcu_preempt_process_callbacks(void) | 1005 | static void rcu_preempt_process_callbacks(void) |
@@ -943,7 +1008,7 @@ static void rcu_preempt_process_callbacks(void) | |||
943 | 1008 | ||
944 | /* | 1009 | /* |
945 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1010 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
946 | * But because preemptable RCU does not exist, map to rcu-sched. | 1011 | * But because preemptible RCU does not exist, map to rcu-sched. |
947 | */ | 1012 | */ |
948 | void synchronize_rcu_expedited(void) | 1013 | void synchronize_rcu_expedited(void) |
949 | { | 1014 | { |
@@ -954,7 +1019,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
954 | #ifdef CONFIG_HOTPLUG_CPU | 1019 | #ifdef CONFIG_HOTPLUG_CPU |
955 | 1020 | ||
956 | /* | 1021 | /* |
957 | * Because preemptable RCU does not exist, there is never any need to | 1022 | * Because preemptible RCU does not exist, there is never any need to |
958 | * report on tasks preempted in RCU read-side critical sections during | 1023 | * report on tasks preempted in RCU read-side critical sections during |
959 | * expedited RCU grace periods. | 1024 | * expedited RCU grace periods. |
960 | */ | 1025 | */ |
@@ -966,7 +1031,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
966 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1031 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
967 | 1032 | ||
968 | /* | 1033 | /* |
969 | * Because preemptable RCU does not exist, it never has any work to do. | 1034 | * Because preemptible RCU does not exist, it never has any work to do. |
970 | */ | 1035 | */ |
971 | static int rcu_preempt_pending(int cpu) | 1036 | static int rcu_preempt_pending(int cpu) |
972 | { | 1037 | { |
@@ -974,7 +1039,7 @@ static int rcu_preempt_pending(int cpu) | |||
974 | } | 1039 | } |
975 | 1040 | ||
976 | /* | 1041 | /* |
977 | * Because preemptable RCU does not exist, it never needs any CPU. | 1042 | * Because preemptible RCU does not exist, it never needs any CPU. |
978 | */ | 1043 | */ |
979 | static int rcu_preempt_needs_cpu(int cpu) | 1044 | static int rcu_preempt_needs_cpu(int cpu) |
980 | { | 1045 | { |
@@ -982,7 +1047,7 @@ static int rcu_preempt_needs_cpu(int cpu) | |||
982 | } | 1047 | } |
983 | 1048 | ||
984 | /* | 1049 | /* |
985 | * Because preemptable RCU does not exist, rcu_barrier() is just | 1050 | * Because preemptible RCU does not exist, rcu_barrier() is just |
986 | * another name for rcu_barrier_sched(). | 1051 | * another name for rcu_barrier_sched(). |
987 | */ | 1052 | */ |
988 | void rcu_barrier(void) | 1053 | void rcu_barrier(void) |
@@ -992,7 +1057,7 @@ void rcu_barrier(void) | |||
992 | EXPORT_SYMBOL_GPL(rcu_barrier); | 1057 | EXPORT_SYMBOL_GPL(rcu_barrier); |
993 | 1058 | ||
994 | /* | 1059 | /* |
995 | * Because preemptable RCU does not exist, there is no per-CPU | 1060 | * Because preemptible RCU does not exist, there is no per-CPU |
996 | * data to initialize. | 1061 | * data to initialize. |
997 | */ | 1062 | */ |
998 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 1063 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
@@ -1000,14 +1065,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
1000 | } | 1065 | } |
1001 | 1066 | ||
1002 | /* | 1067 | /* |
1003 | * Because there is no preemptable RCU, there are no callbacks to move. | 1068 | * Because there is no preemptible RCU, there are no callbacks to move. |
1004 | */ | 1069 | */ |
1005 | static void rcu_preempt_send_cbs_to_online(void) | 1070 | static void rcu_preempt_send_cbs_to_online(void) |
1006 | { | 1071 | { |
1007 | } | 1072 | } |
1008 | 1073 | ||
1009 | /* | 1074 | /* |
1010 | * Because preemptable RCU does not exist, it need not be initialized. | 1075 | * Because preemptible RCU does not exist, it need not be initialized. |
1011 | */ | 1076 | */ |
1012 | static void __init __rcu_init_preempt(void) | 1077 | static void __init __rcu_init_preempt(void) |
1013 | { | 1078 | { |
@@ -1015,6 +1080,665 @@ static void __init __rcu_init_preempt(void) | |||
1015 | 1080 | ||
1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1081 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1017 | 1082 | ||
1083 | #ifdef CONFIG_RCU_BOOST | ||
1084 | |||
1085 | #include "rtmutex_common.h" | ||
1086 | |||
1087 | #ifdef CONFIG_RCU_TRACE | ||
1088 | |||
1089 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1090 | { | ||
1091 | if (list_empty(&rnp->blkd_tasks)) | ||
1092 | rnp->n_balk_blkd_tasks++; | ||
1093 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
1094 | rnp->n_balk_exp_gp_tasks++; | ||
1095 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
1096 | rnp->n_balk_boost_tasks++; | ||
1097 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
1098 | rnp->n_balk_notblocked++; | ||
1099 | else if (rnp->gp_tasks != NULL && | ||
1100 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
1101 | rnp->n_balk_notyet++; | ||
1102 | else | ||
1103 | rnp->n_balk_nos++; | ||
1104 | } | ||
1105 | |||
1106 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
1107 | |||
1108 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1109 | { | ||
1110 | } | ||
1111 | |||
1112 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
1113 | |||
1114 | /* | ||
1115 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | ||
1116 | * or ->boost_tasks, advancing the pointer to the next task in the | ||
1117 | * ->blkd_tasks list. | ||
1118 | * | ||
1119 | * Note that irqs must be enabled: boosting the task can block. | ||
1120 | * Returns 1 if there are more tasks needing to be boosted. | ||
1121 | */ | ||
1122 | static int rcu_boost(struct rcu_node *rnp) | ||
1123 | { | ||
1124 | unsigned long flags; | ||
1125 | struct rt_mutex mtx; | ||
1126 | struct task_struct *t; | ||
1127 | struct list_head *tb; | ||
1128 | |||
1129 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) | ||
1130 | return 0; /* Nothing left to boost. */ | ||
1131 | |||
1132 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1133 | |||
1134 | /* | ||
1135 | * Recheck under the lock: all tasks in need of boosting | ||
1136 | * might exit their RCU read-side critical sections on their own. | ||
1137 | */ | ||
1138 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { | ||
1139 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1140 | return 0; | ||
1141 | } | ||
1142 | |||
1143 | /* | ||
1144 | * Preferentially boost tasks blocking expedited grace periods. | ||
1145 | * This cannot starve the normal grace periods because a second | ||
1146 | * expedited grace period must boost all blocked tasks, including | ||
1147 | * those blocking the pre-existing normal grace period. | ||
1148 | */ | ||
1149 | if (rnp->exp_tasks != NULL) { | ||
1150 | tb = rnp->exp_tasks; | ||
1151 | rnp->n_exp_boosts++; | ||
1152 | } else { | ||
1153 | tb = rnp->boost_tasks; | ||
1154 | rnp->n_normal_boosts++; | ||
1155 | } | ||
1156 | rnp->n_tasks_boosted++; | ||
1157 | |||
1158 | /* | ||
1159 | * We boost task t by manufacturing an rt_mutex that appears to | ||
1160 | * be held by task t. We leave a pointer to that rt_mutex where | ||
1161 | * task t can find it, and task t will release the mutex when it | ||
1162 | * exits its outermost RCU read-side critical section. Then | ||
1163 | * simply acquiring this artificial rt_mutex will boost task | ||
1164 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
1165 | * | ||
1166 | * Note that task t must acquire rnp->lock to remove itself from | ||
1167 | * the ->blkd_tasks list, which it will do from exit() if from | ||
1168 | * nowhere else. We therefore are guaranteed that task t will | ||
1169 | * stay around at least until we drop rnp->lock. Note that | ||
1170 | * rnp->lock also resolves races between our priority boosting | ||
1171 | * and task t's exiting its outermost RCU read-side critical | ||
1172 | * section. | ||
1173 | */ | ||
1174 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
1175 | rt_mutex_init_proxy_locked(&mtx, t); | ||
1176 | t->rcu_boost_mutex = &mtx; | ||
1177 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
1178 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1179 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | ||
1180 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
1181 | |||
1182 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | ||
1183 | } | ||
1184 | |||
1185 | /* | ||
1186 | * Timer handler to initiate waking up of boost kthreads that | ||
1187 | * have yielded the CPU due to excessive numbers of tasks to | ||
1188 | * boost. We wake up the per-rcu_node kthread, which in turn | ||
1189 | * will wake up the booster kthread. | ||
1190 | */ | ||
1191 | static void rcu_boost_kthread_timer(unsigned long arg) | ||
1192 | { | ||
1193 | invoke_rcu_node_kthread((struct rcu_node *)arg); | ||
1194 | } | ||
1195 | |||
1196 | /* | ||
1197 | * Priority-boosting kthread. One per leaf rcu_node and one for the | ||
1198 | * root rcu_node. | ||
1199 | */ | ||
1200 | static int rcu_boost_kthread(void *arg) | ||
1201 | { | ||
1202 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1203 | int spincnt = 0; | ||
1204 | int more2boost; | ||
1205 | |||
1206 | for (;;) { | ||
1207 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | ||
1208 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | ||
1209 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | ||
1210 | more2boost = rcu_boost(rnp); | ||
1211 | if (more2boost) | ||
1212 | spincnt++; | ||
1213 | else | ||
1214 | spincnt = 0; | ||
1215 | if (spincnt > 10) { | ||
1216 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | ||
1217 | spincnt = 0; | ||
1218 | } | ||
1219 | } | ||
1220 | /* NOTREACHED */ | ||
1221 | return 0; | ||
1222 | } | ||
1223 | |||
1224 | /* | ||
1225 | * Check to see if it is time to start boosting RCU readers that are | ||
1226 | * blocking the current grace period, and, if so, tell the per-rcu_node | ||
1227 | * kthread to start boosting them. If there is an expedited grace | ||
1228 | * period in progress, it is always time to boost. | ||
1229 | * | ||
1230 | * The caller must hold rnp->lock, which this function releases, | ||
1231 | * but irqs remain disabled. The ->boost_kthread_task is immortal, | ||
1232 | * so we don't need to worry about it going away. | ||
1233 | */ | ||
1234 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1235 | { | ||
1236 | struct task_struct *t; | ||
1237 | |||
1238 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | ||
1239 | rnp->n_balk_exp_gp_tasks++; | ||
1240 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1241 | return; | ||
1242 | } | ||
1243 | if (rnp->exp_tasks != NULL || | ||
1244 | (rnp->gp_tasks != NULL && | ||
1245 | rnp->boost_tasks == NULL && | ||
1246 | rnp->qsmask == 0 && | ||
1247 | ULONG_CMP_GE(jiffies, rnp->boost_time))) { | ||
1248 | if (rnp->exp_tasks == NULL) | ||
1249 | rnp->boost_tasks = rnp->gp_tasks; | ||
1250 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1251 | t = rnp->boost_kthread_task; | ||
1252 | if (t != NULL) | ||
1253 | wake_up_process(t); | ||
1254 | } else { | ||
1255 | rcu_initiate_boost_trace(rnp); | ||
1256 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1257 | } | ||
1258 | } | ||
1259 | |||
1260 | /* | ||
1261 | * Wake up the per-CPU kthread to invoke RCU callbacks. | ||
1262 | */ | ||
1263 | static void invoke_rcu_callbacks_kthread(void) | ||
1264 | { | ||
1265 | unsigned long flags; | ||
1266 | |||
1267 | local_irq_save(flags); | ||
1268 | __this_cpu_write(rcu_cpu_has_work, 1); | ||
1269 | if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { | ||
1270 | local_irq_restore(flags); | ||
1271 | return; | ||
1272 | } | ||
1273 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | ||
1274 | local_irq_restore(flags); | ||
1275 | } | ||
1276 | |||
1277 | /* | ||
1278 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | ||
1279 | * held, so no one should be messing with the existence of the boost | ||
1280 | * kthread. | ||
1281 | */ | ||
1282 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
1283 | cpumask_var_t cm) | ||
1284 | { | ||
1285 | struct task_struct *t; | ||
1286 | |||
1287 | t = rnp->boost_kthread_task; | ||
1288 | if (t != NULL) | ||
1289 | set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); | ||
1290 | } | ||
1291 | |||
1292 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
1293 | |||
1294 | /* | ||
1295 | * Do priority-boost accounting for the start of a new grace period. | ||
1296 | */ | ||
1297 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1298 | { | ||
1299 | rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
1300 | } | ||
1301 | |||
1302 | /* | ||
1303 | * Create an RCU-boost kthread for the specified node if one does not | ||
1304 | * already exist. We only create this kthread for preemptible RCU. | ||
1305 | * Returns zero if all is well, a negated errno otherwise. | ||
1306 | */ | ||
1307 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
1308 | struct rcu_node *rnp, | ||
1309 | int rnp_index) | ||
1310 | { | ||
1311 | unsigned long flags; | ||
1312 | struct sched_param sp; | ||
1313 | struct task_struct *t; | ||
1314 | |||
1315 | if (&rcu_preempt_state != rsp) | ||
1316 | return 0; | ||
1317 | rsp->boost = 1; | ||
1318 | if (rnp->boost_kthread_task != NULL) | ||
1319 | return 0; | ||
1320 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | ||
1321 | "rcub%d", rnp_index); | ||
1322 | if (IS_ERR(t)) | ||
1323 | return PTR_ERR(t); | ||
1324 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1325 | rnp->boost_kthread_task = t; | ||
1326 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1327 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1328 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1329 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
1330 | return 0; | ||
1331 | } | ||
1332 | |||
1333 | #ifdef CONFIG_HOTPLUG_CPU | ||
1334 | |||
1335 | /* | ||
1336 | * Stop the RCU's per-CPU kthread when its CPU goes offline,. | ||
1337 | */ | ||
1338 | static void rcu_stop_cpu_kthread(int cpu) | ||
1339 | { | ||
1340 | struct task_struct *t; | ||
1341 | |||
1342 | /* Stop the CPU's kthread. */ | ||
1343 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1344 | if (t != NULL) { | ||
1345 | per_cpu(rcu_cpu_kthread_task, cpu) = NULL; | ||
1346 | kthread_stop(t); | ||
1347 | } | ||
1348 | } | ||
1349 | |||
1350 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1351 | |||
1352 | static void rcu_kthread_do_work(void) | ||
1353 | { | ||
1354 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | ||
1355 | rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | ||
1356 | rcu_preempt_do_callbacks(); | ||
1357 | } | ||
1358 | |||
1359 | /* | ||
1360 | * Wake up the specified per-rcu_node-structure kthread. | ||
1361 | * Because the per-rcu_node kthreads are immortal, we don't need | ||
1362 | * to do anything to keep them alive. | ||
1363 | */ | ||
1364 | static void invoke_rcu_node_kthread(struct rcu_node *rnp) | ||
1365 | { | ||
1366 | struct task_struct *t; | ||
1367 | |||
1368 | t = rnp->node_kthread_task; | ||
1369 | if (t != NULL) | ||
1370 | wake_up_process(t); | ||
1371 | } | ||
1372 | |||
1373 | /* | ||
1374 | * Set the specified CPU's kthread to run RT or not, as specified by | ||
1375 | * the to_rt argument. The CPU-hotplug locks are held, so the task | ||
1376 | * is not going away. | ||
1377 | */ | ||
1378 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1379 | { | ||
1380 | int policy; | ||
1381 | struct sched_param sp; | ||
1382 | struct task_struct *t; | ||
1383 | |||
1384 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1385 | if (t == NULL) | ||
1386 | return; | ||
1387 | if (to_rt) { | ||
1388 | policy = SCHED_FIFO; | ||
1389 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1390 | } else { | ||
1391 | policy = SCHED_NORMAL; | ||
1392 | sp.sched_priority = 0; | ||
1393 | } | ||
1394 | sched_setscheduler_nocheck(t, policy, &sp); | ||
1395 | } | ||
1396 | |||
1397 | /* | ||
1398 | * Timer handler to initiate the waking up of per-CPU kthreads that | ||
1399 | * have yielded the CPU due to excess numbers of RCU callbacks. | ||
1400 | * We wake up the per-rcu_node kthread, which in turn will wake up | ||
1401 | * the booster kthread. | ||
1402 | */ | ||
1403 | static void rcu_cpu_kthread_timer(unsigned long arg) | ||
1404 | { | ||
1405 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); | ||
1406 | struct rcu_node *rnp = rdp->mynode; | ||
1407 | |||
1408 | atomic_or(rdp->grpmask, &rnp->wakemask); | ||
1409 | invoke_rcu_node_kthread(rnp); | ||
1410 | } | ||
1411 | |||
1412 | /* | ||
1413 | * Drop to non-real-time priority and yield, but only after posting a | ||
1414 | * timer that will cause us to regain our real-time priority if we | ||
1415 | * remain preempted. Either way, we restore our real-time priority | ||
1416 | * before returning. | ||
1417 | */ | ||
1418 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | ||
1419 | { | ||
1420 | struct sched_param sp; | ||
1421 | struct timer_list yield_timer; | ||
1422 | |||
1423 | setup_timer_on_stack(&yield_timer, f, arg); | ||
1424 | mod_timer(&yield_timer, jiffies + 2); | ||
1425 | sp.sched_priority = 0; | ||
1426 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | ||
1427 | set_user_nice(current, 19); | ||
1428 | schedule(); | ||
1429 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1430 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
1431 | del_timer(&yield_timer); | ||
1432 | } | ||
1433 | |||
1434 | /* | ||
1435 | * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. | ||
1436 | * This can happen while the corresponding CPU is either coming online | ||
1437 | * or going offline. We cannot wait until the CPU is fully online | ||
1438 | * before starting the kthread, because the various notifier functions | ||
1439 | * can wait for RCU grace periods. So we park rcu_cpu_kthread() until | ||
1440 | * the corresponding CPU is online. | ||
1441 | * | ||
1442 | * Return 1 if the kthread needs to stop, 0 otherwise. | ||
1443 | * | ||
1444 | * Caller must disable bh. This function can momentarily enable it. | ||
1445 | */ | ||
1446 | static int rcu_cpu_kthread_should_stop(int cpu) | ||
1447 | { | ||
1448 | while (cpu_is_offline(cpu) || | ||
1449 | !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || | ||
1450 | smp_processor_id() != cpu) { | ||
1451 | if (kthread_should_stop()) | ||
1452 | return 1; | ||
1453 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
1454 | per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); | ||
1455 | local_bh_enable(); | ||
1456 | schedule_timeout_uninterruptible(1); | ||
1457 | if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) | ||
1458 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
1459 | local_bh_disable(); | ||
1460 | } | ||
1461 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1462 | return 0; | ||
1463 | } | ||
1464 | |||
1465 | /* | ||
1466 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | ||
1467 | * earlier RCU softirq. | ||
1468 | */ | ||
1469 | static int rcu_cpu_kthread(void *arg) | ||
1470 | { | ||
1471 | int cpu = (int)(long)arg; | ||
1472 | unsigned long flags; | ||
1473 | int spincnt = 0; | ||
1474 | unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); | ||
1475 | char work; | ||
1476 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | ||
1477 | |||
1478 | for (;;) { | ||
1479 | *statusp = RCU_KTHREAD_WAITING; | ||
1480 | rcu_wait(*workp != 0 || kthread_should_stop()); | ||
1481 | local_bh_disable(); | ||
1482 | if (rcu_cpu_kthread_should_stop(cpu)) { | ||
1483 | local_bh_enable(); | ||
1484 | break; | ||
1485 | } | ||
1486 | *statusp = RCU_KTHREAD_RUNNING; | ||
1487 | per_cpu(rcu_cpu_kthread_loops, cpu)++; | ||
1488 | local_irq_save(flags); | ||
1489 | work = *workp; | ||
1490 | *workp = 0; | ||
1491 | local_irq_restore(flags); | ||
1492 | if (work) | ||
1493 | rcu_kthread_do_work(); | ||
1494 | local_bh_enable(); | ||
1495 | if (*workp != 0) | ||
1496 | spincnt++; | ||
1497 | else | ||
1498 | spincnt = 0; | ||
1499 | if (spincnt > 10) { | ||
1500 | *statusp = RCU_KTHREAD_YIELDING; | ||
1501 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | ||
1502 | spincnt = 0; | ||
1503 | } | ||
1504 | } | ||
1505 | *statusp = RCU_KTHREAD_STOPPED; | ||
1506 | return 0; | ||
1507 | } | ||
1508 | |||
1509 | /* | ||
1510 | * Spawn a per-CPU kthread, setting up affinity and priority. | ||
1511 | * Because the CPU hotplug lock is held, no other CPU will be attempting | ||
1512 | * to manipulate rcu_cpu_kthread_task. There might be another CPU | ||
1513 | * attempting to access it during boot, but the locking in kthread_bind() | ||
1514 | * will enforce sufficient ordering. | ||
1515 | * | ||
1516 | * Please note that we cannot simply refuse to wake up the per-CPU | ||
1517 | * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, | ||
1518 | * which can result in softlockup complaints if the task ends up being | ||
1519 | * idle for more than a couple of minutes. | ||
1520 | * | ||
1521 | * However, please note also that we cannot bind the per-CPU kthread to its | ||
1522 | * CPU until that CPU is fully online. We also cannot wait until the | ||
1523 | * CPU is fully online before we create its per-CPU kthread, as this would | ||
1524 | * deadlock the system when CPU notifiers tried waiting for grace | ||
1525 | * periods. So we bind the per-CPU kthread to its CPU only if the CPU | ||
1526 | * is online. If its CPU is not yet fully online, then the code in | ||
1527 | * rcu_cpu_kthread() will wait until it is fully online, and then do | ||
1528 | * the binding. | ||
1529 | */ | ||
1530 | static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | ||
1531 | { | ||
1532 | struct sched_param sp; | ||
1533 | struct task_struct *t; | ||
1534 | |||
1535 | if (!rcu_scheduler_fully_active || | ||
1536 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | ||
1537 | return 0; | ||
1538 | t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); | ||
1539 | if (IS_ERR(t)) | ||
1540 | return PTR_ERR(t); | ||
1541 | if (cpu_online(cpu)) | ||
1542 | kthread_bind(t, cpu); | ||
1543 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1544 | WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); | ||
1545 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1546 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1547 | per_cpu(rcu_cpu_kthread_task, cpu) = t; | ||
1548 | wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ | ||
1549 | return 0; | ||
1550 | } | ||
1551 | |||
1552 | /* | ||
1553 | * Per-rcu_node kthread, which is in charge of waking up the per-CPU | ||
1554 | * kthreads when needed. We ignore requests to wake up kthreads | ||
1555 | * for offline CPUs, which is OK because force_quiescent_state() | ||
1556 | * takes care of this case. | ||
1557 | */ | ||
1558 | static int rcu_node_kthread(void *arg) | ||
1559 | { | ||
1560 | int cpu; | ||
1561 | unsigned long flags; | ||
1562 | unsigned long mask; | ||
1563 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1564 | struct sched_param sp; | ||
1565 | struct task_struct *t; | ||
1566 | |||
1567 | for (;;) { | ||
1568 | rnp->node_kthread_status = RCU_KTHREAD_WAITING; | ||
1569 | rcu_wait(atomic_read(&rnp->wakemask) != 0); | ||
1570 | rnp->node_kthread_status = RCU_KTHREAD_RUNNING; | ||
1571 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1572 | mask = atomic_xchg(&rnp->wakemask, 0); | ||
1573 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1574 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { | ||
1575 | if ((mask & 0x1) == 0) | ||
1576 | continue; | ||
1577 | preempt_disable(); | ||
1578 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1579 | if (!cpu_online(cpu) || t == NULL) { | ||
1580 | preempt_enable(); | ||
1581 | continue; | ||
1582 | } | ||
1583 | per_cpu(rcu_cpu_has_work, cpu) = 1; | ||
1584 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1585 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1586 | preempt_enable(); | ||
1587 | } | ||
1588 | } | ||
1589 | /* NOTREACHED */ | ||
1590 | rnp->node_kthread_status = RCU_KTHREAD_STOPPED; | ||
1591 | return 0; | ||
1592 | } | ||
1593 | |||
1594 | /* | ||
1595 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | ||
1596 | * served by the rcu_node in question. The CPU hotplug lock is still | ||
1597 | * held, so the value of rnp->qsmaskinit will be stable. | ||
1598 | * | ||
1599 | * We don't include outgoingcpu in the affinity set, use -1 if there is | ||
1600 | * no outgoing CPU. If there are no CPUs left in the affinity set, | ||
1601 | * this function allows the kthread to execute on any CPU. | ||
1602 | */ | ||
1603 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1604 | { | ||
1605 | cpumask_var_t cm; | ||
1606 | int cpu; | ||
1607 | unsigned long mask = rnp->qsmaskinit; | ||
1608 | |||
1609 | if (rnp->node_kthread_task == NULL) | ||
1610 | return; | ||
1611 | if (!alloc_cpumask_var(&cm, GFP_KERNEL)) | ||
1612 | return; | ||
1613 | cpumask_clear(cm); | ||
1614 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | ||
1615 | if ((mask & 0x1) && cpu != outgoingcpu) | ||
1616 | cpumask_set_cpu(cpu, cm); | ||
1617 | if (cpumask_weight(cm) == 0) { | ||
1618 | cpumask_setall(cm); | ||
1619 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) | ||
1620 | cpumask_clear_cpu(cpu, cm); | ||
1621 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | ||
1622 | } | ||
1623 | set_cpus_allowed_ptr(rnp->node_kthread_task, cm); | ||
1624 | rcu_boost_kthread_setaffinity(rnp, cm); | ||
1625 | free_cpumask_var(cm); | ||
1626 | } | ||
1627 | |||
1628 | /* | ||
1629 | * Spawn a per-rcu_node kthread, setting priority and affinity. | ||
1630 | * Called during boot before online/offline can happen, or, if | ||
1631 | * during runtime, with the main CPU-hotplug locks held. So only | ||
1632 | * one of these can be executing at a time. | ||
1633 | */ | ||
1634 | static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | ||
1635 | struct rcu_node *rnp) | ||
1636 | { | ||
1637 | unsigned long flags; | ||
1638 | int rnp_index = rnp - &rsp->node[0]; | ||
1639 | struct sched_param sp; | ||
1640 | struct task_struct *t; | ||
1641 | |||
1642 | if (!rcu_scheduler_fully_active || | ||
1643 | rnp->qsmaskinit == 0) | ||
1644 | return 0; | ||
1645 | if (rnp->node_kthread_task == NULL) { | ||
1646 | t = kthread_create(rcu_node_kthread, (void *)rnp, | ||
1647 | "rcun%d", rnp_index); | ||
1648 | if (IS_ERR(t)) | ||
1649 | return PTR_ERR(t); | ||
1650 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1651 | rnp->node_kthread_task = t; | ||
1652 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1653 | sp.sched_priority = 99; | ||
1654 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1655 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
1656 | } | ||
1657 | return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); | ||
1658 | } | ||
1659 | |||
1660 | /* | ||
1661 | * Spawn all kthreads -- called as soon as the scheduler is running. | ||
1662 | */ | ||
1663 | static int __init rcu_spawn_kthreads(void) | ||
1664 | { | ||
1665 | int cpu; | ||
1666 | struct rcu_node *rnp; | ||
1667 | |||
1668 | rcu_scheduler_fully_active = 1; | ||
1669 | for_each_possible_cpu(cpu) { | ||
1670 | per_cpu(rcu_cpu_has_work, cpu) = 0; | ||
1671 | if (cpu_online(cpu)) | ||
1672 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
1673 | } | ||
1674 | rnp = rcu_get_root(rcu_state); | ||
1675 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1676 | if (NUM_RCU_NODES > 1) { | ||
1677 | rcu_for_each_leaf_node(rcu_state, rnp) | ||
1678 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1679 | } | ||
1680 | return 0; | ||
1681 | } | ||
1682 | early_initcall(rcu_spawn_kthreads); | ||
1683 | |||
1684 | static void __cpuinit rcu_prepare_kthreads(int cpu) | ||
1685 | { | ||
1686 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
1687 | struct rcu_node *rnp = rdp->mynode; | ||
1688 | |||
1689 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | ||
1690 | if (rcu_scheduler_fully_active) { | ||
1691 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
1692 | if (rnp->node_kthread_task == NULL) | ||
1693 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1694 | } | ||
1695 | } | ||
1696 | |||
1697 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
1698 | |||
1699 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1700 | { | ||
1701 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1702 | } | ||
1703 | |||
1704 | static void invoke_rcu_callbacks_kthread(void) | ||
1705 | { | ||
1706 | WARN_ON_ONCE(1); | ||
1707 | } | ||
1708 | |||
1709 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1710 | { | ||
1711 | } | ||
1712 | |||
1713 | #ifdef CONFIG_HOTPLUG_CPU | ||
1714 | |||
1715 | static void rcu_stop_cpu_kthread(int cpu) | ||
1716 | { | ||
1717 | } | ||
1718 | |||
1719 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1720 | |||
1721 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1722 | { | ||
1723 | } | ||
1724 | |||
1725 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1726 | { | ||
1727 | } | ||
1728 | |||
1729 | static int __init rcu_scheduler_really_started(void) | ||
1730 | { | ||
1731 | rcu_scheduler_fully_active = 1; | ||
1732 | return 0; | ||
1733 | } | ||
1734 | early_initcall(rcu_scheduler_really_started); | ||
1735 | |||
1736 | static void __cpuinit rcu_prepare_kthreads(int cpu) | ||
1737 | { | ||
1738 | } | ||
1739 | |||
1740 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
1741 | |||
1018 | #ifndef CONFIG_SMP | 1742 | #ifndef CONFIG_SMP |
1019 | 1743 | ||
1020 | void synchronize_sched_expedited(void) | 1744 | void synchronize_sched_expedited(void) |
@@ -1187,14 +1911,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | |||
1187 | * | 1911 | * |
1188 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 1912 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
1189 | * disabled, we do one pass of force_quiescent_state(), then do a | 1913 | * disabled, we do one pass of force_quiescent_state(), then do a |
1190 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | 1914 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
1191 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | 1915 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
1192 | */ | 1916 | */ |
1193 | int rcu_needs_cpu(int cpu) | 1917 | int rcu_needs_cpu(int cpu) |
1194 | { | 1918 | { |
1195 | int c = 0; | 1919 | int c = 0; |
1196 | int snap; | 1920 | int snap; |
1197 | int snap_nmi; | ||
1198 | int thatcpu; | 1921 | int thatcpu; |
1199 | 1922 | ||
1200 | /* Check for being in the holdoff period. */ | 1923 | /* Check for being in the holdoff period. */ |
@@ -1205,10 +1928,10 @@ int rcu_needs_cpu(int cpu) | |||
1205 | for_each_online_cpu(thatcpu) { | 1928 | for_each_online_cpu(thatcpu) { |
1206 | if (thatcpu == cpu) | 1929 | if (thatcpu == cpu) |
1207 | continue; | 1930 | continue; |
1208 | snap = per_cpu(rcu_dynticks, thatcpu).dynticks; | 1931 | snap = atomic_add_return(0, &per_cpu(rcu_dynticks, |
1209 | snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; | 1932 | thatcpu).dynticks); |
1210 | smp_mb(); /* Order sampling of snap with end of grace period. */ | 1933 | smp_mb(); /* Order sampling of snap with end of grace period. */ |
1211 | if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { | 1934 | if ((snap & 0x1) != 0) { |
1212 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 1935 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
1213 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 1936 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
1214 | return rcu_needs_cpu_quick_check(cpu); | 1937 | return rcu_needs_cpu_quick_check(cpu); |
@@ -1239,7 +1962,7 @@ int rcu_needs_cpu(int cpu) | |||
1239 | 1962 | ||
1240 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 1963 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ |
1241 | if (c) | 1964 | if (c) |
1242 | raise_softirq(RCU_SOFTIRQ); | 1965 | invoke_rcu_core(); |
1243 | return c; | 1966 | return c; |
1244 | } | 1967 | } |
1245 | 1968 | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c8e97853b970..4e144876dc68 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,6 +46,22 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | #ifdef CONFIG_RCU_BOOST | ||
50 | |||
51 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
52 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); | ||
53 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
54 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
55 | |||
56 | static char convert_kthread_status(unsigned int kthread_status) | ||
57 | { | ||
58 | if (kthread_status > RCU_KTHREAD_MAX) | ||
59 | return '?'; | ||
60 | return "SRWOY"[kthread_status]; | ||
61 | } | ||
62 | |||
63 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
64 | |||
49 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 65 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
50 | { | 66 | { |
51 | if (!rdp->beenonline) | 67 | if (!rdp->beenonline) |
@@ -57,14 +73,31 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
57 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 73 | rdp->passed_quiesc, rdp->passed_quiesc_completed, |
58 | rdp->qs_pending); | 74 | rdp->qs_pending); |
59 | #ifdef CONFIG_NO_HZ | 75 | #ifdef CONFIG_NO_HZ |
60 | seq_printf(m, " dt=%d/%d dn=%d df=%lu", | 76 | seq_printf(m, " dt=%d/%d/%d df=%lu", |
61 | rdp->dynticks->dynticks, | 77 | atomic_read(&rdp->dynticks->dynticks), |
62 | rdp->dynticks->dynticks_nesting, | 78 | rdp->dynticks->dynticks_nesting, |
63 | rdp->dynticks->dynticks_nmi, | 79 | rdp->dynticks->dynticks_nmi_nesting, |
64 | rdp->dynticks_fqs); | 80 | rdp->dynticks_fqs); |
65 | #endif /* #ifdef CONFIG_NO_HZ */ | 81 | #endif /* #ifdef CONFIG_NO_HZ */ |
66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 82 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
67 | seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); | 83 | seq_printf(m, " ql=%ld qs=%c%c%c%c", |
84 | rdp->qlen, | ||
85 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
86 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
87 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
88 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
89 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
90 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
91 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
92 | #ifdef CONFIG_RCU_BOOST | ||
93 | seq_printf(m, " kt=%d/%c/%d ktl=%x", | ||
94 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
95 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
96 | rdp->cpu)), | ||
97 | per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), | ||
98 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | ||
99 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
100 | seq_printf(m, " b=%ld", rdp->blimit); | ||
68 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | 101 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", |
69 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 102 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
70 | } | 103 | } |
@@ -115,13 +148,27 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
115 | rdp->qs_pending); | 148 | rdp->qs_pending); |
116 | #ifdef CONFIG_NO_HZ | 149 | #ifdef CONFIG_NO_HZ |
117 | seq_printf(m, ",%d,%d,%d,%lu", | 150 | seq_printf(m, ",%d,%d,%d,%lu", |
118 | rdp->dynticks->dynticks, | 151 | atomic_read(&rdp->dynticks->dynticks), |
119 | rdp->dynticks->dynticks_nesting, | 152 | rdp->dynticks->dynticks_nesting, |
120 | rdp->dynticks->dynticks_nmi, | 153 | rdp->dynticks->dynticks_nmi_nesting, |
121 | rdp->dynticks_fqs); | 154 | rdp->dynticks_fqs); |
122 | #endif /* #ifdef CONFIG_NO_HZ */ | 155 | #endif /* #ifdef CONFIG_NO_HZ */ |
123 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 156 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
124 | seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); | 157 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, |
158 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
159 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
160 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
161 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
162 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
163 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
164 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
165 | #ifdef CONFIG_RCU_BOOST | ||
166 | seq_printf(m, ",%d,\"%c\"", | ||
167 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
168 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
169 | rdp->cpu))); | ||
170 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
171 | seq_printf(m, ",%ld", rdp->blimit); | ||
125 | seq_printf(m, ",%lu,%lu,%lu\n", | 172 | seq_printf(m, ",%lu,%lu,%lu\n", |
126 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 173 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
127 | } | 174 | } |
@@ -130,9 +177,13 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
130 | { | 177 | { |
131 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); | 178 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); |
132 | #ifdef CONFIG_NO_HZ | 179 | #ifdef CONFIG_NO_HZ |
133 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); | 180 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
134 | #endif /* #ifdef CONFIG_NO_HZ */ | 181 | #endif /* #ifdef CONFIG_NO_HZ */ |
135 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); | 182 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); |
183 | #ifdef CONFIG_RCU_BOOST | ||
184 | seq_puts(m, "\"kt\",\"ktl\""); | ||
185 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
186 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | ||
136 | #ifdef CONFIG_TREE_PREEMPT_RCU | 187 | #ifdef CONFIG_TREE_PREEMPT_RCU |
137 | seq_puts(m, "\"rcu_preempt:\"\n"); | 188 | seq_puts(m, "\"rcu_preempt:\"\n"); |
138 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 189 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); |
@@ -157,11 +208,76 @@ static const struct file_operations rcudata_csv_fops = { | |||
157 | .release = single_release, | 208 | .release = single_release, |
158 | }; | 209 | }; |
159 | 210 | ||
211 | #ifdef CONFIG_RCU_BOOST | ||
212 | |||
213 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | ||
214 | { | ||
215 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " | ||
216 | "j=%04x bt=%04x\n", | ||
217 | rnp->grplo, rnp->grphi, | ||
218 | "T."[list_empty(&rnp->blkd_tasks)], | ||
219 | "N."[!rnp->gp_tasks], | ||
220 | "E."[!rnp->exp_tasks], | ||
221 | "B."[!rnp->boost_tasks], | ||
222 | convert_kthread_status(rnp->boost_kthread_status), | ||
223 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | ||
224 | rnp->n_normal_boosts, | ||
225 | (int)(jiffies & 0xffff), | ||
226 | (int)(rnp->boost_time & 0xffff)); | ||
227 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | ||
228 | " balk", | ||
229 | rnp->n_balk_blkd_tasks, | ||
230 | rnp->n_balk_exp_gp_tasks, | ||
231 | rnp->n_balk_boost_tasks, | ||
232 | rnp->n_balk_notblocked, | ||
233 | rnp->n_balk_notyet, | ||
234 | rnp->n_balk_nos); | ||
235 | } | ||
236 | |||
237 | static int show_rcu_node_boost(struct seq_file *m, void *unused) | ||
238 | { | ||
239 | struct rcu_node *rnp; | ||
240 | |||
241 | rcu_for_each_leaf_node(&rcu_preempt_state, rnp) | ||
242 | print_one_rcu_node_boost(m, rnp); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | static int rcu_node_boost_open(struct inode *inode, struct file *file) | ||
247 | { | ||
248 | return single_open(file, show_rcu_node_boost, NULL); | ||
249 | } | ||
250 | |||
251 | static const struct file_operations rcu_node_boost_fops = { | ||
252 | .owner = THIS_MODULE, | ||
253 | .open = rcu_node_boost_open, | ||
254 | .read = seq_read, | ||
255 | .llseek = seq_lseek, | ||
256 | .release = single_release, | ||
257 | }; | ||
258 | |||
259 | /* | ||
260 | * Create the rcuboost debugfs entry. Standard error return. | ||
261 | */ | ||
262 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
263 | { | ||
264 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
265 | &rcu_node_boost_fops); | ||
266 | } | ||
267 | |||
268 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
269 | |||
270 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
271 | { | ||
272 | return 0; /* There cannot be an error if we didn't create it! */ | ||
273 | } | ||
274 | |||
275 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
276 | |||
160 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 277 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
161 | { | 278 | { |
162 | unsigned long gpnum; | 279 | unsigned long gpnum; |
163 | int level = 0; | 280 | int level = 0; |
164 | int phase; | ||
165 | struct rcu_node *rnp; | 281 | struct rcu_node *rnp; |
166 | 282 | ||
167 | gpnum = rsp->gpnum; | 283 | gpnum = rsp->gpnum; |
@@ -178,13 +294,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
178 | seq_puts(m, "\n"); | 294 | seq_puts(m, "\n"); |
179 | level = rnp->level; | 295 | level = rnp->level; |
180 | } | 296 | } |
181 | phase = gpnum & 0x1; | 297 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", |
182 | seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", | ||
183 | rnp->qsmask, rnp->qsmaskinit, | 298 | rnp->qsmask, rnp->qsmaskinit, |
184 | "T."[list_empty(&rnp->blocked_tasks[phase])], | 299 | ".G"[rnp->gp_tasks != NULL], |
185 | "E."[list_empty(&rnp->blocked_tasks[phase + 2])], | 300 | ".E"[rnp->exp_tasks != NULL], |
186 | "T."[list_empty(&rnp->blocked_tasks[!phase])], | 301 | ".T"[!list_empty(&rnp->blkd_tasks)], |
187 | "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], | ||
188 | rnp->grplo, rnp->grphi, rnp->grpnum); | 302 | rnp->grplo, rnp->grphi, rnp->grpnum); |
189 | } | 303 | } |
190 | seq_puts(m, "\n"); | 304 | seq_puts(m, "\n"); |
@@ -216,16 +330,35 @@ static const struct file_operations rcuhier_fops = { | |||
216 | .release = single_release, | 330 | .release = single_release, |
217 | }; | 331 | }; |
218 | 332 | ||
333 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | ||
334 | { | ||
335 | unsigned long flags; | ||
336 | unsigned long completed; | ||
337 | unsigned long gpnum; | ||
338 | unsigned long gpage; | ||
339 | unsigned long gpmax; | ||
340 | struct rcu_node *rnp = &rsp->node[0]; | ||
341 | |||
342 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
343 | completed = rsp->completed; | ||
344 | gpnum = rsp->gpnum; | ||
345 | if (rsp->completed == rsp->gpnum) | ||
346 | gpage = 0; | ||
347 | else | ||
348 | gpage = jiffies - rsp->gp_start; | ||
349 | gpmax = rsp->gp_max; | ||
350 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
351 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | ||
352 | rsp->name, completed, gpnum, gpage, gpmax); | ||
353 | } | ||
354 | |||
219 | static int show_rcugp(struct seq_file *m, void *unused) | 355 | static int show_rcugp(struct seq_file *m, void *unused) |
220 | { | 356 | { |
221 | #ifdef CONFIG_TREE_PREEMPT_RCU | 357 | #ifdef CONFIG_TREE_PREEMPT_RCU |
222 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", | 358 | show_one_rcugp(m, &rcu_preempt_state); |
223 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | ||
224 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 359 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
225 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", | 360 | show_one_rcugp(m, &rcu_sched_state); |
226 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 361 | show_one_rcugp(m, &rcu_bh_state); |
227 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", | ||
228 | rcu_bh_state.completed, rcu_bh_state.gpnum); | ||
229 | return 0; | 362 | return 0; |
230 | } | 363 | } |
231 | 364 | ||
@@ -298,6 +431,29 @@ static const struct file_operations rcu_pending_fops = { | |||
298 | .release = single_release, | 431 | .release = single_release, |
299 | }; | 432 | }; |
300 | 433 | ||
434 | static int show_rcutorture(struct seq_file *m, void *unused) | ||
435 | { | ||
436 | seq_printf(m, "rcutorture test sequence: %lu %s\n", | ||
437 | rcutorture_testseq >> 1, | ||
438 | (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); | ||
439 | seq_printf(m, "rcutorture update version number: %lu\n", | ||
440 | rcutorture_vernum); | ||
441 | return 0; | ||
442 | } | ||
443 | |||
444 | static int rcutorture_open(struct inode *inode, struct file *file) | ||
445 | { | ||
446 | return single_open(file, show_rcutorture, NULL); | ||
447 | } | ||
448 | |||
449 | static const struct file_operations rcutorture_fops = { | ||
450 | .owner = THIS_MODULE, | ||
451 | .open = rcutorture_open, | ||
452 | .read = seq_read, | ||
453 | .llseek = seq_lseek, | ||
454 | .release = single_release, | ||
455 | }; | ||
456 | |||
301 | static struct dentry *rcudir; | 457 | static struct dentry *rcudir; |
302 | 458 | ||
303 | static int __init rcutree_trace_init(void) | 459 | static int __init rcutree_trace_init(void) |
@@ -318,6 +474,9 @@ static int __init rcutree_trace_init(void) | |||
318 | if (!retval) | 474 | if (!retval) |
319 | goto free_out; | 475 | goto free_out; |
320 | 476 | ||
477 | if (rcu_boost_trace_create_file(rcudir)) | ||
478 | goto free_out; | ||
479 | |||
321 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 480 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); |
322 | if (!retval) | 481 | if (!retval) |
323 | goto free_out; | 482 | goto free_out; |
@@ -331,6 +490,11 @@ static int __init rcutree_trace_init(void) | |||
331 | NULL, &rcu_pending_fops); | 490 | NULL, &rcu_pending_fops); |
332 | if (!retval) | 491 | if (!retval) |
333 | goto free_out; | 492 | goto free_out; |
493 | |||
494 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | ||
495 | NULL, &rcutorture_fops); | ||
496 | if (!retval) | ||
497 | goto free_out; | ||
334 | return 0; | 498 | return 0; |
335 | free_out: | 499 | free_out: |
336 | debugfs_remove_recursive(rcudir); | 500 | debugfs_remove_recursive(rcudir); |
diff --git a/kernel/resource.c b/kernel/resource.c index 798e2fae2a06..3ff40178dce7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -38,6 +38,14 @@ struct resource iomem_resource = { | |||
38 | }; | 38 | }; |
39 | EXPORT_SYMBOL(iomem_resource); | 39 | EXPORT_SYMBOL(iomem_resource); |
40 | 40 | ||
41 | /* constraints to be met while allocating resources */ | ||
42 | struct resource_constraint { | ||
43 | resource_size_t min, max, align; | ||
44 | resource_size_t (*alignf)(void *, const struct resource *, | ||
45 | resource_size_t, resource_size_t); | ||
46 | void *alignf_data; | ||
47 | }; | ||
48 | |||
41 | static DEFINE_RWLOCK(resource_lock); | 49 | static DEFINE_RWLOCK(resource_lock); |
42 | 50 | ||
43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 51 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2) | |||
384 | } | 392 | } |
385 | 393 | ||
386 | /* | 394 | /* |
387 | * Find empty slot in the resource tree given range and alignment. | 395 | * Find empty slot in the resource tree with the given range and |
396 | * alignment constraints | ||
388 | */ | 397 | */ |
389 | static int find_resource(struct resource *root, struct resource *new, | 398 | static int __find_resource(struct resource *root, struct resource *old, |
390 | resource_size_t size, resource_size_t min, | 399 | struct resource *new, |
391 | resource_size_t max, resource_size_t align, | 400 | resource_size_t size, |
392 | resource_size_t (*alignf)(void *, | 401 | struct resource_constraint *constraint) |
393 | const struct resource *, | ||
394 | resource_size_t, | ||
395 | resource_size_t), | ||
396 | void *alignf_data) | ||
397 | { | 402 | { |
398 | struct resource *this = root->child; | 403 | struct resource *this = root->child; |
399 | struct resource tmp = *new, avail, alloc; | 404 | struct resource tmp = *new, avail, alloc; |
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new, | |||
404 | * Skip past an allocated resource that starts at 0, since the assignment | 409 | * Skip past an allocated resource that starts at 0, since the assignment |
405 | * of this->start - 1 to tmp->end below would cause an underflow. | 410 | * of this->start - 1 to tmp->end below would cause an underflow. |
406 | */ | 411 | */ |
407 | if (this && this->start == 0) { | 412 | if (this && this->start == root->start) { |
408 | tmp.start = this->end + 1; | 413 | tmp.start = (this == old) ? old->start : this->end + 1; |
409 | this = this->sibling; | 414 | this = this->sibling; |
410 | } | 415 | } |
411 | for(;;) { | 416 | for(;;) { |
412 | if (this) | 417 | if (this) |
413 | tmp.end = this->start - 1; | 418 | tmp.end = (this == old) ? this->end : this->start - 1; |
414 | else | 419 | else |
415 | tmp.end = root->end; | 420 | tmp.end = root->end; |
416 | 421 | ||
417 | resource_clip(&tmp, min, max); | 422 | resource_clip(&tmp, constraint->min, constraint->max); |
418 | arch_remove_reservations(&tmp); | 423 | arch_remove_reservations(&tmp); |
419 | 424 | ||
420 | /* Check for overflow after ALIGN() */ | 425 | /* Check for overflow after ALIGN() */ |
421 | avail = *new; | 426 | avail = *new; |
422 | avail.start = ALIGN(tmp.start, align); | 427 | avail.start = ALIGN(tmp.start, constraint->align); |
423 | avail.end = tmp.end; | 428 | avail.end = tmp.end; |
424 | if (avail.start >= tmp.start) { | 429 | if (avail.start >= tmp.start) { |
425 | alloc.start = alignf(alignf_data, &avail, size, align); | 430 | alloc.start = constraint->alignf(constraint->alignf_data, &avail, |
431 | size, constraint->align); | ||
426 | alloc.end = alloc.start + size - 1; | 432 | alloc.end = alloc.start + size - 1; |
427 | if (resource_contains(&avail, &alloc)) { | 433 | if (resource_contains(&avail, &alloc)) { |
428 | new->start = alloc.start; | 434 | new->start = alloc.start; |
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new, | |||
432 | } | 438 | } |
433 | if (!this) | 439 | if (!this) |
434 | break; | 440 | break; |
435 | tmp.start = this->end + 1; | 441 | if (this != old) |
442 | tmp.start = this->end + 1; | ||
436 | this = this->sibling; | 443 | this = this->sibling; |
437 | } | 444 | } |
438 | return -EBUSY; | 445 | return -EBUSY; |
439 | } | 446 | } |
440 | 447 | ||
448 | /* | ||
449 | * Find empty slot in the resource tree given range and alignment. | ||
450 | */ | ||
451 | static int find_resource(struct resource *root, struct resource *new, | ||
452 | resource_size_t size, | ||
453 | struct resource_constraint *constraint) | ||
454 | { | ||
455 | return __find_resource(root, NULL, new, size, constraint); | ||
456 | } | ||
457 | |||
441 | /** | 458 | /** |
442 | * allocate_resource - allocate empty slot in the resource tree given range & alignment | 459 | * reallocate_resource - allocate a slot in the resource tree given range & alignment. |
460 | * The resource will be relocated if the new size cannot be reallocated in the | ||
461 | * current location. | ||
462 | * | ||
463 | * @root: root resource descriptor | ||
464 | * @old: resource descriptor desired by caller | ||
465 | * @newsize: new size of the resource descriptor | ||
466 | * @constraint: the size and alignment constraints to be met. | ||
467 | */ | ||
468 | int reallocate_resource(struct resource *root, struct resource *old, | ||
469 | resource_size_t newsize, | ||
470 | struct resource_constraint *constraint) | ||
471 | { | ||
472 | int err=0; | ||
473 | struct resource new = *old; | ||
474 | struct resource *conflict; | ||
475 | |||
476 | write_lock(&resource_lock); | ||
477 | |||
478 | if ((err = __find_resource(root, old, &new, newsize, constraint))) | ||
479 | goto out; | ||
480 | |||
481 | if (resource_contains(&new, old)) { | ||
482 | old->start = new.start; | ||
483 | old->end = new.end; | ||
484 | goto out; | ||
485 | } | ||
486 | |||
487 | if (old->child) { | ||
488 | err = -EBUSY; | ||
489 | goto out; | ||
490 | } | ||
491 | |||
492 | if (resource_contains(old, &new)) { | ||
493 | old->start = new.start; | ||
494 | old->end = new.end; | ||
495 | } else { | ||
496 | __release_resource(old); | ||
497 | *old = new; | ||
498 | conflict = __request_resource(root, old); | ||
499 | BUG_ON(conflict); | ||
500 | } | ||
501 | out: | ||
502 | write_unlock(&resource_lock); | ||
503 | return err; | ||
504 | } | ||
505 | |||
506 | |||
507 | /** | ||
508 | * allocate_resource - allocate empty slot in the resource tree given range & alignment. | ||
509 | * The resource will be reallocated with a new size if it was already allocated | ||
443 | * @root: root resource descriptor | 510 | * @root: root resource descriptor |
444 | * @new: resource descriptor desired by caller | 511 | * @new: resource descriptor desired by caller |
445 | * @size: requested resource region size | 512 | * @size: requested resource region size |
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
459 | void *alignf_data) | 526 | void *alignf_data) |
460 | { | 527 | { |
461 | int err; | 528 | int err; |
529 | struct resource_constraint constraint; | ||
462 | 530 | ||
463 | if (!alignf) | 531 | if (!alignf) |
464 | alignf = simple_align_resource; | 532 | alignf = simple_align_resource; |
465 | 533 | ||
534 | constraint.min = min; | ||
535 | constraint.max = max; | ||
536 | constraint.align = align; | ||
537 | constraint.alignf = alignf; | ||
538 | constraint.alignf_data = alignf_data; | ||
539 | |||
540 | if ( new->parent ) { | ||
541 | /* resource is already allocated, try reallocating with | ||
542 | the new constraints */ | ||
543 | return reallocate_resource(root, new, size, &constraint); | ||
544 | } | ||
545 | |||
466 | write_lock(&resource_lock); | 546 | write_lock(&resource_lock); |
467 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | 547 | err = find_resource(root, new, size, &constraint); |
468 | if (err >= 0 && __request_resource(root, new)) | 548 | if (err >= 0 && __request_resource(root, new)) |
469 | err = -EBUSY; | 549 | err = -EBUSY; |
470 | write_unlock(&resource_lock); | 550 | write_unlock(&resource_lock); |
diff --git a/kernel/sched.c b/kernel/sched.c index 312f8b95c2d4..3dc716f6d8ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
231 | #endif | 231 | #endif |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 234 | * sched_domains_mutex serializes calls to init_sched_domains, |
235 | * detach_destroy_domains and partition_sched_domains. | 235 | * detach_destroy_domains and partition_sched_domains. |
236 | */ | 236 | */ |
237 | static DEFINE_MUTEX(sched_domains_mutex); | 237 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -292,7 +292,7 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
292 | * (The default weight is 1024 - so there's no practical | 292 | * (The default weight is 1024 - so there's no practical |
293 | * limitation from this.) | 293 | * limitation from this.) |
294 | */ | 294 | */ |
295 | #define MIN_SHARES 2 | 295 | #define MIN_SHARES (1UL << 1) |
296 | #define MAX_SHARES (1UL << 18) | 296 | #define MAX_SHARES (1UL << 18) |
297 | 297 | ||
298 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | 298 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
@@ -312,6 +312,9 @@ struct cfs_rq { | |||
312 | 312 | ||
313 | u64 exec_clock; | 313 | u64 exec_clock; |
314 | u64 min_vruntime; | 314 | u64 min_vruntime; |
315 | #ifndef CONFIG_64BIT | ||
316 | u64 min_vruntime_copy; | ||
317 | #endif | ||
315 | 318 | ||
316 | struct rb_root tasks_timeline; | 319 | struct rb_root tasks_timeline; |
317 | struct rb_node *rb_leftmost; | 320 | struct rb_node *rb_leftmost; |
@@ -325,7 +328,9 @@ struct cfs_rq { | |||
325 | */ | 328 | */ |
326 | struct sched_entity *curr, *next, *last, *skip; | 329 | struct sched_entity *curr, *next, *last, *skip; |
327 | 330 | ||
331 | #ifdef CONFIG_SCHED_DEBUG | ||
328 | unsigned int nr_spread_over; | 332 | unsigned int nr_spread_over; |
333 | #endif | ||
329 | 334 | ||
330 | #ifdef CONFIG_FAIR_GROUP_SCHED | 335 | #ifdef CONFIG_FAIR_GROUP_SCHED |
331 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 336 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -417,6 +422,7 @@ struct rt_rq { | |||
417 | */ | 422 | */ |
418 | struct root_domain { | 423 | struct root_domain { |
419 | atomic_t refcount; | 424 | atomic_t refcount; |
425 | struct rcu_head rcu; | ||
420 | cpumask_var_t span; | 426 | cpumask_var_t span; |
421 | cpumask_var_t online; | 427 | cpumask_var_t online; |
422 | 428 | ||
@@ -460,7 +466,7 @@ struct rq { | |||
460 | u64 nohz_stamp; | 466 | u64 nohz_stamp; |
461 | unsigned char nohz_balance_kick; | 467 | unsigned char nohz_balance_kick; |
462 | #endif | 468 | #endif |
463 | unsigned int skip_clock_update; | 469 | int skip_clock_update; |
464 | 470 | ||
465 | /* capture load from *all* tasks on this cpu: */ | 471 | /* capture load from *all* tasks on this cpu: */ |
466 | struct load_weight load; | 472 | struct load_weight load; |
@@ -553,6 +559,10 @@ struct rq { | |||
553 | unsigned int ttwu_count; | 559 | unsigned int ttwu_count; |
554 | unsigned int ttwu_local; | 560 | unsigned int ttwu_local; |
555 | #endif | 561 | #endif |
562 | |||
563 | #ifdef CONFIG_SMP | ||
564 | struct task_struct *wake_list; | ||
565 | #endif | ||
556 | }; | 566 | }; |
557 | 567 | ||
558 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 568 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq) | |||
571 | 581 | ||
572 | #define rcu_dereference_check_sched_domain(p) \ | 582 | #define rcu_dereference_check_sched_domain(p) \ |
573 | rcu_dereference_check((p), \ | 583 | rcu_dereference_check((p), \ |
574 | rcu_read_lock_sched_held() || \ | 584 | rcu_read_lock_held() || \ |
575 | lockdep_is_held(&sched_domains_mutex)) | 585 | lockdep_is_held(&sched_domains_mutex)) |
576 | 586 | ||
577 | /* | 587 | /* |
@@ -595,10 +605,10 @@ static inline int cpu_of(struct rq *rq) | |||
595 | /* | 605 | /* |
596 | * Return the group to which this tasks belongs. | 606 | * Return the group to which this tasks belongs. |
597 | * | 607 | * |
598 | * We use task_subsys_state_check() and extend the RCU verification | 608 | * We use task_subsys_state_check() and extend the RCU verification with |
599 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 609 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each |
600 | * holds that lock for each task it moves into the cgroup. Therefore | 610 | * task it moves into the cgroup. Therefore by holding either of those locks, |
601 | * by holding that lock, we pin the task to the current cgroup. | 611 | * we pin the task to the current cgroup. |
602 | */ | 612 | */ |
603 | static inline struct task_group *task_group(struct task_struct *p) | 613 | static inline struct task_group *task_group(struct task_struct *p) |
604 | { | 614 | { |
@@ -606,6 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
606 | struct cgroup_subsys_state *css; | 616 | struct cgroup_subsys_state *css; |
607 | 617 | ||
608 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 618 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
619 | lockdep_is_held(&p->pi_lock) || | ||
609 | lockdep_is_held(&task_rq(p)->lock)); | 620 | lockdep_is_held(&task_rq(p)->lock)); |
610 | tg = container_of(css, struct task_group, css); | 621 | tg = container_of(css, struct task_group, css); |
611 | 622 | ||
@@ -642,7 +653,7 @@ static void update_rq_clock(struct rq *rq) | |||
642 | { | 653 | { |
643 | s64 delta; | 654 | s64 delta; |
644 | 655 | ||
645 | if (rq->skip_clock_update) | 656 | if (rq->skip_clock_update > 0) |
646 | return; | 657 | return; |
647 | 658 | ||
648 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 659 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
@@ -838,18 +849,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
838 | return rq->curr == p; | 849 | return rq->curr == p; |
839 | } | 850 | } |
840 | 851 | ||
841 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
842 | static inline int task_running(struct rq *rq, struct task_struct *p) | 852 | static inline int task_running(struct rq *rq, struct task_struct *p) |
843 | { | 853 | { |
854 | #ifdef CONFIG_SMP | ||
855 | return p->on_cpu; | ||
856 | #else | ||
844 | return task_current(rq, p); | 857 | return task_current(rq, p); |
858 | #endif | ||
845 | } | 859 | } |
846 | 860 | ||
861 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
847 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 862 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
848 | { | 863 | { |
864 | #ifdef CONFIG_SMP | ||
865 | /* | ||
866 | * We can optimise this out completely for !SMP, because the | ||
867 | * SMP rebalancing from interrupt is the only thing that cares | ||
868 | * here. | ||
869 | */ | ||
870 | next->on_cpu = 1; | ||
871 | #endif | ||
849 | } | 872 | } |
850 | 873 | ||
851 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 874 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
852 | { | 875 | { |
876 | #ifdef CONFIG_SMP | ||
877 | /* | ||
878 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
879 | * We must ensure this doesn't happen until the switch is completely | ||
880 | * finished. | ||
881 | */ | ||
882 | smp_wmb(); | ||
883 | prev->on_cpu = 0; | ||
884 | #endif | ||
853 | #ifdef CONFIG_DEBUG_SPINLOCK | 885 | #ifdef CONFIG_DEBUG_SPINLOCK |
854 | /* this is a valid case when another task releases the spinlock */ | 886 | /* this is a valid case when another task releases the spinlock */ |
855 | rq->lock.owner = current; | 887 | rq->lock.owner = current; |
@@ -865,15 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
865 | } | 897 | } |
866 | 898 | ||
867 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 899 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
868 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
869 | { | ||
870 | #ifdef CONFIG_SMP | ||
871 | return p->oncpu; | ||
872 | #else | ||
873 | return task_current(rq, p); | ||
874 | #endif | ||
875 | } | ||
876 | |||
877 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 900 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
878 | { | 901 | { |
879 | #ifdef CONFIG_SMP | 902 | #ifdef CONFIG_SMP |
@@ -882,7 +905,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
882 | * SMP rebalancing from interrupt is the only thing that cares | 905 | * SMP rebalancing from interrupt is the only thing that cares |
883 | * here. | 906 | * here. |
884 | */ | 907 | */ |
885 | next->oncpu = 1; | 908 | next->on_cpu = 1; |
886 | #endif | 909 | #endif |
887 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 910 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
888 | raw_spin_unlock_irq(&rq->lock); | 911 | raw_spin_unlock_irq(&rq->lock); |
@@ -895,12 +918,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
895 | { | 918 | { |
896 | #ifdef CONFIG_SMP | 919 | #ifdef CONFIG_SMP |
897 | /* | 920 | /* |
898 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 921 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
899 | * We must ensure this doesn't happen until the switch is completely | 922 | * We must ensure this doesn't happen until the switch is completely |
900 | * finished. | 923 | * finished. |
901 | */ | 924 | */ |
902 | smp_wmb(); | 925 | smp_wmb(); |
903 | prev->oncpu = 0; | 926 | prev->on_cpu = 0; |
904 | #endif | 927 | #endif |
905 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 928 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
906 | local_irq_enable(); | 929 | local_irq_enable(); |
@@ -909,23 +932,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
909 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 932 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
910 | 933 | ||
911 | /* | 934 | /* |
912 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 935 | * __task_rq_lock - lock the rq @p resides on. |
913 | * against ttwu(). | ||
914 | */ | ||
915 | static inline int task_is_waking(struct task_struct *p) | ||
916 | { | ||
917 | return unlikely(p->state == TASK_WAKING); | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
922 | * Must be called interrupts disabled. | ||
923 | */ | 936 | */ |
924 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 937 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
925 | __acquires(rq->lock) | 938 | __acquires(rq->lock) |
926 | { | 939 | { |
927 | struct rq *rq; | 940 | struct rq *rq; |
928 | 941 | ||
942 | lockdep_assert_held(&p->pi_lock); | ||
943 | |||
929 | for (;;) { | 944 | for (;;) { |
930 | rq = task_rq(p); | 945 | rq = task_rq(p); |
931 | raw_spin_lock(&rq->lock); | 946 | raw_spin_lock(&rq->lock); |
@@ -936,22 +951,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
936 | } | 951 | } |
937 | 952 | ||
938 | /* | 953 | /* |
939 | * task_rq_lock - lock the runqueue a given task resides on and disable | 954 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
940 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
941 | * explicitly disabling preemption. | ||
942 | */ | 955 | */ |
943 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 956 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
957 | __acquires(p->pi_lock) | ||
944 | __acquires(rq->lock) | 958 | __acquires(rq->lock) |
945 | { | 959 | { |
946 | struct rq *rq; | 960 | struct rq *rq; |
947 | 961 | ||
948 | for (;;) { | 962 | for (;;) { |
949 | local_irq_save(*flags); | 963 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
950 | rq = task_rq(p); | 964 | rq = task_rq(p); |
951 | raw_spin_lock(&rq->lock); | 965 | raw_spin_lock(&rq->lock); |
952 | if (likely(rq == task_rq(p))) | 966 | if (likely(rq == task_rq(p))) |
953 | return rq; | 967 | return rq; |
954 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 968 | raw_spin_unlock(&rq->lock); |
969 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
955 | } | 970 | } |
956 | } | 971 | } |
957 | 972 | ||
@@ -961,10 +976,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
961 | raw_spin_unlock(&rq->lock); | 976 | raw_spin_unlock(&rq->lock); |
962 | } | 977 | } |
963 | 978 | ||
964 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 979 | static inline void |
980 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
965 | __releases(rq->lock) | 981 | __releases(rq->lock) |
982 | __releases(p->pi_lock) | ||
966 | { | 983 | { |
967 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 984 | raw_spin_unlock(&rq->lock); |
985 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
968 | } | 986 | } |
969 | 987 | ||
970 | /* | 988 | /* |
@@ -1193,11 +1211,17 @@ int get_nohz_timer_target(void) | |||
1193 | int i; | 1211 | int i; |
1194 | struct sched_domain *sd; | 1212 | struct sched_domain *sd; |
1195 | 1213 | ||
1214 | rcu_read_lock(); | ||
1196 | for_each_domain(cpu, sd) { | 1215 | for_each_domain(cpu, sd) { |
1197 | for_each_cpu(i, sched_domain_span(sd)) | 1216 | for_each_cpu(i, sched_domain_span(sd)) { |
1198 | if (!idle_cpu(i)) | 1217 | if (!idle_cpu(i)) { |
1199 | return i; | 1218 | cpu = i; |
1219 | goto unlock; | ||
1220 | } | ||
1221 | } | ||
1200 | } | 1222 | } |
1223 | unlock: | ||
1224 | rcu_read_unlock(); | ||
1201 | return cpu; | 1225 | return cpu; |
1202 | } | 1226 | } |
1203 | /* | 1227 | /* |
@@ -1307,15 +1331,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1307 | { | 1331 | { |
1308 | u64 tmp; | 1332 | u64 tmp; |
1309 | 1333 | ||
1334 | /* | ||
1335 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1336 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1337 | * 2^SCHED_LOAD_RESOLUTION. | ||
1338 | */ | ||
1339 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1340 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1341 | else | ||
1342 | tmp = (u64)delta_exec; | ||
1343 | |||
1310 | if (!lw->inv_weight) { | 1344 | if (!lw->inv_weight) { |
1311 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1345 | unsigned long w = scale_load_down(lw->weight); |
1346 | |||
1347 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1312 | lw->inv_weight = 1; | 1348 | lw->inv_weight = 1; |
1349 | else if (unlikely(!w)) | ||
1350 | lw->inv_weight = WMULT_CONST; | ||
1313 | else | 1351 | else |
1314 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1352 | lw->inv_weight = WMULT_CONST / w; |
1315 | / (lw->weight+1); | ||
1316 | } | 1353 | } |
1317 | 1354 | ||
1318 | tmp = (u64)delta_exec * weight; | ||
1319 | /* | 1355 | /* |
1320 | * Check whether we'd overflow the 64-bit multiplication: | 1356 | * Check whether we'd overflow the 64-bit multiplication: |
1321 | */ | 1357 | */ |
@@ -1755,17 +1791,20 @@ static void dec_nr_running(struct rq *rq) | |||
1755 | 1791 | ||
1756 | static void set_load_weight(struct task_struct *p) | 1792 | static void set_load_weight(struct task_struct *p) |
1757 | { | 1793 | { |
1794 | int prio = p->static_prio - MAX_RT_PRIO; | ||
1795 | struct load_weight *load = &p->se.load; | ||
1796 | |||
1758 | /* | 1797 | /* |
1759 | * SCHED_IDLE tasks get minimal weight: | 1798 | * SCHED_IDLE tasks get minimal weight: |
1760 | */ | 1799 | */ |
1761 | if (p->policy == SCHED_IDLE) { | 1800 | if (p->policy == SCHED_IDLE) { |
1762 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1801 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
1763 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1802 | load->inv_weight = WMULT_IDLEPRIO; |
1764 | return; | 1803 | return; |
1765 | } | 1804 | } |
1766 | 1805 | ||
1767 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1806 | load->weight = scale_load(prio_to_weight[prio]); |
1768 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1807 | load->inv_weight = prio_to_wmult[prio]; |
1769 | } | 1808 | } |
1770 | 1809 | ||
1771 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1810 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1773,7 +1812,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1773 | update_rq_clock(rq); | 1812 | update_rq_clock(rq); |
1774 | sched_info_queued(p); | 1813 | sched_info_queued(p); |
1775 | p->sched_class->enqueue_task(rq, p, flags); | 1814 | p->sched_class->enqueue_task(rq, p, flags); |
1776 | p->se.on_rq = 1; | ||
1777 | } | 1815 | } |
1778 | 1816 | ||
1779 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1817 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1781,7 +1819,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1781 | update_rq_clock(rq); | 1819 | update_rq_clock(rq); |
1782 | sched_info_dequeued(p); | 1820 | sched_info_dequeued(p); |
1783 | p->sched_class->dequeue_task(rq, p, flags); | 1821 | p->sched_class->dequeue_task(rq, p, flags); |
1784 | p->se.on_rq = 0; | ||
1785 | } | 1822 | } |
1786 | 1823 | ||
1787 | /* | 1824 | /* |
@@ -2116,7 +2153,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2116 | * A queue event has occurred, and we're going to schedule. In | 2153 | * A queue event has occurred, and we're going to schedule. In |
2117 | * this case, we can save a useless back to back clock update. | 2154 | * this case, we can save a useless back to back clock update. |
2118 | */ | 2155 | */ |
2119 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | 2156 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) |
2120 | rq->skip_clock_update = 1; | 2157 | rq->skip_clock_update = 1; |
2121 | } | 2158 | } |
2122 | 2159 | ||
@@ -2162,6 +2199,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2162 | */ | 2199 | */ |
2163 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2200 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2164 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2201 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2202 | |||
2203 | #ifdef CONFIG_LOCKDEP | ||
2204 | /* | ||
2205 | * The caller should hold either p->pi_lock or rq->lock, when changing | ||
2206 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | ||
2207 | * | ||
2208 | * sched_move_task() holds both and thus holding either pins the cgroup, | ||
2209 | * see set_task_rq(). | ||
2210 | * | ||
2211 | * Furthermore, all task_rq users should acquire both locks, see | ||
2212 | * task_rq_lock(). | ||
2213 | */ | ||
2214 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2215 | lockdep_is_held(&task_rq(p)->lock))); | ||
2216 | #endif | ||
2165 | #endif | 2217 | #endif |
2166 | 2218 | ||
2167 | trace_sched_migrate_task(p, new_cpu); | 2219 | trace_sched_migrate_task(p, new_cpu); |
@@ -2182,19 +2234,6 @@ struct migration_arg { | |||
2182 | static int migration_cpu_stop(void *data); | 2234 | static int migration_cpu_stop(void *data); |
2183 | 2235 | ||
2184 | /* | 2236 | /* |
2185 | * The task's runqueue lock must be held. | ||
2186 | * Returns true if you have to wait for migration thread. | ||
2187 | */ | ||
2188 | static bool migrate_task(struct task_struct *p, struct rq *rq) | ||
2189 | { | ||
2190 | /* | ||
2191 | * If the task is not on a runqueue (and not running), then | ||
2192 | * the next wake-up will properly place the task. | ||
2193 | */ | ||
2194 | return p->se.on_rq || task_running(rq, p); | ||
2195 | } | ||
2196 | |||
2197 | /* | ||
2198 | * wait_task_inactive - wait for a thread to unschedule. | 2237 | * wait_task_inactive - wait for a thread to unschedule. |
2199 | * | 2238 | * |
2200 | * If @match_state is nonzero, it's the @p->state value just checked and | 2239 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2251,11 +2290,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2251 | rq = task_rq_lock(p, &flags); | 2290 | rq = task_rq_lock(p, &flags); |
2252 | trace_sched_wait_task(p); | 2291 | trace_sched_wait_task(p); |
2253 | running = task_running(rq, p); | 2292 | running = task_running(rq, p); |
2254 | on_rq = p->se.on_rq; | 2293 | on_rq = p->on_rq; |
2255 | ncsw = 0; | 2294 | ncsw = 0; |
2256 | if (!match_state || p->state == match_state) | 2295 | if (!match_state || p->state == match_state) |
2257 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2296 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2258 | task_rq_unlock(rq, &flags); | 2297 | task_rq_unlock(rq, p, &flags); |
2259 | 2298 | ||
2260 | /* | 2299 | /* |
2261 | * If it changed from the expected state, bail out now. | 2300 | * If it changed from the expected state, bail out now. |
@@ -2330,7 +2369,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
2330 | 2369 | ||
2331 | #ifdef CONFIG_SMP | 2370 | #ifdef CONFIG_SMP |
2332 | /* | 2371 | /* |
2333 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2372 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2334 | */ | 2373 | */ |
2335 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2374 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2336 | { | 2375 | { |
@@ -2363,12 +2402,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2363 | } | 2402 | } |
2364 | 2403 | ||
2365 | /* | 2404 | /* |
2366 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2405 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2367 | */ | 2406 | */ |
2368 | static inline | 2407 | static inline |
2369 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2408 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2370 | { | 2409 | { |
2371 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2410 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2372 | 2411 | ||
2373 | /* | 2412 | /* |
2374 | * In order not to call set_task_cpu() on a blocking task we need | 2413 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2394,27 +2433,63 @@ static void update_avg(u64 *avg, u64 sample) | |||
2394 | } | 2433 | } |
2395 | #endif | 2434 | #endif |
2396 | 2435 | ||
2397 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2436 | static void |
2398 | bool is_sync, bool is_migrate, bool is_local, | 2437 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2399 | unsigned long en_flags) | ||
2400 | { | 2438 | { |
2401 | schedstat_inc(p, se.statistics.nr_wakeups); | 2439 | #ifdef CONFIG_SCHEDSTATS |
2402 | if (is_sync) | 2440 | struct rq *rq = this_rq(); |
2403 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2441 | |
2404 | if (is_migrate) | 2442 | #ifdef CONFIG_SMP |
2405 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2443 | int this_cpu = smp_processor_id(); |
2406 | if (is_local) | 2444 | |
2445 | if (cpu == this_cpu) { | ||
2446 | schedstat_inc(rq, ttwu_local); | ||
2407 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 2447 | schedstat_inc(p, se.statistics.nr_wakeups_local); |
2408 | else | 2448 | } else { |
2449 | struct sched_domain *sd; | ||
2450 | |||
2409 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 2451 | schedstat_inc(p, se.statistics.nr_wakeups_remote); |
2452 | rcu_read_lock(); | ||
2453 | for_each_domain(this_cpu, sd) { | ||
2454 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2455 | schedstat_inc(sd, ttwu_wake_remote); | ||
2456 | break; | ||
2457 | } | ||
2458 | } | ||
2459 | rcu_read_unlock(); | ||
2460 | } | ||
2461 | |||
2462 | if (wake_flags & WF_MIGRATED) | ||
2463 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2464 | |||
2465 | #endif /* CONFIG_SMP */ | ||
2466 | |||
2467 | schedstat_inc(rq, ttwu_count); | ||
2468 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
2410 | 2469 | ||
2470 | if (wake_flags & WF_SYNC) | ||
2471 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2472 | |||
2473 | #endif /* CONFIG_SCHEDSTATS */ | ||
2474 | } | ||
2475 | |||
2476 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2477 | { | ||
2411 | activate_task(rq, p, en_flags); | 2478 | activate_task(rq, p, en_flags); |
2479 | p->on_rq = 1; | ||
2480 | |||
2481 | /* if a worker is waking up, notify workqueue */ | ||
2482 | if (p->flags & PF_WQ_WORKER) | ||
2483 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2412 | } | 2484 | } |
2413 | 2485 | ||
2414 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2486 | /* |
2415 | int wake_flags, bool success) | 2487 | * Mark the task runnable and perform wakeup-preemption. |
2488 | */ | ||
2489 | static void | ||
2490 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2416 | { | 2491 | { |
2417 | trace_sched_wakeup(p, success); | 2492 | trace_sched_wakeup(p, true); |
2418 | check_preempt_curr(rq, p, wake_flags); | 2493 | check_preempt_curr(rq, p, wake_flags); |
2419 | 2494 | ||
2420 | p->state = TASK_RUNNING; | 2495 | p->state = TASK_RUNNING; |
@@ -2433,9 +2508,119 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2433 | rq->idle_stamp = 0; | 2508 | rq->idle_stamp = 0; |
2434 | } | 2509 | } |
2435 | #endif | 2510 | #endif |
2436 | /* if a worker is waking up, notify workqueue */ | 2511 | } |
2437 | if ((p->flags & PF_WQ_WORKER) && success) | 2512 | |
2438 | wq_worker_waking_up(p, cpu_of(rq)); | 2513 | static void |
2514 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2515 | { | ||
2516 | #ifdef CONFIG_SMP | ||
2517 | if (p->sched_contributes_to_load) | ||
2518 | rq->nr_uninterruptible--; | ||
2519 | #endif | ||
2520 | |||
2521 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2522 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2523 | } | ||
2524 | |||
2525 | /* | ||
2526 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2527 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2528 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2529 | * the task is still ->on_rq. | ||
2530 | */ | ||
2531 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2532 | { | ||
2533 | struct rq *rq; | ||
2534 | int ret = 0; | ||
2535 | |||
2536 | rq = __task_rq_lock(p); | ||
2537 | if (p->on_rq) { | ||
2538 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2539 | ret = 1; | ||
2540 | } | ||
2541 | __task_rq_unlock(rq); | ||
2542 | |||
2543 | return ret; | ||
2544 | } | ||
2545 | |||
2546 | #ifdef CONFIG_SMP | ||
2547 | static void sched_ttwu_pending(void) | ||
2548 | { | ||
2549 | struct rq *rq = this_rq(); | ||
2550 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2551 | |||
2552 | if (!list) | ||
2553 | return; | ||
2554 | |||
2555 | raw_spin_lock(&rq->lock); | ||
2556 | |||
2557 | while (list) { | ||
2558 | struct task_struct *p = list; | ||
2559 | list = list->wake_entry; | ||
2560 | ttwu_do_activate(rq, p, 0); | ||
2561 | } | ||
2562 | |||
2563 | raw_spin_unlock(&rq->lock); | ||
2564 | } | ||
2565 | |||
2566 | void scheduler_ipi(void) | ||
2567 | { | ||
2568 | sched_ttwu_pending(); | ||
2569 | } | ||
2570 | |||
2571 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2572 | { | ||
2573 | struct rq *rq = cpu_rq(cpu); | ||
2574 | struct task_struct *next = rq->wake_list; | ||
2575 | |||
2576 | for (;;) { | ||
2577 | struct task_struct *old = next; | ||
2578 | |||
2579 | p->wake_entry = next; | ||
2580 | next = cmpxchg(&rq->wake_list, old, p); | ||
2581 | if (next == old) | ||
2582 | break; | ||
2583 | } | ||
2584 | |||
2585 | if (!next) | ||
2586 | smp_send_reschedule(cpu); | ||
2587 | } | ||
2588 | |||
2589 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
2590 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
2591 | { | ||
2592 | struct rq *rq; | ||
2593 | int ret = 0; | ||
2594 | |||
2595 | rq = __task_rq_lock(p); | ||
2596 | if (p->on_cpu) { | ||
2597 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
2598 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2599 | ret = 1; | ||
2600 | } | ||
2601 | __task_rq_unlock(rq); | ||
2602 | |||
2603 | return ret; | ||
2604 | |||
2605 | } | ||
2606 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
2607 | #endif /* CONFIG_SMP */ | ||
2608 | |||
2609 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2610 | { | ||
2611 | struct rq *rq = cpu_rq(cpu); | ||
2612 | |||
2613 | #if defined(CONFIG_SMP) | ||
2614 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2615 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | ||
2616 | ttwu_queue_remote(p, cpu); | ||
2617 | return; | ||
2618 | } | ||
2619 | #endif | ||
2620 | |||
2621 | raw_spin_lock(&rq->lock); | ||
2622 | ttwu_do_activate(rq, p, 0); | ||
2623 | raw_spin_unlock(&rq->lock); | ||
2439 | } | 2624 | } |
2440 | 2625 | ||
2441 | /** | 2626 | /** |
@@ -2453,92 +2638,66 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2453 | * Returns %true if @p was woken up, %false if it was already running | 2638 | * Returns %true if @p was woken up, %false if it was already running |
2454 | * or @state didn't match @p's state. | 2639 | * or @state didn't match @p's state. |
2455 | */ | 2640 | */ |
2456 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2641 | static int |
2457 | int wake_flags) | 2642 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2458 | { | 2643 | { |
2459 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2460 | unsigned long flags; | 2644 | unsigned long flags; |
2461 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2645 | int cpu, success = 0; |
2462 | struct rq *rq; | ||
2463 | |||
2464 | this_cpu = get_cpu(); | ||
2465 | 2646 | ||
2466 | smp_wmb(); | 2647 | smp_wmb(); |
2467 | rq = task_rq_lock(p, &flags); | 2648 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2468 | if (!(p->state & state)) | 2649 | if (!(p->state & state)) |
2469 | goto out; | 2650 | goto out; |
2470 | 2651 | ||
2471 | if (p->se.on_rq) | 2652 | success = 1; /* we're going to change ->state */ |
2472 | goto out_running; | ||
2473 | |||
2474 | cpu = task_cpu(p); | 2653 | cpu = task_cpu(p); |
2475 | orig_cpu = cpu; | ||
2476 | 2654 | ||
2477 | #ifdef CONFIG_SMP | 2655 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2478 | if (unlikely(task_running(rq, p))) | 2656 | goto stat; |
2479 | goto out_activate; | ||
2480 | 2657 | ||
2658 | #ifdef CONFIG_SMP | ||
2481 | /* | 2659 | /* |
2482 | * In order to handle concurrent wakeups and release the rq->lock | 2660 | * If the owning (remote) cpu is still in the middle of schedule() with |
2483 | * we put the task in TASK_WAKING state. | 2661 | * this task as prev, wait until its done referencing the task. |
2484 | * | ||
2485 | * First fix up the nr_uninterruptible count: | ||
2486 | */ | 2662 | */ |
2487 | if (task_contributes_to_load(p)) { | 2663 | while (p->on_cpu) { |
2488 | if (likely(cpu_online(orig_cpu))) | 2664 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2489 | rq->nr_uninterruptible--; | 2665 | /* |
2490 | else | 2666 | * In case the architecture enables interrupts in |
2491 | this_rq()->nr_uninterruptible--; | 2667 | * context_switch(), we cannot busy wait, since that |
2668 | * would lead to deadlocks when an interrupt hits and | ||
2669 | * tries to wake up @prev. So bail and do a complete | ||
2670 | * remote wakeup. | ||
2671 | */ | ||
2672 | if (ttwu_activate_remote(p, wake_flags)) | ||
2673 | goto stat; | ||
2674 | #else | ||
2675 | cpu_relax(); | ||
2676 | #endif | ||
2492 | } | 2677 | } |
2678 | /* | ||
2679 | * Pairs with the smp_wmb() in finish_lock_switch(). | ||
2680 | */ | ||
2681 | smp_rmb(); | ||
2682 | |||
2683 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | ||
2493 | p->state = TASK_WAKING; | 2684 | p->state = TASK_WAKING; |
2494 | 2685 | ||
2495 | if (p->sched_class->task_waking) { | 2686 | if (p->sched_class->task_waking) |
2496 | p->sched_class->task_waking(rq, p); | 2687 | p->sched_class->task_waking(p); |
2497 | en_flags |= ENQUEUE_WAKING; | ||
2498 | } | ||
2499 | 2688 | ||
2500 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | 2689 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
2501 | if (cpu != orig_cpu) | 2690 | if (task_cpu(p) != cpu) { |
2691 | wake_flags |= WF_MIGRATED; | ||
2502 | set_task_cpu(p, cpu); | 2692 | set_task_cpu(p, cpu); |
2503 | __task_rq_unlock(rq); | ||
2504 | |||
2505 | rq = cpu_rq(cpu); | ||
2506 | raw_spin_lock(&rq->lock); | ||
2507 | |||
2508 | /* | ||
2509 | * We migrated the task without holding either rq->lock, however | ||
2510 | * since the task is not on the task list itself, nobody else | ||
2511 | * will try and migrate the task, hence the rq should match the | ||
2512 | * cpu we just moved it to. | ||
2513 | */ | ||
2514 | WARN_ON(task_cpu(p) != cpu); | ||
2515 | WARN_ON(p->state != TASK_WAKING); | ||
2516 | |||
2517 | #ifdef CONFIG_SCHEDSTATS | ||
2518 | schedstat_inc(rq, ttwu_count); | ||
2519 | if (cpu == this_cpu) | ||
2520 | schedstat_inc(rq, ttwu_local); | ||
2521 | else { | ||
2522 | struct sched_domain *sd; | ||
2523 | for_each_domain(this_cpu, sd) { | ||
2524 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2525 | schedstat_inc(sd, ttwu_wake_remote); | ||
2526 | break; | ||
2527 | } | ||
2528 | } | ||
2529 | } | 2693 | } |
2530 | #endif /* CONFIG_SCHEDSTATS */ | ||
2531 | |||
2532 | out_activate: | ||
2533 | #endif /* CONFIG_SMP */ | 2694 | #endif /* CONFIG_SMP */ |
2534 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2695 | |
2535 | cpu == this_cpu, en_flags); | 2696 | ttwu_queue(p, cpu); |
2536 | success = 1; | 2697 | stat: |
2537 | out_running: | 2698 | ttwu_stat(p, cpu, wake_flags); |
2538 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2539 | out: | 2699 | out: |
2540 | task_rq_unlock(rq, &flags); | 2700 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2541 | put_cpu(); | ||
2542 | 2701 | ||
2543 | return success; | 2702 | return success; |
2544 | } | 2703 | } |
@@ -2547,31 +2706,34 @@ out: | |||
2547 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2706 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2548 | * @p: the thread to be awakened | 2707 | * @p: the thread to be awakened |
2549 | * | 2708 | * |
2550 | * Put @p on the run-queue if it's not already there. The caller must | 2709 | * Put @p on the run-queue if it's not already there. The caller must |
2551 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2710 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2552 | * the current task. this_rq() stays locked over invocation. | 2711 | * the current task. |
2553 | */ | 2712 | */ |
2554 | static void try_to_wake_up_local(struct task_struct *p) | 2713 | static void try_to_wake_up_local(struct task_struct *p) |
2555 | { | 2714 | { |
2556 | struct rq *rq = task_rq(p); | 2715 | struct rq *rq = task_rq(p); |
2557 | bool success = false; | ||
2558 | 2716 | ||
2559 | BUG_ON(rq != this_rq()); | 2717 | BUG_ON(rq != this_rq()); |
2560 | BUG_ON(p == current); | 2718 | BUG_ON(p == current); |
2561 | lockdep_assert_held(&rq->lock); | 2719 | lockdep_assert_held(&rq->lock); |
2562 | 2720 | ||
2721 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2722 | raw_spin_unlock(&rq->lock); | ||
2723 | raw_spin_lock(&p->pi_lock); | ||
2724 | raw_spin_lock(&rq->lock); | ||
2725 | } | ||
2726 | |||
2563 | if (!(p->state & TASK_NORMAL)) | 2727 | if (!(p->state & TASK_NORMAL)) |
2564 | return; | 2728 | goto out; |
2565 | 2729 | ||
2566 | if (!p->se.on_rq) { | 2730 | if (!p->on_rq) |
2567 | if (likely(!task_running(rq, p))) { | 2731 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2568 | schedstat_inc(rq, ttwu_count); | 2732 | |
2569 | schedstat_inc(rq, ttwu_local); | 2733 | ttwu_do_wakeup(rq, p, 0); |
2570 | } | 2734 | ttwu_stat(p, smp_processor_id(), 0); |
2571 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2735 | out: |
2572 | success = true; | 2736 | raw_spin_unlock(&p->pi_lock); |
2573 | } | ||
2574 | ttwu_post_activation(p, rq, 0, success); | ||
2575 | } | 2737 | } |
2576 | 2738 | ||
2577 | /** | 2739 | /** |
@@ -2604,19 +2766,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2604 | */ | 2766 | */ |
2605 | static void __sched_fork(struct task_struct *p) | 2767 | static void __sched_fork(struct task_struct *p) |
2606 | { | 2768 | { |
2769 | p->on_rq = 0; | ||
2770 | |||
2771 | p->se.on_rq = 0; | ||
2607 | p->se.exec_start = 0; | 2772 | p->se.exec_start = 0; |
2608 | p->se.sum_exec_runtime = 0; | 2773 | p->se.sum_exec_runtime = 0; |
2609 | p->se.prev_sum_exec_runtime = 0; | 2774 | p->se.prev_sum_exec_runtime = 0; |
2610 | p->se.nr_migrations = 0; | 2775 | p->se.nr_migrations = 0; |
2611 | p->se.vruntime = 0; | 2776 | p->se.vruntime = 0; |
2777 | INIT_LIST_HEAD(&p->se.group_node); | ||
2612 | 2778 | ||
2613 | #ifdef CONFIG_SCHEDSTATS | 2779 | #ifdef CONFIG_SCHEDSTATS |
2614 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2780 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2615 | #endif | 2781 | #endif |
2616 | 2782 | ||
2617 | INIT_LIST_HEAD(&p->rt.run_list); | 2783 | INIT_LIST_HEAD(&p->rt.run_list); |
2618 | p->se.on_rq = 0; | ||
2619 | INIT_LIST_HEAD(&p->se.group_node); | ||
2620 | 2784 | ||
2621 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2785 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2622 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2786 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2626,8 +2790,9 @@ static void __sched_fork(struct task_struct *p) | |||
2626 | /* | 2790 | /* |
2627 | * fork()/clone()-time setup: | 2791 | * fork()/clone()-time setup: |
2628 | */ | 2792 | */ |
2629 | void sched_fork(struct task_struct *p, int clone_flags) | 2793 | void sched_fork(struct task_struct *p) |
2630 | { | 2794 | { |
2795 | unsigned long flags; | ||
2631 | int cpu = get_cpu(); | 2796 | int cpu = get_cpu(); |
2632 | 2797 | ||
2633 | __sched_fork(p); | 2798 | __sched_fork(p); |
@@ -2678,16 +2843,16 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2678 | * | 2843 | * |
2679 | * Silence PROVE_RCU. | 2844 | * Silence PROVE_RCU. |
2680 | */ | 2845 | */ |
2681 | rcu_read_lock(); | 2846 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2682 | set_task_cpu(p, cpu); | 2847 | set_task_cpu(p, cpu); |
2683 | rcu_read_unlock(); | 2848 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2684 | 2849 | ||
2685 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2850 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2686 | if (likely(sched_info_on())) | 2851 | if (likely(sched_info_on())) |
2687 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2852 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2688 | #endif | 2853 | #endif |
2689 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2854 | #if defined(CONFIG_SMP) |
2690 | p->oncpu = 0; | 2855 | p->on_cpu = 0; |
2691 | #endif | 2856 | #endif |
2692 | #ifdef CONFIG_PREEMPT | 2857 | #ifdef CONFIG_PREEMPT |
2693 | /* Want to start with kernel preemption disabled. */ | 2858 | /* Want to start with kernel preemption disabled. */ |
@@ -2707,41 +2872,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2707 | * that must be done for every newly created context, then puts the task | 2872 | * that must be done for every newly created context, then puts the task |
2708 | * on the runqueue and wakes it. | 2873 | * on the runqueue and wakes it. |
2709 | */ | 2874 | */ |
2710 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2875 | void wake_up_new_task(struct task_struct *p) |
2711 | { | 2876 | { |
2712 | unsigned long flags; | 2877 | unsigned long flags; |
2713 | struct rq *rq; | 2878 | struct rq *rq; |
2714 | int cpu __maybe_unused = get_cpu(); | ||
2715 | 2879 | ||
2880 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2716 | #ifdef CONFIG_SMP | 2881 | #ifdef CONFIG_SMP |
2717 | rq = task_rq_lock(p, &flags); | ||
2718 | p->state = TASK_WAKING; | ||
2719 | |||
2720 | /* | 2882 | /* |
2721 | * Fork balancing, do it here and not earlier because: | 2883 | * Fork balancing, do it here and not earlier because: |
2722 | * - cpus_allowed can change in the fork path | 2884 | * - cpus_allowed can change in the fork path |
2723 | * - any previously selected cpu might disappear through hotplug | 2885 | * - any previously selected cpu might disappear through hotplug |
2724 | * | ||
2725 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2726 | * without people poking at ->cpus_allowed. | ||
2727 | */ | 2886 | */ |
2728 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2887 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2729 | set_task_cpu(p, cpu); | ||
2730 | |||
2731 | p->state = TASK_RUNNING; | ||
2732 | task_rq_unlock(rq, &flags); | ||
2733 | #endif | 2888 | #endif |
2734 | 2889 | ||
2735 | rq = task_rq_lock(p, &flags); | 2890 | rq = __task_rq_lock(p); |
2736 | activate_task(rq, p, 0); | 2891 | activate_task(rq, p, 0); |
2737 | trace_sched_wakeup_new(p, 1); | 2892 | p->on_rq = 1; |
2893 | trace_sched_wakeup_new(p, true); | ||
2738 | check_preempt_curr(rq, p, WF_FORK); | 2894 | check_preempt_curr(rq, p, WF_FORK); |
2739 | #ifdef CONFIG_SMP | 2895 | #ifdef CONFIG_SMP |
2740 | if (p->sched_class->task_woken) | 2896 | if (p->sched_class->task_woken) |
2741 | p->sched_class->task_woken(rq, p); | 2897 | p->sched_class->task_woken(rq, p); |
2742 | #endif | 2898 | #endif |
2743 | task_rq_unlock(rq, &flags); | 2899 | task_rq_unlock(rq, p, &flags); |
2744 | put_cpu(); | ||
2745 | } | 2900 | } |
2746 | 2901 | ||
2747 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2902 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -3450,27 +3605,22 @@ void sched_exec(void) | |||
3450 | { | 3605 | { |
3451 | struct task_struct *p = current; | 3606 | struct task_struct *p = current; |
3452 | unsigned long flags; | 3607 | unsigned long flags; |
3453 | struct rq *rq; | ||
3454 | int dest_cpu; | 3608 | int dest_cpu; |
3455 | 3609 | ||
3456 | rq = task_rq_lock(p, &flags); | 3610 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3457 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3611 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3458 | if (dest_cpu == smp_processor_id()) | 3612 | if (dest_cpu == smp_processor_id()) |
3459 | goto unlock; | 3613 | goto unlock; |
3460 | 3614 | ||
3461 | /* | 3615 | if (likely(cpu_active(dest_cpu))) { |
3462 | * select_task_rq() can race against ->cpus_allowed | ||
3463 | */ | ||
3464 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3465 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { | ||
3466 | struct migration_arg arg = { p, dest_cpu }; | 3616 | struct migration_arg arg = { p, dest_cpu }; |
3467 | 3617 | ||
3468 | task_rq_unlock(rq, &flags); | 3618 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3469 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3619 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3470 | return; | 3620 | return; |
3471 | } | 3621 | } |
3472 | unlock: | 3622 | unlock: |
3473 | task_rq_unlock(rq, &flags); | 3623 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3474 | } | 3624 | } |
3475 | 3625 | ||
3476 | #endif | 3626 | #endif |
@@ -3507,7 +3657,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3507 | 3657 | ||
3508 | rq = task_rq_lock(p, &flags); | 3658 | rq = task_rq_lock(p, &flags); |
3509 | ns = do_task_delta_exec(p, rq); | 3659 | ns = do_task_delta_exec(p, rq); |
3510 | task_rq_unlock(rq, &flags); | 3660 | task_rq_unlock(rq, p, &flags); |
3511 | 3661 | ||
3512 | return ns; | 3662 | return ns; |
3513 | } | 3663 | } |
@@ -3525,7 +3675,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3525 | 3675 | ||
3526 | rq = task_rq_lock(p, &flags); | 3676 | rq = task_rq_lock(p, &flags); |
3527 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3677 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3528 | task_rq_unlock(rq, &flags); | 3678 | task_rq_unlock(rq, p, &flags); |
3529 | 3679 | ||
3530 | return ns; | 3680 | return ns; |
3531 | } | 3681 | } |
@@ -3549,7 +3699,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3549 | rq = task_rq_lock(p, &flags); | 3699 | rq = task_rq_lock(p, &flags); |
3550 | thread_group_cputime(p, &totals); | 3700 | thread_group_cputime(p, &totals); |
3551 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3701 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3552 | task_rq_unlock(rq, &flags); | 3702 | task_rq_unlock(rq, p, &flags); |
3553 | 3703 | ||
3554 | return ns; | 3704 | return ns; |
3555 | } | 3705 | } |
@@ -3903,9 +4053,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3903 | /* | 4053 | /* |
3904 | * This function gets called by the timer code, with HZ frequency. | 4054 | * This function gets called by the timer code, with HZ frequency. |
3905 | * We call it with interrupts disabled. | 4055 | * We call it with interrupts disabled. |
3906 | * | ||
3907 | * It also gets called by the fork code, when changing the parent's | ||
3908 | * timeslices. | ||
3909 | */ | 4056 | */ |
3910 | void scheduler_tick(void) | 4057 | void scheduler_tick(void) |
3911 | { | 4058 | { |
@@ -4025,17 +4172,11 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4025 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4172 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4026 | 4173 | ||
4027 | schedstat_inc(this_rq(), sched_count); | 4174 | schedstat_inc(this_rq(), sched_count); |
4028 | #ifdef CONFIG_SCHEDSTATS | ||
4029 | if (unlikely(prev->lock_depth >= 0)) { | ||
4030 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); | ||
4031 | schedstat_inc(prev, sched_info.bkl_count); | ||
4032 | } | ||
4033 | #endif | ||
4034 | } | 4175 | } |
4035 | 4176 | ||
4036 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4177 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
4037 | { | 4178 | { |
4038 | if (prev->se.on_rq) | 4179 | if (prev->on_rq || rq->skip_clock_update < 0) |
4039 | update_rq_clock(rq); | 4180 | update_rq_clock(rq); |
4040 | prev->sched_class->put_prev_task(rq, prev); | 4181 | prev->sched_class->put_prev_task(rq, prev); |
4041 | } | 4182 | } |
@@ -4097,11 +4238,13 @@ need_resched: | |||
4097 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4238 | if (unlikely(signal_pending_state(prev->state, prev))) { |
4098 | prev->state = TASK_RUNNING; | 4239 | prev->state = TASK_RUNNING; |
4099 | } else { | 4240 | } else { |
4241 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4242 | prev->on_rq = 0; | ||
4243 | |||
4100 | /* | 4244 | /* |
4101 | * If a worker is going to sleep, notify and | 4245 | * If a worker went to sleep, notify and ask workqueue |
4102 | * ask workqueue whether it wants to wake up a | 4246 | * whether it wants to wake up a task to maintain |
4103 | * task to maintain concurrency. If so, wake | 4247 | * concurrency. |
4104 | * up the task. | ||
4105 | */ | 4248 | */ |
4106 | if (prev->flags & PF_WQ_WORKER) { | 4249 | if (prev->flags & PF_WQ_WORKER) { |
4107 | struct task_struct *to_wakeup; | 4250 | struct task_struct *to_wakeup; |
@@ -4110,11 +4253,10 @@ need_resched: | |||
4110 | if (to_wakeup) | 4253 | if (to_wakeup) |
4111 | try_to_wake_up_local(to_wakeup); | 4254 | try_to_wake_up_local(to_wakeup); |
4112 | } | 4255 | } |
4113 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4114 | 4256 | ||
4115 | /* | 4257 | /* |
4116 | * If we are going to sleep and we have plugged IO queued, make | 4258 | * If we are going to sleep and we have plugged IO |
4117 | * sure to submit it to avoid deadlocks. | 4259 | * queued, make sure to submit it to avoid deadlocks. |
4118 | */ | 4260 | */ |
4119 | if (blk_needs_flush_plug(prev)) { | 4261 | if (blk_needs_flush_plug(prev)) { |
4120 | raw_spin_unlock(&rq->lock); | 4262 | raw_spin_unlock(&rq->lock); |
@@ -4161,70 +4303,53 @@ need_resched: | |||
4161 | EXPORT_SYMBOL(schedule); | 4303 | EXPORT_SYMBOL(schedule); |
4162 | 4304 | ||
4163 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4305 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4164 | /* | ||
4165 | * Look out! "owner" is an entirely speculative pointer | ||
4166 | * access and not reliable. | ||
4167 | */ | ||
4168 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | ||
4169 | { | ||
4170 | unsigned int cpu; | ||
4171 | struct rq *rq; | ||
4172 | 4306 | ||
4173 | if (!sched_feat(OWNER_SPIN)) | 4307 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
4174 | return 0; | 4308 | { |
4309 | bool ret = false; | ||
4175 | 4310 | ||
4176 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4311 | rcu_read_lock(); |
4177 | /* | 4312 | if (lock->owner != owner) |
4178 | * Need to access the cpu field knowing that | 4313 | goto fail; |
4179 | * DEBUG_PAGEALLOC could have unmapped it if | ||
4180 | * the mutex owner just released it and exited. | ||
4181 | */ | ||
4182 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
4183 | return 0; | ||
4184 | #else | ||
4185 | cpu = owner->cpu; | ||
4186 | #endif | ||
4187 | 4314 | ||
4188 | /* | 4315 | /* |
4189 | * Even if the access succeeded (likely case), | 4316 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
4190 | * the cpu field may no longer be valid. | 4317 | * lock->owner still matches owner, if that fails, owner might |
4318 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4319 | * ensures the memory stays valid. | ||
4191 | */ | 4320 | */ |
4192 | if (cpu >= nr_cpumask_bits) | 4321 | barrier(); |
4193 | return 0; | ||
4194 | 4322 | ||
4195 | /* | 4323 | ret = owner->on_cpu; |
4196 | * We need to validate that we can do a | 4324 | fail: |
4197 | * get_cpu() and that we have the percpu area. | 4325 | rcu_read_unlock(); |
4198 | */ | ||
4199 | if (!cpu_online(cpu)) | ||
4200 | return 0; | ||
4201 | 4326 | ||
4202 | rq = cpu_rq(cpu); | 4327 | return ret; |
4328 | } | ||
4203 | 4329 | ||
4204 | for (;;) { | 4330 | /* |
4205 | /* | 4331 | * Look out! "owner" is an entirely speculative pointer |
4206 | * Owner changed, break to re-assess state. | 4332 | * access and not reliable. |
4207 | */ | 4333 | */ |
4208 | if (lock->owner != owner) { | 4334 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
4209 | /* | 4335 | { |
4210 | * If the lock has switched to a different owner, | 4336 | if (!sched_feat(OWNER_SPIN)) |
4211 | * we likely have heavy contention. Return 0 to quit | 4337 | return 0; |
4212 | * optimistic spinning and not contend further: | ||
4213 | */ | ||
4214 | if (lock->owner) | ||
4215 | return 0; | ||
4216 | break; | ||
4217 | } | ||
4218 | 4338 | ||
4219 | /* | 4339 | while (owner_running(lock, owner)) { |
4220 | * Is that owner really running on that cpu? | 4340 | if (need_resched()) |
4221 | */ | ||
4222 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
4223 | return 0; | 4341 | return 0; |
4224 | 4342 | ||
4225 | arch_mutex_cpu_relax(); | 4343 | arch_mutex_cpu_relax(); |
4226 | } | 4344 | } |
4227 | 4345 | ||
4346 | /* | ||
4347 | * If the owner changed to another task there is likely | ||
4348 | * heavy contention, stop spinning. | ||
4349 | */ | ||
4350 | if (lock->owner) | ||
4351 | return 0; | ||
4352 | |||
4228 | return 1; | 4353 | return 1; |
4229 | } | 4354 | } |
4230 | #endif | 4355 | #endif |
@@ -4684,19 +4809,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4684 | */ | 4809 | */ |
4685 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4810 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4686 | { | 4811 | { |
4687 | unsigned long flags; | ||
4688 | int oldprio, on_rq, running; | 4812 | int oldprio, on_rq, running; |
4689 | struct rq *rq; | 4813 | struct rq *rq; |
4690 | const struct sched_class *prev_class; | 4814 | const struct sched_class *prev_class; |
4691 | 4815 | ||
4692 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4816 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4693 | 4817 | ||
4694 | rq = task_rq_lock(p, &flags); | 4818 | rq = __task_rq_lock(p); |
4695 | 4819 | ||
4696 | trace_sched_pi_setprio(p, prio); | 4820 | trace_sched_pi_setprio(p, prio); |
4697 | oldprio = p->prio; | 4821 | oldprio = p->prio; |
4698 | prev_class = p->sched_class; | 4822 | prev_class = p->sched_class; |
4699 | on_rq = p->se.on_rq; | 4823 | on_rq = p->on_rq; |
4700 | running = task_current(rq, p); | 4824 | running = task_current(rq, p); |
4701 | if (on_rq) | 4825 | if (on_rq) |
4702 | dequeue_task(rq, p, 0); | 4826 | dequeue_task(rq, p, 0); |
@@ -4716,7 +4840,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4716 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4840 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4717 | 4841 | ||
4718 | check_class_changed(rq, p, prev_class, oldprio); | 4842 | check_class_changed(rq, p, prev_class, oldprio); |
4719 | task_rq_unlock(rq, &flags); | 4843 | __task_rq_unlock(rq); |
4720 | } | 4844 | } |
4721 | 4845 | ||
4722 | #endif | 4846 | #endif |
@@ -4744,7 +4868,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4744 | p->static_prio = NICE_TO_PRIO(nice); | 4868 | p->static_prio = NICE_TO_PRIO(nice); |
4745 | goto out_unlock; | 4869 | goto out_unlock; |
4746 | } | 4870 | } |
4747 | on_rq = p->se.on_rq; | 4871 | on_rq = p->on_rq; |
4748 | if (on_rq) | 4872 | if (on_rq) |
4749 | dequeue_task(rq, p, 0); | 4873 | dequeue_task(rq, p, 0); |
4750 | 4874 | ||
@@ -4764,7 +4888,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4764 | resched_task(rq->curr); | 4888 | resched_task(rq->curr); |
4765 | } | 4889 | } |
4766 | out_unlock: | 4890 | out_unlock: |
4767 | task_rq_unlock(rq, &flags); | 4891 | task_rq_unlock(rq, p, &flags); |
4768 | } | 4892 | } |
4769 | EXPORT_SYMBOL(set_user_nice); | 4893 | EXPORT_SYMBOL(set_user_nice); |
4770 | 4894 | ||
@@ -4878,8 +5002,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4878 | static void | 5002 | static void |
4879 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 5003 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4880 | { | 5004 | { |
4881 | BUG_ON(p->se.on_rq); | ||
4882 | |||
4883 | p->policy = policy; | 5005 | p->policy = policy; |
4884 | p->rt_priority = prio; | 5006 | p->rt_priority = prio; |
4885 | p->normal_prio = normal_prio(p); | 5007 | p->normal_prio = normal_prio(p); |
@@ -4994,20 +5116,17 @@ recheck: | |||
4994 | /* | 5116 | /* |
4995 | * make sure no PI-waiters arrive (or leave) while we are | 5117 | * make sure no PI-waiters arrive (or leave) while we are |
4996 | * changing the priority of the task: | 5118 | * changing the priority of the task: |
4997 | */ | 5119 | * |
4998 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
4999 | /* | ||
5000 | * To be able to change p->policy safely, the appropriate | 5120 | * To be able to change p->policy safely, the appropriate |
5001 | * runqueue lock must be held. | 5121 | * runqueue lock must be held. |
5002 | */ | 5122 | */ |
5003 | rq = __task_rq_lock(p); | 5123 | rq = task_rq_lock(p, &flags); |
5004 | 5124 | ||
5005 | /* | 5125 | /* |
5006 | * Changing the policy of the stop threads its a very bad idea | 5126 | * Changing the policy of the stop threads its a very bad idea |
5007 | */ | 5127 | */ |
5008 | if (p == rq->stop) { | 5128 | if (p == rq->stop) { |
5009 | __task_rq_unlock(rq); | 5129 | task_rq_unlock(rq, p, &flags); |
5010 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5011 | return -EINVAL; | 5130 | return -EINVAL; |
5012 | } | 5131 | } |
5013 | 5132 | ||
@@ -5031,8 +5150,7 @@ recheck: | |||
5031 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5150 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5032 | task_group(p)->rt_bandwidth.rt_runtime == 0 && | 5151 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
5033 | !task_group_is_autogroup(task_group(p))) { | 5152 | !task_group_is_autogroup(task_group(p))) { |
5034 | __task_rq_unlock(rq); | 5153 | task_rq_unlock(rq, p, &flags); |
5035 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5036 | return -EPERM; | 5154 | return -EPERM; |
5037 | } | 5155 | } |
5038 | } | 5156 | } |
@@ -5041,11 +5159,10 @@ recheck: | |||
5041 | /* recheck policy now with rq lock held */ | 5159 | /* recheck policy now with rq lock held */ |
5042 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5160 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
5043 | policy = oldpolicy = -1; | 5161 | policy = oldpolicy = -1; |
5044 | __task_rq_unlock(rq); | 5162 | task_rq_unlock(rq, p, &flags); |
5045 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5046 | goto recheck; | 5163 | goto recheck; |
5047 | } | 5164 | } |
5048 | on_rq = p->se.on_rq; | 5165 | on_rq = p->on_rq; |
5049 | running = task_current(rq, p); | 5166 | running = task_current(rq, p); |
5050 | if (on_rq) | 5167 | if (on_rq) |
5051 | deactivate_task(rq, p, 0); | 5168 | deactivate_task(rq, p, 0); |
@@ -5064,8 +5181,7 @@ recheck: | |||
5064 | activate_task(rq, p, 0); | 5181 | activate_task(rq, p, 0); |
5065 | 5182 | ||
5066 | check_class_changed(rq, p, prev_class, oldprio); | 5183 | check_class_changed(rq, p, prev_class, oldprio); |
5067 | __task_rq_unlock(rq); | 5184 | task_rq_unlock(rq, p, &flags); |
5068 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5069 | 5185 | ||
5070 | rt_mutex_adjust_pi(p); | 5186 | rt_mutex_adjust_pi(p); |
5071 | 5187 | ||
@@ -5316,7 +5432,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5316 | { | 5432 | { |
5317 | struct task_struct *p; | 5433 | struct task_struct *p; |
5318 | unsigned long flags; | 5434 | unsigned long flags; |
5319 | struct rq *rq; | ||
5320 | int retval; | 5435 | int retval; |
5321 | 5436 | ||
5322 | get_online_cpus(); | 5437 | get_online_cpus(); |
@@ -5331,9 +5446,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5331 | if (retval) | 5446 | if (retval) |
5332 | goto out_unlock; | 5447 | goto out_unlock; |
5333 | 5448 | ||
5334 | rq = task_rq_lock(p, &flags); | 5449 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5335 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5450 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5336 | task_rq_unlock(rq, &flags); | 5451 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5337 | 5452 | ||
5338 | out_unlock: | 5453 | out_unlock: |
5339 | rcu_read_unlock(); | 5454 | rcu_read_unlock(); |
@@ -5658,7 +5773,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5658 | 5773 | ||
5659 | rq = task_rq_lock(p, &flags); | 5774 | rq = task_rq_lock(p, &flags); |
5660 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5775 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5661 | task_rq_unlock(rq, &flags); | 5776 | task_rq_unlock(rq, p, &flags); |
5662 | 5777 | ||
5663 | rcu_read_unlock(); | 5778 | rcu_read_unlock(); |
5664 | jiffies_to_timespec(time_slice, &t); | 5779 | jiffies_to_timespec(time_slice, &t); |
@@ -5760,7 +5875,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5760 | idle->state = TASK_RUNNING; | 5875 | idle->state = TASK_RUNNING; |
5761 | idle->se.exec_start = sched_clock(); | 5876 | idle->se.exec_start = sched_clock(); |
5762 | 5877 | ||
5763 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5878 | do_set_cpus_allowed(idle, cpumask_of(cpu)); |
5764 | /* | 5879 | /* |
5765 | * We're having a chicken and egg problem, even though we are | 5880 | * We're having a chicken and egg problem, even though we are |
5766 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 5881 | * holding rq->lock, the cpu isn't yet set to this cpu so the |
@@ -5776,17 +5891,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5776 | rcu_read_unlock(); | 5891 | rcu_read_unlock(); |
5777 | 5892 | ||
5778 | rq->curr = rq->idle = idle; | 5893 | rq->curr = rq->idle = idle; |
5779 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5894 | #if defined(CONFIG_SMP) |
5780 | idle->oncpu = 1; | 5895 | idle->on_cpu = 1; |
5781 | #endif | 5896 | #endif |
5782 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5897 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5783 | 5898 | ||
5784 | /* Set the preempt count _outside_ the spinlocks! */ | 5899 | /* Set the preempt count _outside_ the spinlocks! */ |
5785 | #if defined(CONFIG_PREEMPT) | ||
5786 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5787 | #else | ||
5788 | task_thread_info(idle)->preempt_count = 0; | 5900 | task_thread_info(idle)->preempt_count = 0; |
5789 | #endif | 5901 | |
5790 | /* | 5902 | /* |
5791 | * The idle tasks have their own, simple scheduling class: | 5903 | * The idle tasks have their own, simple scheduling class: |
5792 | */ | 5904 | */ |
@@ -5851,6 +5963,16 @@ static inline void sched_init_granularity(void) | |||
5851 | } | 5963 | } |
5852 | 5964 | ||
5853 | #ifdef CONFIG_SMP | 5965 | #ifdef CONFIG_SMP |
5966 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | ||
5967 | { | ||
5968 | if (p->sched_class && p->sched_class->set_cpus_allowed) | ||
5969 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
5970 | else { | ||
5971 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
5972 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
5973 | } | ||
5974 | } | ||
5975 | |||
5854 | /* | 5976 | /* |
5855 | * This is how migration works: | 5977 | * This is how migration works: |
5856 | * | 5978 | * |
@@ -5881,52 +6003,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5881 | unsigned int dest_cpu; | 6003 | unsigned int dest_cpu; |
5882 | int ret = 0; | 6004 | int ret = 0; |
5883 | 6005 | ||
5884 | /* | ||
5885 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5886 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5887 | */ | ||
5888 | again: | ||
5889 | while (task_is_waking(p)) | ||
5890 | cpu_relax(); | ||
5891 | rq = task_rq_lock(p, &flags); | 6006 | rq = task_rq_lock(p, &flags); |
5892 | if (task_is_waking(p)) { | 6007 | |
5893 | task_rq_unlock(rq, &flags); | 6008 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5894 | goto again; | 6009 | goto out; |
5895 | } | ||
5896 | 6010 | ||
5897 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 6011 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5898 | ret = -EINVAL; | 6012 | ret = -EINVAL; |
5899 | goto out; | 6013 | goto out; |
5900 | } | 6014 | } |
5901 | 6015 | ||
5902 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 6016 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5903 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5904 | ret = -EINVAL; | 6017 | ret = -EINVAL; |
5905 | goto out; | 6018 | goto out; |
5906 | } | 6019 | } |
5907 | 6020 | ||
5908 | if (p->sched_class->set_cpus_allowed) | 6021 | do_set_cpus_allowed(p, new_mask); |
5909 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
5910 | else { | ||
5911 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
5912 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
5913 | } | ||
5914 | 6022 | ||
5915 | /* Can the task run on the task's current CPU? If so, we're done */ | 6023 | /* Can the task run on the task's current CPU? If so, we're done */ |
5916 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 6024 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
5917 | goto out; | 6025 | goto out; |
5918 | 6026 | ||
5919 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 6027 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5920 | if (migrate_task(p, rq)) { | 6028 | if (p->on_rq) { |
5921 | struct migration_arg arg = { p, dest_cpu }; | 6029 | struct migration_arg arg = { p, dest_cpu }; |
5922 | /* Need help from migration thread: drop lock and wait. */ | 6030 | /* Need help from migration thread: drop lock and wait. */ |
5923 | task_rq_unlock(rq, &flags); | 6031 | task_rq_unlock(rq, p, &flags); |
5924 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 6032 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5925 | tlb_migrate_finish(p->mm); | 6033 | tlb_migrate_finish(p->mm); |
5926 | return 0; | 6034 | return 0; |
5927 | } | 6035 | } |
5928 | out: | 6036 | out: |
5929 | task_rq_unlock(rq, &flags); | 6037 | task_rq_unlock(rq, p, &flags); |
5930 | 6038 | ||
5931 | return ret; | 6039 | return ret; |
5932 | } | 6040 | } |
@@ -5954,6 +6062,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5954 | rq_src = cpu_rq(src_cpu); | 6062 | rq_src = cpu_rq(src_cpu); |
5955 | rq_dest = cpu_rq(dest_cpu); | 6063 | rq_dest = cpu_rq(dest_cpu); |
5956 | 6064 | ||
6065 | raw_spin_lock(&p->pi_lock); | ||
5957 | double_rq_lock(rq_src, rq_dest); | 6066 | double_rq_lock(rq_src, rq_dest); |
5958 | /* Already moved. */ | 6067 | /* Already moved. */ |
5959 | if (task_cpu(p) != src_cpu) | 6068 | if (task_cpu(p) != src_cpu) |
@@ -5966,7 +6075,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5966 | * If we're not on a rq, the next wake-up will ensure we're | 6075 | * If we're not on a rq, the next wake-up will ensure we're |
5967 | * placed properly. | 6076 | * placed properly. |
5968 | */ | 6077 | */ |
5969 | if (p->se.on_rq) { | 6078 | if (p->on_rq) { |
5970 | deactivate_task(rq_src, p, 0); | 6079 | deactivate_task(rq_src, p, 0); |
5971 | set_task_cpu(p, dest_cpu); | 6080 | set_task_cpu(p, dest_cpu); |
5972 | activate_task(rq_dest, p, 0); | 6081 | activate_task(rq_dest, p, 0); |
@@ -5976,6 +6085,7 @@ done: | |||
5976 | ret = 1; | 6085 | ret = 1; |
5977 | fail: | 6086 | fail: |
5978 | double_rq_unlock(rq_src, rq_dest); | 6087 | double_rq_unlock(rq_src, rq_dest); |
6088 | raw_spin_unlock(&p->pi_lock); | ||
5979 | return ret; | 6089 | return ret; |
5980 | } | 6090 | } |
5981 | 6091 | ||
@@ -6316,6 +6426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6316 | 6426 | ||
6317 | #ifdef CONFIG_HOTPLUG_CPU | 6427 | #ifdef CONFIG_HOTPLUG_CPU |
6318 | case CPU_DYING: | 6428 | case CPU_DYING: |
6429 | sched_ttwu_pending(); | ||
6319 | /* Update our root-domain */ | 6430 | /* Update our root-domain */ |
6320 | raw_spin_lock_irqsave(&rq->lock, flags); | 6431 | raw_spin_lock_irqsave(&rq->lock, flags); |
6321 | if (rq->rd) { | 6432 | if (rq->rd) { |
@@ -6394,6 +6505,8 @@ early_initcall(migration_init); | |||
6394 | 6505 | ||
6395 | #ifdef CONFIG_SMP | 6506 | #ifdef CONFIG_SMP |
6396 | 6507 | ||
6508 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6509 | |||
6397 | #ifdef CONFIG_SCHED_DEBUG | 6510 | #ifdef CONFIG_SCHED_DEBUG |
6398 | 6511 | ||
6399 | static __read_mostly int sched_domain_debug_enabled; | 6512 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6468,7 +6581,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6468 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6581 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
6469 | 6582 | ||
6470 | printk(KERN_CONT " %s", str); | 6583 | printk(KERN_CONT " %s", str); |
6471 | if (group->cpu_power != SCHED_LOAD_SCALE) { | 6584 | if (group->cpu_power != SCHED_POWER_SCALE) { |
6472 | printk(KERN_CONT " (cpu_power = %d)", | 6585 | printk(KERN_CONT " (cpu_power = %d)", |
6473 | group->cpu_power); | 6586 | group->cpu_power); |
6474 | } | 6587 | } |
@@ -6489,7 +6602,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6489 | 6602 | ||
6490 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6603 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6491 | { | 6604 | { |
6492 | cpumask_var_t groupmask; | ||
6493 | int level = 0; | 6605 | int level = 0; |
6494 | 6606 | ||
6495 | if (!sched_domain_debug_enabled) | 6607 | if (!sched_domain_debug_enabled) |
@@ -6502,20 +6614,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6502 | 6614 | ||
6503 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6615 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6504 | 6616 | ||
6505 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6506 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6507 | return; | ||
6508 | } | ||
6509 | |||
6510 | for (;;) { | 6617 | for (;;) { |
6511 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6618 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6512 | break; | 6619 | break; |
6513 | level++; | 6620 | level++; |
6514 | sd = sd->parent; | 6621 | sd = sd->parent; |
6515 | if (!sd) | 6622 | if (!sd) |
6516 | break; | 6623 | break; |
6517 | } | 6624 | } |
6518 | free_cpumask_var(groupmask); | ||
6519 | } | 6625 | } |
6520 | #else /* !CONFIG_SCHED_DEBUG */ | 6626 | #else /* !CONFIG_SCHED_DEBUG */ |
6521 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6627 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6572,12 +6678,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6572 | return 1; | 6678 | return 1; |
6573 | } | 6679 | } |
6574 | 6680 | ||
6575 | static void free_rootdomain(struct root_domain *rd) | 6681 | static void free_rootdomain(struct rcu_head *rcu) |
6576 | { | 6682 | { |
6577 | synchronize_sched(); | 6683 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6578 | 6684 | ||
6579 | cpupri_cleanup(&rd->cpupri); | 6685 | cpupri_cleanup(&rd->cpupri); |
6580 | |||
6581 | free_cpumask_var(rd->rto_mask); | 6686 | free_cpumask_var(rd->rto_mask); |
6582 | free_cpumask_var(rd->online); | 6687 | free_cpumask_var(rd->online); |
6583 | free_cpumask_var(rd->span); | 6688 | free_cpumask_var(rd->span); |
@@ -6618,7 +6723,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6618 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6723 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6619 | 6724 | ||
6620 | if (old_rd) | 6725 | if (old_rd) |
6621 | free_rootdomain(old_rd); | 6726 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6622 | } | 6727 | } |
6623 | 6728 | ||
6624 | static int init_rootdomain(struct root_domain *rd) | 6729 | static int init_rootdomain(struct root_domain *rd) |
@@ -6669,6 +6774,25 @@ static struct root_domain *alloc_rootdomain(void) | |||
6669 | return rd; | 6774 | return rd; |
6670 | } | 6775 | } |
6671 | 6776 | ||
6777 | static void free_sched_domain(struct rcu_head *rcu) | ||
6778 | { | ||
6779 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6780 | if (atomic_dec_and_test(&sd->groups->ref)) | ||
6781 | kfree(sd->groups); | ||
6782 | kfree(sd); | ||
6783 | } | ||
6784 | |||
6785 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6786 | { | ||
6787 | call_rcu(&sd->rcu, free_sched_domain); | ||
6788 | } | ||
6789 | |||
6790 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6791 | { | ||
6792 | for (; sd; sd = sd->parent) | ||
6793 | destroy_sched_domain(sd, cpu); | ||
6794 | } | ||
6795 | |||
6672 | /* | 6796 | /* |
6673 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6797 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6674 | * hold the hotplug lock. | 6798 | * hold the hotplug lock. |
@@ -6679,9 +6803,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6679 | struct rq *rq = cpu_rq(cpu); | 6803 | struct rq *rq = cpu_rq(cpu); |
6680 | struct sched_domain *tmp; | 6804 | struct sched_domain *tmp; |
6681 | 6805 | ||
6682 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6683 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6684 | |||
6685 | /* Remove the sched domains which do not contribute to scheduling. */ | 6806 | /* Remove the sched domains which do not contribute to scheduling. */ |
6686 | for (tmp = sd; tmp; ) { | 6807 | for (tmp = sd; tmp; ) { |
6687 | struct sched_domain *parent = tmp->parent; | 6808 | struct sched_domain *parent = tmp->parent; |
@@ -6692,12 +6813,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6692 | tmp->parent = parent->parent; | 6813 | tmp->parent = parent->parent; |
6693 | if (parent->parent) | 6814 | if (parent->parent) |
6694 | parent->parent->child = tmp; | 6815 | parent->parent->child = tmp; |
6816 | destroy_sched_domain(parent, cpu); | ||
6695 | } else | 6817 | } else |
6696 | tmp = tmp->parent; | 6818 | tmp = tmp->parent; |
6697 | } | 6819 | } |
6698 | 6820 | ||
6699 | if (sd && sd_degenerate(sd)) { | 6821 | if (sd && sd_degenerate(sd)) { |
6822 | tmp = sd; | ||
6700 | sd = sd->parent; | 6823 | sd = sd->parent; |
6824 | destroy_sched_domain(tmp, cpu); | ||
6701 | if (sd) | 6825 | if (sd) |
6702 | sd->child = NULL; | 6826 | sd->child = NULL; |
6703 | } | 6827 | } |
@@ -6705,7 +6829,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6705 | sched_domain_debug(sd, cpu); | 6829 | sched_domain_debug(sd, cpu); |
6706 | 6830 | ||
6707 | rq_attach_root(rq, rd); | 6831 | rq_attach_root(rq, rd); |
6832 | tmp = rq->sd; | ||
6708 | rcu_assign_pointer(rq->sd, sd); | 6833 | rcu_assign_pointer(rq->sd, sd); |
6834 | destroy_sched_domains(tmp, cpu); | ||
6709 | } | 6835 | } |
6710 | 6836 | ||
6711 | /* cpus with isolated domains */ | 6837 | /* cpus with isolated domains */ |
@@ -6721,56 +6847,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6721 | 6847 | ||
6722 | __setup("isolcpus=", isolated_cpu_setup); | 6848 | __setup("isolcpus=", isolated_cpu_setup); |
6723 | 6849 | ||
6724 | /* | ||
6725 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6726 | * to a function which identifies what group(along with sched group) a CPU | ||
6727 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6728 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6729 | * | ||
6730 | * init_sched_build_groups will build a circular linked list of the groups | ||
6731 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6732 | * and ->cpu_power to 0. | ||
6733 | */ | ||
6734 | static void | ||
6735 | init_sched_build_groups(const struct cpumask *span, | ||
6736 | const struct cpumask *cpu_map, | ||
6737 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6738 | struct sched_group **sg, | ||
6739 | struct cpumask *tmpmask), | ||
6740 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6741 | { | ||
6742 | struct sched_group *first = NULL, *last = NULL; | ||
6743 | int i; | ||
6744 | |||
6745 | cpumask_clear(covered); | ||
6746 | |||
6747 | for_each_cpu(i, span) { | ||
6748 | struct sched_group *sg; | ||
6749 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6750 | int j; | ||
6751 | |||
6752 | if (cpumask_test_cpu(i, covered)) | ||
6753 | continue; | ||
6754 | |||
6755 | cpumask_clear(sched_group_cpus(sg)); | ||
6756 | sg->cpu_power = 0; | ||
6757 | |||
6758 | for_each_cpu(j, span) { | ||
6759 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6760 | continue; | ||
6761 | |||
6762 | cpumask_set_cpu(j, covered); | ||
6763 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6764 | } | ||
6765 | if (!first) | ||
6766 | first = sg; | ||
6767 | if (last) | ||
6768 | last->next = sg; | ||
6769 | last = sg; | ||
6770 | } | ||
6771 | last->next = first; | ||
6772 | } | ||
6773 | |||
6774 | #define SD_NODES_PER_DOMAIN 16 | 6850 | #define SD_NODES_PER_DOMAIN 16 |
6775 | 6851 | ||
6776 | #ifdef CONFIG_NUMA | 6852 | #ifdef CONFIG_NUMA |
@@ -6787,7 +6863,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6787 | */ | 6863 | */ |
6788 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 6864 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6789 | { | 6865 | { |
6790 | int i, n, val, min_val, best_node = 0; | 6866 | int i, n, val, min_val, best_node = -1; |
6791 | 6867 | ||
6792 | min_val = INT_MAX; | 6868 | min_val = INT_MAX; |
6793 | 6869 | ||
@@ -6811,7 +6887,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6811 | } | 6887 | } |
6812 | } | 6888 | } |
6813 | 6889 | ||
6814 | node_set(best_node, *used_nodes); | 6890 | if (best_node != -1) |
6891 | node_set(best_node, *used_nodes); | ||
6815 | return best_node; | 6892 | return best_node; |
6816 | } | 6893 | } |
6817 | 6894 | ||
@@ -6837,315 +6914,130 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6837 | 6914 | ||
6838 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6915 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6839 | int next_node = find_next_best_node(node, &used_nodes); | 6916 | int next_node = find_next_best_node(node, &used_nodes); |
6840 | 6917 | if (next_node < 0) | |
6918 | break; | ||
6841 | cpumask_or(span, span, cpumask_of_node(next_node)); | 6919 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6842 | } | 6920 | } |
6843 | } | 6921 | } |
6922 | |||
6923 | static const struct cpumask *cpu_node_mask(int cpu) | ||
6924 | { | ||
6925 | lockdep_assert_held(&sched_domains_mutex); | ||
6926 | |||
6927 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
6928 | |||
6929 | return sched_domains_tmpmask; | ||
6930 | } | ||
6931 | |||
6932 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
6933 | { | ||
6934 | return cpu_possible_mask; | ||
6935 | } | ||
6844 | #endif /* CONFIG_NUMA */ | 6936 | #endif /* CONFIG_NUMA */ |
6845 | 6937 | ||
6846 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6938 | static const struct cpumask *cpu_cpu_mask(int cpu) |
6939 | { | ||
6940 | return cpumask_of_node(cpu_to_node(cpu)); | ||
6941 | } | ||
6847 | 6942 | ||
6848 | /* | 6943 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6849 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6850 | * | ||
6851 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6852 | * and struct sched_domain. ) | ||
6853 | */ | ||
6854 | struct static_sched_group { | ||
6855 | struct sched_group sg; | ||
6856 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6857 | }; | ||
6858 | 6944 | ||
6859 | struct static_sched_domain { | 6945 | struct sd_data { |
6860 | struct sched_domain sd; | 6946 | struct sched_domain **__percpu sd; |
6861 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 6947 | struct sched_group **__percpu sg; |
6862 | }; | 6948 | }; |
6863 | 6949 | ||
6864 | struct s_data { | 6950 | struct s_data { |
6865 | #ifdef CONFIG_NUMA | 6951 | struct sched_domain ** __percpu sd; |
6866 | int sd_allnodes; | ||
6867 | cpumask_var_t domainspan; | ||
6868 | cpumask_var_t covered; | ||
6869 | cpumask_var_t notcovered; | ||
6870 | #endif | ||
6871 | cpumask_var_t nodemask; | ||
6872 | cpumask_var_t this_sibling_map; | ||
6873 | cpumask_var_t this_core_map; | ||
6874 | cpumask_var_t this_book_map; | ||
6875 | cpumask_var_t send_covered; | ||
6876 | cpumask_var_t tmpmask; | ||
6877 | struct sched_group **sched_group_nodes; | ||
6878 | struct root_domain *rd; | 6952 | struct root_domain *rd; |
6879 | }; | 6953 | }; |
6880 | 6954 | ||
6881 | enum s_alloc { | 6955 | enum s_alloc { |
6882 | sa_sched_groups = 0, | ||
6883 | sa_rootdomain, | 6956 | sa_rootdomain, |
6884 | sa_tmpmask, | 6957 | sa_sd, |
6885 | sa_send_covered, | 6958 | sa_sd_storage, |
6886 | sa_this_book_map, | ||
6887 | sa_this_core_map, | ||
6888 | sa_this_sibling_map, | ||
6889 | sa_nodemask, | ||
6890 | sa_sched_group_nodes, | ||
6891 | #ifdef CONFIG_NUMA | ||
6892 | sa_notcovered, | ||
6893 | sa_covered, | ||
6894 | sa_domainspan, | ||
6895 | #endif | ||
6896 | sa_none, | 6959 | sa_none, |
6897 | }; | 6960 | }; |
6898 | 6961 | ||
6899 | /* | 6962 | struct sched_domain_topology_level; |
6900 | * SMT sched-domains: | ||
6901 | */ | ||
6902 | #ifdef CONFIG_SCHED_SMT | ||
6903 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6904 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6905 | 6963 | ||
6906 | static int | 6964 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6907 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 6965 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6908 | struct sched_group **sg, struct cpumask *unused) | ||
6909 | { | ||
6910 | if (sg) | ||
6911 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6912 | return cpu; | ||
6913 | } | ||
6914 | #endif /* CONFIG_SCHED_SMT */ | ||
6915 | |||
6916 | /* | ||
6917 | * multi-core sched-domains: | ||
6918 | */ | ||
6919 | #ifdef CONFIG_SCHED_MC | ||
6920 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | ||
6921 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6922 | 6966 | ||
6923 | static int | 6967 | struct sched_domain_topology_level { |
6924 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6968 | sched_domain_init_f init; |
6925 | struct sched_group **sg, struct cpumask *mask) | 6969 | sched_domain_mask_f mask; |
6926 | { | 6970 | struct sd_data data; |
6927 | int group; | 6971 | }; |
6928 | #ifdef CONFIG_SCHED_SMT | ||
6929 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6930 | group = cpumask_first(mask); | ||
6931 | #else | ||
6932 | group = cpu; | ||
6933 | #endif | ||
6934 | if (sg) | ||
6935 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6936 | return group; | ||
6937 | } | ||
6938 | #endif /* CONFIG_SCHED_MC */ | ||
6939 | 6972 | ||
6940 | /* | 6973 | /* |
6941 | * book sched-domains: | 6974 | * Assumes the sched_domain tree is fully constructed |
6942 | */ | 6975 | */ |
6943 | #ifdef CONFIG_SCHED_BOOK | 6976 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6944 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6945 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6946 | |||
6947 | static int | ||
6948 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, | ||
6949 | struct sched_group **sg, struct cpumask *mask) | ||
6950 | { | 6977 | { |
6951 | int group = cpu; | 6978 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6952 | #ifdef CONFIG_SCHED_MC | 6979 | struct sched_domain *child = sd->child; |
6953 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6954 | group = cpumask_first(mask); | ||
6955 | #elif defined(CONFIG_SCHED_SMT) | ||
6956 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6957 | group = cpumask_first(mask); | ||
6958 | #endif | ||
6959 | if (sg) | ||
6960 | *sg = &per_cpu(sched_group_book, group).sg; | ||
6961 | return group; | ||
6962 | } | ||
6963 | #endif /* CONFIG_SCHED_BOOK */ | ||
6964 | 6980 | ||
6965 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6981 | if (child) |
6966 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6982 | cpu = cpumask_first(sched_domain_span(child)); |
6967 | 6983 | ||
6968 | static int | ||
6969 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | ||
6970 | struct sched_group **sg, struct cpumask *mask) | ||
6971 | { | ||
6972 | int group; | ||
6973 | #ifdef CONFIG_SCHED_BOOK | ||
6974 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6975 | group = cpumask_first(mask); | ||
6976 | #elif defined(CONFIG_SCHED_MC) | ||
6977 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6978 | group = cpumask_first(mask); | ||
6979 | #elif defined(CONFIG_SCHED_SMT) | ||
6980 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6981 | group = cpumask_first(mask); | ||
6982 | #else | ||
6983 | group = cpu; | ||
6984 | #endif | ||
6985 | if (sg) | 6984 | if (sg) |
6986 | *sg = &per_cpu(sched_group_phys, group).sg; | 6985 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
6987 | return group; | 6986 | |
6987 | return cpu; | ||
6988 | } | 6988 | } |
6989 | 6989 | ||
6990 | #ifdef CONFIG_NUMA | ||
6991 | /* | 6990 | /* |
6992 | * The init_sched_build_groups can't handle what we want to do with node | 6991 | * build_sched_groups takes the cpumask we wish to span, and a pointer |
6993 | * groups, so roll our own. Now each node has its own list of groups which | 6992 | * to a function which identifies what group(along with sched group) a CPU |
6994 | * gets dynamically allocated. | 6993 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids |
6994 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6995 | * | ||
6996 | * build_sched_groups will build a circular linked list of the groups | ||
6997 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6998 | * and ->cpu_power to 0. | ||
6995 | */ | 6999 | */ |
6996 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | 7000 | static void |
6997 | static struct sched_group ***sched_group_nodes_bycpu; | 7001 | build_sched_groups(struct sched_domain *sd) |
6998 | |||
6999 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | ||
7000 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | ||
7001 | |||
7002 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | ||
7003 | struct sched_group **sg, | ||
7004 | struct cpumask *nodemask) | ||
7005 | { | ||
7006 | int group; | ||
7007 | |||
7008 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | ||
7009 | group = cpumask_first(nodemask); | ||
7010 | |||
7011 | if (sg) | ||
7012 | *sg = &per_cpu(sched_group_allnodes, group).sg; | ||
7013 | return group; | ||
7014 | } | ||
7015 | |||
7016 | static void init_numa_sched_groups_power(struct sched_group *group_head) | ||
7017 | { | ||
7018 | struct sched_group *sg = group_head; | ||
7019 | int j; | ||
7020 | |||
7021 | if (!sg) | ||
7022 | return; | ||
7023 | do { | ||
7024 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
7025 | struct sched_domain *sd; | ||
7026 | |||
7027 | sd = &per_cpu(phys_domains, j).sd; | ||
7028 | if (j != group_first_cpu(sd->groups)) { | ||
7029 | /* | ||
7030 | * Only add "power" once for each | ||
7031 | * physical package. | ||
7032 | */ | ||
7033 | continue; | ||
7034 | } | ||
7035 | |||
7036 | sg->cpu_power += sd->groups->cpu_power; | ||
7037 | } | ||
7038 | sg = sg->next; | ||
7039 | } while (sg != group_head); | ||
7040 | } | ||
7041 | |||
7042 | static int build_numa_sched_groups(struct s_data *d, | ||
7043 | const struct cpumask *cpu_map, int num) | ||
7044 | { | 7002 | { |
7045 | struct sched_domain *sd; | 7003 | struct sched_group *first = NULL, *last = NULL; |
7046 | struct sched_group *sg, *prev; | 7004 | struct sd_data *sdd = sd->private; |
7047 | int n, j; | 7005 | const struct cpumask *span = sched_domain_span(sd); |
7048 | 7006 | struct cpumask *covered; | |
7049 | cpumask_clear(d->covered); | 7007 | int i; |
7050 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
7051 | if (cpumask_empty(d->nodemask)) { | ||
7052 | d->sched_group_nodes[num] = NULL; | ||
7053 | goto out; | ||
7054 | } | ||
7055 | |||
7056 | sched_domain_node_span(num, d->domainspan); | ||
7057 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
7058 | |||
7059 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7060 | GFP_KERNEL, num); | ||
7061 | if (!sg) { | ||
7062 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
7063 | num); | ||
7064 | return -ENOMEM; | ||
7065 | } | ||
7066 | d->sched_group_nodes[num] = sg; | ||
7067 | |||
7068 | for_each_cpu(j, d->nodemask) { | ||
7069 | sd = &per_cpu(node_domains, j).sd; | ||
7070 | sd->groups = sg; | ||
7071 | } | ||
7072 | 7008 | ||
7073 | sg->cpu_power = 0; | 7009 | lockdep_assert_held(&sched_domains_mutex); |
7074 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 7010 | covered = sched_domains_tmpmask; |
7075 | sg->next = sg; | ||
7076 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
7077 | 7011 | ||
7078 | prev = sg; | 7012 | cpumask_clear(covered); |
7079 | for (j = 0; j < nr_node_ids; j++) { | ||
7080 | n = (num + j) % nr_node_ids; | ||
7081 | cpumask_complement(d->notcovered, d->covered); | ||
7082 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
7083 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
7084 | if (cpumask_empty(d->tmpmask)) | ||
7085 | break; | ||
7086 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
7087 | if (cpumask_empty(d->tmpmask)) | ||
7088 | continue; | ||
7089 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7090 | GFP_KERNEL, num); | ||
7091 | if (!sg) { | ||
7092 | printk(KERN_WARNING | ||
7093 | "Can not alloc domain group for node %d\n", j); | ||
7094 | return -ENOMEM; | ||
7095 | } | ||
7096 | sg->cpu_power = 0; | ||
7097 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
7098 | sg->next = prev->next; | ||
7099 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
7100 | prev->next = sg; | ||
7101 | prev = sg; | ||
7102 | } | ||
7103 | out: | ||
7104 | return 0; | ||
7105 | } | ||
7106 | #endif /* CONFIG_NUMA */ | ||
7107 | |||
7108 | #ifdef CONFIG_NUMA | ||
7109 | /* Free memory allocated for various sched_group structures */ | ||
7110 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7111 | struct cpumask *nodemask) | ||
7112 | { | ||
7113 | int cpu, i; | ||
7114 | 7013 | ||
7115 | for_each_cpu(cpu, cpu_map) { | 7014 | for_each_cpu(i, span) { |
7116 | struct sched_group **sched_group_nodes | 7015 | struct sched_group *sg; |
7117 | = sched_group_nodes_bycpu[cpu]; | 7016 | int group = get_group(i, sdd, &sg); |
7017 | int j; | ||
7118 | 7018 | ||
7119 | if (!sched_group_nodes) | 7019 | if (cpumask_test_cpu(i, covered)) |
7120 | continue; | 7020 | continue; |
7121 | 7021 | ||
7122 | for (i = 0; i < nr_node_ids; i++) { | 7022 | cpumask_clear(sched_group_cpus(sg)); |
7123 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7023 | sg->cpu_power = 0; |
7124 | 7024 | ||
7125 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 7025 | for_each_cpu(j, span) { |
7126 | if (cpumask_empty(nodemask)) | 7026 | if (get_group(j, sdd, NULL) != group) |
7127 | continue; | 7027 | continue; |
7128 | 7028 | ||
7129 | if (sg == NULL) | 7029 | cpumask_set_cpu(j, covered); |
7130 | continue; | 7030 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
7131 | sg = sg->next; | ||
7132 | next_sg: | ||
7133 | oldsg = sg; | ||
7134 | sg = sg->next; | ||
7135 | kfree(oldsg); | ||
7136 | if (oldsg != sched_group_nodes[i]) | ||
7137 | goto next_sg; | ||
7138 | } | 7031 | } |
7139 | kfree(sched_group_nodes); | 7032 | |
7140 | sched_group_nodes_bycpu[cpu] = NULL; | 7033 | if (!first) |
7034 | first = sg; | ||
7035 | if (last) | ||
7036 | last->next = sg; | ||
7037 | last = sg; | ||
7141 | } | 7038 | } |
7039 | last->next = first; | ||
7142 | } | 7040 | } |
7143 | #else /* !CONFIG_NUMA */ | ||
7144 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7145 | struct cpumask *nodemask) | ||
7146 | { | ||
7147 | } | ||
7148 | #endif /* CONFIG_NUMA */ | ||
7149 | 7041 | ||
7150 | /* | 7042 | /* |
7151 | * Initialize sched groups cpu_power. | 7043 | * Initialize sched groups cpu_power. |
@@ -7159,11 +7051,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
7159 | */ | 7051 | */ |
7160 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7052 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
7161 | { | 7053 | { |
7162 | struct sched_domain *child; | ||
7163 | struct sched_group *group; | ||
7164 | long power; | ||
7165 | int weight; | ||
7166 | |||
7167 | WARN_ON(!sd || !sd->groups); | 7054 | WARN_ON(!sd || !sd->groups); |
7168 | 7055 | ||
7169 | if (cpu != group_first_cpu(sd->groups)) | 7056 | if (cpu != group_first_cpu(sd->groups)) |
@@ -7171,36 +7058,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7171 | 7058 | ||
7172 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7059 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); |
7173 | 7060 | ||
7174 | child = sd->child; | 7061 | update_group_power(sd, cpu); |
7175 | |||
7176 | sd->groups->cpu_power = 0; | ||
7177 | |||
7178 | if (!child) { | ||
7179 | power = SCHED_LOAD_SCALE; | ||
7180 | weight = cpumask_weight(sched_domain_span(sd)); | ||
7181 | /* | ||
7182 | * SMT siblings share the power of a single core. | ||
7183 | * Usually multiple threads get a better yield out of | ||
7184 | * that one core than a single thread would have, | ||
7185 | * reflect that in sd->smt_gain. | ||
7186 | */ | ||
7187 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
7188 | power *= sd->smt_gain; | ||
7189 | power /= weight; | ||
7190 | power >>= SCHED_LOAD_SHIFT; | ||
7191 | } | ||
7192 | sd->groups->cpu_power += power; | ||
7193 | return; | ||
7194 | } | ||
7195 | |||
7196 | /* | ||
7197 | * Add cpu_power of each child group to this groups cpu_power. | ||
7198 | */ | ||
7199 | group = child->groups; | ||
7200 | do { | ||
7201 | sd->groups->cpu_power += group->cpu_power; | ||
7202 | group = group->next; | ||
7203 | } while (group != child->groups); | ||
7204 | } | 7062 | } |
7205 | 7063 | ||
7206 | /* | 7064 | /* |
@@ -7214,15 +7072,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7214 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7072 | # define SD_INIT_NAME(sd, type) do { } while (0) |
7215 | #endif | 7073 | #endif |
7216 | 7074 | ||
7217 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7075 | #define SD_INIT_FUNC(type) \ |
7218 | 7076 | static noinline struct sched_domain * \ | |
7219 | #define SD_INIT_FUNC(type) \ | 7077 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
7220 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7078 | { \ |
7221 | { \ | 7079 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
7222 | memset(sd, 0, sizeof(*sd)); \ | 7080 | *sd = SD_##type##_INIT; \ |
7223 | *sd = SD_##type##_INIT; \ | 7081 | SD_INIT_NAME(sd, type); \ |
7224 | sd->level = SD_LV_##type; \ | 7082 | sd->private = &tl->data; \ |
7225 | SD_INIT_NAME(sd, type); \ | 7083 | return sd; \ |
7226 | } | 7084 | } |
7227 | 7085 | ||
7228 | SD_INIT_FUNC(CPU) | 7086 | SD_INIT_FUNC(CPU) |
@@ -7241,13 +7099,14 @@ SD_INIT_FUNC(CPU) | |||
7241 | #endif | 7099 | #endif |
7242 | 7100 | ||
7243 | static int default_relax_domain_level = -1; | 7101 | static int default_relax_domain_level = -1; |
7102 | int sched_domain_level_max; | ||
7244 | 7103 | ||
7245 | static int __init setup_relax_domain_level(char *str) | 7104 | static int __init setup_relax_domain_level(char *str) |
7246 | { | 7105 | { |
7247 | unsigned long val; | 7106 | unsigned long val; |
7248 | 7107 | ||
7249 | val = simple_strtoul(str, NULL, 0); | 7108 | val = simple_strtoul(str, NULL, 0); |
7250 | if (val < SD_LV_MAX) | 7109 | if (val < sched_domain_level_max) |
7251 | default_relax_domain_level = val; | 7110 | default_relax_domain_level = val; |
7252 | 7111 | ||
7253 | return 1; | 7112 | return 1; |
@@ -7275,37 +7134,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
7275 | } | 7134 | } |
7276 | } | 7135 | } |
7277 | 7136 | ||
7137 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7138 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7139 | |||
7278 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7140 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
7279 | const struct cpumask *cpu_map) | 7141 | const struct cpumask *cpu_map) |
7280 | { | 7142 | { |
7281 | switch (what) { | 7143 | switch (what) { |
7282 | case sa_sched_groups: | ||
7283 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
7284 | d->sched_group_nodes = NULL; | ||
7285 | case sa_rootdomain: | 7144 | case sa_rootdomain: |
7286 | free_rootdomain(d->rd); /* fall through */ | 7145 | if (!atomic_read(&d->rd->refcount)) |
7287 | case sa_tmpmask: | 7146 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7288 | free_cpumask_var(d->tmpmask); /* fall through */ | 7147 | case sa_sd: |
7289 | case sa_send_covered: | 7148 | free_percpu(d->sd); /* fall through */ |
7290 | free_cpumask_var(d->send_covered); /* fall through */ | 7149 | case sa_sd_storage: |
7291 | case sa_this_book_map: | 7150 | __sdt_free(cpu_map); /* fall through */ |
7292 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
7293 | case sa_this_core_map: | ||
7294 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7295 | case sa_this_sibling_map: | ||
7296 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7297 | case sa_nodemask: | ||
7298 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7299 | case sa_sched_group_nodes: | ||
7300 | #ifdef CONFIG_NUMA | ||
7301 | kfree(d->sched_group_nodes); /* fall through */ | ||
7302 | case sa_notcovered: | ||
7303 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7304 | case sa_covered: | ||
7305 | free_cpumask_var(d->covered); /* fall through */ | ||
7306 | case sa_domainspan: | ||
7307 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7308 | #endif | ||
7309 | case sa_none: | 7151 | case sa_none: |
7310 | break; | 7152 | break; |
7311 | } | 7153 | } |
@@ -7314,308 +7156,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7314 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7156 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7315 | const struct cpumask *cpu_map) | 7157 | const struct cpumask *cpu_map) |
7316 | { | 7158 | { |
7317 | #ifdef CONFIG_NUMA | 7159 | memset(d, 0, sizeof(*d)); |
7318 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7160 | |
7319 | return sa_none; | 7161 | if (__sdt_alloc(cpu_map)) |
7320 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7162 | return sa_sd_storage; |
7321 | return sa_domainspan; | 7163 | d->sd = alloc_percpu(struct sched_domain *); |
7322 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7164 | if (!d->sd) |
7323 | return sa_covered; | 7165 | return sa_sd_storage; |
7324 | /* Allocate the per-node list of sched groups */ | ||
7325 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7326 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7327 | if (!d->sched_group_nodes) { | ||
7328 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7329 | return sa_notcovered; | ||
7330 | } | ||
7331 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7332 | #endif | ||
7333 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7334 | return sa_sched_group_nodes; | ||
7335 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7336 | return sa_nodemask; | ||
7337 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7338 | return sa_this_sibling_map; | ||
7339 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) | ||
7340 | return sa_this_core_map; | ||
7341 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7342 | return sa_this_book_map; | ||
7343 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7344 | return sa_send_covered; | ||
7345 | d->rd = alloc_rootdomain(); | 7166 | d->rd = alloc_rootdomain(); |
7346 | if (!d->rd) { | 7167 | if (!d->rd) |
7347 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7168 | return sa_sd; |
7348 | return sa_tmpmask; | ||
7349 | } | ||
7350 | return sa_rootdomain; | 7169 | return sa_rootdomain; |
7351 | } | 7170 | } |
7352 | 7171 | ||
7353 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7172 | /* |
7354 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7173 | * NULL the sd_data elements we've used to build the sched_domain and |
7174 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7175 | * will not free the data we're using. | ||
7176 | */ | ||
7177 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7355 | { | 7178 | { |
7356 | struct sched_domain *sd = NULL; | 7179 | struct sd_data *sdd = sd->private; |
7357 | #ifdef CONFIG_NUMA | 7180 | struct sched_group *sg = sd->groups; |
7358 | struct sched_domain *parent; | ||
7359 | |||
7360 | d->sd_allnodes = 0; | ||
7361 | if (cpumask_weight(cpu_map) > | ||
7362 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7363 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7364 | SD_INIT(sd, ALLNODES); | ||
7365 | set_domain_attribute(sd, attr); | ||
7366 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7367 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7368 | d->sd_allnodes = 1; | ||
7369 | } | ||
7370 | parent = sd; | ||
7371 | |||
7372 | sd = &per_cpu(node_domains, i).sd; | ||
7373 | SD_INIT(sd, NODE); | ||
7374 | set_domain_attribute(sd, attr); | ||
7375 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7376 | sd->parent = parent; | ||
7377 | if (parent) | ||
7378 | parent->child = sd; | ||
7379 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7380 | #endif | ||
7381 | return sd; | ||
7382 | } | ||
7383 | 7181 | ||
7384 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7182 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7385 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7183 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7386 | struct sched_domain *parent, int i) | ||
7387 | { | ||
7388 | struct sched_domain *sd; | ||
7389 | sd = &per_cpu(phys_domains, i).sd; | ||
7390 | SD_INIT(sd, CPU); | ||
7391 | set_domain_attribute(sd, attr); | ||
7392 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7393 | sd->parent = parent; | ||
7394 | if (parent) | ||
7395 | parent->child = sd; | ||
7396 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7397 | return sd; | ||
7398 | } | ||
7399 | 7184 | ||
7400 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | 7185 | if (cpu == cpumask_first(sched_group_cpus(sg))) { |
7401 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7186 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); |
7402 | struct sched_domain *parent, int i) | 7187 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7403 | { | 7188 | } |
7404 | struct sched_domain *sd = parent; | ||
7405 | #ifdef CONFIG_SCHED_BOOK | ||
7406 | sd = &per_cpu(book_domains, i).sd; | ||
7407 | SD_INIT(sd, BOOK); | ||
7408 | set_domain_attribute(sd, attr); | ||
7409 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7410 | sd->parent = parent; | ||
7411 | parent->child = sd; | ||
7412 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7413 | #endif | ||
7414 | return sd; | ||
7415 | } | 7189 | } |
7416 | 7190 | ||
7417 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7191 | #ifdef CONFIG_SCHED_SMT |
7418 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7192 | static const struct cpumask *cpu_smt_mask(int cpu) |
7419 | struct sched_domain *parent, int i) | ||
7420 | { | 7193 | { |
7421 | struct sched_domain *sd = parent; | 7194 | return topology_thread_cpumask(cpu); |
7422 | #ifdef CONFIG_SCHED_MC | ||
7423 | sd = &per_cpu(core_domains, i).sd; | ||
7424 | SD_INIT(sd, MC); | ||
7425 | set_domain_attribute(sd, attr); | ||
7426 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7427 | sd->parent = parent; | ||
7428 | parent->child = sd; | ||
7429 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7430 | #endif | ||
7431 | return sd; | ||
7432 | } | 7195 | } |
7433 | |||
7434 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7435 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7436 | struct sched_domain *parent, int i) | ||
7437 | { | ||
7438 | struct sched_domain *sd = parent; | ||
7439 | #ifdef CONFIG_SCHED_SMT | ||
7440 | sd = &per_cpu(cpu_domains, i).sd; | ||
7441 | SD_INIT(sd, SIBLING); | ||
7442 | set_domain_attribute(sd, attr); | ||
7443 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7444 | sd->parent = parent; | ||
7445 | parent->child = sd; | ||
7446 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7447 | #endif | 7196 | #endif |
7448 | return sd; | ||
7449 | } | ||
7450 | 7197 | ||
7451 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7198 | /* |
7452 | const struct cpumask *cpu_map, int cpu) | 7199 | * Topology list, bottom-up. |
7453 | { | 7200 | */ |
7454 | switch (l) { | 7201 | static struct sched_domain_topology_level default_topology[] = { |
7455 | #ifdef CONFIG_SCHED_SMT | 7202 | #ifdef CONFIG_SCHED_SMT |
7456 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7203 | { sd_init_SIBLING, cpu_smt_mask, }, |
7457 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7458 | topology_thread_cpumask(cpu)); | ||
7459 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7460 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7461 | &cpu_to_cpu_group, | ||
7462 | d->send_covered, d->tmpmask); | ||
7463 | break; | ||
7464 | #endif | 7204 | #endif |
7465 | #ifdef CONFIG_SCHED_MC | 7205 | #ifdef CONFIG_SCHED_MC |
7466 | case SD_LV_MC: /* set up multi-core groups */ | 7206 | { sd_init_MC, cpu_coregroup_mask, }, |
7467 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7468 | if (cpu == cpumask_first(d->this_core_map)) | ||
7469 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7470 | &cpu_to_core_group, | ||
7471 | d->send_covered, d->tmpmask); | ||
7472 | break; | ||
7473 | #endif | 7207 | #endif |
7474 | #ifdef CONFIG_SCHED_BOOK | 7208 | #ifdef CONFIG_SCHED_BOOK |
7475 | case SD_LV_BOOK: /* set up book groups */ | 7209 | { sd_init_BOOK, cpu_book_mask, }, |
7476 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7477 | if (cpu == cpumask_first(d->this_book_map)) | ||
7478 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7479 | &cpu_to_book_group, | ||
7480 | d->send_covered, d->tmpmask); | ||
7481 | break; | ||
7482 | #endif | 7210 | #endif |
7483 | case SD_LV_CPU: /* set up physical groups */ | 7211 | { sd_init_CPU, cpu_cpu_mask, }, |
7484 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | ||
7485 | if (!cpumask_empty(d->nodemask)) | ||
7486 | init_sched_build_groups(d->nodemask, cpu_map, | ||
7487 | &cpu_to_phys_group, | ||
7488 | d->send_covered, d->tmpmask); | ||
7489 | break; | ||
7490 | #ifdef CONFIG_NUMA | 7212 | #ifdef CONFIG_NUMA |
7491 | case SD_LV_ALLNODES: | 7213 | { sd_init_NODE, cpu_node_mask, }, |
7492 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7214 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7493 | d->send_covered, d->tmpmask); | ||
7494 | break; | ||
7495 | #endif | 7215 | #endif |
7496 | default: | 7216 | { NULL, }, |
7497 | break; | 7217 | }; |
7218 | |||
7219 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7220 | |||
7221 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7222 | { | ||
7223 | struct sched_domain_topology_level *tl; | ||
7224 | int j; | ||
7225 | |||
7226 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7227 | struct sd_data *sdd = &tl->data; | ||
7228 | |||
7229 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7230 | if (!sdd->sd) | ||
7231 | return -ENOMEM; | ||
7232 | |||
7233 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7234 | if (!sdd->sg) | ||
7235 | return -ENOMEM; | ||
7236 | |||
7237 | for_each_cpu(j, cpu_map) { | ||
7238 | struct sched_domain *sd; | ||
7239 | struct sched_group *sg; | ||
7240 | |||
7241 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7242 | GFP_KERNEL, cpu_to_node(j)); | ||
7243 | if (!sd) | ||
7244 | return -ENOMEM; | ||
7245 | |||
7246 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7247 | |||
7248 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7249 | GFP_KERNEL, cpu_to_node(j)); | ||
7250 | if (!sg) | ||
7251 | return -ENOMEM; | ||
7252 | |||
7253 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7254 | } | ||
7498 | } | 7255 | } |
7256 | |||
7257 | return 0; | ||
7258 | } | ||
7259 | |||
7260 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7261 | { | ||
7262 | struct sched_domain_topology_level *tl; | ||
7263 | int j; | ||
7264 | |||
7265 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7266 | struct sd_data *sdd = &tl->data; | ||
7267 | |||
7268 | for_each_cpu(j, cpu_map) { | ||
7269 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7270 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7271 | } | ||
7272 | free_percpu(sdd->sd); | ||
7273 | free_percpu(sdd->sg); | ||
7274 | } | ||
7275 | } | ||
7276 | |||
7277 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7278 | struct s_data *d, const struct cpumask *cpu_map, | ||
7279 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7280 | int cpu) | ||
7281 | { | ||
7282 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7283 | if (!sd) | ||
7284 | return child; | ||
7285 | |||
7286 | set_domain_attribute(sd, attr); | ||
7287 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7288 | if (child) { | ||
7289 | sd->level = child->level + 1; | ||
7290 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7291 | child->parent = sd; | ||
7292 | } | ||
7293 | sd->child = child; | ||
7294 | |||
7295 | return sd; | ||
7499 | } | 7296 | } |
7500 | 7297 | ||
7501 | /* | 7298 | /* |
7502 | * Build sched domains for a given set of cpus and attach the sched domains | 7299 | * Build sched domains for a given set of cpus and attach the sched domains |
7503 | * to the individual cpus | 7300 | * to the individual cpus |
7504 | */ | 7301 | */ |
7505 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7302 | static int build_sched_domains(const struct cpumask *cpu_map, |
7506 | struct sched_domain_attr *attr) | 7303 | struct sched_domain_attr *attr) |
7507 | { | 7304 | { |
7508 | enum s_alloc alloc_state = sa_none; | 7305 | enum s_alloc alloc_state = sa_none; |
7509 | struct s_data d; | ||
7510 | struct sched_domain *sd; | 7306 | struct sched_domain *sd; |
7511 | int i; | 7307 | struct s_data d; |
7512 | #ifdef CONFIG_NUMA | 7308 | int i, ret = -ENOMEM; |
7513 | d.sd_allnodes = 0; | ||
7514 | #endif | ||
7515 | 7309 | ||
7516 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7310 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7517 | if (alloc_state != sa_rootdomain) | 7311 | if (alloc_state != sa_rootdomain) |
7518 | goto error; | 7312 | goto error; |
7519 | alloc_state = sa_sched_groups; | ||
7520 | 7313 | ||
7521 | /* | 7314 | /* Set up domains for cpus specified by the cpu_map. */ |
7522 | * Set up domains for cpus specified by the cpu_map. | ||
7523 | */ | ||
7524 | for_each_cpu(i, cpu_map) { | 7315 | for_each_cpu(i, cpu_map) { |
7525 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | 7316 | struct sched_domain_topology_level *tl; |
7526 | cpu_map); | ||
7527 | 7317 | ||
7528 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7318 | sd = NULL; |
7529 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7319 | for (tl = sched_domain_topology; tl->init; tl++) |
7530 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | 7320 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7531 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7532 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7533 | } | ||
7534 | 7321 | ||
7535 | for_each_cpu(i, cpu_map) { | 7322 | while (sd->child) |
7536 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7323 | sd = sd->child; |
7537 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
7538 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
7539 | } | ||
7540 | 7324 | ||
7541 | /* Set up physical groups */ | 7325 | *per_cpu_ptr(d.sd, i) = sd; |
7542 | for (i = 0; i < nr_node_ids; i++) | ||
7543 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | ||
7544 | |||
7545 | #ifdef CONFIG_NUMA | ||
7546 | /* Set up node groups */ | ||
7547 | if (d.sd_allnodes) | ||
7548 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7549 | |||
7550 | for (i = 0; i < nr_node_ids; i++) | ||
7551 | if (build_numa_sched_groups(&d, cpu_map, i)) | ||
7552 | goto error; | ||
7553 | #endif | ||
7554 | |||
7555 | /* Calculate CPU power for physical packages and nodes */ | ||
7556 | #ifdef CONFIG_SCHED_SMT | ||
7557 | for_each_cpu(i, cpu_map) { | ||
7558 | sd = &per_cpu(cpu_domains, i).sd; | ||
7559 | init_sched_groups_power(i, sd); | ||
7560 | } | ||
7561 | #endif | ||
7562 | #ifdef CONFIG_SCHED_MC | ||
7563 | for_each_cpu(i, cpu_map) { | ||
7564 | sd = &per_cpu(core_domains, i).sd; | ||
7565 | init_sched_groups_power(i, sd); | ||
7566 | } | ||
7567 | #endif | ||
7568 | #ifdef CONFIG_SCHED_BOOK | ||
7569 | for_each_cpu(i, cpu_map) { | ||
7570 | sd = &per_cpu(book_domains, i).sd; | ||
7571 | init_sched_groups_power(i, sd); | ||
7572 | } | 7326 | } |
7573 | #endif | ||
7574 | 7327 | ||
7328 | /* Build the groups for the domains */ | ||
7575 | for_each_cpu(i, cpu_map) { | 7329 | for_each_cpu(i, cpu_map) { |
7576 | sd = &per_cpu(phys_domains, i).sd; | 7330 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7577 | init_sched_groups_power(i, sd); | 7331 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7578 | } | 7332 | get_group(i, sd->private, &sd->groups); |
7333 | atomic_inc(&sd->groups->ref); | ||
7579 | 7334 | ||
7580 | #ifdef CONFIG_NUMA | 7335 | if (i != cpumask_first(sched_domain_span(sd))) |
7581 | for (i = 0; i < nr_node_ids; i++) | 7336 | continue; |
7582 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | ||
7583 | 7337 | ||
7584 | if (d.sd_allnodes) { | 7338 | build_sched_groups(sd); |
7585 | struct sched_group *sg; | 7339 | } |
7340 | } | ||
7341 | |||
7342 | /* Calculate CPU power for physical packages and nodes */ | ||
7343 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
7344 | if (!cpumask_test_cpu(i, cpu_map)) | ||
7345 | continue; | ||
7586 | 7346 | ||
7587 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7347 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7588 | d.tmpmask); | 7348 | claim_allocations(i, sd); |
7589 | init_numa_sched_groups_power(sg); | 7349 | init_sched_groups_power(i, sd); |
7350 | } | ||
7590 | } | 7351 | } |
7591 | #endif | ||
7592 | 7352 | ||
7593 | /* Attach the domains */ | 7353 | /* Attach the domains */ |
7354 | rcu_read_lock(); | ||
7594 | for_each_cpu(i, cpu_map) { | 7355 | for_each_cpu(i, cpu_map) { |
7595 | #ifdef CONFIG_SCHED_SMT | 7356 | sd = *per_cpu_ptr(d.sd, i); |
7596 | sd = &per_cpu(cpu_domains, i).sd; | ||
7597 | #elif defined(CONFIG_SCHED_MC) | ||
7598 | sd = &per_cpu(core_domains, i).sd; | ||
7599 | #elif defined(CONFIG_SCHED_BOOK) | ||
7600 | sd = &per_cpu(book_domains, i).sd; | ||
7601 | #else | ||
7602 | sd = &per_cpu(phys_domains, i).sd; | ||
7603 | #endif | ||
7604 | cpu_attach_domain(sd, d.rd, i); | 7357 | cpu_attach_domain(sd, d.rd, i); |
7605 | } | 7358 | } |
7359 | rcu_read_unlock(); | ||
7606 | 7360 | ||
7607 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7361 | ret = 0; |
7608 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7609 | return 0; | ||
7610 | |||
7611 | error: | 7362 | error: |
7612 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7363 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7613 | return -ENOMEM; | 7364 | return ret; |
7614 | } | ||
7615 | |||
7616 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7617 | { | ||
7618 | return __build_sched_domains(cpu_map, NULL); | ||
7619 | } | 7365 | } |
7620 | 7366 | ||
7621 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7367 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7670,7 +7416,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7670 | * For now this just excludes isolated cpus, but could be used to | 7416 | * For now this just excludes isolated cpus, but could be used to |
7671 | * exclude other special cases in the future. | 7417 | * exclude other special cases in the future. |
7672 | */ | 7418 | */ |
7673 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7419 | static int init_sched_domains(const struct cpumask *cpu_map) |
7674 | { | 7420 | { |
7675 | int err; | 7421 | int err; |
7676 | 7422 | ||
@@ -7681,32 +7427,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7681 | doms_cur = &fallback_doms; | 7427 | doms_cur = &fallback_doms; |
7682 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7428 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7683 | dattr_cur = NULL; | 7429 | dattr_cur = NULL; |
7684 | err = build_sched_domains(doms_cur[0]); | 7430 | err = build_sched_domains(doms_cur[0], NULL); |
7685 | register_sched_domain_sysctl(); | 7431 | register_sched_domain_sysctl(); |
7686 | 7432 | ||
7687 | return err; | 7433 | return err; |
7688 | } | 7434 | } |
7689 | 7435 | ||
7690 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7691 | struct cpumask *tmpmask) | ||
7692 | { | ||
7693 | free_sched_groups(cpu_map, tmpmask); | ||
7694 | } | ||
7695 | |||
7696 | /* | 7436 | /* |
7697 | * Detach sched domains from a group of cpus specified in cpu_map | 7437 | * Detach sched domains from a group of cpus specified in cpu_map |
7698 | * These cpus will now be attached to the NULL domain | 7438 | * These cpus will now be attached to the NULL domain |
7699 | */ | 7439 | */ |
7700 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7440 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7701 | { | 7441 | { |
7702 | /* Save because hotplug lock held. */ | ||
7703 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7704 | int i; | 7442 | int i; |
7705 | 7443 | ||
7444 | rcu_read_lock(); | ||
7706 | for_each_cpu(i, cpu_map) | 7445 | for_each_cpu(i, cpu_map) |
7707 | cpu_attach_domain(NULL, &def_root_domain, i); | 7446 | cpu_attach_domain(NULL, &def_root_domain, i); |
7708 | synchronize_sched(); | 7447 | rcu_read_unlock(); |
7709 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7710 | } | 7448 | } |
7711 | 7449 | ||
7712 | /* handle null as "default" */ | 7450 | /* handle null as "default" */ |
@@ -7795,8 +7533,7 @@ match1: | |||
7795 | goto match2; | 7533 | goto match2; |
7796 | } | 7534 | } |
7797 | /* no match - add a new doms_new */ | 7535 | /* no match - add a new doms_new */ |
7798 | __build_sched_domains(doms_new[i], | 7536 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7799 | dattr_new ? dattr_new + i : NULL); | ||
7800 | match2: | 7537 | match2: |
7801 | ; | 7538 | ; |
7802 | } | 7539 | } |
@@ -7815,7 +7552,7 @@ match2: | |||
7815 | } | 7552 | } |
7816 | 7553 | ||
7817 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7554 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7818 | static void arch_reinit_sched_domains(void) | 7555 | static void reinit_sched_domains(void) |
7819 | { | 7556 | { |
7820 | get_online_cpus(); | 7557 | get_online_cpus(); |
7821 | 7558 | ||
@@ -7848,7 +7585,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7848 | else | 7585 | else |
7849 | sched_mc_power_savings = level; | 7586 | sched_mc_power_savings = level; |
7850 | 7587 | ||
7851 | arch_reinit_sched_domains(); | 7588 | reinit_sched_domains(); |
7852 | 7589 | ||
7853 | return count; | 7590 | return count; |
7854 | } | 7591 | } |
@@ -7967,14 +7704,9 @@ void __init sched_init_smp(void) | |||
7967 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7704 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7968 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7705 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7969 | 7706 | ||
7970 | #if defined(CONFIG_NUMA) | ||
7971 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7972 | GFP_KERNEL); | ||
7973 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7974 | #endif | ||
7975 | get_online_cpus(); | 7707 | get_online_cpus(); |
7976 | mutex_lock(&sched_domains_mutex); | 7708 | mutex_lock(&sched_domains_mutex); |
7977 | arch_init_sched_domains(cpu_active_mask); | 7709 | init_sched_domains(cpu_active_mask); |
7978 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7710 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7979 | if (cpumask_empty(non_isolated_cpus)) | 7711 | if (cpumask_empty(non_isolated_cpus)) |
7980 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7712 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -8025,6 +7757,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
8025 | #endif | 7757 | #endif |
8026 | #endif | 7758 | #endif |
8027 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7759 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7760 | #ifndef CONFIG_64BIT | ||
7761 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
7762 | #endif | ||
8028 | } | 7763 | } |
8029 | 7764 | ||
8030 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 7765 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
@@ -8224,7 +7959,7 @@ void __init sched_init(void) | |||
8224 | #ifdef CONFIG_SMP | 7959 | #ifdef CONFIG_SMP |
8225 | rq->sd = NULL; | 7960 | rq->sd = NULL; |
8226 | rq->rd = NULL; | 7961 | rq->rd = NULL; |
8227 | rq->cpu_power = SCHED_LOAD_SCALE; | 7962 | rq->cpu_power = SCHED_POWER_SCALE; |
8228 | rq->post_schedule = 0; | 7963 | rq->post_schedule = 0; |
8229 | rq->active_balance = 0; | 7964 | rq->active_balance = 0; |
8230 | rq->next_balance = jiffies; | 7965 | rq->next_balance = jiffies; |
@@ -8281,6 +8016,7 @@ void __init sched_init(void) | |||
8281 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 8016 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
8282 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8017 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
8283 | #ifdef CONFIG_SMP | 8018 | #ifdef CONFIG_SMP |
8019 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
8284 | #ifdef CONFIG_NO_HZ | 8020 | #ifdef CONFIG_NO_HZ |
8285 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 8021 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
8286 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 8022 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -8340,7 +8076,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8340 | int old_prio = p->prio; | 8076 | int old_prio = p->prio; |
8341 | int on_rq; | 8077 | int on_rq; |
8342 | 8078 | ||
8343 | on_rq = p->se.on_rq; | 8079 | on_rq = p->on_rq; |
8344 | if (on_rq) | 8080 | if (on_rq) |
8345 | deactivate_task(rq, p, 0); | 8081 | deactivate_task(rq, p, 0); |
8346 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8082 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8553,7 +8289,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8553 | { | 8289 | { |
8554 | struct rt_rq *rt_rq; | 8290 | struct rt_rq *rt_rq; |
8555 | struct sched_rt_entity *rt_se; | 8291 | struct sched_rt_entity *rt_se; |
8556 | struct rq *rq; | ||
8557 | int i; | 8292 | int i; |
8558 | 8293 | ||
8559 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8294 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8567,8 +8302,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8567 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8302 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8568 | 8303 | ||
8569 | for_each_possible_cpu(i) { | 8304 | for_each_possible_cpu(i) { |
8570 | rq = cpu_rq(i); | ||
8571 | |||
8572 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8305 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8573 | GFP_KERNEL, cpu_to_node(i)); | 8306 | GFP_KERNEL, cpu_to_node(i)); |
8574 | if (!rt_rq) | 8307 | if (!rt_rq) |
@@ -8683,7 +8416,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8683 | rq = task_rq_lock(tsk, &flags); | 8416 | rq = task_rq_lock(tsk, &flags); |
8684 | 8417 | ||
8685 | running = task_current(rq, tsk); | 8418 | running = task_current(rq, tsk); |
8686 | on_rq = tsk->se.on_rq; | 8419 | on_rq = tsk->on_rq; |
8687 | 8420 | ||
8688 | if (on_rq) | 8421 | if (on_rq) |
8689 | dequeue_task(rq, tsk, 0); | 8422 | dequeue_task(rq, tsk, 0); |
@@ -8702,7 +8435,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8702 | if (on_rq) | 8435 | if (on_rq) |
8703 | enqueue_task(rq, tsk, 0); | 8436 | enqueue_task(rq, tsk, 0); |
8704 | 8437 | ||
8705 | task_rq_unlock(rq, &flags); | 8438 | task_rq_unlock(rq, tsk, &flags); |
8706 | } | 8439 | } |
8707 | #endif /* CONFIG_CGROUP_SCHED */ | 8440 | #endif /* CONFIG_CGROUP_SCHED */ |
8708 | 8441 | ||
@@ -8720,10 +8453,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8720 | if (!tg->se[0]) | 8453 | if (!tg->se[0]) |
8721 | return -EINVAL; | 8454 | return -EINVAL; |
8722 | 8455 | ||
8723 | if (shares < MIN_SHARES) | 8456 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); |
8724 | shares = MIN_SHARES; | ||
8725 | else if (shares > MAX_SHARES) | ||
8726 | shares = MAX_SHARES; | ||
8727 | 8457 | ||
8728 | mutex_lock(&shares_mutex); | 8458 | mutex_lock(&shares_mutex); |
8729 | if (tg->shares == shares) | 8459 | if (tg->shares == shares) |
@@ -9073,42 +8803,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
9073 | return 0; | 8803 | return 0; |
9074 | } | 8804 | } |
9075 | 8805 | ||
9076 | static int | ||
9077 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
9078 | struct task_struct *tsk, bool threadgroup) | ||
9079 | { | ||
9080 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | ||
9081 | if (retval) | ||
9082 | return retval; | ||
9083 | if (threadgroup) { | ||
9084 | struct task_struct *c; | ||
9085 | rcu_read_lock(); | ||
9086 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
9087 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
9088 | if (retval) { | ||
9089 | rcu_read_unlock(); | ||
9090 | return retval; | ||
9091 | } | ||
9092 | } | ||
9093 | rcu_read_unlock(); | ||
9094 | } | ||
9095 | return 0; | ||
9096 | } | ||
9097 | |||
9098 | static void | 8806 | static void |
9099 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 8807 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
9100 | struct cgroup *old_cont, struct task_struct *tsk, | ||
9101 | bool threadgroup) | ||
9102 | { | 8808 | { |
9103 | sched_move_task(tsk); | 8809 | sched_move_task(tsk); |
9104 | if (threadgroup) { | ||
9105 | struct task_struct *c; | ||
9106 | rcu_read_lock(); | ||
9107 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
9108 | sched_move_task(c); | ||
9109 | } | ||
9110 | rcu_read_unlock(); | ||
9111 | } | ||
9112 | } | 8810 | } |
9113 | 8811 | ||
9114 | static void | 8812 | static void |
@@ -9130,14 +8828,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
9130 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 8828 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
9131 | u64 shareval) | 8829 | u64 shareval) |
9132 | { | 8830 | { |
9133 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 8831 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
9134 | } | 8832 | } |
9135 | 8833 | ||
9136 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 8834 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
9137 | { | 8835 | { |
9138 | struct task_group *tg = cgroup_tg(cgrp); | 8836 | struct task_group *tg = cgroup_tg(cgrp); |
9139 | 8837 | ||
9140 | return (u64) tg->shares; | 8838 | return (u64) scale_load_down(tg->shares); |
9141 | } | 8839 | } |
9142 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8840 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9143 | 8841 | ||
@@ -9196,8 +8894,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9196 | .name = "cpu", | 8894 | .name = "cpu", |
9197 | .create = cpu_cgroup_create, | 8895 | .create = cpu_cgroup_create, |
9198 | .destroy = cpu_cgroup_destroy, | 8896 | .destroy = cpu_cgroup_destroy, |
9199 | .can_attach = cpu_cgroup_can_attach, | 8897 | .can_attach_task = cpu_cgroup_can_attach_task, |
9200 | .attach = cpu_cgroup_attach, | 8898 | .attach_task = cpu_cgroup_attach_task, |
9201 | .exit = cpu_cgroup_exit, | 8899 | .exit = cpu_cgroup_exit, |
9202 | .populate = cpu_cgroup_populate, | 8900 | .populate = cpu_cgroup_populate, |
9203 | .subsys_id = cpu_cgroup_subsys_id, | 8901 | .subsys_id = cpu_cgroup_subsys_id, |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a4158..a6710a112b4f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
152 | read_lock_irqsave(&tasklist_lock, flags); | 152 | read_lock_irqsave(&tasklist_lock, flags); |
153 | 153 | ||
154 | do_each_thread(g, p) { | 154 | do_each_thread(g, p) { |
155 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 155 | if (!p->on_rq || task_cpu(p) != rq_cpu) |
156 | continue; | 156 | continue; |
157 | 157 | ||
158 | print_task(m, rq, p); | 158 | print_task(m, rq, p); |
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
296 | P(ttwu_count); | 296 | P(ttwu_count); |
297 | P(ttwu_local); | 297 | P(ttwu_local); |
298 | 298 | ||
299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", | ||
300 | rq->rq_sched_info.bkl_count); | ||
301 | |||
302 | #undef P | 299 | #undef P |
303 | #undef P64 | 300 | #undef P64 |
304 | #endif | 301 | #endif |
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
441 | P(se.statistics.wait_count); | 438 | P(se.statistics.wait_count); |
442 | PN(se.statistics.iowait_sum); | 439 | PN(se.statistics.iowait_sum); |
443 | P(se.statistics.iowait_count); | 440 | P(se.statistics.iowait_count); |
444 | P(sched_info.bkl_count); | ||
445 | P(se.nr_migrations); | 441 | P(se.nr_migrations); |
446 | P(se.statistics.nr_migrations_cold); | 442 | P(se.statistics.nr_migrations_cold); |
447 | P(se.statistics.nr_failed_migrations_affine); | 443 | P(se.statistics.nr_failed_migrations_affine); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833ab2cb8..433491c2dc8f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
358 | } | 358 | } |
359 | 359 | ||
360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
361 | #ifndef CONFIG_64BIT | ||
362 | smp_wmb(); | ||
363 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
364 | #endif | ||
361 | } | 365 | } |
362 | 366 | ||
363 | /* | 367 | /* |
@@ -1072,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1072 | se->on_rq = 0; | 1076 | se->on_rq = 0; |
1073 | update_cfs_load(cfs_rq, 0); | 1077 | update_cfs_load(cfs_rq, 0); |
1074 | account_entity_dequeue(cfs_rq, se); | 1078 | account_entity_dequeue(cfs_rq, se); |
1075 | update_min_vruntime(cfs_rq); | ||
1076 | update_cfs_shares(cfs_rq); | ||
1077 | 1079 | ||
1078 | /* | 1080 | /* |
1079 | * Normalize the entity after updating the min_vruntime because the | 1081 | * Normalize the entity after updating the min_vruntime because the |
@@ -1082,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1082 | */ | 1084 | */ |
1083 | if (!(flags & DEQUEUE_SLEEP)) | 1085 | if (!(flags & DEQUEUE_SLEEP)) |
1084 | se->vruntime -= cfs_rq->min_vruntime; | 1086 | se->vruntime -= cfs_rq->min_vruntime; |
1087 | |||
1088 | update_min_vruntime(cfs_rq); | ||
1089 | update_cfs_shares(cfs_rq); | ||
1085 | } | 1090 | } |
1086 | 1091 | ||
1087 | /* | 1092 | /* |
@@ -1340,6 +1345,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1340 | hrtick_update(rq); | 1345 | hrtick_update(rq); |
1341 | } | 1346 | } |
1342 | 1347 | ||
1348 | static void set_next_buddy(struct sched_entity *se); | ||
1349 | |||
1343 | /* | 1350 | /* |
1344 | * The dequeue_task method is called before nr_running is | 1351 | * The dequeue_task method is called before nr_running is |
1345 | * decreased. We remove the task from the rbtree and | 1352 | * decreased. We remove the task from the rbtree and |
@@ -1349,14 +1356,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1349 | { | 1356 | { |
1350 | struct cfs_rq *cfs_rq; | 1357 | struct cfs_rq *cfs_rq; |
1351 | struct sched_entity *se = &p->se; | 1358 | struct sched_entity *se = &p->se; |
1359 | int task_sleep = flags & DEQUEUE_SLEEP; | ||
1352 | 1360 | ||
1353 | for_each_sched_entity(se) { | 1361 | for_each_sched_entity(se) { |
1354 | cfs_rq = cfs_rq_of(se); | 1362 | cfs_rq = cfs_rq_of(se); |
1355 | dequeue_entity(cfs_rq, se, flags); | 1363 | dequeue_entity(cfs_rq, se, flags); |
1356 | 1364 | ||
1357 | /* Don't dequeue parent if it has other entities besides us */ | 1365 | /* Don't dequeue parent if it has other entities besides us */ |
1358 | if (cfs_rq->load.weight) | 1366 | if (cfs_rq->load.weight) { |
1367 | /* | ||
1368 | * Bias pick_next to pick a task from this cfs_rq, as | ||
1369 | * p is sleeping when it is within its sched_slice. | ||
1370 | */ | ||
1371 | if (task_sleep && parent_entity(se)) | ||
1372 | set_next_buddy(parent_entity(se)); | ||
1359 | break; | 1373 | break; |
1374 | } | ||
1360 | flags |= DEQUEUE_SLEEP; | 1375 | flags |= DEQUEUE_SLEEP; |
1361 | } | 1376 | } |
1362 | 1377 | ||
@@ -1372,12 +1387,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1372 | 1387 | ||
1373 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
1374 | 1389 | ||
1375 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1390 | static void task_waking_fair(struct task_struct *p) |
1376 | { | 1391 | { |
1377 | struct sched_entity *se = &p->se; | 1392 | struct sched_entity *se = &p->se; |
1378 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1393 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1394 | u64 min_vruntime; | ||
1379 | 1395 | ||
1380 | se->vruntime -= cfs_rq->min_vruntime; | 1396 | #ifndef CONFIG_64BIT |
1397 | u64 min_vruntime_copy; | ||
1398 | |||
1399 | do { | ||
1400 | min_vruntime_copy = cfs_rq->min_vruntime_copy; | ||
1401 | smp_rmb(); | ||
1402 | min_vruntime = cfs_rq->min_vruntime; | ||
1403 | } while (min_vruntime != min_vruntime_copy); | ||
1404 | #else | ||
1405 | min_vruntime = cfs_rq->min_vruntime; | ||
1406 | #endif | ||
1407 | |||
1408 | se->vruntime -= min_vruntime; | ||
1381 | } | 1409 | } |
1382 | 1410 | ||
1383 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1411 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1557,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1557 | } | 1585 | } |
1558 | 1586 | ||
1559 | /* Adjust by relative CPU power of the group */ | 1587 | /* Adjust by relative CPU power of the group */ |
1560 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1588 | avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; |
1561 | 1589 | ||
1562 | if (local_group) { | 1590 | if (local_group) { |
1563 | this_load = avg_load; | 1591 | this_load = avg_load; |
@@ -1622,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1622 | /* | 1650 | /* |
1623 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1651 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1624 | */ | 1652 | */ |
1653 | rcu_read_lock(); | ||
1625 | for_each_domain(target, sd) { | 1654 | for_each_domain(target, sd) { |
1626 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1655 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1627 | break; | 1656 | break; |
@@ -1641,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1641 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1670 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
1642 | break; | 1671 | break; |
1643 | } | 1672 | } |
1673 | rcu_read_unlock(); | ||
1644 | 1674 | ||
1645 | return target; | 1675 | return target; |
1646 | } | 1676 | } |
@@ -1657,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1657 | * preempt must be disabled. | 1687 | * preempt must be disabled. |
1658 | */ | 1688 | */ |
1659 | static int | 1689 | static int |
1660 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | 1690 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
1661 | { | 1691 | { |
1662 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1692 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1663 | int cpu = smp_processor_id(); | 1693 | int cpu = smp_processor_id(); |
@@ -1673,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1673 | new_cpu = prev_cpu; | 1703 | new_cpu = prev_cpu; |
1674 | } | 1704 | } |
1675 | 1705 | ||
1706 | rcu_read_lock(); | ||
1676 | for_each_domain(cpu, tmp) { | 1707 | for_each_domain(cpu, tmp) { |
1677 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1708 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
1678 | continue; | 1709 | continue; |
@@ -1692,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1692 | nr_running += cpu_rq(i)->cfs.nr_running; | 1723 | nr_running += cpu_rq(i)->cfs.nr_running; |
1693 | } | 1724 | } |
1694 | 1725 | ||
1695 | capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 1726 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
1696 | 1727 | ||
1697 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1728 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1698 | nr_running /= 2; | 1729 | nr_running /= 2; |
@@ -1723,9 +1754,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1723 | 1754 | ||
1724 | if (affine_sd) { | 1755 | if (affine_sd) { |
1725 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1756 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1726 | return select_idle_sibling(p, cpu); | 1757 | prev_cpu = cpu; |
1727 | else | 1758 | |
1728 | return select_idle_sibling(p, prev_cpu); | 1759 | new_cpu = select_idle_sibling(p, prev_cpu); |
1760 | goto unlock; | ||
1729 | } | 1761 | } |
1730 | 1762 | ||
1731 | while (sd) { | 1763 | while (sd) { |
@@ -1766,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1766 | } | 1798 | } |
1767 | /* while loop will break here if sd == NULL */ | 1799 | /* while loop will break here if sd == NULL */ |
1768 | } | 1800 | } |
1801 | unlock: | ||
1802 | rcu_read_unlock(); | ||
1769 | 1803 | ||
1770 | return new_cpu; | 1804 | return new_cpu; |
1771 | } | 1805 | } |
@@ -1789,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | |||
1789 | * This is especially important for buddies when the leftmost | 1823 | * This is especially important for buddies when the leftmost |
1790 | * task is higher priority than the buddy. | 1824 | * task is higher priority than the buddy. |
1791 | */ | 1825 | */ |
1792 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1826 | return calc_delta_fair(gran, se); |
1793 | gran = calc_delta_fair(gran, se); | ||
1794 | |||
1795 | return gran; | ||
1796 | } | 1827 | } |
1797 | 1828 | ||
1798 | /* | 1829 | /* |
@@ -1826,26 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
1826 | 1857 | ||
1827 | static void set_last_buddy(struct sched_entity *se) | 1858 | static void set_last_buddy(struct sched_entity *se) |
1828 | { | 1859 | { |
1829 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1860 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1830 | for_each_sched_entity(se) | 1861 | return; |
1831 | cfs_rq_of(se)->last = se; | 1862 | |
1832 | } | 1863 | for_each_sched_entity(se) |
1864 | cfs_rq_of(se)->last = se; | ||
1833 | } | 1865 | } |
1834 | 1866 | ||
1835 | static void set_next_buddy(struct sched_entity *se) | 1867 | static void set_next_buddy(struct sched_entity *se) |
1836 | { | 1868 | { |
1837 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1869 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1838 | for_each_sched_entity(se) | 1870 | return; |
1839 | cfs_rq_of(se)->next = se; | 1871 | |
1840 | } | 1872 | for_each_sched_entity(se) |
1873 | cfs_rq_of(se)->next = se; | ||
1841 | } | 1874 | } |
1842 | 1875 | ||
1843 | static void set_skip_buddy(struct sched_entity *se) | 1876 | static void set_skip_buddy(struct sched_entity *se) |
1844 | { | 1877 | { |
1845 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1878 | for_each_sched_entity(se) |
1846 | for_each_sched_entity(se) | 1879 | cfs_rq_of(se)->skip = se; |
1847 | cfs_rq_of(se)->skip = se; | ||
1848 | } | ||
1849 | } | 1880 | } |
1850 | 1881 | ||
1851 | /* | 1882 | /* |
@@ -1857,12 +1888,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1857 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1888 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1858 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1889 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1859 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1890 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1891 | int next_buddy_marked = 0; | ||
1860 | 1892 | ||
1861 | if (unlikely(se == pse)) | 1893 | if (unlikely(se == pse)) |
1862 | return; | 1894 | return; |
1863 | 1895 | ||
1864 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) | 1896 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1865 | set_next_buddy(pse); | 1897 | set_next_buddy(pse); |
1898 | next_buddy_marked = 1; | ||
1899 | } | ||
1866 | 1900 | ||
1867 | /* | 1901 | /* |
1868 | * We can come here with TIF_NEED_RESCHED already set from new task | 1902 | * We can come here with TIF_NEED_RESCHED already set from new task |
@@ -1890,8 +1924,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1890 | update_curr(cfs_rq); | 1924 | update_curr(cfs_rq); |
1891 | find_matching_se(&se, &pse); | 1925 | find_matching_se(&se, &pse); |
1892 | BUG_ON(!pse); | 1926 | BUG_ON(!pse); |
1893 | if (wakeup_preempt_entity(se, pse) == 1) | 1927 | if (wakeup_preempt_entity(se, pse) == 1) { |
1928 | /* | ||
1929 | * Bias pick_next to pick the sched entity that is | ||
1930 | * triggering this preemption. | ||
1931 | */ | ||
1932 | if (!next_buddy_marked) | ||
1933 | set_next_buddy(pse); | ||
1894 | goto preempt; | 1934 | goto preempt; |
1935 | } | ||
1895 | 1936 | ||
1896 | return; | 1937 | return; |
1897 | 1938 | ||
@@ -2102,7 +2143,7 @@ static unsigned long | |||
2102 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2143 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2103 | unsigned long max_load_move, struct sched_domain *sd, | 2144 | unsigned long max_load_move, struct sched_domain *sd, |
2104 | enum cpu_idle_type idle, int *all_pinned, | 2145 | enum cpu_idle_type idle, int *all_pinned, |
2105 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2146 | struct cfs_rq *busiest_cfs_rq) |
2106 | { | 2147 | { |
2107 | int loops = 0, pulled = 0; | 2148 | int loops = 0, pulled = 0; |
2108 | long rem_load_move = max_load_move; | 2149 | long rem_load_move = max_load_move; |
@@ -2140,9 +2181,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2140 | */ | 2181 | */ |
2141 | if (rem_load_move <= 0) | 2182 | if (rem_load_move <= 0) |
2142 | break; | 2183 | break; |
2143 | |||
2144 | if (p->prio < *this_best_prio) | ||
2145 | *this_best_prio = p->prio; | ||
2146 | } | 2184 | } |
2147 | out: | 2185 | out: |
2148 | /* | 2186 | /* |
@@ -2202,7 +2240,7 @@ static unsigned long | |||
2202 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2240 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2203 | unsigned long max_load_move, | 2241 | unsigned long max_load_move, |
2204 | struct sched_domain *sd, enum cpu_idle_type idle, | 2242 | struct sched_domain *sd, enum cpu_idle_type idle, |
2205 | int *all_pinned, int *this_best_prio) | 2243 | int *all_pinned) |
2206 | { | 2244 | { |
2207 | long rem_load_move = max_load_move; | 2245 | long rem_load_move = max_load_move; |
2208 | int busiest_cpu = cpu_of(busiest); | 2246 | int busiest_cpu = cpu_of(busiest); |
@@ -2227,7 +2265,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2227 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2265 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
2228 | 2266 | ||
2229 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 2267 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
2230 | rem_load, sd, idle, all_pinned, this_best_prio, | 2268 | rem_load, sd, idle, all_pinned, |
2231 | busiest_cfs_rq); | 2269 | busiest_cfs_rq); |
2232 | 2270 | ||
2233 | if (!moved_load) | 2271 | if (!moved_load) |
@@ -2253,11 +2291,11 @@ static unsigned long | |||
2253 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2291 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2254 | unsigned long max_load_move, | 2292 | unsigned long max_load_move, |
2255 | struct sched_domain *sd, enum cpu_idle_type idle, | 2293 | struct sched_domain *sd, enum cpu_idle_type idle, |
2256 | int *all_pinned, int *this_best_prio) | 2294 | int *all_pinned) |
2257 | { | 2295 | { |
2258 | return balance_tasks(this_rq, this_cpu, busiest, | 2296 | return balance_tasks(this_rq, this_cpu, busiest, |
2259 | max_load_move, sd, idle, all_pinned, | 2297 | max_load_move, sd, idle, all_pinned, |
2260 | this_best_prio, &busiest->cfs); | 2298 | &busiest->cfs); |
2261 | } | 2299 | } |
2262 | #endif | 2300 | #endif |
2263 | 2301 | ||
@@ -2274,12 +2312,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2274 | int *all_pinned) | 2312 | int *all_pinned) |
2275 | { | 2313 | { |
2276 | unsigned long total_load_moved = 0, load_moved; | 2314 | unsigned long total_load_moved = 0, load_moved; |
2277 | int this_best_prio = this_rq->curr->prio; | ||
2278 | 2315 | ||
2279 | do { | 2316 | do { |
2280 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 2317 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2281 | max_load_move - total_load_moved, | 2318 | max_load_move - total_load_moved, |
2282 | sd, idle, all_pinned, &this_best_prio); | 2319 | sd, idle, all_pinned); |
2283 | 2320 | ||
2284 | total_load_moved += load_moved; | 2321 | total_load_moved += load_moved; |
2285 | 2322 | ||
@@ -2534,7 +2571,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
2534 | 2571 | ||
2535 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 2572 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
2536 | { | 2573 | { |
2537 | return SCHED_LOAD_SCALE; | 2574 | return SCHED_POWER_SCALE; |
2538 | } | 2575 | } |
2539 | 2576 | ||
2540 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | 2577 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) |
@@ -2571,10 +2608,10 @@ unsigned long scale_rt_power(int cpu) | |||
2571 | available = total - rq->rt_avg; | 2608 | available = total - rq->rt_avg; |
2572 | } | 2609 | } |
2573 | 2610 | ||
2574 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2611 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
2575 | total = SCHED_LOAD_SCALE; | 2612 | total = SCHED_POWER_SCALE; |
2576 | 2613 | ||
2577 | total >>= SCHED_LOAD_SHIFT; | 2614 | total >>= SCHED_POWER_SHIFT; |
2578 | 2615 | ||
2579 | return div_u64(available, total); | 2616 | return div_u64(available, total); |
2580 | } | 2617 | } |
@@ -2582,7 +2619,7 @@ unsigned long scale_rt_power(int cpu) | |||
2582 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 2619 | static void update_cpu_power(struct sched_domain *sd, int cpu) |
2583 | { | 2620 | { |
2584 | unsigned long weight = sd->span_weight; | 2621 | unsigned long weight = sd->span_weight; |
2585 | unsigned long power = SCHED_LOAD_SCALE; | 2622 | unsigned long power = SCHED_POWER_SCALE; |
2586 | struct sched_group *sdg = sd->groups; | 2623 | struct sched_group *sdg = sd->groups; |
2587 | 2624 | ||
2588 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2625 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
@@ -2591,7 +2628,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2591 | else | 2628 | else |
2592 | power *= default_scale_smt_power(sd, cpu); | 2629 | power *= default_scale_smt_power(sd, cpu); |
2593 | 2630 | ||
2594 | power >>= SCHED_LOAD_SHIFT; | 2631 | power >>= SCHED_POWER_SHIFT; |
2595 | } | 2632 | } |
2596 | 2633 | ||
2597 | sdg->cpu_power_orig = power; | 2634 | sdg->cpu_power_orig = power; |
@@ -2601,10 +2638,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2601 | else | 2638 | else |
2602 | power *= default_scale_freq_power(sd, cpu); | 2639 | power *= default_scale_freq_power(sd, cpu); |
2603 | 2640 | ||
2604 | power >>= SCHED_LOAD_SHIFT; | 2641 | power >>= SCHED_POWER_SHIFT; |
2605 | 2642 | ||
2606 | power *= scale_rt_power(cpu); | 2643 | power *= scale_rt_power(cpu); |
2607 | power >>= SCHED_LOAD_SHIFT; | 2644 | power >>= SCHED_POWER_SHIFT; |
2608 | 2645 | ||
2609 | if (!power) | 2646 | if (!power) |
2610 | power = 1; | 2647 | power = 1; |
@@ -2646,9 +2683,9 @@ static inline int | |||
2646 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 2683 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) |
2647 | { | 2684 | { |
2648 | /* | 2685 | /* |
2649 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | 2686 | * Only siblings can have significantly less than SCHED_POWER_SCALE |
2650 | */ | 2687 | */ |
2651 | if (sd->level != SD_LV_SIBLING) | 2688 | if (!(sd->flags & SD_SHARE_CPUPOWER)) |
2652 | return 0; | 2689 | return 0; |
2653 | 2690 | ||
2654 | /* | 2691 | /* |
@@ -2734,7 +2771,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2734 | } | 2771 | } |
2735 | 2772 | ||
2736 | /* Adjust by relative CPU power of the group */ | 2773 | /* Adjust by relative CPU power of the group */ |
2737 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2774 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; |
2738 | 2775 | ||
2739 | /* | 2776 | /* |
2740 | * Consider the group unbalanced when the imbalance is larger | 2777 | * Consider the group unbalanced when the imbalance is larger |
@@ -2751,7 +2788,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2751 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 2788 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2752 | sgs->group_imb = 1; | 2789 | sgs->group_imb = 1; |
2753 | 2790 | ||
2754 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2791 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, |
2792 | SCHED_POWER_SCALE); | ||
2755 | if (!sgs->group_capacity) | 2793 | if (!sgs->group_capacity) |
2756 | sgs->group_capacity = fix_small_capacity(sd, group); | 2794 | sgs->group_capacity = fix_small_capacity(sd, group); |
2757 | sgs->group_weight = group->group_weight; | 2795 | sgs->group_weight = group->group_weight; |
@@ -2925,7 +2963,7 @@ static int check_asym_packing(struct sched_domain *sd, | |||
2925 | return 0; | 2963 | return 0; |
2926 | 2964 | ||
2927 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | 2965 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, |
2928 | SCHED_LOAD_SCALE); | 2966 | SCHED_POWER_SCALE); |
2929 | return 1; | 2967 | return 1; |
2930 | } | 2968 | } |
2931 | 2969 | ||
@@ -2954,7 +2992,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2954 | cpu_avg_load_per_task(this_cpu); | 2992 | cpu_avg_load_per_task(this_cpu); |
2955 | 2993 | ||
2956 | scaled_busy_load_per_task = sds->busiest_load_per_task | 2994 | scaled_busy_load_per_task = sds->busiest_load_per_task |
2957 | * SCHED_LOAD_SCALE; | 2995 | * SCHED_POWER_SCALE; |
2958 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | 2996 | scaled_busy_load_per_task /= sds->busiest->cpu_power; |
2959 | 2997 | ||
2960 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 2998 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
@@ -2973,10 +3011,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2973 | min(sds->busiest_load_per_task, sds->max_load); | 3011 | min(sds->busiest_load_per_task, sds->max_load); |
2974 | pwr_now += sds->this->cpu_power * | 3012 | pwr_now += sds->this->cpu_power * |
2975 | min(sds->this_load_per_task, sds->this_load); | 3013 | min(sds->this_load_per_task, sds->this_load); |
2976 | pwr_now /= SCHED_LOAD_SCALE; | 3014 | pwr_now /= SCHED_POWER_SCALE; |
2977 | 3015 | ||
2978 | /* Amount of load we'd subtract */ | 3016 | /* Amount of load we'd subtract */ |
2979 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3017 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
2980 | sds->busiest->cpu_power; | 3018 | sds->busiest->cpu_power; |
2981 | if (sds->max_load > tmp) | 3019 | if (sds->max_load > tmp) |
2982 | pwr_move += sds->busiest->cpu_power * | 3020 | pwr_move += sds->busiest->cpu_power * |
@@ -2984,15 +3022,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2984 | 3022 | ||
2985 | /* Amount of load we'd add */ | 3023 | /* Amount of load we'd add */ |
2986 | if (sds->max_load * sds->busiest->cpu_power < | 3024 | if (sds->max_load * sds->busiest->cpu_power < |
2987 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3025 | sds->busiest_load_per_task * SCHED_POWER_SCALE) |
2988 | tmp = (sds->max_load * sds->busiest->cpu_power) / | 3026 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
2989 | sds->this->cpu_power; | 3027 | sds->this->cpu_power; |
2990 | else | 3028 | else |
2991 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3029 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
2992 | sds->this->cpu_power; | 3030 | sds->this->cpu_power; |
2993 | pwr_move += sds->this->cpu_power * | 3031 | pwr_move += sds->this->cpu_power * |
2994 | min(sds->this_load_per_task, sds->this_load + tmp); | 3032 | min(sds->this_load_per_task, sds->this_load + tmp); |
2995 | pwr_move /= SCHED_LOAD_SCALE; | 3033 | pwr_move /= SCHED_POWER_SCALE; |
2996 | 3034 | ||
2997 | /* Move if we gain throughput */ | 3035 | /* Move if we gain throughput */ |
2998 | if (pwr_move > pwr_now) | 3036 | if (pwr_move > pwr_now) |
@@ -3034,7 +3072,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3034 | load_above_capacity = (sds->busiest_nr_running - | 3072 | load_above_capacity = (sds->busiest_nr_running - |
3035 | sds->busiest_group_capacity); | 3073 | sds->busiest_group_capacity); |
3036 | 3074 | ||
3037 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | 3075 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
3038 | 3076 | ||
3039 | load_above_capacity /= sds->busiest->cpu_power; | 3077 | load_above_capacity /= sds->busiest->cpu_power; |
3040 | } | 3078 | } |
@@ -3054,7 +3092,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3054 | /* How much load to actually move to equalise the imbalance */ | 3092 | /* How much load to actually move to equalise the imbalance */ |
3055 | *imbalance = min(max_pull * sds->busiest->cpu_power, | 3093 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
3056 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | 3094 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
3057 | / SCHED_LOAD_SCALE; | 3095 | / SCHED_POWER_SCALE; |
3058 | 3096 | ||
3059 | /* | 3097 | /* |
3060 | * if *imbalance is less than the average load per runnable task | 3098 | * if *imbalance is less than the average load per runnable task |
@@ -3123,7 +3161,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3123 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3161 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3124 | goto out_balanced; | 3162 | goto out_balanced; |
3125 | 3163 | ||
3126 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3164 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
3127 | 3165 | ||
3128 | /* | 3166 | /* |
3129 | * If the busiest group is imbalanced the below checks don't | 3167 | * If the busiest group is imbalanced the below checks don't |
@@ -3202,7 +3240,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3202 | 3240 | ||
3203 | for_each_cpu(i, sched_group_cpus(group)) { | 3241 | for_each_cpu(i, sched_group_cpus(group)) { |
3204 | unsigned long power = power_of(i); | 3242 | unsigned long power = power_of(i); |
3205 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 3243 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
3244 | SCHED_POWER_SCALE); | ||
3206 | unsigned long wl; | 3245 | unsigned long wl; |
3207 | 3246 | ||
3208 | if (!capacity) | 3247 | if (!capacity) |
@@ -3227,7 +3266,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3227 | * the load can be moved away from the cpu that is potentially | 3266 | * the load can be moved away from the cpu that is potentially |
3228 | * running at a lower capacity. | 3267 | * running at a lower capacity. |
3229 | */ | 3268 | */ |
3230 | wl = (wl * SCHED_LOAD_SCALE) / power; | 3269 | wl = (wl * SCHED_POWER_SCALE) / power; |
3231 | 3270 | ||
3232 | if (wl > max_load) { | 3271 | if (wl > max_load) { |
3233 | max_load = wl; | 3272 | max_load = wl; |
@@ -3465,6 +3504,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3465 | raw_spin_unlock(&this_rq->lock); | 3504 | raw_spin_unlock(&this_rq->lock); |
3466 | 3505 | ||
3467 | update_shares(this_cpu); | 3506 | update_shares(this_cpu); |
3507 | rcu_read_lock(); | ||
3468 | for_each_domain(this_cpu, sd) { | 3508 | for_each_domain(this_cpu, sd) { |
3469 | unsigned long interval; | 3509 | unsigned long interval; |
3470 | int balance = 1; | 3510 | int balance = 1; |
@@ -3486,6 +3526,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3486 | break; | 3526 | break; |
3487 | } | 3527 | } |
3488 | } | 3528 | } |
3529 | rcu_read_unlock(); | ||
3489 | 3530 | ||
3490 | raw_spin_lock(&this_rq->lock); | 3531 | raw_spin_lock(&this_rq->lock); |
3491 | 3532 | ||
@@ -3534,6 +3575,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3534 | double_lock_balance(busiest_rq, target_rq); | 3575 | double_lock_balance(busiest_rq, target_rq); |
3535 | 3576 | ||
3536 | /* Search for an sd spanning us and the target CPU. */ | 3577 | /* Search for an sd spanning us and the target CPU. */ |
3578 | rcu_read_lock(); | ||
3537 | for_each_domain(target_cpu, sd) { | 3579 | for_each_domain(target_cpu, sd) { |
3538 | if ((sd->flags & SD_LOAD_BALANCE) && | 3580 | if ((sd->flags & SD_LOAD_BALANCE) && |
3539 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3581 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
@@ -3549,6 +3591,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3549 | else | 3591 | else |
3550 | schedstat_inc(sd, alb_failed); | 3592 | schedstat_inc(sd, alb_failed); |
3551 | } | 3593 | } |
3594 | rcu_read_unlock(); | ||
3552 | double_unlock_balance(busiest_rq, target_rq); | 3595 | double_unlock_balance(busiest_rq, target_rq); |
3553 | out_unlock: | 3596 | out_unlock: |
3554 | busiest_rq->active_balance = 0; | 3597 | busiest_rq->active_balance = 0; |
@@ -3675,6 +3718,7 @@ static int find_new_ilb(int cpu) | |||
3675 | { | 3718 | { |
3676 | struct sched_domain *sd; | 3719 | struct sched_domain *sd; |
3677 | struct sched_group *ilb_group; | 3720 | struct sched_group *ilb_group; |
3721 | int ilb = nr_cpu_ids; | ||
3678 | 3722 | ||
3679 | /* | 3723 | /* |
3680 | * Have idle load balancer selection from semi-idle packages only | 3724 | * Have idle load balancer selection from semi-idle packages only |
@@ -3690,20 +3734,25 @@ static int find_new_ilb(int cpu) | |||
3690 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3734 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3691 | goto out_done; | 3735 | goto out_done; |
3692 | 3736 | ||
3737 | rcu_read_lock(); | ||
3693 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3738 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3694 | ilb_group = sd->groups; | 3739 | ilb_group = sd->groups; |
3695 | 3740 | ||
3696 | do { | 3741 | do { |
3697 | if (is_semi_idle_group(ilb_group)) | 3742 | if (is_semi_idle_group(ilb_group)) { |
3698 | return cpumask_first(nohz.grp_idle_mask); | 3743 | ilb = cpumask_first(nohz.grp_idle_mask); |
3744 | goto unlock; | ||
3745 | } | ||
3699 | 3746 | ||
3700 | ilb_group = ilb_group->next; | 3747 | ilb_group = ilb_group->next; |
3701 | 3748 | ||
3702 | } while (ilb_group != sd->groups); | 3749 | } while (ilb_group != sd->groups); |
3703 | } | 3750 | } |
3751 | unlock: | ||
3752 | rcu_read_unlock(); | ||
3704 | 3753 | ||
3705 | out_done: | 3754 | out_done: |
3706 | return nr_cpu_ids; | 3755 | return ilb; |
3707 | } | 3756 | } |
3708 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3757 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3709 | static inline int find_new_ilb(int call_cpu) | 3758 | static inline int find_new_ilb(int call_cpu) |
@@ -3848,6 +3897,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3848 | 3897 | ||
3849 | update_shares(cpu); | 3898 | update_shares(cpu); |
3850 | 3899 | ||
3900 | rcu_read_lock(); | ||
3851 | for_each_domain(cpu, sd) { | 3901 | for_each_domain(cpu, sd) { |
3852 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3902 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3853 | continue; | 3903 | continue; |
@@ -3893,6 +3943,7 @@ out: | |||
3893 | if (!balance) | 3943 | if (!balance) |
3894 | break; | 3944 | break; |
3895 | } | 3945 | } |
3946 | rcu_read_unlock(); | ||
3896 | 3947 | ||
3897 | /* | 3948 | /* |
3898 | * next_balance will be updated only when there is a need. | 3949 | * next_balance will be updated only when there is a need. |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..be40f7371ee1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) | |||
64 | * Decrement CPU power based on irq activity | 64 | * Decrement CPU power based on irq activity |
65 | */ | 65 | */ |
66 | SCHED_FEAT(NONIRQ_POWER, 1) | 66 | SCHED_FEAT(NONIRQ_POWER, 1) |
67 | |||
68 | /* | ||
69 | * Queue remote wakeups on the target CPU and process them | ||
70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | ||
71 | */ | ||
72 | SCHED_FEAT(TTWU_QUEUE, 1) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a6396427..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -7,7 +7,7 @@ | |||
7 | 7 | ||
8 | #ifdef CONFIG_SMP | 8 | #ifdef CONFIG_SMP |
9 | static int | 9 | static int |
10 | select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 10 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) |
11 | { | 11 | { |
12 | return task_cpu(p); /* IDLE tasks as never migrated */ | 12 | return task_cpu(p); /* IDLE tasks as never migrated */ |
13 | } | 13 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f82..10d018212bab 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
184 | } | 184 | } |
185 | 185 | ||
186 | typedef struct task_group *rt_rq_iter_t; | ||
187 | |||
188 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
189 | for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ | ||
190 | (&iter->list != &task_groups) && \ | ||
191 | (rt_rq = iter->rt_rq[cpu_of(rq)]); \ | ||
192 | iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) | ||
193 | |||
186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 194 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
187 | { | 195 | { |
188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | 196 | list_add_rcu(&rt_rq->leaf_rt_rq_list, |
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
288 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 296 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
289 | } | 297 | } |
290 | 298 | ||
299 | typedef struct rt_rq *rt_rq_iter_t; | ||
300 | |||
301 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
302 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
303 | |||
291 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 304 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
292 | { | 305 | { |
293 | } | 306 | } |
@@ -402,12 +415,13 @@ next: | |||
402 | static void __disable_runtime(struct rq *rq) | 415 | static void __disable_runtime(struct rq *rq) |
403 | { | 416 | { |
404 | struct root_domain *rd = rq->rd; | 417 | struct root_domain *rd = rq->rd; |
418 | rt_rq_iter_t iter; | ||
405 | struct rt_rq *rt_rq; | 419 | struct rt_rq *rt_rq; |
406 | 420 | ||
407 | if (unlikely(!scheduler_running)) | 421 | if (unlikely(!scheduler_running)) |
408 | return; | 422 | return; |
409 | 423 | ||
410 | for_each_leaf_rt_rq(rt_rq, rq) { | 424 | for_each_rt_rq(rt_rq, iter, rq) { |
411 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 425 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
412 | s64 want; | 426 | s64 want; |
413 | int i; | 427 | int i; |
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq) | |||
487 | 501 | ||
488 | static void __enable_runtime(struct rq *rq) | 502 | static void __enable_runtime(struct rq *rq) |
489 | { | 503 | { |
504 | rt_rq_iter_t iter; | ||
490 | struct rt_rq *rt_rq; | 505 | struct rt_rq *rt_rq; |
491 | 506 | ||
492 | if (unlikely(!scheduler_running)) | 507 | if (unlikely(!scheduler_running)) |
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq) | |||
495 | /* | 510 | /* |
496 | * Reset each runqueue's bandwidth settings | 511 | * Reset each runqueue's bandwidth settings |
497 | */ | 512 | */ |
498 | for_each_leaf_rt_rq(rt_rq, rq) { | 513 | for_each_rt_rq(rt_rq, iter, rq) { |
499 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 514 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
500 | 515 | ||
501 | raw_spin_lock(&rt_b->rt_runtime_lock); | 516 | raw_spin_lock(&rt_b->rt_runtime_lock); |
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
562 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | 577 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { |
563 | rt_rq->rt_throttled = 0; | 578 | rt_rq->rt_throttled = 0; |
564 | enqueue = 1; | 579 | enqueue = 1; |
580 | |||
581 | /* | ||
582 | * Force a clock update if the CPU was idle, | ||
583 | * lest wakeup -> unthrottle time accumulate. | ||
584 | */ | ||
585 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) | ||
586 | rq->skip_clock_update = -1; | ||
565 | } | 587 | } |
566 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 588 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
567 | idle = 0; | 589 | idle = 0; |
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq) | |||
977 | static int find_lowest_rq(struct task_struct *task); | 999 | static int find_lowest_rq(struct task_struct *task); |
978 | 1000 | ||
979 | static int | 1001 | static int |
980 | select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 1002 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) |
981 | { | 1003 | { |
1004 | struct task_struct *curr; | ||
1005 | struct rq *rq; | ||
1006 | int cpu; | ||
1007 | |||
982 | if (sd_flag != SD_BALANCE_WAKE) | 1008 | if (sd_flag != SD_BALANCE_WAKE) |
983 | return smp_processor_id(); | 1009 | return smp_processor_id(); |
984 | 1010 | ||
1011 | cpu = task_cpu(p); | ||
1012 | rq = cpu_rq(cpu); | ||
1013 | |||
1014 | rcu_read_lock(); | ||
1015 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
1016 | |||
985 | /* | 1017 | /* |
986 | * If the current task is an RT task, then | 1018 | * If the current task on @p's runqueue is an RT task, then |
987 | * try to see if we can wake this RT task up on another | 1019 | * try to see if we can wake this RT task up on another |
988 | * runqueue. Otherwise simply start this RT task | 1020 | * runqueue. Otherwise simply start this RT task |
989 | * on its current runqueue. | 1021 | * on its current runqueue. |
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | |||
997 | * lock? | 1029 | * lock? |
998 | * | 1030 | * |
999 | * For equal prio tasks, we just let the scheduler sort it out. | 1031 | * For equal prio tasks, we just let the scheduler sort it out. |
1032 | * | ||
1033 | * Otherwise, just let it ride on the affined RQ and the | ||
1034 | * post-schedule router will push the preempted task away | ||
1035 | * | ||
1036 | * This test is optimistic, if we get it wrong the load-balancer | ||
1037 | * will have to sort it out. | ||
1000 | */ | 1038 | */ |
1001 | if (unlikely(rt_task(rq->curr)) && | 1039 | if (curr && unlikely(rt_task(curr)) && |
1002 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1040 | (curr->rt.nr_cpus_allowed < 2 || |
1003 | rq->curr->prio < p->prio) && | 1041 | curr->prio < p->prio) && |
1004 | (p->rt.nr_cpus_allowed > 1)) { | 1042 | (p->rt.nr_cpus_allowed > 1)) { |
1005 | int cpu = find_lowest_rq(p); | 1043 | int target = find_lowest_rq(p); |
1006 | 1044 | ||
1007 | return (cpu == -1) ? task_cpu(p) : cpu; | 1045 | if (target != -1) |
1046 | cpu = target; | ||
1008 | } | 1047 | } |
1048 | rcu_read_unlock(); | ||
1009 | 1049 | ||
1010 | /* | 1050 | return cpu; |
1011 | * Otherwise, just let it ride on the affined RQ and the | ||
1012 | * post-schedule router will push the preempted task away | ||
1013 | */ | ||
1014 | return task_cpu(p); | ||
1015 | } | 1051 | } |
1016 | 1052 | ||
1017 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1053 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
@@ -1060,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag | |||
1060 | * to move current somewhere else, making room for our non-migratable | 1096 | * to move current somewhere else, making room for our non-migratable |
1061 | * task. | 1097 | * task. |
1062 | */ | 1098 | */ |
1063 | if (p->prio == rq->curr->prio && !need_resched()) | 1099 | if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) |
1064 | check_preempt_equal_prio(rq, p); | 1100 | check_preempt_equal_prio(rq, p); |
1065 | #endif | 1101 | #endif |
1066 | } | 1102 | } |
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1136 | * The previous task needs to be made eligible for pushing | 1172 | * The previous task needs to be made eligible for pushing |
1137 | * if it is still active | 1173 | * if it is still active |
1138 | */ | 1174 | */ |
1139 | if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) | 1175 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) |
1140 | enqueue_pushable_task(rq, p); | 1176 | enqueue_pushable_task(rq, p); |
1141 | } | 1177 | } |
1142 | 1178 | ||
@@ -1203,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task) | |||
1203 | int this_cpu = smp_processor_id(); | 1239 | int this_cpu = smp_processor_id(); |
1204 | int cpu = task_cpu(task); | 1240 | int cpu = task_cpu(task); |
1205 | 1241 | ||
1242 | /* Make sure the mask is initialized first */ | ||
1243 | if (unlikely(!lowest_mask)) | ||
1244 | return -1; | ||
1245 | |||
1206 | if (task->rt.nr_cpus_allowed == 1) | 1246 | if (task->rt.nr_cpus_allowed == 1) |
1207 | return -1; /* No other targets possible */ | 1247 | return -1; /* No other targets possible */ |
1208 | 1248 | ||
@@ -1227,6 +1267,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1227 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1267 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
1228 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1268 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
1229 | 1269 | ||
1270 | rcu_read_lock(); | ||
1230 | for_each_domain(cpu, sd) { | 1271 | for_each_domain(cpu, sd) { |
1231 | if (sd->flags & SD_WAKE_AFFINE) { | 1272 | if (sd->flags & SD_WAKE_AFFINE) { |
1232 | int best_cpu; | 1273 | int best_cpu; |
@@ -1236,15 +1277,20 @@ static int find_lowest_rq(struct task_struct *task) | |||
1236 | * remote processor. | 1277 | * remote processor. |
1237 | */ | 1278 | */ |
1238 | if (this_cpu != -1 && | 1279 | if (this_cpu != -1 && |
1239 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) | 1280 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { |
1281 | rcu_read_unlock(); | ||
1240 | return this_cpu; | 1282 | return this_cpu; |
1283 | } | ||
1241 | 1284 | ||
1242 | best_cpu = cpumask_first_and(lowest_mask, | 1285 | best_cpu = cpumask_first_and(lowest_mask, |
1243 | sched_domain_span(sd)); | 1286 | sched_domain_span(sd)); |
1244 | if (best_cpu < nr_cpu_ids) | 1287 | if (best_cpu < nr_cpu_ids) { |
1288 | rcu_read_unlock(); | ||
1245 | return best_cpu; | 1289 | return best_cpu; |
1290 | } | ||
1246 | } | 1291 | } |
1247 | } | 1292 | } |
1293 | rcu_read_unlock(); | ||
1248 | 1294 | ||
1249 | /* | 1295 | /* |
1250 | * And finally, if there were no matches within the domains | 1296 | * And finally, if there were no matches within the domains |
@@ -1287,7 +1333,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1287 | !cpumask_test_cpu(lowest_rq->cpu, | 1333 | !cpumask_test_cpu(lowest_rq->cpu, |
1288 | &task->cpus_allowed) || | 1334 | &task->cpus_allowed) || |
1289 | task_running(rq, task) || | 1335 | task_running(rq, task) || |
1290 | !task->se.on_rq)) { | 1336 | !task->on_rq)) { |
1291 | 1337 | ||
1292 | raw_spin_unlock(&lowest_rq->lock); | 1338 | raw_spin_unlock(&lowest_rq->lock); |
1293 | lowest_rq = NULL; | 1339 | lowest_rq = NULL; |
@@ -1321,7 +1367,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1321 | BUG_ON(task_current(rq, p)); | 1367 | BUG_ON(task_current(rq, p)); |
1322 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1368 | BUG_ON(p->rt.nr_cpus_allowed <= 1); |
1323 | 1369 | ||
1324 | BUG_ON(!p->se.on_rq); | 1370 | BUG_ON(!p->on_rq); |
1325 | BUG_ON(!rt_task(p)); | 1371 | BUG_ON(!rt_task(p)); |
1326 | 1372 | ||
1327 | return p; | 1373 | return p; |
@@ -1467,7 +1513,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1467 | */ | 1513 | */ |
1468 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1514 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
1469 | WARN_ON(p == src_rq->curr); | 1515 | WARN_ON(p == src_rq->curr); |
1470 | WARN_ON(!p->se.on_rq); | 1516 | WARN_ON(!p->on_rq); |
1471 | 1517 | ||
1472 | /* | 1518 | /* |
1473 | * There's a chance that p is higher in priority | 1519 | * There's a chance that p is higher in priority |
@@ -1538,7 +1584,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1538 | * Update the migration status of the RQ if we have an RT task | 1584 | * Update the migration status of the RQ if we have an RT task |
1539 | * which is running AND changing its weight value. | 1585 | * which is running AND changing its weight value. |
1540 | */ | 1586 | */ |
1541 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | 1587 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { |
1542 | struct rq *rq = task_rq(p); | 1588 | struct rq *rq = task_rq(p); |
1543 | 1589 | ||
1544 | if (!task_current(rq, p)) { | 1590 | if (!task_current(rq, p)) { |
@@ -1608,7 +1654,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1608 | * we may need to handle the pulling of RT tasks | 1654 | * we may need to handle the pulling of RT tasks |
1609 | * now. | 1655 | * now. |
1610 | */ | 1656 | */ |
1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) | 1657 | if (p->on_rq && !rq->rt.rt_nr_running) |
1612 | pull_rt_task(rq); | 1658 | pull_rt_task(rq); |
1613 | } | 1659 | } |
1614 | 1660 | ||
@@ -1638,7 +1684,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1638 | * If that current running task is also an RT task | 1684 | * If that current running task is also an RT task |
1639 | * then see if we can move to another run queue. | 1685 | * then see if we can move to another run queue. |
1640 | */ | 1686 | */ |
1641 | if (p->se.on_rq && rq->curr != p) { | 1687 | if (p->on_rq && rq->curr != p) { |
1642 | #ifdef CONFIG_SMP | 1688 | #ifdef CONFIG_SMP |
1643 | if (rq->rt.overloaded && push_rt_task(rq) && | 1689 | if (rq->rt.overloaded && push_rt_task(rq) && |
1644 | /* Don't resched if we changed runqueues */ | 1690 | /* Don't resched if we changed runqueues */ |
@@ -1657,7 +1703,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1657 | static void | 1703 | static void |
1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1704 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1659 | { | 1705 | { |
1660 | if (!p->se.on_rq) | 1706 | if (!p->on_rq) |
1661 | return; | 1707 | return; |
1662 | 1708 | ||
1663 | if (rq->curr == p) { | 1709 | if (rq->curr == p) { |
@@ -1796,10 +1842,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | |||
1796 | 1842 | ||
1797 | static void print_rt_stats(struct seq_file *m, int cpu) | 1843 | static void print_rt_stats(struct seq_file *m, int cpu) |
1798 | { | 1844 | { |
1845 | rt_rq_iter_t iter; | ||
1799 | struct rt_rq *rt_rq; | 1846 | struct rt_rq *rt_rq; |
1800 | 1847 | ||
1801 | rcu_read_lock(); | 1848 | rcu_read_lock(); |
1802 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | 1849 | for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) |
1803 | print_rt_rq(m, cpu, rt_rq); | 1850 | print_rt_rq(m, cpu, rt_rq); |
1804 | rcu_read_unlock(); | 1851 | rcu_read_unlock(); |
1805 | } | 1852 | } |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 48ddf431db0e..331e01bcd026 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
37 | 37 | ||
38 | #ifdef CONFIG_SMP | 38 | #ifdef CONFIG_SMP |
39 | /* domain-specific stats */ | 39 | /* domain-specific stats */ |
40 | preempt_disable(); | 40 | rcu_read_lock(); |
41 | for_each_domain(cpu, sd) { | 41 | for_each_domain(cpu, sd) { |
42 | enum cpu_idle_type itype; | 42 | enum cpu_idle_type itype; |
43 | 43 | ||
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
65 | sd->ttwu_move_balance); | 65 | sd->ttwu_move_balance); |
66 | } | 66 | } |
67 | preempt_enable(); | 67 | rcu_read_unlock(); |
68 | #endif | 68 | #endif |
69 | } | 69 | } |
70 | kfree(mask_str); | 70 | kfree(mask_str); |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fdac..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -9,8 +9,7 @@ | |||
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
11 | static int | 11 | static int |
12 | select_task_rq_stop(struct rq *rq, struct task_struct *p, | 12 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) |
13 | int sd_flag, int flags) | ||
14 | { | 13 | { |
15 | return task_cpu(p); /* stop tasks as never migrate */ | 14 | return task_cpu(p); /* stop tasks as never migrate */ |
16 | } | 15 | } |
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
26 | { | 25 | { |
27 | struct task_struct *stop = rq->stop; | 26 | struct task_struct *stop = rq->stop; |
28 | 27 | ||
29 | if (stop && stop->se.on_rq) | 28 | if (stop && stop->on_rq) |
30 | return stop; | 29 | return stop; |
31 | 30 | ||
32 | return NULL; | 31 | return NULL; |
diff --git a/kernel/signal.c b/kernel/signal.c index 7165af5f1b11..ff7678603328 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |||
124 | 124 | ||
125 | static int recalc_sigpending_tsk(struct task_struct *t) | 125 | static int recalc_sigpending_tsk(struct task_struct *t) |
126 | { | 126 | { |
127 | if (t->signal->group_stop_count > 0 || | 127 | if ((t->group_stop & GROUP_STOP_PENDING) || |
128 | PENDING(&t->pending, &t->blocked) || | 128 | PENDING(&t->pending, &t->blocked) || |
129 | PENDING(&t->signal->shared_pending, &t->blocked)) { | 129 | PENDING(&t->signal->shared_pending, &t->blocked)) { |
130 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 130 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
@@ -223,6 +223,83 @@ static inline void print_dropped_signal(int sig) | |||
223 | current->comm, current->pid, sig); | 223 | current->comm, current->pid, sig); |
224 | } | 224 | } |
225 | 225 | ||
226 | /** | ||
227 | * task_clear_group_stop_trapping - clear group stop trapping bit | ||
228 | * @task: target task | ||
229 | * | ||
230 | * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it | ||
231 | * and wake up the ptracer. Note that we don't need any further locking. | ||
232 | * @task->siglock guarantees that @task->parent points to the ptracer. | ||
233 | * | ||
234 | * CONTEXT: | ||
235 | * Must be called with @task->sighand->siglock held. | ||
236 | */ | ||
237 | static void task_clear_group_stop_trapping(struct task_struct *task) | ||
238 | { | ||
239 | if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { | ||
240 | task->group_stop &= ~GROUP_STOP_TRAPPING; | ||
241 | __wake_up_sync_key(&task->parent->signal->wait_chldexit, | ||
242 | TASK_UNINTERRUPTIBLE, 1, task); | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /** | ||
247 | * task_clear_group_stop_pending - clear pending group stop | ||
248 | * @task: target task | ||
249 | * | ||
250 | * Clear group stop states for @task. | ||
251 | * | ||
252 | * CONTEXT: | ||
253 | * Must be called with @task->sighand->siglock held. | ||
254 | */ | ||
255 | void task_clear_group_stop_pending(struct task_struct *task) | ||
256 | { | ||
257 | task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | | ||
258 | GROUP_STOP_DEQUEUED); | ||
259 | } | ||
260 | |||
261 | /** | ||
262 | * task_participate_group_stop - participate in a group stop | ||
263 | * @task: task participating in a group stop | ||
264 | * | ||
265 | * @task has GROUP_STOP_PENDING set and is participating in a group stop. | ||
266 | * Group stop states are cleared and the group stop count is consumed if | ||
267 | * %GROUP_STOP_CONSUME was set. If the consumption completes the group | ||
268 | * stop, the appropriate %SIGNAL_* flags are set. | ||
269 | * | ||
270 | * CONTEXT: | ||
271 | * Must be called with @task->sighand->siglock held. | ||
272 | * | ||
273 | * RETURNS: | ||
274 | * %true if group stop completion should be notified to the parent, %false | ||
275 | * otherwise. | ||
276 | */ | ||
277 | static bool task_participate_group_stop(struct task_struct *task) | ||
278 | { | ||
279 | struct signal_struct *sig = task->signal; | ||
280 | bool consume = task->group_stop & GROUP_STOP_CONSUME; | ||
281 | |||
282 | WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); | ||
283 | |||
284 | task_clear_group_stop_pending(task); | ||
285 | |||
286 | if (!consume) | ||
287 | return false; | ||
288 | |||
289 | if (!WARN_ON_ONCE(sig->group_stop_count == 0)) | ||
290 | sig->group_stop_count--; | ||
291 | |||
292 | /* | ||
293 | * Tell the caller to notify completion iff we are entering into a | ||
294 | * fresh group stop. Read comment in do_signal_stop() for details. | ||
295 | */ | ||
296 | if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { | ||
297 | sig->flags = SIGNAL_STOP_STOPPED; | ||
298 | return true; | ||
299 | } | ||
300 | return false; | ||
301 | } | ||
302 | |||
226 | /* | 303 | /* |
227 | * allocate a new signal queue record | 304 | * allocate a new signal queue record |
228 | * - this may be called without locks if and only if t == current, otherwise an | 305 | * - this may be called without locks if and only if t == current, otherwise an |
@@ -527,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
527 | * is to alert stop-signal processing code when another | 604 | * is to alert stop-signal processing code when another |
528 | * processor has come along and cleared the flag. | 605 | * processor has come along and cleared the flag. |
529 | */ | 606 | */ |
530 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 607 | current->group_stop |= GROUP_STOP_DEQUEUED; |
531 | } | 608 | } |
532 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { | 609 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { |
533 | /* | 610 | /* |
@@ -592,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | |||
592 | if (sigisemptyset(&m)) | 669 | if (sigisemptyset(&m)) |
593 | return 0; | 670 | return 0; |
594 | 671 | ||
595 | signandsets(&s->signal, &s->signal, mask); | 672 | sigandnsets(&s->signal, &s->signal, mask); |
596 | list_for_each_entry_safe(q, n, &s->list, list) { | 673 | list_for_each_entry_safe(q, n, &s->list, list) { |
597 | if (sigismember(mask, q->info.si_signo)) { | 674 | if (sigismember(mask, q->info.si_signo)) { |
598 | list_del_init(&q->list); | 675 | list_del_init(&q->list); |
@@ -727,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
727 | } else if (sig == SIGCONT) { | 804 | } else if (sig == SIGCONT) { |
728 | unsigned int why; | 805 | unsigned int why; |
729 | /* | 806 | /* |
730 | * Remove all stop signals from all queues, | 807 | * Remove all stop signals from all queues, wake all threads. |
731 | * and wake all threads. | ||
732 | */ | 808 | */ |
733 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); | 809 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); |
734 | t = p; | 810 | t = p; |
735 | do { | 811 | do { |
736 | unsigned int state; | 812 | task_clear_group_stop_pending(t); |
737 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | 813 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); |
738 | /* | 814 | wake_up_state(t, __TASK_STOPPED); |
739 | * If there is a handler for SIGCONT, we must make | ||
740 | * sure that no thread returns to user mode before | ||
741 | * we post the signal, in case it was the only | ||
742 | * thread eligible to run the signal handler--then | ||
743 | * it must not do anything between resuming and | ||
744 | * running the handler. With the TIF_SIGPENDING | ||
745 | * flag set, the thread will pause and acquire the | ||
746 | * siglock that we hold now and until we've queued | ||
747 | * the pending signal. | ||
748 | * | ||
749 | * Wake up the stopped thread _after_ setting | ||
750 | * TIF_SIGPENDING | ||
751 | */ | ||
752 | state = __TASK_STOPPED; | ||
753 | if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { | ||
754 | set_tsk_thread_flag(t, TIF_SIGPENDING); | ||
755 | state |= TASK_INTERRUPTIBLE; | ||
756 | } | ||
757 | wake_up_state(t, state); | ||
758 | } while_each_thread(p, t); | 815 | } while_each_thread(p, t); |
759 | 816 | ||
760 | /* | 817 | /* |
@@ -780,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
780 | signal->flags = why | SIGNAL_STOP_CONTINUED; | 837 | signal->flags = why | SIGNAL_STOP_CONTINUED; |
781 | signal->group_stop_count = 0; | 838 | signal->group_stop_count = 0; |
782 | signal->group_exit_code = 0; | 839 | signal->group_exit_code = 0; |
783 | } else { | ||
784 | /* | ||
785 | * We are not stopped, but there could be a stop | ||
786 | * signal in the middle of being processed after | ||
787 | * being removed from the queue. Clear that too. | ||
788 | */ | ||
789 | signal->flags &= ~SIGNAL_STOP_DEQUEUED; | ||
790 | } | 840 | } |
791 | } | 841 | } |
792 | 842 | ||
@@ -875,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) | |||
875 | signal->group_stop_count = 0; | 925 | signal->group_stop_count = 0; |
876 | t = p; | 926 | t = p; |
877 | do { | 927 | do { |
928 | task_clear_group_stop_pending(t); | ||
878 | sigaddset(&t->pending.signal, SIGKILL); | 929 | sigaddset(&t->pending.signal, SIGKILL); |
879 | signal_wake_up(t, 1); | 930 | signal_wake_up(t, 1); |
880 | } while_each_thread(p, t); | 931 | } while_each_thread(p, t); |
@@ -1109,6 +1160,7 @@ int zap_other_threads(struct task_struct *p) | |||
1109 | p->signal->group_stop_count = 0; | 1160 | p->signal->group_stop_count = 0; |
1110 | 1161 | ||
1111 | while_each_thread(p, t) { | 1162 | while_each_thread(p, t) { |
1163 | task_clear_group_stop_pending(t); | ||
1112 | count++; | 1164 | count++; |
1113 | 1165 | ||
1114 | /* Don't bother with already dead threads */ | 1166 | /* Don't bother with already dead threads */ |
@@ -1536,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig) | |||
1536 | return ret; | 1588 | return ret; |
1537 | } | 1589 | } |
1538 | 1590 | ||
1539 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | 1591 | /** |
1592 | * do_notify_parent_cldstop - notify parent of stopped/continued state change | ||
1593 | * @tsk: task reporting the state change | ||
1594 | * @for_ptracer: the notification is for ptracer | ||
1595 | * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report | ||
1596 | * | ||
1597 | * Notify @tsk's parent that the stopped/continued state has changed. If | ||
1598 | * @for_ptracer is %false, @tsk's group leader notifies to its real parent. | ||
1599 | * If %true, @tsk reports to @tsk->parent which should be the ptracer. | ||
1600 | * | ||
1601 | * CONTEXT: | ||
1602 | * Must be called with tasklist_lock at least read locked. | ||
1603 | */ | ||
1604 | static void do_notify_parent_cldstop(struct task_struct *tsk, | ||
1605 | bool for_ptracer, int why) | ||
1540 | { | 1606 | { |
1541 | struct siginfo info; | 1607 | struct siginfo info; |
1542 | unsigned long flags; | 1608 | unsigned long flags; |
1543 | struct task_struct *parent; | 1609 | struct task_struct *parent; |
1544 | struct sighand_struct *sighand; | 1610 | struct sighand_struct *sighand; |
1545 | 1611 | ||
1546 | if (task_ptrace(tsk)) | 1612 | if (for_ptracer) { |
1547 | parent = tsk->parent; | 1613 | parent = tsk->parent; |
1548 | else { | 1614 | } else { |
1549 | tsk = tsk->group_leader; | 1615 | tsk = tsk->group_leader; |
1550 | parent = tsk->real_parent; | 1616 | parent = tsk->real_parent; |
1551 | } | 1617 | } |
@@ -1621,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1621 | } | 1687 | } |
1622 | 1688 | ||
1623 | /* | 1689 | /* |
1690 | * Test whether the target task of the usual cldstop notification - the | ||
1691 | * real_parent of @child - is in the same group as the ptracer. | ||
1692 | */ | ||
1693 | static bool real_parent_is_ptracer(struct task_struct *child) | ||
1694 | { | ||
1695 | return same_thread_group(child->parent, child->real_parent); | ||
1696 | } | ||
1697 | |||
1698 | /* | ||
1624 | * This must be called with current->sighand->siglock held. | 1699 | * This must be called with current->sighand->siglock held. |
1625 | * | 1700 | * |
1626 | * This should be the path for all ptrace stops. | 1701 | * This should be the path for all ptrace stops. |
@@ -1631,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1631 | * If we actually decide not to stop at all because the tracer | 1706 | * If we actually decide not to stop at all because the tracer |
1632 | * is gone, we keep current->exit_code unless clear_code. | 1707 | * is gone, we keep current->exit_code unless clear_code. |
1633 | */ | 1708 | */ |
1634 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | 1709 | static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) |
1635 | __releases(¤t->sighand->siglock) | 1710 | __releases(¤t->sighand->siglock) |
1636 | __acquires(¤t->sighand->siglock) | 1711 | __acquires(¤t->sighand->siglock) |
1637 | { | 1712 | { |
1713 | bool gstop_done = false; | ||
1714 | |||
1638 | if (arch_ptrace_stop_needed(exit_code, info)) { | 1715 | if (arch_ptrace_stop_needed(exit_code, info)) { |
1639 | /* | 1716 | /* |
1640 | * The arch code has something special to do before a | 1717 | * The arch code has something special to do before a |
@@ -1655,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
1655 | } | 1732 | } |
1656 | 1733 | ||
1657 | /* | 1734 | /* |
1658 | * If there is a group stop in progress, | 1735 | * If @why is CLD_STOPPED, we're trapping to participate in a group |
1659 | * we must participate in the bookkeeping. | 1736 | * stop. Do the bookkeeping. Note that if SIGCONT was delievered |
1737 | * while siglock was released for the arch hook, PENDING could be | ||
1738 | * clear now. We act as if SIGCONT is received after TASK_TRACED | ||
1739 | * is entered - ignore it. | ||
1660 | */ | 1740 | */ |
1661 | if (current->signal->group_stop_count > 0) | 1741 | if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) |
1662 | --current->signal->group_stop_count; | 1742 | gstop_done = task_participate_group_stop(current); |
1663 | 1743 | ||
1664 | current->last_siginfo = info; | 1744 | current->last_siginfo = info; |
1665 | current->exit_code = exit_code; | 1745 | current->exit_code = exit_code; |
1666 | 1746 | ||
1667 | /* Let the debugger run. */ | 1747 | /* |
1668 | __set_current_state(TASK_TRACED); | 1748 | * TRACED should be visible before TRAPPING is cleared; otherwise, |
1749 | * the tracer might fail do_wait(). | ||
1750 | */ | ||
1751 | set_current_state(TASK_TRACED); | ||
1752 | |||
1753 | /* | ||
1754 | * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and | ||
1755 | * transition to TASK_TRACED should be atomic with respect to | ||
1756 | * siglock. This hsould be done after the arch hook as siglock is | ||
1757 | * released and regrabbed across it. | ||
1758 | */ | ||
1759 | task_clear_group_stop_trapping(current); | ||
1760 | |||
1669 | spin_unlock_irq(¤t->sighand->siglock); | 1761 | spin_unlock_irq(¤t->sighand->siglock); |
1670 | read_lock(&tasklist_lock); | 1762 | read_lock(&tasklist_lock); |
1671 | if (may_ptrace_stop()) { | 1763 | if (may_ptrace_stop()) { |
1672 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1764 | /* |
1765 | * Notify parents of the stop. | ||
1766 | * | ||
1767 | * While ptraced, there are two parents - the ptracer and | ||
1768 | * the real_parent of the group_leader. The ptracer should | ||
1769 | * know about every stop while the real parent is only | ||
1770 | * interested in the completion of group stop. The states | ||
1771 | * for the two don't interact with each other. Notify | ||
1772 | * separately unless they're gonna be duplicates. | ||
1773 | */ | ||
1774 | do_notify_parent_cldstop(current, true, why); | ||
1775 | if (gstop_done && !real_parent_is_ptracer(current)) | ||
1776 | do_notify_parent_cldstop(current, false, why); | ||
1777 | |||
1673 | /* | 1778 | /* |
1674 | * Don't want to allow preemption here, because | 1779 | * Don't want to allow preemption here, because |
1675 | * sys_ptrace() needs this task to be inactive. | 1780 | * sys_ptrace() needs this task to be inactive. |
@@ -1684,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
1684 | /* | 1789 | /* |
1685 | * By the time we got the lock, our tracer went away. | 1790 | * By the time we got the lock, our tracer went away. |
1686 | * Don't drop the lock yet, another tracer may come. | 1791 | * Don't drop the lock yet, another tracer may come. |
1792 | * | ||
1793 | * If @gstop_done, the ptracer went away between group stop | ||
1794 | * completion and here. During detach, it would have set | ||
1795 | * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED | ||
1796 | * in do_signal_stop() on return, so notifying the real | ||
1797 | * parent of the group stop completion is enough. | ||
1687 | */ | 1798 | */ |
1799 | if (gstop_done) | ||
1800 | do_notify_parent_cldstop(current, false, why); | ||
1801 | |||
1688 | __set_current_state(TASK_RUNNING); | 1802 | __set_current_state(TASK_RUNNING); |
1689 | if (clear_code) | 1803 | if (clear_code) |
1690 | current->exit_code = 0; | 1804 | current->exit_code = 0; |
@@ -1728,7 +1842,7 @@ void ptrace_notify(int exit_code) | |||
1728 | 1842 | ||
1729 | /* Let the debugger run. */ | 1843 | /* Let the debugger run. */ |
1730 | spin_lock_irq(¤t->sighand->siglock); | 1844 | spin_lock_irq(¤t->sighand->siglock); |
1731 | ptrace_stop(exit_code, 1, &info); | 1845 | ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); |
1732 | spin_unlock_irq(¤t->sighand->siglock); | 1846 | spin_unlock_irq(¤t->sighand->siglock); |
1733 | } | 1847 | } |
1734 | 1848 | ||
@@ -1741,66 +1855,115 @@ void ptrace_notify(int exit_code) | |||
1741 | static int do_signal_stop(int signr) | 1855 | static int do_signal_stop(int signr) |
1742 | { | 1856 | { |
1743 | struct signal_struct *sig = current->signal; | 1857 | struct signal_struct *sig = current->signal; |
1744 | int notify; | ||
1745 | 1858 | ||
1746 | if (!sig->group_stop_count) { | 1859 | if (!(current->group_stop & GROUP_STOP_PENDING)) { |
1860 | unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; | ||
1747 | struct task_struct *t; | 1861 | struct task_struct *t; |
1748 | 1862 | ||
1749 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || | 1863 | /* signr will be recorded in task->group_stop for retries */ |
1864 | WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); | ||
1865 | |||
1866 | if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || | ||
1750 | unlikely(signal_group_exit(sig))) | 1867 | unlikely(signal_group_exit(sig))) |
1751 | return 0; | 1868 | return 0; |
1752 | /* | 1869 | /* |
1753 | * There is no group stop already in progress. | 1870 | * There is no group stop already in progress. We must |
1754 | * We must initiate one now. | 1871 | * initiate one now. |
1872 | * | ||
1873 | * While ptraced, a task may be resumed while group stop is | ||
1874 | * still in effect and then receive a stop signal and | ||
1875 | * initiate another group stop. This deviates from the | ||
1876 | * usual behavior as two consecutive stop signals can't | ||
1877 | * cause two group stops when !ptraced. That is why we | ||
1878 | * also check !task_is_stopped(t) below. | ||
1879 | * | ||
1880 | * The condition can be distinguished by testing whether | ||
1881 | * SIGNAL_STOP_STOPPED is already set. Don't generate | ||
1882 | * group_exit_code in such case. | ||
1883 | * | ||
1884 | * This is not necessary for SIGNAL_STOP_CONTINUED because | ||
1885 | * an intervening stop signal is required to cause two | ||
1886 | * continued events regardless of ptrace. | ||
1755 | */ | 1887 | */ |
1756 | sig->group_exit_code = signr; | 1888 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) |
1889 | sig->group_exit_code = signr; | ||
1890 | else | ||
1891 | WARN_ON_ONCE(!task_ptrace(current)); | ||
1757 | 1892 | ||
1893 | current->group_stop &= ~GROUP_STOP_SIGMASK; | ||
1894 | current->group_stop |= signr | gstop; | ||
1758 | sig->group_stop_count = 1; | 1895 | sig->group_stop_count = 1; |
1759 | for (t = next_thread(current); t != current; t = next_thread(t)) | 1896 | for (t = next_thread(current); t != current; |
1897 | t = next_thread(t)) { | ||
1898 | t->group_stop &= ~GROUP_STOP_SIGMASK; | ||
1760 | /* | 1899 | /* |
1761 | * Setting state to TASK_STOPPED for a group | 1900 | * Setting state to TASK_STOPPED for a group |
1762 | * stop is always done with the siglock held, | 1901 | * stop is always done with the siglock held, |
1763 | * so this check has no races. | 1902 | * so this check has no races. |
1764 | */ | 1903 | */ |
1765 | if (!(t->flags & PF_EXITING) && | 1904 | if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { |
1766 | !task_is_stopped_or_traced(t)) { | 1905 | t->group_stop |= signr | gstop; |
1767 | sig->group_stop_count++; | 1906 | sig->group_stop_count++; |
1768 | signal_wake_up(t, 0); | 1907 | signal_wake_up(t, 0); |
1769 | } | 1908 | } |
1909 | } | ||
1770 | } | 1910 | } |
1771 | /* | 1911 | retry: |
1772 | * If there are no other threads in the group, or if there is | 1912 | if (likely(!task_ptrace(current))) { |
1773 | * a group stop in progress and we are the last to stop, report | 1913 | int notify = 0; |
1774 | * to the parent. When ptraced, every thread reports itself. | 1914 | |
1775 | */ | 1915 | /* |
1776 | notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; | 1916 | * If there are no other threads in the group, or if there |
1777 | notify = tracehook_notify_jctl(notify, CLD_STOPPED); | 1917 | * is a group stop in progress and we are the last to stop, |
1778 | /* | 1918 | * report to the parent. |
1779 | * tracehook_notify_jctl() can drop and reacquire siglock, so | 1919 | */ |
1780 | * we keep ->group_stop_count != 0 before the call. If SIGCONT | 1920 | if (task_participate_group_stop(current)) |
1781 | * or SIGKILL comes in between ->group_stop_count == 0. | 1921 | notify = CLD_STOPPED; |
1782 | */ | 1922 | |
1783 | if (sig->group_stop_count) { | ||
1784 | if (!--sig->group_stop_count) | ||
1785 | sig->flags = SIGNAL_STOP_STOPPED; | ||
1786 | current->exit_code = sig->group_exit_code; | ||
1787 | __set_current_state(TASK_STOPPED); | 1923 | __set_current_state(TASK_STOPPED); |
1924 | spin_unlock_irq(¤t->sighand->siglock); | ||
1925 | |||
1926 | /* | ||
1927 | * Notify the parent of the group stop completion. Because | ||
1928 | * we're not holding either the siglock or tasklist_lock | ||
1929 | * here, ptracer may attach inbetween; however, this is for | ||
1930 | * group stop and should always be delivered to the real | ||
1931 | * parent of the group leader. The new ptracer will get | ||
1932 | * its notification when this task transitions into | ||
1933 | * TASK_TRACED. | ||
1934 | */ | ||
1935 | if (notify) { | ||
1936 | read_lock(&tasklist_lock); | ||
1937 | do_notify_parent_cldstop(current, false, notify); | ||
1938 | read_unlock(&tasklist_lock); | ||
1939 | } | ||
1940 | |||
1941 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | ||
1942 | schedule(); | ||
1943 | |||
1944 | spin_lock_irq(¤t->sighand->siglock); | ||
1945 | } else { | ||
1946 | ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, | ||
1947 | CLD_STOPPED, 0, NULL); | ||
1948 | current->exit_code = 0; | ||
1788 | } | 1949 | } |
1789 | spin_unlock_irq(¤t->sighand->siglock); | ||
1790 | 1950 | ||
1791 | if (notify) { | 1951 | /* |
1792 | read_lock(&tasklist_lock); | 1952 | * GROUP_STOP_PENDING could be set if another group stop has |
1793 | do_notify_parent_cldstop(current, notify); | 1953 | * started since being woken up or ptrace wants us to transit |
1794 | read_unlock(&tasklist_lock); | 1954 | * between TASK_STOPPED and TRACED. Retry group stop. |
1955 | */ | ||
1956 | if (current->group_stop & GROUP_STOP_PENDING) { | ||
1957 | WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); | ||
1958 | goto retry; | ||
1795 | } | 1959 | } |
1796 | 1960 | ||
1797 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 1961 | /* PTRACE_ATTACH might have raced with task killing, clear trapping */ |
1798 | do { | 1962 | task_clear_group_stop_trapping(current); |
1799 | schedule(); | 1963 | |
1800 | } while (try_to_freeze()); | 1964 | spin_unlock_irq(¤t->sighand->siglock); |
1801 | 1965 | ||
1802 | tracehook_finish_jctl(); | 1966 | tracehook_finish_jctl(); |
1803 | current->exit_code = 0; | ||
1804 | 1967 | ||
1805 | return 1; | 1968 | return 1; |
1806 | } | 1969 | } |
@@ -1814,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
1814 | ptrace_signal_deliver(regs, cookie); | 1977 | ptrace_signal_deliver(regs, cookie); |
1815 | 1978 | ||
1816 | /* Let the debugger run. */ | 1979 | /* Let the debugger run. */ |
1817 | ptrace_stop(signr, 0, info); | 1980 | ptrace_stop(signr, CLD_TRAPPED, 0, info); |
1818 | 1981 | ||
1819 | /* We're back. Did the debugger cancel the sig? */ | 1982 | /* We're back. Did the debugger cancel the sig? */ |
1820 | signr = current->exit_code; | 1983 | signr = current->exit_code; |
@@ -1869,18 +2032,36 @@ relock: | |||
1869 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. | 2032 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. |
1870 | */ | 2033 | */ |
1871 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { | 2034 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { |
1872 | int why = (signal->flags & SIGNAL_STOP_CONTINUED) | 2035 | struct task_struct *leader; |
1873 | ? CLD_CONTINUED : CLD_STOPPED; | 2036 | int why; |
2037 | |||
2038 | if (signal->flags & SIGNAL_CLD_CONTINUED) | ||
2039 | why = CLD_CONTINUED; | ||
2040 | else | ||
2041 | why = CLD_STOPPED; | ||
2042 | |||
1874 | signal->flags &= ~SIGNAL_CLD_MASK; | 2043 | signal->flags &= ~SIGNAL_CLD_MASK; |
1875 | 2044 | ||
1876 | why = tracehook_notify_jctl(why, CLD_CONTINUED); | ||
1877 | spin_unlock_irq(&sighand->siglock); | 2045 | spin_unlock_irq(&sighand->siglock); |
1878 | 2046 | ||
1879 | if (why) { | 2047 | /* |
1880 | read_lock(&tasklist_lock); | 2048 | * Notify the parent that we're continuing. This event is |
1881 | do_notify_parent_cldstop(current->group_leader, why); | 2049 | * always per-process and doesn't make whole lot of sense |
1882 | read_unlock(&tasklist_lock); | 2050 | * for ptracers, who shouldn't consume the state via |
1883 | } | 2051 | * wait(2) either, but, for backward compatibility, notify |
2052 | * the ptracer of the group leader too unless it's gonna be | ||
2053 | * a duplicate. | ||
2054 | */ | ||
2055 | read_lock(&tasklist_lock); | ||
2056 | |||
2057 | do_notify_parent_cldstop(current, false, why); | ||
2058 | |||
2059 | leader = current->group_leader; | ||
2060 | if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) | ||
2061 | do_notify_parent_cldstop(leader, true, why); | ||
2062 | |||
2063 | read_unlock(&tasklist_lock); | ||
2064 | |||
1884 | goto relock; | 2065 | goto relock; |
1885 | } | 2066 | } |
1886 | 2067 | ||
@@ -1897,8 +2078,8 @@ relock: | |||
1897 | if (unlikely(signr != 0)) | 2078 | if (unlikely(signr != 0)) |
1898 | ka = return_ka; | 2079 | ka = return_ka; |
1899 | else { | 2080 | else { |
1900 | if (unlikely(signal->group_stop_count > 0) && | 2081 | if (unlikely(current->group_stop & |
1901 | do_signal_stop(0)) | 2082 | GROUP_STOP_PENDING) && do_signal_stop(0)) |
1902 | goto relock; | 2083 | goto relock; |
1903 | 2084 | ||
1904 | signr = dequeue_signal(current, ¤t->blocked, | 2085 | signr = dequeue_signal(current, ¤t->blocked, |
@@ -2017,10 +2198,42 @@ relock: | |||
2017 | return signr; | 2198 | return signr; |
2018 | } | 2199 | } |
2019 | 2200 | ||
2201 | /* | ||
2202 | * It could be that complete_signal() picked us to notify about the | ||
2203 | * group-wide signal. Other threads should be notified now to take | ||
2204 | * the shared signals in @which since we will not. | ||
2205 | */ | ||
2206 | static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) | ||
2207 | { | ||
2208 | sigset_t retarget; | ||
2209 | struct task_struct *t; | ||
2210 | |||
2211 | sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); | ||
2212 | if (sigisemptyset(&retarget)) | ||
2213 | return; | ||
2214 | |||
2215 | t = tsk; | ||
2216 | while_each_thread(tsk, t) { | ||
2217 | if (t->flags & PF_EXITING) | ||
2218 | continue; | ||
2219 | |||
2220 | if (!has_pending_signals(&retarget, &t->blocked)) | ||
2221 | continue; | ||
2222 | /* Remove the signals this thread can handle. */ | ||
2223 | sigandsets(&retarget, &retarget, &t->blocked); | ||
2224 | |||
2225 | if (!signal_pending(t)) | ||
2226 | signal_wake_up(t, 0); | ||
2227 | |||
2228 | if (sigisemptyset(&retarget)) | ||
2229 | break; | ||
2230 | } | ||
2231 | } | ||
2232 | |||
2020 | void exit_signals(struct task_struct *tsk) | 2233 | void exit_signals(struct task_struct *tsk) |
2021 | { | 2234 | { |
2022 | int group_stop = 0; | 2235 | int group_stop = 0; |
2023 | struct task_struct *t; | 2236 | sigset_t unblocked; |
2024 | 2237 | ||
2025 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { | 2238 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { |
2026 | tsk->flags |= PF_EXITING; | 2239 | tsk->flags |= PF_EXITING; |
@@ -2036,26 +2249,23 @@ void exit_signals(struct task_struct *tsk) | |||
2036 | if (!signal_pending(tsk)) | 2249 | if (!signal_pending(tsk)) |
2037 | goto out; | 2250 | goto out; |
2038 | 2251 | ||
2039 | /* | 2252 | unblocked = tsk->blocked; |
2040 | * It could be that __group_complete_signal() choose us to | 2253 | signotset(&unblocked); |
2041 | * notify about group-wide signal. Another thread should be | 2254 | retarget_shared_pending(tsk, &unblocked); |
2042 | * woken now to take the signal since we will not. | ||
2043 | */ | ||
2044 | for (t = tsk; (t = next_thread(t)) != tsk; ) | ||
2045 | if (!signal_pending(t) && !(t->flags & PF_EXITING)) | ||
2046 | recalc_sigpending_and_wake(t); | ||
2047 | 2255 | ||
2048 | if (unlikely(tsk->signal->group_stop_count) && | 2256 | if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && |
2049 | !--tsk->signal->group_stop_count) { | 2257 | task_participate_group_stop(tsk)) |
2050 | tsk->signal->flags = SIGNAL_STOP_STOPPED; | 2258 | group_stop = CLD_STOPPED; |
2051 | group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); | ||
2052 | } | ||
2053 | out: | 2259 | out: |
2054 | spin_unlock_irq(&tsk->sighand->siglock); | 2260 | spin_unlock_irq(&tsk->sighand->siglock); |
2055 | 2261 | ||
2262 | /* | ||
2263 | * If group stop has completed, deliver the notification. This | ||
2264 | * should always go to the real parent of the group leader. | ||
2265 | */ | ||
2056 | if (unlikely(group_stop)) { | 2266 | if (unlikely(group_stop)) { |
2057 | read_lock(&tasklist_lock); | 2267 | read_lock(&tasklist_lock); |
2058 | do_notify_parent_cldstop(tsk, group_stop); | 2268 | do_notify_parent_cldstop(tsk, false, group_stop); |
2059 | read_unlock(&tasklist_lock); | 2269 | read_unlock(&tasklist_lock); |
2060 | } | 2270 | } |
2061 | } | 2271 | } |
@@ -2089,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param) | |||
2089 | return -EINTR; | 2299 | return -EINTR; |
2090 | } | 2300 | } |
2091 | 2301 | ||
2092 | /* | 2302 | static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) |
2093 | * We don't need to get the kernel lock - this is all local to this | 2303 | { |
2094 | * particular thread.. (and that's good, because this is _heavily_ | 2304 | if (signal_pending(tsk) && !thread_group_empty(tsk)) { |
2095 | * used by various programs) | 2305 | sigset_t newblocked; |
2306 | /* A set of now blocked but previously unblocked signals. */ | ||
2307 | sigandnsets(&newblocked, newset, ¤t->blocked); | ||
2308 | retarget_shared_pending(tsk, &newblocked); | ||
2309 | } | ||
2310 | tsk->blocked = *newset; | ||
2311 | recalc_sigpending(); | ||
2312 | } | ||
2313 | |||
2314 | /** | ||
2315 | * set_current_blocked - change current->blocked mask | ||
2316 | * @newset: new mask | ||
2317 | * | ||
2318 | * It is wrong to change ->blocked directly, this helper should be used | ||
2319 | * to ensure the process can't miss a shared signal we are going to block. | ||
2096 | */ | 2320 | */ |
2321 | void set_current_blocked(const sigset_t *newset) | ||
2322 | { | ||
2323 | struct task_struct *tsk = current; | ||
2324 | |||
2325 | spin_lock_irq(&tsk->sighand->siglock); | ||
2326 | __set_task_blocked(tsk, newset); | ||
2327 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2328 | } | ||
2097 | 2329 | ||
2098 | /* | 2330 | /* |
2099 | * This is also useful for kernel threads that want to temporarily | 2331 | * This is also useful for kernel threads that want to temporarily |
@@ -2105,73 +2337,66 @@ long do_no_restart_syscall(struct restart_block *param) | |||
2105 | */ | 2337 | */ |
2106 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | 2338 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) |
2107 | { | 2339 | { |
2108 | int error; | 2340 | struct task_struct *tsk = current; |
2341 | sigset_t newset; | ||
2109 | 2342 | ||
2110 | spin_lock_irq(¤t->sighand->siglock); | 2343 | /* Lockless, only current can change ->blocked, never from irq */ |
2111 | if (oldset) | 2344 | if (oldset) |
2112 | *oldset = current->blocked; | 2345 | *oldset = tsk->blocked; |
2113 | 2346 | ||
2114 | error = 0; | ||
2115 | switch (how) { | 2347 | switch (how) { |
2116 | case SIG_BLOCK: | 2348 | case SIG_BLOCK: |
2117 | sigorsets(¤t->blocked, ¤t->blocked, set); | 2349 | sigorsets(&newset, &tsk->blocked, set); |
2118 | break; | 2350 | break; |
2119 | case SIG_UNBLOCK: | 2351 | case SIG_UNBLOCK: |
2120 | signandsets(¤t->blocked, ¤t->blocked, set); | 2352 | sigandnsets(&newset, &tsk->blocked, set); |
2121 | break; | 2353 | break; |
2122 | case SIG_SETMASK: | 2354 | case SIG_SETMASK: |
2123 | current->blocked = *set; | 2355 | newset = *set; |
2124 | break; | 2356 | break; |
2125 | default: | 2357 | default: |
2126 | error = -EINVAL; | 2358 | return -EINVAL; |
2127 | } | 2359 | } |
2128 | recalc_sigpending(); | ||
2129 | spin_unlock_irq(¤t->sighand->siglock); | ||
2130 | 2360 | ||
2131 | return error; | 2361 | set_current_blocked(&newset); |
2362 | return 0; | ||
2132 | } | 2363 | } |
2133 | 2364 | ||
2134 | /** | 2365 | /** |
2135 | * sys_rt_sigprocmask - change the list of currently blocked signals | 2366 | * sys_rt_sigprocmask - change the list of currently blocked signals |
2136 | * @how: whether to add, remove, or set signals | 2367 | * @how: whether to add, remove, or set signals |
2137 | * @set: stores pending signals | 2368 | * @nset: stores pending signals |
2138 | * @oset: previous value of signal mask if non-null | 2369 | * @oset: previous value of signal mask if non-null |
2139 | * @sigsetsize: size of sigset_t type | 2370 | * @sigsetsize: size of sigset_t type |
2140 | */ | 2371 | */ |
2141 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, | 2372 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, |
2142 | sigset_t __user *, oset, size_t, sigsetsize) | 2373 | sigset_t __user *, oset, size_t, sigsetsize) |
2143 | { | 2374 | { |
2144 | int error = -EINVAL; | ||
2145 | sigset_t old_set, new_set; | 2375 | sigset_t old_set, new_set; |
2376 | int error; | ||
2146 | 2377 | ||
2147 | /* XXX: Don't preclude handling different sized sigset_t's. */ | 2378 | /* XXX: Don't preclude handling different sized sigset_t's. */ |
2148 | if (sigsetsize != sizeof(sigset_t)) | 2379 | if (sigsetsize != sizeof(sigset_t)) |
2149 | goto out; | 2380 | return -EINVAL; |
2150 | 2381 | ||
2151 | if (set) { | 2382 | old_set = current->blocked; |
2152 | error = -EFAULT; | 2383 | |
2153 | if (copy_from_user(&new_set, set, sizeof(*set))) | 2384 | if (nset) { |
2154 | goto out; | 2385 | if (copy_from_user(&new_set, nset, sizeof(sigset_t))) |
2386 | return -EFAULT; | ||
2155 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 2387 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
2156 | 2388 | ||
2157 | error = sigprocmask(how, &new_set, &old_set); | 2389 | error = sigprocmask(how, &new_set, NULL); |
2158 | if (error) | 2390 | if (error) |
2159 | goto out; | 2391 | return error; |
2160 | if (oset) | 2392 | } |
2161 | goto set_old; | ||
2162 | } else if (oset) { | ||
2163 | spin_lock_irq(¤t->sighand->siglock); | ||
2164 | old_set = current->blocked; | ||
2165 | spin_unlock_irq(¤t->sighand->siglock); | ||
2166 | 2393 | ||
2167 | set_old: | 2394 | if (oset) { |
2168 | error = -EFAULT; | 2395 | if (copy_to_user(oset, &old_set, sizeof(sigset_t))) |
2169 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | 2396 | return -EFAULT; |
2170 | goto out; | ||
2171 | } | 2397 | } |
2172 | error = 0; | 2398 | |
2173 | out: | 2399 | return 0; |
2174 | return error; | ||
2175 | } | 2400 | } |
2176 | 2401 | ||
2177 | long do_sigpending(void __user *set, unsigned long sigsetsize) | 2402 | long do_sigpending(void __user *set, unsigned long sigsetsize) |
@@ -2284,6 +2509,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2284 | #endif | 2509 | #endif |
2285 | 2510 | ||
2286 | /** | 2511 | /** |
2512 | * do_sigtimedwait - wait for queued signals specified in @which | ||
2513 | * @which: queued signals to wait for | ||
2514 | * @info: if non-null, the signal's siginfo is returned here | ||
2515 | * @ts: upper bound on process time suspension | ||
2516 | */ | ||
2517 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | ||
2518 | const struct timespec *ts) | ||
2519 | { | ||
2520 | struct task_struct *tsk = current; | ||
2521 | long timeout = MAX_SCHEDULE_TIMEOUT; | ||
2522 | sigset_t mask = *which; | ||
2523 | int sig; | ||
2524 | |||
2525 | if (ts) { | ||
2526 | if (!timespec_valid(ts)) | ||
2527 | return -EINVAL; | ||
2528 | timeout = timespec_to_jiffies(ts); | ||
2529 | /* | ||
2530 | * We can be close to the next tick, add another one | ||
2531 | * to ensure we will wait at least the time asked for. | ||
2532 | */ | ||
2533 | if (ts->tv_sec || ts->tv_nsec) | ||
2534 | timeout++; | ||
2535 | } | ||
2536 | |||
2537 | /* | ||
2538 | * Invert the set of allowed signals to get those we want to block. | ||
2539 | */ | ||
2540 | sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2541 | signotset(&mask); | ||
2542 | |||
2543 | spin_lock_irq(&tsk->sighand->siglock); | ||
2544 | sig = dequeue_signal(tsk, &mask, info); | ||
2545 | if (!sig && timeout) { | ||
2546 | /* | ||
2547 | * None ready, temporarily unblock those we're interested | ||
2548 | * while we are sleeping in so that we'll be awakened when | ||
2549 | * they arrive. Unblocking is always fine, we can avoid | ||
2550 | * set_current_blocked(). | ||
2551 | */ | ||
2552 | tsk->real_blocked = tsk->blocked; | ||
2553 | sigandsets(&tsk->blocked, &tsk->blocked, &mask); | ||
2554 | recalc_sigpending(); | ||
2555 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2556 | |||
2557 | timeout = schedule_timeout_interruptible(timeout); | ||
2558 | |||
2559 | spin_lock_irq(&tsk->sighand->siglock); | ||
2560 | __set_task_blocked(tsk, &tsk->real_blocked); | ||
2561 | siginitset(&tsk->real_blocked, 0); | ||
2562 | sig = dequeue_signal(tsk, &mask, info); | ||
2563 | } | ||
2564 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2565 | |||
2566 | if (sig) | ||
2567 | return sig; | ||
2568 | return timeout ? -EINTR : -EAGAIN; | ||
2569 | } | ||
2570 | |||
2571 | /** | ||
2287 | * sys_rt_sigtimedwait - synchronously wait for queued signals specified | 2572 | * sys_rt_sigtimedwait - synchronously wait for queued signals specified |
2288 | * in @uthese | 2573 | * in @uthese |
2289 | * @uthese: queued signals to wait for | 2574 | * @uthese: queued signals to wait for |
@@ -2295,11 +2580,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
2295 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, | 2580 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, |
2296 | size_t, sigsetsize) | 2581 | size_t, sigsetsize) |
2297 | { | 2582 | { |
2298 | int ret, sig; | ||
2299 | sigset_t these; | 2583 | sigset_t these; |
2300 | struct timespec ts; | 2584 | struct timespec ts; |
2301 | siginfo_t info; | 2585 | siginfo_t info; |
2302 | long timeout = 0; | 2586 | int ret; |
2303 | 2587 | ||
2304 | /* XXX: Don't preclude handling different sized sigset_t's. */ | 2588 | /* XXX: Don't preclude handling different sized sigset_t's. */ |
2305 | if (sigsetsize != sizeof(sigset_t)) | 2589 | if (sigsetsize != sizeof(sigset_t)) |
@@ -2308,61 +2592,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
2308 | if (copy_from_user(&these, uthese, sizeof(these))) | 2592 | if (copy_from_user(&these, uthese, sizeof(these))) |
2309 | return -EFAULT; | 2593 | return -EFAULT; |
2310 | 2594 | ||
2311 | /* | ||
2312 | * Invert the set of allowed signals to get those we | ||
2313 | * want to block. | ||
2314 | */ | ||
2315 | sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2316 | signotset(&these); | ||
2317 | |||
2318 | if (uts) { | 2595 | if (uts) { |
2319 | if (copy_from_user(&ts, uts, sizeof(ts))) | 2596 | if (copy_from_user(&ts, uts, sizeof(ts))) |
2320 | return -EFAULT; | 2597 | return -EFAULT; |
2321 | if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 | ||
2322 | || ts.tv_sec < 0) | ||
2323 | return -EINVAL; | ||
2324 | } | 2598 | } |
2325 | 2599 | ||
2326 | spin_lock_irq(¤t->sighand->siglock); | 2600 | ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); |
2327 | sig = dequeue_signal(current, &these, &info); | ||
2328 | if (!sig) { | ||
2329 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
2330 | if (uts) | ||
2331 | timeout = (timespec_to_jiffies(&ts) | ||
2332 | + (ts.tv_sec || ts.tv_nsec)); | ||
2333 | |||
2334 | if (timeout) { | ||
2335 | /* | ||
2336 | * None ready -- temporarily unblock those we're | ||
2337 | * interested while we are sleeping in so that we'll | ||
2338 | * be awakened when they arrive. | ||
2339 | */ | ||
2340 | current->real_blocked = current->blocked; | ||
2341 | sigandsets(¤t->blocked, ¤t->blocked, &these); | ||
2342 | recalc_sigpending(); | ||
2343 | spin_unlock_irq(¤t->sighand->siglock); | ||
2344 | |||
2345 | timeout = schedule_timeout_interruptible(timeout); | ||
2346 | |||
2347 | spin_lock_irq(¤t->sighand->siglock); | ||
2348 | sig = dequeue_signal(current, &these, &info); | ||
2349 | current->blocked = current->real_blocked; | ||
2350 | siginitset(¤t->real_blocked, 0); | ||
2351 | recalc_sigpending(); | ||
2352 | } | ||
2353 | } | ||
2354 | spin_unlock_irq(¤t->sighand->siglock); | ||
2355 | 2601 | ||
2356 | if (sig) { | 2602 | if (ret > 0 && uinfo) { |
2357 | ret = sig; | 2603 | if (copy_siginfo_to_user(uinfo, &info)) |
2358 | if (uinfo) { | 2604 | ret = -EFAULT; |
2359 | if (copy_siginfo_to_user(uinfo, &info)) | ||
2360 | ret = -EFAULT; | ||
2361 | } | ||
2362 | } else { | ||
2363 | ret = -EAGAIN; | ||
2364 | if (timeout) | ||
2365 | ret = -EINTR; | ||
2366 | } | 2605 | } |
2367 | 2606 | ||
2368 | return ret; | 2607 | return ret; |
@@ -2650,60 +2889,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | |||
2650 | /** | 2889 | /** |
2651 | * sys_sigprocmask - examine and change blocked signals | 2890 | * sys_sigprocmask - examine and change blocked signals |
2652 | * @how: whether to add, remove, or set signals | 2891 | * @how: whether to add, remove, or set signals |
2653 | * @set: signals to add or remove (if non-null) | 2892 | * @nset: signals to add or remove (if non-null) |
2654 | * @oset: previous value of signal mask if non-null | 2893 | * @oset: previous value of signal mask if non-null |
2655 | * | 2894 | * |
2656 | * Some platforms have their own version with special arguments; | 2895 | * Some platforms have their own version with special arguments; |
2657 | * others support only sys_rt_sigprocmask. | 2896 | * others support only sys_rt_sigprocmask. |
2658 | */ | 2897 | */ |
2659 | 2898 | ||
2660 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, | 2899 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, |
2661 | old_sigset_t __user *, oset) | 2900 | old_sigset_t __user *, oset) |
2662 | { | 2901 | { |
2663 | int error; | ||
2664 | old_sigset_t old_set, new_set; | 2902 | old_sigset_t old_set, new_set; |
2903 | sigset_t new_blocked; | ||
2665 | 2904 | ||
2666 | if (set) { | 2905 | old_set = current->blocked.sig[0]; |
2667 | error = -EFAULT; | 2906 | |
2668 | if (copy_from_user(&new_set, set, sizeof(*set))) | 2907 | if (nset) { |
2669 | goto out; | 2908 | if (copy_from_user(&new_set, nset, sizeof(*nset))) |
2909 | return -EFAULT; | ||
2670 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | 2910 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); |
2671 | 2911 | ||
2672 | spin_lock_irq(¤t->sighand->siglock); | 2912 | new_blocked = current->blocked; |
2673 | old_set = current->blocked.sig[0]; | ||
2674 | 2913 | ||
2675 | error = 0; | ||
2676 | switch (how) { | 2914 | switch (how) { |
2677 | default: | ||
2678 | error = -EINVAL; | ||
2679 | break; | ||
2680 | case SIG_BLOCK: | 2915 | case SIG_BLOCK: |
2681 | sigaddsetmask(¤t->blocked, new_set); | 2916 | sigaddsetmask(&new_blocked, new_set); |
2682 | break; | 2917 | break; |
2683 | case SIG_UNBLOCK: | 2918 | case SIG_UNBLOCK: |
2684 | sigdelsetmask(¤t->blocked, new_set); | 2919 | sigdelsetmask(&new_blocked, new_set); |
2685 | break; | 2920 | break; |
2686 | case SIG_SETMASK: | 2921 | case SIG_SETMASK: |
2687 | current->blocked.sig[0] = new_set; | 2922 | new_blocked.sig[0] = new_set; |
2688 | break; | 2923 | break; |
2924 | default: | ||
2925 | return -EINVAL; | ||
2689 | } | 2926 | } |
2690 | 2927 | ||
2691 | recalc_sigpending(); | 2928 | set_current_blocked(&new_blocked); |
2692 | spin_unlock_irq(¤t->sighand->siglock); | 2929 | } |
2693 | if (error) | 2930 | |
2694 | goto out; | 2931 | if (oset) { |
2695 | if (oset) | ||
2696 | goto set_old; | ||
2697 | } else if (oset) { | ||
2698 | old_set = current->blocked.sig[0]; | ||
2699 | set_old: | ||
2700 | error = -EFAULT; | ||
2701 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | 2932 | if (copy_to_user(oset, &old_set, sizeof(*oset))) |
2702 | goto out; | 2933 | return -EFAULT; |
2703 | } | 2934 | } |
2704 | error = 0; | 2935 | |
2705 | out: | 2936 | return 0; |
2706 | return error; | ||
2707 | } | 2937 | } |
2708 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ | 2938 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ |
2709 | 2939 | ||
@@ -2793,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) | |||
2793 | 3023 | ||
2794 | SYSCALL_DEFINE0(pause) | 3024 | SYSCALL_DEFINE0(pause) |
2795 | { | 3025 | { |
2796 | current->state = TASK_INTERRUPTIBLE; | 3026 | while (!signal_pending(current)) { |
2797 | schedule(); | 3027 | current->state = TASK_INTERRUPTIBLE; |
3028 | schedule(); | ||
3029 | } | ||
2798 | return -ERESTARTNOHAND; | 3030 | return -ERESTARTNOHAND; |
2799 | } | 3031 | } |
2800 | 3032 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 73a195193558..fb67dfa8394e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { | |||
74 | .notifier_call = hotplug_cfd, | 74 | .notifier_call = hotplug_cfd, |
75 | }; | 75 | }; |
76 | 76 | ||
77 | static int __cpuinit init_call_single_data(void) | 77 | void __init call_function_init(void) |
78 | { | 78 | { |
79 | void *cpu = (void *)(long)smp_processor_id(); | 79 | void *cpu = (void *)(long)smp_processor_id(); |
80 | int i; | 80 | int i; |
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void) | |||
88 | 88 | ||
89 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); | 89 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); |
90 | register_cpu_notifier(&hotplug_cfd_notifier); | 90 | register_cpu_notifier(&hotplug_cfd_notifier); |
91 | |||
92 | return 0; | ||
93 | } | 91 | } |
94 | early_initcall(init_call_single_data); | ||
95 | 92 | ||
96 | /* | 93 | /* |
97 | * csd_lock/csd_unlock used to serialize access to per-cpu csd resources | 94 | * csd_lock/csd_unlock used to serialize access to per-cpu csd resources |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 174f976c2874..40cf63ddd4b3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | |||
58 | 58 | ||
59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
61 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 61 | "TASKLET", "SCHED", "HRTIMER", "RCU" |
62 | }; | 62 | }; |
63 | 63 | ||
64 | /* | 64 | /* |
diff --git a/kernel/sys.c b/kernel/sys.c index af468edf096a..e4128b278f23 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -314,8 +314,8 @@ void kernel_restart_prepare(char *cmd) | |||
314 | { | 314 | { |
315 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 315 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
316 | system_state = SYSTEM_RESTART; | 316 | system_state = SYSTEM_RESTART; |
317 | usermodehelper_disable(); | ||
317 | device_shutdown(); | 318 | device_shutdown(); |
318 | sysdev_shutdown(); | ||
319 | syscore_shutdown(); | 319 | syscore_shutdown(); |
320 | } | 320 | } |
321 | 321 | ||
@@ -344,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
344 | blocking_notifier_call_chain(&reboot_notifier_list, | 344 | blocking_notifier_call_chain(&reboot_notifier_list, |
345 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | 345 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); |
346 | system_state = state; | 346 | system_state = state; |
347 | usermodehelper_disable(); | ||
347 | device_shutdown(); | 348 | device_shutdown(); |
348 | } | 349 | } |
349 | /** | 350 | /** |
@@ -354,7 +355,6 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
354 | void kernel_halt(void) | 355 | void kernel_halt(void) |
355 | { | 356 | { |
356 | kernel_shutdown_prepare(SYSTEM_HALT); | 357 | kernel_shutdown_prepare(SYSTEM_HALT); |
357 | sysdev_shutdown(); | ||
358 | syscore_shutdown(); | 358 | syscore_shutdown(); |
359 | printk(KERN_EMERG "System halted.\n"); | 359 | printk(KERN_EMERG "System halted.\n"); |
360 | kmsg_dump(KMSG_DUMP_HALT); | 360 | kmsg_dump(KMSG_DUMP_HALT); |
@@ -374,7 +374,6 @@ void kernel_power_off(void) | |||
374 | if (pm_power_off_prepare) | 374 | if (pm_power_off_prepare) |
375 | pm_power_off_prepare(); | 375 | pm_power_off_prepare(); |
376 | disable_nonboot_cpus(); | 376 | disable_nonboot_cpus(); |
377 | sysdev_shutdown(); | ||
378 | syscore_shutdown(); | 377 | syscore_shutdown(); |
379 | printk(KERN_EMERG "Power down.\n"); | 378 | printk(KERN_EMERG "Power down.\n"); |
380 | kmsg_dump(KMSG_DUMP_POWEROFF); | 379 | kmsg_dump(KMSG_DUMP_POWEROFF); |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 25cc41cd8f33..62cbc8877fef 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt); | |||
46 | cond_syscall(compat_sys_getsockopt); | 46 | cond_syscall(compat_sys_getsockopt); |
47 | cond_syscall(sys_shutdown); | 47 | cond_syscall(sys_shutdown); |
48 | cond_syscall(sys_sendmsg); | 48 | cond_syscall(sys_sendmsg); |
49 | cond_syscall(sys_sendmmsg); | ||
49 | cond_syscall(compat_sys_sendmsg); | 50 | cond_syscall(compat_sys_sendmsg); |
51 | cond_syscall(compat_sys_sendmmsg); | ||
50 | cond_syscall(sys_recvmsg); | 52 | cond_syscall(sys_recvmsg); |
51 | cond_syscall(sys_recvmmsg); | 53 | cond_syscall(sys_recvmmsg); |
52 | cond_syscall(compat_sys_recvmsg); | 54 | cond_syscall(compat_sys_recvmsg); |
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait); | |||
69 | cond_syscall(sys_semget); | 71 | cond_syscall(sys_semget); |
70 | cond_syscall(sys_semop); | 72 | cond_syscall(sys_semop); |
71 | cond_syscall(sys_semtimedop); | 73 | cond_syscall(sys_semtimedop); |
74 | cond_syscall(compat_sys_semtimedop); | ||
72 | cond_syscall(sys_semctl); | 75 | cond_syscall(sys_semctl); |
76 | cond_syscall(compat_sys_semctl); | ||
73 | cond_syscall(sys_msgget); | 77 | cond_syscall(sys_msgget); |
74 | cond_syscall(sys_msgsnd); | 78 | cond_syscall(sys_msgsnd); |
79 | cond_syscall(compat_sys_msgsnd); | ||
75 | cond_syscall(sys_msgrcv); | 80 | cond_syscall(sys_msgrcv); |
81 | cond_syscall(compat_sys_msgrcv); | ||
76 | cond_syscall(sys_msgctl); | 82 | cond_syscall(sys_msgctl); |
83 | cond_syscall(compat_sys_msgctl); | ||
77 | cond_syscall(sys_shmget); | 84 | cond_syscall(sys_shmget); |
78 | cond_syscall(sys_shmat); | 85 | cond_syscall(sys_shmat); |
86 | cond_syscall(compat_sys_shmat); | ||
79 | cond_syscall(sys_shmdt); | 87 | cond_syscall(sys_shmdt); |
80 | cond_syscall(sys_shmctl); | 88 | cond_syscall(sys_shmctl); |
89 | cond_syscall(compat_sys_shmctl); | ||
81 | cond_syscall(sys_mq_open); | 90 | cond_syscall(sys_mq_open); |
82 | cond_syscall(sys_mq_unlink); | 91 | cond_syscall(sys_mq_unlink); |
83 | cond_syscall(sys_mq_timedsend); | 92 | cond_syscall(sys_mq_timedsend); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b17..f175d98bd355 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/kprobes.h> | 56 | #include <linux/kprobes.h> |
57 | #include <linux/pipe_fs_i.h> | 57 | #include <linux/pipe_fs_i.h> |
58 | #include <linux/oom.h> | 58 | #include <linux/oom.h> |
59 | #include <linux/kmod.h> | ||
59 | 60 | ||
60 | #include <asm/uaccess.h> | 61 | #include <asm/uaccess.h> |
61 | #include <asm/processor.h> | 62 | #include <asm/processor.h> |
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = { | |||
616 | .child = random_table, | 617 | .child = random_table, |
617 | }, | 618 | }, |
618 | { | 619 | { |
620 | .procname = "usermodehelper", | ||
621 | .mode = 0555, | ||
622 | .child = usermodehelper_table, | ||
623 | }, | ||
624 | { | ||
619 | .procname = "overflowuid", | 625 | .procname = "overflowuid", |
620 | .data = &overflowuid, | 626 | .data = &overflowuid, |
621 | .maxlen = sizeof(int), | 627 | .maxlen = sizeof(int), |
@@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = { | |||
730 | .data = &watchdog_enabled, | 736 | .data = &watchdog_enabled, |
731 | .maxlen = sizeof (int), | 737 | .maxlen = sizeof (int), |
732 | .mode = 0644, | 738 | .mode = 0644, |
733 | .proc_handler = proc_dowatchdog_enabled, | 739 | .proc_handler = proc_dowatchdog, |
740 | .extra1 = &zero, | ||
741 | .extra2 = &one, | ||
734 | }, | 742 | }, |
735 | { | 743 | { |
736 | .procname = "watchdog_thresh", | 744 | .procname = "watchdog_thresh", |
737 | .data = &softlockup_thresh, | 745 | .data = &watchdog_thresh, |
738 | .maxlen = sizeof(int), | 746 | .maxlen = sizeof(int), |
739 | .mode = 0644, | 747 | .mode = 0644, |
740 | .proc_handler = proc_dowatchdog_thresh, | 748 | .proc_handler = proc_dowatchdog, |
741 | .extra1 = &neg_one, | 749 | .extra1 = &neg_one, |
742 | .extra2 = &sixty, | 750 | .extra2 = &sixty, |
743 | }, | 751 | }, |
@@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = { | |||
755 | .data = &watchdog_enabled, | 763 | .data = &watchdog_enabled, |
756 | .maxlen = sizeof (int), | 764 | .maxlen = sizeof (int), |
757 | .mode = 0644, | 765 | .mode = 0644, |
758 | .proc_handler = proc_dowatchdog_enabled, | 766 | .proc_handler = proc_dowatchdog, |
767 | .extra1 = &zero, | ||
768 | .extra2 = &one, | ||
759 | }, | 769 | }, |
760 | #endif | 770 | #endif |
761 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 771 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
@@ -928,6 +938,12 @@ static struct ctl_table kern_table[] = { | |||
928 | }, | 938 | }, |
929 | #endif | 939 | #endif |
930 | #ifdef CONFIG_PERF_EVENTS | 940 | #ifdef CONFIG_PERF_EVENTS |
941 | /* | ||
942 | * User-space scripts rely on the existence of this file | ||
943 | * as a feature check for perf_events being enabled. | ||
944 | * | ||
945 | * So it's an ABI, do not remove! | ||
946 | */ | ||
931 | { | 947 | { |
932 | .procname = "perf_event_paranoid", | 948 | .procname = "perf_event_paranoid", |
933 | .data = &sysctl_perf_event_paranoid, | 949 | .data = &sysctl_perf_event_paranoid, |
@@ -1496,7 +1512,7 @@ static struct ctl_table fs_table[] = { | |||
1496 | 1512 | ||
1497 | static struct ctl_table debug_table[] = { | 1513 | static struct ctl_table debug_table[] = { |
1498 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ | 1514 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ |
1499 | defined(CONFIG_S390) | 1515 | defined(CONFIG_S390) || defined(CONFIG_TILE) |
1500 | { | 1516 | { |
1501 | .procname = "exception-trace", | 1517 | .procname = "exception-trace", |
1502 | .data = &show_unhandled_signals, | 1518 | .data = &show_unhandled_signals, |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9ffea360a778..fc0f22005417 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -285,16 +285,18 @@ ret: | |||
285 | static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | 285 | static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) |
286 | { | 286 | { |
287 | struct listener_list *listeners; | 287 | struct listener_list *listeners; |
288 | struct listener *s, *tmp; | 288 | struct listener *s, *tmp, *s2; |
289 | unsigned int cpu; | 289 | unsigned int cpu; |
290 | 290 | ||
291 | if (!cpumask_subset(mask, cpu_possible_mask)) | 291 | if (!cpumask_subset(mask, cpu_possible_mask)) |
292 | return -EINVAL; | 292 | return -EINVAL; |
293 | 293 | ||
294 | s = NULL; | ||
294 | if (isadd == REGISTER) { | 295 | if (isadd == REGISTER) { |
295 | for_each_cpu(cpu, mask) { | 296 | for_each_cpu(cpu, mask) { |
296 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | 297 | if (!s) |
297 | cpu_to_node(cpu)); | 298 | s = kmalloc_node(sizeof(struct listener), |
299 | GFP_KERNEL, cpu_to_node(cpu)); | ||
298 | if (!s) | 300 | if (!s) |
299 | goto cleanup; | 301 | goto cleanup; |
300 | s->pid = pid; | 302 | s->pid = pid; |
@@ -303,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
303 | 305 | ||
304 | listeners = &per_cpu(listener_array, cpu); | 306 | listeners = &per_cpu(listener_array, cpu); |
305 | down_write(&listeners->sem); | 307 | down_write(&listeners->sem); |
308 | list_for_each_entry_safe(s2, tmp, &listeners->list, list) { | ||
309 | if (s2->pid == pid) | ||
310 | goto next_cpu; | ||
311 | } | ||
306 | list_add(&s->list, &listeners->list); | 312 | list_add(&s->list, &listeners->list); |
313 | s = NULL; | ||
314 | next_cpu: | ||
307 | up_write(&listeners->sem); | 315 | up_write(&listeners->sem); |
308 | } | 316 | } |
317 | kfree(s); | ||
309 | return 0; | 318 | return 0; |
310 | } | 319 | } |
311 | 320 | ||
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index b0425991e9ac..e2fd74b8e8c2 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
2 | obj-y += timeconv.o posix-clock.o | 2 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
3 | 3 | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000000..59f369f98a04 --- /dev/null +++ b/kernel/time/alarmtimer.c | |||
@@ -0,0 +1,720 @@ | |||
1 | /* | ||
2 | * Alarmtimer interface | ||
3 | * | ||
4 | * This interface provides a timer which is similarto hrtimers, | ||
5 | * but triggers a RTC alarm if the box is suspend. | ||
6 | * | ||
7 | * This interface is influenced by the Android RTC Alarm timer | ||
8 | * interface. | ||
9 | * | ||
10 | * Copyright (C) 2010 IBM Corperation | ||
11 | * | ||
12 | * Author: John Stultz <john.stultz@linaro.org> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License version 2 as | ||
16 | * published by the Free Software Foundation. | ||
17 | */ | ||
18 | #include <linux/time.h> | ||
19 | #include <linux/hrtimer.h> | ||
20 | #include <linux/timerqueue.h> | ||
21 | #include <linux/rtc.h> | ||
22 | #include <linux/alarmtimer.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <linux/platform_device.h> | ||
25 | #include <linux/posix-timers.h> | ||
26 | #include <linux/workqueue.h> | ||
27 | #include <linux/freezer.h> | ||
28 | |||
29 | /** | ||
30 | * struct alarm_base - Alarm timer bases | ||
31 | * @lock: Lock for syncrhonized access to the base | ||
32 | * @timerqueue: Timerqueue head managing the list of events | ||
33 | * @timer: hrtimer used to schedule events while running | ||
34 | * @gettime: Function to read the time correlating to the base | ||
35 | * @base_clockid: clockid for the base | ||
36 | */ | ||
37 | static struct alarm_base { | ||
38 | spinlock_t lock; | ||
39 | struct timerqueue_head timerqueue; | ||
40 | struct hrtimer timer; | ||
41 | ktime_t (*gettime)(void); | ||
42 | clockid_t base_clockid; | ||
43 | } alarm_bases[ALARM_NUMTYPE]; | ||
44 | |||
45 | /* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ | ||
46 | static ktime_t freezer_delta; | ||
47 | static DEFINE_SPINLOCK(freezer_delta_lock); | ||
48 | |||
49 | #ifdef CONFIG_RTC_CLASS | ||
50 | /* rtc timer and device for setting alarm wakeups at suspend */ | ||
51 | static struct rtc_timer rtctimer; | ||
52 | static struct rtc_device *rtcdev; | ||
53 | static DEFINE_SPINLOCK(rtcdev_lock); | ||
54 | |||
55 | /** | ||
56 | * has_wakealarm - check rtc device has wakealarm ability | ||
57 | * @dev: current device | ||
58 | * @name_ptr: name to be returned | ||
59 | * | ||
60 | * This helper function checks to see if the rtc device can wake | ||
61 | * from suspend. | ||
62 | */ | ||
63 | static int has_wakealarm(struct device *dev, void *name_ptr) | ||
64 | { | ||
65 | struct rtc_device *candidate = to_rtc_device(dev); | ||
66 | |||
67 | if (!candidate->ops->set_alarm) | ||
68 | return 0; | ||
69 | if (!device_may_wakeup(candidate->dev.parent)) | ||
70 | return 0; | ||
71 | |||
72 | *(const char **)name_ptr = dev_name(dev); | ||
73 | return 1; | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * alarmtimer_get_rtcdev - Return selected rtcdevice | ||
78 | * | ||
79 | * This function returns the rtc device to use for wakealarms. | ||
80 | * If one has not already been chosen, it checks to see if a | ||
81 | * functional rtc device is available. | ||
82 | */ | ||
83 | static struct rtc_device *alarmtimer_get_rtcdev(void) | ||
84 | { | ||
85 | struct device *dev; | ||
86 | char *str; | ||
87 | unsigned long flags; | ||
88 | struct rtc_device *ret; | ||
89 | |||
90 | spin_lock_irqsave(&rtcdev_lock, flags); | ||
91 | if (!rtcdev) { | ||
92 | /* Find an rtc device and init the rtc_timer */ | ||
93 | dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); | ||
94 | /* If we have a device then str is valid. See has_wakealarm() */ | ||
95 | if (dev) { | ||
96 | rtcdev = rtc_class_open(str); | ||
97 | /* | ||
98 | * Drop the reference we got in class_find_device, | ||
99 | * rtc_open takes its own. | ||
100 | */ | ||
101 | put_device(dev); | ||
102 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
103 | } | ||
104 | } | ||
105 | ret = rtcdev; | ||
106 | spin_unlock_irqrestore(&rtcdev_lock, flags); | ||
107 | |||
108 | return ret; | ||
109 | } | ||
110 | #else | ||
111 | #define alarmtimer_get_rtcdev() (0) | ||
112 | #define rtcdev (0) | ||
113 | #endif | ||
114 | |||
115 | |||
116 | /** | ||
117 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue | ||
118 | * @base: pointer to the base where the timer is being run | ||
119 | * @alarm: pointer to alarm being enqueued. | ||
120 | * | ||
121 | * Adds alarm to a alarm_base timerqueue and if necessary sets | ||
122 | * an hrtimer to run. | ||
123 | * | ||
124 | * Must hold base->lock when calling. | ||
125 | */ | ||
126 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | ||
127 | { | ||
128 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
129 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | ||
130 | hrtimer_try_to_cancel(&base->timer); | ||
131 | hrtimer_start(&base->timer, alarm->node.expires, | ||
132 | HRTIMER_MODE_ABS); | ||
133 | } | ||
134 | } | ||
135 | |||
136 | /** | ||
137 | * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue | ||
138 | * @base: pointer to the base where the timer is running | ||
139 | * @alarm: pointer to alarm being removed | ||
140 | * | ||
141 | * Removes alarm to a alarm_base timerqueue and if necessary sets | ||
142 | * a new timer to run. | ||
143 | * | ||
144 | * Must hold base->lock when calling. | ||
145 | */ | ||
146 | static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | ||
147 | { | ||
148 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | ||
149 | |||
150 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
151 | if (next == &alarm->node) { | ||
152 | hrtimer_try_to_cancel(&base->timer); | ||
153 | next = timerqueue_getnext(&base->timerqueue); | ||
154 | if (!next) | ||
155 | return; | ||
156 | hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); | ||
157 | } | ||
158 | } | ||
159 | |||
160 | |||
161 | /** | ||
162 | * alarmtimer_fired - Handles alarm hrtimer being fired. | ||
163 | * @timer: pointer to hrtimer being run | ||
164 | * | ||
165 | * When a alarm timer fires, this runs through the timerqueue to | ||
166 | * see which alarms expired, and runs those. If there are more alarm | ||
167 | * timers queued for the future, we set the hrtimer to fire when | ||
168 | * when the next future alarm timer expires. | ||
169 | */ | ||
170 | static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | ||
171 | { | ||
172 | struct alarm_base *base = container_of(timer, struct alarm_base, timer); | ||
173 | struct timerqueue_node *next; | ||
174 | unsigned long flags; | ||
175 | ktime_t now; | ||
176 | int ret = HRTIMER_NORESTART; | ||
177 | |||
178 | spin_lock_irqsave(&base->lock, flags); | ||
179 | now = base->gettime(); | ||
180 | while ((next = timerqueue_getnext(&base->timerqueue))) { | ||
181 | struct alarm *alarm; | ||
182 | ktime_t expired = next->expires; | ||
183 | |||
184 | if (expired.tv64 >= now.tv64) | ||
185 | break; | ||
186 | |||
187 | alarm = container_of(next, struct alarm, node); | ||
188 | |||
189 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
190 | alarm->enabled = 0; | ||
191 | /* Re-add periodic timers */ | ||
192 | if (alarm->period.tv64) { | ||
193 | alarm->node.expires = ktime_add(expired, alarm->period); | ||
194 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
195 | alarm->enabled = 1; | ||
196 | } | ||
197 | spin_unlock_irqrestore(&base->lock, flags); | ||
198 | if (alarm->function) | ||
199 | alarm->function(alarm); | ||
200 | spin_lock_irqsave(&base->lock, flags); | ||
201 | } | ||
202 | |||
203 | if (next) { | ||
204 | hrtimer_set_expires(&base->timer, next->expires); | ||
205 | ret = HRTIMER_RESTART; | ||
206 | } | ||
207 | spin_unlock_irqrestore(&base->lock, flags); | ||
208 | |||
209 | return ret; | ||
210 | |||
211 | } | ||
212 | |||
213 | #ifdef CONFIG_RTC_CLASS | ||
214 | /** | ||
215 | * alarmtimer_suspend - Suspend time callback | ||
216 | * @dev: unused | ||
217 | * @state: unused | ||
218 | * | ||
219 | * When we are going into suspend, we look through the bases | ||
220 | * to see which is the soonest timer to expire. We then | ||
221 | * set an rtc timer to fire that far into the future, which | ||
222 | * will wake us from suspend. | ||
223 | */ | ||
224 | static int alarmtimer_suspend(struct device *dev) | ||
225 | { | ||
226 | struct rtc_time tm; | ||
227 | ktime_t min, now; | ||
228 | unsigned long flags; | ||
229 | struct rtc_device *rtc; | ||
230 | int i; | ||
231 | |||
232 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
233 | min = freezer_delta; | ||
234 | freezer_delta = ktime_set(0, 0); | ||
235 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
236 | |||
237 | rtc = rtcdev; | ||
238 | /* If we have no rtcdev, just return */ | ||
239 | if (!rtc) | ||
240 | return 0; | ||
241 | |||
242 | /* Find the soonest timer to expire*/ | ||
243 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
244 | struct alarm_base *base = &alarm_bases[i]; | ||
245 | struct timerqueue_node *next; | ||
246 | ktime_t delta; | ||
247 | |||
248 | spin_lock_irqsave(&base->lock, flags); | ||
249 | next = timerqueue_getnext(&base->timerqueue); | ||
250 | spin_unlock_irqrestore(&base->lock, flags); | ||
251 | if (!next) | ||
252 | continue; | ||
253 | delta = ktime_sub(next->expires, base->gettime()); | ||
254 | if (!min.tv64 || (delta.tv64 < min.tv64)) | ||
255 | min = delta; | ||
256 | } | ||
257 | if (min.tv64 == 0) | ||
258 | return 0; | ||
259 | |||
260 | /* XXX - Should we enforce a minimum sleep time? */ | ||
261 | WARN_ON(min.tv64 < NSEC_PER_SEC); | ||
262 | |||
263 | /* Setup an rtc timer to fire that far in the future */ | ||
264 | rtc_timer_cancel(rtc, &rtctimer); | ||
265 | rtc_read_time(rtc, &tm); | ||
266 | now = rtc_tm_to_ktime(tm); | ||
267 | now = ktime_add(now, min); | ||
268 | |||
269 | rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); | ||
270 | |||
271 | return 0; | ||
272 | } | ||
273 | #else | ||
274 | static int alarmtimer_suspend(struct device *dev) | ||
275 | { | ||
276 | return 0; | ||
277 | } | ||
278 | #endif | ||
279 | |||
280 | static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | ||
281 | { | ||
282 | ktime_t delta; | ||
283 | unsigned long flags; | ||
284 | struct alarm_base *base = &alarm_bases[type]; | ||
285 | |||
286 | delta = ktime_sub(absexp, base->gettime()); | ||
287 | |||
288 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
289 | if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) | ||
290 | freezer_delta = delta; | ||
291 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
292 | } | ||
293 | |||
294 | |||
295 | /** | ||
296 | * alarm_init - Initialize an alarm structure | ||
297 | * @alarm: ptr to alarm to be initialized | ||
298 | * @type: the type of the alarm | ||
299 | * @function: callback that is run when the alarm fires | ||
300 | */ | ||
301 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | ||
302 | void (*function)(struct alarm *)) | ||
303 | { | ||
304 | timerqueue_init(&alarm->node); | ||
305 | alarm->period = ktime_set(0, 0); | ||
306 | alarm->function = function; | ||
307 | alarm->type = type; | ||
308 | alarm->enabled = 0; | ||
309 | } | ||
310 | |||
311 | /** | ||
312 | * alarm_start - Sets an alarm to fire | ||
313 | * @alarm: ptr to alarm to set | ||
314 | * @start: time to run the alarm | ||
315 | * @period: period at which the alarm will recur | ||
316 | */ | ||
317 | void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) | ||
318 | { | ||
319 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
320 | unsigned long flags; | ||
321 | |||
322 | spin_lock_irqsave(&base->lock, flags); | ||
323 | if (alarm->enabled) | ||
324 | alarmtimer_remove(base, alarm); | ||
325 | alarm->node.expires = start; | ||
326 | alarm->period = period; | ||
327 | alarmtimer_enqueue(base, alarm); | ||
328 | alarm->enabled = 1; | ||
329 | spin_unlock_irqrestore(&base->lock, flags); | ||
330 | } | ||
331 | |||
332 | /** | ||
333 | * alarm_cancel - Tries to cancel an alarm timer | ||
334 | * @alarm: ptr to alarm to be canceled | ||
335 | */ | ||
336 | void alarm_cancel(struct alarm *alarm) | ||
337 | { | ||
338 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
339 | unsigned long flags; | ||
340 | |||
341 | spin_lock_irqsave(&base->lock, flags); | ||
342 | if (alarm->enabled) | ||
343 | alarmtimer_remove(base, alarm); | ||
344 | alarm->enabled = 0; | ||
345 | spin_unlock_irqrestore(&base->lock, flags); | ||
346 | } | ||
347 | |||
348 | |||
349 | /** | ||
350 | * clock2alarm - helper that converts from clockid to alarmtypes | ||
351 | * @clockid: clockid. | ||
352 | */ | ||
353 | static enum alarmtimer_type clock2alarm(clockid_t clockid) | ||
354 | { | ||
355 | if (clockid == CLOCK_REALTIME_ALARM) | ||
356 | return ALARM_REALTIME; | ||
357 | if (clockid == CLOCK_BOOTTIME_ALARM) | ||
358 | return ALARM_BOOTTIME; | ||
359 | return -1; | ||
360 | } | ||
361 | |||
362 | /** | ||
363 | * alarm_handle_timer - Callback for posix timers | ||
364 | * @alarm: alarm that fired | ||
365 | * | ||
366 | * Posix timer callback for expired alarm timers. | ||
367 | */ | ||
368 | static void alarm_handle_timer(struct alarm *alarm) | ||
369 | { | ||
370 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | ||
371 | it.alarmtimer); | ||
372 | if (posix_timer_event(ptr, 0) != 0) | ||
373 | ptr->it_overrun++; | ||
374 | } | ||
375 | |||
376 | /** | ||
377 | * alarm_clock_getres - posix getres interface | ||
378 | * @which_clock: clockid | ||
379 | * @tp: timespec to fill | ||
380 | * | ||
381 | * Returns the granularity of underlying alarm base clock | ||
382 | */ | ||
383 | static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
384 | { | ||
385 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | ||
386 | |||
387 | if (!alarmtimer_get_rtcdev()) | ||
388 | return -ENOTSUPP; | ||
389 | |||
390 | return hrtimer_get_res(baseid, tp); | ||
391 | } | ||
392 | |||
393 | /** | ||
394 | * alarm_clock_get - posix clock_get interface | ||
395 | * @which_clock: clockid | ||
396 | * @tp: timespec to fill. | ||
397 | * | ||
398 | * Provides the underlying alarm base time. | ||
399 | */ | ||
400 | static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | ||
401 | { | ||
402 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | ||
403 | |||
404 | if (!alarmtimer_get_rtcdev()) | ||
405 | return -ENOTSUPP; | ||
406 | |||
407 | *tp = ktime_to_timespec(base->gettime()); | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | /** | ||
412 | * alarm_timer_create - posix timer_create interface | ||
413 | * @new_timer: k_itimer pointer to manage | ||
414 | * | ||
415 | * Initializes the k_itimer structure. | ||
416 | */ | ||
417 | static int alarm_timer_create(struct k_itimer *new_timer) | ||
418 | { | ||
419 | enum alarmtimer_type type; | ||
420 | struct alarm_base *base; | ||
421 | |||
422 | if (!alarmtimer_get_rtcdev()) | ||
423 | return -ENOTSUPP; | ||
424 | |||
425 | if (!capable(CAP_WAKE_ALARM)) | ||
426 | return -EPERM; | ||
427 | |||
428 | type = clock2alarm(new_timer->it_clock); | ||
429 | base = &alarm_bases[type]; | ||
430 | alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); | ||
431 | return 0; | ||
432 | } | ||
433 | |||
434 | /** | ||
435 | * alarm_timer_get - posix timer_get interface | ||
436 | * @new_timer: k_itimer pointer | ||
437 | * @cur_setting: itimerspec data to fill | ||
438 | * | ||
439 | * Copies the itimerspec data out from the k_itimer | ||
440 | */ | ||
441 | static void alarm_timer_get(struct k_itimer *timr, | ||
442 | struct itimerspec *cur_setting) | ||
443 | { | ||
444 | cur_setting->it_interval = | ||
445 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
446 | cur_setting->it_value = | ||
447 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
448 | return; | ||
449 | } | ||
450 | |||
451 | /** | ||
452 | * alarm_timer_del - posix timer_del interface | ||
453 | * @timr: k_itimer pointer to be deleted | ||
454 | * | ||
455 | * Cancels any programmed alarms for the given timer. | ||
456 | */ | ||
457 | static int alarm_timer_del(struct k_itimer *timr) | ||
458 | { | ||
459 | if (!rtcdev) | ||
460 | return -ENOTSUPP; | ||
461 | |||
462 | alarm_cancel(&timr->it.alarmtimer); | ||
463 | return 0; | ||
464 | } | ||
465 | |||
466 | /** | ||
467 | * alarm_timer_set - posix timer_set interface | ||
468 | * @timr: k_itimer pointer to be deleted | ||
469 | * @flags: timer flags | ||
470 | * @new_setting: itimerspec to be used | ||
471 | * @old_setting: itimerspec being replaced | ||
472 | * | ||
473 | * Sets the timer to new_setting, and starts the timer. | ||
474 | */ | ||
475 | static int alarm_timer_set(struct k_itimer *timr, int flags, | ||
476 | struct itimerspec *new_setting, | ||
477 | struct itimerspec *old_setting) | ||
478 | { | ||
479 | if (!rtcdev) | ||
480 | return -ENOTSUPP; | ||
481 | |||
482 | /* Save old values */ | ||
483 | old_setting->it_interval = | ||
484 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
485 | old_setting->it_value = | ||
486 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
487 | |||
488 | /* If the timer was already set, cancel it */ | ||
489 | alarm_cancel(&timr->it.alarmtimer); | ||
490 | |||
491 | /* start the timer */ | ||
492 | alarm_start(&timr->it.alarmtimer, | ||
493 | timespec_to_ktime(new_setting->it_value), | ||
494 | timespec_to_ktime(new_setting->it_interval)); | ||
495 | return 0; | ||
496 | } | ||
497 | |||
498 | /** | ||
499 | * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep | ||
500 | * @alarm: ptr to alarm that fired | ||
501 | * | ||
502 | * Wakes up the task that set the alarmtimer | ||
503 | */ | ||
504 | static void alarmtimer_nsleep_wakeup(struct alarm *alarm) | ||
505 | { | ||
506 | struct task_struct *task = (struct task_struct *)alarm->data; | ||
507 | |||
508 | alarm->data = NULL; | ||
509 | if (task) | ||
510 | wake_up_process(task); | ||
511 | } | ||
512 | |||
513 | /** | ||
514 | * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation | ||
515 | * @alarm: ptr to alarmtimer | ||
516 | * @absexp: absolute expiration time | ||
517 | * | ||
518 | * Sets the alarm timer and sleeps until it is fired or interrupted. | ||
519 | */ | ||
520 | static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | ||
521 | { | ||
522 | alarm->data = (void *)current; | ||
523 | do { | ||
524 | set_current_state(TASK_INTERRUPTIBLE); | ||
525 | alarm_start(alarm, absexp, ktime_set(0, 0)); | ||
526 | if (likely(alarm->data)) | ||
527 | schedule(); | ||
528 | |||
529 | alarm_cancel(alarm); | ||
530 | } while (alarm->data && !signal_pending(current)); | ||
531 | |||
532 | __set_current_state(TASK_RUNNING); | ||
533 | |||
534 | return (alarm->data == NULL); | ||
535 | } | ||
536 | |||
537 | |||
538 | /** | ||
539 | * update_rmtp - Update remaining timespec value | ||
540 | * @exp: expiration time | ||
541 | * @type: timer type | ||
542 | * @rmtp: user pointer to remaining timepsec value | ||
543 | * | ||
544 | * Helper function that fills in rmtp value with time between | ||
545 | * now and the exp value | ||
546 | */ | ||
547 | static int update_rmtp(ktime_t exp, enum alarmtimer_type type, | ||
548 | struct timespec __user *rmtp) | ||
549 | { | ||
550 | struct timespec rmt; | ||
551 | ktime_t rem; | ||
552 | |||
553 | rem = ktime_sub(exp, alarm_bases[type].gettime()); | ||
554 | |||
555 | if (rem.tv64 <= 0) | ||
556 | return 0; | ||
557 | rmt = ktime_to_timespec(rem); | ||
558 | |||
559 | if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) | ||
560 | return -EFAULT; | ||
561 | |||
562 | return 1; | ||
563 | |||
564 | } | ||
565 | |||
566 | /** | ||
567 | * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep | ||
568 | * @restart: ptr to restart block | ||
569 | * | ||
570 | * Handles restarted clock_nanosleep calls | ||
571 | */ | ||
572 | static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) | ||
573 | { | ||
574 | enum alarmtimer_type type = restart->nanosleep.clockid; | ||
575 | ktime_t exp; | ||
576 | struct timespec __user *rmtp; | ||
577 | struct alarm alarm; | ||
578 | int ret = 0; | ||
579 | |||
580 | exp.tv64 = restart->nanosleep.expires; | ||
581 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
582 | |||
583 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
584 | goto out; | ||
585 | |||
586 | if (freezing(current)) | ||
587 | alarmtimer_freezerset(exp, type); | ||
588 | |||
589 | rmtp = restart->nanosleep.rmtp; | ||
590 | if (rmtp) { | ||
591 | ret = update_rmtp(exp, type, rmtp); | ||
592 | if (ret <= 0) | ||
593 | goto out; | ||
594 | } | ||
595 | |||
596 | |||
597 | /* The other values in restart are already filled in */ | ||
598 | ret = -ERESTART_RESTARTBLOCK; | ||
599 | out: | ||
600 | return ret; | ||
601 | } | ||
602 | |||
603 | /** | ||
604 | * alarm_timer_nsleep - alarmtimer nanosleep | ||
605 | * @which_clock: clockid | ||
606 | * @flags: determins abstime or relative | ||
607 | * @tsreq: requested sleep time (abs or rel) | ||
608 | * @rmtp: remaining sleep time saved | ||
609 | * | ||
610 | * Handles clock_nanosleep calls against _ALARM clockids | ||
611 | */ | ||
612 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | ||
613 | struct timespec *tsreq, struct timespec __user *rmtp) | ||
614 | { | ||
615 | enum alarmtimer_type type = clock2alarm(which_clock); | ||
616 | struct alarm alarm; | ||
617 | ktime_t exp; | ||
618 | int ret = 0; | ||
619 | struct restart_block *restart; | ||
620 | |||
621 | if (!alarmtimer_get_rtcdev()) | ||
622 | return -ENOTSUPP; | ||
623 | |||
624 | if (!capable(CAP_WAKE_ALARM)) | ||
625 | return -EPERM; | ||
626 | |||
627 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
628 | |||
629 | exp = timespec_to_ktime(*tsreq); | ||
630 | /* Convert (if necessary) to absolute time */ | ||
631 | if (flags != TIMER_ABSTIME) { | ||
632 | ktime_t now = alarm_bases[type].gettime(); | ||
633 | exp = ktime_add(now, exp); | ||
634 | } | ||
635 | |||
636 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
637 | goto out; | ||
638 | |||
639 | if (freezing(current)) | ||
640 | alarmtimer_freezerset(exp, type); | ||
641 | |||
642 | /* abs timers don't set remaining time or restart */ | ||
643 | if (flags == TIMER_ABSTIME) { | ||
644 | ret = -ERESTARTNOHAND; | ||
645 | goto out; | ||
646 | } | ||
647 | |||
648 | if (rmtp) { | ||
649 | ret = update_rmtp(exp, type, rmtp); | ||
650 | if (ret <= 0) | ||
651 | goto out; | ||
652 | } | ||
653 | |||
654 | restart = ¤t_thread_info()->restart_block; | ||
655 | restart->fn = alarm_timer_nsleep_restart; | ||
656 | restart->nanosleep.clockid = type; | ||
657 | restart->nanosleep.expires = exp.tv64; | ||
658 | restart->nanosleep.rmtp = rmtp; | ||
659 | ret = -ERESTART_RESTARTBLOCK; | ||
660 | |||
661 | out: | ||
662 | return ret; | ||
663 | } | ||
664 | |||
665 | |||
666 | /* Suspend hook structures */ | ||
667 | static const struct dev_pm_ops alarmtimer_pm_ops = { | ||
668 | .suspend = alarmtimer_suspend, | ||
669 | }; | ||
670 | |||
671 | static struct platform_driver alarmtimer_driver = { | ||
672 | .driver = { | ||
673 | .name = "alarmtimer", | ||
674 | .pm = &alarmtimer_pm_ops, | ||
675 | } | ||
676 | }; | ||
677 | |||
678 | /** | ||
679 | * alarmtimer_init - Initialize alarm timer code | ||
680 | * | ||
681 | * This function initializes the alarm bases and registers | ||
682 | * the posix clock ids. | ||
683 | */ | ||
684 | static int __init alarmtimer_init(void) | ||
685 | { | ||
686 | int error = 0; | ||
687 | int i; | ||
688 | struct k_clock alarm_clock = { | ||
689 | .clock_getres = alarm_clock_getres, | ||
690 | .clock_get = alarm_clock_get, | ||
691 | .timer_create = alarm_timer_create, | ||
692 | .timer_set = alarm_timer_set, | ||
693 | .timer_del = alarm_timer_del, | ||
694 | .timer_get = alarm_timer_get, | ||
695 | .nsleep = alarm_timer_nsleep, | ||
696 | }; | ||
697 | |||
698 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); | ||
699 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); | ||
700 | |||
701 | /* Initialize alarm bases */ | ||
702 | alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; | ||
703 | alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; | ||
704 | alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; | ||
705 | alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; | ||
706 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
707 | timerqueue_init_head(&alarm_bases[i].timerqueue); | ||
708 | spin_lock_init(&alarm_bases[i].lock); | ||
709 | hrtimer_init(&alarm_bases[i].timer, | ||
710 | alarm_bases[i].base_clockid, | ||
711 | HRTIMER_MODE_ABS); | ||
712 | alarm_bases[i].timer.function = alarmtimer_fired; | ||
713 | } | ||
714 | error = platform_driver_register(&alarmtimer_driver); | ||
715 | platform_device_register_simple("alarmtimer", -1, NULL, 0); | ||
716 | |||
717 | return error; | ||
718 | } | ||
719 | device_initcall(alarmtimer_init); | ||
720 | |||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0d74b9ba90c8..e4c699dfa4e8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
182 | unsigned long flags; | 182 | unsigned long flags; |
183 | 183 | ||
184 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 184 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); |
185 | BUG_ON(!dev->cpumask); | 185 | if (!dev->cpumask) { |
186 | WARN_ON(num_possible_cpus() > 1); | ||
187 | dev->cpumask = cpumask_of(smp_processor_id()); | ||
188 | } | ||
186 | 189 | ||
187 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 190 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
188 | 191 | ||
@@ -194,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
194 | } | 197 | } |
195 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 198 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
196 | 199 | ||
200 | static void clockevents_config(struct clock_event_device *dev, | ||
201 | u32 freq) | ||
202 | { | ||
203 | u64 sec; | ||
204 | |||
205 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
206 | return; | ||
207 | |||
208 | /* | ||
209 | * Calculate the maximum number of seconds we can sleep. Limit | ||
210 | * to 10 minutes for hardware which can program more than | ||
211 | * 32bit ticks so we still get reasonable conversion values. | ||
212 | */ | ||
213 | sec = dev->max_delta_ticks; | ||
214 | do_div(sec, freq); | ||
215 | if (!sec) | ||
216 | sec = 1; | ||
217 | else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) | ||
218 | sec = 600; | ||
219 | |||
220 | clockevents_calc_mult_shift(dev, freq, sec); | ||
221 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | ||
222 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * clockevents_config_and_register - Configure and register a clock event device | ||
227 | * @dev: device to register | ||
228 | * @freq: The clock frequency | ||
229 | * @min_delta: The minimum clock ticks to program in oneshot mode | ||
230 | * @max_delta: The maximum clock ticks to program in oneshot mode | ||
231 | * | ||
232 | * min/max_delta can be 0 for devices which do not support oneshot mode. | ||
233 | */ | ||
234 | void clockevents_config_and_register(struct clock_event_device *dev, | ||
235 | u32 freq, unsigned long min_delta, | ||
236 | unsigned long max_delta) | ||
237 | { | ||
238 | dev->min_delta_ticks = min_delta; | ||
239 | dev->max_delta_ticks = max_delta; | ||
240 | clockevents_config(dev, freq); | ||
241 | clockevents_register_device(dev); | ||
242 | } | ||
243 | |||
244 | /** | ||
245 | * clockevents_update_freq - Update frequency and reprogram a clock event device. | ||
246 | * @dev: device to modify | ||
247 | * @freq: new device frequency | ||
248 | * | ||
249 | * Reconfigure and reprogram a clock event device in oneshot | ||
250 | * mode. Must be called on the cpu for which the device delivers per | ||
251 | * cpu timer events with interrupts disabled! Returns 0 on success, | ||
252 | * -ETIME when the event is in the past. | ||
253 | */ | ||
254 | int clockevents_update_freq(struct clock_event_device *dev, u32 freq) | ||
255 | { | ||
256 | clockevents_config(dev, freq); | ||
257 | |||
258 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | ||
259 | return 0; | ||
260 | |||
261 | return clockevents_program_event(dev, dev->next_event, ktime_get()); | ||
262 | } | ||
263 | |||
197 | /* | 264 | /* |
198 | * Noop handler when we shut down an event device | 265 | * Noop handler when we shut down an event device |
199 | */ | 266 | */ |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0e17c10f8a9d..e0980f0d9a0a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -185,7 +185,6 @@ static struct clocksource *watchdog; | |||
185 | static struct timer_list watchdog_timer; | 185 | static struct timer_list watchdog_timer; |
186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); | 186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); |
187 | static DEFINE_SPINLOCK(watchdog_lock); | 187 | static DEFINE_SPINLOCK(watchdog_lock); |
188 | static cycle_t watchdog_last; | ||
189 | static int watchdog_running; | 188 | static int watchdog_running; |
190 | 189 | ||
191 | static int clocksource_watchdog_kthread(void *data); | 190 | static int clocksource_watchdog_kthread(void *data); |
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data) | |||
254 | if (!watchdog_running) | 253 | if (!watchdog_running) |
255 | goto out; | 254 | goto out; |
256 | 255 | ||
257 | wdnow = watchdog->read(watchdog); | ||
258 | wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, | ||
259 | watchdog->mult, watchdog->shift); | ||
260 | watchdog_last = wdnow; | ||
261 | |||
262 | list_for_each_entry(cs, &watchdog_list, wd_list) { | 256 | list_for_each_entry(cs, &watchdog_list, wd_list) { |
263 | 257 | ||
264 | /* Clocksource already marked unstable? */ | 258 | /* Clocksource already marked unstable? */ |
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data) | |||
268 | continue; | 262 | continue; |
269 | } | 263 | } |
270 | 264 | ||
265 | local_irq_disable(); | ||
271 | csnow = cs->read(cs); | 266 | csnow = cs->read(cs); |
267 | wdnow = watchdog->read(watchdog); | ||
268 | local_irq_enable(); | ||
272 | 269 | ||
273 | /* Clocksource initialized ? */ | 270 | /* Clocksource initialized ? */ |
274 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | 271 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { |
275 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | 272 | cs->flags |= CLOCK_SOURCE_WATCHDOG; |
276 | cs->wd_last = csnow; | 273 | cs->wd_last = wdnow; |
274 | cs->cs_last = csnow; | ||
277 | continue; | 275 | continue; |
278 | } | 276 | } |
279 | 277 | ||
280 | /* Check the deviation from the watchdog clocksource. */ | 278 | wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, |
281 | cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & | 279 | watchdog->mult, watchdog->shift); |
280 | |||
281 | cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & | ||
282 | cs->mask, cs->mult, cs->shift); | 282 | cs->mask, cs->mult, cs->shift); |
283 | cs->wd_last = csnow; | 283 | cs->cs_last = csnow; |
284 | cs->wd_last = wdnow; | ||
285 | |||
286 | /* Check the deviation from the watchdog clocksource. */ | ||
284 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { | 287 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { |
285 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 288 | clocksource_unstable(cs, cs_nsec - wd_nsec); |
286 | continue; | 289 | continue; |
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void) | |||
318 | return; | 321 | return; |
319 | init_timer(&watchdog_timer); | 322 | init_timer(&watchdog_timer); |
320 | watchdog_timer.function = clocksource_watchdog; | 323 | watchdog_timer.function = clocksource_watchdog; |
321 | watchdog_last = watchdog->read(watchdog); | ||
322 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 324 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
323 | add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); | 325 | add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); |
324 | watchdog_running = 1; | 326 | watchdog_running = 1; |
@@ -626,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
626 | list_add(&cs->list, entry); | 628 | list_add(&cs->list, entry); |
627 | } | 629 | } |
628 | 630 | ||
629 | |||
630 | /* | ||
631 | * Maximum time we expect to go between ticks. This includes idle | ||
632 | * tickless time. It provides the trade off between selecting a | ||
633 | * mult/shift pair that is very precise but can only handle a short | ||
634 | * period of time, vs. a mult/shift pair that can handle long periods | ||
635 | * of time but isn't as precise. | ||
636 | * | ||
637 | * This is a subsystem constant, and actual hardware limitations | ||
638 | * may override it (ie: clocksources that wrap every 3 seconds). | ||
639 | */ | ||
640 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | ||
641 | |||
642 | /** | 631 | /** |
643 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 632 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
644 | * @t: clocksource to be registered | 633 | * @t: clocksource to be registered |
@@ -652,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
652 | */ | 641 | */ |
653 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 642 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
654 | { | 643 | { |
644 | u64 sec; | ||
645 | |||
655 | /* | 646 | /* |
656 | * Ideally we want to use some of the limits used in | 647 | * Calc the maximum number of seconds which we can run before |
657 | * clocksource_max_deferment, to provide a more informed | 648 | * wrapping around. For clocksources which have a mask > 32bit |
658 | * MAX_UPDATE_LENGTH. But for now this just gets the | 649 | * we need to limit the max sleep time to have a good |
659 | * register interface working properly. | 650 | * conversion precision. 10 minutes is still a reasonable |
651 | * amount. That results in a shift value of 24 for a | ||
652 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
653 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
654 | * margin as we do in clocksource_max_deferment() | ||
660 | */ | 655 | */ |
656 | sec = (cs->mask - (cs->mask >> 5)); | ||
657 | do_div(sec, freq); | ||
658 | do_div(sec, scale); | ||
659 | if (!sec) | ||
660 | sec = 1; | ||
661 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
662 | sec = 600; | ||
663 | |||
661 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 664 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
662 | NSEC_PER_SEC/scale, | 665 | NSEC_PER_SEC / scale, sec * scale); |
663 | MAX_UPDATE_LENGTH*scale); | ||
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 666 | cs->max_idle_ns = clocksource_max_deferment(cs); |
665 | } | 667 | } |
666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 668 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 723c7637e55a..c7218d132738 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
456 | unsigned long flags; | 456 | unsigned long flags; |
457 | int cpu; | 457 | int cpu; |
458 | 458 | ||
459 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
460 | |||
461 | /* | 459 | /* |
462 | * Periodic mode does not care about the enter/exit of power | 460 | * Periodic mode does not care about the enter/exit of power |
463 | * states | 461 | * states |
464 | */ | 462 | */ |
465 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | 463 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) |
466 | goto out; | 464 | return; |
467 | 465 | ||
468 | bc = tick_broadcast_device.evtdev; | 466 | /* |
467 | * We are called with preemtion disabled from the depth of the | ||
468 | * idle code, so we can't be moved away. | ||
469 | */ | ||
469 | cpu = smp_processor_id(); | 470 | cpu = smp_processor_id(); |
470 | td = &per_cpu(tick_cpu_device, cpu); | 471 | td = &per_cpu(tick_cpu_device, cpu); |
471 | dev = td->evtdev; | 472 | dev = td->evtdev; |
472 | 473 | ||
473 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 474 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
474 | goto out; | 475 | return; |
476 | |||
477 | bc = tick_broadcast_device.evtdev; | ||
475 | 478 | ||
479 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
476 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | 480 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { |
477 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { | 481 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { |
478 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); | 482 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); |
@@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
489 | tick_program_event(dev->next_event, 1); | 493 | tick_program_event(dev->next_event, 1); |
490 | } | 494 | } |
491 | } | 495 | } |
492 | |||
493 | out: | ||
494 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 496 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
495 | } | 497 | } |
496 | 498 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ad5d576755e..342408cf68dd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -596,6 +596,58 @@ void __init timekeeping_init(void) | |||
596 | static struct timespec timekeeping_suspend_time; | 596 | static struct timespec timekeeping_suspend_time; |
597 | 597 | ||
598 | /** | 598 | /** |
599 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | ||
600 | * @delta: pointer to a timespec delta value | ||
601 | * | ||
602 | * Takes a timespec offset measuring a suspend interval and properly | ||
603 | * adds the sleep offset to the timekeeping variables. | ||
604 | */ | ||
605 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | ||
606 | { | ||
607 | xtime = timespec_add(xtime, *delta); | ||
608 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); | ||
609 | total_sleep_time = timespec_add(total_sleep_time, *delta); | ||
610 | } | ||
611 | |||
612 | |||
613 | /** | ||
614 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | ||
615 | * @delta: pointer to a timespec delta value | ||
616 | * | ||
617 | * This hook is for architectures that cannot support read_persistent_clock | ||
618 | * because their RTC/persistent clock is only accessible when irqs are enabled. | ||
619 | * | ||
620 | * This function should only be called by rtc_resume(), and allows | ||
621 | * a suspend offset to be injected into the timekeeping values. | ||
622 | */ | ||
623 | void timekeeping_inject_sleeptime(struct timespec *delta) | ||
624 | { | ||
625 | unsigned long flags; | ||
626 | struct timespec ts; | ||
627 | |||
628 | /* Make sure we don't set the clock twice */ | ||
629 | read_persistent_clock(&ts); | ||
630 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | ||
631 | return; | ||
632 | |||
633 | write_seqlock_irqsave(&xtime_lock, flags); | ||
634 | timekeeping_forward_now(); | ||
635 | |||
636 | __timekeeping_inject_sleeptime(delta); | ||
637 | |||
638 | timekeeper.ntp_error = 0; | ||
639 | ntp_clear(); | ||
640 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
641 | timekeeper.mult); | ||
642 | |||
643 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
644 | |||
645 | /* signal hrtimers about time change */ | ||
646 | clock_was_set(); | ||
647 | } | ||
648 | |||
649 | |||
650 | /** | ||
599 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 651 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
600 | * | 652 | * |
601 | * This is for the generic clocksource timekeeping. | 653 | * This is for the generic clocksource timekeeping. |
@@ -615,9 +667,7 @@ static void timekeeping_resume(void) | |||
615 | 667 | ||
616 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 668 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
617 | ts = timespec_sub(ts, timekeeping_suspend_time); | 669 | ts = timespec_sub(ts, timekeeping_suspend_time); |
618 | xtime = timespec_add(xtime, ts); | 670 | __timekeeping_inject_sleeptime(&ts); |
619 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | ||
620 | total_sleep_time = timespec_add(total_sleep_time, ts); | ||
621 | } | 671 | } |
622 | /* re-base the last cycle value */ | 672 | /* re-base the last cycle value */ |
623 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 673 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
@@ -630,7 +680,7 @@ static void timekeeping_resume(void) | |||
630 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | 680 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); |
631 | 681 | ||
632 | /* Resume hrtimers */ | 682 | /* Resume hrtimers */ |
633 | hres_timers_resume(); | 683 | hrtimers_resume(); |
634 | } | 684 | } |
635 | 685 | ||
636 | static int timekeeping_suspend(void) | 686 | static int timekeeping_suspend(void) |
@@ -1049,6 +1099,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1049 | } | 1099 | } |
1050 | 1100 | ||
1051 | /** | 1101 | /** |
1102 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | ||
1103 | */ | ||
1104 | ktime_t ktime_get_monotonic_offset(void) | ||
1105 | { | ||
1106 | unsigned long seq; | ||
1107 | struct timespec wtom; | ||
1108 | |||
1109 | do { | ||
1110 | seq = read_seqbegin(&xtime_lock); | ||
1111 | wtom = wall_to_monotonic; | ||
1112 | } while (read_seqretry(&xtime_lock, seq)); | ||
1113 | return timespec_to_ktime(wtom); | ||
1114 | } | ||
1115 | |||
1116 | /** | ||
1052 | * xtime_update() - advances the timekeeping infrastructure | 1117 | * xtime_update() - advances the timekeeping infrastructure |
1053 | * @ticks: number of ticks, that have elapsed since the last call. | 1118 | * @ticks: number of ticks, that have elapsed since the last call. |
1054 | * | 1119 | * |
diff --git a/kernel/timer.c b/kernel/timer.c index fd6198692b57..8cff36119e4d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
749 | unsigned long expires_limit, mask; | 749 | unsigned long expires_limit, mask; |
750 | int bit; | 750 | int bit; |
751 | 751 | ||
752 | expires_limit = expires; | ||
753 | |||
754 | if (timer->slack >= 0) { | 752 | if (timer->slack >= 0) { |
755 | expires_limit = expires + timer->slack; | 753 | expires_limit = expires + timer->slack; |
756 | } else { | 754 | } else { |
757 | unsigned long now = jiffies; | 755 | long delta = expires - jiffies; |
756 | |||
757 | if (delta < 256) | ||
758 | return expires; | ||
758 | 759 | ||
759 | /* No slack, if already expired else auto slack 0.4% */ | 760 | expires_limit = expires + delta / 256; |
760 | if (time_after(expires, now)) | ||
761 | expires_limit = expires + (expires - now)/256; | ||
762 | } | 761 | } |
763 | mask = expires ^ expires_limit; | 762 | mask = expires ^ expires_limit; |
764 | if (mask == 0) | 763 | if (mask == 0) |
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
795 | */ | 794 | */ |
796 | int mod_timer(struct timer_list *timer, unsigned long expires) | 795 | int mod_timer(struct timer_list *timer, unsigned long expires) |
797 | { | 796 | { |
797 | expires = apply_slack(timer, expires); | ||
798 | |||
798 | /* | 799 | /* |
799 | * This is a common optimization triggered by the | 800 | * This is a common optimization triggered by the |
800 | * networking code - if the timer is re-modified | 801 | * networking code - if the timer is re-modified |
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
803 | if (timer_pending(timer) && timer->expires == expires) | 804 | if (timer_pending(timer) && timer->expires == expires) |
804 | return 1; | 805 | return 1; |
805 | 806 | ||
806 | expires = apply_slack(timer, expires); | ||
807 | |||
808 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); | 807 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); |
809 | } | 808 | } |
810 | EXPORT_SYMBOL(mod_timer); | 809 | EXPORT_SYMBOL(mod_timer); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee24fa1935ac..908038f57440 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -39,20 +39,26 @@ | |||
39 | #include "trace_stat.h" | 39 | #include "trace_stat.h" |
40 | 40 | ||
41 | #define FTRACE_WARN_ON(cond) \ | 41 | #define FTRACE_WARN_ON(cond) \ |
42 | do { \ | 42 | ({ \ |
43 | if (WARN_ON(cond)) \ | 43 | int ___r = cond; \ |
44 | if (WARN_ON(___r)) \ | ||
44 | ftrace_kill(); \ | 45 | ftrace_kill(); \ |
45 | } while (0) | 46 | ___r; \ |
47 | }) | ||
46 | 48 | ||
47 | #define FTRACE_WARN_ON_ONCE(cond) \ | 49 | #define FTRACE_WARN_ON_ONCE(cond) \ |
48 | do { \ | 50 | ({ \ |
49 | if (WARN_ON_ONCE(cond)) \ | 51 | int ___r = cond; \ |
52 | if (WARN_ON_ONCE(___r)) \ | ||
50 | ftrace_kill(); \ | 53 | ftrace_kill(); \ |
51 | } while (0) | 54 | ___r; \ |
55 | }) | ||
52 | 56 | ||
53 | /* hash bits for specific function selection */ | 57 | /* hash bits for specific function selection */ |
54 | #define FTRACE_HASH_BITS 7 | 58 | #define FTRACE_HASH_BITS 7 |
55 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) | 59 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) |
60 | #define FTRACE_HASH_DEFAULT_BITS 10 | ||
61 | #define FTRACE_HASH_MAX_BITS 12 | ||
56 | 62 | ||
57 | /* ftrace_enabled is a method to turn ftrace on or off */ | 63 | /* ftrace_enabled is a method to turn ftrace on or off */ |
58 | int ftrace_enabled __read_mostly; | 64 | int ftrace_enabled __read_mostly; |
@@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly = | |||
81 | .func = ftrace_stub, | 87 | .func = ftrace_stub, |
82 | }; | 88 | }; |
83 | 89 | ||
84 | static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; | 90 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
91 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | ||
85 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 92 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
86 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 93 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
87 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 94 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
95 | static struct ftrace_ops global_ops; | ||
96 | |||
97 | static void | ||
98 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | ||
88 | 99 | ||
89 | /* | 100 | /* |
90 | * Traverse the ftrace_list, invoking all entries. The reason that we | 101 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
91 | * can use rcu_dereference_raw() is that elements removed from this list | 102 | * can use rcu_dereference_raw() is that elements removed from this list |
92 | * are simply leaked, so there is no need to interact with a grace-period | 103 | * are simply leaked, so there is no need to interact with a grace-period |
93 | * mechanism. The rcu_dereference_raw() calls are needed to handle | 104 | * mechanism. The rcu_dereference_raw() calls are needed to handle |
94 | * concurrent insertions into the ftrace_list. | 105 | * concurrent insertions into the ftrace_global_list. |
95 | * | 106 | * |
96 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 107 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
97 | */ | 108 | */ |
98 | static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) | 109 | static void ftrace_global_list_func(unsigned long ip, |
110 | unsigned long parent_ip) | ||
99 | { | 111 | { |
100 | struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ | 112 | struct ftrace_ops *op; |
101 | 113 | ||
114 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) | ||
115 | return; | ||
116 | |||
117 | trace_recursion_set(TRACE_GLOBAL_BIT); | ||
118 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ | ||
102 | while (op != &ftrace_list_end) { | 119 | while (op != &ftrace_list_end) { |
103 | op->func(ip, parent_ip); | 120 | op->func(ip, parent_ip); |
104 | op = rcu_dereference_raw(op->next); /*see above*/ | 121 | op = rcu_dereference_raw(op->next); /*see above*/ |
105 | }; | 122 | }; |
123 | trace_recursion_clear(TRACE_GLOBAL_BIT); | ||
106 | } | 124 | } |
107 | 125 | ||
108 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) | 126 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) |
@@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | |||
147 | } | 165 | } |
148 | #endif | 166 | #endif |
149 | 167 | ||
150 | static int __register_ftrace_function(struct ftrace_ops *ops) | 168 | static void update_global_ops(void) |
151 | { | 169 | { |
152 | ops->next = ftrace_list; | 170 | ftrace_func_t func; |
171 | |||
153 | /* | 172 | /* |
154 | * We are entering ops into the ftrace_list but another | 173 | * If there's only one function registered, then call that |
155 | * CPU might be walking that list. We need to make sure | 174 | * function directly. Otherwise, we need to iterate over the |
156 | * the ops->next pointer is valid before another CPU sees | 175 | * registered callers. |
157 | * the ops pointer included into the ftrace_list. | ||
158 | */ | 176 | */ |
159 | rcu_assign_pointer(ftrace_list, ops); | 177 | if (ftrace_global_list == &ftrace_list_end || |
178 | ftrace_global_list->next == &ftrace_list_end) | ||
179 | func = ftrace_global_list->func; | ||
180 | else | ||
181 | func = ftrace_global_list_func; | ||
160 | 182 | ||
161 | if (ftrace_enabled) { | 183 | /* If we filter on pids, update to use the pid function */ |
162 | ftrace_func_t func; | 184 | if (!list_empty(&ftrace_pids)) { |
185 | set_ftrace_pid_function(func); | ||
186 | func = ftrace_pid_func; | ||
187 | } | ||
163 | 188 | ||
164 | if (ops->next == &ftrace_list_end) | 189 | global_ops.func = func; |
165 | func = ops->func; | 190 | } |
166 | else | ||
167 | func = ftrace_list_func; | ||
168 | 191 | ||
169 | if (!list_empty(&ftrace_pids)) { | 192 | static void update_ftrace_function(void) |
170 | set_ftrace_pid_function(func); | 193 | { |
171 | func = ftrace_pid_func; | 194 | ftrace_func_t func; |
172 | } | 195 | |
196 | update_global_ops(); | ||
197 | |||
198 | /* | ||
199 | * If we are at the end of the list and this ops is | ||
200 | * not dynamic, then have the mcount trampoline call | ||
201 | * the function directly | ||
202 | */ | ||
203 | if (ftrace_ops_list == &ftrace_list_end || | ||
204 | (ftrace_ops_list->next == &ftrace_list_end && | ||
205 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) | ||
206 | func = ftrace_ops_list->func; | ||
207 | else | ||
208 | func = ftrace_ops_list_func; | ||
173 | 209 | ||
174 | /* | ||
175 | * For one func, simply call it directly. | ||
176 | * For more than one func, call the chain. | ||
177 | */ | ||
178 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 210 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
179 | ftrace_trace_function = func; | 211 | ftrace_trace_function = func; |
180 | #else | 212 | #else |
181 | __ftrace_trace_function = func; | 213 | __ftrace_trace_function = func; |
182 | ftrace_trace_function = ftrace_test_stop_func; | 214 | ftrace_trace_function = ftrace_test_stop_func; |
183 | #endif | 215 | #endif |
184 | } | 216 | } |
185 | 217 | ||
186 | return 0; | 218 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
219 | { | ||
220 | ops->next = *list; | ||
221 | /* | ||
222 | * We are entering ops into the list but another | ||
223 | * CPU might be walking that list. We need to make sure | ||
224 | * the ops->next pointer is valid before another CPU sees | ||
225 | * the ops pointer included into the list. | ||
226 | */ | ||
227 | rcu_assign_pointer(*list, ops); | ||
187 | } | 228 | } |
188 | 229 | ||
189 | static int __unregister_ftrace_function(struct ftrace_ops *ops) | 230 | static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
190 | { | 231 | { |
191 | struct ftrace_ops **p; | 232 | struct ftrace_ops **p; |
192 | 233 | ||
@@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
194 | * If we are removing the last function, then simply point | 235 | * If we are removing the last function, then simply point |
195 | * to the ftrace_stub. | 236 | * to the ftrace_stub. |
196 | */ | 237 | */ |
197 | if (ftrace_list == ops && ops->next == &ftrace_list_end) { | 238 | if (*list == ops && ops->next == &ftrace_list_end) { |
198 | ftrace_trace_function = ftrace_stub; | 239 | *list = &ftrace_list_end; |
199 | ftrace_list = &ftrace_list_end; | ||
200 | return 0; | 240 | return 0; |
201 | } | 241 | } |
202 | 242 | ||
203 | for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) | 243 | for (p = list; *p != &ftrace_list_end; p = &(*p)->next) |
204 | if (*p == ops) | 244 | if (*p == ops) |
205 | break; | 245 | break; |
206 | 246 | ||
@@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
208 | return -1; | 248 | return -1; |
209 | 249 | ||
210 | *p = (*p)->next; | 250 | *p = (*p)->next; |
251 | return 0; | ||
252 | } | ||
211 | 253 | ||
212 | if (ftrace_enabled) { | 254 | static int __register_ftrace_function(struct ftrace_ops *ops) |
213 | /* If we only have one func left, then call that directly */ | 255 | { |
214 | if (ftrace_list->next == &ftrace_list_end) { | 256 | if (ftrace_disabled) |
215 | ftrace_func_t func = ftrace_list->func; | 257 | return -ENODEV; |
216 | 258 | ||
217 | if (!list_empty(&ftrace_pids)) { | 259 | if (FTRACE_WARN_ON(ops == &global_ops)) |
218 | set_ftrace_pid_function(func); | 260 | return -EINVAL; |
219 | func = ftrace_pid_func; | 261 | |
220 | } | 262 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
221 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 263 | return -EBUSY; |
222 | ftrace_trace_function = func; | 264 | |
223 | #else | 265 | if (!core_kernel_data((unsigned long)ops)) |
224 | __ftrace_trace_function = func; | 266 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
225 | #endif | 267 | |
226 | } | 268 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
227 | } | 269 | int first = ftrace_global_list == &ftrace_list_end; |
270 | add_ftrace_ops(&ftrace_global_list, ops); | ||
271 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
272 | if (first) | ||
273 | add_ftrace_ops(&ftrace_ops_list, &global_ops); | ||
274 | } else | ||
275 | add_ftrace_ops(&ftrace_ops_list, ops); | ||
276 | |||
277 | if (ftrace_enabled) | ||
278 | update_ftrace_function(); | ||
228 | 279 | ||
229 | return 0; | 280 | return 0; |
230 | } | 281 | } |
231 | 282 | ||
232 | static void ftrace_update_pid_func(void) | 283 | static int __unregister_ftrace_function(struct ftrace_ops *ops) |
233 | { | 284 | { |
234 | ftrace_func_t func; | 285 | int ret; |
235 | 286 | ||
236 | if (ftrace_trace_function == ftrace_stub) | 287 | if (ftrace_disabled) |
237 | return; | 288 | return -ENODEV; |
238 | 289 | ||
239 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 290 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
240 | func = ftrace_trace_function; | 291 | return -EBUSY; |
241 | #else | ||
242 | func = __ftrace_trace_function; | ||
243 | #endif | ||
244 | 292 | ||
245 | if (!list_empty(&ftrace_pids)) { | 293 | if (FTRACE_WARN_ON(ops == &global_ops)) |
246 | set_ftrace_pid_function(func); | 294 | return -EINVAL; |
247 | func = ftrace_pid_func; | ||
248 | } else { | ||
249 | if (func == ftrace_pid_func) | ||
250 | func = ftrace_pid_function; | ||
251 | } | ||
252 | 295 | ||
253 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 296 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
254 | ftrace_trace_function = func; | 297 | ret = remove_ftrace_ops(&ftrace_global_list, ops); |
255 | #else | 298 | if (!ret && ftrace_global_list == &ftrace_list_end) |
256 | __ftrace_trace_function = func; | 299 | ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); |
257 | #endif | 300 | if (!ret) |
301 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
302 | } else | ||
303 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); | ||
304 | |||
305 | if (ret < 0) | ||
306 | return ret; | ||
307 | |||
308 | if (ftrace_enabled) | ||
309 | update_ftrace_function(); | ||
310 | |||
311 | /* | ||
312 | * Dynamic ops may be freed, we must make sure that all | ||
313 | * callers are done before leaving this function. | ||
314 | */ | ||
315 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
316 | synchronize_sched(); | ||
317 | |||
318 | return 0; | ||
319 | } | ||
320 | |||
321 | static void ftrace_update_pid_func(void) | ||
322 | { | ||
323 | /* Only do something if we are tracing something */ | ||
324 | if (ftrace_trace_function == ftrace_stub) | ||
325 | return; | ||
326 | |||
327 | update_ftrace_function(); | ||
258 | } | 328 | } |
259 | 329 | ||
260 | #ifdef CONFIG_FUNCTION_PROFILER | 330 | #ifdef CONFIG_FUNCTION_PROFILER |
@@ -888,8 +958,35 @@ enum { | |||
888 | FTRACE_START_FUNC_RET = (1 << 3), | 958 | FTRACE_START_FUNC_RET = (1 << 3), |
889 | FTRACE_STOP_FUNC_RET = (1 << 4), | 959 | FTRACE_STOP_FUNC_RET = (1 << 4), |
890 | }; | 960 | }; |
961 | struct ftrace_func_entry { | ||
962 | struct hlist_node hlist; | ||
963 | unsigned long ip; | ||
964 | }; | ||
965 | |||
966 | struct ftrace_hash { | ||
967 | unsigned long size_bits; | ||
968 | struct hlist_head *buckets; | ||
969 | unsigned long count; | ||
970 | struct rcu_head rcu; | ||
971 | }; | ||
972 | |||
973 | /* | ||
974 | * We make these constant because no one should touch them, | ||
975 | * but they are used as the default "empty hash", to avoid allocating | ||
976 | * it all the time. These are in a read only section such that if | ||
977 | * anyone does try to modify it, it will cause an exception. | ||
978 | */ | ||
979 | static const struct hlist_head empty_buckets[1]; | ||
980 | static const struct ftrace_hash empty_hash = { | ||
981 | .buckets = (struct hlist_head *)empty_buckets, | ||
982 | }; | ||
983 | #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) | ||
891 | 984 | ||
892 | static int ftrace_filtered; | 985 | static struct ftrace_ops global_ops = { |
986 | .func = ftrace_stub, | ||
987 | .notrace_hash = EMPTY_HASH, | ||
988 | .filter_hash = EMPTY_HASH, | ||
989 | }; | ||
893 | 990 | ||
894 | static struct dyn_ftrace *ftrace_new_addrs; | 991 | static struct dyn_ftrace *ftrace_new_addrs; |
895 | 992 | ||
@@ -912,6 +1009,269 @@ static struct ftrace_page *ftrace_pages; | |||
912 | 1009 | ||
913 | static struct dyn_ftrace *ftrace_free_records; | 1010 | static struct dyn_ftrace *ftrace_free_records; |
914 | 1011 | ||
1012 | static struct ftrace_func_entry * | ||
1013 | ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | ||
1014 | { | ||
1015 | unsigned long key; | ||
1016 | struct ftrace_func_entry *entry; | ||
1017 | struct hlist_head *hhd; | ||
1018 | struct hlist_node *n; | ||
1019 | |||
1020 | if (!hash->count) | ||
1021 | return NULL; | ||
1022 | |||
1023 | if (hash->size_bits > 0) | ||
1024 | key = hash_long(ip, hash->size_bits); | ||
1025 | else | ||
1026 | key = 0; | ||
1027 | |||
1028 | hhd = &hash->buckets[key]; | ||
1029 | |||
1030 | hlist_for_each_entry_rcu(entry, n, hhd, hlist) { | ||
1031 | if (entry->ip == ip) | ||
1032 | return entry; | ||
1033 | } | ||
1034 | return NULL; | ||
1035 | } | ||
1036 | |||
1037 | static void __add_hash_entry(struct ftrace_hash *hash, | ||
1038 | struct ftrace_func_entry *entry) | ||
1039 | { | ||
1040 | struct hlist_head *hhd; | ||
1041 | unsigned long key; | ||
1042 | |||
1043 | if (hash->size_bits) | ||
1044 | key = hash_long(entry->ip, hash->size_bits); | ||
1045 | else | ||
1046 | key = 0; | ||
1047 | |||
1048 | hhd = &hash->buckets[key]; | ||
1049 | hlist_add_head(&entry->hlist, hhd); | ||
1050 | hash->count++; | ||
1051 | } | ||
1052 | |||
1053 | static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) | ||
1054 | { | ||
1055 | struct ftrace_func_entry *entry; | ||
1056 | |||
1057 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
1058 | if (!entry) | ||
1059 | return -ENOMEM; | ||
1060 | |||
1061 | entry->ip = ip; | ||
1062 | __add_hash_entry(hash, entry); | ||
1063 | |||
1064 | return 0; | ||
1065 | } | ||
1066 | |||
1067 | static void | ||
1068 | free_hash_entry(struct ftrace_hash *hash, | ||
1069 | struct ftrace_func_entry *entry) | ||
1070 | { | ||
1071 | hlist_del(&entry->hlist); | ||
1072 | kfree(entry); | ||
1073 | hash->count--; | ||
1074 | } | ||
1075 | |||
1076 | static void | ||
1077 | remove_hash_entry(struct ftrace_hash *hash, | ||
1078 | struct ftrace_func_entry *entry) | ||
1079 | { | ||
1080 | hlist_del(&entry->hlist); | ||
1081 | hash->count--; | ||
1082 | } | ||
1083 | |||
1084 | static void ftrace_hash_clear(struct ftrace_hash *hash) | ||
1085 | { | ||
1086 | struct hlist_head *hhd; | ||
1087 | struct hlist_node *tp, *tn; | ||
1088 | struct ftrace_func_entry *entry; | ||
1089 | int size = 1 << hash->size_bits; | ||
1090 | int i; | ||
1091 | |||
1092 | if (!hash->count) | ||
1093 | return; | ||
1094 | |||
1095 | for (i = 0; i < size; i++) { | ||
1096 | hhd = &hash->buckets[i]; | ||
1097 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) | ||
1098 | free_hash_entry(hash, entry); | ||
1099 | } | ||
1100 | FTRACE_WARN_ON(hash->count); | ||
1101 | } | ||
1102 | |||
1103 | static void free_ftrace_hash(struct ftrace_hash *hash) | ||
1104 | { | ||
1105 | if (!hash || hash == EMPTY_HASH) | ||
1106 | return; | ||
1107 | ftrace_hash_clear(hash); | ||
1108 | kfree(hash->buckets); | ||
1109 | kfree(hash); | ||
1110 | } | ||
1111 | |||
1112 | static void __free_ftrace_hash_rcu(struct rcu_head *rcu) | ||
1113 | { | ||
1114 | struct ftrace_hash *hash; | ||
1115 | |||
1116 | hash = container_of(rcu, struct ftrace_hash, rcu); | ||
1117 | free_ftrace_hash(hash); | ||
1118 | } | ||
1119 | |||
1120 | static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | ||
1121 | { | ||
1122 | if (!hash || hash == EMPTY_HASH) | ||
1123 | return; | ||
1124 | call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); | ||
1125 | } | ||
1126 | |||
1127 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | ||
1128 | { | ||
1129 | struct ftrace_hash *hash; | ||
1130 | int size; | ||
1131 | |||
1132 | hash = kzalloc(sizeof(*hash), GFP_KERNEL); | ||
1133 | if (!hash) | ||
1134 | return NULL; | ||
1135 | |||
1136 | size = 1 << size_bits; | ||
1137 | hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); | ||
1138 | |||
1139 | if (!hash->buckets) { | ||
1140 | kfree(hash); | ||
1141 | return NULL; | ||
1142 | } | ||
1143 | |||
1144 | hash->size_bits = size_bits; | ||
1145 | |||
1146 | return hash; | ||
1147 | } | ||
1148 | |||
1149 | static struct ftrace_hash * | ||
1150 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | ||
1151 | { | ||
1152 | struct ftrace_func_entry *entry; | ||
1153 | struct ftrace_hash *new_hash; | ||
1154 | struct hlist_node *tp; | ||
1155 | int size; | ||
1156 | int ret; | ||
1157 | int i; | ||
1158 | |||
1159 | new_hash = alloc_ftrace_hash(size_bits); | ||
1160 | if (!new_hash) | ||
1161 | return NULL; | ||
1162 | |||
1163 | /* Empty hash? */ | ||
1164 | if (!hash || !hash->count) | ||
1165 | return new_hash; | ||
1166 | |||
1167 | size = 1 << hash->size_bits; | ||
1168 | for (i = 0; i < size; i++) { | ||
1169 | hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { | ||
1170 | ret = add_hash_entry(new_hash, entry->ip); | ||
1171 | if (ret < 0) | ||
1172 | goto free_hash; | ||
1173 | } | ||
1174 | } | ||
1175 | |||
1176 | FTRACE_WARN_ON(new_hash->count != hash->count); | ||
1177 | |||
1178 | return new_hash; | ||
1179 | |||
1180 | free_hash: | ||
1181 | free_ftrace_hash(new_hash); | ||
1182 | return NULL; | ||
1183 | } | ||
1184 | |||
1185 | static int | ||
1186 | ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | ||
1187 | { | ||
1188 | struct ftrace_func_entry *entry; | ||
1189 | struct hlist_node *tp, *tn; | ||
1190 | struct hlist_head *hhd; | ||
1191 | struct ftrace_hash *old_hash; | ||
1192 | struct ftrace_hash *new_hash; | ||
1193 | unsigned long key; | ||
1194 | int size = src->count; | ||
1195 | int bits = 0; | ||
1196 | int i; | ||
1197 | |||
1198 | /* | ||
1199 | * If the new source is empty, just free dst and assign it | ||
1200 | * the empty_hash. | ||
1201 | */ | ||
1202 | if (!src->count) { | ||
1203 | free_ftrace_hash_rcu(*dst); | ||
1204 | rcu_assign_pointer(*dst, EMPTY_HASH); | ||
1205 | return 0; | ||
1206 | } | ||
1207 | |||
1208 | /* | ||
1209 | * Make the hash size about 1/2 the # found | ||
1210 | */ | ||
1211 | for (size /= 2; size; size >>= 1) | ||
1212 | bits++; | ||
1213 | |||
1214 | /* Don't allocate too much */ | ||
1215 | if (bits > FTRACE_HASH_MAX_BITS) | ||
1216 | bits = FTRACE_HASH_MAX_BITS; | ||
1217 | |||
1218 | new_hash = alloc_ftrace_hash(bits); | ||
1219 | if (!new_hash) | ||
1220 | return -ENOMEM; | ||
1221 | |||
1222 | size = 1 << src->size_bits; | ||
1223 | for (i = 0; i < size; i++) { | ||
1224 | hhd = &src->buckets[i]; | ||
1225 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { | ||
1226 | if (bits > 0) | ||
1227 | key = hash_long(entry->ip, bits); | ||
1228 | else | ||
1229 | key = 0; | ||
1230 | remove_hash_entry(src, entry); | ||
1231 | __add_hash_entry(new_hash, entry); | ||
1232 | } | ||
1233 | } | ||
1234 | |||
1235 | old_hash = *dst; | ||
1236 | rcu_assign_pointer(*dst, new_hash); | ||
1237 | free_ftrace_hash_rcu(old_hash); | ||
1238 | |||
1239 | return 0; | ||
1240 | } | ||
1241 | |||
1242 | /* | ||
1243 | * Test the hashes for this ops to see if we want to call | ||
1244 | * the ops->func or not. | ||
1245 | * | ||
1246 | * It's a match if the ip is in the ops->filter_hash or | ||
1247 | * the filter_hash does not exist or is empty, | ||
1248 | * AND | ||
1249 | * the ip is not in the ops->notrace_hash. | ||
1250 | * | ||
1251 | * This needs to be called with preemption disabled as | ||
1252 | * the hashes are freed with call_rcu_sched(). | ||
1253 | */ | ||
1254 | static int | ||
1255 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
1256 | { | ||
1257 | struct ftrace_hash *filter_hash; | ||
1258 | struct ftrace_hash *notrace_hash; | ||
1259 | int ret; | ||
1260 | |||
1261 | filter_hash = rcu_dereference_raw(ops->filter_hash); | ||
1262 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | ||
1263 | |||
1264 | if ((!filter_hash || !filter_hash->count || | ||
1265 | ftrace_lookup_ip(filter_hash, ip)) && | ||
1266 | (!notrace_hash || !notrace_hash->count || | ||
1267 | !ftrace_lookup_ip(notrace_hash, ip))) | ||
1268 | ret = 1; | ||
1269 | else | ||
1270 | ret = 0; | ||
1271 | |||
1272 | return ret; | ||
1273 | } | ||
1274 | |||
915 | /* | 1275 | /* |
916 | * This is a double for. Do not use 'break' to break out of the loop, | 1276 | * This is a double for. Do not use 'break' to break out of the loop, |
917 | * you must use a goto. | 1277 | * you must use a goto. |
@@ -926,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records; | |||
926 | } \ | 1286 | } \ |
927 | } | 1287 | } |
928 | 1288 | ||
1289 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | ||
1290 | int filter_hash, | ||
1291 | bool inc) | ||
1292 | { | ||
1293 | struct ftrace_hash *hash; | ||
1294 | struct ftrace_hash *other_hash; | ||
1295 | struct ftrace_page *pg; | ||
1296 | struct dyn_ftrace *rec; | ||
1297 | int count = 0; | ||
1298 | int all = 0; | ||
1299 | |||
1300 | /* Only update if the ops has been registered */ | ||
1301 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
1302 | return; | ||
1303 | |||
1304 | /* | ||
1305 | * In the filter_hash case: | ||
1306 | * If the count is zero, we update all records. | ||
1307 | * Otherwise we just update the items in the hash. | ||
1308 | * | ||
1309 | * In the notrace_hash case: | ||
1310 | * We enable the update in the hash. | ||
1311 | * As disabling notrace means enabling the tracing, | ||
1312 | * and enabling notrace means disabling, the inc variable | ||
1313 | * gets inversed. | ||
1314 | */ | ||
1315 | if (filter_hash) { | ||
1316 | hash = ops->filter_hash; | ||
1317 | other_hash = ops->notrace_hash; | ||
1318 | if (!hash || !hash->count) | ||
1319 | all = 1; | ||
1320 | } else { | ||
1321 | inc = !inc; | ||
1322 | hash = ops->notrace_hash; | ||
1323 | other_hash = ops->filter_hash; | ||
1324 | /* | ||
1325 | * If the notrace hash has no items, | ||
1326 | * then there's nothing to do. | ||
1327 | */ | ||
1328 | if (hash && !hash->count) | ||
1329 | return; | ||
1330 | } | ||
1331 | |||
1332 | do_for_each_ftrace_rec(pg, rec) { | ||
1333 | int in_other_hash = 0; | ||
1334 | int in_hash = 0; | ||
1335 | int match = 0; | ||
1336 | |||
1337 | if (all) { | ||
1338 | /* | ||
1339 | * Only the filter_hash affects all records. | ||
1340 | * Update if the record is not in the notrace hash. | ||
1341 | */ | ||
1342 | if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) | ||
1343 | match = 1; | ||
1344 | } else { | ||
1345 | in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); | ||
1346 | in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); | ||
1347 | |||
1348 | /* | ||
1349 | * | ||
1350 | */ | ||
1351 | if (filter_hash && in_hash && !in_other_hash) | ||
1352 | match = 1; | ||
1353 | else if (!filter_hash && in_hash && | ||
1354 | (in_other_hash || !other_hash->count)) | ||
1355 | match = 1; | ||
1356 | } | ||
1357 | if (!match) | ||
1358 | continue; | ||
1359 | |||
1360 | if (inc) { | ||
1361 | rec->flags++; | ||
1362 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | ||
1363 | return; | ||
1364 | } else { | ||
1365 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | ||
1366 | return; | ||
1367 | rec->flags--; | ||
1368 | } | ||
1369 | count++; | ||
1370 | /* Shortcut, if we handled all records, we are done. */ | ||
1371 | if (!all && count == hash->count) | ||
1372 | return; | ||
1373 | } while_for_each_ftrace_rec(); | ||
1374 | } | ||
1375 | |||
1376 | static void ftrace_hash_rec_disable(struct ftrace_ops *ops, | ||
1377 | int filter_hash) | ||
1378 | { | ||
1379 | __ftrace_hash_rec_update(ops, filter_hash, 0); | ||
1380 | } | ||
1381 | |||
1382 | static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | ||
1383 | int filter_hash) | ||
1384 | { | ||
1385 | __ftrace_hash_rec_update(ops, filter_hash, 1); | ||
1386 | } | ||
1387 | |||
929 | static void ftrace_free_rec(struct dyn_ftrace *rec) | 1388 | static void ftrace_free_rec(struct dyn_ftrace *rec) |
930 | { | 1389 | { |
931 | rec->freelist = ftrace_free_records; | 1390 | rec->freelist = ftrace_free_records; |
@@ -1047,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1047 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1506 | ftrace_addr = (unsigned long)FTRACE_ADDR; |
1048 | 1507 | ||
1049 | /* | 1508 | /* |
1050 | * If this record is not to be traced or we want to disable it, | 1509 | * If we are enabling tracing: |
1051 | * then disable it. | 1510 | * |
1511 | * If the record has a ref count, then we need to enable it | ||
1512 | * because someone is using it. | ||
1052 | * | 1513 | * |
1053 | * If we want to enable it and filtering is off, then enable it. | 1514 | * Otherwise we make sure its disabled. |
1054 | * | 1515 | * |
1055 | * If we want to enable it and filtering is on, enable it only if | 1516 | * If we are disabling tracing, then disable all records that |
1056 | * it's filtered | 1517 | * are enabled. |
1057 | */ | 1518 | */ |
1058 | if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { | 1519 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
1059 | if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) | 1520 | flag = FTRACE_FL_ENABLED; |
1060 | flag = FTRACE_FL_ENABLED; | ||
1061 | } | ||
1062 | 1521 | ||
1063 | /* If the state of this record hasn't changed, then do nothing */ | 1522 | /* If the state of this record hasn't changed, then do nothing */ |
1064 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1523 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
@@ -1079,19 +1538,16 @@ static void ftrace_replace_code(int enable) | |||
1079 | struct ftrace_page *pg; | 1538 | struct ftrace_page *pg; |
1080 | int failed; | 1539 | int failed; |
1081 | 1540 | ||
1541 | if (unlikely(ftrace_disabled)) | ||
1542 | return; | ||
1543 | |||
1082 | do_for_each_ftrace_rec(pg, rec) { | 1544 | do_for_each_ftrace_rec(pg, rec) { |
1083 | /* | 1545 | /* Skip over free records */ |
1084 | * Skip over free records, records that have | 1546 | if (rec->flags & FTRACE_FL_FREE) |
1085 | * failed and not converted. | ||
1086 | */ | ||
1087 | if (rec->flags & FTRACE_FL_FREE || | ||
1088 | rec->flags & FTRACE_FL_FAILED || | ||
1089 | !(rec->flags & FTRACE_FL_CONVERTED)) | ||
1090 | continue; | 1547 | continue; |
1091 | 1548 | ||
1092 | failed = __ftrace_replace_code(rec, enable); | 1549 | failed = __ftrace_replace_code(rec, enable); |
1093 | if (failed) { | 1550 | if (failed) { |
1094 | rec->flags |= FTRACE_FL_FAILED; | ||
1095 | ftrace_bug(failed, rec->ip); | 1551 | ftrace_bug(failed, rec->ip); |
1096 | /* Stop processing */ | 1552 | /* Stop processing */ |
1097 | return; | 1553 | return; |
@@ -1107,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | |||
1107 | 1563 | ||
1108 | ip = rec->ip; | 1564 | ip = rec->ip; |
1109 | 1565 | ||
1566 | if (unlikely(ftrace_disabled)) | ||
1567 | return 0; | ||
1568 | |||
1110 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); | 1569 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); |
1111 | if (ret) { | 1570 | if (ret) { |
1112 | ftrace_bug(ret, ip); | 1571 | ftrace_bug(ret, ip); |
1113 | rec->flags |= FTRACE_FL_FAILED; | ||
1114 | return 0; | 1572 | return 0; |
1115 | } | 1573 | } |
1116 | return 1; | 1574 | return 1; |
@@ -1171,6 +1629,7 @@ static void ftrace_run_update_code(int command) | |||
1171 | 1629 | ||
1172 | static ftrace_func_t saved_ftrace_func; | 1630 | static ftrace_func_t saved_ftrace_func; |
1173 | static int ftrace_start_up; | 1631 | static int ftrace_start_up; |
1632 | static int global_start_up; | ||
1174 | 1633 | ||
1175 | static void ftrace_startup_enable(int command) | 1634 | static void ftrace_startup_enable(int command) |
1176 | { | 1635 | { |
@@ -1185,19 +1644,38 @@ static void ftrace_startup_enable(int command) | |||
1185 | ftrace_run_update_code(command); | 1644 | ftrace_run_update_code(command); |
1186 | } | 1645 | } |
1187 | 1646 | ||
1188 | static void ftrace_startup(int command) | 1647 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
1189 | { | 1648 | { |
1649 | bool hash_enable = true; | ||
1650 | |||
1190 | if (unlikely(ftrace_disabled)) | 1651 | if (unlikely(ftrace_disabled)) |
1191 | return; | 1652 | return -ENODEV; |
1192 | 1653 | ||
1193 | ftrace_start_up++; | 1654 | ftrace_start_up++; |
1194 | command |= FTRACE_ENABLE_CALLS; | 1655 | command |= FTRACE_ENABLE_CALLS; |
1195 | 1656 | ||
1657 | /* ops marked global share the filter hashes */ | ||
1658 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
1659 | ops = &global_ops; | ||
1660 | /* Don't update hash if global is already set */ | ||
1661 | if (global_start_up) | ||
1662 | hash_enable = false; | ||
1663 | global_start_up++; | ||
1664 | } | ||
1665 | |||
1666 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
1667 | if (hash_enable) | ||
1668 | ftrace_hash_rec_enable(ops, 1); | ||
1669 | |||
1196 | ftrace_startup_enable(command); | 1670 | ftrace_startup_enable(command); |
1671 | |||
1672 | return 0; | ||
1197 | } | 1673 | } |
1198 | 1674 | ||
1199 | static void ftrace_shutdown(int command) | 1675 | static void ftrace_shutdown(struct ftrace_ops *ops, int command) |
1200 | { | 1676 | { |
1677 | bool hash_disable = true; | ||
1678 | |||
1201 | if (unlikely(ftrace_disabled)) | 1679 | if (unlikely(ftrace_disabled)) |
1202 | return; | 1680 | return; |
1203 | 1681 | ||
@@ -1209,6 +1687,23 @@ static void ftrace_shutdown(int command) | |||
1209 | */ | 1687 | */ |
1210 | WARN_ON_ONCE(ftrace_start_up < 0); | 1688 | WARN_ON_ONCE(ftrace_start_up < 0); |
1211 | 1689 | ||
1690 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
1691 | ops = &global_ops; | ||
1692 | global_start_up--; | ||
1693 | WARN_ON_ONCE(global_start_up < 0); | ||
1694 | /* Don't update hash if global still has users */ | ||
1695 | if (global_start_up) { | ||
1696 | WARN_ON_ONCE(!ftrace_start_up); | ||
1697 | hash_disable = false; | ||
1698 | } | ||
1699 | } | ||
1700 | |||
1701 | if (hash_disable) | ||
1702 | ftrace_hash_rec_disable(ops, 1); | ||
1703 | |||
1704 | if (ops != &global_ops || !global_start_up) | ||
1705 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
1706 | |||
1212 | if (!ftrace_start_up) | 1707 | if (!ftrace_start_up) |
1213 | command |= FTRACE_DISABLE_CALLS; | 1708 | command |= FTRACE_DISABLE_CALLS; |
1214 | 1709 | ||
@@ -1273,10 +1768,10 @@ static int ftrace_update_code(struct module *mod) | |||
1273 | */ | 1768 | */ |
1274 | if (!ftrace_code_disable(mod, p)) { | 1769 | if (!ftrace_code_disable(mod, p)) { |
1275 | ftrace_free_rec(p); | 1770 | ftrace_free_rec(p); |
1276 | continue; | 1771 | /* Game over */ |
1772 | break; | ||
1277 | } | 1773 | } |
1278 | 1774 | ||
1279 | p->flags |= FTRACE_FL_CONVERTED; | ||
1280 | ftrace_update_cnt++; | 1775 | ftrace_update_cnt++; |
1281 | 1776 | ||
1282 | /* | 1777 | /* |
@@ -1351,9 +1846,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) | |||
1351 | enum { | 1846 | enum { |
1352 | FTRACE_ITER_FILTER = (1 << 0), | 1847 | FTRACE_ITER_FILTER = (1 << 0), |
1353 | FTRACE_ITER_NOTRACE = (1 << 1), | 1848 | FTRACE_ITER_NOTRACE = (1 << 1), |
1354 | FTRACE_ITER_FAILURES = (1 << 2), | 1849 | FTRACE_ITER_PRINTALL = (1 << 2), |
1355 | FTRACE_ITER_PRINTALL = (1 << 3), | 1850 | FTRACE_ITER_HASH = (1 << 3), |
1356 | FTRACE_ITER_HASH = (1 << 4), | 1851 | FTRACE_ITER_ENABLED = (1 << 4), |
1357 | }; | 1852 | }; |
1358 | 1853 | ||
1359 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1854 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
@@ -1365,6 +1860,8 @@ struct ftrace_iterator { | |||
1365 | struct dyn_ftrace *func; | 1860 | struct dyn_ftrace *func; |
1366 | struct ftrace_func_probe *probe; | 1861 | struct ftrace_func_probe *probe; |
1367 | struct trace_parser parser; | 1862 | struct trace_parser parser; |
1863 | struct ftrace_hash *hash; | ||
1864 | struct ftrace_ops *ops; | ||
1368 | int hidx; | 1865 | int hidx; |
1369 | int idx; | 1866 | int idx; |
1370 | unsigned flags; | 1867 | unsigned flags; |
@@ -1461,8 +1958,12 @@ static void * | |||
1461 | t_next(struct seq_file *m, void *v, loff_t *pos) | 1958 | t_next(struct seq_file *m, void *v, loff_t *pos) |
1462 | { | 1959 | { |
1463 | struct ftrace_iterator *iter = m->private; | 1960 | struct ftrace_iterator *iter = m->private; |
1961 | struct ftrace_ops *ops = &global_ops; | ||
1464 | struct dyn_ftrace *rec = NULL; | 1962 | struct dyn_ftrace *rec = NULL; |
1465 | 1963 | ||
1964 | if (unlikely(ftrace_disabled)) | ||
1965 | return NULL; | ||
1966 | |||
1466 | if (iter->flags & FTRACE_ITER_HASH) | 1967 | if (iter->flags & FTRACE_ITER_HASH) |
1467 | return t_hash_next(m, pos); | 1968 | return t_hash_next(m, pos); |
1468 | 1969 | ||
@@ -1483,17 +1984,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1483 | rec = &iter->pg->records[iter->idx++]; | 1984 | rec = &iter->pg->records[iter->idx++]; |
1484 | if ((rec->flags & FTRACE_FL_FREE) || | 1985 | if ((rec->flags & FTRACE_FL_FREE) || |
1485 | 1986 | ||
1486 | (!(iter->flags & FTRACE_ITER_FAILURES) && | ||
1487 | (rec->flags & FTRACE_FL_FAILED)) || | ||
1488 | |||
1489 | ((iter->flags & FTRACE_ITER_FAILURES) && | ||
1490 | !(rec->flags & FTRACE_FL_FAILED)) || | ||
1491 | |||
1492 | ((iter->flags & FTRACE_ITER_FILTER) && | 1987 | ((iter->flags & FTRACE_ITER_FILTER) && |
1493 | !(rec->flags & FTRACE_FL_FILTER)) || | 1988 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || |
1494 | 1989 | ||
1495 | ((iter->flags & FTRACE_ITER_NOTRACE) && | 1990 | ((iter->flags & FTRACE_ITER_NOTRACE) && |
1496 | !(rec->flags & FTRACE_FL_NOTRACE))) { | 1991 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || |
1992 | |||
1993 | ((iter->flags & FTRACE_ITER_ENABLED) && | ||
1994 | !(rec->flags & ~FTRACE_FL_MASK))) { | ||
1995 | |||
1497 | rec = NULL; | 1996 | rec = NULL; |
1498 | goto retry; | 1997 | goto retry; |
1499 | } | 1998 | } |
@@ -1517,10 +2016,15 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
1517 | static void *t_start(struct seq_file *m, loff_t *pos) | 2016 | static void *t_start(struct seq_file *m, loff_t *pos) |
1518 | { | 2017 | { |
1519 | struct ftrace_iterator *iter = m->private; | 2018 | struct ftrace_iterator *iter = m->private; |
2019 | struct ftrace_ops *ops = &global_ops; | ||
1520 | void *p = NULL; | 2020 | void *p = NULL; |
1521 | loff_t l; | 2021 | loff_t l; |
1522 | 2022 | ||
1523 | mutex_lock(&ftrace_lock); | 2023 | mutex_lock(&ftrace_lock); |
2024 | |||
2025 | if (unlikely(ftrace_disabled)) | ||
2026 | return NULL; | ||
2027 | |||
1524 | /* | 2028 | /* |
1525 | * If an lseek was done, then reset and start from beginning. | 2029 | * If an lseek was done, then reset and start from beginning. |
1526 | */ | 2030 | */ |
@@ -1532,7 +2036,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1532 | * off, we can short cut and just print out that all | 2036 | * off, we can short cut and just print out that all |
1533 | * functions are enabled. | 2037 | * functions are enabled. |
1534 | */ | 2038 | */ |
1535 | if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { | 2039 | if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { |
1536 | if (*pos > 0) | 2040 | if (*pos > 0) |
1537 | return t_hash_start(m, pos); | 2041 | return t_hash_start(m, pos); |
1538 | iter->flags |= FTRACE_ITER_PRINTALL; | 2042 | iter->flags |= FTRACE_ITER_PRINTALL; |
@@ -1590,7 +2094,11 @@ static int t_show(struct seq_file *m, void *v) | |||
1590 | if (!rec) | 2094 | if (!rec) |
1591 | return 0; | 2095 | return 0; |
1592 | 2096 | ||
1593 | seq_printf(m, "%ps\n", (void *)rec->ip); | 2097 | seq_printf(m, "%ps", (void *)rec->ip); |
2098 | if (iter->flags & FTRACE_ITER_ENABLED) | ||
2099 | seq_printf(m, " (%ld)", | ||
2100 | rec->flags & ~FTRACE_FL_MASK); | ||
2101 | seq_printf(m, "\n"); | ||
1594 | 2102 | ||
1595 | return 0; | 2103 | return 0; |
1596 | } | 2104 | } |
@@ -1630,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file) | |||
1630 | } | 2138 | } |
1631 | 2139 | ||
1632 | static int | 2140 | static int |
1633 | ftrace_failures_open(struct inode *inode, struct file *file) | 2141 | ftrace_enabled_open(struct inode *inode, struct file *file) |
1634 | { | 2142 | { |
1635 | int ret; | ||
1636 | struct seq_file *m; | ||
1637 | struct ftrace_iterator *iter; | 2143 | struct ftrace_iterator *iter; |
2144 | int ret; | ||
2145 | |||
2146 | if (unlikely(ftrace_disabled)) | ||
2147 | return -ENODEV; | ||
2148 | |||
2149 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | ||
2150 | if (!iter) | ||
2151 | return -ENOMEM; | ||
1638 | 2152 | ||
1639 | ret = ftrace_avail_open(inode, file); | 2153 | iter->pg = ftrace_pages_start; |
2154 | iter->flags = FTRACE_ITER_ENABLED; | ||
2155 | |||
2156 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
1640 | if (!ret) { | 2157 | if (!ret) { |
1641 | m = file->private_data; | 2158 | struct seq_file *m = file->private_data; |
1642 | iter = m->private; | 2159 | |
1643 | iter->flags = FTRACE_ITER_FAILURES; | 2160 | m->private = iter; |
2161 | } else { | ||
2162 | kfree(iter); | ||
1644 | } | 2163 | } |
1645 | 2164 | ||
1646 | return ret; | 2165 | return ret; |
1647 | } | 2166 | } |
1648 | 2167 | ||
1649 | 2168 | static void ftrace_filter_reset(struct ftrace_hash *hash) | |
1650 | static void ftrace_filter_reset(int enable) | ||
1651 | { | 2169 | { |
1652 | struct ftrace_page *pg; | ||
1653 | struct dyn_ftrace *rec; | ||
1654 | unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
1655 | |||
1656 | mutex_lock(&ftrace_lock); | 2170 | mutex_lock(&ftrace_lock); |
1657 | if (enable) | 2171 | ftrace_hash_clear(hash); |
1658 | ftrace_filtered = 0; | ||
1659 | do_for_each_ftrace_rec(pg, rec) { | ||
1660 | if (rec->flags & FTRACE_FL_FAILED) | ||
1661 | continue; | ||
1662 | rec->flags &= ~type; | ||
1663 | } while_for_each_ftrace_rec(); | ||
1664 | mutex_unlock(&ftrace_lock); | 2172 | mutex_unlock(&ftrace_lock); |
1665 | } | 2173 | } |
1666 | 2174 | ||
1667 | static int | 2175 | static int |
1668 | ftrace_regex_open(struct inode *inode, struct file *file, int enable) | 2176 | ftrace_regex_open(struct ftrace_ops *ops, int flag, |
2177 | struct inode *inode, struct file *file) | ||
1669 | { | 2178 | { |
1670 | struct ftrace_iterator *iter; | 2179 | struct ftrace_iterator *iter; |
2180 | struct ftrace_hash *hash; | ||
1671 | int ret = 0; | 2181 | int ret = 0; |
1672 | 2182 | ||
1673 | if (unlikely(ftrace_disabled)) | 2183 | if (unlikely(ftrace_disabled)) |
@@ -1682,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
1682 | return -ENOMEM; | 2192 | return -ENOMEM; |
1683 | } | 2193 | } |
1684 | 2194 | ||
2195 | if (flag & FTRACE_ITER_NOTRACE) | ||
2196 | hash = ops->notrace_hash; | ||
2197 | else | ||
2198 | hash = ops->filter_hash; | ||
2199 | |||
2200 | iter->ops = ops; | ||
2201 | iter->flags = flag; | ||
2202 | |||
2203 | if (file->f_mode & FMODE_WRITE) { | ||
2204 | mutex_lock(&ftrace_lock); | ||
2205 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); | ||
2206 | mutex_unlock(&ftrace_lock); | ||
2207 | |||
2208 | if (!iter->hash) { | ||
2209 | trace_parser_put(&iter->parser); | ||
2210 | kfree(iter); | ||
2211 | return -ENOMEM; | ||
2212 | } | ||
2213 | } | ||
2214 | |||
1685 | mutex_lock(&ftrace_regex_lock); | 2215 | mutex_lock(&ftrace_regex_lock); |
2216 | |||
1686 | if ((file->f_mode & FMODE_WRITE) && | 2217 | if ((file->f_mode & FMODE_WRITE) && |
1687 | (file->f_flags & O_TRUNC)) | 2218 | (file->f_flags & O_TRUNC)) |
1688 | ftrace_filter_reset(enable); | 2219 | ftrace_filter_reset(iter->hash); |
1689 | 2220 | ||
1690 | if (file->f_mode & FMODE_READ) { | 2221 | if (file->f_mode & FMODE_READ) { |
1691 | iter->pg = ftrace_pages_start; | 2222 | iter->pg = ftrace_pages_start; |
1692 | iter->flags = enable ? FTRACE_ITER_FILTER : | ||
1693 | FTRACE_ITER_NOTRACE; | ||
1694 | 2223 | ||
1695 | ret = seq_open(file, &show_ftrace_seq_ops); | 2224 | ret = seq_open(file, &show_ftrace_seq_ops); |
1696 | if (!ret) { | 2225 | if (!ret) { |
1697 | struct seq_file *m = file->private_data; | 2226 | struct seq_file *m = file->private_data; |
1698 | m->private = iter; | 2227 | m->private = iter; |
1699 | } else { | 2228 | } else { |
2229 | /* Failed */ | ||
2230 | free_ftrace_hash(iter->hash); | ||
1700 | trace_parser_put(&iter->parser); | 2231 | trace_parser_put(&iter->parser); |
1701 | kfree(iter); | 2232 | kfree(iter); |
1702 | } | 2233 | } |
@@ -1710,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
1710 | static int | 2241 | static int |
1711 | ftrace_filter_open(struct inode *inode, struct file *file) | 2242 | ftrace_filter_open(struct inode *inode, struct file *file) |
1712 | { | 2243 | { |
1713 | return ftrace_regex_open(inode, file, 1); | 2244 | return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, |
2245 | inode, file); | ||
1714 | } | 2246 | } |
1715 | 2247 | ||
1716 | static int | 2248 | static int |
1717 | ftrace_notrace_open(struct inode *inode, struct file *file) | 2249 | ftrace_notrace_open(struct inode *inode, struct file *file) |
1718 | { | 2250 | { |
1719 | return ftrace_regex_open(inode, file, 0); | 2251 | return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, |
2252 | inode, file); | ||
1720 | } | 2253 | } |
1721 | 2254 | ||
1722 | static loff_t | 2255 | static loff_t |
@@ -1761,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type) | |||
1761 | } | 2294 | } |
1762 | 2295 | ||
1763 | static int | 2296 | static int |
1764 | ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) | 2297 | enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) |
2298 | { | ||
2299 | struct ftrace_func_entry *entry; | ||
2300 | int ret = 0; | ||
2301 | |||
2302 | entry = ftrace_lookup_ip(hash, rec->ip); | ||
2303 | if (not) { | ||
2304 | /* Do nothing if it doesn't exist */ | ||
2305 | if (!entry) | ||
2306 | return 0; | ||
2307 | |||
2308 | free_hash_entry(hash, entry); | ||
2309 | } else { | ||
2310 | /* Do nothing if it exists */ | ||
2311 | if (entry) | ||
2312 | return 0; | ||
2313 | |||
2314 | ret = add_hash_entry(hash, rec->ip); | ||
2315 | } | ||
2316 | return ret; | ||
2317 | } | ||
2318 | |||
2319 | static int | ||
2320 | ftrace_match_record(struct dyn_ftrace *rec, char *mod, | ||
2321 | char *regex, int len, int type) | ||
1765 | { | 2322 | { |
1766 | char str[KSYM_SYMBOL_LEN]; | 2323 | char str[KSYM_SYMBOL_LEN]; |
2324 | char *modname; | ||
2325 | |||
2326 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
2327 | |||
2328 | if (mod) { | ||
2329 | /* module lookup requires matching the module */ | ||
2330 | if (!modname || strcmp(modname, mod)) | ||
2331 | return 0; | ||
2332 | |||
2333 | /* blank search means to match all funcs in the mod */ | ||
2334 | if (!len) | ||
2335 | return 1; | ||
2336 | } | ||
1767 | 2337 | ||
1768 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | ||
1769 | return ftrace_match(str, regex, len, type); | 2338 | return ftrace_match(str, regex, len, type); |
1770 | } | 2339 | } |
1771 | 2340 | ||
1772 | static int ftrace_match_records(char *buff, int len, int enable) | 2341 | static int |
2342 | match_records(struct ftrace_hash *hash, char *buff, | ||
2343 | int len, char *mod, int not) | ||
1773 | { | 2344 | { |
1774 | unsigned int search_len; | 2345 | unsigned search_len = 0; |
1775 | struct ftrace_page *pg; | 2346 | struct ftrace_page *pg; |
1776 | struct dyn_ftrace *rec; | 2347 | struct dyn_ftrace *rec; |
1777 | unsigned long flag; | 2348 | int type = MATCH_FULL; |
1778 | char *search; | 2349 | char *search = buff; |
1779 | int type; | ||
1780 | int not; | ||
1781 | int found = 0; | 2350 | int found = 0; |
2351 | int ret; | ||
1782 | 2352 | ||
1783 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | 2353 | if (len) { |
1784 | type = filter_parse_regex(buff, len, &search, ¬); | 2354 | type = filter_parse_regex(buff, len, &search, ¬); |
1785 | 2355 | search_len = strlen(search); | |
1786 | search_len = strlen(search); | 2356 | } |
1787 | 2357 | ||
1788 | mutex_lock(&ftrace_lock); | 2358 | mutex_lock(&ftrace_lock); |
1789 | do_for_each_ftrace_rec(pg, rec) { | ||
1790 | 2359 | ||
1791 | if (rec->flags & FTRACE_FL_FAILED) | 2360 | if (unlikely(ftrace_disabled)) |
1792 | continue; | 2361 | goto out_unlock; |
1793 | 2362 | ||
1794 | if (ftrace_match_record(rec, search, search_len, type)) { | 2363 | do_for_each_ftrace_rec(pg, rec) { |
1795 | if (not) | 2364 | |
1796 | rec->flags &= ~flag; | 2365 | if (ftrace_match_record(rec, mod, search, search_len, type)) { |
1797 | else | 2366 | ret = enter_record(hash, rec, not); |
1798 | rec->flags |= flag; | 2367 | if (ret < 0) { |
2368 | found = ret; | ||
2369 | goto out_unlock; | ||
2370 | } | ||
1799 | found = 1; | 2371 | found = 1; |
1800 | } | 2372 | } |
1801 | /* | ||
1802 | * Only enable filtering if we have a function that | ||
1803 | * is filtered on. | ||
1804 | */ | ||
1805 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
1806 | ftrace_filtered = 1; | ||
1807 | } while_for_each_ftrace_rec(); | 2373 | } while_for_each_ftrace_rec(); |
2374 | out_unlock: | ||
1808 | mutex_unlock(&ftrace_lock); | 2375 | mutex_unlock(&ftrace_lock); |
1809 | 2376 | ||
1810 | return found; | 2377 | return found; |
1811 | } | 2378 | } |
1812 | 2379 | ||
1813 | static int | 2380 | static int |
1814 | ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, | 2381 | ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) |
1815 | char *regex, int len, int type) | ||
1816 | { | 2382 | { |
1817 | char str[KSYM_SYMBOL_LEN]; | 2383 | return match_records(hash, buff, len, NULL, 0); |
1818 | char *modname; | ||
1819 | |||
1820 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
1821 | |||
1822 | if (!modname || strcmp(modname, mod)) | ||
1823 | return 0; | ||
1824 | |||
1825 | /* blank search means to match all funcs in the mod */ | ||
1826 | if (len) | ||
1827 | return ftrace_match(str, regex, len, type); | ||
1828 | else | ||
1829 | return 1; | ||
1830 | } | 2384 | } |
1831 | 2385 | ||
1832 | static int ftrace_match_module_records(char *buff, char *mod, int enable) | 2386 | static int |
2387 | ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) | ||
1833 | { | 2388 | { |
1834 | unsigned search_len = 0; | ||
1835 | struct ftrace_page *pg; | ||
1836 | struct dyn_ftrace *rec; | ||
1837 | int type = MATCH_FULL; | ||
1838 | char *search = buff; | ||
1839 | unsigned long flag; | ||
1840 | int not = 0; | 2389 | int not = 0; |
1841 | int found = 0; | ||
1842 | |||
1843 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
1844 | 2390 | ||
1845 | /* blank or '*' mean the same */ | 2391 | /* blank or '*' mean the same */ |
1846 | if (strcmp(buff, "*") == 0) | 2392 | if (strcmp(buff, "*") == 0) |
@@ -1852,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
1852 | not = 1; | 2398 | not = 1; |
1853 | } | 2399 | } |
1854 | 2400 | ||
1855 | if (strlen(buff)) { | 2401 | return match_records(hash, buff, strlen(buff), mod, not); |
1856 | type = filter_parse_regex(buff, strlen(buff), &search, ¬); | ||
1857 | search_len = strlen(search); | ||
1858 | } | ||
1859 | |||
1860 | mutex_lock(&ftrace_lock); | ||
1861 | do_for_each_ftrace_rec(pg, rec) { | ||
1862 | |||
1863 | if (rec->flags & FTRACE_FL_FAILED) | ||
1864 | continue; | ||
1865 | |||
1866 | if (ftrace_match_module_record(rec, mod, | ||
1867 | search, search_len, type)) { | ||
1868 | if (not) | ||
1869 | rec->flags &= ~flag; | ||
1870 | else | ||
1871 | rec->flags |= flag; | ||
1872 | found = 1; | ||
1873 | } | ||
1874 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
1875 | ftrace_filtered = 1; | ||
1876 | |||
1877 | } while_for_each_ftrace_rec(); | ||
1878 | mutex_unlock(&ftrace_lock); | ||
1879 | |||
1880 | return found; | ||
1881 | } | 2402 | } |
1882 | 2403 | ||
1883 | /* | 2404 | /* |
@@ -1888,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
1888 | static int | 2409 | static int |
1889 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | 2410 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) |
1890 | { | 2411 | { |
2412 | struct ftrace_ops *ops = &global_ops; | ||
2413 | struct ftrace_hash *hash; | ||
1891 | char *mod; | 2414 | char *mod; |
2415 | int ret = -EINVAL; | ||
1892 | 2416 | ||
1893 | /* | 2417 | /* |
1894 | * cmd == 'mod' because we only registered this func | 2418 | * cmd == 'mod' because we only registered this func |
@@ -1900,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | |||
1900 | 2424 | ||
1901 | /* we must have a module name */ | 2425 | /* we must have a module name */ |
1902 | if (!param) | 2426 | if (!param) |
1903 | return -EINVAL; | 2427 | return ret; |
1904 | 2428 | ||
1905 | mod = strsep(¶m, ":"); | 2429 | mod = strsep(¶m, ":"); |
1906 | if (!strlen(mod)) | 2430 | if (!strlen(mod)) |
1907 | return -EINVAL; | 2431 | return ret; |
1908 | 2432 | ||
1909 | if (ftrace_match_module_records(func, mod, enable)) | 2433 | if (enable) |
1910 | return 0; | 2434 | hash = ops->filter_hash; |
1911 | return -EINVAL; | 2435 | else |
2436 | hash = ops->notrace_hash; | ||
2437 | |||
2438 | ret = ftrace_match_module_records(hash, func, mod); | ||
2439 | if (!ret) | ||
2440 | ret = -EINVAL; | ||
2441 | if (ret < 0) | ||
2442 | return ret; | ||
2443 | |||
2444 | return 0; | ||
1912 | } | 2445 | } |
1913 | 2446 | ||
1914 | static struct ftrace_func_command ftrace_mod_cmd = { | 2447 | static struct ftrace_func_command ftrace_mod_cmd = { |
@@ -1959,6 +2492,7 @@ static int ftrace_probe_registered; | |||
1959 | 2492 | ||
1960 | static void __enable_ftrace_function_probe(void) | 2493 | static void __enable_ftrace_function_probe(void) |
1961 | { | 2494 | { |
2495 | int ret; | ||
1962 | int i; | 2496 | int i; |
1963 | 2497 | ||
1964 | if (ftrace_probe_registered) | 2498 | if (ftrace_probe_registered) |
@@ -1973,13 +2507,16 @@ static void __enable_ftrace_function_probe(void) | |||
1973 | if (i == FTRACE_FUNC_HASHSIZE) | 2507 | if (i == FTRACE_FUNC_HASHSIZE) |
1974 | return; | 2508 | return; |
1975 | 2509 | ||
1976 | __register_ftrace_function(&trace_probe_ops); | 2510 | ret = __register_ftrace_function(&trace_probe_ops); |
1977 | ftrace_startup(0); | 2511 | if (!ret) |
2512 | ret = ftrace_startup(&trace_probe_ops, 0); | ||
2513 | |||
1978 | ftrace_probe_registered = 1; | 2514 | ftrace_probe_registered = 1; |
1979 | } | 2515 | } |
1980 | 2516 | ||
1981 | static void __disable_ftrace_function_probe(void) | 2517 | static void __disable_ftrace_function_probe(void) |
1982 | { | 2518 | { |
2519 | int ret; | ||
1983 | int i; | 2520 | int i; |
1984 | 2521 | ||
1985 | if (!ftrace_probe_registered) | 2522 | if (!ftrace_probe_registered) |
@@ -1992,8 +2529,10 @@ static void __disable_ftrace_function_probe(void) | |||
1992 | } | 2529 | } |
1993 | 2530 | ||
1994 | /* no more funcs left */ | 2531 | /* no more funcs left */ |
1995 | __unregister_ftrace_function(&trace_probe_ops); | 2532 | ret = __unregister_ftrace_function(&trace_probe_ops); |
1996 | ftrace_shutdown(0); | 2533 | if (!ret) |
2534 | ftrace_shutdown(&trace_probe_ops, 0); | ||
2535 | |||
1997 | ftrace_probe_registered = 0; | 2536 | ftrace_probe_registered = 0; |
1998 | } | 2537 | } |
1999 | 2538 | ||
@@ -2029,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
2029 | return -EINVAL; | 2568 | return -EINVAL; |
2030 | 2569 | ||
2031 | mutex_lock(&ftrace_lock); | 2570 | mutex_lock(&ftrace_lock); |
2032 | do_for_each_ftrace_rec(pg, rec) { | ||
2033 | 2571 | ||
2034 | if (rec->flags & FTRACE_FL_FAILED) | 2572 | if (unlikely(ftrace_disabled)) |
2035 | continue; | 2573 | goto out_unlock; |
2574 | |||
2575 | do_for_each_ftrace_rec(pg, rec) { | ||
2036 | 2576 | ||
2037 | if (!ftrace_match_record(rec, search, len, type)) | 2577 | if (!ftrace_match_record(rec, NULL, search, len, type)) |
2038 | continue; | 2578 | continue; |
2039 | 2579 | ||
2040 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | 2580 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); |
@@ -2195,7 +2735,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) | |||
2195 | return ret; | 2735 | return ret; |
2196 | } | 2736 | } |
2197 | 2737 | ||
2198 | static int ftrace_process_regex(char *buff, int len, int enable) | 2738 | static int ftrace_process_regex(struct ftrace_hash *hash, |
2739 | char *buff, int len, int enable) | ||
2199 | { | 2740 | { |
2200 | char *func, *command, *next = buff; | 2741 | char *func, *command, *next = buff; |
2201 | struct ftrace_func_command *p; | 2742 | struct ftrace_func_command *p; |
@@ -2204,9 +2745,12 @@ static int ftrace_process_regex(char *buff, int len, int enable) | |||
2204 | func = strsep(&next, ":"); | 2745 | func = strsep(&next, ":"); |
2205 | 2746 | ||
2206 | if (!next) { | 2747 | if (!next) { |
2207 | if (ftrace_match_records(func, len, enable)) | 2748 | ret = ftrace_match_records(hash, func, len); |
2208 | return 0; | 2749 | if (!ret) |
2209 | return ret; | 2750 | ret = -EINVAL; |
2751 | if (ret < 0) | ||
2752 | return ret; | ||
2753 | return 0; | ||
2210 | } | 2754 | } |
2211 | 2755 | ||
2212 | /* command found */ | 2756 | /* command found */ |
@@ -2239,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2239 | 2783 | ||
2240 | mutex_lock(&ftrace_regex_lock); | 2784 | mutex_lock(&ftrace_regex_lock); |
2241 | 2785 | ||
2786 | ret = -ENODEV; | ||
2787 | if (unlikely(ftrace_disabled)) | ||
2788 | goto out_unlock; | ||
2789 | |||
2242 | if (file->f_mode & FMODE_READ) { | 2790 | if (file->f_mode & FMODE_READ) { |
2243 | struct seq_file *m = file->private_data; | 2791 | struct seq_file *m = file->private_data; |
2244 | iter = m->private; | 2792 | iter = m->private; |
@@ -2250,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2250 | 2798 | ||
2251 | if (read >= 0 && trace_parser_loaded(parser) && | 2799 | if (read >= 0 && trace_parser_loaded(parser) && |
2252 | !trace_parser_cont(parser)) { | 2800 | !trace_parser_cont(parser)) { |
2253 | ret = ftrace_process_regex(parser->buffer, | 2801 | ret = ftrace_process_regex(iter->hash, parser->buffer, |
2254 | parser->idx, enable); | 2802 | parser->idx, enable); |
2255 | trace_parser_clear(parser); | 2803 | trace_parser_clear(parser); |
2256 | if (ret) | 2804 | if (ret) |
@@ -2278,22 +2826,83 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, | |||
2278 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); | 2826 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); |
2279 | } | 2827 | } |
2280 | 2828 | ||
2281 | static void | 2829 | static int |
2282 | ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | 2830 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, |
2831 | int reset, int enable) | ||
2283 | { | 2832 | { |
2833 | struct ftrace_hash **orig_hash; | ||
2834 | struct ftrace_hash *hash; | ||
2835 | int ret; | ||
2836 | |||
2837 | /* All global ops uses the global ops filters */ | ||
2838 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) | ||
2839 | ops = &global_ops; | ||
2840 | |||
2284 | if (unlikely(ftrace_disabled)) | 2841 | if (unlikely(ftrace_disabled)) |
2285 | return; | 2842 | return -ENODEV; |
2843 | |||
2844 | if (enable) | ||
2845 | orig_hash = &ops->filter_hash; | ||
2846 | else | ||
2847 | orig_hash = &ops->notrace_hash; | ||
2848 | |||
2849 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | ||
2850 | if (!hash) | ||
2851 | return -ENOMEM; | ||
2286 | 2852 | ||
2287 | mutex_lock(&ftrace_regex_lock); | 2853 | mutex_lock(&ftrace_regex_lock); |
2288 | if (reset) | 2854 | if (reset) |
2289 | ftrace_filter_reset(enable); | 2855 | ftrace_filter_reset(hash); |
2290 | if (buf) | 2856 | if (buf) |
2291 | ftrace_match_records(buf, len, enable); | 2857 | ftrace_match_records(hash, buf, len); |
2858 | |||
2859 | mutex_lock(&ftrace_lock); | ||
2860 | ret = ftrace_hash_move(orig_hash, hash); | ||
2861 | mutex_unlock(&ftrace_lock); | ||
2862 | |||
2292 | mutex_unlock(&ftrace_regex_lock); | 2863 | mutex_unlock(&ftrace_regex_lock); |
2864 | |||
2865 | free_ftrace_hash(hash); | ||
2866 | return ret; | ||
2867 | } | ||
2868 | |||
2869 | /** | ||
2870 | * ftrace_set_filter - set a function to filter on in ftrace | ||
2871 | * @ops - the ops to set the filter with | ||
2872 | * @buf - the string that holds the function filter text. | ||
2873 | * @len - the length of the string. | ||
2874 | * @reset - non zero to reset all filters before applying this filter. | ||
2875 | * | ||
2876 | * Filters denote which functions should be enabled when tracing is enabled. | ||
2877 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | ||
2878 | */ | ||
2879 | void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, | ||
2880 | int len, int reset) | ||
2881 | { | ||
2882 | ftrace_set_regex(ops, buf, len, reset, 1); | ||
2293 | } | 2883 | } |
2884 | EXPORT_SYMBOL_GPL(ftrace_set_filter); | ||
2294 | 2885 | ||
2295 | /** | 2886 | /** |
2887 | * ftrace_set_notrace - set a function to not trace in ftrace | ||
2888 | * @ops - the ops to set the notrace filter with | ||
2889 | * @buf - the string that holds the function notrace text. | ||
2890 | * @len - the length of the string. | ||
2891 | * @reset - non zero to reset all filters before applying this filter. | ||
2892 | * | ||
2893 | * Notrace Filters denote which functions should not be enabled when tracing | ||
2894 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | ||
2895 | * for tracing. | ||
2896 | */ | ||
2897 | void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, | ||
2898 | int len, int reset) | ||
2899 | { | ||
2900 | ftrace_set_regex(ops, buf, len, reset, 0); | ||
2901 | } | ||
2902 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | ||
2903 | /** | ||
2296 | * ftrace_set_filter - set a function to filter on in ftrace | 2904 | * ftrace_set_filter - set a function to filter on in ftrace |
2905 | * @ops - the ops to set the filter with | ||
2297 | * @buf - the string that holds the function filter text. | 2906 | * @buf - the string that holds the function filter text. |
2298 | * @len - the length of the string. | 2907 | * @len - the length of the string. |
2299 | * @reset - non zero to reset all filters before applying this filter. | 2908 | * @reset - non zero to reset all filters before applying this filter. |
@@ -2301,13 +2910,15 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | |||
2301 | * Filters denote which functions should be enabled when tracing is enabled. | 2910 | * Filters denote which functions should be enabled when tracing is enabled. |
2302 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | 2911 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. |
2303 | */ | 2912 | */ |
2304 | void ftrace_set_filter(unsigned char *buf, int len, int reset) | 2913 | void ftrace_set_global_filter(unsigned char *buf, int len, int reset) |
2305 | { | 2914 | { |
2306 | ftrace_set_regex(buf, len, reset, 1); | 2915 | ftrace_set_regex(&global_ops, buf, len, reset, 1); |
2307 | } | 2916 | } |
2917 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); | ||
2308 | 2918 | ||
2309 | /** | 2919 | /** |
2310 | * ftrace_set_notrace - set a function to not trace in ftrace | 2920 | * ftrace_set_notrace - set a function to not trace in ftrace |
2921 | * @ops - the ops to set the notrace filter with | ||
2311 | * @buf - the string that holds the function notrace text. | 2922 | * @buf - the string that holds the function notrace text. |
2312 | * @len - the length of the string. | 2923 | * @len - the length of the string. |
2313 | * @reset - non zero to reset all filters before applying this filter. | 2924 | * @reset - non zero to reset all filters before applying this filter. |
@@ -2316,10 +2927,11 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) | |||
2316 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | 2927 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled |
2317 | * for tracing. | 2928 | * for tracing. |
2318 | */ | 2929 | */ |
2319 | void ftrace_set_notrace(unsigned char *buf, int len, int reset) | 2930 | void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) |
2320 | { | 2931 | { |
2321 | ftrace_set_regex(buf, len, reset, 0); | 2932 | ftrace_set_regex(&global_ops, buf, len, reset, 0); |
2322 | } | 2933 | } |
2934 | EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); | ||
2323 | 2935 | ||
2324 | /* | 2936 | /* |
2325 | * command line interface to allow users to set filters on boot up. | 2937 | * command line interface to allow users to set filters on boot up. |
@@ -2370,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf) | |||
2370 | } | 2982 | } |
2371 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 2983 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
2372 | 2984 | ||
2373 | static void __init set_ftrace_early_filter(char *buf, int enable) | 2985 | static void __init |
2986 | set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) | ||
2374 | { | 2987 | { |
2375 | char *func; | 2988 | char *func; |
2376 | 2989 | ||
2377 | while (buf) { | 2990 | while (buf) { |
2378 | func = strsep(&buf, ","); | 2991 | func = strsep(&buf, ","); |
2379 | ftrace_set_regex(func, strlen(func), 0, enable); | 2992 | ftrace_set_regex(ops, func, strlen(func), 0, enable); |
2380 | } | 2993 | } |
2381 | } | 2994 | } |
2382 | 2995 | ||
2383 | static void __init set_ftrace_early_filters(void) | 2996 | static void __init set_ftrace_early_filters(void) |
2384 | { | 2997 | { |
2385 | if (ftrace_filter_buf[0]) | 2998 | if (ftrace_filter_buf[0]) |
2386 | set_ftrace_early_filter(ftrace_filter_buf, 1); | 2999 | set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); |
2387 | if (ftrace_notrace_buf[0]) | 3000 | if (ftrace_notrace_buf[0]) |
2388 | set_ftrace_early_filter(ftrace_notrace_buf, 0); | 3001 | set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); |
2389 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3002 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
2390 | if (ftrace_graph_buf[0]) | 3003 | if (ftrace_graph_buf[0]) |
2391 | set_ftrace_early_graph(ftrace_graph_buf); | 3004 | set_ftrace_early_graph(ftrace_graph_buf); |
@@ -2393,11 +3006,14 @@ static void __init set_ftrace_early_filters(void) | |||
2393 | } | 3006 | } |
2394 | 3007 | ||
2395 | static int | 3008 | static int |
2396 | ftrace_regex_release(struct inode *inode, struct file *file, int enable) | 3009 | ftrace_regex_release(struct inode *inode, struct file *file) |
2397 | { | 3010 | { |
2398 | struct seq_file *m = (struct seq_file *)file->private_data; | 3011 | struct seq_file *m = (struct seq_file *)file->private_data; |
2399 | struct ftrace_iterator *iter; | 3012 | struct ftrace_iterator *iter; |
3013 | struct ftrace_hash **orig_hash; | ||
2400 | struct trace_parser *parser; | 3014 | struct trace_parser *parser; |
3015 | int filter_hash; | ||
3016 | int ret; | ||
2401 | 3017 | ||
2402 | mutex_lock(&ftrace_regex_lock); | 3018 | mutex_lock(&ftrace_regex_lock); |
2403 | if (file->f_mode & FMODE_READ) { | 3019 | if (file->f_mode & FMODE_READ) { |
@@ -2410,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) | |||
2410 | parser = &iter->parser; | 3026 | parser = &iter->parser; |
2411 | if (trace_parser_loaded(parser)) { | 3027 | if (trace_parser_loaded(parser)) { |
2412 | parser->buffer[parser->idx] = 0; | 3028 | parser->buffer[parser->idx] = 0; |
2413 | ftrace_match_records(parser->buffer, parser->idx, enable); | 3029 | ftrace_match_records(iter->hash, parser->buffer, parser->idx); |
2414 | } | 3030 | } |
2415 | 3031 | ||
2416 | mutex_lock(&ftrace_lock); | ||
2417 | if (ftrace_start_up && ftrace_enabled) | ||
2418 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
2419 | mutex_unlock(&ftrace_lock); | ||
2420 | |||
2421 | trace_parser_put(parser); | 3032 | trace_parser_put(parser); |
3033 | |||
3034 | if (file->f_mode & FMODE_WRITE) { | ||
3035 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | ||
3036 | |||
3037 | if (filter_hash) | ||
3038 | orig_hash = &iter->ops->filter_hash; | ||
3039 | else | ||
3040 | orig_hash = &iter->ops->notrace_hash; | ||
3041 | |||
3042 | mutex_lock(&ftrace_lock); | ||
3043 | /* | ||
3044 | * Remove the current set, update the hash and add | ||
3045 | * them back. | ||
3046 | */ | ||
3047 | ftrace_hash_rec_disable(iter->ops, filter_hash); | ||
3048 | ret = ftrace_hash_move(orig_hash, iter->hash); | ||
3049 | if (!ret) { | ||
3050 | ftrace_hash_rec_enable(iter->ops, filter_hash); | ||
3051 | if (iter->ops->flags & FTRACE_OPS_FL_ENABLED | ||
3052 | && ftrace_enabled) | ||
3053 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
3054 | } | ||
3055 | mutex_unlock(&ftrace_lock); | ||
3056 | } | ||
3057 | free_ftrace_hash(iter->hash); | ||
2422 | kfree(iter); | 3058 | kfree(iter); |
2423 | 3059 | ||
2424 | mutex_unlock(&ftrace_regex_lock); | 3060 | mutex_unlock(&ftrace_regex_lock); |
2425 | return 0; | 3061 | return 0; |
2426 | } | 3062 | } |
2427 | 3063 | ||
2428 | static int | ||
2429 | ftrace_filter_release(struct inode *inode, struct file *file) | ||
2430 | { | ||
2431 | return ftrace_regex_release(inode, file, 1); | ||
2432 | } | ||
2433 | |||
2434 | static int | ||
2435 | ftrace_notrace_release(struct inode *inode, struct file *file) | ||
2436 | { | ||
2437 | return ftrace_regex_release(inode, file, 0); | ||
2438 | } | ||
2439 | |||
2440 | static const struct file_operations ftrace_avail_fops = { | 3064 | static const struct file_operations ftrace_avail_fops = { |
2441 | .open = ftrace_avail_open, | 3065 | .open = ftrace_avail_open, |
2442 | .read = seq_read, | 3066 | .read = seq_read, |
@@ -2444,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = { | |||
2444 | .release = seq_release_private, | 3068 | .release = seq_release_private, |
2445 | }; | 3069 | }; |
2446 | 3070 | ||
2447 | static const struct file_operations ftrace_failures_fops = { | 3071 | static const struct file_operations ftrace_enabled_fops = { |
2448 | .open = ftrace_failures_open, | 3072 | .open = ftrace_enabled_open, |
2449 | .read = seq_read, | 3073 | .read = seq_read, |
2450 | .llseek = seq_lseek, | 3074 | .llseek = seq_lseek, |
2451 | .release = seq_release_private, | 3075 | .release = seq_release_private, |
@@ -2456,7 +3080,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
2456 | .read = seq_read, | 3080 | .read = seq_read, |
2457 | .write = ftrace_filter_write, | 3081 | .write = ftrace_filter_write, |
2458 | .llseek = ftrace_regex_lseek, | 3082 | .llseek = ftrace_regex_lseek, |
2459 | .release = ftrace_filter_release, | 3083 | .release = ftrace_regex_release, |
2460 | }; | 3084 | }; |
2461 | 3085 | ||
2462 | static const struct file_operations ftrace_notrace_fops = { | 3086 | static const struct file_operations ftrace_notrace_fops = { |
@@ -2464,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
2464 | .read = seq_read, | 3088 | .read = seq_read, |
2465 | .write = ftrace_notrace_write, | 3089 | .write = ftrace_notrace_write, |
2466 | .llseek = ftrace_regex_lseek, | 3090 | .llseek = ftrace_regex_lseek, |
2467 | .release = ftrace_notrace_release, | 3091 | .release = ftrace_regex_release, |
2468 | }; | 3092 | }; |
2469 | 3093 | ||
2470 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3094 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
@@ -2573,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2573 | bool exists; | 3197 | bool exists; |
2574 | int i; | 3198 | int i; |
2575 | 3199 | ||
2576 | if (ftrace_disabled) | ||
2577 | return -ENODEV; | ||
2578 | |||
2579 | /* decode regex */ | 3200 | /* decode regex */ |
2580 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 3201 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
2581 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) | 3202 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
@@ -2584,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2584 | search_len = strlen(search); | 3205 | search_len = strlen(search); |
2585 | 3206 | ||
2586 | mutex_lock(&ftrace_lock); | 3207 | mutex_lock(&ftrace_lock); |
3208 | |||
3209 | if (unlikely(ftrace_disabled)) { | ||
3210 | mutex_unlock(&ftrace_lock); | ||
3211 | return -ENODEV; | ||
3212 | } | ||
3213 | |||
2587 | do_for_each_ftrace_rec(pg, rec) { | 3214 | do_for_each_ftrace_rec(pg, rec) { |
2588 | 3215 | ||
2589 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 3216 | if (rec->flags & FTRACE_FL_FREE) |
2590 | continue; | 3217 | continue; |
2591 | 3218 | ||
2592 | if (ftrace_match_record(rec, search, search_len, type)) { | 3219 | if (ftrace_match_record(rec, NULL, search, search_len, type)) { |
2593 | /* if it is in the array */ | 3220 | /* if it is in the array */ |
2594 | exists = false; | 3221 | exists = false; |
2595 | for (i = 0; i < *idx; i++) { | 3222 | for (i = 0; i < *idx; i++) { |
@@ -2679,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
2679 | trace_create_file("available_filter_functions", 0444, | 3306 | trace_create_file("available_filter_functions", 0444, |
2680 | d_tracer, NULL, &ftrace_avail_fops); | 3307 | d_tracer, NULL, &ftrace_avail_fops); |
2681 | 3308 | ||
2682 | trace_create_file("failures", 0444, | 3309 | trace_create_file("enabled_functions", 0444, |
2683 | d_tracer, NULL, &ftrace_failures_fops); | 3310 | d_tracer, NULL, &ftrace_enabled_fops); |
2684 | 3311 | ||
2685 | trace_create_file("set_ftrace_filter", 0644, d_tracer, | 3312 | trace_create_file("set_ftrace_filter", 0644, d_tracer, |
2686 | NULL, &ftrace_filter_fops); | 3313 | NULL, &ftrace_filter_fops); |
@@ -2720,7 +3347,10 @@ static int ftrace_process_locs(struct module *mod, | |||
2720 | ftrace_record_ip(addr); | 3347 | ftrace_record_ip(addr); |
2721 | } | 3348 | } |
2722 | 3349 | ||
2723 | /* disable interrupts to prevent kstop machine */ | 3350 | /* |
3351 | * Disable interrupts to prevent interrupts from executing | ||
3352 | * code that is being modified. | ||
3353 | */ | ||
2724 | local_irq_save(flags); | 3354 | local_irq_save(flags); |
2725 | ftrace_update_code(mod); | 3355 | ftrace_update_code(mod); |
2726 | local_irq_restore(flags); | 3356 | local_irq_restore(flags); |
@@ -2735,10 +3365,11 @@ void ftrace_release_mod(struct module *mod) | |||
2735 | struct dyn_ftrace *rec; | 3365 | struct dyn_ftrace *rec; |
2736 | struct ftrace_page *pg; | 3366 | struct ftrace_page *pg; |
2737 | 3367 | ||
3368 | mutex_lock(&ftrace_lock); | ||
3369 | |||
2738 | if (ftrace_disabled) | 3370 | if (ftrace_disabled) |
2739 | return; | 3371 | goto out_unlock; |
2740 | 3372 | ||
2741 | mutex_lock(&ftrace_lock); | ||
2742 | do_for_each_ftrace_rec(pg, rec) { | 3373 | do_for_each_ftrace_rec(pg, rec) { |
2743 | if (within_module_core(rec->ip, mod)) { | 3374 | if (within_module_core(rec->ip, mod)) { |
2744 | /* | 3375 | /* |
@@ -2749,6 +3380,7 @@ void ftrace_release_mod(struct module *mod) | |||
2749 | ftrace_free_rec(rec); | 3380 | ftrace_free_rec(rec); |
2750 | } | 3381 | } |
2751 | } while_for_each_ftrace_rec(); | 3382 | } while_for_each_ftrace_rec(); |
3383 | out_unlock: | ||
2752 | mutex_unlock(&ftrace_lock); | 3384 | mutex_unlock(&ftrace_lock); |
2753 | } | 3385 | } |
2754 | 3386 | ||
@@ -2835,6 +3467,10 @@ void __init ftrace_init(void) | |||
2835 | 3467 | ||
2836 | #else | 3468 | #else |
2837 | 3469 | ||
3470 | static struct ftrace_ops global_ops = { | ||
3471 | .func = ftrace_stub, | ||
3472 | }; | ||
3473 | |||
2838 | static int __init ftrace_nodyn_init(void) | 3474 | static int __init ftrace_nodyn_init(void) |
2839 | { | 3475 | { |
2840 | ftrace_enabled = 1; | 3476 | ftrace_enabled = 1; |
@@ -2845,12 +3481,47 @@ device_initcall(ftrace_nodyn_init); | |||
2845 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 3481 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
2846 | static inline void ftrace_startup_enable(int command) { } | 3482 | static inline void ftrace_startup_enable(int command) { } |
2847 | /* Keep as macros so we do not need to define the commands */ | 3483 | /* Keep as macros so we do not need to define the commands */ |
2848 | # define ftrace_startup(command) do { } while (0) | 3484 | # define ftrace_startup(ops, command) \ |
2849 | # define ftrace_shutdown(command) do { } while (0) | 3485 | ({ \ |
3486 | (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ | ||
3487 | 0; \ | ||
3488 | }) | ||
3489 | # define ftrace_shutdown(ops, command) do { } while (0) | ||
2850 | # define ftrace_startup_sysctl() do { } while (0) | 3490 | # define ftrace_startup_sysctl() do { } while (0) |
2851 | # define ftrace_shutdown_sysctl() do { } while (0) | 3491 | # define ftrace_shutdown_sysctl() do { } while (0) |
3492 | |||
3493 | static inline int | ||
3494 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
3495 | { | ||
3496 | return 1; | ||
3497 | } | ||
3498 | |||
2852 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 3499 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
2853 | 3500 | ||
3501 | static void | ||
3502 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | ||
3503 | { | ||
3504 | struct ftrace_ops *op; | ||
3505 | |||
3506 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) | ||
3507 | return; | ||
3508 | |||
3509 | trace_recursion_set(TRACE_INTERNAL_BIT); | ||
3510 | /* | ||
3511 | * Some of the ops may be dynamically allocated, | ||
3512 | * they must be freed after a synchronize_sched(). | ||
3513 | */ | ||
3514 | preempt_disable_notrace(); | ||
3515 | op = rcu_dereference_raw(ftrace_ops_list); | ||
3516 | while (op != &ftrace_list_end) { | ||
3517 | if (ftrace_ops_test(op, ip)) | ||
3518 | op->func(ip, parent_ip); | ||
3519 | op = rcu_dereference_raw(op->next); | ||
3520 | }; | ||
3521 | preempt_enable_notrace(); | ||
3522 | trace_recursion_clear(TRACE_INTERNAL_BIT); | ||
3523 | } | ||
3524 | |||
2854 | static void clear_ftrace_swapper(void) | 3525 | static void clear_ftrace_swapper(void) |
2855 | { | 3526 | { |
2856 | struct task_struct *p; | 3527 | struct task_struct *p; |
@@ -3143,19 +3814,23 @@ void ftrace_kill(void) | |||
3143 | */ | 3814 | */ |
3144 | int register_ftrace_function(struct ftrace_ops *ops) | 3815 | int register_ftrace_function(struct ftrace_ops *ops) |
3145 | { | 3816 | { |
3146 | int ret; | 3817 | int ret = -1; |
3147 | |||
3148 | if (unlikely(ftrace_disabled)) | ||
3149 | return -1; | ||
3150 | 3818 | ||
3151 | mutex_lock(&ftrace_lock); | 3819 | mutex_lock(&ftrace_lock); |
3152 | 3820 | ||
3821 | if (unlikely(ftrace_disabled)) | ||
3822 | goto out_unlock; | ||
3823 | |||
3153 | ret = __register_ftrace_function(ops); | 3824 | ret = __register_ftrace_function(ops); |
3154 | ftrace_startup(0); | 3825 | if (!ret) |
3826 | ret = ftrace_startup(ops, 0); | ||
3155 | 3827 | ||
3828 | |||
3829 | out_unlock: | ||
3156 | mutex_unlock(&ftrace_lock); | 3830 | mutex_unlock(&ftrace_lock); |
3157 | return ret; | 3831 | return ret; |
3158 | } | 3832 | } |
3833 | EXPORT_SYMBOL_GPL(register_ftrace_function); | ||
3159 | 3834 | ||
3160 | /** | 3835 | /** |
3161 | * unregister_ftrace_function - unregister a function for profiling. | 3836 | * unregister_ftrace_function - unregister a function for profiling. |
@@ -3169,25 +3844,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops) | |||
3169 | 3844 | ||
3170 | mutex_lock(&ftrace_lock); | 3845 | mutex_lock(&ftrace_lock); |
3171 | ret = __unregister_ftrace_function(ops); | 3846 | ret = __unregister_ftrace_function(ops); |
3172 | ftrace_shutdown(0); | 3847 | if (!ret) |
3848 | ftrace_shutdown(ops, 0); | ||
3173 | mutex_unlock(&ftrace_lock); | 3849 | mutex_unlock(&ftrace_lock); |
3174 | 3850 | ||
3175 | return ret; | 3851 | return ret; |
3176 | } | 3852 | } |
3853 | EXPORT_SYMBOL_GPL(unregister_ftrace_function); | ||
3177 | 3854 | ||
3178 | int | 3855 | int |
3179 | ftrace_enable_sysctl(struct ctl_table *table, int write, | 3856 | ftrace_enable_sysctl(struct ctl_table *table, int write, |
3180 | void __user *buffer, size_t *lenp, | 3857 | void __user *buffer, size_t *lenp, |
3181 | loff_t *ppos) | 3858 | loff_t *ppos) |
3182 | { | 3859 | { |
3183 | int ret; | 3860 | int ret = -ENODEV; |
3184 | |||
3185 | if (unlikely(ftrace_disabled)) | ||
3186 | return -ENODEV; | ||
3187 | 3861 | ||
3188 | mutex_lock(&ftrace_lock); | 3862 | mutex_lock(&ftrace_lock); |
3189 | 3863 | ||
3190 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 3864 | if (unlikely(ftrace_disabled)) |
3865 | goto out; | ||
3866 | |||
3867 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
3191 | 3868 | ||
3192 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) | 3869 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) |
3193 | goto out; | 3870 | goto out; |
@@ -3199,11 +3876,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
3199 | ftrace_startup_sysctl(); | 3876 | ftrace_startup_sysctl(); |
3200 | 3877 | ||
3201 | /* we are starting ftrace again */ | 3878 | /* we are starting ftrace again */ |
3202 | if (ftrace_list != &ftrace_list_end) { | 3879 | if (ftrace_ops_list != &ftrace_list_end) { |
3203 | if (ftrace_list->next == &ftrace_list_end) | 3880 | if (ftrace_ops_list->next == &ftrace_list_end) |
3204 | ftrace_trace_function = ftrace_list->func; | 3881 | ftrace_trace_function = ftrace_ops_list->func; |
3205 | else | 3882 | else |
3206 | ftrace_trace_function = ftrace_list_func; | 3883 | ftrace_trace_function = ftrace_ops_list_func; |
3207 | } | 3884 | } |
3208 | 3885 | ||
3209 | } else { | 3886 | } else { |
@@ -3392,7 +4069,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
3392 | ftrace_graph_return = retfunc; | 4069 | ftrace_graph_return = retfunc; |
3393 | ftrace_graph_entry = entryfunc; | 4070 | ftrace_graph_entry = entryfunc; |
3394 | 4071 | ||
3395 | ftrace_startup(FTRACE_START_FUNC_RET); | 4072 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); |
3396 | 4073 | ||
3397 | out: | 4074 | out: |
3398 | mutex_unlock(&ftrace_lock); | 4075 | mutex_unlock(&ftrace_lock); |
@@ -3409,7 +4086,7 @@ void unregister_ftrace_graph(void) | |||
3409 | ftrace_graph_active--; | 4086 | ftrace_graph_active--; |
3410 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 4087 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
3411 | ftrace_graph_entry = ftrace_graph_entry_stub; | 4088 | ftrace_graph_entry = ftrace_graph_entry_stub; |
3412 | ftrace_shutdown(FTRACE_STOP_FUNC_RET); | 4089 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
3413 | unregister_pm_notifier(&ftrace_suspend_notifier); | 4090 | unregister_pm_notifier(&ftrace_suspend_notifier); |
3414 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 4091 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
3415 | 4092 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0ef7b4b2a1f7..b0c7aa407943 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void) | |||
2216 | 2216 | ||
2217 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" | 2217 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" |
2218 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", | 2218 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", |
2219 | current->trace_recursion, | 2219 | trace_recursion_buffer(), |
2220 | hardirq_count() >> HARDIRQ_SHIFT, | 2220 | hardirq_count() >> HARDIRQ_SHIFT, |
2221 | softirq_count() >> SOFTIRQ_SHIFT, | 2221 | softirq_count() >> SOFTIRQ_SHIFT, |
2222 | in_nmi()); | 2222 | in_nmi()); |
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void) | |||
2226 | 2226 | ||
2227 | static inline int trace_recursive_lock(void) | 2227 | static inline int trace_recursive_lock(void) |
2228 | { | 2228 | { |
2229 | current->trace_recursion++; | 2229 | trace_recursion_inc(); |
2230 | 2230 | ||
2231 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | 2231 | if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) |
2232 | return 0; | 2232 | return 0; |
2233 | 2233 | ||
2234 | trace_recursive_fail(); | 2234 | trace_recursive_fail(); |
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void) | |||
2238 | 2238 | ||
2239 | static inline void trace_recursive_unlock(void) | 2239 | static inline void trace_recursive_unlock(void) |
2240 | { | 2240 | { |
2241 | WARN_ON_ONCE(!current->trace_recursion); | 2241 | WARN_ON_ONCE(!trace_recursion_buffer()); |
2242 | 2242 | ||
2243 | current->trace_recursion--; | 2243 | trace_recursion_dec(); |
2244 | } | 2244 | } |
2245 | 2245 | ||
2246 | #else | 2246 | #else |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1cb49be7c7fb..ee9c921d7f21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -2014,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
2014 | { | 2014 | { |
2015 | enum print_line_t ret; | 2015 | enum print_line_t ret; |
2016 | 2016 | ||
2017 | if (iter->lost_events) | 2017 | if (iter->lost_events && |
2018 | trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", | 2018 | !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", |
2019 | iter->cpu, iter->lost_events); | 2019 | iter->cpu, iter->lost_events)) |
2020 | return TRACE_TYPE_PARTIAL_LINE; | ||
2020 | 2021 | ||
2021 | if (iter->trace && iter->trace->print_line) { | 2022 | if (iter->trace && iter->trace->print_line) { |
2022 | ret = iter->trace->print_line(iter); | 2023 | ret = iter->trace->print_line(iter); |
@@ -3230,6 +3231,14 @@ waitagain: | |||
3230 | 3231 | ||
3231 | if (iter->seq.len >= cnt) | 3232 | if (iter->seq.len >= cnt) |
3232 | break; | 3233 | break; |
3234 | |||
3235 | /* | ||
3236 | * Setting the full flag means we reached the trace_seq buffer | ||
3237 | * size and we should leave by partial output condition above. | ||
3238 | * One of the trace_seq_* functions is not used properly. | ||
3239 | */ | ||
3240 | WARN_ONCE(iter->seq.full, "full flag set for trace type %d", | ||
3241 | iter->ent->type); | ||
3233 | } | 3242 | } |
3234 | trace_access_unlock(iter->cpu_file); | 3243 | trace_access_unlock(iter->cpu_file); |
3235 | trace_event_read_unlock(); | 3244 | trace_event_read_unlock(); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5e9dfc6286dd..229f8591f61d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]); | |||
419 | extern unsigned long ftrace_update_tot_cnt; | 419 | extern unsigned long ftrace_update_tot_cnt; |
420 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 420 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
421 | extern int DYN_FTRACE_TEST_NAME(void); | 421 | extern int DYN_FTRACE_TEST_NAME(void); |
422 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | ||
423 | extern int DYN_FTRACE_TEST_NAME2(void); | ||
422 | #endif | 424 | #endif |
423 | 425 | ||
424 | extern int ring_buffer_expanded; | 426 | extern int ring_buffer_expanded; |
@@ -782,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
782 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 784 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) |
783 | #include "trace_entries.h" | 785 | #include "trace_entries.h" |
784 | 786 | ||
787 | /* Only current can touch trace_recursion */ | ||
788 | #define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) | ||
789 | #define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) | ||
790 | |||
791 | /* Ring buffer has the 10 LSB bits to count */ | ||
792 | #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) | ||
793 | |||
794 | /* for function tracing recursion */ | ||
795 | #define TRACE_INTERNAL_BIT (1<<11) | ||
796 | #define TRACE_GLOBAL_BIT (1<<12) | ||
797 | |||
798 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) | ||
799 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) | ||
800 | #define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) | ||
801 | |||
785 | #endif /* _LINUX_KERNEL_TRACE_H */ | 802 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2fe110341359..686ec399f2a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata = | |||
1657 | 1657 | ||
1658 | static __init void event_trace_self_test_with_function(void) | 1658 | static __init void event_trace_self_test_with_function(void) |
1659 | { | 1659 | { |
1660 | register_ftrace_function(&trace_ops); | 1660 | int ret; |
1661 | ret = register_ftrace_function(&trace_ops); | ||
1662 | if (WARN_ON(ret < 0)) { | ||
1663 | pr_info("Failed to enable function tracer for event tests\n"); | ||
1664 | return; | ||
1665 | } | ||
1661 | pr_info("Running tests again, along with the function tracer\n"); | 1666 | pr_info("Running tests again, along with the function tracer\n"); |
1662 | event_trace_self_tests(); | 1667 | event_trace_self_tests(); |
1663 | unregister_ftrace_function(&trace_ops); | 1668 | unregister_ftrace_function(&trace_ops); |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8f..8d0e1cc4e974 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
149 | static struct ftrace_ops trace_ops __read_mostly = | 149 | static struct ftrace_ops trace_ops __read_mostly = |
150 | { | 150 | { |
151 | .func = function_trace_call, | 151 | .func = function_trace_call, |
152 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
152 | }; | 153 | }; |
153 | 154 | ||
154 | static struct ftrace_ops trace_stack_ops __read_mostly = | 155 | static struct ftrace_ops trace_stack_ops __read_mostly = |
155 | { | 156 | { |
156 | .func = function_stack_trace_call, | 157 | .func = function_stack_trace_call, |
158 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
157 | }; | 159 | }; |
158 | 160 | ||
159 | /* Our two options */ | 161 | /* Our two options */ |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a4969b47afc1..c77424be284d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
153 | static struct ftrace_ops trace_ops __read_mostly = | 153 | static struct ftrace_ops trace_ops __read_mostly = |
154 | { | 154 | { |
155 | .func = irqsoff_tracer_call, | 155 | .func = irqsoff_tracer_call, |
156 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
156 | }; | 157 | }; |
157 | #endif /* CONFIG_FUNCTION_TRACER */ | 158 | #endif /* CONFIG_FUNCTION_TRACER */ |
158 | 159 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35d55a386145..27d13b36b8be 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = { | |||
53 | "common_preempt_count", | 53 | "common_preempt_count", |
54 | "common_pid", | 54 | "common_pid", |
55 | "common_tgid", | 55 | "common_tgid", |
56 | "common_lock_depth", | ||
57 | FIELD_STRING_IP, | 56 | FIELD_STRING_IP, |
58 | FIELD_STRING_RETIP, | 57 | FIELD_STRING_RETIP, |
59 | FIELD_STRING_FUNC, | 58 | FIELD_STRING_FUNC, |
@@ -1871,8 +1870,12 @@ fs_initcall(init_kprobe_trace); | |||
1871 | 1870 | ||
1872 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1871 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
1873 | 1872 | ||
1874 | static int kprobe_trace_selftest_target(int a1, int a2, int a3, | 1873 | /* |
1875 | int a4, int a5, int a6) | 1874 | * The "__used" keeps gcc from removing the function symbol |
1875 | * from the kallsyms table. | ||
1876 | */ | ||
1877 | static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, | ||
1878 | int a4, int a5, int a6) | ||
1876 | { | 1879 | { |
1877 | return a1 + a2 + a3 + a4 + a5 + a6; | 1880 | return a1 + a2 + a3 + a4 + a5 + a6; |
1878 | } | 1881 | } |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 456be9063c2d..e37de492a9e1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
353 | } | 353 | } |
354 | EXPORT_SYMBOL(ftrace_print_symbols_seq); | 354 | EXPORT_SYMBOL(ftrace_print_symbols_seq); |
355 | 355 | ||
356 | #if BITS_PER_LONG == 32 | ||
357 | const char * | ||
358 | ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, | ||
359 | const struct trace_print_flags_u64 *symbol_array) | ||
360 | { | ||
361 | int i; | ||
362 | const char *ret = p->buffer + p->len; | ||
363 | |||
364 | for (i = 0; symbol_array[i].name; i++) { | ||
365 | |||
366 | if (val != symbol_array[i].mask) | ||
367 | continue; | ||
368 | |||
369 | trace_seq_puts(p, symbol_array[i].name); | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | if (!p->len) | ||
374 | trace_seq_printf(p, "0x%llx", val); | ||
375 | |||
376 | trace_seq_putc(p, 0); | ||
377 | |||
378 | return ret; | ||
379 | } | ||
380 | EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); | ||
381 | #endif | ||
382 | |||
356 | const char * | 383 | const char * |
357 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 384 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) |
358 | { | 385 | { |
@@ -830,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); | |||
830 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, | 857 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, |
831 | struct trace_event *event) | 858 | struct trace_event *event) |
832 | { | 859 | { |
860 | if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) | ||
861 | return TRACE_TYPE_PARTIAL_LINE; | ||
862 | |||
833 | return TRACE_TYPE_HANDLED; | 863 | return TRACE_TYPE_HANDLED; |
834 | } | 864 | } |
835 | 865 | ||
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..1f06468a10d7 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); | |||
32 | 32 | ||
33 | struct trace_bprintk_fmt { | 33 | struct trace_bprintk_fmt { |
34 | struct list_head list; | 34 | struct list_head list; |
35 | char fmt[0]; | 35 | const char *fmt; |
36 | }; | 36 | }; |
37 | 37 | ||
38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) | 38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) |
@@ -49,6 +49,7 @@ static | |||
49 | void hold_module_trace_bprintk_format(const char **start, const char **end) | 49 | void hold_module_trace_bprintk_format(const char **start, const char **end) |
50 | { | 50 | { |
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | ||
52 | 53 | ||
53 | mutex_lock(&btrace_mutex); | 54 | mutex_lock(&btrace_mutex); |
54 | for (iter = start; iter < end; iter++) { | 55 | for (iter = start; iter < end; iter++) { |
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
58 | continue; | 59 | continue; |
59 | } | 60 | } |
60 | 61 | ||
61 | tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) | 62 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); |
62 | + strlen(*iter) + 1, GFP_KERNEL); | 63 | if (tb_fmt) |
63 | if (tb_fmt) { | 64 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); |
65 | if (tb_fmt && fmt) { | ||
64 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); | 66 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); |
65 | strcpy(tb_fmt->fmt, *iter); | 67 | strcpy(fmt, *iter); |
68 | tb_fmt->fmt = fmt; | ||
66 | *iter = tb_fmt->fmt; | 69 | *iter = tb_fmt->fmt; |
67 | } else | 70 | } else { |
71 | kfree(tb_fmt); | ||
68 | *iter = NULL; | 72 | *iter = NULL; |
73 | } | ||
69 | } | 74 | } |
70 | mutex_unlock(&btrace_mutex); | 75 | mutex_unlock(&btrace_mutex); |
71 | } | 76 | } |
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, | |||
84 | return 0; | 89 | return 0; |
85 | } | 90 | } |
86 | 91 | ||
92 | /* | ||
93 | * The debugfs/tracing/printk_formats file maps the addresses with | ||
94 | * the ASCII formats that are used in the bprintk events in the | ||
95 | * buffer. For userspace tools to be able to decode the events from | ||
96 | * the buffer, they need to be able to map the address with the format. | ||
97 | * | ||
98 | * The addresses of the bprintk formats are in their own section | ||
99 | * __trace_printk_fmt. But for modules we copy them into a link list. | ||
100 | * The code to print the formats and their addresses passes around the | ||
101 | * address of the fmt string. If the fmt address passed into the seq | ||
102 | * functions is within the kernel core __trace_printk_fmt section, then | ||
103 | * it simply uses the next pointer in the list. | ||
104 | * | ||
105 | * When the fmt pointer is outside the kernel core __trace_printk_fmt | ||
106 | * section, then we need to read the link list pointers. The trick is | ||
107 | * we pass the address of the string to the seq function just like | ||
108 | * we do for the kernel core formats. To get back the structure that | ||
109 | * holds the format, we simply use containerof() and then go to the | ||
110 | * next format in the list. | ||
111 | */ | ||
112 | static const char ** | ||
113 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
114 | { | ||
115 | struct trace_bprintk_fmt *mod_fmt; | ||
116 | |||
117 | if (list_empty(&trace_bprintk_fmt_list)) | ||
118 | return NULL; | ||
119 | |||
120 | /* | ||
121 | * v will point to the address of the fmt record from t_next | ||
122 | * v will be NULL from t_start. | ||
123 | * If this is the first pointer or called from start | ||
124 | * then we need to walk the list. | ||
125 | */ | ||
126 | if (!v || start_index == *pos) { | ||
127 | struct trace_bprintk_fmt *p; | ||
128 | |||
129 | /* search the module list */ | ||
130 | list_for_each_entry(p, &trace_bprintk_fmt_list, list) { | ||
131 | if (start_index == *pos) | ||
132 | return &p->fmt; | ||
133 | start_index++; | ||
134 | } | ||
135 | /* pos > index */ | ||
136 | return NULL; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * v points to the address of the fmt field in the mod list | ||
141 | * structure that holds the module print format. | ||
142 | */ | ||
143 | mod_fmt = container_of(v, typeof(*mod_fmt), fmt); | ||
144 | if (mod_fmt->list.next == &trace_bprintk_fmt_list) | ||
145 | return NULL; | ||
146 | |||
147 | mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); | ||
148 | |||
149 | return &mod_fmt->fmt; | ||
150 | } | ||
151 | |||
152 | static void format_mod_start(void) | ||
153 | { | ||
154 | mutex_lock(&btrace_mutex); | ||
155 | } | ||
156 | |||
157 | static void format_mod_stop(void) | ||
158 | { | ||
159 | mutex_unlock(&btrace_mutex); | ||
160 | } | ||
161 | |||
87 | #else /* !CONFIG_MODULES */ | 162 | #else /* !CONFIG_MODULES */ |
88 | __init static int | 163 | __init static int |
89 | module_trace_bprintk_format_notify(struct notifier_block *self, | 164 | module_trace_bprintk_format_notify(struct notifier_block *self, |
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, | |||
91 | { | 166 | { |
92 | return 0; | 167 | return 0; |
93 | } | 168 | } |
169 | static inline const char ** | ||
170 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
171 | { | ||
172 | return NULL; | ||
173 | } | ||
174 | static inline void format_mod_start(void) { } | ||
175 | static inline void format_mod_stop(void) { } | ||
94 | #endif /* CONFIG_MODULES */ | 176 | #endif /* CONFIG_MODULES */ |
95 | 177 | ||
96 | 178 | ||
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) | |||
153 | } | 235 | } |
154 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); | 236 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); |
155 | 237 | ||
238 | static const char **find_next(void *v, loff_t *pos) | ||
239 | { | ||
240 | const char **fmt = v; | ||
241 | int start_index; | ||
242 | |||
243 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | ||
244 | |||
245 | if (*pos < start_index) | ||
246 | return __start___trace_bprintk_fmt + *pos; | ||
247 | |||
248 | return find_next_mod_format(start_index, v, fmt, pos); | ||
249 | } | ||
250 | |||
156 | static void * | 251 | static void * |
157 | t_start(struct seq_file *m, loff_t *pos) | 252 | t_start(struct seq_file *m, loff_t *pos) |
158 | { | 253 | { |
159 | const char **fmt = __start___trace_bprintk_fmt + *pos; | 254 | format_mod_start(); |
160 | 255 | return find_next(NULL, pos); | |
161 | if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) | ||
162 | return NULL; | ||
163 | return fmt; | ||
164 | } | 256 | } |
165 | 257 | ||
166 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) | 258 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) |
167 | { | 259 | { |
168 | (*pos)++; | 260 | (*pos)++; |
169 | return t_start(m, pos); | 261 | return find_next(v, pos); |
170 | } | 262 | } |
171 | 263 | ||
172 | static int t_show(struct seq_file *m, void *v) | 264 | static int t_show(struct seq_file *m, void *v) |
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v) | |||
205 | 297 | ||
206 | static void t_stop(struct seq_file *m, void *p) | 298 | static void t_stop(struct seq_file *m, void *p) |
207 | { | 299 | { |
300 | format_mod_stop(); | ||
208 | } | 301 | } |
209 | 302 | ||
210 | static const struct seq_operations show_format_seq_ops = { | 303 | static const struct seq_operations show_format_seq_ops = { |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559ed59f..f029dd4fd2ca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
129 | static struct ftrace_ops trace_ops __read_mostly = | 129 | static struct ftrace_ops trace_ops __read_mostly = |
130 | { | 130 | { |
131 | .func = wakeup_tracer_call, | 131 | .func = wakeup_tracer_call, |
132 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
132 | }; | 133 | }; |
133 | #endif /* CONFIG_FUNCTION_TRACER */ | 134 | #endif /* CONFIG_FUNCTION_TRACER */ |
134 | 135 | ||
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 659732eba07c..288541f977fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) | |||
101 | 101 | ||
102 | #ifdef CONFIG_DYNAMIC_FTRACE | 102 | #ifdef CONFIG_DYNAMIC_FTRACE |
103 | 103 | ||
104 | static int trace_selftest_test_probe1_cnt; | ||
105 | static void trace_selftest_test_probe1_func(unsigned long ip, | ||
106 | unsigned long pip) | ||
107 | { | ||
108 | trace_selftest_test_probe1_cnt++; | ||
109 | } | ||
110 | |||
111 | static int trace_selftest_test_probe2_cnt; | ||
112 | static void trace_selftest_test_probe2_func(unsigned long ip, | ||
113 | unsigned long pip) | ||
114 | { | ||
115 | trace_selftest_test_probe2_cnt++; | ||
116 | } | ||
117 | |||
118 | static int trace_selftest_test_probe3_cnt; | ||
119 | static void trace_selftest_test_probe3_func(unsigned long ip, | ||
120 | unsigned long pip) | ||
121 | { | ||
122 | trace_selftest_test_probe3_cnt++; | ||
123 | } | ||
124 | |||
125 | static int trace_selftest_test_global_cnt; | ||
126 | static void trace_selftest_test_global_func(unsigned long ip, | ||
127 | unsigned long pip) | ||
128 | { | ||
129 | trace_selftest_test_global_cnt++; | ||
130 | } | ||
131 | |||
132 | static int trace_selftest_test_dyn_cnt; | ||
133 | static void trace_selftest_test_dyn_func(unsigned long ip, | ||
134 | unsigned long pip) | ||
135 | { | ||
136 | trace_selftest_test_dyn_cnt++; | ||
137 | } | ||
138 | |||
139 | static struct ftrace_ops test_probe1 = { | ||
140 | .func = trace_selftest_test_probe1_func, | ||
141 | }; | ||
142 | |||
143 | static struct ftrace_ops test_probe2 = { | ||
144 | .func = trace_selftest_test_probe2_func, | ||
145 | }; | ||
146 | |||
147 | static struct ftrace_ops test_probe3 = { | ||
148 | .func = trace_selftest_test_probe3_func, | ||
149 | }; | ||
150 | |||
151 | static struct ftrace_ops test_global = { | ||
152 | .func = trace_selftest_test_global_func, | ||
153 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
154 | }; | ||
155 | |||
156 | static void print_counts(void) | ||
157 | { | ||
158 | printk("(%d %d %d %d %d) ", | ||
159 | trace_selftest_test_probe1_cnt, | ||
160 | trace_selftest_test_probe2_cnt, | ||
161 | trace_selftest_test_probe3_cnt, | ||
162 | trace_selftest_test_global_cnt, | ||
163 | trace_selftest_test_dyn_cnt); | ||
164 | } | ||
165 | |||
166 | static void reset_counts(void) | ||
167 | { | ||
168 | trace_selftest_test_probe1_cnt = 0; | ||
169 | trace_selftest_test_probe2_cnt = 0; | ||
170 | trace_selftest_test_probe3_cnt = 0; | ||
171 | trace_selftest_test_global_cnt = 0; | ||
172 | trace_selftest_test_dyn_cnt = 0; | ||
173 | } | ||
174 | |||
175 | static int trace_selftest_ops(int cnt) | ||
176 | { | ||
177 | int save_ftrace_enabled = ftrace_enabled; | ||
178 | struct ftrace_ops *dyn_ops; | ||
179 | char *func1_name; | ||
180 | char *func2_name; | ||
181 | int len1; | ||
182 | int len2; | ||
183 | int ret = -1; | ||
184 | |||
185 | printk(KERN_CONT "PASSED\n"); | ||
186 | pr_info("Testing dynamic ftrace ops #%d: ", cnt); | ||
187 | |||
188 | ftrace_enabled = 1; | ||
189 | reset_counts(); | ||
190 | |||
191 | /* Handle PPC64 '.' name */ | ||
192 | func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
193 | func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); | ||
194 | len1 = strlen(func1_name); | ||
195 | len2 = strlen(func2_name); | ||
196 | |||
197 | /* | ||
198 | * Probe 1 will trace function 1. | ||
199 | * Probe 2 will trace function 2. | ||
200 | * Probe 3 will trace functions 1 and 2. | ||
201 | */ | ||
202 | ftrace_set_filter(&test_probe1, func1_name, len1, 1); | ||
203 | ftrace_set_filter(&test_probe2, func2_name, len2, 1); | ||
204 | ftrace_set_filter(&test_probe3, func1_name, len1, 1); | ||
205 | ftrace_set_filter(&test_probe3, func2_name, len2, 0); | ||
206 | |||
207 | register_ftrace_function(&test_probe1); | ||
208 | register_ftrace_function(&test_probe2); | ||
209 | register_ftrace_function(&test_probe3); | ||
210 | register_ftrace_function(&test_global); | ||
211 | |||
212 | DYN_FTRACE_TEST_NAME(); | ||
213 | |||
214 | print_counts(); | ||
215 | |||
216 | if (trace_selftest_test_probe1_cnt != 1) | ||
217 | goto out; | ||
218 | if (trace_selftest_test_probe2_cnt != 0) | ||
219 | goto out; | ||
220 | if (trace_selftest_test_probe3_cnt != 1) | ||
221 | goto out; | ||
222 | if (trace_selftest_test_global_cnt == 0) | ||
223 | goto out; | ||
224 | |||
225 | DYN_FTRACE_TEST_NAME2(); | ||
226 | |||
227 | print_counts(); | ||
228 | |||
229 | if (trace_selftest_test_probe1_cnt != 1) | ||
230 | goto out; | ||
231 | if (trace_selftest_test_probe2_cnt != 1) | ||
232 | goto out; | ||
233 | if (trace_selftest_test_probe3_cnt != 2) | ||
234 | goto out; | ||
235 | |||
236 | /* Add a dynamic probe */ | ||
237 | dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); | ||
238 | if (!dyn_ops) { | ||
239 | printk("MEMORY ERROR "); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | dyn_ops->func = trace_selftest_test_dyn_func; | ||
244 | |||
245 | register_ftrace_function(dyn_ops); | ||
246 | |||
247 | trace_selftest_test_global_cnt = 0; | ||
248 | |||
249 | DYN_FTRACE_TEST_NAME(); | ||
250 | |||
251 | print_counts(); | ||
252 | |||
253 | if (trace_selftest_test_probe1_cnt != 2) | ||
254 | goto out_free; | ||
255 | if (trace_selftest_test_probe2_cnt != 1) | ||
256 | goto out_free; | ||
257 | if (trace_selftest_test_probe3_cnt != 3) | ||
258 | goto out_free; | ||
259 | if (trace_selftest_test_global_cnt == 0) | ||
260 | goto out; | ||
261 | if (trace_selftest_test_dyn_cnt == 0) | ||
262 | goto out_free; | ||
263 | |||
264 | DYN_FTRACE_TEST_NAME2(); | ||
265 | |||
266 | print_counts(); | ||
267 | |||
268 | if (trace_selftest_test_probe1_cnt != 2) | ||
269 | goto out_free; | ||
270 | if (trace_selftest_test_probe2_cnt != 2) | ||
271 | goto out_free; | ||
272 | if (trace_selftest_test_probe3_cnt != 4) | ||
273 | goto out_free; | ||
274 | |||
275 | ret = 0; | ||
276 | out_free: | ||
277 | unregister_ftrace_function(dyn_ops); | ||
278 | kfree(dyn_ops); | ||
279 | |||
280 | out: | ||
281 | /* Purposely unregister in the same order */ | ||
282 | unregister_ftrace_function(&test_probe1); | ||
283 | unregister_ftrace_function(&test_probe2); | ||
284 | unregister_ftrace_function(&test_probe3); | ||
285 | unregister_ftrace_function(&test_global); | ||
286 | |||
287 | /* Make sure everything is off */ | ||
288 | reset_counts(); | ||
289 | DYN_FTRACE_TEST_NAME(); | ||
290 | DYN_FTRACE_TEST_NAME(); | ||
291 | |||
292 | if (trace_selftest_test_probe1_cnt || | ||
293 | trace_selftest_test_probe2_cnt || | ||
294 | trace_selftest_test_probe3_cnt || | ||
295 | trace_selftest_test_global_cnt || | ||
296 | trace_selftest_test_dyn_cnt) | ||
297 | ret = -1; | ||
298 | |||
299 | ftrace_enabled = save_ftrace_enabled; | ||
300 | |||
301 | return ret; | ||
302 | } | ||
303 | |||
104 | /* Test dynamic code modification and ftrace filters */ | 304 | /* Test dynamic code modification and ftrace filters */ |
105 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | 305 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, |
106 | struct trace_array *tr, | 306 | struct trace_array *tr, |
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
131 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 331 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
132 | 332 | ||
133 | /* filter only on our function */ | 333 | /* filter only on our function */ |
134 | ftrace_set_filter(func_name, strlen(func_name), 1); | 334 | ftrace_set_global_filter(func_name, strlen(func_name), 1); |
135 | 335 | ||
136 | /* enable tracing */ | 336 | /* enable tracing */ |
137 | ret = tracer_init(trace, tr); | 337 | ret = tracer_init(trace, tr); |
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
166 | 366 | ||
167 | /* check the trace buffer */ | 367 | /* check the trace buffer */ |
168 | ret = trace_test_buffer(tr, &count); | 368 | ret = trace_test_buffer(tr, &count); |
169 | trace->reset(tr); | ||
170 | tracing_start(); | 369 | tracing_start(); |
171 | 370 | ||
172 | /* we should only have one item */ | 371 | /* we should only have one item */ |
173 | if (!ret && count != 1) { | 372 | if (!ret && count != 1) { |
373 | trace->reset(tr); | ||
174 | printk(KERN_CONT ".. filter failed count=%ld ..", count); | 374 | printk(KERN_CONT ".. filter failed count=%ld ..", count); |
175 | ret = -1; | 375 | ret = -1; |
176 | goto out; | 376 | goto out; |
177 | } | 377 | } |
178 | 378 | ||
379 | /* Test the ops with global tracing running */ | ||
380 | ret = trace_selftest_ops(1); | ||
381 | trace->reset(tr); | ||
382 | |||
179 | out: | 383 | out: |
180 | ftrace_enabled = save_ftrace_enabled; | 384 | ftrace_enabled = save_ftrace_enabled; |
181 | tracer_enabled = save_tracer_enabled; | 385 | tracer_enabled = save_tracer_enabled; |
182 | 386 | ||
183 | /* Enable tracing on all functions again */ | 387 | /* Enable tracing on all functions again */ |
184 | ftrace_set_filter(NULL, 0, 1); | 388 | ftrace_set_global_filter(NULL, 0, 1); |
389 | |||
390 | /* Test the ops with global tracing off */ | ||
391 | if (!ret) | ||
392 | ret = trace_selftest_ops(2); | ||
185 | 393 | ||
186 | return ret; | 394 | return ret; |
187 | } | 395 | } |
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5bf..b4c475a0a48b 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c | |||
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) | |||
5 | /* used to call mcount */ | 5 | /* used to call mcount */ |
6 | return 0; | 6 | return 0; |
7 | } | 7 | } |
8 | |||
9 | int DYN_FTRACE_TEST_NAME2(void) | ||
10 | { | ||
11 | /* used to call mcount */ | ||
12 | return 0; | ||
13 | } | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead0c239..b0b53b8e4c25 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
134 | { | 134 | { |
135 | .func = stack_trace_call, | 135 | .func = stack_trace_call, |
136 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
136 | }; | 137 | }; |
137 | 138 | ||
138 | static ssize_t | 139 | static ssize_t |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 68187af4889e..b219f1449c54 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
251 | { | 251 | { |
252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); | 252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); |
253 | 253 | ||
254 | if (elem->regfunc && !elem->state && active) | 254 | if (elem->regfunc && !jump_label_enabled(&elem->key) && active) |
255 | elem->regfunc(); | 255 | elem->regfunc(); |
256 | else if (elem->unregfunc && elem->state && !active) | 256 | else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) |
257 | elem->unregfunc(); | 257 | elem->unregfunc(); |
258 | 258 | ||
259 | /* | 259 | /* |
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
264 | * is used. | 264 | * is used. |
265 | */ | 265 | */ |
266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
267 | if (!elem->state && active) { | 267 | if (active && !jump_label_enabled(&elem->key)) |
268 | jump_label_enable(&elem->state); | 268 | jump_label_inc(&elem->key); |
269 | elem->state = active; | 269 | else if (!active && jump_label_enabled(&elem->key)) |
270 | } else if (elem->state && !active) { | 270 | jump_label_dec(&elem->key); |
271 | jump_label_disable(&elem->state); | ||
272 | elem->state = active; | ||
273 | } | ||
274 | } | 271 | } |
275 | 272 | ||
276 | /* | 273 | /* |
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
281 | */ | 278 | */ |
282 | static void disable_tracepoint(struct tracepoint *elem) | 279 | static void disable_tracepoint(struct tracepoint *elem) |
283 | { | 280 | { |
284 | if (elem->unregfunc && elem->state) | 281 | if (elem->unregfunc && jump_label_enabled(&elem->key)) |
285 | elem->unregfunc(); | 282 | elem->unregfunc(); |
286 | 283 | ||
287 | if (elem->state) { | 284 | if (jump_label_enabled(&elem->key)) |
288 | jump_label_disable(&elem->state); | 285 | jump_label_dec(&elem->key); |
289 | elem->state = 0; | ||
290 | } | ||
291 | rcu_assign_pointer(elem->funcs, NULL); | 286 | rcu_assign_pointer(elem->funcs, NULL); |
292 | } | 287 | } |
293 | 288 | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eaba..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/user_namespace.h> | 17 | #include <linux/user_namespace.h> |
18 | #include <linux/proc_fs.h> | ||
18 | 19 | ||
19 | static struct uts_namespace *create_uts_ns(void) | 20 | static struct uts_namespace *create_uts_ns(void) |
20 | { | 21 | { |
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref) | |||
79 | put_user_ns(ns->user_ns); | 80 | put_user_ns(ns->user_ns); |
80 | kfree(ns); | 81 | kfree(ns); |
81 | } | 82 | } |
83 | |||
84 | static void *utsns_get(struct task_struct *task) | ||
85 | { | ||
86 | struct uts_namespace *ns = NULL; | ||
87 | struct nsproxy *nsproxy; | ||
88 | |||
89 | rcu_read_lock(); | ||
90 | nsproxy = task_nsproxy(task); | ||
91 | if (nsproxy) { | ||
92 | ns = nsproxy->uts_ns; | ||
93 | get_uts_ns(ns); | ||
94 | } | ||
95 | rcu_read_unlock(); | ||
96 | |||
97 | return ns; | ||
98 | } | ||
99 | |||
100 | static void utsns_put(void *ns) | ||
101 | { | ||
102 | put_uts_ns(ns); | ||
103 | } | ||
104 | |||
105 | static int utsns_install(struct nsproxy *nsproxy, void *ns) | ||
106 | { | ||
107 | get_uts_ns(ns); | ||
108 | put_uts_ns(nsproxy->uts_ns); | ||
109 | nsproxy->uts_ns = ns; | ||
110 | return 0; | ||
111 | } | ||
112 | |||
113 | const struct proc_ns_operations utsns_operations = { | ||
114 | .name = "uts", | ||
115 | .type = CLONE_NEWUTS, | ||
116 | .get = utsns_get, | ||
117 | .put = utsns_put, | ||
118 | .install = utsns_install, | ||
119 | }; | ||
120 | |||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14733d4d156b..3d0c56ad4792 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
29 | 29 | ||
30 | int watchdog_enabled = 1; | 30 | int watchdog_enabled = 1; |
31 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly watchdog_thresh = 10; |
32 | 32 | ||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) | |||
91 | __setup("nosoftlockup", nosoftlockup_setup); | 91 | __setup("nosoftlockup", nosoftlockup_setup); |
92 | /* */ | 92 | /* */ |
93 | 93 | ||
94 | /* | ||
95 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | ||
96 | * lockups can have false positives under extreme conditions. So we generally | ||
97 | * want a higher threshold for soft lockups than for hard lockups. So we couple | ||
98 | * the thresholds with a factor: we make the soft threshold twice the amount of | ||
99 | * time the hard threshold is. | ||
100 | */ | ||
101 | static int get_softlockup_thresh(void) | ||
102 | { | ||
103 | return watchdog_thresh * 2; | ||
104 | } | ||
94 | 105 | ||
95 | /* | 106 | /* |
96 | * Returns seconds, approximately. We don't need nanosecond | 107 | * Returns seconds, approximately. We don't need nanosecond |
@@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu) | |||
105 | static unsigned long get_sample_period(void) | 116 | static unsigned long get_sample_period(void) |
106 | { | 117 | { |
107 | /* | 118 | /* |
108 | * convert softlockup_thresh from seconds to ns | 119 | * convert watchdog_thresh from seconds to ns |
109 | * the divide by 5 is to give hrtimer 5 chances to | 120 | * the divide by 5 is to give hrtimer 5 chances to |
110 | * increment before the hardlockup detector generates | 121 | * increment before the hardlockup detector generates |
111 | * a warning | 122 | * a warning |
112 | */ | 123 | */ |
113 | return softlockup_thresh / 5 * NSEC_PER_SEC; | 124 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); |
114 | } | 125 | } |
115 | 126 | ||
116 | /* Commands for resetting the watchdog */ | 127 | /* Commands for resetting the watchdog */ |
@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) | |||
182 | unsigned long now = get_timestamp(smp_processor_id()); | 193 | unsigned long now = get_timestamp(smp_processor_id()); |
183 | 194 | ||
184 | /* Warn about unreasonable delays: */ | 195 | /* Warn about unreasonable delays: */ |
185 | if (time_after(now, touch_ts + softlockup_thresh)) | 196 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
186 | return now - touch_ts; | 197 | return now - touch_ts; |
187 | 198 | ||
188 | return 0; | 199 | return 0; |
@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) | |||
359 | 370 | ||
360 | /* Try to register using hardware perf events */ | 371 | /* Try to register using hardware perf events */ |
361 | wd_attr = &wd_hw_attr; | 372 | wd_attr = &wd_hw_attr; |
362 | wd_attr->sample_period = hw_nmi_get_sample_period(); | 373 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
363 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); | 374 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); |
364 | if (!IS_ERR(event)) { | 375 | if (!IS_ERR(event)) { |
365 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |
@@ -404,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; } | |||
404 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 415 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
405 | 416 | ||
406 | /* prepare/enable/disable routines */ | 417 | /* prepare/enable/disable routines */ |
407 | static int watchdog_prepare_cpu(int cpu) | 418 | static void watchdog_prepare_cpu(int cpu) |
408 | { | 419 | { |
409 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | 420 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); |
410 | 421 | ||
411 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | 422 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); |
412 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 423 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
413 | hrtimer->function = watchdog_timer_fn; | 424 | hrtimer->function = watchdog_timer_fn; |
414 | |||
415 | return 0; | ||
416 | } | 425 | } |
417 | 426 | ||
418 | static int watchdog_enable(int cpu) | 427 | static int watchdog_enable(int cpu) |
@@ -501,28 +510,25 @@ static void watchdog_disable_all_cpus(void) | |||
501 | /* sysctl functions */ | 510 | /* sysctl functions */ |
502 | #ifdef CONFIG_SYSCTL | 511 | #ifdef CONFIG_SYSCTL |
503 | /* | 512 | /* |
504 | * proc handler for /proc/sys/kernel/nmi_watchdog | 513 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh |
505 | */ | 514 | */ |
506 | 515 | ||
507 | int proc_dowatchdog_enabled(struct ctl_table *table, int write, | 516 | int proc_dowatchdog(struct ctl_table *table, int write, |
508 | void __user *buffer, size_t *length, loff_t *ppos) | 517 | void __user *buffer, size_t *lenp, loff_t *ppos) |
509 | { | 518 | { |
510 | proc_dointvec(table, write, buffer, length, ppos); | 519 | int ret; |
511 | 520 | ||
512 | if (write) { | 521 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
513 | if (watchdog_enabled) | 522 | if (ret || !write) |
514 | watchdog_enable_all_cpus(); | 523 | goto out; |
515 | else | ||
516 | watchdog_disable_all_cpus(); | ||
517 | } | ||
518 | return 0; | ||
519 | } | ||
520 | 524 | ||
521 | int proc_dowatchdog_thresh(struct ctl_table *table, int write, | 525 | if (watchdog_enabled && watchdog_thresh) |
522 | void __user *buffer, | 526 | watchdog_enable_all_cpus(); |
523 | size_t *lenp, loff_t *ppos) | 527 | else |
524 | { | 528 | watchdog_disable_all_cpus(); |
525 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 529 | |
530 | out: | ||
531 | return ret; | ||
526 | } | 532 | } |
527 | #endif /* CONFIG_SYSCTL */ | 533 | #endif /* CONFIG_SYSCTL */ |
528 | 534 | ||
@@ -534,17 +540,16 @@ static int __cpuinit | |||
534 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 540 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
535 | { | 541 | { |
536 | int hotcpu = (unsigned long)hcpu; | 542 | int hotcpu = (unsigned long)hcpu; |
537 | int err = 0; | ||
538 | 543 | ||
539 | switch (action) { | 544 | switch (action) { |
540 | case CPU_UP_PREPARE: | 545 | case CPU_UP_PREPARE: |
541 | case CPU_UP_PREPARE_FROZEN: | 546 | case CPU_UP_PREPARE_FROZEN: |
542 | err = watchdog_prepare_cpu(hotcpu); | 547 | watchdog_prepare_cpu(hotcpu); |
543 | break; | 548 | break; |
544 | case CPU_ONLINE: | 549 | case CPU_ONLINE: |
545 | case CPU_ONLINE_FROZEN: | 550 | case CPU_ONLINE_FROZEN: |
546 | if (watchdog_enabled) | 551 | if (watchdog_enabled) |
547 | err = watchdog_enable(hotcpu); | 552 | watchdog_enable(hotcpu); |
548 | break; | 553 | break; |
549 | #ifdef CONFIG_HOTPLUG_CPU | 554 | #ifdef CONFIG_HOTPLUG_CPU |
550 | case CPU_UP_CANCELED: | 555 | case CPU_UP_CANCELED: |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e3378e8d3a5c..0400553f0d04 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -2866,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) | |||
2866 | } | 2866 | } |
2867 | } | 2867 | } |
2868 | 2868 | ||
2869 | /* just in case, make sure it's actually aligned | 2869 | /* just in case, make sure it's actually aligned */ |
2870 | * - this is affected by PERCPU() alignment in vmlinux.lds.S | ||
2871 | */ | ||
2872 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | 2870 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); |
2873 | return wq->cpu_wq.v ? 0 : -ENOMEM; | 2871 | return wq->cpu_wq.v ? 0 : -ENOMEM; |
2874 | } | 2872 | } |