diff options
Diffstat (limited to 'kernel')
85 files changed, 5474 insertions, 2721 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9f..6c07f30fa9b7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
45 | obj-$(CONFIG_SMP) += smp.o | 45 | obj-$(CONFIG_SMP) += smp.o |
46 | obj-$(CONFIG_SMP) += smpboot.o | ||
46 | ifneq ($(CONFIG_SMP),y) | 47 | ifneq ($(CONFIG_SMP),y) |
47 | obj-y += up.o | 48 | obj-y += up.o |
48 | endif | 49 | endif |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34eae..4b96415527b8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
69 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
70 | #include <linux/compat.h> | ||
70 | 71 | ||
71 | #include "audit.h" | 72 | #include "audit.h" |
72 | 73 | ||
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) | |||
2710 | audit_log_end(ab); | 2711 | audit_log_end(ab); |
2711 | } | 2712 | } |
2712 | 2713 | ||
2713 | void __audit_seccomp(unsigned long syscall) | 2714 | void __audit_seccomp(unsigned long syscall, long signr, int code) |
2714 | { | 2715 | { |
2715 | struct audit_buffer *ab; | 2716 | struct audit_buffer *ab; |
2716 | 2717 | ||
2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2718 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2718 | audit_log_abend(ab, "seccomp", SIGKILL); | 2719 | audit_log_abend(ab, "seccomp", signr); |
2719 | audit_log_format(ab, " syscall=%ld", syscall); | 2720 | audit_log_format(ab, " syscall=%ld", syscall); |
2721 | audit_log_format(ab, " compat=%d", is_compat_task()); | ||
2722 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
2723 | audit_log_format(ab, " code=0x%x", code); | ||
2720 | audit_log_end(ab); | 2724 | audit_log_end(ab); |
2721 | } | 2725 | } |
2722 | 2726 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c8329b0c2576..a0c6af34d500 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -60,9 +60,13 @@ | |||
60 | #include <linux/eventfd.h> | 60 | #include <linux/eventfd.h> |
61 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
63 | #include <linux/kthread.h> | ||
63 | 64 | ||
64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
65 | 66 | ||
67 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
68 | #define CSS_DEACT_BIAS INT_MIN | ||
69 | |||
66 | /* | 70 | /* |
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 71 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
68 | * hierarchy must be performed while holding it. | 72 | * hierarchy must be performed while holding it. |
@@ -127,6 +131,9 @@ struct cgroupfs_root { | |||
127 | /* A list running through the active hierarchies */ | 131 | /* A list running through the active hierarchies */ |
128 | struct list_head root_list; | 132 | struct list_head root_list; |
129 | 133 | ||
134 | /* All cgroups on this root, cgroup_mutex protected */ | ||
135 | struct list_head allcg_list; | ||
136 | |||
130 | /* Hierarchy-specific flags */ | 137 | /* Hierarchy-specific flags */ |
131 | unsigned long flags; | 138 | unsigned long flags; |
132 | 139 | ||
@@ -145,6 +152,15 @@ struct cgroupfs_root { | |||
145 | static struct cgroupfs_root rootnode; | 152 | static struct cgroupfs_root rootnode; |
146 | 153 | ||
147 | /* | 154 | /* |
155 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
156 | */ | ||
157 | struct cfent { | ||
158 | struct list_head node; | ||
159 | struct dentry *dentry; | ||
160 | struct cftype *type; | ||
161 | }; | ||
162 | |||
163 | /* | ||
148 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 164 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
149 | * cgroup_subsys->use_id != 0. | 165 | * cgroup_subsys->use_id != 0. |
150 | */ | 166 | */ |
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void) | |||
239 | 255 | ||
240 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
241 | 257 | ||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
259 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
260 | { | ||
261 | int v = atomic_read(&css->refcnt); | ||
262 | |||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | ||
264 | } | ||
265 | |||
242 | /* convenient tests for these bits */ | 266 | /* convenient tests for these bits */ |
243 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 267 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
244 | { | 268 | { |
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
279 | #define for_each_active_root(_root) \ | 303 | #define for_each_active_root(_root) \ |
280 | list_for_each_entry(_root, &roots, root_list) | 304 | list_for_each_entry(_root, &roots, root_list) |
281 | 305 | ||
306 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
307 | { | ||
308 | return dentry->d_fsdata; | ||
309 | } | ||
310 | |||
311 | static inline struct cfent *__d_cfe(struct dentry *dentry) | ||
312 | { | ||
313 | return dentry->d_fsdata; | ||
314 | } | ||
315 | |||
316 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
317 | { | ||
318 | return __d_cfe(dentry)->type; | ||
319 | } | ||
320 | |||
282 | /* the list of cgroups eligible for automatic release. Protected by | 321 | /* the list of cgroups eligible for automatic release. Protected by |
283 | * release_list_lock */ | 322 | * release_list_lock */ |
284 | static LIST_HEAD(release_list); | 323 | static LIST_HEAD(release_list); |
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
816 | struct cgroup_subsys *ss; | 855 | struct cgroup_subsys *ss; |
817 | int ret = 0; | 856 | int ret = 0; |
818 | 857 | ||
819 | for_each_subsys(cgrp->root, ss) | 858 | for_each_subsys(cgrp->root, ss) { |
820 | if (ss->pre_destroy) { | 859 | if (!ss->pre_destroy) |
821 | ret = ss->pre_destroy(cgrp); | 860 | continue; |
822 | if (ret) | 861 | |
823 | break; | 862 | ret = ss->pre_destroy(cgrp); |
863 | if (ret) { | ||
864 | /* ->pre_destroy() failure is being deprecated */ | ||
865 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
866 | break; | ||
824 | } | 867 | } |
868 | } | ||
825 | 869 | ||
826 | return ret; | 870 | return ret; |
827 | } | 871 | } |
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
864 | BUG_ON(!list_empty(&cgrp->pidlists)); | 908 | BUG_ON(!list_empty(&cgrp->pidlists)); |
865 | 909 | ||
866 | kfree_rcu(cgrp, rcu_head); | 910 | kfree_rcu(cgrp, rcu_head); |
911 | } else { | ||
912 | struct cfent *cfe = __d_cfe(dentry); | ||
913 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | ||
914 | |||
915 | WARN_ONCE(!list_empty(&cfe->node) && | ||
916 | cgrp != &cgrp->root->top_cgroup, | ||
917 | "cfe still linked for %s\n", cfe->type->name); | ||
918 | kfree(cfe); | ||
867 | } | 919 | } |
868 | iput(inode); | 920 | iput(inode); |
869 | } | 921 | } |
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d) | |||
882 | dput(parent); | 934 | dput(parent); |
883 | } | 935 | } |
884 | 936 | ||
885 | static void cgroup_clear_directory(struct dentry *dentry) | 937 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
886 | { | 938 | { |
887 | struct list_head *node; | 939 | struct cfent *cfe; |
888 | 940 | ||
889 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 941 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
890 | spin_lock(&dentry->d_lock); | 942 | lockdep_assert_held(&cgroup_mutex); |
891 | node = dentry->d_subdirs.next; | 943 | |
892 | while (node != &dentry->d_subdirs) { | 944 | list_for_each_entry(cfe, &cgrp->files, node) { |
893 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 945 | struct dentry *d = cfe->dentry; |
894 | 946 | ||
895 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 947 | if (cft && cfe->type != cft) |
896 | list_del_init(node); | 948 | continue; |
897 | if (d->d_inode) { | 949 | |
898 | /* This should never be called on a cgroup | 950 | dget(d); |
899 | * directory with child cgroups */ | 951 | d_delete(d); |
900 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 952 | simple_unlink(d->d_inode, d); |
901 | dget_dlock(d); | 953 | list_del_init(&cfe->node); |
902 | spin_unlock(&d->d_lock); | 954 | dput(d); |
903 | spin_unlock(&dentry->d_lock); | 955 | |
904 | d_delete(d); | 956 | return 0; |
905 | simple_unlink(dentry->d_inode, d); | ||
906 | dput(d); | ||
907 | spin_lock(&dentry->d_lock); | ||
908 | } else | ||
909 | spin_unlock(&d->d_lock); | ||
910 | node = dentry->d_subdirs.next; | ||
911 | } | 957 | } |
912 | spin_unlock(&dentry->d_lock); | 958 | return -ENOENT; |
959 | } | ||
960 | |||
961 | static void cgroup_clear_directory(struct dentry *dir) | ||
962 | { | ||
963 | struct cgroup *cgrp = __d_cgrp(dir); | ||
964 | |||
965 | while (!list_empty(&cgrp->files)) | ||
966 | cgroup_rm_file(cgrp, NULL); | ||
913 | } | 967 | } |
914 | 968 | ||
915 | /* | 969 | /* |
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1294 | if (ret) | 1348 | if (ret) |
1295 | goto out_unlock; | 1349 | goto out_unlock; |
1296 | 1350 | ||
1351 | /* See feature-removal-schedule.txt */ | ||
1352 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1353 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | ||
1354 | task_tgid_nr(current), current->comm); | ||
1355 | |||
1297 | /* Don't allow flags or name to change at remount */ | 1356 | /* Don't allow flags or name to change at remount */ |
1298 | if (opts.flags != root->flags || | 1357 | if (opts.flags != root->flags || |
1299 | (opts.name && strcmp(opts.name, root->name))) { | 1358 | (opts.name && strcmp(opts.name, root->name))) { |
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1308 | goto out_unlock; | 1367 | goto out_unlock; |
1309 | } | 1368 | } |
1310 | 1369 | ||
1311 | /* (re)populate subsystem files */ | 1370 | /* clear out any existing files and repopulate subsystem files */ |
1371 | cgroup_clear_directory(cgrp->dentry); | ||
1312 | cgroup_populate_dir(cgrp); | 1372 | cgroup_populate_dir(cgrp); |
1313 | 1373 | ||
1314 | if (opts.release_agent) | 1374 | if (opts.release_agent) |
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1333 | { | 1393 | { |
1334 | INIT_LIST_HEAD(&cgrp->sibling); | 1394 | INIT_LIST_HEAD(&cgrp->sibling); |
1335 | INIT_LIST_HEAD(&cgrp->children); | 1395 | INIT_LIST_HEAD(&cgrp->children); |
1396 | INIT_LIST_HEAD(&cgrp->files); | ||
1336 | INIT_LIST_HEAD(&cgrp->css_sets); | 1397 | INIT_LIST_HEAD(&cgrp->css_sets); |
1337 | INIT_LIST_HEAD(&cgrp->release_list); | 1398 | INIT_LIST_HEAD(&cgrp->release_list); |
1338 | INIT_LIST_HEAD(&cgrp->pidlists); | 1399 | INIT_LIST_HEAD(&cgrp->pidlists); |
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1344 | static void init_cgroup_root(struct cgroupfs_root *root) | 1405 | static void init_cgroup_root(struct cgroupfs_root *root) |
1345 | { | 1406 | { |
1346 | struct cgroup *cgrp = &root->top_cgroup; | 1407 | struct cgroup *cgrp = &root->top_cgroup; |
1408 | |||
1347 | INIT_LIST_HEAD(&root->subsys_list); | 1409 | INIT_LIST_HEAD(&root->subsys_list); |
1348 | INIT_LIST_HEAD(&root->root_list); | 1410 | INIT_LIST_HEAD(&root->root_list); |
1411 | INIT_LIST_HEAD(&root->allcg_list); | ||
1349 | root->number_of_cgroups = 1; | 1412 | root->number_of_cgroups = 1; |
1350 | cgrp->root = root; | 1413 | cgrp->root = root; |
1351 | cgrp->top_cgroup = cgrp; | 1414 | cgrp->top_cgroup = cgrp; |
1415 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1352 | init_cgroup_housekeeping(cgrp); | 1416 | init_cgroup_housekeeping(cgrp); |
1353 | } | 1417 | } |
1354 | 1418 | ||
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = { | |||
1692 | 1756 | ||
1693 | static struct kobject *cgroup_kobj; | 1757 | static struct kobject *cgroup_kobj; |
1694 | 1758 | ||
1695 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
1696 | { | ||
1697 | return dentry->d_fsdata; | ||
1698 | } | ||
1699 | |||
1700 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
1701 | { | ||
1702 | return dentry->d_fsdata; | ||
1703 | } | ||
1704 | |||
1705 | /** | 1759 | /** |
1706 | * cgroup_path - generate the path of a cgroup | 1760 | * cgroup_path - generate the path of a cgroup |
1707 | * @cgrp: the cgroup in question | 1761 | * @cgrp: the cgroup in question |
@@ -2172,6 +2226,18 @@ retry_find_task: | |||
2172 | 2226 | ||
2173 | if (threadgroup) | 2227 | if (threadgroup) |
2174 | tsk = tsk->group_leader; | 2228 | tsk = tsk->group_leader; |
2229 | |||
2230 | /* | ||
2231 | * Workqueue threads may acquire PF_THREAD_BOUND and become | ||
2232 | * trapped in a cpuset, or RT worker may be born in a cgroup | ||
2233 | * with no rt_runtime allocated. Just say no. | ||
2234 | */ | ||
2235 | if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { | ||
2236 | ret = -EINVAL; | ||
2237 | rcu_read_unlock(); | ||
2238 | goto out_unlock_cgroup; | ||
2239 | } | ||
2240 | |||
2175 | get_task_struct(tsk); | 2241 | get_task_struct(tsk); |
2176 | rcu_read_unlock(); | 2242 | rcu_read_unlock(); |
2177 | 2243 | ||
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2603 | return mode; | 2669 | return mode; |
2604 | } | 2670 | } |
2605 | 2671 | ||
2606 | int cgroup_add_file(struct cgroup *cgrp, | 2672 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2607 | struct cgroup_subsys *subsys, | 2673 | const struct cftype *cft) |
2608 | const struct cftype *cft) | ||
2609 | { | 2674 | { |
2610 | struct dentry *dir = cgrp->dentry; | 2675 | struct dentry *dir = cgrp->dentry; |
2676 | struct cgroup *parent = __d_cgrp(dir); | ||
2611 | struct dentry *dentry; | 2677 | struct dentry *dentry; |
2678 | struct cfent *cfe; | ||
2612 | int error; | 2679 | int error; |
2613 | umode_t mode; | 2680 | umode_t mode; |
2614 | |||
2615 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2681 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2682 | |||
2683 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2684 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2685 | return 0; | ||
2686 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2687 | return 0; | ||
2688 | |||
2616 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2689 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2617 | strcpy(name, subsys->name); | 2690 | strcpy(name, subsys->name); |
2618 | strcat(name, "."); | 2691 | strcat(name, "."); |
2619 | } | 2692 | } |
2620 | strcat(name, cft->name); | 2693 | strcat(name, cft->name); |
2694 | |||
2621 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2695 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2696 | |||
2697 | cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); | ||
2698 | if (!cfe) | ||
2699 | return -ENOMEM; | ||
2700 | |||
2622 | dentry = lookup_one_len(name, dir, strlen(name)); | 2701 | dentry = lookup_one_len(name, dir, strlen(name)); |
2623 | if (!IS_ERR(dentry)) { | 2702 | if (IS_ERR(dentry)) { |
2624 | mode = cgroup_file_mode(cft); | ||
2625 | error = cgroup_create_file(dentry, mode | S_IFREG, | ||
2626 | cgrp->root->sb); | ||
2627 | if (!error) | ||
2628 | dentry->d_fsdata = (void *)cft; | ||
2629 | dput(dentry); | ||
2630 | } else | ||
2631 | error = PTR_ERR(dentry); | 2703 | error = PTR_ERR(dentry); |
2704 | goto out; | ||
2705 | } | ||
2706 | |||
2707 | mode = cgroup_file_mode(cft); | ||
2708 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | ||
2709 | if (!error) { | ||
2710 | cfe->type = (void *)cft; | ||
2711 | cfe->dentry = dentry; | ||
2712 | dentry->d_fsdata = cfe; | ||
2713 | list_add_tail(&cfe->node, &parent->files); | ||
2714 | cfe = NULL; | ||
2715 | } | ||
2716 | dput(dentry); | ||
2717 | out: | ||
2718 | kfree(cfe); | ||
2632 | return error; | 2719 | return error; |
2633 | } | 2720 | } |
2634 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2635 | 2721 | ||
2636 | int cgroup_add_files(struct cgroup *cgrp, | 2722 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2637 | struct cgroup_subsys *subsys, | 2723 | const struct cftype cfts[], bool is_add) |
2638 | const struct cftype cft[], | ||
2639 | int count) | ||
2640 | { | 2724 | { |
2641 | int i, err; | 2725 | const struct cftype *cft; |
2642 | for (i = 0; i < count; i++) { | 2726 | int err, ret = 0; |
2643 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2727 | |
2644 | if (err) | 2728 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2645 | return err; | 2729 | if (is_add) |
2730 | err = cgroup_add_file(cgrp, subsys, cft); | ||
2731 | else | ||
2732 | err = cgroup_rm_file(cgrp, cft); | ||
2733 | if (err) { | ||
2734 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2735 | is_add ? "add" : "remove", cft->name, err); | ||
2736 | ret = err; | ||
2737 | } | ||
2738 | } | ||
2739 | return ret; | ||
2740 | } | ||
2741 | |||
2742 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2743 | |||
2744 | static void cgroup_cfts_prepare(void) | ||
2745 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | ||
2746 | { | ||
2747 | /* | ||
2748 | * Thanks to the entanglement with vfs inode locking, we can't walk | ||
2749 | * the existing cgroups under cgroup_mutex and create files. | ||
2750 | * Instead, we increment reference on all cgroups and build list of | ||
2751 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | ||
2752 | * exclusive access to the field. | ||
2753 | */ | ||
2754 | mutex_lock(&cgroup_cft_mutex); | ||
2755 | mutex_lock(&cgroup_mutex); | ||
2756 | } | ||
2757 | |||
2758 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | ||
2759 | const struct cftype *cfts, bool is_add) | ||
2760 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | ||
2761 | { | ||
2762 | LIST_HEAD(pending); | ||
2763 | struct cgroup *cgrp, *n; | ||
2764 | |||
2765 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | ||
2766 | if (cfts && ss->root != &rootnode) { | ||
2767 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | ||
2768 | dget(cgrp->dentry); | ||
2769 | list_add_tail(&cgrp->cft_q_node, &pending); | ||
2770 | } | ||
2771 | } | ||
2772 | |||
2773 | mutex_unlock(&cgroup_mutex); | ||
2774 | |||
2775 | /* | ||
2776 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | ||
2777 | * files for all cgroups which were created before. | ||
2778 | */ | ||
2779 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | ||
2780 | struct inode *inode = cgrp->dentry->d_inode; | ||
2781 | |||
2782 | mutex_lock(&inode->i_mutex); | ||
2783 | mutex_lock(&cgroup_mutex); | ||
2784 | if (!cgroup_is_removed(cgrp)) | ||
2785 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | ||
2786 | mutex_unlock(&cgroup_mutex); | ||
2787 | mutex_unlock(&inode->i_mutex); | ||
2788 | |||
2789 | list_del_init(&cgrp->cft_q_node); | ||
2790 | dput(cgrp->dentry); | ||
2646 | } | 2791 | } |
2792 | |||
2793 | mutex_unlock(&cgroup_cft_mutex); | ||
2794 | } | ||
2795 | |||
2796 | /** | ||
2797 | * cgroup_add_cftypes - add an array of cftypes to a subsystem | ||
2798 | * @ss: target cgroup subsystem | ||
2799 | * @cfts: zero-length name terminated array of cftypes | ||
2800 | * | ||
2801 | * Register @cfts to @ss. Files described by @cfts are created for all | ||
2802 | * existing cgroups to which @ss is attached and all future cgroups will | ||
2803 | * have them too. This function can be called anytime whether @ss is | ||
2804 | * attached or not. | ||
2805 | * | ||
2806 | * Returns 0 on successful registration, -errno on failure. Note that this | ||
2807 | * function currently returns 0 as long as @cfts registration is successful | ||
2808 | * even if some file creation attempts on existing cgroups fail. | ||
2809 | */ | ||
2810 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2811 | { | ||
2812 | struct cftype_set *set; | ||
2813 | |||
2814 | set = kzalloc(sizeof(*set), GFP_KERNEL); | ||
2815 | if (!set) | ||
2816 | return -ENOMEM; | ||
2817 | |||
2818 | cgroup_cfts_prepare(); | ||
2819 | set->cfts = cfts; | ||
2820 | list_add_tail(&set->node, &ss->cftsets); | ||
2821 | cgroup_cfts_commit(ss, cfts, true); | ||
2822 | |||
2647 | return 0; | 2823 | return 0; |
2648 | } | 2824 | } |
2649 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2825 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2826 | |||
2827 | /** | ||
2828 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | ||
2829 | * @ss: target cgroup subsystem | ||
2830 | * @cfts: zero-length name terminated array of cftypes | ||
2831 | * | ||
2832 | * Unregister @cfts from @ss. Files described by @cfts are removed from | ||
2833 | * all existing cgroups to which @ss is attached and all future cgroups | ||
2834 | * won't have them either. This function can be called anytime whether @ss | ||
2835 | * is attached or not. | ||
2836 | * | ||
2837 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | ||
2838 | * registered with @ss. | ||
2839 | */ | ||
2840 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2841 | { | ||
2842 | struct cftype_set *set; | ||
2843 | |||
2844 | cgroup_cfts_prepare(); | ||
2845 | |||
2846 | list_for_each_entry(set, &ss->cftsets, node) { | ||
2847 | if (set->cfts == cfts) { | ||
2848 | list_del_init(&set->node); | ||
2849 | cgroup_cfts_commit(ss, cfts, false); | ||
2850 | return 0; | ||
2851 | } | ||
2852 | } | ||
2853 | |||
2854 | cgroup_cfts_commit(ss, NULL, false); | ||
2855 | return -ENOENT; | ||
2856 | } | ||
2650 | 2857 | ||
2651 | /** | 2858 | /** |
2652 | * cgroup_task_count - count the number of tasks in a cgroup. | 2859 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -3625,13 +3832,14 @@ static struct cftype files[] = { | |||
3625 | .read_u64 = cgroup_clone_children_read, | 3832 | .read_u64 = cgroup_clone_children_read, |
3626 | .write_u64 = cgroup_clone_children_write, | 3833 | .write_u64 = cgroup_clone_children_write, |
3627 | }, | 3834 | }, |
3628 | }; | 3835 | { |
3629 | 3836 | .name = "release_agent", | |
3630 | static struct cftype cft_release_agent = { | 3837 | .flags = CFTYPE_ONLY_ON_ROOT, |
3631 | .name = "release_agent", | 3838 | .read_seq_string = cgroup_release_agent_show, |
3632 | .read_seq_string = cgroup_release_agent_show, | 3839 | .write_string = cgroup_release_agent_write, |
3633 | .write_string = cgroup_release_agent_write, | 3840 | .max_write_len = PATH_MAX, |
3634 | .max_write_len = PATH_MAX, | 3841 | }, |
3842 | { } /* terminate */ | ||
3635 | }; | 3843 | }; |
3636 | 3844 | ||
3637 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3845 | static int cgroup_populate_dir(struct cgroup *cgrp) |
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3639 | int err; | 3847 | int err; |
3640 | struct cgroup_subsys *ss; | 3848 | struct cgroup_subsys *ss; |
3641 | 3849 | ||
3642 | /* First clear out any existing files */ | 3850 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3643 | cgroup_clear_directory(cgrp->dentry); | ||
3644 | |||
3645 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | ||
3646 | if (err < 0) | 3851 | if (err < 0) |
3647 | return err; | 3852 | return err; |
3648 | 3853 | ||
3649 | if (cgrp == cgrp->top_cgroup) { | 3854 | /* process cftsets of each subsystem */ |
3650 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | ||
3651 | return err; | ||
3652 | } | ||
3653 | |||
3654 | for_each_subsys(cgrp->root, ss) { | 3855 | for_each_subsys(cgrp->root, ss) { |
3655 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3856 | struct cftype_set *set; |
3656 | return err; | 3857 | |
3858 | list_for_each_entry(set, &ss->cftsets, node) | ||
3859 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | ||
3657 | } | 3860 | } |
3861 | |||
3658 | /* This cgroup is ready now */ | 3862 | /* This cgroup is ready now */ |
3659 | for_each_subsys(cgrp->root, ss) { | 3863 | for_each_subsys(cgrp->root, ss) { |
3660 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3864 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3670 | return 0; | 3874 | return 0; |
3671 | } | 3875 | } |
3672 | 3876 | ||
3877 | static void css_dput_fn(struct work_struct *work) | ||
3878 | { | ||
3879 | struct cgroup_subsys_state *css = | ||
3880 | container_of(work, struct cgroup_subsys_state, dput_work); | ||
3881 | |||
3882 | dput(css->cgroup->dentry); | ||
3883 | } | ||
3884 | |||
3673 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3885 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3674 | struct cgroup_subsys *ss, | 3886 | struct cgroup_subsys *ss, |
3675 | struct cgroup *cgrp) | 3887 | struct cgroup *cgrp) |
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3682 | set_bit(CSS_ROOT, &css->flags); | 3894 | set_bit(CSS_ROOT, &css->flags); |
3683 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3895 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3684 | cgrp->subsys[ss->subsys_id] = css; | 3896 | cgrp->subsys[ss->subsys_id] = css; |
3897 | |||
3898 | /* | ||
3899 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | ||
3900 | * which is put on the last css_put(). dput() requires process | ||
3901 | * context, which css_put() may be called without. @css->dput_work | ||
3902 | * will be used to invoke dput() asynchronously from css_put(). | ||
3903 | */ | ||
3904 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
3905 | if (ss->__DEPRECATED_clear_css_refs) | ||
3906 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
3685 | } | 3907 | } |
3686 | 3908 | ||
3687 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3909 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3784 | if (err < 0) | 4006 | if (err < 0) |
3785 | goto err_remove; | 4007 | goto err_remove; |
3786 | 4008 | ||
4009 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4010 | for_each_subsys(root, ss) | ||
4011 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4012 | dget(dentry); | ||
4013 | |||
3787 | /* The cgroup directory was pre-locked for us */ | 4014 | /* The cgroup directory was pre-locked for us */ |
3788 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4015 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3789 | 4016 | ||
4017 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4018 | |||
3790 | err = cgroup_populate_dir(cgrp); | 4019 | err = cgroup_populate_dir(cgrp); |
3791 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4020 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3792 | 4021 | ||
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
3826 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4055 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3827 | } | 4056 | } |
3828 | 4057 | ||
4058 | /* | ||
4059 | * Check the reference count on each subsystem. Since we already | ||
4060 | * established that there are no tasks in the cgroup, if the css refcount | ||
4061 | * is also 1, then there should be no outstanding references, so the | ||
4062 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4063 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4064 | * be called via check_for_release() with no synchronization other than | ||
4065 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4066 | */ | ||
3829 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 4067 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3830 | { | 4068 | { |
3831 | /* Check the reference count on each subsystem. Since we | ||
3832 | * already established that there are no tasks in the | ||
3833 | * cgroup, if the css refcount is also 1, then there should | ||
3834 | * be no outstanding references, so the subsystem is safe to | ||
3835 | * destroy. We scan across all subsystems rather than using | ||
3836 | * the per-hierarchy linked list of mounted subsystems since | ||
3837 | * we can be called via check_for_release() with no | ||
3838 | * synchronization other than RCU, and the subsystem linked | ||
3839 | * list isn't RCU-safe */ | ||
3840 | int i; | 4069 | int i; |
4070 | |||
3841 | /* | 4071 | /* |
3842 | * We won't need to lock the subsys array, because the subsystems | 4072 | * We won't need to lock the subsys array, because the subsystems |
3843 | * we're concerned about aren't going anywhere since our cgroup root | 4073 | * we're concerned about aren't going anywhere since our cgroup root |
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4076 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3847 | struct cgroup_subsys *ss = subsys[i]; | 4077 | struct cgroup_subsys *ss = subsys[i]; |
3848 | struct cgroup_subsys_state *css; | 4078 | struct cgroup_subsys_state *css; |
4079 | |||
3849 | /* Skip subsystems not present or not in this hierarchy */ | 4080 | /* Skip subsystems not present or not in this hierarchy */ |
3850 | if (ss == NULL || ss->root != cgrp->root) | 4081 | if (ss == NULL || ss->root != cgrp->root) |
3851 | continue; | 4082 | continue; |
4083 | |||
3852 | css = cgrp->subsys[ss->subsys_id]; | 4084 | css = cgrp->subsys[ss->subsys_id]; |
3853 | /* When called from check_for_release() it's possible | 4085 | /* |
4086 | * When called from check_for_release() it's possible | ||
3854 | * that by this point the cgroup has been removed | 4087 | * that by this point the cgroup has been removed |
3855 | * and the css deleted. But a false-positive doesn't | 4088 | * and the css deleted. But a false-positive doesn't |
3856 | * matter, since it can only happen if the cgroup | 4089 | * matter, since it can only happen if the cgroup |
3857 | * has been deleted and hence no longer needs the | 4090 | * has been deleted and hence no longer needs the |
3858 | * release agent to be called anyway. */ | 4091 | * release agent to be called anyway. |
3859 | if (css && (atomic_read(&css->refcnt) > 1)) | 4092 | */ |
4093 | if (css && css_refcnt(css) > 1) | ||
3860 | return 1; | 4094 | return 1; |
3861 | } | 4095 | } |
3862 | return 0; | 4096 | return 0; |
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3866 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4100 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3867 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 4101 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3868 | * busy subsystems. Call with cgroup_mutex held | 4102 | * busy subsystems. Call with cgroup_mutex held |
4103 | * | ||
4104 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4105 | * not, cgroup removal behaves differently. | ||
4106 | * | ||
4107 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4108 | * cgroup removal can be committed. This is implemented by | ||
4109 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4110 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4111 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4112 | * removed as soon as the existing user (memcg) is updated. | ||
4113 | * | ||
4114 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4115 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4116 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4117 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4118 | * is put so that dentry destruction happens only after all css's are | ||
4119 | * released. | ||
3869 | */ | 4120 | */ |
3870 | |||
3871 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 4121 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3872 | { | 4122 | { |
3873 | struct cgroup_subsys *ss; | 4123 | struct cgroup_subsys *ss; |
3874 | unsigned long flags; | 4124 | unsigned long flags; |
3875 | bool failed = false; | 4125 | bool failed = false; |
4126 | |||
3876 | local_irq_save(flags); | 4127 | local_irq_save(flags); |
4128 | |||
4129 | /* | ||
4130 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4131 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4132 | * deactivation, we succeeded. | ||
4133 | */ | ||
3877 | for_each_subsys(cgrp->root, ss) { | 4134 | for_each_subsys(cgrp->root, ss) { |
3878 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4135 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3879 | int refcnt; | 4136 | |
3880 | while (1) { | 4137 | WARN_ON(atomic_read(&css->refcnt) < 0); |
3881 | /* We can only remove a CSS with a refcnt==1 */ | 4138 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
3882 | refcnt = atomic_read(&css->refcnt); | 4139 | |
3883 | if (refcnt > 1) { | 4140 | if (ss->__DEPRECATED_clear_css_refs) |
3884 | failed = true; | 4141 | failed |= css_refcnt(css) != 1; |
3885 | goto done; | ||
3886 | } | ||
3887 | BUG_ON(!refcnt); | ||
3888 | /* | ||
3889 | * Drop the refcnt to 0 while we check other | ||
3890 | * subsystems. This will cause any racing | ||
3891 | * css_tryget() to spin until we set the | ||
3892 | * CSS_REMOVED bits or abort | ||
3893 | */ | ||
3894 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | ||
3895 | break; | ||
3896 | cpu_relax(); | ||
3897 | } | ||
3898 | } | 4142 | } |
3899 | done: | 4143 | |
4144 | /* | ||
4145 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4146 | * restore refcnts to positive values. Either way, all in-progress | ||
4147 | * css_tryget() will be released. | ||
4148 | */ | ||
3900 | for_each_subsys(cgrp->root, ss) { | 4149 | for_each_subsys(cgrp->root, ss) { |
3901 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4150 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3902 | if (failed) { | 4151 | |
3903 | /* | 4152 | if (!failed) { |
3904 | * Restore old refcnt if we previously managed | ||
3905 | * to clear it from 1 to 0 | ||
3906 | */ | ||
3907 | if (!atomic_read(&css->refcnt)) | ||
3908 | atomic_set(&css->refcnt, 1); | ||
3909 | } else { | ||
3910 | /* Commit the fact that the CSS is removed */ | ||
3911 | set_bit(CSS_REMOVED, &css->flags); | 4153 | set_bit(CSS_REMOVED, &css->flags); |
4154 | css_put(css); | ||
4155 | } else { | ||
4156 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
3912 | } | 4157 | } |
3913 | } | 4158 | } |
4159 | |||
3914 | local_irq_restore(flags); | 4160 | local_irq_restore(flags); |
3915 | return !failed; | 4161 | return !failed; |
3916 | } | 4162 | } |
@@ -3995,6 +4241,8 @@ again: | |||
3995 | list_del_init(&cgrp->sibling); | 4241 | list_del_init(&cgrp->sibling); |
3996 | cgroup_unlock_hierarchy(cgrp->root); | 4242 | cgroup_unlock_hierarchy(cgrp->root); |
3997 | 4243 | ||
4244 | list_del_init(&cgrp->allcg_node); | ||
4245 | |||
3998 | d = dget(cgrp->dentry); | 4246 | d = dget(cgrp->dentry); |
3999 | 4247 | ||
4000 | cgroup_d_remove_dir(d); | 4248 | cgroup_d_remove_dir(d); |
@@ -4021,12 +4269,29 @@ again: | |||
4021 | return 0; | 4269 | return 0; |
4022 | } | 4270 | } |
4023 | 4271 | ||
4272 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | ||
4273 | { | ||
4274 | INIT_LIST_HEAD(&ss->cftsets); | ||
4275 | |||
4276 | /* | ||
4277 | * base_cftset is embedded in subsys itself, no need to worry about | ||
4278 | * deregistration. | ||
4279 | */ | ||
4280 | if (ss->base_cftypes) { | ||
4281 | ss->base_cftset.cfts = ss->base_cftypes; | ||
4282 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | ||
4283 | } | ||
4284 | } | ||
4285 | |||
4024 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4286 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4025 | { | 4287 | { |
4026 | struct cgroup_subsys_state *css; | 4288 | struct cgroup_subsys_state *css; |
4027 | 4289 | ||
4028 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4290 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4029 | 4291 | ||
4292 | /* init base cftset */ | ||
4293 | cgroup_init_cftsets(ss); | ||
4294 | |||
4030 | /* Create the top cgroup state for this subsystem */ | 4295 | /* Create the top cgroup state for this subsystem */ |
4031 | list_add(&ss->sibling, &rootnode.subsys_list); | 4296 | list_add(&ss->sibling, &rootnode.subsys_list); |
4032 | ss->root = &rootnode; | 4297 | ss->root = &rootnode; |
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4096 | return 0; | 4361 | return 0; |
4097 | } | 4362 | } |
4098 | 4363 | ||
4364 | /* init base cftset */ | ||
4365 | cgroup_init_cftsets(ss); | ||
4366 | |||
4099 | /* | 4367 | /* |
4100 | * need to register a subsys id before anything else - for example, | 4368 | * need to register a subsys id before anything else - for example, |
4101 | * init_cgroup_css needs it. | 4369 | * init_cgroup_css needs it. |
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp) | |||
4685 | } | 4953 | } |
4686 | 4954 | ||
4687 | /* Caller must verify that the css is not for root cgroup */ | 4955 | /* Caller must verify that the css is not for root cgroup */ |
4688 | void __css_put(struct cgroup_subsys_state *css, int count) | 4956 | bool __css_tryget(struct cgroup_subsys_state *css) |
4957 | { | ||
4958 | do { | ||
4959 | int v = css_refcnt(css); | ||
4960 | |||
4961 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | ||
4962 | return true; | ||
4963 | cpu_relax(); | ||
4964 | } while (!test_bit(CSS_REMOVED, &css->flags)); | ||
4965 | |||
4966 | return false; | ||
4967 | } | ||
4968 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4969 | |||
4970 | /* Caller must verify that the css is not for root cgroup */ | ||
4971 | void __css_put(struct cgroup_subsys_state *css) | ||
4689 | { | 4972 | { |
4690 | struct cgroup *cgrp = css->cgroup; | 4973 | struct cgroup *cgrp = css->cgroup; |
4691 | int val; | 4974 | |
4692 | rcu_read_lock(); | 4975 | rcu_read_lock(); |
4693 | val = atomic_sub_return(count, &css->refcnt); | 4976 | atomic_dec(&css->refcnt); |
4694 | if (val == 1) { | 4977 | switch (css_refcnt(css)) { |
4978 | case 1: | ||
4695 | if (notify_on_release(cgrp)) { | 4979 | if (notify_on_release(cgrp)) { |
4696 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4980 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4697 | check_for_release(cgrp); | 4981 | check_for_release(cgrp); |
4698 | } | 4982 | } |
4699 | cgroup_wakeup_rmdir_waiter(cgrp); | 4983 | cgroup_wakeup_rmdir_waiter(cgrp); |
4984 | break; | ||
4985 | case 0: | ||
4986 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | ||
4987 | schedule_work(&css->dput_work); | ||
4988 | break; | ||
4700 | } | 4989 | } |
4701 | rcu_read_unlock(); | 4990 | rcu_read_unlock(); |
4702 | WARN_ON_ONCE(val < 1); | ||
4703 | } | 4991 | } |
4704 | EXPORT_SYMBOL_GPL(__css_put); | 4992 | EXPORT_SYMBOL_GPL(__css_put); |
4705 | 4993 | ||
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4818 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5106 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4819 | * it's unchanged until freed. | 5107 | * it's unchanged until freed. |
4820 | */ | 5108 | */ |
4821 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5109 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4822 | 5110 | ||
4823 | if (cssid) | 5111 | if (cssid) |
4824 | return cssid->id; | 5112 | return cssid->id; |
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4830 | { | 5118 | { |
4831 | struct css_id *cssid; | 5119 | struct css_id *cssid; |
4832 | 5120 | ||
4833 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5121 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4834 | 5122 | ||
4835 | if (cssid) | 5123 | if (cssid) |
4836 | return cssid->depth; | 5124 | return cssid->depth; |
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = { | |||
5211 | .name = "releasable", | 5499 | .name = "releasable", |
5212 | .read_u64 = releasable_read, | 5500 | .read_u64 = releasable_read, |
5213 | }, | 5501 | }, |
5214 | }; | ||
5215 | 5502 | ||
5216 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5503 | { } /* terminate */ |
5217 | { | 5504 | }; |
5218 | return cgroup_add_files(cont, ss, debug_files, | ||
5219 | ARRAY_SIZE(debug_files)); | ||
5220 | } | ||
5221 | 5505 | ||
5222 | struct cgroup_subsys debug_subsys = { | 5506 | struct cgroup_subsys debug_subsys = { |
5223 | .name = "debug", | 5507 | .name = "debug", |
5224 | .create = debug_create, | 5508 | .create = debug_create, |
5225 | .destroy = debug_destroy, | 5509 | .destroy = debug_destroy, |
5226 | .populate = debug_populate, | ||
5227 | .subsys_id = debug_subsys_id, | 5510 | .subsys_id = debug_subsys_id, |
5511 | .base_cftypes = debug_files, | ||
5228 | }; | 5512 | }; |
5229 | #endif /* CONFIG_CGROUP_DEBUG */ | 5513 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b62..3649fc6b3eaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, | |||
358 | static struct cftype files[] = { | 358 | static struct cftype files[] = { |
359 | { | 359 | { |
360 | .name = "state", | 360 | .name = "state", |
361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
361 | .read_seq_string = freezer_read, | 362 | .read_seq_string = freezer_read, |
362 | .write_string = freezer_write, | 363 | .write_string = freezer_write, |
363 | }, | 364 | }, |
365 | { } /* terminate */ | ||
364 | }; | 366 | }; |
365 | 367 | ||
366 | static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | ||
367 | { | ||
368 | if (!cgroup->parent) | ||
369 | return 0; | ||
370 | return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | ||
371 | } | ||
372 | |||
373 | struct cgroup_subsys freezer_subsys = { | 368 | struct cgroup_subsys freezer_subsys = { |
374 | .name = "freezer", | 369 | .name = "freezer", |
375 | .create = freezer_create, | 370 | .create = freezer_create, |
376 | .destroy = freezer_destroy, | 371 | .destroy = freezer_destroy, |
377 | .populate = freezer_populate, | ||
378 | .subsys_id = freezer_subsys_id, | 372 | .subsys_id = freezer_subsys_id, |
379 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
380 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
375 | .base_cftypes = files, | ||
381 | }; | 376 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index 74ff8498809a..d2c67aa49ae6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
372 | 372 | ||
373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
374 | 374 | ||
375 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 375 | /* |
376 | compat_old_sigset_t __user *oset) | 376 | * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the |
377 | * blocked set of signals to the supplied signal set | ||
378 | */ | ||
379 | static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) | ||
377 | { | 380 | { |
378 | old_sigset_t s; | 381 | memcpy(blocked->sig, &set, sizeof(set)); |
379 | long ret; | 382 | } |
380 | mm_segment_t old_fs; | ||
381 | 383 | ||
382 | if (set && get_user(s, set)) | 384 | asmlinkage long compat_sys_sigprocmask(int how, |
383 | return -EFAULT; | 385 | compat_old_sigset_t __user *nset, |
384 | old_fs = get_fs(); | 386 | compat_old_sigset_t __user *oset) |
385 | set_fs(KERNEL_DS); | 387 | { |
386 | ret = sys_sigprocmask(how, | 388 | old_sigset_t old_set, new_set; |
387 | set ? (old_sigset_t __user *) &s : NULL, | 389 | sigset_t new_blocked; |
388 | oset ? (old_sigset_t __user *) &s : NULL); | 390 | |
389 | set_fs(old_fs); | 391 | old_set = current->blocked.sig[0]; |
390 | if (ret == 0) | 392 | |
391 | if (oset) | 393 | if (nset) { |
392 | ret = put_user(s, oset); | 394 | if (get_user(new_set, nset)) |
393 | return ret; | 395 | return -EFAULT; |
396 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
397 | |||
398 | new_blocked = current->blocked; | ||
399 | |||
400 | switch (how) { | ||
401 | case SIG_BLOCK: | ||
402 | sigaddsetmask(&new_blocked, new_set); | ||
403 | break; | ||
404 | case SIG_UNBLOCK: | ||
405 | sigdelsetmask(&new_blocked, new_set); | ||
406 | break; | ||
407 | case SIG_SETMASK: | ||
408 | compat_sig_setmask(&new_blocked, new_set); | ||
409 | break; | ||
410 | default: | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | |||
414 | set_current_blocked(&new_blocked); | ||
415 | } | ||
416 | |||
417 | if (oset) { | ||
418 | if (put_user(old_set, oset)) | ||
419 | return -EFAULT; | ||
420 | } | ||
421 | |||
422 | return 0; | ||
394 | } | 423 | } |
395 | 424 | ||
396 | #endif | 425 | #endif |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e57027..0e6353cf147a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | 18 | #include <linux/suspend.h> |
19 | 19 | ||
20 | #include "smpboot.h" | ||
21 | |||
20 | #ifdef CONFIG_SMP | 22 | #ifdef CONFIG_SMP |
21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 23 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
22 | static DEFINE_MUTEX(cpu_add_remove_lock); | 24 | static DEFINE_MUTEX(cpu_add_remove_lock); |
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
295 | int ret, nr_calls = 0; | 297 | int ret, nr_calls = 0; |
296 | void *hcpu = (void *)(long)cpu; | 298 | void *hcpu = (void *)(long)cpu; |
297 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 299 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
300 | struct task_struct *idle; | ||
298 | 301 | ||
299 | if (cpu_online(cpu) || !cpu_present(cpu)) | 302 | if (cpu_online(cpu) || !cpu_present(cpu)) |
300 | return -EINVAL; | 303 | return -EINVAL; |
301 | 304 | ||
302 | cpu_hotplug_begin(); | 305 | cpu_hotplug_begin(); |
306 | |||
307 | idle = idle_thread_get(cpu); | ||
308 | if (IS_ERR(idle)) { | ||
309 | ret = PTR_ERR(idle); | ||
310 | goto out; | ||
311 | } | ||
312 | |||
303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 313 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
304 | if (ret) { | 314 | if (ret) { |
305 | nr_calls--; | 315 | nr_calls--; |
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
309 | } | 319 | } |
310 | 320 | ||
311 | /* Arch-specific enabling code. */ | 321 | /* Arch-specific enabling code. */ |
312 | ret = __cpu_up(cpu); | 322 | ret = __cpu_up(cpu, idle); |
313 | if (ret != 0) | 323 | if (ret != 0) |
314 | goto out_notify; | 324 | goto out_notify; |
315 | BUG_ON(!cpu_online(cpu)); | 325 | BUG_ON(!cpu_online(cpu)); |
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
320 | out_notify: | 330 | out_notify: |
321 | if (ret != 0) | 331 | if (ret != 0) |
322 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 332 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
333 | out: | ||
323 | cpu_hotplug_done(); | 334 | cpu_hotplug_done(); |
324 | 335 | ||
325 | return ret; | 336 | return ret; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b96ad75b7e64..8c8bd652dd12 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = { | |||
270 | * are online. If none are online, walk up the cpuset hierarchy | 270 | * are online. If none are online, walk up the cpuset hierarchy |
271 | * until we find one that does have some online cpus. If we get | 271 | * until we find one that does have some online cpus. If we get |
272 | * all the way to the top and still haven't found any online cpus, | 272 | * all the way to the top and still haven't found any online cpus, |
273 | * return cpu_online_map. Or if passed a NULL cs from an exit'ing | 273 | * return cpu_online_mask. Or if passed a NULL cs from an exit'ing |
274 | * task, return cpu_online_map. | 274 | * task, return cpu_online_mask. |
275 | * | 275 | * |
276 | * One way or another, we guarantee to return some non-empty subset | 276 | * One way or another, we guarantee to return some non-empty subset |
277 | * of cpu_online_map. | 277 | * of cpu_online_mask. |
278 | * | 278 | * |
279 | * Call with callback_mutex held. | 279 | * Call with callback_mutex held. |
280 | */ | 280 | */ |
@@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
867 | int retval; | 867 | int retval; |
868 | int is_load_balanced; | 868 | int is_load_balanced; |
869 | 869 | ||
870 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 870 | /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ |
871 | if (cs == &top_cpuset) | 871 | if (cs == &top_cpuset) |
872 | return -EACCES; | 872 | return -EACCES; |
873 | 873 | ||
@@ -1765,28 +1765,17 @@ static struct cftype files[] = { | |||
1765 | .write_u64 = cpuset_write_u64, | 1765 | .write_u64 = cpuset_write_u64, |
1766 | .private = FILE_SPREAD_SLAB, | 1766 | .private = FILE_SPREAD_SLAB, |
1767 | }, | 1767 | }, |
1768 | }; | ||
1769 | |||
1770 | static struct cftype cft_memory_pressure_enabled = { | ||
1771 | .name = "memory_pressure_enabled", | ||
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }; | ||
1776 | 1768 | ||
1777 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 1769 | { |
1778 | { | 1770 | .name = "memory_pressure_enabled", |
1779 | int err; | 1771 | .flags = CFTYPE_ONLY_ON_ROOT, |
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }, | ||
1780 | 1776 | ||
1781 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 1777 | { } /* terminate */ |
1782 | if (err) | 1778 | }; |
1783 | return err; | ||
1784 | /* memory_pressure_enabled is in root cpuset only */ | ||
1785 | if (!cont->parent) | ||
1786 | err = cgroup_add_file(cont, ss, | ||
1787 | &cft_memory_pressure_enabled); | ||
1788 | return err; | ||
1789 | } | ||
1790 | 1779 | ||
1791 | /* | 1780 | /* |
1792 | * post_clone() is called during cgroup_create() when the | 1781 | * post_clone() is called during cgroup_create() when the |
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1887 | .destroy = cpuset_destroy, | 1876 | .destroy = cpuset_destroy, |
1888 | .can_attach = cpuset_can_attach, | 1877 | .can_attach = cpuset_can_attach, |
1889 | .attach = cpuset_attach, | 1878 | .attach = cpuset_attach, |
1890 | .populate = cpuset_populate, | ||
1891 | .post_clone = cpuset_post_clone, | 1879 | .post_clone = cpuset_post_clone, |
1892 | .subsys_id = cpuset_subsys_id, | 1880 | .subsys_id = cpuset_subsys_id, |
1881 | .base_cftypes = files, | ||
1893 | .early_init = 1, | 1882 | .early_init = 1, |
1894 | }; | 1883 | }; |
1895 | 1884 | ||
@@ -2149,7 +2138,7 @@ void __init cpuset_init_smp(void) | |||
2149 | * | 2138 | * |
2150 | * Description: Returns the cpumask_var_t cpus_allowed of the cpuset | 2139 | * Description: Returns the cpumask_var_t cpus_allowed of the cpuset |
2151 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2140 | * attached to the specified @tsk. Guaranteed to return some non-empty |
2152 | * subset of cpu_online_map, even if this means going outside the | 2141 | * subset of cpu_online_mask, even if this means going outside the |
2153 | * tasks cpuset. | 2142 | * tasks cpuset. |
2154 | **/ | 2143 | **/ |
2155 | 2144 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index eddc5e2e9587..430557ea488f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -396,6 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
396 | struct cred *new; | 396 | struct cred *new; |
397 | int ret; | 397 | int ret; |
398 | 398 | ||
399 | p->replacement_session_keyring = NULL; | ||
400 | |||
399 | if ( | 401 | if ( |
400 | #ifdef CONFIG_KEYS | 402 | #ifdef CONFIG_KEYS |
401 | !p->cred->thread_keyring && | 403 | !p->cred->thread_keyring && |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1dc53bae56e1..0557f24c6bca 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -160,37 +160,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup); | |||
160 | * Weak aliases for breakpoint management, | 160 | * Weak aliases for breakpoint management, |
161 | * can be overriden by architectures when needed: | 161 | * can be overriden by architectures when needed: |
162 | */ | 162 | */ |
163 | int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) | 163 | int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) |
164 | { | 164 | { |
165 | int err; | 165 | int err; |
166 | 166 | ||
167 | err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); | 167 | err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, |
168 | BREAK_INSTR_SIZE); | ||
168 | if (err) | 169 | if (err) |
169 | return err; | 170 | return err; |
170 | 171 | err = probe_kernel_write((char *)bpt->bpt_addr, | |
171 | return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, | 172 | arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); |
172 | BREAK_INSTR_SIZE); | 173 | return err; |
173 | } | 174 | } |
174 | 175 | ||
175 | int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) | 176 | int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) |
176 | { | 177 | { |
177 | return probe_kernel_write((char *)addr, | 178 | return probe_kernel_write((char *)bpt->bpt_addr, |
178 | (char *)bundle, BREAK_INSTR_SIZE); | 179 | (char *)bpt->saved_instr, BREAK_INSTR_SIZE); |
179 | } | 180 | } |
180 | 181 | ||
181 | int __weak kgdb_validate_break_address(unsigned long addr) | 182 | int __weak kgdb_validate_break_address(unsigned long addr) |
182 | { | 183 | { |
183 | char tmp_variable[BREAK_INSTR_SIZE]; | 184 | struct kgdb_bkpt tmp; |
184 | int err; | 185 | int err; |
185 | /* Validate setting the breakpoint and then removing it. In the | 186 | /* Validate setting the breakpoint and then removing it. If the |
186 | * remove fails, the kernel needs to emit a bad message because we | 187 | * remove fails, the kernel needs to emit a bad message because we |
187 | * are deep trouble not being able to put things back the way we | 188 | * are deep trouble not being able to put things back the way we |
188 | * found them. | 189 | * found them. |
189 | */ | 190 | */ |
190 | err = kgdb_arch_set_breakpoint(addr, tmp_variable); | 191 | tmp.bpt_addr = addr; |
192 | err = kgdb_arch_set_breakpoint(&tmp); | ||
191 | if (err) | 193 | if (err) |
192 | return err; | 194 | return err; |
193 | err = kgdb_arch_remove_breakpoint(addr, tmp_variable); | 195 | err = kgdb_arch_remove_breakpoint(&tmp); |
194 | if (err) | 196 | if (err) |
195 | printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " | 197 | printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " |
196 | "memory destroyed at: %lx", addr); | 198 | "memory destroyed at: %lx", addr); |
@@ -234,7 +236,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) | |||
234 | */ | 236 | */ |
235 | int dbg_activate_sw_breakpoints(void) | 237 | int dbg_activate_sw_breakpoints(void) |
236 | { | 238 | { |
237 | unsigned long addr; | ||
238 | int error; | 239 | int error; |
239 | int ret = 0; | 240 | int ret = 0; |
240 | int i; | 241 | int i; |
@@ -243,16 +244,15 @@ int dbg_activate_sw_breakpoints(void) | |||
243 | if (kgdb_break[i].state != BP_SET) | 244 | if (kgdb_break[i].state != BP_SET) |
244 | continue; | 245 | continue; |
245 | 246 | ||
246 | addr = kgdb_break[i].bpt_addr; | 247 | error = kgdb_arch_set_breakpoint(&kgdb_break[i]); |
247 | error = kgdb_arch_set_breakpoint(addr, | ||
248 | kgdb_break[i].saved_instr); | ||
249 | if (error) { | 248 | if (error) { |
250 | ret = error; | 249 | ret = error; |
251 | printk(KERN_INFO "KGDB: BP install failed: %lx", addr); | 250 | printk(KERN_INFO "KGDB: BP install failed: %lx", |
251 | kgdb_break[i].bpt_addr); | ||
252 | continue; | 252 | continue; |
253 | } | 253 | } |
254 | 254 | ||
255 | kgdb_flush_swbreak_addr(addr); | 255 | kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); |
256 | kgdb_break[i].state = BP_ACTIVE; | 256 | kgdb_break[i].state = BP_ACTIVE; |
257 | } | 257 | } |
258 | return ret; | 258 | return ret; |
@@ -301,7 +301,6 @@ int dbg_set_sw_break(unsigned long addr) | |||
301 | 301 | ||
302 | int dbg_deactivate_sw_breakpoints(void) | 302 | int dbg_deactivate_sw_breakpoints(void) |
303 | { | 303 | { |
304 | unsigned long addr; | ||
305 | int error; | 304 | int error; |
306 | int ret = 0; | 305 | int ret = 0; |
307 | int i; | 306 | int i; |
@@ -309,15 +308,14 @@ int dbg_deactivate_sw_breakpoints(void) | |||
309 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | 308 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { |
310 | if (kgdb_break[i].state != BP_ACTIVE) | 309 | if (kgdb_break[i].state != BP_ACTIVE) |
311 | continue; | 310 | continue; |
312 | addr = kgdb_break[i].bpt_addr; | 311 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); |
313 | error = kgdb_arch_remove_breakpoint(addr, | ||
314 | kgdb_break[i].saved_instr); | ||
315 | if (error) { | 312 | if (error) { |
316 | printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); | 313 | printk(KERN_INFO "KGDB: BP remove failed: %lx\n", |
314 | kgdb_break[i].bpt_addr); | ||
317 | ret = error; | 315 | ret = error; |
318 | } | 316 | } |
319 | 317 | ||
320 | kgdb_flush_swbreak_addr(addr); | 318 | kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); |
321 | kgdb_break[i].state = BP_SET; | 319 | kgdb_break[i].state = BP_SET; |
322 | } | 320 | } |
323 | return ret; | 321 | return ret; |
@@ -351,7 +349,6 @@ int kgdb_isremovedbreak(unsigned long addr) | |||
351 | 349 | ||
352 | int dbg_remove_all_break(void) | 350 | int dbg_remove_all_break(void) |
353 | { | 351 | { |
354 | unsigned long addr; | ||
355 | int error; | 352 | int error; |
356 | int i; | 353 | int i; |
357 | 354 | ||
@@ -359,12 +356,10 @@ int dbg_remove_all_break(void) | |||
359 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { | 356 | for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { |
360 | if (kgdb_break[i].state != BP_ACTIVE) | 357 | if (kgdb_break[i].state != BP_ACTIVE) |
361 | goto setundefined; | 358 | goto setundefined; |
362 | addr = kgdb_break[i].bpt_addr; | 359 | error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); |
363 | error = kgdb_arch_remove_breakpoint(addr, | ||
364 | kgdb_break[i].saved_instr); | ||
365 | if (error) | 360 | if (error) |
366 | printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", | 361 | printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", |
367 | addr); | 362 | kgdb_break[i].bpt_addr); |
368 | setundefined: | 363 | setundefined: |
369 | kgdb_break[i].state = BP_UNDEFINED; | 364 | kgdb_break[i].state = BP_UNDEFINED; |
370 | } | 365 | } |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b5f17da1c56..bb9520f0f6ff 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -743,7 +743,7 @@ kdb_printit: | |||
743 | kdb_input_flush(); | 743 | kdb_input_flush(); |
744 | c = console_drivers; | 744 | c = console_drivers; |
745 | 745 | ||
746 | if (!dbg_io_ops->is_console) { | 746 | if (dbg_io_ops && !dbg_io_ops->is_console) { |
747 | len = strlen(moreprompt); | 747 | len = strlen(moreprompt); |
748 | cp = moreprompt; | 748 | cp = moreprompt; |
749 | while (len--) { | 749 | while (len--) { |
diff --git a/kernel/events/core.c b/kernel/events/core.c index a6a9ec4cd8f5..5b06cbbf6931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -3183,7 +3183,7 @@ static void perf_event_for_each(struct perf_event *event, | |||
3183 | perf_event_for_each_child(event, func); | 3183 | perf_event_for_each_child(event, func); |
3184 | func(event); | 3184 | func(event); |
3185 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 3185 | list_for_each_entry(sibling, &event->sibling_list, group_entry) |
3186 | perf_event_for_each_child(event, func); | 3186 | perf_event_for_each_child(sibling, func); |
3187 | mutex_unlock(&ctx->mutex); | 3187 | mutex_unlock(&ctx->mutex); |
3188 | } | 3188 | } |
3189 | 3189 | ||
@@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | |||
4957 | if (rctx < 0) | 4957 | if (rctx < 0) |
4958 | return; | 4958 | return; |
4959 | 4959 | ||
4960 | perf_sample_data_init(&data, addr); | 4960 | perf_sample_data_init(&data, addr, 0); |
4961 | 4961 | ||
4962 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); | 4962 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
4963 | 4963 | ||
@@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
5215 | .data = record, | 5215 | .data = record, |
5216 | }; | 5216 | }; |
5217 | 5217 | ||
5218 | perf_sample_data_init(&data, addr); | 5218 | perf_sample_data_init(&data, addr, 0); |
5219 | data.raw = &raw; | 5219 | data.raw = &raw; |
5220 | 5220 | ||
5221 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5221 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
@@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
5318 | struct perf_sample_data sample; | 5318 | struct perf_sample_data sample; |
5319 | struct pt_regs *regs = data; | 5319 | struct pt_regs *regs = data; |
5320 | 5320 | ||
5321 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5321 | perf_sample_data_init(&sample, bp->attr.bp_addr, 0); |
5322 | 5322 | ||
5323 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | 5323 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) |
5324 | perf_swevent_event(bp, 1, &sample, regs); | 5324 | perf_swevent_event(bp, 1, &sample, regs); |
@@ -5344,13 +5344,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5344 | 5344 | ||
5345 | event->pmu->read(event); | 5345 | event->pmu->read(event); |
5346 | 5346 | ||
5347 | perf_sample_data_init(&data, 0); | 5347 | perf_sample_data_init(&data, 0, event->hw.last_period); |
5348 | data.period = event->hw.last_period; | ||
5349 | regs = get_irq_regs(); | 5348 | regs = get_irq_regs(); |
5350 | 5349 | ||
5351 | if (regs && !perf_exclude_event(event, regs)) { | 5350 | if (regs && !perf_exclude_event(event, regs)) { |
5352 | if (!(event->attr.exclude_idle && is_idle_task(current))) | 5351 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5353 | if (perf_event_overflow(event, &data, regs)) | 5352 | if (__perf_event_overflow(event, 1, &data, regs)) |
5354 | ret = HRTIMER_NORESTART; | 5353 | ret = HRTIMER_NORESTART; |
5355 | } | 5354 | } |
5356 | 5355 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 5339705b8241..fe35a634bf76 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex); | |||
35 | extern struct exception_table_entry __start___ex_table[]; | 35 | extern struct exception_table_entry __start___ex_table[]; |
36 | extern struct exception_table_entry __stop___ex_table[]; | 36 | extern struct exception_table_entry __stop___ex_table[]; |
37 | 37 | ||
38 | /* Cleared by build time tools if the table is already sorted. */ | ||
39 | u32 __initdata main_extable_sort_needed = 1; | ||
40 | |||
38 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
39 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
40 | { | 43 | { |
41 | sort_extable(__start___ex_table, __stop___ex_table); | 44 | if (main_extable_sort_needed) |
45 | sort_extable(__start___ex_table, __stop___ex_table); | ||
46 | else | ||
47 | pr_notice("__ex_table already sorted, skipping sort\n"); | ||
42 | } | 48 | } |
43 | 49 | ||
44 | /* Given an address, look for it in the exception tables. */ | 50 | /* Given an address, look for it in the exception tables. */ |
diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..05c813dc9ecc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/cgroup.h> | 34 | #include <linux/cgroup.h> |
35 | #include <linux/security.h> | 35 | #include <linux/security.h> |
36 | #include <linux/hugetlb.h> | 36 | #include <linux/hugetlb.h> |
37 | #include <linux/seccomp.h> | ||
37 | #include <linux/swap.h> | 38 | #include <linux/swap.h> |
38 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
39 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
@@ -47,6 +48,7 @@ | |||
47 | #include <linux/audit.h> | 48 | #include <linux/audit.h> |
48 | #include <linux/memcontrol.h> | 49 | #include <linux/memcontrol.h> |
49 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/proc_fs.h> | ||
50 | #include <linux/profile.h> | 52 | #include <linux/profile.h> |
51 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
52 | #include <linux/ksm.h> | 54 | #include <linux/ksm.h> |
@@ -111,32 +113,67 @@ int nr_processes(void) | |||
111 | return total; | 113 | return total; |
112 | } | 114 | } |
113 | 115 | ||
114 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 116 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
115 | # define alloc_task_struct_node(node) \ | ||
116 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) | ||
117 | # define free_task_struct(tsk) \ | ||
118 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
119 | static struct kmem_cache *task_struct_cachep; | 117 | static struct kmem_cache *task_struct_cachep; |
118 | |||
119 | static inline struct task_struct *alloc_task_struct_node(int node) | ||
120 | { | ||
121 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); | ||
122 | } | ||
123 | |||
124 | void __weak arch_release_task_struct(struct task_struct *tsk) { } | ||
125 | |||
126 | static inline void free_task_struct(struct task_struct *tsk) | ||
127 | { | ||
128 | arch_release_task_struct(tsk); | ||
129 | kmem_cache_free(task_struct_cachep, tsk); | ||
130 | } | ||
120 | #endif | 131 | #endif |
121 | 132 | ||
122 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 133 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR |
134 | void __weak arch_release_thread_info(struct thread_info *ti) { } | ||
135 | |||
136 | /* | ||
137 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | ||
138 | * kmemcache based allocator. | ||
139 | */ | ||
140 | # if THREAD_SIZE >= PAGE_SIZE | ||
123 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 141 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
124 | int node) | 142 | int node) |
125 | { | 143 | { |
126 | #ifdef CONFIG_DEBUG_STACK_USAGE | 144 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
127 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 145 | THREAD_SIZE_ORDER); |
128 | #else | ||
129 | gfp_t mask = GFP_KERNEL; | ||
130 | #endif | ||
131 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); | ||
132 | 146 | ||
133 | return page ? page_address(page) : NULL; | 147 | return page ? page_address(page) : NULL; |
134 | } | 148 | } |
135 | 149 | ||
136 | static inline void free_thread_info(struct thread_info *ti) | 150 | static inline void free_thread_info(struct thread_info *ti) |
137 | { | 151 | { |
152 | arch_release_thread_info(ti); | ||
138 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 153 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
139 | } | 154 | } |
155 | # else | ||
156 | static struct kmem_cache *thread_info_cache; | ||
157 | |||
158 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | ||
159 | int node) | ||
160 | { | ||
161 | return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); | ||
162 | } | ||
163 | |||
164 | static void free_thread_info(struct thread_info *ti) | ||
165 | { | ||
166 | arch_release_thread_info(ti); | ||
167 | kmem_cache_free(thread_info_cache, ti); | ||
168 | } | ||
169 | |||
170 | void thread_info_cache_init(void) | ||
171 | { | ||
172 | thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, | ||
173 | THREAD_SIZE, 0, NULL); | ||
174 | BUG_ON(thread_info_cache == NULL); | ||
175 | } | ||
176 | # endif | ||
140 | #endif | 177 | #endif |
141 | 178 | ||
142 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 179 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
@@ -170,6 +207,7 @@ void free_task(struct task_struct *tsk) | |||
170 | free_thread_info(tsk->stack); | 207 | free_thread_info(tsk->stack); |
171 | rt_mutex_debug_task_free(tsk); | 208 | rt_mutex_debug_task_free(tsk); |
172 | ftrace_graph_exit_task(tsk); | 209 | ftrace_graph_exit_task(tsk); |
210 | put_seccomp_filter(tsk); | ||
173 | free_task_struct(tsk); | 211 | free_task_struct(tsk); |
174 | } | 212 | } |
175 | EXPORT_SYMBOL(free_task); | 213 | EXPORT_SYMBOL(free_task); |
@@ -203,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk) | |||
203 | } | 241 | } |
204 | EXPORT_SYMBOL_GPL(__put_task_struct); | 242 | EXPORT_SYMBOL_GPL(__put_task_struct); |
205 | 243 | ||
206 | /* | 244 | void __init __weak arch_task_cache_init(void) { } |
207 | * macro override instead of weak attribute alias, to workaround | ||
208 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
209 | */ | ||
210 | #ifndef arch_task_cache_init | ||
211 | #define arch_task_cache_init() | ||
212 | #endif | ||
213 | 245 | ||
214 | void __init fork_init(unsigned long mempages) | 246 | void __init fork_init(unsigned long mempages) |
215 | { | 247 | { |
216 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 248 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
217 | #ifndef ARCH_MIN_TASKALIGN | 249 | #ifndef ARCH_MIN_TASKALIGN |
218 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 250 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
219 | #endif | 251 | #endif |
@@ -260,8 +292,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
260 | int node = tsk_fork_get_node(orig); | 292 | int node = tsk_fork_get_node(orig); |
261 | int err; | 293 | int err; |
262 | 294 | ||
263 | prepare_to_copy(orig); | ||
264 | |||
265 | tsk = alloc_task_struct_node(node); | 295 | tsk = alloc_task_struct_node(node); |
266 | if (!tsk) | 296 | if (!tsk) |
267 | return NULL; | 297 | return NULL; |
@@ -1162,6 +1192,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1162 | goto fork_out; | 1192 | goto fork_out; |
1163 | 1193 | ||
1164 | ftrace_graph_init_task(p); | 1194 | ftrace_graph_init_task(p); |
1195 | get_seccomp_filter(p); | ||
1165 | 1196 | ||
1166 | rt_mutex_init_task(p); | 1197 | rt_mutex_init_task(p); |
1167 | 1198 | ||
@@ -1464,6 +1495,8 @@ bad_fork_cleanup_io: | |||
1464 | if (p->io_context) | 1495 | if (p->io_context) |
1465 | exit_io_context(p); | 1496 | exit_io_context(p); |
1466 | bad_fork_cleanup_namespaces: | 1497 | bad_fork_cleanup_namespaces: |
1498 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1499 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1467 | exit_task_namespaces(p); | 1500 | exit_task_namespaces(p); |
1468 | bad_fork_cleanup_mm: | 1501 | bad_fork_cleanup_mm: |
1469 | if (p->mm) | 1502 | if (p->mm) |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2a..6df614912b9d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
108 | 108 | ||
109 | touch_nmi_watchdog(); | 109 | touch_nmi_watchdog(); |
110 | 110 | ||
111 | if (sysctl_hung_task_panic) | 111 | if (sysctl_hung_task_panic) { |
112 | trigger_all_cpu_backtrace(); | ||
112 | panic("hung_task: blocked tasks"); | 113 | panic("hung_task: blocked tasks"); |
114 | } | ||
113 | } | 115 | } |
114 | 116 | ||
115 | /* | 117 | /* |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index cf1a4a68ce44..d1a758bc972a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -62,7 +62,7 @@ config IRQ_DOMAIN_DEBUG | |||
62 | help | 62 | help |
63 | This option will show the mapping relationship between hardware irq | 63 | This option will show the mapping relationship between hardware irq |
64 | numbers and Linux irq numbers. The mapping is exposed via debugfs | 64 | numbers and Linux irq numbers. The mapping is exposed via debugfs |
65 | in the file "virq_mapping". | 65 | in the file "irq_domain_mapping". |
66 | 66 | ||
67 | If you don't know what this means you don't need it. | 67 | If you don't know what this means you don't need it. |
68 | 68 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6080f6bc8c33..fc275e4f629b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
379 | * If its disabled or no action available | 379 | * If its disabled or no action available |
380 | * keep it masked and get out of here | 380 | * keep it masked and get out of here |
381 | */ | 381 | */ |
382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
383 | desc->istate |= IRQS_PENDING; | ||
383 | goto out_unlock; | 384 | goto out_unlock; |
385 | } | ||
384 | 386 | ||
385 | handle_irq_event(desc); | 387 | handle_irq_event(desc); |
386 | 388 | ||
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
518 | out_unlock: | 520 | out_unlock: |
519 | raw_spin_unlock(&desc->lock); | 521 | raw_spin_unlock(&desc->lock); |
520 | } | 522 | } |
523 | EXPORT_SYMBOL(handle_edge_irq); | ||
521 | 524 | ||
522 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | 525 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER |
523 | /** | 526 | /** |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 97a8bfadc88a..e75e29e4434a 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
@@ -4,10 +4,10 @@ | |||
4 | 4 | ||
5 | #include <linux/kallsyms.h> | 5 | #include <linux/kallsyms.h> |
6 | 6 | ||
7 | #define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) | 7 | #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) |
8 | #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) | 8 | #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) |
9 | /* FIXME */ | 9 | /* FIXME */ |
10 | #define PD(f) do { } while (0) | 10 | #define ___PD(f) do { } while (0) |
11 | 11 | ||
12 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | 12 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) |
13 | { | 13 | { |
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
23 | print_symbol("%s\n", (unsigned long)desc->action->handler); | 23 | print_symbol("%s\n", (unsigned long)desc->action->handler); |
24 | } | 24 | } |
25 | 25 | ||
26 | P(IRQ_LEVEL); | 26 | ___P(IRQ_LEVEL); |
27 | P(IRQ_PER_CPU); | 27 | ___P(IRQ_PER_CPU); |
28 | P(IRQ_NOPROBE); | 28 | ___P(IRQ_NOPROBE); |
29 | P(IRQ_NOREQUEST); | 29 | ___P(IRQ_NOREQUEST); |
30 | P(IRQ_NOTHREAD); | 30 | ___P(IRQ_NOTHREAD); |
31 | P(IRQ_NOAUTOEN); | 31 | ___P(IRQ_NOAUTOEN); |
32 | 32 | ||
33 | PS(IRQS_AUTODETECT); | 33 | ___PS(IRQS_AUTODETECT); |
34 | PS(IRQS_REPLAY); | 34 | ___PS(IRQS_REPLAY); |
35 | PS(IRQS_WAITING); | 35 | ___PS(IRQS_WAITING); |
36 | PS(IRQS_PENDING); | 36 | ___PS(IRQS_PENDING); |
37 | 37 | ||
38 | PD(IRQS_INPROGRESS); | 38 | ___PD(IRQS_INPROGRESS); |
39 | PD(IRQS_DISABLED); | 39 | ___PD(IRQS_DISABLED); |
40 | PD(IRQS_MASKED); | 40 | ___PD(IRQS_MASKED); |
41 | } | 41 | } |
42 | 42 | ||
43 | #undef P | 43 | #undef ___P |
44 | #undef PS | 44 | #undef ___PS |
45 | #undef PD | 45 | #undef ___PD |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d86e254b95eb..192a302d6cfd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
112 | { | 112 | { |
113 | return radix_tree_lookup(&irq_desc_tree, irq); | 113 | return radix_tree_lookup(&irq_desc_tree, irq); |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(irq_to_desc); | ||
115 | 116 | ||
116 | static void delete_irq_desc(unsigned int irq) | 117 | static void delete_irq_desc(unsigned int irq) |
117 | { | 118 | { |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3601f3fbf67c..0e0ba5f840b2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -23,7 +23,6 @@ static LIST_HEAD(irq_domain_list); | |||
23 | static DEFINE_MUTEX(irq_domain_mutex); | 23 | static DEFINE_MUTEX(irq_domain_mutex); |
24 | 24 | ||
25 | static DEFINE_MUTEX(revmap_trees_mutex); | 25 | static DEFINE_MUTEX(revmap_trees_mutex); |
26 | static unsigned int irq_virq_count = NR_IRQS; | ||
27 | static struct irq_domain *irq_default_domain; | 26 | static struct irq_domain *irq_default_domain; |
28 | 27 | ||
29 | /** | 28 | /** |
@@ -184,13 +183,16 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, | |||
184 | } | 183 | } |
185 | 184 | ||
186 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | 185 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, |
186 | unsigned int max_irq, | ||
187 | const struct irq_domain_ops *ops, | 187 | const struct irq_domain_ops *ops, |
188 | void *host_data) | 188 | void *host_data) |
189 | { | 189 | { |
190 | struct irq_domain *domain = irq_domain_alloc(of_node, | 190 | struct irq_domain *domain = irq_domain_alloc(of_node, |
191 | IRQ_DOMAIN_MAP_NOMAP, ops, host_data); | 191 | IRQ_DOMAIN_MAP_NOMAP, ops, host_data); |
192 | if (domain) | 192 | if (domain) { |
193 | domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; | ||
193 | irq_domain_add(domain); | 194 | irq_domain_add(domain); |
195 | } | ||
194 | return domain; | 196 | return domain; |
195 | } | 197 | } |
196 | 198 | ||
@@ -262,22 +264,6 @@ void irq_set_default_host(struct irq_domain *domain) | |||
262 | irq_default_domain = domain; | 264 | irq_default_domain = domain; |
263 | } | 265 | } |
264 | 266 | ||
265 | /** | ||
266 | * irq_set_virq_count() - Set the maximum number of linux irqs | ||
267 | * @count: number of linux irqs, capped with NR_IRQS | ||
268 | * | ||
269 | * This is mainly for use by platforms like iSeries who want to program | ||
270 | * the virtual irq number in the controller to avoid the reverse mapping | ||
271 | */ | ||
272 | void irq_set_virq_count(unsigned int count) | ||
273 | { | ||
274 | pr_debug("irq: Trying to set virq count to %d\n", count); | ||
275 | |||
276 | BUG_ON(count < NUM_ISA_INTERRUPTS); | ||
277 | if (count < NR_IRQS) | ||
278 | irq_virq_count = count; | ||
279 | } | ||
280 | |||
281 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | 267 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, |
282 | irq_hw_number_t hwirq) | 268 | irq_hw_number_t hwirq) |
283 | { | 269 | { |
@@ -320,13 +306,12 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
320 | pr_debug("irq: create_direct virq allocation failed\n"); | 306 | pr_debug("irq: create_direct virq allocation failed\n"); |
321 | return 0; | 307 | return 0; |
322 | } | 308 | } |
323 | if (virq >= irq_virq_count) { | 309 | if (virq >= domain->revmap_data.nomap.max_irq) { |
324 | pr_err("ERROR: no free irqs available below %i maximum\n", | 310 | pr_err("ERROR: no free irqs available below %i maximum\n", |
325 | irq_virq_count); | 311 | domain->revmap_data.nomap.max_irq); |
326 | irq_free_desc(virq); | 312 | irq_free_desc(virq); |
327 | return 0; | 313 | return 0; |
328 | } | 314 | } |
329 | |||
330 | pr_debug("irq: create_direct obtained virq %d\n", virq); | 315 | pr_debug("irq: create_direct obtained virq %d\n", virq); |
331 | 316 | ||
332 | if (irq_setup_virq(domain, virq, virq)) { | 317 | if (irq_setup_virq(domain, virq, virq)) { |
@@ -350,7 +335,8 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
350 | unsigned int irq_create_mapping(struct irq_domain *domain, | 335 | unsigned int irq_create_mapping(struct irq_domain *domain, |
351 | irq_hw_number_t hwirq) | 336 | irq_hw_number_t hwirq) |
352 | { | 337 | { |
353 | unsigned int virq, hint; | 338 | unsigned int hint; |
339 | int virq; | ||
354 | 340 | ||
355 | pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | 341 | pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); |
356 | 342 | ||
@@ -377,13 +363,13 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
377 | return irq_domain_legacy_revmap(domain, hwirq); | 363 | return irq_domain_legacy_revmap(domain, hwirq); |
378 | 364 | ||
379 | /* Allocate a virtual interrupt number */ | 365 | /* Allocate a virtual interrupt number */ |
380 | hint = hwirq % irq_virq_count; | 366 | hint = hwirq % nr_irqs; |
381 | if (hint == 0) | 367 | if (hint == 0) |
382 | hint++; | 368 | hint++; |
383 | virq = irq_alloc_desc_from(hint, 0); | 369 | virq = irq_alloc_desc_from(hint, 0); |
384 | if (!virq) | 370 | if (virq <= 0) |
385 | virq = irq_alloc_desc_from(1, 0); | 371 | virq = irq_alloc_desc_from(1, 0); |
386 | if (!virq) { | 372 | if (virq <= 0) { |
387 | pr_debug("irq: -> virq allocation failed\n"); | 373 | pr_debug("irq: -> virq allocation failed\n"); |
388 | return 0; | 374 | return 0; |
389 | } | 375 | } |
@@ -515,7 +501,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
515 | irq_hw_number_t hwirq) | 501 | irq_hw_number_t hwirq) |
516 | { | 502 | { |
517 | unsigned int i; | 503 | unsigned int i; |
518 | unsigned int hint = hwirq % irq_virq_count; | 504 | unsigned int hint = hwirq % nr_irqs; |
519 | 505 | ||
520 | /* Look for default domain if nececssary */ | 506 | /* Look for default domain if nececssary */ |
521 | if (domain == NULL) | 507 | if (domain == NULL) |
@@ -536,7 +522,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
536 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) | 522 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) |
537 | return i; | 523 | return i; |
538 | i++; | 524 | i++; |
539 | if (i >= irq_virq_count) | 525 | if (i >= nr_irqs) |
540 | i = 1; | 526 | i = 1; |
541 | } while(i != hint); | 527 | } while(i != hint); |
542 | return 0; | 528 | return 0; |
@@ -642,8 +628,9 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
642 | void *data; | 628 | void *data; |
643 | int i; | 629 | int i; |
644 | 630 | ||
645 | seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", | 631 | seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", |
646 | "chip name", "chip data", "domain name"); | 632 | "chip name", (int)(2 * sizeof(void *) + 2), "chip data", |
633 | "domain name"); | ||
647 | 634 | ||
648 | for (i = 1; i < nr_irqs; i++) { | 635 | for (i = 1; i < nr_irqs; i++) { |
649 | desc = irq_to_desc(i); | 636 | desc = irq_to_desc(i); |
@@ -666,7 +653,7 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
666 | seq_printf(m, "%-15s ", p); | 653 | seq_printf(m, "%-15s ", p); |
667 | 654 | ||
668 | data = irq_desc_get_chip_data(desc); | 655 | data = irq_desc_get_chip_data(desc); |
669 | seq_printf(m, "0x%16p ", data); | 656 | seq_printf(m, data ? "0x%p " : " %p ", data); |
670 | 657 | ||
671 | if (desc->irq_data.domain && desc->irq_data.domain->of_node) | 658 | if (desc->irq_data.domain && desc->irq_data.domain->of_node) |
672 | p = desc->irq_data.domain->of_node->full_name; | 659 | p = desc->irq_data.domain->of_node->full_name; |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569b..bb32326afe87 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
565 | * IRQF_TRIGGER_* but the PIC does not support multiple | 565 | * IRQF_TRIGGER_* but the PIC does not support multiple |
566 | * flow-types? | 566 | * flow-types? |
567 | */ | 567 | */ |
568 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, | 568 | pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, |
569 | chip ? (chip->name ? : "unknown") : "unknown"); | 569 | chip ? (chip->name ? : "unknown") : "unknown"); |
570 | return 0; | 570 | return 0; |
571 | } | 571 | } |
572 | 572 | ||
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
600 | ret = 0; | 600 | ret = 0; |
601 | break; | 601 | break; |
602 | default: | 602 | default: |
603 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 603 | pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", |
604 | flags, irq, chip->irq_set_type); | 604 | flags, irq, chip->irq_set_type); |
605 | } | 605 | } |
606 | if (unmask) | 606 | if (unmask) |
@@ -837,8 +837,7 @@ void exit_irq_thread(void) | |||
837 | 837 | ||
838 | action = kthread_data(tsk); | 838 | action = kthread_data(tsk); |
839 | 839 | ||
840 | printk(KERN_ERR | 840 | pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
841 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
842 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | 841 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); |
843 | 842 | ||
844 | desc = irq_to_desc(action->irq); | 843 | desc = irq_to_desc(action->irq); |
@@ -878,7 +877,6 @@ static int | |||
878 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 877 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
879 | { | 878 | { |
880 | struct irqaction *old, **old_ptr; | 879 | struct irqaction *old, **old_ptr; |
881 | const char *old_name = NULL; | ||
882 | unsigned long flags, thread_mask = 0; | 880 | unsigned long flags, thread_mask = 0; |
883 | int ret, nested, shared = 0; | 881 | int ret, nested, shared = 0; |
884 | cpumask_var_t mask; | 882 | cpumask_var_t mask; |
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
972 | */ | 970 | */ |
973 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 971 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
974 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || | 972 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
975 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | 973 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) |
976 | old_name = old->name; | ||
977 | goto mismatch; | 974 | goto mismatch; |
978 | } | ||
979 | 975 | ||
980 | /* All handlers must agree on per-cpuness */ | 976 | /* All handlers must agree on per-cpuness */ |
981 | if ((old->flags & IRQF_PERCPU) != | 977 | if ((old->flags & IRQF_PERCPU) != |
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1031 | * all existing action->thread_mask bits. | 1027 | * all existing action->thread_mask bits. |
1032 | */ | 1028 | */ |
1033 | new->thread_mask = 1 << ffz(thread_mask); | 1029 | new->thread_mask = 1 << ffz(thread_mask); |
1030 | |||
1031 | } else if (new->handler == irq_default_primary_handler) { | ||
1032 | /* | ||
1033 | * The interrupt was requested with handler = NULL, so | ||
1034 | * we use the default primary handler for it. But it | ||
1035 | * does not have the oneshot flag set. In combination | ||
1036 | * with level interrupts this is deadly, because the | ||
1037 | * default primary handler just wakes the thread, then | ||
1038 | * the irq lines is reenabled, but the device still | ||
1039 | * has the level irq asserted. Rinse and repeat.... | ||
1040 | * | ||
1041 | * While this works for edge type interrupts, we play | ||
1042 | * it safe and reject unconditionally because we can't | ||
1043 | * say for sure which type this interrupt really | ||
1044 | * has. The type flags are unreliable as the | ||
1045 | * underlying chip implementation can override them. | ||
1046 | */ | ||
1047 | pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", | ||
1048 | irq); | ||
1049 | ret = -EINVAL; | ||
1050 | goto out_mask; | ||
1034 | } | 1051 | } |
1035 | 1052 | ||
1036 | if (!shared) { | 1053 | if (!shared) { |
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1078 | 1095 | ||
1079 | if (nmsk != omsk) | 1096 | if (nmsk != omsk) |
1080 | /* hope the handler works with current trigger mode */ | 1097 | /* hope the handler works with current trigger mode */ |
1081 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", | 1098 | pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", |
1082 | irq, nmsk, omsk); | 1099 | irq, nmsk, omsk); |
1083 | } | 1100 | } |
1084 | 1101 | ||
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1115 | return 0; | 1132 | return 0; |
1116 | 1133 | ||
1117 | mismatch: | 1134 | mismatch: |
1118 | #ifdef CONFIG_DEBUG_SHIRQ | ||
1119 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 1135 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
1120 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 1136 | pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", |
1121 | if (old_name) | 1137 | irq, new->flags, new->name, old->flags, old->name); |
1122 | printk(KERN_ERR "current handler: %s\n", old_name); | 1138 | #ifdef CONFIG_DEBUG_SHIRQ |
1123 | dump_stack(); | 1139 | dump_stack(); |
1124 | } | ||
1125 | #endif | 1140 | #endif |
1141 | } | ||
1126 | ret = -EBUSY; | 1142 | ret = -EBUSY; |
1127 | 1143 | ||
1128 | out_mask: | 1144 | out_mask: |
@@ -1204,12 +1220,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1204 | /* Found it - now remove it from the list of entries: */ | 1220 | /* Found it - now remove it from the list of entries: */ |
1205 | *action_ptr = action->next; | 1221 | *action_ptr = action->next; |
1206 | 1222 | ||
1207 | /* Currently used only by UML, might disappear one day: */ | ||
1208 | #ifdef CONFIG_IRQ_RELEASE_METHOD | ||
1209 | if (desc->irq_data.chip->release) | ||
1210 | desc->irq_data.chip->release(irq, dev_id); | ||
1211 | #endif | ||
1212 | |||
1213 | /* If this was the last handler, shut down the IRQ line: */ | 1223 | /* If this was the last handler, shut down the IRQ line: */ |
1214 | if (!desc->action) | 1224 | if (!desc->action) |
1215 | irq_shutdown(desc); | 1225 | irq_shutdown(desc); |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a6..cb228bf21760 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void) | |||
103 | int irq; | 103 | int irq; |
104 | 104 | ||
105 | for_each_irq_desc(irq, desc) { | 105 | for_each_irq_desc(irq, desc) { |
106 | /* | ||
107 | * Only interrupts which are marked as wakeup source | ||
108 | * and have not been disabled before the suspend check | ||
109 | * can abort suspend. | ||
110 | */ | ||
106 | if (irqd_is_wakeup_set(&desc->irq_data)) { | 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
107 | if (desc->istate & IRQS_PENDING) | 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) |
108 | return -EBUSY; | 113 | return -EBUSY; |
109 | continue; | 114 | continue; |
110 | } | 115 | } |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c9..6454db7b6a4d 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
58 | /* | 58 | /* |
59 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
60 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
61 | * active. | 61 | * active. Clear the pending bit so suspend/resume does not |
62 | * get confused. | ||
62 | */ | 63 | */ |
63 | if (irq_settings_is_level(desc)) | 64 | if (irq_settings_is_level(desc)) { |
65 | desc->istate &= ~IRQS_PENDING; | ||
64 | return; | 66 | return; |
67 | } | ||
65 | if (desc->istate & IRQS_REPLAY) | 68 | if (desc->istate & IRQS_REPLAY) |
66 | return; | 69 | return; |
67 | if (desc->istate & IRQS_PENDING) { | 70 | if (desc->istate & IRQS_PENDING) { |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c3c46c72046e..1588e3b2871b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -5,11 +5,13 @@ | |||
5 | * context. The enqueueing is NMI-safe. | 5 | * context. The enqueueing is NMI-safe. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/bug.h> | ||
8 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
9 | #include <linux/export.h> | 10 | #include <linux/export.h> |
10 | #include <linux/irq_work.h> | 11 | #include <linux/irq_work.h> |
11 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
12 | #include <linux/hardirq.h> | 13 | #include <linux/hardirq.h> |
14 | #include <linux/irqflags.h> | ||
13 | #include <asm/processor.h> | 15 | #include <asm/processor.h> |
14 | 16 | ||
15 | /* | 17 | /* |
diff --git a/kernel/itimer.c b/kernel/itimer.c index 22000c3db0dd..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -284,8 +284,12 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, | |||
284 | if (value) { | 284 | if (value) { |
285 | if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) | 285 | if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) |
286 | return -EFAULT; | 286 | return -EFAULT; |
287 | } else | 287 | } else { |
288 | memset((char *) &set_buffer, 0, sizeof(set_buffer)); | 288 | memset(&set_buffer, 0, sizeof(set_buffer)); |
289 | printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." | ||
290 | " Misfeature support will be removed\n", | ||
291 | current->comm); | ||
292 | } | ||
289 | 293 | ||
290 | error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); | 294 | error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); |
291 | if (error || !ovalue) | 295 | if (error || !ovalue) |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 957a7aab8ebc..05698a7415fe 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -322,7 +322,7 @@ static void __call_usermodehelper(struct work_struct *work) | |||
322 | * land has been frozen during a system-wide hibernation or suspend operation). | 322 | * land has been frozen during a system-wide hibernation or suspend operation). |
323 | * Should always be manipulated under umhelper_sem acquired for write. | 323 | * Should always be manipulated under umhelper_sem acquired for write. |
324 | */ | 324 | */ |
325 | static int usermodehelper_disabled = 1; | 325 | static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; |
326 | 326 | ||
327 | /* Number of helpers running */ | 327 | /* Number of helpers running */ |
328 | static atomic_t running_helpers = ATOMIC_INIT(0); | 328 | static atomic_t running_helpers = ATOMIC_INIT(0); |
@@ -334,32 +334,110 @@ static atomic_t running_helpers = ATOMIC_INIT(0); | |||
334 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); | 334 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); |
335 | 335 | ||
336 | /* | 336 | /* |
337 | * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled | ||
338 | * to become 'false'. | ||
339 | */ | ||
340 | static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); | ||
341 | |||
342 | /* | ||
337 | * Time to wait for running_helpers to become zero before the setting of | 343 | * Time to wait for running_helpers to become zero before the setting of |
338 | * usermodehelper_disabled in usermodehelper_disable() fails | 344 | * usermodehelper_disabled in usermodehelper_disable() fails |
339 | */ | 345 | */ |
340 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) | 346 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) |
341 | 347 | ||
342 | void read_lock_usermodehelper(void) | 348 | int usermodehelper_read_trylock(void) |
343 | { | 349 | { |
350 | DEFINE_WAIT(wait); | ||
351 | int ret = 0; | ||
352 | |||
344 | down_read(&umhelper_sem); | 353 | down_read(&umhelper_sem); |
354 | for (;;) { | ||
355 | prepare_to_wait(&usermodehelper_disabled_waitq, &wait, | ||
356 | TASK_INTERRUPTIBLE); | ||
357 | if (!usermodehelper_disabled) | ||
358 | break; | ||
359 | |||
360 | if (usermodehelper_disabled == UMH_DISABLED) | ||
361 | ret = -EAGAIN; | ||
362 | |||
363 | up_read(&umhelper_sem); | ||
364 | |||
365 | if (ret) | ||
366 | break; | ||
367 | |||
368 | schedule(); | ||
369 | try_to_freeze(); | ||
370 | |||
371 | down_read(&umhelper_sem); | ||
372 | } | ||
373 | finish_wait(&usermodehelper_disabled_waitq, &wait); | ||
374 | return ret; | ||
375 | } | ||
376 | EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); | ||
377 | |||
378 | long usermodehelper_read_lock_wait(long timeout) | ||
379 | { | ||
380 | DEFINE_WAIT(wait); | ||
381 | |||
382 | if (timeout < 0) | ||
383 | return -EINVAL; | ||
384 | |||
385 | down_read(&umhelper_sem); | ||
386 | for (;;) { | ||
387 | prepare_to_wait(&usermodehelper_disabled_waitq, &wait, | ||
388 | TASK_UNINTERRUPTIBLE); | ||
389 | if (!usermodehelper_disabled) | ||
390 | break; | ||
391 | |||
392 | up_read(&umhelper_sem); | ||
393 | |||
394 | timeout = schedule_timeout(timeout); | ||
395 | if (!timeout) | ||
396 | break; | ||
397 | |||
398 | down_read(&umhelper_sem); | ||
399 | } | ||
400 | finish_wait(&usermodehelper_disabled_waitq, &wait); | ||
401 | return timeout; | ||
345 | } | 402 | } |
346 | EXPORT_SYMBOL_GPL(read_lock_usermodehelper); | 403 | EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); |
347 | 404 | ||
348 | void read_unlock_usermodehelper(void) | 405 | void usermodehelper_read_unlock(void) |
349 | { | 406 | { |
350 | up_read(&umhelper_sem); | 407 | up_read(&umhelper_sem); |
351 | } | 408 | } |
352 | EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); | 409 | EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); |
353 | 410 | ||
354 | /** | 411 | /** |
355 | * usermodehelper_disable - prevent new helpers from being started | 412 | * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. |
413 | * depth: New value to assign to usermodehelper_disabled. | ||
414 | * | ||
415 | * Change the value of usermodehelper_disabled (under umhelper_sem locked for | ||
416 | * writing) and wakeup tasks waiting for it to change. | ||
356 | */ | 417 | */ |
357 | int usermodehelper_disable(void) | 418 | void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) |
419 | { | ||
420 | down_write(&umhelper_sem); | ||
421 | usermodehelper_disabled = depth; | ||
422 | wake_up(&usermodehelper_disabled_waitq); | ||
423 | up_write(&umhelper_sem); | ||
424 | } | ||
425 | |||
426 | /** | ||
427 | * __usermodehelper_disable - Prevent new helpers from being started. | ||
428 | * @depth: New value to assign to usermodehelper_disabled. | ||
429 | * | ||
430 | * Set usermodehelper_disabled to @depth and wait for running helpers to exit. | ||
431 | */ | ||
432 | int __usermodehelper_disable(enum umh_disable_depth depth) | ||
358 | { | 433 | { |
359 | long retval; | 434 | long retval; |
360 | 435 | ||
436 | if (!depth) | ||
437 | return -EINVAL; | ||
438 | |||
361 | down_write(&umhelper_sem); | 439 | down_write(&umhelper_sem); |
362 | usermodehelper_disabled = 1; | 440 | usermodehelper_disabled = depth; |
363 | up_write(&umhelper_sem); | 441 | up_write(&umhelper_sem); |
364 | 442 | ||
365 | /* | 443 | /* |
@@ -374,31 +452,10 @@ int usermodehelper_disable(void) | |||
374 | if (retval) | 452 | if (retval) |
375 | return 0; | 453 | return 0; |
376 | 454 | ||
377 | down_write(&umhelper_sem); | 455 | __usermodehelper_set_disable_depth(UMH_ENABLED); |
378 | usermodehelper_disabled = 0; | ||
379 | up_write(&umhelper_sem); | ||
380 | return -EAGAIN; | 456 | return -EAGAIN; |
381 | } | 457 | } |
382 | 458 | ||
383 | /** | ||
384 | * usermodehelper_enable - allow new helpers to be started again | ||
385 | */ | ||
386 | void usermodehelper_enable(void) | ||
387 | { | ||
388 | down_write(&umhelper_sem); | ||
389 | usermodehelper_disabled = 0; | ||
390 | up_write(&umhelper_sem); | ||
391 | } | ||
392 | |||
393 | /** | ||
394 | * usermodehelper_is_disabled - check if new helpers are allowed to be started | ||
395 | */ | ||
396 | bool usermodehelper_is_disabled(void) | ||
397 | { | ||
398 | return usermodehelper_disabled; | ||
399 | } | ||
400 | EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); | ||
401 | |||
402 | static void helper_lock(void) | 459 | static void helper_lock(void) |
403 | { | 460 | { |
404 | atomic_inc(&running_helpers); | 461 | atomic_inc(&running_helpers); |
diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e425..4edbd9c11aca 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info, | |||
2429 | goto free_hdr; | 2429 | goto free_hdr; |
2430 | } | 2430 | } |
2431 | 2431 | ||
2432 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { | 2432 | if (hdr->e_shoff >= len || |
2433 | hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { | ||
2433 | err = -ENOEXEC; | 2434 | err = -ENOEXEC; |
2434 | goto free_hdr; | 2435 | goto free_hdr; |
2435 | } | 2436 | } |
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod, | |||
2953 | 2954 | ||
2954 | /* Module is ready to execute: parsing args may do that. */ | 2955 | /* Module is ready to execute: parsing args may do that. */ |
2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 2956 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
2956 | -32768, 32767, NULL); | 2957 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
2957 | if (err < 0) | 2958 | if (err < 0) |
2958 | goto unlink; | 2959 | goto unlink; |
2959 | 2960 | ||
diff --git a/kernel/padata.c b/kernel/padata.c index 6f10eb285ece..89fe3d1b9efb 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -1,6 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * padata.c - generic interface to process data streams in parallel | 2 | * padata.c - generic interface to process data streams in parallel |
3 | * | 3 | * |
4 | * See Documentation/padata.txt for an api documentation. | ||
5 | * | ||
4 | * Copyright (C) 2008, 2009 secunet Security Networks AG | 6 | * Copyright (C) 2008, 2009 secunet Security Networks AG |
5 | * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> | 7 | * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> |
6 | * | 8 | * |
@@ -354,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd, | |||
354 | if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) | 356 | if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) |
355 | return -ENOMEM; | 357 | return -ENOMEM; |
356 | 358 | ||
357 | cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); | 359 | cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); |
358 | if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { | 360 | if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { |
359 | free_cpumask_var(pd->cpumask.cbcpu); | 361 | free_cpumask_var(pd->cpumask.cbcpu); |
360 | return -ENOMEM; | 362 | return -ENOMEM; |
361 | } | 363 | } |
362 | 364 | ||
363 | cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); | 365 | cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); |
364 | return 0; | 366 | return 0; |
365 | } | 367 | } |
366 | 368 | ||
@@ -564,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier); | |||
564 | static bool padata_validate_cpumask(struct padata_instance *pinst, | 566 | static bool padata_validate_cpumask(struct padata_instance *pinst, |
565 | const struct cpumask *cpumask) | 567 | const struct cpumask *cpumask) |
566 | { | 568 | { |
567 | if (!cpumask_intersects(cpumask, cpu_active_mask)) { | 569 | if (!cpumask_intersects(cpumask, cpu_online_mask)) { |
568 | pinst->flags |= PADATA_INVALID; | 570 | pinst->flags |= PADATA_INVALID; |
569 | return false; | 571 | return false; |
570 | } | 572 | } |
@@ -678,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) | |||
678 | { | 680 | { |
679 | struct parallel_data *pd; | 681 | struct parallel_data *pd; |
680 | 682 | ||
681 | if (cpumask_test_cpu(cpu, cpu_active_mask)) { | 683 | if (cpumask_test_cpu(cpu, cpu_online_mask)) { |
682 | pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, | 684 | pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, |
683 | pinst->cpumask.cbcpu); | 685 | pinst->cpumask.cbcpu); |
684 | if (!pd) | 686 | if (!pd) |
@@ -746,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | |||
746 | return -ENOMEM; | 748 | return -ENOMEM; |
747 | 749 | ||
748 | padata_replace(pinst, pd); | 750 | padata_replace(pinst, pd); |
751 | |||
752 | cpumask_clear_cpu(cpu, pd->cpumask.cbcpu); | ||
753 | cpumask_clear_cpu(cpu, pd->cpumask.pcpu); | ||
749 | } | 754 | } |
750 | 755 | ||
751 | return 0; | 756 | return 0; |
diff --git a/kernel/panic.c b/kernel/panic.c index 80aed44e345a..8ed89a175d79 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -97,7 +97,7 @@ void panic(const char *fmt, ...) | |||
97 | /* | 97 | /* |
98 | * Avoid nested stack-dumping if a panic occurs during oops processing | 98 | * Avoid nested stack-dumping if a panic occurs during oops processing |
99 | */ | 99 | */ |
100 | if (!oops_in_progress) | 100 | if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) |
101 | dump_stack(); | 101 | dump_stack(); |
102 | #endif | 102 | #endif |
103 | 103 | ||
diff --git a/kernel/params.c b/kernel/params.c index f37d82631347..ed35345be536 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) | |||
85 | 85 | ||
86 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
87 | char *val, | 87 | char *val, |
88 | const char *doing, | ||
88 | const struct kernel_param *params, | 89 | const struct kernel_param *params, |
89 | unsigned num_params, | 90 | unsigned num_params, |
90 | s16 min_level, | 91 | s16 min_level, |
91 | s16 max_level, | 92 | s16 max_level, |
92 | int (*handle_unknown)(char *param, char *val)) | 93 | int (*handle_unknown)(char *param, char *val, |
94 | const char *doing)) | ||
93 | { | 95 | { |
94 | unsigned int i; | 96 | unsigned int i; |
95 | int err; | 97 | int err; |
@@ -104,8 +106,8 @@ static int parse_one(char *param, | |||
104 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && params[i].ops->set != param_set_bool |
105 | && params[i].ops->set != param_set_bint) | 107 | && params[i].ops->set != param_set_bint) |
106 | return -EINVAL; | 108 | return -EINVAL; |
107 | pr_debug("They are equal! Calling %p\n", | 109 | pr_debug("handling %s with %p\n", param, |
108 | params[i].ops->set); | 110 | params[i].ops->set); |
109 | mutex_lock(¶m_lock); | 111 | mutex_lock(¶m_lock); |
110 | err = params[i].ops->set(val, ¶ms[i]); | 112 | err = params[i].ops->set(val, ¶ms[i]); |
111 | mutex_unlock(¶m_lock); | 113 | mutex_unlock(¶m_lock); |
@@ -114,11 +116,11 @@ static int parse_one(char *param, | |||
114 | } | 116 | } |
115 | 117 | ||
116 | if (handle_unknown) { | 118 | if (handle_unknown) { |
117 | pr_debug("Unknown argument: calling %p\n", handle_unknown); | 119 | pr_debug("doing %s: %s='%s'\n", doing, param, val); |
118 | return handle_unknown(param, val); | 120 | return handle_unknown(param, val, doing); |
119 | } | 121 | } |
120 | 122 | ||
121 | pr_debug("Unknown argument `%s'\n", param); | 123 | pr_debug("Unknown argument '%s'\n", param); |
122 | return -ENOENT; | 124 | return -ENOENT; |
123 | } | 125 | } |
124 | 126 | ||
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) | |||
175 | } | 177 | } |
176 | 178 | ||
177 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
178 | int parse_args(const char *name, | 180 | int parse_args(const char *doing, |
179 | char *args, | 181 | char *args, |
180 | const struct kernel_param *params, | 182 | const struct kernel_param *params, |
181 | unsigned num, | 183 | unsigned num, |
182 | s16 min_level, | 184 | s16 min_level, |
183 | s16 max_level, | 185 | s16 max_level, |
184 | int (*unknown)(char *param, char *val)) | 186 | int (*unknown)(char *param, char *val, const char *doing)) |
185 | { | 187 | { |
186 | char *param, *val; | 188 | char *param, *val; |
187 | 189 | ||
188 | pr_debug("Parsing ARGS: %s\n", args); | ||
189 | |||
190 | /* Chew leading spaces */ | 190 | /* Chew leading spaces */ |
191 | args = skip_spaces(args); | 191 | args = skip_spaces(args); |
192 | 192 | ||
193 | if (*args) | ||
194 | pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); | ||
195 | |||
193 | while (*args) { | 196 | while (*args) { |
194 | int ret; | 197 | int ret; |
195 | int irq_was_disabled; | 198 | int irq_was_disabled; |
196 | 199 | ||
197 | args = next_arg(args, ¶m, &val); | 200 | args = next_arg(args, ¶m, &val); |
198 | irq_was_disabled = irqs_disabled(); | 201 | irq_was_disabled = irqs_disabled(); |
199 | ret = parse_one(param, val, params, num, | 202 | ret = parse_one(param, val, doing, params, num, |
200 | min_level, max_level, unknown); | 203 | min_level, max_level, unknown); |
201 | if (irq_was_disabled && !irqs_disabled()) { | 204 | if (irq_was_disabled && !irqs_disabled()) |
202 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | 205 | pr_warn("%s: option '%s' enabled irq's!\n", |
203 | "irq's!\n", param); | 206 | doing, param); |
204 | } | 207 | |
205 | switch (ret) { | 208 | switch (ret) { |
206 | case -ENOENT: | 209 | case -ENOENT: |
207 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | 210 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
208 | name, param); | ||
209 | return ret; | 211 | return ret; |
210 | case -ENOSPC: | 212 | case -ENOSPC: |
211 | printk(KERN_ERR | 213 | pr_err("%s: `%s' too large for parameter `%s'\n", |
212 | "%s: `%s' too large for parameter `%s'\n", | 214 | doing, val ?: "", param); |
213 | name, val ?: "", param); | ||
214 | return ret; | 215 | return ret; |
215 | case 0: | 216 | case 0: |
216 | break; | 217 | break; |
217 | default: | 218 | default: |
218 | printk(KERN_ERR | 219 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
219 | "%s: `%s' invalid for parameter `%s'\n", | 220 | doing, val ?: "", param); |
220 | name, val ?: "", param); | ||
221 | return ret; | 221 | return ret; |
222 | } | 222 | } |
223 | } | 223 | } |
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | |||
263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 263 | int param_set_charp(const char *val, const struct kernel_param *kp) |
264 | { | 264 | { |
265 | if (strlen(val) > 1024) { | 265 | if (strlen(val) > 1024) { |
266 | printk(KERN_ERR "%s: string parameter too long\n", | 266 | pr_err("%s: string parameter too long\n", kp->name); |
267 | kp->name); | ||
268 | return -ENOSPC; | 267 | return -ENOSPC; |
269 | } | 268 | } |
270 | 269 | ||
@@ -400,8 +399,7 @@ static int param_array(const char *name, | |||
400 | int len; | 399 | int len; |
401 | 400 | ||
402 | if (*num == max) { | 401 | if (*num == max) { |
403 | printk(KERN_ERR "%s: can only take %i arguments\n", | 402 | pr_err("%s: can only take %i arguments\n", name, max); |
404 | name, max); | ||
405 | return -EINVAL; | 403 | return -EINVAL; |
406 | } | 404 | } |
407 | len = strcspn(val, ","); | 405 | len = strcspn(val, ","); |
@@ -420,8 +418,7 @@ static int param_array(const char *name, | |||
420 | } while (save == ','); | 418 | } while (save == ','); |
421 | 419 | ||
422 | if (*num < min) { | 420 | if (*num < min) { |
423 | printk(KERN_ERR "%s: needs at least %i arguments\n", | 421 | pr_err("%s: needs at least %i arguments\n", name, min); |
424 | name, min); | ||
425 | return -EINVAL; | 422 | return -EINVAL; |
426 | } | 423 | } |
427 | return 0; | 424 | return 0; |
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) | |||
480 | const struct kparam_string *kps = kp->str; | 477 | const struct kparam_string *kps = kp->str; |
481 | 478 | ||
482 | if (strlen(val)+1 > kps->maxlen) { | 479 | if (strlen(val)+1 > kps->maxlen) { |
483 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 480 | pr_err("%s: string doesn't fit in %u chars.\n", |
484 | kp->name, kps->maxlen-1); | 481 | kp->name, kps->maxlen-1); |
485 | return -ENOSPC; | 482 | return -ENOSPC; |
486 | } | 483 | } |
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
750 | #endif | 747 | #endif |
751 | if (err) { | 748 | if (err) { |
752 | kobject_put(&mk->kobj); | 749 | kobject_put(&mk->kobj); |
753 | printk(KERN_ERR | 750 | pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", |
754 | "Module '%s' failed add to sysfs, error number %d\n", | ||
755 | name, err); | 751 | name, err); |
756 | printk(KERN_ERR | ||
757 | "The system will be unstable now.\n"); | ||
758 | return NULL; | 752 | return NULL; |
759 | } | 753 | } |
760 | 754 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e3216..8f9b4eb974e0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP | |||
103 | select HOTPLUG | 103 | select HOTPLUG |
104 | select HOTPLUG_CPU | 104 | select HOTPLUG_CPU |
105 | 105 | ||
106 | config PM_AUTOSLEEP | ||
107 | bool "Opportunistic sleep" | ||
108 | depends on PM_SLEEP | ||
109 | default n | ||
110 | ---help--- | ||
111 | Allow the kernel to trigger a system transition into a global sleep | ||
112 | state automatically whenever there are no active wakeup sources. | ||
113 | |||
114 | config PM_WAKELOCKS | ||
115 | bool "User space wakeup sources interface" | ||
116 | depends on PM_SLEEP | ||
117 | default n | ||
118 | ---help--- | ||
119 | Allow user space to create, activate and deactivate wakeup source | ||
120 | objects with the help of a sysfs-based interface. | ||
121 | |||
122 | config PM_WAKELOCKS_LIMIT | ||
123 | int "Maximum number of user space wakeup sources (0 = no limit)" | ||
124 | range 0 100000 | ||
125 | default 100 | ||
126 | depends on PM_WAKELOCKS | ||
127 | |||
128 | config PM_WAKELOCKS_GC | ||
129 | bool "Garbage collector for user space wakeup sources" | ||
130 | depends on PM_WAKELOCKS | ||
131 | default y | ||
132 | |||
106 | config PM_RUNTIME | 133 | config PM_RUNTIME |
107 | bool "Run-time PM core functionality" | 134 | bool "Run-time PM core functionality" |
108 | depends on !IA64_HP_SIM | 135 | depends on !IA64_HP_SIM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec5252..29472bff11ef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
11 | block_io.o | 11 | block_io.o |
12 | obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o | ||
13 | obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o | ||
12 | 14 | ||
13 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 000000000000..ca304046d9e2 --- /dev/null +++ b/kernel/power/autosleep.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * kernel/power/autosleep.c | ||
3 | * | ||
4 | * Opportunistic sleep support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | */ | ||
8 | |||
9 | #include <linux/device.h> | ||
10 | #include <linux/mutex.h> | ||
11 | #include <linux/pm_wakeup.h> | ||
12 | |||
13 | #include "power.h" | ||
14 | |||
15 | static suspend_state_t autosleep_state; | ||
16 | static struct workqueue_struct *autosleep_wq; | ||
17 | /* | ||
18 | * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source | ||
19 | * is active, otherwise a deadlock with try_to_suspend() is possible. | ||
20 | * Alternatively mutex_lock_interruptible() can be used. This will then fail | ||
21 | * if an auto_sleep cycle tries to freeze processes. | ||
22 | */ | ||
23 | static DEFINE_MUTEX(autosleep_lock); | ||
24 | static struct wakeup_source *autosleep_ws; | ||
25 | |||
26 | static void try_to_suspend(struct work_struct *work) | ||
27 | { | ||
28 | unsigned int initial_count, final_count; | ||
29 | |||
30 | if (!pm_get_wakeup_count(&initial_count, true)) | ||
31 | goto out; | ||
32 | |||
33 | mutex_lock(&autosleep_lock); | ||
34 | |||
35 | if (!pm_save_wakeup_count(initial_count)) { | ||
36 | mutex_unlock(&autosleep_lock); | ||
37 | goto out; | ||
38 | } | ||
39 | |||
40 | if (autosleep_state == PM_SUSPEND_ON) { | ||
41 | mutex_unlock(&autosleep_lock); | ||
42 | return; | ||
43 | } | ||
44 | if (autosleep_state >= PM_SUSPEND_MAX) | ||
45 | hibernate(); | ||
46 | else | ||
47 | pm_suspend(autosleep_state); | ||
48 | |||
49 | mutex_unlock(&autosleep_lock); | ||
50 | |||
51 | if (!pm_get_wakeup_count(&final_count, false)) | ||
52 | goto out; | ||
53 | |||
54 | /* | ||
55 | * If the wakeup occured for an unknown reason, wait to prevent the | ||
56 | * system from trying to suspend and waking up in a tight loop. | ||
57 | */ | ||
58 | if (final_count == initial_count) | ||
59 | schedule_timeout_uninterruptible(HZ / 2); | ||
60 | |||
61 | out: | ||
62 | queue_up_suspend_work(); | ||
63 | } | ||
64 | |||
65 | static DECLARE_WORK(suspend_work, try_to_suspend); | ||
66 | |||
67 | void queue_up_suspend_work(void) | ||
68 | { | ||
69 | if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) | ||
70 | queue_work(autosleep_wq, &suspend_work); | ||
71 | } | ||
72 | |||
73 | suspend_state_t pm_autosleep_state(void) | ||
74 | { | ||
75 | return autosleep_state; | ||
76 | } | ||
77 | |||
78 | int pm_autosleep_lock(void) | ||
79 | { | ||
80 | return mutex_lock_interruptible(&autosleep_lock); | ||
81 | } | ||
82 | |||
83 | void pm_autosleep_unlock(void) | ||
84 | { | ||
85 | mutex_unlock(&autosleep_lock); | ||
86 | } | ||
87 | |||
88 | int pm_autosleep_set_state(suspend_state_t state) | ||
89 | { | ||
90 | |||
91 | #ifndef CONFIG_HIBERNATION | ||
92 | if (state >= PM_SUSPEND_MAX) | ||
93 | return -EINVAL; | ||
94 | #endif | ||
95 | |||
96 | __pm_stay_awake(autosleep_ws); | ||
97 | |||
98 | mutex_lock(&autosleep_lock); | ||
99 | |||
100 | autosleep_state = state; | ||
101 | |||
102 | __pm_relax(autosleep_ws); | ||
103 | |||
104 | if (state > PM_SUSPEND_ON) { | ||
105 | pm_wakep_autosleep_enabled(true); | ||
106 | queue_up_suspend_work(); | ||
107 | } else { | ||
108 | pm_wakep_autosleep_enabled(false); | ||
109 | } | ||
110 | |||
111 | mutex_unlock(&autosleep_lock); | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | int __init pm_autosleep_init(void) | ||
116 | { | ||
117 | autosleep_ws = wakeup_source_register("autosleep"); | ||
118 | if (!autosleep_ws) | ||
119 | return -ENOMEM; | ||
120 | |||
121 | autosleep_wq = alloc_ordered_workqueue("autosleep", 0); | ||
122 | if (autosleep_wq) | ||
123 | return 0; | ||
124 | |||
125 | wakeup_source_unregister(autosleep_ws); | ||
126 | return -ENOMEM; | ||
127 | } | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0a186cfde788..8b53db38a279 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -16,7 +16,6 @@ | |||
16 | #include <linux/string.h> | 16 | #include <linux/string.h> |
17 | #include <linux/device.h> | 17 | #include <linux/device.h> |
18 | #include <linux/async.h> | 18 | #include <linux/async.h> |
19 | #include <linux/kmod.h> | ||
20 | #include <linux/delay.h> | 19 | #include <linux/delay.h> |
21 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
22 | #include <linux/mount.h> | 21 | #include <linux/mount.h> |
@@ -26,6 +25,8 @@ | |||
26 | #include <linux/freezer.h> | 25 | #include <linux/freezer.h> |
27 | #include <linux/gfp.h> | 26 | #include <linux/gfp.h> |
28 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
28 | #include <linux/ctype.h> | ||
29 | #include <linux/genhd.h> | ||
29 | #include <scsi/scsi_scan.h> | 30 | #include <scsi/scsi_scan.h> |
30 | 31 | ||
31 | #include "power.h" | 32 | #include "power.h" |
@@ -611,14 +612,10 @@ int hibernate(void) | |||
611 | if (error) | 612 | if (error) |
612 | goto Exit; | 613 | goto Exit; |
613 | 614 | ||
614 | error = usermodehelper_disable(); | ||
615 | if (error) | ||
616 | goto Exit; | ||
617 | |||
618 | /* Allocate memory management structures */ | 615 | /* Allocate memory management structures */ |
619 | error = create_basic_memory_bitmaps(); | 616 | error = create_basic_memory_bitmaps(); |
620 | if (error) | 617 | if (error) |
621 | goto Enable_umh; | 618 | goto Exit; |
622 | 619 | ||
623 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 620 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
624 | sys_sync(); | 621 | sys_sync(); |
@@ -661,8 +658,6 @@ int hibernate(void) | |||
661 | 658 | ||
662 | Free_bitmaps: | 659 | Free_bitmaps: |
663 | free_basic_memory_bitmaps(); | 660 | free_basic_memory_bitmaps(); |
664 | Enable_umh: | ||
665 | usermodehelper_enable(); | ||
666 | Exit: | 661 | Exit: |
667 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 662 | pm_notifier_call_chain(PM_POST_HIBERNATION); |
668 | pm_restore_console(); | 663 | pm_restore_console(); |
@@ -729,6 +724,17 @@ static int software_resume(void) | |||
729 | 724 | ||
730 | /* Check if the device is there */ | 725 | /* Check if the device is there */ |
731 | swsusp_resume_device = name_to_dev_t(resume_file); | 726 | swsusp_resume_device = name_to_dev_t(resume_file); |
727 | |||
728 | /* | ||
729 | * name_to_dev_t is ineffective to verify parition if resume_file is in | ||
730 | * integer format. (e.g. major:minor) | ||
731 | */ | ||
732 | if (isdigit(resume_file[0]) && resume_wait) { | ||
733 | int partno; | ||
734 | while (!get_gendisk(swsusp_resume_device, &partno)) | ||
735 | msleep(10); | ||
736 | } | ||
737 | |||
732 | if (!swsusp_resume_device) { | 738 | if (!swsusp_resume_device) { |
733 | /* | 739 | /* |
734 | * Some device discovery might still be in progress; we need | 740 | * Some device discovery might still be in progress; we need |
@@ -777,15 +783,9 @@ static int software_resume(void) | |||
777 | if (error) | 783 | if (error) |
778 | goto close_finish; | 784 | goto close_finish; |
779 | 785 | ||
780 | error = usermodehelper_disable(); | ||
781 | if (error) | ||
782 | goto close_finish; | ||
783 | |||
784 | error = create_basic_memory_bitmaps(); | 786 | error = create_basic_memory_bitmaps(); |
785 | if (error) { | 787 | if (error) |
786 | usermodehelper_enable(); | ||
787 | goto close_finish; | 788 | goto close_finish; |
788 | } | ||
789 | 789 | ||
790 | pr_debug("PM: Preparing processes for restore.\n"); | 790 | pr_debug("PM: Preparing processes for restore.\n"); |
791 | error = freeze_processes(); | 791 | error = freeze_processes(); |
@@ -806,7 +806,6 @@ static int software_resume(void) | |||
806 | thaw_processes(); | 806 | thaw_processes(); |
807 | Done: | 807 | Done: |
808 | free_basic_memory_bitmaps(); | 808 | free_basic_memory_bitmaps(); |
809 | usermodehelper_enable(); | ||
810 | Finish: | 809 | Finish: |
811 | pm_notifier_call_chain(PM_POST_RESTORE); | 810 | pm_notifier_call_chain(PM_POST_RESTORE); |
812 | pm_restore_console(); | 811 | pm_restore_console(); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c62..428f8a034e96 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
269 | return (s - buf); | 269 | return (s - buf); |
270 | } | 270 | } |
271 | 271 | ||
272 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | 272 | static suspend_state_t decode_state(const char *buf, size_t n) |
273 | const char *buf, size_t n) | ||
274 | { | 273 | { |
275 | #ifdef CONFIG_SUSPEND | 274 | #ifdef CONFIG_SUSPEND |
276 | suspend_state_t state = PM_SUSPEND_STANDBY; | 275 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
278 | #endif | 277 | #endif |
279 | char *p; | 278 | char *p; |
280 | int len; | 279 | int len; |
281 | int error = -EINVAL; | ||
282 | 280 | ||
283 | p = memchr(buf, '\n', n); | 281 | p = memchr(buf, '\n', n); |
284 | len = p ? p - buf : n; | 282 | len = p ? p - buf : n; |
285 | 283 | ||
286 | /* First, check if we are requested to hibernate */ | 284 | /* Check hibernation first. */ |
287 | if (len == 4 && !strncmp(buf, "disk", len)) { | 285 | if (len == 4 && !strncmp(buf, "disk", len)) |
288 | error = hibernate(); | 286 | return PM_SUSPEND_MAX; |
289 | goto Exit; | ||
290 | } | ||
291 | 287 | ||
292 | #ifdef CONFIG_SUSPEND | 288 | #ifdef CONFIG_SUSPEND |
293 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | 289 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) |
294 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { | 290 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) |
295 | error = pm_suspend(state); | 291 | return state; |
296 | break; | ||
297 | } | ||
298 | } | ||
299 | #endif | 292 | #endif |
300 | 293 | ||
301 | Exit: | 294 | return PM_SUSPEND_ON; |
295 | } | ||
296 | |||
297 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
298 | const char *buf, size_t n) | ||
299 | { | ||
300 | suspend_state_t state; | ||
301 | int error; | ||
302 | |||
303 | error = pm_autosleep_lock(); | ||
304 | if (error) | ||
305 | return error; | ||
306 | |||
307 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
308 | error = -EBUSY; | ||
309 | goto out; | ||
310 | } | ||
311 | |||
312 | state = decode_state(buf, n); | ||
313 | if (state < PM_SUSPEND_MAX) | ||
314 | error = pm_suspend(state); | ||
315 | else if (state == PM_SUSPEND_MAX) | ||
316 | error = hibernate(); | ||
317 | else | ||
318 | error = -EINVAL; | ||
319 | |||
320 | out: | ||
321 | pm_autosleep_unlock(); | ||
302 | return error ? error : n; | 322 | return error ? error : n; |
303 | } | 323 | } |
304 | 324 | ||
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, | |||
339 | { | 359 | { |
340 | unsigned int val; | 360 | unsigned int val; |
341 | 361 | ||
342 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; | 362 | return pm_get_wakeup_count(&val, true) ? |
363 | sprintf(buf, "%u\n", val) : -EINTR; | ||
343 | } | 364 | } |
344 | 365 | ||
345 | static ssize_t wakeup_count_store(struct kobject *kobj, | 366 | static ssize_t wakeup_count_store(struct kobject *kobj, |
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, | |||
347 | const char *buf, size_t n) | 368 | const char *buf, size_t n) |
348 | { | 369 | { |
349 | unsigned int val; | 370 | unsigned int val; |
371 | int error; | ||
372 | |||
373 | error = pm_autosleep_lock(); | ||
374 | if (error) | ||
375 | return error; | ||
376 | |||
377 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
378 | error = -EBUSY; | ||
379 | goto out; | ||
380 | } | ||
350 | 381 | ||
382 | error = -EINVAL; | ||
351 | if (sscanf(buf, "%u", &val) == 1) { | 383 | if (sscanf(buf, "%u", &val) == 1) { |
352 | if (pm_save_wakeup_count(val)) | 384 | if (pm_save_wakeup_count(val)) |
353 | return n; | 385 | error = n; |
354 | } | 386 | } |
355 | return -EINVAL; | 387 | |
388 | out: | ||
389 | pm_autosleep_unlock(); | ||
390 | return error; | ||
356 | } | 391 | } |
357 | 392 | ||
358 | power_attr(wakeup_count); | 393 | power_attr(wakeup_count); |
394 | |||
395 | #ifdef CONFIG_PM_AUTOSLEEP | ||
396 | static ssize_t autosleep_show(struct kobject *kobj, | ||
397 | struct kobj_attribute *attr, | ||
398 | char *buf) | ||
399 | { | ||
400 | suspend_state_t state = pm_autosleep_state(); | ||
401 | |||
402 | if (state == PM_SUSPEND_ON) | ||
403 | return sprintf(buf, "off\n"); | ||
404 | |||
405 | #ifdef CONFIG_SUSPEND | ||
406 | if (state < PM_SUSPEND_MAX) | ||
407 | return sprintf(buf, "%s\n", valid_state(state) ? | ||
408 | pm_states[state] : "error"); | ||
409 | #endif | ||
410 | #ifdef CONFIG_HIBERNATION | ||
411 | return sprintf(buf, "disk\n"); | ||
412 | #else | ||
413 | return sprintf(buf, "error"); | ||
414 | #endif | ||
415 | } | ||
416 | |||
417 | static ssize_t autosleep_store(struct kobject *kobj, | ||
418 | struct kobj_attribute *attr, | ||
419 | const char *buf, size_t n) | ||
420 | { | ||
421 | suspend_state_t state = decode_state(buf, n); | ||
422 | int error; | ||
423 | |||
424 | if (state == PM_SUSPEND_ON | ||
425 | && strcmp(buf, "off") && strcmp(buf, "off\n")) | ||
426 | return -EINVAL; | ||
427 | |||
428 | error = pm_autosleep_set_state(state); | ||
429 | return error ? error : n; | ||
430 | } | ||
431 | |||
432 | power_attr(autosleep); | ||
433 | #endif /* CONFIG_PM_AUTOSLEEP */ | ||
434 | |||
435 | #ifdef CONFIG_PM_WAKELOCKS | ||
436 | static ssize_t wake_lock_show(struct kobject *kobj, | ||
437 | struct kobj_attribute *attr, | ||
438 | char *buf) | ||
439 | { | ||
440 | return pm_show_wakelocks(buf, true); | ||
441 | } | ||
442 | |||
443 | static ssize_t wake_lock_store(struct kobject *kobj, | ||
444 | struct kobj_attribute *attr, | ||
445 | const char *buf, size_t n) | ||
446 | { | ||
447 | int error = pm_wake_lock(buf); | ||
448 | return error ? error : n; | ||
449 | } | ||
450 | |||
451 | power_attr(wake_lock); | ||
452 | |||
453 | static ssize_t wake_unlock_show(struct kobject *kobj, | ||
454 | struct kobj_attribute *attr, | ||
455 | char *buf) | ||
456 | { | ||
457 | return pm_show_wakelocks(buf, false); | ||
458 | } | ||
459 | |||
460 | static ssize_t wake_unlock_store(struct kobject *kobj, | ||
461 | struct kobj_attribute *attr, | ||
462 | const char *buf, size_t n) | ||
463 | { | ||
464 | int error = pm_wake_unlock(buf); | ||
465 | return error ? error : n; | ||
466 | } | ||
467 | |||
468 | power_attr(wake_unlock); | ||
469 | |||
470 | #endif /* CONFIG_PM_WAKELOCKS */ | ||
359 | #endif /* CONFIG_PM_SLEEP */ | 471 | #endif /* CONFIG_PM_SLEEP */ |
360 | 472 | ||
361 | #ifdef CONFIG_PM_TRACE | 473 | #ifdef CONFIG_PM_TRACE |
@@ -409,6 +521,13 @@ static struct attribute * g[] = { | |||
409 | #ifdef CONFIG_PM_SLEEP | 521 | #ifdef CONFIG_PM_SLEEP |
410 | &pm_async_attr.attr, | 522 | &pm_async_attr.attr, |
411 | &wakeup_count_attr.attr, | 523 | &wakeup_count_attr.attr, |
524 | #ifdef CONFIG_PM_AUTOSLEEP | ||
525 | &autosleep_attr.attr, | ||
526 | #endif | ||
527 | #ifdef CONFIG_PM_WAKELOCKS | ||
528 | &wake_lock_attr.attr, | ||
529 | &wake_unlock_attr.attr, | ||
530 | #endif | ||
412 | #ifdef CONFIG_PM_DEBUG | 531 | #ifdef CONFIG_PM_DEBUG |
413 | &pm_test_attr.attr, | 532 | &pm_test_attr.attr, |
414 | #endif | 533 | #endif |
@@ -444,7 +563,10 @@ static int __init pm_init(void) | |||
444 | power_kobj = kobject_create_and_add("power", NULL); | 563 | power_kobj = kobject_create_and_add("power", NULL); |
445 | if (!power_kobj) | 564 | if (!power_kobj) |
446 | return -ENOMEM; | 565 | return -ENOMEM; |
447 | return sysfs_create_group(power_kobj, &attr_group); | 566 | error = sysfs_create_group(power_kobj, &attr_group); |
567 | if (error) | ||
568 | return error; | ||
569 | return pm_autosleep_init(); | ||
448 | } | 570 | } |
449 | 571 | ||
450 | core_initcall(pm_init); | 572 | core_initcall(pm_init); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d7407..b0bd4beaebfe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) | |||
264 | { | 264 | { |
265 | } | 265 | } |
266 | #endif | 266 | #endif |
267 | |||
268 | #ifdef CONFIG_PM_AUTOSLEEP | ||
269 | |||
270 | /* kernel/power/autosleep.c */ | ||
271 | extern int pm_autosleep_init(void); | ||
272 | extern int pm_autosleep_lock(void); | ||
273 | extern void pm_autosleep_unlock(void); | ||
274 | extern suspend_state_t pm_autosleep_state(void); | ||
275 | extern int pm_autosleep_set_state(suspend_state_t state); | ||
276 | |||
277 | #else /* !CONFIG_PM_AUTOSLEEP */ | ||
278 | |||
279 | static inline int pm_autosleep_init(void) { return 0; } | ||
280 | static inline int pm_autosleep_lock(void) { return 0; } | ||
281 | static inline void pm_autosleep_unlock(void) {} | ||
282 | static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } | ||
283 | |||
284 | #endif /* !CONFIG_PM_AUTOSLEEP */ | ||
285 | |||
286 | #ifdef CONFIG_PM_WAKELOCKS | ||
287 | |||
288 | /* kernel/power/wakelock.c */ | ||
289 | extern ssize_t pm_show_wakelocks(char *buf, bool show_active); | ||
290 | extern int pm_wake_lock(const char *buf); | ||
291 | extern int pm_wake_unlock(const char *buf); | ||
292 | |||
293 | #endif /* !CONFIG_PM_WAKELOCKS */ | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0d2aeb226108..19db29f67558 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/workqueue.h> | 18 | #include <linux/workqueue.h> |
19 | #include <linux/kmod.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * Timeout for stopping processes | 22 | * Timeout for stopping processes |
@@ -122,6 +123,10 @@ int freeze_processes(void) | |||
122 | { | 123 | { |
123 | int error; | 124 | int error; |
124 | 125 | ||
126 | error = __usermodehelper_disable(UMH_FREEZING); | ||
127 | if (error) | ||
128 | return error; | ||
129 | |||
125 | if (!pm_freezing) | 130 | if (!pm_freezing) |
126 | atomic_inc(&system_freezing_cnt); | 131 | atomic_inc(&system_freezing_cnt); |
127 | 132 | ||
@@ -130,6 +135,7 @@ int freeze_processes(void) | |||
130 | error = try_to_freeze_tasks(true); | 135 | error = try_to_freeze_tasks(true); |
131 | if (!error) { | 136 | if (!error) { |
132 | printk("done."); | 137 | printk("done."); |
138 | __usermodehelper_set_disable_depth(UMH_DISABLED); | ||
133 | oom_killer_disable(); | 139 | oom_killer_disable(); |
134 | } | 140 | } |
135 | printk("\n"); | 141 | printk("\n"); |
@@ -187,6 +193,8 @@ void thaw_processes(void) | |||
187 | } while_each_thread(g, p); | 193 | } while_each_thread(g, p); |
188 | read_unlock(&tasklist_lock); | 194 | read_unlock(&tasklist_lock); |
189 | 195 | ||
196 | usermodehelper_enable(); | ||
197 | |||
190 | schedule(); | 198 | schedule(); |
191 | printk("done.\n"); | 199 | printk("done.\n"); |
192 | } | 200 | } |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index d6d6dbd1ecc0..6a031e684026 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -230,6 +230,21 @@ int pm_qos_request_active(struct pm_qos_request *req) | |||
230 | EXPORT_SYMBOL_GPL(pm_qos_request_active); | 230 | EXPORT_SYMBOL_GPL(pm_qos_request_active); |
231 | 231 | ||
232 | /** | 232 | /** |
233 | * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout | ||
234 | * @work: work struct for the delayed work (timeout) | ||
235 | * | ||
236 | * This cancels the timeout request by falling back to the default at timeout. | ||
237 | */ | ||
238 | static void pm_qos_work_fn(struct work_struct *work) | ||
239 | { | ||
240 | struct pm_qos_request *req = container_of(to_delayed_work(work), | ||
241 | struct pm_qos_request, | ||
242 | work); | ||
243 | |||
244 | pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); | ||
245 | } | ||
246 | |||
247 | /** | ||
233 | * pm_qos_add_request - inserts new qos request into the list | 248 | * pm_qos_add_request - inserts new qos request into the list |
234 | * @req: pointer to a preallocated handle | 249 | * @req: pointer to a preallocated handle |
235 | * @pm_qos_class: identifies which list of qos request to use | 250 | * @pm_qos_class: identifies which list of qos request to use |
@@ -253,6 +268,7 @@ void pm_qos_add_request(struct pm_qos_request *req, | |||
253 | return; | 268 | return; |
254 | } | 269 | } |
255 | req->pm_qos_class = pm_qos_class; | 270 | req->pm_qos_class = pm_qos_class; |
271 | INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); | ||
256 | pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, | 272 | pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, |
257 | &req->node, PM_QOS_ADD_REQ, value); | 273 | &req->node, PM_QOS_ADD_REQ, value); |
258 | } | 274 | } |
@@ -279,6 +295,9 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
279 | return; | 295 | return; |
280 | } | 296 | } |
281 | 297 | ||
298 | if (delayed_work_pending(&req->work)) | ||
299 | cancel_delayed_work_sync(&req->work); | ||
300 | |||
282 | if (new_value != req->node.prio) | 301 | if (new_value != req->node.prio) |
283 | pm_qos_update_target( | 302 | pm_qos_update_target( |
284 | pm_qos_array[req->pm_qos_class]->constraints, | 303 | pm_qos_array[req->pm_qos_class]->constraints, |
@@ -287,6 +306,34 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
287 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 306 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
288 | 307 | ||
289 | /** | 308 | /** |
309 | * pm_qos_update_request_timeout - modifies an existing qos request temporarily. | ||
310 | * @req : handle to list element holding a pm_qos request to use | ||
311 | * @new_value: defines the temporal qos request | ||
312 | * @timeout_us: the effective duration of this qos request in usecs. | ||
313 | * | ||
314 | * After timeout_us, this qos request is cancelled automatically. | ||
315 | */ | ||
316 | void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, | ||
317 | unsigned long timeout_us) | ||
318 | { | ||
319 | if (!req) | ||
320 | return; | ||
321 | if (WARN(!pm_qos_request_active(req), | ||
322 | "%s called for unknown object.", __func__)) | ||
323 | return; | ||
324 | |||
325 | if (delayed_work_pending(&req->work)) | ||
326 | cancel_delayed_work_sync(&req->work); | ||
327 | |||
328 | if (new_value != req->node.prio) | ||
329 | pm_qos_update_target( | ||
330 | pm_qos_array[req->pm_qos_class]->constraints, | ||
331 | &req->node, PM_QOS_UPDATE_REQ, new_value); | ||
332 | |||
333 | schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); | ||
334 | } | ||
335 | |||
336 | /** | ||
290 | * pm_qos_remove_request - modifies an existing qos request | 337 | * pm_qos_remove_request - modifies an existing qos request |
291 | * @req: handle to request list element | 338 | * @req: handle to request list element |
292 | * | 339 | * |
@@ -305,6 +352,9 @@ void pm_qos_remove_request(struct pm_qos_request *req) | |||
305 | return; | 352 | return; |
306 | } | 353 | } |
307 | 354 | ||
355 | if (delayed_work_pending(&req->work)) | ||
356 | cancel_delayed_work_sync(&req->work); | ||
357 | |||
308 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, | 358 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, |
309 | &req->node, PM_QOS_REMOVE_REQ, | 359 | &req->node, PM_QOS_REMOVE_REQ, |
310 | PM_QOS_DEFAULT_VALUE); | 360 | PM_QOS_DEFAULT_VALUE); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 88e5c967370d..396d262b8fd0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/kmod.h> | ||
16 | #include <linux/console.h> | 15 | #include <linux/console.h> |
17 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
18 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
@@ -102,17 +101,12 @@ static int suspend_prepare(void) | |||
102 | if (error) | 101 | if (error) |
103 | goto Finish; | 102 | goto Finish; |
104 | 103 | ||
105 | error = usermodehelper_disable(); | ||
106 | if (error) | ||
107 | goto Finish; | ||
108 | |||
109 | error = suspend_freeze_processes(); | 104 | error = suspend_freeze_processes(); |
110 | if (!error) | 105 | if (!error) |
111 | return 0; | 106 | return 0; |
112 | 107 | ||
113 | suspend_stats.failed_freeze++; | 108 | suspend_stats.failed_freeze++; |
114 | dpm_save_failed_step(SUSPEND_FREEZE); | 109 | dpm_save_failed_step(SUSPEND_FREEZE); |
115 | usermodehelper_enable(); | ||
116 | Finish: | 110 | Finish: |
117 | pm_notifier_call_chain(PM_POST_SUSPEND); | 111 | pm_notifier_call_chain(PM_POST_SUSPEND); |
118 | pm_restore_console(); | 112 | pm_restore_console(); |
@@ -259,7 +253,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
259 | static void suspend_finish(void) | 253 | static void suspend_finish(void) |
260 | { | 254 | { |
261 | suspend_thaw_processes(); | 255 | suspend_thaw_processes(); |
262 | usermodehelper_enable(); | ||
263 | pm_notifier_call_chain(PM_POST_SUSPEND); | 256 | pm_notifier_call_chain(PM_POST_SUSPEND); |
264 | pm_restore_console(); | 257 | pm_restore_console(); |
265 | } | 258 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8742fd013a94..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | 9 | * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> |
10 | * | 10 | * |
11 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
12 | * | 12 | * |
@@ -51,6 +51,23 @@ | |||
51 | 51 | ||
52 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) | 52 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) |
53 | 53 | ||
54 | /* | ||
55 | * Number of free pages that are not high. | ||
56 | */ | ||
57 | static inline unsigned long low_free_pages(void) | ||
58 | { | ||
59 | return nr_free_pages() - nr_free_highpages(); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * Number of pages required to be kept free while writing the image. Always | ||
64 | * half of all available low pages before the writing starts. | ||
65 | */ | ||
66 | static inline unsigned long reqd_free_pages(void) | ||
67 | { | ||
68 | return low_free_pages() / 2; | ||
69 | } | ||
70 | |||
54 | struct swap_map_page { | 71 | struct swap_map_page { |
55 | sector_t entries[MAP_PAGE_ENTRIES]; | 72 | sector_t entries[MAP_PAGE_ENTRIES]; |
56 | sector_t next_swap; | 73 | sector_t next_swap; |
@@ -72,7 +89,7 @@ struct swap_map_handle { | |||
72 | sector_t cur_swap; | 89 | sector_t cur_swap; |
73 | sector_t first_sector; | 90 | sector_t first_sector; |
74 | unsigned int k; | 91 | unsigned int k; |
75 | unsigned long nr_free_pages, written; | 92 | unsigned long reqd_free_pages; |
76 | u32 crc32; | 93 | u32 crc32; |
77 | }; | 94 | }; |
78 | 95 | ||
@@ -265,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
265 | return -ENOSPC; | 282 | return -ENOSPC; |
266 | 283 | ||
267 | if (bio_chain) { | 284 | if (bio_chain) { |
268 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 285 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | |
286 | __GFP_NORETRY); | ||
269 | if (src) { | 287 | if (src) { |
270 | copy_page(src, buf); | 288 | copy_page(src, buf); |
271 | } else { | 289 | } else { |
272 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ | 290 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ |
273 | if (ret) | 291 | if (ret) |
274 | return ret; | 292 | return ret; |
275 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 293 | src = (void *)__get_free_page(__GFP_WAIT | |
294 | __GFP_NOWARN | | ||
295 | __GFP_NORETRY); | ||
276 | if (src) { | 296 | if (src) { |
277 | copy_page(src, buf); | 297 | copy_page(src, buf); |
278 | } else { | 298 | } else { |
@@ -316,8 +336,7 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
316 | goto err_rel; | 336 | goto err_rel; |
317 | } | 337 | } |
318 | handle->k = 0; | 338 | handle->k = 0; |
319 | handle->nr_free_pages = nr_free_pages() >> 1; | 339 | handle->reqd_free_pages = reqd_free_pages(); |
320 | handle->written = 0; | ||
321 | handle->first_sector = handle->cur_swap; | 340 | handle->first_sector = handle->cur_swap; |
322 | return 0; | 341 | return 0; |
323 | err_rel: | 342 | err_rel: |
@@ -351,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
351 | clear_page(handle->cur); | 370 | clear_page(handle->cur); |
352 | handle->cur_swap = offset; | 371 | handle->cur_swap = offset; |
353 | handle->k = 0; | 372 | handle->k = 0; |
354 | } | 373 | |
355 | if (bio_chain && ++handle->written > handle->nr_free_pages) { | 374 | if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { |
356 | error = hib_wait_on_bio_chain(bio_chain); | 375 | error = hib_wait_on_bio_chain(bio_chain); |
357 | if (error) | 376 | if (error) |
358 | goto out; | 377 | goto out; |
359 | handle->written = 0; | 378 | /* |
379 | * Recalculate the number of required free pages, to | ||
380 | * make sure we never take more than half. | ||
381 | */ | ||
382 | handle->reqd_free_pages = reqd_free_pages(); | ||
383 | } | ||
360 | } | 384 | } |
361 | out: | 385 | out: |
362 | return error; | 386 | return error; |
@@ -403,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
403 | /* Maximum number of threads for compression/decompression. */ | 427 | /* Maximum number of threads for compression/decompression. */ |
404 | #define LZO_THREADS 3 | 428 | #define LZO_THREADS 3 |
405 | 429 | ||
406 | /* Maximum number of pages for read buffering. */ | 430 | /* Minimum/maximum number of pages for read buffering. */ |
407 | #define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) | 431 | #define LZO_MIN_RD_PAGES 1024 |
432 | #define LZO_MAX_RD_PAGES 8192 | ||
408 | 433 | ||
409 | 434 | ||
410 | /** | 435 | /** |
@@ -615,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
615 | } | 640 | } |
616 | 641 | ||
617 | /* | 642 | /* |
618 | * Adjust number of free pages after all allocations have been done. | ||
619 | * We don't want to run out of pages when writing. | ||
620 | */ | ||
621 | handle->nr_free_pages = nr_free_pages() >> 1; | ||
622 | |||
623 | /* | ||
624 | * Start the CRC32 thread. | 643 | * Start the CRC32 thread. |
625 | */ | 644 | */ |
626 | init_waitqueue_head(&crc->go); | 645 | init_waitqueue_head(&crc->go); |
@@ -641,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
641 | goto out_clean; | 660 | goto out_clean; |
642 | } | 661 | } |
643 | 662 | ||
663 | /* | ||
664 | * Adjust the number of required free pages after all allocations have | ||
665 | * been done. We don't want to run out of pages when writing. | ||
666 | */ | ||
667 | handle->reqd_free_pages = reqd_free_pages(); | ||
668 | |||
644 | printk(KERN_INFO | 669 | printk(KERN_INFO |
645 | "PM: Using %u thread(s) for compression.\n" | 670 | "PM: Using %u thread(s) for compression.\n" |
646 | "PM: Compressing and saving image data (%u pages) ... ", | 671 | "PM: Compressing and saving image data (%u pages) ... ", |
@@ -1051,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1051 | unsigned i, thr, run_threads, nr_threads; | 1076 | unsigned i, thr, run_threads, nr_threads; |
1052 | unsigned ring = 0, pg = 0, ring_size = 0, | 1077 | unsigned ring = 0, pg = 0, ring_size = 0, |
1053 | have = 0, want, need, asked = 0; | 1078 | have = 0, want, need, asked = 0; |
1054 | unsigned long read_pages; | 1079 | unsigned long read_pages = 0; |
1055 | unsigned char **page = NULL; | 1080 | unsigned char **page = NULL; |
1056 | struct dec_data *data = NULL; | 1081 | struct dec_data *data = NULL; |
1057 | struct crc_data *crc = NULL; | 1082 | struct crc_data *crc = NULL; |
@@ -1063,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1063 | nr_threads = num_online_cpus() - 1; | 1088 | nr_threads = num_online_cpus() - 1; |
1064 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | 1089 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); |
1065 | 1090 | ||
1066 | page = vmalloc(sizeof(*page) * LZO_READ_PAGES); | 1091 | page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); |
1067 | if (!page) { | 1092 | if (!page) { |
1068 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 1093 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
1069 | ret = -ENOMEM; | 1094 | ret = -ENOMEM; |
@@ -1128,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1128 | } | 1153 | } |
1129 | 1154 | ||
1130 | /* | 1155 | /* |
1131 | * Adjust number of pages for read buffering, in case we are short. | 1156 | * Set the number of pages for read buffering. |
1157 | * This is complete guesswork, because we'll only know the real | ||
1158 | * picture once prepare_image() is called, which is much later on | ||
1159 | * during the image load phase. We'll assume the worst case and | ||
1160 | * say that none of the image pages are from high memory. | ||
1132 | */ | 1161 | */ |
1133 | read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; | 1162 | if (low_free_pages() > snapshot_get_image_size()) |
1134 | read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); | 1163 | read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; |
1164 | read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); | ||
1135 | 1165 | ||
1136 | for (i = 0; i < read_pages; i++) { | 1166 | for (i = 0; i < read_pages; i++) { |
1137 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | 1167 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? |
1138 | __GFP_WAIT | __GFP_HIGH : | 1168 | __GFP_WAIT | __GFP_HIGH : |
1139 | __GFP_WAIT); | 1169 | __GFP_WAIT | __GFP_NOWARN | |
1170 | __GFP_NORETRY); | ||
1171 | |||
1140 | if (!page[i]) { | 1172 | if (!page[i]) { |
1141 | if (i < LZO_CMP_PAGES) { | 1173 | if (i < LZO_CMP_PAGES) { |
1142 | ring_size = i; | 1174 | ring_size = i; |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 33c4329205af..91b0fd021a95 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
13 | #include <linux/syscalls.h> | 13 | #include <linux/syscalls.h> |
14 | #include <linux/reboot.h> | 14 | #include <linux/reboot.h> |
15 | #include <linux/kmod.h> | ||
16 | #include <linux/string.h> | 15 | #include <linux/string.h> |
17 | #include <linux/device.h> | 16 | #include <linux/device.h> |
18 | #include <linux/miscdevice.h> | 17 | #include <linux/miscdevice.h> |
@@ -222,14 +221,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
222 | sys_sync(); | 221 | sys_sync(); |
223 | printk("done.\n"); | 222 | printk("done.\n"); |
224 | 223 | ||
225 | error = usermodehelper_disable(); | ||
226 | if (error) | ||
227 | break; | ||
228 | |||
229 | error = freeze_processes(); | 224 | error = freeze_processes(); |
230 | if (error) | 225 | if (!error) |
231 | usermodehelper_enable(); | ||
232 | else | ||
233 | data->frozen = 1; | 226 | data->frozen = 1; |
234 | break; | 227 | break; |
235 | 228 | ||
@@ -238,7 +231,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
238 | break; | 231 | break; |
239 | pm_restore_gfp_mask(); | 232 | pm_restore_gfp_mask(); |
240 | thaw_processes(); | 233 | thaw_processes(); |
241 | usermodehelper_enable(); | ||
242 | data->frozen = 0; | 234 | data->frozen = 0; |
243 | break; | 235 | break; |
244 | 236 | ||
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 000000000000..c8fba3380076 --- /dev/null +++ b/kernel/power/wakelock.c | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * kernel/power/wakelock.c | ||
3 | * | ||
4 | * User space wakeup sources support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | * | ||
8 | * This code is based on the analogous interface allowing user space to | ||
9 | * manipulate wakelocks on Android. | ||
10 | */ | ||
11 | |||
12 | #include <linux/ctype.h> | ||
13 | #include <linux/device.h> | ||
14 | #include <linux/err.h> | ||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/list.h> | ||
17 | #include <linux/rbtree.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | static DEFINE_MUTEX(wakelocks_lock); | ||
21 | |||
22 | struct wakelock { | ||
23 | char *name; | ||
24 | struct rb_node node; | ||
25 | struct wakeup_source ws; | ||
26 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
27 | struct list_head lru; | ||
28 | #endif | ||
29 | }; | ||
30 | |||
31 | static struct rb_root wakelocks_tree = RB_ROOT; | ||
32 | |||
33 | ssize_t pm_show_wakelocks(char *buf, bool show_active) | ||
34 | { | ||
35 | struct rb_node *node; | ||
36 | struct wakelock *wl; | ||
37 | char *str = buf; | ||
38 | char *end = buf + PAGE_SIZE; | ||
39 | |||
40 | mutex_lock(&wakelocks_lock); | ||
41 | |||
42 | for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { | ||
43 | wl = rb_entry(node, struct wakelock, node); | ||
44 | if (wl->ws.active == show_active) | ||
45 | str += scnprintf(str, end - str, "%s ", wl->name); | ||
46 | } | ||
47 | if (str > buf) | ||
48 | str--; | ||
49 | |||
50 | str += scnprintf(str, end - str, "\n"); | ||
51 | |||
52 | mutex_unlock(&wakelocks_lock); | ||
53 | return (str - buf); | ||
54 | } | ||
55 | |||
56 | #if CONFIG_PM_WAKELOCKS_LIMIT > 0 | ||
57 | static unsigned int number_of_wakelocks; | ||
58 | |||
59 | static inline bool wakelocks_limit_exceeded(void) | ||
60 | { | ||
61 | return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; | ||
62 | } | ||
63 | |||
64 | static inline void increment_wakelocks_number(void) | ||
65 | { | ||
66 | number_of_wakelocks++; | ||
67 | } | ||
68 | |||
69 | static inline void decrement_wakelocks_number(void) | ||
70 | { | ||
71 | number_of_wakelocks--; | ||
72 | } | ||
73 | #else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ | ||
74 | static inline bool wakelocks_limit_exceeded(void) { return false; } | ||
75 | static inline void increment_wakelocks_number(void) {} | ||
76 | static inline void decrement_wakelocks_number(void) {} | ||
77 | #endif /* CONFIG_PM_WAKELOCKS_LIMIT */ | ||
78 | |||
79 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
80 | #define WL_GC_COUNT_MAX 100 | ||
81 | #define WL_GC_TIME_SEC 300 | ||
82 | |||
83 | static LIST_HEAD(wakelocks_lru_list); | ||
84 | static unsigned int wakelocks_gc_count; | ||
85 | |||
86 | static inline void wakelocks_lru_add(struct wakelock *wl) | ||
87 | { | ||
88 | list_add(&wl->lru, &wakelocks_lru_list); | ||
89 | } | ||
90 | |||
91 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) | ||
92 | { | ||
93 | list_move(&wl->lru, &wakelocks_lru_list); | ||
94 | } | ||
95 | |||
96 | static void wakelocks_gc(void) | ||
97 | { | ||
98 | struct wakelock *wl, *aux; | ||
99 | ktime_t now; | ||
100 | |||
101 | if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) | ||
102 | return; | ||
103 | |||
104 | now = ktime_get(); | ||
105 | list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { | ||
106 | u64 idle_time_ns; | ||
107 | bool active; | ||
108 | |||
109 | spin_lock_irq(&wl->ws.lock); | ||
110 | idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); | ||
111 | active = wl->ws.active; | ||
112 | spin_unlock_irq(&wl->ws.lock); | ||
113 | |||
114 | if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) | ||
115 | break; | ||
116 | |||
117 | if (!active) { | ||
118 | wakeup_source_remove(&wl->ws); | ||
119 | rb_erase(&wl->node, &wakelocks_tree); | ||
120 | list_del(&wl->lru); | ||
121 | kfree(wl->name); | ||
122 | kfree(wl); | ||
123 | decrement_wakelocks_number(); | ||
124 | } | ||
125 | } | ||
126 | wakelocks_gc_count = 0; | ||
127 | } | ||
128 | #else /* !CONFIG_PM_WAKELOCKS_GC */ | ||
129 | static inline void wakelocks_lru_add(struct wakelock *wl) {} | ||
130 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} | ||
131 | static inline void wakelocks_gc(void) {} | ||
132 | #endif /* !CONFIG_PM_WAKELOCKS_GC */ | ||
133 | |||
134 | static struct wakelock *wakelock_lookup_add(const char *name, size_t len, | ||
135 | bool add_if_not_found) | ||
136 | { | ||
137 | struct rb_node **node = &wakelocks_tree.rb_node; | ||
138 | struct rb_node *parent = *node; | ||
139 | struct wakelock *wl; | ||
140 | |||
141 | while (*node) { | ||
142 | int diff; | ||
143 | |||
144 | parent = *node; | ||
145 | wl = rb_entry(*node, struct wakelock, node); | ||
146 | diff = strncmp(name, wl->name, len); | ||
147 | if (diff == 0) { | ||
148 | if (wl->name[len]) | ||
149 | diff = -1; | ||
150 | else | ||
151 | return wl; | ||
152 | } | ||
153 | if (diff < 0) | ||
154 | node = &(*node)->rb_left; | ||
155 | else | ||
156 | node = &(*node)->rb_right; | ||
157 | } | ||
158 | if (!add_if_not_found) | ||
159 | return ERR_PTR(-EINVAL); | ||
160 | |||
161 | if (wakelocks_limit_exceeded()) | ||
162 | return ERR_PTR(-ENOSPC); | ||
163 | |||
164 | /* Not found, we have to add a new one. */ | ||
165 | wl = kzalloc(sizeof(*wl), GFP_KERNEL); | ||
166 | if (!wl) | ||
167 | return ERR_PTR(-ENOMEM); | ||
168 | |||
169 | wl->name = kstrndup(name, len, GFP_KERNEL); | ||
170 | if (!wl->name) { | ||
171 | kfree(wl); | ||
172 | return ERR_PTR(-ENOMEM); | ||
173 | } | ||
174 | wl->ws.name = wl->name; | ||
175 | wakeup_source_add(&wl->ws); | ||
176 | rb_link_node(&wl->node, parent, node); | ||
177 | rb_insert_color(&wl->node, &wakelocks_tree); | ||
178 | wakelocks_lru_add(wl); | ||
179 | increment_wakelocks_number(); | ||
180 | return wl; | ||
181 | } | ||
182 | |||
183 | int pm_wake_lock(const char *buf) | ||
184 | { | ||
185 | const char *str = buf; | ||
186 | struct wakelock *wl; | ||
187 | u64 timeout_ns = 0; | ||
188 | size_t len; | ||
189 | int ret = 0; | ||
190 | |||
191 | while (*str && !isspace(*str)) | ||
192 | str++; | ||
193 | |||
194 | len = str - buf; | ||
195 | if (!len) | ||
196 | return -EINVAL; | ||
197 | |||
198 | if (*str && *str != '\n') { | ||
199 | /* Find out if there's a valid timeout string appended. */ | ||
200 | ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); | ||
201 | if (ret) | ||
202 | return -EINVAL; | ||
203 | } | ||
204 | |||
205 | mutex_lock(&wakelocks_lock); | ||
206 | |||
207 | wl = wakelock_lookup_add(buf, len, true); | ||
208 | if (IS_ERR(wl)) { | ||
209 | ret = PTR_ERR(wl); | ||
210 | goto out; | ||
211 | } | ||
212 | if (timeout_ns) { | ||
213 | u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; | ||
214 | |||
215 | do_div(timeout_ms, NSEC_PER_MSEC); | ||
216 | __pm_wakeup_event(&wl->ws, timeout_ms); | ||
217 | } else { | ||
218 | __pm_stay_awake(&wl->ws); | ||
219 | } | ||
220 | |||
221 | wakelocks_lru_most_recent(wl); | ||
222 | |||
223 | out: | ||
224 | mutex_unlock(&wakelocks_lock); | ||
225 | return ret; | ||
226 | } | ||
227 | |||
228 | int pm_wake_unlock(const char *buf) | ||
229 | { | ||
230 | struct wakelock *wl; | ||
231 | size_t len; | ||
232 | int ret = 0; | ||
233 | |||
234 | len = strlen(buf); | ||
235 | if (!len) | ||
236 | return -EINVAL; | ||
237 | |||
238 | if (buf[len-1] == '\n') | ||
239 | len--; | ||
240 | |||
241 | if (!len) | ||
242 | return -EINVAL; | ||
243 | |||
244 | mutex_lock(&wakelocks_lock); | ||
245 | |||
246 | wl = wakelock_lookup_add(buf, len, false); | ||
247 | if (IS_ERR(wl)) { | ||
248 | ret = PTR_ERR(wl); | ||
249 | goto out; | ||
250 | } | ||
251 | __pm_relax(&wl->ws); | ||
252 | |||
253 | wakelocks_lru_most_recent(wl); | ||
254 | wakelocks_gc(); | ||
255 | |||
256 | out: | ||
257 | mutex_unlock(&wakelocks_lock); | ||
258 | return ret; | ||
259 | } | ||
diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d39..32462d2b364a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/rculist.h> | 43 | #include <linux/rculist.h> |
44 | #include <linux/poll.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | 47 | ||
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
54 | { | 55 | { |
55 | } | 56 | } |
56 | 57 | ||
57 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
58 | |||
59 | /* printk's without a loglevel use this.. */ | 58 | /* printk's without a loglevel use this.. */ |
60 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 59 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
61 | 60 | ||
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); | |||
99 | static int console_locked, console_suspended; | 98 | static int console_locked, console_suspended; |
100 | 99 | ||
101 | /* | 100 | /* |
102 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | ||
103 | * It is also used in interesting ways to provide interlocking in | ||
104 | * console_unlock();. | ||
105 | */ | ||
106 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
107 | |||
108 | #define LOG_BUF_MASK (log_buf_len-1) | ||
109 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | ||
110 | |||
111 | /* | ||
112 | * The indices into log_buf are not constrained to log_buf_len - they | ||
113 | * must be masked before subscripting | ||
114 | */ | ||
115 | static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ | ||
116 | static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ | ||
117 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | ||
118 | |||
119 | /* | ||
120 | * If exclusive_console is non-NULL then only this console is to be printed to. | 101 | * If exclusive_console is non-NULL then only this console is to be printed to. |
121 | */ | 102 | */ |
122 | static struct console *exclusive_console; | 103 | static struct console *exclusive_console; |
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline); | |||
145 | /* Flag: console code may call schedule() */ | 126 | /* Flag: console code may call schedule() */ |
146 | static int console_may_schedule; | 127 | static int console_may_schedule; |
147 | 128 | ||
129 | /* | ||
130 | * The printk log buffer consists of a chain of concatenated variable | ||
131 | * length records. Every record starts with a record header, containing | ||
132 | * the overall length of the record. | ||
133 | * | ||
134 | * The heads to the first and last entry in the buffer, as well as the | ||
135 | * sequence numbers of these both entries are maintained when messages | ||
136 | * are stored.. | ||
137 | * | ||
138 | * If the heads indicate available messages, the length in the header | ||
139 | * tells the start next message. A length == 0 for the next message | ||
140 | * indicates a wrap-around to the beginning of the buffer. | ||
141 | * | ||
142 | * Every record carries the monotonic timestamp in microseconds, as well as | ||
143 | * the standard userspace syslog level and syslog facility. The usual | ||
144 | * kernel messages use LOG_KERN; userspace-injected messages always carry | ||
145 | * a matching syslog facility, by default LOG_USER. The origin of every | ||
146 | * message can be reliably determined that way. | ||
147 | * | ||
148 | * The human readable log message directly follows the message header. The | ||
149 | * length of the message text is stored in the header, the stored message | ||
150 | * is not terminated. | ||
151 | * | ||
152 | * Optionally, a message can carry a dictionary of properties (key/value pairs), | ||
153 | * to provide userspace with a machine-readable message context. | ||
154 | * | ||
155 | * Examples for well-defined, commonly used property names are: | ||
156 | * DEVICE=b12:8 device identifier | ||
157 | * b12:8 block dev_t | ||
158 | * c127:3 char dev_t | ||
159 | * n8 netdev ifindex | ||
160 | * +sound:card0 subsystem:devname | ||
161 | * SUBSYSTEM=pci driver-core subsystem name | ||
162 | * | ||
163 | * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value | ||
164 | * follows directly after a '=' character. Every property is terminated by | ||
165 | * a '\0' character. The last property is not terminated. | ||
166 | * | ||
167 | * Example of a message structure: | ||
168 | * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec | ||
169 | * 0008 34 00 record is 52 bytes long | ||
170 | * 000a 0b 00 text is 11 bytes long | ||
171 | * 000c 1f 00 dictionary is 23 bytes long | ||
172 | * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) | ||
173 | * 0010 69 74 27 73 20 61 20 6c "it's a l" | ||
174 | * 69 6e 65 "ine" | ||
175 | * 001b 44 45 56 49 43 "DEVIC" | ||
176 | * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" | ||
177 | * 52 49 56 45 52 3d 62 75 "RIVER=bu" | ||
178 | * 67 "g" | ||
179 | * 0032 00 00 00 padding to next message header | ||
180 | * | ||
181 | * The 'struct log' buffer header must never be directly exported to | ||
182 | * userspace, it is a kernel-private implementation detail that might | ||
183 | * need to be changed in the future, when the requirements change. | ||
184 | * | ||
185 | * /dev/kmsg exports the structured data in the following line format: | ||
186 | * "level,sequnum,timestamp;<message text>\n" | ||
187 | * | ||
188 | * The optional key/value pairs are attached as continuation lines starting | ||
189 | * with a space character and terminated by a newline. All possible | ||
190 | * non-prinatable characters are escaped in the "\xff" notation. | ||
191 | * | ||
192 | * Users of the export format should ignore possible additional values | ||
193 | * separated by ',', and find the message after the ';' character. | ||
194 | */ | ||
195 | |||
196 | struct log { | ||
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | ||
198 | u16 len; /* length of entire record */ | ||
199 | u16 text_len; /* length of text buffer */ | ||
200 | u16 dict_len; /* length of dictionary buffer */ | ||
201 | u16 level; /* syslog level + facility */ | ||
202 | }; | ||
203 | |||
204 | /* | ||
205 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | ||
206 | * used in interesting ways to provide interlocking in console_unlock(); | ||
207 | */ | ||
208 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
209 | |||
210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | ||
211 | static u64 syslog_seq; | ||
212 | static u32 syslog_idx; | ||
213 | |||
214 | /* index and sequence number of the first record stored in the buffer */ | ||
215 | static u64 log_first_seq; | ||
216 | static u32 log_first_idx; | ||
217 | |||
218 | /* index and sequence number of the next record to store in the buffer */ | ||
219 | static u64 log_next_seq; | ||
148 | #ifdef CONFIG_PRINTK | 220 | #ifdef CONFIG_PRINTK |
221 | static u32 log_next_idx; | ||
222 | |||
223 | /* the next printk record to read after the last 'clear' command */ | ||
224 | static u64 clear_seq; | ||
225 | static u32 clear_idx; | ||
226 | |||
227 | #define LOG_LINE_MAX 1024 | ||
149 | 228 | ||
150 | static char __log_buf[__LOG_BUF_LEN]; | 229 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
231 | #define LOG_ALIGN 4 | ||
232 | #else | ||
233 | #define LOG_ALIGN 8 | ||
234 | #endif | ||
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | ||
151 | static char *log_buf = __log_buf; | 237 | static char *log_buf = __log_buf; |
152 | static int log_buf_len = __LOG_BUF_LEN; | 238 | static u32 log_buf_len = __LOG_BUF_LEN; |
153 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 239 | |
154 | static int saved_console_loglevel = -1; | 240 | /* cpu currently holding logbuf_lock */ |
241 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
242 | |||
243 | /* human readable text of the record */ | ||
244 | static char *log_text(const struct log *msg) | ||
245 | { | ||
246 | return (char *)msg + sizeof(struct log); | ||
247 | } | ||
248 | |||
249 | /* optional key/value pair dictionary attached to the record */ | ||
250 | static char *log_dict(const struct log *msg) | ||
251 | { | ||
252 | return (char *)msg + sizeof(struct log) + msg->text_len; | ||
253 | } | ||
254 | |||
255 | /* get record by index; idx must point to valid msg */ | ||
256 | static struct log *log_from_idx(u32 idx) | ||
257 | { | ||
258 | struct log *msg = (struct log *)(log_buf + idx); | ||
259 | |||
260 | /* | ||
261 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
262 | * read the message at the start of the buffer. | ||
263 | */ | ||
264 | if (!msg->len) | ||
265 | return (struct log *)log_buf; | ||
266 | return msg; | ||
267 | } | ||
268 | |||
269 | /* get next record; idx must point to valid msg */ | ||
270 | static u32 log_next(u32 idx) | ||
271 | { | ||
272 | struct log *msg = (struct log *)(log_buf + idx); | ||
273 | |||
274 | /* length == 0 indicates the end of the buffer; wrap */ | ||
275 | /* | ||
276 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
277 | * read the message at the start of the buffer as *this* one, and | ||
278 | * return the one after that. | ||
279 | */ | ||
280 | if (!msg->len) { | ||
281 | msg = (struct log *)log_buf; | ||
282 | return msg->len; | ||
283 | } | ||
284 | return idx + msg->len; | ||
285 | } | ||
286 | |||
287 | /* insert record into the buffer, discard old ones, update heads */ | ||
288 | static void log_store(int facility, int level, | ||
289 | const char *dict, u16 dict_len, | ||
290 | const char *text, u16 text_len) | ||
291 | { | ||
292 | struct log *msg; | ||
293 | u32 size, pad_len; | ||
294 | |||
295 | /* number of '\0' padding bytes to next message */ | ||
296 | size = sizeof(struct log) + text_len + dict_len; | ||
297 | pad_len = (-size) & (LOG_ALIGN - 1); | ||
298 | size += pad_len; | ||
299 | |||
300 | while (log_first_seq < log_next_seq) { | ||
301 | u32 free; | ||
302 | |||
303 | if (log_next_idx > log_first_idx) | ||
304 | free = max(log_buf_len - log_next_idx, log_first_idx); | ||
305 | else | ||
306 | free = log_first_idx - log_next_idx; | ||
307 | |||
308 | if (free > size + sizeof(struct log)) | ||
309 | break; | ||
310 | |||
311 | /* drop old messages until we have enough contiuous space */ | ||
312 | log_first_idx = log_next(log_first_idx); | ||
313 | log_first_seq++; | ||
314 | } | ||
315 | |||
316 | if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { | ||
317 | /* | ||
318 | * This message + an additional empty header does not fit | ||
319 | * at the end of the buffer. Add an empty header with len == 0 | ||
320 | * to signify a wrap around. | ||
321 | */ | ||
322 | memset(log_buf + log_next_idx, 0, sizeof(struct log)); | ||
323 | log_next_idx = 0; | ||
324 | } | ||
325 | |||
326 | /* fill message */ | ||
327 | msg = (struct log *)(log_buf + log_next_idx); | ||
328 | memcpy(log_text(msg), text, text_len); | ||
329 | msg->text_len = text_len; | ||
330 | memcpy(log_dict(msg), dict, dict_len); | ||
331 | msg->dict_len = dict_len; | ||
332 | msg->level = (facility << 3) | (level & 7); | ||
333 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | ||
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | ||
336 | |||
337 | /* insert message */ | ||
338 | log_next_idx += msg->len; | ||
339 | log_next_seq++; | ||
340 | } | ||
341 | |||
342 | /* /dev/kmsg - userspace message inject/listen interface */ | ||
343 | struct devkmsg_user { | ||
344 | u64 seq; | ||
345 | u32 idx; | ||
346 | struct mutex lock; | ||
347 | char buf[8192]; | ||
348 | }; | ||
349 | |||
350 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | ||
351 | unsigned long count, loff_t pos) | ||
352 | { | ||
353 | char *buf, *line; | ||
354 | int i; | ||
355 | int level = default_message_loglevel; | ||
356 | int facility = 1; /* LOG_USER */ | ||
357 | size_t len = iov_length(iv, count); | ||
358 | ssize_t ret = len; | ||
359 | |||
360 | if (len > LOG_LINE_MAX) | ||
361 | return -EINVAL; | ||
362 | buf = kmalloc(len+1, GFP_KERNEL); | ||
363 | if (buf == NULL) | ||
364 | return -ENOMEM; | ||
365 | |||
366 | line = buf; | ||
367 | for (i = 0; i < count; i++) { | ||
368 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) | ||
369 | goto out; | ||
370 | line += iv[i].iov_len; | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace | ||
375 | * the decimal value represents 32bit, the lower 3 bit are the log | ||
376 | * level, the rest are the log facility. | ||
377 | * | ||
378 | * If no prefix or no userspace facility is specified, we | ||
379 | * enforce LOG_USER, to be able to reliably distinguish | ||
380 | * kernel-generated messages from userspace-injected ones. | ||
381 | */ | ||
382 | line = buf; | ||
383 | if (line[0] == '<') { | ||
384 | char *endp = NULL; | ||
385 | |||
386 | i = simple_strtoul(line+1, &endp, 10); | ||
387 | if (endp && endp[0] == '>') { | ||
388 | level = i & 7; | ||
389 | if (i >> 3) | ||
390 | facility = i >> 3; | ||
391 | endp++; | ||
392 | len -= endp - line; | ||
393 | line = endp; | ||
394 | } | ||
395 | } | ||
396 | line[len] = '\0'; | ||
397 | |||
398 | printk_emit(facility, level, NULL, 0, "%s", line); | ||
399 | out: | ||
400 | kfree(buf); | ||
401 | return ret; | ||
402 | } | ||
403 | |||
404 | static ssize_t devkmsg_read(struct file *file, char __user *buf, | ||
405 | size_t count, loff_t *ppos) | ||
406 | { | ||
407 | struct devkmsg_user *user = file->private_data; | ||
408 | struct log *msg; | ||
409 | u64 ts_usec; | ||
410 | size_t i; | ||
411 | size_t len; | ||
412 | ssize_t ret; | ||
413 | |||
414 | if (!user) | ||
415 | return -EBADF; | ||
416 | |||
417 | mutex_lock(&user->lock); | ||
418 | raw_spin_lock(&logbuf_lock); | ||
419 | while (user->seq == log_next_seq) { | ||
420 | if (file->f_flags & O_NONBLOCK) { | ||
421 | ret = -EAGAIN; | ||
422 | raw_spin_unlock(&logbuf_lock); | ||
423 | goto out; | ||
424 | } | ||
425 | |||
426 | raw_spin_unlock(&logbuf_lock); | ||
427 | ret = wait_event_interruptible(log_wait, | ||
428 | user->seq != log_next_seq); | ||
429 | if (ret) | ||
430 | goto out; | ||
431 | raw_spin_lock(&logbuf_lock); | ||
432 | } | ||
433 | |||
434 | if (user->seq < log_first_seq) { | ||
435 | /* our last seen message is gone, return error and reset */ | ||
436 | user->idx = log_first_idx; | ||
437 | user->seq = log_first_seq; | ||
438 | ret = -EPIPE; | ||
439 | raw_spin_unlock(&logbuf_lock); | ||
440 | goto out; | ||
441 | } | ||
442 | |||
443 | msg = log_from_idx(user->idx); | ||
444 | ts_usec = msg->ts_nsec; | ||
445 | do_div(ts_usec, 1000); | ||
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | ||
447 | msg->level, user->seq, ts_usec); | ||
448 | |||
449 | /* escape non-printable characters */ | ||
450 | for (i = 0; i < msg->text_len; i++) { | ||
451 | unsigned char c = log_text(msg)[i]; | ||
452 | |||
453 | if (c < ' ' || c >= 128) | ||
454 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
455 | else | ||
456 | user->buf[len++] = c; | ||
457 | } | ||
458 | user->buf[len++] = '\n'; | ||
459 | |||
460 | if (msg->dict_len) { | ||
461 | bool line = true; | ||
462 | |||
463 | for (i = 0; i < msg->dict_len; i++) { | ||
464 | unsigned char c = log_dict(msg)[i]; | ||
465 | |||
466 | if (line) { | ||
467 | user->buf[len++] = ' '; | ||
468 | line = false; | ||
469 | } | ||
470 | |||
471 | if (c == '\0') { | ||
472 | user->buf[len++] = '\n'; | ||
473 | line = true; | ||
474 | continue; | ||
475 | } | ||
476 | |||
477 | if (c < ' ' || c >= 128) { | ||
478 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
479 | continue; | ||
480 | } | ||
481 | |||
482 | user->buf[len++] = c; | ||
483 | } | ||
484 | user->buf[len++] = '\n'; | ||
485 | } | ||
486 | |||
487 | user->idx = log_next(user->idx); | ||
488 | user->seq++; | ||
489 | raw_spin_unlock(&logbuf_lock); | ||
490 | |||
491 | if (len > count) { | ||
492 | ret = -EINVAL; | ||
493 | goto out; | ||
494 | } | ||
495 | |||
496 | if (copy_to_user(buf, user->buf, len)) { | ||
497 | ret = -EFAULT; | ||
498 | goto out; | ||
499 | } | ||
500 | ret = len; | ||
501 | out: | ||
502 | mutex_unlock(&user->lock); | ||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | ||
507 | { | ||
508 | struct devkmsg_user *user = file->private_data; | ||
509 | loff_t ret = 0; | ||
510 | |||
511 | if (!user) | ||
512 | return -EBADF; | ||
513 | if (offset) | ||
514 | return -ESPIPE; | ||
515 | |||
516 | raw_spin_lock(&logbuf_lock); | ||
517 | switch (whence) { | ||
518 | case SEEK_SET: | ||
519 | /* the first record */ | ||
520 | user->idx = log_first_idx; | ||
521 | user->seq = log_first_seq; | ||
522 | break; | ||
523 | case SEEK_DATA: | ||
524 | /* | ||
525 | * The first record after the last SYSLOG_ACTION_CLEAR, | ||
526 | * like issued by 'dmesg -c'. Reading /dev/kmsg itself | ||
527 | * changes no global state, and does not clear anything. | ||
528 | */ | ||
529 | user->idx = clear_idx; | ||
530 | user->seq = clear_seq; | ||
531 | break; | ||
532 | case SEEK_END: | ||
533 | /* after the last record */ | ||
534 | user->idx = log_next_idx; | ||
535 | user->seq = log_next_seq; | ||
536 | break; | ||
537 | default: | ||
538 | ret = -EINVAL; | ||
539 | } | ||
540 | raw_spin_unlock(&logbuf_lock); | ||
541 | return ret; | ||
542 | } | ||
543 | |||
544 | static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | ||
545 | { | ||
546 | struct devkmsg_user *user = file->private_data; | ||
547 | int ret = 0; | ||
548 | |||
549 | if (!user) | ||
550 | return POLLERR|POLLNVAL; | ||
551 | |||
552 | poll_wait(file, &log_wait, wait); | ||
553 | |||
554 | raw_spin_lock(&logbuf_lock); | ||
555 | if (user->seq < log_next_seq) { | ||
556 | /* return error when data has vanished underneath us */ | ||
557 | if (user->seq < log_first_seq) | ||
558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | ||
559 | ret = POLLIN|POLLRDNORM; | ||
560 | } | ||
561 | raw_spin_unlock(&logbuf_lock); | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | |||
566 | static int devkmsg_open(struct inode *inode, struct file *file) | ||
567 | { | ||
568 | struct devkmsg_user *user; | ||
569 | int err; | ||
570 | |||
571 | /* write-only does not need any file context */ | ||
572 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) | ||
573 | return 0; | ||
574 | |||
575 | err = security_syslog(SYSLOG_ACTION_READ_ALL); | ||
576 | if (err) | ||
577 | return err; | ||
578 | |||
579 | user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); | ||
580 | if (!user) | ||
581 | return -ENOMEM; | ||
582 | |||
583 | mutex_init(&user->lock); | ||
584 | |||
585 | raw_spin_lock(&logbuf_lock); | ||
586 | user->idx = log_first_idx; | ||
587 | user->seq = log_first_seq; | ||
588 | raw_spin_unlock(&logbuf_lock); | ||
589 | |||
590 | file->private_data = user; | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | static int devkmsg_release(struct inode *inode, struct file *file) | ||
595 | { | ||
596 | struct devkmsg_user *user = file->private_data; | ||
597 | |||
598 | if (!user) | ||
599 | return 0; | ||
600 | |||
601 | mutex_destroy(&user->lock); | ||
602 | kfree(user); | ||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | const struct file_operations kmsg_fops = { | ||
607 | .open = devkmsg_open, | ||
608 | .read = devkmsg_read, | ||
609 | .aio_write = devkmsg_writev, | ||
610 | .llseek = devkmsg_llseek, | ||
611 | .poll = devkmsg_poll, | ||
612 | .release = devkmsg_release, | ||
613 | }; | ||
155 | 614 | ||
156 | #ifdef CONFIG_KEXEC | 615 | #ifdef CONFIG_KEXEC |
157 | /* | 616 | /* |
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1; | |||
165 | void log_buf_kexec_setup(void) | 624 | void log_buf_kexec_setup(void) |
166 | { | 625 | { |
167 | VMCOREINFO_SYMBOL(log_buf); | 626 | VMCOREINFO_SYMBOL(log_buf); |
168 | VMCOREINFO_SYMBOL(log_end); | ||
169 | VMCOREINFO_SYMBOL(log_buf_len); | 627 | VMCOREINFO_SYMBOL(log_buf_len); |
170 | VMCOREINFO_SYMBOL(logged_chars); | 628 | VMCOREINFO_SYMBOL(log_first_idx); |
629 | VMCOREINFO_SYMBOL(log_next_idx); | ||
171 | } | 630 | } |
172 | #endif | 631 | #endif |
173 | 632 | ||
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup); | |||
191 | void __init setup_log_buf(int early) | 650 | void __init setup_log_buf(int early) |
192 | { | 651 | { |
193 | unsigned long flags; | 652 | unsigned long flags; |
194 | unsigned start, dest_idx, offset; | ||
195 | char *new_log_buf; | 653 | char *new_log_buf; |
196 | int free; | 654 | int free; |
197 | 655 | ||
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early) | |||
219 | log_buf_len = new_log_buf_len; | 677 | log_buf_len = new_log_buf_len; |
220 | log_buf = new_log_buf; | 678 | log_buf = new_log_buf; |
221 | new_log_buf_len = 0; | 679 | new_log_buf_len = 0; |
222 | free = __LOG_BUF_LEN - log_end; | 680 | free = __LOG_BUF_LEN - log_next_idx; |
223 | 681 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | |
224 | offset = start = min(con_start, log_start); | ||
225 | dest_idx = 0; | ||
226 | while (start != log_end) { | ||
227 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
228 | |||
229 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
230 | start++; | ||
231 | dest_idx++; | ||
232 | } | ||
233 | log_start -= offset; | ||
234 | con_start -= offset; | ||
235 | log_end -= offset; | ||
236 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 682 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
237 | 683 | ||
238 | pr_info("log_buf_len: %d\n", log_buf_len); | 684 | pr_info("log_buf_len: %d\n", log_buf_len); |
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file) | |||
332 | return 0; | 778 | return 0; |
333 | } | 779 | } |
334 | 780 | ||
781 | #if defined(CONFIG_PRINTK_TIME) | ||
782 | static bool printk_time = 1; | ||
783 | #else | ||
784 | static bool printk_time; | ||
785 | #endif | ||
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
787 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | ||
789 | { | ||
790 | size_t len = 0; | ||
791 | |||
792 | if (syslog) { | ||
793 | if (buf) { | ||
794 | len += sprintf(buf, "<%u>", msg->level); | ||
795 | } else { | ||
796 | len += 3; | ||
797 | if (msg->level > 9) | ||
798 | len++; | ||
799 | if (msg->level > 99) | ||
800 | len++; | ||
801 | } | ||
802 | } | ||
803 | |||
804 | if (printk_time) { | ||
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | ||
817 | } | ||
818 | |||
819 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
820 | char *buf, size_t size) | ||
821 | { | ||
822 | const char *text = log_text(msg); | ||
823 | size_t text_size = msg->text_len; | ||
824 | size_t len = 0; | ||
825 | |||
826 | do { | ||
827 | const char *next = memchr(text, '\n', text_size); | ||
828 | size_t text_len; | ||
829 | |||
830 | if (next) { | ||
831 | text_len = next - text; | ||
832 | next++; | ||
833 | text_size -= next - text; | ||
834 | } else { | ||
835 | text_len = text_size; | ||
836 | } | ||
837 | |||
838 | if (buf) { | ||
839 | if (print_prefix(msg, syslog, NULL) + | ||
840 | text_len + 1>= size - len) | ||
841 | break; | ||
842 | |||
843 | len += print_prefix(msg, syslog, buf + len); | ||
844 | memcpy(buf + len, text, text_len); | ||
845 | len += text_len; | ||
846 | buf[len++] = '\n'; | ||
847 | } else { | ||
848 | /* SYSLOG_ACTION_* buffer size only calculation */ | ||
849 | len += print_prefix(msg, syslog, NULL); | ||
850 | len += text_len + 1; | ||
851 | } | ||
852 | |||
853 | text = next; | ||
854 | } while (text); | ||
855 | |||
856 | return len; | ||
857 | } | ||
858 | |||
859 | static int syslog_print(char __user *buf, int size) | ||
860 | { | ||
861 | char *text; | ||
862 | struct log *msg; | ||
863 | int len; | ||
864 | |||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
866 | if (!text) | ||
867 | return -ENOMEM; | ||
868 | |||
869 | raw_spin_lock_irq(&logbuf_lock); | ||
870 | if (syslog_seq < log_first_seq) { | ||
871 | /* messages are gone, move to first one */ | ||
872 | syslog_seq = log_first_seq; | ||
873 | syslog_idx = log_first_idx; | ||
874 | } | ||
875 | msg = log_from_idx(syslog_idx); | ||
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
877 | syslog_idx = log_next(syslog_idx); | ||
878 | syslog_seq++; | ||
879 | raw_spin_unlock_irq(&logbuf_lock); | ||
880 | |||
881 | if (len > 0 && copy_to_user(buf, text, len)) | ||
882 | len = -EFAULT; | ||
883 | |||
884 | kfree(text); | ||
885 | return len; | ||
886 | } | ||
887 | |||
888 | static int syslog_print_all(char __user *buf, int size, bool clear) | ||
889 | { | ||
890 | char *text; | ||
891 | int len = 0; | ||
892 | |||
893 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
894 | if (!text) | ||
895 | return -ENOMEM; | ||
896 | |||
897 | raw_spin_lock_irq(&logbuf_lock); | ||
898 | if (buf) { | ||
899 | u64 next_seq; | ||
900 | u64 seq; | ||
901 | u32 idx; | ||
902 | |||
903 | if (clear_seq < log_first_seq) { | ||
904 | /* messages are gone, move to first available one */ | ||
905 | clear_seq = log_first_seq; | ||
906 | clear_idx = log_first_idx; | ||
907 | } | ||
908 | |||
909 | /* | ||
910 | * Find first record that fits, including all following records, | ||
911 | * into the user-provided buffer for this dump. | ||
912 | */ | ||
913 | seq = clear_seq; | ||
914 | idx = clear_idx; | ||
915 | while (seq < log_next_seq) { | ||
916 | struct log *msg = log_from_idx(idx); | ||
917 | |||
918 | len += msg_print_text(msg, true, NULL, 0); | ||
919 | idx = log_next(idx); | ||
920 | seq++; | ||
921 | } | ||
922 | seq = clear_seq; | ||
923 | idx = clear_idx; | ||
924 | while (len > size && seq < log_next_seq) { | ||
925 | struct log *msg = log_from_idx(idx); | ||
926 | |||
927 | len -= msg_print_text(msg, true, NULL, 0); | ||
928 | idx = log_next(idx); | ||
929 | seq++; | ||
930 | } | ||
931 | |||
932 | /* last message in this dump */ | ||
933 | next_seq = log_next_seq; | ||
934 | |||
935 | len = 0; | ||
936 | while (len >= 0 && seq < next_seq) { | ||
937 | struct log *msg = log_from_idx(idx); | ||
938 | int textlen; | ||
939 | |||
940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
941 | if (textlen < 0) { | ||
942 | len = textlen; | ||
943 | break; | ||
944 | } | ||
945 | idx = log_next(idx); | ||
946 | seq++; | ||
947 | |||
948 | raw_spin_unlock_irq(&logbuf_lock); | ||
949 | if (copy_to_user(buf + len, text, textlen)) | ||
950 | len = -EFAULT; | ||
951 | else | ||
952 | len += textlen; | ||
953 | raw_spin_lock_irq(&logbuf_lock); | ||
954 | |||
955 | if (seq < log_first_seq) { | ||
956 | /* messages are gone, move to next one */ | ||
957 | seq = log_first_seq; | ||
958 | idx = log_first_idx; | ||
959 | } | ||
960 | } | ||
961 | } | ||
962 | |||
963 | if (clear) { | ||
964 | clear_seq = log_next_seq; | ||
965 | clear_idx = log_next_idx; | ||
966 | } | ||
967 | raw_spin_unlock_irq(&logbuf_lock); | ||
968 | |||
969 | kfree(text); | ||
970 | return len; | ||
971 | } | ||
972 | |||
335 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 973 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
336 | { | 974 | { |
337 | unsigned i, j, limit, count; | 975 | bool clear = false; |
338 | int do_clear = 0; | 976 | static int saved_console_loglevel = -1; |
339 | char c; | ||
340 | int error; | 977 | int error; |
341 | 978 | ||
342 | error = check_syslog_permissions(type, from_file); | 979 | error = check_syslog_permissions(type, from_file); |
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
364 | goto out; | 1001 | goto out; |
365 | } | 1002 | } |
366 | error = wait_event_interruptible(log_wait, | 1003 | error = wait_event_interruptible(log_wait, |
367 | (log_start - log_end)); | 1004 | syslog_seq != log_next_seq); |
368 | if (error) | 1005 | if (error) |
369 | goto out; | 1006 | goto out; |
370 | i = 0; | 1007 | error = syslog_print(buf, len); |
371 | raw_spin_lock_irq(&logbuf_lock); | ||
372 | while (!error && (log_start != log_end) && i < len) { | ||
373 | c = LOG_BUF(log_start); | ||
374 | log_start++; | ||
375 | raw_spin_unlock_irq(&logbuf_lock); | ||
376 | error = __put_user(c,buf); | ||
377 | buf++; | ||
378 | i++; | ||
379 | cond_resched(); | ||
380 | raw_spin_lock_irq(&logbuf_lock); | ||
381 | } | ||
382 | raw_spin_unlock_irq(&logbuf_lock); | ||
383 | if (!error) | ||
384 | error = i; | ||
385 | break; | 1008 | break; |
386 | /* Read/clear last kernel messages */ | 1009 | /* Read/clear last kernel messages */ |
387 | case SYSLOG_ACTION_READ_CLEAR: | 1010 | case SYSLOG_ACTION_READ_CLEAR: |
388 | do_clear = 1; | 1011 | clear = true; |
389 | /* FALL THRU */ | 1012 | /* FALL THRU */ |
390 | /* Read last kernel messages */ | 1013 | /* Read last kernel messages */ |
391 | case SYSLOG_ACTION_READ_ALL: | 1014 | case SYSLOG_ACTION_READ_ALL: |
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | error = -EFAULT; | 1022 | error = -EFAULT; |
400 | goto out; | 1023 | goto out; |
401 | } | 1024 | } |
402 | count = len; | 1025 | error = syslog_print_all(buf, len, clear); |
403 | if (count > log_buf_len) | ||
404 | count = log_buf_len; | ||
405 | raw_spin_lock_irq(&logbuf_lock); | ||
406 | if (count > logged_chars) | ||
407 | count = logged_chars; | ||
408 | if (do_clear) | ||
409 | logged_chars = 0; | ||
410 | limit = log_end; | ||
411 | /* | ||
412 | * __put_user() could sleep, and while we sleep | ||
413 | * printk() could overwrite the messages | ||
414 | * we try to copy to user space. Therefore | ||
415 | * the messages are copied in reverse. <manfreds> | ||
416 | */ | ||
417 | for (i = 0; i < count && !error; i++) { | ||
418 | j = limit-1-i; | ||
419 | if (j + log_buf_len < log_end) | ||
420 | break; | ||
421 | c = LOG_BUF(j); | ||
422 | raw_spin_unlock_irq(&logbuf_lock); | ||
423 | error = __put_user(c,&buf[count-1-i]); | ||
424 | cond_resched(); | ||
425 | raw_spin_lock_irq(&logbuf_lock); | ||
426 | } | ||
427 | raw_spin_unlock_irq(&logbuf_lock); | ||
428 | if (error) | ||
429 | break; | ||
430 | error = i; | ||
431 | if (i != count) { | ||
432 | int offset = count-error; | ||
433 | /* buffer overflow during copy, correct user buffer. */ | ||
434 | for (i = 0; i < error; i++) { | ||
435 | if (__get_user(c,&buf[i+offset]) || | ||
436 | __put_user(c,&buf[i])) { | ||
437 | error = -EFAULT; | ||
438 | break; | ||
439 | } | ||
440 | cond_resched(); | ||
441 | } | ||
442 | } | ||
443 | break; | 1026 | break; |
444 | /* Clear ring buffer */ | 1027 | /* Clear ring buffer */ |
445 | case SYSLOG_ACTION_CLEAR: | 1028 | case SYSLOG_ACTION_CLEAR: |
446 | logged_chars = 0; | 1029 | syslog_print_all(NULL, 0, true); |
447 | break; | ||
448 | /* Disable logging to console */ | 1030 | /* Disable logging to console */ |
449 | case SYSLOG_ACTION_CONSOLE_OFF: | 1031 | case SYSLOG_ACTION_CONSOLE_OFF: |
450 | if (saved_console_loglevel == -1) | 1032 | if (saved_console_loglevel == -1) |
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
472 | break; | 1054 | break; |
473 | /* Number of chars in the log buffer */ | 1055 | /* Number of chars in the log buffer */ |
474 | case SYSLOG_ACTION_SIZE_UNREAD: | 1056 | case SYSLOG_ACTION_SIZE_UNREAD: |
475 | error = log_end - log_start; | 1057 | raw_spin_lock_irq(&logbuf_lock); |
1058 | if (syslog_seq < log_first_seq) { | ||
1059 | /* messages are gone, move to first one */ | ||
1060 | syslog_seq = log_first_seq; | ||
1061 | syslog_idx = log_first_idx; | ||
1062 | } | ||
1063 | if (from_file) { | ||
1064 | /* | ||
1065 | * Short-cut for poll(/"proc/kmsg") which simply checks | ||
1066 | * for pending data, not the size; return the count of | ||
1067 | * records, not the length. | ||
1068 | */ | ||
1069 | error = log_next_idx - syslog_idx; | ||
1070 | } else { | ||
1071 | u64 seq; | ||
1072 | u32 idx; | ||
1073 | |||
1074 | error = 0; | ||
1075 | seq = syslog_seq; | ||
1076 | idx = syslog_idx; | ||
1077 | while (seq < log_next_seq) { | ||
1078 | struct log *msg = log_from_idx(idx); | ||
1079 | |||
1080 | error += msg_print_text(msg, true, NULL, 0); | ||
1081 | idx = log_next(idx); | ||
1082 | seq++; | ||
1083 | } | ||
1084 | } | ||
1085 | raw_spin_unlock_irq(&logbuf_lock); | ||
476 | break; | 1086 | break; |
477 | /* Size of the log buffer */ | 1087 | /* Size of the log buffer */ |
478 | case SYSLOG_ACTION_SIZE_BUFFER: | 1088 | case SYSLOG_ACTION_SIZE_BUFFER: |
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4]) | |||
501 | { | 1111 | { |
502 | syslog_data[0] = log_buf; | 1112 | syslog_data[0] = log_buf; |
503 | syslog_data[1] = log_buf + log_buf_len; | 1113 | syslog_data[1] = log_buf + log_buf_len; |
504 | syslog_data[2] = log_buf + log_end - | 1114 | syslog_data[2] = log_buf + log_first_idx; |
505 | (logged_chars < log_buf_len ? logged_chars : log_buf_len); | 1115 | syslog_data[3] = log_buf + log_next_idx; |
506 | syslog_data[3] = log_buf + log_end; | ||
507 | } | 1116 | } |
508 | #endif /* CONFIG_KGDB_KDB */ | 1117 | #endif /* CONFIG_KGDB_KDB */ |
509 | 1118 | ||
510 | /* | ||
511 | * Call the console drivers on a range of log_buf | ||
512 | */ | ||
513 | static void __call_console_drivers(unsigned start, unsigned end) | ||
514 | { | ||
515 | struct console *con; | ||
516 | |||
517 | for_each_console(con) { | ||
518 | if (exclusive_console && con != exclusive_console) | ||
519 | continue; | ||
520 | if ((con->flags & CON_ENABLED) && con->write && | ||
521 | (cpu_online(smp_processor_id()) || | ||
522 | (con->flags & CON_ANYTIME))) | ||
523 | con->write(con, &LOG_BUF(start), end - start); | ||
524 | } | ||
525 | } | ||
526 | |||
527 | static bool __read_mostly ignore_loglevel; | 1119 | static bool __read_mostly ignore_loglevel; |
528 | 1120 | ||
529 | static int __init ignore_loglevel_setup(char *str) | 1121 | static int __init ignore_loglevel_setup(char *str) |
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | |||
540 | "print all kernel messages to the console."); | 1132 | "print all kernel messages to the console."); |
541 | 1133 | ||
542 | /* | 1134 | /* |
543 | * Write out chars from start to end - 1 inclusive | ||
544 | */ | ||
545 | static void _call_console_drivers(unsigned start, | ||
546 | unsigned end, int msg_log_level) | ||
547 | { | ||
548 | trace_console(&LOG_BUF(0), start, end, log_buf_len); | ||
549 | |||
550 | if ((msg_log_level < console_loglevel || ignore_loglevel) && | ||
551 | console_drivers && start != end) { | ||
552 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | ||
553 | /* wrapped write */ | ||
554 | __call_console_drivers(start & LOG_BUF_MASK, | ||
555 | log_buf_len); | ||
556 | __call_console_drivers(0, end & LOG_BUF_MASK); | ||
557 | } else { | ||
558 | __call_console_drivers(start, end); | ||
559 | } | ||
560 | } | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
565 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
566 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
567 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
568 | * to extract the correct log level for in-kernel processing, and not mangle | ||
569 | * the original value. | ||
570 | * | ||
571 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
572 | * passed, it will be filled in with the log level without a possible facility | ||
573 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
574 | * and returned. If no valid header is found, 0 is returned and the passed | ||
575 | * variables are not touched. | ||
576 | */ | ||
577 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
578 | { | ||
579 | unsigned int lev = 0; | ||
580 | char sp = '\0'; | ||
581 | size_t len; | ||
582 | |||
583 | if (p[0] != '<' || !p[1]) | ||
584 | return 0; | ||
585 | if (p[2] == '>') { | ||
586 | /* usual single digit level number or special char */ | ||
587 | switch (p[1]) { | ||
588 | case '0' ... '7': | ||
589 | lev = p[1] - '0'; | ||
590 | break; | ||
591 | case 'c': /* KERN_CONT */ | ||
592 | case 'd': /* KERN_DEFAULT */ | ||
593 | sp = p[1]; | ||
594 | break; | ||
595 | default: | ||
596 | return 0; | ||
597 | } | ||
598 | len = 3; | ||
599 | } else { | ||
600 | /* multi digit including the level and facility number */ | ||
601 | char *endp = NULL; | ||
602 | |||
603 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
604 | if (endp == NULL || endp[0] != '>') | ||
605 | return 0; | ||
606 | len = (endp + 1) - p; | ||
607 | } | ||
608 | |||
609 | /* do not accept special char if not asked for */ | ||
610 | if (sp && !special) | ||
611 | return 0; | ||
612 | |||
613 | if (special) { | ||
614 | *special = sp; | ||
615 | /* return special char, do not touch level */ | ||
616 | if (sp) | ||
617 | return len; | ||
618 | } | ||
619 | |||
620 | if (level) | ||
621 | *level = lev; | ||
622 | return len; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * Call the console drivers, asking them to write out | 1135 | * Call the console drivers, asking them to write out |
627 | * log_buf[start] to log_buf[end - 1]. | 1136 | * log_buf[start] to log_buf[end - 1]. |
628 | * The console_lock must be held. | 1137 | * The console_lock must be held. |
629 | */ | 1138 | */ |
630 | static void call_console_drivers(unsigned start, unsigned end) | 1139 | static void call_console_drivers(int level, const char *text, size_t len) |
631 | { | 1140 | { |
632 | unsigned cur_index, start_print; | 1141 | struct console *con; |
633 | static int msg_level = -1; | ||
634 | 1142 | ||
635 | BUG_ON(((int)(start - end)) > 0); | 1143 | trace_console(text, 0, len, len); |
636 | 1144 | ||
637 | cur_index = start; | 1145 | if (level >= console_loglevel && !ignore_loglevel) |
638 | start_print = start; | 1146 | return; |
639 | while (cur_index != end) { | 1147 | if (!console_drivers) |
640 | if (msg_level < 0 && ((end - cur_index) > 2)) { | 1148 | return; |
641 | /* strip log prefix */ | ||
642 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); | ||
643 | start_print = cur_index; | ||
644 | } | ||
645 | while (cur_index != end) { | ||
646 | char c = LOG_BUF(cur_index); | ||
647 | |||
648 | cur_index++; | ||
649 | if (c == '\n') { | ||
650 | if (msg_level < 0) { | ||
651 | /* | ||
652 | * printk() has already given us loglevel tags in | ||
653 | * the buffer. This code is here in case the | ||
654 | * log buffer has wrapped right round and scribbled | ||
655 | * on those tags | ||
656 | */ | ||
657 | msg_level = default_message_loglevel; | ||
658 | } | ||
659 | _call_console_drivers(start_print, cur_index, msg_level); | ||
660 | msg_level = -1; | ||
661 | start_print = cur_index; | ||
662 | break; | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | _call_console_drivers(start_print, end, msg_level); | ||
667 | } | ||
668 | 1149 | ||
669 | static void emit_log_char(char c) | 1150 | for_each_console(con) { |
670 | { | 1151 | if (exclusive_console && con != exclusive_console) |
671 | LOG_BUF(log_end) = c; | 1152 | continue; |
672 | log_end++; | 1153 | if (!(con->flags & CON_ENABLED)) |
673 | if (log_end - log_start > log_buf_len) | 1154 | continue; |
674 | log_start = log_end - log_buf_len; | 1155 | if (!con->write) |
675 | if (log_end - con_start > log_buf_len) | 1156 | continue; |
676 | con_start = log_end - log_buf_len; | 1157 | if (!cpu_online(smp_processor_id()) && |
677 | if (logged_chars < log_buf_len) | 1158 | !(con->flags & CON_ANYTIME)) |
678 | logged_chars++; | 1159 | continue; |
1160 | con->write(con, text, len); | ||
1161 | } | ||
679 | } | 1162 | } |
680 | 1163 | ||
681 | /* | 1164 | /* |
@@ -700,16 +1183,6 @@ static void zap_locks(void) | |||
700 | sema_init(&console_sem, 1); | 1183 | sema_init(&console_sem, 1); |
701 | } | 1184 | } |
702 | 1185 | ||
703 | #if defined(CONFIG_PRINTK_TIME) | ||
704 | static bool printk_time = 1; | ||
705 | #else | ||
706 | static bool printk_time = 0; | ||
707 | #endif | ||
708 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
709 | |||
710 | static bool always_kmsg_dump; | ||
711 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
712 | |||
713 | /* Check if we have any console registered that can be called early in boot. */ | 1186 | /* Check if we have any console registered that can be called early in boot. */ |
714 | static int have_callable_console(void) | 1187 | static int have_callable_console(void) |
715 | { | 1188 | { |
@@ -722,51 +1195,6 @@ static int have_callable_console(void) | |||
722 | return 0; | 1195 | return 0; |
723 | } | 1196 | } |
724 | 1197 | ||
725 | /** | ||
726 | * printk - print a kernel message | ||
727 | * @fmt: format string | ||
728 | * | ||
729 | * This is printk(). It can be called from any context. We want it to work. | ||
730 | * | ||
731 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and | ||
732 | * call the console drivers. If we fail to get the semaphore we place the output | ||
733 | * into the log buffer and return. The current holder of the console_sem will | ||
734 | * notice the new output in console_unlock(); and will send it to the | ||
735 | * consoles before releasing the lock. | ||
736 | * | ||
737 | * One effect of this deferred printing is that code which calls printk() and | ||
738 | * then changes console_loglevel may break. This is because console_loglevel | ||
739 | * is inspected when the actual printing occurs. | ||
740 | * | ||
741 | * See also: | ||
742 | * printf(3) | ||
743 | * | ||
744 | * See the vsnprintf() documentation for format string extensions over C99. | ||
745 | */ | ||
746 | |||
747 | asmlinkage int printk(const char *fmt, ...) | ||
748 | { | ||
749 | va_list args; | ||
750 | int r; | ||
751 | |||
752 | #ifdef CONFIG_KGDB_KDB | ||
753 | if (unlikely(kdb_trap_printk)) { | ||
754 | va_start(args, fmt); | ||
755 | r = vkdb_printf(fmt, args); | ||
756 | va_end(args); | ||
757 | return r; | ||
758 | } | ||
759 | #endif | ||
760 | va_start(args, fmt); | ||
761 | r = vprintk(fmt, args); | ||
762 | va_end(args); | ||
763 | |||
764 | return r; | ||
765 | } | ||
766 | |||
767 | /* cpu currently holding logbuf_lock */ | ||
768 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
769 | |||
770 | /* | 1198 | /* |
771 | * Can we actually use the console at this time on this cpu? | 1199 | * Can we actually use the console at this time on this cpu? |
772 | * | 1200 | * |
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
810 | retval = 0; | 1238 | retval = 0; |
811 | } | 1239 | } |
812 | } | 1240 | } |
813 | printk_cpu = UINT_MAX; | 1241 | logbuf_cpu = UINT_MAX; |
814 | if (wake) | 1242 | if (wake) |
815 | up(&console_sem); | 1243 | up(&console_sem); |
816 | raw_spin_unlock(&logbuf_lock); | 1244 | raw_spin_unlock(&logbuf_lock); |
817 | return retval; | 1245 | return retval; |
818 | } | 1246 | } |
819 | static const char recursion_bug_msg [] = | ||
820 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
821 | static int recursion_bug; | ||
822 | static int new_text_line = 1; | ||
823 | static char printk_buf[1024]; | ||
824 | 1247 | ||
825 | int printk_delay_msec __read_mostly; | 1248 | int printk_delay_msec __read_mostly; |
826 | 1249 | ||
@@ -836,15 +1259,23 @@ static inline void printk_delay(void) | |||
836 | } | 1259 | } |
837 | } | 1260 | } |
838 | 1261 | ||
839 | asmlinkage int vprintk(const char *fmt, va_list args) | 1262 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | ||
1264 | const char *fmt, va_list args) | ||
840 | { | 1265 | { |
841 | int printed_len = 0; | 1266 | static int recursion_bug; |
842 | int current_log_level = default_message_loglevel; | 1267 | static char cont_buf[LOG_LINE_MAX]; |
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | ||
1272 | char *text = textbuf; | ||
1273 | size_t text_len; | ||
843 | unsigned long flags; | 1274 | unsigned long flags; |
844 | int this_cpu; | 1275 | int this_cpu; |
845 | char *p; | 1276 | bool newline = false; |
846 | size_t plen; | 1277 | bool prefix = false; |
847 | char special; | 1278 | int printed_len = 0; |
848 | 1279 | ||
849 | boot_delay_msec(); | 1280 | boot_delay_msec(); |
850 | printk_delay(); | 1281 | printk_delay(); |
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | /* | 1287 | /* |
857 | * Ouch, printk recursed into itself! | 1288 | * Ouch, printk recursed into itself! |
858 | */ | 1289 | */ |
859 | if (unlikely(printk_cpu == this_cpu)) { | 1290 | if (unlikely(logbuf_cpu == this_cpu)) { |
860 | /* | 1291 | /* |
861 | * If a crash is occurring during printk() on this CPU, | 1292 | * If a crash is occurring during printk() on this CPU, |
862 | * then try to get the crash message out but make sure | 1293 | * then try to get the crash message out but make sure |
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
873 | 1304 | ||
874 | lockdep_off(); | 1305 | lockdep_off(); |
875 | raw_spin_lock(&logbuf_lock); | 1306 | raw_spin_lock(&logbuf_lock); |
876 | printk_cpu = this_cpu; | 1307 | logbuf_cpu = this_cpu; |
877 | 1308 | ||
878 | if (recursion_bug) { | 1309 | if (recursion_bug) { |
1310 | static const char recursion_msg[] = | ||
1311 | "BUG: recent printk recursion!"; | ||
1312 | |||
879 | recursion_bug = 0; | 1313 | recursion_bug = 0; |
880 | strcpy(printk_buf, recursion_bug_msg); | 1314 | printed_len += strlen(recursion_msg); |
881 | printed_len = strlen(recursion_bug_msg); | 1315 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | ||
882 | } | 1317 | } |
883 | /* Emit the output into the temporary buffer */ | ||
884 | printed_len += vscnprintf(printk_buf + printed_len, | ||
885 | sizeof(printk_buf) - printed_len, fmt, args); | ||
886 | 1318 | ||
887 | p = printk_buf; | 1319 | /* |
1320 | * The printf needs to come first; we need the syslog | ||
1321 | * prefix which might be passed-in as a parameter. | ||
1322 | */ | ||
1323 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | ||
888 | 1324 | ||
889 | /* Read log level and handle special printk prefix */ | 1325 | /* mark and strip a trailing newline */ |
890 | plen = log_prefix(p, ¤t_log_level, &special); | 1326 | if (text_len && text[text_len-1] == '\n') { |
891 | if (plen) { | 1327 | text_len--; |
892 | p += plen; | 1328 | newline = true; |
1329 | } | ||
893 | 1330 | ||
894 | switch (special) { | 1331 | /* strip syslog prefix and extract log level or control flags */ |
895 | case 'c': /* Strip <c> KERN_CONT, continue line */ | 1332 | if (text[0] == '<' && text[1] && text[2] == '>') { |
896 | plen = 0; | 1333 | switch (text[1]) { |
897 | break; | 1334 | case '0' ... '7': |
898 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ | 1335 | if (level == -1) |
899 | plen = 0; | 1336 | level = text[1] - '0'; |
900 | default: | 1337 | case 'd': /* KERN_DEFAULT */ |
901 | if (!new_text_line) { | 1338 | prefix = true; |
902 | emit_log_char('\n'); | 1339 | case 'c': /* KERN_CONT */ |
903 | new_text_line = 1; | 1340 | text += 3; |
904 | } | 1341 | text_len -= 3; |
905 | } | 1342 | } |
906 | } | 1343 | } |
907 | 1344 | ||
908 | /* | 1345 | if (level == -1) |
909 | * Copy the output into log_buf. If the caller didn't provide | 1346 | level = default_message_loglevel; |
910 | * the appropriate log prefix, we insert them here | ||
911 | */ | ||
912 | for (; *p; p++) { | ||
913 | if (new_text_line) { | ||
914 | new_text_line = 0; | ||
915 | |||
916 | if (plen) { | ||
917 | /* Copy original log prefix */ | ||
918 | int i; | ||
919 | |||
920 | for (i = 0; i < plen; i++) | ||
921 | emit_log_char(printk_buf[i]); | ||
922 | printed_len += plen; | ||
923 | } else { | ||
924 | /* Add log prefix */ | ||
925 | emit_log_char('<'); | ||
926 | emit_log_char(current_log_level + '0'); | ||
927 | emit_log_char('>'); | ||
928 | printed_len += 3; | ||
929 | } | ||
930 | 1347 | ||
931 | if (printk_time) { | 1348 | if (dict) { |
932 | /* Add the current time stamp */ | 1349 | prefix = true; |
933 | char tbuf[50], *tp; | 1350 | newline = true; |
934 | unsigned tlen; | 1351 | } |
935 | unsigned long long t; | ||
936 | unsigned long nanosec_rem; | ||
937 | |||
938 | t = cpu_clock(printk_cpu); | ||
939 | nanosec_rem = do_div(t, 1000000000); | ||
940 | tlen = sprintf(tbuf, "[%5lu.%06lu] ", | ||
941 | (unsigned long) t, | ||
942 | nanosec_rem / 1000); | ||
943 | |||
944 | for (tp = tbuf; tp < tbuf + tlen; tp++) | ||
945 | emit_log_char(*tp); | ||
946 | printed_len += tlen; | ||
947 | } | ||
948 | 1352 | ||
949 | if (!*p) | 1353 | if (!newline) { |
950 | break; | 1354 | if (cont_len && (prefix || cont_task != current)) { |
1355 | /* | ||
1356 | * Flush earlier buffer, which is either from a | ||
1357 | * different thread, or when we got a new prefix. | ||
1358 | */ | ||
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | ||
1360 | cont_len = 0; | ||
951 | } | 1361 | } |
952 | 1362 | ||
953 | emit_log_char(*p); | 1363 | if (!cont_len) { |
954 | if (*p == '\n') | 1364 | cont_level = level; |
955 | new_text_line = 1; | 1365 | cont_task = current; |
1366 | } | ||
1367 | |||
1368 | /* buffer or append to earlier buffer from the same thread */ | ||
1369 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1370 | text_len = sizeof(cont_buf) - cont_len; | ||
1371 | memcpy(cont_buf + cont_len, text, text_len); | ||
1372 | cont_len += text_len; | ||
1373 | } else { | ||
1374 | if (cont_len && cont_task == current) { | ||
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | |||
1386 | /* append to the earlier buffer and flush */ | ||
1387 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1388 | text_len = sizeof(cont_buf) - cont_len; | ||
1389 | memcpy(cont_buf + cont_len, text, text_len); | ||
1390 | cont_len += text_len; | ||
1391 | log_store(facility, cont_level, | ||
1392 | NULL, 0, cont_buf, cont_len); | ||
1393 | cont_len = 0; | ||
1394 | cont_task = NULL; | ||
1395 | printed_len = cont_len; | ||
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | ||
956 | } | 1402 | } |
957 | 1403 | ||
958 | /* | 1404 | /* |
959 | * Try to acquire and then immediately release the | 1405 | * Try to acquire and then immediately release the console semaphore. |
960 | * console semaphore. The release will do all the | 1406 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
961 | * actual magic (print out buffers, wake up klogd, | 1407 | * users. |
962 | * etc). | ||
963 | * | 1408 | * |
964 | * The console_trylock_for_printk() function | 1409 | * The console_trylock_for_printk() function will release 'logbuf_lock' |
965 | * will release 'logbuf_lock' regardless of whether it | 1410 | * regardless of whether it actually gets the console semaphore or not. |
966 | * actually gets the semaphore or not. | ||
967 | */ | 1411 | */ |
968 | if (console_trylock_for_printk(this_cpu)) | 1412 | if (console_trylock_for_printk(this_cpu)) |
969 | console_unlock(); | 1413 | console_unlock(); |
@@ -974,16 +1418,81 @@ out_restore_irqs: | |||
974 | 1418 | ||
975 | return printed_len; | 1419 | return printed_len; |
976 | } | 1420 | } |
977 | EXPORT_SYMBOL(printk); | 1421 | EXPORT_SYMBOL(vprintk_emit); |
978 | EXPORT_SYMBOL(vprintk); | ||
979 | 1422 | ||
980 | #else | 1423 | asmlinkage int vprintk(const char *fmt, va_list args) |
1424 | { | ||
1425 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1426 | } | ||
1427 | EXPORT_SYMBOL(vprintk); | ||
981 | 1428 | ||
982 | static void call_console_drivers(unsigned start, unsigned end) | 1429 | asmlinkage int printk_emit(int facility, int level, |
1430 | const char *dict, size_t dictlen, | ||
1431 | const char *fmt, ...) | ||
983 | { | 1432 | { |
1433 | va_list args; | ||
1434 | int r; | ||
1435 | |||
1436 | va_start(args, fmt); | ||
1437 | r = vprintk_emit(facility, level, dict, dictlen, fmt, args); | ||
1438 | va_end(args); | ||
1439 | |||
1440 | return r; | ||
984 | } | 1441 | } |
1442 | EXPORT_SYMBOL(printk_emit); | ||
985 | 1443 | ||
1444 | /** | ||
1445 | * printk - print a kernel message | ||
1446 | * @fmt: format string | ||
1447 | * | ||
1448 | * This is printk(). It can be called from any context. We want it to work. | ||
1449 | * | ||
1450 | * We try to grab the console_lock. If we succeed, it's easy - we log the | ||
1451 | * output and call the console drivers. If we fail to get the semaphore, we | ||
1452 | * place the output into the log buffer and return. The current holder of | ||
1453 | * the console_sem will notice the new output in console_unlock(); and will | ||
1454 | * send it to the consoles before releasing the lock. | ||
1455 | * | ||
1456 | * One effect of this deferred printing is that code which calls printk() and | ||
1457 | * then changes console_loglevel may break. This is because console_loglevel | ||
1458 | * is inspected when the actual printing occurs. | ||
1459 | * | ||
1460 | * See also: | ||
1461 | * printf(3) | ||
1462 | * | ||
1463 | * See the vsnprintf() documentation for format string extensions over C99. | ||
1464 | */ | ||
1465 | asmlinkage int printk(const char *fmt, ...) | ||
1466 | { | ||
1467 | va_list args; | ||
1468 | int r; | ||
1469 | |||
1470 | #ifdef CONFIG_KGDB_KDB | ||
1471 | if (unlikely(kdb_trap_printk)) { | ||
1472 | va_start(args, fmt); | ||
1473 | r = vkdb_printf(fmt, args); | ||
1474 | va_end(args); | ||
1475 | return r; | ||
1476 | } | ||
986 | #endif | 1477 | #endif |
1478 | va_start(args, fmt); | ||
1479 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1480 | va_end(args); | ||
1481 | |||
1482 | return r; | ||
1483 | } | ||
1484 | EXPORT_SYMBOL(printk); | ||
1485 | |||
1486 | #else | ||
1487 | |||
1488 | #define LOG_LINE_MAX 0 | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | ||
1490 | static u32 log_next(u32 idx) { return 0; } | ||
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | ||
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
1493 | char *buf, size_t size) { return 0; } | ||
1494 | |||
1495 | #endif /* CONFIG_PRINTK */ | ||
987 | 1496 | ||
988 | static int __add_preferred_console(char *name, int idx, char *options, | 1497 | static int __add_preferred_console(char *name, int idx, char *options, |
989 | char *brl_options) | 1498 | char *brl_options) |
@@ -1217,7 +1726,7 @@ int is_console_locked(void) | |||
1217 | } | 1726 | } |
1218 | 1727 | ||
1219 | /* | 1728 | /* |
1220 | * Delayed printk facility, for scheduler-internal messages: | 1729 | * Delayed printk version, for scheduler-internal messages: |
1221 | */ | 1730 | */ |
1222 | #define PRINTK_BUF_SIZE 512 | 1731 | #define PRINTK_BUF_SIZE 512 |
1223 | 1732 | ||
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void) | |||
1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1762 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1254 | } | 1763 | } |
1255 | 1764 | ||
1765 | /* the next printk record to write to the console */ | ||
1766 | static u64 console_seq; | ||
1767 | static u32 console_idx; | ||
1768 | |||
1256 | /** | 1769 | /** |
1257 | * console_unlock - unlock the console system | 1770 | * console_unlock - unlock the console system |
1258 | * | 1771 | * |
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void) | |||
1263 | * by printk(). If this is the case, console_unlock(); emits | 1776 | * by printk(). If this is the case, console_unlock(); emits |
1264 | * the output prior to releasing the lock. | 1777 | * the output prior to releasing the lock. |
1265 | * | 1778 | * |
1266 | * If there is output waiting for klogd, we wake it up. | 1779 | * If there is output waiting, we wake /dev/kmsg and syslog() users. |
1267 | * | 1780 | * |
1268 | * console_unlock(); may be called from any context. | 1781 | * console_unlock(); may be called from any context. |
1269 | */ | 1782 | */ |
1270 | void console_unlock(void) | 1783 | void console_unlock(void) |
1271 | { | 1784 | { |
1785 | static u64 seen_seq; | ||
1272 | unsigned long flags; | 1786 | unsigned long flags; |
1273 | unsigned _con_start, _log_end; | 1787 | bool wake_klogd = false; |
1274 | unsigned wake_klogd = 0, retry = 0; | 1788 | bool retry; |
1275 | 1789 | ||
1276 | if (console_suspended) { | 1790 | if (console_suspended) { |
1277 | up(&console_sem); | 1791 | up(&console_sem); |
@@ -1281,17 +1795,38 @@ void console_unlock(void) | |||
1281 | console_may_schedule = 0; | 1795 | console_may_schedule = 0; |
1282 | 1796 | ||
1283 | again: | 1797 | again: |
1284 | for ( ; ; ) { | 1798 | for (;;) { |
1799 | struct log *msg; | ||
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | ||
1802 | int level; | ||
1803 | |||
1285 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 1804 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1286 | wake_klogd |= log_start - log_end; | 1805 | if (seen_seq != log_next_seq) { |
1287 | if (con_start == log_end) | 1806 | wake_klogd = true; |
1288 | break; /* Nothing to print */ | 1807 | seen_seq = log_next_seq; |
1289 | _con_start = con_start; | 1808 | } |
1290 | _log_end = log_end; | 1809 | |
1291 | con_start = log_end; /* Flush */ | 1810 | if (console_seq < log_first_seq) { |
1811 | /* messages are gone, move to first one */ | ||
1812 | console_seq = log_first_seq; | ||
1813 | console_idx = log_first_idx; | ||
1814 | } | ||
1815 | |||
1816 | if (console_seq == log_next_seq) | ||
1817 | break; | ||
1818 | |||
1819 | msg = log_from_idx(console_idx); | ||
1820 | level = msg->level & 7; | ||
1821 | |||
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | ||
1823 | |||
1824 | console_idx = log_next(console_idx); | ||
1825 | console_seq++; | ||
1292 | raw_spin_unlock(&logbuf_lock); | 1826 | raw_spin_unlock(&logbuf_lock); |
1827 | |||
1293 | stop_critical_timings(); /* don't trace print latency */ | 1828 | stop_critical_timings(); /* don't trace print latency */ |
1294 | call_console_drivers(_con_start, _log_end); | 1829 | call_console_drivers(level, text, len); |
1295 | start_critical_timings(); | 1830 | start_critical_timings(); |
1296 | local_irq_restore(flags); | 1831 | local_irq_restore(flags); |
1297 | } | 1832 | } |
@@ -1312,8 +1847,7 @@ again: | |||
1312 | * flush, no worries. | 1847 | * flush, no worries. |
1313 | */ | 1848 | */ |
1314 | raw_spin_lock(&logbuf_lock); | 1849 | raw_spin_lock(&logbuf_lock); |
1315 | if (con_start != log_end) | 1850 | retry = console_seq != log_next_seq; |
1316 | retry = 1; | ||
1317 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 1851 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1318 | 1852 | ||
1319 | if (retry && console_trylock()) | 1853 | if (retry && console_trylock()) |
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon) | |||
1549 | * for us. | 2083 | * for us. |
1550 | */ | 2084 | */ |
1551 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1552 | con_start = log_start; | 2086 | console_seq = syslog_seq; |
2087 | console_idx = syslog_idx; | ||
1553 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1554 | /* | 2089 | /* |
1555 | * We're about to replay the log buffer. Only do this to the | 2090 | * We're about to replay the log buffer. Only do this to the |
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1758 | } | 2293 | } |
1759 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 2294 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1760 | 2295 | ||
2296 | static bool always_kmsg_dump; | ||
2297 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
2298 | |||
1761 | /** | 2299 | /** |
1762 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2300 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1763 | * @reason: the reason (oops, panic etc) for dumping | 2301 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | |||
1767 | */ | 2305 | */ |
1768 | void kmsg_dump(enum kmsg_dump_reason reason) | 2306 | void kmsg_dump(enum kmsg_dump_reason reason) |
1769 | { | 2307 | { |
1770 | unsigned long end; | 2308 | u64 idx; |
1771 | unsigned chars; | ||
1772 | struct kmsg_dumper *dumper; | 2309 | struct kmsg_dumper *dumper; |
1773 | const char *s1, *s2; | 2310 | const char *s1, *s2; |
1774 | unsigned long l1, l2; | 2311 | unsigned long l1, l2; |
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1780 | /* Theoretically, the log could move on after we do this, but | 2317 | /* Theoretically, the log could move on after we do this, but |
1781 | there's not a lot we can do about that. The new messages | 2318 | there's not a lot we can do about that. The new messages |
1782 | will overwrite the start of what we dump. */ | 2319 | will overwrite the start of what we dump. */ |
2320 | |||
1783 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1784 | end = log_end & LOG_BUF_MASK; | 2322 | if (syslog_seq < log_first_seq) |
1785 | chars = logged_chars; | 2323 | idx = syslog_idx; |
1786 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2324 | else |
2325 | idx = log_first_idx; | ||
1787 | 2326 | ||
1788 | if (chars > end) { | 2327 | if (idx > log_next_idx) { |
1789 | s1 = log_buf + log_buf_len - chars + end; | 2328 | s1 = log_buf; |
1790 | l1 = chars - end; | 2329 | l1 = log_next_idx; |
1791 | 2330 | ||
1792 | s2 = log_buf; | 2331 | s2 = log_buf + idx; |
1793 | l2 = end; | 2332 | l2 = log_buf_len - idx; |
1794 | } else { | 2333 | } else { |
1795 | s1 = ""; | 2334 | s1 = ""; |
1796 | l1 = 0; | 2335 | l1 = 0; |
1797 | 2336 | ||
1798 | s2 = log_buf + end - chars; | 2337 | s2 = log_buf + idx; |
1799 | l2 = chars; | 2338 | l2 = log_next_idx - idx; |
1800 | } | 2339 | } |
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1801 | 2341 | ||
1802 | rcu_read_lock(); | 2342 | rcu_read_lock(); |
1803 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2343 | list_for_each_entry_rcu(dumper, &dump_list, list) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc27..95cba41ce1e9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -51,6 +51,34 @@ | |||
51 | 51 | ||
52 | #include "rcu.h" | 52 | #include "rcu.h" |
53 | 53 | ||
54 | #ifdef CONFIG_PREEMPT_RCU | ||
55 | |||
56 | /* | ||
57 | * Check for a task exiting while in a preemptible-RCU read-side | ||
58 | * critical section, clean up if so. No need to issue warnings, | ||
59 | * as debug_check_no_locks_held() already does this if lockdep | ||
60 | * is enabled. | ||
61 | */ | ||
62 | void exit_rcu(void) | ||
63 | { | ||
64 | struct task_struct *t = current; | ||
65 | |||
66 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
67 | return; | ||
68 | t->rcu_read_lock_nesting = 1; | ||
69 | barrier(); | ||
70 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
71 | __rcu_read_unlock(); | ||
72 | } | ||
73 | |||
74 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
75 | |||
76 | void exit_rcu(void) | ||
77 | { | ||
78 | } | ||
79 | |||
80 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
81 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 82 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | static struct lock_class_key rcu_lock_key; | 83 | static struct lock_class_key rcu_lock_key; |
56 | struct lockdep_map rcu_lock_map = | 84 | struct lockdep_map rcu_lock_map = |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb62..fc31a2d65100 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) | |||
851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | ||
855 | * Check for a task exiting while in a preemptible -RCU read-side | ||
856 | * critical section, clean up if so. No need to issue warnings, | ||
857 | * as debug_check_no_locks_held() already does this if lockdep | ||
858 | * is enabled. | ||
859 | */ | ||
860 | void exit_rcu(void) | ||
861 | { | ||
862 | struct task_struct *t = current; | ||
863 | |||
864 | if (t->rcu_read_lock_nesting == 0) | ||
865 | return; | ||
866 | t->rcu_read_lock_nesting = 1; | ||
867 | __rcu_read_unlock(); | ||
868 | } | ||
869 | |||
870 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 854 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
871 | 855 | ||
872 | #ifdef CONFIG_RCU_TRACE | 856 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6e..e66b34ab7555 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | 68 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ |
68 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | 69 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ |
69 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | 70 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ |
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); | |||
96 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 97 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
97 | module_param(fqs_stutter, int, 0444); | 98 | module_param(fqs_stutter, int, 0444); |
98 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 99 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
100 | module_param(n_barrier_cbs, int, 0444); | ||
101 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | ||
99 | module_param(onoff_interval, int, 0444); | 102 | module_param(onoff_interval, int, 0444); |
100 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 103 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); |
101 | module_param(onoff_holdoff, int, 0444); | 104 | module_param(onoff_holdoff, int, 0444); |
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; | |||
139 | static struct task_struct *onoff_task; | 142 | static struct task_struct *onoff_task; |
140 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 143 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
141 | static struct task_struct *stall_task; | 144 | static struct task_struct *stall_task; |
145 | static struct task_struct **barrier_cbs_tasks; | ||
146 | static struct task_struct *barrier_task; | ||
142 | 147 | ||
143 | #define RCU_TORTURE_PIPE_LEN 10 | 148 | #define RCU_TORTURE_PIPE_LEN 10 |
144 | 149 | ||
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
164 | static atomic_t n_rcu_torture_free; | 169 | static atomic_t n_rcu_torture_free; |
165 | static atomic_t n_rcu_torture_mberror; | 170 | static atomic_t n_rcu_torture_mberror; |
166 | static atomic_t n_rcu_torture_error; | 171 | static atomic_t n_rcu_torture_error; |
172 | static long n_rcu_torture_barrier_error; | ||
167 | static long n_rcu_torture_boost_ktrerror; | 173 | static long n_rcu_torture_boost_ktrerror; |
168 | static long n_rcu_torture_boost_rterror; | 174 | static long n_rcu_torture_boost_rterror; |
169 | static long n_rcu_torture_boost_failure; | 175 | static long n_rcu_torture_boost_failure; |
@@ -173,6 +179,8 @@ static long n_offline_attempts; | |||
173 | static long n_offline_successes; | 179 | static long n_offline_successes; |
174 | static long n_online_attempts; | 180 | static long n_online_attempts; |
175 | static long n_online_successes; | 181 | static long n_online_successes; |
182 | static long n_barrier_attempts; | ||
183 | static long n_barrier_successes; | ||
176 | static struct list_head rcu_torture_removed; | 184 | static struct list_head rcu_torture_removed; |
177 | static cpumask_var_t shuffle_tmp_mask; | 185 | static cpumask_var_t shuffle_tmp_mask; |
178 | 186 | ||
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ | |||
197 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 205 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
198 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
199 | /* and boost task create/destroy. */ | 207 | /* and boost task create/destroy. */ |
208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | ||
209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | ||
210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | ||
211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | ||
200 | 212 | ||
201 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 213 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
202 | 214 | ||
@@ -327,6 +339,7 @@ struct rcu_torture_ops { | |||
327 | int (*completed)(void); | 339 | int (*completed)(void); |
328 | void (*deferred_free)(struct rcu_torture *p); | 340 | void (*deferred_free)(struct rcu_torture *p); |
329 | void (*sync)(void); | 341 | void (*sync)(void); |
342 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
330 | void (*cb_barrier)(void); | 343 | void (*cb_barrier)(void); |
331 | void (*fqs)(void); | 344 | void (*fqs)(void); |
332 | int (*stats)(char *page); | 345 | int (*stats)(char *page); |
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
417 | .completed = rcu_torture_completed, | 430 | .completed = rcu_torture_completed, |
418 | .deferred_free = rcu_torture_deferred_free, | 431 | .deferred_free = rcu_torture_deferred_free, |
419 | .sync = synchronize_rcu, | 432 | .sync = synchronize_rcu, |
433 | .call = call_rcu, | ||
420 | .cb_barrier = rcu_barrier, | 434 | .cb_barrier = rcu_barrier, |
421 | .fqs = rcu_force_quiescent_state, | 435 | .fqs = rcu_force_quiescent_state, |
422 | .stats = NULL, | 436 | .stats = NULL, |
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
460 | .completed = rcu_torture_completed, | 474 | .completed = rcu_torture_completed, |
461 | .deferred_free = rcu_sync_torture_deferred_free, | 475 | .deferred_free = rcu_sync_torture_deferred_free, |
462 | .sync = synchronize_rcu, | 476 | .sync = synchronize_rcu, |
477 | .call = NULL, | ||
463 | .cb_barrier = NULL, | 478 | .cb_barrier = NULL, |
464 | .fqs = rcu_force_quiescent_state, | 479 | .fqs = rcu_force_quiescent_state, |
465 | .stats = NULL, | 480 | .stats = NULL, |
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
477 | .completed = rcu_no_completed, | 492 | .completed = rcu_no_completed, |
478 | .deferred_free = rcu_sync_torture_deferred_free, | 493 | .deferred_free = rcu_sync_torture_deferred_free, |
479 | .sync = synchronize_rcu_expedited, | 494 | .sync = synchronize_rcu_expedited, |
495 | .call = NULL, | ||
480 | .cb_barrier = NULL, | 496 | .cb_barrier = NULL, |
481 | .fqs = rcu_force_quiescent_state, | 497 | .fqs = rcu_force_quiescent_state, |
482 | .stats = NULL, | 498 | .stats = NULL, |
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
519 | .completed = rcu_bh_torture_completed, | 535 | .completed = rcu_bh_torture_completed, |
520 | .deferred_free = rcu_bh_torture_deferred_free, | 536 | .deferred_free = rcu_bh_torture_deferred_free, |
521 | .sync = synchronize_rcu_bh, | 537 | .sync = synchronize_rcu_bh, |
538 | .call = call_rcu_bh, | ||
522 | .cb_barrier = rcu_barrier_bh, | 539 | .cb_barrier = rcu_barrier_bh, |
523 | .fqs = rcu_bh_force_quiescent_state, | 540 | .fqs = rcu_bh_force_quiescent_state, |
524 | .stats = NULL, | 541 | .stats = NULL, |
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
535 | .completed = rcu_bh_torture_completed, | 552 | .completed = rcu_bh_torture_completed, |
536 | .deferred_free = rcu_sync_torture_deferred_free, | 553 | .deferred_free = rcu_sync_torture_deferred_free, |
537 | .sync = synchronize_rcu_bh, | 554 | .sync = synchronize_rcu_bh, |
555 | .call = NULL, | ||
538 | .cb_barrier = NULL, | 556 | .cb_barrier = NULL, |
539 | .fqs = rcu_bh_force_quiescent_state, | 557 | .fqs = rcu_bh_force_quiescent_state, |
540 | .stats = NULL, | 558 | .stats = NULL, |
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
551 | .completed = rcu_bh_torture_completed, | 569 | .completed = rcu_bh_torture_completed, |
552 | .deferred_free = rcu_sync_torture_deferred_free, | 570 | .deferred_free = rcu_sync_torture_deferred_free, |
553 | .sync = synchronize_rcu_bh_expedited, | 571 | .sync = synchronize_rcu_bh_expedited, |
572 | .call = NULL, | ||
554 | .cb_barrier = NULL, | 573 | .cb_barrier = NULL, |
555 | .fqs = rcu_bh_force_quiescent_state, | 574 | .fqs = rcu_bh_force_quiescent_state, |
556 | .stats = NULL, | 575 | .stats = NULL, |
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void) | |||
606 | return srcu_batches_completed(&srcu_ctl); | 625 | return srcu_batches_completed(&srcu_ctl); |
607 | } | 626 | } |
608 | 627 | ||
628 | static void srcu_torture_deferred_free(struct rcu_torture *rp) | ||
629 | { | ||
630 | call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); | ||
631 | } | ||
632 | |||
609 | static void srcu_torture_synchronize(void) | 633 | static void srcu_torture_synchronize(void) |
610 | { | 634 | { |
611 | synchronize_srcu(&srcu_ctl); | 635 | synchronize_srcu(&srcu_ctl); |
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page) | |||
620 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 644 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", |
621 | torture_type, TORTURE_FLAG, idx); | 645 | torture_type, TORTURE_FLAG, idx); |
622 | for_each_possible_cpu(cpu) { | 646 | for_each_possible_cpu(cpu) { |
623 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, | 647 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, |
624 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 648 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
625 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 649 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
626 | } | 650 | } |
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = { | |||
635 | .read_delay = srcu_read_delay, | 659 | .read_delay = srcu_read_delay, |
636 | .readunlock = srcu_torture_read_unlock, | 660 | .readunlock = srcu_torture_read_unlock, |
637 | .completed = srcu_torture_completed, | 661 | .completed = srcu_torture_completed, |
638 | .deferred_free = rcu_sync_torture_deferred_free, | 662 | .deferred_free = srcu_torture_deferred_free, |
639 | .sync = srcu_torture_synchronize, | 663 | .sync = srcu_torture_synchronize, |
664 | .call = NULL, | ||
640 | .cb_barrier = NULL, | 665 | .cb_barrier = NULL, |
641 | .stats = srcu_torture_stats, | 666 | .stats = srcu_torture_stats, |
642 | .name = "srcu" | 667 | .name = "srcu" |
643 | }; | 668 | }; |
644 | 669 | ||
670 | static struct rcu_torture_ops srcu_sync_ops = { | ||
671 | .init = srcu_torture_init, | ||
672 | .cleanup = srcu_torture_cleanup, | ||
673 | .readlock = srcu_torture_read_lock, | ||
674 | .read_delay = srcu_read_delay, | ||
675 | .readunlock = srcu_torture_read_unlock, | ||
676 | .completed = srcu_torture_completed, | ||
677 | .deferred_free = rcu_sync_torture_deferred_free, | ||
678 | .sync = srcu_torture_synchronize, | ||
679 | .call = NULL, | ||
680 | .cb_barrier = NULL, | ||
681 | .stats = srcu_torture_stats, | ||
682 | .name = "srcu_sync" | ||
683 | }; | ||
684 | |||
645 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | 685 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) |
646 | { | 686 | { |
647 | return srcu_read_lock_raw(&srcu_ctl); | 687 | return srcu_read_lock_raw(&srcu_ctl); |
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
659 | .read_delay = srcu_read_delay, | 699 | .read_delay = srcu_read_delay, |
660 | .readunlock = srcu_torture_read_unlock_raw, | 700 | .readunlock = srcu_torture_read_unlock_raw, |
661 | .completed = srcu_torture_completed, | 701 | .completed = srcu_torture_completed, |
662 | .deferred_free = rcu_sync_torture_deferred_free, | 702 | .deferred_free = srcu_torture_deferred_free, |
663 | .sync = srcu_torture_synchronize, | 703 | .sync = srcu_torture_synchronize, |
704 | .call = NULL, | ||
664 | .cb_barrier = NULL, | 705 | .cb_barrier = NULL, |
665 | .stats = srcu_torture_stats, | 706 | .stats = srcu_torture_stats, |
666 | .name = "srcu_raw" | 707 | .name = "srcu_raw" |
667 | }; | 708 | }; |
668 | 709 | ||
710 | static struct rcu_torture_ops srcu_raw_sync_ops = { | ||
711 | .init = srcu_torture_init, | ||
712 | .cleanup = srcu_torture_cleanup, | ||
713 | .readlock = srcu_torture_read_lock_raw, | ||
714 | .read_delay = srcu_read_delay, | ||
715 | .readunlock = srcu_torture_read_unlock_raw, | ||
716 | .completed = srcu_torture_completed, | ||
717 | .deferred_free = rcu_sync_torture_deferred_free, | ||
718 | .sync = srcu_torture_synchronize, | ||
719 | .call = NULL, | ||
720 | .cb_barrier = NULL, | ||
721 | .stats = srcu_torture_stats, | ||
722 | .name = "srcu_raw_sync" | ||
723 | }; | ||
724 | |||
669 | static void srcu_torture_synchronize_expedited(void) | 725 | static void srcu_torture_synchronize_expedited(void) |
670 | { | 726 | { |
671 | synchronize_srcu_expedited(&srcu_ctl); | 727 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { | |||
680 | .completed = srcu_torture_completed, | 736 | .completed = srcu_torture_completed, |
681 | .deferred_free = rcu_sync_torture_deferred_free, | 737 | .deferred_free = rcu_sync_torture_deferred_free, |
682 | .sync = srcu_torture_synchronize_expedited, | 738 | .sync = srcu_torture_synchronize_expedited, |
739 | .call = NULL, | ||
683 | .cb_barrier = NULL, | 740 | .cb_barrier = NULL, |
684 | .stats = srcu_torture_stats, | 741 | .stats = srcu_torture_stats, |
685 | .name = "srcu_expedited" | 742 | .name = "srcu_expedited" |
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page) | |||
1129 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1130 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1131 | "rtbf: %ld rtb: %ld nt: %ld " | 1188 | "rtbf: %ld rtb: %ld nt: %ld " |
1132 | "onoff: %ld/%ld:%ld/%ld", | 1189 | "onoff: %ld/%ld:%ld/%ld " |
1190 | "barrier: %ld/%ld:%ld", | ||
1133 | rcu_torture_current, | 1191 | rcu_torture_current, |
1134 | rcu_torture_current_version, | 1192 | rcu_torture_current_version, |
1135 | list_empty(&rcu_torture_freelist), | 1193 | list_empty(&rcu_torture_freelist), |
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page) | |||
1145 | n_online_successes, | 1203 | n_online_successes, |
1146 | n_online_attempts, | 1204 | n_online_attempts, |
1147 | n_offline_successes, | 1205 | n_offline_successes, |
1148 | n_offline_attempts); | 1206 | n_offline_attempts, |
1207 | n_barrier_successes, | ||
1208 | n_barrier_attempts, | ||
1209 | n_rcu_torture_barrier_error); | ||
1210 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1149 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1211 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1212 | n_rcu_torture_barrier_error != 0 || | ||
1150 | n_rcu_torture_boost_ktrerror != 0 || | 1213 | n_rcu_torture_boost_ktrerror != 0 || |
1151 | n_rcu_torture_boost_rterror != 0 || | 1214 | n_rcu_torture_boost_rterror != 0 || |
1152 | n_rcu_torture_boost_failure != 0) | 1215 | n_rcu_torture_boost_failure != 0 || |
1153 | cnt += sprintf(&page[cnt], " !!!"); | 1216 | i > 1) { |
1154 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1155 | if (i > 1) { | ||
1156 | cnt += sprintf(&page[cnt], "!!! "); | 1217 | cnt += sprintf(&page[cnt], "!!! "); |
1157 | atomic_inc(&n_rcu_torture_error); | 1218 | atomic_inc(&n_rcu_torture_error); |
1158 | WARN_ON_ONCE(1); | 1219 | WARN_ON_ONCE(1); |
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu) | |||
1337 | 1398 | ||
1338 | /* This must be outside of the mutex, otherwise deadlock! */ | 1399 | /* This must be outside of the mutex, otherwise deadlock! */ |
1339 | kthread_stop(t); | 1400 | kthread_stop(t); |
1401 | boost_tasks[cpu] = NULL; | ||
1340 | } | 1402 | } |
1341 | 1403 | ||
1342 | static int rcutorture_booster_init(int cpu) | 1404 | static int rcutorture_booster_init(int cpu) |
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void) | |||
1484 | return; | 1546 | return; |
1485 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | 1547 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); |
1486 | kthread_stop(onoff_task); | 1548 | kthread_stop(onoff_task); |
1549 | onoff_task = NULL; | ||
1487 | } | 1550 | } |
1488 | 1551 | ||
1489 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1552 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1490 | 1553 | ||
1491 | static void | 1554 | static int |
1492 | rcu_torture_onoff_init(void) | 1555 | rcu_torture_onoff_init(void) |
1493 | { | 1556 | { |
1557 | return 0; | ||
1494 | } | 1558 | } |
1495 | 1559 | ||
1496 | static void rcu_torture_onoff_cleanup(void) | 1560 | static void rcu_torture_onoff_cleanup(void) |
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void) | |||
1554 | return; | 1618 | return; |
1555 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | 1619 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); |
1556 | kthread_stop(stall_task); | 1620 | kthread_stop(stall_task); |
1621 | stall_task = NULL; | ||
1622 | } | ||
1623 | |||
1624 | /* Callback function for RCU barrier testing. */ | ||
1625 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | ||
1626 | { | ||
1627 | atomic_inc(&barrier_cbs_invoked); | ||
1628 | } | ||
1629 | |||
1630 | /* kthread function to register callbacks used to test RCU barriers. */ | ||
1631 | static int rcu_torture_barrier_cbs(void *arg) | ||
1632 | { | ||
1633 | long myid = (long)arg; | ||
1634 | struct rcu_head rcu; | ||
1635 | |||
1636 | init_rcu_head_on_stack(&rcu); | ||
1637 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); | ||
1638 | set_user_nice(current, 19); | ||
1639 | do { | ||
1640 | wait_event(barrier_cbs_wq[myid], | ||
1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | ||
1642 | kthread_should_stop() || | ||
1643 | fullstop != FULLSTOP_DONTSTOP); | ||
1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1645 | break; | ||
1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | ||
1647 | if (atomic_dec_and_test(&barrier_cbs_count)) | ||
1648 | wake_up(&barrier_wq); | ||
1649 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1650 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); | ||
1651 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1652 | while (!kthread_should_stop()) | ||
1653 | schedule_timeout_interruptible(1); | ||
1654 | cur_ops->cb_barrier(); | ||
1655 | destroy_rcu_head_on_stack(&rcu); | ||
1656 | return 0; | ||
1657 | } | ||
1658 | |||
1659 | /* kthread function to drive and coordinate RCU barrier testing. */ | ||
1660 | static int rcu_torture_barrier(void *arg) | ||
1661 | { | ||
1662 | int i; | ||
1663 | |||
1664 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); | ||
1665 | do { | ||
1666 | atomic_set(&barrier_cbs_invoked, 0); | ||
1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | ||
1668 | /* wake_up() path contains the required barriers. */ | ||
1669 | for (i = 0; i < n_barrier_cbs; i++) | ||
1670 | wake_up(&barrier_cbs_wq[i]); | ||
1671 | wait_event(barrier_wq, | ||
1672 | atomic_read(&barrier_cbs_count) == 0 || | ||
1673 | kthread_should_stop() || | ||
1674 | fullstop != FULLSTOP_DONTSTOP); | ||
1675 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1676 | break; | ||
1677 | n_barrier_attempts++; | ||
1678 | cur_ops->cb_barrier(); | ||
1679 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | ||
1680 | n_rcu_torture_barrier_error++; | ||
1681 | WARN_ON_ONCE(1); | ||
1682 | } | ||
1683 | n_barrier_successes++; | ||
1684 | schedule_timeout_interruptible(HZ / 10); | ||
1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | ||
1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1688 | while (!kthread_should_stop()) | ||
1689 | schedule_timeout_interruptible(1); | ||
1690 | return 0; | ||
1691 | } | ||
1692 | |||
1693 | /* Initialize RCU barrier testing. */ | ||
1694 | static int rcu_torture_barrier_init(void) | ||
1695 | { | ||
1696 | int i; | ||
1697 | int ret; | ||
1698 | |||
1699 | if (n_barrier_cbs == 0) | ||
1700 | return 0; | ||
1701 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | ||
1702 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1703 | " Call or barrier ops missing for %s,\n", | ||
1704 | torture_type, cur_ops->name); | ||
1705 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1706 | " RCU barrier testing omitted from run.\n", | ||
1707 | torture_type); | ||
1708 | return 0; | ||
1709 | } | ||
1710 | atomic_set(&barrier_cbs_count, 0); | ||
1711 | atomic_set(&barrier_cbs_invoked, 0); | ||
1712 | barrier_cbs_tasks = | ||
1713 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | ||
1714 | GFP_KERNEL); | ||
1715 | barrier_cbs_wq = | ||
1716 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | ||
1717 | GFP_KERNEL); | ||
1718 | if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) | ||
1719 | return -ENOMEM; | ||
1720 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1721 | init_waitqueue_head(&barrier_cbs_wq[i]); | ||
1722 | barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, | ||
1723 | (void *)(long)i, | ||
1724 | "rcu_torture_barrier_cbs"); | ||
1725 | if (IS_ERR(barrier_cbs_tasks[i])) { | ||
1726 | ret = PTR_ERR(barrier_cbs_tasks[i]); | ||
1727 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); | ||
1728 | barrier_cbs_tasks[i] = NULL; | ||
1729 | return ret; | ||
1730 | } | ||
1731 | } | ||
1732 | barrier_task = kthread_run(rcu_torture_barrier, NULL, | ||
1733 | "rcu_torture_barrier"); | ||
1734 | if (IS_ERR(barrier_task)) { | ||
1735 | ret = PTR_ERR(barrier_task); | ||
1736 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); | ||
1737 | barrier_task = NULL; | ||
1738 | } | ||
1739 | return 0; | ||
1740 | } | ||
1741 | |||
1742 | /* Clean up after RCU barrier testing. */ | ||
1743 | static void rcu_torture_barrier_cleanup(void) | ||
1744 | { | ||
1745 | int i; | ||
1746 | |||
1747 | if (barrier_task != NULL) { | ||
1748 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); | ||
1749 | kthread_stop(barrier_task); | ||
1750 | barrier_task = NULL; | ||
1751 | } | ||
1752 | if (barrier_cbs_tasks != NULL) { | ||
1753 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1754 | if (barrier_cbs_tasks[i] != NULL) { | ||
1755 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); | ||
1756 | kthread_stop(barrier_cbs_tasks[i]); | ||
1757 | barrier_cbs_tasks[i] = NULL; | ||
1758 | } | ||
1759 | } | ||
1760 | kfree(barrier_cbs_tasks); | ||
1761 | barrier_cbs_tasks = NULL; | ||
1762 | } | ||
1763 | if (barrier_cbs_wq != NULL) { | ||
1764 | kfree(barrier_cbs_wq); | ||
1765 | barrier_cbs_wq = NULL; | ||
1766 | } | ||
1557 | } | 1767 | } |
1558 | 1768 | ||
1559 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1769 | static int rcutorture_cpu_notify(struct notifier_block *self, |
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void) | |||
1598 | fullstop = FULLSTOP_RMMOD; | 1808 | fullstop = FULLSTOP_RMMOD; |
1599 | mutex_unlock(&fullstop_mutex); | 1809 | mutex_unlock(&fullstop_mutex); |
1600 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1810 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1811 | rcu_torture_barrier_cleanup(); | ||
1601 | rcu_torture_stall_cleanup(); | 1812 | rcu_torture_stall_cleanup(); |
1602 | if (stutter_task) { | 1813 | if (stutter_task) { |
1603 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1814 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void) | |||
1665 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | 1876 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); |
1666 | kthread_stop(shutdown_task); | 1877 | kthread_stop(shutdown_task); |
1667 | } | 1878 | } |
1879 | shutdown_task = NULL; | ||
1668 | rcu_torture_onoff_cleanup(); | 1880 | rcu_torture_onoff_cleanup(); |
1669 | 1881 | ||
1670 | /* Wait for all RCU callbacks to fire. */ | 1882 | /* Wait for all RCU callbacks to fire. */ |
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void) | |||
1676 | 1888 | ||
1677 | if (cur_ops->cleanup) | 1889 | if (cur_ops->cleanup) |
1678 | cur_ops->cleanup(); | 1890 | cur_ops->cleanup(); |
1679 | if (atomic_read(&n_rcu_torture_error)) | 1891 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1680 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1892 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1681 | else if (n_online_successes != n_online_attempts || | 1893 | else if (n_online_successes != n_online_attempts || |
1682 | n_offline_successes != n_offline_attempts) | 1894 | n_offline_successes != n_offline_attempts) |
@@ -1692,10 +1904,12 @@ rcu_torture_init(void) | |||
1692 | int i; | 1904 | int i; |
1693 | int cpu; | 1905 | int cpu; |
1694 | int firsterr = 0; | 1906 | int firsterr = 0; |
1907 | int retval; | ||
1695 | static struct rcu_torture_ops *torture_ops[] = | 1908 | static struct rcu_torture_ops *torture_ops[] = |
1696 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1697 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1698 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, | 1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, |
1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | ||
1699 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1700 | 1914 | ||
1701 | mutex_lock(&fullstop_mutex); | 1915 | mutex_lock(&fullstop_mutex); |
@@ -1749,6 +1963,7 @@ rcu_torture_init(void) | |||
1749 | atomic_set(&n_rcu_torture_free, 0); | 1963 | atomic_set(&n_rcu_torture_free, 0); |
1750 | atomic_set(&n_rcu_torture_mberror, 0); | 1964 | atomic_set(&n_rcu_torture_mberror, 0); |
1751 | atomic_set(&n_rcu_torture_error, 0); | 1965 | atomic_set(&n_rcu_torture_error, 0); |
1966 | n_rcu_torture_barrier_error = 0; | ||
1752 | n_rcu_torture_boost_ktrerror = 0; | 1967 | n_rcu_torture_boost_ktrerror = 0; |
1753 | n_rcu_torture_boost_rterror = 0; | 1968 | n_rcu_torture_boost_rterror = 0; |
1754 | n_rcu_torture_boost_failure = 0; | 1969 | n_rcu_torture_boost_failure = 0; |
@@ -1872,7 +2087,6 @@ rcu_torture_init(void) | |||
1872 | test_boost_duration = 2; | 2087 | test_boost_duration = 2; |
1873 | if ((test_boost == 1 && cur_ops->can_boost) || | 2088 | if ((test_boost == 1 && cur_ops->can_boost) || |
1874 | test_boost == 2) { | 2089 | test_boost == 2) { |
1875 | int retval; | ||
1876 | 2090 | ||
1877 | boost_starttime = jiffies + test_boost_interval * HZ; | 2091 | boost_starttime = jiffies + test_boost_interval * HZ; |
1878 | register_cpu_notifier(&rcutorture_cpu_nb); | 2092 | register_cpu_notifier(&rcutorture_cpu_nb); |
@@ -1897,9 +2111,22 @@ rcu_torture_init(void) | |||
1897 | goto unwind; | 2111 | goto unwind; |
1898 | } | 2112 | } |
1899 | } | 2113 | } |
1900 | rcu_torture_onoff_init(); | 2114 | i = rcu_torture_onoff_init(); |
2115 | if (i != 0) { | ||
2116 | firsterr = i; | ||
2117 | goto unwind; | ||
2118 | } | ||
1901 | register_reboot_notifier(&rcutorture_shutdown_nb); | 2119 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1902 | rcu_torture_stall_init(); | 2120 | i = rcu_torture_stall_init(); |
2121 | if (i != 0) { | ||
2122 | firsterr = i; | ||
2123 | goto unwind; | ||
2124 | } | ||
2125 | retval = rcu_torture_barrier_init(); | ||
2126 | if (retval != 0) { | ||
2127 | firsterr = retval; | ||
2128 | goto unwind; | ||
2129 | } | ||
1903 | rcutorture_record_test_transition(); | 2130 | rcutorture_record_test_transition(); |
1904 | mutex_unlock(&fullstop_mutex); | 2131 | mutex_unlock(&fullstop_mutex); |
1905 | return 0; | 2132 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1050d6d3922c..0da7b88d92d0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
75 | .gpnum = -300, \ | 75 | .gpnum = -300, \ |
76 | .completed = -300, \ | 76 | .completed = -300, \ |
77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | ||
79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | ||
78 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
79 | .n_force_qs = 0, \ | 81 | .n_force_qs = 0, \ |
80 | .n_force_qs_ngp = 0, \ | 82 | .n_force_qs_ngp = 0, \ |
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
145 | unsigned long rcutorture_testseq; | 147 | unsigned long rcutorture_testseq; |
146 | unsigned long rcutorture_vernum; | 148 | unsigned long rcutorture_vernum; |
147 | 149 | ||
150 | /* State information for rcu_barrier() and friends. */ | ||
151 | |||
152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
153 | static atomic_t rcu_barrier_cpu_count; | ||
154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
155 | static struct completion rcu_barrier_completion; | ||
156 | |||
148 | /* | 157 | /* |
149 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
150 | * permit this function to be invoked without holding the root rcu_node | 159 | * permit this function to be invoked without holding the root rcu_node |
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu) | |||
192 | { | 201 | { |
193 | trace_rcu_utilization("Start context switch"); | 202 | trace_rcu_utilization("Start context switch"); |
194 | rcu_sched_qs(cpu); | 203 | rcu_sched_qs(cpu); |
195 | rcu_preempt_note_context_switch(cpu); | ||
196 | trace_rcu_utilization("End context switch"); | 204 | trace_rcu_utilization("End context switch"); |
197 | } | 205 | } |
198 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1311 | #ifdef CONFIG_HOTPLUG_CPU | 1319 | #ifdef CONFIG_HOTPLUG_CPU |
1312 | 1320 | ||
1313 | /* | 1321 | /* |
1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1322 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1315 | * Also record a quiescent state for this CPU for the current grace period. | 1323 | * specified CPU must be offline, and the caller must hold the |
1316 | * Synchronization and interrupt disabling are not required because | 1324 | * ->onofflock. |
1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
1318 | * operations that might block must be done later from the CPU_DEAD | ||
1319 | * notifier. | ||
1320 | * | ||
1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
1323 | * destination from the bits set in that mask. | ||
1324 | */ | 1325 | */ |
1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1326 | static void |
1327 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
1328 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
1326 | { | 1329 | { |
1327 | int i; | 1330 | int i; |
1328 | unsigned long mask; | ||
1329 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
1333 | 1331 | ||
1334 | /* First, adjust the counts. */ | 1332 | /* |
1333 | * Orphan the callbacks. First adjust the counts. This is safe | ||
1334 | * because ->onofflock excludes _rcu_barrier()'s adoption of | ||
1335 | * the callbacks, thus no memory barrier is required. | ||
1336 | */ | ||
1335 | if (rdp->nxtlist != NULL) { | 1337 | if (rdp->nxtlist != NULL) { |
1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | 1338 | rsp->qlen_lazy += rdp->qlen_lazy; |
1337 | receive_rdp->qlen += rdp->qlen; | 1339 | rsp->qlen += rdp->qlen; |
1340 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1338 | rdp->qlen_lazy = 0; | 1341 | rdp->qlen_lazy = 0; |
1339 | rdp->qlen = 0; | 1342 | rdp->qlen = 0; |
1340 | } | 1343 | } |
1341 | 1344 | ||
1342 | /* | 1345 | /* |
1343 | * Next, move ready-to-invoke callbacks to be invoked on some | 1346 | * Next, move those callbacks still needing a grace period to |
1344 | * other CPU. These will not be required to pass through another | 1347 | * the orphanage, where some other CPU will pick them up. |
1345 | * grace period: They are done, regardless of CPU. | 1348 | * Some of the callbacks might have gone partway through a grace |
1349 | * period, but that is too bad. They get to start over because we | ||
1350 | * cannot assume that grace periods are synchronized across CPUs. | ||
1351 | * We don't bother updating the ->nxttail[] array yet, instead | ||
1352 | * we just reset the whole thing later on. | ||
1346 | */ | 1353 | */ |
1347 | if (rdp->nxtlist != NULL && | 1354 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { |
1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | 1355 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; |
1349 | struct rcu_head *oldhead; | 1356 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; |
1350 | struct rcu_head **oldtail; | 1357 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1351 | struct rcu_head **newtail; | ||
1352 | |||
1353 | oldhead = rdp->nxtlist; | ||
1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
1361 | receive_rdp->nxttail[i] = newtail; | ||
1362 | if (rdp->nxttail[i] == newtail) | ||
1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1364 | } | ||
1365 | } | 1358 | } |
1366 | 1359 | ||
1367 | /* | 1360 | /* |
1368 | * Finally, put the rest of the callbacks at the end of the list. | 1361 | * Then move the ready-to-invoke callbacks to the orphanage, |
1369 | * The ones that made it partway through get to start over: We | 1362 | * where some other CPU will pick them up. These will not be |
1370 | * cannot assume that grace periods are synchronized across CPUs. | 1363 | * required to pass though another grace period: They are done. |
1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but | ||
1372 | * this does not seem compelling. Not yet, anyway.) | ||
1373 | */ | 1364 | */ |
1374 | if (rdp->nxtlist != NULL) { | 1365 | if (rdp->nxtlist != NULL) { |
1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1366 | *rsp->orphan_donetail = rdp->nxtlist; |
1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | 1367 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1380 | |||
1381 | rdp->nxtlist = NULL; | ||
1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1384 | } | 1368 | } |
1385 | 1369 | ||
1370 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
1371 | rdp->nxtlist = NULL; | ||
1372 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1373 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
1378 | * orphanage. The caller must hold the ->onofflock. | ||
1379 | */ | ||
1380 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1381 | { | ||
1382 | int i; | ||
1383 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
1384 | |||
1386 | /* | 1385 | /* |
1387 | * Record a quiescent state for the dying CPU. This is safe | 1386 | * If there is an rcu_barrier() operation in progress, then |
1388 | * only because we have already cleared out the callbacks. | 1387 | * only the task doing that operation is permitted to adopt |
1389 | * (Otherwise, the RCU core might try to schedule the invocation | 1388 | * callbacks. To do otherwise breaks rcu_barrier() and friends |
1390 | * of callbacks on this now-offline CPU, which would be bad.) | 1389 | * by causing them to fail to wait for the callbacks in the |
1390 | * orphanage. | ||
1391 | */ | 1391 | */ |
1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1392 | if (rsp->rcu_barrier_in_progress && |
1393 | rsp->rcu_barrier_in_progress != current) | ||
1394 | return; | ||
1395 | |||
1396 | /* Do the accounting first. */ | ||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
1398 | rdp->qlen += rsp->qlen; | ||
1399 | rdp->n_cbs_adopted += rsp->qlen; | ||
1400 | rsp->qlen_lazy = 0; | ||
1401 | rsp->qlen = 0; | ||
1402 | |||
1403 | /* | ||
1404 | * We do not need a memory barrier here because the only way we | ||
1405 | * can get here if there is an rcu_barrier() in flight is if | ||
1406 | * we are the task doing the rcu_barrier(). | ||
1407 | */ | ||
1408 | |||
1409 | /* First adopt the ready-to-invoke callbacks. */ | ||
1410 | if (rsp->orphan_donelist != NULL) { | ||
1411 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1412 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
1413 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
1414 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
1415 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
1416 | rsp->orphan_donelist = NULL; | ||
1417 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1418 | } | ||
1419 | |||
1420 | /* And then adopt the callbacks that still need a grace period. */ | ||
1421 | if (rsp->orphan_nxtlist != NULL) { | ||
1422 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
1423 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
1424 | rsp->orphan_nxtlist = NULL; | ||
1425 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * Trace the fact that this CPU is going offline. | ||
1431 | */ | ||
1432 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
1433 | { | ||
1434 | RCU_TRACE(unsigned long mask); | ||
1435 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
1436 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
1437 | |||
1438 | RCU_TRACE(mask = rdp->grpmask); | ||
1393 | trace_rcu_grace_period(rsp->name, | 1439 | trace_rcu_grace_period(rsp->name, |
1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1440 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1395 | "cpuofl"); | 1441 | "cpuofl"); |
1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
1398 | } | 1442 | } |
1399 | 1443 | ||
1400 | /* | 1444 | /* |
1401 | * The CPU has been completely removed, and some other CPU is reporting | 1445 | * The CPU has been completely removed, and some other CPU is reporting |
1402 | * this fact from process context. Do the remainder of the cleanup. | 1446 | * this fact from process context. Do the remainder of the cleanup, |
1447 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
1448 | * adopting them, if there is no _rcu_barrier() instance running. | ||
1403 | * There can only be one CPU hotplug operation at a time, so no other | 1449 | * There can only be one CPU hotplug operation at a time, so no other |
1404 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1450 | * CPU can be attempting to update rcu_cpu_kthread_task. |
1405 | */ | 1451 | */ |
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1409 | unsigned long mask; | 1455 | unsigned long mask; |
1410 | int need_report = 0; | 1456 | int need_report = 0; |
1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1457 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ | 1458 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
1413 | 1459 | ||
1414 | /* Adjust any no-longer-needed kthreads. */ | 1460 | /* Adjust any no-longer-needed kthreads. */ |
1415 | rcu_stop_cpu_kthread(cpu); | 1461 | rcu_stop_cpu_kthread(cpu); |
1416 | rcu_node_kthread_setaffinity(rnp, -1); | 1462 | rcu_node_kthread_setaffinity(rnp, -1); |
1417 | 1463 | ||
1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | 1464 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1419 | 1465 | ||
1420 | /* Exclude any attempts to start a new grace period. */ | 1466 | /* Exclude any attempts to start a new grace period. */ |
1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1467 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1422 | 1468 | ||
1469 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
1470 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
1471 | rcu_adopt_orphan_cbs(rsp); | ||
1472 | |||
1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1473 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1474 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1425 | do { | 1475 | do { |
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1456 | 1506 | ||
1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1507 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1458 | 1508 | ||
1509 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1510 | { | ||
1511 | } | ||
1512 | |||
1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1513 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1460 | { | 1514 | { |
1461 | } | 1515 | } |
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1524 | rcu_is_callbacks_kthread()); | 1578 | rcu_is_callbacks_kthread()); |
1525 | 1579 | ||
1526 | /* Update count, and requeue any remaining callbacks. */ | 1580 | /* Update count, and requeue any remaining callbacks. */ |
1527 | rdp->qlen_lazy -= count_lazy; | ||
1528 | rdp->qlen -= count; | ||
1529 | rdp->n_cbs_invoked += count; | ||
1530 | if (list != NULL) { | 1581 | if (list != NULL) { |
1531 | *tail = rdp->nxtlist; | 1582 | *tail = rdp->nxtlist; |
1532 | rdp->nxtlist = list; | 1583 | rdp->nxtlist = list; |
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1536 | else | 1587 | else |
1537 | break; | 1588 | break; |
1538 | } | 1589 | } |
1590 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
1591 | rdp->qlen_lazy -= count_lazy; | ||
1592 | rdp->qlen -= count; | ||
1593 | rdp->n_cbs_invoked += count; | ||
1539 | 1594 | ||
1540 | /* Reinstate batch limit if we have worked down the excess. */ | 1595 | /* Reinstate batch limit if we have worked down the excess. */ |
1541 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1596 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
@@ -1820,15 +1875,17 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1820 | * a quiescent state betweentimes. | 1875 | * a quiescent state betweentimes. |
1821 | */ | 1876 | */ |
1822 | local_irq_save(flags); | 1877 | local_irq_save(flags); |
1823 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | ||
1824 | rdp = this_cpu_ptr(rsp->rda); | 1878 | rdp = this_cpu_ptr(rsp->rda); |
1825 | 1879 | ||
1826 | /* Add the callback to our list. */ | 1880 | /* Add the callback to our list. */ |
1827 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1828 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1829 | rdp->qlen++; | 1881 | rdp->qlen++; |
1830 | if (lazy) | 1882 | if (lazy) |
1831 | rdp->qlen_lazy++; | 1883 | rdp->qlen_lazy++; |
1884 | else | ||
1885 | rcu_idle_count_callbacks_posted(); | ||
1886 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
1887 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1888 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1832 | 1889 | ||
1833 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1890 | if (__is_kfree_rcu_offset((unsigned long)func)) |
1834 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1891 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
@@ -1894,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
1894 | } | 1951 | } |
1895 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 1952 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1896 | 1953 | ||
1954 | /* | ||
1955 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | ||
1956 | * any blocking grace-period wait automatically implies a grace period | ||
1957 | * if there is only one CPU online at any point time during execution | ||
1958 | * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to | ||
1959 | * occasionally incorrectly indicate that there are multiple CPUs online | ||
1960 | * when there was in fact only one the whole time, as this just adds | ||
1961 | * some overhead: RCU still operates correctly. | ||
1962 | * | ||
1963 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
1964 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
1965 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
1966 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
1967 | * all times, but num_online_cpus() might well return one (or even zero). | ||
1968 | * | ||
1969 | * However, all such demonic sequences require at least one CPU-offline | ||
1970 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
1971 | * is only a problem if there is an RCU read-side critical section executing | ||
1972 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
1973 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
1974 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
1975 | * that there is only one CPU when in fact there was more than one throughout | ||
1976 | * is when there were no RCU readers in the system. If there are no | ||
1977 | * RCU readers, the grace period by definition can be of zero length, | ||
1978 | * regardless of the number of online CPUs. | ||
1979 | */ | ||
1980 | static inline int rcu_blocking_is_gp(void) | ||
1981 | { | ||
1982 | might_sleep(); /* Check for RCU read-side critical section. */ | ||
1983 | return num_online_cpus() <= 1; | ||
1984 | } | ||
1985 | |||
1897 | /** | 1986 | /** |
1898 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. | 1987 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. |
1899 | * | 1988 | * |
@@ -2167,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
2167 | rcu_preempt_cpu_has_callbacks(cpu); | 2256 | rcu_preempt_cpu_has_callbacks(cpu); |
2168 | } | 2257 | } |
2169 | 2258 | ||
2170 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2259 | /* |
2171 | static atomic_t rcu_barrier_cpu_count; | 2260 | * RCU callback function for _rcu_barrier(). If we are last, wake |
2172 | static DEFINE_MUTEX(rcu_barrier_mutex); | 2261 | * up the task executing _rcu_barrier(). |
2173 | static struct completion rcu_barrier_completion; | 2262 | */ |
2174 | |||
2175 | static void rcu_barrier_callback(struct rcu_head *notused) | 2263 | static void rcu_barrier_callback(struct rcu_head *notused) |
2176 | { | 2264 | { |
2177 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2265 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -2201,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2201 | void (*call_rcu_func)(struct rcu_head *head, | 2289 | void (*call_rcu_func)(struct rcu_head *head, |
2202 | void (*func)(struct rcu_head *head))) | 2290 | void (*func)(struct rcu_head *head))) |
2203 | { | 2291 | { |
2204 | BUG_ON(in_interrupt()); | 2292 | int cpu; |
2293 | unsigned long flags; | ||
2294 | struct rcu_data *rdp; | ||
2295 | struct rcu_head rh; | ||
2296 | |||
2297 | init_rcu_head_on_stack(&rh); | ||
2298 | |||
2205 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2299 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
2206 | mutex_lock(&rcu_barrier_mutex); | 2300 | mutex_lock(&rcu_barrier_mutex); |
2207 | init_completion(&rcu_barrier_completion); | 2301 | |
2302 | smp_mb(); /* Prevent any prior operations from leaking in. */ | ||
2303 | |||
2208 | /* | 2304 | /* |
2209 | * Initialize rcu_barrier_cpu_count to 1, then invoke | 2305 | * Initialize the count to one rather than to zero in order to |
2210 | * rcu_barrier_func() on each CPU, so that each CPU also has | 2306 | * avoid a too-soon return to zero in case of a short grace period |
2211 | * incremented rcu_barrier_cpu_count. Only then is it safe to | 2307 | * (or preemption of this task). Also flag this task as doing |
2212 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 2308 | * an rcu_barrier(). This will prevent anyone else from adopting |
2213 | * might complete its grace period before all of the other CPUs | 2309 | * orphaned callbacks, which could cause otherwise failure if a |
2214 | * did their increment, causing this function to return too | 2310 | * CPU went offline and quickly came back online. To see this, |
2215 | * early. Note that on_each_cpu() disables irqs, which prevents | 2311 | * consider the following sequence of events: |
2216 | * any CPUs from coming online or going offline until each online | 2312 | * |
2217 | * CPU has queued its RCU-barrier callback. | 2313 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. |
2314 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
2315 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
2316 | * 4. CPU 1 comes back online. | ||
2317 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
2318 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
2319 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
2218 | */ | 2320 | */ |
2321 | init_completion(&rcu_barrier_completion); | ||
2219 | atomic_set(&rcu_barrier_cpu_count, 1); | 2322 | atomic_set(&rcu_barrier_cpu_count, 1); |
2220 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 2323 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
2324 | rsp->rcu_barrier_in_progress = current; | ||
2325 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2326 | |||
2327 | /* | ||
2328 | * Force every CPU with callbacks to register a new callback | ||
2329 | * that will tell us when all the preceding callbacks have | ||
2330 | * been invoked. If an offline CPU has callbacks, wait for | ||
2331 | * it to either come back online or to finish orphaning those | ||
2332 | * callbacks. | ||
2333 | */ | ||
2334 | for_each_possible_cpu(cpu) { | ||
2335 | preempt_disable(); | ||
2336 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2337 | if (cpu_is_offline(cpu)) { | ||
2338 | preempt_enable(); | ||
2339 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
2340 | schedule_timeout_interruptible(1); | ||
2341 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2342 | smp_call_function_single(cpu, rcu_barrier_func, | ||
2343 | (void *)call_rcu_func, 1); | ||
2344 | preempt_enable(); | ||
2345 | } else { | ||
2346 | preempt_enable(); | ||
2347 | } | ||
2348 | } | ||
2349 | |||
2350 | /* | ||
2351 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
2352 | * posted, we can adopt all of the orphaned callbacks and place | ||
2353 | * an rcu_barrier_callback() callback after them. When that is done, | ||
2354 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
2355 | * following every callback that could possibly have been | ||
2356 | * registered before _rcu_barrier() was called. | ||
2357 | */ | ||
2358 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
2359 | rcu_adopt_orphan_cbs(rsp); | ||
2360 | rsp->rcu_barrier_in_progress = NULL; | ||
2361 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2362 | atomic_inc(&rcu_barrier_cpu_count); | ||
2363 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
2364 | call_rcu_func(&rh, rcu_barrier_callback); | ||
2365 | |||
2366 | /* | ||
2367 | * Now that we have an rcu_barrier_callback() callback on each | ||
2368 | * CPU, and thus each counted, remove the initial count. | ||
2369 | */ | ||
2221 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2370 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
2222 | complete(&rcu_barrier_completion); | 2371 | complete(&rcu_barrier_completion); |
2372 | |||
2373 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
2223 | wait_for_completion(&rcu_barrier_completion); | 2374 | wait_for_completion(&rcu_barrier_completion); |
2375 | |||
2376 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
2224 | mutex_unlock(&rcu_barrier_mutex); | 2377 | mutex_unlock(&rcu_barrier_mutex); |
2378 | |||
2379 | destroy_rcu_head_on_stack(&rh); | ||
2225 | } | 2380 | } |
2226 | 2381 | ||
2227 | /** | 2382 | /** |
@@ -2418,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2418 | 2573 | ||
2419 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2574 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
2420 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2575 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
2421 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | 2576 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; |
2422 | } | 2577 | } |
2423 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2578 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
2424 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2579 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a4072..7f5d138dedf5 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -29,18 +29,14 @@ | |||
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
33 | * CONFIG_RCU_FANOUT_LEAF. | ||
33 | * In theory, it should be possible to add more levels straightforwardly. | 34 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this did work well going from three levels to four. | 35 | * In practice, this did work well going from three levels to four. |
35 | * Of course, your mileage may vary. | 36 | * Of course, your mileage may vary. |
36 | */ | 37 | */ |
37 | #define MAX_RCU_LVLS 4 | 38 | #define MAX_RCU_LVLS 4 |
38 | #if CONFIG_RCU_FANOUT > 16 | 39 | #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) |
39 | #define RCU_FANOUT_LEAF 16 | ||
40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ | ||
41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) | ||
42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | ||
43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | 40 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) |
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | 41 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) |
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
@@ -371,6 +367,17 @@ struct rcu_state { | |||
371 | 367 | ||
372 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 368 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
373 | /* starting new GP. */ | 369 | /* starting new GP. */ |
370 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
371 | /* need a grace period. */ | ||
372 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
373 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
374 | /* are ready to invoke. */ | ||
375 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
376 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
377 | long qlen; /* Total number of callbacks. */ | ||
378 | struct task_struct *rcu_barrier_in_progress; | ||
379 | /* Task doing rcu_barrier(), */ | ||
380 | /* or NULL if no barrier. */ | ||
374 | raw_spinlock_t fqslock; /* Only one task forcing */ | 381 | raw_spinlock_t fqslock; /* Only one task forcing */ |
375 | /* quiescent states. */ | 382 | /* quiescent states. */ |
376 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 383 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
423 | /* Forward declarations for rcutree_plugin.h */ | 430 | /* Forward declarations for rcutree_plugin.h */ |
424 | static void rcu_bootup_announce(void); | 431 | static void rcu_bootup_announce(void); |
425 | long rcu_batches_completed(void); | 432 | long rcu_batches_completed(void); |
426 | static void rcu_preempt_note_context_switch(int cpu); | ||
427 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
428 | #ifdef CONFIG_HOTPLUG_CPU | 434 | #ifdef CONFIG_HOTPLUG_CPU |
429 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); | |||
471 | static void rcu_prepare_for_idle_init(int cpu); | 477 | static void rcu_prepare_for_idle_init(int cpu); |
472 | static void rcu_cleanup_after_idle(int cpu); | 478 | static void rcu_cleanup_after_idle(int cpu); |
473 | static void rcu_prepare_for_idle(int cpu); | 479 | static void rcu_prepare_for_idle(int cpu); |
480 | static void rcu_idle_count_callbacks_posted(void); | ||
474 | static void print_cpu_stall_info_begin(void); | 481 | static void print_cpu_stall_info_begin(void); |
475 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 482 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
476 | static void print_cpu_stall_info_end(void); | 483 | static void print_cpu_stall_info_end(void); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816be..2411000d9869 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) | |||
153 | * | 153 | * |
154 | * Caller must disable preemption. | 154 | * Caller must disable preemption. |
155 | */ | 155 | */ |
156 | static void rcu_preempt_note_context_switch(int cpu) | 156 | void rcu_preempt_note_context_switch(void) |
157 | { | 157 | { |
158 | struct task_struct *t = current; | 158 | struct task_struct *t = current; |
159 | unsigned long flags; | 159 | unsigned long flags; |
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
165 | 165 | ||
166 | /* Possibly blocking in an RCU read-side critical section. */ | 166 | /* Possibly blocking in an RCU read-side critical section. */ |
167 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); |
168 | rnp = rdp->mynode; | 168 | rnp = rdp->mynode; |
169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 169 | raw_spin_lock_irqsave(&rnp->lock, flags); |
170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
228 | * means that we continue to block the current grace period. | 228 | * means that we continue to block the current grace period. |
229 | */ | 229 | */ |
230 | local_irq_save(flags); | 230 | local_irq_save(flags); |
231 | rcu_preempt_qs(cpu); | 231 | rcu_preempt_qs(smp_processor_id()); |
232 | local_irq_restore(flags); | 232 | local_irq_restore(flags); |
233 | } | 233 | } |
234 | 234 | ||
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) | |||
969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | 969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
970 | } | 970 | } |
971 | 971 | ||
972 | /* | ||
973 | * Check for a task exiting while in a preemptible-RCU read-side | ||
974 | * critical section, clean up if so. No need to issue warnings, | ||
975 | * as debug_check_no_locks_held() already does this if lockdep | ||
976 | * is enabled. | ||
977 | */ | ||
978 | void exit_rcu(void) | ||
979 | { | ||
980 | struct task_struct *t = current; | ||
981 | |||
982 | if (t->rcu_read_lock_nesting == 0) | ||
983 | return; | ||
984 | t->rcu_read_lock_nesting = 1; | ||
985 | __rcu_read_unlock(); | ||
986 | } | ||
987 | |||
988 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 972 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
989 | 973 | ||
990 | static struct rcu_state *rcu_state = &rcu_sched_state; | 974 | static struct rcu_state *rcu_state = &rcu_sched_state; |
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void) | |||
1018 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
1019 | 1003 | ||
1020 | /* | 1004 | /* |
1021 | * Because preemptible RCU does not exist, we never have to check for | ||
1022 | * CPUs being in quiescent states. | ||
1023 | */ | ||
1024 | static void rcu_preempt_note_context_switch(int cpu) | ||
1025 | { | ||
1026 | } | ||
1027 | |||
1028 | /* | ||
1029 | * Because preemptible RCU does not exist, there are never any preempted | 1005 | * Because preemptible RCU does not exist, there are never any preempted |
1030 | * RCU readers. | 1006 | * RCU readers. |
1031 | */ | 1007 | */ |
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu) | |||
1938 | { | 1914 | { |
1939 | } | 1915 | } |
1940 | 1916 | ||
1917 | /* | ||
1918 | * Don't bother keeping a running count of the number of RCU callbacks | ||
1919 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
1920 | */ | ||
1921 | static void rcu_idle_count_callbacks_posted(void) | ||
1922 | { | ||
1923 | } | ||
1924 | |||
1941 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1925 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1942 | 1926 | ||
1943 | /* | 1927 | /* |
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu) | |||
1978 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1979 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1980 | 1964 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
1981 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1982 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
1983 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | 1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ |
1984 | static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ | 1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); |
1985 | static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | 1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ |
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1986 | 1979 | ||
1987 | /* | 1980 | /* |
1988 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | |||
1995 | */ | 1988 | */ |
1996 | int rcu_needs_cpu(int cpu) | 1989 | int rcu_needs_cpu(int cpu) |
1997 | { | 1990 | { |
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1998 | /* If no callbacks, RCU doesn't need the CPU. */ | 1993 | /* If no callbacks, RCU doesn't need the CPU. */ |
1999 | if (!rcu_cpu_has_callbacks(cpu)) | 1994 | if (!rcu_cpu_has_callbacks(cpu)) |
2000 | return 0; | 1995 | return 0; |
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2045 | } | 2040 | } |
2046 | 2041 | ||
2047 | /* | 2042 | /* |
2043 | * Handler for smp_call_function_single(). The only point of this | ||
2044 | * handler is to wake the CPU up, so the handler does only tracing. | ||
2045 | */ | ||
2046 | void rcu_idle_demigrate(void *unused) | ||
2047 | { | ||
2048 | trace_rcu_prep_idle("Demigrate"); | ||
2049 | } | ||
2050 | |||
2051 | /* | ||
2048 | * Timer handler used to force CPU to start pushing its remaining RCU | 2052 | * Timer handler used to force CPU to start pushing its remaining RCU |
2049 | * callbacks in the case where it entered dyntick-idle mode with callbacks | 2053 | * callbacks in the case where it entered dyntick-idle mode with callbacks |
2050 | * pending. The hander doesn't really need to do anything because the | 2054 | * pending. The hander doesn't really need to do anything because the |
2051 | * real work is done upon re-entry to idle, or by the next scheduling-clock | 2055 | * real work is done upon re-entry to idle, or by the next scheduling-clock |
2052 | * interrupt should idle not be re-entered. | 2056 | * interrupt should idle not be re-entered. |
2057 | * | ||
2058 | * One special case: the timer gets migrated without awakening the CPU | ||
2059 | * on which the timer was scheduled on. In this case, we must wake up | ||
2060 | * that CPU. We do so with smp_call_function_single(). | ||
2053 | */ | 2061 | */ |
2054 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | 2062 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) |
2055 | { | 2063 | { |
2064 | int cpu = (int)cpu_in; | ||
2065 | |||
2056 | trace_rcu_prep_idle("Timer"); | 2066 | trace_rcu_prep_idle("Timer"); |
2057 | return HRTIMER_NORESTART; | 2067 | if (cpu != smp_processor_id()) |
2068 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
2069 | else | ||
2070 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
2058 | } | 2071 | } |
2059 | 2072 | ||
2060 | /* | 2073 | /* |
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | |||
2062 | */ | 2075 | */ |
2063 | static void rcu_prepare_for_idle_init(int cpu) | 2076 | static void rcu_prepare_for_idle_init(int cpu) |
2064 | { | 2077 | { |
2065 | static int firsttime = 1; | 2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
2066 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), |
2067 | 2080 | rcu_idle_gp_timer_func, cpu); | |
2068 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; |
2069 | hrtp->function = rcu_idle_gp_timer_func; | 2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; |
2070 | if (firsttime) { | ||
2071 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2072 | |||
2073 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2074 | upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); | ||
2075 | rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2076 | firsttime = 0; | ||
2077 | } | ||
2078 | } | 2083 | } |
2079 | 2084 | ||
2080 | /* | 2085 | /* |
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu) | |||
2084 | */ | 2089 | */ |
2085 | static void rcu_cleanup_after_idle(int cpu) | 2090 | static void rcu_cleanup_after_idle(int cpu) |
2086 | { | 2091 | { |
2087 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | 2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); |
2093 | trace_rcu_prep_idle("Cleanup after idle"); | ||
2088 | } | 2094 | } |
2089 | 2095 | ||
2090 | /* | 2096 | /* |
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | */ | 2114 | */ |
2109 | static void rcu_prepare_for_idle(int cpu) | 2115 | static void rcu_prepare_for_idle(int cpu) |
2110 | { | 2116 | { |
2117 | struct timer_list *tp; | ||
2118 | |||
2119 | /* | ||
2120 | * If this is an idle re-entry, for example, due to use of | ||
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | ||
2122 | * loop, then don't take any state-machine actions, unless the | ||
2123 | * momentary exit from idle queued additional non-lazy callbacks. | ||
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | ||
2125 | * pending. | ||
2126 | */ | ||
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | ||
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | ||
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2133 | } | ||
2134 | return; | ||
2135 | } | ||
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | ||
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | |||
2111 | /* | 2140 | /* |
2112 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2113 | * Also reset state to avoid prejudicing later attempts. | 2142 | * Also reset state to avoid prejudicing later attempts. |
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu) | |||
2140 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
2141 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2142 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) |
2143 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2144 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | 2173 | jiffies + RCU_IDLE_GP_DELAY; |
2145 | else | 2174 | else |
2146 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2147 | rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); | 2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2180 | per_cpu(rcu_nonlazy_posted, cpu); | ||
2148 | return; /* Nothing more to do immediately. */ | 2181 | return; /* Nothing more to do immediately. */ |
2149 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2150 | /* We have hit the limit, so time to give up. */ | 2183 | /* We have hit the limit, so time to give up. */ |
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu) | |||
2184 | trace_rcu_prep_idle("Callbacks drained"); | 2217 | trace_rcu_prep_idle("Callbacks drained"); |
2185 | } | 2218 | } |
2186 | 2219 | ||
2220 | /* | ||
2221 | * Keep a running count of the number of non-lazy callbacks posted | ||
2222 | * on this CPU. This running counter (which is never decremented) allows | ||
2223 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
2224 | * posts a callback, even if an equal number of callbacks are invoked. | ||
2225 | * Of course, callbacks should only be posted from within a trace event | ||
2226 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
2227 | */ | ||
2228 | static void rcu_idle_count_callbacks_posted(void) | ||
2229 | { | ||
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | ||
2231 | } | ||
2232 | |||
2187 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2188 | 2234 | ||
2189 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 2235 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu) | |||
2192 | 2238 | ||
2193 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2194 | { | 2240 | { |
2195 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); |
2196 | 2242 | ||
2197 | sprintf(cp, "drain=%d %c timer=%lld", | 2243 | sprintf(cp, "drain=%d %c timer=%lu", |
2198 | per_cpu(rcu_dyntick_drain, cpu), | 2244 | per_cpu(rcu_dyntick_drain, cpu), |
2199 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', |
2200 | hrtimer_active(hrtp) | 2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2201 | ? ktime_to_us(hrtimer_get_remaining(hrtp)) | ||
2202 | : -1); | ||
2203 | } | 2247 | } |
2204 | 2248 | ||
2205 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2249 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff43..d4bc16ddd1d4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
271 | 271 | ||
272 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
275 | rsp->completed, gpnum, rsp->fqs_state, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
276 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
277 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs_lh); | 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
282 | if (rnp->level != level) { | 282 | if (rnp->level != level) { |
283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b3..bebe2b170d49 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) | |||
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val, |
26 | bool force) | ||
26 | { | 27 | { |
28 | int ret = 0; | ||
29 | |||
27 | if (counter->usage + val > counter->limit) { | 30 | if (counter->usage + val > counter->limit) { |
28 | counter->failcnt++; | 31 | counter->failcnt++; |
29 | return -ENOMEM; | 32 | ret = -ENOMEM; |
33 | if (!force) | ||
34 | return ret; | ||
30 | } | 35 | } |
31 | 36 | ||
32 | counter->usage += val; | 37 | counter->usage += val; |
33 | if (counter->usage > counter->max_usage) | 38 | if (counter->usage > counter->max_usage) |
34 | counter->max_usage = counter->usage; | 39 | counter->max_usage = counter->usage; |
35 | return 0; | 40 | return ret; |
36 | } | 41 | } |
37 | 42 | ||
38 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 43 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, |
39 | struct res_counter **limit_fail_at) | 44 | struct res_counter **limit_fail_at, bool force) |
40 | { | 45 | { |
41 | int ret; | 46 | int ret, r; |
42 | unsigned long flags; | 47 | unsigned long flags; |
43 | struct res_counter *c, *u; | 48 | struct res_counter *c, *u; |
44 | 49 | ||
50 | r = ret = 0; | ||
45 | *limit_fail_at = NULL; | 51 | *limit_fail_at = NULL; |
46 | local_irq_save(flags); | 52 | local_irq_save(flags); |
47 | for (c = counter; c != NULL; c = c->parent) { | 53 | for (c = counter; c != NULL; c = c->parent) { |
48 | spin_lock(&c->lock); | 54 | spin_lock(&c->lock); |
49 | ret = res_counter_charge_locked(c, val); | 55 | r = res_counter_charge_locked(c, val, force); |
50 | spin_unlock(&c->lock); | 56 | spin_unlock(&c->lock); |
51 | if (ret < 0) { | 57 | if (r < 0 && !ret) { |
58 | ret = r; | ||
52 | *limit_fail_at = c; | 59 | *limit_fail_at = c; |
53 | goto undo; | 60 | if (!force) |
61 | break; | ||
54 | } | 62 | } |
55 | } | 63 | } |
56 | ret = 0; | 64 | |
57 | goto done; | 65 | if (ret < 0 && !force) { |
58 | undo: | 66 | for (u = counter; u != c; u = u->parent) { |
59 | for (u = counter; u != c; u = u->parent) { | 67 | spin_lock(&u->lock); |
60 | spin_lock(&u->lock); | 68 | res_counter_uncharge_locked(u, val); |
61 | res_counter_uncharge_locked(u, val); | 69 | spin_unlock(&u->lock); |
62 | spin_unlock(&u->lock); | 70 | } |
63 | } | 71 | } |
64 | done: | ||
65 | local_irq_restore(flags); | 72 | local_irq_restore(flags); |
73 | |||
66 | return ret; | 74 | return ret; |
67 | } | 75 | } |
68 | 76 | ||
77 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
78 | struct res_counter **limit_fail_at) | ||
79 | { | ||
80 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
81 | } | ||
82 | |||
69 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | 83 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, |
70 | struct res_counter **limit_fail_at) | 84 | struct res_counter **limit_fail_at) |
71 | { | 85 | { |
72 | int ret, r; | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
73 | unsigned long flags; | ||
74 | struct res_counter *c; | ||
75 | |||
76 | r = ret = 0; | ||
77 | *limit_fail_at = NULL; | ||
78 | local_irq_save(flags); | ||
79 | for (c = counter; c != NULL; c = c->parent) { | ||
80 | spin_lock(&c->lock); | ||
81 | r = res_counter_charge_locked(c, val); | ||
82 | if (r) | ||
83 | c->usage += val; | ||
84 | spin_unlock(&c->lock); | ||
85 | if (r < 0 && ret == 0) { | ||
86 | *limit_fail_at = c; | ||
87 | ret = r; | ||
88 | } | ||
89 | } | ||
90 | local_irq_restore(flags); | ||
91 | |||
92 | return ret; | ||
93 | } | 87 | } |
88 | |||
94 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
95 | { | 90 | { |
96 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a3..173ea52f3af0 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | |||
20 | |||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b189fecaef90..39eb6011bc38 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -692,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 693 | } |
693 | #endif | 694 | #endif |
694 | 695 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 696 | static void set_load_weight(struct task_struct *p) |
698 | { | 697 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 698 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -2083,6 +2082,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2083 | #endif | 2082 | #endif |
2084 | 2083 | ||
2085 | /* Here we just switch the register state and the stack. */ | 2084 | /* Here we just switch the register state and the stack. */ |
2085 | rcu_switch_from(prev); | ||
2086 | switch_to(prev, next, prev); | 2086 | switch_to(prev, next, prev); |
2087 | 2087 | ||
2088 | barrier(); | 2088 | barrier(); |
@@ -2486,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2487 | * every tick. We fix it up based on jiffies. | 2487 | * every tick. We fix it up based on jiffies. |
2488 | */ | 2488 | */ |
2489 | void update_cpu_load(struct rq *this_rq) | 2489 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2490 | unsigned long pending_updates) | ||
2490 | { | 2491 | { |
2491 | unsigned long this_load = this_rq->load.weight; | ||
2492 | unsigned long curr_jiffies = jiffies; | ||
2493 | unsigned long pending_updates; | ||
2494 | int i, scale; | 2492 | int i, scale; |
2495 | 2493 | ||
2496 | this_rq->nr_load_updates++; | 2494 | this_rq->nr_load_updates++; |
2497 | 2495 | ||
2498 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2499 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2500 | return; | ||
2501 | |||
2502 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2503 | this_rq->last_load_update_tick = curr_jiffies; | ||
2504 | |||
2505 | /* Update our load: */ | 2496 | /* Update our load: */ |
2506 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2497 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2507 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2498 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2526,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq) | |||
2526 | sched_avg_update(this_rq); | 2517 | sched_avg_update(this_rq); |
2527 | } | 2518 | } |
2528 | 2519 | ||
2520 | /* | ||
2521 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2522 | * idle balance. | ||
2523 | */ | ||
2524 | void update_idle_cpu_load(struct rq *this_rq) | ||
2525 | { | ||
2526 | unsigned long curr_jiffies = jiffies; | ||
2527 | unsigned long load = this_rq->load.weight; | ||
2528 | unsigned long pending_updates; | ||
2529 | |||
2530 | /* | ||
2531 | * Bloody broken means of dealing with nohz, but better than nothing.. | ||
2532 | * jiffies is updated by one cpu, another cpu can drift wrt the jiffy | ||
2533 | * update and see 0 difference the one time and 2 the next, even though | ||
2534 | * we ticked at roughtly the same rate. | ||
2535 | * | ||
2536 | * Hence we only use this from nohz_idle_balance() and skip this | ||
2537 | * nonsense when called from the scheduler_tick() since that's | ||
2538 | * guaranteed a stable rate. | ||
2539 | */ | ||
2540 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2541 | return; | ||
2542 | |||
2543 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2544 | this_rq->last_load_update_tick = curr_jiffies; | ||
2545 | |||
2546 | __update_cpu_load(this_rq, load, pending_updates); | ||
2547 | } | ||
2548 | |||
2549 | /* | ||
2550 | * Called from scheduler_tick() | ||
2551 | */ | ||
2529 | static void update_cpu_load_active(struct rq *this_rq) | 2552 | static void update_cpu_load_active(struct rq *this_rq) |
2530 | { | 2553 | { |
2531 | update_cpu_load(this_rq); | 2554 | /* |
2555 | * See the mess in update_idle_cpu_load(). | ||
2556 | */ | ||
2557 | this_rq->last_load_update_tick = jiffies; | ||
2558 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2532 | 2559 | ||
2533 | calc_load_account_active(this_rq); | 2560 | calc_load_account_active(this_rq); |
2534 | } | 2561 | } |
@@ -3113,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3113 | if (irqs_disabled()) | 3140 | if (irqs_disabled()) |
3114 | print_irqtrace_events(prev); | 3141 | print_irqtrace_events(prev); |
3115 | dump_stack(); | 3142 | dump_stack(); |
3143 | add_taint(TAINT_WARN); | ||
3116 | } | 3144 | } |
3117 | 3145 | ||
3118 | /* | 3146 | /* |
@@ -5557,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5557 | break; | 5585 | break; |
5558 | } | 5586 | } |
5559 | 5587 | ||
5560 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5588 | if (!(sd->flags & SD_OVERLAP) && |
5589 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5561 | printk(KERN_CONT "\n"); | 5590 | printk(KERN_CONT "\n"); |
5562 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5591 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5563 | break; | 5592 | break; |
@@ -5895,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str) | |||
5895 | 5924 | ||
5896 | __setup("isolcpus=", isolated_cpu_setup); | 5925 | __setup("isolcpus=", isolated_cpu_setup); |
5897 | 5926 | ||
5898 | #ifdef CONFIG_NUMA | ||
5899 | |||
5900 | /** | ||
5901 | * find_next_best_node - find the next node to include in a sched_domain | ||
5902 | * @node: node whose sched_domain we're building | ||
5903 | * @used_nodes: nodes already in the sched_domain | ||
5904 | * | ||
5905 | * Find the next node to include in a given scheduling domain. Simply | ||
5906 | * finds the closest node not already in the @used_nodes map. | ||
5907 | * | ||
5908 | * Should use nodemask_t. | ||
5909 | */ | ||
5910 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5911 | { | ||
5912 | int i, n, val, min_val, best_node = -1; | ||
5913 | |||
5914 | min_val = INT_MAX; | ||
5915 | |||
5916 | for (i = 0; i < nr_node_ids; i++) { | ||
5917 | /* Start at @node */ | ||
5918 | n = (node + i) % nr_node_ids; | ||
5919 | |||
5920 | if (!nr_cpus_node(n)) | ||
5921 | continue; | ||
5922 | |||
5923 | /* Skip already used nodes */ | ||
5924 | if (node_isset(n, *used_nodes)) | ||
5925 | continue; | ||
5926 | |||
5927 | /* Simple min distance search */ | ||
5928 | val = node_distance(node, n); | ||
5929 | |||
5930 | if (val < min_val) { | ||
5931 | min_val = val; | ||
5932 | best_node = n; | ||
5933 | } | ||
5934 | } | ||
5935 | |||
5936 | if (best_node != -1) | ||
5937 | node_set(best_node, *used_nodes); | ||
5938 | return best_node; | ||
5939 | } | ||
5940 | |||
5941 | /** | ||
5942 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5943 | * @node: node whose cpumask we're constructing | ||
5944 | * @span: resulting cpumask | ||
5945 | * | ||
5946 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5947 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5948 | * out optimally. | ||
5949 | */ | ||
5950 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5951 | { | ||
5952 | nodemask_t used_nodes; | ||
5953 | int i; | ||
5954 | |||
5955 | cpumask_clear(span); | ||
5956 | nodes_clear(used_nodes); | ||
5957 | |||
5958 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5959 | node_set(node, used_nodes); | ||
5960 | |||
5961 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5962 | int next_node = find_next_best_node(node, &used_nodes); | ||
5963 | if (next_node < 0) | ||
5964 | break; | ||
5965 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5966 | } | ||
5967 | } | ||
5968 | |||
5969 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5970 | { | ||
5971 | lockdep_assert_held(&sched_domains_mutex); | ||
5972 | |||
5973 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5974 | |||
5975 | return sched_domains_tmpmask; | ||
5976 | } | ||
5977 | |||
5978 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5979 | { | ||
5980 | return cpu_possible_mask; | ||
5981 | } | ||
5982 | #endif /* CONFIG_NUMA */ | ||
5983 | |||
5984 | static const struct cpumask *cpu_cpu_mask(int cpu) | 5927 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5985 | { | 5928 | { |
5986 | return cpumask_of_node(cpu_to_node(cpu)); | 5929 | return cpumask_of_node(cpu_to_node(cpu)); |
5987 | } | 5930 | } |
5988 | 5931 | ||
5989 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5990 | |||
5991 | struct sd_data { | 5932 | struct sd_data { |
5992 | struct sched_domain **__percpu sd; | 5933 | struct sched_domain **__percpu sd; |
5993 | struct sched_group **__percpu sg; | 5934 | struct sched_group **__percpu sg; |
@@ -6017,6 +5958,7 @@ struct sched_domain_topology_level { | |||
6017 | sched_domain_init_f init; | 5958 | sched_domain_init_f init; |
6018 | sched_domain_mask_f mask; | 5959 | sched_domain_mask_f mask; |
6019 | int flags; | 5960 | int flags; |
5961 | int numa_level; | ||
6020 | struct sd_data data; | 5962 | struct sd_data data; |
6021 | }; | 5963 | }; |
6022 | 5964 | ||
@@ -6208,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6208 | } | 6150 | } |
6209 | 6151 | ||
6210 | SD_INIT_FUNC(CPU) | 6152 | SD_INIT_FUNC(CPU) |
6211 | #ifdef CONFIG_NUMA | ||
6212 | SD_INIT_FUNC(ALLNODES) | ||
6213 | SD_INIT_FUNC(NODE) | ||
6214 | #endif | ||
6215 | #ifdef CONFIG_SCHED_SMT | 6153 | #ifdef CONFIG_SCHED_SMT |
6216 | SD_INIT_FUNC(SIBLING) | 6154 | SD_INIT_FUNC(SIBLING) |
6217 | #endif | 6155 | #endif |
@@ -6333,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6333 | { sd_init_BOOK, cpu_book_mask, }, | 6271 | { sd_init_BOOK, cpu_book_mask, }, |
6334 | #endif | 6272 | #endif |
6335 | { sd_init_CPU, cpu_cpu_mask, }, | 6273 | { sd_init_CPU, cpu_cpu_mask, }, |
6336 | #ifdef CONFIG_NUMA | ||
6337 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6338 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6339 | #endif | ||
6340 | { NULL, }, | 6274 | { NULL, }, |
6341 | }; | 6275 | }; |
6342 | 6276 | ||
6343 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6277 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6344 | 6278 | ||
6279 | #ifdef CONFIG_NUMA | ||
6280 | |||
6281 | static int sched_domains_numa_levels; | ||
6282 | static int sched_domains_numa_scale; | ||
6283 | static int *sched_domains_numa_distance; | ||
6284 | static struct cpumask ***sched_domains_numa_masks; | ||
6285 | static int sched_domains_curr_level; | ||
6286 | |||
6287 | static inline int sd_local_flags(int level) | ||
6288 | { | ||
6289 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | ||
6290 | return 0; | ||
6291 | |||
6292 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6293 | } | ||
6294 | |||
6295 | static struct sched_domain * | ||
6296 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6297 | { | ||
6298 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6299 | int level = tl->numa_level; | ||
6300 | int sd_weight = cpumask_weight( | ||
6301 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6302 | |||
6303 | *sd = (struct sched_domain){ | ||
6304 | .min_interval = sd_weight, | ||
6305 | .max_interval = 2*sd_weight, | ||
6306 | .busy_factor = 32, | ||
6307 | .imbalance_pct = 125, | ||
6308 | .cache_nice_tries = 2, | ||
6309 | .busy_idx = 3, | ||
6310 | .idle_idx = 2, | ||
6311 | .newidle_idx = 0, | ||
6312 | .wake_idx = 0, | ||
6313 | .forkexec_idx = 0, | ||
6314 | |||
6315 | .flags = 1*SD_LOAD_BALANCE | ||
6316 | | 1*SD_BALANCE_NEWIDLE | ||
6317 | | 0*SD_BALANCE_EXEC | ||
6318 | | 0*SD_BALANCE_FORK | ||
6319 | | 0*SD_BALANCE_WAKE | ||
6320 | | 0*SD_WAKE_AFFINE | ||
6321 | | 0*SD_PREFER_LOCAL | ||
6322 | | 0*SD_SHARE_CPUPOWER | ||
6323 | | 0*SD_SHARE_PKG_RESOURCES | ||
6324 | | 1*SD_SERIALIZE | ||
6325 | | 0*SD_PREFER_SIBLING | ||
6326 | | sd_local_flags(level) | ||
6327 | , | ||
6328 | .last_balance = jiffies, | ||
6329 | .balance_interval = sd_weight, | ||
6330 | }; | ||
6331 | SD_INIT_NAME(sd, NUMA); | ||
6332 | sd->private = &tl->data; | ||
6333 | |||
6334 | /* | ||
6335 | * Ugly hack to pass state to sd_numa_mask()... | ||
6336 | */ | ||
6337 | sched_domains_curr_level = tl->numa_level; | ||
6338 | |||
6339 | return sd; | ||
6340 | } | ||
6341 | |||
6342 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6343 | { | ||
6344 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6345 | } | ||
6346 | |||
6347 | static void sched_init_numa(void) | ||
6348 | { | ||
6349 | int next_distance, curr_distance = node_distance(0, 0); | ||
6350 | struct sched_domain_topology_level *tl; | ||
6351 | int level = 0; | ||
6352 | int i, j, k; | ||
6353 | |||
6354 | sched_domains_numa_scale = curr_distance; | ||
6355 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6356 | if (!sched_domains_numa_distance) | ||
6357 | return; | ||
6358 | |||
6359 | /* | ||
6360 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6361 | * unique distances in the node_distance() table. | ||
6362 | * | ||
6363 | * Assumes node_distance(0,j) includes all distances in | ||
6364 | * node_distance(i,j) in order to avoid cubic time. | ||
6365 | * | ||
6366 | * XXX: could be optimized to O(n log n) by using sort() | ||
6367 | */ | ||
6368 | next_distance = curr_distance; | ||
6369 | for (i = 0; i < nr_node_ids; i++) { | ||
6370 | for (j = 0; j < nr_node_ids; j++) { | ||
6371 | int distance = node_distance(0, j); | ||
6372 | if (distance > curr_distance && | ||
6373 | (distance < next_distance || | ||
6374 | next_distance == curr_distance)) | ||
6375 | next_distance = distance; | ||
6376 | } | ||
6377 | if (next_distance != curr_distance) { | ||
6378 | sched_domains_numa_distance[level++] = next_distance; | ||
6379 | sched_domains_numa_levels = level; | ||
6380 | curr_distance = next_distance; | ||
6381 | } else break; | ||
6382 | } | ||
6383 | /* | ||
6384 | * 'level' contains the number of unique distances, excluding the | ||
6385 | * identity distance node_distance(i,i). | ||
6386 | * | ||
6387 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6388 | * numbers. | ||
6389 | */ | ||
6390 | |||
6391 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6392 | if (!sched_domains_numa_masks) | ||
6393 | return; | ||
6394 | |||
6395 | /* | ||
6396 | * Now for each level, construct a mask per node which contains all | ||
6397 | * cpus of nodes that are that many hops away from us. | ||
6398 | */ | ||
6399 | for (i = 0; i < level; i++) { | ||
6400 | sched_domains_numa_masks[i] = | ||
6401 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6402 | if (!sched_domains_numa_masks[i]) | ||
6403 | return; | ||
6404 | |||
6405 | for (j = 0; j < nr_node_ids; j++) { | ||
6406 | struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); | ||
6407 | if (!mask) | ||
6408 | return; | ||
6409 | |||
6410 | sched_domains_numa_masks[i][j] = mask; | ||
6411 | |||
6412 | for (k = 0; k < nr_node_ids; k++) { | ||
6413 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6414 | continue; | ||
6415 | |||
6416 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6417 | } | ||
6418 | } | ||
6419 | } | ||
6420 | |||
6421 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6422 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6423 | if (!tl) | ||
6424 | return; | ||
6425 | |||
6426 | /* | ||
6427 | * Copy the default topology bits.. | ||
6428 | */ | ||
6429 | for (i = 0; default_topology[i].init; i++) | ||
6430 | tl[i] = default_topology[i]; | ||
6431 | |||
6432 | /* | ||
6433 | * .. and append 'j' levels of NUMA goodness. | ||
6434 | */ | ||
6435 | for (j = 0; j < level; i++, j++) { | ||
6436 | tl[i] = (struct sched_domain_topology_level){ | ||
6437 | .init = sd_numa_init, | ||
6438 | .mask = sd_numa_mask, | ||
6439 | .flags = SDTL_OVERLAP, | ||
6440 | .numa_level = j, | ||
6441 | }; | ||
6442 | } | ||
6443 | |||
6444 | sched_domain_topology = tl; | ||
6445 | } | ||
6446 | #else | ||
6447 | static inline void sched_init_numa(void) | ||
6448 | { | ||
6449 | } | ||
6450 | #endif /* CONFIG_NUMA */ | ||
6451 | |||
6345 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6452 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6346 | { | 6453 | { |
6347 | struct sched_domain_topology_level *tl; | 6454 | struct sched_domain_topology_level *tl; |
@@ -6379,6 +6486,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6379 | if (!sg) | 6486 | if (!sg) |
6380 | return -ENOMEM; | 6487 | return -ENOMEM; |
6381 | 6488 | ||
6489 | sg->next = sg; | ||
6490 | |||
6382 | *per_cpu_ptr(sdd->sg, j) = sg; | 6491 | *per_cpu_ptr(sdd->sg, j) = sg; |
6383 | 6492 | ||
6384 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6493 | sgp = kzalloc_node(sizeof(struct sched_group_power), |
@@ -6402,16 +6511,26 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6402 | struct sd_data *sdd = &tl->data; | 6511 | struct sd_data *sdd = &tl->data; |
6403 | 6512 | ||
6404 | for_each_cpu(j, cpu_map) { | 6513 | for_each_cpu(j, cpu_map) { |
6405 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | 6514 | struct sched_domain *sd; |
6406 | if (sd && (sd->flags & SD_OVERLAP)) | 6515 | |
6407 | free_sched_groups(sd->groups, 0); | 6516 | if (sdd->sd) { |
6408 | kfree(*per_cpu_ptr(sdd->sd, j)); | 6517 | sd = *per_cpu_ptr(sdd->sd, j); |
6409 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6518 | if (sd && (sd->flags & SD_OVERLAP)) |
6410 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 6519 | free_sched_groups(sd->groups, 0); |
6520 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
6521 | } | ||
6522 | |||
6523 | if (sdd->sg) | ||
6524 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
6525 | if (sdd->sgp) | ||
6526 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
6411 | } | 6527 | } |
6412 | free_percpu(sdd->sd); | 6528 | free_percpu(sdd->sd); |
6529 | sdd->sd = NULL; | ||
6413 | free_percpu(sdd->sg); | 6530 | free_percpu(sdd->sg); |
6531 | sdd->sg = NULL; | ||
6414 | free_percpu(sdd->sgp); | 6532 | free_percpu(sdd->sgp); |
6533 | sdd->sgp = NULL; | ||
6415 | } | 6534 | } |
6416 | } | 6535 | } |
6417 | 6536 | ||
@@ -6697,97 +6816,6 @@ match2: | |||
6697 | mutex_unlock(&sched_domains_mutex); | 6816 | mutex_unlock(&sched_domains_mutex); |
6698 | } | 6817 | } |
6699 | 6818 | ||
6700 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6701 | static void reinit_sched_domains(void) | ||
6702 | { | ||
6703 | get_online_cpus(); | ||
6704 | |||
6705 | /* Destroy domains first to force the rebuild */ | ||
6706 | partition_sched_domains(0, NULL, NULL); | ||
6707 | |||
6708 | rebuild_sched_domains(); | ||
6709 | put_online_cpus(); | ||
6710 | } | ||
6711 | |||
6712 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6713 | { | ||
6714 | unsigned int level = 0; | ||
6715 | |||
6716 | if (sscanf(buf, "%u", &level) != 1) | ||
6717 | return -EINVAL; | ||
6718 | |||
6719 | /* | ||
6720 | * level is always be positive so don't check for | ||
6721 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | ||
6722 | * What happens on 0 or 1 byte write, | ||
6723 | * need to check for count as well? | ||
6724 | */ | ||
6725 | |||
6726 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | ||
6727 | return -EINVAL; | ||
6728 | |||
6729 | if (smt) | ||
6730 | sched_smt_power_savings = level; | ||
6731 | else | ||
6732 | sched_mc_power_savings = level; | ||
6733 | |||
6734 | reinit_sched_domains(); | ||
6735 | |||
6736 | return count; | ||
6737 | } | ||
6738 | |||
6739 | #ifdef CONFIG_SCHED_MC | ||
6740 | static ssize_t sched_mc_power_savings_show(struct device *dev, | ||
6741 | struct device_attribute *attr, | ||
6742 | char *buf) | ||
6743 | { | ||
6744 | return sprintf(buf, "%u\n", sched_mc_power_savings); | ||
6745 | } | ||
6746 | static ssize_t sched_mc_power_savings_store(struct device *dev, | ||
6747 | struct device_attribute *attr, | ||
6748 | const char *buf, size_t count) | ||
6749 | { | ||
6750 | return sched_power_savings_store(buf, count, 0); | ||
6751 | } | ||
6752 | static DEVICE_ATTR(sched_mc_power_savings, 0644, | ||
6753 | sched_mc_power_savings_show, | ||
6754 | sched_mc_power_savings_store); | ||
6755 | #endif | ||
6756 | |||
6757 | #ifdef CONFIG_SCHED_SMT | ||
6758 | static ssize_t sched_smt_power_savings_show(struct device *dev, | ||
6759 | struct device_attribute *attr, | ||
6760 | char *buf) | ||
6761 | { | ||
6762 | return sprintf(buf, "%u\n", sched_smt_power_savings); | ||
6763 | } | ||
6764 | static ssize_t sched_smt_power_savings_store(struct device *dev, | ||
6765 | struct device_attribute *attr, | ||
6766 | const char *buf, size_t count) | ||
6767 | { | ||
6768 | return sched_power_savings_store(buf, count, 1); | ||
6769 | } | ||
6770 | static DEVICE_ATTR(sched_smt_power_savings, 0644, | ||
6771 | sched_smt_power_savings_show, | ||
6772 | sched_smt_power_savings_store); | ||
6773 | #endif | ||
6774 | |||
6775 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) | ||
6776 | { | ||
6777 | int err = 0; | ||
6778 | |||
6779 | #ifdef CONFIG_SCHED_SMT | ||
6780 | if (smt_capable()) | ||
6781 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); | ||
6782 | #endif | ||
6783 | #ifdef CONFIG_SCHED_MC | ||
6784 | if (!err && mc_capable()) | ||
6785 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); | ||
6786 | #endif | ||
6787 | return err; | ||
6788 | } | ||
6789 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
6790 | |||
6791 | /* | 6819 | /* |
6792 | * Update cpusets according to cpu_active mask. If cpusets are | 6820 | * Update cpusets according to cpu_active mask. If cpusets are |
6793 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 6821 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
@@ -6825,6 +6853,8 @@ void __init sched_init_smp(void) | |||
6825 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 6853 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6826 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 6854 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6827 | 6855 | ||
6856 | sched_init_numa(); | ||
6857 | |||
6828 | get_online_cpus(); | 6858 | get_online_cpus(); |
6829 | mutex_lock(&sched_domains_mutex); | 6859 | mutex_lock(&sched_domains_mutex); |
6830 | init_sched_domains(cpu_active_mask); | 6860 | init_sched_domains(cpu_active_mask); |
@@ -7046,6 +7076,7 @@ void __init sched_init(void) | |||
7046 | /* May be allocated at isolcpus cmdline parse time */ | 7076 | /* May be allocated at isolcpus cmdline parse time */ |
7047 | if (cpu_isolated_map == NULL) | 7077 | if (cpu_isolated_map == NULL) |
7048 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7078 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7079 | idle_thread_set_boot_cpu(); | ||
7049 | #endif | 7080 | #endif |
7050 | init_sched_fair_class(); | 7081 | init_sched_fair_class(); |
7051 | 7082 | ||
@@ -7967,13 +7998,9 @@ static struct cftype cpu_files[] = { | |||
7967 | .write_u64 = cpu_rt_period_write_uint, | 7998 | .write_u64 = cpu_rt_period_write_uint, |
7968 | }, | 7999 | }, |
7969 | #endif | 8000 | #endif |
8001 | { } /* terminate */ | ||
7970 | }; | 8002 | }; |
7971 | 8003 | ||
7972 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7973 | { | ||
7974 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7975 | } | ||
7976 | |||
7977 | struct cgroup_subsys cpu_cgroup_subsys = { | 8004 | struct cgroup_subsys cpu_cgroup_subsys = { |
7978 | .name = "cpu", | 8005 | .name = "cpu", |
7979 | .create = cpu_cgroup_create, | 8006 | .create = cpu_cgroup_create, |
@@ -7981,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7981 | .can_attach = cpu_cgroup_can_attach, | 8008 | .can_attach = cpu_cgroup_can_attach, |
7982 | .attach = cpu_cgroup_attach, | 8009 | .attach = cpu_cgroup_attach, |
7983 | .exit = cpu_cgroup_exit, | 8010 | .exit = cpu_cgroup_exit, |
7984 | .populate = cpu_cgroup_populate, | ||
7985 | .subsys_id = cpu_cgroup_subsys_id, | 8011 | .subsys_id = cpu_cgroup_subsys_id, |
8012 | .base_cftypes = cpu_files, | ||
7986 | .early_init = 1, | 8013 | .early_init = 1, |
7987 | }; | 8014 | }; |
7988 | 8015 | ||
@@ -8167,13 +8194,9 @@ static struct cftype files[] = { | |||
8167 | .name = "stat", | 8194 | .name = "stat", |
8168 | .read_map = cpuacct_stats_show, | 8195 | .read_map = cpuacct_stats_show, |
8169 | }, | 8196 | }, |
8197 | { } /* terminate */ | ||
8170 | }; | 8198 | }; |
8171 | 8199 | ||
8172 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8173 | { | ||
8174 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8175 | } | ||
8176 | |||
8177 | /* | 8200 | /* |
8178 | * charge this task's execution time to its accounting group. | 8201 | * charge this task's execution time to its accounting group. |
8179 | * | 8202 | * |
@@ -8205,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8205 | .name = "cpuacct", | 8228 | .name = "cpuacct", |
8206 | .create = cpuacct_create, | 8229 | .create = cpuacct_create, |
8207 | .destroy = cpuacct_destroy, | 8230 | .destroy = cpuacct_destroy, |
8208 | .populate = cpuacct_populate, | ||
8209 | .subsys_id = cpuacct_subsys_id, | 8231 | .subsys_id = cpuacct_subsys_id, |
8232 | .base_cftypes = files, | ||
8210 | }; | 8233 | }; |
8211 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8234 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161d..6f79596e0ea9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | SPLIT_NS(spread0)); | 202 | SPLIT_NS(spread0)); |
203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
204 | cfs_rq->nr_spread_over); | 204 | cfs_rq->nr_spread_over); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 207 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
260 | SEQ_printf(m, "\ncpu#%d\n", cpu); | 260 | SEQ_printf(m, "\ncpu#%d\n", cpu); |
261 | #endif | 261 | #endif |
262 | 262 | ||
263 | #define P(x) \ | 263 | #define P(x) \ |
264 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 264 | do { \ |
265 | if (sizeof(rq->x) == 4) \ | ||
266 | SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ | ||
267 | else \ | ||
268 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ | ||
269 | } while (0) | ||
270 | |||
265 | #define PN(x) \ | 271 | #define PN(x) \ |
266 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 272 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
267 | 273 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f0..940e6d17cf96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
785 | #ifdef CONFIG_SMP | 785 | #ifdef CONFIG_SMP |
786 | if (entity_is_task(se)) | 786 | if (entity_is_task(se)) |
787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 787 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
788 | #endif | 788 | #endif |
789 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
790 | } | 790 | } |
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2721 | * If power savings logic is enabled for a domain, see if we | 2721 | * If power savings logic is enabled for a domain, see if we |
2722 | * are not overloaded, if so, don't balance wider. | 2722 | * are not overloaded, if so, don't balance wider. |
2723 | */ | 2723 | */ |
2724 | if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { | 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { |
2725 | unsigned long power = 0; | 2725 | unsigned long power = 0; |
2726 | unsigned long nr_running = 0; | 2726 | unsigned long nr_running = 0; |
2727 | unsigned long capacity; | 2727 | unsigned long capacity; |
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2734 | 2734 | ||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
2736 | 2736 | ||
2737 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2738 | nr_running /= 2; | ||
2739 | |||
2740 | if (nr_running < capacity) | 2737 | if (nr_running < capacity) |
2741 | want_sd = 0; | 2738 | want_sd = 0; |
2742 | } | 2739 | } |
@@ -3082,7 +3079,7 @@ struct lb_env { | |||
3082 | struct rq *dst_rq; | 3079 | struct rq *dst_rq; |
3083 | 3080 | ||
3084 | enum cpu_idle_type idle; | 3081 | enum cpu_idle_type idle; |
3085 | long load_move; | 3082 | long imbalance; |
3086 | unsigned int flags; | 3083 | unsigned int flags; |
3087 | 3084 | ||
3088 | unsigned int loop; | 3085 | unsigned int loop; |
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env) | |||
3215 | 3212 | ||
3216 | static unsigned long task_h_load(struct task_struct *p); | 3213 | static unsigned long task_h_load(struct task_struct *p); |
3217 | 3214 | ||
3215 | static const unsigned int sched_nr_migrate_break = 32; | ||
3216 | |||
3218 | /* | 3217 | /* |
3219 | * move_tasks tries to move up to load_move weighted load from busiest to | 3218 | * move_tasks tries to move up to imbalance weighted load from busiest to |
3220 | * this_rq, as part of a balancing operation within domain "sd". | 3219 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | 3220 | * Returns 1 if successful and 0 otherwise. |
3222 | * | 3221 | * |
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env) | |||
3229 | unsigned long load; | 3228 | unsigned long load; |
3230 | int pulled = 0; | 3229 | int pulled = 0; |
3231 | 3230 | ||
3232 | if (env->load_move <= 0) | 3231 | if (env->imbalance <= 0) |
3233 | return 0; | 3232 | return 0; |
3234 | 3233 | ||
3235 | while (!list_empty(tasks)) { | 3234 | while (!list_empty(tasks)) { |
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env) | |||
3242 | 3241 | ||
3243 | /* take a breather every nr_migrate tasks */ | 3242 | /* take a breather every nr_migrate tasks */ |
3244 | if (env->loop > env->loop_break) { | 3243 | if (env->loop > env->loop_break) { |
3245 | env->loop_break += sysctl_sched_nr_migrate; | 3244 | env->loop_break += sched_nr_migrate_break; |
3246 | env->flags |= LBF_NEED_BREAK; | 3245 | env->flags |= LBF_NEED_BREAK; |
3247 | break; | 3246 | break; |
3248 | } | 3247 | } |
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env) | |||
3252 | 3251 | ||
3253 | load = task_h_load(p); | 3252 | load = task_h_load(p); |
3254 | 3253 | ||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | 3254 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) |
3256 | goto next; | 3255 | goto next; |
3257 | 3256 | ||
3258 | if ((load / 2) > env->load_move) | 3257 | if ((load / 2) > env->imbalance) |
3259 | goto next; | 3258 | goto next; |
3260 | 3259 | ||
3261 | if (!can_migrate_task(p, env)) | 3260 | if (!can_migrate_task(p, env)) |
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env) | |||
3263 | 3262 | ||
3264 | move_task(p, env); | 3263 | move_task(p, env); |
3265 | pulled++; | 3264 | pulled++; |
3266 | env->load_move -= load; | 3265 | env->imbalance -= load; |
3267 | 3266 | ||
3268 | #ifdef CONFIG_PREEMPT | 3267 | #ifdef CONFIG_PREEMPT |
3269 | /* | 3268 | /* |
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env) | |||
3279 | * We only want to steal up to the prescribed amount of | 3278 | * We only want to steal up to the prescribed amount of |
3280 | * weighted load. | 3279 | * weighted load. |
3281 | */ | 3280 | */ |
3282 | if (env->load_move <= 0) | 3281 | if (env->imbalance <= 0) |
3283 | break; | 3282 | break; |
3284 | 3283 | ||
3285 | continue; | 3284 | continue; |
@@ -3433,14 +3432,6 @@ struct sd_lb_stats { | |||
3433 | unsigned int busiest_group_weight; | 3432 | unsigned int busiest_group_weight; |
3434 | 3433 | ||
3435 | int group_imb; /* Is there imbalance in this sd */ | 3434 | int group_imb; /* Is there imbalance in this sd */ |
3436 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3437 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3438 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3439 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3440 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3441 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3442 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3443 | #endif | ||
3444 | }; | 3435 | }; |
3445 | 3436 | ||
3446 | /* | 3437 | /* |
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
3484 | return load_idx; | 3475 | return load_idx; |
3485 | } | 3476 | } |
3486 | 3477 | ||
3487 | |||
3488 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3489 | /** | ||
3490 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3491 | * the given sched_domain, during load balancing. | ||
3492 | * | ||
3493 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3494 | * @sds: Variable containing the statistics for sd. | ||
3495 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3496 | */ | ||
3497 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3498 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3499 | { | ||
3500 | /* | ||
3501 | * Busy processors will not participate in power savings | ||
3502 | * balance. | ||
3503 | */ | ||
3504 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3505 | sds->power_savings_balance = 0; | ||
3506 | else { | ||
3507 | sds->power_savings_balance = 1; | ||
3508 | sds->min_nr_running = ULONG_MAX; | ||
3509 | sds->leader_nr_running = 0; | ||
3510 | } | ||
3511 | } | ||
3512 | |||
3513 | /** | ||
3514 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3515 | * sched_domain while performing load balancing. | ||
3516 | * | ||
3517 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3518 | * @sds: Variable containing the statistics of the sched_domain | ||
3519 | * @local_group: Does group contain the CPU for which we're performing | ||
3520 | * load balancing ? | ||
3521 | * @sgs: Variable containing the statistics of the group. | ||
3522 | */ | ||
3523 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3524 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3525 | { | ||
3526 | |||
3527 | if (!sds->power_savings_balance) | ||
3528 | return; | ||
3529 | |||
3530 | /* | ||
3531 | * If the local group is idle or completely loaded | ||
3532 | * no need to do power savings balance at this domain | ||
3533 | */ | ||
3534 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3535 | !sds->this_nr_running)) | ||
3536 | sds->power_savings_balance = 0; | ||
3537 | |||
3538 | /* | ||
3539 | * If a group is already running at full capacity or idle, | ||
3540 | * don't include that group in power savings calculations | ||
3541 | */ | ||
3542 | if (!sds->power_savings_balance || | ||
3543 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3544 | !sgs->sum_nr_running) | ||
3545 | return; | ||
3546 | |||
3547 | /* | ||
3548 | * Calculate the group which has the least non-idle load. | ||
3549 | * This is the group from where we need to pick up the load | ||
3550 | * for saving power | ||
3551 | */ | ||
3552 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3553 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3554 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3555 | sds->group_min = group; | ||
3556 | sds->min_nr_running = sgs->sum_nr_running; | ||
3557 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3558 | sgs->sum_nr_running; | ||
3559 | } | ||
3560 | |||
3561 | /* | ||
3562 | * Calculate the group which is almost near its | ||
3563 | * capacity but still has some space to pick up some load | ||
3564 | * from other group and save more power | ||
3565 | */ | ||
3566 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3567 | return; | ||
3568 | |||
3569 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3570 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3571 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3572 | sds->group_leader = group; | ||
3573 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3574 | } | ||
3575 | } | ||
3576 | |||
3577 | /** | ||
3578 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3579 | * @sds: Variable containing the statistics of the sched_domain | ||
3580 | * under consideration. | ||
3581 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3582 | * @imbalance: Variable to store the imbalance. | ||
3583 | * | ||
3584 | * Description: | ||
3585 | * Check if we have potential to perform some power-savings balance. | ||
3586 | * If yes, set the busiest group to be the least loaded group in the | ||
3587 | * sched_domain, so that it's CPUs can be put to idle. | ||
3588 | * | ||
3589 | * Returns 1 if there is potential to perform power-savings balance. | ||
3590 | * Else returns 0. | ||
3591 | */ | ||
3592 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3593 | int this_cpu, unsigned long *imbalance) | ||
3594 | { | ||
3595 | if (!sds->power_savings_balance) | ||
3596 | return 0; | ||
3597 | |||
3598 | if (sds->this != sds->group_leader || | ||
3599 | sds->group_leader == sds->group_min) | ||
3600 | return 0; | ||
3601 | |||
3602 | *imbalance = sds->min_load_per_task; | ||
3603 | sds->busiest = sds->group_min; | ||
3604 | |||
3605 | return 1; | ||
3606 | |||
3607 | } | ||
3608 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3609 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3610 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3611 | { | ||
3612 | return; | ||
3613 | } | ||
3614 | |||
3615 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3616 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3617 | { | ||
3618 | return; | ||
3619 | } | ||
3620 | |||
3621 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3622 | int this_cpu, unsigned long *imbalance) | ||
3623 | { | ||
3624 | return 0; | ||
3625 | } | ||
3626 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3627 | |||
3628 | |||
3629 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 3478 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
3630 | { | 3479 | { |
3631 | return SCHED_POWER_SCALE; | 3480 | return SCHED_POWER_SCALE; |
@@ -3763,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3763 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3612 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3764 | * @sd: The sched_domain whose statistics are to be updated. | 3613 | * @sd: The sched_domain whose statistics are to be updated. |
3765 | * @group: sched_group whose statistics are to be updated. | 3614 | * @group: sched_group whose statistics are to be updated. |
3766 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3767 | * @idle: Idle status of this_cpu | ||
3768 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3615 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3769 | * @local_group: Does group contain this_cpu. | 3616 | * @local_group: Does group contain this_cpu. |
3770 | * @cpus: Set of cpus considered for load balancing. | 3617 | * @cpus: Set of cpus considered for load balancing. |
3771 | * @balance: Should we balance. | 3618 | * @balance: Should we balance. |
3772 | * @sgs: variable to hold the statistics for this group. | 3619 | * @sgs: variable to hold the statistics for this group. |
3773 | */ | 3620 | */ |
3774 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 3621 | static inline void update_sg_lb_stats(struct lb_env *env, |
3775 | struct sched_group *group, int this_cpu, | 3622 | struct sched_group *group, int load_idx, |
3776 | enum cpu_idle_type idle, int load_idx, | ||
3777 | int local_group, const struct cpumask *cpus, | 3623 | int local_group, const struct cpumask *cpus, |
3778 | int *balance, struct sg_lb_stats *sgs) | 3624 | int *balance, struct sg_lb_stats *sgs) |
3779 | { | 3625 | { |
3780 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; | 3626 | unsigned long nr_running, max_nr_running, min_nr_running; |
3781 | int i; | 3627 | unsigned long load, max_cpu_load, min_cpu_load; |
3782 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3628 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3783 | unsigned long avg_load_per_task = 0; | 3629 | unsigned long avg_load_per_task = 0; |
3630 | int i; | ||
3784 | 3631 | ||
3785 | if (local_group) | 3632 | if (local_group) |
3786 | balance_cpu = group_first_cpu(group); | 3633 | balance_cpu = group_first_cpu(group); |
@@ -3789,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3789 | max_cpu_load = 0; | 3636 | max_cpu_load = 0; |
3790 | min_cpu_load = ~0UL; | 3637 | min_cpu_load = ~0UL; |
3791 | max_nr_running = 0; | 3638 | max_nr_running = 0; |
3639 | min_nr_running = ~0UL; | ||
3792 | 3640 | ||
3793 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3641 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
3794 | struct rq *rq = cpu_rq(i); | 3642 | struct rq *rq = cpu_rq(i); |
3795 | 3643 | ||
3644 | nr_running = rq->nr_running; | ||
3645 | |||
3796 | /* Bias balancing toward cpus of our domain */ | 3646 | /* Bias balancing toward cpus of our domain */ |
3797 | if (local_group) { | 3647 | if (local_group) { |
3798 | if (idle_cpu(i) && !first_idle_cpu) { | 3648 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -3803,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3803 | load = target_load(i, load_idx); | 3653 | load = target_load(i, load_idx); |
3804 | } else { | 3654 | } else { |
3805 | load = source_load(i, load_idx); | 3655 | load = source_load(i, load_idx); |
3806 | if (load > max_cpu_load) { | 3656 | if (load > max_cpu_load) |
3807 | max_cpu_load = load; | 3657 | max_cpu_load = load; |
3808 | max_nr_running = rq->nr_running; | ||
3809 | } | ||
3810 | if (min_cpu_load > load) | 3658 | if (min_cpu_load > load) |
3811 | min_cpu_load = load; | 3659 | min_cpu_load = load; |
3660 | |||
3661 | if (nr_running > max_nr_running) | ||
3662 | max_nr_running = nr_running; | ||
3663 | if (min_nr_running > nr_running) | ||
3664 | min_nr_running = nr_running; | ||
3812 | } | 3665 | } |
3813 | 3666 | ||
3814 | sgs->group_load += load; | 3667 | sgs->group_load += load; |
3815 | sgs->sum_nr_running += rq->nr_running; | 3668 | sgs->sum_nr_running += nr_running; |
3816 | sgs->sum_weighted_load += weighted_cpuload(i); | 3669 | sgs->sum_weighted_load += weighted_cpuload(i); |
3817 | if (idle_cpu(i)) | 3670 | if (idle_cpu(i)) |
3818 | sgs->idle_cpus++; | 3671 | sgs->idle_cpus++; |
@@ -3825,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3825 | * to do the newly idle load balance. | 3678 | * to do the newly idle load balance. |
3826 | */ | 3679 | */ |
3827 | if (local_group) { | 3680 | if (local_group) { |
3828 | if (idle != CPU_NEWLY_IDLE) { | 3681 | if (env->idle != CPU_NEWLY_IDLE) { |
3829 | if (balance_cpu != this_cpu) { | 3682 | if (balance_cpu != env->dst_cpu) { |
3830 | *balance = 0; | 3683 | *balance = 0; |
3831 | return; | 3684 | return; |
3832 | } | 3685 | } |
3833 | update_group_power(sd, this_cpu); | 3686 | update_group_power(env->sd, env->dst_cpu); |
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | 3687 | } else if (time_after_eq(jiffies, group->sgp->next_update)) |
3835 | update_group_power(sd, this_cpu); | 3688 | update_group_power(env->sd, env->dst_cpu); |
3836 | } | 3689 | } |
3837 | 3690 | ||
3838 | /* Adjust by relative CPU power of the group */ | 3691 | /* Adjust by relative CPU power of the group */ |
@@ -3850,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3850 | if (sgs->sum_nr_running) | 3703 | if (sgs->sum_nr_running) |
3851 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 3704 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
3852 | 3705 | ||
3853 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 3706 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && |
3707 | (max_nr_running - min_nr_running) > 1) | ||
3854 | sgs->group_imb = 1; | 3708 | sgs->group_imb = 1; |
3855 | 3709 | ||
3856 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | 3710 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
3857 | SCHED_POWER_SCALE); | 3711 | SCHED_POWER_SCALE); |
3858 | if (!sgs->group_capacity) | 3712 | if (!sgs->group_capacity) |
3859 | sgs->group_capacity = fix_small_capacity(sd, group); | 3713 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
3860 | sgs->group_weight = group->group_weight; | 3714 | sgs->group_weight = group->group_weight; |
3861 | 3715 | ||
3862 | if (sgs->group_capacity > sgs->sum_nr_running) | 3716 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -3874,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3874 | * Determine if @sg is a busier group than the previously selected | 3728 | * Determine if @sg is a busier group than the previously selected |
3875 | * busiest group. | 3729 | * busiest group. |
3876 | */ | 3730 | */ |
3877 | static bool update_sd_pick_busiest(struct sched_domain *sd, | 3731 | static bool update_sd_pick_busiest(struct lb_env *env, |
3878 | struct sd_lb_stats *sds, | 3732 | struct sd_lb_stats *sds, |
3879 | struct sched_group *sg, | 3733 | struct sched_group *sg, |
3880 | struct sg_lb_stats *sgs, | 3734 | struct sg_lb_stats *sgs) |
3881 | int this_cpu) | ||
3882 | { | 3735 | { |
3883 | if (sgs->avg_load <= sds->max_load) | 3736 | if (sgs->avg_load <= sds->max_load) |
3884 | return false; | 3737 | return false; |
@@ -3894,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3894 | * numbered CPUs in the group, therefore mark all groups | 3747 | * numbered CPUs in the group, therefore mark all groups |
3895 | * higher than ourself as busy. | 3748 | * higher than ourself as busy. |
3896 | */ | 3749 | */ |
3897 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 3750 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && |
3898 | this_cpu < group_first_cpu(sg)) { | 3751 | env->dst_cpu < group_first_cpu(sg)) { |
3899 | if (!sds->busiest) | 3752 | if (!sds->busiest) |
3900 | return true; | 3753 | return true; |
3901 | 3754 | ||
@@ -3915,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3915 | * @balance: Should we balance. | 3768 | * @balance: Should we balance. |
3916 | * @sds: variable to hold the statistics for this sched_domain. | 3769 | * @sds: variable to hold the statistics for this sched_domain. |
3917 | */ | 3770 | */ |
3918 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 3771 | static inline void update_sd_lb_stats(struct lb_env *env, |
3919 | enum cpu_idle_type idle, const struct cpumask *cpus, | 3772 | const struct cpumask *cpus, |
3920 | int *balance, struct sd_lb_stats *sds) | 3773 | int *balance, struct sd_lb_stats *sds) |
3921 | { | 3774 | { |
3922 | struct sched_domain *child = sd->child; | 3775 | struct sched_domain *child = env->sd->child; |
3923 | struct sched_group *sg = sd->groups; | 3776 | struct sched_group *sg = env->sd->groups; |
3924 | struct sg_lb_stats sgs; | 3777 | struct sg_lb_stats sgs; |
3925 | int load_idx, prefer_sibling = 0; | 3778 | int load_idx, prefer_sibling = 0; |
3926 | 3779 | ||
3927 | if (child && child->flags & SD_PREFER_SIBLING) | 3780 | if (child && child->flags & SD_PREFER_SIBLING) |
3928 | prefer_sibling = 1; | 3781 | prefer_sibling = 1; |
3929 | 3782 | ||
3930 | init_sd_power_savings_stats(sd, sds, idle); | 3783 | load_idx = get_sd_load_idx(env->sd, env->idle); |
3931 | load_idx = get_sd_load_idx(sd, idle); | ||
3932 | 3784 | ||
3933 | do { | 3785 | do { |
3934 | int local_group; | 3786 | int local_group; |
3935 | 3787 | ||
3936 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 3788 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
3937 | memset(&sgs, 0, sizeof(sgs)); | 3789 | memset(&sgs, 0, sizeof(sgs)); |
3938 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, | 3790 | update_sg_lb_stats(env, sg, load_idx, local_group, |
3939 | local_group, cpus, balance, &sgs); | 3791 | cpus, balance, &sgs); |
3940 | 3792 | ||
3941 | if (local_group && !(*balance)) | 3793 | if (local_group && !(*balance)) |
3942 | return; | 3794 | return; |
@@ -3964,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3964 | sds->this_load_per_task = sgs.sum_weighted_load; | 3816 | sds->this_load_per_task = sgs.sum_weighted_load; |
3965 | sds->this_has_capacity = sgs.group_has_capacity; | 3817 | sds->this_has_capacity = sgs.group_has_capacity; |
3966 | sds->this_idle_cpus = sgs.idle_cpus; | 3818 | sds->this_idle_cpus = sgs.idle_cpus; |
3967 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 3819 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { |
3968 | sds->max_load = sgs.avg_load; | 3820 | sds->max_load = sgs.avg_load; |
3969 | sds->busiest = sg; | 3821 | sds->busiest = sg; |
3970 | sds->busiest_nr_running = sgs.sum_nr_running; | 3822 | sds->busiest_nr_running = sgs.sum_nr_running; |
@@ -3976,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3976 | sds->group_imb = sgs.group_imb; | 3828 | sds->group_imb = sgs.group_imb; |
3977 | } | 3829 | } |
3978 | 3830 | ||
3979 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); | ||
3980 | sg = sg->next; | 3831 | sg = sg->next; |
3981 | } while (sg != sd->groups); | 3832 | } while (sg != env->sd->groups); |
3982 | } | 3833 | } |
3983 | 3834 | ||
3984 | /** | 3835 | /** |
@@ -4006,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
4006 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | 3857 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. |
4007 | * @imbalance: returns amount of imbalanced due to packing. | 3858 | * @imbalance: returns amount of imbalanced due to packing. |
4008 | */ | 3859 | */ |
4009 | static int check_asym_packing(struct sched_domain *sd, | 3860 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
4010 | struct sd_lb_stats *sds, | ||
4011 | int this_cpu, unsigned long *imbalance) | ||
4012 | { | 3861 | { |
4013 | int busiest_cpu; | 3862 | int busiest_cpu; |
4014 | 3863 | ||
4015 | if (!(sd->flags & SD_ASYM_PACKING)) | 3864 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
4016 | return 0; | 3865 | return 0; |
4017 | 3866 | ||
4018 | if (!sds->busiest) | 3867 | if (!sds->busiest) |
4019 | return 0; | 3868 | return 0; |
4020 | 3869 | ||
4021 | busiest_cpu = group_first_cpu(sds->busiest); | 3870 | busiest_cpu = group_first_cpu(sds->busiest); |
4022 | if (this_cpu > busiest_cpu) | 3871 | if (env->dst_cpu > busiest_cpu) |
4023 | return 0; | 3872 | return 0; |
4024 | 3873 | ||
4025 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, | 3874 | env->imbalance = DIV_ROUND_CLOSEST( |
4026 | SCHED_POWER_SCALE); | 3875 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); |
3876 | |||
4027 | return 1; | 3877 | return 1; |
4028 | } | 3878 | } |
4029 | 3879 | ||
@@ -4035,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd, | |||
4035 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | 3885 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. |
4036 | * @imbalance: Variable to store the imbalance. | 3886 | * @imbalance: Variable to store the imbalance. |
4037 | */ | 3887 | */ |
4038 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | 3888 | static inline |
4039 | int this_cpu, unsigned long *imbalance) | 3889 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4040 | { | 3890 | { |
4041 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 3891 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4042 | unsigned int imbn = 2; | 3892 | unsigned int imbn = 2; |
@@ -4047,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4047 | if (sds->busiest_load_per_task > | 3897 | if (sds->busiest_load_per_task > |
4048 | sds->this_load_per_task) | 3898 | sds->this_load_per_task) |
4049 | imbn = 1; | 3899 | imbn = 1; |
4050 | } else | 3900 | } else { |
4051 | sds->this_load_per_task = | 3901 | sds->this_load_per_task = |
4052 | cpu_avg_load_per_task(this_cpu); | 3902 | cpu_avg_load_per_task(env->dst_cpu); |
3903 | } | ||
4053 | 3904 | ||
4054 | scaled_busy_load_per_task = sds->busiest_load_per_task | 3905 | scaled_busy_load_per_task = sds->busiest_load_per_task |
4055 | * SCHED_POWER_SCALE; | 3906 | * SCHED_POWER_SCALE; |
@@ -4057,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4057 | 3908 | ||
4058 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3909 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
4059 | (scaled_busy_load_per_task * imbn)) { | 3910 | (scaled_busy_load_per_task * imbn)) { |
4060 | *imbalance = sds->busiest_load_per_task; | 3911 | env->imbalance = sds->busiest_load_per_task; |
4061 | return; | 3912 | return; |
4062 | } | 3913 | } |
4063 | 3914 | ||
@@ -4094,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4094 | 3945 | ||
4095 | /* Move if we gain throughput */ | 3946 | /* Move if we gain throughput */ |
4096 | if (pwr_move > pwr_now) | 3947 | if (pwr_move > pwr_now) |
4097 | *imbalance = sds->busiest_load_per_task; | 3948 | env->imbalance = sds->busiest_load_per_task; |
4098 | } | 3949 | } |
4099 | 3950 | ||
4100 | /** | 3951 | /** |
4101 | * calculate_imbalance - Calculate the amount of imbalance present within the | 3952 | * calculate_imbalance - Calculate the amount of imbalance present within the |
4102 | * groups of a given sched_domain during load balance. | 3953 | * groups of a given sched_domain during load balance. |
3954 | * @env: load balance environment | ||
4103 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | 3955 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. |
4104 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
4105 | * @imbalance: The variable to store the imbalance. | ||
4106 | */ | 3956 | */ |
4107 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 3957 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4108 | unsigned long *imbalance) | ||
4109 | { | 3958 | { |
4110 | unsigned long max_pull, load_above_capacity = ~0UL; | 3959 | unsigned long max_pull, load_above_capacity = ~0UL; |
4111 | 3960 | ||
@@ -4121,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4121 | * its cpu_power, while calculating max_load..) | 3970 | * its cpu_power, while calculating max_load..) |
4122 | */ | 3971 | */ |
4123 | if (sds->max_load < sds->avg_load) { | 3972 | if (sds->max_load < sds->avg_load) { |
4124 | *imbalance = 0; | 3973 | env->imbalance = 0; |
4125 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3974 | return fix_small_imbalance(env, sds); |
4126 | } | 3975 | } |
4127 | 3976 | ||
4128 | if (!sds->group_imb) { | 3977 | if (!sds->group_imb) { |
@@ -4150,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4150 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 3999 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
4151 | 4000 | ||
4152 | /* How much load to actually move to equalise the imbalance */ | 4001 | /* How much load to actually move to equalise the imbalance */ |
4153 | *imbalance = min(max_pull * sds->busiest->sgp->power, | 4002 | env->imbalance = min(max_pull * sds->busiest->sgp->power, |
4154 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4003 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
4155 | / SCHED_POWER_SCALE; | 4004 | / SCHED_POWER_SCALE; |
4156 | 4005 | ||
@@ -4160,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4160 | * a think about bumping its value to force at least one task to be | 4009 | * a think about bumping its value to force at least one task to be |
4161 | * moved | 4010 | * moved |
4162 | */ | 4011 | */ |
4163 | if (*imbalance < sds->busiest_load_per_task) | 4012 | if (env->imbalance < sds->busiest_load_per_task) |
4164 | return fix_small_imbalance(sds, this_cpu, imbalance); | 4013 | return fix_small_imbalance(env, sds); |
4165 | 4014 | ||
4166 | } | 4015 | } |
4167 | 4016 | ||
@@ -4192,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4192 | * put to idle by rebalancing its tasks onto our group. | 4041 | * put to idle by rebalancing its tasks onto our group. |
4193 | */ | 4042 | */ |
4194 | static struct sched_group * | 4043 | static struct sched_group * |
4195 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 4044 | find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) |
4196 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4197 | const struct cpumask *cpus, int *balance) | ||
4198 | { | 4045 | { |
4199 | struct sd_lb_stats sds; | 4046 | struct sd_lb_stats sds; |
4200 | 4047 | ||
@@ -4204,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4204 | * Compute the various statistics relavent for load balancing at | 4051 | * Compute the various statistics relavent for load balancing at |
4205 | * this level. | 4052 | * this level. |
4206 | */ | 4053 | */ |
4207 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); | 4054 | update_sd_lb_stats(env, cpus, balance, &sds); |
4208 | 4055 | ||
4209 | /* | 4056 | /* |
4210 | * this_cpu is not the appropriate cpu to perform load balancing at | 4057 | * this_cpu is not the appropriate cpu to perform load balancing at |
@@ -4213,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4213 | if (!(*balance)) | 4060 | if (!(*balance)) |
4214 | goto ret; | 4061 | goto ret; |
4215 | 4062 | ||
4216 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | 4063 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4217 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 4064 | check_asym_packing(env, &sds)) |
4218 | return sds.busiest; | 4065 | return sds.busiest; |
4219 | 4066 | ||
4220 | /* There is no busy sibling group to pull tasks from */ | 4067 | /* There is no busy sibling group to pull tasks from */ |
@@ -4232,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4232 | goto force_balance; | 4079 | goto force_balance; |
4233 | 4080 | ||
4234 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4081 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4235 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4082 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
4236 | !sds.busiest_has_capacity) | 4083 | !sds.busiest_has_capacity) |
4237 | goto force_balance; | 4084 | goto force_balance; |
4238 | 4085 | ||
@@ -4250,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4250 | if (sds.this_load >= sds.avg_load) | 4097 | if (sds.this_load >= sds.avg_load) |
4251 | goto out_balanced; | 4098 | goto out_balanced; |
4252 | 4099 | ||
4253 | if (idle == CPU_IDLE) { | 4100 | if (env->idle == CPU_IDLE) { |
4254 | /* | 4101 | /* |
4255 | * This cpu is idle. If the busiest group load doesn't | 4102 | * This cpu is idle. If the busiest group load doesn't |
4256 | * have more tasks than the number of available cpu's and | 4103 | * have more tasks than the number of available cpu's and |
@@ -4265,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4265 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 4112 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4266 | * imbalance_pct to be conservative. | 4113 | * imbalance_pct to be conservative. |
4267 | */ | 4114 | */ |
4268 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 4115 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) |
4269 | goto out_balanced; | 4116 | goto out_balanced; |
4270 | } | 4117 | } |
4271 | 4118 | ||
4272 | force_balance: | 4119 | force_balance: |
4273 | /* Looks like there is an imbalance. Compute it */ | 4120 | /* Looks like there is an imbalance. Compute it */ |
4274 | calculate_imbalance(&sds, this_cpu, imbalance); | 4121 | calculate_imbalance(env, &sds); |
4275 | return sds.busiest; | 4122 | return sds.busiest; |
4276 | 4123 | ||
4277 | out_balanced: | 4124 | out_balanced: |
4278 | /* | ||
4279 | * There is no obvious imbalance. But check if we can do some balancing | ||
4280 | * to save power. | ||
4281 | */ | ||
4282 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4283 | return sds.busiest; | ||
4284 | ret: | 4125 | ret: |
4285 | *imbalance = 0; | 4126 | env->imbalance = 0; |
4286 | return NULL; | 4127 | return NULL; |
4287 | } | 4128 | } |
4288 | 4129 | ||
4289 | /* | 4130 | /* |
4290 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4131 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
4291 | */ | 4132 | */ |
4292 | static struct rq * | 4133 | static struct rq *find_busiest_queue(struct lb_env *env, |
4293 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | 4134 | struct sched_group *group, |
4294 | enum cpu_idle_type idle, unsigned long imbalance, | 4135 | const struct cpumask *cpus) |
4295 | const struct cpumask *cpus) | ||
4296 | { | 4136 | { |
4297 | struct rq *busiest = NULL, *rq; | 4137 | struct rq *busiest = NULL, *rq; |
4298 | unsigned long max_load = 0; | 4138 | unsigned long max_load = 0; |
@@ -4305,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4305 | unsigned long wl; | 4145 | unsigned long wl; |
4306 | 4146 | ||
4307 | if (!capacity) | 4147 | if (!capacity) |
4308 | capacity = fix_small_capacity(sd, group); | 4148 | capacity = fix_small_capacity(env->sd, group); |
4309 | 4149 | ||
4310 | if (!cpumask_test_cpu(i, cpus)) | 4150 | if (!cpumask_test_cpu(i, cpus)) |
4311 | continue; | 4151 | continue; |
@@ -4317,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4317 | * When comparing with imbalance, use weighted_cpuload() | 4157 | * When comparing with imbalance, use weighted_cpuload() |
4318 | * which is not scaled with the cpu power. | 4158 | * which is not scaled with the cpu power. |
4319 | */ | 4159 | */ |
4320 | if (capacity && rq->nr_running == 1 && wl > imbalance) | 4160 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) |
4321 | continue; | 4161 | continue; |
4322 | 4162 | ||
4323 | /* | 4163 | /* |
@@ -4346,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4346 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4186 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4347 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4187 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
4348 | 4188 | ||
4349 | static int need_active_balance(struct sched_domain *sd, int idle, | 4189 | static int need_active_balance(struct lb_env *env) |
4350 | int busiest_cpu, int this_cpu) | ||
4351 | { | 4190 | { |
4352 | if (idle == CPU_NEWLY_IDLE) { | 4191 | struct sched_domain *sd = env->sd; |
4192 | |||
4193 | if (env->idle == CPU_NEWLY_IDLE) { | ||
4353 | 4194 | ||
4354 | /* | 4195 | /* |
4355 | * ASYM_PACKING needs to force migrate tasks from busy but | 4196 | * ASYM_PACKING needs to force migrate tasks from busy but |
4356 | * higher numbered CPUs in order to pack all tasks in the | 4197 | * higher numbered CPUs in order to pack all tasks in the |
4357 | * lowest numbered CPUs. | 4198 | * lowest numbered CPUs. |
4358 | */ | 4199 | */ |
4359 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | 4200 | if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) |
4360 | return 1; | 4201 | return 1; |
4361 | |||
4362 | /* | ||
4363 | * The only task running in a non-idle cpu can be moved to this | ||
4364 | * cpu in an attempt to completely freeup the other CPU | ||
4365 | * package. | ||
4366 | * | ||
4367 | * The package power saving logic comes from | ||
4368 | * find_busiest_group(). If there are no imbalance, then | ||
4369 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4370 | * f_b_g() will select a group from which a running task may be | ||
4371 | * pulled to this cpu in order to make the other package idle. | ||
4372 | * If there is no opportunity to make a package idle and if | ||
4373 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4374 | * action will be taken in load_balance_newidle(). | ||
4375 | * | ||
4376 | * Under normal task pull operation due to imbalance, there | ||
4377 | * will be more than one task in the source run queue and | ||
4378 | * move_tasks() will succeed. ld_moved will be true and this | ||
4379 | * active balance code will not be triggered. | ||
4380 | */ | ||
4381 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4382 | return 0; | ||
4383 | } | 4202 | } |
4384 | 4203 | ||
4385 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 4204 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
@@ -4397,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4397 | { | 4216 | { |
4398 | int ld_moved, active_balance = 0; | 4217 | int ld_moved, active_balance = 0; |
4399 | struct sched_group *group; | 4218 | struct sched_group *group; |
4400 | unsigned long imbalance; | ||
4401 | struct rq *busiest; | 4219 | struct rq *busiest; |
4402 | unsigned long flags; | 4220 | unsigned long flags; |
4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4221 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
@@ -4407,7 +4225,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4407 | .dst_cpu = this_cpu, | 4225 | .dst_cpu = this_cpu, |
4408 | .dst_rq = this_rq, | 4226 | .dst_rq = this_rq, |
4409 | .idle = idle, | 4227 | .idle = idle, |
4410 | .loop_break = sysctl_sched_nr_migrate, | 4228 | .loop_break = sched_nr_migrate_break, |
4411 | }; | 4229 | }; |
4412 | 4230 | ||
4413 | cpumask_copy(cpus, cpu_active_mask); | 4231 | cpumask_copy(cpus, cpu_active_mask); |
@@ -4415,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4415 | schedstat_inc(sd, lb_count[idle]); | 4233 | schedstat_inc(sd, lb_count[idle]); |
4416 | 4234 | ||
4417 | redo: | 4235 | redo: |
4418 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, | 4236 | group = find_busiest_group(&env, cpus, balance); |
4419 | cpus, balance); | ||
4420 | 4237 | ||
4421 | if (*balance == 0) | 4238 | if (*balance == 0) |
4422 | goto out_balanced; | 4239 | goto out_balanced; |
@@ -4426,7 +4243,7 @@ redo: | |||
4426 | goto out_balanced; | 4243 | goto out_balanced; |
4427 | } | 4244 | } |
4428 | 4245 | ||
4429 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); | 4246 | busiest = find_busiest_queue(&env, group, cpus); |
4430 | if (!busiest) { | 4247 | if (!busiest) { |
4431 | schedstat_inc(sd, lb_nobusyq[idle]); | 4248 | schedstat_inc(sd, lb_nobusyq[idle]); |
4432 | goto out_balanced; | 4249 | goto out_balanced; |
@@ -4434,7 +4251,7 @@ redo: | |||
4434 | 4251 | ||
4435 | BUG_ON(busiest == this_rq); | 4252 | BUG_ON(busiest == this_rq); |
4436 | 4253 | ||
4437 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 4254 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4438 | 4255 | ||
4439 | ld_moved = 0; | 4256 | ld_moved = 0; |
4440 | if (busiest->nr_running > 1) { | 4257 | if (busiest->nr_running > 1) { |
@@ -4445,10 +4262,9 @@ redo: | |||
4445 | * correctly treated as an imbalance. | 4262 | * correctly treated as an imbalance. |
4446 | */ | 4263 | */ |
4447 | env.flags |= LBF_ALL_PINNED; | 4264 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | 4265 | env.src_cpu = busiest->cpu; |
4449 | env.src_cpu = busiest->cpu; | 4266 | env.src_rq = busiest; |
4450 | env.src_rq = busiest; | 4267 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
4451 | env.loop_max = busiest->nr_running; | ||
4452 | 4268 | ||
4453 | more_balance: | 4269 | more_balance: |
4454 | local_irq_save(flags); | 4270 | local_irq_save(flags); |
@@ -4490,7 +4306,7 @@ more_balance: | |||
4490 | if (idle != CPU_NEWLY_IDLE) | 4306 | if (idle != CPU_NEWLY_IDLE) |
4491 | sd->nr_balance_failed++; | 4307 | sd->nr_balance_failed++; |
4492 | 4308 | ||
4493 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { | 4309 | if (need_active_balance(&env)) { |
4494 | raw_spin_lock_irqsave(&busiest->lock, flags); | 4310 | raw_spin_lock_irqsave(&busiest->lock, flags); |
4495 | 4311 | ||
4496 | /* don't kick the active_load_balance_cpu_stop, | 4312 | /* don't kick the active_load_balance_cpu_stop, |
@@ -4517,10 +4333,11 @@ more_balance: | |||
4517 | } | 4333 | } |
4518 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 4334 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
4519 | 4335 | ||
4520 | if (active_balance) | 4336 | if (active_balance) { |
4521 | stop_one_cpu_nowait(cpu_of(busiest), | 4337 | stop_one_cpu_nowait(cpu_of(busiest), |
4522 | active_load_balance_cpu_stop, busiest, | 4338 | active_load_balance_cpu_stop, busiest, |
4523 | &busiest->active_balance_work); | 4339 | &busiest->active_balance_work); |
4340 | } | ||
4524 | 4341 | ||
4525 | /* | 4342 | /* |
4526 | * We've kicked active balancing, reset the failure | 4343 | * We've kicked active balancing, reset the failure |
@@ -4701,104 +4518,15 @@ static struct { | |||
4701 | unsigned long next_balance; /* in jiffy units */ | 4518 | unsigned long next_balance; /* in jiffy units */ |
4702 | } nohz ____cacheline_aligned; | 4519 | } nohz ____cacheline_aligned; |
4703 | 4520 | ||
4704 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4521 | static inline int find_new_ilb(int call_cpu) |
4705 | /** | ||
4706 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4707 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4708 | * be returned. | ||
4709 | * @flag: The flag to check for the lowest sched_domain | ||
4710 | * for the given cpu. | ||
4711 | * | ||
4712 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4713 | */ | ||
4714 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4715 | { | ||
4716 | struct sched_domain *sd; | ||
4717 | |||
4718 | for_each_domain(cpu, sd) | ||
4719 | if (sd->flags & flag) | ||
4720 | break; | ||
4721 | |||
4722 | return sd; | ||
4723 | } | ||
4724 | |||
4725 | /** | ||
4726 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4727 | * @cpu: The cpu whose domains we're iterating over. | ||
4728 | * @sd: variable holding the value of the power_savings_sd | ||
4729 | * for cpu. | ||
4730 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4731 | * | ||
4732 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4733 | * set, starting from the lowest sched_domain to the highest. | ||
4734 | */ | ||
4735 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4736 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4737 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4738 | |||
4739 | /** | ||
4740 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4741 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4742 | * | ||
4743 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4744 | * Else, returns >= nr_cpu_ids. | ||
4745 | * | ||
4746 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4747 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4748 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4749 | * when there are other idle cpu's which are better suited for that job. | ||
4750 | */ | ||
4751 | static int find_new_ilb(int cpu) | ||
4752 | { | 4522 | { |
4753 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 4523 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
4754 | struct sched_group *ilbg; | ||
4755 | struct sched_domain *sd; | ||
4756 | |||
4757 | /* | ||
4758 | * Have idle load balancer selection from semi-idle packages only | ||
4759 | * when power-aware load balancing is enabled | ||
4760 | */ | ||
4761 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4762 | goto out_done; | ||
4763 | |||
4764 | /* | ||
4765 | * Optimize for the case when we have no idle CPUs or only one | ||
4766 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4767 | */ | ||
4768 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | ||
4769 | goto out_done; | ||
4770 | |||
4771 | rcu_read_lock(); | ||
4772 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4773 | ilbg = sd->groups; | ||
4774 | |||
4775 | do { | ||
4776 | if (ilbg->group_weight != | ||
4777 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { | ||
4778 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4779 | sched_group_cpus(ilbg)); | ||
4780 | goto unlock; | ||
4781 | } | ||
4782 | |||
4783 | ilbg = ilbg->next; | ||
4784 | |||
4785 | } while (ilbg != sd->groups); | ||
4786 | } | ||
4787 | unlock: | ||
4788 | rcu_read_unlock(); | ||
4789 | 4524 | ||
4790 | out_done: | ||
4791 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | 4525 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4792 | return ilb; | 4526 | return ilb; |
4793 | 4527 | ||
4794 | return nr_cpu_ids; | 4528 | return nr_cpu_ids; |
4795 | } | 4529 | } |
4796 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4797 | static inline int find_new_ilb(int call_cpu) | ||
4798 | { | ||
4799 | return nr_cpu_ids; | ||
4800 | } | ||
4801 | #endif | ||
4802 | 4530 | ||
4803 | /* | 4531 | /* |
4804 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | 4532 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the |
@@ -5021,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
5021 | 4749 | ||
5022 | raw_spin_lock_irq(&this_rq->lock); | 4750 | raw_spin_lock_irq(&this_rq->lock); |
5023 | update_rq_clock(this_rq); | 4751 | update_rq_clock(this_rq); |
5024 | update_cpu_load(this_rq); | 4752 | update_idle_cpu_load(this_rq); |
5025 | raw_spin_unlock_irq(&this_rq->lock); | 4753 | raw_spin_unlock_irq(&this_rq->lock); |
5026 | 4754 | ||
5027 | rebalance_domains(balance_cpu, CPU_IDLE); | 4755 | rebalance_domains(balance_cpu, CPU_IDLE); |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d0..de00a486c5c6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
68 | 68 | ||
69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
71 | SCHED_FEAT(LB_MIN, false) | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f289..b44d604b35d1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
5 | * | 5 | * |
6 | * (NOTE: these are not related to SCHED_IDLE tasks which are | 6 | * (NOTE: these are not related to SCHED_IDLE tasks which are |
7 | * handled in sched_fair.c) | 7 | * handled in sched/fair.c) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d0..c5565c3c515f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1803 | static void set_cpus_allowed_rt(struct task_struct *p, | 1803 | static void set_cpus_allowed_rt(struct task_struct *p, |
1804 | const struct cpumask *new_mask) | 1804 | const struct cpumask *new_mask) |
1805 | { | 1805 | { |
1806 | int weight = cpumask_weight(new_mask); | 1806 | struct rq *rq; |
1807 | int weight; | ||
1807 | 1808 | ||
1808 | BUG_ON(!rt_task(p)); | 1809 | BUG_ON(!rt_task(p)); |
1809 | 1810 | ||
1810 | /* | 1811 | if (!p->on_rq) |
1811 | * Update the migration status of the RQ if we have an RT task | 1812 | return; |
1812 | * which is running AND changing its weight value. | ||
1813 | */ | ||
1814 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
1815 | struct rq *rq = task_rq(p); | ||
1816 | |||
1817 | if (!task_current(rq, p)) { | ||
1818 | /* | ||
1819 | * Make sure we dequeue this task from the pushable list | ||
1820 | * before going further. It will either remain off of | ||
1821 | * the list because we are no longer pushable, or it | ||
1822 | * will be requeued. | ||
1823 | */ | ||
1824 | if (p->rt.nr_cpus_allowed > 1) | ||
1825 | dequeue_pushable_task(rq, p); | ||
1826 | 1813 | ||
1827 | /* | 1814 | weight = cpumask_weight(new_mask); |
1828 | * Requeue if our weight is changing and still > 1 | ||
1829 | */ | ||
1830 | if (weight > 1) | ||
1831 | enqueue_pushable_task(rq, p); | ||
1832 | 1815 | ||
1833 | } | 1816 | /* |
1817 | * Only update if the process changes its state from whether it | ||
1818 | * can migrate or not. | ||
1819 | */ | ||
1820 | if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) | ||
1821 | return; | ||
1834 | 1822 | ||
1835 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | 1823 | rq = task_rq(p); |
1836 | rq->rt.rt_nr_migratory++; | ||
1837 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
1838 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1839 | rq->rt.rt_nr_migratory--; | ||
1840 | } | ||
1841 | 1824 | ||
1842 | update_rt_migration(&rq->rt); | 1825 | /* |
1826 | * The process used to be able to migrate OR it can now migrate | ||
1827 | */ | ||
1828 | if (weight <= 1) { | ||
1829 | if (!task_current(rq, p)) | ||
1830 | dequeue_pushable_task(rq, p); | ||
1831 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1832 | rq->rt.rt_nr_migratory--; | ||
1833 | } else { | ||
1834 | if (!task_current(rq, p)) | ||
1835 | enqueue_pushable_task(rq, p); | ||
1836 | rq->rt.rt_nr_migratory++; | ||
1843 | } | 1837 | } |
1838 | |||
1839 | update_rt_migration(&rq->rt); | ||
1844 | } | 1840 | } |
1845 | 1841 | ||
1846 | /* Assumes rq->lock is held */ | 1842 | /* Assumes rq->lock is held */ |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52e..ba9dccfd24ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -201,7 +201,7 @@ struct cfs_bandwidth { }; | |||
201 | /* CFS-related fields in a runqueue */ | 201 | /* CFS-related fields in a runqueue */ |
202 | struct cfs_rq { | 202 | struct cfs_rq { |
203 | struct load_weight load; | 203 | struct load_weight load; |
204 | unsigned long nr_running, h_nr_running; | 204 | unsigned int nr_running, h_nr_running; |
205 | 205 | ||
206 | u64 exec_clock; | 206 | u64 exec_clock; |
207 | u64 min_vruntime; | 207 | u64 min_vruntime; |
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void) | |||
279 | /* Real-Time classes' related field in a runqueue: */ | 279 | /* Real-Time classes' related field in a runqueue: */ |
280 | struct rt_rq { | 280 | struct rt_rq { |
281 | struct rt_prio_array active; | 281 | struct rt_prio_array active; |
282 | unsigned long rt_nr_running; | 282 | unsigned int rt_nr_running; |
283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
284 | struct { | 284 | struct { |
285 | int curr; /* highest queued rt task prio */ | 285 | int curr; /* highest queued rt task prio */ |
@@ -353,7 +353,7 @@ struct rq { | |||
353 | * nr_running and cpu_load should be in the same cacheline because | 353 | * nr_running and cpu_load should be in the same cacheline because |
354 | * remote CPUs use both these fields when doing load calculation. | 354 | * remote CPUs use both these fields when doing load calculation. |
355 | */ | 355 | */ |
356 | unsigned long nr_running; | 356 | unsigned int nr_running; |
357 | #define CPU_LOAD_IDX_MAX 5 | 357 | #define CPU_LOAD_IDX_MAX 5 |
358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
359 | unsigned long last_load_update_tick; | 359 | unsigned long last_load_update_tick; |
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu); | |||
876 | extern struct rt_bandwidth def_rt_bandwidth; | 876 | extern struct rt_bandwidth def_rt_bandwidth; |
877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
878 | 878 | ||
879 | extern void update_cpu_load(struct rq *this_rq); | 879 | extern void update_idle_cpu_load(struct rq *this_rq); |
880 | 880 | ||
881 | #ifdef CONFIG_CGROUP_CPUACCT | 881 | #ifdef CONFIG_CGROUP_CPUACCT |
882 | #include <linux/cgroup.h> | 882 | #include <linux/cgroup.h> |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895ea..ee376beedaf9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -3,16 +3,357 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | 4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * | 5 | * |
6 | * This defines a simple but solid secure-computing mode. | 6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> | ||
8 | * | ||
9 | * This defines a simple but solid secure-computing facility. | ||
10 | * | ||
11 | * Mode 1 uses a fixed list of allowed system calls. | ||
12 | * Mode 2 allows user-defined system call filters in the form | ||
13 | * of Berkeley Packet Filters/Linux Socket Filters. | ||
7 | */ | 14 | */ |
8 | 15 | ||
16 | #include <linux/atomic.h> | ||
9 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
10 | #include <linux/seccomp.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> | ||
20 | #include <linux/seccomp.h> | ||
13 | 21 | ||
14 | /* #define SECCOMP_DEBUG 1 */ | 22 | /* #define SECCOMP_DEBUG 1 */ |
15 | #define NR_SECCOMP_MODES 1 | 23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER | ||
25 | #include <asm/syscall.h> | ||
26 | #include <linux/filter.h> | ||
27 | #include <linux/ptrace.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/tracehook.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | |||
33 | /** | ||
34 | * struct seccomp_filter - container for seccomp BPF programs | ||
35 | * | ||
36 | * @usage: reference count to manage the object lifetime. | ||
37 | * get/put helpers should be used when accessing an instance | ||
38 | * outside of a lifetime-guarded section. In general, this | ||
39 | * is only needed for handling filters shared across tasks. | ||
40 | * @prev: points to a previously installed, or inherited, filter | ||
41 | * @len: the number of instructions in the program | ||
42 | * @insns: the BPF program instructions to evaluate | ||
43 | * | ||
44 | * seccomp_filter objects are organized in a tree linked via the @prev | ||
45 | * pointer. For any task, it appears to be a singly-linked list starting | ||
46 | * with current->seccomp.filter, the most recently attached or inherited filter. | ||
47 | * However, multiple filters may share a @prev node, by way of fork(), which | ||
48 | * results in a unidirectional tree existing in memory. This is similar to | ||
49 | * how namespaces work. | ||
50 | * | ||
51 | * seccomp_filter objects should never be modified after being attached | ||
52 | * to a task_struct (other than @usage). | ||
53 | */ | ||
54 | struct seccomp_filter { | ||
55 | atomic_t usage; | ||
56 | struct seccomp_filter *prev; | ||
57 | unsigned short len; /* Instruction count */ | ||
58 | struct sock_filter insns[]; | ||
59 | }; | ||
60 | |||
61 | /* Limit any path through the tree to 256KB worth of instructions. */ | ||
62 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) | ||
63 | |||
64 | /** | ||
65 | * get_u32 - returns a u32 offset into data | ||
66 | * @data: a unsigned 64 bit value | ||
67 | * @index: 0 or 1 to return the first or second 32-bits | ||
68 | * | ||
69 | * This inline exists to hide the length of unsigned long. If a 32-bit | ||
70 | * unsigned long is passed in, it will be extended and the top 32-bits will be | ||
71 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be | ||
72 | * properly returned. | ||
73 | * | ||
74 | * Endianness is explicitly ignored and left for BPF program authors to manage | ||
75 | * as per the specific architecture. | ||
76 | */ | ||
77 | static inline u32 get_u32(u64 data, int index) | ||
78 | { | ||
79 | return ((u32 *)&data)[index]; | ||
80 | } | ||
81 | |||
82 | /* Helper for bpf_load below. */ | ||
83 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) | ||
84 | /** | ||
85 | * bpf_load: checks and returns a pointer to the requested offset | ||
86 | * @off: offset into struct seccomp_data to load from | ||
87 | * | ||
88 | * Returns the requested 32-bits of data. | ||
89 | * seccomp_check_filter() should assure that @off is 32-bit aligned | ||
90 | * and not out of bounds. Failure to do so is a BUG. | ||
91 | */ | ||
92 | u32 seccomp_bpf_load(int off) | ||
93 | { | ||
94 | struct pt_regs *regs = task_pt_regs(current); | ||
95 | if (off == BPF_DATA(nr)) | ||
96 | return syscall_get_nr(current, regs); | ||
97 | if (off == BPF_DATA(arch)) | ||
98 | return syscall_get_arch(current, regs); | ||
99 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { | ||
100 | unsigned long value; | ||
101 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); | ||
102 | int index = !!(off % sizeof(u64)); | ||
103 | syscall_get_arguments(current, regs, arg, 1, &value); | ||
104 | return get_u32(value, index); | ||
105 | } | ||
106 | if (off == BPF_DATA(instruction_pointer)) | ||
107 | return get_u32(KSTK_EIP(current), 0); | ||
108 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) | ||
109 | return get_u32(KSTK_EIP(current), 1); | ||
110 | /* seccomp_check_filter should make this impossible. */ | ||
111 | BUG(); | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * seccomp_check_filter - verify seccomp filter code | ||
116 | * @filter: filter to verify | ||
117 | * @flen: length of filter | ||
118 | * | ||
119 | * Takes a previously checked filter (by sk_chk_filter) and | ||
120 | * redirects all filter code that loads struct sk_buff data | ||
121 | * and related data through seccomp_bpf_load. It also | ||
122 | * enforces length and alignment checking of those loads. | ||
123 | * | ||
124 | * Returns 0 if the rule set is legal or -EINVAL if not. | ||
125 | */ | ||
126 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | ||
127 | { | ||
128 | int pc; | ||
129 | for (pc = 0; pc < flen; pc++) { | ||
130 | struct sock_filter *ftest = &filter[pc]; | ||
131 | u16 code = ftest->code; | ||
132 | u32 k = ftest->k; | ||
133 | |||
134 | switch (code) { | ||
135 | case BPF_S_LD_W_ABS: | ||
136 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; | ||
137 | /* 32-bit aligned and not out of bounds. */ | ||
138 | if (k >= sizeof(struct seccomp_data) || k & 3) | ||
139 | return -EINVAL; | ||
140 | continue; | ||
141 | case BPF_S_LD_W_LEN: | ||
142 | ftest->code = BPF_S_LD_IMM; | ||
143 | ftest->k = sizeof(struct seccomp_data); | ||
144 | continue; | ||
145 | case BPF_S_LDX_W_LEN: | ||
146 | ftest->code = BPF_S_LDX_IMM; | ||
147 | ftest->k = sizeof(struct seccomp_data); | ||
148 | continue; | ||
149 | /* Explicitly include allowed calls. */ | ||
150 | case BPF_S_RET_K: | ||
151 | case BPF_S_RET_A: | ||
152 | case BPF_S_ALU_ADD_K: | ||
153 | case BPF_S_ALU_ADD_X: | ||
154 | case BPF_S_ALU_SUB_K: | ||
155 | case BPF_S_ALU_SUB_X: | ||
156 | case BPF_S_ALU_MUL_K: | ||
157 | case BPF_S_ALU_MUL_X: | ||
158 | case BPF_S_ALU_DIV_X: | ||
159 | case BPF_S_ALU_AND_K: | ||
160 | case BPF_S_ALU_AND_X: | ||
161 | case BPF_S_ALU_OR_K: | ||
162 | case BPF_S_ALU_OR_X: | ||
163 | case BPF_S_ALU_LSH_K: | ||
164 | case BPF_S_ALU_LSH_X: | ||
165 | case BPF_S_ALU_RSH_K: | ||
166 | case BPF_S_ALU_RSH_X: | ||
167 | case BPF_S_ALU_NEG: | ||
168 | case BPF_S_LD_IMM: | ||
169 | case BPF_S_LDX_IMM: | ||
170 | case BPF_S_MISC_TAX: | ||
171 | case BPF_S_MISC_TXA: | ||
172 | case BPF_S_ALU_DIV_K: | ||
173 | case BPF_S_LD_MEM: | ||
174 | case BPF_S_LDX_MEM: | ||
175 | case BPF_S_ST: | ||
176 | case BPF_S_STX: | ||
177 | case BPF_S_JMP_JA: | ||
178 | case BPF_S_JMP_JEQ_K: | ||
179 | case BPF_S_JMP_JEQ_X: | ||
180 | case BPF_S_JMP_JGE_K: | ||
181 | case BPF_S_JMP_JGE_X: | ||
182 | case BPF_S_JMP_JGT_K: | ||
183 | case BPF_S_JMP_JGT_X: | ||
184 | case BPF_S_JMP_JSET_K: | ||
185 | case BPF_S_JMP_JSET_X: | ||
186 | continue; | ||
187 | default: | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * seccomp_run_filters - evaluates all seccomp filters against @syscall | ||
196 | * @syscall: number of the current system call | ||
197 | * | ||
198 | * Returns valid seccomp BPF response codes. | ||
199 | */ | ||
200 | static u32 seccomp_run_filters(int syscall) | ||
201 | { | ||
202 | struct seccomp_filter *f; | ||
203 | u32 ret = SECCOMP_RET_ALLOW; | ||
204 | |||
205 | /* Ensure unexpected behavior doesn't result in failing open. */ | ||
206 | if (WARN_ON(current->seccomp.filter == NULL)) | ||
207 | return SECCOMP_RET_KILL; | ||
208 | |||
209 | /* | ||
210 | * All filters in the list are evaluated and the lowest BPF return | ||
211 | * value always takes priority (ignoring the DATA). | ||
212 | */ | ||
213 | for (f = current->seccomp.filter; f; f = f->prev) { | ||
214 | u32 cur_ret = sk_run_filter(NULL, f->insns); | ||
215 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | ||
216 | ret = cur_ret; | ||
217 | } | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | /** | ||
222 | * seccomp_attach_filter: Attaches a seccomp filter to current. | ||
223 | * @fprog: BPF program to install | ||
224 | * | ||
225 | * Returns 0 on success or an errno on failure. | ||
226 | */ | ||
227 | static long seccomp_attach_filter(struct sock_fprog *fprog) | ||
228 | { | ||
229 | struct seccomp_filter *filter; | ||
230 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | ||
231 | unsigned long total_insns = fprog->len; | ||
232 | long ret; | ||
233 | |||
234 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | ||
235 | return -EINVAL; | ||
236 | |||
237 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | ||
238 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | ||
239 | if (total_insns > MAX_INSNS_PER_PATH) | ||
240 | return -ENOMEM; | ||
241 | |||
242 | /* | ||
243 | * Installing a seccomp filter requires that the task have | ||
244 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | ||
245 | * This avoids scenarios where unprivileged tasks can affect the | ||
246 | * behavior of privileged children. | ||
247 | */ | ||
248 | if (!current->no_new_privs && | ||
249 | security_capable_noaudit(current_cred(), current_user_ns(), | ||
250 | CAP_SYS_ADMIN) != 0) | ||
251 | return -EACCES; | ||
252 | |||
253 | /* Allocate a new seccomp_filter */ | ||
254 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, | ||
255 | GFP_KERNEL|__GFP_NOWARN); | ||
256 | if (!filter) | ||
257 | return -ENOMEM; | ||
258 | atomic_set(&filter->usage, 1); | ||
259 | filter->len = fprog->len; | ||
260 | |||
261 | /* Copy the instructions from fprog. */ | ||
262 | ret = -EFAULT; | ||
263 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) | ||
264 | goto fail; | ||
265 | |||
266 | /* Check and rewrite the fprog via the skb checker */ | ||
267 | ret = sk_chk_filter(filter->insns, filter->len); | ||
268 | if (ret) | ||
269 | goto fail; | ||
270 | |||
271 | /* Check and rewrite the fprog for seccomp use */ | ||
272 | ret = seccomp_check_filter(filter->insns, filter->len); | ||
273 | if (ret) | ||
274 | goto fail; | ||
275 | |||
276 | /* | ||
277 | * If there is an existing filter, make it the prev and don't drop its | ||
278 | * task reference. | ||
279 | */ | ||
280 | filter->prev = current->seccomp.filter; | ||
281 | current->seccomp.filter = filter; | ||
282 | return 0; | ||
283 | fail: | ||
284 | kfree(filter); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | ||
290 | * @user_filter: pointer to the user data containing a sock_fprog. | ||
291 | * | ||
292 | * Returns 0 on success and non-zero otherwise. | ||
293 | */ | ||
294 | long seccomp_attach_user_filter(char __user *user_filter) | ||
295 | { | ||
296 | struct sock_fprog fprog; | ||
297 | long ret = -EFAULT; | ||
298 | |||
299 | #ifdef CONFIG_COMPAT | ||
300 | if (is_compat_task()) { | ||
301 | struct compat_sock_fprog fprog32; | ||
302 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) | ||
303 | goto out; | ||
304 | fprog.len = fprog32.len; | ||
305 | fprog.filter = compat_ptr(fprog32.filter); | ||
306 | } else /* falls through to the if below. */ | ||
307 | #endif | ||
308 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | ||
309 | goto out; | ||
310 | ret = seccomp_attach_filter(&fprog); | ||
311 | out: | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | ||
316 | void get_seccomp_filter(struct task_struct *tsk) | ||
317 | { | ||
318 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
319 | if (!orig) | ||
320 | return; | ||
321 | /* Reference count is bounded by the number of total processes. */ | ||
322 | atomic_inc(&orig->usage); | ||
323 | } | ||
324 | |||
325 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | ||
326 | void put_seccomp_filter(struct task_struct *tsk) | ||
327 | { | ||
328 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
329 | /* Clean up single-reference branches iteratively. */ | ||
330 | while (orig && atomic_dec_and_test(&orig->usage)) { | ||
331 | struct seccomp_filter *freeme = orig; | ||
332 | orig = orig->prev; | ||
333 | kfree(freeme); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation | ||
339 | * @syscall: syscall number to send to userland | ||
340 | * @reason: filter-supplied reason code to send to userland (via si_errno) | ||
341 | * | ||
342 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. | ||
343 | */ | ||
344 | static void seccomp_send_sigsys(int syscall, int reason) | ||
345 | { | ||
346 | struct siginfo info; | ||
347 | memset(&info, 0, sizeof(info)); | ||
348 | info.si_signo = SIGSYS; | ||
349 | info.si_code = SYS_SECCOMP; | ||
350 | info.si_call_addr = (void __user *)KSTK_EIP(current); | ||
351 | info.si_errno = reason; | ||
352 | info.si_arch = syscall_get_arch(current, task_pt_regs(current)); | ||
353 | info.si_syscall = syscall; | ||
354 | force_sig_info(SIGSYS, &info, current); | ||
355 | } | ||
356 | #endif /* CONFIG_SECCOMP_FILTER */ | ||
16 | 357 | ||
17 | /* | 358 | /* |
18 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 359 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { | |||
31 | }; | 372 | }; |
32 | #endif | 373 | #endif |
33 | 374 | ||
34 | void __secure_computing(int this_syscall) | 375 | int __secure_computing(int this_syscall) |
35 | { | 376 | { |
36 | int mode = current->seccomp.mode; | 377 | int mode = current->seccomp.mode; |
37 | int * syscall; | 378 | int exit_sig = 0; |
379 | int *syscall; | ||
380 | u32 ret; | ||
38 | 381 | ||
39 | switch (mode) { | 382 | switch (mode) { |
40 | case 1: | 383 | case SECCOMP_MODE_STRICT: |
41 | syscall = mode1_syscalls; | 384 | syscall = mode1_syscalls; |
42 | #ifdef CONFIG_COMPAT | 385 | #ifdef CONFIG_COMPAT |
43 | if (is_compat_task()) | 386 | if (is_compat_task()) |
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) | |||
45 | #endif | 388 | #endif |
46 | do { | 389 | do { |
47 | if (*syscall == this_syscall) | 390 | if (*syscall == this_syscall) |
48 | return; | 391 | return 0; |
49 | } while (*++syscall); | 392 | } while (*++syscall); |
393 | exit_sig = SIGKILL; | ||
394 | ret = SECCOMP_RET_KILL; | ||
395 | break; | ||
396 | #ifdef CONFIG_SECCOMP_FILTER | ||
397 | case SECCOMP_MODE_FILTER: { | ||
398 | int data; | ||
399 | ret = seccomp_run_filters(this_syscall); | ||
400 | data = ret & SECCOMP_RET_DATA; | ||
401 | ret &= SECCOMP_RET_ACTION; | ||
402 | switch (ret) { | ||
403 | case SECCOMP_RET_ERRNO: | ||
404 | /* Set the low-order 16-bits as a errno. */ | ||
405 | syscall_set_return_value(current, task_pt_regs(current), | ||
406 | -data, 0); | ||
407 | goto skip; | ||
408 | case SECCOMP_RET_TRAP: | ||
409 | /* Show the handler the original registers. */ | ||
410 | syscall_rollback(current, task_pt_regs(current)); | ||
411 | /* Let the filter pass back 16 bits of data. */ | ||
412 | seccomp_send_sigsys(this_syscall, data); | ||
413 | goto skip; | ||
414 | case SECCOMP_RET_TRACE: | ||
415 | /* Skip these calls if there is no tracer. */ | ||
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | ||
417 | goto skip; | ||
418 | /* Allow the BPF to provide the event message */ | ||
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
420 | /* | ||
421 | * The delivery of a fatal signal during event | ||
422 | * notification may silently skip tracer notification. | ||
423 | * Terminating the task now avoids executing a system | ||
424 | * call that may not be intended. | ||
425 | */ | ||
426 | if (fatal_signal_pending(current)) | ||
427 | break; | ||
428 | return 0; | ||
429 | case SECCOMP_RET_ALLOW: | ||
430 | return 0; | ||
431 | case SECCOMP_RET_KILL: | ||
432 | default: | ||
433 | break; | ||
434 | } | ||
435 | exit_sig = SIGSYS; | ||
50 | break; | 436 | break; |
437 | } | ||
438 | #endif | ||
51 | default: | 439 | default: |
52 | BUG(); | 440 | BUG(); |
53 | } | 441 | } |
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) | |||
55 | #ifdef SECCOMP_DEBUG | 443 | #ifdef SECCOMP_DEBUG |
56 | dump_stack(); | 444 | dump_stack(); |
57 | #endif | 445 | #endif |
58 | audit_seccomp(this_syscall); | 446 | audit_seccomp(this_syscall, exit_sig, ret); |
59 | do_exit(SIGKILL); | 447 | do_exit(exit_sig); |
448 | #ifdef CONFIG_SECCOMP_FILTER | ||
449 | skip: | ||
450 | audit_seccomp(this_syscall, exit_sig, ret); | ||
451 | #endif | ||
452 | return -1; | ||
60 | } | 453 | } |
61 | 454 | ||
62 | long prctl_get_seccomp(void) | 455 | long prctl_get_seccomp(void) |
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void) | |||
64 | return current->seccomp.mode; | 457 | return current->seccomp.mode; |
65 | } | 458 | } |
66 | 459 | ||
67 | long prctl_set_seccomp(unsigned long seccomp_mode) | 460 | /** |
461 | * prctl_set_seccomp: configures current->seccomp.mode | ||
462 | * @seccomp_mode: requested mode to use | ||
463 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
464 | * | ||
465 | * This function may be called repeatedly with a @seccomp_mode of | ||
466 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | ||
467 | * successfully installed will be evaluated (in reverse order) for each system | ||
468 | * call the task makes. | ||
469 | * | ||
470 | * Once current->seccomp.mode is non-zero, it may not be changed. | ||
471 | * | ||
472 | * Returns 0 on success or -EINVAL on failure. | ||
473 | */ | ||
474 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | ||
68 | { | 475 | { |
69 | long ret; | 476 | long ret = -EINVAL; |
70 | 477 | ||
71 | /* can set it only once to be even more secure */ | 478 | if (current->seccomp.mode && |
72 | ret = -EPERM; | 479 | current->seccomp.mode != seccomp_mode) |
73 | if (unlikely(current->seccomp.mode)) | ||
74 | goto out; | 480 | goto out; |
75 | 481 | ||
76 | ret = -EINVAL; | 482 | switch (seccomp_mode) { |
77 | if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { | 483 | case SECCOMP_MODE_STRICT: |
78 | current->seccomp.mode = seccomp_mode; | 484 | ret = 0; |
79 | set_thread_flag(TIF_SECCOMP); | ||
80 | #ifdef TIF_NOTSC | 485 | #ifdef TIF_NOTSC |
81 | disable_TSC(); | 486 | disable_TSC(); |
82 | #endif | 487 | #endif |
83 | ret = 0; | 488 | break; |
489 | #ifdef CONFIG_SECCOMP_FILTER | ||
490 | case SECCOMP_MODE_FILTER: | ||
491 | ret = seccomp_attach_user_filter(filter); | ||
492 | if (ret) | ||
493 | goto out; | ||
494 | break; | ||
495 | #endif | ||
496 | default: | ||
497 | goto out; | ||
84 | } | 498 | } |
85 | 499 | ||
86 | out: | 500 | current->seccomp.mode = seccomp_mode; |
501 | set_thread_flag(TIF_SECCOMP); | ||
502 | out: | ||
87 | return ret; | 503 | return ret; |
88 | } | 504 | } |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c3..4567fc020fe3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable); | |||
118 | * down_trylock - try to acquire the semaphore, without waiting | 118 | * down_trylock - try to acquire the semaphore, without waiting |
119 | * @sem: the semaphore to be acquired | 119 | * @sem: the semaphore to be acquired |
120 | * | 120 | * |
121 | * Try to acquire the semaphore atomically. Returns 0 if the mutex has | 121 | * Try to acquire the semaphore atomically. Returns 0 if the semaphore has |
122 | * been acquired successfully or 1 if it it cannot be acquired. | 122 | * been acquired successfully or 1 if it it cannot be acquired. |
123 | * | 123 | * |
124 | * NOTE: This return value is inverted from both spin_trylock and | 124 | * NOTE: This return value is inverted from both spin_trylock and |
diff --git a/kernel/signal.c b/kernel/signal.c index 833ea5166855..21ebe75ff85f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -160,7 +160,7 @@ void recalc_sigpending(void) | |||
160 | 160 | ||
161 | #define SYNCHRONOUS_MASK \ | 161 | #define SYNCHRONOUS_MASK \ |
162 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | 162 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ |
163 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | 163 | sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) |
164 | 164 | ||
165 | int next_signal(struct sigpending *pending, sigset_t *mask) | 165 | int next_signal(struct sigpending *pending, sigset_t *mask) |
166 | { | 166 | { |
@@ -2695,6 +2695,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2695 | err |= __put_user(from->si_uid, &to->si_uid); | 2695 | err |= __put_user(from->si_uid, &to->si_uid); |
2696 | err |= __put_user(from->si_ptr, &to->si_ptr); | 2696 | err |= __put_user(from->si_ptr, &to->si_ptr); |
2697 | break; | 2697 | break; |
2698 | #ifdef __ARCH_SIGSYS | ||
2699 | case __SI_SYS: | ||
2700 | err |= __put_user(from->si_call_addr, &to->si_call_addr); | ||
2701 | err |= __put_user(from->si_syscall, &to->si_syscall); | ||
2702 | err |= __put_user(from->si_arch, &to->si_arch); | ||
2703 | break; | ||
2704 | #endif | ||
2698 | default: /* this is just in case for now ... */ | 2705 | default: /* this is just in case for now ... */ |
2699 | err |= __put_user(from->si_pid, &to->si_pid); | 2706 | err |= __put_user(from->si_pid, &to->si_pid); |
2700 | err |= __put_user(from->si_uid, &to->si_uid); | 2707 | err |= __put_user(from->si_uid, &to->si_uid); |
diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf759..d0ae5b24875e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,8 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #include "smpboot.h" | ||
17 | |||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
17 | static struct { | 19 | static struct { |
18 | struct list_head queue; | 20 | struct list_head queue; |
@@ -669,6 +671,8 @@ void __init smp_init(void) | |||
669 | { | 671 | { |
670 | unsigned int cpu; | 672 | unsigned int cpu; |
671 | 673 | ||
674 | idle_threads_init(); | ||
675 | |||
672 | /* FIXME: This should be done in userspace --RR */ | 676 | /* FIXME: This should be done in userspace --RR */ |
673 | for_each_present_cpu(cpu) { | 677 | for_each_present_cpu(cpu) { |
674 | if (num_online_cpus() >= setup_max_cpus) | 678 | if (num_online_cpus() >= setup_max_cpus) |
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
791 | } | 795 | } |
792 | } | 796 | } |
793 | EXPORT_SYMBOL(on_each_cpu_cond); | 797 | EXPORT_SYMBOL(on_each_cpu_cond); |
798 | |||
799 | static void do_nothing(void *unused) | ||
800 | { | ||
801 | } | ||
802 | |||
803 | /** | ||
804 | * kick_all_cpus_sync - Force all cpus out of idle | ||
805 | * | ||
806 | * Used to synchronize the update of pm_idle function pointer. It's | ||
807 | * called after the pointer is updated and returns after the dummy | ||
808 | * callback function has been executed on all cpus. The execution of | ||
809 | * the function can only happen on the remote cpus after they have | ||
810 | * left the idle function which had been called via pm_idle function | ||
811 | * pointer. So it's guaranteed that nothing uses the previous pointer | ||
812 | * anymore. | ||
813 | */ | ||
814 | void kick_all_cpus_sync(void) | ||
815 | { | ||
816 | /* Make sure the change is visible before we kick the cpus */ | ||
817 | smp_mb(); | ||
818 | smp_call_function(do_nothing, NULL, 1); | ||
819 | } | ||
820 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 000000000000..e1a797e028a3 --- /dev/null +++ b/kernel/smpboot.c | |||
@@ -0,0 +1,62 @@ | |||
1 | /* | ||
2 | * Common SMP CPU bringup/teardown functions | ||
3 | */ | ||
4 | #include <linux/err.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/percpu.h> | ||
9 | |||
10 | #include "smpboot.h" | ||
11 | |||
12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
13 | /* | ||
14 | * For the hotplug case we keep the task structs around and reuse | ||
15 | * them. | ||
16 | */ | ||
17 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); | ||
18 | |||
19 | struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) | ||
20 | { | ||
21 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
22 | |||
23 | if (!tsk) | ||
24 | return ERR_PTR(-ENOMEM); | ||
25 | init_idle(tsk, cpu); | ||
26 | return tsk; | ||
27 | } | ||
28 | |||
29 | void __init idle_thread_set_boot_cpu(void) | ||
30 | { | ||
31 | per_cpu(idle_threads, smp_processor_id()) = current; | ||
32 | } | ||
33 | |||
34 | static inline void idle_init(unsigned int cpu) | ||
35 | { | ||
36 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
37 | |||
38 | if (!tsk) { | ||
39 | tsk = fork_idle(cpu); | ||
40 | if (IS_ERR(tsk)) | ||
41 | pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); | ||
42 | else | ||
43 | per_cpu(idle_threads, cpu) = tsk; | ||
44 | } | ||
45 | } | ||
46 | |||
47 | /** | ||
48 | * idle_thread_init - Initialize the idle thread for a cpu | ||
49 | * @cpu: The cpu for which the idle thread should be initialized | ||
50 | * | ||
51 | * Creates the thread if it does not exist. | ||
52 | */ | ||
53 | void __init idle_threads_init(void) | ||
54 | { | ||
55 | unsigned int cpu; | ||
56 | |||
57 | for_each_possible_cpu(cpu) { | ||
58 | if (cpu != smp_processor_id()) | ||
59 | idle_init(cpu); | ||
60 | } | ||
61 | } | ||
62 | #endif | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 000000000000..80c0acfb8472 --- /dev/null +++ b/kernel/smpboot.h | |||
@@ -0,0 +1,18 @@ | |||
1 | #ifndef SMPBOOT_H | ||
2 | #define SMPBOOT_H | ||
3 | |||
4 | struct task_struct; | ||
5 | |||
6 | int smpboot_prepare(unsigned int cpu); | ||
7 | |||
8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
9 | struct task_struct *idle_thread_get(unsigned int cpu); | ||
10 | void idle_thread_set_boot_cpu(void); | ||
11 | void idle_threads_init(void); | ||
12 | #else | ||
13 | static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } | ||
14 | static inline void idle_thread_set_boot_cpu(void) { } | ||
15 | static inline void idle_threads_init(void) { } | ||
16 | #endif | ||
17 | |||
18 | #endif | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..2095be3318d5 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,10 +34,77 @@ | |||
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | /* | ||
38 | * Initialize an rcu_batch structure to empty. | ||
39 | */ | ||
40 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
41 | { | ||
42 | b->head = NULL; | ||
43 | b->tail = &b->head; | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
48 | */ | ||
49 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
50 | { | ||
51 | *b->tail = head; | ||
52 | b->tail = &head->next; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Is the specified rcu_batch structure empty? | ||
57 | */ | ||
58 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
59 | { | ||
60 | return b->tail == &b->head; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Remove the callback at the head of the specified rcu_batch structure | ||
65 | * and return a pointer to it, or return NULL if the structure is empty. | ||
66 | */ | ||
67 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
68 | { | ||
69 | struct rcu_head *head; | ||
70 | |||
71 | if (rcu_batch_empty(b)) | ||
72 | return NULL; | ||
73 | |||
74 | head = b->head; | ||
75 | b->head = head->next; | ||
76 | if (b->tail == &head->next) | ||
77 | rcu_batch_init(b); | ||
78 | |||
79 | return head; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
84 | * the structure specified by "to". | ||
85 | */ | ||
86 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
87 | { | ||
88 | if (!rcu_batch_empty(from)) { | ||
89 | *to->tail = from->head; | ||
90 | to->tail = from->tail; | ||
91 | rcu_batch_init(from); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 98 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | 99 | { |
39 | sp->completed = 0; | 100 | sp->completed = 0; |
40 | mutex_init(&sp->mutex); | 101 | spin_lock_init(&sp->queue_lock); |
102 | sp->running = false; | ||
103 | rcu_batch_init(&sp->batch_queue); | ||
104 | rcu_batch_init(&sp->batch_check0); | ||
105 | rcu_batch_init(&sp->batch_check1); | ||
106 | rcu_batch_init(&sp->batch_done); | ||
107 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | 108 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | 109 | return sp->per_cpu_ref ? 0 : -ENOMEM; |
43 | } | 110 | } |
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 140 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 141 | ||
75 | /* | 142 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 143 | * Returns approximate total of the readers' ->seq[] values for the |
77 | * active on the specified rank of per-CPU counters. | 144 | * rank of per-CPU counters specified by idx. |
78 | */ | 145 | */ |
146 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
147 | { | ||
148 | int cpu; | ||
149 | unsigned long sum = 0; | ||
150 | unsigned long t; | ||
79 | 151 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 152 | for_each_possible_cpu(cpu) { |
153 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
154 | sum += t; | ||
155 | } | ||
156 | return sum; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Returns approximate number of readers active on the specified rank | ||
161 | * of the per-CPU ->c[] counters. | ||
162 | */ | ||
163 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
81 | { | 164 | { |
82 | int cpu; | 165 | int cpu; |
83 | int sum; | 166 | unsigned long sum = 0; |
167 | unsigned long t; | ||
84 | 168 | ||
85 | sum = 0; | 169 | for_each_possible_cpu(cpu) { |
86 | for_each_possible_cpu(cpu) | 170 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 171 | sum += t; |
172 | } | ||
88 | return sum; | 173 | return sum; |
89 | } | 174 | } |
90 | 175 | ||
176 | /* | ||
177 | * Return true if the number of pre-existing readers is determined to | ||
178 | * be stably zero. An example unstable zero can occur if the call | ||
179 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
180 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
181 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
182 | * time to sum the array, and might in fact be interrupted or preempted | ||
183 | * partway through the summation. | ||
184 | */ | ||
185 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
186 | { | ||
187 | unsigned long seq; | ||
188 | |||
189 | seq = srcu_readers_seq_idx(sp, idx); | ||
190 | |||
191 | /* | ||
192 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
193 | * __srcu_read_lock(). This pairing ensures that if an | ||
194 | * __srcu_read_lock() increments its counter after the summation | ||
195 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
196 | * critical section will see any changes made prior to the start | ||
197 | * of the current SRCU grace period. | ||
198 | * | ||
199 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
200 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
201 | * must see the increment of ->c[]. | ||
202 | */ | ||
203 | smp_mb(); /* A */ | ||
204 | |||
205 | /* | ||
206 | * Note that srcu_readers_active_idx() can incorrectly return | ||
207 | * zero even though there is a pre-existing reader throughout. | ||
208 | * To see this, suppose that task A is in a very long SRCU | ||
209 | * read-side critical section that started on CPU 0, and that | ||
210 | * no other reader exists, so that the sum of the counters | ||
211 | * is equal to one. Then suppose that task B starts executing | ||
212 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
213 | * task C starts reading on CPU 0, so that its increment is not | ||
214 | * summed, but finishes reading on CPU 2, so that its decrement | ||
215 | * -is- summed. Then when task B completes its sum, it will | ||
216 | * incorrectly get zero, despite the fact that task A has been | ||
217 | * in its SRCU read-side critical section the whole time. | ||
218 | * | ||
219 | * We therefore do a validation step should srcu_readers_active_idx() | ||
220 | * return zero. | ||
221 | */ | ||
222 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * The remainder of this function is the validation step. | ||
227 | * The following smp_mb() D pairs with the smp_mb() C in | ||
228 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
229 | * by srcu_readers_active_idx() above, then any destructive | ||
230 | * operation performed after the grace period will happen after | ||
231 | * the corresponding SRCU read-side critical section. | ||
232 | * | ||
233 | * Note that there can be at most NR_CPUS worth of readers using | ||
234 | * the old index, which is not enough to overflow even a 32-bit | ||
235 | * integer. (Yes, this does mean that systems having more than | ||
236 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
237 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
238 | * Therefore, the only way that the return values of the two | ||
239 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
240 | * no increments of the corresponding rank of ->seq[] counts | ||
241 | * in the interim. But the missed-increment scenario laid out | ||
242 | * above includes an increment of the ->seq[] counter by | ||
243 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
244 | * scenario occurs, the return values from the two calls to | ||
245 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
246 | * step below suffices. | ||
247 | */ | ||
248 | smp_mb(); /* D */ | ||
249 | |||
250 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
251 | } | ||
252 | |||
91 | /** | 253 | /** |
92 | * srcu_readers_active - returns approximate number of readers. | 254 | * srcu_readers_active - returns approximate number of readers. |
93 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 255 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
98 | */ | 260 | */ |
99 | static int srcu_readers_active(struct srcu_struct *sp) | 261 | static int srcu_readers_active(struct srcu_struct *sp) |
100 | { | 262 | { |
101 | return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); | 263 | int cpu; |
264 | unsigned long sum = 0; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
268 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
269 | } | ||
270 | return sum; | ||
102 | } | 271 | } |
103 | 272 | ||
104 | /** | 273 | /** |
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 300 | int idx; |
132 | 301 | ||
133 | preempt_disable(); | 302 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 303 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 304 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 306 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
307 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
138 | preempt_enable(); | 308 | preempt_enable(); |
139 | return idx; | 309 | return idx; |
140 | } | 310 | } |
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 319 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 320 | { |
151 | preempt_disable(); | 321 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 322 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 323 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
154 | preempt_enable(); | 324 | preempt_enable(); |
155 | } | 325 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 326 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 333 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 334 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 335 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 336 | #define SRCU_RETRY_CHECK_DELAY 5 |
337 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
338 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
167 | 339 | ||
168 | /* | 340 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 341 | * @@@ Wait until all pre-existing readers complete. Such readers |
342 | * will have used the index specified by "idx". | ||
343 | * the caller should ensures the ->completed is not changed while checking | ||
344 | * and idx = (->completed & 1) ^ 1 | ||
170 | */ | 345 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 346 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) |
172 | { | 347 | { |
173 | int idx; | 348 | for (;;) { |
174 | 349 | if (srcu_readers_active_idx_check(sp, idx)) | |
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 350 | return true; |
176 | !lock_is_held(&rcu_bh_lock_map) && | 351 | if (--trycount <= 0) |
177 | !lock_is_held(&rcu_lock_map) && | 352 | return false; |
178 | !lock_is_held(&rcu_sched_lock_map), | 353 | udelay(SRCU_RETRY_CHECK_DELAY); |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 354 | } |
180 | 355 | } | |
181 | idx = sp->completed; | ||
182 | mutex_lock(&sp->mutex); | ||
183 | 356 | ||
184 | /* | 357 | /* |
185 | * Check to see if someone else did the work for us while we were | 358 | * Increment the ->completed counter so that future SRCU readers will |
186 | * waiting to acquire the lock. We need -two- advances of | 359 | * use the other rank of the ->c[] and ->seq[] arrays. This allows |
187 | * the counter, not just one. If there was but one, we might have | 360 | * us to wait for pre-existing readers in a starvation-free manner. |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 361 | */ |
189 | * having failed to prevent CPU-reordering races with concurrent | 362 | static void srcu_flip(struct srcu_struct *sp) |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 363 | { |
191 | * either (1) wait for two or (2) supply the second ourselves. | 364 | sp->completed++; |
192 | */ | 365 | } |
193 | 366 | ||
194 | if ((sp->completed - idx) >= 2) { | 367 | /* |
195 | mutex_unlock(&sp->mutex); | 368 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
196 | return; | 369 | * initiating grace-period processing if it is not already running. |
370 | */ | ||
371 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
372 | void (*func)(struct rcu_head *head)) | ||
373 | { | ||
374 | unsigned long flags; | ||
375 | |||
376 | head->next = NULL; | ||
377 | head->func = func; | ||
378 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
379 | rcu_batch_queue(&sp->batch_queue, head); | ||
380 | if (!sp->running) { | ||
381 | sp->running = true; | ||
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | ||
197 | } | 383 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
385 | } | ||
386 | EXPORT_SYMBOL_GPL(call_srcu); | ||
198 | 387 | ||
199 | sync_func(); /* Force memory barrier on all CPUs. */ | 388 | struct rcu_synchronize { |
389 | struct rcu_head head; | ||
390 | struct completion completion; | ||
391 | }; | ||
200 | 392 | ||
201 | /* | 393 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 394 | * Awaken the corresponding synchronize_srcu() instance now that a |
203 | * sees the new value of sp->completed will also see any preceding | 395 | * grace period has elapsed. |
204 | * changes to data structures made by this CPU. This prevents | 396 | */ |
205 | * some other CPU from reordering the accesses in its SRCU | 397 | static void wakeme_after_rcu(struct rcu_head *head) |
206 | * read-side critical section to precede the corresponding | 398 | { |
207 | * srcu_read_lock() -- ensuring that such references will in | 399 | struct rcu_synchronize *rcu; |
208 | * fact be protected. | ||
209 | * | ||
210 | * So it is now safe to do the flip. | ||
211 | */ | ||
212 | 400 | ||
213 | idx = sp->completed & 0x1; | 401 | rcu = container_of(head, struct rcu_synchronize, head); |
214 | sp->completed++; | 402 | complete(&rcu->completion); |
403 | } | ||
215 | 404 | ||
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
406 | static void srcu_reschedule(struct srcu_struct *sp); | ||
217 | 407 | ||
218 | /* | 408 | /* |
219 | * At this point, because of the preceding synchronize_sched(), | 409 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
220 | * all srcu_read_lock() calls using the old counters have completed. | 410 | */ |
221 | * Their corresponding critical sections might well be still | 411 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
222 | * executing, but the srcu_read_lock() primitives themselves | 412 | { |
223 | * will have finished executing. We initially give readers | 413 | struct rcu_synchronize rcu; |
224 | * an arbitrarily chosen 10 microseconds to get out of their | 414 | struct rcu_head *head = &rcu.head; |
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | 415 | bool done = false; |
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | 416 | ||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | 417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
418 | !lock_is_held(&rcu_bh_lock_map) && | ||
419 | !lock_is_held(&rcu_lock_map) && | ||
420 | !lock_is_held(&rcu_sched_lock_map), | ||
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
236 | 422 | ||
237 | /* | 423 | init_completion(&rcu.completion); |
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | 424 | |
239 | * primitives that were executing concurrently with the preceding | 425 | head->next = NULL; |
240 | * for_each_possible_cpu() loop to have completed by this point. | 426 | head->func = wakeme_after_rcu; |
241 | * More importantly, it also forces the corresponding SRCU read-side | 427 | spin_lock_irq(&sp->queue_lock); |
242 | * critical sections to have also completed, and the corresponding | 428 | if (!sp->running) { |
243 | * references to SRCU-protected data items to be dropped. | 429 | /* steal the processing owner */ |
244 | * | 430 | sp->running = true; |
245 | * Note: | 431 | rcu_batch_queue(&sp->batch_check0, head); |
246 | * | 432 | spin_unlock_irq(&sp->queue_lock); |
247 | * Despite what you might think at first glance, the | 433 | |
248 | * preceding synchronize_sched() -must- be within the | 434 | srcu_advance_batches(sp, trycount); |
249 | * critical section ended by the following mutex_unlock(). | 435 | if (!rcu_batch_empty(&sp->batch_done)) { |
250 | * Otherwise, a task taking the early exit can race | 436 | BUG_ON(sp->batch_done.head != head); |
251 | * with a srcu_read_unlock(), which might have executed | 437 | rcu_batch_dequeue(&sp->batch_done); |
252 | * just before the preceding srcu_readers_active() check, | 438 | done = true; |
253 | * and whose CPU might have reordered the srcu_read_unlock() | 439 | } |
254 | * with the preceding critical section. In this case, there | 440 | /* give the processing owner to work_struct */ |
255 | * is nothing preventing the synchronize_sched() task that is | 441 | srcu_reschedule(sp); |
256 | * taking the early exit from freeing a data structure that | 442 | } else { |
257 | * is still being referenced (out of order) by the task | 443 | rcu_batch_queue(&sp->batch_queue, head); |
258 | * doing the srcu_read_unlock(). | 444 | spin_unlock_irq(&sp->queue_lock); |
259 | * | 445 | } |
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | ||
264 | 446 | ||
265 | mutex_unlock(&sp->mutex); | 447 | if (!done) |
448 | wait_for_completion(&rcu.completion); | ||
266 | } | 449 | } |
267 | 450 | ||
268 | /** | 451 | /** |
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 464 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 465 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 466 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
285 | } | 468 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 469 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 470 | ||
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 472 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 473 | * @sp: srcu_struct with which to synchronize. |
291 | * | 474 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 475 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 476 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 477 | * |
300 | * Note that it is illegal to call this function while holding any lock | 478 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 479 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 480 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 481 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 482 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 485 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 487 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
313 | } | 489 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 491 | ||
316 | /** | 492 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
494 | */ | ||
495 | void srcu_barrier(struct srcu_struct *sp) | ||
496 | { | ||
497 | synchronize_srcu(sp); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
500 | |||
501 | /** | ||
317 | * srcu_batches_completed - return batches completed. | 502 | * srcu_batches_completed - return batches completed. |
318 | * @sp: srcu_struct on which to report batch completion. | 503 | * @sp: srcu_struct on which to report batch completion. |
319 | * | 504 | * |
320 | * Report the number of batches, correlated with, but not necessarily | 505 | * Report the number of batches, correlated with, but not necessarily |
321 | * precisely the same as, the number of grace periods that have elapsed. | 506 | * precisely the same as, the number of grace periods that have elapsed. |
322 | */ | 507 | */ |
323 | |||
324 | long srcu_batches_completed(struct srcu_struct *sp) | 508 | long srcu_batches_completed(struct srcu_struct *sp) |
325 | { | 509 | { |
326 | return sp->completed; | 510 | return sp->completed; |
327 | } | 511 | } |
328 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |
513 | |||
514 | #define SRCU_CALLBACK_BATCH 10 | ||
515 | #define SRCU_INTERVAL 1 | ||
516 | |||
517 | /* | ||
518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
519 | * period pipeline. | ||
520 | */ | ||
521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
522 | { | ||
523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
524 | spin_lock_irq(&sp->queue_lock); | ||
525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
526 | spin_unlock_irq(&sp->queue_lock); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
533 | */ | ||
534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
535 | { | ||
536 | int idx = 1 ^ (sp->completed & 1); | ||
537 | |||
538 | /* | ||
539 | * Because readers might be delayed for an extended period after | ||
540 | * fetching ->completed for their index, at any point in time there | ||
541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
542 | * need to wait for readers to clear from both index values before | ||
543 | * invoking a callback. | ||
544 | */ | ||
545 | |||
546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
547 | rcu_batch_empty(&sp->batch_check1)) | ||
548 | return; /* no callbacks need to be advanced */ | ||
549 | |||
550 | if (!try_check_zero(sp, idx, trycount)) | ||
551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
552 | |||
553 | /* | ||
554 | * The callbacks in ->batch_check1 have already done with their | ||
555 | * first zero check and flip back when they were enqueued on | ||
556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
557 | * (Presumably try_check_zero() returned false during that | ||
558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
560 | */ | ||
561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
562 | |||
563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
564 | return; /* no callbacks need to be advanced */ | ||
565 | srcu_flip(sp); | ||
566 | |||
567 | /* | ||
568 | * The callbacks in ->batch_check0 just finished their | ||
569 | * first check zero and flip, so move them to ->batch_check1 | ||
570 | * for future checking on the other idx. | ||
571 | */ | ||
572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
573 | |||
574 | /* | ||
575 | * SRCU read-side critical sections are normally short, so check | ||
576 | * at least twice in quick succession after a flip. | ||
577 | */ | ||
578 | trycount = trycount < 2 ? 2 : trycount; | ||
579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
581 | |||
582 | /* | ||
583 | * The callbacks in ->batch_check1 have now waited for all | ||
584 | * pre-existing readers using both idx values. They are therefore | ||
585 | * ready to invoke, so move them to ->batch_done. | ||
586 | */ | ||
587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
592 | * their grace period. If there are more to do, SRCU will reschedule | ||
593 | * the workqueue. | ||
594 | */ | ||
595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
596 | { | ||
597 | int i; | ||
598 | struct rcu_head *head; | ||
599 | |||
600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
602 | if (!head) | ||
603 | break; | ||
604 | local_bh_disable(); | ||
605 | head->func(head); | ||
606 | local_bh_enable(); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Finished one round of SRCU grace period. Start another if there are | ||
612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
613 | */ | ||
614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
615 | { | ||
616 | bool pending = true; | ||
617 | |||
618 | if (rcu_batch_empty(&sp->batch_done) && | ||
619 | rcu_batch_empty(&sp->batch_check1) && | ||
620 | rcu_batch_empty(&sp->batch_check0) && | ||
621 | rcu_batch_empty(&sp->batch_queue)) { | ||
622 | spin_lock_irq(&sp->queue_lock); | ||
623 | if (rcu_batch_empty(&sp->batch_done) && | ||
624 | rcu_batch_empty(&sp->batch_check1) && | ||
625 | rcu_batch_empty(&sp->batch_check0) && | ||
626 | rcu_batch_empty(&sp->batch_queue)) { | ||
627 | sp->running = false; | ||
628 | pending = false; | ||
629 | } | ||
630 | spin_unlock_irq(&sp->queue_lock); | ||
631 | } | ||
632 | |||
633 | if (pending) | ||
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * This is the work-queue function that handles SRCU grace periods. | ||
639 | */ | ||
640 | static void process_srcu(struct work_struct *work) | ||
641 | { | ||
642 | struct srcu_struct *sp; | ||
643 | |||
644 | sp = container_of(work, struct srcu_struct, work.work); | ||
645 | |||
646 | srcu_collect_new(sp); | ||
647 | srcu_advance_batches(sp, 1); | ||
648 | srcu_invoke_callbacks(sp); | ||
649 | srcu_reschedule(sp); | ||
650 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c index f484077b6b14..6df42624e454 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1990,7 +1990,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1990 | error = prctl_get_seccomp(); | 1990 | error = prctl_get_seccomp(); |
1991 | break; | 1991 | break; |
1992 | case PR_SET_SECCOMP: | 1992 | case PR_SET_SECCOMP: |
1993 | error = prctl_set_seccomp(arg2); | 1993 | error = prctl_set_seccomp(arg2, (char __user *)arg3); |
1994 | break; | 1994 | break; |
1995 | case PR_GET_TSC: | 1995 | case PR_GET_TSC: |
1996 | error = GET_TSC_CTL(arg2); | 1996 | error = GET_TSC_CTL(arg2); |
@@ -2061,6 +2061,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2061 | error = put_user(me->signal->is_child_subreaper, | 2061 | error = put_user(me->signal->is_child_subreaper, |
2062 | (int __user *) arg2); | 2062 | (int __user *) arg2); |
2063 | break; | 2063 | break; |
2064 | case PR_SET_NO_NEW_PRIVS: | ||
2065 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
2066 | return -EINVAL; | ||
2067 | |||
2068 | current->no_new_privs = 1; | ||
2069 | break; | ||
2070 | case PR_GET_NO_NEW_PRIVS: | ||
2071 | if (arg2 || arg3 || arg4 || arg5) | ||
2072 | return -EINVAL; | ||
2073 | return current->no_new_privs ? 1 : 0; | ||
2064 | default: | 2074 | default: |
2065 | error = -EINVAL; | 2075 | error = -EINVAL; |
2066 | break; | 2076 | break; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 52b3a06a02f8..4ab11879aeb4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -170,7 +170,7 @@ static int proc_taint(struct ctl_table *table, int write, | |||
170 | #endif | 170 | #endif |
171 | 171 | ||
172 | #ifdef CONFIG_PRINTK | 172 | #ifdef CONFIG_PRINTK |
173 | static int proc_dmesg_restrict(struct ctl_table *table, int write, | 173 | static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, |
174 | void __user *buffer, size_t *lenp, loff_t *ppos); | 174 | void __user *buffer, size_t *lenp, loff_t *ppos); |
175 | #endif | 175 | #endif |
176 | 176 | ||
@@ -703,7 +703,7 @@ static struct ctl_table kern_table[] = { | |||
703 | .data = &dmesg_restrict, | 703 | .data = &dmesg_restrict, |
704 | .maxlen = sizeof(int), | 704 | .maxlen = sizeof(int), |
705 | .mode = 0644, | 705 | .mode = 0644, |
706 | .proc_handler = proc_dointvec_minmax, | 706 | .proc_handler = proc_dointvec_minmax_sysadmin, |
707 | .extra1 = &zero, | 707 | .extra1 = &zero, |
708 | .extra2 = &one, | 708 | .extra2 = &one, |
709 | }, | 709 | }, |
@@ -712,7 +712,7 @@ static struct ctl_table kern_table[] = { | |||
712 | .data = &kptr_restrict, | 712 | .data = &kptr_restrict, |
713 | .maxlen = sizeof(int), | 713 | .maxlen = sizeof(int), |
714 | .mode = 0644, | 714 | .mode = 0644, |
715 | .proc_handler = proc_dmesg_restrict, | 715 | .proc_handler = proc_dointvec_minmax_sysadmin, |
716 | .extra1 = &zero, | 716 | .extra1 = &zero, |
717 | .extra2 = &two, | 717 | .extra2 = &two, |
718 | }, | 718 | }, |
@@ -1943,7 +1943,7 @@ static int proc_taint(struct ctl_table *table, int write, | |||
1943 | } | 1943 | } |
1944 | 1944 | ||
1945 | #ifdef CONFIG_PRINTK | 1945 | #ifdef CONFIG_PRINTK |
1946 | static int proc_dmesg_restrict(struct ctl_table *table, int write, | 1946 | static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, |
1947 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1947 | void __user *buffer, size_t *lenp, loff_t *ppos) |
1948 | { | 1948 | { |
1949 | if (write && !capable(CAP_SYS_ADMIN)) | 1949 | if (write && !capable(CAP_SYS_ADMIN)) |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2cf9cc7aa103..a20dc8a3c949 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -1,6 +1,10 @@ | |||
1 | # | 1 | # |
2 | # Timer subsystem related configuration options | 2 | # Timer subsystem related configuration options |
3 | # | 3 | # |
4 | |||
5 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is | ||
6 | # only related to the tick functionality. Oneshot clockevent devices | ||
7 | # are supported independ of this. | ||
4 | config TICK_ONESHOT | 8 | config TICK_ONESHOT |
5 | bool | 9 | bool |
6 | 10 | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7b..aa27d391bfc8 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); | |||
59 | * If one has not already been chosen, it checks to see if a | 59 | * If one has not already been chosen, it checks to see if a |
60 | * functional rtc device is available. | 60 | * functional rtc device is available. |
61 | */ | 61 | */ |
62 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | struct rtc_device *alarmtimer_get_rtcdev(void) |
63 | { | 63 | { |
64 | unsigned long flags; | 64 | unsigned long flags; |
65 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) | |||
115 | class_interface_unregister(&alarmtimer_rtc_interface); | 115 | class_interface_unregister(&alarmtimer_rtc_interface); |
116 | } | 116 | } |
117 | #else | 117 | #else |
118 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) | 118 | struct rtc_device *alarmtimer_get_rtcdev(void) |
119 | { | 119 | { |
120 | return NULL; | 120 | return NULL; |
121 | } | 121 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e883f57a3cd3..f113755695e2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -346,7 +346,8 @@ int tick_resume_broadcast(void) | |||
346 | tick_get_broadcast_mask()); | 346 | tick_get_broadcast_mask()); |
347 | break; | 347 | break; |
348 | case TICKDEV_MODE_ONESHOT: | 348 | case TICKDEV_MODE_ONESHOT: |
349 | broadcast = tick_resume_broadcast_oneshot(bc); | 349 | if (!cpumask_empty(tick_get_broadcast_mask())) |
350 | broadcast = tick_resume_broadcast_oneshot(bc); | ||
350 | break; | 351 | break; |
351 | } | 352 | } |
352 | } | 353 | } |
@@ -373,6 +374,9 @@ static int tick_broadcast_set_event(ktime_t expires, int force) | |||
373 | { | 374 | { |
374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 375 | struct clock_event_device *bc = tick_broadcast_device.evtdev; |
375 | 376 | ||
377 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) | ||
378 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
379 | |||
376 | return clockevents_program_event(bc, expires, force); | 380 | return clockevents_program_event(bc, expires, force); |
377 | } | 381 | } |
378 | 382 | ||
@@ -531,7 +535,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
531 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 535 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; |
532 | 536 | ||
533 | bc->event_handler = tick_handle_oneshot_broadcast; | 537 | bc->event_handler = tick_handle_oneshot_broadcast; |
534 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
535 | 538 | ||
536 | /* Take the do_timer update */ | 539 | /* Take the do_timer update */ |
537 | tick_do_timer_cpu = cpu; | 540 | tick_do_timer_cpu = cpu; |
@@ -549,6 +552,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
549 | to_cpumask(tmpmask)); | 552 | to_cpumask(tmpmask)); |
550 | 553 | ||
551 | if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { | 554 | if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { |
555 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | ||
552 | tick_broadcast_init_next_event(to_cpumask(tmpmask), | 556 | tick_broadcast_init_next_event(to_cpumask(tmpmask), |
553 | tick_next_period); | 557 | tick_next_period); |
554 | tick_broadcast_set_event(tick_next_period, 1); | 558 | tick_broadcast_set_event(tick_next_period, 1); |
@@ -575,15 +579,12 @@ void tick_broadcast_switch_to_oneshot(void) | |||
575 | unsigned long flags; | 579 | unsigned long flags; |
576 | 580 | ||
577 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 581 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
578 | if (cpumask_empty(tick_get_broadcast_mask())) | ||
579 | goto end; | ||
580 | 582 | ||
581 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; | 583 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; |
582 | bc = tick_broadcast_device.evtdev; | 584 | bc = tick_broadcast_device.evtdev; |
583 | if (bc) | 585 | if (bc) |
584 | tick_broadcast_setup_oneshot(bc); | 586 | tick_broadcast_setup_oneshot(bc); |
585 | 587 | ||
586 | end: | ||
587 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 588 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
588 | } | 589 | } |
589 | 590 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3526038f2836..6a3a5b9ff561 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
534 | hrtimer_get_expires(&ts->sched_timer), 0)) | 534 | hrtimer_get_expires(&ts->sched_timer), 0)) |
535 | break; | 535 | break; |
536 | } | 536 | } |
537 | /* Update jiffies and reread time */ | 537 | /* Reread time and update jiffies */ |
538 | tick_do_update_jiffies64(now); | ||
539 | now = ktime_get(); | 538 | now = ktime_get(); |
539 | tick_do_update_jiffies64(now); | ||
540 | } | 540 | } |
541 | } | 541 | } |
542 | 542 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 67316cb6a777..6ec7e7e0db43 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); | |||
861 | * | 861 | * |
862 | * mod_timer_pinned() is a way to update the expire field of an | 862 | * mod_timer_pinned() is a way to update the expire field of an |
863 | * active timer (if the timer is inactive it will be activated) | 863 | * active timer (if the timer is inactive it will be activated) |
864 | * and not allow the timer to be migrated to a different CPU. | 864 | * and to ensure that the timer is scheduled on the current CPU. |
865 | * | ||
866 | * Note that this does not prevent the timer from being migrated | ||
867 | * when the current CPU goes offline. If this is a problem for | ||
868 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
869 | * example, cancelling the timer when the corresponding CPU goes | ||
870 | * offline. | ||
865 | * | 871 | * |
866 | * mod_timer_pinned(timer, expires) is equivalent to: | 872 | * mod_timer_pinned(timer, expires) is equivalent to: |
867 | * | 873 | * |
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1102 | * warnings as well as problems when looking into | 1108 | * warnings as well as problems when looking into |
1103 | * timer->lockdep_map, make a copy and use that here. | 1109 | * timer->lockdep_map, make a copy and use that here. |
1104 | */ | 1110 | */ |
1105 | struct lockdep_map lockdep_map = timer->lockdep_map; | 1111 | struct lockdep_map lockdep_map; |
1112 | |||
1113 | lockdep_copy_map(&lockdep_map, &timer->lockdep_map); | ||
1106 | #endif | 1114 | #endif |
1107 | /* | 1115 | /* |
1108 | * Couple the lock chain with the lock chain at | 1116 | * Couple the lock chain with the lock chain at |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..f347ac91292d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,6 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE | ||
145 | select KALLSYMS | 144 | select KALLSYMS |
146 | select GENERIC_TRACER | 145 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 146 | select CONTEXT_SWITCH_TRACER |
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
272 | bool "Trace likely/unlikely profiler" | 271 | bool "Trace likely/unlikely profiler" |
273 | select TRACE_BRANCH_PROFILING | 272 | select TRACE_BRANCH_PROFILING |
274 | help | 273 | help |
275 | This tracer profiles all the the likely and unlikely macros | 274 | This tracer profiles all likely and unlikely macros |
276 | in the kernel. It will display the results in: | 275 | in the kernel. It will display the results in: |
277 | 276 | ||
278 | /sys/kernel/debug/tracing/trace_stat/branch_annotated | 277 | /sys/kernel/debug/tracing/trace_stat/branch_annotated |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5ea..b3afe0e76f79 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o | |||
41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | ||
45 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 44 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
46 | ifeq ($(CONFIG_BLOCK),y) | 45 | ifeq ($(CONFIG_BLOCK),y) |
47 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o | 46 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cdea7b56b0c9..c0bd0308741c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q) | |||
311 | } | 311 | } |
312 | EXPORT_SYMBOL_GPL(blk_trace_remove); | 312 | EXPORT_SYMBOL_GPL(blk_trace_remove); |
313 | 313 | ||
314 | static int blk_dropped_open(struct inode *inode, struct file *filp) | ||
315 | { | ||
316 | filp->private_data = inode->i_private; | ||
317 | |||
318 | return 0; | ||
319 | } | ||
320 | |||
321 | static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, | 314 | static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, |
322 | size_t count, loff_t *ppos) | 315 | size_t count, loff_t *ppos) |
323 | { | 316 | { |
@@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, | |||
331 | 324 | ||
332 | static const struct file_operations blk_dropped_fops = { | 325 | static const struct file_operations blk_dropped_fops = { |
333 | .owner = THIS_MODULE, | 326 | .owner = THIS_MODULE, |
334 | .open = blk_dropped_open, | 327 | .open = simple_open, |
335 | .read = blk_dropped_read, | 328 | .read = blk_dropped_read, |
336 | .llseek = default_llseek, | 329 | .llseek = default_llseek, |
337 | }; | 330 | }; |
338 | 331 | ||
339 | static int blk_msg_open(struct inode *inode, struct file *filp) | ||
340 | { | ||
341 | filp->private_data = inode->i_private; | ||
342 | |||
343 | return 0; | ||
344 | } | ||
345 | |||
346 | static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, | 332 | static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, |
347 | size_t count, loff_t *ppos) | 333 | size_t count, loff_t *ppos) |
348 | { | 334 | { |
@@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, | |||
371 | 357 | ||
372 | static const struct file_operations blk_msg_fops = { | 358 | static const struct file_operations blk_msg_fops = { |
373 | .owner = THIS_MODULE, | 359 | .owner = THIS_MODULE, |
374 | .open = blk_msg_open, | 360 | .open = simple_open, |
375 | .write = blk_msg_write, | 361 | .write = blk_msg_write, |
376 | .llseek = noop_llseek, | 362 | .llseek = noop_llseek, |
377 | }; | 363 | }; |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c92..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1383 | 1383 | ||
1384 | static int ftrace_cmp_recs(const void *a, const void *b) | 1384 | static int ftrace_cmp_recs(const void *a, const void *b) |
1385 | { | 1385 | { |
1386 | const struct dyn_ftrace *reca = a; | 1386 | const struct dyn_ftrace *key = a; |
1387 | const struct dyn_ftrace *recb = b; | 1387 | const struct dyn_ftrace *rec = b; |
1388 | 1388 | ||
1389 | if (reca->ip > recb->ip) | 1389 | if (key->flags < rec->ip) |
1390 | return 1; | ||
1391 | if (reca->ip < recb->ip) | ||
1392 | return -1; | 1390 | return -1; |
1391 | if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) | ||
1392 | return 1; | ||
1393 | return 0; | 1393 | return 0; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | /** | 1396 | static unsigned long ftrace_location_range(unsigned long start, unsigned long end) |
1397 | * ftrace_location - return true if the ip giving is a traced location | ||
1398 | * @ip: the instruction pointer to check | ||
1399 | * | ||
1400 | * Returns 1 if @ip given is a pointer to a ftrace location. | ||
1401 | * That is, the instruction that is either a NOP or call to | ||
1402 | * the function tracer. It checks the ftrace internal tables to | ||
1403 | * determine if the address belongs or not. | ||
1404 | */ | ||
1405 | int ftrace_location(unsigned long ip) | ||
1406 | { | 1397 | { |
1407 | struct ftrace_page *pg; | 1398 | struct ftrace_page *pg; |
1408 | struct dyn_ftrace *rec; | 1399 | struct dyn_ftrace *rec; |
1409 | struct dyn_ftrace key; | 1400 | struct dyn_ftrace key; |
1410 | 1401 | ||
1411 | key.ip = ip; | 1402 | key.ip = start; |
1403 | key.flags = end; /* overload flags, as it is unsigned long */ | ||
1412 | 1404 | ||
1413 | for (pg = ftrace_pages_start; pg; pg = pg->next) { | 1405 | for (pg = ftrace_pages_start; pg; pg = pg->next) { |
1406 | if (end < pg->records[0].ip || | ||
1407 | start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) | ||
1408 | continue; | ||
1414 | rec = bsearch(&key, pg->records, pg->index, | 1409 | rec = bsearch(&key, pg->records, pg->index, |
1415 | sizeof(struct dyn_ftrace), | 1410 | sizeof(struct dyn_ftrace), |
1416 | ftrace_cmp_recs); | 1411 | ftrace_cmp_recs); |
1417 | if (rec) | 1412 | if (rec) |
1418 | return 1; | 1413 | return rec->ip; |
1419 | } | 1414 | } |
1420 | 1415 | ||
1421 | return 0; | 1416 | return 0; |
1422 | } | 1417 | } |
1423 | 1418 | ||
1419 | /** | ||
1420 | * ftrace_location - return true if the ip giving is a traced location | ||
1421 | * @ip: the instruction pointer to check | ||
1422 | * | ||
1423 | * Returns rec->ip if @ip given is a pointer to a ftrace location. | ||
1424 | * That is, the instruction that is either a NOP or call to | ||
1425 | * the function tracer. It checks the ftrace internal tables to | ||
1426 | * determine if the address belongs or not. | ||
1427 | */ | ||
1428 | unsigned long ftrace_location(unsigned long ip) | ||
1429 | { | ||
1430 | return ftrace_location_range(ip, ip); | ||
1431 | } | ||
1432 | |||
1433 | /** | ||
1434 | * ftrace_text_reserved - return true if range contains an ftrace location | ||
1435 | * @start: start of range to search | ||
1436 | * @end: end of range to search (inclusive). @end points to the last byte to check. | ||
1437 | * | ||
1438 | * Returns 1 if @start and @end contains a ftrace location. | ||
1439 | * That is, the instruction that is either a NOP or call to | ||
1440 | * the function tracer. It checks the ftrace internal tables to | ||
1441 | * determine if the address belongs or not. | ||
1442 | */ | ||
1443 | int ftrace_text_reserved(void *start, void *end) | ||
1444 | { | ||
1445 | unsigned long ret; | ||
1446 | |||
1447 | ret = ftrace_location_range((unsigned long)start, | ||
1448 | (unsigned long)end); | ||
1449 | |||
1450 | return (int)!!ret; | ||
1451 | } | ||
1452 | |||
1424 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1453 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1425 | int filter_hash, | 1454 | int filter_hash, |
1426 | bool inc) | 1455 | bool inc) |
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
1520 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1549 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
1521 | } | 1550 | } |
1522 | 1551 | ||
1523 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) | ||
1524 | { | ||
1525 | if (ftrace_pages->index == ftrace_pages->size) { | ||
1526 | /* We should have allocated enough */ | ||
1527 | if (WARN_ON(!ftrace_pages->next)) | ||
1528 | return NULL; | ||
1529 | ftrace_pages = ftrace_pages->next; | ||
1530 | } | ||
1531 | |||
1532 | return &ftrace_pages->records[ftrace_pages->index++]; | ||
1533 | } | ||
1534 | |||
1535 | static struct dyn_ftrace * | ||
1536 | ftrace_record_ip(unsigned long ip) | ||
1537 | { | ||
1538 | struct dyn_ftrace *rec; | ||
1539 | |||
1540 | if (ftrace_disabled) | ||
1541 | return NULL; | ||
1542 | |||
1543 | rec = ftrace_alloc_dyn_node(ip); | ||
1544 | if (!rec) | ||
1545 | return NULL; | ||
1546 | |||
1547 | rec->ip = ip; | ||
1548 | |||
1549 | return rec; | ||
1550 | } | ||
1551 | |||
1552 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1552 | static void print_ip_ins(const char *fmt, unsigned char *p) |
1553 | { | 1553 | { |
1554 | int i; | 1554 | int i; |
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) | |||
1598 | } | 1598 | } |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | |||
1602 | /* Return 1 if the address range is reserved for ftrace */ | ||
1603 | int ftrace_text_reserved(void *start, void *end) | ||
1604 | { | ||
1605 | struct dyn_ftrace *rec; | ||
1606 | struct ftrace_page *pg; | ||
1607 | |||
1608 | do_for_each_ftrace_rec(pg, rec) { | ||
1609 | if (rec->ip <= (unsigned long)end && | ||
1610 | rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) | ||
1611 | return 1; | ||
1612 | } while_for_each_ftrace_rec(); | ||
1613 | return 0; | ||
1614 | } | ||
1615 | |||
1616 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1601 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
1617 | { | 1602 | { |
1618 | unsigned long flag = 0UL; | 1603 | unsigned long flag = 0UL; |
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1698 | return -1; /* unknow ftrace bug */ | 1683 | return -1; /* unknow ftrace bug */ |
1699 | } | 1684 | } |
1700 | 1685 | ||
1701 | static void ftrace_replace_code(int update) | 1686 | void __weak ftrace_replace_code(int enable) |
1702 | { | 1687 | { |
1703 | struct dyn_ftrace *rec; | 1688 | struct dyn_ftrace *rec; |
1704 | struct ftrace_page *pg; | 1689 | struct ftrace_page *pg; |
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) | |||
1708 | return; | 1693 | return; |
1709 | 1694 | ||
1710 | do_for_each_ftrace_rec(pg, rec) { | 1695 | do_for_each_ftrace_rec(pg, rec) { |
1711 | failed = __ftrace_replace_code(rec, update); | 1696 | failed = __ftrace_replace_code(rec, enable); |
1712 | if (failed) { | 1697 | if (failed) { |
1713 | ftrace_bug(failed, rec->ip); | 1698 | ftrace_bug(failed, rec->ip); |
1714 | /* Stop processing */ | 1699 | /* Stop processing */ |
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
1826 | return 0; | 1811 | return 0; |
1827 | } | 1812 | } |
1828 | 1813 | ||
1829 | static int __ftrace_modify_code(void *data) | 1814 | void ftrace_modify_all_code(int command) |
1830 | { | 1815 | { |
1831 | int *command = data; | 1816 | if (command & FTRACE_UPDATE_CALLS) |
1832 | |||
1833 | if (*command & FTRACE_UPDATE_CALLS) | ||
1834 | ftrace_replace_code(1); | 1817 | ftrace_replace_code(1); |
1835 | else if (*command & FTRACE_DISABLE_CALLS) | 1818 | else if (command & FTRACE_DISABLE_CALLS) |
1836 | ftrace_replace_code(0); | 1819 | ftrace_replace_code(0); |
1837 | 1820 | ||
1838 | if (*command & FTRACE_UPDATE_TRACE_FUNC) | 1821 | if (command & FTRACE_UPDATE_TRACE_FUNC) |
1839 | ftrace_update_ftrace_func(ftrace_trace_function); | 1822 | ftrace_update_ftrace_func(ftrace_trace_function); |
1840 | 1823 | ||
1841 | if (*command & FTRACE_START_FUNC_RET) | 1824 | if (command & FTRACE_START_FUNC_RET) |
1842 | ftrace_enable_ftrace_graph_caller(); | 1825 | ftrace_enable_ftrace_graph_caller(); |
1843 | else if (*command & FTRACE_STOP_FUNC_RET) | 1826 | else if (command & FTRACE_STOP_FUNC_RET) |
1844 | ftrace_disable_ftrace_graph_caller(); | 1827 | ftrace_disable_ftrace_graph_caller(); |
1828 | } | ||
1829 | |||
1830 | static int __ftrace_modify_code(void *data) | ||
1831 | { | ||
1832 | int *command = data; | ||
1833 | |||
1834 | ftrace_modify_all_code(*command); | ||
1845 | 1835 | ||
1846 | return 0; | 1836 | return 0; |
1847 | } | 1837 | } |
@@ -2469,57 +2459,35 @@ static int | |||
2469 | ftrace_avail_open(struct inode *inode, struct file *file) | 2459 | ftrace_avail_open(struct inode *inode, struct file *file) |
2470 | { | 2460 | { |
2471 | struct ftrace_iterator *iter; | 2461 | struct ftrace_iterator *iter; |
2472 | int ret; | ||
2473 | 2462 | ||
2474 | if (unlikely(ftrace_disabled)) | 2463 | if (unlikely(ftrace_disabled)) |
2475 | return -ENODEV; | 2464 | return -ENODEV; |
2476 | 2465 | ||
2477 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2466 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2478 | if (!iter) | 2467 | if (iter) { |
2479 | return -ENOMEM; | 2468 | iter->pg = ftrace_pages_start; |
2480 | 2469 | iter->ops = &global_ops; | |
2481 | iter->pg = ftrace_pages_start; | ||
2482 | iter->ops = &global_ops; | ||
2483 | |||
2484 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2485 | if (!ret) { | ||
2486 | struct seq_file *m = file->private_data; | ||
2487 | |||
2488 | m->private = iter; | ||
2489 | } else { | ||
2490 | kfree(iter); | ||
2491 | } | 2470 | } |
2492 | 2471 | ||
2493 | return ret; | 2472 | return iter ? 0 : -ENOMEM; |
2494 | } | 2473 | } |
2495 | 2474 | ||
2496 | static int | 2475 | static int |
2497 | ftrace_enabled_open(struct inode *inode, struct file *file) | 2476 | ftrace_enabled_open(struct inode *inode, struct file *file) |
2498 | { | 2477 | { |
2499 | struct ftrace_iterator *iter; | 2478 | struct ftrace_iterator *iter; |
2500 | int ret; | ||
2501 | 2479 | ||
2502 | if (unlikely(ftrace_disabled)) | 2480 | if (unlikely(ftrace_disabled)) |
2503 | return -ENODEV; | 2481 | return -ENODEV; |
2504 | 2482 | ||
2505 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2483 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2506 | if (!iter) | 2484 | if (iter) { |
2507 | return -ENOMEM; | 2485 | iter->pg = ftrace_pages_start; |
2508 | 2486 | iter->flags = FTRACE_ITER_ENABLED; | |
2509 | iter->pg = ftrace_pages_start; | 2487 | iter->ops = &global_ops; |
2510 | iter->flags = FTRACE_ITER_ENABLED; | ||
2511 | iter->ops = &global_ops; | ||
2512 | |||
2513 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2514 | if (!ret) { | ||
2515 | struct seq_file *m = file->private_data; | ||
2516 | |||
2517 | m->private = iter; | ||
2518 | } else { | ||
2519 | kfree(iter); | ||
2520 | } | 2488 | } |
2521 | 2489 | ||
2522 | return ret; | 2490 | return iter ? 0 : -ENOMEM; |
2523 | } | 2491 | } |
2524 | 2492 | ||
2525 | static void ftrace_filter_reset(struct ftrace_hash *hash) | 2493 | static void ftrace_filter_reset(struct ftrace_hash *hash) |
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3688 | return 0; | 3656 | return 0; |
3689 | } | 3657 | } |
3690 | 3658 | ||
3691 | static void ftrace_swap_recs(void *a, void *b, int size) | 3659 | static int ftrace_cmp_ips(const void *a, const void *b) |
3660 | { | ||
3661 | const unsigned long *ipa = a; | ||
3662 | const unsigned long *ipb = b; | ||
3663 | |||
3664 | if (*ipa > *ipb) | ||
3665 | return 1; | ||
3666 | if (*ipa < *ipb) | ||
3667 | return -1; | ||
3668 | return 0; | ||
3669 | } | ||
3670 | |||
3671 | static void ftrace_swap_ips(void *a, void *b, int size) | ||
3692 | { | 3672 | { |
3693 | struct dyn_ftrace *reca = a; | 3673 | unsigned long *ipa = a; |
3694 | struct dyn_ftrace *recb = b; | 3674 | unsigned long *ipb = b; |
3695 | struct dyn_ftrace t; | 3675 | unsigned long t; |
3696 | 3676 | ||
3697 | t = *reca; | 3677 | t = *ipa; |
3698 | *reca = *recb; | 3678 | *ipa = *ipb; |
3699 | *recb = t; | 3679 | *ipb = t; |
3700 | } | 3680 | } |
3701 | 3681 | ||
3702 | static int ftrace_process_locs(struct module *mod, | 3682 | static int ftrace_process_locs(struct module *mod, |
3703 | unsigned long *start, | 3683 | unsigned long *start, |
3704 | unsigned long *end) | 3684 | unsigned long *end) |
3705 | { | 3685 | { |
3686 | struct ftrace_page *start_pg; | ||
3706 | struct ftrace_page *pg; | 3687 | struct ftrace_page *pg; |
3688 | struct dyn_ftrace *rec; | ||
3707 | unsigned long count; | 3689 | unsigned long count; |
3708 | unsigned long *p; | 3690 | unsigned long *p; |
3709 | unsigned long addr; | 3691 | unsigned long addr; |
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3715 | if (!count) | 3697 | if (!count) |
3716 | return 0; | 3698 | return 0; |
3717 | 3699 | ||
3718 | pg = ftrace_allocate_pages(count); | 3700 | sort(start, count, sizeof(*start), |
3719 | if (!pg) | 3701 | ftrace_cmp_ips, ftrace_swap_ips); |
3702 | |||
3703 | start_pg = ftrace_allocate_pages(count); | ||
3704 | if (!start_pg) | ||
3720 | return -ENOMEM; | 3705 | return -ENOMEM; |
3721 | 3706 | ||
3722 | mutex_lock(&ftrace_lock); | 3707 | mutex_lock(&ftrace_lock); |
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, | |||
3729 | if (!mod) { | 3714 | if (!mod) { |
3730 | WARN_ON(ftrace_pages || ftrace_pages_start); | 3715 | WARN_ON(ftrace_pages || ftrace_pages_start); |
3731 | /* First initialization */ | 3716 | /* First initialization */ |
3732 | ftrace_pages = ftrace_pages_start = pg; | 3717 | ftrace_pages = ftrace_pages_start = start_pg; |
3733 | } else { | 3718 | } else { |
3734 | if (!ftrace_pages) | 3719 | if (!ftrace_pages) |
3735 | goto out; | 3720 | goto out; |
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3740 | ftrace_pages = ftrace_pages->next; | 3725 | ftrace_pages = ftrace_pages->next; |
3741 | } | 3726 | } |
3742 | 3727 | ||
3743 | ftrace_pages->next = pg; | 3728 | ftrace_pages->next = start_pg; |
3744 | ftrace_pages = pg; | ||
3745 | } | 3729 | } |
3746 | 3730 | ||
3747 | p = start; | 3731 | p = start; |
3732 | pg = start_pg; | ||
3748 | while (p < end) { | 3733 | while (p < end) { |
3749 | addr = ftrace_call_adjust(*p++); | 3734 | addr = ftrace_call_adjust(*p++); |
3750 | /* | 3735 | /* |
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, | |||
3755 | */ | 3740 | */ |
3756 | if (!addr) | 3741 | if (!addr) |
3757 | continue; | 3742 | continue; |
3758 | if (!ftrace_record_ip(addr)) | 3743 | |
3759 | break; | 3744 | if (pg->index == pg->size) { |
3745 | /* We should have allocated enough */ | ||
3746 | if (WARN_ON(!pg->next)) | ||
3747 | break; | ||
3748 | pg = pg->next; | ||
3749 | } | ||
3750 | |||
3751 | rec = &pg->records[pg->index++]; | ||
3752 | rec->ip = addr; | ||
3760 | } | 3753 | } |
3761 | 3754 | ||
3762 | /* These new locations need to be initialized */ | 3755 | /* We should have used all pages */ |
3763 | ftrace_new_pgs = pg; | 3756 | WARN_ON(pg->next); |
3757 | |||
3758 | /* Assign the last page to ftrace_pages */ | ||
3759 | ftrace_pages = pg; | ||
3764 | 3760 | ||
3765 | /* Make each individual set of pages sorted by ips */ | 3761 | /* These new locations need to be initialized */ |
3766 | for (; pg; pg = pg->next) | 3762 | ftrace_new_pgs = start_pg; |
3767 | sort(pg->records, pg->index, sizeof(struct dyn_ftrace), | ||
3768 | ftrace_cmp_recs, ftrace_swap_recs); | ||
3769 | 3763 | ||
3770 | /* | 3764 | /* |
3771 | * We only need to disable interrupts on start up | 3765 | * We only need to disable interrupts on start up |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91efd..6420cda62336 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -23,6 +23,8 @@ | |||
23 | #include <asm/local.h> | 23 | #include <asm/local.h> |
24 | #include "trace.h" | 24 | #include "trace.h" |
25 | 25 | ||
26 | static void update_pages_handler(struct work_struct *work); | ||
27 | |||
26 | /* | 28 | /* |
27 | * The ring buffer header is special. We must manually up keep it. | 29 | * The ring buffer header is special. We must manually up keep it. |
28 | */ | 30 | */ |
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu { | |||
449 | raw_spinlock_t reader_lock; /* serialize readers */ | 451 | raw_spinlock_t reader_lock; /* serialize readers */ |
450 | arch_spinlock_t lock; | 452 | arch_spinlock_t lock; |
451 | struct lock_class_key lock_key; | 453 | struct lock_class_key lock_key; |
454 | unsigned int nr_pages; | ||
452 | struct list_head *pages; | 455 | struct list_head *pages; |
453 | struct buffer_page *head_page; /* read from head */ | 456 | struct buffer_page *head_page; /* read from head */ |
454 | struct buffer_page *tail_page; /* write to tail */ | 457 | struct buffer_page *tail_page; /* write to tail */ |
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu { | |||
466 | unsigned long read_bytes; | 469 | unsigned long read_bytes; |
467 | u64 write_stamp; | 470 | u64 write_stamp; |
468 | u64 read_stamp; | 471 | u64 read_stamp; |
472 | /* ring buffer pages to update, > 0 to add, < 0 to remove */ | ||
473 | int nr_pages_to_update; | ||
474 | struct list_head new_pages; /* new pages to add */ | ||
475 | struct work_struct update_pages_work; | ||
476 | struct completion update_done; | ||
469 | }; | 477 | }; |
470 | 478 | ||
471 | struct ring_buffer { | 479 | struct ring_buffer { |
472 | unsigned pages; | ||
473 | unsigned flags; | 480 | unsigned flags; |
474 | int cpus; | 481 | int cpus; |
475 | atomic_t record_disabled; | 482 | atomic_t record_disabled; |
483 | atomic_t resize_disabled; | ||
476 | cpumask_var_t cpumask; | 484 | cpumask_var_t cpumask; |
477 | 485 | ||
478 | struct lock_class_key *reader_lock_key; | 486 | struct lock_class_key *reader_lock_key; |
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
937 | struct list_head *head = cpu_buffer->pages; | 945 | struct list_head *head = cpu_buffer->pages; |
938 | struct buffer_page *bpage, *tmp; | 946 | struct buffer_page *bpage, *tmp; |
939 | 947 | ||
948 | /* Reset the head page if it exists */ | ||
949 | if (cpu_buffer->head_page) | ||
950 | rb_set_head_page(cpu_buffer); | ||
951 | |||
940 | rb_head_page_deactivate(cpu_buffer); | 952 | rb_head_page_deactivate(cpu_buffer); |
941 | 953 | ||
942 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) | 954 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) |
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
963 | return 0; | 975 | return 0; |
964 | } | 976 | } |
965 | 977 | ||
966 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | 978 | static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) |
967 | unsigned nr_pages) | ||
968 | { | 979 | { |
980 | int i; | ||
969 | struct buffer_page *bpage, *tmp; | 981 | struct buffer_page *bpage, *tmp; |
970 | LIST_HEAD(pages); | ||
971 | unsigned i; | ||
972 | |||
973 | WARN_ON(!nr_pages); | ||
974 | 982 | ||
975 | for (i = 0; i < nr_pages; i++) { | 983 | for (i = 0; i < nr_pages; i++) { |
976 | struct page *page; | 984 | struct page *page; |
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
981 | */ | 989 | */ |
982 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 990 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
983 | GFP_KERNEL | __GFP_NORETRY, | 991 | GFP_KERNEL | __GFP_NORETRY, |
984 | cpu_to_node(cpu_buffer->cpu)); | 992 | cpu_to_node(cpu)); |
985 | if (!bpage) | 993 | if (!bpage) |
986 | goto free_pages; | 994 | goto free_pages; |
987 | 995 | ||
988 | rb_check_bpage(cpu_buffer, bpage); | 996 | list_add(&bpage->list, pages); |
989 | 997 | ||
990 | list_add(&bpage->list, &pages); | 998 | page = alloc_pages_node(cpu_to_node(cpu), |
991 | |||
992 | page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), | ||
993 | GFP_KERNEL | __GFP_NORETRY, 0); | 999 | GFP_KERNEL | __GFP_NORETRY, 0); |
994 | if (!page) | 1000 | if (!page) |
995 | goto free_pages; | 1001 | goto free_pages; |
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
997 | rb_init_page(bpage->page); | 1003 | rb_init_page(bpage->page); |
998 | } | 1004 | } |
999 | 1005 | ||
1006 | return 0; | ||
1007 | |||
1008 | free_pages: | ||
1009 | list_for_each_entry_safe(bpage, tmp, pages, list) { | ||
1010 | list_del_init(&bpage->list); | ||
1011 | free_buffer_page(bpage); | ||
1012 | } | ||
1013 | |||
1014 | return -ENOMEM; | ||
1015 | } | ||
1016 | |||
1017 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | ||
1018 | unsigned nr_pages) | ||
1019 | { | ||
1020 | LIST_HEAD(pages); | ||
1021 | |||
1022 | WARN_ON(!nr_pages); | ||
1023 | |||
1024 | if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) | ||
1025 | return -ENOMEM; | ||
1026 | |||
1000 | /* | 1027 | /* |
1001 | * The ring buffer page list is a circular list that does not | 1028 | * The ring buffer page list is a circular list that does not |
1002 | * start and end with a list head. All page list items point to | 1029 | * start and end with a list head. All page list items point to |
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1005 | cpu_buffer->pages = pages.next; | 1032 | cpu_buffer->pages = pages.next; |
1006 | list_del(&pages); | 1033 | list_del(&pages); |
1007 | 1034 | ||
1035 | cpu_buffer->nr_pages = nr_pages; | ||
1036 | |||
1008 | rb_check_pages(cpu_buffer); | 1037 | rb_check_pages(cpu_buffer); |
1009 | 1038 | ||
1010 | return 0; | 1039 | return 0; |
1011 | |||
1012 | free_pages: | ||
1013 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | ||
1014 | list_del_init(&bpage->list); | ||
1015 | free_buffer_page(bpage); | ||
1016 | } | ||
1017 | return -ENOMEM; | ||
1018 | } | 1040 | } |
1019 | 1041 | ||
1020 | static struct ring_buffer_per_cpu * | 1042 | static struct ring_buffer_per_cpu * |
1021 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | 1043 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) |
1022 | { | 1044 | { |
1023 | struct ring_buffer_per_cpu *cpu_buffer; | 1045 | struct ring_buffer_per_cpu *cpu_buffer; |
1024 | struct buffer_page *bpage; | 1046 | struct buffer_page *bpage; |
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1035 | raw_spin_lock_init(&cpu_buffer->reader_lock); | 1057 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1036 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1058 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1037 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1059 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1060 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); | ||
1061 | init_completion(&cpu_buffer->update_done); | ||
1038 | 1062 | ||
1039 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1063 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1040 | GFP_KERNEL, cpu_to_node(cpu)); | 1064 | GFP_KERNEL, cpu_to_node(cpu)); |
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1052 | 1076 | ||
1053 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
1054 | 1078 | ||
1055 | ret = rb_allocate_pages(cpu_buffer, buffer->pages); | 1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
1056 | if (ret < 0) | 1080 | if (ret < 0) |
1057 | goto fail_free_reader; | 1081 | goto fail_free_reader; |
1058 | 1082 | ||
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1113 | { | 1137 | { |
1114 | struct ring_buffer *buffer; | 1138 | struct ring_buffer *buffer; |
1115 | int bsize; | 1139 | int bsize; |
1116 | int cpu; | 1140 | int cpu, nr_pages; |
1117 | 1141 | ||
1118 | /* keep it in its own cache line */ | 1142 | /* keep it in its own cache line */ |
1119 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), | 1143 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), |
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1124 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) | 1148 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) |
1125 | goto fail_free_buffer; | 1149 | goto fail_free_buffer; |
1126 | 1150 | ||
1127 | buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1151 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1128 | buffer->flags = flags; | 1152 | buffer->flags = flags; |
1129 | buffer->clock = trace_clock_local; | 1153 | buffer->clock = trace_clock_local; |
1130 | buffer->reader_lock_key = key; | 1154 | buffer->reader_lock_key = key; |
1131 | 1155 | ||
1132 | /* need at least two pages */ | 1156 | /* need at least two pages */ |
1133 | if (buffer->pages < 2) | 1157 | if (nr_pages < 2) |
1134 | buffer->pages = 2; | 1158 | nr_pages = 2; |
1135 | 1159 | ||
1136 | /* | 1160 | /* |
1137 | * In case of non-hotplug cpu, if the ring-buffer is allocated | 1161 | * In case of non-hotplug cpu, if the ring-buffer is allocated |
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1154 | 1178 | ||
1155 | for_each_buffer_cpu(buffer, cpu) { | 1179 | for_each_buffer_cpu(buffer, cpu) { |
1156 | buffer->buffers[cpu] = | 1180 | buffer->buffers[cpu] = |
1157 | rb_allocate_cpu_buffer(buffer, cpu); | 1181 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
1158 | if (!buffer->buffers[cpu]) | 1182 | if (!buffer->buffers[cpu]) |
1159 | goto fail_free_buffers; | 1183 | goto fail_free_buffers; |
1160 | } | 1184 | } |
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, | |||
1222 | 1246 | ||
1223 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); | 1247 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); |
1224 | 1248 | ||
1225 | static void | 1249 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) |
1226 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | ||
1227 | { | 1250 | { |
1228 | struct buffer_page *bpage; | 1251 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1229 | struct list_head *p; | 1252 | } |
1230 | unsigned i; | 1253 | |
1254 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1255 | { | ||
1256 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1257 | } | ||
1258 | |||
1259 | static int | ||
1260 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | ||
1261 | { | ||
1262 | struct list_head *tail_page, *to_remove, *next_page; | ||
1263 | struct buffer_page *to_remove_page, *tmp_iter_page; | ||
1264 | struct buffer_page *last_page, *first_page; | ||
1265 | unsigned int nr_removed; | ||
1266 | unsigned long head_bit; | ||
1267 | int page_entries; | ||
1268 | |||
1269 | head_bit = 0; | ||
1231 | 1270 | ||
1232 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1271 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1233 | rb_head_page_deactivate(cpu_buffer); | 1272 | atomic_inc(&cpu_buffer->record_disabled); |
1273 | /* | ||
1274 | * We don't race with the readers since we have acquired the reader | ||
1275 | * lock. We also don't race with writers after disabling recording. | ||
1276 | * This makes it easy to figure out the first and the last page to be | ||
1277 | * removed from the list. We unlink all the pages in between including | ||
1278 | * the first and last pages. This is done in a busy loop so that we | ||
1279 | * lose the least number of traces. | ||
1280 | * The pages are freed after we restart recording and unlock readers. | ||
1281 | */ | ||
1282 | tail_page = &cpu_buffer->tail_page->list; | ||
1234 | 1283 | ||
1235 | for (i = 0; i < nr_pages; i++) { | 1284 | /* |
1236 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1285 | * tail page might be on reader page, we remove the next page |
1237 | goto out; | 1286 | * from the ring buffer |
1238 | p = cpu_buffer->pages->next; | 1287 | */ |
1239 | bpage = list_entry(p, struct buffer_page, list); | 1288 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) |
1240 | list_del_init(&bpage->list); | 1289 | tail_page = rb_list_head(tail_page->next); |
1241 | free_buffer_page(bpage); | 1290 | to_remove = tail_page; |
1291 | |||
1292 | /* start of pages to remove */ | ||
1293 | first_page = list_entry(rb_list_head(to_remove->next), | ||
1294 | struct buffer_page, list); | ||
1295 | |||
1296 | for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { | ||
1297 | to_remove = rb_list_head(to_remove)->next; | ||
1298 | head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; | ||
1242 | } | 1299 | } |
1243 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | ||
1244 | goto out; | ||
1245 | 1300 | ||
1246 | rb_reset_cpu(cpu_buffer); | 1301 | next_page = rb_list_head(to_remove)->next; |
1247 | rb_check_pages(cpu_buffer); | ||
1248 | 1302 | ||
1249 | out: | 1303 | /* |
1304 | * Now we remove all pages between tail_page and next_page. | ||
1305 | * Make sure that we have head_bit value preserved for the | ||
1306 | * next page | ||
1307 | */ | ||
1308 | tail_page->next = (struct list_head *)((unsigned long)next_page | | ||
1309 | head_bit); | ||
1310 | next_page = rb_list_head(next_page); | ||
1311 | next_page->prev = tail_page; | ||
1312 | |||
1313 | /* make sure pages points to a valid page in the ring buffer */ | ||
1314 | cpu_buffer->pages = next_page; | ||
1315 | |||
1316 | /* update head page */ | ||
1317 | if (head_bit) | ||
1318 | cpu_buffer->head_page = list_entry(next_page, | ||
1319 | struct buffer_page, list); | ||
1320 | |||
1321 | /* | ||
1322 | * change read pointer to make sure any read iterators reset | ||
1323 | * themselves | ||
1324 | */ | ||
1325 | cpu_buffer->read = 0; | ||
1326 | |||
1327 | /* pages are removed, resume tracing and then free the pages */ | ||
1328 | atomic_dec(&cpu_buffer->record_disabled); | ||
1250 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1329 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1330 | |||
1331 | RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); | ||
1332 | |||
1333 | /* last buffer page to remove */ | ||
1334 | last_page = list_entry(rb_list_head(to_remove), struct buffer_page, | ||
1335 | list); | ||
1336 | tmp_iter_page = first_page; | ||
1337 | |||
1338 | do { | ||
1339 | to_remove_page = tmp_iter_page; | ||
1340 | rb_inc_page(cpu_buffer, &tmp_iter_page); | ||
1341 | |||
1342 | /* update the counters */ | ||
1343 | page_entries = rb_page_entries(to_remove_page); | ||
1344 | if (page_entries) { | ||
1345 | /* | ||
1346 | * If something was added to this page, it was full | ||
1347 | * since it is not the tail page. So we deduct the | ||
1348 | * bytes consumed in ring buffer from here. | ||
1349 | * No need to update overruns, since this page is | ||
1350 | * deleted from ring buffer and its entries are | ||
1351 | * already accounted for. | ||
1352 | */ | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * We have already removed references to this list item, just | ||
1358 | * free up the buffer_page and its page | ||
1359 | */ | ||
1360 | free_buffer_page(to_remove_page); | ||
1361 | nr_removed--; | ||
1362 | |||
1363 | } while (to_remove_page != last_page); | ||
1364 | |||
1365 | RB_WARN_ON(cpu_buffer, nr_removed); | ||
1366 | |||
1367 | return nr_removed == 0; | ||
1251 | } | 1368 | } |
1252 | 1369 | ||
1253 | static void | 1370 | static int |
1254 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | 1371 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) |
1255 | struct list_head *pages, unsigned nr_pages) | ||
1256 | { | 1372 | { |
1257 | struct buffer_page *bpage; | 1373 | struct list_head *pages = &cpu_buffer->new_pages; |
1258 | struct list_head *p; | 1374 | int retries, success; |
1259 | unsigned i; | ||
1260 | 1375 | ||
1261 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1376 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1262 | rb_head_page_deactivate(cpu_buffer); | 1377 | /* |
1378 | * We are holding the reader lock, so the reader page won't be swapped | ||
1379 | * in the ring buffer. Now we are racing with the writer trying to | ||
1380 | * move head page and the tail page. | ||
1381 | * We are going to adapt the reader page update process where: | ||
1382 | * 1. We first splice the start and end of list of new pages between | ||
1383 | * the head page and its previous page. | ||
1384 | * 2. We cmpxchg the prev_page->next to point from head page to the | ||
1385 | * start of new pages list. | ||
1386 | * 3. Finally, we update the head->prev to the end of new list. | ||
1387 | * | ||
1388 | * We will try this process 10 times, to make sure that we don't keep | ||
1389 | * spinning. | ||
1390 | */ | ||
1391 | retries = 10; | ||
1392 | success = 0; | ||
1393 | while (retries--) { | ||
1394 | struct list_head *head_page, *prev_page, *r; | ||
1395 | struct list_head *last_page, *first_page; | ||
1396 | struct list_head *head_page_with_bit; | ||
1263 | 1397 | ||
1264 | for (i = 0; i < nr_pages; i++) { | 1398 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1265 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1399 | prev_page = head_page->prev; |
1266 | goto out; | 1400 | |
1267 | p = pages->next; | 1401 | first_page = pages->next; |
1268 | bpage = list_entry(p, struct buffer_page, list); | 1402 | last_page = pages->prev; |
1269 | list_del_init(&bpage->list); | 1403 | |
1270 | list_add_tail(&bpage->list, cpu_buffer->pages); | 1404 | head_page_with_bit = (struct list_head *) |
1405 | ((unsigned long)head_page | RB_PAGE_HEAD); | ||
1406 | |||
1407 | last_page->next = head_page_with_bit; | ||
1408 | first_page->prev = prev_page; | ||
1409 | |||
1410 | r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); | ||
1411 | |||
1412 | if (r == head_page_with_bit) { | ||
1413 | /* | ||
1414 | * yay, we replaced the page pointer to our new list, | ||
1415 | * now, we just have to update to head page's prev | ||
1416 | * pointer to point to end of list | ||
1417 | */ | ||
1418 | head_page->prev = last_page; | ||
1419 | success = 1; | ||
1420 | break; | ||
1421 | } | ||
1271 | } | 1422 | } |
1272 | rb_reset_cpu(cpu_buffer); | ||
1273 | rb_check_pages(cpu_buffer); | ||
1274 | 1423 | ||
1275 | out: | 1424 | if (success) |
1425 | INIT_LIST_HEAD(pages); | ||
1426 | /* | ||
1427 | * If we weren't successful in adding in new pages, warn and stop | ||
1428 | * tracing | ||
1429 | */ | ||
1430 | RB_WARN_ON(cpu_buffer, !success); | ||
1276 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1431 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1432 | |||
1433 | /* free pages if they weren't inserted */ | ||
1434 | if (!success) { | ||
1435 | struct buffer_page *bpage, *tmp; | ||
1436 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, | ||
1437 | list) { | ||
1438 | list_del_init(&bpage->list); | ||
1439 | free_buffer_page(bpage); | ||
1440 | } | ||
1441 | } | ||
1442 | return success; | ||
1443 | } | ||
1444 | |||
1445 | static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) | ||
1446 | { | ||
1447 | int success; | ||
1448 | |||
1449 | if (cpu_buffer->nr_pages_to_update > 0) | ||
1450 | success = rb_insert_pages(cpu_buffer); | ||
1451 | else | ||
1452 | success = rb_remove_pages(cpu_buffer, | ||
1453 | -cpu_buffer->nr_pages_to_update); | ||
1454 | |||
1455 | if (success) | ||
1456 | cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; | ||
1457 | } | ||
1458 | |||
1459 | static void update_pages_handler(struct work_struct *work) | ||
1460 | { | ||
1461 | struct ring_buffer_per_cpu *cpu_buffer = container_of(work, | ||
1462 | struct ring_buffer_per_cpu, update_pages_work); | ||
1463 | rb_update_pages(cpu_buffer); | ||
1464 | complete(&cpu_buffer->update_done); | ||
1277 | } | 1465 | } |
1278 | 1466 | ||
1279 | /** | 1467 | /** |
@@ -1283,16 +1471,14 @@ out: | |||
1283 | * | 1471 | * |
1284 | * Minimum size is 2 * BUF_PAGE_SIZE. | 1472 | * Minimum size is 2 * BUF_PAGE_SIZE. |
1285 | * | 1473 | * |
1286 | * Returns -1 on failure. | 1474 | * Returns 0 on success and < 0 on failure. |
1287 | */ | 1475 | */ |
1288 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | 1476 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, |
1477 | int cpu_id) | ||
1289 | { | 1478 | { |
1290 | struct ring_buffer_per_cpu *cpu_buffer; | 1479 | struct ring_buffer_per_cpu *cpu_buffer; |
1291 | unsigned nr_pages, rm_pages, new_pages; | 1480 | unsigned nr_pages; |
1292 | struct buffer_page *bpage, *tmp; | 1481 | int cpu, err = 0; |
1293 | unsigned long buffer_size; | ||
1294 | LIST_HEAD(pages); | ||
1295 | int i, cpu; | ||
1296 | 1482 | ||
1297 | /* | 1483 | /* |
1298 | * Always succeed at resizing a non-existent buffer: | 1484 | * Always succeed at resizing a non-existent buffer: |
@@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1302 | 1488 | ||
1303 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1489 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1304 | size *= BUF_PAGE_SIZE; | 1490 | size *= BUF_PAGE_SIZE; |
1305 | buffer_size = buffer->pages * BUF_PAGE_SIZE; | ||
1306 | 1491 | ||
1307 | /* we need a minimum of two pages */ | 1492 | /* we need a minimum of two pages */ |
1308 | if (size < BUF_PAGE_SIZE * 2) | 1493 | if (size < BUF_PAGE_SIZE * 2) |
1309 | size = BUF_PAGE_SIZE * 2; | 1494 | size = BUF_PAGE_SIZE * 2; |
1310 | 1495 | ||
1311 | if (size == buffer_size) | 1496 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1312 | return size; | ||
1313 | |||
1314 | atomic_inc(&buffer->record_disabled); | ||
1315 | 1497 | ||
1316 | /* Make sure all writers are done with this buffer. */ | 1498 | /* |
1317 | synchronize_sched(); | 1499 | * Don't succeed if resizing is disabled, as a reader might be |
1500 | * manipulating the ring buffer and is expecting a sane state while | ||
1501 | * this is true. | ||
1502 | */ | ||
1503 | if (atomic_read(&buffer->resize_disabled)) | ||
1504 | return -EBUSY; | ||
1318 | 1505 | ||
1506 | /* prevent another thread from changing buffer sizes */ | ||
1319 | mutex_lock(&buffer->mutex); | 1507 | mutex_lock(&buffer->mutex); |
1320 | get_online_cpus(); | ||
1321 | |||
1322 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | ||
1323 | 1508 | ||
1324 | if (size < buffer_size) { | 1509 | if (cpu_id == RING_BUFFER_ALL_CPUS) { |
1510 | /* calculate the pages to update */ | ||
1511 | for_each_buffer_cpu(buffer, cpu) { | ||
1512 | cpu_buffer = buffer->buffers[cpu]; | ||
1325 | 1513 | ||
1326 | /* easy case, just free pages */ | 1514 | cpu_buffer->nr_pages_to_update = nr_pages - |
1327 | if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) | 1515 | cpu_buffer->nr_pages; |
1328 | goto out_fail; | 1516 | /* |
1517 | * nothing more to do for removing pages or no update | ||
1518 | */ | ||
1519 | if (cpu_buffer->nr_pages_to_update <= 0) | ||
1520 | continue; | ||
1521 | /* | ||
1522 | * to add pages, make sure all new pages can be | ||
1523 | * allocated without receiving ENOMEM | ||
1524 | */ | ||
1525 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1526 | if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, | ||
1527 | &cpu_buffer->new_pages, cpu)) { | ||
1528 | /* not enough memory for new pages */ | ||
1529 | err = -ENOMEM; | ||
1530 | goto out_err; | ||
1531 | } | ||
1532 | } | ||
1329 | 1533 | ||
1330 | rm_pages = buffer->pages - nr_pages; | 1534 | get_online_cpus(); |
1535 | /* | ||
1536 | * Fire off all the required work handlers | ||
1537 | * We can't schedule on offline CPUs, but it's not necessary | ||
1538 | * since we can change their buffer sizes without any race. | ||
1539 | */ | ||
1540 | for_each_buffer_cpu(buffer, cpu) { | ||
1541 | cpu_buffer = buffer->buffers[cpu]; | ||
1542 | if (!cpu_buffer->nr_pages_to_update) | ||
1543 | continue; | ||
1544 | |||
1545 | if (cpu_online(cpu)) | ||
1546 | schedule_work_on(cpu, | ||
1547 | &cpu_buffer->update_pages_work); | ||
1548 | else | ||
1549 | rb_update_pages(cpu_buffer); | ||
1550 | } | ||
1331 | 1551 | ||
1552 | /* wait for all the updates to complete */ | ||
1332 | for_each_buffer_cpu(buffer, cpu) { | 1553 | for_each_buffer_cpu(buffer, cpu) { |
1333 | cpu_buffer = buffer->buffers[cpu]; | 1554 | cpu_buffer = buffer->buffers[cpu]; |
1334 | rb_remove_pages(cpu_buffer, rm_pages); | 1555 | if (!cpu_buffer->nr_pages_to_update) |
1556 | continue; | ||
1557 | |||
1558 | if (cpu_online(cpu)) | ||
1559 | wait_for_completion(&cpu_buffer->update_done); | ||
1560 | cpu_buffer->nr_pages_to_update = 0; | ||
1335 | } | 1561 | } |
1336 | goto out; | ||
1337 | } | ||
1338 | 1562 | ||
1339 | /* | 1563 | put_online_cpus(); |
1340 | * This is a bit more difficult. We only want to add pages | 1564 | } else { |
1341 | * when we can allocate enough for all CPUs. We do this | 1565 | cpu_buffer = buffer->buffers[cpu_id]; |
1342 | * by allocating all the pages and storing them on a local | ||
1343 | * link list. If we succeed in our allocation, then we | ||
1344 | * add these pages to the cpu_buffers. Otherwise we just free | ||
1345 | * them all and return -ENOMEM; | ||
1346 | */ | ||
1347 | if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) | ||
1348 | goto out_fail; | ||
1349 | 1566 | ||
1350 | new_pages = nr_pages - buffer->pages; | 1567 | if (nr_pages == cpu_buffer->nr_pages) |
1568 | goto out; | ||
1351 | 1569 | ||
1352 | for_each_buffer_cpu(buffer, cpu) { | 1570 | cpu_buffer->nr_pages_to_update = nr_pages - |
1353 | for (i = 0; i < new_pages; i++) { | 1571 | cpu_buffer->nr_pages; |
1354 | struct page *page; | 1572 | |
1355 | /* | 1573 | INIT_LIST_HEAD(&cpu_buffer->new_pages); |
1356 | * __GFP_NORETRY flag makes sure that the allocation | 1574 | if (cpu_buffer->nr_pages_to_update > 0 && |
1357 | * fails gracefully without invoking oom-killer and | 1575 | __rb_allocate_pages(cpu_buffer->nr_pages_to_update, |
1358 | * the system is not destabilized. | 1576 | &cpu_buffer->new_pages, cpu_id)) { |
1359 | */ | 1577 | err = -ENOMEM; |
1360 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), | 1578 | goto out_err; |
1361 | cache_line_size()), | ||
1362 | GFP_KERNEL | __GFP_NORETRY, | ||
1363 | cpu_to_node(cpu)); | ||
1364 | if (!bpage) | ||
1365 | goto free_pages; | ||
1366 | list_add(&bpage->list, &pages); | ||
1367 | page = alloc_pages_node(cpu_to_node(cpu), | ||
1368 | GFP_KERNEL | __GFP_NORETRY, 0); | ||
1369 | if (!page) | ||
1370 | goto free_pages; | ||
1371 | bpage->page = page_address(page); | ||
1372 | rb_init_page(bpage->page); | ||
1373 | } | 1579 | } |
1374 | } | ||
1375 | 1580 | ||
1376 | for_each_buffer_cpu(buffer, cpu) { | 1581 | get_online_cpus(); |
1377 | cpu_buffer = buffer->buffers[cpu]; | ||
1378 | rb_insert_pages(cpu_buffer, &pages, new_pages); | ||
1379 | } | ||
1380 | 1582 | ||
1381 | if (RB_WARN_ON(buffer, !list_empty(&pages))) | 1583 | if (cpu_online(cpu_id)) { |
1382 | goto out_fail; | 1584 | schedule_work_on(cpu_id, |
1585 | &cpu_buffer->update_pages_work); | ||
1586 | wait_for_completion(&cpu_buffer->update_done); | ||
1587 | } else | ||
1588 | rb_update_pages(cpu_buffer); | ||
1589 | |||
1590 | cpu_buffer->nr_pages_to_update = 0; | ||
1591 | put_online_cpus(); | ||
1592 | } | ||
1383 | 1593 | ||
1384 | out: | 1594 | out: |
1385 | buffer->pages = nr_pages; | 1595 | /* |
1386 | put_online_cpus(); | 1596 | * The ring buffer resize can happen with the ring buffer |
1597 | * enabled, so that the update disturbs the tracing as little | ||
1598 | * as possible. But if the buffer is disabled, we do not need | ||
1599 | * to worry about that, and we can take the time to verify | ||
1600 | * that the buffer is not corrupt. | ||
1601 | */ | ||
1602 | if (atomic_read(&buffer->record_disabled)) { | ||
1603 | atomic_inc(&buffer->record_disabled); | ||
1604 | /* | ||
1605 | * Even though the buffer was disabled, we must make sure | ||
1606 | * that it is truly disabled before calling rb_check_pages. | ||
1607 | * There could have been a race between checking | ||
1608 | * record_disable and incrementing it. | ||
1609 | */ | ||
1610 | synchronize_sched(); | ||
1611 | for_each_buffer_cpu(buffer, cpu) { | ||
1612 | cpu_buffer = buffer->buffers[cpu]; | ||
1613 | rb_check_pages(cpu_buffer); | ||
1614 | } | ||
1615 | atomic_dec(&buffer->record_disabled); | ||
1616 | } | ||
1617 | |||
1387 | mutex_unlock(&buffer->mutex); | 1618 | mutex_unlock(&buffer->mutex); |
1619 | return size; | ||
1388 | 1620 | ||
1389 | atomic_dec(&buffer->record_disabled); | 1621 | out_err: |
1622 | for_each_buffer_cpu(buffer, cpu) { | ||
1623 | struct buffer_page *bpage, *tmp; | ||
1390 | 1624 | ||
1391 | return size; | 1625 | cpu_buffer = buffer->buffers[cpu]; |
1626 | cpu_buffer->nr_pages_to_update = 0; | ||
1392 | 1627 | ||
1393 | free_pages: | 1628 | if (list_empty(&cpu_buffer->new_pages)) |
1394 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | 1629 | continue; |
1395 | list_del_init(&bpage->list); | ||
1396 | free_buffer_page(bpage); | ||
1397 | } | ||
1398 | put_online_cpus(); | ||
1399 | mutex_unlock(&buffer->mutex); | ||
1400 | atomic_dec(&buffer->record_disabled); | ||
1401 | return -ENOMEM; | ||
1402 | 1630 | ||
1403 | /* | 1631 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, |
1404 | * Something went totally wrong, and we are too paranoid | 1632 | list) { |
1405 | * to even clean up the mess. | 1633 | list_del_init(&bpage->list); |
1406 | */ | 1634 | free_buffer_page(bpage); |
1407 | out_fail: | 1635 | } |
1408 | put_online_cpus(); | 1636 | } |
1409 | mutex_unlock(&buffer->mutex); | 1637 | mutex_unlock(&buffer->mutex); |
1410 | atomic_dec(&buffer->record_disabled); | 1638 | return err; |
1411 | return -1; | ||
1412 | } | 1639 | } |
1413 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1640 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1414 | 1641 | ||
@@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) | |||
1447 | return __rb_page_index(iter->head_page, iter->head); | 1674 | return __rb_page_index(iter->head_page, iter->head); |
1448 | } | 1675 | } |
1449 | 1676 | ||
1450 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1451 | { | ||
1452 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1453 | } | ||
1454 | |||
1455 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | 1677 | static inline unsigned rb_page_commit(struct buffer_page *bpage) |
1456 | { | 1678 | { |
1457 | return local_read(&bpage->page->commit); | 1679 | return local_read(&bpage->page->commit); |
1458 | } | 1680 | } |
1459 | 1681 | ||
1460 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | ||
1461 | { | ||
1462 | return local_read(&bpage->entries) & RB_WRITE_MASK; | ||
1463 | } | ||
1464 | |||
1465 | /* Size is determined by what has been committed */ | 1682 | /* Size is determined by what has been committed */ |
1466 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1683 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1467 | { | 1684 | { |
@@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | |||
1510 | * assign the commit to the tail. | 1727 | * assign the commit to the tail. |
1511 | */ | 1728 | */ |
1512 | again: | 1729 | again: |
1513 | max_count = cpu_buffer->buffer->pages * 100; | 1730 | max_count = cpu_buffer->nr_pages * 100; |
1514 | 1731 | ||
1515 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | 1732 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { |
1516 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) | 1733 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) |
@@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) | |||
3486 | 3703 | ||
3487 | iter->cpu_buffer = cpu_buffer; | 3704 | iter->cpu_buffer = cpu_buffer; |
3488 | 3705 | ||
3706 | atomic_inc(&buffer->resize_disabled); | ||
3489 | atomic_inc(&cpu_buffer->record_disabled); | 3707 | atomic_inc(&cpu_buffer->record_disabled); |
3490 | 3708 | ||
3491 | return iter; | 3709 | return iter; |
@@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) | |||
3548 | { | 3766 | { |
3549 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3767 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3550 | 3768 | ||
3769 | /* | ||
3770 | * Ring buffer is disabled from recording, here's a good place | ||
3771 | * to check the integrity of the ring buffer. | ||
3772 | */ | ||
3773 | rb_check_pages(cpu_buffer); | ||
3774 | |||
3551 | atomic_dec(&cpu_buffer->record_disabled); | 3775 | atomic_dec(&cpu_buffer->record_disabled); |
3776 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | ||
3552 | kfree(iter); | 3777 | kfree(iter); |
3553 | } | 3778 | } |
3554 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); | 3779 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); |
@@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); | |||
3588 | * ring_buffer_size - return the size of the ring buffer (in bytes) | 3813 | * ring_buffer_size - return the size of the ring buffer (in bytes) |
3589 | * @buffer: The ring buffer. | 3814 | * @buffer: The ring buffer. |
3590 | */ | 3815 | */ |
3591 | unsigned long ring_buffer_size(struct ring_buffer *buffer) | 3816 | unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) |
3592 | { | 3817 | { |
3593 | return BUF_PAGE_SIZE * buffer->pages; | 3818 | /* |
3819 | * Earlier, this method returned | ||
3820 | * BUF_PAGE_SIZE * buffer->nr_pages | ||
3821 | * Since the nr_pages field is now removed, we have converted this to | ||
3822 | * return the per cpu buffer value. | ||
3823 | */ | ||
3824 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3825 | return 0; | ||
3826 | |||
3827 | return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; | ||
3594 | } | 3828 | } |
3595 | EXPORT_SYMBOL_GPL(ring_buffer_size); | 3829 | EXPORT_SYMBOL_GPL(ring_buffer_size); |
3596 | 3830 | ||
@@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3611 | cpu_buffer->commit_page = cpu_buffer->head_page; | 3845 | cpu_buffer->commit_page = cpu_buffer->head_page; |
3612 | 3846 | ||
3613 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 3847 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
3848 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
3614 | local_set(&cpu_buffer->reader_page->write, 0); | 3849 | local_set(&cpu_buffer->reader_page->write, 0); |
3615 | local_set(&cpu_buffer->reader_page->entries, 0); | 3850 | local_set(&cpu_buffer->reader_page->entries, 0); |
3616 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3851 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
@@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3647 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 3882 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
3648 | return; | 3883 | return; |
3649 | 3884 | ||
3885 | atomic_inc(&buffer->resize_disabled); | ||
3650 | atomic_inc(&cpu_buffer->record_disabled); | 3886 | atomic_inc(&cpu_buffer->record_disabled); |
3651 | 3887 | ||
3888 | /* Make sure all commits have finished */ | ||
3889 | synchronize_sched(); | ||
3890 | |||
3652 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3891 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3653 | 3892 | ||
3654 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3893 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
@@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3664 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3903 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3665 | 3904 | ||
3666 | atomic_dec(&cpu_buffer->record_disabled); | 3905 | atomic_dec(&cpu_buffer->record_disabled); |
3906 | atomic_dec(&buffer->resize_disabled); | ||
3667 | } | 3907 | } |
3668 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); | 3908 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); |
3669 | 3909 | ||
@@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3765 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) | 4005 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) |
3766 | goto out; | 4006 | goto out; |
3767 | 4007 | ||
4008 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
4009 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
4010 | |||
3768 | /* At least make sure the two buffers are somewhat the same */ | 4011 | /* At least make sure the two buffers are somewhat the same */ |
3769 | if (buffer_a->pages != buffer_b->pages) | 4012 | if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) |
3770 | goto out; | 4013 | goto out; |
3771 | 4014 | ||
3772 | ret = -EAGAIN; | 4015 | ret = -EAGAIN; |
@@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3780 | if (atomic_read(&buffer_b->record_disabled)) | 4023 | if (atomic_read(&buffer_b->record_disabled)) |
3781 | goto out; | 4024 | goto out; |
3782 | 4025 | ||
3783 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
3784 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
3785 | |||
3786 | if (atomic_read(&cpu_buffer_a->record_disabled)) | 4026 | if (atomic_read(&cpu_buffer_a->record_disabled)) |
3787 | goto out; | 4027 | goto out; |
3788 | 4028 | ||
@@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4071 | struct ring_buffer *buffer = | 4311 | struct ring_buffer *buffer = |
4072 | container_of(self, struct ring_buffer, cpu_notify); | 4312 | container_of(self, struct ring_buffer, cpu_notify); |
4073 | long cpu = (long)hcpu; | 4313 | long cpu = (long)hcpu; |
4314 | int cpu_i, nr_pages_same; | ||
4315 | unsigned int nr_pages; | ||
4074 | 4316 | ||
4075 | switch (action) { | 4317 | switch (action) { |
4076 | case CPU_UP_PREPARE: | 4318 | case CPU_UP_PREPARE: |
@@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4078 | if (cpumask_test_cpu(cpu, buffer->cpumask)) | 4320 | if (cpumask_test_cpu(cpu, buffer->cpumask)) |
4079 | return NOTIFY_OK; | 4321 | return NOTIFY_OK; |
4080 | 4322 | ||
4323 | nr_pages = 0; | ||
4324 | nr_pages_same = 1; | ||
4325 | /* check if all cpu sizes are same */ | ||
4326 | for_each_buffer_cpu(buffer, cpu_i) { | ||
4327 | /* fill in the size from first enabled cpu */ | ||
4328 | if (nr_pages == 0) | ||
4329 | nr_pages = buffer->buffers[cpu_i]->nr_pages; | ||
4330 | if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { | ||
4331 | nr_pages_same = 0; | ||
4332 | break; | ||
4333 | } | ||
4334 | } | ||
4335 | /* allocate minimum pages, user can later expand it */ | ||
4336 | if (!nr_pages_same) | ||
4337 | nr_pages = 2; | ||
4081 | buffer->buffers[cpu] = | 4338 | buffer->buffers[cpu] = |
4082 | rb_allocate_cpu_buffer(buffer, cpu); | 4339 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
4083 | if (!buffer->buffers[cpu]) { | 4340 | if (!buffer->buffers[cpu]) { |
4084 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", | 4341 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", |
4085 | cpu); | 4342 | cpu); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed7b5d1e12f4..68032c6177db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -87,18 +87,6 @@ static int tracing_disabled = 1; | |||
87 | 87 | ||
88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); | 88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); |
89 | 89 | ||
90 | static inline void ftrace_disable_cpu(void) | ||
91 | { | ||
92 | preempt_disable(); | ||
93 | __this_cpu_inc(ftrace_cpu_disabled); | ||
94 | } | ||
95 | |||
96 | static inline void ftrace_enable_cpu(void) | ||
97 | { | ||
98 | __this_cpu_dec(ftrace_cpu_disabled); | ||
99 | preempt_enable(); | ||
100 | } | ||
101 | |||
102 | cpumask_var_t __read_mostly tracing_buffer_mask; | 90 | cpumask_var_t __read_mostly tracing_buffer_mask; |
103 | 91 | ||
104 | /* | 92 | /* |
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) | |||
629 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 617 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
630 | { | 618 | { |
631 | int len; | 619 | int len; |
632 | void *ret; | ||
633 | 620 | ||
634 | if (s->len <= s->readpos) | 621 | if (s->len <= s->readpos) |
635 | return -EBUSY; | 622 | return -EBUSY; |
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
637 | len = s->len - s->readpos; | 624 | len = s->len - s->readpos; |
638 | if (cnt > len) | 625 | if (cnt > len) |
639 | cnt = len; | 626 | cnt = len; |
640 | ret = memcpy(buf, s->buffer + s->readpos, cnt); | 627 | memcpy(buf, s->buffer + s->readpos, cnt); |
641 | if (!ret) | ||
642 | return -EFAULT; | ||
643 | 628 | ||
644 | s->readpos += cnt; | 629 | s->readpos += cnt; |
645 | return cnt; | 630 | return cnt; |
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
751 | 736 | ||
752 | arch_spin_lock(&ftrace_max_lock); | 737 | arch_spin_lock(&ftrace_max_lock); |
753 | 738 | ||
754 | ftrace_disable_cpu(); | ||
755 | |||
756 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); | 739 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); |
757 | 740 | ||
758 | if (ret == -EBUSY) { | 741 | if (ret == -EBUSY) { |
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
766 | "Failed to swap buffers due to commit in progress\n"); | 749 | "Failed to swap buffers due to commit in progress\n"); |
767 | } | 750 | } |
768 | 751 | ||
769 | ftrace_enable_cpu(); | ||
770 | |||
771 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); | 752 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); |
772 | 753 | ||
773 | __update_max_tr(tr, tsk, cpu); | 754 | __update_max_tr(tr, tsk, cpu); |
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
782 | * Register a new plugin tracer. | 763 | * Register a new plugin tracer. |
783 | */ | 764 | */ |
784 | int register_tracer(struct tracer *type) | 765 | int register_tracer(struct tracer *type) |
785 | __releases(kernel_lock) | ||
786 | __acquires(kernel_lock) | ||
787 | { | 766 | { |
788 | struct tracer *t; | 767 | struct tracer *t; |
789 | int ret = 0; | 768 | int ret = 0; |
@@ -841,7 +820,8 @@ __acquires(kernel_lock) | |||
841 | 820 | ||
842 | /* If we expanded the buffers, make sure the max is expanded too */ | 821 | /* If we expanded the buffers, make sure the max is expanded too */ |
843 | if (ring_buffer_expanded && type->use_max_tr) | 822 | if (ring_buffer_expanded && type->use_max_tr) |
844 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | 823 | ring_buffer_resize(max_tr.buffer, trace_buf_size, |
824 | RING_BUFFER_ALL_CPUS); | ||
845 | 825 | ||
846 | /* the test is responsible for initializing and enabling */ | 826 | /* the test is responsible for initializing and enabling */ |
847 | pr_info("Testing tracer %s: ", type->name); | 827 | pr_info("Testing tracer %s: ", type->name); |
@@ -857,7 +837,8 @@ __acquires(kernel_lock) | |||
857 | 837 | ||
858 | /* Shrink the max buffer again */ | 838 | /* Shrink the max buffer again */ |
859 | if (ring_buffer_expanded && type->use_max_tr) | 839 | if (ring_buffer_expanded && type->use_max_tr) |
860 | ring_buffer_resize(max_tr.buffer, 1); | 840 | ring_buffer_resize(max_tr.buffer, 1, |
841 | RING_BUFFER_ALL_CPUS); | ||
861 | 842 | ||
862 | printk(KERN_CONT "PASSED\n"); | 843 | printk(KERN_CONT "PASSED\n"); |
863 | } | 844 | } |
@@ -917,13 +898,6 @@ out: | |||
917 | mutex_unlock(&trace_types_lock); | 898 | mutex_unlock(&trace_types_lock); |
918 | } | 899 | } |
919 | 900 | ||
920 | static void __tracing_reset(struct ring_buffer *buffer, int cpu) | ||
921 | { | ||
922 | ftrace_disable_cpu(); | ||
923 | ring_buffer_reset_cpu(buffer, cpu); | ||
924 | ftrace_enable_cpu(); | ||
925 | } | ||
926 | |||
927 | void tracing_reset(struct trace_array *tr, int cpu) | 901 | void tracing_reset(struct trace_array *tr, int cpu) |
928 | { | 902 | { |
929 | struct ring_buffer *buffer = tr->buffer; | 903 | struct ring_buffer *buffer = tr->buffer; |
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
932 | 906 | ||
933 | /* Make sure all commits have finished */ | 907 | /* Make sure all commits have finished */ |
934 | synchronize_sched(); | 908 | synchronize_sched(); |
935 | __tracing_reset(buffer, cpu); | 909 | ring_buffer_reset_cpu(buffer, cpu); |
936 | 910 | ||
937 | ring_buffer_record_enable(buffer); | 911 | ring_buffer_record_enable(buffer); |
938 | } | 912 | } |
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
950 | tr->time_start = ftrace_now(tr->cpu); | 924 | tr->time_start = ftrace_now(tr->cpu); |
951 | 925 | ||
952 | for_each_online_cpu(cpu) | 926 | for_each_online_cpu(cpu) |
953 | __tracing_reset(buffer, cpu); | 927 | ring_buffer_reset_cpu(buffer, cpu); |
954 | 928 | ||
955 | ring_buffer_record_enable(buffer); | 929 | ring_buffer_record_enable(buffer); |
956 | } | 930 | } |
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
1498 | 1472 | ||
1499 | #endif /* CONFIG_STACKTRACE */ | 1473 | #endif /* CONFIG_STACKTRACE */ |
1500 | 1474 | ||
1475 | /* created for use with alloc_percpu */ | ||
1476 | struct trace_buffer_struct { | ||
1477 | char buffer[TRACE_BUF_SIZE]; | ||
1478 | }; | ||
1479 | |||
1480 | static struct trace_buffer_struct *trace_percpu_buffer; | ||
1481 | static struct trace_buffer_struct *trace_percpu_sirq_buffer; | ||
1482 | static struct trace_buffer_struct *trace_percpu_irq_buffer; | ||
1483 | static struct trace_buffer_struct *trace_percpu_nmi_buffer; | ||
1484 | |||
1485 | /* | ||
1486 | * The buffer used is dependent on the context. There is a per cpu | ||
1487 | * buffer for normal context, softirq contex, hard irq context and | ||
1488 | * for NMI context. Thise allows for lockless recording. | ||
1489 | * | ||
1490 | * Note, if the buffers failed to be allocated, then this returns NULL | ||
1491 | */ | ||
1492 | static char *get_trace_buf(void) | ||
1493 | { | ||
1494 | struct trace_buffer_struct *percpu_buffer; | ||
1495 | struct trace_buffer_struct *buffer; | ||
1496 | |||
1497 | /* | ||
1498 | * If we have allocated per cpu buffers, then we do not | ||
1499 | * need to do any locking. | ||
1500 | */ | ||
1501 | if (in_nmi()) | ||
1502 | percpu_buffer = trace_percpu_nmi_buffer; | ||
1503 | else if (in_irq()) | ||
1504 | percpu_buffer = trace_percpu_irq_buffer; | ||
1505 | else if (in_softirq()) | ||
1506 | percpu_buffer = trace_percpu_sirq_buffer; | ||
1507 | else | ||
1508 | percpu_buffer = trace_percpu_buffer; | ||
1509 | |||
1510 | if (!percpu_buffer) | ||
1511 | return NULL; | ||
1512 | |||
1513 | buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); | ||
1514 | |||
1515 | return buffer->buffer; | ||
1516 | } | ||
1517 | |||
1518 | static int alloc_percpu_trace_buffer(void) | ||
1519 | { | ||
1520 | struct trace_buffer_struct *buffers; | ||
1521 | struct trace_buffer_struct *sirq_buffers; | ||
1522 | struct trace_buffer_struct *irq_buffers; | ||
1523 | struct trace_buffer_struct *nmi_buffers; | ||
1524 | |||
1525 | buffers = alloc_percpu(struct trace_buffer_struct); | ||
1526 | if (!buffers) | ||
1527 | goto err_warn; | ||
1528 | |||
1529 | sirq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1530 | if (!sirq_buffers) | ||
1531 | goto err_sirq; | ||
1532 | |||
1533 | irq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1534 | if (!irq_buffers) | ||
1535 | goto err_irq; | ||
1536 | |||
1537 | nmi_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1538 | if (!nmi_buffers) | ||
1539 | goto err_nmi; | ||
1540 | |||
1541 | trace_percpu_buffer = buffers; | ||
1542 | trace_percpu_sirq_buffer = sirq_buffers; | ||
1543 | trace_percpu_irq_buffer = irq_buffers; | ||
1544 | trace_percpu_nmi_buffer = nmi_buffers; | ||
1545 | |||
1546 | return 0; | ||
1547 | |||
1548 | err_nmi: | ||
1549 | free_percpu(irq_buffers); | ||
1550 | err_irq: | ||
1551 | free_percpu(sirq_buffers); | ||
1552 | err_sirq: | ||
1553 | free_percpu(buffers); | ||
1554 | err_warn: | ||
1555 | WARN(1, "Could not allocate percpu trace_printk buffer"); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | |||
1559 | void trace_printk_init_buffers(void) | ||
1560 | { | ||
1561 | static int buffers_allocated; | ||
1562 | |||
1563 | if (buffers_allocated) | ||
1564 | return; | ||
1565 | |||
1566 | if (alloc_percpu_trace_buffer()) | ||
1567 | return; | ||
1568 | |||
1569 | pr_info("ftrace: Allocated trace_printk buffers\n"); | ||
1570 | |||
1571 | buffers_allocated = 1; | ||
1572 | } | ||
1573 | |||
1501 | /** | 1574 | /** |
1502 | * trace_vbprintk - write binary msg to tracing buffer | 1575 | * trace_vbprintk - write binary msg to tracing buffer |
1503 | * | 1576 | * |
1504 | */ | 1577 | */ |
1505 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | 1578 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) |
1506 | { | 1579 | { |
1507 | static arch_spinlock_t trace_buf_lock = | ||
1508 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
1509 | static u32 trace_buf[TRACE_BUF_SIZE]; | ||
1510 | |||
1511 | struct ftrace_event_call *call = &event_bprint; | 1580 | struct ftrace_event_call *call = &event_bprint; |
1512 | struct ring_buffer_event *event; | 1581 | struct ring_buffer_event *event; |
1513 | struct ring_buffer *buffer; | 1582 | struct ring_buffer *buffer; |
1514 | struct trace_array *tr = &global_trace; | 1583 | struct trace_array *tr = &global_trace; |
1515 | struct trace_array_cpu *data; | ||
1516 | struct bprint_entry *entry; | 1584 | struct bprint_entry *entry; |
1517 | unsigned long flags; | 1585 | unsigned long flags; |
1518 | int disable; | 1586 | char *tbuffer; |
1519 | int cpu, len = 0, size, pc; | 1587 | int len = 0, size, pc; |
1520 | 1588 | ||
1521 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1589 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
1522 | return 0; | 1590 | return 0; |
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1526 | 1594 | ||
1527 | pc = preempt_count(); | 1595 | pc = preempt_count(); |
1528 | preempt_disable_notrace(); | 1596 | preempt_disable_notrace(); |
1529 | cpu = raw_smp_processor_id(); | ||
1530 | data = tr->data[cpu]; | ||
1531 | 1597 | ||
1532 | disable = atomic_inc_return(&data->disabled); | 1598 | tbuffer = get_trace_buf(); |
1533 | if (unlikely(disable != 1)) | 1599 | if (!tbuffer) { |
1600 | len = 0; | ||
1534 | goto out; | 1601 | goto out; |
1602 | } | ||
1535 | 1603 | ||
1536 | /* Lockdep uses trace_printk for lock tracing */ | 1604 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); |
1537 | local_irq_save(flags); | ||
1538 | arch_spin_lock(&trace_buf_lock); | ||
1539 | len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1540 | 1605 | ||
1541 | if (len > TRACE_BUF_SIZE || len < 0) | 1606 | if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) |
1542 | goto out_unlock; | 1607 | goto out; |
1543 | 1608 | ||
1609 | local_save_flags(flags); | ||
1544 | size = sizeof(*entry) + sizeof(u32) * len; | 1610 | size = sizeof(*entry) + sizeof(u32) * len; |
1545 | buffer = tr->buffer; | 1611 | buffer = tr->buffer; |
1546 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, | 1612 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, |
1547 | flags, pc); | 1613 | flags, pc); |
1548 | if (!event) | 1614 | if (!event) |
1549 | goto out_unlock; | 1615 | goto out; |
1550 | entry = ring_buffer_event_data(event); | 1616 | entry = ring_buffer_event_data(event); |
1551 | entry->ip = ip; | 1617 | entry->ip = ip; |
1552 | entry->fmt = fmt; | 1618 | entry->fmt = fmt; |
1553 | 1619 | ||
1554 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1620 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1555 | if (!filter_check_discard(call, entry, buffer, event)) { | 1621 | if (!filter_check_discard(call, entry, buffer, event)) { |
1556 | ring_buffer_unlock_commit(buffer, event); | 1622 | ring_buffer_unlock_commit(buffer, event); |
1557 | ftrace_trace_stack(buffer, flags, 6, pc); | 1623 | ftrace_trace_stack(buffer, flags, 6, pc); |
1558 | } | 1624 | } |
1559 | 1625 | ||
1560 | out_unlock: | ||
1561 | arch_spin_unlock(&trace_buf_lock); | ||
1562 | local_irq_restore(flags); | ||
1563 | |||
1564 | out: | 1626 | out: |
1565 | atomic_dec_return(&data->disabled); | ||
1566 | preempt_enable_notrace(); | 1627 | preempt_enable_notrace(); |
1567 | unpause_graph_tracing(); | 1628 | unpause_graph_tracing(); |
1568 | 1629 | ||
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr, | |||
1588 | int trace_array_vprintk(struct trace_array *tr, | 1649 | int trace_array_vprintk(struct trace_array *tr, |
1589 | unsigned long ip, const char *fmt, va_list args) | 1650 | unsigned long ip, const char *fmt, va_list args) |
1590 | { | 1651 | { |
1591 | static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
1592 | static char trace_buf[TRACE_BUF_SIZE]; | ||
1593 | |||
1594 | struct ftrace_event_call *call = &event_print; | 1652 | struct ftrace_event_call *call = &event_print; |
1595 | struct ring_buffer_event *event; | 1653 | struct ring_buffer_event *event; |
1596 | struct ring_buffer *buffer; | 1654 | struct ring_buffer *buffer; |
1597 | struct trace_array_cpu *data; | 1655 | int len = 0, size, pc; |
1598 | int cpu, len = 0, size, pc; | ||
1599 | struct print_entry *entry; | 1656 | struct print_entry *entry; |
1600 | unsigned long irq_flags; | 1657 | unsigned long flags; |
1601 | int disable; | 1658 | char *tbuffer; |
1602 | 1659 | ||
1603 | if (tracing_disabled || tracing_selftest_running) | 1660 | if (tracing_disabled || tracing_selftest_running) |
1604 | return 0; | 1661 | return 0; |
1605 | 1662 | ||
1663 | /* Don't pollute graph traces with trace_vprintk internals */ | ||
1664 | pause_graph_tracing(); | ||
1665 | |||
1606 | pc = preempt_count(); | 1666 | pc = preempt_count(); |
1607 | preempt_disable_notrace(); | 1667 | preempt_disable_notrace(); |
1608 | cpu = raw_smp_processor_id(); | ||
1609 | data = tr->data[cpu]; | ||
1610 | 1668 | ||
1611 | disable = atomic_inc_return(&data->disabled); | 1669 | |
1612 | if (unlikely(disable != 1)) | 1670 | tbuffer = get_trace_buf(); |
1671 | if (!tbuffer) { | ||
1672 | len = 0; | ||
1613 | goto out; | 1673 | goto out; |
1674 | } | ||
1614 | 1675 | ||
1615 | pause_graph_tracing(); | 1676 | len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
1616 | raw_local_irq_save(irq_flags); | 1677 | if (len > TRACE_BUF_SIZE) |
1617 | arch_spin_lock(&trace_buf_lock); | 1678 | goto out; |
1618 | len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1619 | 1679 | ||
1680 | local_save_flags(flags); | ||
1620 | size = sizeof(*entry) + len + 1; | 1681 | size = sizeof(*entry) + len + 1; |
1621 | buffer = tr->buffer; | 1682 | buffer = tr->buffer; |
1622 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 1683 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
1623 | irq_flags, pc); | 1684 | flags, pc); |
1624 | if (!event) | 1685 | if (!event) |
1625 | goto out_unlock; | 1686 | goto out; |
1626 | entry = ring_buffer_event_data(event); | 1687 | entry = ring_buffer_event_data(event); |
1627 | entry->ip = ip; | 1688 | entry->ip = ip; |
1628 | 1689 | ||
1629 | memcpy(&entry->buf, trace_buf, len); | 1690 | memcpy(&entry->buf, tbuffer, len); |
1630 | entry->buf[len] = '\0'; | 1691 | entry->buf[len] = '\0'; |
1631 | if (!filter_check_discard(call, entry, buffer, event)) { | 1692 | if (!filter_check_discard(call, entry, buffer, event)) { |
1632 | ring_buffer_unlock_commit(buffer, event); | 1693 | ring_buffer_unlock_commit(buffer, event); |
1633 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | 1694 | ftrace_trace_stack(buffer, flags, 6, pc); |
1634 | } | 1695 | } |
1635 | |||
1636 | out_unlock: | ||
1637 | arch_spin_unlock(&trace_buf_lock); | ||
1638 | raw_local_irq_restore(irq_flags); | ||
1639 | unpause_graph_tracing(); | ||
1640 | out: | 1696 | out: |
1641 | atomic_dec_return(&data->disabled); | ||
1642 | preempt_enable_notrace(); | 1697 | preempt_enable_notrace(); |
1698 | unpause_graph_tracing(); | ||
1643 | 1699 | ||
1644 | return len; | 1700 | return len; |
1645 | } | 1701 | } |
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk); | |||
1652 | 1708 | ||
1653 | static void trace_iterator_increment(struct trace_iterator *iter) | 1709 | static void trace_iterator_increment(struct trace_iterator *iter) |
1654 | { | 1710 | { |
1655 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1656 | ftrace_disable_cpu(); | ||
1657 | |||
1658 | iter->idx++; | 1711 | iter->idx++; |
1659 | if (iter->buffer_iter[iter->cpu]) | 1712 | if (iter->buffer_iter[iter->cpu]) |
1660 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); | 1713 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); |
1661 | |||
1662 | ftrace_enable_cpu(); | ||
1663 | } | 1714 | } |
1664 | 1715 | ||
1665 | static struct trace_entry * | 1716 | static struct trace_entry * |
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1669 | struct ring_buffer_event *event; | 1720 | struct ring_buffer_event *event; |
1670 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; | 1721 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; |
1671 | 1722 | ||
1672 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1673 | ftrace_disable_cpu(); | ||
1674 | |||
1675 | if (buf_iter) | 1723 | if (buf_iter) |
1676 | event = ring_buffer_iter_peek(buf_iter, ts); | 1724 | event = ring_buffer_iter_peek(buf_iter, ts); |
1677 | else | 1725 | else |
1678 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, | 1726 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, |
1679 | lost_events); | 1727 | lost_events); |
1680 | 1728 | ||
1681 | ftrace_enable_cpu(); | ||
1682 | |||
1683 | if (event) { | 1729 | if (event) { |
1684 | iter->ent_size = ring_buffer_event_length(event); | 1730 | iter->ent_size = ring_buffer_event_length(event); |
1685 | return ring_buffer_event_data(event); | 1731 | return ring_buffer_event_data(event); |
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) | |||
1769 | 1815 | ||
1770 | static void trace_consume(struct trace_iterator *iter) | 1816 | static void trace_consume(struct trace_iterator *iter) |
1771 | { | 1817 | { |
1772 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1773 | ftrace_disable_cpu(); | ||
1774 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, | 1818 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, |
1775 | &iter->lost_events); | 1819 | &iter->lost_events); |
1776 | ftrace_enable_cpu(); | ||
1777 | } | 1820 | } |
1778 | 1821 | ||
1779 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) | 1822 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) |
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1862 | iter->cpu = 0; | 1905 | iter->cpu = 0; |
1863 | iter->idx = -1; | 1906 | iter->idx = -1; |
1864 | 1907 | ||
1865 | ftrace_disable_cpu(); | ||
1866 | |||
1867 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | 1908 | if (cpu_file == TRACE_PIPE_ALL_CPU) { |
1868 | for_each_tracing_cpu(cpu) | 1909 | for_each_tracing_cpu(cpu) |
1869 | tracing_iter_reset(iter, cpu); | 1910 | tracing_iter_reset(iter, cpu); |
1870 | } else | 1911 | } else |
1871 | tracing_iter_reset(iter, cpu_file); | 1912 | tracing_iter_reset(iter, cpu_file); |
1872 | 1913 | ||
1873 | ftrace_enable_cpu(); | ||
1874 | |||
1875 | iter->leftover = 0; | 1914 | iter->leftover = 0; |
1876 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) | 1915 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) |
1877 | ; | 1916 | ; |
@@ -2332,15 +2371,13 @@ static struct trace_iterator * | |||
2332 | __tracing_open(struct inode *inode, struct file *file) | 2371 | __tracing_open(struct inode *inode, struct file *file) |
2333 | { | 2372 | { |
2334 | long cpu_file = (long) inode->i_private; | 2373 | long cpu_file = (long) inode->i_private; |
2335 | void *fail_ret = ERR_PTR(-ENOMEM); | ||
2336 | struct trace_iterator *iter; | 2374 | struct trace_iterator *iter; |
2337 | struct seq_file *m; | 2375 | int cpu; |
2338 | int cpu, ret; | ||
2339 | 2376 | ||
2340 | if (tracing_disabled) | 2377 | if (tracing_disabled) |
2341 | return ERR_PTR(-ENODEV); | 2378 | return ERR_PTR(-ENODEV); |
2342 | 2379 | ||
2343 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2380 | iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); |
2344 | if (!iter) | 2381 | if (!iter) |
2345 | return ERR_PTR(-ENOMEM); | 2382 | return ERR_PTR(-ENOMEM); |
2346 | 2383 | ||
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2397 | tracing_iter_reset(iter, cpu); | 2434 | tracing_iter_reset(iter, cpu); |
2398 | } | 2435 | } |
2399 | 2436 | ||
2400 | ret = seq_open(file, &tracer_seq_ops); | ||
2401 | if (ret < 0) { | ||
2402 | fail_ret = ERR_PTR(ret); | ||
2403 | goto fail_buffer; | ||
2404 | } | ||
2405 | |||
2406 | m = file->private_data; | ||
2407 | m->private = iter; | ||
2408 | |||
2409 | mutex_unlock(&trace_types_lock); | 2437 | mutex_unlock(&trace_types_lock); |
2410 | 2438 | ||
2411 | return iter; | 2439 | return iter; |
2412 | 2440 | ||
2413 | fail_buffer: | ||
2414 | for_each_tracing_cpu(cpu) { | ||
2415 | if (iter->buffer_iter[cpu]) | ||
2416 | ring_buffer_read_finish(iter->buffer_iter[cpu]); | ||
2417 | } | ||
2418 | free_cpumask_var(iter->started); | ||
2419 | tracing_start(); | ||
2420 | fail: | 2441 | fail: |
2421 | mutex_unlock(&trace_types_lock); | 2442 | mutex_unlock(&trace_types_lock); |
2422 | kfree(iter->trace); | 2443 | kfree(iter->trace); |
2423 | kfree(iter); | 2444 | seq_release_private(inode, file); |
2424 | 2445 | return ERR_PTR(-ENOMEM); | |
2425 | return fail_ret; | ||
2426 | } | 2446 | } |
2427 | 2447 | ||
2428 | int tracing_open_generic(struct inode *inode, struct file *filp) | 2448 | int tracing_open_generic(struct inode *inode, struct file *filp) |
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2458 | tracing_start(); | 2478 | tracing_start(); |
2459 | mutex_unlock(&trace_types_lock); | 2479 | mutex_unlock(&trace_types_lock); |
2460 | 2480 | ||
2461 | seq_release(inode, file); | ||
2462 | mutex_destroy(&iter->mutex); | 2481 | mutex_destroy(&iter->mutex); |
2463 | free_cpumask_var(iter->started); | 2482 | free_cpumask_var(iter->started); |
2464 | kfree(iter->trace); | 2483 | kfree(iter->trace); |
2465 | kfree(iter); | 2484 | seq_release_private(inode, file); |
2466 | return 0; | 2485 | return 0; |
2467 | } | 2486 | } |
2468 | 2487 | ||
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
2648 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 2667 | if (cpumask_test_cpu(cpu, tracing_cpumask) && |
2649 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2668 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2650 | atomic_inc(&global_trace.data[cpu]->disabled); | 2669 | atomic_inc(&global_trace.data[cpu]->disabled); |
2670 | ring_buffer_record_disable_cpu(global_trace.buffer, cpu); | ||
2651 | } | 2671 | } |
2652 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 2672 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && |
2653 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2673 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2654 | atomic_dec(&global_trace.data[cpu]->disabled); | 2674 | atomic_dec(&global_trace.data[cpu]->disabled); |
2675 | ring_buffer_record_enable_cpu(global_trace.buffer, cpu); | ||
2655 | } | 2676 | } |
2656 | } | 2677 | } |
2657 | arch_spin_unlock(&ftrace_max_lock); | 2678 | arch_spin_unlock(&ftrace_max_lock); |
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr) | |||
2974 | return t->init(tr); | 2995 | return t->init(tr); |
2975 | } | 2996 | } |
2976 | 2997 | ||
2977 | static int __tracing_resize_ring_buffer(unsigned long size) | 2998 | static void set_buffer_entries(struct trace_array *tr, unsigned long val) |
2999 | { | ||
3000 | int cpu; | ||
3001 | for_each_tracing_cpu(cpu) | ||
3002 | tr->data[cpu]->entries = val; | ||
3003 | } | ||
3004 | |||
3005 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | ||
2978 | { | 3006 | { |
2979 | int ret; | 3007 | int ret; |
2980 | 3008 | ||
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
2985 | */ | 3013 | */ |
2986 | ring_buffer_expanded = 1; | 3014 | ring_buffer_expanded = 1; |
2987 | 3015 | ||
2988 | ret = ring_buffer_resize(global_trace.buffer, size); | 3016 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
2989 | if (ret < 0) | 3017 | if (ret < 0) |
2990 | return ret; | 3018 | return ret; |
2991 | 3019 | ||
2992 | if (!current_trace->use_max_tr) | 3020 | if (!current_trace->use_max_tr) |
2993 | goto out; | 3021 | goto out; |
2994 | 3022 | ||
2995 | ret = ring_buffer_resize(max_tr.buffer, size); | 3023 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); |
2996 | if (ret < 0) { | 3024 | if (ret < 0) { |
2997 | int r; | 3025 | int r = 0; |
3026 | |||
3027 | if (cpu == RING_BUFFER_ALL_CPUS) { | ||
3028 | int i; | ||
3029 | for_each_tracing_cpu(i) { | ||
3030 | r = ring_buffer_resize(global_trace.buffer, | ||
3031 | global_trace.data[i]->entries, | ||
3032 | i); | ||
3033 | if (r < 0) | ||
3034 | break; | ||
3035 | } | ||
3036 | } else { | ||
3037 | r = ring_buffer_resize(global_trace.buffer, | ||
3038 | global_trace.data[cpu]->entries, | ||
3039 | cpu); | ||
3040 | } | ||
2998 | 3041 | ||
2999 | r = ring_buffer_resize(global_trace.buffer, | ||
3000 | global_trace.entries); | ||
3001 | if (r < 0) { | 3042 | if (r < 0) { |
3002 | /* | 3043 | /* |
3003 | * AARGH! We are left with different | 3044 | * AARGH! We are left with different |
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
3019 | return ret; | 3060 | return ret; |
3020 | } | 3061 | } |
3021 | 3062 | ||
3022 | max_tr.entries = size; | 3063 | if (cpu == RING_BUFFER_ALL_CPUS) |
3064 | set_buffer_entries(&max_tr, size); | ||
3065 | else | ||
3066 | max_tr.data[cpu]->entries = size; | ||
3067 | |||
3023 | out: | 3068 | out: |
3024 | global_trace.entries = size; | 3069 | if (cpu == RING_BUFFER_ALL_CPUS) |
3070 | set_buffer_entries(&global_trace, size); | ||
3071 | else | ||
3072 | global_trace.data[cpu]->entries = size; | ||
3025 | 3073 | ||
3026 | return ret; | 3074 | return ret; |
3027 | } | 3075 | } |
3028 | 3076 | ||
3029 | static ssize_t tracing_resize_ring_buffer(unsigned long size) | 3077 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) |
3030 | { | 3078 | { |
3031 | int cpu, ret = size; | 3079 | int ret = size; |
3032 | 3080 | ||
3033 | mutex_lock(&trace_types_lock); | 3081 | mutex_lock(&trace_types_lock); |
3034 | 3082 | ||
3035 | tracing_stop(); | 3083 | if (cpu_id != RING_BUFFER_ALL_CPUS) { |
3036 | 3084 | /* make sure, this cpu is enabled in the mask */ | |
3037 | /* disable all cpu buffers */ | 3085 | if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { |
3038 | for_each_tracing_cpu(cpu) { | 3086 | ret = -EINVAL; |
3039 | if (global_trace.data[cpu]) | 3087 | goto out; |
3040 | atomic_inc(&global_trace.data[cpu]->disabled); | 3088 | } |
3041 | if (max_tr.data[cpu]) | ||
3042 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
3043 | } | 3089 | } |
3044 | 3090 | ||
3045 | if (size != global_trace.entries) | 3091 | ret = __tracing_resize_ring_buffer(size, cpu_id); |
3046 | ret = __tracing_resize_ring_buffer(size); | ||
3047 | |||
3048 | if (ret < 0) | 3092 | if (ret < 0) |
3049 | ret = -ENOMEM; | 3093 | ret = -ENOMEM; |
3050 | 3094 | ||
3051 | for_each_tracing_cpu(cpu) { | 3095 | out: |
3052 | if (global_trace.data[cpu]) | ||
3053 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
3054 | if (max_tr.data[cpu]) | ||
3055 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
3056 | } | ||
3057 | |||
3058 | tracing_start(); | ||
3059 | mutex_unlock(&trace_types_lock); | 3096 | mutex_unlock(&trace_types_lock); |
3060 | 3097 | ||
3061 | return ret; | 3098 | return ret; |
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void) | |||
3078 | 3115 | ||
3079 | mutex_lock(&trace_types_lock); | 3116 | mutex_lock(&trace_types_lock); |
3080 | if (!ring_buffer_expanded) | 3117 | if (!ring_buffer_expanded) |
3081 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3118 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3119 | RING_BUFFER_ALL_CPUS); | ||
3082 | mutex_unlock(&trace_types_lock); | 3120 | mutex_unlock(&trace_types_lock); |
3083 | 3121 | ||
3084 | return ret; | 3122 | return ret; |
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf) | |||
3102 | mutex_lock(&trace_types_lock); | 3140 | mutex_lock(&trace_types_lock); |
3103 | 3141 | ||
3104 | if (!ring_buffer_expanded) { | 3142 | if (!ring_buffer_expanded) { |
3105 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3143 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3144 | RING_BUFFER_ALL_CPUS); | ||
3106 | if (ret < 0) | 3145 | if (ret < 0) |
3107 | goto out; | 3146 | goto out; |
3108 | ret = 0; | 3147 | ret = 0; |
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf) | |||
3128 | * The max_tr ring buffer has some state (e.g. ring->clock) and | 3167 | * The max_tr ring buffer has some state (e.g. ring->clock) and |
3129 | * we want preserve it. | 3168 | * we want preserve it. |
3130 | */ | 3169 | */ |
3131 | ring_buffer_resize(max_tr.buffer, 1); | 3170 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); |
3132 | max_tr.entries = 1; | 3171 | set_buffer_entries(&max_tr, 1); |
3133 | } | 3172 | } |
3134 | destroy_trace_option_files(topts); | 3173 | destroy_trace_option_files(topts); |
3135 | 3174 | ||
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf) | |||
3137 | 3176 | ||
3138 | topts = create_trace_option_files(current_trace); | 3177 | topts = create_trace_option_files(current_trace); |
3139 | if (current_trace->use_max_tr) { | 3178 | if (current_trace->use_max_tr) { |
3140 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | 3179 | int cpu; |
3141 | if (ret < 0) | 3180 | /* we need to make per cpu buffer sizes equivalent */ |
3142 | goto out; | 3181 | for_each_tracing_cpu(cpu) { |
3143 | max_tr.entries = global_trace.entries; | 3182 | ret = ring_buffer_resize(max_tr.buffer, |
3183 | global_trace.data[cpu]->entries, | ||
3184 | cpu); | ||
3185 | if (ret < 0) | ||
3186 | goto out; | ||
3187 | max_tr.data[cpu]->entries = | ||
3188 | global_trace.data[cpu]->entries; | ||
3189 | } | ||
3144 | } | 3190 | } |
3145 | 3191 | ||
3146 | if (t->init) { | 3192 | if (t->init) { |
@@ -3642,30 +3688,82 @@ out_err: | |||
3642 | goto out; | 3688 | goto out; |
3643 | } | 3689 | } |
3644 | 3690 | ||
3691 | struct ftrace_entries_info { | ||
3692 | struct trace_array *tr; | ||
3693 | int cpu; | ||
3694 | }; | ||
3695 | |||
3696 | static int tracing_entries_open(struct inode *inode, struct file *filp) | ||
3697 | { | ||
3698 | struct ftrace_entries_info *info; | ||
3699 | |||
3700 | if (tracing_disabled) | ||
3701 | return -ENODEV; | ||
3702 | |||
3703 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
3704 | if (!info) | ||
3705 | return -ENOMEM; | ||
3706 | |||
3707 | info->tr = &global_trace; | ||
3708 | info->cpu = (unsigned long)inode->i_private; | ||
3709 | |||
3710 | filp->private_data = info; | ||
3711 | |||
3712 | return 0; | ||
3713 | } | ||
3714 | |||
3645 | static ssize_t | 3715 | static ssize_t |
3646 | tracing_entries_read(struct file *filp, char __user *ubuf, | 3716 | tracing_entries_read(struct file *filp, char __user *ubuf, |
3647 | size_t cnt, loff_t *ppos) | 3717 | size_t cnt, loff_t *ppos) |
3648 | { | 3718 | { |
3649 | struct trace_array *tr = filp->private_data; | 3719 | struct ftrace_entries_info *info = filp->private_data; |
3650 | char buf[96]; | 3720 | struct trace_array *tr = info->tr; |
3651 | int r; | 3721 | char buf[64]; |
3722 | int r = 0; | ||
3723 | ssize_t ret; | ||
3652 | 3724 | ||
3653 | mutex_lock(&trace_types_lock); | 3725 | mutex_lock(&trace_types_lock); |
3654 | if (!ring_buffer_expanded) | 3726 | |
3655 | r = sprintf(buf, "%lu (expanded: %lu)\n", | 3727 | if (info->cpu == RING_BUFFER_ALL_CPUS) { |
3656 | tr->entries >> 10, | 3728 | int cpu, buf_size_same; |
3657 | trace_buf_size >> 10); | 3729 | unsigned long size; |
3658 | else | 3730 | |
3659 | r = sprintf(buf, "%lu\n", tr->entries >> 10); | 3731 | size = 0; |
3732 | buf_size_same = 1; | ||
3733 | /* check if all cpu sizes are same */ | ||
3734 | for_each_tracing_cpu(cpu) { | ||
3735 | /* fill in the size from first enabled cpu */ | ||
3736 | if (size == 0) | ||
3737 | size = tr->data[cpu]->entries; | ||
3738 | if (size != tr->data[cpu]->entries) { | ||
3739 | buf_size_same = 0; | ||
3740 | break; | ||
3741 | } | ||
3742 | } | ||
3743 | |||
3744 | if (buf_size_same) { | ||
3745 | if (!ring_buffer_expanded) | ||
3746 | r = sprintf(buf, "%lu (expanded: %lu)\n", | ||
3747 | size >> 10, | ||
3748 | trace_buf_size >> 10); | ||
3749 | else | ||
3750 | r = sprintf(buf, "%lu\n", size >> 10); | ||
3751 | } else | ||
3752 | r = sprintf(buf, "X\n"); | ||
3753 | } else | ||
3754 | r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); | ||
3755 | |||
3660 | mutex_unlock(&trace_types_lock); | 3756 | mutex_unlock(&trace_types_lock); |
3661 | 3757 | ||
3662 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 3758 | ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
3759 | return ret; | ||
3663 | } | 3760 | } |
3664 | 3761 | ||
3665 | static ssize_t | 3762 | static ssize_t |
3666 | tracing_entries_write(struct file *filp, const char __user *ubuf, | 3763 | tracing_entries_write(struct file *filp, const char __user *ubuf, |
3667 | size_t cnt, loff_t *ppos) | 3764 | size_t cnt, loff_t *ppos) |
3668 | { | 3765 | { |
3766 | struct ftrace_entries_info *info = filp->private_data; | ||
3669 | unsigned long val; | 3767 | unsigned long val; |
3670 | int ret; | 3768 | int ret; |
3671 | 3769 | ||
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3680 | /* value is in KB */ | 3778 | /* value is in KB */ |
3681 | val <<= 10; | 3779 | val <<= 10; |
3682 | 3780 | ||
3683 | ret = tracing_resize_ring_buffer(val); | 3781 | ret = tracing_resize_ring_buffer(val, info->cpu); |
3684 | if (ret < 0) | 3782 | if (ret < 0) |
3685 | return ret; | 3783 | return ret; |
3686 | 3784 | ||
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3689 | return cnt; | 3787 | return cnt; |
3690 | } | 3788 | } |
3691 | 3789 | ||
3790 | static int | ||
3791 | tracing_entries_release(struct inode *inode, struct file *filp) | ||
3792 | { | ||
3793 | struct ftrace_entries_info *info = filp->private_data; | ||
3794 | |||
3795 | kfree(info); | ||
3796 | |||
3797 | return 0; | ||
3798 | } | ||
3799 | |||
3692 | static ssize_t | 3800 | static ssize_t |
3693 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | 3801 | tracing_total_entries_read(struct file *filp, char __user *ubuf, |
3694 | size_t cnt, loff_t *ppos) | 3802 | size_t cnt, loff_t *ppos) |
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, | |||
3700 | 3808 | ||
3701 | mutex_lock(&trace_types_lock); | 3809 | mutex_lock(&trace_types_lock); |
3702 | for_each_tracing_cpu(cpu) { | 3810 | for_each_tracing_cpu(cpu) { |
3703 | size += tr->entries >> 10; | 3811 | size += tr->data[cpu]->entries >> 10; |
3704 | if (!ring_buffer_expanded) | 3812 | if (!ring_buffer_expanded) |
3705 | expanded_size += trace_buf_size >> 10; | 3813 | expanded_size += trace_buf_size >> 10; |
3706 | } | 3814 | } |
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
3734 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | 3842 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) |
3735 | tracing_off(); | 3843 | tracing_off(); |
3736 | /* resize the ring buffer to 0 */ | 3844 | /* resize the ring buffer to 0 */ |
3737 | tracing_resize_ring_buffer(0); | 3845 | tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); |
3738 | 3846 | ||
3739 | return 0; | 3847 | return 0; |
3740 | } | 3848 | } |
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3749 | struct print_entry *entry; | 3857 | struct print_entry *entry; |
3750 | unsigned long irq_flags; | 3858 | unsigned long irq_flags; |
3751 | struct page *pages[2]; | 3859 | struct page *pages[2]; |
3860 | void *map_page[2]; | ||
3752 | int nr_pages = 1; | 3861 | int nr_pages = 1; |
3753 | ssize_t written; | 3862 | ssize_t written; |
3754 | void *page1; | ||
3755 | void *page2; | ||
3756 | int offset; | 3863 | int offset; |
3757 | int size; | 3864 | int size; |
3758 | int len; | 3865 | int len; |
3759 | int ret; | 3866 | int ret; |
3867 | int i; | ||
3760 | 3868 | ||
3761 | if (tracing_disabled) | 3869 | if (tracing_disabled) |
3762 | return -EINVAL; | 3870 | return -EINVAL; |
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3795 | goto out; | 3903 | goto out; |
3796 | } | 3904 | } |
3797 | 3905 | ||
3798 | page1 = kmap_atomic(pages[0]); | 3906 | for (i = 0; i < nr_pages; i++) |
3799 | if (nr_pages == 2) | 3907 | map_page[i] = kmap_atomic(pages[i]); |
3800 | page2 = kmap_atomic(pages[1]); | ||
3801 | 3908 | ||
3802 | local_save_flags(irq_flags); | 3909 | local_save_flags(irq_flags); |
3803 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 3910 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3815 | 3922 | ||
3816 | if (nr_pages == 2) { | 3923 | if (nr_pages == 2) { |
3817 | len = PAGE_SIZE - offset; | 3924 | len = PAGE_SIZE - offset; |
3818 | memcpy(&entry->buf, page1 + offset, len); | 3925 | memcpy(&entry->buf, map_page[0] + offset, len); |
3819 | memcpy(&entry->buf[len], page2, cnt - len); | 3926 | memcpy(&entry->buf[len], map_page[1], cnt - len); |
3820 | } else | 3927 | } else |
3821 | memcpy(&entry->buf, page1 + offset, cnt); | 3928 | memcpy(&entry->buf, map_page[0] + offset, cnt); |
3822 | 3929 | ||
3823 | if (entry->buf[cnt - 1] != '\n') { | 3930 | if (entry->buf[cnt - 1] != '\n') { |
3824 | entry->buf[cnt] = '\n'; | 3931 | entry->buf[cnt] = '\n'; |
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3833 | *fpos += written; | 3940 | *fpos += written; |
3834 | 3941 | ||
3835 | out_unlock: | 3942 | out_unlock: |
3836 | if (nr_pages == 2) | 3943 | for (i = 0; i < nr_pages; i++){ |
3837 | kunmap_atomic(page2); | 3944 | kunmap_atomic(map_page[i]); |
3838 | kunmap_atomic(page1); | 3945 | put_page(pages[i]); |
3839 | while (nr_pages > 0) | 3946 | } |
3840 | put_page(pages[--nr_pages]); | ||
3841 | out: | 3947 | out: |
3842 | return written; | 3948 | return written; |
3843 | } | 3949 | } |
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = { | |||
3933 | }; | 4039 | }; |
3934 | 4040 | ||
3935 | static const struct file_operations tracing_entries_fops = { | 4041 | static const struct file_operations tracing_entries_fops = { |
3936 | .open = tracing_open_generic, | 4042 | .open = tracing_entries_open, |
3937 | .read = tracing_entries_read, | 4043 | .read = tracing_entries_read, |
3938 | .write = tracing_entries_write, | 4044 | .write = tracing_entries_write, |
4045 | .release = tracing_entries_release, | ||
3939 | .llseek = generic_file_llseek, | 4046 | .llseek = generic_file_llseek, |
3940 | }; | 4047 | }; |
3941 | 4048 | ||
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4367 | struct dentry *d_cpu; | 4474 | struct dentry *d_cpu; |
4368 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 4475 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4369 | 4476 | ||
4477 | if (!d_percpu) | ||
4478 | return; | ||
4479 | |||
4370 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 4480 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4371 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4481 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4372 | if (!d_cpu) { | 4482 | if (!d_cpu) { |
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4387 | 4497 | ||
4388 | trace_create_file("stats", 0444, d_cpu, | 4498 | trace_create_file("stats", 0444, d_cpu, |
4389 | (void *) cpu, &tracing_stats_fops); | 4499 | (void *) cpu, &tracing_stats_fops); |
4500 | |||
4501 | trace_create_file("buffer_size_kb", 0444, d_cpu, | ||
4502 | (void *) cpu, &tracing_entries_fops); | ||
4390 | } | 4503 | } |
4391 | 4504 | ||
4392 | #ifdef CONFIG_FTRACE_SELFTEST | 4505 | #ifdef CONFIG_FTRACE_SELFTEST |
@@ -4629,7 +4742,8 @@ static ssize_t | |||
4629 | rb_simple_read(struct file *filp, char __user *ubuf, | 4742 | rb_simple_read(struct file *filp, char __user *ubuf, |
4630 | size_t cnt, loff_t *ppos) | 4743 | size_t cnt, loff_t *ppos) |
4631 | { | 4744 | { |
4632 | struct ring_buffer *buffer = filp->private_data; | 4745 | struct trace_array *tr = filp->private_data; |
4746 | struct ring_buffer *buffer = tr->buffer; | ||
4633 | char buf[64]; | 4747 | char buf[64]; |
4634 | int r; | 4748 | int r; |
4635 | 4749 | ||
@@ -4647,7 +4761,8 @@ static ssize_t | |||
4647 | rb_simple_write(struct file *filp, const char __user *ubuf, | 4761 | rb_simple_write(struct file *filp, const char __user *ubuf, |
4648 | size_t cnt, loff_t *ppos) | 4762 | size_t cnt, loff_t *ppos) |
4649 | { | 4763 | { |
4650 | struct ring_buffer *buffer = filp->private_data; | 4764 | struct trace_array *tr = filp->private_data; |
4765 | struct ring_buffer *buffer = tr->buffer; | ||
4651 | unsigned long val; | 4766 | unsigned long val; |
4652 | int ret; | 4767 | int ret; |
4653 | 4768 | ||
@@ -4716,7 +4831,7 @@ static __init int tracer_init_debugfs(void) | |||
4716 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); | 4831 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); |
4717 | 4832 | ||
4718 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4833 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4719 | &global_trace, &tracing_entries_fops); | 4834 | (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); |
4720 | 4835 | ||
4721 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | 4836 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, |
4722 | &global_trace, &tracing_total_entries_fops); | 4837 | &global_trace, &tracing_total_entries_fops); |
@@ -4734,7 +4849,7 @@ static __init int tracer_init_debugfs(void) | |||
4734 | &trace_clock_fops); | 4849 | &trace_clock_fops); |
4735 | 4850 | ||
4736 | trace_create_file("tracing_on", 0644, d_tracer, | 4851 | trace_create_file("tracing_on", 0644, d_tracer, |
4737 | global_trace.buffer, &rb_simple_fops); | 4852 | &global_trace, &rb_simple_fops); |
4738 | 4853 | ||
4739 | #ifdef CONFIG_DYNAMIC_FTRACE | 4854 | #ifdef CONFIG_DYNAMIC_FTRACE |
4740 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 4855 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
@@ -4955,6 +5070,10 @@ __init static int tracer_alloc_buffers(void) | |||
4955 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 5070 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4956 | goto out_free_buffer_mask; | 5071 | goto out_free_buffer_mask; |
4957 | 5072 | ||
5073 | /* Only allocate trace_printk buffers if a trace_printk exists */ | ||
5074 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | ||
5075 | trace_printk_init_buffers(); | ||
5076 | |||
4958 | /* To save memory, keep the ring buffer size to its minimum */ | 5077 | /* To save memory, keep the ring buffer size to its minimum */ |
4959 | if (ring_buffer_expanded) | 5078 | if (ring_buffer_expanded) |
4960 | ring_buf_size = trace_buf_size; | 5079 | ring_buf_size = trace_buf_size; |
@@ -4973,7 +5092,6 @@ __init static int tracer_alloc_buffers(void) | |||
4973 | WARN_ON(1); | 5092 | WARN_ON(1); |
4974 | goto out_free_cpumask; | 5093 | goto out_free_cpumask; |
4975 | } | 5094 | } |
4976 | global_trace.entries = ring_buffer_size(global_trace.buffer); | ||
4977 | if (global_trace.buffer_disabled) | 5095 | if (global_trace.buffer_disabled) |
4978 | tracing_off(); | 5096 | tracing_off(); |
4979 | 5097 | ||
@@ -4986,7 +5104,6 @@ __init static int tracer_alloc_buffers(void) | |||
4986 | ring_buffer_free(global_trace.buffer); | 5104 | ring_buffer_free(global_trace.buffer); |
4987 | goto out_free_cpumask; | 5105 | goto out_free_cpumask; |
4988 | } | 5106 | } |
4989 | max_tr.entries = 1; | ||
4990 | #endif | 5107 | #endif |
4991 | 5108 | ||
4992 | /* Allocate the first page for all buffers */ | 5109 | /* Allocate the first page for all buffers */ |
@@ -4995,6 +5112,12 @@ __init static int tracer_alloc_buffers(void) | |||
4995 | max_tr.data[i] = &per_cpu(max_tr_data, i); | 5112 | max_tr.data[i] = &per_cpu(max_tr_data, i); |
4996 | } | 5113 | } |
4997 | 5114 | ||
5115 | set_buffer_entries(&global_trace, | ||
5116 | ring_buffer_size(global_trace.buffer, 0)); | ||
5117 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5118 | set_buffer_entries(&max_tr, 1); | ||
5119 | #endif | ||
5120 | |||
4998 | trace_init_cmdlines(); | 5121 | trace_init_cmdlines(); |
4999 | 5122 | ||
5000 | register_tracer(&nop_trace); | 5123 | register_tracer(&nop_trace); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 95059f091a24..6c6f7933eede 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -131,6 +131,7 @@ struct trace_array_cpu { | |||
131 | atomic_t disabled; | 131 | atomic_t disabled; |
132 | void *buffer_page; /* ring buffer spare */ | 132 | void *buffer_page; /* ring buffer spare */ |
133 | 133 | ||
134 | unsigned long entries; | ||
134 | unsigned long saved_latency; | 135 | unsigned long saved_latency; |
135 | unsigned long critical_start; | 136 | unsigned long critical_start; |
136 | unsigned long critical_end; | 137 | unsigned long critical_end; |
@@ -152,7 +153,6 @@ struct trace_array_cpu { | |||
152 | */ | 153 | */ |
153 | struct trace_array { | 154 | struct trace_array { |
154 | struct ring_buffer *buffer; | 155 | struct ring_buffer *buffer; |
155 | unsigned long entries; | ||
156 | int cpu; | 156 | int cpu; |
157 | int buffer_disabled; | 157 | int buffer_disabled; |
158 | cycle_t time_start; | 158 | cycle_t time_start; |
@@ -826,6 +826,8 @@ extern struct list_head ftrace_events; | |||
826 | extern const char *__start___trace_bprintk_fmt[]; | 826 | extern const char *__start___trace_bprintk_fmt[]; |
827 | extern const char *__stop___trace_bprintk_fmt[]; | 827 | extern const char *__stop___trace_bprintk_fmt[]; |
828 | 828 | ||
829 | void trace_printk_init_buffers(void); | ||
830 | |||
829 | #undef FTRACE_ENTRY | 831 | #undef FTRACE_ENTRY |
830 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 832 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
831 | extern struct ftrace_event_call \ | 833 | extern struct ftrace_event_call \ |
@@ -836,11 +838,11 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
836 | filter) | 838 | filter) |
837 | #include "trace_entries.h" | 839 | #include "trace_entries.h" |
838 | 840 | ||
839 | #ifdef CONFIG_FUNCTION_TRACER | 841 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) |
840 | int perf_ftrace_event_register(struct ftrace_event_call *call, | 842 | int perf_ftrace_event_register(struct ftrace_event_call *call, |
841 | enum trace_reg type, void *data); | 843 | enum trace_reg type, void *data); |
842 | #else | 844 | #else |
843 | #define perf_ftrace_event_register NULL | 845 | #define perf_ftrace_event_register NULL |
844 | #endif /* CONFIG_FUNCTION_TRACER */ | 846 | #endif |
845 | 847 | ||
846 | #endif /* _LINUX_KERNEL_TRACE_H */ | 848 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93ae8a9d..29111da1d100 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
294 | if (!call->name || !call->class || !call->class->reg) | 294 | if (!call->name || !call->class || !call->class->reg) |
295 | continue; | 295 | continue; |
296 | 296 | ||
297 | if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) | ||
298 | continue; | ||
299 | |||
297 | if (match && | 300 | if (match && |
298 | strcmp(match, call->name) != 0 && | 301 | strcmp(match, call->name) != 0 && |
299 | strcmp(match, call->class->system) != 0) | 302 | strcmp(match, call->class->system) != 0) |
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1164 | return -1; | 1167 | return -1; |
1165 | } | 1168 | } |
1166 | 1169 | ||
1167 | if (call->class->reg) | 1170 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1168 | trace_create_file("enable", 0644, call->dir, call, | 1171 | trace_create_file("enable", 0644, call->dir, call, |
1169 | enable); | 1172 | enable); |
1170 | 1173 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8bc856..e039906b037d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
180 | .event.type = etype, \ | 180 | .event.type = etype, \ |
181 | .class = &event_class_ftrace_##call, \ | 181 | .class = &event_class_ftrace_##call, \ |
182 | .print_fmt = print, \ | 182 | .print_fmt = print, \ |
183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | ||
183 | }; \ | 184 | }; \ |
184 | struct ftrace_event_call __used \ | 185 | struct ftrace_event_call __used \ |
185 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 859fae6b1825..df611a0e76c5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -652,6 +652,8 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
652 | { | 652 | { |
653 | u64 next_ts; | 653 | u64 next_ts; |
654 | int ret; | 654 | int ret; |
655 | /* trace_find_next_entry will reset ent_size */ | ||
656 | int ent_size = iter->ent_size; | ||
655 | struct trace_seq *s = &iter->seq; | 657 | struct trace_seq *s = &iter->seq; |
656 | struct trace_entry *entry = iter->ent, | 658 | struct trace_entry *entry = iter->ent, |
657 | *next_entry = trace_find_next_entry(iter, NULL, | 659 | *next_entry = trace_find_next_entry(iter, NULL, |
@@ -660,6 +662,9 @@ int trace_print_lat_context(struct trace_iterator *iter) | |||
660 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); | 662 | unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); |
661 | unsigned long rel_usecs; | 663 | unsigned long rel_usecs; |
662 | 664 | ||
665 | /* Restore the original ent_size */ | ||
666 | iter->ent_size = ent_size; | ||
667 | |||
663 | if (!next_entry) | 668 | if (!next_entry) |
664 | next_ts = iter->ts; | 669 | next_ts = iter->ts; |
665 | rel_usecs = ns2usecs(next_ts - iter->ts); | 670 | rel_usecs = ns2usecs(next_ts - iter->ts); |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f9..a9077c1b4ad3 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | 52 | char *fmt; |
53 | 53 | ||
54 | /* allocate the trace_printk per cpu buffers */ | ||
55 | if (start != end) | ||
56 | trace_printk_init_buffers(); | ||
57 | |||
54 | mutex_lock(&btrace_mutex); | 58 | mutex_lock(&btrace_mutex); |
55 | for (iter = start; iter < end; iter++) { | 59 | for (iter = start; iter < end; iter++) { |
56 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); | 60 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); |
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a4721..000000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null | |||
@@ -1,300 +0,0 @@ | |||
1 | /* | ||
2 | * Workqueue statistical tracer. | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | |||
9 | #include <trace/events/workqueue.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/percpu.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/kref.h> | ||
14 | #include "trace_stat.h" | ||
15 | #include "trace.h" | ||
16 | |||
17 | |||
18 | /* A cpu workqueue thread */ | ||
19 | struct cpu_workqueue_stats { | ||
20 | struct list_head list; | ||
21 | struct kref kref; | ||
22 | int cpu; | ||
23 | pid_t pid; | ||
24 | /* Can be inserted from interrupt or user context, need to be atomic */ | ||
25 | atomic_t inserted; | ||
26 | /* | ||
27 | * Don't need to be atomic, works are serialized in a single workqueue thread | ||
28 | * on a single CPU. | ||
29 | */ | ||
30 | unsigned int executed; | ||
31 | }; | ||
32 | |||
33 | /* List of workqueue threads on one cpu */ | ||
34 | struct workqueue_global_stats { | ||
35 | struct list_head list; | ||
36 | spinlock_t lock; | ||
37 | }; | ||
38 | |||
39 | /* Don't need a global lock because allocated before the workqueues, and | ||
40 | * never freed. | ||
41 | */ | ||
42 | static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); | ||
43 | #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) | ||
44 | |||
45 | static void cpu_workqueue_stat_free(struct kref *kref) | ||
46 | { | ||
47 | kfree(container_of(kref, struct cpu_workqueue_stats, kref)); | ||
48 | } | ||
49 | |||
50 | /* Insertion of a work */ | ||
51 | static void | ||
52 | probe_workqueue_insertion(void *ignore, | ||
53 | struct task_struct *wq_thread, | ||
54 | struct work_struct *work) | ||
55 | { | ||
56 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
57 | struct cpu_workqueue_stats *node; | ||
58 | unsigned long flags; | ||
59 | |||
60 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
61 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
62 | if (node->pid == wq_thread->pid) { | ||
63 | atomic_inc(&node->inserted); | ||
64 | goto found; | ||
65 | } | ||
66 | } | ||
67 | pr_debug("trace_workqueue: entry not found\n"); | ||
68 | found: | ||
69 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
70 | } | ||
71 | |||
72 | /* Execution of a work */ | ||
73 | static void | ||
74 | probe_workqueue_execution(void *ignore, | ||
75 | struct task_struct *wq_thread, | ||
76 | struct work_struct *work) | ||
77 | { | ||
78 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
79 | struct cpu_workqueue_stats *node; | ||
80 | unsigned long flags; | ||
81 | |||
82 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
83 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
84 | if (node->pid == wq_thread->pid) { | ||
85 | node->executed++; | ||
86 | goto found; | ||
87 | } | ||
88 | } | ||
89 | pr_debug("trace_workqueue: entry not found\n"); | ||
90 | found: | ||
91 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
92 | } | ||
93 | |||
94 | /* Creation of a cpu workqueue thread */ | ||
95 | static void probe_workqueue_creation(void *ignore, | ||
96 | struct task_struct *wq_thread, int cpu) | ||
97 | { | ||
98 | struct cpu_workqueue_stats *cws; | ||
99 | unsigned long flags; | ||
100 | |||
101 | WARN_ON(cpu < 0); | ||
102 | |||
103 | /* Workqueues are sometimes created in atomic context */ | ||
104 | cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); | ||
105 | if (!cws) { | ||
106 | pr_warning("trace_workqueue: not enough memory\n"); | ||
107 | return; | ||
108 | } | ||
109 | INIT_LIST_HEAD(&cws->list); | ||
110 | kref_init(&cws->kref); | ||
111 | cws->cpu = cpu; | ||
112 | cws->pid = wq_thread->pid; | ||
113 | |||
114 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
115 | list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); | ||
116 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
117 | } | ||
118 | |||
119 | /* Destruction of a cpu workqueue thread */ | ||
120 | static void | ||
121 | probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) | ||
122 | { | ||
123 | /* Workqueue only execute on one cpu */ | ||
124 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
125 | struct cpu_workqueue_stats *node, *next; | ||
126 | unsigned long flags; | ||
127 | |||
128 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
129 | list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, | ||
130 | list) { | ||
131 | if (node->pid == wq_thread->pid) { | ||
132 | list_del(&node->list); | ||
133 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
134 | goto found; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | pr_debug("trace_workqueue: don't find workqueue to destroy\n"); | ||
139 | found: | ||
140 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
141 | |||
142 | } | ||
143 | |||
144 | static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) | ||
145 | { | ||
146 | unsigned long flags; | ||
147 | struct cpu_workqueue_stats *ret = NULL; | ||
148 | |||
149 | |||
150 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
151 | |||
152 | if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { | ||
153 | ret = list_entry(workqueue_cpu_stat(cpu)->list.next, | ||
154 | struct cpu_workqueue_stats, list); | ||
155 | kref_get(&ret->kref); | ||
156 | } | ||
157 | |||
158 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
159 | |||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | static void *workqueue_stat_start(struct tracer_stat *trace) | ||
164 | { | ||
165 | int cpu; | ||
166 | void *ret = NULL; | ||
167 | |||
168 | for_each_possible_cpu(cpu) { | ||
169 | ret = workqueue_stat_start_cpu(cpu); | ||
170 | if (ret) | ||
171 | return ret; | ||
172 | } | ||
173 | return NULL; | ||
174 | } | ||
175 | |||
176 | static void *workqueue_stat_next(void *prev, int idx) | ||
177 | { | ||
178 | struct cpu_workqueue_stats *prev_cws = prev; | ||
179 | struct cpu_workqueue_stats *ret; | ||
180 | int cpu = prev_cws->cpu; | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
184 | if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { | ||
185 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
186 | do { | ||
187 | cpu = cpumask_next(cpu, cpu_possible_mask); | ||
188 | if (cpu >= nr_cpu_ids) | ||
189 | return NULL; | ||
190 | } while (!(ret = workqueue_stat_start_cpu(cpu))); | ||
191 | return ret; | ||
192 | } else { | ||
193 | ret = list_entry(prev_cws->list.next, | ||
194 | struct cpu_workqueue_stats, list); | ||
195 | kref_get(&ret->kref); | ||
196 | } | ||
197 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static int workqueue_stat_show(struct seq_file *s, void *p) | ||
203 | { | ||
204 | struct cpu_workqueue_stats *cws = p; | ||
205 | struct pid *pid; | ||
206 | struct task_struct *tsk; | ||
207 | |||
208 | pid = find_get_pid(cws->pid); | ||
209 | if (pid) { | ||
210 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
211 | if (tsk) { | ||
212 | seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, | ||
213 | atomic_read(&cws->inserted), cws->executed, | ||
214 | tsk->comm); | ||
215 | put_task_struct(tsk); | ||
216 | } | ||
217 | put_pid(pid); | ||
218 | } | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static void workqueue_stat_release(void *stat) | ||
224 | { | ||
225 | struct cpu_workqueue_stats *node = stat; | ||
226 | |||
227 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
228 | } | ||
229 | |||
230 | static int workqueue_stat_headers(struct seq_file *s) | ||
231 | { | ||
232 | seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); | ||
233 | seq_printf(s, "# | | | |\n"); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct tracer_stat workqueue_stats __read_mostly = { | ||
238 | .name = "workqueues", | ||
239 | .stat_start = workqueue_stat_start, | ||
240 | .stat_next = workqueue_stat_next, | ||
241 | .stat_show = workqueue_stat_show, | ||
242 | .stat_release = workqueue_stat_release, | ||
243 | .stat_headers = workqueue_stat_headers | ||
244 | }; | ||
245 | |||
246 | |||
247 | int __init stat_workqueue_init(void) | ||
248 | { | ||
249 | if (register_stat_tracer(&workqueue_stats)) { | ||
250 | pr_warning("Unable to register workqueue stat tracer\n"); | ||
251 | return 1; | ||
252 | } | ||
253 | |||
254 | return 0; | ||
255 | } | ||
256 | fs_initcall(stat_workqueue_init); | ||
257 | |||
258 | /* | ||
259 | * Workqueues are created very early, just after pre-smp initcalls. | ||
260 | * So we must register our tracepoints at this stage. | ||
261 | */ | ||
262 | int __init trace_workqueue_early_init(void) | ||
263 | { | ||
264 | int ret, cpu; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
272 | if (ret) | ||
273 | goto out; | ||
274 | |||
275 | ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
276 | if (ret) | ||
277 | goto no_insertion; | ||
278 | |||
279 | ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
280 | if (ret) | ||
281 | goto no_execution; | ||
282 | |||
283 | ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); | ||
284 | if (ret) | ||
285 | goto no_creation; | ||
286 | |||
287 | return 0; | ||
288 | |||
289 | no_creation: | ||
290 | unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
291 | no_execution: | ||
292 | unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
293 | no_insertion: | ||
294 | unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
295 | out: | ||
296 | pr_warning("trace_workqueue: unable to trace workqueues\n"); | ||
297 | |||
298 | return 1; | ||
299 | } | ||
300 | early_initcall(trace_workqueue_early_init); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c08..9a3128dc67df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1032 | cwq = get_cwq(gcwq->cpu, wq); | 1032 | cwq = get_cwq(gcwq->cpu, wq); |
1033 | trace_workqueue_queue_work(cpu, cwq, work); | 1033 | trace_workqueue_queue_work(cpu, cwq, work); |
1034 | 1034 | ||
1035 | BUG_ON(!list_empty(&work->entry)); | 1035 | if (WARN_ON(!list_empty(&work->entry))) { |
1036 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
1037 | return; | ||
1038 | } | ||
1036 | 1039 | ||
1037 | cwq->nr_in_flight[cwq->work_color]++; | 1040 | cwq->nr_in_flight[cwq->work_color]++; |
1038 | work_flags = work_color_to_flags(cwq->work_color); | 1041 | work_flags = work_color_to_flags(cwq->work_color); |
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) | |||
1210 | } else | 1213 | } else |
1211 | wake_up_all(&gcwq->trustee_wait); | 1214 | wake_up_all(&gcwq->trustee_wait); |
1212 | 1215 | ||
1213 | /* sanity check nr_running */ | 1216 | /* |
1214 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | 1217 | * Sanity check nr_running. Because trustee releases gcwq->lock |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | ||
1219 | * warning may trigger spuriously. Check iff trustee is idle. | ||
1220 | */ | ||
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | ||
1222 | gcwq->nr_workers == gcwq->nr_idle && | ||
1215 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); |
1216 | } | 1224 | } |
1217 | 1225 | ||
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) | |||
1810 | * lock freed" warnings as well as problems when looking into | 1818 | * lock freed" warnings as well as problems when looking into |
1811 | * work->lockdep_map, make a copy and use that here. | 1819 | * work->lockdep_map, make a copy and use that here. |
1812 | */ | 1820 | */ |
1813 | struct lockdep_map lockdep_map = work->lockdep_map; | 1821 | struct lockdep_map lockdep_map; |
1822 | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | ||
1814 | #endif | 1824 | #endif |
1815 | /* | 1825 | /* |
1816 | * A single work shouldn't be executed concurrently by | 1826 | * A single work shouldn't be executed concurrently by |
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) | |||
2506 | { | 2516 | { |
2507 | struct wq_barrier barr; | 2517 | struct wq_barrier barr; |
2508 | 2518 | ||
2519 | lock_map_acquire(&work->lockdep_map); | ||
2520 | lock_map_release(&work->lockdep_map); | ||
2521 | |||
2509 | if (start_flush_work(work, &barr, true)) { | 2522 | if (start_flush_work(work, &barr, true)) { |
2510 | wait_for_completion(&barr.done); | 2523 | wait_for_completion(&barr.done); |
2511 | destroy_work_on_stack(&barr.work); | 2524 | destroy_work_on_stack(&barr.work); |