diff options
Diffstat (limited to 'kernel')
48 files changed, 4177 insertions, 1870 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9f..6c07f30fa9b7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -43,6 +43,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
45 | obj-$(CONFIG_SMP) += smp.o | 45 | obj-$(CONFIG_SMP) += smp.o |
46 | obj-$(CONFIG_SMP) += smpboot.o | ||
46 | ifneq ($(CONFIG_SMP),y) | 47 | ifneq ($(CONFIG_SMP),y) |
47 | obj-y += up.o | 48 | obj-y += up.o |
48 | endif | 49 | endif |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34eae..4b96415527b8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
69 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
70 | #include <linux/compat.h> | ||
70 | 71 | ||
71 | #include "audit.h" | 72 | #include "audit.h" |
72 | 73 | ||
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) | |||
2710 | audit_log_end(ab); | 2711 | audit_log_end(ab); |
2711 | } | 2712 | } |
2712 | 2713 | ||
2713 | void __audit_seccomp(unsigned long syscall) | 2714 | void __audit_seccomp(unsigned long syscall, long signr, int code) |
2714 | { | 2715 | { |
2715 | struct audit_buffer *ab; | 2716 | struct audit_buffer *ab; |
2716 | 2717 | ||
2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2718 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2718 | audit_log_abend(ab, "seccomp", SIGKILL); | 2719 | audit_log_abend(ab, "seccomp", signr); |
2719 | audit_log_format(ab, " syscall=%ld", syscall); | 2720 | audit_log_format(ab, " syscall=%ld", syscall); |
2721 | audit_log_format(ab, " compat=%d", is_compat_task()); | ||
2722 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
2723 | audit_log_format(ab, " code=0x%x", code); | ||
2720 | audit_log_end(ab); | 2724 | audit_log_end(ab); |
2721 | } | 2725 | } |
2722 | 2726 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c9..ad8eae5bb801 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -60,9 +60,13 @@ | |||
60 | #include <linux/eventfd.h> | 60 | #include <linux/eventfd.h> |
61 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
63 | #include <linux/kthread.h> | ||
63 | 64 | ||
64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
65 | 66 | ||
67 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
68 | #define CSS_DEACT_BIAS INT_MIN | ||
69 | |||
66 | /* | 70 | /* |
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 71 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
68 | * hierarchy must be performed while holding it. | 72 | * hierarchy must be performed while holding it. |
@@ -127,6 +131,9 @@ struct cgroupfs_root { | |||
127 | /* A list running through the active hierarchies */ | 131 | /* A list running through the active hierarchies */ |
128 | struct list_head root_list; | 132 | struct list_head root_list; |
129 | 133 | ||
134 | /* All cgroups on this root, cgroup_mutex protected */ | ||
135 | struct list_head allcg_list; | ||
136 | |||
130 | /* Hierarchy-specific flags */ | 137 | /* Hierarchy-specific flags */ |
131 | unsigned long flags; | 138 | unsigned long flags; |
132 | 139 | ||
@@ -145,6 +152,15 @@ struct cgroupfs_root { | |||
145 | static struct cgroupfs_root rootnode; | 152 | static struct cgroupfs_root rootnode; |
146 | 153 | ||
147 | /* | 154 | /* |
155 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
156 | */ | ||
157 | struct cfent { | ||
158 | struct list_head node; | ||
159 | struct dentry *dentry; | ||
160 | struct cftype *type; | ||
161 | }; | ||
162 | |||
163 | /* | ||
148 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 164 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
149 | * cgroup_subsys->use_id != 0. | 165 | * cgroup_subsys->use_id != 0. |
150 | */ | 166 | */ |
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void) | |||
239 | 255 | ||
240 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
241 | 257 | ||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
259 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
260 | { | ||
261 | int v = atomic_read(&css->refcnt); | ||
262 | |||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | ||
264 | } | ||
265 | |||
242 | /* convenient tests for these bits */ | 266 | /* convenient tests for these bits */ |
243 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 267 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
244 | { | 268 | { |
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
279 | #define for_each_active_root(_root) \ | 303 | #define for_each_active_root(_root) \ |
280 | list_for_each_entry(_root, &roots, root_list) | 304 | list_for_each_entry(_root, &roots, root_list) |
281 | 305 | ||
306 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
307 | { | ||
308 | return dentry->d_fsdata; | ||
309 | } | ||
310 | |||
311 | static inline struct cfent *__d_cfe(struct dentry *dentry) | ||
312 | { | ||
313 | return dentry->d_fsdata; | ||
314 | } | ||
315 | |||
316 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
317 | { | ||
318 | return __d_cfe(dentry)->type; | ||
319 | } | ||
320 | |||
282 | /* the list of cgroups eligible for automatic release. Protected by | 321 | /* the list of cgroups eligible for automatic release. Protected by |
283 | * release_list_lock */ | 322 | * release_list_lock */ |
284 | static LIST_HEAD(release_list); | 323 | static LIST_HEAD(release_list); |
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
816 | struct cgroup_subsys *ss; | 855 | struct cgroup_subsys *ss; |
817 | int ret = 0; | 856 | int ret = 0; |
818 | 857 | ||
819 | for_each_subsys(cgrp->root, ss) | 858 | for_each_subsys(cgrp->root, ss) { |
820 | if (ss->pre_destroy) { | 859 | if (!ss->pre_destroy) |
821 | ret = ss->pre_destroy(cgrp); | 860 | continue; |
822 | if (ret) | 861 | |
823 | break; | 862 | ret = ss->pre_destroy(cgrp); |
863 | if (ret) { | ||
864 | /* ->pre_destroy() failure is being deprecated */ | ||
865 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
866 | break; | ||
824 | } | 867 | } |
868 | } | ||
825 | 869 | ||
826 | return ret; | 870 | return ret; |
827 | } | 871 | } |
@@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
864 | BUG_ON(!list_empty(&cgrp->pidlists)); | 908 | BUG_ON(!list_empty(&cgrp->pidlists)); |
865 | 909 | ||
866 | kfree_rcu(cgrp, rcu_head); | 910 | kfree_rcu(cgrp, rcu_head); |
911 | } else { | ||
912 | struct cfent *cfe = __d_cfe(dentry); | ||
913 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | ||
914 | |||
915 | WARN_ONCE(!list_empty(&cfe->node) && | ||
916 | cgrp != &cgrp->root->top_cgroup, | ||
917 | "cfe still linked for %s\n", cfe->type->name); | ||
918 | kfree(cfe); | ||
867 | } | 919 | } |
868 | iput(inode); | 920 | iput(inode); |
869 | } | 921 | } |
@@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d) | |||
882 | dput(parent); | 934 | dput(parent); |
883 | } | 935 | } |
884 | 936 | ||
885 | static void cgroup_clear_directory(struct dentry *dentry) | 937 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
886 | { | 938 | { |
887 | struct list_head *node; | 939 | struct cfent *cfe; |
888 | 940 | ||
889 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 941 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
890 | spin_lock(&dentry->d_lock); | 942 | lockdep_assert_held(&cgroup_mutex); |
891 | node = dentry->d_subdirs.next; | 943 | |
892 | while (node != &dentry->d_subdirs) { | 944 | list_for_each_entry(cfe, &cgrp->files, node) { |
893 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 945 | struct dentry *d = cfe->dentry; |
894 | 946 | ||
895 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 947 | if (cft && cfe->type != cft) |
896 | list_del_init(node); | 948 | continue; |
897 | if (d->d_inode) { | 949 | |
898 | /* This should never be called on a cgroup | 950 | dget(d); |
899 | * directory with child cgroups */ | 951 | d_delete(d); |
900 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 952 | simple_unlink(d->d_inode, d); |
901 | dget_dlock(d); | 953 | list_del_init(&cfe->node); |
902 | spin_unlock(&d->d_lock); | 954 | dput(d); |
903 | spin_unlock(&dentry->d_lock); | 955 | |
904 | d_delete(d); | 956 | return 0; |
905 | simple_unlink(dentry->d_inode, d); | ||
906 | dput(d); | ||
907 | spin_lock(&dentry->d_lock); | ||
908 | } else | ||
909 | spin_unlock(&d->d_lock); | ||
910 | node = dentry->d_subdirs.next; | ||
911 | } | 957 | } |
912 | spin_unlock(&dentry->d_lock); | 958 | return -ENOENT; |
959 | } | ||
960 | |||
961 | static void cgroup_clear_directory(struct dentry *dir) | ||
962 | { | ||
963 | struct cgroup *cgrp = __d_cgrp(dir); | ||
964 | |||
965 | while (!list_empty(&cgrp->files)) | ||
966 | cgroup_rm_file(cgrp, NULL); | ||
913 | } | 967 | } |
914 | 968 | ||
915 | /* | 969 | /* |
@@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1294 | if (ret) | 1348 | if (ret) |
1295 | goto out_unlock; | 1349 | goto out_unlock; |
1296 | 1350 | ||
1351 | /* See feature-removal-schedule.txt */ | ||
1352 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1353 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | ||
1354 | task_tgid_nr(current), current->comm); | ||
1355 | |||
1297 | /* Don't allow flags or name to change at remount */ | 1356 | /* Don't allow flags or name to change at remount */ |
1298 | if (opts.flags != root->flags || | 1357 | if (opts.flags != root->flags || |
1299 | (opts.name && strcmp(opts.name, root->name))) { | 1358 | (opts.name && strcmp(opts.name, root->name))) { |
@@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1308 | goto out_unlock; | 1367 | goto out_unlock; |
1309 | } | 1368 | } |
1310 | 1369 | ||
1311 | /* (re)populate subsystem files */ | 1370 | /* clear out any existing files and repopulate subsystem files */ |
1371 | cgroup_clear_directory(cgrp->dentry); | ||
1312 | cgroup_populate_dir(cgrp); | 1372 | cgroup_populate_dir(cgrp); |
1313 | 1373 | ||
1314 | if (opts.release_agent) | 1374 | if (opts.release_agent) |
@@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1333 | { | 1393 | { |
1334 | INIT_LIST_HEAD(&cgrp->sibling); | 1394 | INIT_LIST_HEAD(&cgrp->sibling); |
1335 | INIT_LIST_HEAD(&cgrp->children); | 1395 | INIT_LIST_HEAD(&cgrp->children); |
1396 | INIT_LIST_HEAD(&cgrp->files); | ||
1336 | INIT_LIST_HEAD(&cgrp->css_sets); | 1397 | INIT_LIST_HEAD(&cgrp->css_sets); |
1337 | INIT_LIST_HEAD(&cgrp->release_list); | 1398 | INIT_LIST_HEAD(&cgrp->release_list); |
1338 | INIT_LIST_HEAD(&cgrp->pidlists); | 1399 | INIT_LIST_HEAD(&cgrp->pidlists); |
@@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1344 | static void init_cgroup_root(struct cgroupfs_root *root) | 1405 | static void init_cgroup_root(struct cgroupfs_root *root) |
1345 | { | 1406 | { |
1346 | struct cgroup *cgrp = &root->top_cgroup; | 1407 | struct cgroup *cgrp = &root->top_cgroup; |
1408 | |||
1347 | INIT_LIST_HEAD(&root->subsys_list); | 1409 | INIT_LIST_HEAD(&root->subsys_list); |
1348 | INIT_LIST_HEAD(&root->root_list); | 1410 | INIT_LIST_HEAD(&root->root_list); |
1411 | INIT_LIST_HEAD(&root->allcg_list); | ||
1349 | root->number_of_cgroups = 1; | 1412 | root->number_of_cgroups = 1; |
1350 | cgrp->root = root; | 1413 | cgrp->root = root; |
1351 | cgrp->top_cgroup = cgrp; | 1414 | cgrp->top_cgroup = cgrp; |
1415 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1352 | init_cgroup_housekeeping(cgrp); | 1416 | init_cgroup_housekeeping(cgrp); |
1353 | } | 1417 | } |
1354 | 1418 | ||
@@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = { | |||
1692 | 1756 | ||
1693 | static struct kobject *cgroup_kobj; | 1757 | static struct kobject *cgroup_kobj; |
1694 | 1758 | ||
1695 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
1696 | { | ||
1697 | return dentry->d_fsdata; | ||
1698 | } | ||
1699 | |||
1700 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
1701 | { | ||
1702 | return dentry->d_fsdata; | ||
1703 | } | ||
1704 | |||
1705 | /** | 1759 | /** |
1706 | * cgroup_path - generate the path of a cgroup | 1760 | * cgroup_path - generate the path of a cgroup |
1707 | * @cgrp: the cgroup in question | 1761 | * @cgrp: the cgroup in question |
@@ -2172,6 +2226,18 @@ retry_find_task: | |||
2172 | 2226 | ||
2173 | if (threadgroup) | 2227 | if (threadgroup) |
2174 | tsk = tsk->group_leader; | 2228 | tsk = tsk->group_leader; |
2229 | |||
2230 | /* | ||
2231 | * Workqueue threads may acquire PF_THREAD_BOUND and become | ||
2232 | * trapped in a cpuset, or RT worker may be born in a cgroup | ||
2233 | * with no rt_runtime allocated. Just say no. | ||
2234 | */ | ||
2235 | if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { | ||
2236 | ret = -EINVAL; | ||
2237 | rcu_read_unlock(); | ||
2238 | goto out_unlock_cgroup; | ||
2239 | } | ||
2240 | |||
2175 | get_task_struct(tsk); | 2241 | get_task_struct(tsk); |
2176 | rcu_read_unlock(); | 2242 | rcu_read_unlock(); |
2177 | 2243 | ||
@@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2603 | return mode; | 2669 | return mode; |
2604 | } | 2670 | } |
2605 | 2671 | ||
2606 | int cgroup_add_file(struct cgroup *cgrp, | 2672 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2607 | struct cgroup_subsys *subsys, | 2673 | const struct cftype *cft) |
2608 | const struct cftype *cft) | ||
2609 | { | 2674 | { |
2610 | struct dentry *dir = cgrp->dentry; | 2675 | struct dentry *dir = cgrp->dentry; |
2676 | struct cgroup *parent = __d_cgrp(dir); | ||
2611 | struct dentry *dentry; | 2677 | struct dentry *dentry; |
2678 | struct cfent *cfe; | ||
2612 | int error; | 2679 | int error; |
2613 | umode_t mode; | 2680 | umode_t mode; |
2614 | |||
2615 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2681 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2682 | |||
2683 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2684 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2685 | return 0; | ||
2686 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2687 | return 0; | ||
2688 | |||
2616 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2689 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2617 | strcpy(name, subsys->name); | 2690 | strcpy(name, subsys->name); |
2618 | strcat(name, "."); | 2691 | strcat(name, "."); |
2619 | } | 2692 | } |
2620 | strcat(name, cft->name); | 2693 | strcat(name, cft->name); |
2694 | |||
2621 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2695 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2696 | |||
2697 | cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); | ||
2698 | if (!cfe) | ||
2699 | return -ENOMEM; | ||
2700 | |||
2622 | dentry = lookup_one_len(name, dir, strlen(name)); | 2701 | dentry = lookup_one_len(name, dir, strlen(name)); |
2623 | if (!IS_ERR(dentry)) { | 2702 | if (IS_ERR(dentry)) { |
2624 | mode = cgroup_file_mode(cft); | ||
2625 | error = cgroup_create_file(dentry, mode | S_IFREG, | ||
2626 | cgrp->root->sb); | ||
2627 | if (!error) | ||
2628 | dentry->d_fsdata = (void *)cft; | ||
2629 | dput(dentry); | ||
2630 | } else | ||
2631 | error = PTR_ERR(dentry); | 2703 | error = PTR_ERR(dentry); |
2704 | goto out; | ||
2705 | } | ||
2706 | |||
2707 | mode = cgroup_file_mode(cft); | ||
2708 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | ||
2709 | if (!error) { | ||
2710 | cfe->type = (void *)cft; | ||
2711 | cfe->dentry = dentry; | ||
2712 | dentry->d_fsdata = cfe; | ||
2713 | list_add_tail(&cfe->node, &parent->files); | ||
2714 | cfe = NULL; | ||
2715 | } | ||
2716 | dput(dentry); | ||
2717 | out: | ||
2718 | kfree(cfe); | ||
2632 | return error; | 2719 | return error; |
2633 | } | 2720 | } |
2634 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2635 | 2721 | ||
2636 | int cgroup_add_files(struct cgroup *cgrp, | 2722 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2637 | struct cgroup_subsys *subsys, | 2723 | const struct cftype cfts[], bool is_add) |
2638 | const struct cftype cft[], | ||
2639 | int count) | ||
2640 | { | 2724 | { |
2641 | int i, err; | 2725 | const struct cftype *cft; |
2642 | for (i = 0; i < count; i++) { | 2726 | int err, ret = 0; |
2643 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2727 | |
2644 | if (err) | 2728 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2645 | return err; | 2729 | if (is_add) |
2730 | err = cgroup_add_file(cgrp, subsys, cft); | ||
2731 | else | ||
2732 | err = cgroup_rm_file(cgrp, cft); | ||
2733 | if (err) { | ||
2734 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2735 | is_add ? "add" : "remove", cft->name, err); | ||
2736 | ret = err; | ||
2737 | } | ||
2738 | } | ||
2739 | return ret; | ||
2740 | } | ||
2741 | |||
2742 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2743 | |||
2744 | static void cgroup_cfts_prepare(void) | ||
2745 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | ||
2746 | { | ||
2747 | /* | ||
2748 | * Thanks to the entanglement with vfs inode locking, we can't walk | ||
2749 | * the existing cgroups under cgroup_mutex and create files. | ||
2750 | * Instead, we increment reference on all cgroups and build list of | ||
2751 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | ||
2752 | * exclusive access to the field. | ||
2753 | */ | ||
2754 | mutex_lock(&cgroup_cft_mutex); | ||
2755 | mutex_lock(&cgroup_mutex); | ||
2756 | } | ||
2757 | |||
2758 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | ||
2759 | const struct cftype *cfts, bool is_add) | ||
2760 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | ||
2761 | { | ||
2762 | LIST_HEAD(pending); | ||
2763 | struct cgroup *cgrp, *n; | ||
2764 | |||
2765 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | ||
2766 | if (cfts && ss->root != &rootnode) { | ||
2767 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | ||
2768 | dget(cgrp->dentry); | ||
2769 | list_add_tail(&cgrp->cft_q_node, &pending); | ||
2770 | } | ||
2771 | } | ||
2772 | |||
2773 | mutex_unlock(&cgroup_mutex); | ||
2774 | |||
2775 | /* | ||
2776 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | ||
2777 | * files for all cgroups which were created before. | ||
2778 | */ | ||
2779 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | ||
2780 | struct inode *inode = cgrp->dentry->d_inode; | ||
2781 | |||
2782 | mutex_lock(&inode->i_mutex); | ||
2783 | mutex_lock(&cgroup_mutex); | ||
2784 | if (!cgroup_is_removed(cgrp)) | ||
2785 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | ||
2786 | mutex_unlock(&cgroup_mutex); | ||
2787 | mutex_unlock(&inode->i_mutex); | ||
2788 | |||
2789 | list_del_init(&cgrp->cft_q_node); | ||
2790 | dput(cgrp->dentry); | ||
2646 | } | 2791 | } |
2792 | |||
2793 | mutex_unlock(&cgroup_cft_mutex); | ||
2794 | } | ||
2795 | |||
2796 | /** | ||
2797 | * cgroup_add_cftypes - add an array of cftypes to a subsystem | ||
2798 | * @ss: target cgroup subsystem | ||
2799 | * @cfts: zero-length name terminated array of cftypes | ||
2800 | * | ||
2801 | * Register @cfts to @ss. Files described by @cfts are created for all | ||
2802 | * existing cgroups to which @ss is attached and all future cgroups will | ||
2803 | * have them too. This function can be called anytime whether @ss is | ||
2804 | * attached or not. | ||
2805 | * | ||
2806 | * Returns 0 on successful registration, -errno on failure. Note that this | ||
2807 | * function currently returns 0 as long as @cfts registration is successful | ||
2808 | * even if some file creation attempts on existing cgroups fail. | ||
2809 | */ | ||
2810 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2811 | { | ||
2812 | struct cftype_set *set; | ||
2813 | |||
2814 | set = kzalloc(sizeof(*set), GFP_KERNEL); | ||
2815 | if (!set) | ||
2816 | return -ENOMEM; | ||
2817 | |||
2818 | cgroup_cfts_prepare(); | ||
2819 | set->cfts = cfts; | ||
2820 | list_add_tail(&set->node, &ss->cftsets); | ||
2821 | cgroup_cfts_commit(ss, cfts, true); | ||
2822 | |||
2647 | return 0; | 2823 | return 0; |
2648 | } | 2824 | } |
2649 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2825 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2826 | |||
2827 | /** | ||
2828 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | ||
2829 | * @ss: target cgroup subsystem | ||
2830 | * @cfts: zero-length name terminated array of cftypes | ||
2831 | * | ||
2832 | * Unregister @cfts from @ss. Files described by @cfts are removed from | ||
2833 | * all existing cgroups to which @ss is attached and all future cgroups | ||
2834 | * won't have them either. This function can be called anytime whether @ss | ||
2835 | * is attached or not. | ||
2836 | * | ||
2837 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | ||
2838 | * registered with @ss. | ||
2839 | */ | ||
2840 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2841 | { | ||
2842 | struct cftype_set *set; | ||
2843 | |||
2844 | cgroup_cfts_prepare(); | ||
2845 | |||
2846 | list_for_each_entry(set, &ss->cftsets, node) { | ||
2847 | if (set->cfts == cfts) { | ||
2848 | list_del_init(&set->node); | ||
2849 | cgroup_cfts_commit(ss, cfts, false); | ||
2850 | return 0; | ||
2851 | } | ||
2852 | } | ||
2853 | |||
2854 | cgroup_cfts_commit(ss, NULL, false); | ||
2855 | return -ENOENT; | ||
2856 | } | ||
2650 | 2857 | ||
2651 | /** | 2858 | /** |
2652 | * cgroup_task_count - count the number of tasks in a cgroup. | 2859 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -3625,13 +3832,14 @@ static struct cftype files[] = { | |||
3625 | .read_u64 = cgroup_clone_children_read, | 3832 | .read_u64 = cgroup_clone_children_read, |
3626 | .write_u64 = cgroup_clone_children_write, | 3833 | .write_u64 = cgroup_clone_children_write, |
3627 | }, | 3834 | }, |
3628 | }; | 3835 | { |
3629 | 3836 | .name = "release_agent", | |
3630 | static struct cftype cft_release_agent = { | 3837 | .flags = CFTYPE_ONLY_ON_ROOT, |
3631 | .name = "release_agent", | 3838 | .read_seq_string = cgroup_release_agent_show, |
3632 | .read_seq_string = cgroup_release_agent_show, | 3839 | .write_string = cgroup_release_agent_write, |
3633 | .write_string = cgroup_release_agent_write, | 3840 | .max_write_len = PATH_MAX, |
3634 | .max_write_len = PATH_MAX, | 3841 | }, |
3842 | { } /* terminate */ | ||
3635 | }; | 3843 | }; |
3636 | 3844 | ||
3637 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3845 | static int cgroup_populate_dir(struct cgroup *cgrp) |
@@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3639 | int err; | 3847 | int err; |
3640 | struct cgroup_subsys *ss; | 3848 | struct cgroup_subsys *ss; |
3641 | 3849 | ||
3642 | /* First clear out any existing files */ | 3850 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3643 | cgroup_clear_directory(cgrp->dentry); | ||
3644 | |||
3645 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | ||
3646 | if (err < 0) | 3851 | if (err < 0) |
3647 | return err; | 3852 | return err; |
3648 | 3853 | ||
3649 | if (cgrp == cgrp->top_cgroup) { | 3854 | /* process cftsets of each subsystem */ |
3650 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | ||
3651 | return err; | ||
3652 | } | ||
3653 | |||
3654 | for_each_subsys(cgrp->root, ss) { | 3855 | for_each_subsys(cgrp->root, ss) { |
3655 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3856 | struct cftype_set *set; |
3656 | return err; | 3857 | |
3858 | list_for_each_entry(set, &ss->cftsets, node) | ||
3859 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | ||
3657 | } | 3860 | } |
3861 | |||
3658 | /* This cgroup is ready now */ | 3862 | /* This cgroup is ready now */ |
3659 | for_each_subsys(cgrp->root, ss) { | 3863 | for_each_subsys(cgrp->root, ss) { |
3660 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3864 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
@@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3670 | return 0; | 3874 | return 0; |
3671 | } | 3875 | } |
3672 | 3876 | ||
3877 | static void css_dput_fn(struct work_struct *work) | ||
3878 | { | ||
3879 | struct cgroup_subsys_state *css = | ||
3880 | container_of(work, struct cgroup_subsys_state, dput_work); | ||
3881 | |||
3882 | dput(css->cgroup->dentry); | ||
3883 | } | ||
3884 | |||
3673 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3885 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3674 | struct cgroup_subsys *ss, | 3886 | struct cgroup_subsys *ss, |
3675 | struct cgroup *cgrp) | 3887 | struct cgroup *cgrp) |
@@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3682 | set_bit(CSS_ROOT, &css->flags); | 3894 | set_bit(CSS_ROOT, &css->flags); |
3683 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3895 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3684 | cgrp->subsys[ss->subsys_id] = css; | 3896 | cgrp->subsys[ss->subsys_id] = css; |
3897 | |||
3898 | /* | ||
3899 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | ||
3900 | * which is put on the last css_put(). dput() requires process | ||
3901 | * context, which css_put() may be called without. @css->dput_work | ||
3902 | * will be used to invoke dput() asynchronously from css_put(). | ||
3903 | */ | ||
3904 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
3905 | if (ss->__DEPRECATED_clear_css_refs) | ||
3906 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
3685 | } | 3907 | } |
3686 | 3908 | ||
3687 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3909 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
@@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3784 | if (err < 0) | 4006 | if (err < 0) |
3785 | goto err_remove; | 4007 | goto err_remove; |
3786 | 4008 | ||
4009 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4010 | for_each_subsys(root, ss) | ||
4011 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4012 | dget(dentry); | ||
4013 | |||
3787 | /* The cgroup directory was pre-locked for us */ | 4014 | /* The cgroup directory was pre-locked for us */ |
3788 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4015 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3789 | 4016 | ||
4017 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4018 | |||
3790 | err = cgroup_populate_dir(cgrp); | 4019 | err = cgroup_populate_dir(cgrp); |
3791 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4020 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3792 | 4021 | ||
@@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
3826 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4055 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3827 | } | 4056 | } |
3828 | 4057 | ||
4058 | /* | ||
4059 | * Check the reference count on each subsystem. Since we already | ||
4060 | * established that there are no tasks in the cgroup, if the css refcount | ||
4061 | * is also 1, then there should be no outstanding references, so the | ||
4062 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4063 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4064 | * be called via check_for_release() with no synchronization other than | ||
4065 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4066 | */ | ||
3829 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 4067 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3830 | { | 4068 | { |
3831 | /* Check the reference count on each subsystem. Since we | ||
3832 | * already established that there are no tasks in the | ||
3833 | * cgroup, if the css refcount is also 1, then there should | ||
3834 | * be no outstanding references, so the subsystem is safe to | ||
3835 | * destroy. We scan across all subsystems rather than using | ||
3836 | * the per-hierarchy linked list of mounted subsystems since | ||
3837 | * we can be called via check_for_release() with no | ||
3838 | * synchronization other than RCU, and the subsystem linked | ||
3839 | * list isn't RCU-safe */ | ||
3840 | int i; | 4069 | int i; |
4070 | |||
3841 | /* | 4071 | /* |
3842 | * We won't need to lock the subsys array, because the subsystems | 4072 | * We won't need to lock the subsys array, because the subsystems |
3843 | * we're concerned about aren't going anywhere since our cgroup root | 4073 | * we're concerned about aren't going anywhere since our cgroup root |
@@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4076 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3847 | struct cgroup_subsys *ss = subsys[i]; | 4077 | struct cgroup_subsys *ss = subsys[i]; |
3848 | struct cgroup_subsys_state *css; | 4078 | struct cgroup_subsys_state *css; |
4079 | |||
3849 | /* Skip subsystems not present or not in this hierarchy */ | 4080 | /* Skip subsystems not present or not in this hierarchy */ |
3850 | if (ss == NULL || ss->root != cgrp->root) | 4081 | if (ss == NULL || ss->root != cgrp->root) |
3851 | continue; | 4082 | continue; |
4083 | |||
3852 | css = cgrp->subsys[ss->subsys_id]; | 4084 | css = cgrp->subsys[ss->subsys_id]; |
3853 | /* When called from check_for_release() it's possible | 4085 | /* |
4086 | * When called from check_for_release() it's possible | ||
3854 | * that by this point the cgroup has been removed | 4087 | * that by this point the cgroup has been removed |
3855 | * and the css deleted. But a false-positive doesn't | 4088 | * and the css deleted. But a false-positive doesn't |
3856 | * matter, since it can only happen if the cgroup | 4089 | * matter, since it can only happen if the cgroup |
3857 | * has been deleted and hence no longer needs the | 4090 | * has been deleted and hence no longer needs the |
3858 | * release agent to be called anyway. */ | 4091 | * release agent to be called anyway. |
3859 | if (css && (atomic_read(&css->refcnt) > 1)) | 4092 | */ |
4093 | if (css && css_refcnt(css) > 1) | ||
3860 | return 1; | 4094 | return 1; |
3861 | } | 4095 | } |
3862 | return 0; | 4096 | return 0; |
@@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3866 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4100 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3867 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 4101 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3868 | * busy subsystems. Call with cgroup_mutex held | 4102 | * busy subsystems. Call with cgroup_mutex held |
4103 | * | ||
4104 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4105 | * not, cgroup removal behaves differently. | ||
4106 | * | ||
4107 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4108 | * cgroup removal can be committed. This is implemented by | ||
4109 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4110 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4111 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4112 | * removed as soon as the existing user (memcg) is updated. | ||
4113 | * | ||
4114 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4115 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4116 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4117 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4118 | * is put so that dentry destruction happens only after all css's are | ||
4119 | * released. | ||
3869 | */ | 4120 | */ |
3870 | |||
3871 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 4121 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3872 | { | 4122 | { |
3873 | struct cgroup_subsys *ss; | 4123 | struct cgroup_subsys *ss; |
3874 | unsigned long flags; | 4124 | unsigned long flags; |
3875 | bool failed = false; | 4125 | bool failed = false; |
4126 | |||
3876 | local_irq_save(flags); | 4127 | local_irq_save(flags); |
4128 | |||
4129 | /* | ||
4130 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4131 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4132 | * deactivation, we succeeded. | ||
4133 | */ | ||
3877 | for_each_subsys(cgrp->root, ss) { | 4134 | for_each_subsys(cgrp->root, ss) { |
3878 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4135 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3879 | int refcnt; | 4136 | |
3880 | while (1) { | 4137 | WARN_ON(atomic_read(&css->refcnt) < 0); |
3881 | /* We can only remove a CSS with a refcnt==1 */ | 4138 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
3882 | refcnt = atomic_read(&css->refcnt); | 4139 | |
3883 | if (refcnt > 1) { | 4140 | if (ss->__DEPRECATED_clear_css_refs) |
3884 | failed = true; | 4141 | failed |= css_refcnt(css) != 1; |
3885 | goto done; | ||
3886 | } | ||
3887 | BUG_ON(!refcnt); | ||
3888 | /* | ||
3889 | * Drop the refcnt to 0 while we check other | ||
3890 | * subsystems. This will cause any racing | ||
3891 | * css_tryget() to spin until we set the | ||
3892 | * CSS_REMOVED bits or abort | ||
3893 | */ | ||
3894 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | ||
3895 | break; | ||
3896 | cpu_relax(); | ||
3897 | } | ||
3898 | } | 4142 | } |
3899 | done: | 4143 | |
4144 | /* | ||
4145 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4146 | * restore refcnts to positive values. Either way, all in-progress | ||
4147 | * css_tryget() will be released. | ||
4148 | */ | ||
3900 | for_each_subsys(cgrp->root, ss) { | 4149 | for_each_subsys(cgrp->root, ss) { |
3901 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4150 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3902 | if (failed) { | 4151 | |
3903 | /* | 4152 | if (!failed) { |
3904 | * Restore old refcnt if we previously managed | ||
3905 | * to clear it from 1 to 0 | ||
3906 | */ | ||
3907 | if (!atomic_read(&css->refcnt)) | ||
3908 | atomic_set(&css->refcnt, 1); | ||
3909 | } else { | ||
3910 | /* Commit the fact that the CSS is removed */ | ||
3911 | set_bit(CSS_REMOVED, &css->flags); | 4153 | set_bit(CSS_REMOVED, &css->flags); |
4154 | css_put(css); | ||
4155 | } else { | ||
4156 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
3912 | } | 4157 | } |
3913 | } | 4158 | } |
4159 | |||
3914 | local_irq_restore(flags); | 4160 | local_irq_restore(flags); |
3915 | return !failed; | 4161 | return !failed; |
3916 | } | 4162 | } |
@@ -3995,6 +4241,8 @@ again: | |||
3995 | list_del_init(&cgrp->sibling); | 4241 | list_del_init(&cgrp->sibling); |
3996 | cgroup_unlock_hierarchy(cgrp->root); | 4242 | cgroup_unlock_hierarchy(cgrp->root); |
3997 | 4243 | ||
4244 | list_del_init(&cgrp->allcg_node); | ||
4245 | |||
3998 | d = dget(cgrp->dentry); | 4246 | d = dget(cgrp->dentry); |
3999 | 4247 | ||
4000 | cgroup_d_remove_dir(d); | 4248 | cgroup_d_remove_dir(d); |
@@ -4021,12 +4269,29 @@ again: | |||
4021 | return 0; | 4269 | return 0; |
4022 | } | 4270 | } |
4023 | 4271 | ||
4272 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | ||
4273 | { | ||
4274 | INIT_LIST_HEAD(&ss->cftsets); | ||
4275 | |||
4276 | /* | ||
4277 | * base_cftset is embedded in subsys itself, no need to worry about | ||
4278 | * deregistration. | ||
4279 | */ | ||
4280 | if (ss->base_cftypes) { | ||
4281 | ss->base_cftset.cfts = ss->base_cftypes; | ||
4282 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | ||
4283 | } | ||
4284 | } | ||
4285 | |||
4024 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4286 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4025 | { | 4287 | { |
4026 | struct cgroup_subsys_state *css; | 4288 | struct cgroup_subsys_state *css; |
4027 | 4289 | ||
4028 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4290 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4029 | 4291 | ||
4292 | /* init base cftset */ | ||
4293 | cgroup_init_cftsets(ss); | ||
4294 | |||
4030 | /* Create the top cgroup state for this subsystem */ | 4295 | /* Create the top cgroup state for this subsystem */ |
4031 | list_add(&ss->sibling, &rootnode.subsys_list); | 4296 | list_add(&ss->sibling, &rootnode.subsys_list); |
4032 | ss->root = &rootnode; | 4297 | ss->root = &rootnode; |
@@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4096 | return 0; | 4361 | return 0; |
4097 | } | 4362 | } |
4098 | 4363 | ||
4364 | /* init base cftset */ | ||
4365 | cgroup_init_cftsets(ss); | ||
4366 | |||
4099 | /* | 4367 | /* |
4100 | * need to register a subsys id before anything else - for example, | 4368 | * need to register a subsys id before anything else - for example, |
4101 | * init_cgroup_css needs it. | 4369 | * init_cgroup_css needs it. |
@@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp) | |||
4685 | } | 4953 | } |
4686 | 4954 | ||
4687 | /* Caller must verify that the css is not for root cgroup */ | 4955 | /* Caller must verify that the css is not for root cgroup */ |
4688 | void __css_put(struct cgroup_subsys_state *css, int count) | 4956 | bool __css_tryget(struct cgroup_subsys_state *css) |
4957 | { | ||
4958 | do { | ||
4959 | int v = css_refcnt(css); | ||
4960 | |||
4961 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | ||
4962 | return true; | ||
4963 | cpu_relax(); | ||
4964 | } while (!test_bit(CSS_REMOVED, &css->flags)); | ||
4965 | |||
4966 | return false; | ||
4967 | } | ||
4968 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4969 | |||
4970 | /* Caller must verify that the css is not for root cgroup */ | ||
4971 | void __css_put(struct cgroup_subsys_state *css) | ||
4689 | { | 4972 | { |
4690 | struct cgroup *cgrp = css->cgroup; | 4973 | struct cgroup *cgrp = css->cgroup; |
4691 | int val; | 4974 | |
4692 | rcu_read_lock(); | 4975 | rcu_read_lock(); |
4693 | val = atomic_sub_return(count, &css->refcnt); | 4976 | atomic_dec(&css->refcnt); |
4694 | if (val == 1) { | 4977 | switch (css_refcnt(css)) { |
4978 | case 1: | ||
4695 | if (notify_on_release(cgrp)) { | 4979 | if (notify_on_release(cgrp)) { |
4696 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4980 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4697 | check_for_release(cgrp); | 4981 | check_for_release(cgrp); |
4698 | } | 4982 | } |
4699 | cgroup_wakeup_rmdir_waiter(cgrp); | 4983 | cgroup_wakeup_rmdir_waiter(cgrp); |
4984 | break; | ||
4985 | case 0: | ||
4986 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | ||
4987 | schedule_work(&css->dput_work); | ||
4988 | break; | ||
4700 | } | 4989 | } |
4701 | rcu_read_unlock(); | 4990 | rcu_read_unlock(); |
4702 | WARN_ON_ONCE(val < 1); | ||
4703 | } | 4991 | } |
4704 | EXPORT_SYMBOL_GPL(__css_put); | 4992 | EXPORT_SYMBOL_GPL(__css_put); |
4705 | 4993 | ||
@@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4818 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5106 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4819 | * it's unchanged until freed. | 5107 | * it's unchanged until freed. |
4820 | */ | 5108 | */ |
4821 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5109 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4822 | 5110 | ||
4823 | if (cssid) | 5111 | if (cssid) |
4824 | return cssid->id; | 5112 | return cssid->id; |
@@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4830 | { | 5118 | { |
4831 | struct css_id *cssid; | 5119 | struct css_id *cssid; |
4832 | 5120 | ||
4833 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5121 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4834 | 5122 | ||
4835 | if (cssid) | 5123 | if (cssid) |
4836 | return cssid->depth; | 5124 | return cssid->depth; |
@@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = { | |||
5211 | .name = "releasable", | 5499 | .name = "releasable", |
5212 | .read_u64 = releasable_read, | 5500 | .read_u64 = releasable_read, |
5213 | }, | 5501 | }, |
5214 | }; | ||
5215 | 5502 | ||
5216 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5503 | { } /* terminate */ |
5217 | { | 5504 | }; |
5218 | return cgroup_add_files(cont, ss, debug_files, | ||
5219 | ARRAY_SIZE(debug_files)); | ||
5220 | } | ||
5221 | 5505 | ||
5222 | struct cgroup_subsys debug_subsys = { | 5506 | struct cgroup_subsys debug_subsys = { |
5223 | .name = "debug", | 5507 | .name = "debug", |
5224 | .create = debug_create, | 5508 | .create = debug_create, |
5225 | .destroy = debug_destroy, | 5509 | .destroy = debug_destroy, |
5226 | .populate = debug_populate, | ||
5227 | .subsys_id = debug_subsys_id, | 5510 | .subsys_id = debug_subsys_id, |
5511 | .base_cftypes = debug_files, | ||
5228 | }; | 5512 | }; |
5229 | #endif /* CONFIG_CGROUP_DEBUG */ | 5513 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b62..3649fc6b3eaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, | |||
358 | static struct cftype files[] = { | 358 | static struct cftype files[] = { |
359 | { | 359 | { |
360 | .name = "state", | 360 | .name = "state", |
361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
361 | .read_seq_string = freezer_read, | 362 | .read_seq_string = freezer_read, |
362 | .write_string = freezer_write, | 363 | .write_string = freezer_write, |
363 | }, | 364 | }, |
365 | { } /* terminate */ | ||
364 | }; | 366 | }; |
365 | 367 | ||
366 | static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | ||
367 | { | ||
368 | if (!cgroup->parent) | ||
369 | return 0; | ||
370 | return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | ||
371 | } | ||
372 | |||
373 | struct cgroup_subsys freezer_subsys = { | 368 | struct cgroup_subsys freezer_subsys = { |
374 | .name = "freezer", | 369 | .name = "freezer", |
375 | .create = freezer_create, | 370 | .create = freezer_create, |
376 | .destroy = freezer_destroy, | 371 | .destroy = freezer_destroy, |
377 | .populate = freezer_populate, | ||
378 | .subsys_id = freezer_subsys_id, | 372 | .subsys_id = freezer_subsys_id, |
379 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
380 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
375 | .base_cftypes = files, | ||
381 | }; | 376 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index 74ff8498809a..d2c67aa49ae6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
372 | 372 | ||
373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
374 | 374 | ||
375 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 375 | /* |
376 | compat_old_sigset_t __user *oset) | 376 | * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the |
377 | * blocked set of signals to the supplied signal set | ||
378 | */ | ||
379 | static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) | ||
377 | { | 380 | { |
378 | old_sigset_t s; | 381 | memcpy(blocked->sig, &set, sizeof(set)); |
379 | long ret; | 382 | } |
380 | mm_segment_t old_fs; | ||
381 | 383 | ||
382 | if (set && get_user(s, set)) | 384 | asmlinkage long compat_sys_sigprocmask(int how, |
383 | return -EFAULT; | 385 | compat_old_sigset_t __user *nset, |
384 | old_fs = get_fs(); | 386 | compat_old_sigset_t __user *oset) |
385 | set_fs(KERNEL_DS); | 387 | { |
386 | ret = sys_sigprocmask(how, | 388 | old_sigset_t old_set, new_set; |
387 | set ? (old_sigset_t __user *) &s : NULL, | 389 | sigset_t new_blocked; |
388 | oset ? (old_sigset_t __user *) &s : NULL); | 390 | |
389 | set_fs(old_fs); | 391 | old_set = current->blocked.sig[0]; |
390 | if (ret == 0) | 392 | |
391 | if (oset) | 393 | if (nset) { |
392 | ret = put_user(s, oset); | 394 | if (get_user(new_set, nset)) |
393 | return ret; | 395 | return -EFAULT; |
396 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
397 | |||
398 | new_blocked = current->blocked; | ||
399 | |||
400 | switch (how) { | ||
401 | case SIG_BLOCK: | ||
402 | sigaddsetmask(&new_blocked, new_set); | ||
403 | break; | ||
404 | case SIG_UNBLOCK: | ||
405 | sigdelsetmask(&new_blocked, new_set); | ||
406 | break; | ||
407 | case SIG_SETMASK: | ||
408 | compat_sig_setmask(&new_blocked, new_set); | ||
409 | break; | ||
410 | default: | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | |||
414 | set_current_blocked(&new_blocked); | ||
415 | } | ||
416 | |||
417 | if (oset) { | ||
418 | if (put_user(old_set, oset)) | ||
419 | return -EFAULT; | ||
420 | } | ||
421 | |||
422 | return 0; | ||
394 | } | 423 | } |
395 | 424 | ||
396 | #endif | 425 | #endif |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e57027..0e6353cf147a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | 18 | #include <linux/suspend.h> |
19 | 19 | ||
20 | #include "smpboot.h" | ||
21 | |||
20 | #ifdef CONFIG_SMP | 22 | #ifdef CONFIG_SMP |
21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 23 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
22 | static DEFINE_MUTEX(cpu_add_remove_lock); | 24 | static DEFINE_MUTEX(cpu_add_remove_lock); |
@@ -295,11 +297,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
295 | int ret, nr_calls = 0; | 297 | int ret, nr_calls = 0; |
296 | void *hcpu = (void *)(long)cpu; | 298 | void *hcpu = (void *)(long)cpu; |
297 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 299 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
300 | struct task_struct *idle; | ||
298 | 301 | ||
299 | if (cpu_online(cpu) || !cpu_present(cpu)) | 302 | if (cpu_online(cpu) || !cpu_present(cpu)) |
300 | return -EINVAL; | 303 | return -EINVAL; |
301 | 304 | ||
302 | cpu_hotplug_begin(); | 305 | cpu_hotplug_begin(); |
306 | |||
307 | idle = idle_thread_get(cpu); | ||
308 | if (IS_ERR(idle)) { | ||
309 | ret = PTR_ERR(idle); | ||
310 | goto out; | ||
311 | } | ||
312 | |||
303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 313 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
304 | if (ret) { | 314 | if (ret) { |
305 | nr_calls--; | 315 | nr_calls--; |
@@ -309,7 +319,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
309 | } | 319 | } |
310 | 320 | ||
311 | /* Arch-specific enabling code. */ | 321 | /* Arch-specific enabling code. */ |
312 | ret = __cpu_up(cpu); | 322 | ret = __cpu_up(cpu, idle); |
313 | if (ret != 0) | 323 | if (ret != 0) |
314 | goto out_notify; | 324 | goto out_notify; |
315 | BUG_ON(!cpu_online(cpu)); | 325 | BUG_ON(!cpu_online(cpu)); |
@@ -320,6 +330,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
320 | out_notify: | 330 | out_notify: |
321 | if (ret != 0) | 331 | if (ret != 0) |
322 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 332 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
333 | out: | ||
323 | cpu_hotplug_done(); | 334 | cpu_hotplug_done(); |
324 | 335 | ||
325 | return ret; | 336 | return ret; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba2..8c8bd652dd12 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1765,28 +1765,17 @@ static struct cftype files[] = { | |||
1765 | .write_u64 = cpuset_write_u64, | 1765 | .write_u64 = cpuset_write_u64, |
1766 | .private = FILE_SPREAD_SLAB, | 1766 | .private = FILE_SPREAD_SLAB, |
1767 | }, | 1767 | }, |
1768 | }; | ||
1769 | |||
1770 | static struct cftype cft_memory_pressure_enabled = { | ||
1771 | .name = "memory_pressure_enabled", | ||
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }; | ||
1776 | 1768 | ||
1777 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 1769 | { |
1778 | { | 1770 | .name = "memory_pressure_enabled", |
1779 | int err; | 1771 | .flags = CFTYPE_ONLY_ON_ROOT, |
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }, | ||
1780 | 1776 | ||
1781 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 1777 | { } /* terminate */ |
1782 | if (err) | 1778 | }; |
1783 | return err; | ||
1784 | /* memory_pressure_enabled is in root cpuset only */ | ||
1785 | if (!cont->parent) | ||
1786 | err = cgroup_add_file(cont, ss, | ||
1787 | &cft_memory_pressure_enabled); | ||
1788 | return err; | ||
1789 | } | ||
1790 | 1779 | ||
1791 | /* | 1780 | /* |
1792 | * post_clone() is called during cgroup_create() when the | 1781 | * post_clone() is called during cgroup_create() when the |
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1887 | .destroy = cpuset_destroy, | 1876 | .destroy = cpuset_destroy, |
1888 | .can_attach = cpuset_can_attach, | 1877 | .can_attach = cpuset_can_attach, |
1889 | .attach = cpuset_attach, | 1878 | .attach = cpuset_attach, |
1890 | .populate = cpuset_populate, | ||
1891 | .post_clone = cpuset_post_clone, | 1879 | .post_clone = cpuset_post_clone, |
1892 | .subsys_id = cpuset_subsys_id, | 1880 | .subsys_id = cpuset_subsys_id, |
1881 | .base_cftypes = files, | ||
1893 | .early_init = 1, | 1882 | .early_init = 1, |
1894 | }; | 1883 | }; |
1895 | 1884 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index fd126f82b57c..91a445925855 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -2039,8 +2039,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2039 | * accessing the event control register. If a NMI hits, then it will | 2039 | * accessing the event control register. If a NMI hits, then it will |
2040 | * not restart the event. | 2040 | * not restart the event. |
2041 | */ | 2041 | */ |
2042 | void __perf_event_task_sched_out(struct task_struct *task, | 2042 | static void __perf_event_task_sched_out(struct task_struct *task, |
2043 | struct task_struct *next) | 2043 | struct task_struct *next) |
2044 | { | 2044 | { |
2045 | int ctxn; | 2045 | int ctxn; |
2046 | 2046 | ||
@@ -2279,8 +2279,8 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, | |||
2279 | * accessing the event control register. If a NMI hits, then it will | 2279 | * accessing the event control register. If a NMI hits, then it will |
2280 | * keep the event running. | 2280 | * keep the event running. |
2281 | */ | 2281 | */ |
2282 | void __perf_event_task_sched_in(struct task_struct *prev, | 2282 | static void __perf_event_task_sched_in(struct task_struct *prev, |
2283 | struct task_struct *task) | 2283 | struct task_struct *task) |
2284 | { | 2284 | { |
2285 | struct perf_event_context *ctx; | 2285 | struct perf_event_context *ctx; |
2286 | int ctxn; | 2286 | int ctxn; |
@@ -2305,6 +2305,12 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
2305 | perf_branch_stack_sched_in(prev, task); | 2305 | perf_branch_stack_sched_in(prev, task); |
2306 | } | 2306 | } |
2307 | 2307 | ||
2308 | void __perf_event_task_sched(struct task_struct *prev, struct task_struct *next) | ||
2309 | { | ||
2310 | __perf_event_task_sched_out(prev, next); | ||
2311 | __perf_event_task_sched_in(prev, next); | ||
2312 | } | ||
2313 | |||
2308 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2314 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
2309 | { | 2315 | { |
2310 | u64 frequency = event->attr.sample_freq; | 2316 | u64 frequency = event->attr.sample_freq; |
@@ -4957,7 +4963,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | |||
4957 | if (rctx < 0) | 4963 | if (rctx < 0) |
4958 | return; | 4964 | return; |
4959 | 4965 | ||
4960 | perf_sample_data_init(&data, addr); | 4966 | perf_sample_data_init(&data, addr, 0); |
4961 | 4967 | ||
4962 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); | 4968 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
4963 | 4969 | ||
@@ -5215,7 +5221,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
5215 | .data = record, | 5221 | .data = record, |
5216 | }; | 5222 | }; |
5217 | 5223 | ||
5218 | perf_sample_data_init(&data, addr); | 5224 | perf_sample_data_init(&data, addr, 0); |
5219 | data.raw = &raw; | 5225 | data.raw = &raw; |
5220 | 5226 | ||
5221 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5227 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
@@ -5318,7 +5324,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
5318 | struct perf_sample_data sample; | 5324 | struct perf_sample_data sample; |
5319 | struct pt_regs *regs = data; | 5325 | struct pt_regs *regs = data; |
5320 | 5326 | ||
5321 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5327 | perf_sample_data_init(&sample, bp->attr.bp_addr, 0); |
5322 | 5328 | ||
5323 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | 5329 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) |
5324 | perf_swevent_event(bp, 1, &sample, regs); | 5330 | perf_swevent_event(bp, 1, &sample, regs); |
@@ -5344,13 +5350,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5344 | 5350 | ||
5345 | event->pmu->read(event); | 5351 | event->pmu->read(event); |
5346 | 5352 | ||
5347 | perf_sample_data_init(&data, 0); | 5353 | perf_sample_data_init(&data, 0, event->hw.last_period); |
5348 | data.period = event->hw.last_period; | ||
5349 | regs = get_irq_regs(); | 5354 | regs = get_irq_regs(); |
5350 | 5355 | ||
5351 | if (regs && !perf_exclude_event(event, regs)) { | 5356 | if (regs && !perf_exclude_event(event, regs)) { |
5352 | if (!(event->attr.exclude_idle && is_idle_task(current))) | 5357 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5353 | if (perf_event_overflow(event, &data, regs)) | 5358 | if (__perf_event_overflow(event, 1, &data, regs)) |
5354 | ret = HRTIMER_NORESTART; | 5359 | ret = HRTIMER_NORESTART; |
5355 | } | 5360 | } |
5356 | 5361 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..ad54c833116a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/cgroup.h> | 34 | #include <linux/cgroup.h> |
35 | #include <linux/security.h> | 35 | #include <linux/security.h> |
36 | #include <linux/hugetlb.h> | 36 | #include <linux/hugetlb.h> |
37 | #include <linux/seccomp.h> | ||
37 | #include <linux/swap.h> | 38 | #include <linux/swap.h> |
38 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
39 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
@@ -47,6 +48,7 @@ | |||
47 | #include <linux/audit.h> | 48 | #include <linux/audit.h> |
48 | #include <linux/memcontrol.h> | 49 | #include <linux/memcontrol.h> |
49 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/proc_fs.h> | ||
50 | #include <linux/profile.h> | 52 | #include <linux/profile.h> |
51 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
52 | #include <linux/ksm.h> | 54 | #include <linux/ksm.h> |
@@ -111,32 +113,67 @@ int nr_processes(void) | |||
111 | return total; | 113 | return total; |
112 | } | 114 | } |
113 | 115 | ||
114 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 116 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
115 | # define alloc_task_struct_node(node) \ | ||
116 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) | ||
117 | # define free_task_struct(tsk) \ | ||
118 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
119 | static struct kmem_cache *task_struct_cachep; | 117 | static struct kmem_cache *task_struct_cachep; |
118 | |||
119 | static inline struct task_struct *alloc_task_struct_node(int node) | ||
120 | { | ||
121 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); | ||
122 | } | ||
123 | |||
124 | void __weak arch_release_task_struct(struct task_struct *tsk) { } | ||
125 | |||
126 | static inline void free_task_struct(struct task_struct *tsk) | ||
127 | { | ||
128 | arch_release_task_struct(tsk); | ||
129 | kmem_cache_free(task_struct_cachep, tsk); | ||
130 | } | ||
120 | #endif | 131 | #endif |
121 | 132 | ||
122 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 133 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR |
134 | void __weak arch_release_thread_info(struct thread_info *ti) { } | ||
135 | |||
136 | /* | ||
137 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | ||
138 | * kmemcache based allocator. | ||
139 | */ | ||
140 | # if THREAD_SIZE >= PAGE_SIZE | ||
123 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 141 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
124 | int node) | 142 | int node) |
125 | { | 143 | { |
126 | #ifdef CONFIG_DEBUG_STACK_USAGE | 144 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
127 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 145 | THREAD_SIZE_ORDER); |
128 | #else | ||
129 | gfp_t mask = GFP_KERNEL; | ||
130 | #endif | ||
131 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); | ||
132 | 146 | ||
133 | return page ? page_address(page) : NULL; | 147 | return page ? page_address(page) : NULL; |
134 | } | 148 | } |
135 | 149 | ||
136 | static inline void free_thread_info(struct thread_info *ti) | 150 | static inline void free_thread_info(struct thread_info *ti) |
137 | { | 151 | { |
152 | arch_release_thread_info(ti); | ||
138 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 153 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
139 | } | 154 | } |
155 | # else | ||
156 | static struct kmem_cache *thread_info_cache; | ||
157 | |||
158 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | ||
159 | int node) | ||
160 | { | ||
161 | return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); | ||
162 | } | ||
163 | |||
164 | static void free_thread_info(struct thread_info *ti) | ||
165 | { | ||
166 | arch_release_thread_info(ti); | ||
167 | kmem_cache_free(thread_info_cache, ti); | ||
168 | } | ||
169 | |||
170 | void thread_info_cache_init(void) | ||
171 | { | ||
172 | thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, | ||
173 | THREAD_SIZE, 0, NULL); | ||
174 | BUG_ON(thread_info_cache == NULL); | ||
175 | } | ||
176 | # endif | ||
140 | #endif | 177 | #endif |
141 | 178 | ||
142 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 179 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
@@ -170,6 +207,7 @@ void free_task(struct task_struct *tsk) | |||
170 | free_thread_info(tsk->stack); | 207 | free_thread_info(tsk->stack); |
171 | rt_mutex_debug_task_free(tsk); | 208 | rt_mutex_debug_task_free(tsk); |
172 | ftrace_graph_exit_task(tsk); | 209 | ftrace_graph_exit_task(tsk); |
210 | put_seccomp_filter(tsk); | ||
173 | free_task_struct(tsk); | 211 | free_task_struct(tsk); |
174 | } | 212 | } |
175 | EXPORT_SYMBOL(free_task); | 213 | EXPORT_SYMBOL(free_task); |
@@ -203,17 +241,11 @@ void __put_task_struct(struct task_struct *tsk) | |||
203 | } | 241 | } |
204 | EXPORT_SYMBOL_GPL(__put_task_struct); | 242 | EXPORT_SYMBOL_GPL(__put_task_struct); |
205 | 243 | ||
206 | /* | 244 | void __init __weak arch_task_cache_init(void) { } |
207 | * macro override instead of weak attribute alias, to workaround | ||
208 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
209 | */ | ||
210 | #ifndef arch_task_cache_init | ||
211 | #define arch_task_cache_init() | ||
212 | #endif | ||
213 | 245 | ||
214 | void __init fork_init(unsigned long mempages) | 246 | void __init fork_init(unsigned long mempages) |
215 | { | 247 | { |
216 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 248 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
217 | #ifndef ARCH_MIN_TASKALIGN | 249 | #ifndef ARCH_MIN_TASKALIGN |
218 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 250 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
219 | #endif | 251 | #endif |
@@ -1162,6 +1194,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1162 | goto fork_out; | 1194 | goto fork_out; |
1163 | 1195 | ||
1164 | ftrace_graph_init_task(p); | 1196 | ftrace_graph_init_task(p); |
1197 | get_seccomp_filter(p); | ||
1165 | 1198 | ||
1166 | rt_mutex_init_task(p); | 1199 | rt_mutex_init_task(p); |
1167 | 1200 | ||
@@ -1464,6 +1497,8 @@ bad_fork_cleanup_io: | |||
1464 | if (p->io_context) | 1497 | if (p->io_context) |
1465 | exit_io_context(p); | 1498 | exit_io_context(p); |
1466 | bad_fork_cleanup_namespaces: | 1499 | bad_fork_cleanup_namespaces: |
1500 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1501 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1467 | exit_task_namespaces(p); | 1502 | exit_task_namespaces(p); |
1468 | bad_fork_cleanup_mm: | 1503 | bad_fork_cleanup_mm: |
1469 | if (p->mm) | 1504 | if (p->mm) |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2a..6df614912b9d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
108 | 108 | ||
109 | touch_nmi_watchdog(); | 109 | touch_nmi_watchdog(); |
110 | 110 | ||
111 | if (sysctl_hung_task_panic) | 111 | if (sysctl_hung_task_panic) { |
112 | trigger_all_cpu_backtrace(); | ||
112 | panic("hung_task: blocked tasks"); | 113 | panic("hung_task: blocked tasks"); |
114 | } | ||
113 | } | 115 | } |
114 | 116 | ||
115 | /* | 117 | /* |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6080f6bc8c33..fc275e4f629b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
379 | * If its disabled or no action available | 379 | * If its disabled or no action available |
380 | * keep it masked and get out of here | 380 | * keep it masked and get out of here |
381 | */ | 381 | */ |
382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
383 | desc->istate |= IRQS_PENDING; | ||
383 | goto out_unlock; | 384 | goto out_unlock; |
385 | } | ||
384 | 386 | ||
385 | handle_irq_event(desc); | 387 | handle_irq_event(desc); |
386 | 388 | ||
@@ -518,6 +520,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
518 | out_unlock: | 520 | out_unlock: |
519 | raw_spin_unlock(&desc->lock); | 521 | raw_spin_unlock(&desc->lock); |
520 | } | 522 | } |
523 | EXPORT_SYMBOL(handle_edge_irq); | ||
521 | 524 | ||
522 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | 525 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER |
523 | /** | 526 | /** |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d86e254b95eb..192a302d6cfd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
112 | { | 112 | { |
113 | return radix_tree_lookup(&irq_desc_tree, irq); | 113 | return radix_tree_lookup(&irq_desc_tree, irq); |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(irq_to_desc); | ||
115 | 116 | ||
116 | static void delete_irq_desc(unsigned int irq) | 117 | static void delete_irq_desc(unsigned int irq) |
117 | { | 118 | { |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569b..585f6381f8e4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
565 | * IRQF_TRIGGER_* but the PIC does not support multiple | 565 | * IRQF_TRIGGER_* but the PIC does not support multiple |
566 | * flow-types? | 566 | * flow-types? |
567 | */ | 567 | */ |
568 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, | 568 | pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, |
569 | chip ? (chip->name ? : "unknown") : "unknown"); | 569 | chip ? (chip->name ? : "unknown") : "unknown"); |
570 | return 0; | 570 | return 0; |
571 | } | 571 | } |
572 | 572 | ||
@@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
600 | ret = 0; | 600 | ret = 0; |
601 | break; | 601 | break; |
602 | default: | 602 | default: |
603 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 603 | pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", |
604 | flags, irq, chip->irq_set_type); | 604 | flags, irq, chip->irq_set_type); |
605 | } | 605 | } |
606 | if (unmask) | 606 | if (unmask) |
@@ -837,8 +837,7 @@ void exit_irq_thread(void) | |||
837 | 837 | ||
838 | action = kthread_data(tsk); | 838 | action = kthread_data(tsk); |
839 | 839 | ||
840 | printk(KERN_ERR | 840 | pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
841 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
842 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | 841 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); |
843 | 842 | ||
844 | desc = irq_to_desc(action->irq); | 843 | desc = irq_to_desc(action->irq); |
@@ -878,7 +877,6 @@ static int | |||
878 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 877 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
879 | { | 878 | { |
880 | struct irqaction *old, **old_ptr; | 879 | struct irqaction *old, **old_ptr; |
881 | const char *old_name = NULL; | ||
882 | unsigned long flags, thread_mask = 0; | 880 | unsigned long flags, thread_mask = 0; |
883 | int ret, nested, shared = 0; | 881 | int ret, nested, shared = 0; |
884 | cpumask_var_t mask; | 882 | cpumask_var_t mask; |
@@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
972 | */ | 970 | */ |
973 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 971 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
974 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || | 972 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
975 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | 973 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) |
976 | old_name = old->name; | ||
977 | goto mismatch; | 974 | goto mismatch; |
978 | } | ||
979 | 975 | ||
980 | /* All handlers must agree on per-cpuness */ | 976 | /* All handlers must agree on per-cpuness */ |
981 | if ((old->flags & IRQF_PERCPU) != | 977 | if ((old->flags & IRQF_PERCPU) != |
@@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1031 | * all existing action->thread_mask bits. | 1027 | * all existing action->thread_mask bits. |
1032 | */ | 1028 | */ |
1033 | new->thread_mask = 1 << ffz(thread_mask); | 1029 | new->thread_mask = 1 << ffz(thread_mask); |
1030 | |||
1031 | } else if (new->handler == irq_default_primary_handler) { | ||
1032 | /* | ||
1033 | * The interrupt was requested with handler = NULL, so | ||
1034 | * we use the default primary handler for it. But it | ||
1035 | * does not have the oneshot flag set. In combination | ||
1036 | * with level interrupts this is deadly, because the | ||
1037 | * default primary handler just wakes the thread, then | ||
1038 | * the irq lines is reenabled, but the device still | ||
1039 | * has the level irq asserted. Rinse and repeat.... | ||
1040 | * | ||
1041 | * While this works for edge type interrupts, we play | ||
1042 | * it safe and reject unconditionally because we can't | ||
1043 | * say for sure which type this interrupt really | ||
1044 | * has. The type flags are unreliable as the | ||
1045 | * underlying chip implementation can override them. | ||
1046 | */ | ||
1047 | pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", | ||
1048 | irq); | ||
1049 | ret = -EINVAL; | ||
1050 | goto out_mask; | ||
1034 | } | 1051 | } |
1035 | 1052 | ||
1036 | if (!shared) { | 1053 | if (!shared) { |
@@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1078 | 1095 | ||
1079 | if (nmsk != omsk) | 1096 | if (nmsk != omsk) |
1080 | /* hope the handler works with current trigger mode */ | 1097 | /* hope the handler works with current trigger mode */ |
1081 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", | 1098 | pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", |
1082 | irq, nmsk, omsk); | 1099 | irq, nmsk, omsk); |
1083 | } | 1100 | } |
1084 | 1101 | ||
@@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1115 | return 0; | 1132 | return 0; |
1116 | 1133 | ||
1117 | mismatch: | 1134 | mismatch: |
1118 | #ifdef CONFIG_DEBUG_SHIRQ | ||
1119 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 1135 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
1120 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 1136 | pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", |
1121 | if (old_name) | 1137 | irq, new->flags, new->name, old->flags, old->name); |
1122 | printk(KERN_ERR "current handler: %s\n", old_name); | 1138 | #ifdef CONFIG_DEBUG_SHIRQ |
1123 | dump_stack(); | 1139 | dump_stack(); |
1124 | } | ||
1125 | #endif | 1140 | #endif |
1141 | } | ||
1126 | ret = -EBUSY; | 1142 | ret = -EBUSY; |
1127 | 1143 | ||
1128 | out_mask: | 1144 | out_mask: |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a6..cb228bf21760 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void) | |||
103 | int irq; | 103 | int irq; |
104 | 104 | ||
105 | for_each_irq_desc(irq, desc) { | 105 | for_each_irq_desc(irq, desc) { |
106 | /* | ||
107 | * Only interrupts which are marked as wakeup source | ||
108 | * and have not been disabled before the suspend check | ||
109 | * can abort suspend. | ||
110 | */ | ||
106 | if (irqd_is_wakeup_set(&desc->irq_data)) { | 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
107 | if (desc->istate & IRQS_PENDING) | 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) |
108 | return -EBUSY; | 113 | return -EBUSY; |
109 | continue; | 114 | continue; |
110 | } | 115 | } |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c9..6454db7b6a4d 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
58 | /* | 58 | /* |
59 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
60 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
61 | * active. | 61 | * active. Clear the pending bit so suspend/resume does not |
62 | * get confused. | ||
62 | */ | 63 | */ |
63 | if (irq_settings_is_level(desc)) | 64 | if (irq_settings_is_level(desc)) { |
65 | desc->istate &= ~IRQS_PENDING; | ||
64 | return; | 66 | return; |
67 | } | ||
65 | if (desc->istate & IRQS_REPLAY) | 68 | if (desc->istate & IRQS_REPLAY) |
66 | return; | 69 | return; |
67 | if (desc->istate & IRQS_PENDING) { | 70 | if (desc->istate & IRQS_PENDING) { |
diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e425..a4e60973ca73 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2953,7 +2953,7 @@ static struct module *load_module(void __user *umod, | |||
2953 | 2953 | ||
2954 | /* Module is ready to execute: parsing args may do that. */ | 2954 | /* Module is ready to execute: parsing args may do that. */ |
2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
2956 | -32768, 32767, NULL); | 2956 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
2957 | if (err < 0) | 2957 | if (err < 0) |
2958 | goto unlink; | 2958 | goto unlink; |
2959 | 2959 | ||
diff --git a/kernel/params.c b/kernel/params.c index f37d82631347..ed35345be536 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) | |||
85 | 85 | ||
86 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
87 | char *val, | 87 | char *val, |
88 | const char *doing, | ||
88 | const struct kernel_param *params, | 89 | const struct kernel_param *params, |
89 | unsigned num_params, | 90 | unsigned num_params, |
90 | s16 min_level, | 91 | s16 min_level, |
91 | s16 max_level, | 92 | s16 max_level, |
92 | int (*handle_unknown)(char *param, char *val)) | 93 | int (*handle_unknown)(char *param, char *val, |
94 | const char *doing)) | ||
93 | { | 95 | { |
94 | unsigned int i; | 96 | unsigned int i; |
95 | int err; | 97 | int err; |
@@ -104,8 +106,8 @@ static int parse_one(char *param, | |||
104 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && params[i].ops->set != param_set_bool |
105 | && params[i].ops->set != param_set_bint) | 107 | && params[i].ops->set != param_set_bint) |
106 | return -EINVAL; | 108 | return -EINVAL; |
107 | pr_debug("They are equal! Calling %p\n", | 109 | pr_debug("handling %s with %p\n", param, |
108 | params[i].ops->set); | 110 | params[i].ops->set); |
109 | mutex_lock(¶m_lock); | 111 | mutex_lock(¶m_lock); |
110 | err = params[i].ops->set(val, ¶ms[i]); | 112 | err = params[i].ops->set(val, ¶ms[i]); |
111 | mutex_unlock(¶m_lock); | 113 | mutex_unlock(¶m_lock); |
@@ -114,11 +116,11 @@ static int parse_one(char *param, | |||
114 | } | 116 | } |
115 | 117 | ||
116 | if (handle_unknown) { | 118 | if (handle_unknown) { |
117 | pr_debug("Unknown argument: calling %p\n", handle_unknown); | 119 | pr_debug("doing %s: %s='%s'\n", doing, param, val); |
118 | return handle_unknown(param, val); | 120 | return handle_unknown(param, val, doing); |
119 | } | 121 | } |
120 | 122 | ||
121 | pr_debug("Unknown argument `%s'\n", param); | 123 | pr_debug("Unknown argument '%s'\n", param); |
122 | return -ENOENT; | 124 | return -ENOENT; |
123 | } | 125 | } |
124 | 126 | ||
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) | |||
175 | } | 177 | } |
176 | 178 | ||
177 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
178 | int parse_args(const char *name, | 180 | int parse_args(const char *doing, |
179 | char *args, | 181 | char *args, |
180 | const struct kernel_param *params, | 182 | const struct kernel_param *params, |
181 | unsigned num, | 183 | unsigned num, |
182 | s16 min_level, | 184 | s16 min_level, |
183 | s16 max_level, | 185 | s16 max_level, |
184 | int (*unknown)(char *param, char *val)) | 186 | int (*unknown)(char *param, char *val, const char *doing)) |
185 | { | 187 | { |
186 | char *param, *val; | 188 | char *param, *val; |
187 | 189 | ||
188 | pr_debug("Parsing ARGS: %s\n", args); | ||
189 | |||
190 | /* Chew leading spaces */ | 190 | /* Chew leading spaces */ |
191 | args = skip_spaces(args); | 191 | args = skip_spaces(args); |
192 | 192 | ||
193 | if (*args) | ||
194 | pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); | ||
195 | |||
193 | while (*args) { | 196 | while (*args) { |
194 | int ret; | 197 | int ret; |
195 | int irq_was_disabled; | 198 | int irq_was_disabled; |
196 | 199 | ||
197 | args = next_arg(args, ¶m, &val); | 200 | args = next_arg(args, ¶m, &val); |
198 | irq_was_disabled = irqs_disabled(); | 201 | irq_was_disabled = irqs_disabled(); |
199 | ret = parse_one(param, val, params, num, | 202 | ret = parse_one(param, val, doing, params, num, |
200 | min_level, max_level, unknown); | 203 | min_level, max_level, unknown); |
201 | if (irq_was_disabled && !irqs_disabled()) { | 204 | if (irq_was_disabled && !irqs_disabled()) |
202 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | 205 | pr_warn("%s: option '%s' enabled irq's!\n", |
203 | "irq's!\n", param); | 206 | doing, param); |
204 | } | 207 | |
205 | switch (ret) { | 208 | switch (ret) { |
206 | case -ENOENT: | 209 | case -ENOENT: |
207 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | 210 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
208 | name, param); | ||
209 | return ret; | 211 | return ret; |
210 | case -ENOSPC: | 212 | case -ENOSPC: |
211 | printk(KERN_ERR | 213 | pr_err("%s: `%s' too large for parameter `%s'\n", |
212 | "%s: `%s' too large for parameter `%s'\n", | 214 | doing, val ?: "", param); |
213 | name, val ?: "", param); | ||
214 | return ret; | 215 | return ret; |
215 | case 0: | 216 | case 0: |
216 | break; | 217 | break; |
217 | default: | 218 | default: |
218 | printk(KERN_ERR | 219 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
219 | "%s: `%s' invalid for parameter `%s'\n", | 220 | doing, val ?: "", param); |
220 | name, val ?: "", param); | ||
221 | return ret; | 221 | return ret; |
222 | } | 222 | } |
223 | } | 223 | } |
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | |||
263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 263 | int param_set_charp(const char *val, const struct kernel_param *kp) |
264 | { | 264 | { |
265 | if (strlen(val) > 1024) { | 265 | if (strlen(val) > 1024) { |
266 | printk(KERN_ERR "%s: string parameter too long\n", | 266 | pr_err("%s: string parameter too long\n", kp->name); |
267 | kp->name); | ||
268 | return -ENOSPC; | 267 | return -ENOSPC; |
269 | } | 268 | } |
270 | 269 | ||
@@ -400,8 +399,7 @@ static int param_array(const char *name, | |||
400 | int len; | 399 | int len; |
401 | 400 | ||
402 | if (*num == max) { | 401 | if (*num == max) { |
403 | printk(KERN_ERR "%s: can only take %i arguments\n", | 402 | pr_err("%s: can only take %i arguments\n", name, max); |
404 | name, max); | ||
405 | return -EINVAL; | 403 | return -EINVAL; |
406 | } | 404 | } |
407 | len = strcspn(val, ","); | 405 | len = strcspn(val, ","); |
@@ -420,8 +418,7 @@ static int param_array(const char *name, | |||
420 | } while (save == ','); | 418 | } while (save == ','); |
421 | 419 | ||
422 | if (*num < min) { | 420 | if (*num < min) { |
423 | printk(KERN_ERR "%s: needs at least %i arguments\n", | 421 | pr_err("%s: needs at least %i arguments\n", name, min); |
424 | name, min); | ||
425 | return -EINVAL; | 422 | return -EINVAL; |
426 | } | 423 | } |
427 | return 0; | 424 | return 0; |
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) | |||
480 | const struct kparam_string *kps = kp->str; | 477 | const struct kparam_string *kps = kp->str; |
481 | 478 | ||
482 | if (strlen(val)+1 > kps->maxlen) { | 479 | if (strlen(val)+1 > kps->maxlen) { |
483 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 480 | pr_err("%s: string doesn't fit in %u chars.\n", |
484 | kp->name, kps->maxlen-1); | 481 | kp->name, kps->maxlen-1); |
485 | return -ENOSPC; | 482 | return -ENOSPC; |
486 | } | 483 | } |
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
750 | #endif | 747 | #endif |
751 | if (err) { | 748 | if (err) { |
752 | kobject_put(&mk->kobj); | 749 | kobject_put(&mk->kobj); |
753 | printk(KERN_ERR | 750 | pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", |
754 | "Module '%s' failed add to sysfs, error number %d\n", | ||
755 | name, err); | 751 | name, err); |
756 | printk(KERN_ERR | ||
757 | "The system will be unstable now.\n"); | ||
758 | return NULL; | 752 | return NULL; |
759 | } | 753 | } |
760 | 754 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d39..32462d2b364a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/rculist.h> | 43 | #include <linux/rculist.h> |
44 | #include <linux/poll.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | 47 | ||
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
54 | { | 55 | { |
55 | } | 56 | } |
56 | 57 | ||
57 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
58 | |||
59 | /* printk's without a loglevel use this.. */ | 58 | /* printk's without a loglevel use this.. */ |
60 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 59 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
61 | 60 | ||
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); | |||
99 | static int console_locked, console_suspended; | 98 | static int console_locked, console_suspended; |
100 | 99 | ||
101 | /* | 100 | /* |
102 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | ||
103 | * It is also used in interesting ways to provide interlocking in | ||
104 | * console_unlock();. | ||
105 | */ | ||
106 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
107 | |||
108 | #define LOG_BUF_MASK (log_buf_len-1) | ||
109 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | ||
110 | |||
111 | /* | ||
112 | * The indices into log_buf are not constrained to log_buf_len - they | ||
113 | * must be masked before subscripting | ||
114 | */ | ||
115 | static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ | ||
116 | static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ | ||
117 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | ||
118 | |||
119 | /* | ||
120 | * If exclusive_console is non-NULL then only this console is to be printed to. | 101 | * If exclusive_console is non-NULL then only this console is to be printed to. |
121 | */ | 102 | */ |
122 | static struct console *exclusive_console; | 103 | static struct console *exclusive_console; |
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline); | |||
145 | /* Flag: console code may call schedule() */ | 126 | /* Flag: console code may call schedule() */ |
146 | static int console_may_schedule; | 127 | static int console_may_schedule; |
147 | 128 | ||
129 | /* | ||
130 | * The printk log buffer consists of a chain of concatenated variable | ||
131 | * length records. Every record starts with a record header, containing | ||
132 | * the overall length of the record. | ||
133 | * | ||
134 | * The heads to the first and last entry in the buffer, as well as the | ||
135 | * sequence numbers of these both entries are maintained when messages | ||
136 | * are stored.. | ||
137 | * | ||
138 | * If the heads indicate available messages, the length in the header | ||
139 | * tells the start next message. A length == 0 for the next message | ||
140 | * indicates a wrap-around to the beginning of the buffer. | ||
141 | * | ||
142 | * Every record carries the monotonic timestamp in microseconds, as well as | ||
143 | * the standard userspace syslog level and syslog facility. The usual | ||
144 | * kernel messages use LOG_KERN; userspace-injected messages always carry | ||
145 | * a matching syslog facility, by default LOG_USER. The origin of every | ||
146 | * message can be reliably determined that way. | ||
147 | * | ||
148 | * The human readable log message directly follows the message header. The | ||
149 | * length of the message text is stored in the header, the stored message | ||
150 | * is not terminated. | ||
151 | * | ||
152 | * Optionally, a message can carry a dictionary of properties (key/value pairs), | ||
153 | * to provide userspace with a machine-readable message context. | ||
154 | * | ||
155 | * Examples for well-defined, commonly used property names are: | ||
156 | * DEVICE=b12:8 device identifier | ||
157 | * b12:8 block dev_t | ||
158 | * c127:3 char dev_t | ||
159 | * n8 netdev ifindex | ||
160 | * +sound:card0 subsystem:devname | ||
161 | * SUBSYSTEM=pci driver-core subsystem name | ||
162 | * | ||
163 | * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value | ||
164 | * follows directly after a '=' character. Every property is terminated by | ||
165 | * a '\0' character. The last property is not terminated. | ||
166 | * | ||
167 | * Example of a message structure: | ||
168 | * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec | ||
169 | * 0008 34 00 record is 52 bytes long | ||
170 | * 000a 0b 00 text is 11 bytes long | ||
171 | * 000c 1f 00 dictionary is 23 bytes long | ||
172 | * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) | ||
173 | * 0010 69 74 27 73 20 61 20 6c "it's a l" | ||
174 | * 69 6e 65 "ine" | ||
175 | * 001b 44 45 56 49 43 "DEVIC" | ||
176 | * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" | ||
177 | * 52 49 56 45 52 3d 62 75 "RIVER=bu" | ||
178 | * 67 "g" | ||
179 | * 0032 00 00 00 padding to next message header | ||
180 | * | ||
181 | * The 'struct log' buffer header must never be directly exported to | ||
182 | * userspace, it is a kernel-private implementation detail that might | ||
183 | * need to be changed in the future, when the requirements change. | ||
184 | * | ||
185 | * /dev/kmsg exports the structured data in the following line format: | ||
186 | * "level,sequnum,timestamp;<message text>\n" | ||
187 | * | ||
188 | * The optional key/value pairs are attached as continuation lines starting | ||
189 | * with a space character and terminated by a newline. All possible | ||
190 | * non-prinatable characters are escaped in the "\xff" notation. | ||
191 | * | ||
192 | * Users of the export format should ignore possible additional values | ||
193 | * separated by ',', and find the message after the ';' character. | ||
194 | */ | ||
195 | |||
196 | struct log { | ||
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | ||
198 | u16 len; /* length of entire record */ | ||
199 | u16 text_len; /* length of text buffer */ | ||
200 | u16 dict_len; /* length of dictionary buffer */ | ||
201 | u16 level; /* syslog level + facility */ | ||
202 | }; | ||
203 | |||
204 | /* | ||
205 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | ||
206 | * used in interesting ways to provide interlocking in console_unlock(); | ||
207 | */ | ||
208 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
209 | |||
210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | ||
211 | static u64 syslog_seq; | ||
212 | static u32 syslog_idx; | ||
213 | |||
214 | /* index and sequence number of the first record stored in the buffer */ | ||
215 | static u64 log_first_seq; | ||
216 | static u32 log_first_idx; | ||
217 | |||
218 | /* index and sequence number of the next record to store in the buffer */ | ||
219 | static u64 log_next_seq; | ||
148 | #ifdef CONFIG_PRINTK | 220 | #ifdef CONFIG_PRINTK |
221 | static u32 log_next_idx; | ||
222 | |||
223 | /* the next printk record to read after the last 'clear' command */ | ||
224 | static u64 clear_seq; | ||
225 | static u32 clear_idx; | ||
226 | |||
227 | #define LOG_LINE_MAX 1024 | ||
149 | 228 | ||
150 | static char __log_buf[__LOG_BUF_LEN]; | 229 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
231 | #define LOG_ALIGN 4 | ||
232 | #else | ||
233 | #define LOG_ALIGN 8 | ||
234 | #endif | ||
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | ||
151 | static char *log_buf = __log_buf; | 237 | static char *log_buf = __log_buf; |
152 | static int log_buf_len = __LOG_BUF_LEN; | 238 | static u32 log_buf_len = __LOG_BUF_LEN; |
153 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 239 | |
154 | static int saved_console_loglevel = -1; | 240 | /* cpu currently holding logbuf_lock */ |
241 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
242 | |||
243 | /* human readable text of the record */ | ||
244 | static char *log_text(const struct log *msg) | ||
245 | { | ||
246 | return (char *)msg + sizeof(struct log); | ||
247 | } | ||
248 | |||
249 | /* optional key/value pair dictionary attached to the record */ | ||
250 | static char *log_dict(const struct log *msg) | ||
251 | { | ||
252 | return (char *)msg + sizeof(struct log) + msg->text_len; | ||
253 | } | ||
254 | |||
255 | /* get record by index; idx must point to valid msg */ | ||
256 | static struct log *log_from_idx(u32 idx) | ||
257 | { | ||
258 | struct log *msg = (struct log *)(log_buf + idx); | ||
259 | |||
260 | /* | ||
261 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
262 | * read the message at the start of the buffer. | ||
263 | */ | ||
264 | if (!msg->len) | ||
265 | return (struct log *)log_buf; | ||
266 | return msg; | ||
267 | } | ||
268 | |||
269 | /* get next record; idx must point to valid msg */ | ||
270 | static u32 log_next(u32 idx) | ||
271 | { | ||
272 | struct log *msg = (struct log *)(log_buf + idx); | ||
273 | |||
274 | /* length == 0 indicates the end of the buffer; wrap */ | ||
275 | /* | ||
276 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
277 | * read the message at the start of the buffer as *this* one, and | ||
278 | * return the one after that. | ||
279 | */ | ||
280 | if (!msg->len) { | ||
281 | msg = (struct log *)log_buf; | ||
282 | return msg->len; | ||
283 | } | ||
284 | return idx + msg->len; | ||
285 | } | ||
286 | |||
287 | /* insert record into the buffer, discard old ones, update heads */ | ||
288 | static void log_store(int facility, int level, | ||
289 | const char *dict, u16 dict_len, | ||
290 | const char *text, u16 text_len) | ||
291 | { | ||
292 | struct log *msg; | ||
293 | u32 size, pad_len; | ||
294 | |||
295 | /* number of '\0' padding bytes to next message */ | ||
296 | size = sizeof(struct log) + text_len + dict_len; | ||
297 | pad_len = (-size) & (LOG_ALIGN - 1); | ||
298 | size += pad_len; | ||
299 | |||
300 | while (log_first_seq < log_next_seq) { | ||
301 | u32 free; | ||
302 | |||
303 | if (log_next_idx > log_first_idx) | ||
304 | free = max(log_buf_len - log_next_idx, log_first_idx); | ||
305 | else | ||
306 | free = log_first_idx - log_next_idx; | ||
307 | |||
308 | if (free > size + sizeof(struct log)) | ||
309 | break; | ||
310 | |||
311 | /* drop old messages until we have enough contiuous space */ | ||
312 | log_first_idx = log_next(log_first_idx); | ||
313 | log_first_seq++; | ||
314 | } | ||
315 | |||
316 | if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { | ||
317 | /* | ||
318 | * This message + an additional empty header does not fit | ||
319 | * at the end of the buffer. Add an empty header with len == 0 | ||
320 | * to signify a wrap around. | ||
321 | */ | ||
322 | memset(log_buf + log_next_idx, 0, sizeof(struct log)); | ||
323 | log_next_idx = 0; | ||
324 | } | ||
325 | |||
326 | /* fill message */ | ||
327 | msg = (struct log *)(log_buf + log_next_idx); | ||
328 | memcpy(log_text(msg), text, text_len); | ||
329 | msg->text_len = text_len; | ||
330 | memcpy(log_dict(msg), dict, dict_len); | ||
331 | msg->dict_len = dict_len; | ||
332 | msg->level = (facility << 3) | (level & 7); | ||
333 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | ||
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | ||
336 | |||
337 | /* insert message */ | ||
338 | log_next_idx += msg->len; | ||
339 | log_next_seq++; | ||
340 | } | ||
341 | |||
342 | /* /dev/kmsg - userspace message inject/listen interface */ | ||
343 | struct devkmsg_user { | ||
344 | u64 seq; | ||
345 | u32 idx; | ||
346 | struct mutex lock; | ||
347 | char buf[8192]; | ||
348 | }; | ||
349 | |||
350 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | ||
351 | unsigned long count, loff_t pos) | ||
352 | { | ||
353 | char *buf, *line; | ||
354 | int i; | ||
355 | int level = default_message_loglevel; | ||
356 | int facility = 1; /* LOG_USER */ | ||
357 | size_t len = iov_length(iv, count); | ||
358 | ssize_t ret = len; | ||
359 | |||
360 | if (len > LOG_LINE_MAX) | ||
361 | return -EINVAL; | ||
362 | buf = kmalloc(len+1, GFP_KERNEL); | ||
363 | if (buf == NULL) | ||
364 | return -ENOMEM; | ||
365 | |||
366 | line = buf; | ||
367 | for (i = 0; i < count; i++) { | ||
368 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) | ||
369 | goto out; | ||
370 | line += iv[i].iov_len; | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace | ||
375 | * the decimal value represents 32bit, the lower 3 bit are the log | ||
376 | * level, the rest are the log facility. | ||
377 | * | ||
378 | * If no prefix or no userspace facility is specified, we | ||
379 | * enforce LOG_USER, to be able to reliably distinguish | ||
380 | * kernel-generated messages from userspace-injected ones. | ||
381 | */ | ||
382 | line = buf; | ||
383 | if (line[0] == '<') { | ||
384 | char *endp = NULL; | ||
385 | |||
386 | i = simple_strtoul(line+1, &endp, 10); | ||
387 | if (endp && endp[0] == '>') { | ||
388 | level = i & 7; | ||
389 | if (i >> 3) | ||
390 | facility = i >> 3; | ||
391 | endp++; | ||
392 | len -= endp - line; | ||
393 | line = endp; | ||
394 | } | ||
395 | } | ||
396 | line[len] = '\0'; | ||
397 | |||
398 | printk_emit(facility, level, NULL, 0, "%s", line); | ||
399 | out: | ||
400 | kfree(buf); | ||
401 | return ret; | ||
402 | } | ||
403 | |||
404 | static ssize_t devkmsg_read(struct file *file, char __user *buf, | ||
405 | size_t count, loff_t *ppos) | ||
406 | { | ||
407 | struct devkmsg_user *user = file->private_data; | ||
408 | struct log *msg; | ||
409 | u64 ts_usec; | ||
410 | size_t i; | ||
411 | size_t len; | ||
412 | ssize_t ret; | ||
413 | |||
414 | if (!user) | ||
415 | return -EBADF; | ||
416 | |||
417 | mutex_lock(&user->lock); | ||
418 | raw_spin_lock(&logbuf_lock); | ||
419 | while (user->seq == log_next_seq) { | ||
420 | if (file->f_flags & O_NONBLOCK) { | ||
421 | ret = -EAGAIN; | ||
422 | raw_spin_unlock(&logbuf_lock); | ||
423 | goto out; | ||
424 | } | ||
425 | |||
426 | raw_spin_unlock(&logbuf_lock); | ||
427 | ret = wait_event_interruptible(log_wait, | ||
428 | user->seq != log_next_seq); | ||
429 | if (ret) | ||
430 | goto out; | ||
431 | raw_spin_lock(&logbuf_lock); | ||
432 | } | ||
433 | |||
434 | if (user->seq < log_first_seq) { | ||
435 | /* our last seen message is gone, return error and reset */ | ||
436 | user->idx = log_first_idx; | ||
437 | user->seq = log_first_seq; | ||
438 | ret = -EPIPE; | ||
439 | raw_spin_unlock(&logbuf_lock); | ||
440 | goto out; | ||
441 | } | ||
442 | |||
443 | msg = log_from_idx(user->idx); | ||
444 | ts_usec = msg->ts_nsec; | ||
445 | do_div(ts_usec, 1000); | ||
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | ||
447 | msg->level, user->seq, ts_usec); | ||
448 | |||
449 | /* escape non-printable characters */ | ||
450 | for (i = 0; i < msg->text_len; i++) { | ||
451 | unsigned char c = log_text(msg)[i]; | ||
452 | |||
453 | if (c < ' ' || c >= 128) | ||
454 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
455 | else | ||
456 | user->buf[len++] = c; | ||
457 | } | ||
458 | user->buf[len++] = '\n'; | ||
459 | |||
460 | if (msg->dict_len) { | ||
461 | bool line = true; | ||
462 | |||
463 | for (i = 0; i < msg->dict_len; i++) { | ||
464 | unsigned char c = log_dict(msg)[i]; | ||
465 | |||
466 | if (line) { | ||
467 | user->buf[len++] = ' '; | ||
468 | line = false; | ||
469 | } | ||
470 | |||
471 | if (c == '\0') { | ||
472 | user->buf[len++] = '\n'; | ||
473 | line = true; | ||
474 | continue; | ||
475 | } | ||
476 | |||
477 | if (c < ' ' || c >= 128) { | ||
478 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
479 | continue; | ||
480 | } | ||
481 | |||
482 | user->buf[len++] = c; | ||
483 | } | ||
484 | user->buf[len++] = '\n'; | ||
485 | } | ||
486 | |||
487 | user->idx = log_next(user->idx); | ||
488 | user->seq++; | ||
489 | raw_spin_unlock(&logbuf_lock); | ||
490 | |||
491 | if (len > count) { | ||
492 | ret = -EINVAL; | ||
493 | goto out; | ||
494 | } | ||
495 | |||
496 | if (copy_to_user(buf, user->buf, len)) { | ||
497 | ret = -EFAULT; | ||
498 | goto out; | ||
499 | } | ||
500 | ret = len; | ||
501 | out: | ||
502 | mutex_unlock(&user->lock); | ||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | ||
507 | { | ||
508 | struct devkmsg_user *user = file->private_data; | ||
509 | loff_t ret = 0; | ||
510 | |||
511 | if (!user) | ||
512 | return -EBADF; | ||
513 | if (offset) | ||
514 | return -ESPIPE; | ||
515 | |||
516 | raw_spin_lock(&logbuf_lock); | ||
517 | switch (whence) { | ||
518 | case SEEK_SET: | ||
519 | /* the first record */ | ||
520 | user->idx = log_first_idx; | ||
521 | user->seq = log_first_seq; | ||
522 | break; | ||
523 | case SEEK_DATA: | ||
524 | /* | ||
525 | * The first record after the last SYSLOG_ACTION_CLEAR, | ||
526 | * like issued by 'dmesg -c'. Reading /dev/kmsg itself | ||
527 | * changes no global state, and does not clear anything. | ||
528 | */ | ||
529 | user->idx = clear_idx; | ||
530 | user->seq = clear_seq; | ||
531 | break; | ||
532 | case SEEK_END: | ||
533 | /* after the last record */ | ||
534 | user->idx = log_next_idx; | ||
535 | user->seq = log_next_seq; | ||
536 | break; | ||
537 | default: | ||
538 | ret = -EINVAL; | ||
539 | } | ||
540 | raw_spin_unlock(&logbuf_lock); | ||
541 | return ret; | ||
542 | } | ||
543 | |||
544 | static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | ||
545 | { | ||
546 | struct devkmsg_user *user = file->private_data; | ||
547 | int ret = 0; | ||
548 | |||
549 | if (!user) | ||
550 | return POLLERR|POLLNVAL; | ||
551 | |||
552 | poll_wait(file, &log_wait, wait); | ||
553 | |||
554 | raw_spin_lock(&logbuf_lock); | ||
555 | if (user->seq < log_next_seq) { | ||
556 | /* return error when data has vanished underneath us */ | ||
557 | if (user->seq < log_first_seq) | ||
558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | ||
559 | ret = POLLIN|POLLRDNORM; | ||
560 | } | ||
561 | raw_spin_unlock(&logbuf_lock); | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | |||
566 | static int devkmsg_open(struct inode *inode, struct file *file) | ||
567 | { | ||
568 | struct devkmsg_user *user; | ||
569 | int err; | ||
570 | |||
571 | /* write-only does not need any file context */ | ||
572 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) | ||
573 | return 0; | ||
574 | |||
575 | err = security_syslog(SYSLOG_ACTION_READ_ALL); | ||
576 | if (err) | ||
577 | return err; | ||
578 | |||
579 | user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); | ||
580 | if (!user) | ||
581 | return -ENOMEM; | ||
582 | |||
583 | mutex_init(&user->lock); | ||
584 | |||
585 | raw_spin_lock(&logbuf_lock); | ||
586 | user->idx = log_first_idx; | ||
587 | user->seq = log_first_seq; | ||
588 | raw_spin_unlock(&logbuf_lock); | ||
589 | |||
590 | file->private_data = user; | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | static int devkmsg_release(struct inode *inode, struct file *file) | ||
595 | { | ||
596 | struct devkmsg_user *user = file->private_data; | ||
597 | |||
598 | if (!user) | ||
599 | return 0; | ||
600 | |||
601 | mutex_destroy(&user->lock); | ||
602 | kfree(user); | ||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | const struct file_operations kmsg_fops = { | ||
607 | .open = devkmsg_open, | ||
608 | .read = devkmsg_read, | ||
609 | .aio_write = devkmsg_writev, | ||
610 | .llseek = devkmsg_llseek, | ||
611 | .poll = devkmsg_poll, | ||
612 | .release = devkmsg_release, | ||
613 | }; | ||
155 | 614 | ||
156 | #ifdef CONFIG_KEXEC | 615 | #ifdef CONFIG_KEXEC |
157 | /* | 616 | /* |
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1; | |||
165 | void log_buf_kexec_setup(void) | 624 | void log_buf_kexec_setup(void) |
166 | { | 625 | { |
167 | VMCOREINFO_SYMBOL(log_buf); | 626 | VMCOREINFO_SYMBOL(log_buf); |
168 | VMCOREINFO_SYMBOL(log_end); | ||
169 | VMCOREINFO_SYMBOL(log_buf_len); | 627 | VMCOREINFO_SYMBOL(log_buf_len); |
170 | VMCOREINFO_SYMBOL(logged_chars); | 628 | VMCOREINFO_SYMBOL(log_first_idx); |
629 | VMCOREINFO_SYMBOL(log_next_idx); | ||
171 | } | 630 | } |
172 | #endif | 631 | #endif |
173 | 632 | ||
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup); | |||
191 | void __init setup_log_buf(int early) | 650 | void __init setup_log_buf(int early) |
192 | { | 651 | { |
193 | unsigned long flags; | 652 | unsigned long flags; |
194 | unsigned start, dest_idx, offset; | ||
195 | char *new_log_buf; | 653 | char *new_log_buf; |
196 | int free; | 654 | int free; |
197 | 655 | ||
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early) | |||
219 | log_buf_len = new_log_buf_len; | 677 | log_buf_len = new_log_buf_len; |
220 | log_buf = new_log_buf; | 678 | log_buf = new_log_buf; |
221 | new_log_buf_len = 0; | 679 | new_log_buf_len = 0; |
222 | free = __LOG_BUF_LEN - log_end; | 680 | free = __LOG_BUF_LEN - log_next_idx; |
223 | 681 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | |
224 | offset = start = min(con_start, log_start); | ||
225 | dest_idx = 0; | ||
226 | while (start != log_end) { | ||
227 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
228 | |||
229 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
230 | start++; | ||
231 | dest_idx++; | ||
232 | } | ||
233 | log_start -= offset; | ||
234 | con_start -= offset; | ||
235 | log_end -= offset; | ||
236 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 682 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
237 | 683 | ||
238 | pr_info("log_buf_len: %d\n", log_buf_len); | 684 | pr_info("log_buf_len: %d\n", log_buf_len); |
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file) | |||
332 | return 0; | 778 | return 0; |
333 | } | 779 | } |
334 | 780 | ||
781 | #if defined(CONFIG_PRINTK_TIME) | ||
782 | static bool printk_time = 1; | ||
783 | #else | ||
784 | static bool printk_time; | ||
785 | #endif | ||
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
787 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | ||
789 | { | ||
790 | size_t len = 0; | ||
791 | |||
792 | if (syslog) { | ||
793 | if (buf) { | ||
794 | len += sprintf(buf, "<%u>", msg->level); | ||
795 | } else { | ||
796 | len += 3; | ||
797 | if (msg->level > 9) | ||
798 | len++; | ||
799 | if (msg->level > 99) | ||
800 | len++; | ||
801 | } | ||
802 | } | ||
803 | |||
804 | if (printk_time) { | ||
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | ||
817 | } | ||
818 | |||
819 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
820 | char *buf, size_t size) | ||
821 | { | ||
822 | const char *text = log_text(msg); | ||
823 | size_t text_size = msg->text_len; | ||
824 | size_t len = 0; | ||
825 | |||
826 | do { | ||
827 | const char *next = memchr(text, '\n', text_size); | ||
828 | size_t text_len; | ||
829 | |||
830 | if (next) { | ||
831 | text_len = next - text; | ||
832 | next++; | ||
833 | text_size -= next - text; | ||
834 | } else { | ||
835 | text_len = text_size; | ||
836 | } | ||
837 | |||
838 | if (buf) { | ||
839 | if (print_prefix(msg, syslog, NULL) + | ||
840 | text_len + 1>= size - len) | ||
841 | break; | ||
842 | |||
843 | len += print_prefix(msg, syslog, buf + len); | ||
844 | memcpy(buf + len, text, text_len); | ||
845 | len += text_len; | ||
846 | buf[len++] = '\n'; | ||
847 | } else { | ||
848 | /* SYSLOG_ACTION_* buffer size only calculation */ | ||
849 | len += print_prefix(msg, syslog, NULL); | ||
850 | len += text_len + 1; | ||
851 | } | ||
852 | |||
853 | text = next; | ||
854 | } while (text); | ||
855 | |||
856 | return len; | ||
857 | } | ||
858 | |||
859 | static int syslog_print(char __user *buf, int size) | ||
860 | { | ||
861 | char *text; | ||
862 | struct log *msg; | ||
863 | int len; | ||
864 | |||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
866 | if (!text) | ||
867 | return -ENOMEM; | ||
868 | |||
869 | raw_spin_lock_irq(&logbuf_lock); | ||
870 | if (syslog_seq < log_first_seq) { | ||
871 | /* messages are gone, move to first one */ | ||
872 | syslog_seq = log_first_seq; | ||
873 | syslog_idx = log_first_idx; | ||
874 | } | ||
875 | msg = log_from_idx(syslog_idx); | ||
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
877 | syslog_idx = log_next(syslog_idx); | ||
878 | syslog_seq++; | ||
879 | raw_spin_unlock_irq(&logbuf_lock); | ||
880 | |||
881 | if (len > 0 && copy_to_user(buf, text, len)) | ||
882 | len = -EFAULT; | ||
883 | |||
884 | kfree(text); | ||
885 | return len; | ||
886 | } | ||
887 | |||
888 | static int syslog_print_all(char __user *buf, int size, bool clear) | ||
889 | { | ||
890 | char *text; | ||
891 | int len = 0; | ||
892 | |||
893 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
894 | if (!text) | ||
895 | return -ENOMEM; | ||
896 | |||
897 | raw_spin_lock_irq(&logbuf_lock); | ||
898 | if (buf) { | ||
899 | u64 next_seq; | ||
900 | u64 seq; | ||
901 | u32 idx; | ||
902 | |||
903 | if (clear_seq < log_first_seq) { | ||
904 | /* messages are gone, move to first available one */ | ||
905 | clear_seq = log_first_seq; | ||
906 | clear_idx = log_first_idx; | ||
907 | } | ||
908 | |||
909 | /* | ||
910 | * Find first record that fits, including all following records, | ||
911 | * into the user-provided buffer for this dump. | ||
912 | */ | ||
913 | seq = clear_seq; | ||
914 | idx = clear_idx; | ||
915 | while (seq < log_next_seq) { | ||
916 | struct log *msg = log_from_idx(idx); | ||
917 | |||
918 | len += msg_print_text(msg, true, NULL, 0); | ||
919 | idx = log_next(idx); | ||
920 | seq++; | ||
921 | } | ||
922 | seq = clear_seq; | ||
923 | idx = clear_idx; | ||
924 | while (len > size && seq < log_next_seq) { | ||
925 | struct log *msg = log_from_idx(idx); | ||
926 | |||
927 | len -= msg_print_text(msg, true, NULL, 0); | ||
928 | idx = log_next(idx); | ||
929 | seq++; | ||
930 | } | ||
931 | |||
932 | /* last message in this dump */ | ||
933 | next_seq = log_next_seq; | ||
934 | |||
935 | len = 0; | ||
936 | while (len >= 0 && seq < next_seq) { | ||
937 | struct log *msg = log_from_idx(idx); | ||
938 | int textlen; | ||
939 | |||
940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
941 | if (textlen < 0) { | ||
942 | len = textlen; | ||
943 | break; | ||
944 | } | ||
945 | idx = log_next(idx); | ||
946 | seq++; | ||
947 | |||
948 | raw_spin_unlock_irq(&logbuf_lock); | ||
949 | if (copy_to_user(buf + len, text, textlen)) | ||
950 | len = -EFAULT; | ||
951 | else | ||
952 | len += textlen; | ||
953 | raw_spin_lock_irq(&logbuf_lock); | ||
954 | |||
955 | if (seq < log_first_seq) { | ||
956 | /* messages are gone, move to next one */ | ||
957 | seq = log_first_seq; | ||
958 | idx = log_first_idx; | ||
959 | } | ||
960 | } | ||
961 | } | ||
962 | |||
963 | if (clear) { | ||
964 | clear_seq = log_next_seq; | ||
965 | clear_idx = log_next_idx; | ||
966 | } | ||
967 | raw_spin_unlock_irq(&logbuf_lock); | ||
968 | |||
969 | kfree(text); | ||
970 | return len; | ||
971 | } | ||
972 | |||
335 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 973 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
336 | { | 974 | { |
337 | unsigned i, j, limit, count; | 975 | bool clear = false; |
338 | int do_clear = 0; | 976 | static int saved_console_loglevel = -1; |
339 | char c; | ||
340 | int error; | 977 | int error; |
341 | 978 | ||
342 | error = check_syslog_permissions(type, from_file); | 979 | error = check_syslog_permissions(type, from_file); |
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
364 | goto out; | 1001 | goto out; |
365 | } | 1002 | } |
366 | error = wait_event_interruptible(log_wait, | 1003 | error = wait_event_interruptible(log_wait, |
367 | (log_start - log_end)); | 1004 | syslog_seq != log_next_seq); |
368 | if (error) | 1005 | if (error) |
369 | goto out; | 1006 | goto out; |
370 | i = 0; | 1007 | error = syslog_print(buf, len); |
371 | raw_spin_lock_irq(&logbuf_lock); | ||
372 | while (!error && (log_start != log_end) && i < len) { | ||
373 | c = LOG_BUF(log_start); | ||
374 | log_start++; | ||
375 | raw_spin_unlock_irq(&logbuf_lock); | ||
376 | error = __put_user(c,buf); | ||
377 | buf++; | ||
378 | i++; | ||
379 | cond_resched(); | ||
380 | raw_spin_lock_irq(&logbuf_lock); | ||
381 | } | ||
382 | raw_spin_unlock_irq(&logbuf_lock); | ||
383 | if (!error) | ||
384 | error = i; | ||
385 | break; | 1008 | break; |
386 | /* Read/clear last kernel messages */ | 1009 | /* Read/clear last kernel messages */ |
387 | case SYSLOG_ACTION_READ_CLEAR: | 1010 | case SYSLOG_ACTION_READ_CLEAR: |
388 | do_clear = 1; | 1011 | clear = true; |
389 | /* FALL THRU */ | 1012 | /* FALL THRU */ |
390 | /* Read last kernel messages */ | 1013 | /* Read last kernel messages */ |
391 | case SYSLOG_ACTION_READ_ALL: | 1014 | case SYSLOG_ACTION_READ_ALL: |
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | error = -EFAULT; | 1022 | error = -EFAULT; |
400 | goto out; | 1023 | goto out; |
401 | } | 1024 | } |
402 | count = len; | 1025 | error = syslog_print_all(buf, len, clear); |
403 | if (count > log_buf_len) | ||
404 | count = log_buf_len; | ||
405 | raw_spin_lock_irq(&logbuf_lock); | ||
406 | if (count > logged_chars) | ||
407 | count = logged_chars; | ||
408 | if (do_clear) | ||
409 | logged_chars = 0; | ||
410 | limit = log_end; | ||
411 | /* | ||
412 | * __put_user() could sleep, and while we sleep | ||
413 | * printk() could overwrite the messages | ||
414 | * we try to copy to user space. Therefore | ||
415 | * the messages are copied in reverse. <manfreds> | ||
416 | */ | ||
417 | for (i = 0; i < count && !error; i++) { | ||
418 | j = limit-1-i; | ||
419 | if (j + log_buf_len < log_end) | ||
420 | break; | ||
421 | c = LOG_BUF(j); | ||
422 | raw_spin_unlock_irq(&logbuf_lock); | ||
423 | error = __put_user(c,&buf[count-1-i]); | ||
424 | cond_resched(); | ||
425 | raw_spin_lock_irq(&logbuf_lock); | ||
426 | } | ||
427 | raw_spin_unlock_irq(&logbuf_lock); | ||
428 | if (error) | ||
429 | break; | ||
430 | error = i; | ||
431 | if (i != count) { | ||
432 | int offset = count-error; | ||
433 | /* buffer overflow during copy, correct user buffer. */ | ||
434 | for (i = 0; i < error; i++) { | ||
435 | if (__get_user(c,&buf[i+offset]) || | ||
436 | __put_user(c,&buf[i])) { | ||
437 | error = -EFAULT; | ||
438 | break; | ||
439 | } | ||
440 | cond_resched(); | ||
441 | } | ||
442 | } | ||
443 | break; | 1026 | break; |
444 | /* Clear ring buffer */ | 1027 | /* Clear ring buffer */ |
445 | case SYSLOG_ACTION_CLEAR: | 1028 | case SYSLOG_ACTION_CLEAR: |
446 | logged_chars = 0; | 1029 | syslog_print_all(NULL, 0, true); |
447 | break; | ||
448 | /* Disable logging to console */ | 1030 | /* Disable logging to console */ |
449 | case SYSLOG_ACTION_CONSOLE_OFF: | 1031 | case SYSLOG_ACTION_CONSOLE_OFF: |
450 | if (saved_console_loglevel == -1) | 1032 | if (saved_console_loglevel == -1) |
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
472 | break; | 1054 | break; |
473 | /* Number of chars in the log buffer */ | 1055 | /* Number of chars in the log buffer */ |
474 | case SYSLOG_ACTION_SIZE_UNREAD: | 1056 | case SYSLOG_ACTION_SIZE_UNREAD: |
475 | error = log_end - log_start; | 1057 | raw_spin_lock_irq(&logbuf_lock); |
1058 | if (syslog_seq < log_first_seq) { | ||
1059 | /* messages are gone, move to first one */ | ||
1060 | syslog_seq = log_first_seq; | ||
1061 | syslog_idx = log_first_idx; | ||
1062 | } | ||
1063 | if (from_file) { | ||
1064 | /* | ||
1065 | * Short-cut for poll(/"proc/kmsg") which simply checks | ||
1066 | * for pending data, not the size; return the count of | ||
1067 | * records, not the length. | ||
1068 | */ | ||
1069 | error = log_next_idx - syslog_idx; | ||
1070 | } else { | ||
1071 | u64 seq; | ||
1072 | u32 idx; | ||
1073 | |||
1074 | error = 0; | ||
1075 | seq = syslog_seq; | ||
1076 | idx = syslog_idx; | ||
1077 | while (seq < log_next_seq) { | ||
1078 | struct log *msg = log_from_idx(idx); | ||
1079 | |||
1080 | error += msg_print_text(msg, true, NULL, 0); | ||
1081 | idx = log_next(idx); | ||
1082 | seq++; | ||
1083 | } | ||
1084 | } | ||
1085 | raw_spin_unlock_irq(&logbuf_lock); | ||
476 | break; | 1086 | break; |
477 | /* Size of the log buffer */ | 1087 | /* Size of the log buffer */ |
478 | case SYSLOG_ACTION_SIZE_BUFFER: | 1088 | case SYSLOG_ACTION_SIZE_BUFFER: |
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4]) | |||
501 | { | 1111 | { |
502 | syslog_data[0] = log_buf; | 1112 | syslog_data[0] = log_buf; |
503 | syslog_data[1] = log_buf + log_buf_len; | 1113 | syslog_data[1] = log_buf + log_buf_len; |
504 | syslog_data[2] = log_buf + log_end - | 1114 | syslog_data[2] = log_buf + log_first_idx; |
505 | (logged_chars < log_buf_len ? logged_chars : log_buf_len); | 1115 | syslog_data[3] = log_buf + log_next_idx; |
506 | syslog_data[3] = log_buf + log_end; | ||
507 | } | 1116 | } |
508 | #endif /* CONFIG_KGDB_KDB */ | 1117 | #endif /* CONFIG_KGDB_KDB */ |
509 | 1118 | ||
510 | /* | ||
511 | * Call the console drivers on a range of log_buf | ||
512 | */ | ||
513 | static void __call_console_drivers(unsigned start, unsigned end) | ||
514 | { | ||
515 | struct console *con; | ||
516 | |||
517 | for_each_console(con) { | ||
518 | if (exclusive_console && con != exclusive_console) | ||
519 | continue; | ||
520 | if ((con->flags & CON_ENABLED) && con->write && | ||
521 | (cpu_online(smp_processor_id()) || | ||
522 | (con->flags & CON_ANYTIME))) | ||
523 | con->write(con, &LOG_BUF(start), end - start); | ||
524 | } | ||
525 | } | ||
526 | |||
527 | static bool __read_mostly ignore_loglevel; | 1119 | static bool __read_mostly ignore_loglevel; |
528 | 1120 | ||
529 | static int __init ignore_loglevel_setup(char *str) | 1121 | static int __init ignore_loglevel_setup(char *str) |
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | |||
540 | "print all kernel messages to the console."); | 1132 | "print all kernel messages to the console."); |
541 | 1133 | ||
542 | /* | 1134 | /* |
543 | * Write out chars from start to end - 1 inclusive | ||
544 | */ | ||
545 | static void _call_console_drivers(unsigned start, | ||
546 | unsigned end, int msg_log_level) | ||
547 | { | ||
548 | trace_console(&LOG_BUF(0), start, end, log_buf_len); | ||
549 | |||
550 | if ((msg_log_level < console_loglevel || ignore_loglevel) && | ||
551 | console_drivers && start != end) { | ||
552 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | ||
553 | /* wrapped write */ | ||
554 | __call_console_drivers(start & LOG_BUF_MASK, | ||
555 | log_buf_len); | ||
556 | __call_console_drivers(0, end & LOG_BUF_MASK); | ||
557 | } else { | ||
558 | __call_console_drivers(start, end); | ||
559 | } | ||
560 | } | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
565 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
566 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
567 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
568 | * to extract the correct log level for in-kernel processing, and not mangle | ||
569 | * the original value. | ||
570 | * | ||
571 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
572 | * passed, it will be filled in with the log level without a possible facility | ||
573 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
574 | * and returned. If no valid header is found, 0 is returned and the passed | ||
575 | * variables are not touched. | ||
576 | */ | ||
577 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
578 | { | ||
579 | unsigned int lev = 0; | ||
580 | char sp = '\0'; | ||
581 | size_t len; | ||
582 | |||
583 | if (p[0] != '<' || !p[1]) | ||
584 | return 0; | ||
585 | if (p[2] == '>') { | ||
586 | /* usual single digit level number or special char */ | ||
587 | switch (p[1]) { | ||
588 | case '0' ... '7': | ||
589 | lev = p[1] - '0'; | ||
590 | break; | ||
591 | case 'c': /* KERN_CONT */ | ||
592 | case 'd': /* KERN_DEFAULT */ | ||
593 | sp = p[1]; | ||
594 | break; | ||
595 | default: | ||
596 | return 0; | ||
597 | } | ||
598 | len = 3; | ||
599 | } else { | ||
600 | /* multi digit including the level and facility number */ | ||
601 | char *endp = NULL; | ||
602 | |||
603 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
604 | if (endp == NULL || endp[0] != '>') | ||
605 | return 0; | ||
606 | len = (endp + 1) - p; | ||
607 | } | ||
608 | |||
609 | /* do not accept special char if not asked for */ | ||
610 | if (sp && !special) | ||
611 | return 0; | ||
612 | |||
613 | if (special) { | ||
614 | *special = sp; | ||
615 | /* return special char, do not touch level */ | ||
616 | if (sp) | ||
617 | return len; | ||
618 | } | ||
619 | |||
620 | if (level) | ||
621 | *level = lev; | ||
622 | return len; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * Call the console drivers, asking them to write out | 1135 | * Call the console drivers, asking them to write out |
627 | * log_buf[start] to log_buf[end - 1]. | 1136 | * log_buf[start] to log_buf[end - 1]. |
628 | * The console_lock must be held. | 1137 | * The console_lock must be held. |
629 | */ | 1138 | */ |
630 | static void call_console_drivers(unsigned start, unsigned end) | 1139 | static void call_console_drivers(int level, const char *text, size_t len) |
631 | { | 1140 | { |
632 | unsigned cur_index, start_print; | 1141 | struct console *con; |
633 | static int msg_level = -1; | ||
634 | 1142 | ||
635 | BUG_ON(((int)(start - end)) > 0); | 1143 | trace_console(text, 0, len, len); |
636 | 1144 | ||
637 | cur_index = start; | 1145 | if (level >= console_loglevel && !ignore_loglevel) |
638 | start_print = start; | 1146 | return; |
639 | while (cur_index != end) { | 1147 | if (!console_drivers) |
640 | if (msg_level < 0 && ((end - cur_index) > 2)) { | 1148 | return; |
641 | /* strip log prefix */ | ||
642 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); | ||
643 | start_print = cur_index; | ||
644 | } | ||
645 | while (cur_index != end) { | ||
646 | char c = LOG_BUF(cur_index); | ||
647 | |||
648 | cur_index++; | ||
649 | if (c == '\n') { | ||
650 | if (msg_level < 0) { | ||
651 | /* | ||
652 | * printk() has already given us loglevel tags in | ||
653 | * the buffer. This code is here in case the | ||
654 | * log buffer has wrapped right round and scribbled | ||
655 | * on those tags | ||
656 | */ | ||
657 | msg_level = default_message_loglevel; | ||
658 | } | ||
659 | _call_console_drivers(start_print, cur_index, msg_level); | ||
660 | msg_level = -1; | ||
661 | start_print = cur_index; | ||
662 | break; | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | _call_console_drivers(start_print, end, msg_level); | ||
667 | } | ||
668 | 1149 | ||
669 | static void emit_log_char(char c) | 1150 | for_each_console(con) { |
670 | { | 1151 | if (exclusive_console && con != exclusive_console) |
671 | LOG_BUF(log_end) = c; | 1152 | continue; |
672 | log_end++; | 1153 | if (!(con->flags & CON_ENABLED)) |
673 | if (log_end - log_start > log_buf_len) | 1154 | continue; |
674 | log_start = log_end - log_buf_len; | 1155 | if (!con->write) |
675 | if (log_end - con_start > log_buf_len) | 1156 | continue; |
676 | con_start = log_end - log_buf_len; | 1157 | if (!cpu_online(smp_processor_id()) && |
677 | if (logged_chars < log_buf_len) | 1158 | !(con->flags & CON_ANYTIME)) |
678 | logged_chars++; | 1159 | continue; |
1160 | con->write(con, text, len); | ||
1161 | } | ||
679 | } | 1162 | } |
680 | 1163 | ||
681 | /* | 1164 | /* |
@@ -700,16 +1183,6 @@ static void zap_locks(void) | |||
700 | sema_init(&console_sem, 1); | 1183 | sema_init(&console_sem, 1); |
701 | } | 1184 | } |
702 | 1185 | ||
703 | #if defined(CONFIG_PRINTK_TIME) | ||
704 | static bool printk_time = 1; | ||
705 | #else | ||
706 | static bool printk_time = 0; | ||
707 | #endif | ||
708 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
709 | |||
710 | static bool always_kmsg_dump; | ||
711 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
712 | |||
713 | /* Check if we have any console registered that can be called early in boot. */ | 1186 | /* Check if we have any console registered that can be called early in boot. */ |
714 | static int have_callable_console(void) | 1187 | static int have_callable_console(void) |
715 | { | 1188 | { |
@@ -722,51 +1195,6 @@ static int have_callable_console(void) | |||
722 | return 0; | 1195 | return 0; |
723 | } | 1196 | } |
724 | 1197 | ||
725 | /** | ||
726 | * printk - print a kernel message | ||
727 | * @fmt: format string | ||
728 | * | ||
729 | * This is printk(). It can be called from any context. We want it to work. | ||
730 | * | ||
731 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and | ||
732 | * call the console drivers. If we fail to get the semaphore we place the output | ||
733 | * into the log buffer and return. The current holder of the console_sem will | ||
734 | * notice the new output in console_unlock(); and will send it to the | ||
735 | * consoles before releasing the lock. | ||
736 | * | ||
737 | * One effect of this deferred printing is that code which calls printk() and | ||
738 | * then changes console_loglevel may break. This is because console_loglevel | ||
739 | * is inspected when the actual printing occurs. | ||
740 | * | ||
741 | * See also: | ||
742 | * printf(3) | ||
743 | * | ||
744 | * See the vsnprintf() documentation for format string extensions over C99. | ||
745 | */ | ||
746 | |||
747 | asmlinkage int printk(const char *fmt, ...) | ||
748 | { | ||
749 | va_list args; | ||
750 | int r; | ||
751 | |||
752 | #ifdef CONFIG_KGDB_KDB | ||
753 | if (unlikely(kdb_trap_printk)) { | ||
754 | va_start(args, fmt); | ||
755 | r = vkdb_printf(fmt, args); | ||
756 | va_end(args); | ||
757 | return r; | ||
758 | } | ||
759 | #endif | ||
760 | va_start(args, fmt); | ||
761 | r = vprintk(fmt, args); | ||
762 | va_end(args); | ||
763 | |||
764 | return r; | ||
765 | } | ||
766 | |||
767 | /* cpu currently holding logbuf_lock */ | ||
768 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
769 | |||
770 | /* | 1198 | /* |
771 | * Can we actually use the console at this time on this cpu? | 1199 | * Can we actually use the console at this time on this cpu? |
772 | * | 1200 | * |
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
810 | retval = 0; | 1238 | retval = 0; |
811 | } | 1239 | } |
812 | } | 1240 | } |
813 | printk_cpu = UINT_MAX; | 1241 | logbuf_cpu = UINT_MAX; |
814 | if (wake) | 1242 | if (wake) |
815 | up(&console_sem); | 1243 | up(&console_sem); |
816 | raw_spin_unlock(&logbuf_lock); | 1244 | raw_spin_unlock(&logbuf_lock); |
817 | return retval; | 1245 | return retval; |
818 | } | 1246 | } |
819 | static const char recursion_bug_msg [] = | ||
820 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
821 | static int recursion_bug; | ||
822 | static int new_text_line = 1; | ||
823 | static char printk_buf[1024]; | ||
824 | 1247 | ||
825 | int printk_delay_msec __read_mostly; | 1248 | int printk_delay_msec __read_mostly; |
826 | 1249 | ||
@@ -836,15 +1259,23 @@ static inline void printk_delay(void) | |||
836 | } | 1259 | } |
837 | } | 1260 | } |
838 | 1261 | ||
839 | asmlinkage int vprintk(const char *fmt, va_list args) | 1262 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | ||
1264 | const char *fmt, va_list args) | ||
840 | { | 1265 | { |
841 | int printed_len = 0; | 1266 | static int recursion_bug; |
842 | int current_log_level = default_message_loglevel; | 1267 | static char cont_buf[LOG_LINE_MAX]; |
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | ||
1272 | char *text = textbuf; | ||
1273 | size_t text_len; | ||
843 | unsigned long flags; | 1274 | unsigned long flags; |
844 | int this_cpu; | 1275 | int this_cpu; |
845 | char *p; | 1276 | bool newline = false; |
846 | size_t plen; | 1277 | bool prefix = false; |
847 | char special; | 1278 | int printed_len = 0; |
848 | 1279 | ||
849 | boot_delay_msec(); | 1280 | boot_delay_msec(); |
850 | printk_delay(); | 1281 | printk_delay(); |
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | /* | 1287 | /* |
857 | * Ouch, printk recursed into itself! | 1288 | * Ouch, printk recursed into itself! |
858 | */ | 1289 | */ |
859 | if (unlikely(printk_cpu == this_cpu)) { | 1290 | if (unlikely(logbuf_cpu == this_cpu)) { |
860 | /* | 1291 | /* |
861 | * If a crash is occurring during printk() on this CPU, | 1292 | * If a crash is occurring during printk() on this CPU, |
862 | * then try to get the crash message out but make sure | 1293 | * then try to get the crash message out but make sure |
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
873 | 1304 | ||
874 | lockdep_off(); | 1305 | lockdep_off(); |
875 | raw_spin_lock(&logbuf_lock); | 1306 | raw_spin_lock(&logbuf_lock); |
876 | printk_cpu = this_cpu; | 1307 | logbuf_cpu = this_cpu; |
877 | 1308 | ||
878 | if (recursion_bug) { | 1309 | if (recursion_bug) { |
1310 | static const char recursion_msg[] = | ||
1311 | "BUG: recent printk recursion!"; | ||
1312 | |||
879 | recursion_bug = 0; | 1313 | recursion_bug = 0; |
880 | strcpy(printk_buf, recursion_bug_msg); | 1314 | printed_len += strlen(recursion_msg); |
881 | printed_len = strlen(recursion_bug_msg); | 1315 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | ||
882 | } | 1317 | } |
883 | /* Emit the output into the temporary buffer */ | ||
884 | printed_len += vscnprintf(printk_buf + printed_len, | ||
885 | sizeof(printk_buf) - printed_len, fmt, args); | ||
886 | 1318 | ||
887 | p = printk_buf; | 1319 | /* |
1320 | * The printf needs to come first; we need the syslog | ||
1321 | * prefix which might be passed-in as a parameter. | ||
1322 | */ | ||
1323 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | ||
888 | 1324 | ||
889 | /* Read log level and handle special printk prefix */ | 1325 | /* mark and strip a trailing newline */ |
890 | plen = log_prefix(p, ¤t_log_level, &special); | 1326 | if (text_len && text[text_len-1] == '\n') { |
891 | if (plen) { | 1327 | text_len--; |
892 | p += plen; | 1328 | newline = true; |
1329 | } | ||
893 | 1330 | ||
894 | switch (special) { | 1331 | /* strip syslog prefix and extract log level or control flags */ |
895 | case 'c': /* Strip <c> KERN_CONT, continue line */ | 1332 | if (text[0] == '<' && text[1] && text[2] == '>') { |
896 | plen = 0; | 1333 | switch (text[1]) { |
897 | break; | 1334 | case '0' ... '7': |
898 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ | 1335 | if (level == -1) |
899 | plen = 0; | 1336 | level = text[1] - '0'; |
900 | default: | 1337 | case 'd': /* KERN_DEFAULT */ |
901 | if (!new_text_line) { | 1338 | prefix = true; |
902 | emit_log_char('\n'); | 1339 | case 'c': /* KERN_CONT */ |
903 | new_text_line = 1; | 1340 | text += 3; |
904 | } | 1341 | text_len -= 3; |
905 | } | 1342 | } |
906 | } | 1343 | } |
907 | 1344 | ||
908 | /* | 1345 | if (level == -1) |
909 | * Copy the output into log_buf. If the caller didn't provide | 1346 | level = default_message_loglevel; |
910 | * the appropriate log prefix, we insert them here | ||
911 | */ | ||
912 | for (; *p; p++) { | ||
913 | if (new_text_line) { | ||
914 | new_text_line = 0; | ||
915 | |||
916 | if (plen) { | ||
917 | /* Copy original log prefix */ | ||
918 | int i; | ||
919 | |||
920 | for (i = 0; i < plen; i++) | ||
921 | emit_log_char(printk_buf[i]); | ||
922 | printed_len += plen; | ||
923 | } else { | ||
924 | /* Add log prefix */ | ||
925 | emit_log_char('<'); | ||
926 | emit_log_char(current_log_level + '0'); | ||
927 | emit_log_char('>'); | ||
928 | printed_len += 3; | ||
929 | } | ||
930 | 1347 | ||
931 | if (printk_time) { | 1348 | if (dict) { |
932 | /* Add the current time stamp */ | 1349 | prefix = true; |
933 | char tbuf[50], *tp; | 1350 | newline = true; |
934 | unsigned tlen; | 1351 | } |
935 | unsigned long long t; | ||
936 | unsigned long nanosec_rem; | ||
937 | |||
938 | t = cpu_clock(printk_cpu); | ||
939 | nanosec_rem = do_div(t, 1000000000); | ||
940 | tlen = sprintf(tbuf, "[%5lu.%06lu] ", | ||
941 | (unsigned long) t, | ||
942 | nanosec_rem / 1000); | ||
943 | |||
944 | for (tp = tbuf; tp < tbuf + tlen; tp++) | ||
945 | emit_log_char(*tp); | ||
946 | printed_len += tlen; | ||
947 | } | ||
948 | 1352 | ||
949 | if (!*p) | 1353 | if (!newline) { |
950 | break; | 1354 | if (cont_len && (prefix || cont_task != current)) { |
1355 | /* | ||
1356 | * Flush earlier buffer, which is either from a | ||
1357 | * different thread, or when we got a new prefix. | ||
1358 | */ | ||
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | ||
1360 | cont_len = 0; | ||
951 | } | 1361 | } |
952 | 1362 | ||
953 | emit_log_char(*p); | 1363 | if (!cont_len) { |
954 | if (*p == '\n') | 1364 | cont_level = level; |
955 | new_text_line = 1; | 1365 | cont_task = current; |
1366 | } | ||
1367 | |||
1368 | /* buffer or append to earlier buffer from the same thread */ | ||
1369 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1370 | text_len = sizeof(cont_buf) - cont_len; | ||
1371 | memcpy(cont_buf + cont_len, text, text_len); | ||
1372 | cont_len += text_len; | ||
1373 | } else { | ||
1374 | if (cont_len && cont_task == current) { | ||
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | |||
1386 | /* append to the earlier buffer and flush */ | ||
1387 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1388 | text_len = sizeof(cont_buf) - cont_len; | ||
1389 | memcpy(cont_buf + cont_len, text, text_len); | ||
1390 | cont_len += text_len; | ||
1391 | log_store(facility, cont_level, | ||
1392 | NULL, 0, cont_buf, cont_len); | ||
1393 | cont_len = 0; | ||
1394 | cont_task = NULL; | ||
1395 | printed_len = cont_len; | ||
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | ||
956 | } | 1402 | } |
957 | 1403 | ||
958 | /* | 1404 | /* |
959 | * Try to acquire and then immediately release the | 1405 | * Try to acquire and then immediately release the console semaphore. |
960 | * console semaphore. The release will do all the | 1406 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
961 | * actual magic (print out buffers, wake up klogd, | 1407 | * users. |
962 | * etc). | ||
963 | * | 1408 | * |
964 | * The console_trylock_for_printk() function | 1409 | * The console_trylock_for_printk() function will release 'logbuf_lock' |
965 | * will release 'logbuf_lock' regardless of whether it | 1410 | * regardless of whether it actually gets the console semaphore or not. |
966 | * actually gets the semaphore or not. | ||
967 | */ | 1411 | */ |
968 | if (console_trylock_for_printk(this_cpu)) | 1412 | if (console_trylock_for_printk(this_cpu)) |
969 | console_unlock(); | 1413 | console_unlock(); |
@@ -974,16 +1418,81 @@ out_restore_irqs: | |||
974 | 1418 | ||
975 | return printed_len; | 1419 | return printed_len; |
976 | } | 1420 | } |
977 | EXPORT_SYMBOL(printk); | 1421 | EXPORT_SYMBOL(vprintk_emit); |
978 | EXPORT_SYMBOL(vprintk); | ||
979 | 1422 | ||
980 | #else | 1423 | asmlinkage int vprintk(const char *fmt, va_list args) |
1424 | { | ||
1425 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1426 | } | ||
1427 | EXPORT_SYMBOL(vprintk); | ||
981 | 1428 | ||
982 | static void call_console_drivers(unsigned start, unsigned end) | 1429 | asmlinkage int printk_emit(int facility, int level, |
1430 | const char *dict, size_t dictlen, | ||
1431 | const char *fmt, ...) | ||
983 | { | 1432 | { |
1433 | va_list args; | ||
1434 | int r; | ||
1435 | |||
1436 | va_start(args, fmt); | ||
1437 | r = vprintk_emit(facility, level, dict, dictlen, fmt, args); | ||
1438 | va_end(args); | ||
1439 | |||
1440 | return r; | ||
984 | } | 1441 | } |
1442 | EXPORT_SYMBOL(printk_emit); | ||
985 | 1443 | ||
1444 | /** | ||
1445 | * printk - print a kernel message | ||
1446 | * @fmt: format string | ||
1447 | * | ||
1448 | * This is printk(). It can be called from any context. We want it to work. | ||
1449 | * | ||
1450 | * We try to grab the console_lock. If we succeed, it's easy - we log the | ||
1451 | * output and call the console drivers. If we fail to get the semaphore, we | ||
1452 | * place the output into the log buffer and return. The current holder of | ||
1453 | * the console_sem will notice the new output in console_unlock(); and will | ||
1454 | * send it to the consoles before releasing the lock. | ||
1455 | * | ||
1456 | * One effect of this deferred printing is that code which calls printk() and | ||
1457 | * then changes console_loglevel may break. This is because console_loglevel | ||
1458 | * is inspected when the actual printing occurs. | ||
1459 | * | ||
1460 | * See also: | ||
1461 | * printf(3) | ||
1462 | * | ||
1463 | * See the vsnprintf() documentation for format string extensions over C99. | ||
1464 | */ | ||
1465 | asmlinkage int printk(const char *fmt, ...) | ||
1466 | { | ||
1467 | va_list args; | ||
1468 | int r; | ||
1469 | |||
1470 | #ifdef CONFIG_KGDB_KDB | ||
1471 | if (unlikely(kdb_trap_printk)) { | ||
1472 | va_start(args, fmt); | ||
1473 | r = vkdb_printf(fmt, args); | ||
1474 | va_end(args); | ||
1475 | return r; | ||
1476 | } | ||
986 | #endif | 1477 | #endif |
1478 | va_start(args, fmt); | ||
1479 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1480 | va_end(args); | ||
1481 | |||
1482 | return r; | ||
1483 | } | ||
1484 | EXPORT_SYMBOL(printk); | ||
1485 | |||
1486 | #else | ||
1487 | |||
1488 | #define LOG_LINE_MAX 0 | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | ||
1490 | static u32 log_next(u32 idx) { return 0; } | ||
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | ||
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
1493 | char *buf, size_t size) { return 0; } | ||
1494 | |||
1495 | #endif /* CONFIG_PRINTK */ | ||
987 | 1496 | ||
988 | static int __add_preferred_console(char *name, int idx, char *options, | 1497 | static int __add_preferred_console(char *name, int idx, char *options, |
989 | char *brl_options) | 1498 | char *brl_options) |
@@ -1217,7 +1726,7 @@ int is_console_locked(void) | |||
1217 | } | 1726 | } |
1218 | 1727 | ||
1219 | /* | 1728 | /* |
1220 | * Delayed printk facility, for scheduler-internal messages: | 1729 | * Delayed printk version, for scheduler-internal messages: |
1221 | */ | 1730 | */ |
1222 | #define PRINTK_BUF_SIZE 512 | 1731 | #define PRINTK_BUF_SIZE 512 |
1223 | 1732 | ||
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void) | |||
1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1762 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1254 | } | 1763 | } |
1255 | 1764 | ||
1765 | /* the next printk record to write to the console */ | ||
1766 | static u64 console_seq; | ||
1767 | static u32 console_idx; | ||
1768 | |||
1256 | /** | 1769 | /** |
1257 | * console_unlock - unlock the console system | 1770 | * console_unlock - unlock the console system |
1258 | * | 1771 | * |
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void) | |||
1263 | * by printk(). If this is the case, console_unlock(); emits | 1776 | * by printk(). If this is the case, console_unlock(); emits |
1264 | * the output prior to releasing the lock. | 1777 | * the output prior to releasing the lock. |
1265 | * | 1778 | * |
1266 | * If there is output waiting for klogd, we wake it up. | 1779 | * If there is output waiting, we wake /dev/kmsg and syslog() users. |
1267 | * | 1780 | * |
1268 | * console_unlock(); may be called from any context. | 1781 | * console_unlock(); may be called from any context. |
1269 | */ | 1782 | */ |
1270 | void console_unlock(void) | 1783 | void console_unlock(void) |
1271 | { | 1784 | { |
1785 | static u64 seen_seq; | ||
1272 | unsigned long flags; | 1786 | unsigned long flags; |
1273 | unsigned _con_start, _log_end; | 1787 | bool wake_klogd = false; |
1274 | unsigned wake_klogd = 0, retry = 0; | 1788 | bool retry; |
1275 | 1789 | ||
1276 | if (console_suspended) { | 1790 | if (console_suspended) { |
1277 | up(&console_sem); | 1791 | up(&console_sem); |
@@ -1281,17 +1795,38 @@ void console_unlock(void) | |||
1281 | console_may_schedule = 0; | 1795 | console_may_schedule = 0; |
1282 | 1796 | ||
1283 | again: | 1797 | again: |
1284 | for ( ; ; ) { | 1798 | for (;;) { |
1799 | struct log *msg; | ||
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | ||
1802 | int level; | ||
1803 | |||
1285 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 1804 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1286 | wake_klogd |= log_start - log_end; | 1805 | if (seen_seq != log_next_seq) { |
1287 | if (con_start == log_end) | 1806 | wake_klogd = true; |
1288 | break; /* Nothing to print */ | 1807 | seen_seq = log_next_seq; |
1289 | _con_start = con_start; | 1808 | } |
1290 | _log_end = log_end; | 1809 | |
1291 | con_start = log_end; /* Flush */ | 1810 | if (console_seq < log_first_seq) { |
1811 | /* messages are gone, move to first one */ | ||
1812 | console_seq = log_first_seq; | ||
1813 | console_idx = log_first_idx; | ||
1814 | } | ||
1815 | |||
1816 | if (console_seq == log_next_seq) | ||
1817 | break; | ||
1818 | |||
1819 | msg = log_from_idx(console_idx); | ||
1820 | level = msg->level & 7; | ||
1821 | |||
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | ||
1823 | |||
1824 | console_idx = log_next(console_idx); | ||
1825 | console_seq++; | ||
1292 | raw_spin_unlock(&logbuf_lock); | 1826 | raw_spin_unlock(&logbuf_lock); |
1827 | |||
1293 | stop_critical_timings(); /* don't trace print latency */ | 1828 | stop_critical_timings(); /* don't trace print latency */ |
1294 | call_console_drivers(_con_start, _log_end); | 1829 | call_console_drivers(level, text, len); |
1295 | start_critical_timings(); | 1830 | start_critical_timings(); |
1296 | local_irq_restore(flags); | 1831 | local_irq_restore(flags); |
1297 | } | 1832 | } |
@@ -1312,8 +1847,7 @@ again: | |||
1312 | * flush, no worries. | 1847 | * flush, no worries. |
1313 | */ | 1848 | */ |
1314 | raw_spin_lock(&logbuf_lock); | 1849 | raw_spin_lock(&logbuf_lock); |
1315 | if (con_start != log_end) | 1850 | retry = console_seq != log_next_seq; |
1316 | retry = 1; | ||
1317 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 1851 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1318 | 1852 | ||
1319 | if (retry && console_trylock()) | 1853 | if (retry && console_trylock()) |
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon) | |||
1549 | * for us. | 2083 | * for us. |
1550 | */ | 2084 | */ |
1551 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1552 | con_start = log_start; | 2086 | console_seq = syslog_seq; |
2087 | console_idx = syslog_idx; | ||
1553 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1554 | /* | 2089 | /* |
1555 | * We're about to replay the log buffer. Only do this to the | 2090 | * We're about to replay the log buffer. Only do this to the |
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1758 | } | 2293 | } |
1759 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 2294 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1760 | 2295 | ||
2296 | static bool always_kmsg_dump; | ||
2297 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
2298 | |||
1761 | /** | 2299 | /** |
1762 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2300 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1763 | * @reason: the reason (oops, panic etc) for dumping | 2301 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | |||
1767 | */ | 2305 | */ |
1768 | void kmsg_dump(enum kmsg_dump_reason reason) | 2306 | void kmsg_dump(enum kmsg_dump_reason reason) |
1769 | { | 2307 | { |
1770 | unsigned long end; | 2308 | u64 idx; |
1771 | unsigned chars; | ||
1772 | struct kmsg_dumper *dumper; | 2309 | struct kmsg_dumper *dumper; |
1773 | const char *s1, *s2; | 2310 | const char *s1, *s2; |
1774 | unsigned long l1, l2; | 2311 | unsigned long l1, l2; |
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1780 | /* Theoretically, the log could move on after we do this, but | 2317 | /* Theoretically, the log could move on after we do this, but |
1781 | there's not a lot we can do about that. The new messages | 2318 | there's not a lot we can do about that. The new messages |
1782 | will overwrite the start of what we dump. */ | 2319 | will overwrite the start of what we dump. */ |
2320 | |||
1783 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1784 | end = log_end & LOG_BUF_MASK; | 2322 | if (syslog_seq < log_first_seq) |
1785 | chars = logged_chars; | 2323 | idx = syslog_idx; |
1786 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2324 | else |
2325 | idx = log_first_idx; | ||
1787 | 2326 | ||
1788 | if (chars > end) { | 2327 | if (idx > log_next_idx) { |
1789 | s1 = log_buf + log_buf_len - chars + end; | 2328 | s1 = log_buf; |
1790 | l1 = chars - end; | 2329 | l1 = log_next_idx; |
1791 | 2330 | ||
1792 | s2 = log_buf; | 2331 | s2 = log_buf + idx; |
1793 | l2 = end; | 2332 | l2 = log_buf_len - idx; |
1794 | } else { | 2333 | } else { |
1795 | s1 = ""; | 2334 | s1 = ""; |
1796 | l1 = 0; | 2335 | l1 = 0; |
1797 | 2336 | ||
1798 | s2 = log_buf + end - chars; | 2337 | s2 = log_buf + idx; |
1799 | l2 = chars; | 2338 | l2 = log_next_idx - idx; |
1800 | } | 2339 | } |
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1801 | 2341 | ||
1802 | rcu_read_lock(); | 2342 | rcu_read_lock(); |
1803 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2343 | list_for_each_entry_rcu(dumper, &dump_list, list) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc27..95cba41ce1e9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -51,6 +51,34 @@ | |||
51 | 51 | ||
52 | #include "rcu.h" | 52 | #include "rcu.h" |
53 | 53 | ||
54 | #ifdef CONFIG_PREEMPT_RCU | ||
55 | |||
56 | /* | ||
57 | * Check for a task exiting while in a preemptible-RCU read-side | ||
58 | * critical section, clean up if so. No need to issue warnings, | ||
59 | * as debug_check_no_locks_held() already does this if lockdep | ||
60 | * is enabled. | ||
61 | */ | ||
62 | void exit_rcu(void) | ||
63 | { | ||
64 | struct task_struct *t = current; | ||
65 | |||
66 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
67 | return; | ||
68 | t->rcu_read_lock_nesting = 1; | ||
69 | barrier(); | ||
70 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
71 | __rcu_read_unlock(); | ||
72 | } | ||
73 | |||
74 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
75 | |||
76 | void exit_rcu(void) | ||
77 | { | ||
78 | } | ||
79 | |||
80 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
81 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 82 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | static struct lock_class_key rcu_lock_key; | 83 | static struct lock_class_key rcu_lock_key; |
56 | struct lockdep_map rcu_lock_map = | 84 | struct lockdep_map rcu_lock_map = |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb62..fc31a2d65100 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) | |||
851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | ||
855 | * Check for a task exiting while in a preemptible -RCU read-side | ||
856 | * critical section, clean up if so. No need to issue warnings, | ||
857 | * as debug_check_no_locks_held() already does this if lockdep | ||
858 | * is enabled. | ||
859 | */ | ||
860 | void exit_rcu(void) | ||
861 | { | ||
862 | struct task_struct *t = current; | ||
863 | |||
864 | if (t->rcu_read_lock_nesting == 0) | ||
865 | return; | ||
866 | t->rcu_read_lock_nesting = 1; | ||
867 | __rcu_read_unlock(); | ||
868 | } | ||
869 | |||
870 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 854 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
871 | 855 | ||
872 | #ifdef CONFIG_RCU_TRACE | 856 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6e..e66b34ab7555 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | 68 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ |
68 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | 69 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ |
69 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | 70 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ |
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); | |||
96 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 97 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
97 | module_param(fqs_stutter, int, 0444); | 98 | module_param(fqs_stutter, int, 0444); |
98 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 99 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
100 | module_param(n_barrier_cbs, int, 0444); | ||
101 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | ||
99 | module_param(onoff_interval, int, 0444); | 102 | module_param(onoff_interval, int, 0444); |
100 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 103 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); |
101 | module_param(onoff_holdoff, int, 0444); | 104 | module_param(onoff_holdoff, int, 0444); |
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; | |||
139 | static struct task_struct *onoff_task; | 142 | static struct task_struct *onoff_task; |
140 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 143 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
141 | static struct task_struct *stall_task; | 144 | static struct task_struct *stall_task; |
145 | static struct task_struct **barrier_cbs_tasks; | ||
146 | static struct task_struct *barrier_task; | ||
142 | 147 | ||
143 | #define RCU_TORTURE_PIPE_LEN 10 | 148 | #define RCU_TORTURE_PIPE_LEN 10 |
144 | 149 | ||
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
164 | static atomic_t n_rcu_torture_free; | 169 | static atomic_t n_rcu_torture_free; |
165 | static atomic_t n_rcu_torture_mberror; | 170 | static atomic_t n_rcu_torture_mberror; |
166 | static atomic_t n_rcu_torture_error; | 171 | static atomic_t n_rcu_torture_error; |
172 | static long n_rcu_torture_barrier_error; | ||
167 | static long n_rcu_torture_boost_ktrerror; | 173 | static long n_rcu_torture_boost_ktrerror; |
168 | static long n_rcu_torture_boost_rterror; | 174 | static long n_rcu_torture_boost_rterror; |
169 | static long n_rcu_torture_boost_failure; | 175 | static long n_rcu_torture_boost_failure; |
@@ -173,6 +179,8 @@ static long n_offline_attempts; | |||
173 | static long n_offline_successes; | 179 | static long n_offline_successes; |
174 | static long n_online_attempts; | 180 | static long n_online_attempts; |
175 | static long n_online_successes; | 181 | static long n_online_successes; |
182 | static long n_barrier_attempts; | ||
183 | static long n_barrier_successes; | ||
176 | static struct list_head rcu_torture_removed; | 184 | static struct list_head rcu_torture_removed; |
177 | static cpumask_var_t shuffle_tmp_mask; | 185 | static cpumask_var_t shuffle_tmp_mask; |
178 | 186 | ||
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ | |||
197 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 205 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
198 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
199 | /* and boost task create/destroy. */ | 207 | /* and boost task create/destroy. */ |
208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | ||
209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | ||
210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | ||
211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | ||
200 | 212 | ||
201 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 213 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
202 | 214 | ||
@@ -327,6 +339,7 @@ struct rcu_torture_ops { | |||
327 | int (*completed)(void); | 339 | int (*completed)(void); |
328 | void (*deferred_free)(struct rcu_torture *p); | 340 | void (*deferred_free)(struct rcu_torture *p); |
329 | void (*sync)(void); | 341 | void (*sync)(void); |
342 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
330 | void (*cb_barrier)(void); | 343 | void (*cb_barrier)(void); |
331 | void (*fqs)(void); | 344 | void (*fqs)(void); |
332 | int (*stats)(char *page); | 345 | int (*stats)(char *page); |
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
417 | .completed = rcu_torture_completed, | 430 | .completed = rcu_torture_completed, |
418 | .deferred_free = rcu_torture_deferred_free, | 431 | .deferred_free = rcu_torture_deferred_free, |
419 | .sync = synchronize_rcu, | 432 | .sync = synchronize_rcu, |
433 | .call = call_rcu, | ||
420 | .cb_barrier = rcu_barrier, | 434 | .cb_barrier = rcu_barrier, |
421 | .fqs = rcu_force_quiescent_state, | 435 | .fqs = rcu_force_quiescent_state, |
422 | .stats = NULL, | 436 | .stats = NULL, |
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
460 | .completed = rcu_torture_completed, | 474 | .completed = rcu_torture_completed, |
461 | .deferred_free = rcu_sync_torture_deferred_free, | 475 | .deferred_free = rcu_sync_torture_deferred_free, |
462 | .sync = synchronize_rcu, | 476 | .sync = synchronize_rcu, |
477 | .call = NULL, | ||
463 | .cb_barrier = NULL, | 478 | .cb_barrier = NULL, |
464 | .fqs = rcu_force_quiescent_state, | 479 | .fqs = rcu_force_quiescent_state, |
465 | .stats = NULL, | 480 | .stats = NULL, |
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
477 | .completed = rcu_no_completed, | 492 | .completed = rcu_no_completed, |
478 | .deferred_free = rcu_sync_torture_deferred_free, | 493 | .deferred_free = rcu_sync_torture_deferred_free, |
479 | .sync = synchronize_rcu_expedited, | 494 | .sync = synchronize_rcu_expedited, |
495 | .call = NULL, | ||
480 | .cb_barrier = NULL, | 496 | .cb_barrier = NULL, |
481 | .fqs = rcu_force_quiescent_state, | 497 | .fqs = rcu_force_quiescent_state, |
482 | .stats = NULL, | 498 | .stats = NULL, |
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
519 | .completed = rcu_bh_torture_completed, | 535 | .completed = rcu_bh_torture_completed, |
520 | .deferred_free = rcu_bh_torture_deferred_free, | 536 | .deferred_free = rcu_bh_torture_deferred_free, |
521 | .sync = synchronize_rcu_bh, | 537 | .sync = synchronize_rcu_bh, |
538 | .call = call_rcu_bh, | ||
522 | .cb_barrier = rcu_barrier_bh, | 539 | .cb_barrier = rcu_barrier_bh, |
523 | .fqs = rcu_bh_force_quiescent_state, | 540 | .fqs = rcu_bh_force_quiescent_state, |
524 | .stats = NULL, | 541 | .stats = NULL, |
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
535 | .completed = rcu_bh_torture_completed, | 552 | .completed = rcu_bh_torture_completed, |
536 | .deferred_free = rcu_sync_torture_deferred_free, | 553 | .deferred_free = rcu_sync_torture_deferred_free, |
537 | .sync = synchronize_rcu_bh, | 554 | .sync = synchronize_rcu_bh, |
555 | .call = NULL, | ||
538 | .cb_barrier = NULL, | 556 | .cb_barrier = NULL, |
539 | .fqs = rcu_bh_force_quiescent_state, | 557 | .fqs = rcu_bh_force_quiescent_state, |
540 | .stats = NULL, | 558 | .stats = NULL, |
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
551 | .completed = rcu_bh_torture_completed, | 569 | .completed = rcu_bh_torture_completed, |
552 | .deferred_free = rcu_sync_torture_deferred_free, | 570 | .deferred_free = rcu_sync_torture_deferred_free, |
553 | .sync = synchronize_rcu_bh_expedited, | 571 | .sync = synchronize_rcu_bh_expedited, |
572 | .call = NULL, | ||
554 | .cb_barrier = NULL, | 573 | .cb_barrier = NULL, |
555 | .fqs = rcu_bh_force_quiescent_state, | 574 | .fqs = rcu_bh_force_quiescent_state, |
556 | .stats = NULL, | 575 | .stats = NULL, |
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void) | |||
606 | return srcu_batches_completed(&srcu_ctl); | 625 | return srcu_batches_completed(&srcu_ctl); |
607 | } | 626 | } |
608 | 627 | ||
628 | static void srcu_torture_deferred_free(struct rcu_torture *rp) | ||
629 | { | ||
630 | call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); | ||
631 | } | ||
632 | |||
609 | static void srcu_torture_synchronize(void) | 633 | static void srcu_torture_synchronize(void) |
610 | { | 634 | { |
611 | synchronize_srcu(&srcu_ctl); | 635 | synchronize_srcu(&srcu_ctl); |
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page) | |||
620 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 644 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", |
621 | torture_type, TORTURE_FLAG, idx); | 645 | torture_type, TORTURE_FLAG, idx); |
622 | for_each_possible_cpu(cpu) { | 646 | for_each_possible_cpu(cpu) { |
623 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, | 647 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, |
624 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 648 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
625 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 649 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
626 | } | 650 | } |
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = { | |||
635 | .read_delay = srcu_read_delay, | 659 | .read_delay = srcu_read_delay, |
636 | .readunlock = srcu_torture_read_unlock, | 660 | .readunlock = srcu_torture_read_unlock, |
637 | .completed = srcu_torture_completed, | 661 | .completed = srcu_torture_completed, |
638 | .deferred_free = rcu_sync_torture_deferred_free, | 662 | .deferred_free = srcu_torture_deferred_free, |
639 | .sync = srcu_torture_synchronize, | 663 | .sync = srcu_torture_synchronize, |
664 | .call = NULL, | ||
640 | .cb_barrier = NULL, | 665 | .cb_barrier = NULL, |
641 | .stats = srcu_torture_stats, | 666 | .stats = srcu_torture_stats, |
642 | .name = "srcu" | 667 | .name = "srcu" |
643 | }; | 668 | }; |
644 | 669 | ||
670 | static struct rcu_torture_ops srcu_sync_ops = { | ||
671 | .init = srcu_torture_init, | ||
672 | .cleanup = srcu_torture_cleanup, | ||
673 | .readlock = srcu_torture_read_lock, | ||
674 | .read_delay = srcu_read_delay, | ||
675 | .readunlock = srcu_torture_read_unlock, | ||
676 | .completed = srcu_torture_completed, | ||
677 | .deferred_free = rcu_sync_torture_deferred_free, | ||
678 | .sync = srcu_torture_synchronize, | ||
679 | .call = NULL, | ||
680 | .cb_barrier = NULL, | ||
681 | .stats = srcu_torture_stats, | ||
682 | .name = "srcu_sync" | ||
683 | }; | ||
684 | |||
645 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | 685 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) |
646 | { | 686 | { |
647 | return srcu_read_lock_raw(&srcu_ctl); | 687 | return srcu_read_lock_raw(&srcu_ctl); |
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
659 | .read_delay = srcu_read_delay, | 699 | .read_delay = srcu_read_delay, |
660 | .readunlock = srcu_torture_read_unlock_raw, | 700 | .readunlock = srcu_torture_read_unlock_raw, |
661 | .completed = srcu_torture_completed, | 701 | .completed = srcu_torture_completed, |
662 | .deferred_free = rcu_sync_torture_deferred_free, | 702 | .deferred_free = srcu_torture_deferred_free, |
663 | .sync = srcu_torture_synchronize, | 703 | .sync = srcu_torture_synchronize, |
704 | .call = NULL, | ||
664 | .cb_barrier = NULL, | 705 | .cb_barrier = NULL, |
665 | .stats = srcu_torture_stats, | 706 | .stats = srcu_torture_stats, |
666 | .name = "srcu_raw" | 707 | .name = "srcu_raw" |
667 | }; | 708 | }; |
668 | 709 | ||
710 | static struct rcu_torture_ops srcu_raw_sync_ops = { | ||
711 | .init = srcu_torture_init, | ||
712 | .cleanup = srcu_torture_cleanup, | ||
713 | .readlock = srcu_torture_read_lock_raw, | ||
714 | .read_delay = srcu_read_delay, | ||
715 | .readunlock = srcu_torture_read_unlock_raw, | ||
716 | .completed = srcu_torture_completed, | ||
717 | .deferred_free = rcu_sync_torture_deferred_free, | ||
718 | .sync = srcu_torture_synchronize, | ||
719 | .call = NULL, | ||
720 | .cb_barrier = NULL, | ||
721 | .stats = srcu_torture_stats, | ||
722 | .name = "srcu_raw_sync" | ||
723 | }; | ||
724 | |||
669 | static void srcu_torture_synchronize_expedited(void) | 725 | static void srcu_torture_synchronize_expedited(void) |
670 | { | 726 | { |
671 | synchronize_srcu_expedited(&srcu_ctl); | 727 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { | |||
680 | .completed = srcu_torture_completed, | 736 | .completed = srcu_torture_completed, |
681 | .deferred_free = rcu_sync_torture_deferred_free, | 737 | .deferred_free = rcu_sync_torture_deferred_free, |
682 | .sync = srcu_torture_synchronize_expedited, | 738 | .sync = srcu_torture_synchronize_expedited, |
739 | .call = NULL, | ||
683 | .cb_barrier = NULL, | 740 | .cb_barrier = NULL, |
684 | .stats = srcu_torture_stats, | 741 | .stats = srcu_torture_stats, |
685 | .name = "srcu_expedited" | 742 | .name = "srcu_expedited" |
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page) | |||
1129 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1130 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1131 | "rtbf: %ld rtb: %ld nt: %ld " | 1188 | "rtbf: %ld rtb: %ld nt: %ld " |
1132 | "onoff: %ld/%ld:%ld/%ld", | 1189 | "onoff: %ld/%ld:%ld/%ld " |
1190 | "barrier: %ld/%ld:%ld", | ||
1133 | rcu_torture_current, | 1191 | rcu_torture_current, |
1134 | rcu_torture_current_version, | 1192 | rcu_torture_current_version, |
1135 | list_empty(&rcu_torture_freelist), | 1193 | list_empty(&rcu_torture_freelist), |
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page) | |||
1145 | n_online_successes, | 1203 | n_online_successes, |
1146 | n_online_attempts, | 1204 | n_online_attempts, |
1147 | n_offline_successes, | 1205 | n_offline_successes, |
1148 | n_offline_attempts); | 1206 | n_offline_attempts, |
1207 | n_barrier_successes, | ||
1208 | n_barrier_attempts, | ||
1209 | n_rcu_torture_barrier_error); | ||
1210 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1149 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1211 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1212 | n_rcu_torture_barrier_error != 0 || | ||
1150 | n_rcu_torture_boost_ktrerror != 0 || | 1213 | n_rcu_torture_boost_ktrerror != 0 || |
1151 | n_rcu_torture_boost_rterror != 0 || | 1214 | n_rcu_torture_boost_rterror != 0 || |
1152 | n_rcu_torture_boost_failure != 0) | 1215 | n_rcu_torture_boost_failure != 0 || |
1153 | cnt += sprintf(&page[cnt], " !!!"); | 1216 | i > 1) { |
1154 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1155 | if (i > 1) { | ||
1156 | cnt += sprintf(&page[cnt], "!!! "); | 1217 | cnt += sprintf(&page[cnt], "!!! "); |
1157 | atomic_inc(&n_rcu_torture_error); | 1218 | atomic_inc(&n_rcu_torture_error); |
1158 | WARN_ON_ONCE(1); | 1219 | WARN_ON_ONCE(1); |
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu) | |||
1337 | 1398 | ||
1338 | /* This must be outside of the mutex, otherwise deadlock! */ | 1399 | /* This must be outside of the mutex, otherwise deadlock! */ |
1339 | kthread_stop(t); | 1400 | kthread_stop(t); |
1401 | boost_tasks[cpu] = NULL; | ||
1340 | } | 1402 | } |
1341 | 1403 | ||
1342 | static int rcutorture_booster_init(int cpu) | 1404 | static int rcutorture_booster_init(int cpu) |
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void) | |||
1484 | return; | 1546 | return; |
1485 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | 1547 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); |
1486 | kthread_stop(onoff_task); | 1548 | kthread_stop(onoff_task); |
1549 | onoff_task = NULL; | ||
1487 | } | 1550 | } |
1488 | 1551 | ||
1489 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1552 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1490 | 1553 | ||
1491 | static void | 1554 | static int |
1492 | rcu_torture_onoff_init(void) | 1555 | rcu_torture_onoff_init(void) |
1493 | { | 1556 | { |
1557 | return 0; | ||
1494 | } | 1558 | } |
1495 | 1559 | ||
1496 | static void rcu_torture_onoff_cleanup(void) | 1560 | static void rcu_torture_onoff_cleanup(void) |
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void) | |||
1554 | return; | 1618 | return; |
1555 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | 1619 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); |
1556 | kthread_stop(stall_task); | 1620 | kthread_stop(stall_task); |
1621 | stall_task = NULL; | ||
1622 | } | ||
1623 | |||
1624 | /* Callback function for RCU barrier testing. */ | ||
1625 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | ||
1626 | { | ||
1627 | atomic_inc(&barrier_cbs_invoked); | ||
1628 | } | ||
1629 | |||
1630 | /* kthread function to register callbacks used to test RCU barriers. */ | ||
1631 | static int rcu_torture_barrier_cbs(void *arg) | ||
1632 | { | ||
1633 | long myid = (long)arg; | ||
1634 | struct rcu_head rcu; | ||
1635 | |||
1636 | init_rcu_head_on_stack(&rcu); | ||
1637 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); | ||
1638 | set_user_nice(current, 19); | ||
1639 | do { | ||
1640 | wait_event(barrier_cbs_wq[myid], | ||
1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | ||
1642 | kthread_should_stop() || | ||
1643 | fullstop != FULLSTOP_DONTSTOP); | ||
1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1645 | break; | ||
1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | ||
1647 | if (atomic_dec_and_test(&barrier_cbs_count)) | ||
1648 | wake_up(&barrier_wq); | ||
1649 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1650 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); | ||
1651 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1652 | while (!kthread_should_stop()) | ||
1653 | schedule_timeout_interruptible(1); | ||
1654 | cur_ops->cb_barrier(); | ||
1655 | destroy_rcu_head_on_stack(&rcu); | ||
1656 | return 0; | ||
1657 | } | ||
1658 | |||
1659 | /* kthread function to drive and coordinate RCU barrier testing. */ | ||
1660 | static int rcu_torture_barrier(void *arg) | ||
1661 | { | ||
1662 | int i; | ||
1663 | |||
1664 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); | ||
1665 | do { | ||
1666 | atomic_set(&barrier_cbs_invoked, 0); | ||
1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | ||
1668 | /* wake_up() path contains the required barriers. */ | ||
1669 | for (i = 0; i < n_barrier_cbs; i++) | ||
1670 | wake_up(&barrier_cbs_wq[i]); | ||
1671 | wait_event(barrier_wq, | ||
1672 | atomic_read(&barrier_cbs_count) == 0 || | ||
1673 | kthread_should_stop() || | ||
1674 | fullstop != FULLSTOP_DONTSTOP); | ||
1675 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1676 | break; | ||
1677 | n_barrier_attempts++; | ||
1678 | cur_ops->cb_barrier(); | ||
1679 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | ||
1680 | n_rcu_torture_barrier_error++; | ||
1681 | WARN_ON_ONCE(1); | ||
1682 | } | ||
1683 | n_barrier_successes++; | ||
1684 | schedule_timeout_interruptible(HZ / 10); | ||
1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | ||
1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1688 | while (!kthread_should_stop()) | ||
1689 | schedule_timeout_interruptible(1); | ||
1690 | return 0; | ||
1691 | } | ||
1692 | |||
1693 | /* Initialize RCU barrier testing. */ | ||
1694 | static int rcu_torture_barrier_init(void) | ||
1695 | { | ||
1696 | int i; | ||
1697 | int ret; | ||
1698 | |||
1699 | if (n_barrier_cbs == 0) | ||
1700 | return 0; | ||
1701 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | ||
1702 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1703 | " Call or barrier ops missing for %s,\n", | ||
1704 | torture_type, cur_ops->name); | ||
1705 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1706 | " RCU barrier testing omitted from run.\n", | ||
1707 | torture_type); | ||
1708 | return 0; | ||
1709 | } | ||
1710 | atomic_set(&barrier_cbs_count, 0); | ||
1711 | atomic_set(&barrier_cbs_invoked, 0); | ||
1712 | barrier_cbs_tasks = | ||
1713 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | ||
1714 | GFP_KERNEL); | ||
1715 | barrier_cbs_wq = | ||
1716 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | ||
1717 | GFP_KERNEL); | ||
1718 | if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) | ||
1719 | return -ENOMEM; | ||
1720 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1721 | init_waitqueue_head(&barrier_cbs_wq[i]); | ||
1722 | barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, | ||
1723 | (void *)(long)i, | ||
1724 | "rcu_torture_barrier_cbs"); | ||
1725 | if (IS_ERR(barrier_cbs_tasks[i])) { | ||
1726 | ret = PTR_ERR(barrier_cbs_tasks[i]); | ||
1727 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); | ||
1728 | barrier_cbs_tasks[i] = NULL; | ||
1729 | return ret; | ||
1730 | } | ||
1731 | } | ||
1732 | barrier_task = kthread_run(rcu_torture_barrier, NULL, | ||
1733 | "rcu_torture_barrier"); | ||
1734 | if (IS_ERR(barrier_task)) { | ||
1735 | ret = PTR_ERR(barrier_task); | ||
1736 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); | ||
1737 | barrier_task = NULL; | ||
1738 | } | ||
1739 | return 0; | ||
1740 | } | ||
1741 | |||
1742 | /* Clean up after RCU barrier testing. */ | ||
1743 | static void rcu_torture_barrier_cleanup(void) | ||
1744 | { | ||
1745 | int i; | ||
1746 | |||
1747 | if (barrier_task != NULL) { | ||
1748 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); | ||
1749 | kthread_stop(barrier_task); | ||
1750 | barrier_task = NULL; | ||
1751 | } | ||
1752 | if (barrier_cbs_tasks != NULL) { | ||
1753 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1754 | if (barrier_cbs_tasks[i] != NULL) { | ||
1755 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); | ||
1756 | kthread_stop(barrier_cbs_tasks[i]); | ||
1757 | barrier_cbs_tasks[i] = NULL; | ||
1758 | } | ||
1759 | } | ||
1760 | kfree(barrier_cbs_tasks); | ||
1761 | barrier_cbs_tasks = NULL; | ||
1762 | } | ||
1763 | if (barrier_cbs_wq != NULL) { | ||
1764 | kfree(barrier_cbs_wq); | ||
1765 | barrier_cbs_wq = NULL; | ||
1766 | } | ||
1557 | } | 1767 | } |
1558 | 1768 | ||
1559 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1769 | static int rcutorture_cpu_notify(struct notifier_block *self, |
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void) | |||
1598 | fullstop = FULLSTOP_RMMOD; | 1808 | fullstop = FULLSTOP_RMMOD; |
1599 | mutex_unlock(&fullstop_mutex); | 1809 | mutex_unlock(&fullstop_mutex); |
1600 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1810 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1811 | rcu_torture_barrier_cleanup(); | ||
1601 | rcu_torture_stall_cleanup(); | 1812 | rcu_torture_stall_cleanup(); |
1602 | if (stutter_task) { | 1813 | if (stutter_task) { |
1603 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1814 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void) | |||
1665 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | 1876 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); |
1666 | kthread_stop(shutdown_task); | 1877 | kthread_stop(shutdown_task); |
1667 | } | 1878 | } |
1879 | shutdown_task = NULL; | ||
1668 | rcu_torture_onoff_cleanup(); | 1880 | rcu_torture_onoff_cleanup(); |
1669 | 1881 | ||
1670 | /* Wait for all RCU callbacks to fire. */ | 1882 | /* Wait for all RCU callbacks to fire. */ |
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void) | |||
1676 | 1888 | ||
1677 | if (cur_ops->cleanup) | 1889 | if (cur_ops->cleanup) |
1678 | cur_ops->cleanup(); | 1890 | cur_ops->cleanup(); |
1679 | if (atomic_read(&n_rcu_torture_error)) | 1891 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1680 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1892 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1681 | else if (n_online_successes != n_online_attempts || | 1893 | else if (n_online_successes != n_online_attempts || |
1682 | n_offline_successes != n_offline_attempts) | 1894 | n_offline_successes != n_offline_attempts) |
@@ -1692,10 +1904,12 @@ rcu_torture_init(void) | |||
1692 | int i; | 1904 | int i; |
1693 | int cpu; | 1905 | int cpu; |
1694 | int firsterr = 0; | 1906 | int firsterr = 0; |
1907 | int retval; | ||
1695 | static struct rcu_torture_ops *torture_ops[] = | 1908 | static struct rcu_torture_ops *torture_ops[] = |
1696 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1697 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1698 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, | 1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, |
1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | ||
1699 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1700 | 1914 | ||
1701 | mutex_lock(&fullstop_mutex); | 1915 | mutex_lock(&fullstop_mutex); |
@@ -1749,6 +1963,7 @@ rcu_torture_init(void) | |||
1749 | atomic_set(&n_rcu_torture_free, 0); | 1963 | atomic_set(&n_rcu_torture_free, 0); |
1750 | atomic_set(&n_rcu_torture_mberror, 0); | 1964 | atomic_set(&n_rcu_torture_mberror, 0); |
1751 | atomic_set(&n_rcu_torture_error, 0); | 1965 | atomic_set(&n_rcu_torture_error, 0); |
1966 | n_rcu_torture_barrier_error = 0; | ||
1752 | n_rcu_torture_boost_ktrerror = 0; | 1967 | n_rcu_torture_boost_ktrerror = 0; |
1753 | n_rcu_torture_boost_rterror = 0; | 1968 | n_rcu_torture_boost_rterror = 0; |
1754 | n_rcu_torture_boost_failure = 0; | 1969 | n_rcu_torture_boost_failure = 0; |
@@ -1872,7 +2087,6 @@ rcu_torture_init(void) | |||
1872 | test_boost_duration = 2; | 2087 | test_boost_duration = 2; |
1873 | if ((test_boost == 1 && cur_ops->can_boost) || | 2088 | if ((test_boost == 1 && cur_ops->can_boost) || |
1874 | test_boost == 2) { | 2089 | test_boost == 2) { |
1875 | int retval; | ||
1876 | 2090 | ||
1877 | boost_starttime = jiffies + test_boost_interval * HZ; | 2091 | boost_starttime = jiffies + test_boost_interval * HZ; |
1878 | register_cpu_notifier(&rcutorture_cpu_nb); | 2092 | register_cpu_notifier(&rcutorture_cpu_nb); |
@@ -1897,9 +2111,22 @@ rcu_torture_init(void) | |||
1897 | goto unwind; | 2111 | goto unwind; |
1898 | } | 2112 | } |
1899 | } | 2113 | } |
1900 | rcu_torture_onoff_init(); | 2114 | i = rcu_torture_onoff_init(); |
2115 | if (i != 0) { | ||
2116 | firsterr = i; | ||
2117 | goto unwind; | ||
2118 | } | ||
1901 | register_reboot_notifier(&rcutorture_shutdown_nb); | 2119 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1902 | rcu_torture_stall_init(); | 2120 | i = rcu_torture_stall_init(); |
2121 | if (i != 0) { | ||
2122 | firsterr = i; | ||
2123 | goto unwind; | ||
2124 | } | ||
2125 | retval = rcu_torture_barrier_init(); | ||
2126 | if (retval != 0) { | ||
2127 | firsterr = retval; | ||
2128 | goto unwind; | ||
2129 | } | ||
1903 | rcutorture_record_test_transition(); | 2130 | rcutorture_record_test_transition(); |
1904 | mutex_unlock(&fullstop_mutex); | 2131 | mutex_unlock(&fullstop_mutex); |
1905 | return 0; | 2132 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0c5baf1ab18..0da7b88d92d0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
75 | .gpnum = -300, \ | 75 | .gpnum = -300, \ |
76 | .completed = -300, \ | 76 | .completed = -300, \ |
77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | ||
79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | ||
78 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
79 | .n_force_qs = 0, \ | 81 | .n_force_qs = 0, \ |
80 | .n_force_qs_ngp = 0, \ | 82 | .n_force_qs_ngp = 0, \ |
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
145 | unsigned long rcutorture_testseq; | 147 | unsigned long rcutorture_testseq; |
146 | unsigned long rcutorture_vernum; | 148 | unsigned long rcutorture_vernum; |
147 | 149 | ||
150 | /* State information for rcu_barrier() and friends. */ | ||
151 | |||
152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
153 | static atomic_t rcu_barrier_cpu_count; | ||
154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
155 | static struct completion rcu_barrier_completion; | ||
156 | |||
148 | /* | 157 | /* |
149 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
150 | * permit this function to be invoked without holding the root rcu_node | 159 | * permit this function to be invoked without holding the root rcu_node |
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu) | |||
192 | { | 201 | { |
193 | trace_rcu_utilization("Start context switch"); | 202 | trace_rcu_utilization("Start context switch"); |
194 | rcu_sched_qs(cpu); | 203 | rcu_sched_qs(cpu); |
195 | rcu_preempt_note_context_switch(cpu); | ||
196 | trace_rcu_utilization("End context switch"); | 204 | trace_rcu_utilization("End context switch"); |
197 | } | 205 | } |
198 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1311 | #ifdef CONFIG_HOTPLUG_CPU | 1319 | #ifdef CONFIG_HOTPLUG_CPU |
1312 | 1320 | ||
1313 | /* | 1321 | /* |
1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1322 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1315 | * Also record a quiescent state for this CPU for the current grace period. | 1323 | * specified CPU must be offline, and the caller must hold the |
1316 | * Synchronization and interrupt disabling are not required because | 1324 | * ->onofflock. |
1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
1318 | * operations that might block must be done later from the CPU_DEAD | ||
1319 | * notifier. | ||
1320 | * | ||
1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
1323 | * destination from the bits set in that mask. | ||
1324 | */ | 1325 | */ |
1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1326 | static void |
1327 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
1328 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
1326 | { | 1329 | { |
1327 | int i; | 1330 | int i; |
1328 | unsigned long mask; | ||
1329 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
1333 | 1331 | ||
1334 | /* First, adjust the counts. */ | 1332 | /* |
1333 | * Orphan the callbacks. First adjust the counts. This is safe | ||
1334 | * because ->onofflock excludes _rcu_barrier()'s adoption of | ||
1335 | * the callbacks, thus no memory barrier is required. | ||
1336 | */ | ||
1335 | if (rdp->nxtlist != NULL) { | 1337 | if (rdp->nxtlist != NULL) { |
1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | 1338 | rsp->qlen_lazy += rdp->qlen_lazy; |
1337 | receive_rdp->qlen += rdp->qlen; | 1339 | rsp->qlen += rdp->qlen; |
1340 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1338 | rdp->qlen_lazy = 0; | 1341 | rdp->qlen_lazy = 0; |
1339 | rdp->qlen = 0; | 1342 | rdp->qlen = 0; |
1340 | } | 1343 | } |
1341 | 1344 | ||
1342 | /* | 1345 | /* |
1343 | * Next, move ready-to-invoke callbacks to be invoked on some | 1346 | * Next, move those callbacks still needing a grace period to |
1344 | * other CPU. These will not be required to pass through another | 1347 | * the orphanage, where some other CPU will pick them up. |
1345 | * grace period: They are done, regardless of CPU. | 1348 | * Some of the callbacks might have gone partway through a grace |
1349 | * period, but that is too bad. They get to start over because we | ||
1350 | * cannot assume that grace periods are synchronized across CPUs. | ||
1351 | * We don't bother updating the ->nxttail[] array yet, instead | ||
1352 | * we just reset the whole thing later on. | ||
1346 | */ | 1353 | */ |
1347 | if (rdp->nxtlist != NULL && | 1354 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { |
1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | 1355 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; |
1349 | struct rcu_head *oldhead; | 1356 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; |
1350 | struct rcu_head **oldtail; | 1357 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1351 | struct rcu_head **newtail; | ||
1352 | |||
1353 | oldhead = rdp->nxtlist; | ||
1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
1361 | receive_rdp->nxttail[i] = newtail; | ||
1362 | if (rdp->nxttail[i] == newtail) | ||
1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1364 | } | ||
1365 | } | 1358 | } |
1366 | 1359 | ||
1367 | /* | 1360 | /* |
1368 | * Finally, put the rest of the callbacks at the end of the list. | 1361 | * Then move the ready-to-invoke callbacks to the orphanage, |
1369 | * The ones that made it partway through get to start over: We | 1362 | * where some other CPU will pick them up. These will not be |
1370 | * cannot assume that grace periods are synchronized across CPUs. | 1363 | * required to pass though another grace period: They are done. |
1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but | ||
1372 | * this does not seem compelling. Not yet, anyway.) | ||
1373 | */ | 1364 | */ |
1374 | if (rdp->nxtlist != NULL) { | 1365 | if (rdp->nxtlist != NULL) { |
1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1366 | *rsp->orphan_donetail = rdp->nxtlist; |
1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | 1367 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1380 | |||
1381 | rdp->nxtlist = NULL; | ||
1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1384 | } | 1368 | } |
1385 | 1369 | ||
1370 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
1371 | rdp->nxtlist = NULL; | ||
1372 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1373 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
1378 | * orphanage. The caller must hold the ->onofflock. | ||
1379 | */ | ||
1380 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1381 | { | ||
1382 | int i; | ||
1383 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
1384 | |||
1386 | /* | 1385 | /* |
1387 | * Record a quiescent state for the dying CPU. This is safe | 1386 | * If there is an rcu_barrier() operation in progress, then |
1388 | * only because we have already cleared out the callbacks. | 1387 | * only the task doing that operation is permitted to adopt |
1389 | * (Otherwise, the RCU core might try to schedule the invocation | 1388 | * callbacks. To do otherwise breaks rcu_barrier() and friends |
1390 | * of callbacks on this now-offline CPU, which would be bad.) | 1389 | * by causing them to fail to wait for the callbacks in the |
1390 | * orphanage. | ||
1391 | */ | 1391 | */ |
1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1392 | if (rsp->rcu_barrier_in_progress && |
1393 | rsp->rcu_barrier_in_progress != current) | ||
1394 | return; | ||
1395 | |||
1396 | /* Do the accounting first. */ | ||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
1398 | rdp->qlen += rsp->qlen; | ||
1399 | rdp->n_cbs_adopted += rsp->qlen; | ||
1400 | rsp->qlen_lazy = 0; | ||
1401 | rsp->qlen = 0; | ||
1402 | |||
1403 | /* | ||
1404 | * We do not need a memory barrier here because the only way we | ||
1405 | * can get here if there is an rcu_barrier() in flight is if | ||
1406 | * we are the task doing the rcu_barrier(). | ||
1407 | */ | ||
1408 | |||
1409 | /* First adopt the ready-to-invoke callbacks. */ | ||
1410 | if (rsp->orphan_donelist != NULL) { | ||
1411 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1412 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
1413 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
1414 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
1415 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
1416 | rsp->orphan_donelist = NULL; | ||
1417 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1418 | } | ||
1419 | |||
1420 | /* And then adopt the callbacks that still need a grace period. */ | ||
1421 | if (rsp->orphan_nxtlist != NULL) { | ||
1422 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
1423 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
1424 | rsp->orphan_nxtlist = NULL; | ||
1425 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * Trace the fact that this CPU is going offline. | ||
1431 | */ | ||
1432 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
1433 | { | ||
1434 | RCU_TRACE(unsigned long mask); | ||
1435 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
1436 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
1437 | |||
1438 | RCU_TRACE(mask = rdp->grpmask); | ||
1393 | trace_rcu_grace_period(rsp->name, | 1439 | trace_rcu_grace_period(rsp->name, |
1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1440 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1395 | "cpuofl"); | 1441 | "cpuofl"); |
1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
1398 | } | 1442 | } |
1399 | 1443 | ||
1400 | /* | 1444 | /* |
1401 | * The CPU has been completely removed, and some other CPU is reporting | 1445 | * The CPU has been completely removed, and some other CPU is reporting |
1402 | * this fact from process context. Do the remainder of the cleanup. | 1446 | * this fact from process context. Do the remainder of the cleanup, |
1447 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
1448 | * adopting them, if there is no _rcu_barrier() instance running. | ||
1403 | * There can only be one CPU hotplug operation at a time, so no other | 1449 | * There can only be one CPU hotplug operation at a time, so no other |
1404 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1450 | * CPU can be attempting to update rcu_cpu_kthread_task. |
1405 | */ | 1451 | */ |
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1409 | unsigned long mask; | 1455 | unsigned long mask; |
1410 | int need_report = 0; | 1456 | int need_report = 0; |
1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1457 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ | 1458 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
1413 | 1459 | ||
1414 | /* Adjust any no-longer-needed kthreads. */ | 1460 | /* Adjust any no-longer-needed kthreads. */ |
1415 | rcu_stop_cpu_kthread(cpu); | 1461 | rcu_stop_cpu_kthread(cpu); |
1416 | rcu_node_kthread_setaffinity(rnp, -1); | 1462 | rcu_node_kthread_setaffinity(rnp, -1); |
1417 | 1463 | ||
1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | 1464 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1419 | 1465 | ||
1420 | /* Exclude any attempts to start a new grace period. */ | 1466 | /* Exclude any attempts to start a new grace period. */ |
1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1467 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1422 | 1468 | ||
1469 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
1470 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
1471 | rcu_adopt_orphan_cbs(rsp); | ||
1472 | |||
1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1473 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1474 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1425 | do { | 1475 | do { |
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1456 | 1506 | ||
1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1507 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1458 | 1508 | ||
1509 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1510 | { | ||
1511 | } | ||
1512 | |||
1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1513 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1460 | { | 1514 | { |
1461 | } | 1515 | } |
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1524 | rcu_is_callbacks_kthread()); | 1578 | rcu_is_callbacks_kthread()); |
1525 | 1579 | ||
1526 | /* Update count, and requeue any remaining callbacks. */ | 1580 | /* Update count, and requeue any remaining callbacks. */ |
1527 | rdp->qlen_lazy -= count_lazy; | ||
1528 | rdp->qlen -= count; | ||
1529 | rdp->n_cbs_invoked += count; | ||
1530 | if (list != NULL) { | 1581 | if (list != NULL) { |
1531 | *tail = rdp->nxtlist; | 1582 | *tail = rdp->nxtlist; |
1532 | rdp->nxtlist = list; | 1583 | rdp->nxtlist = list; |
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1536 | else | 1587 | else |
1537 | break; | 1588 | break; |
1538 | } | 1589 | } |
1590 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
1591 | rdp->qlen_lazy -= count_lazy; | ||
1592 | rdp->qlen -= count; | ||
1593 | rdp->n_cbs_invoked += count; | ||
1539 | 1594 | ||
1540 | /* Reinstate batch limit if we have worked down the excess. */ | 1595 | /* Reinstate batch limit if we have worked down the excess. */ |
1541 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1596 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1823 | rdp = this_cpu_ptr(rsp->rda); | 1878 | rdp = this_cpu_ptr(rsp->rda); |
1824 | 1879 | ||
1825 | /* Add the callback to our list. */ | 1880 | /* Add the callback to our list. */ |
1826 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1827 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1828 | rdp->qlen++; | 1881 | rdp->qlen++; |
1829 | if (lazy) | 1882 | if (lazy) |
1830 | rdp->qlen_lazy++; | 1883 | rdp->qlen_lazy++; |
1884 | else | ||
1885 | rcu_idle_count_callbacks_posted(); | ||
1886 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
1887 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1888 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1831 | 1889 | ||
1832 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1890 | if (__is_kfree_rcu_offset((unsigned long)func)) |
1833 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1891 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
1893 | } | 1951 | } |
1894 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 1952 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1895 | 1953 | ||
1954 | /* | ||
1955 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | ||
1956 | * any blocking grace-period wait automatically implies a grace period | ||
1957 | * if there is only one CPU online at any point time during execution | ||
1958 | * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to | ||
1959 | * occasionally incorrectly indicate that there are multiple CPUs online | ||
1960 | * when there was in fact only one the whole time, as this just adds | ||
1961 | * some overhead: RCU still operates correctly. | ||
1962 | * | ||
1963 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
1964 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
1965 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
1966 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
1967 | * all times, but num_online_cpus() might well return one (or even zero). | ||
1968 | * | ||
1969 | * However, all such demonic sequences require at least one CPU-offline | ||
1970 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
1971 | * is only a problem if there is an RCU read-side critical section executing | ||
1972 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
1973 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
1974 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
1975 | * that there is only one CPU when in fact there was more than one throughout | ||
1976 | * is when there were no RCU readers in the system. If there are no | ||
1977 | * RCU readers, the grace period by definition can be of zero length, | ||
1978 | * regardless of the number of online CPUs. | ||
1979 | */ | ||
1980 | static inline int rcu_blocking_is_gp(void) | ||
1981 | { | ||
1982 | might_sleep(); /* Check for RCU read-side critical section. */ | ||
1983 | return num_online_cpus() <= 1; | ||
1984 | } | ||
1985 | |||
1896 | /** | 1986 | /** |
1897 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. | 1987 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. |
1898 | * | 1988 | * |
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
2166 | rcu_preempt_cpu_has_callbacks(cpu); | 2256 | rcu_preempt_cpu_has_callbacks(cpu); |
2167 | } | 2257 | } |
2168 | 2258 | ||
2169 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2259 | /* |
2170 | static atomic_t rcu_barrier_cpu_count; | 2260 | * RCU callback function for _rcu_barrier(). If we are last, wake |
2171 | static DEFINE_MUTEX(rcu_barrier_mutex); | 2261 | * up the task executing _rcu_barrier(). |
2172 | static struct completion rcu_barrier_completion; | 2262 | */ |
2173 | |||
2174 | static void rcu_barrier_callback(struct rcu_head *notused) | 2263 | static void rcu_barrier_callback(struct rcu_head *notused) |
2175 | { | 2264 | { |
2176 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2265 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2200 | void (*call_rcu_func)(struct rcu_head *head, | 2289 | void (*call_rcu_func)(struct rcu_head *head, |
2201 | void (*func)(struct rcu_head *head))) | 2290 | void (*func)(struct rcu_head *head))) |
2202 | { | 2291 | { |
2203 | BUG_ON(in_interrupt()); | 2292 | int cpu; |
2293 | unsigned long flags; | ||
2294 | struct rcu_data *rdp; | ||
2295 | struct rcu_head rh; | ||
2296 | |||
2297 | init_rcu_head_on_stack(&rh); | ||
2298 | |||
2204 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2299 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
2205 | mutex_lock(&rcu_barrier_mutex); | 2300 | mutex_lock(&rcu_barrier_mutex); |
2206 | init_completion(&rcu_barrier_completion); | 2301 | |
2302 | smp_mb(); /* Prevent any prior operations from leaking in. */ | ||
2303 | |||
2207 | /* | 2304 | /* |
2208 | * Initialize rcu_barrier_cpu_count to 1, then invoke | 2305 | * Initialize the count to one rather than to zero in order to |
2209 | * rcu_barrier_func() on each CPU, so that each CPU also has | 2306 | * avoid a too-soon return to zero in case of a short grace period |
2210 | * incremented rcu_barrier_cpu_count. Only then is it safe to | 2307 | * (or preemption of this task). Also flag this task as doing |
2211 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 2308 | * an rcu_barrier(). This will prevent anyone else from adopting |
2212 | * might complete its grace period before all of the other CPUs | 2309 | * orphaned callbacks, which could cause otherwise failure if a |
2213 | * did their increment, causing this function to return too | 2310 | * CPU went offline and quickly came back online. To see this, |
2214 | * early. Note that on_each_cpu() disables irqs, which prevents | 2311 | * consider the following sequence of events: |
2215 | * any CPUs from coming online or going offline until each online | 2312 | * |
2216 | * CPU has queued its RCU-barrier callback. | 2313 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. |
2314 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
2315 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
2316 | * 4. CPU 1 comes back online. | ||
2317 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
2318 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
2319 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
2217 | */ | 2320 | */ |
2321 | init_completion(&rcu_barrier_completion); | ||
2218 | atomic_set(&rcu_barrier_cpu_count, 1); | 2322 | atomic_set(&rcu_barrier_cpu_count, 1); |
2219 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 2323 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
2324 | rsp->rcu_barrier_in_progress = current; | ||
2325 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2326 | |||
2327 | /* | ||
2328 | * Force every CPU with callbacks to register a new callback | ||
2329 | * that will tell us when all the preceding callbacks have | ||
2330 | * been invoked. If an offline CPU has callbacks, wait for | ||
2331 | * it to either come back online or to finish orphaning those | ||
2332 | * callbacks. | ||
2333 | */ | ||
2334 | for_each_possible_cpu(cpu) { | ||
2335 | preempt_disable(); | ||
2336 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2337 | if (cpu_is_offline(cpu)) { | ||
2338 | preempt_enable(); | ||
2339 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
2340 | schedule_timeout_interruptible(1); | ||
2341 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2342 | smp_call_function_single(cpu, rcu_barrier_func, | ||
2343 | (void *)call_rcu_func, 1); | ||
2344 | preempt_enable(); | ||
2345 | } else { | ||
2346 | preempt_enable(); | ||
2347 | } | ||
2348 | } | ||
2349 | |||
2350 | /* | ||
2351 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
2352 | * posted, we can adopt all of the orphaned callbacks and place | ||
2353 | * an rcu_barrier_callback() callback after them. When that is done, | ||
2354 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
2355 | * following every callback that could possibly have been | ||
2356 | * registered before _rcu_barrier() was called. | ||
2357 | */ | ||
2358 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
2359 | rcu_adopt_orphan_cbs(rsp); | ||
2360 | rsp->rcu_barrier_in_progress = NULL; | ||
2361 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2362 | atomic_inc(&rcu_barrier_cpu_count); | ||
2363 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
2364 | call_rcu_func(&rh, rcu_barrier_callback); | ||
2365 | |||
2366 | /* | ||
2367 | * Now that we have an rcu_barrier_callback() callback on each | ||
2368 | * CPU, and thus each counted, remove the initial count. | ||
2369 | */ | ||
2220 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2370 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
2221 | complete(&rcu_barrier_completion); | 2371 | complete(&rcu_barrier_completion); |
2372 | |||
2373 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
2222 | wait_for_completion(&rcu_barrier_completion); | 2374 | wait_for_completion(&rcu_barrier_completion); |
2375 | |||
2376 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
2223 | mutex_unlock(&rcu_barrier_mutex); | 2377 | mutex_unlock(&rcu_barrier_mutex); |
2378 | |||
2379 | destroy_rcu_head_on_stack(&rh); | ||
2224 | } | 2380 | } |
2225 | 2381 | ||
2226 | /** | 2382 | /** |
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2417 | 2573 | ||
2418 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2574 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
2419 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2575 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
2420 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | 2576 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; |
2421 | } | 2577 | } |
2422 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2578 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
2423 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2579 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a4072..7f5d138dedf5 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -29,18 +29,14 @@ | |||
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
33 | * CONFIG_RCU_FANOUT_LEAF. | ||
33 | * In theory, it should be possible to add more levels straightforwardly. | 34 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this did work well going from three levels to four. | 35 | * In practice, this did work well going from three levels to four. |
35 | * Of course, your mileage may vary. | 36 | * Of course, your mileage may vary. |
36 | */ | 37 | */ |
37 | #define MAX_RCU_LVLS 4 | 38 | #define MAX_RCU_LVLS 4 |
38 | #if CONFIG_RCU_FANOUT > 16 | 39 | #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) |
39 | #define RCU_FANOUT_LEAF 16 | ||
40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ | ||
41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) | ||
42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | ||
43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | 40 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) |
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | 41 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) |
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
@@ -371,6 +367,17 @@ struct rcu_state { | |||
371 | 367 | ||
372 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 368 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
373 | /* starting new GP. */ | 369 | /* starting new GP. */ |
370 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
371 | /* need a grace period. */ | ||
372 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
373 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
374 | /* are ready to invoke. */ | ||
375 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
376 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
377 | long qlen; /* Total number of callbacks. */ | ||
378 | struct task_struct *rcu_barrier_in_progress; | ||
379 | /* Task doing rcu_barrier(), */ | ||
380 | /* or NULL if no barrier. */ | ||
374 | raw_spinlock_t fqslock; /* Only one task forcing */ | 381 | raw_spinlock_t fqslock; /* Only one task forcing */ |
375 | /* quiescent states. */ | 382 | /* quiescent states. */ |
376 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 383 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
423 | /* Forward declarations for rcutree_plugin.h */ | 430 | /* Forward declarations for rcutree_plugin.h */ |
424 | static void rcu_bootup_announce(void); | 431 | static void rcu_bootup_announce(void); |
425 | long rcu_batches_completed(void); | 432 | long rcu_batches_completed(void); |
426 | static void rcu_preempt_note_context_switch(int cpu); | ||
427 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
428 | #ifdef CONFIG_HOTPLUG_CPU | 434 | #ifdef CONFIG_HOTPLUG_CPU |
429 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); | |||
471 | static void rcu_prepare_for_idle_init(int cpu); | 477 | static void rcu_prepare_for_idle_init(int cpu); |
472 | static void rcu_cleanup_after_idle(int cpu); | 478 | static void rcu_cleanup_after_idle(int cpu); |
473 | static void rcu_prepare_for_idle(int cpu); | 479 | static void rcu_prepare_for_idle(int cpu); |
480 | static void rcu_idle_count_callbacks_posted(void); | ||
474 | static void print_cpu_stall_info_begin(void); | 481 | static void print_cpu_stall_info_begin(void); |
475 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 482 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
476 | static void print_cpu_stall_info_end(void); | 483 | static void print_cpu_stall_info_end(void); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816be..2411000d9869 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) | |||
153 | * | 153 | * |
154 | * Caller must disable preemption. | 154 | * Caller must disable preemption. |
155 | */ | 155 | */ |
156 | static void rcu_preempt_note_context_switch(int cpu) | 156 | void rcu_preempt_note_context_switch(void) |
157 | { | 157 | { |
158 | struct task_struct *t = current; | 158 | struct task_struct *t = current; |
159 | unsigned long flags; | 159 | unsigned long flags; |
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
165 | 165 | ||
166 | /* Possibly blocking in an RCU read-side critical section. */ | 166 | /* Possibly blocking in an RCU read-side critical section. */ |
167 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); |
168 | rnp = rdp->mynode; | 168 | rnp = rdp->mynode; |
169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 169 | raw_spin_lock_irqsave(&rnp->lock, flags); |
170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
228 | * means that we continue to block the current grace period. | 228 | * means that we continue to block the current grace period. |
229 | */ | 229 | */ |
230 | local_irq_save(flags); | 230 | local_irq_save(flags); |
231 | rcu_preempt_qs(cpu); | 231 | rcu_preempt_qs(smp_processor_id()); |
232 | local_irq_restore(flags); | 232 | local_irq_restore(flags); |
233 | } | 233 | } |
234 | 234 | ||
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) | |||
969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | 969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
970 | } | 970 | } |
971 | 971 | ||
972 | /* | ||
973 | * Check for a task exiting while in a preemptible-RCU read-side | ||
974 | * critical section, clean up if so. No need to issue warnings, | ||
975 | * as debug_check_no_locks_held() already does this if lockdep | ||
976 | * is enabled. | ||
977 | */ | ||
978 | void exit_rcu(void) | ||
979 | { | ||
980 | struct task_struct *t = current; | ||
981 | |||
982 | if (t->rcu_read_lock_nesting == 0) | ||
983 | return; | ||
984 | t->rcu_read_lock_nesting = 1; | ||
985 | __rcu_read_unlock(); | ||
986 | } | ||
987 | |||
988 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 972 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
989 | 973 | ||
990 | static struct rcu_state *rcu_state = &rcu_sched_state; | 974 | static struct rcu_state *rcu_state = &rcu_sched_state; |
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void) | |||
1018 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
1019 | 1003 | ||
1020 | /* | 1004 | /* |
1021 | * Because preemptible RCU does not exist, we never have to check for | ||
1022 | * CPUs being in quiescent states. | ||
1023 | */ | ||
1024 | static void rcu_preempt_note_context_switch(int cpu) | ||
1025 | { | ||
1026 | } | ||
1027 | |||
1028 | /* | ||
1029 | * Because preemptible RCU does not exist, there are never any preempted | 1005 | * Because preemptible RCU does not exist, there are never any preempted |
1030 | * RCU readers. | 1006 | * RCU readers. |
1031 | */ | 1007 | */ |
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu) | |||
1938 | { | 1914 | { |
1939 | } | 1915 | } |
1940 | 1916 | ||
1917 | /* | ||
1918 | * Don't bother keeping a running count of the number of RCU callbacks | ||
1919 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
1920 | */ | ||
1921 | static void rcu_idle_count_callbacks_posted(void) | ||
1922 | { | ||
1923 | } | ||
1924 | |||
1941 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1925 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1942 | 1926 | ||
1943 | /* | 1927 | /* |
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu) | |||
1978 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1979 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1980 | 1964 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
1981 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1982 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
1983 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | 1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ |
1984 | static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ | 1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); |
1985 | static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | 1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ |
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1986 | 1979 | ||
1987 | /* | 1980 | /* |
1988 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | |||
1995 | */ | 1988 | */ |
1996 | int rcu_needs_cpu(int cpu) | 1989 | int rcu_needs_cpu(int cpu) |
1997 | { | 1990 | { |
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1998 | /* If no callbacks, RCU doesn't need the CPU. */ | 1993 | /* If no callbacks, RCU doesn't need the CPU. */ |
1999 | if (!rcu_cpu_has_callbacks(cpu)) | 1994 | if (!rcu_cpu_has_callbacks(cpu)) |
2000 | return 0; | 1995 | return 0; |
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2045 | } | 2040 | } |
2046 | 2041 | ||
2047 | /* | 2042 | /* |
2043 | * Handler for smp_call_function_single(). The only point of this | ||
2044 | * handler is to wake the CPU up, so the handler does only tracing. | ||
2045 | */ | ||
2046 | void rcu_idle_demigrate(void *unused) | ||
2047 | { | ||
2048 | trace_rcu_prep_idle("Demigrate"); | ||
2049 | } | ||
2050 | |||
2051 | /* | ||
2048 | * Timer handler used to force CPU to start pushing its remaining RCU | 2052 | * Timer handler used to force CPU to start pushing its remaining RCU |
2049 | * callbacks in the case where it entered dyntick-idle mode with callbacks | 2053 | * callbacks in the case where it entered dyntick-idle mode with callbacks |
2050 | * pending. The hander doesn't really need to do anything because the | 2054 | * pending. The hander doesn't really need to do anything because the |
2051 | * real work is done upon re-entry to idle, or by the next scheduling-clock | 2055 | * real work is done upon re-entry to idle, or by the next scheduling-clock |
2052 | * interrupt should idle not be re-entered. | 2056 | * interrupt should idle not be re-entered. |
2057 | * | ||
2058 | * One special case: the timer gets migrated without awakening the CPU | ||
2059 | * on which the timer was scheduled on. In this case, we must wake up | ||
2060 | * that CPU. We do so with smp_call_function_single(). | ||
2053 | */ | 2061 | */ |
2054 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | 2062 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) |
2055 | { | 2063 | { |
2064 | int cpu = (int)cpu_in; | ||
2065 | |||
2056 | trace_rcu_prep_idle("Timer"); | 2066 | trace_rcu_prep_idle("Timer"); |
2057 | return HRTIMER_NORESTART; | 2067 | if (cpu != smp_processor_id()) |
2068 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
2069 | else | ||
2070 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
2058 | } | 2071 | } |
2059 | 2072 | ||
2060 | /* | 2073 | /* |
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | |||
2062 | */ | 2075 | */ |
2063 | static void rcu_prepare_for_idle_init(int cpu) | 2076 | static void rcu_prepare_for_idle_init(int cpu) |
2064 | { | 2077 | { |
2065 | static int firsttime = 1; | 2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
2066 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), |
2067 | 2080 | rcu_idle_gp_timer_func, cpu); | |
2068 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; |
2069 | hrtp->function = rcu_idle_gp_timer_func; | 2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; |
2070 | if (firsttime) { | ||
2071 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2072 | |||
2073 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2074 | upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); | ||
2075 | rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2076 | firsttime = 0; | ||
2077 | } | ||
2078 | } | 2083 | } |
2079 | 2084 | ||
2080 | /* | 2085 | /* |
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu) | |||
2084 | */ | 2089 | */ |
2085 | static void rcu_cleanup_after_idle(int cpu) | 2090 | static void rcu_cleanup_after_idle(int cpu) |
2086 | { | 2091 | { |
2087 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | 2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); |
2093 | trace_rcu_prep_idle("Cleanup after idle"); | ||
2088 | } | 2094 | } |
2089 | 2095 | ||
2090 | /* | 2096 | /* |
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | */ | 2114 | */ |
2109 | static void rcu_prepare_for_idle(int cpu) | 2115 | static void rcu_prepare_for_idle(int cpu) |
2110 | { | 2116 | { |
2117 | struct timer_list *tp; | ||
2118 | |||
2119 | /* | ||
2120 | * If this is an idle re-entry, for example, due to use of | ||
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | ||
2122 | * loop, then don't take any state-machine actions, unless the | ||
2123 | * momentary exit from idle queued additional non-lazy callbacks. | ||
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | ||
2125 | * pending. | ||
2126 | */ | ||
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | ||
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | ||
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2133 | } | ||
2134 | return; | ||
2135 | } | ||
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | ||
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | |||
2111 | /* | 2140 | /* |
2112 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2113 | * Also reset state to avoid prejudicing later attempts. | 2142 | * Also reset state to avoid prejudicing later attempts. |
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu) | |||
2140 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
2141 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2142 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) |
2143 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2144 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | 2173 | jiffies + RCU_IDLE_GP_DELAY; |
2145 | else | 2174 | else |
2146 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2147 | rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); | 2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2180 | per_cpu(rcu_nonlazy_posted, cpu); | ||
2148 | return; /* Nothing more to do immediately. */ | 2181 | return; /* Nothing more to do immediately. */ |
2149 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2150 | /* We have hit the limit, so time to give up. */ | 2183 | /* We have hit the limit, so time to give up. */ |
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu) | |||
2184 | trace_rcu_prep_idle("Callbacks drained"); | 2217 | trace_rcu_prep_idle("Callbacks drained"); |
2185 | } | 2218 | } |
2186 | 2219 | ||
2220 | /* | ||
2221 | * Keep a running count of the number of non-lazy callbacks posted | ||
2222 | * on this CPU. This running counter (which is never decremented) allows | ||
2223 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
2224 | * posts a callback, even if an equal number of callbacks are invoked. | ||
2225 | * Of course, callbacks should only be posted from within a trace event | ||
2226 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
2227 | */ | ||
2228 | static void rcu_idle_count_callbacks_posted(void) | ||
2229 | { | ||
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | ||
2231 | } | ||
2232 | |||
2187 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2188 | 2234 | ||
2189 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 2235 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu) | |||
2192 | 2238 | ||
2193 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2194 | { | 2240 | { |
2195 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); |
2196 | 2242 | ||
2197 | sprintf(cp, "drain=%d %c timer=%lld", | 2243 | sprintf(cp, "drain=%d %c timer=%lu", |
2198 | per_cpu(rcu_dyntick_drain, cpu), | 2244 | per_cpu(rcu_dyntick_drain, cpu), |
2199 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', |
2200 | hrtimer_active(hrtp) | 2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2201 | ? ktime_to_us(hrtimer_get_remaining(hrtp)) | ||
2202 | : -1); | ||
2203 | } | 2247 | } |
2204 | 2248 | ||
2205 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2249 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff43..d4bc16ddd1d4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
271 | 271 | ||
272 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
275 | rsp->completed, gpnum, rsp->fqs_state, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
276 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
277 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs_lh); | 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
282 | if (rnp->level != level) { | 282 | if (rnp->level != level) { |
283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b3..bebe2b170d49 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) | |||
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val, |
26 | bool force) | ||
26 | { | 27 | { |
28 | int ret = 0; | ||
29 | |||
27 | if (counter->usage + val > counter->limit) { | 30 | if (counter->usage + val > counter->limit) { |
28 | counter->failcnt++; | 31 | counter->failcnt++; |
29 | return -ENOMEM; | 32 | ret = -ENOMEM; |
33 | if (!force) | ||
34 | return ret; | ||
30 | } | 35 | } |
31 | 36 | ||
32 | counter->usage += val; | 37 | counter->usage += val; |
33 | if (counter->usage > counter->max_usage) | 38 | if (counter->usage > counter->max_usage) |
34 | counter->max_usage = counter->usage; | 39 | counter->max_usage = counter->usage; |
35 | return 0; | 40 | return ret; |
36 | } | 41 | } |
37 | 42 | ||
38 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 43 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, |
39 | struct res_counter **limit_fail_at) | 44 | struct res_counter **limit_fail_at, bool force) |
40 | { | 45 | { |
41 | int ret; | 46 | int ret, r; |
42 | unsigned long flags; | 47 | unsigned long flags; |
43 | struct res_counter *c, *u; | 48 | struct res_counter *c, *u; |
44 | 49 | ||
50 | r = ret = 0; | ||
45 | *limit_fail_at = NULL; | 51 | *limit_fail_at = NULL; |
46 | local_irq_save(flags); | 52 | local_irq_save(flags); |
47 | for (c = counter; c != NULL; c = c->parent) { | 53 | for (c = counter; c != NULL; c = c->parent) { |
48 | spin_lock(&c->lock); | 54 | spin_lock(&c->lock); |
49 | ret = res_counter_charge_locked(c, val); | 55 | r = res_counter_charge_locked(c, val, force); |
50 | spin_unlock(&c->lock); | 56 | spin_unlock(&c->lock); |
51 | if (ret < 0) { | 57 | if (r < 0 && !ret) { |
58 | ret = r; | ||
52 | *limit_fail_at = c; | 59 | *limit_fail_at = c; |
53 | goto undo; | 60 | if (!force) |
61 | break; | ||
54 | } | 62 | } |
55 | } | 63 | } |
56 | ret = 0; | 64 | |
57 | goto done; | 65 | if (ret < 0 && !force) { |
58 | undo: | 66 | for (u = counter; u != c; u = u->parent) { |
59 | for (u = counter; u != c; u = u->parent) { | 67 | spin_lock(&u->lock); |
60 | spin_lock(&u->lock); | 68 | res_counter_uncharge_locked(u, val); |
61 | res_counter_uncharge_locked(u, val); | 69 | spin_unlock(&u->lock); |
62 | spin_unlock(&u->lock); | 70 | } |
63 | } | 71 | } |
64 | done: | ||
65 | local_irq_restore(flags); | 72 | local_irq_restore(flags); |
73 | |||
66 | return ret; | 74 | return ret; |
67 | } | 75 | } |
68 | 76 | ||
77 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
78 | struct res_counter **limit_fail_at) | ||
79 | { | ||
80 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
81 | } | ||
82 | |||
69 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | 83 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, |
70 | struct res_counter **limit_fail_at) | 84 | struct res_counter **limit_fail_at) |
71 | { | 85 | { |
72 | int ret, r; | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
73 | unsigned long flags; | ||
74 | struct res_counter *c; | ||
75 | |||
76 | r = ret = 0; | ||
77 | *limit_fail_at = NULL; | ||
78 | local_irq_save(flags); | ||
79 | for (c = counter; c != NULL; c = c->parent) { | ||
80 | spin_lock(&c->lock); | ||
81 | r = res_counter_charge_locked(c, val); | ||
82 | if (r) | ||
83 | c->usage += val; | ||
84 | spin_unlock(&c->lock); | ||
85 | if (r < 0 && ret == 0) { | ||
86 | *limit_fail_at = c; | ||
87 | ret = r; | ||
88 | } | ||
89 | } | ||
90 | local_irq_restore(flags); | ||
91 | |||
92 | return ret; | ||
93 | } | 87 | } |
88 | |||
94 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
95 | { | 90 | { |
96 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a3..173ea52f3af0 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | |||
20 | |||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ab9745f7e115..d833cc94eedc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -1911,7 +1912,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1911 | struct task_struct *next) | 1912 | struct task_struct *next) |
1912 | { | 1913 | { |
1913 | sched_info_switch(prev, next); | 1914 | sched_info_switch(prev, next); |
1914 | perf_event_task_sched_out(prev, next); | 1915 | perf_event_task_sched(prev, next); |
1915 | fire_sched_out_preempt_notifiers(prev, next); | 1916 | fire_sched_out_preempt_notifiers(prev, next); |
1916 | prepare_lock_switch(rq, next); | 1917 | prepare_lock_switch(rq, next); |
1917 | prepare_arch_switch(next); | 1918 | prepare_arch_switch(next); |
@@ -1954,13 +1955,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1954 | */ | 1955 | */ |
1955 | prev_state = prev->state; | 1956 | prev_state = prev->state; |
1956 | finish_arch_switch(prev); | 1957 | finish_arch_switch(prev); |
1957 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1958 | local_irq_disable(); | ||
1959 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1960 | perf_event_task_sched_in(prev, current); | ||
1961 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1962 | local_irq_enable(); | ||
1963 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1964 | finish_lock_switch(rq, prev); | 1958 | finish_lock_switch(rq, prev); |
1965 | finish_arch_post_lock_switch(); | 1959 | finish_arch_post_lock_switch(); |
1966 | 1960 | ||
@@ -2081,6 +2075,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2081 | #endif | 2075 | #endif |
2082 | 2076 | ||
2083 | /* Here we just switch the register state and the stack. */ | 2077 | /* Here we just switch the register state and the stack. */ |
2078 | rcu_switch_from(prev); | ||
2084 | switch_to(prev, next, prev); | 2079 | switch_to(prev, next, prev); |
2085 | 2080 | ||
2086 | barrier(); | 2081 | barrier(); |
@@ -7077,6 +7072,7 @@ void __init sched_init(void) | |||
7077 | /* May be allocated at isolcpus cmdline parse time */ | 7072 | /* May be allocated at isolcpus cmdline parse time */ |
7078 | if (cpu_isolated_map == NULL) | 7073 | if (cpu_isolated_map == NULL) |
7079 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7074 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7075 | idle_thread_set_boot_cpu(); | ||
7080 | #endif | 7076 | #endif |
7081 | init_sched_fair_class(); | 7077 | init_sched_fair_class(); |
7082 | 7078 | ||
@@ -7998,13 +7994,9 @@ static struct cftype cpu_files[] = { | |||
7998 | .write_u64 = cpu_rt_period_write_uint, | 7994 | .write_u64 = cpu_rt_period_write_uint, |
7999 | }, | 7995 | }, |
8000 | #endif | 7996 | #endif |
7997 | { } /* terminate */ | ||
8001 | }; | 7998 | }; |
8002 | 7999 | ||
8003 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
8004 | { | ||
8005 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
8006 | } | ||
8007 | |||
8008 | struct cgroup_subsys cpu_cgroup_subsys = { | 8000 | struct cgroup_subsys cpu_cgroup_subsys = { |
8009 | .name = "cpu", | 8001 | .name = "cpu", |
8010 | .create = cpu_cgroup_create, | 8002 | .create = cpu_cgroup_create, |
@@ -8012,8 +8004,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8012 | .can_attach = cpu_cgroup_can_attach, | 8004 | .can_attach = cpu_cgroup_can_attach, |
8013 | .attach = cpu_cgroup_attach, | 8005 | .attach = cpu_cgroup_attach, |
8014 | .exit = cpu_cgroup_exit, | 8006 | .exit = cpu_cgroup_exit, |
8015 | .populate = cpu_cgroup_populate, | ||
8016 | .subsys_id = cpu_cgroup_subsys_id, | 8007 | .subsys_id = cpu_cgroup_subsys_id, |
8008 | .base_cftypes = cpu_files, | ||
8017 | .early_init = 1, | 8009 | .early_init = 1, |
8018 | }; | 8010 | }; |
8019 | 8011 | ||
@@ -8198,13 +8190,9 @@ static struct cftype files[] = { | |||
8198 | .name = "stat", | 8190 | .name = "stat", |
8199 | .read_map = cpuacct_stats_show, | 8191 | .read_map = cpuacct_stats_show, |
8200 | }, | 8192 | }, |
8193 | { } /* terminate */ | ||
8201 | }; | 8194 | }; |
8202 | 8195 | ||
8203 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8204 | { | ||
8205 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8206 | } | ||
8207 | |||
8208 | /* | 8196 | /* |
8209 | * charge this task's execution time to its accounting group. | 8197 | * charge this task's execution time to its accounting group. |
8210 | * | 8198 | * |
@@ -8236,7 +8224,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8236 | .name = "cpuacct", | 8224 | .name = "cpuacct", |
8237 | .create = cpuacct_create, | 8225 | .create = cpuacct_create, |
8238 | .destroy = cpuacct_destroy, | 8226 | .destroy = cpuacct_destroy, |
8239 | .populate = cpuacct_populate, | ||
8240 | .subsys_id = cpuacct_subsys_id, | 8227 | .subsys_id = cpuacct_subsys_id, |
8228 | .base_cftypes = files, | ||
8241 | }; | 8229 | }; |
8242 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8230 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895ea..ee376beedaf9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -3,16 +3,357 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | 4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * | 5 | * |
6 | * This defines a simple but solid secure-computing mode. | 6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> | ||
8 | * | ||
9 | * This defines a simple but solid secure-computing facility. | ||
10 | * | ||
11 | * Mode 1 uses a fixed list of allowed system calls. | ||
12 | * Mode 2 allows user-defined system call filters in the form | ||
13 | * of Berkeley Packet Filters/Linux Socket Filters. | ||
7 | */ | 14 | */ |
8 | 15 | ||
16 | #include <linux/atomic.h> | ||
9 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
10 | #include <linux/seccomp.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> | ||
20 | #include <linux/seccomp.h> | ||
13 | 21 | ||
14 | /* #define SECCOMP_DEBUG 1 */ | 22 | /* #define SECCOMP_DEBUG 1 */ |
15 | #define NR_SECCOMP_MODES 1 | 23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER | ||
25 | #include <asm/syscall.h> | ||
26 | #include <linux/filter.h> | ||
27 | #include <linux/ptrace.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/tracehook.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | |||
33 | /** | ||
34 | * struct seccomp_filter - container for seccomp BPF programs | ||
35 | * | ||
36 | * @usage: reference count to manage the object lifetime. | ||
37 | * get/put helpers should be used when accessing an instance | ||
38 | * outside of a lifetime-guarded section. In general, this | ||
39 | * is only needed for handling filters shared across tasks. | ||
40 | * @prev: points to a previously installed, or inherited, filter | ||
41 | * @len: the number of instructions in the program | ||
42 | * @insns: the BPF program instructions to evaluate | ||
43 | * | ||
44 | * seccomp_filter objects are organized in a tree linked via the @prev | ||
45 | * pointer. For any task, it appears to be a singly-linked list starting | ||
46 | * with current->seccomp.filter, the most recently attached or inherited filter. | ||
47 | * However, multiple filters may share a @prev node, by way of fork(), which | ||
48 | * results in a unidirectional tree existing in memory. This is similar to | ||
49 | * how namespaces work. | ||
50 | * | ||
51 | * seccomp_filter objects should never be modified after being attached | ||
52 | * to a task_struct (other than @usage). | ||
53 | */ | ||
54 | struct seccomp_filter { | ||
55 | atomic_t usage; | ||
56 | struct seccomp_filter *prev; | ||
57 | unsigned short len; /* Instruction count */ | ||
58 | struct sock_filter insns[]; | ||
59 | }; | ||
60 | |||
61 | /* Limit any path through the tree to 256KB worth of instructions. */ | ||
62 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) | ||
63 | |||
64 | /** | ||
65 | * get_u32 - returns a u32 offset into data | ||
66 | * @data: a unsigned 64 bit value | ||
67 | * @index: 0 or 1 to return the first or second 32-bits | ||
68 | * | ||
69 | * This inline exists to hide the length of unsigned long. If a 32-bit | ||
70 | * unsigned long is passed in, it will be extended and the top 32-bits will be | ||
71 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be | ||
72 | * properly returned. | ||
73 | * | ||
74 | * Endianness is explicitly ignored and left for BPF program authors to manage | ||
75 | * as per the specific architecture. | ||
76 | */ | ||
77 | static inline u32 get_u32(u64 data, int index) | ||
78 | { | ||
79 | return ((u32 *)&data)[index]; | ||
80 | } | ||
81 | |||
82 | /* Helper for bpf_load below. */ | ||
83 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) | ||
84 | /** | ||
85 | * bpf_load: checks and returns a pointer to the requested offset | ||
86 | * @off: offset into struct seccomp_data to load from | ||
87 | * | ||
88 | * Returns the requested 32-bits of data. | ||
89 | * seccomp_check_filter() should assure that @off is 32-bit aligned | ||
90 | * and not out of bounds. Failure to do so is a BUG. | ||
91 | */ | ||
92 | u32 seccomp_bpf_load(int off) | ||
93 | { | ||
94 | struct pt_regs *regs = task_pt_regs(current); | ||
95 | if (off == BPF_DATA(nr)) | ||
96 | return syscall_get_nr(current, regs); | ||
97 | if (off == BPF_DATA(arch)) | ||
98 | return syscall_get_arch(current, regs); | ||
99 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { | ||
100 | unsigned long value; | ||
101 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); | ||
102 | int index = !!(off % sizeof(u64)); | ||
103 | syscall_get_arguments(current, regs, arg, 1, &value); | ||
104 | return get_u32(value, index); | ||
105 | } | ||
106 | if (off == BPF_DATA(instruction_pointer)) | ||
107 | return get_u32(KSTK_EIP(current), 0); | ||
108 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) | ||
109 | return get_u32(KSTK_EIP(current), 1); | ||
110 | /* seccomp_check_filter should make this impossible. */ | ||
111 | BUG(); | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * seccomp_check_filter - verify seccomp filter code | ||
116 | * @filter: filter to verify | ||
117 | * @flen: length of filter | ||
118 | * | ||
119 | * Takes a previously checked filter (by sk_chk_filter) and | ||
120 | * redirects all filter code that loads struct sk_buff data | ||
121 | * and related data through seccomp_bpf_load. It also | ||
122 | * enforces length and alignment checking of those loads. | ||
123 | * | ||
124 | * Returns 0 if the rule set is legal or -EINVAL if not. | ||
125 | */ | ||
126 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | ||
127 | { | ||
128 | int pc; | ||
129 | for (pc = 0; pc < flen; pc++) { | ||
130 | struct sock_filter *ftest = &filter[pc]; | ||
131 | u16 code = ftest->code; | ||
132 | u32 k = ftest->k; | ||
133 | |||
134 | switch (code) { | ||
135 | case BPF_S_LD_W_ABS: | ||
136 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; | ||
137 | /* 32-bit aligned and not out of bounds. */ | ||
138 | if (k >= sizeof(struct seccomp_data) || k & 3) | ||
139 | return -EINVAL; | ||
140 | continue; | ||
141 | case BPF_S_LD_W_LEN: | ||
142 | ftest->code = BPF_S_LD_IMM; | ||
143 | ftest->k = sizeof(struct seccomp_data); | ||
144 | continue; | ||
145 | case BPF_S_LDX_W_LEN: | ||
146 | ftest->code = BPF_S_LDX_IMM; | ||
147 | ftest->k = sizeof(struct seccomp_data); | ||
148 | continue; | ||
149 | /* Explicitly include allowed calls. */ | ||
150 | case BPF_S_RET_K: | ||
151 | case BPF_S_RET_A: | ||
152 | case BPF_S_ALU_ADD_K: | ||
153 | case BPF_S_ALU_ADD_X: | ||
154 | case BPF_S_ALU_SUB_K: | ||
155 | case BPF_S_ALU_SUB_X: | ||
156 | case BPF_S_ALU_MUL_K: | ||
157 | case BPF_S_ALU_MUL_X: | ||
158 | case BPF_S_ALU_DIV_X: | ||
159 | case BPF_S_ALU_AND_K: | ||
160 | case BPF_S_ALU_AND_X: | ||
161 | case BPF_S_ALU_OR_K: | ||
162 | case BPF_S_ALU_OR_X: | ||
163 | case BPF_S_ALU_LSH_K: | ||
164 | case BPF_S_ALU_LSH_X: | ||
165 | case BPF_S_ALU_RSH_K: | ||
166 | case BPF_S_ALU_RSH_X: | ||
167 | case BPF_S_ALU_NEG: | ||
168 | case BPF_S_LD_IMM: | ||
169 | case BPF_S_LDX_IMM: | ||
170 | case BPF_S_MISC_TAX: | ||
171 | case BPF_S_MISC_TXA: | ||
172 | case BPF_S_ALU_DIV_K: | ||
173 | case BPF_S_LD_MEM: | ||
174 | case BPF_S_LDX_MEM: | ||
175 | case BPF_S_ST: | ||
176 | case BPF_S_STX: | ||
177 | case BPF_S_JMP_JA: | ||
178 | case BPF_S_JMP_JEQ_K: | ||
179 | case BPF_S_JMP_JEQ_X: | ||
180 | case BPF_S_JMP_JGE_K: | ||
181 | case BPF_S_JMP_JGE_X: | ||
182 | case BPF_S_JMP_JGT_K: | ||
183 | case BPF_S_JMP_JGT_X: | ||
184 | case BPF_S_JMP_JSET_K: | ||
185 | case BPF_S_JMP_JSET_X: | ||
186 | continue; | ||
187 | default: | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * seccomp_run_filters - evaluates all seccomp filters against @syscall | ||
196 | * @syscall: number of the current system call | ||
197 | * | ||
198 | * Returns valid seccomp BPF response codes. | ||
199 | */ | ||
200 | static u32 seccomp_run_filters(int syscall) | ||
201 | { | ||
202 | struct seccomp_filter *f; | ||
203 | u32 ret = SECCOMP_RET_ALLOW; | ||
204 | |||
205 | /* Ensure unexpected behavior doesn't result in failing open. */ | ||
206 | if (WARN_ON(current->seccomp.filter == NULL)) | ||
207 | return SECCOMP_RET_KILL; | ||
208 | |||
209 | /* | ||
210 | * All filters in the list are evaluated and the lowest BPF return | ||
211 | * value always takes priority (ignoring the DATA). | ||
212 | */ | ||
213 | for (f = current->seccomp.filter; f; f = f->prev) { | ||
214 | u32 cur_ret = sk_run_filter(NULL, f->insns); | ||
215 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | ||
216 | ret = cur_ret; | ||
217 | } | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | /** | ||
222 | * seccomp_attach_filter: Attaches a seccomp filter to current. | ||
223 | * @fprog: BPF program to install | ||
224 | * | ||
225 | * Returns 0 on success or an errno on failure. | ||
226 | */ | ||
227 | static long seccomp_attach_filter(struct sock_fprog *fprog) | ||
228 | { | ||
229 | struct seccomp_filter *filter; | ||
230 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | ||
231 | unsigned long total_insns = fprog->len; | ||
232 | long ret; | ||
233 | |||
234 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | ||
235 | return -EINVAL; | ||
236 | |||
237 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | ||
238 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | ||
239 | if (total_insns > MAX_INSNS_PER_PATH) | ||
240 | return -ENOMEM; | ||
241 | |||
242 | /* | ||
243 | * Installing a seccomp filter requires that the task have | ||
244 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | ||
245 | * This avoids scenarios where unprivileged tasks can affect the | ||
246 | * behavior of privileged children. | ||
247 | */ | ||
248 | if (!current->no_new_privs && | ||
249 | security_capable_noaudit(current_cred(), current_user_ns(), | ||
250 | CAP_SYS_ADMIN) != 0) | ||
251 | return -EACCES; | ||
252 | |||
253 | /* Allocate a new seccomp_filter */ | ||
254 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, | ||
255 | GFP_KERNEL|__GFP_NOWARN); | ||
256 | if (!filter) | ||
257 | return -ENOMEM; | ||
258 | atomic_set(&filter->usage, 1); | ||
259 | filter->len = fprog->len; | ||
260 | |||
261 | /* Copy the instructions from fprog. */ | ||
262 | ret = -EFAULT; | ||
263 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) | ||
264 | goto fail; | ||
265 | |||
266 | /* Check and rewrite the fprog via the skb checker */ | ||
267 | ret = sk_chk_filter(filter->insns, filter->len); | ||
268 | if (ret) | ||
269 | goto fail; | ||
270 | |||
271 | /* Check and rewrite the fprog for seccomp use */ | ||
272 | ret = seccomp_check_filter(filter->insns, filter->len); | ||
273 | if (ret) | ||
274 | goto fail; | ||
275 | |||
276 | /* | ||
277 | * If there is an existing filter, make it the prev and don't drop its | ||
278 | * task reference. | ||
279 | */ | ||
280 | filter->prev = current->seccomp.filter; | ||
281 | current->seccomp.filter = filter; | ||
282 | return 0; | ||
283 | fail: | ||
284 | kfree(filter); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | ||
290 | * @user_filter: pointer to the user data containing a sock_fprog. | ||
291 | * | ||
292 | * Returns 0 on success and non-zero otherwise. | ||
293 | */ | ||
294 | long seccomp_attach_user_filter(char __user *user_filter) | ||
295 | { | ||
296 | struct sock_fprog fprog; | ||
297 | long ret = -EFAULT; | ||
298 | |||
299 | #ifdef CONFIG_COMPAT | ||
300 | if (is_compat_task()) { | ||
301 | struct compat_sock_fprog fprog32; | ||
302 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) | ||
303 | goto out; | ||
304 | fprog.len = fprog32.len; | ||
305 | fprog.filter = compat_ptr(fprog32.filter); | ||
306 | } else /* falls through to the if below. */ | ||
307 | #endif | ||
308 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | ||
309 | goto out; | ||
310 | ret = seccomp_attach_filter(&fprog); | ||
311 | out: | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | ||
316 | void get_seccomp_filter(struct task_struct *tsk) | ||
317 | { | ||
318 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
319 | if (!orig) | ||
320 | return; | ||
321 | /* Reference count is bounded by the number of total processes. */ | ||
322 | atomic_inc(&orig->usage); | ||
323 | } | ||
324 | |||
325 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | ||
326 | void put_seccomp_filter(struct task_struct *tsk) | ||
327 | { | ||
328 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
329 | /* Clean up single-reference branches iteratively. */ | ||
330 | while (orig && atomic_dec_and_test(&orig->usage)) { | ||
331 | struct seccomp_filter *freeme = orig; | ||
332 | orig = orig->prev; | ||
333 | kfree(freeme); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation | ||
339 | * @syscall: syscall number to send to userland | ||
340 | * @reason: filter-supplied reason code to send to userland (via si_errno) | ||
341 | * | ||
342 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. | ||
343 | */ | ||
344 | static void seccomp_send_sigsys(int syscall, int reason) | ||
345 | { | ||
346 | struct siginfo info; | ||
347 | memset(&info, 0, sizeof(info)); | ||
348 | info.si_signo = SIGSYS; | ||
349 | info.si_code = SYS_SECCOMP; | ||
350 | info.si_call_addr = (void __user *)KSTK_EIP(current); | ||
351 | info.si_errno = reason; | ||
352 | info.si_arch = syscall_get_arch(current, task_pt_regs(current)); | ||
353 | info.si_syscall = syscall; | ||
354 | force_sig_info(SIGSYS, &info, current); | ||
355 | } | ||
356 | #endif /* CONFIG_SECCOMP_FILTER */ | ||
16 | 357 | ||
17 | /* | 358 | /* |
18 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 359 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { | |||
31 | }; | 372 | }; |
32 | #endif | 373 | #endif |
33 | 374 | ||
34 | void __secure_computing(int this_syscall) | 375 | int __secure_computing(int this_syscall) |
35 | { | 376 | { |
36 | int mode = current->seccomp.mode; | 377 | int mode = current->seccomp.mode; |
37 | int * syscall; | 378 | int exit_sig = 0; |
379 | int *syscall; | ||
380 | u32 ret; | ||
38 | 381 | ||
39 | switch (mode) { | 382 | switch (mode) { |
40 | case 1: | 383 | case SECCOMP_MODE_STRICT: |
41 | syscall = mode1_syscalls; | 384 | syscall = mode1_syscalls; |
42 | #ifdef CONFIG_COMPAT | 385 | #ifdef CONFIG_COMPAT |
43 | if (is_compat_task()) | 386 | if (is_compat_task()) |
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) | |||
45 | #endif | 388 | #endif |
46 | do { | 389 | do { |
47 | if (*syscall == this_syscall) | 390 | if (*syscall == this_syscall) |
48 | return; | 391 | return 0; |
49 | } while (*++syscall); | 392 | } while (*++syscall); |
393 | exit_sig = SIGKILL; | ||
394 | ret = SECCOMP_RET_KILL; | ||
395 | break; | ||
396 | #ifdef CONFIG_SECCOMP_FILTER | ||
397 | case SECCOMP_MODE_FILTER: { | ||
398 | int data; | ||
399 | ret = seccomp_run_filters(this_syscall); | ||
400 | data = ret & SECCOMP_RET_DATA; | ||
401 | ret &= SECCOMP_RET_ACTION; | ||
402 | switch (ret) { | ||
403 | case SECCOMP_RET_ERRNO: | ||
404 | /* Set the low-order 16-bits as a errno. */ | ||
405 | syscall_set_return_value(current, task_pt_regs(current), | ||
406 | -data, 0); | ||
407 | goto skip; | ||
408 | case SECCOMP_RET_TRAP: | ||
409 | /* Show the handler the original registers. */ | ||
410 | syscall_rollback(current, task_pt_regs(current)); | ||
411 | /* Let the filter pass back 16 bits of data. */ | ||
412 | seccomp_send_sigsys(this_syscall, data); | ||
413 | goto skip; | ||
414 | case SECCOMP_RET_TRACE: | ||
415 | /* Skip these calls if there is no tracer. */ | ||
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | ||
417 | goto skip; | ||
418 | /* Allow the BPF to provide the event message */ | ||
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
420 | /* | ||
421 | * The delivery of a fatal signal during event | ||
422 | * notification may silently skip tracer notification. | ||
423 | * Terminating the task now avoids executing a system | ||
424 | * call that may not be intended. | ||
425 | */ | ||
426 | if (fatal_signal_pending(current)) | ||
427 | break; | ||
428 | return 0; | ||
429 | case SECCOMP_RET_ALLOW: | ||
430 | return 0; | ||
431 | case SECCOMP_RET_KILL: | ||
432 | default: | ||
433 | break; | ||
434 | } | ||
435 | exit_sig = SIGSYS; | ||
50 | break; | 436 | break; |
437 | } | ||
438 | #endif | ||
51 | default: | 439 | default: |
52 | BUG(); | 440 | BUG(); |
53 | } | 441 | } |
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) | |||
55 | #ifdef SECCOMP_DEBUG | 443 | #ifdef SECCOMP_DEBUG |
56 | dump_stack(); | 444 | dump_stack(); |
57 | #endif | 445 | #endif |
58 | audit_seccomp(this_syscall); | 446 | audit_seccomp(this_syscall, exit_sig, ret); |
59 | do_exit(SIGKILL); | 447 | do_exit(exit_sig); |
448 | #ifdef CONFIG_SECCOMP_FILTER | ||
449 | skip: | ||
450 | audit_seccomp(this_syscall, exit_sig, ret); | ||
451 | #endif | ||
452 | return -1; | ||
60 | } | 453 | } |
61 | 454 | ||
62 | long prctl_get_seccomp(void) | 455 | long prctl_get_seccomp(void) |
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void) | |||
64 | return current->seccomp.mode; | 457 | return current->seccomp.mode; |
65 | } | 458 | } |
66 | 459 | ||
67 | long prctl_set_seccomp(unsigned long seccomp_mode) | 460 | /** |
461 | * prctl_set_seccomp: configures current->seccomp.mode | ||
462 | * @seccomp_mode: requested mode to use | ||
463 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
464 | * | ||
465 | * This function may be called repeatedly with a @seccomp_mode of | ||
466 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | ||
467 | * successfully installed will be evaluated (in reverse order) for each system | ||
468 | * call the task makes. | ||
469 | * | ||
470 | * Once current->seccomp.mode is non-zero, it may not be changed. | ||
471 | * | ||
472 | * Returns 0 on success or -EINVAL on failure. | ||
473 | */ | ||
474 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | ||
68 | { | 475 | { |
69 | long ret; | 476 | long ret = -EINVAL; |
70 | 477 | ||
71 | /* can set it only once to be even more secure */ | 478 | if (current->seccomp.mode && |
72 | ret = -EPERM; | 479 | current->seccomp.mode != seccomp_mode) |
73 | if (unlikely(current->seccomp.mode)) | ||
74 | goto out; | 480 | goto out; |
75 | 481 | ||
76 | ret = -EINVAL; | 482 | switch (seccomp_mode) { |
77 | if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { | 483 | case SECCOMP_MODE_STRICT: |
78 | current->seccomp.mode = seccomp_mode; | 484 | ret = 0; |
79 | set_thread_flag(TIF_SECCOMP); | ||
80 | #ifdef TIF_NOTSC | 485 | #ifdef TIF_NOTSC |
81 | disable_TSC(); | 486 | disable_TSC(); |
82 | #endif | 487 | #endif |
83 | ret = 0; | 488 | break; |
489 | #ifdef CONFIG_SECCOMP_FILTER | ||
490 | case SECCOMP_MODE_FILTER: | ||
491 | ret = seccomp_attach_user_filter(filter); | ||
492 | if (ret) | ||
493 | goto out; | ||
494 | break; | ||
495 | #endif | ||
496 | default: | ||
497 | goto out; | ||
84 | } | 498 | } |
85 | 499 | ||
86 | out: | 500 | current->seccomp.mode = seccomp_mode; |
501 | set_thread_flag(TIF_SECCOMP); | ||
502 | out: | ||
87 | return ret; | 503 | return ret; |
88 | } | 504 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d0..1a006b5d9d9d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -160,7 +160,7 @@ void recalc_sigpending(void) | |||
160 | 160 | ||
161 | #define SYNCHRONOUS_MASK \ | 161 | #define SYNCHRONOUS_MASK \ |
162 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | 162 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ |
163 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | 163 | sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) |
164 | 164 | ||
165 | int next_signal(struct sigpending *pending, sigset_t *mask) | 165 | int next_signal(struct sigpending *pending, sigset_t *mask) |
166 | { | 166 | { |
@@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2706 | err |= __put_user(from->si_uid, &to->si_uid); | 2706 | err |= __put_user(from->si_uid, &to->si_uid); |
2707 | err |= __put_user(from->si_ptr, &to->si_ptr); | 2707 | err |= __put_user(from->si_ptr, &to->si_ptr); |
2708 | break; | 2708 | break; |
2709 | #ifdef __ARCH_SIGSYS | ||
2710 | case __SI_SYS: | ||
2711 | err |= __put_user(from->si_call_addr, &to->si_call_addr); | ||
2712 | err |= __put_user(from->si_syscall, &to->si_syscall); | ||
2713 | err |= __put_user(from->si_arch, &to->si_arch); | ||
2714 | break; | ||
2715 | #endif | ||
2709 | default: /* this is just in case for now ... */ | 2716 | default: /* this is just in case for now ... */ |
2710 | err |= __put_user(from->si_pid, &to->si_pid); | 2717 | err |= __put_user(from->si_pid, &to->si_pid); |
2711 | err |= __put_user(from->si_uid, &to->si_uid); | 2718 | err |= __put_user(from->si_uid, &to->si_uid); |
diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf759..d0ae5b24875e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,8 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #include "smpboot.h" | ||
17 | |||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
17 | static struct { | 19 | static struct { |
18 | struct list_head queue; | 20 | struct list_head queue; |
@@ -669,6 +671,8 @@ void __init smp_init(void) | |||
669 | { | 671 | { |
670 | unsigned int cpu; | 672 | unsigned int cpu; |
671 | 673 | ||
674 | idle_threads_init(); | ||
675 | |||
672 | /* FIXME: This should be done in userspace --RR */ | 676 | /* FIXME: This should be done in userspace --RR */ |
673 | for_each_present_cpu(cpu) { | 677 | for_each_present_cpu(cpu) { |
674 | if (num_online_cpus() >= setup_max_cpus) | 678 | if (num_online_cpus() >= setup_max_cpus) |
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
791 | } | 795 | } |
792 | } | 796 | } |
793 | EXPORT_SYMBOL(on_each_cpu_cond); | 797 | EXPORT_SYMBOL(on_each_cpu_cond); |
798 | |||
799 | static void do_nothing(void *unused) | ||
800 | { | ||
801 | } | ||
802 | |||
803 | /** | ||
804 | * kick_all_cpus_sync - Force all cpus out of idle | ||
805 | * | ||
806 | * Used to synchronize the update of pm_idle function pointer. It's | ||
807 | * called after the pointer is updated and returns after the dummy | ||
808 | * callback function has been executed on all cpus. The execution of | ||
809 | * the function can only happen on the remote cpus after they have | ||
810 | * left the idle function which had been called via pm_idle function | ||
811 | * pointer. So it's guaranteed that nothing uses the previous pointer | ||
812 | * anymore. | ||
813 | */ | ||
814 | void kick_all_cpus_sync(void) | ||
815 | { | ||
816 | /* Make sure the change is visible before we kick the cpus */ | ||
817 | smp_mb(); | ||
818 | smp_call_function(do_nothing, NULL, 1); | ||
819 | } | ||
820 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 000000000000..e1a797e028a3 --- /dev/null +++ b/kernel/smpboot.c | |||
@@ -0,0 +1,62 @@ | |||
1 | /* | ||
2 | * Common SMP CPU bringup/teardown functions | ||
3 | */ | ||
4 | #include <linux/err.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/percpu.h> | ||
9 | |||
10 | #include "smpboot.h" | ||
11 | |||
12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
13 | /* | ||
14 | * For the hotplug case we keep the task structs around and reuse | ||
15 | * them. | ||
16 | */ | ||
17 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); | ||
18 | |||
19 | struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) | ||
20 | { | ||
21 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
22 | |||
23 | if (!tsk) | ||
24 | return ERR_PTR(-ENOMEM); | ||
25 | init_idle(tsk, cpu); | ||
26 | return tsk; | ||
27 | } | ||
28 | |||
29 | void __init idle_thread_set_boot_cpu(void) | ||
30 | { | ||
31 | per_cpu(idle_threads, smp_processor_id()) = current; | ||
32 | } | ||
33 | |||
34 | static inline void idle_init(unsigned int cpu) | ||
35 | { | ||
36 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
37 | |||
38 | if (!tsk) { | ||
39 | tsk = fork_idle(cpu); | ||
40 | if (IS_ERR(tsk)) | ||
41 | pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); | ||
42 | else | ||
43 | per_cpu(idle_threads, cpu) = tsk; | ||
44 | } | ||
45 | } | ||
46 | |||
47 | /** | ||
48 | * idle_thread_init - Initialize the idle thread for a cpu | ||
49 | * @cpu: The cpu for which the idle thread should be initialized | ||
50 | * | ||
51 | * Creates the thread if it does not exist. | ||
52 | */ | ||
53 | void __init idle_threads_init(void) | ||
54 | { | ||
55 | unsigned int cpu; | ||
56 | |||
57 | for_each_possible_cpu(cpu) { | ||
58 | if (cpu != smp_processor_id()) | ||
59 | idle_init(cpu); | ||
60 | } | ||
61 | } | ||
62 | #endif | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 000000000000..80c0acfb8472 --- /dev/null +++ b/kernel/smpboot.h | |||
@@ -0,0 +1,18 @@ | |||
1 | #ifndef SMPBOOT_H | ||
2 | #define SMPBOOT_H | ||
3 | |||
4 | struct task_struct; | ||
5 | |||
6 | int smpboot_prepare(unsigned int cpu); | ||
7 | |||
8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
9 | struct task_struct *idle_thread_get(unsigned int cpu); | ||
10 | void idle_thread_set_boot_cpu(void); | ||
11 | void idle_threads_init(void); | ||
12 | #else | ||
13 | static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } | ||
14 | static inline void idle_thread_set_boot_cpu(void) { } | ||
15 | static inline void idle_threads_init(void) { } | ||
16 | #endif | ||
17 | |||
18 | #endif | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..2095be3318d5 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,10 +34,77 @@ | |||
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | /* | ||
38 | * Initialize an rcu_batch structure to empty. | ||
39 | */ | ||
40 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
41 | { | ||
42 | b->head = NULL; | ||
43 | b->tail = &b->head; | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
48 | */ | ||
49 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
50 | { | ||
51 | *b->tail = head; | ||
52 | b->tail = &head->next; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Is the specified rcu_batch structure empty? | ||
57 | */ | ||
58 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
59 | { | ||
60 | return b->tail == &b->head; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Remove the callback at the head of the specified rcu_batch structure | ||
65 | * and return a pointer to it, or return NULL if the structure is empty. | ||
66 | */ | ||
67 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
68 | { | ||
69 | struct rcu_head *head; | ||
70 | |||
71 | if (rcu_batch_empty(b)) | ||
72 | return NULL; | ||
73 | |||
74 | head = b->head; | ||
75 | b->head = head->next; | ||
76 | if (b->tail == &head->next) | ||
77 | rcu_batch_init(b); | ||
78 | |||
79 | return head; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
84 | * the structure specified by "to". | ||
85 | */ | ||
86 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
87 | { | ||
88 | if (!rcu_batch_empty(from)) { | ||
89 | *to->tail = from->head; | ||
90 | to->tail = from->tail; | ||
91 | rcu_batch_init(from); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 98 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | 99 | { |
39 | sp->completed = 0; | 100 | sp->completed = 0; |
40 | mutex_init(&sp->mutex); | 101 | spin_lock_init(&sp->queue_lock); |
102 | sp->running = false; | ||
103 | rcu_batch_init(&sp->batch_queue); | ||
104 | rcu_batch_init(&sp->batch_check0); | ||
105 | rcu_batch_init(&sp->batch_check1); | ||
106 | rcu_batch_init(&sp->batch_done); | ||
107 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | 108 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | 109 | return sp->per_cpu_ref ? 0 : -ENOMEM; |
43 | } | 110 | } |
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 140 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 141 | ||
75 | /* | 142 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 143 | * Returns approximate total of the readers' ->seq[] values for the |
77 | * active on the specified rank of per-CPU counters. | 144 | * rank of per-CPU counters specified by idx. |
78 | */ | 145 | */ |
146 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
147 | { | ||
148 | int cpu; | ||
149 | unsigned long sum = 0; | ||
150 | unsigned long t; | ||
79 | 151 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 152 | for_each_possible_cpu(cpu) { |
153 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
154 | sum += t; | ||
155 | } | ||
156 | return sum; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Returns approximate number of readers active on the specified rank | ||
161 | * of the per-CPU ->c[] counters. | ||
162 | */ | ||
163 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
81 | { | 164 | { |
82 | int cpu; | 165 | int cpu; |
83 | int sum; | 166 | unsigned long sum = 0; |
167 | unsigned long t; | ||
84 | 168 | ||
85 | sum = 0; | 169 | for_each_possible_cpu(cpu) { |
86 | for_each_possible_cpu(cpu) | 170 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 171 | sum += t; |
172 | } | ||
88 | return sum; | 173 | return sum; |
89 | } | 174 | } |
90 | 175 | ||
176 | /* | ||
177 | * Return true if the number of pre-existing readers is determined to | ||
178 | * be stably zero. An example unstable zero can occur if the call | ||
179 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
180 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
181 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
182 | * time to sum the array, and might in fact be interrupted or preempted | ||
183 | * partway through the summation. | ||
184 | */ | ||
185 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
186 | { | ||
187 | unsigned long seq; | ||
188 | |||
189 | seq = srcu_readers_seq_idx(sp, idx); | ||
190 | |||
191 | /* | ||
192 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
193 | * __srcu_read_lock(). This pairing ensures that if an | ||
194 | * __srcu_read_lock() increments its counter after the summation | ||
195 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
196 | * critical section will see any changes made prior to the start | ||
197 | * of the current SRCU grace period. | ||
198 | * | ||
199 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
200 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
201 | * must see the increment of ->c[]. | ||
202 | */ | ||
203 | smp_mb(); /* A */ | ||
204 | |||
205 | /* | ||
206 | * Note that srcu_readers_active_idx() can incorrectly return | ||
207 | * zero even though there is a pre-existing reader throughout. | ||
208 | * To see this, suppose that task A is in a very long SRCU | ||
209 | * read-side critical section that started on CPU 0, and that | ||
210 | * no other reader exists, so that the sum of the counters | ||
211 | * is equal to one. Then suppose that task B starts executing | ||
212 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
213 | * task C starts reading on CPU 0, so that its increment is not | ||
214 | * summed, but finishes reading on CPU 2, so that its decrement | ||
215 | * -is- summed. Then when task B completes its sum, it will | ||
216 | * incorrectly get zero, despite the fact that task A has been | ||
217 | * in its SRCU read-side critical section the whole time. | ||
218 | * | ||
219 | * We therefore do a validation step should srcu_readers_active_idx() | ||
220 | * return zero. | ||
221 | */ | ||
222 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * The remainder of this function is the validation step. | ||
227 | * The following smp_mb() D pairs with the smp_mb() C in | ||
228 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
229 | * by srcu_readers_active_idx() above, then any destructive | ||
230 | * operation performed after the grace period will happen after | ||
231 | * the corresponding SRCU read-side critical section. | ||
232 | * | ||
233 | * Note that there can be at most NR_CPUS worth of readers using | ||
234 | * the old index, which is not enough to overflow even a 32-bit | ||
235 | * integer. (Yes, this does mean that systems having more than | ||
236 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
237 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
238 | * Therefore, the only way that the return values of the two | ||
239 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
240 | * no increments of the corresponding rank of ->seq[] counts | ||
241 | * in the interim. But the missed-increment scenario laid out | ||
242 | * above includes an increment of the ->seq[] counter by | ||
243 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
244 | * scenario occurs, the return values from the two calls to | ||
245 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
246 | * step below suffices. | ||
247 | */ | ||
248 | smp_mb(); /* D */ | ||
249 | |||
250 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
251 | } | ||
252 | |||
91 | /** | 253 | /** |
92 | * srcu_readers_active - returns approximate number of readers. | 254 | * srcu_readers_active - returns approximate number of readers. |
93 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 255 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
98 | */ | 260 | */ |
99 | static int srcu_readers_active(struct srcu_struct *sp) | 261 | static int srcu_readers_active(struct srcu_struct *sp) |
100 | { | 262 | { |
101 | return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); | 263 | int cpu; |
264 | unsigned long sum = 0; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
268 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
269 | } | ||
270 | return sum; | ||
102 | } | 271 | } |
103 | 272 | ||
104 | /** | 273 | /** |
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 300 | int idx; |
132 | 301 | ||
133 | preempt_disable(); | 302 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 303 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 304 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 306 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
307 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
138 | preempt_enable(); | 308 | preempt_enable(); |
139 | return idx; | 309 | return idx; |
140 | } | 310 | } |
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 319 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 320 | { |
151 | preempt_disable(); | 321 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 322 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 323 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
154 | preempt_enable(); | 324 | preempt_enable(); |
155 | } | 325 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 326 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 333 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 334 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 335 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 336 | #define SRCU_RETRY_CHECK_DELAY 5 |
337 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
338 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
167 | 339 | ||
168 | /* | 340 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 341 | * @@@ Wait until all pre-existing readers complete. Such readers |
342 | * will have used the index specified by "idx". | ||
343 | * the caller should ensures the ->completed is not changed while checking | ||
344 | * and idx = (->completed & 1) ^ 1 | ||
170 | */ | 345 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 346 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) |
172 | { | 347 | { |
173 | int idx; | 348 | for (;;) { |
174 | 349 | if (srcu_readers_active_idx_check(sp, idx)) | |
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 350 | return true; |
176 | !lock_is_held(&rcu_bh_lock_map) && | 351 | if (--trycount <= 0) |
177 | !lock_is_held(&rcu_lock_map) && | 352 | return false; |
178 | !lock_is_held(&rcu_sched_lock_map), | 353 | udelay(SRCU_RETRY_CHECK_DELAY); |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 354 | } |
180 | 355 | } | |
181 | idx = sp->completed; | ||
182 | mutex_lock(&sp->mutex); | ||
183 | 356 | ||
184 | /* | 357 | /* |
185 | * Check to see if someone else did the work for us while we were | 358 | * Increment the ->completed counter so that future SRCU readers will |
186 | * waiting to acquire the lock. We need -two- advances of | 359 | * use the other rank of the ->c[] and ->seq[] arrays. This allows |
187 | * the counter, not just one. If there was but one, we might have | 360 | * us to wait for pre-existing readers in a starvation-free manner. |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 361 | */ |
189 | * having failed to prevent CPU-reordering races with concurrent | 362 | static void srcu_flip(struct srcu_struct *sp) |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 363 | { |
191 | * either (1) wait for two or (2) supply the second ourselves. | 364 | sp->completed++; |
192 | */ | 365 | } |
193 | 366 | ||
194 | if ((sp->completed - idx) >= 2) { | 367 | /* |
195 | mutex_unlock(&sp->mutex); | 368 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
196 | return; | 369 | * initiating grace-period processing if it is not already running. |
370 | */ | ||
371 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
372 | void (*func)(struct rcu_head *head)) | ||
373 | { | ||
374 | unsigned long flags; | ||
375 | |||
376 | head->next = NULL; | ||
377 | head->func = func; | ||
378 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
379 | rcu_batch_queue(&sp->batch_queue, head); | ||
380 | if (!sp->running) { | ||
381 | sp->running = true; | ||
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | ||
197 | } | 383 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
385 | } | ||
386 | EXPORT_SYMBOL_GPL(call_srcu); | ||
198 | 387 | ||
199 | sync_func(); /* Force memory barrier on all CPUs. */ | 388 | struct rcu_synchronize { |
389 | struct rcu_head head; | ||
390 | struct completion completion; | ||
391 | }; | ||
200 | 392 | ||
201 | /* | 393 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 394 | * Awaken the corresponding synchronize_srcu() instance now that a |
203 | * sees the new value of sp->completed will also see any preceding | 395 | * grace period has elapsed. |
204 | * changes to data structures made by this CPU. This prevents | 396 | */ |
205 | * some other CPU from reordering the accesses in its SRCU | 397 | static void wakeme_after_rcu(struct rcu_head *head) |
206 | * read-side critical section to precede the corresponding | 398 | { |
207 | * srcu_read_lock() -- ensuring that such references will in | 399 | struct rcu_synchronize *rcu; |
208 | * fact be protected. | ||
209 | * | ||
210 | * So it is now safe to do the flip. | ||
211 | */ | ||
212 | 400 | ||
213 | idx = sp->completed & 0x1; | 401 | rcu = container_of(head, struct rcu_synchronize, head); |
214 | sp->completed++; | 402 | complete(&rcu->completion); |
403 | } | ||
215 | 404 | ||
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
406 | static void srcu_reschedule(struct srcu_struct *sp); | ||
217 | 407 | ||
218 | /* | 408 | /* |
219 | * At this point, because of the preceding synchronize_sched(), | 409 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
220 | * all srcu_read_lock() calls using the old counters have completed. | 410 | */ |
221 | * Their corresponding critical sections might well be still | 411 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
222 | * executing, but the srcu_read_lock() primitives themselves | 412 | { |
223 | * will have finished executing. We initially give readers | 413 | struct rcu_synchronize rcu; |
224 | * an arbitrarily chosen 10 microseconds to get out of their | 414 | struct rcu_head *head = &rcu.head; |
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | 415 | bool done = false; |
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | 416 | ||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | 417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
418 | !lock_is_held(&rcu_bh_lock_map) && | ||
419 | !lock_is_held(&rcu_lock_map) && | ||
420 | !lock_is_held(&rcu_sched_lock_map), | ||
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
236 | 422 | ||
237 | /* | 423 | init_completion(&rcu.completion); |
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | 424 | |
239 | * primitives that were executing concurrently with the preceding | 425 | head->next = NULL; |
240 | * for_each_possible_cpu() loop to have completed by this point. | 426 | head->func = wakeme_after_rcu; |
241 | * More importantly, it also forces the corresponding SRCU read-side | 427 | spin_lock_irq(&sp->queue_lock); |
242 | * critical sections to have also completed, and the corresponding | 428 | if (!sp->running) { |
243 | * references to SRCU-protected data items to be dropped. | 429 | /* steal the processing owner */ |
244 | * | 430 | sp->running = true; |
245 | * Note: | 431 | rcu_batch_queue(&sp->batch_check0, head); |
246 | * | 432 | spin_unlock_irq(&sp->queue_lock); |
247 | * Despite what you might think at first glance, the | 433 | |
248 | * preceding synchronize_sched() -must- be within the | 434 | srcu_advance_batches(sp, trycount); |
249 | * critical section ended by the following mutex_unlock(). | 435 | if (!rcu_batch_empty(&sp->batch_done)) { |
250 | * Otherwise, a task taking the early exit can race | 436 | BUG_ON(sp->batch_done.head != head); |
251 | * with a srcu_read_unlock(), which might have executed | 437 | rcu_batch_dequeue(&sp->batch_done); |
252 | * just before the preceding srcu_readers_active() check, | 438 | done = true; |
253 | * and whose CPU might have reordered the srcu_read_unlock() | 439 | } |
254 | * with the preceding critical section. In this case, there | 440 | /* give the processing owner to work_struct */ |
255 | * is nothing preventing the synchronize_sched() task that is | 441 | srcu_reschedule(sp); |
256 | * taking the early exit from freeing a data structure that | 442 | } else { |
257 | * is still being referenced (out of order) by the task | 443 | rcu_batch_queue(&sp->batch_queue, head); |
258 | * doing the srcu_read_unlock(). | 444 | spin_unlock_irq(&sp->queue_lock); |
259 | * | 445 | } |
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | ||
264 | 446 | ||
265 | mutex_unlock(&sp->mutex); | 447 | if (!done) |
448 | wait_for_completion(&rcu.completion); | ||
266 | } | 449 | } |
267 | 450 | ||
268 | /** | 451 | /** |
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 464 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 465 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 466 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
285 | } | 468 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 469 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 470 | ||
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 472 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 473 | * @sp: srcu_struct with which to synchronize. |
291 | * | 474 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 475 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 476 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 477 | * |
300 | * Note that it is illegal to call this function while holding any lock | 478 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 479 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 480 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 481 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 482 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 485 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 487 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
313 | } | 489 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 491 | ||
316 | /** | 492 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
494 | */ | ||
495 | void srcu_barrier(struct srcu_struct *sp) | ||
496 | { | ||
497 | synchronize_srcu(sp); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
500 | |||
501 | /** | ||
317 | * srcu_batches_completed - return batches completed. | 502 | * srcu_batches_completed - return batches completed. |
318 | * @sp: srcu_struct on which to report batch completion. | 503 | * @sp: srcu_struct on which to report batch completion. |
319 | * | 504 | * |
320 | * Report the number of batches, correlated with, but not necessarily | 505 | * Report the number of batches, correlated with, but not necessarily |
321 | * precisely the same as, the number of grace periods that have elapsed. | 506 | * precisely the same as, the number of grace periods that have elapsed. |
322 | */ | 507 | */ |
323 | |||
324 | long srcu_batches_completed(struct srcu_struct *sp) | 508 | long srcu_batches_completed(struct srcu_struct *sp) |
325 | { | 509 | { |
326 | return sp->completed; | 510 | return sp->completed; |
327 | } | 511 | } |
328 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |
513 | |||
514 | #define SRCU_CALLBACK_BATCH 10 | ||
515 | #define SRCU_INTERVAL 1 | ||
516 | |||
517 | /* | ||
518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
519 | * period pipeline. | ||
520 | */ | ||
521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
522 | { | ||
523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
524 | spin_lock_irq(&sp->queue_lock); | ||
525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
526 | spin_unlock_irq(&sp->queue_lock); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
533 | */ | ||
534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
535 | { | ||
536 | int idx = 1 ^ (sp->completed & 1); | ||
537 | |||
538 | /* | ||
539 | * Because readers might be delayed for an extended period after | ||
540 | * fetching ->completed for their index, at any point in time there | ||
541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
542 | * need to wait for readers to clear from both index values before | ||
543 | * invoking a callback. | ||
544 | */ | ||
545 | |||
546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
547 | rcu_batch_empty(&sp->batch_check1)) | ||
548 | return; /* no callbacks need to be advanced */ | ||
549 | |||
550 | if (!try_check_zero(sp, idx, trycount)) | ||
551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
552 | |||
553 | /* | ||
554 | * The callbacks in ->batch_check1 have already done with their | ||
555 | * first zero check and flip back when they were enqueued on | ||
556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
557 | * (Presumably try_check_zero() returned false during that | ||
558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
560 | */ | ||
561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
562 | |||
563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
564 | return; /* no callbacks need to be advanced */ | ||
565 | srcu_flip(sp); | ||
566 | |||
567 | /* | ||
568 | * The callbacks in ->batch_check0 just finished their | ||
569 | * first check zero and flip, so move them to ->batch_check1 | ||
570 | * for future checking on the other idx. | ||
571 | */ | ||
572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
573 | |||
574 | /* | ||
575 | * SRCU read-side critical sections are normally short, so check | ||
576 | * at least twice in quick succession after a flip. | ||
577 | */ | ||
578 | trycount = trycount < 2 ? 2 : trycount; | ||
579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
581 | |||
582 | /* | ||
583 | * The callbacks in ->batch_check1 have now waited for all | ||
584 | * pre-existing readers using both idx values. They are therefore | ||
585 | * ready to invoke, so move them to ->batch_done. | ||
586 | */ | ||
587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
592 | * their grace period. If there are more to do, SRCU will reschedule | ||
593 | * the workqueue. | ||
594 | */ | ||
595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
596 | { | ||
597 | int i; | ||
598 | struct rcu_head *head; | ||
599 | |||
600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
602 | if (!head) | ||
603 | break; | ||
604 | local_bh_disable(); | ||
605 | head->func(head); | ||
606 | local_bh_enable(); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Finished one round of SRCU grace period. Start another if there are | ||
612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
613 | */ | ||
614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
615 | { | ||
616 | bool pending = true; | ||
617 | |||
618 | if (rcu_batch_empty(&sp->batch_done) && | ||
619 | rcu_batch_empty(&sp->batch_check1) && | ||
620 | rcu_batch_empty(&sp->batch_check0) && | ||
621 | rcu_batch_empty(&sp->batch_queue)) { | ||
622 | spin_lock_irq(&sp->queue_lock); | ||
623 | if (rcu_batch_empty(&sp->batch_done) && | ||
624 | rcu_batch_empty(&sp->batch_check1) && | ||
625 | rcu_batch_empty(&sp->batch_check0) && | ||
626 | rcu_batch_empty(&sp->batch_queue)) { | ||
627 | sp->running = false; | ||
628 | pending = false; | ||
629 | } | ||
630 | spin_unlock_irq(&sp->queue_lock); | ||
631 | } | ||
632 | |||
633 | if (pending) | ||
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * This is the work-queue function that handles SRCU grace periods. | ||
639 | */ | ||
640 | static void process_srcu(struct work_struct *work) | ||
641 | { | ||
642 | struct srcu_struct *sp; | ||
643 | |||
644 | sp = container_of(work, struct srcu_struct, work.work); | ||
645 | |||
646 | srcu_collect_new(sp); | ||
647 | srcu_advance_batches(sp, 1); | ||
648 | srcu_invoke_callbacks(sp); | ||
649 | srcu_reschedule(sp); | ||
650 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e4..ba0ae8eea6fb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1908 | error = prctl_get_seccomp(); | 1908 | error = prctl_get_seccomp(); |
1909 | break; | 1909 | break; |
1910 | case PR_SET_SECCOMP: | 1910 | case PR_SET_SECCOMP: |
1911 | error = prctl_set_seccomp(arg2); | 1911 | error = prctl_set_seccomp(arg2, (char __user *)arg3); |
1912 | break; | 1912 | break; |
1913 | case PR_GET_TSC: | 1913 | case PR_GET_TSC: |
1914 | error = GET_TSC_CTL(arg2); | 1914 | error = GET_TSC_CTL(arg2); |
@@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1979 | error = put_user(me->signal->is_child_subreaper, | 1979 | error = put_user(me->signal->is_child_subreaper, |
1980 | (int __user *) arg2); | 1980 | (int __user *) arg2); |
1981 | break; | 1981 | break; |
1982 | case PR_SET_NO_NEW_PRIVS: | ||
1983 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
1984 | return -EINVAL; | ||
1985 | |||
1986 | current->no_new_privs = 1; | ||
1987 | break; | ||
1988 | case PR_GET_NO_NEW_PRIVS: | ||
1989 | if (arg2 || arg3 || arg4 || arg5) | ||
1990 | return -EINVAL; | ||
1991 | return current->no_new_privs ? 1 : 0; | ||
1982 | default: | 1992 | default: |
1983 | error = -EINVAL; | 1993 | error = -EINVAL; |
1984 | break; | 1994 | break; |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7b..aa27d391bfc8 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); | |||
59 | * If one has not already been chosen, it checks to see if a | 59 | * If one has not already been chosen, it checks to see if a |
60 | * functional rtc device is available. | 60 | * functional rtc device is available. |
61 | */ | 61 | */ |
62 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | struct rtc_device *alarmtimer_get_rtcdev(void) |
63 | { | 63 | { |
64 | unsigned long flags; | 64 | unsigned long flags; |
65 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) | |||
115 | class_interface_unregister(&alarmtimer_rtc_interface); | 115 | class_interface_unregister(&alarmtimer_rtc_interface); |
116 | } | 116 | } |
117 | #else | 117 | #else |
118 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) | 118 | struct rtc_device *alarmtimer_get_rtcdev(void) |
119 | { | 119 | { |
120 | return NULL; | 120 | return NULL; |
121 | } | 121 | } |
diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888e..09de9a941cd7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); | |||
861 | * | 861 | * |
862 | * mod_timer_pinned() is a way to update the expire field of an | 862 | * mod_timer_pinned() is a way to update the expire field of an |
863 | * active timer (if the timer is inactive it will be activated) | 863 | * active timer (if the timer is inactive it will be activated) |
864 | * and not allow the timer to be migrated to a different CPU. | 864 | * and to ensure that the timer is scheduled on the current CPU. |
865 | * | ||
866 | * Note that this does not prevent the timer from being migrated | ||
867 | * when the current CPU goes offline. If this is a problem for | ||
868 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
869 | * example, cancelling the timer when the corresponding CPU goes | ||
870 | * offline. | ||
865 | * | 871 | * |
866 | * mod_timer_pinned(timer, expires) is equivalent to: | 872 | * mod_timer_pinned(timer, expires) is equivalent to: |
867 | * | 873 | * |
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1102 | * warnings as well as problems when looking into | 1108 | * warnings as well as problems when looking into |
1103 | * timer->lockdep_map, make a copy and use that here. | 1109 | * timer->lockdep_map, make a copy and use that here. |
1104 | */ | 1110 | */ |
1105 | struct lockdep_map lockdep_map = timer->lockdep_map; | 1111 | struct lockdep_map lockdep_map; |
1112 | |||
1113 | lockdep_copy_map(&lockdep_map, &timer->lockdep_map); | ||
1106 | #endif | 1114 | #endif |
1107 | /* | 1115 | /* |
1108 | * Couple the lock chain with the lock chain at | 1116 | * Couple the lock chain with the lock chain at |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..d81a1a532994 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,6 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE | ||
145 | select KALLSYMS | 144 | select KALLSYMS |
146 | select GENERIC_TRACER | 145 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 146 | select CONTEXT_SWITCH_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5ea..b3afe0e76f79 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o | |||
41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | ||
45 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 44 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
46 | ifeq ($(CONFIG_BLOCK),y) | 45 | ifeq ($(CONFIG_BLOCK),y) |
47 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o | 46 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c92..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1383 | 1383 | ||
1384 | static int ftrace_cmp_recs(const void *a, const void *b) | 1384 | static int ftrace_cmp_recs(const void *a, const void *b) |
1385 | { | 1385 | { |
1386 | const struct dyn_ftrace *reca = a; | 1386 | const struct dyn_ftrace *key = a; |
1387 | const struct dyn_ftrace *recb = b; | 1387 | const struct dyn_ftrace *rec = b; |
1388 | 1388 | ||
1389 | if (reca->ip > recb->ip) | 1389 | if (key->flags < rec->ip) |
1390 | return 1; | ||
1391 | if (reca->ip < recb->ip) | ||
1392 | return -1; | 1390 | return -1; |
1391 | if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) | ||
1392 | return 1; | ||
1393 | return 0; | 1393 | return 0; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | /** | 1396 | static unsigned long ftrace_location_range(unsigned long start, unsigned long end) |
1397 | * ftrace_location - return true if the ip giving is a traced location | ||
1398 | * @ip: the instruction pointer to check | ||
1399 | * | ||
1400 | * Returns 1 if @ip given is a pointer to a ftrace location. | ||
1401 | * That is, the instruction that is either a NOP or call to | ||
1402 | * the function tracer. It checks the ftrace internal tables to | ||
1403 | * determine if the address belongs or not. | ||
1404 | */ | ||
1405 | int ftrace_location(unsigned long ip) | ||
1406 | { | 1397 | { |
1407 | struct ftrace_page *pg; | 1398 | struct ftrace_page *pg; |
1408 | struct dyn_ftrace *rec; | 1399 | struct dyn_ftrace *rec; |
1409 | struct dyn_ftrace key; | 1400 | struct dyn_ftrace key; |
1410 | 1401 | ||
1411 | key.ip = ip; | 1402 | key.ip = start; |
1403 | key.flags = end; /* overload flags, as it is unsigned long */ | ||
1412 | 1404 | ||
1413 | for (pg = ftrace_pages_start; pg; pg = pg->next) { | 1405 | for (pg = ftrace_pages_start; pg; pg = pg->next) { |
1406 | if (end < pg->records[0].ip || | ||
1407 | start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) | ||
1408 | continue; | ||
1414 | rec = bsearch(&key, pg->records, pg->index, | 1409 | rec = bsearch(&key, pg->records, pg->index, |
1415 | sizeof(struct dyn_ftrace), | 1410 | sizeof(struct dyn_ftrace), |
1416 | ftrace_cmp_recs); | 1411 | ftrace_cmp_recs); |
1417 | if (rec) | 1412 | if (rec) |
1418 | return 1; | 1413 | return rec->ip; |
1419 | } | 1414 | } |
1420 | 1415 | ||
1421 | return 0; | 1416 | return 0; |
1422 | } | 1417 | } |
1423 | 1418 | ||
1419 | /** | ||
1420 | * ftrace_location - return true if the ip giving is a traced location | ||
1421 | * @ip: the instruction pointer to check | ||
1422 | * | ||
1423 | * Returns rec->ip if @ip given is a pointer to a ftrace location. | ||
1424 | * That is, the instruction that is either a NOP or call to | ||
1425 | * the function tracer. It checks the ftrace internal tables to | ||
1426 | * determine if the address belongs or not. | ||
1427 | */ | ||
1428 | unsigned long ftrace_location(unsigned long ip) | ||
1429 | { | ||
1430 | return ftrace_location_range(ip, ip); | ||
1431 | } | ||
1432 | |||
1433 | /** | ||
1434 | * ftrace_text_reserved - return true if range contains an ftrace location | ||
1435 | * @start: start of range to search | ||
1436 | * @end: end of range to search (inclusive). @end points to the last byte to check. | ||
1437 | * | ||
1438 | * Returns 1 if @start and @end contains a ftrace location. | ||
1439 | * That is, the instruction that is either a NOP or call to | ||
1440 | * the function tracer. It checks the ftrace internal tables to | ||
1441 | * determine if the address belongs or not. | ||
1442 | */ | ||
1443 | int ftrace_text_reserved(void *start, void *end) | ||
1444 | { | ||
1445 | unsigned long ret; | ||
1446 | |||
1447 | ret = ftrace_location_range((unsigned long)start, | ||
1448 | (unsigned long)end); | ||
1449 | |||
1450 | return (int)!!ret; | ||
1451 | } | ||
1452 | |||
1424 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1453 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1425 | int filter_hash, | 1454 | int filter_hash, |
1426 | bool inc) | 1455 | bool inc) |
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
1520 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1549 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
1521 | } | 1550 | } |
1522 | 1551 | ||
1523 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) | ||
1524 | { | ||
1525 | if (ftrace_pages->index == ftrace_pages->size) { | ||
1526 | /* We should have allocated enough */ | ||
1527 | if (WARN_ON(!ftrace_pages->next)) | ||
1528 | return NULL; | ||
1529 | ftrace_pages = ftrace_pages->next; | ||
1530 | } | ||
1531 | |||
1532 | return &ftrace_pages->records[ftrace_pages->index++]; | ||
1533 | } | ||
1534 | |||
1535 | static struct dyn_ftrace * | ||
1536 | ftrace_record_ip(unsigned long ip) | ||
1537 | { | ||
1538 | struct dyn_ftrace *rec; | ||
1539 | |||
1540 | if (ftrace_disabled) | ||
1541 | return NULL; | ||
1542 | |||
1543 | rec = ftrace_alloc_dyn_node(ip); | ||
1544 | if (!rec) | ||
1545 | return NULL; | ||
1546 | |||
1547 | rec->ip = ip; | ||
1548 | |||
1549 | return rec; | ||
1550 | } | ||
1551 | |||
1552 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1552 | static void print_ip_ins(const char *fmt, unsigned char *p) |
1553 | { | 1553 | { |
1554 | int i; | 1554 | int i; |
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) | |||
1598 | } | 1598 | } |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | |||
1602 | /* Return 1 if the address range is reserved for ftrace */ | ||
1603 | int ftrace_text_reserved(void *start, void *end) | ||
1604 | { | ||
1605 | struct dyn_ftrace *rec; | ||
1606 | struct ftrace_page *pg; | ||
1607 | |||
1608 | do_for_each_ftrace_rec(pg, rec) { | ||
1609 | if (rec->ip <= (unsigned long)end && | ||
1610 | rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) | ||
1611 | return 1; | ||
1612 | } while_for_each_ftrace_rec(); | ||
1613 | return 0; | ||
1614 | } | ||
1615 | |||
1616 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1601 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
1617 | { | 1602 | { |
1618 | unsigned long flag = 0UL; | 1603 | unsigned long flag = 0UL; |
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1698 | return -1; /* unknow ftrace bug */ | 1683 | return -1; /* unknow ftrace bug */ |
1699 | } | 1684 | } |
1700 | 1685 | ||
1701 | static void ftrace_replace_code(int update) | 1686 | void __weak ftrace_replace_code(int enable) |
1702 | { | 1687 | { |
1703 | struct dyn_ftrace *rec; | 1688 | struct dyn_ftrace *rec; |
1704 | struct ftrace_page *pg; | 1689 | struct ftrace_page *pg; |
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) | |||
1708 | return; | 1693 | return; |
1709 | 1694 | ||
1710 | do_for_each_ftrace_rec(pg, rec) { | 1695 | do_for_each_ftrace_rec(pg, rec) { |
1711 | failed = __ftrace_replace_code(rec, update); | 1696 | failed = __ftrace_replace_code(rec, enable); |
1712 | if (failed) { | 1697 | if (failed) { |
1713 | ftrace_bug(failed, rec->ip); | 1698 | ftrace_bug(failed, rec->ip); |
1714 | /* Stop processing */ | 1699 | /* Stop processing */ |
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
1826 | return 0; | 1811 | return 0; |
1827 | } | 1812 | } |
1828 | 1813 | ||
1829 | static int __ftrace_modify_code(void *data) | 1814 | void ftrace_modify_all_code(int command) |
1830 | { | 1815 | { |
1831 | int *command = data; | 1816 | if (command & FTRACE_UPDATE_CALLS) |
1832 | |||
1833 | if (*command & FTRACE_UPDATE_CALLS) | ||
1834 | ftrace_replace_code(1); | 1817 | ftrace_replace_code(1); |
1835 | else if (*command & FTRACE_DISABLE_CALLS) | 1818 | else if (command & FTRACE_DISABLE_CALLS) |
1836 | ftrace_replace_code(0); | 1819 | ftrace_replace_code(0); |
1837 | 1820 | ||
1838 | if (*command & FTRACE_UPDATE_TRACE_FUNC) | 1821 | if (command & FTRACE_UPDATE_TRACE_FUNC) |
1839 | ftrace_update_ftrace_func(ftrace_trace_function); | 1822 | ftrace_update_ftrace_func(ftrace_trace_function); |
1840 | 1823 | ||
1841 | if (*command & FTRACE_START_FUNC_RET) | 1824 | if (command & FTRACE_START_FUNC_RET) |
1842 | ftrace_enable_ftrace_graph_caller(); | 1825 | ftrace_enable_ftrace_graph_caller(); |
1843 | else if (*command & FTRACE_STOP_FUNC_RET) | 1826 | else if (command & FTRACE_STOP_FUNC_RET) |
1844 | ftrace_disable_ftrace_graph_caller(); | 1827 | ftrace_disable_ftrace_graph_caller(); |
1828 | } | ||
1829 | |||
1830 | static int __ftrace_modify_code(void *data) | ||
1831 | { | ||
1832 | int *command = data; | ||
1833 | |||
1834 | ftrace_modify_all_code(*command); | ||
1845 | 1835 | ||
1846 | return 0; | 1836 | return 0; |
1847 | } | 1837 | } |
@@ -2469,57 +2459,35 @@ static int | |||
2469 | ftrace_avail_open(struct inode *inode, struct file *file) | 2459 | ftrace_avail_open(struct inode *inode, struct file *file) |
2470 | { | 2460 | { |
2471 | struct ftrace_iterator *iter; | 2461 | struct ftrace_iterator *iter; |
2472 | int ret; | ||
2473 | 2462 | ||
2474 | if (unlikely(ftrace_disabled)) | 2463 | if (unlikely(ftrace_disabled)) |
2475 | return -ENODEV; | 2464 | return -ENODEV; |
2476 | 2465 | ||
2477 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2466 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2478 | if (!iter) | 2467 | if (iter) { |
2479 | return -ENOMEM; | 2468 | iter->pg = ftrace_pages_start; |
2480 | 2469 | iter->ops = &global_ops; | |
2481 | iter->pg = ftrace_pages_start; | ||
2482 | iter->ops = &global_ops; | ||
2483 | |||
2484 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2485 | if (!ret) { | ||
2486 | struct seq_file *m = file->private_data; | ||
2487 | |||
2488 | m->private = iter; | ||
2489 | } else { | ||
2490 | kfree(iter); | ||
2491 | } | 2470 | } |
2492 | 2471 | ||
2493 | return ret; | 2472 | return iter ? 0 : -ENOMEM; |
2494 | } | 2473 | } |
2495 | 2474 | ||
2496 | static int | 2475 | static int |
2497 | ftrace_enabled_open(struct inode *inode, struct file *file) | 2476 | ftrace_enabled_open(struct inode *inode, struct file *file) |
2498 | { | 2477 | { |
2499 | struct ftrace_iterator *iter; | 2478 | struct ftrace_iterator *iter; |
2500 | int ret; | ||
2501 | 2479 | ||
2502 | if (unlikely(ftrace_disabled)) | 2480 | if (unlikely(ftrace_disabled)) |
2503 | return -ENODEV; | 2481 | return -ENODEV; |
2504 | 2482 | ||
2505 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2483 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2506 | if (!iter) | 2484 | if (iter) { |
2507 | return -ENOMEM; | 2485 | iter->pg = ftrace_pages_start; |
2508 | 2486 | iter->flags = FTRACE_ITER_ENABLED; | |
2509 | iter->pg = ftrace_pages_start; | 2487 | iter->ops = &global_ops; |
2510 | iter->flags = FTRACE_ITER_ENABLED; | ||
2511 | iter->ops = &global_ops; | ||
2512 | |||
2513 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2514 | if (!ret) { | ||
2515 | struct seq_file *m = file->private_data; | ||
2516 | |||
2517 | m->private = iter; | ||
2518 | } else { | ||
2519 | kfree(iter); | ||
2520 | } | 2488 | } |
2521 | 2489 | ||
2522 | return ret; | 2490 | return iter ? 0 : -ENOMEM; |
2523 | } | 2491 | } |
2524 | 2492 | ||
2525 | static void ftrace_filter_reset(struct ftrace_hash *hash) | 2493 | static void ftrace_filter_reset(struct ftrace_hash *hash) |
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3688 | return 0; | 3656 | return 0; |
3689 | } | 3657 | } |
3690 | 3658 | ||
3691 | static void ftrace_swap_recs(void *a, void *b, int size) | 3659 | static int ftrace_cmp_ips(const void *a, const void *b) |
3660 | { | ||
3661 | const unsigned long *ipa = a; | ||
3662 | const unsigned long *ipb = b; | ||
3663 | |||
3664 | if (*ipa > *ipb) | ||
3665 | return 1; | ||
3666 | if (*ipa < *ipb) | ||
3667 | return -1; | ||
3668 | return 0; | ||
3669 | } | ||
3670 | |||
3671 | static void ftrace_swap_ips(void *a, void *b, int size) | ||
3692 | { | 3672 | { |
3693 | struct dyn_ftrace *reca = a; | 3673 | unsigned long *ipa = a; |
3694 | struct dyn_ftrace *recb = b; | 3674 | unsigned long *ipb = b; |
3695 | struct dyn_ftrace t; | 3675 | unsigned long t; |
3696 | 3676 | ||
3697 | t = *reca; | 3677 | t = *ipa; |
3698 | *reca = *recb; | 3678 | *ipa = *ipb; |
3699 | *recb = t; | 3679 | *ipb = t; |
3700 | } | 3680 | } |
3701 | 3681 | ||
3702 | static int ftrace_process_locs(struct module *mod, | 3682 | static int ftrace_process_locs(struct module *mod, |
3703 | unsigned long *start, | 3683 | unsigned long *start, |
3704 | unsigned long *end) | 3684 | unsigned long *end) |
3705 | { | 3685 | { |
3686 | struct ftrace_page *start_pg; | ||
3706 | struct ftrace_page *pg; | 3687 | struct ftrace_page *pg; |
3688 | struct dyn_ftrace *rec; | ||
3707 | unsigned long count; | 3689 | unsigned long count; |
3708 | unsigned long *p; | 3690 | unsigned long *p; |
3709 | unsigned long addr; | 3691 | unsigned long addr; |
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3715 | if (!count) | 3697 | if (!count) |
3716 | return 0; | 3698 | return 0; |
3717 | 3699 | ||
3718 | pg = ftrace_allocate_pages(count); | 3700 | sort(start, count, sizeof(*start), |
3719 | if (!pg) | 3701 | ftrace_cmp_ips, ftrace_swap_ips); |
3702 | |||
3703 | start_pg = ftrace_allocate_pages(count); | ||
3704 | if (!start_pg) | ||
3720 | return -ENOMEM; | 3705 | return -ENOMEM; |
3721 | 3706 | ||
3722 | mutex_lock(&ftrace_lock); | 3707 | mutex_lock(&ftrace_lock); |
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, | |||
3729 | if (!mod) { | 3714 | if (!mod) { |
3730 | WARN_ON(ftrace_pages || ftrace_pages_start); | 3715 | WARN_ON(ftrace_pages || ftrace_pages_start); |
3731 | /* First initialization */ | 3716 | /* First initialization */ |
3732 | ftrace_pages = ftrace_pages_start = pg; | 3717 | ftrace_pages = ftrace_pages_start = start_pg; |
3733 | } else { | 3718 | } else { |
3734 | if (!ftrace_pages) | 3719 | if (!ftrace_pages) |
3735 | goto out; | 3720 | goto out; |
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3740 | ftrace_pages = ftrace_pages->next; | 3725 | ftrace_pages = ftrace_pages->next; |
3741 | } | 3726 | } |
3742 | 3727 | ||
3743 | ftrace_pages->next = pg; | 3728 | ftrace_pages->next = start_pg; |
3744 | ftrace_pages = pg; | ||
3745 | } | 3729 | } |
3746 | 3730 | ||
3747 | p = start; | 3731 | p = start; |
3732 | pg = start_pg; | ||
3748 | while (p < end) { | 3733 | while (p < end) { |
3749 | addr = ftrace_call_adjust(*p++); | 3734 | addr = ftrace_call_adjust(*p++); |
3750 | /* | 3735 | /* |
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, | |||
3755 | */ | 3740 | */ |
3756 | if (!addr) | 3741 | if (!addr) |
3757 | continue; | 3742 | continue; |
3758 | if (!ftrace_record_ip(addr)) | 3743 | |
3759 | break; | 3744 | if (pg->index == pg->size) { |
3745 | /* We should have allocated enough */ | ||
3746 | if (WARN_ON(!pg->next)) | ||
3747 | break; | ||
3748 | pg = pg->next; | ||
3749 | } | ||
3750 | |||
3751 | rec = &pg->records[pg->index++]; | ||
3752 | rec->ip = addr; | ||
3760 | } | 3753 | } |
3761 | 3754 | ||
3762 | /* These new locations need to be initialized */ | 3755 | /* We should have used all pages */ |
3763 | ftrace_new_pgs = pg; | 3756 | WARN_ON(pg->next); |
3757 | |||
3758 | /* Assign the last page to ftrace_pages */ | ||
3759 | ftrace_pages = pg; | ||
3764 | 3760 | ||
3765 | /* Make each individual set of pages sorted by ips */ | 3761 | /* These new locations need to be initialized */ |
3766 | for (; pg; pg = pg->next) | 3762 | ftrace_new_pgs = start_pg; |
3767 | sort(pg->records, pg->index, sizeof(struct dyn_ftrace), | ||
3768 | ftrace_cmp_recs, ftrace_swap_recs); | ||
3769 | 3763 | ||
3770 | /* | 3764 | /* |
3771 | * We only need to disable interrupts on start up | 3765 | * We only need to disable interrupts on start up |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91efd..6420cda62336 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -23,6 +23,8 @@ | |||
23 | #include <asm/local.h> | 23 | #include <asm/local.h> |
24 | #include "trace.h" | 24 | #include "trace.h" |
25 | 25 | ||
26 | static void update_pages_handler(struct work_struct *work); | ||
27 | |||
26 | /* | 28 | /* |
27 | * The ring buffer header is special. We must manually up keep it. | 29 | * The ring buffer header is special. We must manually up keep it. |
28 | */ | 30 | */ |
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu { | |||
449 | raw_spinlock_t reader_lock; /* serialize readers */ | 451 | raw_spinlock_t reader_lock; /* serialize readers */ |
450 | arch_spinlock_t lock; | 452 | arch_spinlock_t lock; |
451 | struct lock_class_key lock_key; | 453 | struct lock_class_key lock_key; |
454 | unsigned int nr_pages; | ||
452 | struct list_head *pages; | 455 | struct list_head *pages; |
453 | struct buffer_page *head_page; /* read from head */ | 456 | struct buffer_page *head_page; /* read from head */ |
454 | struct buffer_page *tail_page; /* write to tail */ | 457 | struct buffer_page *tail_page; /* write to tail */ |
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu { | |||
466 | unsigned long read_bytes; | 469 | unsigned long read_bytes; |
467 | u64 write_stamp; | 470 | u64 write_stamp; |
468 | u64 read_stamp; | 471 | u64 read_stamp; |
472 | /* ring buffer pages to update, > 0 to add, < 0 to remove */ | ||
473 | int nr_pages_to_update; | ||
474 | struct list_head new_pages; /* new pages to add */ | ||
475 | struct work_struct update_pages_work; | ||
476 | struct completion update_done; | ||
469 | }; | 477 | }; |
470 | 478 | ||
471 | struct ring_buffer { | 479 | struct ring_buffer { |
472 | unsigned pages; | ||
473 | unsigned flags; | 480 | unsigned flags; |
474 | int cpus; | 481 | int cpus; |
475 | atomic_t record_disabled; | 482 | atomic_t record_disabled; |
483 | atomic_t resize_disabled; | ||
476 | cpumask_var_t cpumask; | 484 | cpumask_var_t cpumask; |
477 | 485 | ||
478 | struct lock_class_key *reader_lock_key; | 486 | struct lock_class_key *reader_lock_key; |
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
937 | struct list_head *head = cpu_buffer->pages; | 945 | struct list_head *head = cpu_buffer->pages; |
938 | struct buffer_page *bpage, *tmp; | 946 | struct buffer_page *bpage, *tmp; |
939 | 947 | ||
948 | /* Reset the head page if it exists */ | ||
949 | if (cpu_buffer->head_page) | ||
950 | rb_set_head_page(cpu_buffer); | ||
951 | |||
940 | rb_head_page_deactivate(cpu_buffer); | 952 | rb_head_page_deactivate(cpu_buffer); |
941 | 953 | ||
942 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) | 954 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) |
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
963 | return 0; | 975 | return 0; |
964 | } | 976 | } |
965 | 977 | ||
966 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | 978 | static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) |
967 | unsigned nr_pages) | ||
968 | { | 979 | { |
980 | int i; | ||
969 | struct buffer_page *bpage, *tmp; | 981 | struct buffer_page *bpage, *tmp; |
970 | LIST_HEAD(pages); | ||
971 | unsigned i; | ||
972 | |||
973 | WARN_ON(!nr_pages); | ||
974 | 982 | ||
975 | for (i = 0; i < nr_pages; i++) { | 983 | for (i = 0; i < nr_pages; i++) { |
976 | struct page *page; | 984 | struct page *page; |
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
981 | */ | 989 | */ |
982 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 990 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
983 | GFP_KERNEL | __GFP_NORETRY, | 991 | GFP_KERNEL | __GFP_NORETRY, |
984 | cpu_to_node(cpu_buffer->cpu)); | 992 | cpu_to_node(cpu)); |
985 | if (!bpage) | 993 | if (!bpage) |
986 | goto free_pages; | 994 | goto free_pages; |
987 | 995 | ||
988 | rb_check_bpage(cpu_buffer, bpage); | 996 | list_add(&bpage->list, pages); |
989 | 997 | ||
990 | list_add(&bpage->list, &pages); | 998 | page = alloc_pages_node(cpu_to_node(cpu), |
991 | |||
992 | page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), | ||
993 | GFP_KERNEL | __GFP_NORETRY, 0); | 999 | GFP_KERNEL | __GFP_NORETRY, 0); |
994 | if (!page) | 1000 | if (!page) |
995 | goto free_pages; | 1001 | goto free_pages; |
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
997 | rb_init_page(bpage->page); | 1003 | rb_init_page(bpage->page); |
998 | } | 1004 | } |
999 | 1005 | ||
1006 | return 0; | ||
1007 | |||
1008 | free_pages: | ||
1009 | list_for_each_entry_safe(bpage, tmp, pages, list) { | ||
1010 | list_del_init(&bpage->list); | ||
1011 | free_buffer_page(bpage); | ||
1012 | } | ||
1013 | |||
1014 | return -ENOMEM; | ||
1015 | } | ||
1016 | |||
1017 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | ||
1018 | unsigned nr_pages) | ||
1019 | { | ||
1020 | LIST_HEAD(pages); | ||
1021 | |||
1022 | WARN_ON(!nr_pages); | ||
1023 | |||
1024 | if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) | ||
1025 | return -ENOMEM; | ||
1026 | |||
1000 | /* | 1027 | /* |
1001 | * The ring buffer page list is a circular list that does not | 1028 | * The ring buffer page list is a circular list that does not |
1002 | * start and end with a list head. All page list items point to | 1029 | * start and end with a list head. All page list items point to |
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1005 | cpu_buffer->pages = pages.next; | 1032 | cpu_buffer->pages = pages.next; |
1006 | list_del(&pages); | 1033 | list_del(&pages); |
1007 | 1034 | ||
1035 | cpu_buffer->nr_pages = nr_pages; | ||
1036 | |||
1008 | rb_check_pages(cpu_buffer); | 1037 | rb_check_pages(cpu_buffer); |
1009 | 1038 | ||
1010 | return 0; | 1039 | return 0; |
1011 | |||
1012 | free_pages: | ||
1013 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | ||
1014 | list_del_init(&bpage->list); | ||
1015 | free_buffer_page(bpage); | ||
1016 | } | ||
1017 | return -ENOMEM; | ||
1018 | } | 1040 | } |
1019 | 1041 | ||
1020 | static struct ring_buffer_per_cpu * | 1042 | static struct ring_buffer_per_cpu * |
1021 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | 1043 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) |
1022 | { | 1044 | { |
1023 | struct ring_buffer_per_cpu *cpu_buffer; | 1045 | struct ring_buffer_per_cpu *cpu_buffer; |
1024 | struct buffer_page *bpage; | 1046 | struct buffer_page *bpage; |
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1035 | raw_spin_lock_init(&cpu_buffer->reader_lock); | 1057 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1036 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1058 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1037 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1059 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1060 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); | ||
1061 | init_completion(&cpu_buffer->update_done); | ||
1038 | 1062 | ||
1039 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1063 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1040 | GFP_KERNEL, cpu_to_node(cpu)); | 1064 | GFP_KERNEL, cpu_to_node(cpu)); |
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1052 | 1076 | ||
1053 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
1054 | 1078 | ||
1055 | ret = rb_allocate_pages(cpu_buffer, buffer->pages); | 1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
1056 | if (ret < 0) | 1080 | if (ret < 0) |
1057 | goto fail_free_reader; | 1081 | goto fail_free_reader; |
1058 | 1082 | ||
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1113 | { | 1137 | { |
1114 | struct ring_buffer *buffer; | 1138 | struct ring_buffer *buffer; |
1115 | int bsize; | 1139 | int bsize; |
1116 | int cpu; | 1140 | int cpu, nr_pages; |
1117 | 1141 | ||
1118 | /* keep it in its own cache line */ | 1142 | /* keep it in its own cache line */ |
1119 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), | 1143 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), |
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1124 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) | 1148 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) |
1125 | goto fail_free_buffer; | 1149 | goto fail_free_buffer; |
1126 | 1150 | ||
1127 | buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1151 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1128 | buffer->flags = flags; | 1152 | buffer->flags = flags; |
1129 | buffer->clock = trace_clock_local; | 1153 | buffer->clock = trace_clock_local; |
1130 | buffer->reader_lock_key = key; | 1154 | buffer->reader_lock_key = key; |
1131 | 1155 | ||
1132 | /* need at least two pages */ | 1156 | /* need at least two pages */ |
1133 | if (buffer->pages < 2) | 1157 | if (nr_pages < 2) |
1134 | buffer->pages = 2; | 1158 | nr_pages = 2; |
1135 | 1159 | ||
1136 | /* | 1160 | /* |
1137 | * In case of non-hotplug cpu, if the ring-buffer is allocated | 1161 | * In case of non-hotplug cpu, if the ring-buffer is allocated |
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1154 | 1178 | ||
1155 | for_each_buffer_cpu(buffer, cpu) { | 1179 | for_each_buffer_cpu(buffer, cpu) { |
1156 | buffer->buffers[cpu] = | 1180 | buffer->buffers[cpu] = |
1157 | rb_allocate_cpu_buffer(buffer, cpu); | 1181 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
1158 | if (!buffer->buffers[cpu]) | 1182 | if (!buffer->buffers[cpu]) |
1159 | goto fail_free_buffers; | 1183 | goto fail_free_buffers; |
1160 | } | 1184 | } |
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, | |||
1222 | 1246 | ||
1223 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); | 1247 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); |
1224 | 1248 | ||
1225 | static void | 1249 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) |
1226 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | ||
1227 | { | 1250 | { |
1228 | struct buffer_page *bpage; | 1251 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1229 | struct list_head *p; | 1252 | } |
1230 | unsigned i; | 1253 | |
1254 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1255 | { | ||
1256 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1257 | } | ||
1258 | |||
1259 | static int | ||
1260 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | ||
1261 | { | ||
1262 | struct list_head *tail_page, *to_remove, *next_page; | ||
1263 | struct buffer_page *to_remove_page, *tmp_iter_page; | ||
1264 | struct buffer_page *last_page, *first_page; | ||
1265 | unsigned int nr_removed; | ||
1266 | unsigned long head_bit; | ||
1267 | int page_entries; | ||
1268 | |||
1269 | head_bit = 0; | ||
1231 | 1270 | ||
1232 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1271 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1233 | rb_head_page_deactivate(cpu_buffer); | 1272 | atomic_inc(&cpu_buffer->record_disabled); |
1273 | /* | ||
1274 | * We don't race with the readers since we have acquired the reader | ||
1275 | * lock. We also don't race with writers after disabling recording. | ||
1276 | * This makes it easy to figure out the first and the last page to be | ||
1277 | * removed from the list. We unlink all the pages in between including | ||
1278 | * the first and last pages. This is done in a busy loop so that we | ||
1279 | * lose the least number of traces. | ||
1280 | * The pages are freed after we restart recording and unlock readers. | ||
1281 | */ | ||
1282 | tail_page = &cpu_buffer->tail_page->list; | ||
1234 | 1283 | ||
1235 | for (i = 0; i < nr_pages; i++) { | 1284 | /* |
1236 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1285 | * tail page might be on reader page, we remove the next page |
1237 | goto out; | 1286 | * from the ring buffer |
1238 | p = cpu_buffer->pages->next; | 1287 | */ |
1239 | bpage = list_entry(p, struct buffer_page, list); | 1288 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) |
1240 | list_del_init(&bpage->list); | 1289 | tail_page = rb_list_head(tail_page->next); |
1241 | free_buffer_page(bpage); | 1290 | to_remove = tail_page; |
1291 | |||
1292 | /* start of pages to remove */ | ||
1293 | first_page = list_entry(rb_list_head(to_remove->next), | ||
1294 | struct buffer_page, list); | ||
1295 | |||
1296 | for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { | ||
1297 | to_remove = rb_list_head(to_remove)->next; | ||
1298 | head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; | ||
1242 | } | 1299 | } |
1243 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | ||
1244 | goto out; | ||
1245 | 1300 | ||
1246 | rb_reset_cpu(cpu_buffer); | 1301 | next_page = rb_list_head(to_remove)->next; |
1247 | rb_check_pages(cpu_buffer); | ||
1248 | 1302 | ||
1249 | out: | 1303 | /* |
1304 | * Now we remove all pages between tail_page and next_page. | ||
1305 | * Make sure that we have head_bit value preserved for the | ||
1306 | * next page | ||
1307 | */ | ||
1308 | tail_page->next = (struct list_head *)((unsigned long)next_page | | ||
1309 | head_bit); | ||
1310 | next_page = rb_list_head(next_page); | ||
1311 | next_page->prev = tail_page; | ||
1312 | |||
1313 | /* make sure pages points to a valid page in the ring buffer */ | ||
1314 | cpu_buffer->pages = next_page; | ||
1315 | |||
1316 | /* update head page */ | ||
1317 | if (head_bit) | ||
1318 | cpu_buffer->head_page = list_entry(next_page, | ||
1319 | struct buffer_page, list); | ||
1320 | |||
1321 | /* | ||
1322 | * change read pointer to make sure any read iterators reset | ||
1323 | * themselves | ||
1324 | */ | ||
1325 | cpu_buffer->read = 0; | ||
1326 | |||
1327 | /* pages are removed, resume tracing and then free the pages */ | ||
1328 | atomic_dec(&cpu_buffer->record_disabled); | ||
1250 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1329 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1330 | |||
1331 | RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); | ||
1332 | |||
1333 | /* last buffer page to remove */ | ||
1334 | last_page = list_entry(rb_list_head(to_remove), struct buffer_page, | ||
1335 | list); | ||
1336 | tmp_iter_page = first_page; | ||
1337 | |||
1338 | do { | ||
1339 | to_remove_page = tmp_iter_page; | ||
1340 | rb_inc_page(cpu_buffer, &tmp_iter_page); | ||
1341 | |||
1342 | /* update the counters */ | ||
1343 | page_entries = rb_page_entries(to_remove_page); | ||
1344 | if (page_entries) { | ||
1345 | /* | ||
1346 | * If something was added to this page, it was full | ||
1347 | * since it is not the tail page. So we deduct the | ||
1348 | * bytes consumed in ring buffer from here. | ||
1349 | * No need to update overruns, since this page is | ||
1350 | * deleted from ring buffer and its entries are | ||
1351 | * already accounted for. | ||
1352 | */ | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * We have already removed references to this list item, just | ||
1358 | * free up the buffer_page and its page | ||
1359 | */ | ||
1360 | free_buffer_page(to_remove_page); | ||
1361 | nr_removed--; | ||
1362 | |||
1363 | } while (to_remove_page != last_page); | ||
1364 | |||
1365 | RB_WARN_ON(cpu_buffer, nr_removed); | ||
1366 | |||
1367 | return nr_removed == 0; | ||
1251 | } | 1368 | } |
1252 | 1369 | ||
1253 | static void | 1370 | static int |
1254 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | 1371 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) |
1255 | struct list_head *pages, unsigned nr_pages) | ||
1256 | { | 1372 | { |
1257 | struct buffer_page *bpage; | 1373 | struct list_head *pages = &cpu_buffer->new_pages; |
1258 | struct list_head *p; | 1374 | int retries, success; |
1259 | unsigned i; | ||
1260 | 1375 | ||
1261 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1376 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1262 | rb_head_page_deactivate(cpu_buffer); | 1377 | /* |
1378 | * We are holding the reader lock, so the reader page won't be swapped | ||
1379 | * in the ring buffer. Now we are racing with the writer trying to | ||
1380 | * move head page and the tail page. | ||
1381 | * We are going to adapt the reader page update process where: | ||
1382 | * 1. We first splice the start and end of list of new pages between | ||
1383 | * the head page and its previous page. | ||
1384 | * 2. We cmpxchg the prev_page->next to point from head page to the | ||
1385 | * start of new pages list. | ||
1386 | * 3. Finally, we update the head->prev to the end of new list. | ||
1387 | * | ||
1388 | * We will try this process 10 times, to make sure that we don't keep | ||
1389 | * spinning. | ||
1390 | */ | ||
1391 | retries = 10; | ||
1392 | success = 0; | ||
1393 | while (retries--) { | ||
1394 | struct list_head *head_page, *prev_page, *r; | ||
1395 | struct list_head *last_page, *first_page; | ||
1396 | struct list_head *head_page_with_bit; | ||
1263 | 1397 | ||
1264 | for (i = 0; i < nr_pages; i++) { | 1398 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1265 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1399 | prev_page = head_page->prev; |
1266 | goto out; | 1400 | |
1267 | p = pages->next; | 1401 | first_page = pages->next; |
1268 | bpage = list_entry(p, struct buffer_page, list); | 1402 | last_page = pages->prev; |
1269 | list_del_init(&bpage->list); | 1403 | |
1270 | list_add_tail(&bpage->list, cpu_buffer->pages); | 1404 | head_page_with_bit = (struct list_head *) |
1405 | ((unsigned long)head_page | RB_PAGE_HEAD); | ||
1406 | |||
1407 | last_page->next = head_page_with_bit; | ||
1408 | first_page->prev = prev_page; | ||
1409 | |||
1410 | r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); | ||
1411 | |||
1412 | if (r == head_page_with_bit) { | ||
1413 | /* | ||
1414 | * yay, we replaced the page pointer to our new list, | ||
1415 | * now, we just have to update to head page's prev | ||
1416 | * pointer to point to end of list | ||
1417 | */ | ||
1418 | head_page->prev = last_page; | ||
1419 | success = 1; | ||
1420 | break; | ||
1421 | } | ||
1271 | } | 1422 | } |
1272 | rb_reset_cpu(cpu_buffer); | ||
1273 | rb_check_pages(cpu_buffer); | ||
1274 | 1423 | ||
1275 | out: | 1424 | if (success) |
1425 | INIT_LIST_HEAD(pages); | ||
1426 | /* | ||
1427 | * If we weren't successful in adding in new pages, warn and stop | ||
1428 | * tracing | ||
1429 | */ | ||
1430 | RB_WARN_ON(cpu_buffer, !success); | ||
1276 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1431 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1432 | |||
1433 | /* free pages if they weren't inserted */ | ||
1434 | if (!success) { | ||
1435 | struct buffer_page *bpage, *tmp; | ||
1436 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, | ||
1437 | list) { | ||
1438 | list_del_init(&bpage->list); | ||
1439 | free_buffer_page(bpage); | ||
1440 | } | ||
1441 | } | ||
1442 | return success; | ||
1443 | } | ||
1444 | |||
1445 | static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) | ||
1446 | { | ||
1447 | int success; | ||
1448 | |||
1449 | if (cpu_buffer->nr_pages_to_update > 0) | ||
1450 | success = rb_insert_pages(cpu_buffer); | ||
1451 | else | ||
1452 | success = rb_remove_pages(cpu_buffer, | ||
1453 | -cpu_buffer->nr_pages_to_update); | ||
1454 | |||
1455 | if (success) | ||
1456 | cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; | ||
1457 | } | ||
1458 | |||
1459 | static void update_pages_handler(struct work_struct *work) | ||
1460 | { | ||
1461 | struct ring_buffer_per_cpu *cpu_buffer = container_of(work, | ||
1462 | struct ring_buffer_per_cpu, update_pages_work); | ||
1463 | rb_update_pages(cpu_buffer); | ||
1464 | complete(&cpu_buffer->update_done); | ||
1277 | } | 1465 | } |
1278 | 1466 | ||
1279 | /** | 1467 | /** |
@@ -1283,16 +1471,14 @@ out: | |||
1283 | * | 1471 | * |
1284 | * Minimum size is 2 * BUF_PAGE_SIZE. | 1472 | * Minimum size is 2 * BUF_PAGE_SIZE. |
1285 | * | 1473 | * |
1286 | * Returns -1 on failure. | 1474 | * Returns 0 on success and < 0 on failure. |
1287 | */ | 1475 | */ |
1288 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | 1476 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, |
1477 | int cpu_id) | ||
1289 | { | 1478 | { |
1290 | struct ring_buffer_per_cpu *cpu_buffer; | 1479 | struct ring_buffer_per_cpu *cpu_buffer; |
1291 | unsigned nr_pages, rm_pages, new_pages; | 1480 | unsigned nr_pages; |
1292 | struct buffer_page *bpage, *tmp; | 1481 | int cpu, err = 0; |
1293 | unsigned long buffer_size; | ||
1294 | LIST_HEAD(pages); | ||
1295 | int i, cpu; | ||
1296 | 1482 | ||
1297 | /* | 1483 | /* |
1298 | * Always succeed at resizing a non-existent buffer: | 1484 | * Always succeed at resizing a non-existent buffer: |
@@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1302 | 1488 | ||
1303 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1489 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1304 | size *= BUF_PAGE_SIZE; | 1490 | size *= BUF_PAGE_SIZE; |
1305 | buffer_size = buffer->pages * BUF_PAGE_SIZE; | ||
1306 | 1491 | ||
1307 | /* we need a minimum of two pages */ | 1492 | /* we need a minimum of two pages */ |
1308 | if (size < BUF_PAGE_SIZE * 2) | 1493 | if (size < BUF_PAGE_SIZE * 2) |
1309 | size = BUF_PAGE_SIZE * 2; | 1494 | size = BUF_PAGE_SIZE * 2; |
1310 | 1495 | ||
1311 | if (size == buffer_size) | 1496 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1312 | return size; | ||
1313 | |||
1314 | atomic_inc(&buffer->record_disabled); | ||
1315 | 1497 | ||
1316 | /* Make sure all writers are done with this buffer. */ | 1498 | /* |
1317 | synchronize_sched(); | 1499 | * Don't succeed if resizing is disabled, as a reader might be |
1500 | * manipulating the ring buffer and is expecting a sane state while | ||
1501 | * this is true. | ||
1502 | */ | ||
1503 | if (atomic_read(&buffer->resize_disabled)) | ||
1504 | return -EBUSY; | ||
1318 | 1505 | ||
1506 | /* prevent another thread from changing buffer sizes */ | ||
1319 | mutex_lock(&buffer->mutex); | 1507 | mutex_lock(&buffer->mutex); |
1320 | get_online_cpus(); | ||
1321 | |||
1322 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | ||
1323 | 1508 | ||
1324 | if (size < buffer_size) { | 1509 | if (cpu_id == RING_BUFFER_ALL_CPUS) { |
1510 | /* calculate the pages to update */ | ||
1511 | for_each_buffer_cpu(buffer, cpu) { | ||
1512 | cpu_buffer = buffer->buffers[cpu]; | ||
1325 | 1513 | ||
1326 | /* easy case, just free pages */ | 1514 | cpu_buffer->nr_pages_to_update = nr_pages - |
1327 | if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) | 1515 | cpu_buffer->nr_pages; |
1328 | goto out_fail; | 1516 | /* |
1517 | * nothing more to do for removing pages or no update | ||
1518 | */ | ||
1519 | if (cpu_buffer->nr_pages_to_update <= 0) | ||
1520 | continue; | ||
1521 | /* | ||
1522 | * to add pages, make sure all new pages can be | ||
1523 | * allocated without receiving ENOMEM | ||
1524 | */ | ||
1525 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1526 | if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, | ||
1527 | &cpu_buffer->new_pages, cpu)) { | ||
1528 | /* not enough memory for new pages */ | ||
1529 | err = -ENOMEM; | ||
1530 | goto out_err; | ||
1531 | } | ||
1532 | } | ||
1329 | 1533 | ||
1330 | rm_pages = buffer->pages - nr_pages; | 1534 | get_online_cpus(); |
1535 | /* | ||
1536 | * Fire off all the required work handlers | ||
1537 | * We can't schedule on offline CPUs, but it's not necessary | ||
1538 | * since we can change their buffer sizes without any race. | ||
1539 | */ | ||
1540 | for_each_buffer_cpu(buffer, cpu) { | ||
1541 | cpu_buffer = buffer->buffers[cpu]; | ||
1542 | if (!cpu_buffer->nr_pages_to_update) | ||
1543 | continue; | ||
1544 | |||
1545 | if (cpu_online(cpu)) | ||
1546 | schedule_work_on(cpu, | ||
1547 | &cpu_buffer->update_pages_work); | ||
1548 | else | ||
1549 | rb_update_pages(cpu_buffer); | ||
1550 | } | ||
1331 | 1551 | ||
1552 | /* wait for all the updates to complete */ | ||
1332 | for_each_buffer_cpu(buffer, cpu) { | 1553 | for_each_buffer_cpu(buffer, cpu) { |
1333 | cpu_buffer = buffer->buffers[cpu]; | 1554 | cpu_buffer = buffer->buffers[cpu]; |
1334 | rb_remove_pages(cpu_buffer, rm_pages); | 1555 | if (!cpu_buffer->nr_pages_to_update) |
1556 | continue; | ||
1557 | |||
1558 | if (cpu_online(cpu)) | ||
1559 | wait_for_completion(&cpu_buffer->update_done); | ||
1560 | cpu_buffer->nr_pages_to_update = 0; | ||
1335 | } | 1561 | } |
1336 | goto out; | ||
1337 | } | ||
1338 | 1562 | ||
1339 | /* | 1563 | put_online_cpus(); |
1340 | * This is a bit more difficult. We only want to add pages | 1564 | } else { |
1341 | * when we can allocate enough for all CPUs. We do this | 1565 | cpu_buffer = buffer->buffers[cpu_id]; |
1342 | * by allocating all the pages and storing them on a local | ||
1343 | * link list. If we succeed in our allocation, then we | ||
1344 | * add these pages to the cpu_buffers. Otherwise we just free | ||
1345 | * them all and return -ENOMEM; | ||
1346 | */ | ||
1347 | if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) | ||
1348 | goto out_fail; | ||
1349 | 1566 | ||
1350 | new_pages = nr_pages - buffer->pages; | 1567 | if (nr_pages == cpu_buffer->nr_pages) |
1568 | goto out; | ||
1351 | 1569 | ||
1352 | for_each_buffer_cpu(buffer, cpu) { | 1570 | cpu_buffer->nr_pages_to_update = nr_pages - |
1353 | for (i = 0; i < new_pages; i++) { | 1571 | cpu_buffer->nr_pages; |
1354 | struct page *page; | 1572 | |
1355 | /* | 1573 | INIT_LIST_HEAD(&cpu_buffer->new_pages); |
1356 | * __GFP_NORETRY flag makes sure that the allocation | 1574 | if (cpu_buffer->nr_pages_to_update > 0 && |
1357 | * fails gracefully without invoking oom-killer and | 1575 | __rb_allocate_pages(cpu_buffer->nr_pages_to_update, |
1358 | * the system is not destabilized. | 1576 | &cpu_buffer->new_pages, cpu_id)) { |
1359 | */ | 1577 | err = -ENOMEM; |
1360 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), | 1578 | goto out_err; |
1361 | cache_line_size()), | ||
1362 | GFP_KERNEL | __GFP_NORETRY, | ||
1363 | cpu_to_node(cpu)); | ||
1364 | if (!bpage) | ||
1365 | goto free_pages; | ||
1366 | list_add(&bpage->list, &pages); | ||
1367 | page = alloc_pages_node(cpu_to_node(cpu), | ||
1368 | GFP_KERNEL | __GFP_NORETRY, 0); | ||
1369 | if (!page) | ||
1370 | goto free_pages; | ||
1371 | bpage->page = page_address(page); | ||
1372 | rb_init_page(bpage->page); | ||
1373 | } | 1579 | } |
1374 | } | ||
1375 | 1580 | ||
1376 | for_each_buffer_cpu(buffer, cpu) { | 1581 | get_online_cpus(); |
1377 | cpu_buffer = buffer->buffers[cpu]; | ||
1378 | rb_insert_pages(cpu_buffer, &pages, new_pages); | ||
1379 | } | ||
1380 | 1582 | ||
1381 | if (RB_WARN_ON(buffer, !list_empty(&pages))) | 1583 | if (cpu_online(cpu_id)) { |
1382 | goto out_fail; | 1584 | schedule_work_on(cpu_id, |
1585 | &cpu_buffer->update_pages_work); | ||
1586 | wait_for_completion(&cpu_buffer->update_done); | ||
1587 | } else | ||
1588 | rb_update_pages(cpu_buffer); | ||
1589 | |||
1590 | cpu_buffer->nr_pages_to_update = 0; | ||
1591 | put_online_cpus(); | ||
1592 | } | ||
1383 | 1593 | ||
1384 | out: | 1594 | out: |
1385 | buffer->pages = nr_pages; | 1595 | /* |
1386 | put_online_cpus(); | 1596 | * The ring buffer resize can happen with the ring buffer |
1597 | * enabled, so that the update disturbs the tracing as little | ||
1598 | * as possible. But if the buffer is disabled, we do not need | ||
1599 | * to worry about that, and we can take the time to verify | ||
1600 | * that the buffer is not corrupt. | ||
1601 | */ | ||
1602 | if (atomic_read(&buffer->record_disabled)) { | ||
1603 | atomic_inc(&buffer->record_disabled); | ||
1604 | /* | ||
1605 | * Even though the buffer was disabled, we must make sure | ||
1606 | * that it is truly disabled before calling rb_check_pages. | ||
1607 | * There could have been a race between checking | ||
1608 | * record_disable and incrementing it. | ||
1609 | */ | ||
1610 | synchronize_sched(); | ||
1611 | for_each_buffer_cpu(buffer, cpu) { | ||
1612 | cpu_buffer = buffer->buffers[cpu]; | ||
1613 | rb_check_pages(cpu_buffer); | ||
1614 | } | ||
1615 | atomic_dec(&buffer->record_disabled); | ||
1616 | } | ||
1617 | |||
1387 | mutex_unlock(&buffer->mutex); | 1618 | mutex_unlock(&buffer->mutex); |
1619 | return size; | ||
1388 | 1620 | ||
1389 | atomic_dec(&buffer->record_disabled); | 1621 | out_err: |
1622 | for_each_buffer_cpu(buffer, cpu) { | ||
1623 | struct buffer_page *bpage, *tmp; | ||
1390 | 1624 | ||
1391 | return size; | 1625 | cpu_buffer = buffer->buffers[cpu]; |
1626 | cpu_buffer->nr_pages_to_update = 0; | ||
1392 | 1627 | ||
1393 | free_pages: | 1628 | if (list_empty(&cpu_buffer->new_pages)) |
1394 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | 1629 | continue; |
1395 | list_del_init(&bpage->list); | ||
1396 | free_buffer_page(bpage); | ||
1397 | } | ||
1398 | put_online_cpus(); | ||
1399 | mutex_unlock(&buffer->mutex); | ||
1400 | atomic_dec(&buffer->record_disabled); | ||
1401 | return -ENOMEM; | ||
1402 | 1630 | ||
1403 | /* | 1631 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, |
1404 | * Something went totally wrong, and we are too paranoid | 1632 | list) { |
1405 | * to even clean up the mess. | 1633 | list_del_init(&bpage->list); |
1406 | */ | 1634 | free_buffer_page(bpage); |
1407 | out_fail: | 1635 | } |
1408 | put_online_cpus(); | 1636 | } |
1409 | mutex_unlock(&buffer->mutex); | 1637 | mutex_unlock(&buffer->mutex); |
1410 | atomic_dec(&buffer->record_disabled); | 1638 | return err; |
1411 | return -1; | ||
1412 | } | 1639 | } |
1413 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1640 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1414 | 1641 | ||
@@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) | |||
1447 | return __rb_page_index(iter->head_page, iter->head); | 1674 | return __rb_page_index(iter->head_page, iter->head); |
1448 | } | 1675 | } |
1449 | 1676 | ||
1450 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1451 | { | ||
1452 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1453 | } | ||
1454 | |||
1455 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | 1677 | static inline unsigned rb_page_commit(struct buffer_page *bpage) |
1456 | { | 1678 | { |
1457 | return local_read(&bpage->page->commit); | 1679 | return local_read(&bpage->page->commit); |
1458 | } | 1680 | } |
1459 | 1681 | ||
1460 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | ||
1461 | { | ||
1462 | return local_read(&bpage->entries) & RB_WRITE_MASK; | ||
1463 | } | ||
1464 | |||
1465 | /* Size is determined by what has been committed */ | 1682 | /* Size is determined by what has been committed */ |
1466 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1683 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1467 | { | 1684 | { |
@@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | |||
1510 | * assign the commit to the tail. | 1727 | * assign the commit to the tail. |
1511 | */ | 1728 | */ |
1512 | again: | 1729 | again: |
1513 | max_count = cpu_buffer->buffer->pages * 100; | 1730 | max_count = cpu_buffer->nr_pages * 100; |
1514 | 1731 | ||
1515 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | 1732 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { |
1516 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) | 1733 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) |
@@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) | |||
3486 | 3703 | ||
3487 | iter->cpu_buffer = cpu_buffer; | 3704 | iter->cpu_buffer = cpu_buffer; |
3488 | 3705 | ||
3706 | atomic_inc(&buffer->resize_disabled); | ||
3489 | atomic_inc(&cpu_buffer->record_disabled); | 3707 | atomic_inc(&cpu_buffer->record_disabled); |
3490 | 3708 | ||
3491 | return iter; | 3709 | return iter; |
@@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) | |||
3548 | { | 3766 | { |
3549 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3767 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3550 | 3768 | ||
3769 | /* | ||
3770 | * Ring buffer is disabled from recording, here's a good place | ||
3771 | * to check the integrity of the ring buffer. | ||
3772 | */ | ||
3773 | rb_check_pages(cpu_buffer); | ||
3774 | |||
3551 | atomic_dec(&cpu_buffer->record_disabled); | 3775 | atomic_dec(&cpu_buffer->record_disabled); |
3776 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | ||
3552 | kfree(iter); | 3777 | kfree(iter); |
3553 | } | 3778 | } |
3554 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); | 3779 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); |
@@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); | |||
3588 | * ring_buffer_size - return the size of the ring buffer (in bytes) | 3813 | * ring_buffer_size - return the size of the ring buffer (in bytes) |
3589 | * @buffer: The ring buffer. | 3814 | * @buffer: The ring buffer. |
3590 | */ | 3815 | */ |
3591 | unsigned long ring_buffer_size(struct ring_buffer *buffer) | 3816 | unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) |
3592 | { | 3817 | { |
3593 | return BUF_PAGE_SIZE * buffer->pages; | 3818 | /* |
3819 | * Earlier, this method returned | ||
3820 | * BUF_PAGE_SIZE * buffer->nr_pages | ||
3821 | * Since the nr_pages field is now removed, we have converted this to | ||
3822 | * return the per cpu buffer value. | ||
3823 | */ | ||
3824 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3825 | return 0; | ||
3826 | |||
3827 | return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; | ||
3594 | } | 3828 | } |
3595 | EXPORT_SYMBOL_GPL(ring_buffer_size); | 3829 | EXPORT_SYMBOL_GPL(ring_buffer_size); |
3596 | 3830 | ||
@@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3611 | cpu_buffer->commit_page = cpu_buffer->head_page; | 3845 | cpu_buffer->commit_page = cpu_buffer->head_page; |
3612 | 3846 | ||
3613 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 3847 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
3848 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
3614 | local_set(&cpu_buffer->reader_page->write, 0); | 3849 | local_set(&cpu_buffer->reader_page->write, 0); |
3615 | local_set(&cpu_buffer->reader_page->entries, 0); | 3850 | local_set(&cpu_buffer->reader_page->entries, 0); |
3616 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3851 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
@@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3647 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 3882 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
3648 | return; | 3883 | return; |
3649 | 3884 | ||
3885 | atomic_inc(&buffer->resize_disabled); | ||
3650 | atomic_inc(&cpu_buffer->record_disabled); | 3886 | atomic_inc(&cpu_buffer->record_disabled); |
3651 | 3887 | ||
3888 | /* Make sure all commits have finished */ | ||
3889 | synchronize_sched(); | ||
3890 | |||
3652 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3891 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3653 | 3892 | ||
3654 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3893 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
@@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3664 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3903 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3665 | 3904 | ||
3666 | atomic_dec(&cpu_buffer->record_disabled); | 3905 | atomic_dec(&cpu_buffer->record_disabled); |
3906 | atomic_dec(&buffer->resize_disabled); | ||
3667 | } | 3907 | } |
3668 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); | 3908 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); |
3669 | 3909 | ||
@@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3765 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) | 4005 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) |
3766 | goto out; | 4006 | goto out; |
3767 | 4007 | ||
4008 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
4009 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
4010 | |||
3768 | /* At least make sure the two buffers are somewhat the same */ | 4011 | /* At least make sure the two buffers are somewhat the same */ |
3769 | if (buffer_a->pages != buffer_b->pages) | 4012 | if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) |
3770 | goto out; | 4013 | goto out; |
3771 | 4014 | ||
3772 | ret = -EAGAIN; | 4015 | ret = -EAGAIN; |
@@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3780 | if (atomic_read(&buffer_b->record_disabled)) | 4023 | if (atomic_read(&buffer_b->record_disabled)) |
3781 | goto out; | 4024 | goto out; |
3782 | 4025 | ||
3783 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
3784 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
3785 | |||
3786 | if (atomic_read(&cpu_buffer_a->record_disabled)) | 4026 | if (atomic_read(&cpu_buffer_a->record_disabled)) |
3787 | goto out; | 4027 | goto out; |
3788 | 4028 | ||
@@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4071 | struct ring_buffer *buffer = | 4311 | struct ring_buffer *buffer = |
4072 | container_of(self, struct ring_buffer, cpu_notify); | 4312 | container_of(self, struct ring_buffer, cpu_notify); |
4073 | long cpu = (long)hcpu; | 4313 | long cpu = (long)hcpu; |
4314 | int cpu_i, nr_pages_same; | ||
4315 | unsigned int nr_pages; | ||
4074 | 4316 | ||
4075 | switch (action) { | 4317 | switch (action) { |
4076 | case CPU_UP_PREPARE: | 4318 | case CPU_UP_PREPARE: |
@@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4078 | if (cpumask_test_cpu(cpu, buffer->cpumask)) | 4320 | if (cpumask_test_cpu(cpu, buffer->cpumask)) |
4079 | return NOTIFY_OK; | 4321 | return NOTIFY_OK; |
4080 | 4322 | ||
4323 | nr_pages = 0; | ||
4324 | nr_pages_same = 1; | ||
4325 | /* check if all cpu sizes are same */ | ||
4326 | for_each_buffer_cpu(buffer, cpu_i) { | ||
4327 | /* fill in the size from first enabled cpu */ | ||
4328 | if (nr_pages == 0) | ||
4329 | nr_pages = buffer->buffers[cpu_i]->nr_pages; | ||
4330 | if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { | ||
4331 | nr_pages_same = 0; | ||
4332 | break; | ||
4333 | } | ||
4334 | } | ||
4335 | /* allocate minimum pages, user can later expand it */ | ||
4336 | if (!nr_pages_same) | ||
4337 | nr_pages = 2; | ||
4081 | buffer->buffers[cpu] = | 4338 | buffer->buffers[cpu] = |
4082 | rb_allocate_cpu_buffer(buffer, cpu); | 4339 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
4083 | if (!buffer->buffers[cpu]) { | 4340 | if (!buffer->buffers[cpu]) { |
4084 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", | 4341 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", |
4085 | cpu); | 4342 | cpu); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a22255c1010..68032c6177db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -87,18 +87,6 @@ static int tracing_disabled = 1; | |||
87 | 87 | ||
88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); | 88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); |
89 | 89 | ||
90 | static inline void ftrace_disable_cpu(void) | ||
91 | { | ||
92 | preempt_disable(); | ||
93 | __this_cpu_inc(ftrace_cpu_disabled); | ||
94 | } | ||
95 | |||
96 | static inline void ftrace_enable_cpu(void) | ||
97 | { | ||
98 | __this_cpu_dec(ftrace_cpu_disabled); | ||
99 | preempt_enable(); | ||
100 | } | ||
101 | |||
102 | cpumask_var_t __read_mostly tracing_buffer_mask; | 90 | cpumask_var_t __read_mostly tracing_buffer_mask; |
103 | 91 | ||
104 | /* | 92 | /* |
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) | |||
629 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 617 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
630 | { | 618 | { |
631 | int len; | 619 | int len; |
632 | void *ret; | ||
633 | 620 | ||
634 | if (s->len <= s->readpos) | 621 | if (s->len <= s->readpos) |
635 | return -EBUSY; | 622 | return -EBUSY; |
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
637 | len = s->len - s->readpos; | 624 | len = s->len - s->readpos; |
638 | if (cnt > len) | 625 | if (cnt > len) |
639 | cnt = len; | 626 | cnt = len; |
640 | ret = memcpy(buf, s->buffer + s->readpos, cnt); | 627 | memcpy(buf, s->buffer + s->readpos, cnt); |
641 | if (!ret) | ||
642 | return -EFAULT; | ||
643 | 628 | ||
644 | s->readpos += cnt; | 629 | s->readpos += cnt; |
645 | return cnt; | 630 | return cnt; |
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
751 | 736 | ||
752 | arch_spin_lock(&ftrace_max_lock); | 737 | arch_spin_lock(&ftrace_max_lock); |
753 | 738 | ||
754 | ftrace_disable_cpu(); | ||
755 | |||
756 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); | 739 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); |
757 | 740 | ||
758 | if (ret == -EBUSY) { | 741 | if (ret == -EBUSY) { |
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
766 | "Failed to swap buffers due to commit in progress\n"); | 749 | "Failed to swap buffers due to commit in progress\n"); |
767 | } | 750 | } |
768 | 751 | ||
769 | ftrace_enable_cpu(); | ||
770 | |||
771 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); | 752 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); |
772 | 753 | ||
773 | __update_max_tr(tr, tsk, cpu); | 754 | __update_max_tr(tr, tsk, cpu); |
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
782 | * Register a new plugin tracer. | 763 | * Register a new plugin tracer. |
783 | */ | 764 | */ |
784 | int register_tracer(struct tracer *type) | 765 | int register_tracer(struct tracer *type) |
785 | __releases(kernel_lock) | ||
786 | __acquires(kernel_lock) | ||
787 | { | 766 | { |
788 | struct tracer *t; | 767 | struct tracer *t; |
789 | int ret = 0; | 768 | int ret = 0; |
@@ -841,7 +820,8 @@ __acquires(kernel_lock) | |||
841 | 820 | ||
842 | /* If we expanded the buffers, make sure the max is expanded too */ | 821 | /* If we expanded the buffers, make sure the max is expanded too */ |
843 | if (ring_buffer_expanded && type->use_max_tr) | 822 | if (ring_buffer_expanded && type->use_max_tr) |
844 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | 823 | ring_buffer_resize(max_tr.buffer, trace_buf_size, |
824 | RING_BUFFER_ALL_CPUS); | ||
845 | 825 | ||
846 | /* the test is responsible for initializing and enabling */ | 826 | /* the test is responsible for initializing and enabling */ |
847 | pr_info("Testing tracer %s: ", type->name); | 827 | pr_info("Testing tracer %s: ", type->name); |
@@ -857,7 +837,8 @@ __acquires(kernel_lock) | |||
857 | 837 | ||
858 | /* Shrink the max buffer again */ | 838 | /* Shrink the max buffer again */ |
859 | if (ring_buffer_expanded && type->use_max_tr) | 839 | if (ring_buffer_expanded && type->use_max_tr) |
860 | ring_buffer_resize(max_tr.buffer, 1); | 840 | ring_buffer_resize(max_tr.buffer, 1, |
841 | RING_BUFFER_ALL_CPUS); | ||
861 | 842 | ||
862 | printk(KERN_CONT "PASSED\n"); | 843 | printk(KERN_CONT "PASSED\n"); |
863 | } | 844 | } |
@@ -917,13 +898,6 @@ out: | |||
917 | mutex_unlock(&trace_types_lock); | 898 | mutex_unlock(&trace_types_lock); |
918 | } | 899 | } |
919 | 900 | ||
920 | static void __tracing_reset(struct ring_buffer *buffer, int cpu) | ||
921 | { | ||
922 | ftrace_disable_cpu(); | ||
923 | ring_buffer_reset_cpu(buffer, cpu); | ||
924 | ftrace_enable_cpu(); | ||
925 | } | ||
926 | |||
927 | void tracing_reset(struct trace_array *tr, int cpu) | 901 | void tracing_reset(struct trace_array *tr, int cpu) |
928 | { | 902 | { |
929 | struct ring_buffer *buffer = tr->buffer; | 903 | struct ring_buffer *buffer = tr->buffer; |
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
932 | 906 | ||
933 | /* Make sure all commits have finished */ | 907 | /* Make sure all commits have finished */ |
934 | synchronize_sched(); | 908 | synchronize_sched(); |
935 | __tracing_reset(buffer, cpu); | 909 | ring_buffer_reset_cpu(buffer, cpu); |
936 | 910 | ||
937 | ring_buffer_record_enable(buffer); | 911 | ring_buffer_record_enable(buffer); |
938 | } | 912 | } |
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
950 | tr->time_start = ftrace_now(tr->cpu); | 924 | tr->time_start = ftrace_now(tr->cpu); |
951 | 925 | ||
952 | for_each_online_cpu(cpu) | 926 | for_each_online_cpu(cpu) |
953 | __tracing_reset(buffer, cpu); | 927 | ring_buffer_reset_cpu(buffer, cpu); |
954 | 928 | ||
955 | ring_buffer_record_enable(buffer); | 929 | ring_buffer_record_enable(buffer); |
956 | } | 930 | } |
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
1498 | 1472 | ||
1499 | #endif /* CONFIG_STACKTRACE */ | 1473 | #endif /* CONFIG_STACKTRACE */ |
1500 | 1474 | ||
1475 | /* created for use with alloc_percpu */ | ||
1476 | struct trace_buffer_struct { | ||
1477 | char buffer[TRACE_BUF_SIZE]; | ||
1478 | }; | ||
1479 | |||
1480 | static struct trace_buffer_struct *trace_percpu_buffer; | ||
1481 | static struct trace_buffer_struct *trace_percpu_sirq_buffer; | ||
1482 | static struct trace_buffer_struct *trace_percpu_irq_buffer; | ||
1483 | static struct trace_buffer_struct *trace_percpu_nmi_buffer; | ||
1484 | |||
1485 | /* | ||
1486 | * The buffer used is dependent on the context. There is a per cpu | ||
1487 | * buffer for normal context, softirq contex, hard irq context and | ||
1488 | * for NMI context. Thise allows for lockless recording. | ||
1489 | * | ||
1490 | * Note, if the buffers failed to be allocated, then this returns NULL | ||
1491 | */ | ||
1492 | static char *get_trace_buf(void) | ||
1493 | { | ||
1494 | struct trace_buffer_struct *percpu_buffer; | ||
1495 | struct trace_buffer_struct *buffer; | ||
1496 | |||
1497 | /* | ||
1498 | * If we have allocated per cpu buffers, then we do not | ||
1499 | * need to do any locking. | ||
1500 | */ | ||
1501 | if (in_nmi()) | ||
1502 | percpu_buffer = trace_percpu_nmi_buffer; | ||
1503 | else if (in_irq()) | ||
1504 | percpu_buffer = trace_percpu_irq_buffer; | ||
1505 | else if (in_softirq()) | ||
1506 | percpu_buffer = trace_percpu_sirq_buffer; | ||
1507 | else | ||
1508 | percpu_buffer = trace_percpu_buffer; | ||
1509 | |||
1510 | if (!percpu_buffer) | ||
1511 | return NULL; | ||
1512 | |||
1513 | buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); | ||
1514 | |||
1515 | return buffer->buffer; | ||
1516 | } | ||
1517 | |||
1518 | static int alloc_percpu_trace_buffer(void) | ||
1519 | { | ||
1520 | struct trace_buffer_struct *buffers; | ||
1521 | struct trace_buffer_struct *sirq_buffers; | ||
1522 | struct trace_buffer_struct *irq_buffers; | ||
1523 | struct trace_buffer_struct *nmi_buffers; | ||
1524 | |||
1525 | buffers = alloc_percpu(struct trace_buffer_struct); | ||
1526 | if (!buffers) | ||
1527 | goto err_warn; | ||
1528 | |||
1529 | sirq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1530 | if (!sirq_buffers) | ||
1531 | goto err_sirq; | ||
1532 | |||
1533 | irq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1534 | if (!irq_buffers) | ||
1535 | goto err_irq; | ||
1536 | |||
1537 | nmi_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1538 | if (!nmi_buffers) | ||
1539 | goto err_nmi; | ||
1540 | |||
1541 | trace_percpu_buffer = buffers; | ||
1542 | trace_percpu_sirq_buffer = sirq_buffers; | ||
1543 | trace_percpu_irq_buffer = irq_buffers; | ||
1544 | trace_percpu_nmi_buffer = nmi_buffers; | ||
1545 | |||
1546 | return 0; | ||
1547 | |||
1548 | err_nmi: | ||
1549 | free_percpu(irq_buffers); | ||
1550 | err_irq: | ||
1551 | free_percpu(sirq_buffers); | ||
1552 | err_sirq: | ||
1553 | free_percpu(buffers); | ||
1554 | err_warn: | ||
1555 | WARN(1, "Could not allocate percpu trace_printk buffer"); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | |||
1559 | void trace_printk_init_buffers(void) | ||
1560 | { | ||
1561 | static int buffers_allocated; | ||
1562 | |||
1563 | if (buffers_allocated) | ||
1564 | return; | ||
1565 | |||
1566 | if (alloc_percpu_trace_buffer()) | ||
1567 | return; | ||
1568 | |||
1569 | pr_info("ftrace: Allocated trace_printk buffers\n"); | ||
1570 | |||
1571 | buffers_allocated = 1; | ||
1572 | } | ||
1573 | |||
1501 | /** | 1574 | /** |
1502 | * trace_vbprintk - write binary msg to tracing buffer | 1575 | * trace_vbprintk - write binary msg to tracing buffer |
1503 | * | 1576 | * |
1504 | */ | 1577 | */ |
1505 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | 1578 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) |
1506 | { | 1579 | { |
1507 | static arch_spinlock_t trace_buf_lock = | ||
1508 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
1509 | static u32 trace_buf[TRACE_BUF_SIZE]; | ||
1510 | |||
1511 | struct ftrace_event_call *call = &event_bprint; | 1580 | struct ftrace_event_call *call = &event_bprint; |
1512 | struct ring_buffer_event *event; | 1581 | struct ring_buffer_event *event; |
1513 | struct ring_buffer *buffer; | 1582 | struct ring_buffer *buffer; |
1514 | struct trace_array *tr = &global_trace; | 1583 | struct trace_array *tr = &global_trace; |
1515 | struct trace_array_cpu *data; | ||
1516 | struct bprint_entry *entry; | 1584 | struct bprint_entry *entry; |
1517 | unsigned long flags; | 1585 | unsigned long flags; |
1518 | int disable; | 1586 | char *tbuffer; |
1519 | int cpu, len = 0, size, pc; | 1587 | int len = 0, size, pc; |
1520 | 1588 | ||
1521 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1589 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
1522 | return 0; | 1590 | return 0; |
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1526 | 1594 | ||
1527 | pc = preempt_count(); | 1595 | pc = preempt_count(); |
1528 | preempt_disable_notrace(); | 1596 | preempt_disable_notrace(); |
1529 | cpu = raw_smp_processor_id(); | ||
1530 | data = tr->data[cpu]; | ||
1531 | 1597 | ||
1532 | disable = atomic_inc_return(&data->disabled); | 1598 | tbuffer = get_trace_buf(); |
1533 | if (unlikely(disable != 1)) | 1599 | if (!tbuffer) { |
1600 | len = 0; | ||
1534 | goto out; | 1601 | goto out; |
1602 | } | ||
1535 | 1603 | ||
1536 | /* Lockdep uses trace_printk for lock tracing */ | 1604 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); |
1537 | local_irq_save(flags); | ||
1538 | arch_spin_lock(&trace_buf_lock); | ||
1539 | len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1540 | 1605 | ||
1541 | if (len > TRACE_BUF_SIZE || len < 0) | 1606 | if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) |
1542 | goto out_unlock; | 1607 | goto out; |
1543 | 1608 | ||
1609 | local_save_flags(flags); | ||
1544 | size = sizeof(*entry) + sizeof(u32) * len; | 1610 | size = sizeof(*entry) + sizeof(u32) * len; |
1545 | buffer = tr->buffer; | 1611 | buffer = tr->buffer; |
1546 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, | 1612 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, |
1547 | flags, pc); | 1613 | flags, pc); |
1548 | if (!event) | 1614 | if (!event) |
1549 | goto out_unlock; | 1615 | goto out; |
1550 | entry = ring_buffer_event_data(event); | 1616 | entry = ring_buffer_event_data(event); |
1551 | entry->ip = ip; | 1617 | entry->ip = ip; |
1552 | entry->fmt = fmt; | 1618 | entry->fmt = fmt; |
1553 | 1619 | ||
1554 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1620 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1555 | if (!filter_check_discard(call, entry, buffer, event)) { | 1621 | if (!filter_check_discard(call, entry, buffer, event)) { |
1556 | ring_buffer_unlock_commit(buffer, event); | 1622 | ring_buffer_unlock_commit(buffer, event); |
1557 | ftrace_trace_stack(buffer, flags, 6, pc); | 1623 | ftrace_trace_stack(buffer, flags, 6, pc); |
1558 | } | 1624 | } |
1559 | 1625 | ||
1560 | out_unlock: | ||
1561 | arch_spin_unlock(&trace_buf_lock); | ||
1562 | local_irq_restore(flags); | ||
1563 | |||
1564 | out: | 1626 | out: |
1565 | atomic_dec_return(&data->disabled); | ||
1566 | preempt_enable_notrace(); | 1627 | preempt_enable_notrace(); |
1567 | unpause_graph_tracing(); | 1628 | unpause_graph_tracing(); |
1568 | 1629 | ||
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr, | |||
1588 | int trace_array_vprintk(struct trace_array *tr, | 1649 | int trace_array_vprintk(struct trace_array *tr, |
1589 | unsigned long ip, const char *fmt, va_list args) | 1650 | unsigned long ip, const char *fmt, va_list args) |
1590 | { | 1651 | { |
1591 | static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
1592 | static char trace_buf[TRACE_BUF_SIZE]; | ||
1593 | |||
1594 | struct ftrace_event_call *call = &event_print; | 1652 | struct ftrace_event_call *call = &event_print; |
1595 | struct ring_buffer_event *event; | 1653 | struct ring_buffer_event *event; |
1596 | struct ring_buffer *buffer; | 1654 | struct ring_buffer *buffer; |
1597 | struct trace_array_cpu *data; | 1655 | int len = 0, size, pc; |
1598 | int cpu, len = 0, size, pc; | ||
1599 | struct print_entry *entry; | 1656 | struct print_entry *entry; |
1600 | unsigned long irq_flags; | 1657 | unsigned long flags; |
1601 | int disable; | 1658 | char *tbuffer; |
1602 | 1659 | ||
1603 | if (tracing_disabled || tracing_selftest_running) | 1660 | if (tracing_disabled || tracing_selftest_running) |
1604 | return 0; | 1661 | return 0; |
1605 | 1662 | ||
1663 | /* Don't pollute graph traces with trace_vprintk internals */ | ||
1664 | pause_graph_tracing(); | ||
1665 | |||
1606 | pc = preempt_count(); | 1666 | pc = preempt_count(); |
1607 | preempt_disable_notrace(); | 1667 | preempt_disable_notrace(); |
1608 | cpu = raw_smp_processor_id(); | ||
1609 | data = tr->data[cpu]; | ||
1610 | 1668 | ||
1611 | disable = atomic_inc_return(&data->disabled); | 1669 | |
1612 | if (unlikely(disable != 1)) | 1670 | tbuffer = get_trace_buf(); |
1671 | if (!tbuffer) { | ||
1672 | len = 0; | ||
1613 | goto out; | 1673 | goto out; |
1674 | } | ||
1614 | 1675 | ||
1615 | pause_graph_tracing(); | 1676 | len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
1616 | raw_local_irq_save(irq_flags); | 1677 | if (len > TRACE_BUF_SIZE) |
1617 | arch_spin_lock(&trace_buf_lock); | 1678 | goto out; |
1618 | len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1619 | 1679 | ||
1680 | local_save_flags(flags); | ||
1620 | size = sizeof(*entry) + len + 1; | 1681 | size = sizeof(*entry) + len + 1; |
1621 | buffer = tr->buffer; | 1682 | buffer = tr->buffer; |
1622 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 1683 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
1623 | irq_flags, pc); | 1684 | flags, pc); |
1624 | if (!event) | 1685 | if (!event) |
1625 | goto out_unlock; | 1686 | goto out; |
1626 | entry = ring_buffer_event_data(event); | 1687 | entry = ring_buffer_event_data(event); |
1627 | entry->ip = ip; | 1688 | entry->ip = ip; |
1628 | 1689 | ||
1629 | memcpy(&entry->buf, trace_buf, len); | 1690 | memcpy(&entry->buf, tbuffer, len); |
1630 | entry->buf[len] = '\0'; | 1691 | entry->buf[len] = '\0'; |
1631 | if (!filter_check_discard(call, entry, buffer, event)) { | 1692 | if (!filter_check_discard(call, entry, buffer, event)) { |
1632 | ring_buffer_unlock_commit(buffer, event); | 1693 | ring_buffer_unlock_commit(buffer, event); |
1633 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | 1694 | ftrace_trace_stack(buffer, flags, 6, pc); |
1634 | } | 1695 | } |
1635 | |||
1636 | out_unlock: | ||
1637 | arch_spin_unlock(&trace_buf_lock); | ||
1638 | raw_local_irq_restore(irq_flags); | ||
1639 | unpause_graph_tracing(); | ||
1640 | out: | 1696 | out: |
1641 | atomic_dec_return(&data->disabled); | ||
1642 | preempt_enable_notrace(); | 1697 | preempt_enable_notrace(); |
1698 | unpause_graph_tracing(); | ||
1643 | 1699 | ||
1644 | return len; | 1700 | return len; |
1645 | } | 1701 | } |
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk); | |||
1652 | 1708 | ||
1653 | static void trace_iterator_increment(struct trace_iterator *iter) | 1709 | static void trace_iterator_increment(struct trace_iterator *iter) |
1654 | { | 1710 | { |
1655 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1656 | ftrace_disable_cpu(); | ||
1657 | |||
1658 | iter->idx++; | 1711 | iter->idx++; |
1659 | if (iter->buffer_iter[iter->cpu]) | 1712 | if (iter->buffer_iter[iter->cpu]) |
1660 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); | 1713 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); |
1661 | |||
1662 | ftrace_enable_cpu(); | ||
1663 | } | 1714 | } |
1664 | 1715 | ||
1665 | static struct trace_entry * | 1716 | static struct trace_entry * |
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1669 | struct ring_buffer_event *event; | 1720 | struct ring_buffer_event *event; |
1670 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; | 1721 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; |
1671 | 1722 | ||
1672 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1673 | ftrace_disable_cpu(); | ||
1674 | |||
1675 | if (buf_iter) | 1723 | if (buf_iter) |
1676 | event = ring_buffer_iter_peek(buf_iter, ts); | 1724 | event = ring_buffer_iter_peek(buf_iter, ts); |
1677 | else | 1725 | else |
1678 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, | 1726 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, |
1679 | lost_events); | 1727 | lost_events); |
1680 | 1728 | ||
1681 | ftrace_enable_cpu(); | ||
1682 | |||
1683 | if (event) { | 1729 | if (event) { |
1684 | iter->ent_size = ring_buffer_event_length(event); | 1730 | iter->ent_size = ring_buffer_event_length(event); |
1685 | return ring_buffer_event_data(event); | 1731 | return ring_buffer_event_data(event); |
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) | |||
1769 | 1815 | ||
1770 | static void trace_consume(struct trace_iterator *iter) | 1816 | static void trace_consume(struct trace_iterator *iter) |
1771 | { | 1817 | { |
1772 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1773 | ftrace_disable_cpu(); | ||
1774 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, | 1818 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, |
1775 | &iter->lost_events); | 1819 | &iter->lost_events); |
1776 | ftrace_enable_cpu(); | ||
1777 | } | 1820 | } |
1778 | 1821 | ||
1779 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) | 1822 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) |
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1862 | iter->cpu = 0; | 1905 | iter->cpu = 0; |
1863 | iter->idx = -1; | 1906 | iter->idx = -1; |
1864 | 1907 | ||
1865 | ftrace_disable_cpu(); | ||
1866 | |||
1867 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | 1908 | if (cpu_file == TRACE_PIPE_ALL_CPU) { |
1868 | for_each_tracing_cpu(cpu) | 1909 | for_each_tracing_cpu(cpu) |
1869 | tracing_iter_reset(iter, cpu); | 1910 | tracing_iter_reset(iter, cpu); |
1870 | } else | 1911 | } else |
1871 | tracing_iter_reset(iter, cpu_file); | 1912 | tracing_iter_reset(iter, cpu_file); |
1872 | 1913 | ||
1873 | ftrace_enable_cpu(); | ||
1874 | |||
1875 | iter->leftover = 0; | 1914 | iter->leftover = 0; |
1876 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) | 1915 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) |
1877 | ; | 1916 | ; |
@@ -2332,15 +2371,13 @@ static struct trace_iterator * | |||
2332 | __tracing_open(struct inode *inode, struct file *file) | 2371 | __tracing_open(struct inode *inode, struct file *file) |
2333 | { | 2372 | { |
2334 | long cpu_file = (long) inode->i_private; | 2373 | long cpu_file = (long) inode->i_private; |
2335 | void *fail_ret = ERR_PTR(-ENOMEM); | ||
2336 | struct trace_iterator *iter; | 2374 | struct trace_iterator *iter; |
2337 | struct seq_file *m; | 2375 | int cpu; |
2338 | int cpu, ret; | ||
2339 | 2376 | ||
2340 | if (tracing_disabled) | 2377 | if (tracing_disabled) |
2341 | return ERR_PTR(-ENODEV); | 2378 | return ERR_PTR(-ENODEV); |
2342 | 2379 | ||
2343 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2380 | iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); |
2344 | if (!iter) | 2381 | if (!iter) |
2345 | return ERR_PTR(-ENOMEM); | 2382 | return ERR_PTR(-ENOMEM); |
2346 | 2383 | ||
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2397 | tracing_iter_reset(iter, cpu); | 2434 | tracing_iter_reset(iter, cpu); |
2398 | } | 2435 | } |
2399 | 2436 | ||
2400 | ret = seq_open(file, &tracer_seq_ops); | ||
2401 | if (ret < 0) { | ||
2402 | fail_ret = ERR_PTR(ret); | ||
2403 | goto fail_buffer; | ||
2404 | } | ||
2405 | |||
2406 | m = file->private_data; | ||
2407 | m->private = iter; | ||
2408 | |||
2409 | mutex_unlock(&trace_types_lock); | 2437 | mutex_unlock(&trace_types_lock); |
2410 | 2438 | ||
2411 | return iter; | 2439 | return iter; |
2412 | 2440 | ||
2413 | fail_buffer: | ||
2414 | for_each_tracing_cpu(cpu) { | ||
2415 | if (iter->buffer_iter[cpu]) | ||
2416 | ring_buffer_read_finish(iter->buffer_iter[cpu]); | ||
2417 | } | ||
2418 | free_cpumask_var(iter->started); | ||
2419 | tracing_start(); | ||
2420 | fail: | 2441 | fail: |
2421 | mutex_unlock(&trace_types_lock); | 2442 | mutex_unlock(&trace_types_lock); |
2422 | kfree(iter->trace); | 2443 | kfree(iter->trace); |
2423 | kfree(iter); | 2444 | seq_release_private(inode, file); |
2424 | 2445 | return ERR_PTR(-ENOMEM); | |
2425 | return fail_ret; | ||
2426 | } | 2446 | } |
2427 | 2447 | ||
2428 | int tracing_open_generic(struct inode *inode, struct file *filp) | 2448 | int tracing_open_generic(struct inode *inode, struct file *filp) |
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2458 | tracing_start(); | 2478 | tracing_start(); |
2459 | mutex_unlock(&trace_types_lock); | 2479 | mutex_unlock(&trace_types_lock); |
2460 | 2480 | ||
2461 | seq_release(inode, file); | ||
2462 | mutex_destroy(&iter->mutex); | 2481 | mutex_destroy(&iter->mutex); |
2463 | free_cpumask_var(iter->started); | 2482 | free_cpumask_var(iter->started); |
2464 | kfree(iter->trace); | 2483 | kfree(iter->trace); |
2465 | kfree(iter); | 2484 | seq_release_private(inode, file); |
2466 | return 0; | 2485 | return 0; |
2467 | } | 2486 | } |
2468 | 2487 | ||
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
2648 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 2667 | if (cpumask_test_cpu(cpu, tracing_cpumask) && |
2649 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2668 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2650 | atomic_inc(&global_trace.data[cpu]->disabled); | 2669 | atomic_inc(&global_trace.data[cpu]->disabled); |
2670 | ring_buffer_record_disable_cpu(global_trace.buffer, cpu); | ||
2651 | } | 2671 | } |
2652 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 2672 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && |
2653 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2673 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2654 | atomic_dec(&global_trace.data[cpu]->disabled); | 2674 | atomic_dec(&global_trace.data[cpu]->disabled); |
2675 | ring_buffer_record_enable_cpu(global_trace.buffer, cpu); | ||
2655 | } | 2676 | } |
2656 | } | 2677 | } |
2657 | arch_spin_unlock(&ftrace_max_lock); | 2678 | arch_spin_unlock(&ftrace_max_lock); |
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr) | |||
2974 | return t->init(tr); | 2995 | return t->init(tr); |
2975 | } | 2996 | } |
2976 | 2997 | ||
2977 | static int __tracing_resize_ring_buffer(unsigned long size) | 2998 | static void set_buffer_entries(struct trace_array *tr, unsigned long val) |
2999 | { | ||
3000 | int cpu; | ||
3001 | for_each_tracing_cpu(cpu) | ||
3002 | tr->data[cpu]->entries = val; | ||
3003 | } | ||
3004 | |||
3005 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | ||
2978 | { | 3006 | { |
2979 | int ret; | 3007 | int ret; |
2980 | 3008 | ||
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
2985 | */ | 3013 | */ |
2986 | ring_buffer_expanded = 1; | 3014 | ring_buffer_expanded = 1; |
2987 | 3015 | ||
2988 | ret = ring_buffer_resize(global_trace.buffer, size); | 3016 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
2989 | if (ret < 0) | 3017 | if (ret < 0) |
2990 | return ret; | 3018 | return ret; |
2991 | 3019 | ||
2992 | if (!current_trace->use_max_tr) | 3020 | if (!current_trace->use_max_tr) |
2993 | goto out; | 3021 | goto out; |
2994 | 3022 | ||
2995 | ret = ring_buffer_resize(max_tr.buffer, size); | 3023 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); |
2996 | if (ret < 0) { | 3024 | if (ret < 0) { |
2997 | int r; | 3025 | int r = 0; |
3026 | |||
3027 | if (cpu == RING_BUFFER_ALL_CPUS) { | ||
3028 | int i; | ||
3029 | for_each_tracing_cpu(i) { | ||
3030 | r = ring_buffer_resize(global_trace.buffer, | ||
3031 | global_trace.data[i]->entries, | ||
3032 | i); | ||
3033 | if (r < 0) | ||
3034 | break; | ||
3035 | } | ||
3036 | } else { | ||
3037 | r = ring_buffer_resize(global_trace.buffer, | ||
3038 | global_trace.data[cpu]->entries, | ||
3039 | cpu); | ||
3040 | } | ||
2998 | 3041 | ||
2999 | r = ring_buffer_resize(global_trace.buffer, | ||
3000 | global_trace.entries); | ||
3001 | if (r < 0) { | 3042 | if (r < 0) { |
3002 | /* | 3043 | /* |
3003 | * AARGH! We are left with different | 3044 | * AARGH! We are left with different |
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
3019 | return ret; | 3060 | return ret; |
3020 | } | 3061 | } |
3021 | 3062 | ||
3022 | max_tr.entries = size; | 3063 | if (cpu == RING_BUFFER_ALL_CPUS) |
3064 | set_buffer_entries(&max_tr, size); | ||
3065 | else | ||
3066 | max_tr.data[cpu]->entries = size; | ||
3067 | |||
3023 | out: | 3068 | out: |
3024 | global_trace.entries = size; | 3069 | if (cpu == RING_BUFFER_ALL_CPUS) |
3070 | set_buffer_entries(&global_trace, size); | ||
3071 | else | ||
3072 | global_trace.data[cpu]->entries = size; | ||
3025 | 3073 | ||
3026 | return ret; | 3074 | return ret; |
3027 | } | 3075 | } |
3028 | 3076 | ||
3029 | static ssize_t tracing_resize_ring_buffer(unsigned long size) | 3077 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) |
3030 | { | 3078 | { |
3031 | int cpu, ret = size; | 3079 | int ret = size; |
3032 | 3080 | ||
3033 | mutex_lock(&trace_types_lock); | 3081 | mutex_lock(&trace_types_lock); |
3034 | 3082 | ||
3035 | tracing_stop(); | 3083 | if (cpu_id != RING_BUFFER_ALL_CPUS) { |
3036 | 3084 | /* make sure, this cpu is enabled in the mask */ | |
3037 | /* disable all cpu buffers */ | 3085 | if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { |
3038 | for_each_tracing_cpu(cpu) { | 3086 | ret = -EINVAL; |
3039 | if (global_trace.data[cpu]) | 3087 | goto out; |
3040 | atomic_inc(&global_trace.data[cpu]->disabled); | 3088 | } |
3041 | if (max_tr.data[cpu]) | ||
3042 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
3043 | } | 3089 | } |
3044 | 3090 | ||
3045 | if (size != global_trace.entries) | 3091 | ret = __tracing_resize_ring_buffer(size, cpu_id); |
3046 | ret = __tracing_resize_ring_buffer(size); | ||
3047 | |||
3048 | if (ret < 0) | 3092 | if (ret < 0) |
3049 | ret = -ENOMEM; | 3093 | ret = -ENOMEM; |
3050 | 3094 | ||
3051 | for_each_tracing_cpu(cpu) { | 3095 | out: |
3052 | if (global_trace.data[cpu]) | ||
3053 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
3054 | if (max_tr.data[cpu]) | ||
3055 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
3056 | } | ||
3057 | |||
3058 | tracing_start(); | ||
3059 | mutex_unlock(&trace_types_lock); | 3096 | mutex_unlock(&trace_types_lock); |
3060 | 3097 | ||
3061 | return ret; | 3098 | return ret; |
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void) | |||
3078 | 3115 | ||
3079 | mutex_lock(&trace_types_lock); | 3116 | mutex_lock(&trace_types_lock); |
3080 | if (!ring_buffer_expanded) | 3117 | if (!ring_buffer_expanded) |
3081 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3118 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3119 | RING_BUFFER_ALL_CPUS); | ||
3082 | mutex_unlock(&trace_types_lock); | 3120 | mutex_unlock(&trace_types_lock); |
3083 | 3121 | ||
3084 | return ret; | 3122 | return ret; |
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf) | |||
3102 | mutex_lock(&trace_types_lock); | 3140 | mutex_lock(&trace_types_lock); |
3103 | 3141 | ||
3104 | if (!ring_buffer_expanded) { | 3142 | if (!ring_buffer_expanded) { |
3105 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3143 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3144 | RING_BUFFER_ALL_CPUS); | ||
3106 | if (ret < 0) | 3145 | if (ret < 0) |
3107 | goto out; | 3146 | goto out; |
3108 | ret = 0; | 3147 | ret = 0; |
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf) | |||
3128 | * The max_tr ring buffer has some state (e.g. ring->clock) and | 3167 | * The max_tr ring buffer has some state (e.g. ring->clock) and |
3129 | * we want preserve it. | 3168 | * we want preserve it. |
3130 | */ | 3169 | */ |
3131 | ring_buffer_resize(max_tr.buffer, 1); | 3170 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); |
3132 | max_tr.entries = 1; | 3171 | set_buffer_entries(&max_tr, 1); |
3133 | } | 3172 | } |
3134 | destroy_trace_option_files(topts); | 3173 | destroy_trace_option_files(topts); |
3135 | 3174 | ||
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf) | |||
3137 | 3176 | ||
3138 | topts = create_trace_option_files(current_trace); | 3177 | topts = create_trace_option_files(current_trace); |
3139 | if (current_trace->use_max_tr) { | 3178 | if (current_trace->use_max_tr) { |
3140 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | 3179 | int cpu; |
3141 | if (ret < 0) | 3180 | /* we need to make per cpu buffer sizes equivalent */ |
3142 | goto out; | 3181 | for_each_tracing_cpu(cpu) { |
3143 | max_tr.entries = global_trace.entries; | 3182 | ret = ring_buffer_resize(max_tr.buffer, |
3183 | global_trace.data[cpu]->entries, | ||
3184 | cpu); | ||
3185 | if (ret < 0) | ||
3186 | goto out; | ||
3187 | max_tr.data[cpu]->entries = | ||
3188 | global_trace.data[cpu]->entries; | ||
3189 | } | ||
3144 | } | 3190 | } |
3145 | 3191 | ||
3146 | if (t->init) { | 3192 | if (t->init) { |
@@ -3642,30 +3688,82 @@ out_err: | |||
3642 | goto out; | 3688 | goto out; |
3643 | } | 3689 | } |
3644 | 3690 | ||
3691 | struct ftrace_entries_info { | ||
3692 | struct trace_array *tr; | ||
3693 | int cpu; | ||
3694 | }; | ||
3695 | |||
3696 | static int tracing_entries_open(struct inode *inode, struct file *filp) | ||
3697 | { | ||
3698 | struct ftrace_entries_info *info; | ||
3699 | |||
3700 | if (tracing_disabled) | ||
3701 | return -ENODEV; | ||
3702 | |||
3703 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
3704 | if (!info) | ||
3705 | return -ENOMEM; | ||
3706 | |||
3707 | info->tr = &global_trace; | ||
3708 | info->cpu = (unsigned long)inode->i_private; | ||
3709 | |||
3710 | filp->private_data = info; | ||
3711 | |||
3712 | return 0; | ||
3713 | } | ||
3714 | |||
3645 | static ssize_t | 3715 | static ssize_t |
3646 | tracing_entries_read(struct file *filp, char __user *ubuf, | 3716 | tracing_entries_read(struct file *filp, char __user *ubuf, |
3647 | size_t cnt, loff_t *ppos) | 3717 | size_t cnt, loff_t *ppos) |
3648 | { | 3718 | { |
3649 | struct trace_array *tr = filp->private_data; | 3719 | struct ftrace_entries_info *info = filp->private_data; |
3650 | char buf[96]; | 3720 | struct trace_array *tr = info->tr; |
3651 | int r; | 3721 | char buf[64]; |
3722 | int r = 0; | ||
3723 | ssize_t ret; | ||
3652 | 3724 | ||
3653 | mutex_lock(&trace_types_lock); | 3725 | mutex_lock(&trace_types_lock); |
3654 | if (!ring_buffer_expanded) | 3726 | |
3655 | r = sprintf(buf, "%lu (expanded: %lu)\n", | 3727 | if (info->cpu == RING_BUFFER_ALL_CPUS) { |
3656 | tr->entries >> 10, | 3728 | int cpu, buf_size_same; |
3657 | trace_buf_size >> 10); | 3729 | unsigned long size; |
3658 | else | 3730 | |
3659 | r = sprintf(buf, "%lu\n", tr->entries >> 10); | 3731 | size = 0; |
3732 | buf_size_same = 1; | ||
3733 | /* check if all cpu sizes are same */ | ||
3734 | for_each_tracing_cpu(cpu) { | ||
3735 | /* fill in the size from first enabled cpu */ | ||
3736 | if (size == 0) | ||
3737 | size = tr->data[cpu]->entries; | ||
3738 | if (size != tr->data[cpu]->entries) { | ||
3739 | buf_size_same = 0; | ||
3740 | break; | ||
3741 | } | ||
3742 | } | ||
3743 | |||
3744 | if (buf_size_same) { | ||
3745 | if (!ring_buffer_expanded) | ||
3746 | r = sprintf(buf, "%lu (expanded: %lu)\n", | ||
3747 | size >> 10, | ||
3748 | trace_buf_size >> 10); | ||
3749 | else | ||
3750 | r = sprintf(buf, "%lu\n", size >> 10); | ||
3751 | } else | ||
3752 | r = sprintf(buf, "X\n"); | ||
3753 | } else | ||
3754 | r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); | ||
3755 | |||
3660 | mutex_unlock(&trace_types_lock); | 3756 | mutex_unlock(&trace_types_lock); |
3661 | 3757 | ||
3662 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 3758 | ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
3759 | return ret; | ||
3663 | } | 3760 | } |
3664 | 3761 | ||
3665 | static ssize_t | 3762 | static ssize_t |
3666 | tracing_entries_write(struct file *filp, const char __user *ubuf, | 3763 | tracing_entries_write(struct file *filp, const char __user *ubuf, |
3667 | size_t cnt, loff_t *ppos) | 3764 | size_t cnt, loff_t *ppos) |
3668 | { | 3765 | { |
3766 | struct ftrace_entries_info *info = filp->private_data; | ||
3669 | unsigned long val; | 3767 | unsigned long val; |
3670 | int ret; | 3768 | int ret; |
3671 | 3769 | ||
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3680 | /* value is in KB */ | 3778 | /* value is in KB */ |
3681 | val <<= 10; | 3779 | val <<= 10; |
3682 | 3780 | ||
3683 | ret = tracing_resize_ring_buffer(val); | 3781 | ret = tracing_resize_ring_buffer(val, info->cpu); |
3684 | if (ret < 0) | 3782 | if (ret < 0) |
3685 | return ret; | 3783 | return ret; |
3686 | 3784 | ||
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3689 | return cnt; | 3787 | return cnt; |
3690 | } | 3788 | } |
3691 | 3789 | ||
3790 | static int | ||
3791 | tracing_entries_release(struct inode *inode, struct file *filp) | ||
3792 | { | ||
3793 | struct ftrace_entries_info *info = filp->private_data; | ||
3794 | |||
3795 | kfree(info); | ||
3796 | |||
3797 | return 0; | ||
3798 | } | ||
3799 | |||
3692 | static ssize_t | 3800 | static ssize_t |
3693 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | 3801 | tracing_total_entries_read(struct file *filp, char __user *ubuf, |
3694 | size_t cnt, loff_t *ppos) | 3802 | size_t cnt, loff_t *ppos) |
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, | |||
3700 | 3808 | ||
3701 | mutex_lock(&trace_types_lock); | 3809 | mutex_lock(&trace_types_lock); |
3702 | for_each_tracing_cpu(cpu) { | 3810 | for_each_tracing_cpu(cpu) { |
3703 | size += tr->entries >> 10; | 3811 | size += tr->data[cpu]->entries >> 10; |
3704 | if (!ring_buffer_expanded) | 3812 | if (!ring_buffer_expanded) |
3705 | expanded_size += trace_buf_size >> 10; | 3813 | expanded_size += trace_buf_size >> 10; |
3706 | } | 3814 | } |
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
3734 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | 3842 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) |
3735 | tracing_off(); | 3843 | tracing_off(); |
3736 | /* resize the ring buffer to 0 */ | 3844 | /* resize the ring buffer to 0 */ |
3737 | tracing_resize_ring_buffer(0); | 3845 | tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); |
3738 | 3846 | ||
3739 | return 0; | 3847 | return 0; |
3740 | } | 3848 | } |
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3749 | struct print_entry *entry; | 3857 | struct print_entry *entry; |
3750 | unsigned long irq_flags; | 3858 | unsigned long irq_flags; |
3751 | struct page *pages[2]; | 3859 | struct page *pages[2]; |
3860 | void *map_page[2]; | ||
3752 | int nr_pages = 1; | 3861 | int nr_pages = 1; |
3753 | ssize_t written; | 3862 | ssize_t written; |
3754 | void *page1; | ||
3755 | void *page2; | ||
3756 | int offset; | 3863 | int offset; |
3757 | int size; | 3864 | int size; |
3758 | int len; | 3865 | int len; |
3759 | int ret; | 3866 | int ret; |
3867 | int i; | ||
3760 | 3868 | ||
3761 | if (tracing_disabled) | 3869 | if (tracing_disabled) |
3762 | return -EINVAL; | 3870 | return -EINVAL; |
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3795 | goto out; | 3903 | goto out; |
3796 | } | 3904 | } |
3797 | 3905 | ||
3798 | page1 = kmap_atomic(pages[0]); | 3906 | for (i = 0; i < nr_pages; i++) |
3799 | if (nr_pages == 2) | 3907 | map_page[i] = kmap_atomic(pages[i]); |
3800 | page2 = kmap_atomic(pages[1]); | ||
3801 | 3908 | ||
3802 | local_save_flags(irq_flags); | 3909 | local_save_flags(irq_flags); |
3803 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 3910 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3815 | 3922 | ||
3816 | if (nr_pages == 2) { | 3923 | if (nr_pages == 2) { |
3817 | len = PAGE_SIZE - offset; | 3924 | len = PAGE_SIZE - offset; |
3818 | memcpy(&entry->buf, page1 + offset, len); | 3925 | memcpy(&entry->buf, map_page[0] + offset, len); |
3819 | memcpy(&entry->buf[len], page2, cnt - len); | 3926 | memcpy(&entry->buf[len], map_page[1], cnt - len); |
3820 | } else | 3927 | } else |
3821 | memcpy(&entry->buf, page1 + offset, cnt); | 3928 | memcpy(&entry->buf, map_page[0] + offset, cnt); |
3822 | 3929 | ||
3823 | if (entry->buf[cnt - 1] != '\n') { | 3930 | if (entry->buf[cnt - 1] != '\n') { |
3824 | entry->buf[cnt] = '\n'; | 3931 | entry->buf[cnt] = '\n'; |
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3833 | *fpos += written; | 3940 | *fpos += written; |
3834 | 3941 | ||
3835 | out_unlock: | 3942 | out_unlock: |
3836 | if (nr_pages == 2) | 3943 | for (i = 0; i < nr_pages; i++){ |
3837 | kunmap_atomic(page2); | 3944 | kunmap_atomic(map_page[i]); |
3838 | kunmap_atomic(page1); | 3945 | put_page(pages[i]); |
3839 | while (nr_pages > 0) | 3946 | } |
3840 | put_page(pages[--nr_pages]); | ||
3841 | out: | 3947 | out: |
3842 | return written; | 3948 | return written; |
3843 | } | 3949 | } |
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = { | |||
3933 | }; | 4039 | }; |
3934 | 4040 | ||
3935 | static const struct file_operations tracing_entries_fops = { | 4041 | static const struct file_operations tracing_entries_fops = { |
3936 | .open = tracing_open_generic, | 4042 | .open = tracing_entries_open, |
3937 | .read = tracing_entries_read, | 4043 | .read = tracing_entries_read, |
3938 | .write = tracing_entries_write, | 4044 | .write = tracing_entries_write, |
4045 | .release = tracing_entries_release, | ||
3939 | .llseek = generic_file_llseek, | 4046 | .llseek = generic_file_llseek, |
3940 | }; | 4047 | }; |
3941 | 4048 | ||
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4367 | struct dentry *d_cpu; | 4474 | struct dentry *d_cpu; |
4368 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 4475 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4369 | 4476 | ||
4477 | if (!d_percpu) | ||
4478 | return; | ||
4479 | |||
4370 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 4480 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4371 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4481 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4372 | if (!d_cpu) { | 4482 | if (!d_cpu) { |
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4387 | 4497 | ||
4388 | trace_create_file("stats", 0444, d_cpu, | 4498 | trace_create_file("stats", 0444, d_cpu, |
4389 | (void *) cpu, &tracing_stats_fops); | 4499 | (void *) cpu, &tracing_stats_fops); |
4500 | |||
4501 | trace_create_file("buffer_size_kb", 0444, d_cpu, | ||
4502 | (void *) cpu, &tracing_entries_fops); | ||
4390 | } | 4503 | } |
4391 | 4504 | ||
4392 | #ifdef CONFIG_FTRACE_SELFTEST | 4505 | #ifdef CONFIG_FTRACE_SELFTEST |
@@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void) | |||
4718 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); | 4831 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); |
4719 | 4832 | ||
4720 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4833 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4721 | &global_trace, &tracing_entries_fops); | 4834 | (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); |
4722 | 4835 | ||
4723 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | 4836 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, |
4724 | &global_trace, &tracing_total_entries_fops); | 4837 | &global_trace, &tracing_total_entries_fops); |
@@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void) | |||
4957 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 5070 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4958 | goto out_free_buffer_mask; | 5071 | goto out_free_buffer_mask; |
4959 | 5072 | ||
5073 | /* Only allocate trace_printk buffers if a trace_printk exists */ | ||
5074 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | ||
5075 | trace_printk_init_buffers(); | ||
5076 | |||
4960 | /* To save memory, keep the ring buffer size to its minimum */ | 5077 | /* To save memory, keep the ring buffer size to its minimum */ |
4961 | if (ring_buffer_expanded) | 5078 | if (ring_buffer_expanded) |
4962 | ring_buf_size = trace_buf_size; | 5079 | ring_buf_size = trace_buf_size; |
@@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void) | |||
4975 | WARN_ON(1); | 5092 | WARN_ON(1); |
4976 | goto out_free_cpumask; | 5093 | goto out_free_cpumask; |
4977 | } | 5094 | } |
4978 | global_trace.entries = ring_buffer_size(global_trace.buffer); | ||
4979 | if (global_trace.buffer_disabled) | 5095 | if (global_trace.buffer_disabled) |
4980 | tracing_off(); | 5096 | tracing_off(); |
4981 | 5097 | ||
@@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void) | |||
4988 | ring_buffer_free(global_trace.buffer); | 5104 | ring_buffer_free(global_trace.buffer); |
4989 | goto out_free_cpumask; | 5105 | goto out_free_cpumask; |
4990 | } | 5106 | } |
4991 | max_tr.entries = 1; | ||
4992 | #endif | 5107 | #endif |
4993 | 5108 | ||
4994 | /* Allocate the first page for all buffers */ | 5109 | /* Allocate the first page for all buffers */ |
@@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void) | |||
4997 | max_tr.data[i] = &per_cpu(max_tr_data, i); | 5112 | max_tr.data[i] = &per_cpu(max_tr_data, i); |
4998 | } | 5113 | } |
4999 | 5114 | ||
5115 | set_buffer_entries(&global_trace, | ||
5116 | ring_buffer_size(global_trace.buffer, 0)); | ||
5117 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5118 | set_buffer_entries(&max_tr, 1); | ||
5119 | #endif | ||
5120 | |||
5000 | trace_init_cmdlines(); | 5121 | trace_init_cmdlines(); |
5001 | 5122 | ||
5002 | register_tracer(&nop_trace); | 5123 | register_tracer(&nop_trace); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f95d65da6db8..6c6f7933eede 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -131,6 +131,7 @@ struct trace_array_cpu { | |||
131 | atomic_t disabled; | 131 | atomic_t disabled; |
132 | void *buffer_page; /* ring buffer spare */ | 132 | void *buffer_page; /* ring buffer spare */ |
133 | 133 | ||
134 | unsigned long entries; | ||
134 | unsigned long saved_latency; | 135 | unsigned long saved_latency; |
135 | unsigned long critical_start; | 136 | unsigned long critical_start; |
136 | unsigned long critical_end; | 137 | unsigned long critical_end; |
@@ -152,7 +153,6 @@ struct trace_array_cpu { | |||
152 | */ | 153 | */ |
153 | struct trace_array { | 154 | struct trace_array { |
154 | struct ring_buffer *buffer; | 155 | struct ring_buffer *buffer; |
155 | unsigned long entries; | ||
156 | int cpu; | 156 | int cpu; |
157 | int buffer_disabled; | 157 | int buffer_disabled; |
158 | cycle_t time_start; | 158 | cycle_t time_start; |
@@ -826,6 +826,8 @@ extern struct list_head ftrace_events; | |||
826 | extern const char *__start___trace_bprintk_fmt[]; | 826 | extern const char *__start___trace_bprintk_fmt[]; |
827 | extern const char *__stop___trace_bprintk_fmt[]; | 827 | extern const char *__stop___trace_bprintk_fmt[]; |
828 | 828 | ||
829 | void trace_printk_init_buffers(void); | ||
830 | |||
829 | #undef FTRACE_ENTRY | 831 | #undef FTRACE_ENTRY |
830 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 832 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
831 | extern struct ftrace_event_call \ | 833 | extern struct ftrace_event_call \ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93ae8a9d..29111da1d100 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
294 | if (!call->name || !call->class || !call->class->reg) | 294 | if (!call->name || !call->class || !call->class->reg) |
295 | continue; | 295 | continue; |
296 | 296 | ||
297 | if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) | ||
298 | continue; | ||
299 | |||
297 | if (match && | 300 | if (match && |
298 | strcmp(match, call->name) != 0 && | 301 | strcmp(match, call->name) != 0 && |
299 | strcmp(match, call->class->system) != 0) | 302 | strcmp(match, call->class->system) != 0) |
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1164 | return -1; | 1167 | return -1; |
1165 | } | 1168 | } |
1166 | 1169 | ||
1167 | if (call->class->reg) | 1170 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1168 | trace_create_file("enable", 0644, call->dir, call, | 1171 | trace_create_file("enable", 0644, call->dir, call, |
1169 | enable); | 1172 | enable); |
1170 | 1173 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8bc856..e039906b037d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
180 | .event.type = etype, \ | 180 | .event.type = etype, \ |
181 | .class = &event_class_ftrace_##call, \ | 181 | .class = &event_class_ftrace_##call, \ |
182 | .print_fmt = print, \ | 182 | .print_fmt = print, \ |
183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | ||
183 | }; \ | 184 | }; \ |
184 | struct ftrace_event_call __used \ | 185 | struct ftrace_event_call __used \ |
185 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f9..a9077c1b4ad3 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | 52 | char *fmt; |
53 | 53 | ||
54 | /* allocate the trace_printk per cpu buffers */ | ||
55 | if (start != end) | ||
56 | trace_printk_init_buffers(); | ||
57 | |||
54 | mutex_lock(&btrace_mutex); | 58 | mutex_lock(&btrace_mutex); |
55 | for (iter = start; iter < end; iter++) { | 59 | for (iter = start; iter < end; iter++) { |
56 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); | 60 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); |
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a4721..000000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null | |||
@@ -1,300 +0,0 @@ | |||
1 | /* | ||
2 | * Workqueue statistical tracer. | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | |||
9 | #include <trace/events/workqueue.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/percpu.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/kref.h> | ||
14 | #include "trace_stat.h" | ||
15 | #include "trace.h" | ||
16 | |||
17 | |||
18 | /* A cpu workqueue thread */ | ||
19 | struct cpu_workqueue_stats { | ||
20 | struct list_head list; | ||
21 | struct kref kref; | ||
22 | int cpu; | ||
23 | pid_t pid; | ||
24 | /* Can be inserted from interrupt or user context, need to be atomic */ | ||
25 | atomic_t inserted; | ||
26 | /* | ||
27 | * Don't need to be atomic, works are serialized in a single workqueue thread | ||
28 | * on a single CPU. | ||
29 | */ | ||
30 | unsigned int executed; | ||
31 | }; | ||
32 | |||
33 | /* List of workqueue threads on one cpu */ | ||
34 | struct workqueue_global_stats { | ||
35 | struct list_head list; | ||
36 | spinlock_t lock; | ||
37 | }; | ||
38 | |||
39 | /* Don't need a global lock because allocated before the workqueues, and | ||
40 | * never freed. | ||
41 | */ | ||
42 | static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); | ||
43 | #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) | ||
44 | |||
45 | static void cpu_workqueue_stat_free(struct kref *kref) | ||
46 | { | ||
47 | kfree(container_of(kref, struct cpu_workqueue_stats, kref)); | ||
48 | } | ||
49 | |||
50 | /* Insertion of a work */ | ||
51 | static void | ||
52 | probe_workqueue_insertion(void *ignore, | ||
53 | struct task_struct *wq_thread, | ||
54 | struct work_struct *work) | ||
55 | { | ||
56 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
57 | struct cpu_workqueue_stats *node; | ||
58 | unsigned long flags; | ||
59 | |||
60 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
61 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
62 | if (node->pid == wq_thread->pid) { | ||
63 | atomic_inc(&node->inserted); | ||
64 | goto found; | ||
65 | } | ||
66 | } | ||
67 | pr_debug("trace_workqueue: entry not found\n"); | ||
68 | found: | ||
69 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
70 | } | ||
71 | |||
72 | /* Execution of a work */ | ||
73 | static void | ||
74 | probe_workqueue_execution(void *ignore, | ||
75 | struct task_struct *wq_thread, | ||
76 | struct work_struct *work) | ||
77 | { | ||
78 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
79 | struct cpu_workqueue_stats *node; | ||
80 | unsigned long flags; | ||
81 | |||
82 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
83 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
84 | if (node->pid == wq_thread->pid) { | ||
85 | node->executed++; | ||
86 | goto found; | ||
87 | } | ||
88 | } | ||
89 | pr_debug("trace_workqueue: entry not found\n"); | ||
90 | found: | ||
91 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
92 | } | ||
93 | |||
94 | /* Creation of a cpu workqueue thread */ | ||
95 | static void probe_workqueue_creation(void *ignore, | ||
96 | struct task_struct *wq_thread, int cpu) | ||
97 | { | ||
98 | struct cpu_workqueue_stats *cws; | ||
99 | unsigned long flags; | ||
100 | |||
101 | WARN_ON(cpu < 0); | ||
102 | |||
103 | /* Workqueues are sometimes created in atomic context */ | ||
104 | cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); | ||
105 | if (!cws) { | ||
106 | pr_warning("trace_workqueue: not enough memory\n"); | ||
107 | return; | ||
108 | } | ||
109 | INIT_LIST_HEAD(&cws->list); | ||
110 | kref_init(&cws->kref); | ||
111 | cws->cpu = cpu; | ||
112 | cws->pid = wq_thread->pid; | ||
113 | |||
114 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
115 | list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); | ||
116 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
117 | } | ||
118 | |||
119 | /* Destruction of a cpu workqueue thread */ | ||
120 | static void | ||
121 | probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) | ||
122 | { | ||
123 | /* Workqueue only execute on one cpu */ | ||
124 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
125 | struct cpu_workqueue_stats *node, *next; | ||
126 | unsigned long flags; | ||
127 | |||
128 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
129 | list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, | ||
130 | list) { | ||
131 | if (node->pid == wq_thread->pid) { | ||
132 | list_del(&node->list); | ||
133 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
134 | goto found; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | pr_debug("trace_workqueue: don't find workqueue to destroy\n"); | ||
139 | found: | ||
140 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
141 | |||
142 | } | ||
143 | |||
144 | static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) | ||
145 | { | ||
146 | unsigned long flags; | ||
147 | struct cpu_workqueue_stats *ret = NULL; | ||
148 | |||
149 | |||
150 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
151 | |||
152 | if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { | ||
153 | ret = list_entry(workqueue_cpu_stat(cpu)->list.next, | ||
154 | struct cpu_workqueue_stats, list); | ||
155 | kref_get(&ret->kref); | ||
156 | } | ||
157 | |||
158 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
159 | |||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | static void *workqueue_stat_start(struct tracer_stat *trace) | ||
164 | { | ||
165 | int cpu; | ||
166 | void *ret = NULL; | ||
167 | |||
168 | for_each_possible_cpu(cpu) { | ||
169 | ret = workqueue_stat_start_cpu(cpu); | ||
170 | if (ret) | ||
171 | return ret; | ||
172 | } | ||
173 | return NULL; | ||
174 | } | ||
175 | |||
176 | static void *workqueue_stat_next(void *prev, int idx) | ||
177 | { | ||
178 | struct cpu_workqueue_stats *prev_cws = prev; | ||
179 | struct cpu_workqueue_stats *ret; | ||
180 | int cpu = prev_cws->cpu; | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
184 | if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { | ||
185 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
186 | do { | ||
187 | cpu = cpumask_next(cpu, cpu_possible_mask); | ||
188 | if (cpu >= nr_cpu_ids) | ||
189 | return NULL; | ||
190 | } while (!(ret = workqueue_stat_start_cpu(cpu))); | ||
191 | return ret; | ||
192 | } else { | ||
193 | ret = list_entry(prev_cws->list.next, | ||
194 | struct cpu_workqueue_stats, list); | ||
195 | kref_get(&ret->kref); | ||
196 | } | ||
197 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static int workqueue_stat_show(struct seq_file *s, void *p) | ||
203 | { | ||
204 | struct cpu_workqueue_stats *cws = p; | ||
205 | struct pid *pid; | ||
206 | struct task_struct *tsk; | ||
207 | |||
208 | pid = find_get_pid(cws->pid); | ||
209 | if (pid) { | ||
210 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
211 | if (tsk) { | ||
212 | seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, | ||
213 | atomic_read(&cws->inserted), cws->executed, | ||
214 | tsk->comm); | ||
215 | put_task_struct(tsk); | ||
216 | } | ||
217 | put_pid(pid); | ||
218 | } | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static void workqueue_stat_release(void *stat) | ||
224 | { | ||
225 | struct cpu_workqueue_stats *node = stat; | ||
226 | |||
227 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
228 | } | ||
229 | |||
230 | static int workqueue_stat_headers(struct seq_file *s) | ||
231 | { | ||
232 | seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); | ||
233 | seq_printf(s, "# | | | |\n"); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct tracer_stat workqueue_stats __read_mostly = { | ||
238 | .name = "workqueues", | ||
239 | .stat_start = workqueue_stat_start, | ||
240 | .stat_next = workqueue_stat_next, | ||
241 | .stat_show = workqueue_stat_show, | ||
242 | .stat_release = workqueue_stat_release, | ||
243 | .stat_headers = workqueue_stat_headers | ||
244 | }; | ||
245 | |||
246 | |||
247 | int __init stat_workqueue_init(void) | ||
248 | { | ||
249 | if (register_stat_tracer(&workqueue_stats)) { | ||
250 | pr_warning("Unable to register workqueue stat tracer\n"); | ||
251 | return 1; | ||
252 | } | ||
253 | |||
254 | return 0; | ||
255 | } | ||
256 | fs_initcall(stat_workqueue_init); | ||
257 | |||
258 | /* | ||
259 | * Workqueues are created very early, just after pre-smp initcalls. | ||
260 | * So we must register our tracepoints at this stage. | ||
261 | */ | ||
262 | int __init trace_workqueue_early_init(void) | ||
263 | { | ||
264 | int ret, cpu; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
272 | if (ret) | ||
273 | goto out; | ||
274 | |||
275 | ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
276 | if (ret) | ||
277 | goto no_insertion; | ||
278 | |||
279 | ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
280 | if (ret) | ||
281 | goto no_execution; | ||
282 | |||
283 | ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); | ||
284 | if (ret) | ||
285 | goto no_creation; | ||
286 | |||
287 | return 0; | ||
288 | |||
289 | no_creation: | ||
290 | unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
291 | no_execution: | ||
292 | unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
293 | no_insertion: | ||
294 | unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
295 | out: | ||
296 | pr_warning("trace_workqueue: unable to trace workqueues\n"); | ||
297 | |||
298 | return 1; | ||
299 | } | ||
300 | early_initcall(trace_workqueue_early_init); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c08..9a3128dc67df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1032 | cwq = get_cwq(gcwq->cpu, wq); | 1032 | cwq = get_cwq(gcwq->cpu, wq); |
1033 | trace_workqueue_queue_work(cpu, cwq, work); | 1033 | trace_workqueue_queue_work(cpu, cwq, work); |
1034 | 1034 | ||
1035 | BUG_ON(!list_empty(&work->entry)); | 1035 | if (WARN_ON(!list_empty(&work->entry))) { |
1036 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
1037 | return; | ||
1038 | } | ||
1036 | 1039 | ||
1037 | cwq->nr_in_flight[cwq->work_color]++; | 1040 | cwq->nr_in_flight[cwq->work_color]++; |
1038 | work_flags = work_color_to_flags(cwq->work_color); | 1041 | work_flags = work_color_to_flags(cwq->work_color); |
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) | |||
1210 | } else | 1213 | } else |
1211 | wake_up_all(&gcwq->trustee_wait); | 1214 | wake_up_all(&gcwq->trustee_wait); |
1212 | 1215 | ||
1213 | /* sanity check nr_running */ | 1216 | /* |
1214 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | 1217 | * Sanity check nr_running. Because trustee releases gcwq->lock |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | ||
1219 | * warning may trigger spuriously. Check iff trustee is idle. | ||
1220 | */ | ||
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | ||
1222 | gcwq->nr_workers == gcwq->nr_idle && | ||
1215 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); |
1216 | } | 1224 | } |
1217 | 1225 | ||
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) | |||
1810 | * lock freed" warnings as well as problems when looking into | 1818 | * lock freed" warnings as well as problems when looking into |
1811 | * work->lockdep_map, make a copy and use that here. | 1819 | * work->lockdep_map, make a copy and use that here. |
1812 | */ | 1820 | */ |
1813 | struct lockdep_map lockdep_map = work->lockdep_map; | 1821 | struct lockdep_map lockdep_map; |
1822 | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | ||
1814 | #endif | 1824 | #endif |
1815 | /* | 1825 | /* |
1816 | * A single work shouldn't be executed concurrently by | 1826 | * A single work shouldn't be executed concurrently by |
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) | |||
2506 | { | 2516 | { |
2507 | struct wq_barrier barr; | 2517 | struct wq_barrier barr; |
2508 | 2518 | ||
2519 | lock_map_acquire(&work->lockdep_map); | ||
2520 | lock_map_release(&work->lockdep_map); | ||
2521 | |||
2509 | if (start_flush_work(work, &barr, true)) { | 2522 | if (start_flush_work(work, &barr, true)) { |
2510 | wait_for_completion(&barr.done); | 2523 | wait_for_completion(&barr.done); |
2511 | destroy_work_on_stack(&barr.work); | 2524 | destroy_work_on_stack(&barr.work); |