diff options
author | Tejun Heo <tj@kernel.org> | 2012-04-01 15:30:01 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2012-04-01 15:55:00 -0400 |
commit | 959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch) | |
tree | 3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /kernel | |
parent | a5567932fc926739e29e98487128080f40c61710 (diff) | |
parent | 48ddbe194623ae089cc0576e60363f2d2e85662a (diff) |
Merge branch 'for-3.5' of ../cgroup into block/for-3.5/core-merged
cgroup/for-3.5 contains the following changes which blk-cgroup needs
to proceed with the on-going cleanup.
* Dynamic addition and removal of cftypes to make config/stat file
handling modular for policies.
* cgroup removal update to not wait for css references to drain to fix
blkcg removal hang caused by cfq caching cfqgs.
Pull in cgroup/for-3.5 into block/for-3.5/core. This causes the
following conflicts in block/blk-cgroup.c.
* 761b3ef50e "cgroup: remove cgroup_subsys argument from callbacks"
conflicts with blkiocg_pre_destroy() addition and blkiocg_attach()
removal. Resolved by removing @subsys from all subsys methods.
* 676f7c8f84 "cgroup: relocate cftype and cgroup_subsys definitions in
controllers" conflicts with ->pre_destroy() and ->attach() updates
and removal of modular config. Resolved by dropping forward
declarations of the methods and applying updates to the relocated
blkio_subsys.
* 4baf6e3325 "cgroup: convert all non-memcg controllers to the new
cftype interface" builds upon the previous item. Resolved by adding
->base_cftypes to the relocated blkio_subsys.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
106 files changed, 5181 insertions, 3197 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 5068e2a4e75f..2251882daf53 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE | |||
124 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 124 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ |
125 | ARCH_INLINE_SPIN_LOCK_IRQSAVE | 125 | ARCH_INLINE_SPIN_LOCK_IRQSAVE |
126 | 126 | ||
127 | config INLINE_SPIN_UNLOCK | 127 | config UNINLINE_SPIN_UNLOCK |
128 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) | 128 | bool |
129 | 129 | ||
130 | config INLINE_SPIN_UNLOCK_BH | 130 | config INLINE_SPIN_UNLOCK_BH |
131 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH | 131 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH |
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 24e7cb0ba26a..3f9c97419f02 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY | |||
36 | config PREEMPT | 36 | config PREEMPT |
37 | bool "Preemptible Kernel (Low-Latency Desktop)" | 37 | bool "Preemptible Kernel (Low-Latency Desktop)" |
38 | select PREEMPT_COUNT | 38 | select PREEMPT_COUNT |
39 | select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK | ||
39 | help | 40 | help |
40 | This option reduces the latency of the kernel by making | 41 | This option reduces the latency of the kernel by making |
41 | all kernel code (that is not executing in a critical section) | 42 | all kernel code (that is not executing in a critical section) |
diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..cb41b9547c9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -27,7 +27,6 @@ obj-y += power/ | |||
27 | 27 | ||
28 | obj-$(CONFIG_FREEZER) += freezer.o | 28 | obj-$(CONFIG_FREEZER) += freezer.o |
29 | obj-$(CONFIG_PROFILING) += profile.o | 29 | obj-$(CONFIG_PROFILING) += profile.o |
30 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o | ||
31 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 30 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
32 | obj-y += time/ | 31 | obj-y += time/ |
33 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 32 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
diff --git a/kernel/audit.c b/kernel/audit.c index bb0eb5bb9a0a..1c7f2c61416b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | |||
1418 | 1418 | ||
1419 | /* This is a helper-function to print the escaped d_path */ | 1419 | /* This is a helper-function to print the escaped d_path */ |
1420 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, | 1420 | void audit_log_d_path(struct audit_buffer *ab, const char *prefix, |
1421 | struct path *path) | 1421 | const struct path *path) |
1422 | { | 1422 | { |
1423 | char *p, *pathname; | 1423 | char *p, *pathname; |
1424 | 1424 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..2905977e0f33 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -63,6 +63,9 @@ | |||
63 | 63 | ||
64 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
65 | 65 | ||
66 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
67 | #define CSS_DEACT_BIAS INT_MIN | ||
68 | |||
66 | /* | 69 | /* |
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 70 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
68 | * hierarchy must be performed while holding it. | 71 | * hierarchy must be performed while holding it. |
@@ -127,6 +130,9 @@ struct cgroupfs_root { | |||
127 | /* A list running through the active hierarchies */ | 130 | /* A list running through the active hierarchies */ |
128 | struct list_head root_list; | 131 | struct list_head root_list; |
129 | 132 | ||
133 | /* All cgroups on this root, cgroup_mutex protected */ | ||
134 | struct list_head allcg_list; | ||
135 | |||
130 | /* Hierarchy-specific flags */ | 136 | /* Hierarchy-specific flags */ |
131 | unsigned long flags; | 137 | unsigned long flags; |
132 | 138 | ||
@@ -145,6 +151,15 @@ struct cgroupfs_root { | |||
145 | static struct cgroupfs_root rootnode; | 151 | static struct cgroupfs_root rootnode; |
146 | 152 | ||
147 | /* | 153 | /* |
154 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
155 | */ | ||
156 | struct cfent { | ||
157 | struct list_head node; | ||
158 | struct dentry *dentry; | ||
159 | struct cftype *type; | ||
160 | }; | ||
161 | |||
162 | /* | ||
148 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 163 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
149 | * cgroup_subsys->use_id != 0. | 164 | * cgroup_subsys->use_id != 0. |
150 | */ | 165 | */ |
@@ -239,6 +254,14 @@ int cgroup_lock_is_held(void) | |||
239 | 254 | ||
240 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 255 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
241 | 256 | ||
257 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
258 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
259 | { | ||
260 | int v = atomic_read(&css->refcnt); | ||
261 | |||
262 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | ||
263 | } | ||
264 | |||
242 | /* convenient tests for these bits */ | 265 | /* convenient tests for these bits */ |
243 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 266 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
244 | { | 267 | { |
@@ -279,6 +302,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
279 | #define for_each_active_root(_root) \ | 302 | #define for_each_active_root(_root) \ |
280 | list_for_each_entry(_root, &roots, root_list) | 303 | list_for_each_entry(_root, &roots, root_list) |
281 | 304 | ||
305 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
306 | { | ||
307 | return dentry->d_fsdata; | ||
308 | } | ||
309 | |||
310 | static inline struct cfent *__d_cfe(struct dentry *dentry) | ||
311 | { | ||
312 | return dentry->d_fsdata; | ||
313 | } | ||
314 | |||
315 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
316 | { | ||
317 | return __d_cfe(dentry)->type; | ||
318 | } | ||
319 | |||
282 | /* the list of cgroups eligible for automatic release. Protected by | 320 | /* the list of cgroups eligible for automatic release. Protected by |
283 | * release_list_lock */ | 321 | * release_list_lock */ |
284 | static LIST_HEAD(release_list); | 322 | static LIST_HEAD(release_list); |
@@ -816,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
816 | struct cgroup_subsys *ss; | 854 | struct cgroup_subsys *ss; |
817 | int ret = 0; | 855 | int ret = 0; |
818 | 856 | ||
819 | for_each_subsys(cgrp->root, ss) | 857 | for_each_subsys(cgrp->root, ss) { |
820 | if (ss->pre_destroy) { | 858 | if (!ss->pre_destroy) |
821 | ret = ss->pre_destroy(ss, cgrp); | 859 | continue; |
822 | if (ret) | 860 | |
823 | break; | 861 | ret = ss->pre_destroy(cgrp); |
862 | if (ret) { | ||
863 | /* ->pre_destroy() failure is being deprecated */ | ||
864 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
865 | break; | ||
824 | } | 866 | } |
867 | } | ||
825 | 868 | ||
826 | return ret; | 869 | return ret; |
827 | } | 870 | } |
@@ -846,7 +889,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
846 | * Release the subsystem state objects. | 889 | * Release the subsystem state objects. |
847 | */ | 890 | */ |
848 | for_each_subsys(cgrp->root, ss) | 891 | for_each_subsys(cgrp->root, ss) |
849 | ss->destroy(ss, cgrp); | 892 | ss->destroy(cgrp); |
850 | 893 | ||
851 | cgrp->root->number_of_cgroups--; | 894 | cgrp->root->number_of_cgroups--; |
852 | mutex_unlock(&cgroup_mutex); | 895 | mutex_unlock(&cgroup_mutex); |
@@ -864,6 +907,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
864 | BUG_ON(!list_empty(&cgrp->pidlists)); | 907 | BUG_ON(!list_empty(&cgrp->pidlists)); |
865 | 908 | ||
866 | kfree_rcu(cgrp, rcu_head); | 909 | kfree_rcu(cgrp, rcu_head); |
910 | } else { | ||
911 | struct cfent *cfe = __d_cfe(dentry); | ||
912 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | ||
913 | |||
914 | WARN_ONCE(!list_empty(&cfe->node) && | ||
915 | cgrp != &cgrp->root->top_cgroup, | ||
916 | "cfe still linked for %s\n", cfe->type->name); | ||
917 | kfree(cfe); | ||
867 | } | 918 | } |
868 | iput(inode); | 919 | iput(inode); |
869 | } | 920 | } |
@@ -882,34 +933,36 @@ static void remove_dir(struct dentry *d) | |||
882 | dput(parent); | 933 | dput(parent); |
883 | } | 934 | } |
884 | 935 | ||
885 | static void cgroup_clear_directory(struct dentry *dentry) | 936 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
886 | { | 937 | { |
887 | struct list_head *node; | 938 | struct cfent *cfe; |
888 | 939 | ||
889 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 940 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
890 | spin_lock(&dentry->d_lock); | 941 | lockdep_assert_held(&cgroup_mutex); |
891 | node = dentry->d_subdirs.next; | 942 | |
892 | while (node != &dentry->d_subdirs) { | 943 | list_for_each_entry(cfe, &cgrp->files, node) { |
893 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 944 | struct dentry *d = cfe->dentry; |
894 | 945 | ||
895 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 946 | if (cft && cfe->type != cft) |
896 | list_del_init(node); | 947 | continue; |
897 | if (d->d_inode) { | 948 | |
898 | /* This should never be called on a cgroup | 949 | dget(d); |
899 | * directory with child cgroups */ | 950 | d_delete(d); |
900 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 951 | simple_unlink(d->d_inode, d); |
901 | dget_dlock(d); | 952 | list_del_init(&cfe->node); |
902 | spin_unlock(&d->d_lock); | 953 | dput(d); |
903 | spin_unlock(&dentry->d_lock); | 954 | |
904 | d_delete(d); | 955 | return 0; |
905 | simple_unlink(dentry->d_inode, d); | ||
906 | dput(d); | ||
907 | spin_lock(&dentry->d_lock); | ||
908 | } else | ||
909 | spin_unlock(&d->d_lock); | ||
910 | node = dentry->d_subdirs.next; | ||
911 | } | 956 | } |
912 | spin_unlock(&dentry->d_lock); | 957 | return -ENOENT; |
958 | } | ||
959 | |||
960 | static void cgroup_clear_directory(struct dentry *dir) | ||
961 | { | ||
962 | struct cgroup *cgrp = __d_cgrp(dir); | ||
963 | |||
964 | while (!list_empty(&cgrp->files)) | ||
965 | cgroup_rm_file(cgrp, NULL); | ||
913 | } | 966 | } |
914 | 967 | ||
915 | /* | 968 | /* |
@@ -1015,7 +1068,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1015 | list_move(&ss->sibling, &root->subsys_list); | 1068 | list_move(&ss->sibling, &root->subsys_list); |
1016 | ss->root = root; | 1069 | ss->root = root; |
1017 | if (ss->bind) | 1070 | if (ss->bind) |
1018 | ss->bind(ss, cgrp); | 1071 | ss->bind(cgrp); |
1019 | mutex_unlock(&ss->hierarchy_mutex); | 1072 | mutex_unlock(&ss->hierarchy_mutex); |
1020 | /* refcount was already taken, and we're keeping it */ | 1073 | /* refcount was already taken, and we're keeping it */ |
1021 | } else if (bit & removed_bits) { | 1074 | } else if (bit & removed_bits) { |
@@ -1025,7 +1078,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1025 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1078 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
1026 | mutex_lock(&ss->hierarchy_mutex); | 1079 | mutex_lock(&ss->hierarchy_mutex); |
1027 | if (ss->bind) | 1080 | if (ss->bind) |
1028 | ss->bind(ss, dummytop); | 1081 | ss->bind(dummytop); |
1029 | dummytop->subsys[i]->cgroup = dummytop; | 1082 | dummytop->subsys[i]->cgroup = dummytop; |
1030 | cgrp->subsys[i] = NULL; | 1083 | cgrp->subsys[i] = NULL; |
1031 | subsys[i]->root = &rootnode; | 1084 | subsys[i]->root = &rootnode; |
@@ -1294,6 +1347,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1294 | if (ret) | 1347 | if (ret) |
1295 | goto out_unlock; | 1348 | goto out_unlock; |
1296 | 1349 | ||
1350 | /* See feature-removal-schedule.txt */ | ||
1351 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1352 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | ||
1353 | task_tgid_nr(current), current->comm); | ||
1354 | |||
1297 | /* Don't allow flags or name to change at remount */ | 1355 | /* Don't allow flags or name to change at remount */ |
1298 | if (opts.flags != root->flags || | 1356 | if (opts.flags != root->flags || |
1299 | (opts.name && strcmp(opts.name, root->name))) { | 1357 | (opts.name && strcmp(opts.name, root->name))) { |
@@ -1308,7 +1366,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1308 | goto out_unlock; | 1366 | goto out_unlock; |
1309 | } | 1367 | } |
1310 | 1368 | ||
1311 | /* (re)populate subsystem files */ | 1369 | /* clear out any existing files and repopulate subsystem files */ |
1370 | cgroup_clear_directory(cgrp->dentry); | ||
1312 | cgroup_populate_dir(cgrp); | 1371 | cgroup_populate_dir(cgrp); |
1313 | 1372 | ||
1314 | if (opts.release_agent) | 1373 | if (opts.release_agent) |
@@ -1333,6 +1392,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1333 | { | 1392 | { |
1334 | INIT_LIST_HEAD(&cgrp->sibling); | 1393 | INIT_LIST_HEAD(&cgrp->sibling); |
1335 | INIT_LIST_HEAD(&cgrp->children); | 1394 | INIT_LIST_HEAD(&cgrp->children); |
1395 | INIT_LIST_HEAD(&cgrp->files); | ||
1336 | INIT_LIST_HEAD(&cgrp->css_sets); | 1396 | INIT_LIST_HEAD(&cgrp->css_sets); |
1337 | INIT_LIST_HEAD(&cgrp->release_list); | 1397 | INIT_LIST_HEAD(&cgrp->release_list); |
1338 | INIT_LIST_HEAD(&cgrp->pidlists); | 1398 | INIT_LIST_HEAD(&cgrp->pidlists); |
@@ -1344,11 +1404,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1344 | static void init_cgroup_root(struct cgroupfs_root *root) | 1404 | static void init_cgroup_root(struct cgroupfs_root *root) |
1345 | { | 1405 | { |
1346 | struct cgroup *cgrp = &root->top_cgroup; | 1406 | struct cgroup *cgrp = &root->top_cgroup; |
1407 | |||
1347 | INIT_LIST_HEAD(&root->subsys_list); | 1408 | INIT_LIST_HEAD(&root->subsys_list); |
1348 | INIT_LIST_HEAD(&root->root_list); | 1409 | INIT_LIST_HEAD(&root->root_list); |
1410 | INIT_LIST_HEAD(&root->allcg_list); | ||
1349 | root->number_of_cgroups = 1; | 1411 | root->number_of_cgroups = 1; |
1350 | cgrp->root = root; | 1412 | cgrp->root = root; |
1351 | cgrp->top_cgroup = cgrp; | 1413 | cgrp->top_cgroup = cgrp; |
1414 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1352 | init_cgroup_housekeeping(cgrp); | 1415 | init_cgroup_housekeeping(cgrp); |
1353 | } | 1416 | } |
1354 | 1417 | ||
@@ -1472,7 +1535,6 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1472 | 1535 | ||
1473 | struct inode *inode = | 1536 | struct inode *inode = |
1474 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); | 1537 | cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); |
1475 | struct dentry *dentry; | ||
1476 | 1538 | ||
1477 | if (!inode) | 1539 | if (!inode) |
1478 | return -ENOMEM; | 1540 | return -ENOMEM; |
@@ -1481,12 +1543,9 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1481 | inode->i_op = &cgroup_dir_inode_operations; | 1543 | inode->i_op = &cgroup_dir_inode_operations; |
1482 | /* directories start off with i_nlink == 2 (for "." entry) */ | 1544 | /* directories start off with i_nlink == 2 (for "." entry) */ |
1483 | inc_nlink(inode); | 1545 | inc_nlink(inode); |
1484 | dentry = d_alloc_root(inode); | 1546 | sb->s_root = d_make_root(inode); |
1485 | if (!dentry) { | 1547 | if (!sb->s_root) |
1486 | iput(inode); | ||
1487 | return -ENOMEM; | 1548 | return -ENOMEM; |
1488 | } | ||
1489 | sb->s_root = dentry; | ||
1490 | /* for everything else we want ->d_op set */ | 1549 | /* for everything else we want ->d_op set */ |
1491 | sb->s_d_op = &cgroup_dops; | 1550 | sb->s_d_op = &cgroup_dops; |
1492 | return 0; | 1551 | return 0; |
@@ -1696,16 +1755,6 @@ static struct file_system_type cgroup_fs_type = { | |||
1696 | 1755 | ||
1697 | static struct kobject *cgroup_kobj; | 1756 | static struct kobject *cgroup_kobj; |
1698 | 1757 | ||
1699 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
1700 | { | ||
1701 | return dentry->d_fsdata; | ||
1702 | } | ||
1703 | |||
1704 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
1705 | { | ||
1706 | return dentry->d_fsdata; | ||
1707 | } | ||
1708 | |||
1709 | /** | 1758 | /** |
1710 | * cgroup_path - generate the path of a cgroup | 1759 | * cgroup_path - generate the path of a cgroup |
1711 | * @cgrp: the cgroup in question | 1760 | * @cgrp: the cgroup in question |
@@ -1763,6 +1812,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); | |||
1763 | struct task_and_cgroup { | 1812 | struct task_and_cgroup { |
1764 | struct task_struct *task; | 1813 | struct task_struct *task; |
1765 | struct cgroup *cgrp; | 1814 | struct cgroup *cgrp; |
1815 | struct css_set *cg; | ||
1766 | }; | 1816 | }; |
1767 | 1817 | ||
1768 | struct cgroup_taskset { | 1818 | struct cgroup_taskset { |
@@ -1843,11 +1893,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1843 | * will already exist. If not set, this function might sleep, and can fail with | 1893 | * will already exist. If not set, this function might sleep, and can fail with |
1844 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. | 1894 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. |
1845 | */ | 1895 | */ |
1846 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1896 | static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1847 | struct task_struct *tsk, bool guarantee) | 1897 | struct task_struct *tsk, struct css_set *newcg) |
1848 | { | 1898 | { |
1849 | struct css_set *oldcg; | 1899 | struct css_set *oldcg; |
1850 | struct css_set *newcg; | ||
1851 | 1900 | ||
1852 | /* | 1901 | /* |
1853 | * We are synchronized through threadgroup_lock() against PF_EXITING | 1902 | * We are synchronized through threadgroup_lock() against PF_EXITING |
@@ -1857,23 +1906,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1857 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | 1906 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
1858 | oldcg = tsk->cgroups; | 1907 | oldcg = tsk->cgroups; |
1859 | 1908 | ||
1860 | /* locate or allocate a new css_set for this task. */ | ||
1861 | if (guarantee) { | ||
1862 | /* we know the css_set we want already exists. */ | ||
1863 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
1864 | read_lock(&css_set_lock); | ||
1865 | newcg = find_existing_css_set(oldcg, cgrp, template); | ||
1866 | BUG_ON(!newcg); | ||
1867 | get_css_set(newcg); | ||
1868 | read_unlock(&css_set_lock); | ||
1869 | } else { | ||
1870 | might_sleep(); | ||
1871 | /* find_css_set will give us newcg already referenced. */ | ||
1872 | newcg = find_css_set(oldcg, cgrp); | ||
1873 | if (!newcg) | ||
1874 | return -ENOMEM; | ||
1875 | } | ||
1876 | |||
1877 | task_lock(tsk); | 1909 | task_lock(tsk); |
1878 | rcu_assign_pointer(tsk->cgroups, newcg); | 1910 | rcu_assign_pointer(tsk->cgroups, newcg); |
1879 | task_unlock(tsk); | 1911 | task_unlock(tsk); |
@@ -1892,7 +1924,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1892 | put_css_set(oldcg); | 1924 | put_css_set(oldcg); |
1893 | 1925 | ||
1894 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1926 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); |
1895 | return 0; | ||
1896 | } | 1927 | } |
1897 | 1928 | ||
1898 | /** | 1929 | /** |
@@ -1905,11 +1936,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1905 | */ | 1936 | */ |
1906 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1937 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1907 | { | 1938 | { |
1908 | int retval; | 1939 | int retval = 0; |
1909 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1940 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1910 | struct cgroup *oldcgrp; | 1941 | struct cgroup *oldcgrp; |
1911 | struct cgroupfs_root *root = cgrp->root; | 1942 | struct cgroupfs_root *root = cgrp->root; |
1912 | struct cgroup_taskset tset = { }; | 1943 | struct cgroup_taskset tset = { }; |
1944 | struct css_set *newcg; | ||
1913 | 1945 | ||
1914 | /* @tsk either already exited or can't exit until the end */ | 1946 | /* @tsk either already exited or can't exit until the end */ |
1915 | if (tsk->flags & PF_EXITING) | 1947 | if (tsk->flags & PF_EXITING) |
@@ -1925,7 +1957,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1925 | 1957 | ||
1926 | for_each_subsys(root, ss) { | 1958 | for_each_subsys(root, ss) { |
1927 | if (ss->can_attach) { | 1959 | if (ss->can_attach) { |
1928 | retval = ss->can_attach(ss, cgrp, &tset); | 1960 | retval = ss->can_attach(cgrp, &tset); |
1929 | if (retval) { | 1961 | if (retval) { |
1930 | /* | 1962 | /* |
1931 | * Remember on which subsystem the can_attach() | 1963 | * Remember on which subsystem the can_attach() |
@@ -1939,13 +1971,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1939 | } | 1971 | } |
1940 | } | 1972 | } |
1941 | 1973 | ||
1942 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); | 1974 | newcg = find_css_set(tsk->cgroups, cgrp); |
1943 | if (retval) | 1975 | if (!newcg) { |
1976 | retval = -ENOMEM; | ||
1944 | goto out; | 1977 | goto out; |
1978 | } | ||
1979 | |||
1980 | cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); | ||
1945 | 1981 | ||
1946 | for_each_subsys(root, ss) { | 1982 | for_each_subsys(root, ss) { |
1947 | if (ss->attach) | 1983 | if (ss->attach) |
1948 | ss->attach(ss, cgrp, &tset); | 1984 | ss->attach(cgrp, &tset); |
1949 | } | 1985 | } |
1950 | 1986 | ||
1951 | synchronize_rcu(); | 1987 | synchronize_rcu(); |
@@ -1967,7 +2003,7 @@ out: | |||
1967 | */ | 2003 | */ |
1968 | break; | 2004 | break; |
1969 | if (ss->cancel_attach) | 2005 | if (ss->cancel_attach) |
1970 | ss->cancel_attach(ss, cgrp, &tset); | 2006 | ss->cancel_attach(cgrp, &tset); |
1971 | } | 2007 | } |
1972 | } | 2008 | } |
1973 | return retval; | 2009 | return retval; |
@@ -1997,66 +2033,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
1997 | } | 2033 | } |
1998 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2034 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
1999 | 2035 | ||
2000 | /* | ||
2001 | * cgroup_attach_proc works in two stages, the first of which prefetches all | ||
2002 | * new css_sets needed (to make sure we have enough memory before committing | ||
2003 | * to the move) and stores them in a list of entries of the following type. | ||
2004 | * TODO: possible optimization: use css_set->rcu_head for chaining instead | ||
2005 | */ | ||
2006 | struct cg_list_entry { | ||
2007 | struct css_set *cg; | ||
2008 | struct list_head links; | ||
2009 | }; | ||
2010 | |||
2011 | static bool css_set_check_fetched(struct cgroup *cgrp, | ||
2012 | struct task_struct *tsk, struct css_set *cg, | ||
2013 | struct list_head *newcg_list) | ||
2014 | { | ||
2015 | struct css_set *newcg; | ||
2016 | struct cg_list_entry *cg_entry; | ||
2017 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
2018 | |||
2019 | read_lock(&css_set_lock); | ||
2020 | newcg = find_existing_css_set(cg, cgrp, template); | ||
2021 | read_unlock(&css_set_lock); | ||
2022 | |||
2023 | /* doesn't exist at all? */ | ||
2024 | if (!newcg) | ||
2025 | return false; | ||
2026 | /* see if it's already in the list */ | ||
2027 | list_for_each_entry(cg_entry, newcg_list, links) | ||
2028 | if (cg_entry->cg == newcg) | ||
2029 | return true; | ||
2030 | |||
2031 | /* not found */ | ||
2032 | return false; | ||
2033 | } | ||
2034 | |||
2035 | /* | ||
2036 | * Find the new css_set and store it in the list in preparation for moving the | ||
2037 | * given task to the given cgroup. Returns 0 or -ENOMEM. | ||
2038 | */ | ||
2039 | static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | ||
2040 | struct list_head *newcg_list) | ||
2041 | { | ||
2042 | struct css_set *newcg; | ||
2043 | struct cg_list_entry *cg_entry; | ||
2044 | |||
2045 | /* ensure a new css_set will exist for this thread */ | ||
2046 | newcg = find_css_set(cg, cgrp); | ||
2047 | if (!newcg) | ||
2048 | return -ENOMEM; | ||
2049 | /* add it to the list */ | ||
2050 | cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); | ||
2051 | if (!cg_entry) { | ||
2052 | put_css_set(newcg); | ||
2053 | return -ENOMEM; | ||
2054 | } | ||
2055 | cg_entry->cg = newcg; | ||
2056 | list_add(&cg_entry->links, newcg_list); | ||
2057 | return 0; | ||
2058 | } | ||
2059 | |||
2060 | /** | 2036 | /** |
2061 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup | 2037 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup |
2062 | * @cgrp: the cgroup to attach to | 2038 | * @cgrp: the cgroup to attach to |
@@ -2070,20 +2046,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2070 | int retval, i, group_size; | 2046 | int retval, i, group_size; |
2071 | struct cgroup_subsys *ss, *failed_ss = NULL; | 2047 | struct cgroup_subsys *ss, *failed_ss = NULL; |
2072 | /* guaranteed to be initialized later, but the compiler needs this */ | 2048 | /* guaranteed to be initialized later, but the compiler needs this */ |
2073 | struct css_set *oldcg; | ||
2074 | struct cgroupfs_root *root = cgrp->root; | 2049 | struct cgroupfs_root *root = cgrp->root; |
2075 | /* threadgroup list cursor and array */ | 2050 | /* threadgroup list cursor and array */ |
2076 | struct task_struct *tsk; | 2051 | struct task_struct *tsk; |
2077 | struct task_and_cgroup *tc; | 2052 | struct task_and_cgroup *tc; |
2078 | struct flex_array *group; | 2053 | struct flex_array *group; |
2079 | struct cgroup_taskset tset = { }; | 2054 | struct cgroup_taskset tset = { }; |
2080 | /* | ||
2081 | * we need to make sure we have css_sets for all the tasks we're | ||
2082 | * going to move -before- we actually start moving them, so that in | ||
2083 | * case we get an ENOMEM we can bail out before making any changes. | ||
2084 | */ | ||
2085 | struct list_head newcg_list; | ||
2086 | struct cg_list_entry *cg_entry, *temp_nobe; | ||
2087 | 2055 | ||
2088 | /* | 2056 | /* |
2089 | * step 0: in order to do expensive, possibly blocking operations for | 2057 | * step 0: in order to do expensive, possibly blocking operations for |
@@ -2102,23 +2070,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2102 | if (retval) | 2070 | if (retval) |
2103 | goto out_free_group_list; | 2071 | goto out_free_group_list; |
2104 | 2072 | ||
2105 | /* prevent changes to the threadgroup list while we take a snapshot. */ | ||
2106 | read_lock(&tasklist_lock); | ||
2107 | if (!thread_group_leader(leader)) { | ||
2108 | /* | ||
2109 | * a race with de_thread from another thread's exec() may strip | ||
2110 | * us of our leadership, making while_each_thread unsafe to use | ||
2111 | * on this task. if this happens, there is no choice but to | ||
2112 | * throw this task away and try again (from cgroup_procs_write); | ||
2113 | * this is "double-double-toil-and-trouble-check locking". | ||
2114 | */ | ||
2115 | read_unlock(&tasklist_lock); | ||
2116 | retval = -EAGAIN; | ||
2117 | goto out_free_group_list; | ||
2118 | } | ||
2119 | |||
2120 | tsk = leader; | 2073 | tsk = leader; |
2121 | i = 0; | 2074 | i = 0; |
2075 | /* | ||
2076 | * Prevent freeing of tasks while we take a snapshot. Tasks that are | ||
2077 | * already PF_EXITING could be freed from underneath us unless we | ||
2078 | * take an rcu_read_lock. | ||
2079 | */ | ||
2080 | rcu_read_lock(); | ||
2122 | do { | 2081 | do { |
2123 | struct task_and_cgroup ent; | 2082 | struct task_and_cgroup ent; |
2124 | 2083 | ||
@@ -2128,24 +2087,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2128 | 2087 | ||
2129 | /* as per above, nr_threads may decrease, but not increase. */ | 2088 | /* as per above, nr_threads may decrease, but not increase. */ |
2130 | BUG_ON(i >= group_size); | 2089 | BUG_ON(i >= group_size); |
2131 | /* | ||
2132 | * saying GFP_ATOMIC has no effect here because we did prealloc | ||
2133 | * earlier, but it's good form to communicate our expectations. | ||
2134 | */ | ||
2135 | ent.task = tsk; | 2090 | ent.task = tsk; |
2136 | ent.cgrp = task_cgroup_from_root(tsk, root); | 2091 | ent.cgrp = task_cgroup_from_root(tsk, root); |
2137 | /* nothing to do if this task is already in the cgroup */ | 2092 | /* nothing to do if this task is already in the cgroup */ |
2138 | if (ent.cgrp == cgrp) | 2093 | if (ent.cgrp == cgrp) |
2139 | continue; | 2094 | continue; |
2095 | /* | ||
2096 | * saying GFP_ATOMIC has no effect here because we did prealloc | ||
2097 | * earlier, but it's good form to communicate our expectations. | ||
2098 | */ | ||
2140 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); | 2099 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); |
2141 | BUG_ON(retval != 0); | 2100 | BUG_ON(retval != 0); |
2142 | i++; | 2101 | i++; |
2143 | } while_each_thread(leader, tsk); | 2102 | } while_each_thread(leader, tsk); |
2103 | rcu_read_unlock(); | ||
2144 | /* remember the number of threads in the array for later. */ | 2104 | /* remember the number of threads in the array for later. */ |
2145 | group_size = i; | 2105 | group_size = i; |
2146 | tset.tc_array = group; | 2106 | tset.tc_array = group; |
2147 | tset.tc_array_len = group_size; | 2107 | tset.tc_array_len = group_size; |
2148 | read_unlock(&tasklist_lock); | ||
2149 | 2108 | ||
2150 | /* methods shouldn't be called if no task is actually migrating */ | 2109 | /* methods shouldn't be called if no task is actually migrating */ |
2151 | retval = 0; | 2110 | retval = 0; |
@@ -2157,7 +2116,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2157 | */ | 2116 | */ |
2158 | for_each_subsys(root, ss) { | 2117 | for_each_subsys(root, ss) { |
2159 | if (ss->can_attach) { | 2118 | if (ss->can_attach) { |
2160 | retval = ss->can_attach(ss, cgrp, &tset); | 2119 | retval = ss->can_attach(cgrp, &tset); |
2161 | if (retval) { | 2120 | if (retval) { |
2162 | failed_ss = ss; | 2121 | failed_ss = ss; |
2163 | goto out_cancel_attach; | 2122 | goto out_cancel_attach; |
@@ -2169,17 +2128,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2169 | * step 2: make sure css_sets exist for all threads to be migrated. | 2128 | * step 2: make sure css_sets exist for all threads to be migrated. |
2170 | * we use find_css_set, which allocates a new one if necessary. | 2129 | * we use find_css_set, which allocates a new one if necessary. |
2171 | */ | 2130 | */ |
2172 | INIT_LIST_HEAD(&newcg_list); | ||
2173 | for (i = 0; i < group_size; i++) { | 2131 | for (i = 0; i < group_size; i++) { |
2174 | tc = flex_array_get(group, i); | 2132 | tc = flex_array_get(group, i); |
2175 | oldcg = tc->task->cgroups; | 2133 | tc->cg = find_css_set(tc->task->cgroups, cgrp); |
2176 | 2134 | if (!tc->cg) { | |
2177 | /* if we don't already have it in the list get a new one */ | 2135 | retval = -ENOMEM; |
2178 | if (!css_set_check_fetched(cgrp, tc->task, oldcg, | 2136 | goto out_put_css_set_refs; |
2179 | &newcg_list)) { | ||
2180 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | ||
2181 | if (retval) | ||
2182 | goto out_list_teardown; | ||
2183 | } | 2137 | } |
2184 | } | 2138 | } |
2185 | 2139 | ||
@@ -2190,8 +2144,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2190 | */ | 2144 | */ |
2191 | for (i = 0; i < group_size; i++) { | 2145 | for (i = 0; i < group_size; i++) { |
2192 | tc = flex_array_get(group, i); | 2146 | tc = flex_array_get(group, i); |
2193 | retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); | 2147 | cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); |
2194 | BUG_ON(retval); | ||
2195 | } | 2148 | } |
2196 | /* nothing is sensitive to fork() after this point. */ | 2149 | /* nothing is sensitive to fork() after this point. */ |
2197 | 2150 | ||
@@ -2200,7 +2153,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2200 | */ | 2153 | */ |
2201 | for_each_subsys(root, ss) { | 2154 | for_each_subsys(root, ss) { |
2202 | if (ss->attach) | 2155 | if (ss->attach) |
2203 | ss->attach(ss, cgrp, &tset); | 2156 | ss->attach(cgrp, &tset); |
2204 | } | 2157 | } |
2205 | 2158 | ||
2206 | /* | 2159 | /* |
@@ -2209,21 +2162,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2209 | synchronize_rcu(); | 2162 | synchronize_rcu(); |
2210 | cgroup_wakeup_rmdir_waiter(cgrp); | 2163 | cgroup_wakeup_rmdir_waiter(cgrp); |
2211 | retval = 0; | 2164 | retval = 0; |
2212 | out_list_teardown: | 2165 | out_put_css_set_refs: |
2213 | /* clean up the list of prefetched css_sets. */ | 2166 | if (retval) { |
2214 | list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { | 2167 | for (i = 0; i < group_size; i++) { |
2215 | list_del(&cg_entry->links); | 2168 | tc = flex_array_get(group, i); |
2216 | put_css_set(cg_entry->cg); | 2169 | if (!tc->cg) |
2217 | kfree(cg_entry); | 2170 | break; |
2171 | put_css_set(tc->cg); | ||
2172 | } | ||
2218 | } | 2173 | } |
2219 | out_cancel_attach: | 2174 | out_cancel_attach: |
2220 | /* same deal as in cgroup_attach_task */ | ||
2221 | if (retval) { | 2175 | if (retval) { |
2222 | for_each_subsys(root, ss) { | 2176 | for_each_subsys(root, ss) { |
2223 | if (ss == failed_ss) | 2177 | if (ss == failed_ss) |
2224 | break; | 2178 | break; |
2225 | if (ss->cancel_attach) | 2179 | if (ss->cancel_attach) |
2226 | ss->cancel_attach(ss, cgrp, &tset); | 2180 | ss->cancel_attach(cgrp, &tset); |
2227 | } | 2181 | } |
2228 | } | 2182 | } |
2229 | out_free_group_list: | 2183 | out_free_group_list: |
@@ -2245,22 +2199,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2245 | if (!cgroup_lock_live_group(cgrp)) | 2199 | if (!cgroup_lock_live_group(cgrp)) |
2246 | return -ENODEV; | 2200 | return -ENODEV; |
2247 | 2201 | ||
2202 | retry_find_task: | ||
2203 | rcu_read_lock(); | ||
2248 | if (pid) { | 2204 | if (pid) { |
2249 | rcu_read_lock(); | ||
2250 | tsk = find_task_by_vpid(pid); | 2205 | tsk = find_task_by_vpid(pid); |
2251 | if (!tsk) { | 2206 | if (!tsk) { |
2252 | rcu_read_unlock(); | 2207 | rcu_read_unlock(); |
2253 | cgroup_unlock(); | 2208 | ret= -ESRCH; |
2254 | return -ESRCH; | 2209 | goto out_unlock_cgroup; |
2255 | } | ||
2256 | if (threadgroup) { | ||
2257 | /* | ||
2258 | * RCU protects this access, since tsk was found in the | ||
2259 | * tid map. a race with de_thread may cause group_leader | ||
2260 | * to stop being the leader, but cgroup_attach_proc will | ||
2261 | * detect it later. | ||
2262 | */ | ||
2263 | tsk = tsk->group_leader; | ||
2264 | } | 2210 | } |
2265 | /* | 2211 | /* |
2266 | * even if we're attaching all tasks in the thread group, we | 2212 | * even if we're attaching all tasks in the thread group, we |
@@ -2271,29 +2217,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2271 | cred->euid != tcred->uid && | 2217 | cred->euid != tcred->uid && |
2272 | cred->euid != tcred->suid) { | 2218 | cred->euid != tcred->suid) { |
2273 | rcu_read_unlock(); | 2219 | rcu_read_unlock(); |
2274 | cgroup_unlock(); | 2220 | ret = -EACCES; |
2275 | return -EACCES; | 2221 | goto out_unlock_cgroup; |
2276 | } | 2222 | } |
2277 | get_task_struct(tsk); | 2223 | } else |
2278 | rcu_read_unlock(); | 2224 | tsk = current; |
2279 | } else { | ||
2280 | if (threadgroup) | ||
2281 | tsk = current->group_leader; | ||
2282 | else | ||
2283 | tsk = current; | ||
2284 | get_task_struct(tsk); | ||
2285 | } | ||
2286 | |||
2287 | threadgroup_lock(tsk); | ||
2288 | 2225 | ||
2289 | if (threadgroup) | 2226 | if (threadgroup) |
2227 | tsk = tsk->group_leader; | ||
2228 | get_task_struct(tsk); | ||
2229 | rcu_read_unlock(); | ||
2230 | |||
2231 | threadgroup_lock(tsk); | ||
2232 | if (threadgroup) { | ||
2233 | if (!thread_group_leader(tsk)) { | ||
2234 | /* | ||
2235 | * a race with de_thread from another thread's exec() | ||
2236 | * may strip us of our leadership, if this happens, | ||
2237 | * there is no choice but to throw this task away and | ||
2238 | * try again; this is | ||
2239 | * "double-double-toil-and-trouble-check locking". | ||
2240 | */ | ||
2241 | threadgroup_unlock(tsk); | ||
2242 | put_task_struct(tsk); | ||
2243 | goto retry_find_task; | ||
2244 | } | ||
2290 | ret = cgroup_attach_proc(cgrp, tsk); | 2245 | ret = cgroup_attach_proc(cgrp, tsk); |
2291 | else | 2246 | } else |
2292 | ret = cgroup_attach_task(cgrp, tsk); | 2247 | ret = cgroup_attach_task(cgrp, tsk); |
2293 | |||
2294 | threadgroup_unlock(tsk); | 2248 | threadgroup_unlock(tsk); |
2295 | 2249 | ||
2296 | put_task_struct(tsk); | 2250 | put_task_struct(tsk); |
2251 | out_unlock_cgroup: | ||
2297 | cgroup_unlock(); | 2252 | cgroup_unlock(); |
2298 | return ret; | 2253 | return ret; |
2299 | } | 2254 | } |
@@ -2305,16 +2260,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | |||
2305 | 2260 | ||
2306 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | 2261 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) |
2307 | { | 2262 | { |
2308 | int ret; | 2263 | return attach_task_by_pid(cgrp, tgid, true); |
2309 | do { | ||
2310 | /* | ||
2311 | * attach_proc fails with -EAGAIN if threadgroup leadership | ||
2312 | * changes in the middle of the operation, in which case we need | ||
2313 | * to find the task_struct for the new leader and start over. | ||
2314 | */ | ||
2315 | ret = attach_task_by_pid(cgrp, tgid, true); | ||
2316 | } while (ret == -EAGAIN); | ||
2317 | return ret; | ||
2318 | } | 2264 | } |
2319 | 2265 | ||
2320 | /** | 2266 | /** |
@@ -2710,50 +2656,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2710 | return mode; | 2656 | return mode; |
2711 | } | 2657 | } |
2712 | 2658 | ||
2713 | int cgroup_add_file(struct cgroup *cgrp, | 2659 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2714 | struct cgroup_subsys *subsys, | 2660 | const struct cftype *cft) |
2715 | const struct cftype *cft) | ||
2716 | { | 2661 | { |
2717 | struct dentry *dir = cgrp->dentry; | 2662 | struct dentry *dir = cgrp->dentry; |
2663 | struct cgroup *parent = __d_cgrp(dir); | ||
2718 | struct dentry *dentry; | 2664 | struct dentry *dentry; |
2665 | struct cfent *cfe; | ||
2719 | int error; | 2666 | int error; |
2720 | umode_t mode; | 2667 | umode_t mode; |
2721 | |||
2722 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2668 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2669 | |||
2670 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2671 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2672 | return 0; | ||
2673 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2674 | return 0; | ||
2675 | |||
2723 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2676 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2724 | strcpy(name, subsys->name); | 2677 | strcpy(name, subsys->name); |
2725 | strcat(name, "."); | 2678 | strcat(name, "."); |
2726 | } | 2679 | } |
2727 | strcat(name, cft->name); | 2680 | strcat(name, cft->name); |
2681 | |||
2728 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2682 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2683 | |||
2684 | cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); | ||
2685 | if (!cfe) | ||
2686 | return -ENOMEM; | ||
2687 | |||
2729 | dentry = lookup_one_len(name, dir, strlen(name)); | 2688 | dentry = lookup_one_len(name, dir, strlen(name)); |
2730 | if (!IS_ERR(dentry)) { | 2689 | if (IS_ERR(dentry)) { |
2731 | mode = cgroup_file_mode(cft); | ||
2732 | error = cgroup_create_file(dentry, mode | S_IFREG, | ||
2733 | cgrp->root->sb); | ||
2734 | if (!error) | ||
2735 | dentry->d_fsdata = (void *)cft; | ||
2736 | dput(dentry); | ||
2737 | } else | ||
2738 | error = PTR_ERR(dentry); | 2690 | error = PTR_ERR(dentry); |
2691 | goto out; | ||
2692 | } | ||
2693 | |||
2694 | mode = cgroup_file_mode(cft); | ||
2695 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | ||
2696 | if (!error) { | ||
2697 | cfe->type = (void *)cft; | ||
2698 | cfe->dentry = dentry; | ||
2699 | dentry->d_fsdata = cfe; | ||
2700 | list_add_tail(&cfe->node, &parent->files); | ||
2701 | cfe = NULL; | ||
2702 | } | ||
2703 | dput(dentry); | ||
2704 | out: | ||
2705 | kfree(cfe); | ||
2739 | return error; | 2706 | return error; |
2740 | } | 2707 | } |
2741 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2742 | 2708 | ||
2743 | int cgroup_add_files(struct cgroup *cgrp, | 2709 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2744 | struct cgroup_subsys *subsys, | 2710 | const struct cftype cfts[], bool is_add) |
2745 | const struct cftype cft[], | ||
2746 | int count) | ||
2747 | { | 2711 | { |
2748 | int i, err; | 2712 | const struct cftype *cft; |
2749 | for (i = 0; i < count; i++) { | 2713 | int err, ret = 0; |
2750 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2714 | |
2751 | if (err) | 2715 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2752 | return err; | 2716 | if (is_add) |
2717 | err = cgroup_add_file(cgrp, subsys, cft); | ||
2718 | else | ||
2719 | err = cgroup_rm_file(cgrp, cft); | ||
2720 | if (err) { | ||
2721 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2722 | is_add ? "add" : "remove", cft->name, err); | ||
2723 | ret = err; | ||
2724 | } | ||
2725 | } | ||
2726 | return ret; | ||
2727 | } | ||
2728 | |||
2729 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2730 | |||
2731 | static void cgroup_cfts_prepare(void) | ||
2732 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | ||
2733 | { | ||
2734 | /* | ||
2735 | * Thanks to the entanglement with vfs inode locking, we can't walk | ||
2736 | * the existing cgroups under cgroup_mutex and create files. | ||
2737 | * Instead, we increment reference on all cgroups and build list of | ||
2738 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | ||
2739 | * exclusive access to the field. | ||
2740 | */ | ||
2741 | mutex_lock(&cgroup_cft_mutex); | ||
2742 | mutex_lock(&cgroup_mutex); | ||
2743 | } | ||
2744 | |||
2745 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | ||
2746 | const struct cftype *cfts, bool is_add) | ||
2747 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | ||
2748 | { | ||
2749 | LIST_HEAD(pending); | ||
2750 | struct cgroup *cgrp, *n; | ||
2751 | |||
2752 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | ||
2753 | if (cfts && ss->root != &rootnode) { | ||
2754 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | ||
2755 | dget(cgrp->dentry); | ||
2756 | list_add_tail(&cgrp->cft_q_node, &pending); | ||
2757 | } | ||
2758 | } | ||
2759 | |||
2760 | mutex_unlock(&cgroup_mutex); | ||
2761 | |||
2762 | /* | ||
2763 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | ||
2764 | * files for all cgroups which were created before. | ||
2765 | */ | ||
2766 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | ||
2767 | struct inode *inode = cgrp->dentry->d_inode; | ||
2768 | |||
2769 | mutex_lock(&inode->i_mutex); | ||
2770 | mutex_lock(&cgroup_mutex); | ||
2771 | if (!cgroup_is_removed(cgrp)) | ||
2772 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | ||
2773 | mutex_unlock(&cgroup_mutex); | ||
2774 | mutex_unlock(&inode->i_mutex); | ||
2775 | |||
2776 | list_del_init(&cgrp->cft_q_node); | ||
2777 | dput(cgrp->dentry); | ||
2753 | } | 2778 | } |
2779 | |||
2780 | mutex_unlock(&cgroup_cft_mutex); | ||
2781 | } | ||
2782 | |||
2783 | /** | ||
2784 | * cgroup_add_cftypes - add an array of cftypes to a subsystem | ||
2785 | * @ss: target cgroup subsystem | ||
2786 | * @cfts: zero-length name terminated array of cftypes | ||
2787 | * | ||
2788 | * Register @cfts to @ss. Files described by @cfts are created for all | ||
2789 | * existing cgroups to which @ss is attached and all future cgroups will | ||
2790 | * have them too. This function can be called anytime whether @ss is | ||
2791 | * attached or not. | ||
2792 | * | ||
2793 | * Returns 0 on successful registration, -errno on failure. Note that this | ||
2794 | * function currently returns 0 as long as @cfts registration is successful | ||
2795 | * even if some file creation attempts on existing cgroups fail. | ||
2796 | */ | ||
2797 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2798 | { | ||
2799 | struct cftype_set *set; | ||
2800 | |||
2801 | set = kzalloc(sizeof(*set), GFP_KERNEL); | ||
2802 | if (!set) | ||
2803 | return -ENOMEM; | ||
2804 | |||
2805 | cgroup_cfts_prepare(); | ||
2806 | set->cfts = cfts; | ||
2807 | list_add_tail(&set->node, &ss->cftsets); | ||
2808 | cgroup_cfts_commit(ss, cfts, true); | ||
2809 | |||
2754 | return 0; | 2810 | return 0; |
2755 | } | 2811 | } |
2756 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2812 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2813 | |||
2814 | /** | ||
2815 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | ||
2816 | * @ss: target cgroup subsystem | ||
2817 | * @cfts: zero-length name terminated array of cftypes | ||
2818 | * | ||
2819 | * Unregister @cfts from @ss. Files described by @cfts are removed from | ||
2820 | * all existing cgroups to which @ss is attached and all future cgroups | ||
2821 | * won't have them either. This function can be called anytime whether @ss | ||
2822 | * is attached or not. | ||
2823 | * | ||
2824 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | ||
2825 | * registered with @ss. | ||
2826 | */ | ||
2827 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2828 | { | ||
2829 | struct cftype_set *set; | ||
2830 | |||
2831 | cgroup_cfts_prepare(); | ||
2832 | |||
2833 | list_for_each_entry(set, &ss->cftsets, node) { | ||
2834 | if (set->cfts == cfts) { | ||
2835 | list_del_init(&set->node); | ||
2836 | cgroup_cfts_commit(ss, cfts, false); | ||
2837 | return 0; | ||
2838 | } | ||
2839 | } | ||
2840 | |||
2841 | cgroup_cfts_commit(ss, NULL, false); | ||
2842 | return -ENOENT; | ||
2843 | } | ||
2757 | 2844 | ||
2758 | /** | 2845 | /** |
2759 | * cgroup_task_count - count the number of tasks in a cgroup. | 2846 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -2804,15 +2891,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp, | |||
2804 | * using their cgroups capability, we don't maintain the lists running | 2891 | * using their cgroups capability, we don't maintain the lists running |
2805 | * through each css_set to its tasks until we see the list actually | 2892 | * through each css_set to its tasks until we see the list actually |
2806 | * used - in other words after the first call to cgroup_iter_start(). | 2893 | * used - in other words after the first call to cgroup_iter_start(). |
2807 | * | ||
2808 | * The tasklist_lock is not held here, as do_each_thread() and | ||
2809 | * while_each_thread() are protected by RCU. | ||
2810 | */ | 2894 | */ |
2811 | static void cgroup_enable_task_cg_lists(void) | 2895 | static void cgroup_enable_task_cg_lists(void) |
2812 | { | 2896 | { |
2813 | struct task_struct *p, *g; | 2897 | struct task_struct *p, *g; |
2814 | write_lock(&css_set_lock); | 2898 | write_lock(&css_set_lock); |
2815 | use_task_css_set_links = 1; | 2899 | use_task_css_set_links = 1; |
2900 | /* | ||
2901 | * We need tasklist_lock because RCU is not safe against | ||
2902 | * while_each_thread(). Besides, a forking task that has passed | ||
2903 | * cgroup_post_fork() without seeing use_task_css_set_links = 1 | ||
2904 | * is not guaranteed to have its child immediately visible in the | ||
2905 | * tasklist if we walk through it with RCU. | ||
2906 | */ | ||
2907 | read_lock(&tasklist_lock); | ||
2816 | do_each_thread(g, p) { | 2908 | do_each_thread(g, p) { |
2817 | task_lock(p); | 2909 | task_lock(p); |
2818 | /* | 2910 | /* |
@@ -2824,6 +2916,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
2824 | list_add(&p->cg_list, &p->cgroups->tasks); | 2916 | list_add(&p->cg_list, &p->cgroups->tasks); |
2825 | task_unlock(p); | 2917 | task_unlock(p); |
2826 | } while_each_thread(g, p); | 2918 | } while_each_thread(g, p); |
2919 | read_unlock(&tasklist_lock); | ||
2827 | write_unlock(&css_set_lock); | 2920 | write_unlock(&css_set_lock); |
2828 | } | 2921 | } |
2829 | 2922 | ||
@@ -3043,6 +3136,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3043 | * | 3136 | * |
3044 | */ | 3137 | */ |
3045 | 3138 | ||
3139 | /* which pidlist file are we talking about? */ | ||
3140 | enum cgroup_filetype { | ||
3141 | CGROUP_FILE_PROCS, | ||
3142 | CGROUP_FILE_TASKS, | ||
3143 | }; | ||
3144 | |||
3145 | /* | ||
3146 | * A pidlist is a list of pids that virtually represents the contents of one | ||
3147 | * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, | ||
3148 | * a pair (one each for procs, tasks) for each pid namespace that's relevant | ||
3149 | * to the cgroup. | ||
3150 | */ | ||
3151 | struct cgroup_pidlist { | ||
3152 | /* | ||
3153 | * used to find which pidlist is wanted. doesn't change as long as | ||
3154 | * this particular list stays in the list. | ||
3155 | */ | ||
3156 | struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; | ||
3157 | /* array of xids */ | ||
3158 | pid_t *list; | ||
3159 | /* how many elements the above list has */ | ||
3160 | int length; | ||
3161 | /* how many files are using the current array */ | ||
3162 | int use_count; | ||
3163 | /* each of these stored in a list by its cgroup */ | ||
3164 | struct list_head links; | ||
3165 | /* pointer to the cgroup we belong to, for list removal purposes */ | ||
3166 | struct cgroup *owner; | ||
3167 | /* protects the other fields */ | ||
3168 | struct rw_semaphore mutex; | ||
3169 | }; | ||
3170 | |||
3046 | /* | 3171 | /* |
3047 | * The following two functions "fix" the issue where there are more pids | 3172 | * The following two functions "fix" the issue where there are more pids |
3048 | * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. | 3173 | * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. |
@@ -3694,13 +3819,14 @@ static struct cftype files[] = { | |||
3694 | .read_u64 = cgroup_clone_children_read, | 3819 | .read_u64 = cgroup_clone_children_read, |
3695 | .write_u64 = cgroup_clone_children_write, | 3820 | .write_u64 = cgroup_clone_children_write, |
3696 | }, | 3821 | }, |
3697 | }; | 3822 | { |
3698 | 3823 | .name = "release_agent", | |
3699 | static struct cftype cft_release_agent = { | 3824 | .flags = CFTYPE_ONLY_ON_ROOT, |
3700 | .name = "release_agent", | 3825 | .read_seq_string = cgroup_release_agent_show, |
3701 | .read_seq_string = cgroup_release_agent_show, | 3826 | .write_string = cgroup_release_agent_write, |
3702 | .write_string = cgroup_release_agent_write, | 3827 | .max_write_len = PATH_MAX, |
3703 | .max_write_len = PATH_MAX, | 3828 | }, |
3829 | { } /* terminate */ | ||
3704 | }; | 3830 | }; |
3705 | 3831 | ||
3706 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3832 | static int cgroup_populate_dir(struct cgroup *cgrp) |
@@ -3708,22 +3834,21 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3708 | int err; | 3834 | int err; |
3709 | struct cgroup_subsys *ss; | 3835 | struct cgroup_subsys *ss; |
3710 | 3836 | ||
3711 | /* First clear out any existing files */ | 3837 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3712 | cgroup_clear_directory(cgrp->dentry); | ||
3713 | |||
3714 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | ||
3715 | if (err < 0) | 3838 | if (err < 0) |
3716 | return err; | 3839 | return err; |
3717 | 3840 | ||
3718 | if (cgrp == cgrp->top_cgroup) { | 3841 | /* process cftsets of each subsystem */ |
3719 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | ||
3720 | return err; | ||
3721 | } | ||
3722 | |||
3723 | for_each_subsys(cgrp->root, ss) { | 3842 | for_each_subsys(cgrp->root, ss) { |
3843 | struct cftype_set *set; | ||
3844 | |||
3724 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3845 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) |
3725 | return err; | 3846 | return err; |
3847 | |||
3848 | list_for_each_entry(set, &ss->cftsets, node) | ||
3849 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | ||
3726 | } | 3850 | } |
3851 | |||
3727 | /* This cgroup is ready now */ | 3852 | /* This cgroup is ready now */ |
3728 | for_each_subsys(cgrp->root, ss) { | 3853 | for_each_subsys(cgrp->root, ss) { |
3729 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3854 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
@@ -3739,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3739 | return 0; | 3864 | return 0; |
3740 | } | 3865 | } |
3741 | 3866 | ||
3867 | static void css_dput_fn(struct work_struct *work) | ||
3868 | { | ||
3869 | struct cgroup_subsys_state *css = | ||
3870 | container_of(work, struct cgroup_subsys_state, dput_work); | ||
3871 | |||
3872 | dput(css->cgroup->dentry); | ||
3873 | } | ||
3874 | |||
3742 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3875 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3743 | struct cgroup_subsys *ss, | 3876 | struct cgroup_subsys *ss, |
3744 | struct cgroup *cgrp) | 3877 | struct cgroup *cgrp) |
@@ -3751,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3751 | set_bit(CSS_ROOT, &css->flags); | 3884 | set_bit(CSS_ROOT, &css->flags); |
3752 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3885 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3753 | cgrp->subsys[ss->subsys_id] = css; | 3886 | cgrp->subsys[ss->subsys_id] = css; |
3887 | |||
3888 | /* | ||
3889 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | ||
3890 | * which is put on the last css_put(). dput() requires process | ||
3891 | * context, which css_put() may be called without. @css->dput_work | ||
3892 | * will be used to invoke dput() asynchronously from css_put(). | ||
3893 | */ | ||
3894 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
3895 | if (ss->__DEPRECATED_clear_css_refs) | ||
3896 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
3754 | } | 3897 | } |
3755 | 3898 | ||
3756 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3899 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
@@ -3827,7 +3970,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3827 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 3970 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); |
3828 | 3971 | ||
3829 | for_each_subsys(root, ss) { | 3972 | for_each_subsys(root, ss) { |
3830 | struct cgroup_subsys_state *css = ss->create(ss, cgrp); | 3973 | struct cgroup_subsys_state *css = ss->create(cgrp); |
3831 | 3974 | ||
3832 | if (IS_ERR(css)) { | 3975 | if (IS_ERR(css)) { |
3833 | err = PTR_ERR(css); | 3976 | err = PTR_ERR(css); |
@@ -3841,7 +3984,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3841 | } | 3984 | } |
3842 | /* At error, ->destroy() callback has to free assigned ID. */ | 3985 | /* At error, ->destroy() callback has to free assigned ID. */ |
3843 | if (clone_children(parent) && ss->post_clone) | 3986 | if (clone_children(parent) && ss->post_clone) |
3844 | ss->post_clone(ss, cgrp); | 3987 | ss->post_clone(cgrp); |
3845 | } | 3988 | } |
3846 | 3989 | ||
3847 | cgroup_lock_hierarchy(root); | 3990 | cgroup_lock_hierarchy(root); |
@@ -3853,9 +3996,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3853 | if (err < 0) | 3996 | if (err < 0) |
3854 | goto err_remove; | 3997 | goto err_remove; |
3855 | 3998 | ||
3999 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4000 | for_each_subsys(root, ss) | ||
4001 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4002 | dget(dentry); | ||
4003 | |||
3856 | /* The cgroup directory was pre-locked for us */ | 4004 | /* The cgroup directory was pre-locked for us */ |
3857 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4005 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3858 | 4006 | ||
4007 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4008 | |||
3859 | err = cgroup_populate_dir(cgrp); | 4009 | err = cgroup_populate_dir(cgrp); |
3860 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4010 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3861 | 4011 | ||
@@ -3875,7 +4025,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3875 | 4025 | ||
3876 | for_each_subsys(root, ss) { | 4026 | for_each_subsys(root, ss) { |
3877 | if (cgrp->subsys[ss->subsys_id]) | 4027 | if (cgrp->subsys[ss->subsys_id]) |
3878 | ss->destroy(ss, cgrp); | 4028 | ss->destroy(cgrp); |
3879 | } | 4029 | } |
3880 | 4030 | ||
3881 | mutex_unlock(&cgroup_mutex); | 4031 | mutex_unlock(&cgroup_mutex); |
@@ -3895,18 +4045,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
3895 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4045 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3896 | } | 4046 | } |
3897 | 4047 | ||
4048 | /* | ||
4049 | * Check the reference count on each subsystem. Since we already | ||
4050 | * established that there are no tasks in the cgroup, if the css refcount | ||
4051 | * is also 1, then there should be no outstanding references, so the | ||
4052 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4053 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4054 | * be called via check_for_release() with no synchronization other than | ||
4055 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4056 | */ | ||
3898 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 4057 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3899 | { | 4058 | { |
3900 | /* Check the reference count on each subsystem. Since we | ||
3901 | * already established that there are no tasks in the | ||
3902 | * cgroup, if the css refcount is also 1, then there should | ||
3903 | * be no outstanding references, so the subsystem is safe to | ||
3904 | * destroy. We scan across all subsystems rather than using | ||
3905 | * the per-hierarchy linked list of mounted subsystems since | ||
3906 | * we can be called via check_for_release() with no | ||
3907 | * synchronization other than RCU, and the subsystem linked | ||
3908 | * list isn't RCU-safe */ | ||
3909 | int i; | 4059 | int i; |
4060 | |||
3910 | /* | 4061 | /* |
3911 | * We won't need to lock the subsys array, because the subsystems | 4062 | * We won't need to lock the subsys array, because the subsystems |
3912 | * we're concerned about aren't going anywhere since our cgroup root | 4063 | * we're concerned about aren't going anywhere since our cgroup root |
@@ -3915,17 +4066,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3915 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4066 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3916 | struct cgroup_subsys *ss = subsys[i]; | 4067 | struct cgroup_subsys *ss = subsys[i]; |
3917 | struct cgroup_subsys_state *css; | 4068 | struct cgroup_subsys_state *css; |
4069 | |||
3918 | /* Skip subsystems not present or not in this hierarchy */ | 4070 | /* Skip subsystems not present or not in this hierarchy */ |
3919 | if (ss == NULL || ss->root != cgrp->root) | 4071 | if (ss == NULL || ss->root != cgrp->root) |
3920 | continue; | 4072 | continue; |
4073 | |||
3921 | css = cgrp->subsys[ss->subsys_id]; | 4074 | css = cgrp->subsys[ss->subsys_id]; |
3922 | /* When called from check_for_release() it's possible | 4075 | /* |
4076 | * When called from check_for_release() it's possible | ||
3923 | * that by this point the cgroup has been removed | 4077 | * that by this point the cgroup has been removed |
3924 | * and the css deleted. But a false-positive doesn't | 4078 | * and the css deleted. But a false-positive doesn't |
3925 | * matter, since it can only happen if the cgroup | 4079 | * matter, since it can only happen if the cgroup |
3926 | * has been deleted and hence no longer needs the | 4080 | * has been deleted and hence no longer needs the |
3927 | * release agent to be called anyway. */ | 4081 | * release agent to be called anyway. |
3928 | if (css && (atomic_read(&css->refcnt) > 1)) | 4082 | */ |
4083 | if (css && css_refcnt(css) > 1) | ||
3929 | return 1; | 4084 | return 1; |
3930 | } | 4085 | } |
3931 | return 0; | 4086 | return 0; |
@@ -3935,51 +4090,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3935 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4090 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3936 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 4091 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3937 | * busy subsystems. Call with cgroup_mutex held | 4092 | * busy subsystems. Call with cgroup_mutex held |
4093 | * | ||
4094 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4095 | * not, cgroup removal behaves differently. | ||
4096 | * | ||
4097 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4098 | * cgroup removal can be committed. This is implemented by | ||
4099 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4100 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4101 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4102 | * removed as soon as the existing user (memcg) is updated. | ||
4103 | * | ||
4104 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4105 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4106 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4107 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4108 | * is put so that dentry destruction happens only after all css's are | ||
4109 | * released. | ||
3938 | */ | 4110 | */ |
3939 | |||
3940 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 4111 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3941 | { | 4112 | { |
3942 | struct cgroup_subsys *ss; | 4113 | struct cgroup_subsys *ss; |
3943 | unsigned long flags; | 4114 | unsigned long flags; |
3944 | bool failed = false; | 4115 | bool failed = false; |
4116 | |||
3945 | local_irq_save(flags); | 4117 | local_irq_save(flags); |
4118 | |||
4119 | /* | ||
4120 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4121 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4122 | * deactivation, we succeeded. | ||
4123 | */ | ||
3946 | for_each_subsys(cgrp->root, ss) { | 4124 | for_each_subsys(cgrp->root, ss) { |
3947 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4125 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3948 | int refcnt; | 4126 | |
3949 | while (1) { | 4127 | WARN_ON(atomic_read(&css->refcnt) < 0); |
3950 | /* We can only remove a CSS with a refcnt==1 */ | 4128 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
3951 | refcnt = atomic_read(&css->refcnt); | 4129 | |
3952 | if (refcnt > 1) { | 4130 | if (ss->__DEPRECATED_clear_css_refs) |
3953 | failed = true; | 4131 | failed |= css_refcnt(css) != 1; |
3954 | goto done; | ||
3955 | } | ||
3956 | BUG_ON(!refcnt); | ||
3957 | /* | ||
3958 | * Drop the refcnt to 0 while we check other | ||
3959 | * subsystems. This will cause any racing | ||
3960 | * css_tryget() to spin until we set the | ||
3961 | * CSS_REMOVED bits or abort | ||
3962 | */ | ||
3963 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | ||
3964 | break; | ||
3965 | cpu_relax(); | ||
3966 | } | ||
3967 | } | 4132 | } |
3968 | done: | 4133 | |
4134 | /* | ||
4135 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4136 | * restore refcnts to positive values. Either way, all in-progress | ||
4137 | * css_tryget() will be released. | ||
4138 | */ | ||
3969 | for_each_subsys(cgrp->root, ss) { | 4139 | for_each_subsys(cgrp->root, ss) { |
3970 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4140 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3971 | if (failed) { | 4141 | |
3972 | /* | 4142 | if (!failed) { |
3973 | * Restore old refcnt if we previously managed | ||
3974 | * to clear it from 1 to 0 | ||
3975 | */ | ||
3976 | if (!atomic_read(&css->refcnt)) | ||
3977 | atomic_set(&css->refcnt, 1); | ||
3978 | } else { | ||
3979 | /* Commit the fact that the CSS is removed */ | ||
3980 | set_bit(CSS_REMOVED, &css->flags); | 4143 | set_bit(CSS_REMOVED, &css->flags); |
4144 | css_put(css); | ||
4145 | } else { | ||
4146 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
3981 | } | 4147 | } |
3982 | } | 4148 | } |
4149 | |||
3983 | local_irq_restore(flags); | 4150 | local_irq_restore(flags); |
3984 | return !failed; | 4151 | return !failed; |
3985 | } | 4152 | } |
@@ -4064,6 +4231,8 @@ again: | |||
4064 | list_del_init(&cgrp->sibling); | 4231 | list_del_init(&cgrp->sibling); |
4065 | cgroup_unlock_hierarchy(cgrp->root); | 4232 | cgroup_unlock_hierarchy(cgrp->root); |
4066 | 4233 | ||
4234 | list_del_init(&cgrp->allcg_node); | ||
4235 | |||
4067 | d = dget(cgrp->dentry); | 4236 | d = dget(cgrp->dentry); |
4068 | 4237 | ||
4069 | cgroup_d_remove_dir(d); | 4238 | cgroup_d_remove_dir(d); |
@@ -4090,16 +4259,33 @@ again: | |||
4090 | return 0; | 4259 | return 0; |
4091 | } | 4260 | } |
4092 | 4261 | ||
4262 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | ||
4263 | { | ||
4264 | INIT_LIST_HEAD(&ss->cftsets); | ||
4265 | |||
4266 | /* | ||
4267 | * base_cftset is embedded in subsys itself, no need to worry about | ||
4268 | * deregistration. | ||
4269 | */ | ||
4270 | if (ss->base_cftypes) { | ||
4271 | ss->base_cftset.cfts = ss->base_cftypes; | ||
4272 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | ||
4273 | } | ||
4274 | } | ||
4275 | |||
4093 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4276 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4094 | { | 4277 | { |
4095 | struct cgroup_subsys_state *css; | 4278 | struct cgroup_subsys_state *css; |
4096 | 4279 | ||
4097 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4280 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4098 | 4281 | ||
4282 | /* init base cftset */ | ||
4283 | cgroup_init_cftsets(ss); | ||
4284 | |||
4099 | /* Create the top cgroup state for this subsystem */ | 4285 | /* Create the top cgroup state for this subsystem */ |
4100 | list_add(&ss->sibling, &rootnode.subsys_list); | 4286 | list_add(&ss->sibling, &rootnode.subsys_list); |
4101 | ss->root = &rootnode; | 4287 | ss->root = &rootnode; |
4102 | css = ss->create(ss, dummytop); | 4288 | css = ss->create(dummytop); |
4103 | /* We don't handle early failures gracefully */ | 4289 | /* We don't handle early failures gracefully */ |
4104 | BUG_ON(IS_ERR(css)); | 4290 | BUG_ON(IS_ERR(css)); |
4105 | init_cgroup_css(css, ss, dummytop); | 4291 | init_cgroup_css(css, ss, dummytop); |
@@ -4165,6 +4351,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4165 | return 0; | 4351 | return 0; |
4166 | } | 4352 | } |
4167 | 4353 | ||
4354 | /* init base cftset */ | ||
4355 | cgroup_init_cftsets(ss); | ||
4356 | |||
4168 | /* | 4357 | /* |
4169 | * need to register a subsys id before anything else - for example, | 4358 | * need to register a subsys id before anything else - for example, |
4170 | * init_cgroup_css needs it. | 4359 | * init_cgroup_css needs it. |
@@ -4188,7 +4377,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4188 | * no ss->create seems to need anything important in the ss struct, so | 4377 | * no ss->create seems to need anything important in the ss struct, so |
4189 | * this can happen first (i.e. before the rootnode attachment). | 4378 | * this can happen first (i.e. before the rootnode attachment). |
4190 | */ | 4379 | */ |
4191 | css = ss->create(ss, dummytop); | 4380 | css = ss->create(dummytop); |
4192 | if (IS_ERR(css)) { | 4381 | if (IS_ERR(css)) { |
4193 | /* failure case - need to deassign the subsys[] slot. */ | 4382 | /* failure case - need to deassign the subsys[] slot. */ |
4194 | subsys[i] = NULL; | 4383 | subsys[i] = NULL; |
@@ -4206,7 +4395,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4206 | int ret = cgroup_init_idr(ss, css); | 4395 | int ret = cgroup_init_idr(ss, css); |
4207 | if (ret) { | 4396 | if (ret) { |
4208 | dummytop->subsys[ss->subsys_id] = NULL; | 4397 | dummytop->subsys[ss->subsys_id] = NULL; |
4209 | ss->destroy(ss, dummytop); | 4398 | ss->destroy(dummytop); |
4210 | subsys[i] = NULL; | 4399 | subsys[i] = NULL; |
4211 | mutex_unlock(&cgroup_mutex); | 4400 | mutex_unlock(&cgroup_mutex); |
4212 | return ret; | 4401 | return ret; |
@@ -4304,7 +4493,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4304 | * pointer to find their state. note that this also takes care of | 4493 | * pointer to find their state. note that this also takes care of |
4305 | * freeing the css_id. | 4494 | * freeing the css_id. |
4306 | */ | 4495 | */ |
4307 | ss->destroy(ss, dummytop); | 4496 | ss->destroy(dummytop); |
4308 | dummytop->subsys[ss->subsys_id] = NULL; | 4497 | dummytop->subsys[ss->subsys_id] = NULL; |
4309 | 4498 | ||
4310 | mutex_unlock(&cgroup_mutex); | 4499 | mutex_unlock(&cgroup_mutex); |
@@ -4580,7 +4769,7 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
4580 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 4769 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { |
4581 | struct cgroup_subsys *ss = subsys[i]; | 4770 | struct cgroup_subsys *ss = subsys[i]; |
4582 | if (ss->fork) | 4771 | if (ss->fork) |
4583 | ss->fork(ss, child); | 4772 | ss->fork(child); |
4584 | } | 4773 | } |
4585 | } | 4774 | } |
4586 | } | 4775 | } |
@@ -4596,6 +4785,17 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
4596 | */ | 4785 | */ |
4597 | void cgroup_post_fork(struct task_struct *child) | 4786 | void cgroup_post_fork(struct task_struct *child) |
4598 | { | 4787 | { |
4788 | /* | ||
4789 | * use_task_css_set_links is set to 1 before we walk the tasklist | ||
4790 | * under the tasklist_lock and we read it here after we added the child | ||
4791 | * to the tasklist under the tasklist_lock as well. If the child wasn't | ||
4792 | * yet in the tasklist when we walked through it from | ||
4793 | * cgroup_enable_task_cg_lists(), then use_task_css_set_links value | ||
4794 | * should be visible now due to the paired locking and barriers implied | ||
4795 | * by LOCK/UNLOCK: it is written before the tasklist_lock unlock | ||
4796 | * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock | ||
4797 | * lock on fork. | ||
4798 | */ | ||
4599 | if (use_task_css_set_links) { | 4799 | if (use_task_css_set_links) { |
4600 | write_lock(&css_set_lock); | 4800 | write_lock(&css_set_lock); |
4601 | if (list_empty(&child->cg_list)) { | 4801 | if (list_empty(&child->cg_list)) { |
@@ -4682,7 +4882,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4682 | struct cgroup *old_cgrp = | 4882 | struct cgroup *old_cgrp = |
4683 | rcu_dereference_raw(cg->subsys[i])->cgroup; | 4883 | rcu_dereference_raw(cg->subsys[i])->cgroup; |
4684 | struct cgroup *cgrp = task_cgroup(tsk, i); | 4884 | struct cgroup *cgrp = task_cgroup(tsk, i); |
4685 | ss->exit(ss, cgrp, old_cgrp, tsk); | 4885 | ss->exit(cgrp, old_cgrp, tsk); |
4686 | } | 4886 | } |
4687 | } | 4887 | } |
4688 | } | 4888 | } |
@@ -4743,21 +4943,41 @@ static void check_for_release(struct cgroup *cgrp) | |||
4743 | } | 4943 | } |
4744 | 4944 | ||
4745 | /* Caller must verify that the css is not for root cgroup */ | 4945 | /* Caller must verify that the css is not for root cgroup */ |
4746 | void __css_put(struct cgroup_subsys_state *css, int count) | 4946 | bool __css_tryget(struct cgroup_subsys_state *css) |
4947 | { | ||
4948 | do { | ||
4949 | int v = css_refcnt(css); | ||
4950 | |||
4951 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | ||
4952 | return true; | ||
4953 | cpu_relax(); | ||
4954 | } while (!test_bit(CSS_REMOVED, &css->flags)); | ||
4955 | |||
4956 | return false; | ||
4957 | } | ||
4958 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4959 | |||
4960 | /* Caller must verify that the css is not for root cgroup */ | ||
4961 | void __css_put(struct cgroup_subsys_state *css) | ||
4747 | { | 4962 | { |
4748 | struct cgroup *cgrp = css->cgroup; | 4963 | struct cgroup *cgrp = css->cgroup; |
4749 | int val; | 4964 | |
4750 | rcu_read_lock(); | 4965 | rcu_read_lock(); |
4751 | val = atomic_sub_return(count, &css->refcnt); | 4966 | atomic_dec(&css->refcnt); |
4752 | if (val == 1) { | 4967 | switch (css_refcnt(css)) { |
4968 | case 1: | ||
4753 | if (notify_on_release(cgrp)) { | 4969 | if (notify_on_release(cgrp)) { |
4754 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4970 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4755 | check_for_release(cgrp); | 4971 | check_for_release(cgrp); |
4756 | } | 4972 | } |
4757 | cgroup_wakeup_rmdir_waiter(cgrp); | 4973 | cgroup_wakeup_rmdir_waiter(cgrp); |
4974 | break; | ||
4975 | case 0: | ||
4976 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | ||
4977 | schedule_work(&css->dput_work); | ||
4978 | break; | ||
4758 | } | 4979 | } |
4759 | rcu_read_unlock(); | 4980 | rcu_read_unlock(); |
4760 | WARN_ON_ONCE(val < 1); | ||
4761 | } | 4981 | } |
4762 | EXPORT_SYMBOL_GPL(__css_put); | 4982 | EXPORT_SYMBOL_GPL(__css_put); |
4763 | 4983 | ||
@@ -4876,7 +5096,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4876 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5096 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4877 | * it's unchanged until freed. | 5097 | * it's unchanged until freed. |
4878 | */ | 5098 | */ |
4879 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5099 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4880 | 5100 | ||
4881 | if (cssid) | 5101 | if (cssid) |
4882 | return cssid->id; | 5102 | return cssid->id; |
@@ -4888,7 +5108,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4888 | { | 5108 | { |
4889 | struct css_id *cssid; | 5109 | struct css_id *cssid; |
4890 | 5110 | ||
4891 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5111 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4892 | 5112 | ||
4893 | if (cssid) | 5113 | if (cssid) |
4894 | return cssid->depth; | 5114 | return cssid->depth; |
@@ -4939,9 +5159,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
4939 | 5159 | ||
4940 | rcu_assign_pointer(id->css, NULL); | 5160 | rcu_assign_pointer(id->css, NULL); |
4941 | rcu_assign_pointer(css->id, NULL); | 5161 | rcu_assign_pointer(css->id, NULL); |
4942 | write_lock(&ss->id_lock); | 5162 | spin_lock(&ss->id_lock); |
4943 | idr_remove(&ss->idr, id->id); | 5163 | idr_remove(&ss->idr, id->id); |
4944 | write_unlock(&ss->id_lock); | 5164 | spin_unlock(&ss->id_lock); |
4945 | kfree_rcu(id, rcu_head); | 5165 | kfree_rcu(id, rcu_head); |
4946 | } | 5166 | } |
4947 | EXPORT_SYMBOL_GPL(free_css_id); | 5167 | EXPORT_SYMBOL_GPL(free_css_id); |
@@ -4967,10 +5187,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
4967 | error = -ENOMEM; | 5187 | error = -ENOMEM; |
4968 | goto err_out; | 5188 | goto err_out; |
4969 | } | 5189 | } |
4970 | write_lock(&ss->id_lock); | 5190 | spin_lock(&ss->id_lock); |
4971 | /* Don't use 0. allocates an ID of 1-65535 */ | 5191 | /* Don't use 0. allocates an ID of 1-65535 */ |
4972 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | 5192 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); |
4973 | write_unlock(&ss->id_lock); | 5193 | spin_unlock(&ss->id_lock); |
4974 | 5194 | ||
4975 | /* Returns error when there are no free spaces for new ID.*/ | 5195 | /* Returns error when there are no free spaces for new ID.*/ |
4976 | if (error) { | 5196 | if (error) { |
@@ -4985,9 +5205,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
4985 | return newid; | 5205 | return newid; |
4986 | remove_idr: | 5206 | remove_idr: |
4987 | error = -ENOSPC; | 5207 | error = -ENOSPC; |
4988 | write_lock(&ss->id_lock); | 5208 | spin_lock(&ss->id_lock); |
4989 | idr_remove(&ss->idr, myid); | 5209 | idr_remove(&ss->idr, myid); |
4990 | write_unlock(&ss->id_lock); | 5210 | spin_unlock(&ss->id_lock); |
4991 | err_out: | 5211 | err_out: |
4992 | kfree(newid); | 5212 | kfree(newid); |
4993 | return ERR_PTR(error); | 5213 | return ERR_PTR(error); |
@@ -4999,7 +5219,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
4999 | { | 5219 | { |
5000 | struct css_id *newid; | 5220 | struct css_id *newid; |
5001 | 5221 | ||
5002 | rwlock_init(&ss->id_lock); | 5222 | spin_lock_init(&ss->id_lock); |
5003 | idr_init(&ss->idr); | 5223 | idr_init(&ss->idr); |
5004 | 5224 | ||
5005 | newid = get_new_cssid(ss, 0); | 5225 | newid = get_new_cssid(ss, 0); |
@@ -5087,6 +5307,8 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
5087 | return NULL; | 5307 | return NULL; |
5088 | 5308 | ||
5089 | BUG_ON(!ss->use_id); | 5309 | BUG_ON(!ss->use_id); |
5310 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
5311 | |||
5090 | /* fill start point for scan */ | 5312 | /* fill start point for scan */ |
5091 | tmpid = id; | 5313 | tmpid = id; |
5092 | while (1) { | 5314 | while (1) { |
@@ -5094,10 +5316,7 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
5094 | * scan next entry from bitmap(tree), tmpid is updated after | 5316 | * scan next entry from bitmap(tree), tmpid is updated after |
5095 | * idr_get_next(). | 5317 | * idr_get_next(). |
5096 | */ | 5318 | */ |
5097 | read_lock(&ss->id_lock); | ||
5098 | tmp = idr_get_next(&ss->idr, &tmpid); | 5319 | tmp = idr_get_next(&ss->idr, &tmpid); |
5099 | read_unlock(&ss->id_lock); | ||
5100 | |||
5101 | if (!tmp) | 5320 | if (!tmp) |
5102 | break; | 5321 | break; |
5103 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { | 5322 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { |
@@ -5137,8 +5356,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5137 | } | 5356 | } |
5138 | 5357 | ||
5139 | #ifdef CONFIG_CGROUP_DEBUG | 5358 | #ifdef CONFIG_CGROUP_DEBUG |
5140 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 5359 | static struct cgroup_subsys_state *debug_create(struct cgroup *cont) |
5141 | struct cgroup *cont) | ||
5142 | { | 5360 | { |
5143 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5361 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5144 | 5362 | ||
@@ -5148,7 +5366,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | |||
5148 | return css; | 5366 | return css; |
5149 | } | 5367 | } |
5150 | 5368 | ||
5151 | static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 5369 | static void debug_destroy(struct cgroup *cont) |
5152 | { | 5370 | { |
5153 | kfree(cont->subsys[debug_subsys_id]); | 5371 | kfree(cont->subsys[debug_subsys_id]); |
5154 | } | 5372 | } |
@@ -5271,19 +5489,15 @@ static struct cftype debug_files[] = { | |||
5271 | .name = "releasable", | 5489 | .name = "releasable", |
5272 | .read_u64 = releasable_read, | 5490 | .read_u64 = releasable_read, |
5273 | }, | 5491 | }, |
5274 | }; | ||
5275 | 5492 | ||
5276 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5493 | { } /* terminate */ |
5277 | { | 5494 | }; |
5278 | return cgroup_add_files(cont, ss, debug_files, | ||
5279 | ARRAY_SIZE(debug_files)); | ||
5280 | } | ||
5281 | 5495 | ||
5282 | struct cgroup_subsys debug_subsys = { | 5496 | struct cgroup_subsys debug_subsys = { |
5283 | .name = "debug", | 5497 | .name = "debug", |
5284 | .create = debug_create, | 5498 | .create = debug_create, |
5285 | .destroy = debug_destroy, | 5499 | .destroy = debug_destroy, |
5286 | .populate = debug_populate, | ||
5287 | .subsys_id = debug_subsys_id, | 5500 | .subsys_id = debug_subsys_id, |
5501 | .base_cftypes = debug_files, | ||
5288 | }; | 5502 | }; |
5289 | #endif /* CONFIG_CGROUP_DEBUG */ | 5503 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..3649fc6b3eaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys; | |||
128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) | 128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) |
129 | * sighand->siglock | 129 | * sighand->siglock |
130 | */ | 130 | */ |
131 | static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | 131 | static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) |
132 | struct cgroup *cgroup) | ||
133 | { | 132 | { |
134 | struct freezer *freezer; | 133 | struct freezer *freezer; |
135 | 134 | ||
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | |||
142 | return &freezer->css; | 141 | return &freezer->css; |
143 | } | 142 | } |
144 | 143 | ||
145 | static void freezer_destroy(struct cgroup_subsys *ss, | 144 | static void freezer_destroy(struct cgroup *cgroup) |
146 | struct cgroup *cgroup) | ||
147 | { | 145 | { |
148 | struct freezer *freezer = cgroup_freezer(cgroup); | 146 | struct freezer *freezer = cgroup_freezer(cgroup); |
149 | 147 | ||
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task) | |||
164 | * a write to that file racing against an attach, and hence the | 162 | * a write to that file racing against an attach, and hence the |
165 | * can_attach() result will remain valid until the attach completes. | 163 | * can_attach() result will remain valid until the attach completes. |
166 | */ | 164 | */ |
167 | static int freezer_can_attach(struct cgroup_subsys *ss, | 165 | static int freezer_can_attach(struct cgroup *new_cgroup, |
168 | struct cgroup *new_cgroup, | ||
169 | struct cgroup_taskset *tset) | 166 | struct cgroup_taskset *tset) |
170 | { | 167 | { |
171 | struct freezer *freezer; | 168 | struct freezer *freezer; |
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
185 | return 0; | 182 | return 0; |
186 | } | 183 | } |
187 | 184 | ||
188 | static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | 185 | static void freezer_fork(struct task_struct *task) |
189 | { | 186 | { |
190 | struct freezer *freezer; | 187 | struct freezer *freezer; |
191 | 188 | ||
@@ -361,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, | |||
361 | static struct cftype files[] = { | 358 | static struct cftype files[] = { |
362 | { | 359 | { |
363 | .name = "state", | 360 | .name = "state", |
361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
364 | .read_seq_string = freezer_read, | 362 | .read_seq_string = freezer_read, |
365 | .write_string = freezer_write, | 363 | .write_string = freezer_write, |
366 | }, | 364 | }, |
365 | { } /* terminate */ | ||
367 | }; | 366 | }; |
368 | 367 | ||
369 | static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | ||
370 | { | ||
371 | if (!cgroup->parent) | ||
372 | return 0; | ||
373 | return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | ||
374 | } | ||
375 | |||
376 | struct cgroup_subsys freezer_subsys = { | 368 | struct cgroup_subsys freezer_subsys = { |
377 | .name = "freezer", | 369 | .name = "freezer", |
378 | .create = freezer_create, | 370 | .create = freezer_create, |
379 | .destroy = freezer_destroy, | 371 | .destroy = freezer_destroy, |
380 | .populate = freezer_populate, | ||
381 | .subsys_id = freezer_subsys_id, | 372 | .subsys_id = freezer_subsys_id, |
382 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
383 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
375 | .base_cftypes = files, | ||
384 | }; | 376 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index f346cedfe24d..74ff8498809a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -31,11 +31,10 @@ | |||
31 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * Note that the native side is already converted to a timespec, because | 34 | * Get/set struct timeval with struct timespec on the native side |
35 | * that's what we want anyway. | ||
36 | */ | 35 | */ |
37 | static int compat_get_timeval(struct timespec *o, | 36 | static int compat_get_timeval_convert(struct timespec *o, |
38 | struct compat_timeval __user *i) | 37 | struct compat_timeval __user *i) |
39 | { | 38 | { |
40 | long usec; | 39 | long usec; |
41 | 40 | ||
@@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o, | |||
46 | return 0; | 45 | return 0; |
47 | } | 46 | } |
48 | 47 | ||
49 | static int compat_put_timeval(struct compat_timeval __user *o, | 48 | static int compat_put_timeval_convert(struct compat_timeval __user *o, |
50 | struct timeval *i) | 49 | struct timeval *i) |
51 | { | 50 | { |
52 | return (put_user(i->tv_sec, &o->tv_sec) || | 51 | return (put_user(i->tv_sec, &o->tv_sec) || |
53 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; | 52 | put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; |
@@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, | |||
117 | if (tv) { | 116 | if (tv) { |
118 | struct timeval ktv; | 117 | struct timeval ktv; |
119 | do_gettimeofday(&ktv); | 118 | do_gettimeofday(&ktv); |
120 | if (compat_put_timeval(tv, &ktv)) | 119 | if (compat_put_timeval_convert(tv, &ktv)) |
121 | return -EFAULT; | 120 | return -EFAULT; |
122 | } | 121 | } |
123 | if (tz) { | 122 | if (tz) { |
@@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, | |||
135 | struct timezone ktz; | 134 | struct timezone ktz; |
136 | 135 | ||
137 | if (tv) { | 136 | if (tv) { |
138 | if (compat_get_timeval(&kts, tv)) | 137 | if (compat_get_timeval_convert(&kts, tv)) |
139 | return -EFAULT; | 138 | return -EFAULT; |
140 | } | 139 | } |
141 | if (tz) { | 140 | if (tz) { |
@@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, | |||
146 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); | 145 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); |
147 | } | 146 | } |
148 | 147 | ||
148 | int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) | ||
149 | { | ||
150 | return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || | ||
151 | __get_user(tv->tv_sec, &ctv->tv_sec) || | ||
152 | __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(get_compat_timeval); | ||
155 | |||
156 | int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) | ||
157 | { | ||
158 | return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || | ||
159 | __put_user(tv->tv_sec, &ctv->tv_sec) || | ||
160 | __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; | ||
161 | } | ||
162 | EXPORT_SYMBOL_GPL(put_compat_timeval); | ||
163 | |||
149 | int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) | 164 | int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) |
150 | { | 165 | { |
151 | return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || | 166 | return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || |
152 | __get_user(ts->tv_sec, &cts->tv_sec) || | 167 | __get_user(ts->tv_sec, &cts->tv_sec) || |
153 | __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | 168 | __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; |
154 | } | 169 | } |
170 | EXPORT_SYMBOL_GPL(get_compat_timespec); | ||
155 | 171 | ||
156 | int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) | 172 | int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) |
157 | { | 173 | { |
@@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user | |||
161 | } | 177 | } |
162 | EXPORT_SYMBOL_GPL(put_compat_timespec); | 178 | EXPORT_SYMBOL_GPL(put_compat_timespec); |
163 | 179 | ||
180 | int compat_get_timeval(struct timeval *tv, const void __user *utv) | ||
181 | { | ||
182 | if (COMPAT_USE_64BIT_TIME) | ||
183 | return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; | ||
184 | else | ||
185 | return get_compat_timeval(tv, utv); | ||
186 | } | ||
187 | EXPORT_SYMBOL_GPL(compat_get_timeval); | ||
188 | |||
189 | int compat_put_timeval(const struct timeval *tv, void __user *utv) | ||
190 | { | ||
191 | if (COMPAT_USE_64BIT_TIME) | ||
192 | return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; | ||
193 | else | ||
194 | return put_compat_timeval(tv, utv); | ||
195 | } | ||
196 | EXPORT_SYMBOL_GPL(compat_put_timeval); | ||
197 | |||
198 | int compat_get_timespec(struct timespec *ts, const void __user *uts) | ||
199 | { | ||
200 | if (COMPAT_USE_64BIT_TIME) | ||
201 | return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; | ||
202 | else | ||
203 | return get_compat_timespec(ts, uts); | ||
204 | } | ||
205 | EXPORT_SYMBOL_GPL(compat_get_timespec); | ||
206 | |||
207 | int compat_put_timespec(const struct timespec *ts, void __user *uts) | ||
208 | { | ||
209 | if (COMPAT_USE_64BIT_TIME) | ||
210 | return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; | ||
211 | else | ||
212 | return put_compat_timespec(ts, uts); | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(compat_put_timespec); | ||
215 | |||
164 | static long compat_nanosleep_restart(struct restart_block *restart) | 216 | static long compat_nanosleep_restart(struct restart_block *restart) |
165 | { | 217 | { |
166 | struct compat_timespec __user *rmtp; | 218 | struct compat_timespec __user *rmtp; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..2382683617a3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
964 | { | 964 | { |
965 | bool need_loop; | 965 | bool need_loop; |
966 | 966 | ||
967 | repeat: | ||
968 | /* | 967 | /* |
969 | * Allow tasks that have access to memory reserves because they have | 968 | * Allow tasks that have access to memory reserves because they have |
970 | * been OOM killed to get memory anywhere. | 969 | * been OOM killed to get memory anywhere. |
@@ -983,45 +982,19 @@ repeat: | |||
983 | */ | 982 | */ |
984 | need_loop = task_has_mempolicy(tsk) || | 983 | need_loop = task_has_mempolicy(tsk) || |
985 | !nodes_intersects(*newmems, tsk->mems_allowed); | 984 | !nodes_intersects(*newmems, tsk->mems_allowed); |
986 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | ||
987 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | ||
988 | 985 | ||
989 | /* | 986 | if (need_loop) |
990 | * ensure checking ->mems_allowed_change_disable after setting all new | 987 | write_seqcount_begin(&tsk->mems_allowed_seq); |
991 | * allowed nodes. | ||
992 | * | ||
993 | * the read-side task can see an nodemask with new allowed nodes and | ||
994 | * old allowed nodes. and if it allocates page when cpuset clears newly | ||
995 | * disallowed ones continuous, it can see the new allowed bits. | ||
996 | * | ||
997 | * And if setting all new allowed nodes is after the checking, setting | ||
998 | * all new allowed nodes and clearing newly disallowed ones will be done | ||
999 | * continuous, and the read-side task may find no node to alloc page. | ||
1000 | */ | ||
1001 | smp_mb(); | ||
1002 | |||
1003 | /* | ||
1004 | * Allocation of memory is very fast, we needn't sleep when waiting | ||
1005 | * for the read-side. | ||
1006 | */ | ||
1007 | while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { | ||
1008 | task_unlock(tsk); | ||
1009 | if (!task_curr(tsk)) | ||
1010 | yield(); | ||
1011 | goto repeat; | ||
1012 | } | ||
1013 | 988 | ||
1014 | /* | 989 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
1015 | * ensure checking ->mems_allowed_change_disable before clearing all new | 990 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); |
1016 | * disallowed nodes. | ||
1017 | * | ||
1018 | * if clearing newly disallowed bits before the checking, the read-side | ||
1019 | * task may find no node to alloc page. | ||
1020 | */ | ||
1021 | smp_mb(); | ||
1022 | 991 | ||
1023 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); | 992 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); |
1024 | tsk->mems_allowed = *newmems; | 993 | tsk->mems_allowed = *newmems; |
994 | |||
995 | if (need_loop) | ||
996 | write_seqcount_end(&tsk->mems_allowed_seq); | ||
997 | |||
1025 | task_unlock(tsk); | 998 | task_unlock(tsk); |
1026 | } | 999 | } |
1027 | 1000 | ||
@@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from; | |||
1399 | static nodemask_t cpuset_attach_nodemask_to; | 1372 | static nodemask_t cpuset_attach_nodemask_to; |
1400 | 1373 | ||
1401 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1374 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1402 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 1375 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1403 | struct cgroup_taskset *tset) | ||
1404 | { | 1376 | { |
1405 | struct cpuset *cs = cgroup_cs(cgrp); | 1377 | struct cpuset *cs = cgroup_cs(cgrp); |
1406 | struct task_struct *task; | 1378 | struct task_struct *task; |
@@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
1436 | return 0; | 1408 | return 0; |
1437 | } | 1409 | } |
1438 | 1410 | ||
1439 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 1411 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1440 | struct cgroup_taskset *tset) | ||
1441 | { | 1412 | { |
1442 | struct mm_struct *mm; | 1413 | struct mm_struct *mm; |
1443 | struct task_struct *task; | 1414 | struct task_struct *task; |
@@ -1794,28 +1765,17 @@ static struct cftype files[] = { | |||
1794 | .write_u64 = cpuset_write_u64, | 1765 | .write_u64 = cpuset_write_u64, |
1795 | .private = FILE_SPREAD_SLAB, | 1766 | .private = FILE_SPREAD_SLAB, |
1796 | }, | 1767 | }, |
1797 | }; | ||
1798 | 1768 | ||
1799 | static struct cftype cft_memory_pressure_enabled = { | 1769 | { |
1800 | .name = "memory_pressure_enabled", | 1770 | .name = "memory_pressure_enabled", |
1801 | .read_u64 = cpuset_read_u64, | 1771 | .flags = CFTYPE_ONLY_ON_ROOT, |
1802 | .write_u64 = cpuset_write_u64, | 1772 | .read_u64 = cpuset_read_u64, |
1803 | .private = FILE_MEMORY_PRESSURE_ENABLED, | 1773 | .write_u64 = cpuset_write_u64, |
1804 | }; | 1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, |
1805 | 1775 | }, | |
1806 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
1807 | { | ||
1808 | int err; | ||
1809 | 1776 | ||
1810 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 1777 | { } /* terminate */ |
1811 | if (err) | 1778 | }; |
1812 | return err; | ||
1813 | /* memory_pressure_enabled is in root cpuset only */ | ||
1814 | if (!cont->parent) | ||
1815 | err = cgroup_add_file(cont, ss, | ||
1816 | &cft_memory_pressure_enabled); | ||
1817 | return err; | ||
1818 | } | ||
1819 | 1779 | ||
1820 | /* | 1780 | /* |
1821 | * post_clone() is called during cgroup_create() when the | 1781 | * post_clone() is called during cgroup_create() when the |
@@ -1833,8 +1793,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1833 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex | 1793 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex |
1834 | * held. | 1794 | * held. |
1835 | */ | 1795 | */ |
1836 | static void cpuset_post_clone(struct cgroup_subsys *ss, | 1796 | static void cpuset_post_clone(struct cgroup *cgroup) |
1837 | struct cgroup *cgroup) | ||
1838 | { | 1797 | { |
1839 | struct cgroup *parent, *child; | 1798 | struct cgroup *parent, *child; |
1840 | struct cpuset *cs, *parent_cs; | 1799 | struct cpuset *cs, *parent_cs; |
@@ -1857,13 +1816,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, | |||
1857 | 1816 | ||
1858 | /* | 1817 | /* |
1859 | * cpuset_create - create a cpuset | 1818 | * cpuset_create - create a cpuset |
1860 | * ss: cpuset cgroup subsystem | ||
1861 | * cont: control group that the new cpuset will be part of | 1819 | * cont: control group that the new cpuset will be part of |
1862 | */ | 1820 | */ |
1863 | 1821 | ||
1864 | static struct cgroup_subsys_state *cpuset_create( | 1822 | static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) |
1865 | struct cgroup_subsys *ss, | ||
1866 | struct cgroup *cont) | ||
1867 | { | 1823 | { |
1868 | struct cpuset *cs; | 1824 | struct cpuset *cs; |
1869 | struct cpuset *parent; | 1825 | struct cpuset *parent; |
@@ -1902,7 +1858,7 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1902 | * will call async_rebuild_sched_domains(). | 1858 | * will call async_rebuild_sched_domains(). |
1903 | */ | 1859 | */ |
1904 | 1860 | ||
1905 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1861 | static void cpuset_destroy(struct cgroup *cont) |
1906 | { | 1862 | { |
1907 | struct cpuset *cs = cgroup_cs(cont); | 1863 | struct cpuset *cs = cgroup_cs(cont); |
1908 | 1864 | ||
@@ -1920,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1920 | .destroy = cpuset_destroy, | 1876 | .destroy = cpuset_destroy, |
1921 | .can_attach = cpuset_can_attach, | 1877 | .can_attach = cpuset_can_attach, |
1922 | .attach = cpuset_attach, | 1878 | .attach = cpuset_attach, |
1923 | .populate = cpuset_populate, | ||
1924 | .post_clone = cpuset_post_clone, | 1879 | .post_clone = cpuset_post_clone, |
1925 | .subsys_id = cpuset_subsys_id, | 1880 | .subsys_id = cpuset_subsys_id, |
1881 | .base_cftypes = files, | ||
1926 | .early_init = 1, | 1882 | .early_init = 1, |
1927 | }; | 1883 | }; |
1928 | 1884 | ||
@@ -2195,10 +2151,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
2195 | mutex_unlock(&callback_mutex); | 2151 | mutex_unlock(&callback_mutex); |
2196 | } | 2152 | } |
2197 | 2153 | ||
2198 | int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2154 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2199 | { | 2155 | { |
2200 | const struct cpuset *cs; | 2156 | const struct cpuset *cs; |
2201 | int cpu; | ||
2202 | 2157 | ||
2203 | rcu_read_lock(); | 2158 | rcu_read_lock(); |
2204 | cs = task_cs(tsk); | 2159 | cs = task_cs(tsk); |
@@ -2219,22 +2174,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
2219 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary | 2174 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary |
2220 | * set any mask even if it is not right from task_cs() pov, | 2175 | * set any mask even if it is not right from task_cs() pov, |
2221 | * the pending set_cpus_allowed_ptr() will fix things. | 2176 | * the pending set_cpus_allowed_ptr() will fix things. |
2177 | * | ||
2178 | * select_fallback_rq() will fix things ups and set cpu_possible_mask | ||
2179 | * if required. | ||
2222 | */ | 2180 | */ |
2223 | |||
2224 | cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); | ||
2225 | if (cpu >= nr_cpu_ids) { | ||
2226 | /* | ||
2227 | * Either tsk->cpus_allowed is wrong (see above) or it | ||
2228 | * is actually empty. The latter case is only possible | ||
2229 | * if we are racing with remove_tasks_in_empty_cpuset(). | ||
2230 | * Like above we can temporary set any mask and rely on | ||
2231 | * set_cpus_allowed_ptr() as synchronization point. | ||
2232 | */ | ||
2233 | do_set_cpus_allowed(tsk, cpu_possible_mask); | ||
2234 | cpu = cpumask_any(cpu_active_mask); | ||
2235 | } | ||
2236 | |||
2237 | return cpu; | ||
2238 | } | 2181 | } |
2239 | 2182 | ||
2240 | void cpuset_init_current_mems_allowed(void) | 2183 | void cpuset_init_current_mems_allowed(void) |
diff --git a/kernel/cred.c b/kernel/cred.c index 5791612a4045..97b36eeca4c9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/keyctl.h> | 16 | #include <linux/keyctl.h> |
17 | #include <linux/init_task.h> | 17 | #include <linux/init_task.h> |
18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
19 | #include <linux/binfmts.h> | ||
19 | #include <linux/cn_proc.h> | 20 | #include <linux/cn_proc.h> |
20 | 21 | ||
21 | #if 0 | 22 | #if 0 |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0d7c08784efb..1dc53bae56e1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/delay.h> | 41 | #include <linux/delay.h> |
42 | #include <linux/sched.h> | 42 | #include <linux/sched.h> |
43 | #include <linux/sysrq.h> | 43 | #include <linux/sysrq.h> |
44 | #include <linux/reboot.h> | ||
44 | #include <linux/init.h> | 45 | #include <linux/init.h> |
45 | #include <linux/kgdb.h> | 46 | #include <linux/kgdb.h> |
46 | #include <linux/kdb.h> | 47 | #include <linux/kdb.h> |
@@ -52,7 +53,6 @@ | |||
52 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
53 | #include <asm/byteorder.h> | 54 | #include <asm/byteorder.h> |
54 | #include <linux/atomic.h> | 55 | #include <linux/atomic.h> |
55 | #include <asm/system.h> | ||
56 | 56 | ||
57 | #include "debug_core.h" | 57 | #include "debug_core.h" |
58 | 58 | ||
@@ -75,6 +75,8 @@ static int exception_level; | |||
75 | struct kgdb_io *dbg_io_ops; | 75 | struct kgdb_io *dbg_io_ops; |
76 | static DEFINE_SPINLOCK(kgdb_registration_lock); | 76 | static DEFINE_SPINLOCK(kgdb_registration_lock); |
77 | 77 | ||
78 | /* Action for the reboot notifiter, a global allow kdb to change it */ | ||
79 | static int kgdbreboot; | ||
78 | /* kgdb console driver is loaded */ | 80 | /* kgdb console driver is loaded */ |
79 | static int kgdb_con_registered; | 81 | static int kgdb_con_registered; |
80 | /* determine if kgdb console output should be used */ | 82 | /* determine if kgdb console output should be used */ |
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str) | |||
96 | early_param("kgdbcon", opt_kgdb_con); | 98 | early_param("kgdbcon", opt_kgdb_con); |
97 | 99 | ||
98 | module_param(kgdb_use_con, int, 0644); | 100 | module_param(kgdb_use_con, int, 0644); |
101 | module_param(kgdbreboot, int, 0644); | ||
99 | 102 | ||
100 | /* | 103 | /* |
101 | * Holds information about breakpoints in a kernel. These breakpoints are | 104 | * Holds information about breakpoints in a kernel. These breakpoints are |
@@ -784,6 +787,33 @@ void __init dbg_late_init(void) | |||
784 | kdb_init(KDB_INIT_FULL); | 787 | kdb_init(KDB_INIT_FULL); |
785 | } | 788 | } |
786 | 789 | ||
790 | static int | ||
791 | dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) | ||
792 | { | ||
793 | /* | ||
794 | * Take the following action on reboot notify depending on value: | ||
795 | * 1 == Enter debugger | ||
796 | * 0 == [the default] detatch debug client | ||
797 | * -1 == Do nothing... and use this until the board resets | ||
798 | */ | ||
799 | switch (kgdbreboot) { | ||
800 | case 1: | ||
801 | kgdb_breakpoint(); | ||
802 | case -1: | ||
803 | goto done; | ||
804 | } | ||
805 | if (!dbg_kdb_mode) | ||
806 | gdbstub_exit(code); | ||
807 | done: | ||
808 | return NOTIFY_DONE; | ||
809 | } | ||
810 | |||
811 | static struct notifier_block dbg_reboot_notifier = { | ||
812 | .notifier_call = dbg_notify_reboot, | ||
813 | .next = NULL, | ||
814 | .priority = INT_MAX, | ||
815 | }; | ||
816 | |||
787 | static void kgdb_register_callbacks(void) | 817 | static void kgdb_register_callbacks(void) |
788 | { | 818 | { |
789 | if (!kgdb_io_module_registered) { | 819 | if (!kgdb_io_module_registered) { |
@@ -791,6 +821,7 @@ static void kgdb_register_callbacks(void) | |||
791 | kgdb_arch_init(); | 821 | kgdb_arch_init(); |
792 | if (!dbg_is_early) | 822 | if (!dbg_is_early) |
793 | kgdb_arch_late(); | 823 | kgdb_arch_late(); |
824 | register_reboot_notifier(&dbg_reboot_notifier); | ||
794 | atomic_notifier_chain_register(&panic_notifier_list, | 825 | atomic_notifier_chain_register(&panic_notifier_list, |
795 | &kgdb_panic_event_nb); | 826 | &kgdb_panic_event_nb); |
796 | #ifdef CONFIG_MAGIC_SYSRQ | 827 | #ifdef CONFIG_MAGIC_SYSRQ |
@@ -812,6 +843,7 @@ static void kgdb_unregister_callbacks(void) | |||
812 | */ | 843 | */ |
813 | if (kgdb_io_module_registered) { | 844 | if (kgdb_io_module_registered) { |
814 | kgdb_io_module_registered = 0; | 845 | kgdb_io_module_registered = 0; |
846 | unregister_reboot_notifier(&dbg_reboot_notifier); | ||
815 | atomic_notifier_chain_unregister(&panic_notifier_list, | 847 | atomic_notifier_chain_unregister(&panic_notifier_list, |
816 | &kgdb_panic_event_nb); | 848 | &kgdb_panic_event_nb); |
817 | kgdb_arch_exit(); | 849 | kgdb_arch_exit(); |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index c22d8c28ad84..ce615e064482 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status) | |||
1111 | unsigned char checksum, ch, buffer[3]; | 1111 | unsigned char checksum, ch, buffer[3]; |
1112 | int loop; | 1112 | int loop; |
1113 | 1113 | ||
1114 | if (!kgdb_connected) | ||
1115 | return; | ||
1116 | kgdb_connected = 0; | ||
1117 | |||
1118 | if (!dbg_io_ops || dbg_kdb_mode) | ||
1119 | return; | ||
1120 | |||
1114 | buffer[0] = 'W'; | 1121 | buffer[0] = 'W'; |
1115 | buffer[1] = hex_asc_hi(status); | 1122 | buffer[1] = hex_asc_hi(status); |
1116 | buffer[2] = hex_asc_lo(status); | 1123 | buffer[2] = hex_asc_lo(status); |
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status) | |||
1129 | dbg_io_ops->write_char(hex_asc_lo(checksum)); | 1136 | dbg_io_ops->write_char(hex_asc_lo(checksum)); |
1130 | 1137 | ||
1131 | /* make sure the output is flushed, lest the bootloader clobber it */ | 1138 | /* make sure the output is flushed, lest the bootloader clobber it */ |
1132 | dbg_io_ops->flush(); | 1139 | if (dbg_io_ops->flush) |
1140 | dbg_io_ops->flush(); | ||
1133 | } | 1141 | } |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 20059ef4459a..8418c2f8ec5d 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) | |||
153 | } else { | 153 | } else { |
154 | kdb_printf("%s: failed to set breakpoint at 0x%lx\n", | 154 | kdb_printf("%s: failed to set breakpoint at 0x%lx\n", |
155 | __func__, bp->bp_addr); | 155 | __func__, bp->bp_addr); |
156 | #ifdef CONFIG_DEBUG_RODATA | ||
157 | if (!bp->bp_type) { | ||
158 | kdb_printf("Software breakpoints are unavailable.\n" | ||
159 | " Change the kernel CONFIG_DEBUG_RODATA=n\n" | ||
160 | " OR use hw breaks: help bph\n"); | ||
161 | } | ||
162 | #endif | ||
156 | return 1; | 163 | return 1; |
157 | } | 164 | } |
158 | return 0; | 165 | return 0; |
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7179eac7b41c..07c9bbb94a0b 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/kdb.h> | 16 | #include <linux/kdb.h> |
17 | #include <linux/nmi.h> | 17 | #include <linux/nmi.h> |
18 | #include <asm/system.h> | ||
19 | #include "kdb_private.h" | 18 | #include "kdb_private.h" |
20 | 19 | ||
21 | 20 | ||
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 4802eb5840e1..9b5f17da1c56 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -689,7 +689,7 @@ kdb_printit: | |||
689 | if (!dbg_kdb_mode && kgdb_connected) { | 689 | if (!dbg_kdb_mode && kgdb_connected) { |
690 | gdbstub_msg_write(kdb_buffer, retlen); | 690 | gdbstub_msg_write(kdb_buffer, retlen); |
691 | } else { | 691 | } else { |
692 | if (!dbg_io_ops->is_console) { | 692 | if (dbg_io_ops && !dbg_io_ops->is_console) { |
693 | len = strlen(kdb_buffer); | 693 | len = strlen(kdb_buffer); |
694 | cp = kdb_buffer; | 694 | cp = kdb_buffer; |
695 | while (len--) { | 695 | while (len--) { |
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 4bca634975c0..118527aa60ea 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ | 25 | #define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ |
26 | 26 | ||
27 | static int kbd_exists; | 27 | static int kbd_exists; |
28 | static int kbd_last_ret; | ||
28 | 29 | ||
29 | /* | 30 | /* |
30 | * Check if the keyboard controller has a keypress for us. | 31 | * Check if the keyboard controller has a keypress for us. |
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void) | |||
90 | return -1; | 91 | return -1; |
91 | } | 92 | } |
92 | 93 | ||
93 | if ((scancode & 0x80) != 0) | 94 | if ((scancode & 0x80) != 0) { |
95 | if (scancode == 0x9c) | ||
96 | kbd_last_ret = 0; | ||
94 | return -1; | 97 | return -1; |
98 | } | ||
95 | 99 | ||
96 | scancode &= 0x7f; | 100 | scancode &= 0x7f; |
97 | 101 | ||
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void) | |||
178 | return -1; /* ignore unprintables */ | 182 | return -1; /* ignore unprintables */ |
179 | } | 183 | } |
180 | 184 | ||
181 | if ((scancode & 0x7f) == 0x1c) { | 185 | if (scancode == 0x1c) { |
182 | /* | 186 | kbd_last_ret = 1; |
183 | * enter key. All done. Absorb the release scancode. | 187 | return 13; |
184 | */ | 188 | } |
189 | |||
190 | return keychar & 0xff; | ||
191 | } | ||
192 | EXPORT_SYMBOL_GPL(kdb_get_kbd_char); | ||
193 | |||
194 | /* | ||
195 | * Best effort cleanup of ENTER break codes on leaving KDB. Called on | ||
196 | * exiting KDB, when we know we processed an ENTER or KP ENTER scan | ||
197 | * code. | ||
198 | */ | ||
199 | void kdb_kbd_cleanup_state(void) | ||
200 | { | ||
201 | int scancode, scanstatus; | ||
202 | |||
203 | /* | ||
204 | * Nothing to clean up, since either | ||
205 | * ENTER was never pressed, or has already | ||
206 | * gotten cleaned up. | ||
207 | */ | ||
208 | if (!kbd_last_ret) | ||
209 | return; | ||
210 | |||
211 | kbd_last_ret = 0; | ||
212 | /* | ||
213 | * Enter key. Need to absorb the break code here, lest it gets | ||
214 | * leaked out if we exit KDB as the result of processing 'g'. | ||
215 | * | ||
216 | * This has several interesting implications: | ||
217 | * + Need to handle KP ENTER, which has break code 0xe0 0x9c. | ||
218 | * + Need to handle repeat ENTER and repeat KP ENTER. Repeats | ||
219 | * only get a break code at the end of the repeated | ||
220 | * sequence. This means we can't propagate the repeated key | ||
221 | * press, and must swallow it away. | ||
222 | * + Need to handle possible PS/2 mouse input. | ||
223 | * + Need to handle mashed keys. | ||
224 | */ | ||
225 | |||
226 | while (1) { | ||
185 | while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) | 227 | while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) |
186 | ; | 228 | cpu_relax(); |
187 | 229 | ||
188 | /* | 230 | /* |
189 | * Fetch the scancode | 231 | * Fetch the scancode. |
190 | */ | 232 | */ |
191 | scancode = inb(KBD_DATA_REG); | 233 | scancode = inb(KBD_DATA_REG); |
192 | scanstatus = inb(KBD_STATUS_REG); | 234 | scanstatus = inb(KBD_STATUS_REG); |
193 | 235 | ||
194 | while (scanstatus & KBD_STAT_MOUSE_OBF) { | 236 | /* |
195 | scancode = inb(KBD_DATA_REG); | 237 | * Skip mouse input. |
196 | scanstatus = inb(KBD_STATUS_REG); | 238 | */ |
197 | } | 239 | if (scanstatus & KBD_STAT_MOUSE_OBF) |
240 | continue; | ||
198 | 241 | ||
199 | if (scancode != 0x9c) { | 242 | /* |
200 | /* | 243 | * If we see 0xe0, this is either a break code for KP |
201 | * Wasn't an enter-release, why not? | 244 | * ENTER, or a repeat make for KP ENTER. Either way, |
202 | */ | 245 | * since the second byte is equivalent to an ENTER, |
203 | kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", | 246 | * skip the 0xe0 and try again. |
204 | scancode, scanstatus); | 247 | * |
205 | } | 248 | * If we see 0x1c, this must be a repeat ENTER or KP |
249 | * ENTER (and we swallowed 0xe0 before). Try again. | ||
250 | * | ||
251 | * We can also see make and break codes for other keys | ||
252 | * mashed before or after pressing ENTER. Thus, if we | ||
253 | * see anything other than 0x9c, we have to try again. | ||
254 | * | ||
255 | * Note, if you held some key as ENTER was depressed, | ||
256 | * that break code would get leaked out. | ||
257 | */ | ||
258 | if (scancode != 0x9c) | ||
259 | continue; | ||
206 | 260 | ||
207 | return 13; | 261 | return; |
208 | } | 262 | } |
209 | |||
210 | return keychar & 0xff; | ||
211 | } | 263 | } |
212 | EXPORT_SYMBOL_GPL(kdb_get_kbd_char); | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e2ae7349437f..67b847dfa2bb 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, | |||
1400 | if (KDB_STATE(DOING_SS)) | 1400 | if (KDB_STATE(DOING_SS)) |
1401 | KDB_STATE_CLEAR(SSBPT); | 1401 | KDB_STATE_CLEAR(SSBPT); |
1402 | 1402 | ||
1403 | /* Clean up any keyboard devices before leaving */ | ||
1404 | kdb_kbd_cleanup_state(); | ||
1405 | |||
1403 | return result; | 1406 | return result; |
1404 | } | 1407 | } |
1405 | 1408 | ||
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index e381d105b40b..47c4e56e513b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -246,6 +246,13 @@ extern void debug_kusage(void); | |||
246 | 246 | ||
247 | extern void kdb_set_current_task(struct task_struct *); | 247 | extern void kdb_set_current_task(struct task_struct *); |
248 | extern struct task_struct *kdb_current_task; | 248 | extern struct task_struct *kdb_current_task; |
249 | |||
250 | #ifdef CONFIG_KDB_KEYBOARD | ||
251 | extern void kdb_kbd_cleanup_state(void); | ||
252 | #else /* ! CONFIG_KDB_KEYBOARD */ | ||
253 | #define kdb_kbd_cleanup_state() | ||
254 | #endif /* ! CONFIG_KDB_KEYBOARD */ | ||
255 | |||
249 | #ifdef CONFIG_MODULES | 256 | #ifdef CONFIG_MODULES |
250 | extern struct list_head *kdb_modules; | 257 | extern struct list_head *kdb_modules; |
251 | #endif /* CONFIG_MODULES */ | 258 | #endif /* CONFIG_MODULES */ |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..d35cc2d3a4cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size) | |||
384 | if (!pfn_valid(pfn)) | 384 | if (!pfn_valid(pfn)) |
385 | return 1; | 385 | return 1; |
386 | page = pfn_to_page(pfn); | 386 | page = pfn_to_page(pfn); |
387 | vaddr = kmap_atomic(page, KM_KDB); | 387 | vaddr = kmap_atomic(page); |
388 | memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); | 388 | memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); |
389 | kunmap_atomic(vaddr, KM_KDB); | 389 | kunmap_atomic(vaddr); |
390 | 390 | ||
391 | return 0; | 391 | return 0; |
392 | } | 392 | } |
diff --git a/kernel/dma.c b/kernel/dma.c index 68a2306522c8..6c6262f86c17 100644 --- a/kernel/dma.c +++ b/kernel/dma.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
19 | #include <linux/init.h> | 19 | #include <linux/init.h> |
20 | #include <asm/dma.h> | 20 | #include <asm/dma.h> |
21 | #include <asm/system.h> | ||
22 | 21 | ||
23 | 22 | ||
24 | 23 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1b5c081d8b9f..a6a9ec4cd8f5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | |||
118 | PERF_FLAG_FD_OUTPUT |\ | 118 | PERF_FLAG_FD_OUTPUT |\ |
119 | PERF_FLAG_PID_CGROUP) | 119 | PERF_FLAG_PID_CGROUP) |
120 | 120 | ||
121 | /* | ||
122 | * branch priv levels that need permission checks | ||
123 | */ | ||
124 | #define PERF_SAMPLE_BRANCH_PERM_PLM \ | ||
125 | (PERF_SAMPLE_BRANCH_KERNEL |\ | ||
126 | PERF_SAMPLE_BRANCH_HV) | ||
127 | |||
121 | enum event_type_t { | 128 | enum event_type_t { |
122 | EVENT_FLEXIBLE = 0x1, | 129 | EVENT_FLEXIBLE = 0x1, |
123 | EVENT_PINNED = 0x2, | 130 | EVENT_PINNED = 0x2, |
@@ -128,8 +135,9 @@ enum event_type_t { | |||
128 | * perf_sched_events : >0 events exist | 135 | * perf_sched_events : >0 events exist |
129 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 136 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
130 | */ | 137 | */ |
131 | struct jump_label_key_deferred perf_sched_events __read_mostly; | 138 | struct static_key_deferred perf_sched_events __read_mostly; |
132 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 139 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
140 | static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | ||
133 | 141 | ||
134 | static atomic_t nr_mmap_events __read_mostly; | 142 | static atomic_t nr_mmap_events __read_mostly; |
135 | static atomic_t nr_comm_events __read_mostly; | 143 | static atomic_t nr_comm_events __read_mostly; |
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
881 | if (is_cgroup_event(event)) | 889 | if (is_cgroup_event(event)) |
882 | ctx->nr_cgroups++; | 890 | ctx->nr_cgroups++; |
883 | 891 | ||
892 | if (has_branch_stack(event)) | ||
893 | ctx->nr_branch_stack++; | ||
894 | |||
884 | list_add_rcu(&event->event_entry, &ctx->event_list); | 895 | list_add_rcu(&event->event_entry, &ctx->event_list); |
885 | if (!ctx->nr_events) | 896 | if (!ctx->nr_events) |
886 | perf_pmu_rotate_start(ctx->pmu); | 897 | perf_pmu_rotate_start(ctx->pmu); |
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1020 | cpuctx->cgrp = NULL; | 1031 | cpuctx->cgrp = NULL; |
1021 | } | 1032 | } |
1022 | 1033 | ||
1034 | if (has_branch_stack(event)) | ||
1035 | ctx->nr_branch_stack--; | ||
1036 | |||
1023 | ctx->nr_events--; | 1037 | ctx->nr_events--; |
1024 | if (event->attr.inherit_stat) | 1038 | if (event->attr.inherit_stat) |
1025 | ctx->nr_stat--; | 1039 | ctx->nr_stat--; |
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2195 | } | 2209 | } |
2196 | 2210 | ||
2197 | /* | 2211 | /* |
2212 | * When sampling the branck stack in system-wide, it may be necessary | ||
2213 | * to flush the stack on context switch. This happens when the branch | ||
2214 | * stack does not tag its entries with the pid of the current task. | ||
2215 | * Otherwise it becomes impossible to associate a branch entry with a | ||
2216 | * task. This ambiguity is more likely to appear when the branch stack | ||
2217 | * supports priv level filtering and the user sets it to monitor only | ||
2218 | * at the user level (which could be a useful measurement in system-wide | ||
2219 | * mode). In that case, the risk is high of having a branch stack with | ||
2220 | * branch from multiple tasks. Flushing may mean dropping the existing | ||
2221 | * entries or stashing them somewhere in the PMU specific code layer. | ||
2222 | * | ||
2223 | * This function provides the context switch callback to the lower code | ||
2224 | * layer. It is invoked ONLY when there is at least one system-wide context | ||
2225 | * with at least one active event using taken branch sampling. | ||
2226 | */ | ||
2227 | static void perf_branch_stack_sched_in(struct task_struct *prev, | ||
2228 | struct task_struct *task) | ||
2229 | { | ||
2230 | struct perf_cpu_context *cpuctx; | ||
2231 | struct pmu *pmu; | ||
2232 | unsigned long flags; | ||
2233 | |||
2234 | /* no need to flush branch stack if not changing task */ | ||
2235 | if (prev == task) | ||
2236 | return; | ||
2237 | |||
2238 | local_irq_save(flags); | ||
2239 | |||
2240 | rcu_read_lock(); | ||
2241 | |||
2242 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
2243 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2244 | |||
2245 | /* | ||
2246 | * check if the context has at least one | ||
2247 | * event using PERF_SAMPLE_BRANCH_STACK | ||
2248 | */ | ||
2249 | if (cpuctx->ctx.nr_branch_stack > 0 | ||
2250 | && pmu->flush_branch_stack) { | ||
2251 | |||
2252 | pmu = cpuctx->ctx.pmu; | ||
2253 | |||
2254 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2255 | |||
2256 | perf_pmu_disable(pmu); | ||
2257 | |||
2258 | pmu->flush_branch_stack(); | ||
2259 | |||
2260 | perf_pmu_enable(pmu); | ||
2261 | |||
2262 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2263 | } | ||
2264 | } | ||
2265 | |||
2266 | rcu_read_unlock(); | ||
2267 | |||
2268 | local_irq_restore(flags); | ||
2269 | } | ||
2270 | |||
2271 | /* | ||
2198 | * Called from scheduler to add the events of the current task | 2272 | * Called from scheduler to add the events of the current task |
2199 | * with interrupts disabled. | 2273 | * with interrupts disabled. |
2200 | * | 2274 | * |
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
2225 | */ | 2299 | */ |
2226 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2300 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) |
2227 | perf_cgroup_sched_in(prev, task); | 2301 | perf_cgroup_sched_in(prev, task); |
2302 | |||
2303 | /* check for system-wide branch_stack events */ | ||
2304 | if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) | ||
2305 | perf_branch_stack_sched_in(prev, task); | ||
2228 | } | 2306 | } |
2229 | 2307 | ||
2230 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2308 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event) | |||
2778 | 2856 | ||
2779 | if (!event->parent) { | 2857 | if (!event->parent) { |
2780 | if (event->attach_state & PERF_ATTACH_TASK) | 2858 | if (event->attach_state & PERF_ATTACH_TASK) |
2781 | jump_label_dec_deferred(&perf_sched_events); | 2859 | static_key_slow_dec_deferred(&perf_sched_events); |
2782 | if (event->attr.mmap || event->attr.mmap_data) | 2860 | if (event->attr.mmap || event->attr.mmap_data) |
2783 | atomic_dec(&nr_mmap_events); | 2861 | atomic_dec(&nr_mmap_events); |
2784 | if (event->attr.comm) | 2862 | if (event->attr.comm) |
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event) | |||
2789 | put_callchain_buffers(); | 2867 | put_callchain_buffers(); |
2790 | if (is_cgroup_event(event)) { | 2868 | if (is_cgroup_event(event)) { |
2791 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | 2869 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); |
2792 | jump_label_dec_deferred(&perf_sched_events); | 2870 | static_key_slow_dec_deferred(&perf_sched_events); |
2871 | } | ||
2872 | |||
2873 | if (has_branch_stack(event)) { | ||
2874 | static_key_slow_dec_deferred(&perf_sched_events); | ||
2875 | /* is system-wide event */ | ||
2876 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
2877 | atomic_dec(&per_cpu(perf_branch_stack_events, | ||
2878 | event->cpu)); | ||
2793 | } | 2879 | } |
2794 | } | 2880 | } |
2795 | 2881 | ||
@@ -3238,10 +3324,6 @@ int perf_event_task_disable(void) | |||
3238 | return 0; | 3324 | return 0; |
3239 | } | 3325 | } |
3240 | 3326 | ||
3241 | #ifndef PERF_EVENT_INDEX_OFFSET | ||
3242 | # define PERF_EVENT_INDEX_OFFSET 0 | ||
3243 | #endif | ||
3244 | |||
3245 | static int perf_event_index(struct perf_event *event) | 3327 | static int perf_event_index(struct perf_event *event) |
3246 | { | 3328 | { |
3247 | if (event->hw.state & PERF_HES_STOPPED) | 3329 | if (event->hw.state & PERF_HES_STOPPED) |
@@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event) | |||
3250 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 3332 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
3251 | return 0; | 3333 | return 0; |
3252 | 3334 | ||
3253 | return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; | 3335 | return event->pmu->event_idx(event); |
3254 | } | 3336 | } |
3255 | 3337 | ||
3256 | static void calc_timer_values(struct perf_event *event, | 3338 | static void calc_timer_values(struct perf_event *event, |
3339 | u64 *now, | ||
3257 | u64 *enabled, | 3340 | u64 *enabled, |
3258 | u64 *running) | 3341 | u64 *running) |
3259 | { | 3342 | { |
3260 | u64 now, ctx_time; | 3343 | u64 ctx_time; |
3261 | 3344 | ||
3262 | now = perf_clock(); | 3345 | *now = perf_clock(); |
3263 | ctx_time = event->shadow_ctx_time + now; | 3346 | ctx_time = event->shadow_ctx_time + *now; |
3264 | *enabled = ctx_time - event->tstamp_enabled; | 3347 | *enabled = ctx_time - event->tstamp_enabled; |
3265 | *running = ctx_time - event->tstamp_running; | 3348 | *running = ctx_time - event->tstamp_running; |
3266 | } | 3349 | } |
3267 | 3350 | ||
3351 | void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) | ||
3352 | { | ||
3353 | } | ||
3354 | |||
3268 | /* | 3355 | /* |
3269 | * Callers need to ensure there can be no nesting of this function, otherwise | 3356 | * Callers need to ensure there can be no nesting of this function, otherwise |
3270 | * the seqlock logic goes bad. We can not serialize this because the arch | 3357 | * the seqlock logic goes bad. We can not serialize this because the arch |
@@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3274 | { | 3361 | { |
3275 | struct perf_event_mmap_page *userpg; | 3362 | struct perf_event_mmap_page *userpg; |
3276 | struct ring_buffer *rb; | 3363 | struct ring_buffer *rb; |
3277 | u64 enabled, running; | 3364 | u64 enabled, running, now; |
3278 | 3365 | ||
3279 | rcu_read_lock(); | 3366 | rcu_read_lock(); |
3280 | /* | 3367 | /* |
@@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3286 | * because of locking issue as we can be called in | 3373 | * because of locking issue as we can be called in |
3287 | * NMI context | 3374 | * NMI context |
3288 | */ | 3375 | */ |
3289 | calc_timer_values(event, &enabled, &running); | 3376 | calc_timer_values(event, &now, &enabled, &running); |
3290 | rb = rcu_dereference(event->rb); | 3377 | rb = rcu_dereference(event->rb); |
3291 | if (!rb) | 3378 | if (!rb) |
3292 | goto unlock; | 3379 | goto unlock; |
@@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3302 | barrier(); | 3389 | barrier(); |
3303 | userpg->index = perf_event_index(event); | 3390 | userpg->index = perf_event_index(event); |
3304 | userpg->offset = perf_event_count(event); | 3391 | userpg->offset = perf_event_count(event); |
3305 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 3392 | if (userpg->index) |
3306 | userpg->offset -= local64_read(&event->hw.prev_count); | 3393 | userpg->offset -= local64_read(&event->hw.prev_count); |
3307 | 3394 | ||
3308 | userpg->time_enabled = enabled + | 3395 | userpg->time_enabled = enabled + |
@@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3311 | userpg->time_running = running + | 3398 | userpg->time_running = running + |
3312 | atomic64_read(&event->child_total_time_running); | 3399 | atomic64_read(&event->child_total_time_running); |
3313 | 3400 | ||
3401 | arch_perf_update_userpage(userpg, now); | ||
3402 | |||
3314 | barrier(); | 3403 | barrier(); |
3315 | ++userpg->lock; | 3404 | ++userpg->lock; |
3316 | preempt_enable(); | 3405 | preempt_enable(); |
@@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3568 | event->mmap_user = get_current_user(); | 3657 | event->mmap_user = get_current_user(); |
3569 | vma->vm_mm->pinned_vm += event->mmap_locked; | 3658 | vma->vm_mm->pinned_vm += event->mmap_locked; |
3570 | 3659 | ||
3660 | perf_event_update_userpage(event); | ||
3661 | |||
3571 | unlock: | 3662 | unlock: |
3572 | if (!ret) | 3663 | if (!ret) |
3573 | atomic_inc(&event->mmap_count); | 3664 | atomic_inc(&event->mmap_count); |
@@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3799 | static void perf_output_read(struct perf_output_handle *handle, | 3890 | static void perf_output_read(struct perf_output_handle *handle, |
3800 | struct perf_event *event) | 3891 | struct perf_event *event) |
3801 | { | 3892 | { |
3802 | u64 enabled = 0, running = 0; | 3893 | u64 enabled = 0, running = 0, now; |
3803 | u64 read_format = event->attr.read_format; | 3894 | u64 read_format = event->attr.read_format; |
3804 | 3895 | ||
3805 | /* | 3896 | /* |
@@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle, | |||
3812 | * NMI context | 3903 | * NMI context |
3813 | */ | 3904 | */ |
3814 | if (read_format & PERF_FORMAT_TOTAL_TIMES) | 3905 | if (read_format & PERF_FORMAT_TOTAL_TIMES) |
3815 | calc_timer_values(event, &enabled, &running); | 3906 | calc_timer_values(event, &now, &enabled, &running); |
3816 | 3907 | ||
3817 | if (event->attr.read_format & PERF_FORMAT_GROUP) | 3908 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
3818 | perf_output_read_group(handle, event, enabled, running); | 3909 | perf_output_read_group(handle, event, enabled, running); |
@@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
3902 | } | 3993 | } |
3903 | } | 3994 | } |
3904 | } | 3995 | } |
3996 | |||
3997 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | ||
3998 | if (data->br_stack) { | ||
3999 | size_t size; | ||
4000 | |||
4001 | size = data->br_stack->nr | ||
4002 | * sizeof(struct perf_branch_entry); | ||
4003 | |||
4004 | perf_output_put(handle, data->br_stack->nr); | ||
4005 | perf_output_copy(handle, data->br_stack->entries, size); | ||
4006 | } else { | ||
4007 | /* | ||
4008 | * we always store at least the value of nr | ||
4009 | */ | ||
4010 | u64 nr = 0; | ||
4011 | perf_output_put(handle, nr); | ||
4012 | } | ||
4013 | } | ||
3905 | } | 4014 | } |
3906 | 4015 | ||
3907 | void perf_prepare_sample(struct perf_event_header *header, | 4016 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
3944 | WARN_ON_ONCE(size & (sizeof(u64)-1)); | 4053 | WARN_ON_ONCE(size & (sizeof(u64)-1)); |
3945 | header->size += size; | 4054 | header->size += size; |
3946 | } | 4055 | } |
4056 | |||
4057 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | ||
4058 | int size = sizeof(u64); /* nr */ | ||
4059 | if (data->br_stack) { | ||
4060 | size += data->br_stack->nr | ||
4061 | * sizeof(struct perf_branch_entry); | ||
4062 | } | ||
4063 | header->size += size; | ||
4064 | } | ||
3947 | } | 4065 | } |
3948 | 4066 | ||
3949 | static void perf_event_output(struct perf_event *event, | 4067 | static void perf_event_output(struct perf_event *event, |
@@ -4986,7 +5104,7 @@ fail: | |||
4986 | return err; | 5104 | return err; |
4987 | } | 5105 | } |
4988 | 5106 | ||
4989 | struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 5107 | struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
4990 | 5108 | ||
4991 | static void sw_perf_event_destroy(struct perf_event *event) | 5109 | static void sw_perf_event_destroy(struct perf_event *event) |
4992 | { | 5110 | { |
@@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event) | |||
4994 | 5112 | ||
4995 | WARN_ON(event->parent); | 5113 | WARN_ON(event->parent); |
4996 | 5114 | ||
4997 | jump_label_dec(&perf_swevent_enabled[event_id]); | 5115 | static_key_slow_dec(&perf_swevent_enabled[event_id]); |
4998 | swevent_hlist_put(event); | 5116 | swevent_hlist_put(event); |
4999 | } | 5117 | } |
5000 | 5118 | ||
@@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event) | |||
5005 | if (event->attr.type != PERF_TYPE_SOFTWARE) | 5123 | if (event->attr.type != PERF_TYPE_SOFTWARE) |
5006 | return -ENOENT; | 5124 | return -ENOENT; |
5007 | 5125 | ||
5126 | /* | ||
5127 | * no branch sampling for software events | ||
5128 | */ | ||
5129 | if (has_branch_stack(event)) | ||
5130 | return -EOPNOTSUPP; | ||
5131 | |||
5008 | switch (event_id) { | 5132 | switch (event_id) { |
5009 | case PERF_COUNT_SW_CPU_CLOCK: | 5133 | case PERF_COUNT_SW_CPU_CLOCK: |
5010 | case PERF_COUNT_SW_TASK_CLOCK: | 5134 | case PERF_COUNT_SW_TASK_CLOCK: |
@@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event) | |||
5024 | if (err) | 5148 | if (err) |
5025 | return err; | 5149 | return err; |
5026 | 5150 | ||
5027 | jump_label_inc(&perf_swevent_enabled[event_id]); | 5151 | static_key_slow_inc(&perf_swevent_enabled[event_id]); |
5028 | event->destroy = sw_perf_event_destroy; | 5152 | event->destroy = sw_perf_event_destroy; |
5029 | } | 5153 | } |
5030 | 5154 | ||
5031 | return 0; | 5155 | return 0; |
5032 | } | 5156 | } |
5033 | 5157 | ||
5158 | static int perf_swevent_event_idx(struct perf_event *event) | ||
5159 | { | ||
5160 | return 0; | ||
5161 | } | ||
5162 | |||
5034 | static struct pmu perf_swevent = { | 5163 | static struct pmu perf_swevent = { |
5035 | .task_ctx_nr = perf_sw_context, | 5164 | .task_ctx_nr = perf_sw_context, |
5036 | 5165 | ||
@@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = { | |||
5040 | .start = perf_swevent_start, | 5169 | .start = perf_swevent_start, |
5041 | .stop = perf_swevent_stop, | 5170 | .stop = perf_swevent_stop, |
5042 | .read = perf_swevent_read, | 5171 | .read = perf_swevent_read, |
5172 | |||
5173 | .event_idx = perf_swevent_event_idx, | ||
5043 | }; | 5174 | }; |
5044 | 5175 | ||
5045 | #ifdef CONFIG_EVENT_TRACING | 5176 | #ifdef CONFIG_EVENT_TRACING |
@@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event) | |||
5108 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 5239 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
5109 | return -ENOENT; | 5240 | return -ENOENT; |
5110 | 5241 | ||
5242 | /* | ||
5243 | * no branch sampling for tracepoint events | ||
5244 | */ | ||
5245 | if (has_branch_stack(event)) | ||
5246 | return -EOPNOTSUPP; | ||
5247 | |||
5111 | err = perf_trace_init(event); | 5248 | err = perf_trace_init(event); |
5112 | if (err) | 5249 | if (err) |
5113 | return err; | 5250 | return err; |
@@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = { | |||
5126 | .start = perf_swevent_start, | 5263 | .start = perf_swevent_start, |
5127 | .stop = perf_swevent_stop, | 5264 | .stop = perf_swevent_stop, |
5128 | .read = perf_swevent_read, | 5265 | .read = perf_swevent_read, |
5266 | |||
5267 | .event_idx = perf_swevent_event_idx, | ||
5129 | }; | 5268 | }; |
5130 | 5269 | ||
5131 | static inline void perf_tp_register(void) | 5270 | static inline void perf_tp_register(void) |
@@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
5331 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | 5470 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) |
5332 | return -ENOENT; | 5471 | return -ENOENT; |
5333 | 5472 | ||
5473 | /* | ||
5474 | * no branch sampling for software events | ||
5475 | */ | ||
5476 | if (has_branch_stack(event)) | ||
5477 | return -EOPNOTSUPP; | ||
5478 | |||
5334 | perf_swevent_init_hrtimer(event); | 5479 | perf_swevent_init_hrtimer(event); |
5335 | 5480 | ||
5336 | return 0; | 5481 | return 0; |
@@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = { | |||
5345 | .start = cpu_clock_event_start, | 5490 | .start = cpu_clock_event_start, |
5346 | .stop = cpu_clock_event_stop, | 5491 | .stop = cpu_clock_event_stop, |
5347 | .read = cpu_clock_event_read, | 5492 | .read = cpu_clock_event_read, |
5493 | |||
5494 | .event_idx = perf_swevent_event_idx, | ||
5348 | }; | 5495 | }; |
5349 | 5496 | ||
5350 | /* | 5497 | /* |
@@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event) | |||
5403 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | 5550 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) |
5404 | return -ENOENT; | 5551 | return -ENOENT; |
5405 | 5552 | ||
5553 | /* | ||
5554 | * no branch sampling for software events | ||
5555 | */ | ||
5556 | if (has_branch_stack(event)) | ||
5557 | return -EOPNOTSUPP; | ||
5558 | |||
5406 | perf_swevent_init_hrtimer(event); | 5559 | perf_swevent_init_hrtimer(event); |
5407 | 5560 | ||
5408 | return 0; | 5561 | return 0; |
@@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = { | |||
5417 | .start = task_clock_event_start, | 5570 | .start = task_clock_event_start, |
5418 | .stop = task_clock_event_stop, | 5571 | .stop = task_clock_event_stop, |
5419 | .read = task_clock_event_read, | 5572 | .read = task_clock_event_read, |
5573 | |||
5574 | .event_idx = perf_swevent_event_idx, | ||
5420 | }; | 5575 | }; |
5421 | 5576 | ||
5422 | static void perf_pmu_nop_void(struct pmu *pmu) | 5577 | static void perf_pmu_nop_void(struct pmu *pmu) |
@@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) | |||
5444 | perf_pmu_enable(pmu); | 5599 | perf_pmu_enable(pmu); |
5445 | } | 5600 | } |
5446 | 5601 | ||
5602 | static int perf_event_idx_default(struct perf_event *event) | ||
5603 | { | ||
5604 | return event->hw.idx + 1; | ||
5605 | } | ||
5606 | |||
5447 | /* | 5607 | /* |
5448 | * Ensures all contexts with the same task_ctx_nr have the same | 5608 | * Ensures all contexts with the same task_ctx_nr have the same |
5449 | * pmu_cpu_context too. | 5609 | * pmu_cpu_context too. |
@@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu) | |||
5530 | if (!pmu->dev) | 5690 | if (!pmu->dev) |
5531 | goto out; | 5691 | goto out; |
5532 | 5692 | ||
5693 | pmu->dev->groups = pmu->attr_groups; | ||
5533 | device_initialize(pmu->dev); | 5694 | device_initialize(pmu->dev); |
5534 | ret = dev_set_name(pmu->dev, "%s", pmu->name); | 5695 | ret = dev_set_name(pmu->dev, "%s", pmu->name); |
5535 | if (ret) | 5696 | if (ret) |
@@ -5633,6 +5794,9 @@ got_cpu_context: | |||
5633 | pmu->pmu_disable = perf_pmu_nop_void; | 5794 | pmu->pmu_disable = perf_pmu_nop_void; |
5634 | } | 5795 | } |
5635 | 5796 | ||
5797 | if (!pmu->event_idx) | ||
5798 | pmu->event_idx = perf_event_idx_default; | ||
5799 | |||
5636 | list_add_rcu(&pmu->entry, &pmus); | 5800 | list_add_rcu(&pmu->entry, &pmus); |
5637 | ret = 0; | 5801 | ret = 0; |
5638 | unlock: | 5802 | unlock: |
@@ -5825,7 +5989,7 @@ done: | |||
5825 | 5989 | ||
5826 | if (!event->parent) { | 5990 | if (!event->parent) { |
5827 | if (event->attach_state & PERF_ATTACH_TASK) | 5991 | if (event->attach_state & PERF_ATTACH_TASK) |
5828 | jump_label_inc(&perf_sched_events.key); | 5992 | static_key_slow_inc(&perf_sched_events.key); |
5829 | if (event->attr.mmap || event->attr.mmap_data) | 5993 | if (event->attr.mmap || event->attr.mmap_data) |
5830 | atomic_inc(&nr_mmap_events); | 5994 | atomic_inc(&nr_mmap_events); |
5831 | if (event->attr.comm) | 5995 | if (event->attr.comm) |
@@ -5839,6 +6003,12 @@ done: | |||
5839 | return ERR_PTR(err); | 6003 | return ERR_PTR(err); |
5840 | } | 6004 | } |
5841 | } | 6005 | } |
6006 | if (has_branch_stack(event)) { | ||
6007 | static_key_slow_inc(&perf_sched_events.key); | ||
6008 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6009 | atomic_inc(&per_cpu(perf_branch_stack_events, | ||
6010 | event->cpu)); | ||
6011 | } | ||
5842 | } | 6012 | } |
5843 | 6013 | ||
5844 | return event; | 6014 | return event; |
@@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
5908 | if (attr->read_format & ~(PERF_FORMAT_MAX-1)) | 6078 | if (attr->read_format & ~(PERF_FORMAT_MAX-1)) |
5909 | return -EINVAL; | 6079 | return -EINVAL; |
5910 | 6080 | ||
6081 | if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { | ||
6082 | u64 mask = attr->branch_sample_type; | ||
6083 | |||
6084 | /* only using defined bits */ | ||
6085 | if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) | ||
6086 | return -EINVAL; | ||
6087 | |||
6088 | /* at least one branch bit must be set */ | ||
6089 | if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) | ||
6090 | return -EINVAL; | ||
6091 | |||
6092 | /* kernel level capture: check permissions */ | ||
6093 | if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) | ||
6094 | && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | ||
6095 | return -EACCES; | ||
6096 | |||
6097 | /* propagate priv level, when not set for branch */ | ||
6098 | if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { | ||
6099 | |||
6100 | /* exclude_kernel checked on syscall entry */ | ||
6101 | if (!attr->exclude_kernel) | ||
6102 | mask |= PERF_SAMPLE_BRANCH_KERNEL; | ||
6103 | |||
6104 | if (!attr->exclude_user) | ||
6105 | mask |= PERF_SAMPLE_BRANCH_USER; | ||
6106 | |||
6107 | if (!attr->exclude_hv) | ||
6108 | mask |= PERF_SAMPLE_BRANCH_HV; | ||
6109 | /* | ||
6110 | * adjust user setting (for HW filter setup) | ||
6111 | */ | ||
6112 | attr->branch_sample_type = mask; | ||
6113 | } | ||
6114 | } | ||
5911 | out: | 6115 | out: |
5912 | return ret; | 6116 | return ret; |
5913 | 6117 | ||
@@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6063 | * - that may need work on context switch | 6267 | * - that may need work on context switch |
6064 | */ | 6268 | */ |
6065 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | 6269 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); |
6066 | jump_label_inc(&perf_sched_events.key); | 6270 | static_key_slow_inc(&perf_sched_events.key); |
6067 | } | 6271 | } |
6068 | 6272 | ||
6069 | /* | 6273 | /* |
@@ -6912,6 +7116,13 @@ void __init perf_event_init(void) | |||
6912 | 7116 | ||
6913 | /* do not patch jump label more than once per second */ | 7117 | /* do not patch jump label more than once per second */ |
6914 | jump_label_rate_limit(&perf_sched_events, HZ); | 7118 | jump_label_rate_limit(&perf_sched_events, HZ); |
7119 | |||
7120 | /* | ||
7121 | * Build time assertion that we keep the data_head at the intended | ||
7122 | * location. IOW, validation we got the __reserved[] size right. | ||
7123 | */ | ||
7124 | BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) | ||
7125 | != 1024); | ||
6915 | } | 7126 | } |
6916 | 7127 | ||
6917 | static int __init perf_event_sysfs_init(void) | 7128 | static int __init perf_event_sysfs_init(void) |
@@ -6943,8 +7154,7 @@ unlock: | |||
6943 | device_initcall(perf_event_sysfs_init); | 7154 | device_initcall(perf_event_sysfs_init); |
6944 | 7155 | ||
6945 | #ifdef CONFIG_CGROUP_PERF | 7156 | #ifdef CONFIG_CGROUP_PERF |
6946 | static struct cgroup_subsys_state *perf_cgroup_create( | 7157 | static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) |
6947 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
6948 | { | 7158 | { |
6949 | struct perf_cgroup *jc; | 7159 | struct perf_cgroup *jc; |
6950 | 7160 | ||
@@ -6961,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create( | |||
6961 | return &jc->css; | 7171 | return &jc->css; |
6962 | } | 7172 | } |
6963 | 7173 | ||
6964 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, | 7174 | static void perf_cgroup_destroy(struct cgroup *cont) |
6965 | struct cgroup *cont) | ||
6966 | { | 7175 | { |
6967 | struct perf_cgroup *jc; | 7176 | struct perf_cgroup *jc; |
6968 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7177 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), |
@@ -6978,8 +7187,7 @@ static int __perf_cgroup_move(void *info) | |||
6978 | return 0; | 7187 | return 0; |
6979 | } | 7188 | } |
6980 | 7189 | ||
6981 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7190 | static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
6982 | struct cgroup_taskset *tset) | ||
6983 | { | 7191 | { |
6984 | struct task_struct *task; | 7192 | struct task_struct *task; |
6985 | 7193 | ||
@@ -6987,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
6987 | task_function_call(task, __perf_cgroup_move, task); | 7195 | task_function_call(task, __perf_cgroup_move, task); |
6988 | } | 7196 | } |
6989 | 7197 | ||
6990 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7198 | static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, |
6991 | struct cgroup *old_cgrp, struct task_struct *task) | 7199 | struct task_struct *task) |
6992 | { | 7200 | { |
6993 | /* | 7201 | /* |
6994 | * cgroup_exit() is called in the copy_process() failure path. | 7202 | * cgroup_exit() is called in the copy_process() failure path. |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) | |||
581 | if (bp->attr.type != PERF_TYPE_BREAKPOINT) | 581 | if (bp->attr.type != PERF_TYPE_BREAKPOINT) |
582 | return -ENOENT; | 582 | return -ENOENT; |
583 | 583 | ||
584 | /* | ||
585 | * no branch sampling for breakpoint events | ||
586 | */ | ||
587 | if (has_branch_stack(bp)) | ||
588 | return -EOPNOTSUPP; | ||
589 | |||
584 | err = register_perf_hw_breakpoint(bp); | 590 | err = register_perf_hw_breakpoint(bp); |
585 | if (err) | 591 | if (err) |
586 | return err; | 592 | return err; |
@@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) | |||
613 | bp->hw.state = PERF_HES_STOPPED; | 619 | bp->hw.state = PERF_HES_STOPPED; |
614 | } | 620 | } |
615 | 621 | ||
622 | static int hw_breakpoint_event_idx(struct perf_event *bp) | ||
623 | { | ||
624 | return 0; | ||
625 | } | ||
626 | |||
616 | static struct pmu perf_breakpoint = { | 627 | static struct pmu perf_breakpoint = { |
617 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | 628 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ |
618 | 629 | ||
@@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = { | |||
622 | .start = hw_breakpoint_start, | 633 | .start = hw_breakpoint_start, |
623 | .stop = hw_breakpoint_stop, | 634 | .stop = hw_breakpoint_stop, |
624 | .read = hw_breakpoint_pmu_read, | 635 | .read = hw_breakpoint_pmu_read, |
636 | |||
637 | .event_idx = hw_breakpoint_event_idx, | ||
625 | }; | 638 | }; |
626 | 639 | ||
627 | int __init init_hw_breakpoint(void) | 640 | int __init init_hw_breakpoint(void) |
@@ -651,10 +664,10 @@ int __init init_hw_breakpoint(void) | |||
651 | 664 | ||
652 | err_alloc: | 665 | err_alloc: |
653 | for_each_possible_cpu(err_cpu) { | 666 | for_each_possible_cpu(err_cpu) { |
654 | if (err_cpu == cpu) | ||
655 | break; | ||
656 | for (i = 0; i < TYPE_MAX; i++) | 667 | for (i = 0; i < TYPE_MAX; i++) |
657 | kfree(per_cpu(nr_task_bp_pinned[i], cpu)); | 668 | kfree(per_cpu(nr_task_bp_pinned[i], cpu)); |
669 | if (err_cpu == cpu) | ||
670 | break; | ||
658 | } | 671 | } |
659 | 672 | ||
660 | return -ENOMEM; | 673 | return -ENOMEM; |
diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..d8bd3b425fa7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | 53 | #include <linux/oom.h> |
54 | #include <linux/writeback.h> | 54 | #include <linux/writeback.h> |
55 | #include <linux/shm.h> | ||
55 | 56 | ||
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
57 | #include <asm/unistd.h> | 58 | #include <asm/unistd.h> |
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...) | |||
424 | */ | 425 | */ |
425 | exit_mm(current); | 426 | exit_mm(current); |
426 | /* | 427 | /* |
427 | * We don't want to have TIF_FREEZE set if the system-wide hibernation | 428 | * We don't want to get frozen, in case system-wide hibernation |
428 | * or suspend transition begins right now. | 429 | * or suspend transition begins right now. |
429 | */ | 430 | */ |
430 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); | 431 | current->flags |= (PF_NOFREEZE | PF_KTHREAD); |
@@ -473,7 +474,7 @@ static void close_files(struct files_struct * files) | |||
473 | i = j * __NFDBITS; | 474 | i = j * __NFDBITS; |
474 | if (i >= fdt->max_fds) | 475 | if (i >= fdt->max_fds) |
475 | break; | 476 | break; |
476 | set = fdt->open_fds->fds_bits[j++]; | 477 | set = fdt->open_fds[j++]; |
477 | while (set) { | 478 | while (set) { |
478 | if (set & 1) { | 479 | if (set & 1) { |
479 | struct file * file = xchg(&fdt->fd[i], NULL); | 480 | struct file * file = xchg(&fdt->fd[i], NULL); |
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk) | |||
686 | } | 687 | } |
687 | 688 | ||
688 | /* | 689 | /* |
689 | * When we die, we re-parent all our children. | 690 | * When we die, we re-parent all our children, and try to: |
690 | * Try to give them to another thread in our thread | 691 | * 1. give them to another thread in our thread group, if such a member exists |
691 | * group, and if no such member exists, give it to | 692 | * 2. give it to the first ancestor process which prctl'd itself as a |
692 | * the child reaper process (ie "init") in our pid | 693 | * child_subreaper for its children (like a service manager) |
693 | * space. | 694 | * 3. give it to the init process (PID 1) in our pid namespace |
694 | */ | 695 | */ |
695 | static struct task_struct *find_new_reaper(struct task_struct *father) | 696 | static struct task_struct *find_new_reaper(struct task_struct *father) |
696 | __releases(&tasklist_lock) | 697 | __releases(&tasklist_lock) |
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
710 | 711 | ||
711 | if (unlikely(pid_ns->child_reaper == father)) { | 712 | if (unlikely(pid_ns->child_reaper == father)) { |
712 | write_unlock_irq(&tasklist_lock); | 713 | write_unlock_irq(&tasklist_lock); |
713 | if (unlikely(pid_ns == &init_pid_ns)) | 714 | if (unlikely(pid_ns == &init_pid_ns)) { |
714 | panic("Attempted to kill init!"); | 715 | panic("Attempted to kill init! exitcode=0x%08x\n", |
716 | father->signal->group_exit_code ?: | ||
717 | father->exit_code); | ||
718 | } | ||
715 | 719 | ||
716 | zap_pid_ns_processes(pid_ns); | 720 | zap_pid_ns_processes(pid_ns); |
717 | write_lock_irq(&tasklist_lock); | 721 | write_lock_irq(&tasklist_lock); |
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
721 | * forget_original_parent() must move them somewhere. | 725 | * forget_original_parent() must move them somewhere. |
722 | */ | 726 | */ |
723 | pid_ns->child_reaper = init_pid_ns.child_reaper; | 727 | pid_ns->child_reaper = init_pid_ns.child_reaper; |
728 | } else if (father->signal->has_child_subreaper) { | ||
729 | struct task_struct *reaper; | ||
730 | |||
731 | /* | ||
732 | * Find the first ancestor marked as child_subreaper. | ||
733 | * Note that the code below checks same_thread_group(reaper, | ||
734 | * pid_ns->child_reaper). This is what we need to DTRT in a | ||
735 | * PID namespace. However we still need the check above, see | ||
736 | * http://marc.info/?l=linux-kernel&m=131385460420380 | ||
737 | */ | ||
738 | for (reaper = father->real_parent; | ||
739 | reaper != &init_task; | ||
740 | reaper = reaper->real_parent) { | ||
741 | if (same_thread_group(reaper, pid_ns->child_reaper)) | ||
742 | break; | ||
743 | if (!reaper->signal->is_child_subreaper) | ||
744 | continue; | ||
745 | thread = reaper; | ||
746 | do { | ||
747 | if (!(thread->flags & PF_EXITING)) | ||
748 | return reaper; | ||
749 | } while_each_thread(reaper, thread); | ||
750 | } | ||
724 | } | 751 | } |
725 | 752 | ||
726 | return pid_ns->child_reaper; | 753 | return pid_ns->child_reaper; |
@@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
818 | if (group_dead) | 845 | if (group_dead) |
819 | kill_orphaned_pgrp(tsk->group_leader, NULL); | 846 | kill_orphaned_pgrp(tsk->group_leader, NULL); |
820 | 847 | ||
821 | /* Let father know we died | ||
822 | * | ||
823 | * Thread signals are configurable, but you aren't going to use | ||
824 | * that to send signals to arbitrary processes. | ||
825 | * That stops right now. | ||
826 | * | ||
827 | * If the parent exec id doesn't match the exec id we saved | ||
828 | * when we started then we know the parent has changed security | ||
829 | * domain. | ||
830 | * | ||
831 | * If our self_exec id doesn't match our parent_exec_id then | ||
832 | * we have changed execution domain as these two values started | ||
833 | * the same after a fork. | ||
834 | */ | ||
835 | if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && | ||
836 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || | ||
837 | tsk->self_exec_id != tsk->parent_exec_id)) | ||
838 | tsk->exit_signal = SIGCHLD; | ||
839 | |||
840 | if (unlikely(tsk->ptrace)) { | 848 | if (unlikely(tsk->ptrace)) { |
841 | int sig = thread_group_leader(tsk) && | 849 | int sig = thread_group_leader(tsk) && |
842 | thread_group_empty(tsk) && | 850 | thread_group_empty(tsk) && |
@@ -935,8 +943,6 @@ void do_exit(long code) | |||
935 | schedule(); | 943 | schedule(); |
936 | } | 944 | } |
937 | 945 | ||
938 | exit_irq_thread(); | ||
939 | |||
940 | exit_signals(tsk); /* sets PF_EXITING */ | 946 | exit_signals(tsk); /* sets PF_EXITING */ |
941 | /* | 947 | /* |
942 | * tsk->flags are checked in the futex code to protect against | 948 | * tsk->flags are checked in the futex code to protect against |
@@ -945,6 +951,8 @@ void do_exit(long code) | |||
945 | smp_mb(); | 951 | smp_mb(); |
946 | raw_spin_unlock_wait(&tsk->pi_lock); | 952 | raw_spin_unlock_wait(&tsk->pi_lock); |
947 | 953 | ||
954 | exit_irq_thread(); | ||
955 | |||
948 | if (unlikely(in_atomic())) | 956 | if (unlikely(in_atomic())) |
949 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 957 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
950 | current->comm, task_pid_nr(current), | 958 | current->comm, task_pid_nr(current), |
@@ -953,7 +961,7 @@ void do_exit(long code) | |||
953 | acct_update_integrals(tsk); | 961 | acct_update_integrals(tsk); |
954 | /* sync mm's RSS info before statistics gathering */ | 962 | /* sync mm's RSS info before statistics gathering */ |
955 | if (tsk->mm) | 963 | if (tsk->mm) |
956 | sync_mm_rss(tsk, tsk->mm); | 964 | sync_mm_rss(tsk->mm); |
957 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 965 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
958 | if (group_dead) { | 966 | if (group_dead) { |
959 | hrtimer_cancel(&tsk->signal->real_timer); | 967 | hrtimer_cancel(&tsk->signal->real_timer); |
diff --git a/kernel/fork.c b/kernel/fork.c index a1b632713e43..08eb8584e2a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -66,6 +66,7 @@ | |||
66 | #include <linux/user-return-notifier.h> | 66 | #include <linux/user-return-notifier.h> |
67 | #include <linux/oom.h> | 67 | #include <linux/oom.h> |
68 | #include <linux/khugepaged.h> | 68 | #include <linux/khugepaged.h> |
69 | #include <linux/signalfd.h> | ||
69 | 70 | ||
70 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
71 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -192,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
192 | WARN_ON(atomic_read(&tsk->usage)); | 193 | WARN_ON(atomic_read(&tsk->usage)); |
193 | WARN_ON(tsk == current); | 194 | WARN_ON(tsk == current); |
194 | 195 | ||
196 | security_task_free(tsk); | ||
195 | exit_creds(tsk); | 197 | exit_creds(tsk); |
196 | delayacct_tsk_free(tsk); | 198 | delayacct_tsk_free(tsk); |
197 | put_signal_struct(tsk->signal); | 199 | put_signal_struct(tsk->signal); |
@@ -354,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
354 | charge = 0; | 356 | charge = 0; |
355 | if (mpnt->vm_flags & VM_ACCOUNT) { | 357 | if (mpnt->vm_flags & VM_ACCOUNT) { |
356 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | 358 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; |
357 | if (security_vm_enough_memory(len)) | 359 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ |
358 | goto fail_nomem; | 360 | goto fail_nomem; |
359 | charge = len; | 361 | charge = len; |
360 | } | 362 | } |
@@ -510,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
510 | return NULL; | 512 | return NULL; |
511 | } | 513 | } |
512 | 514 | ||
515 | static void check_mm(struct mm_struct *mm) | ||
516 | { | ||
517 | int i; | ||
518 | |||
519 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
520 | long x = atomic_long_read(&mm->rss_stat.count[i]); | ||
521 | |||
522 | if (unlikely(x)) | ||
523 | printk(KERN_ALERT "BUG: Bad rss-counter state " | ||
524 | "mm:%p idx:%d val:%ld\n", mm, i, x); | ||
525 | } | ||
526 | |||
527 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
528 | VM_BUG_ON(mm->pmd_huge_pte); | ||
529 | #endif | ||
530 | } | ||
531 | |||
513 | /* | 532 | /* |
514 | * Allocate and initialize an mm_struct. | 533 | * Allocate and initialize an mm_struct. |
515 | */ | 534 | */ |
@@ -537,9 +556,7 @@ void __mmdrop(struct mm_struct *mm) | |||
537 | mm_free_pgd(mm); | 556 | mm_free_pgd(mm); |
538 | destroy_context(mm); | 557 | destroy_context(mm); |
539 | mmu_notifier_mm_destroy(mm); | 558 | mmu_notifier_mm_destroy(mm); |
540 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 559 | check_mm(mm); |
541 | VM_BUG_ON(mm->pmd_huge_pte); | ||
542 | #endif | ||
543 | free_mm(mm); | 560 | free_mm(mm); |
544 | } | 561 | } |
545 | EXPORT_SYMBOL_GPL(__mmdrop); | 562 | EXPORT_SYMBOL_GPL(__mmdrop); |
@@ -667,6 +684,38 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) | |||
667 | return mm; | 684 | return mm; |
668 | } | 685 | } |
669 | 686 | ||
687 | static void complete_vfork_done(struct task_struct *tsk) | ||
688 | { | ||
689 | struct completion *vfork; | ||
690 | |||
691 | task_lock(tsk); | ||
692 | vfork = tsk->vfork_done; | ||
693 | if (likely(vfork)) { | ||
694 | tsk->vfork_done = NULL; | ||
695 | complete(vfork); | ||
696 | } | ||
697 | task_unlock(tsk); | ||
698 | } | ||
699 | |||
700 | static int wait_for_vfork_done(struct task_struct *child, | ||
701 | struct completion *vfork) | ||
702 | { | ||
703 | int killed; | ||
704 | |||
705 | freezer_do_not_count(); | ||
706 | killed = wait_for_completion_killable(vfork); | ||
707 | freezer_count(); | ||
708 | |||
709 | if (killed) { | ||
710 | task_lock(child); | ||
711 | child->vfork_done = NULL; | ||
712 | task_unlock(child); | ||
713 | } | ||
714 | |||
715 | put_task_struct(child); | ||
716 | return killed; | ||
717 | } | ||
718 | |||
670 | /* Please note the differences between mmput and mm_release. | 719 | /* Please note the differences between mmput and mm_release. |
671 | * mmput is called whenever we stop holding onto a mm_struct, | 720 | * mmput is called whenever we stop holding onto a mm_struct, |
672 | * error success whatever. | 721 | * error success whatever. |
@@ -682,8 +731,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) | |||
682 | */ | 731 | */ |
683 | void mm_release(struct task_struct *tsk, struct mm_struct *mm) | 732 | void mm_release(struct task_struct *tsk, struct mm_struct *mm) |
684 | { | 733 | { |
685 | struct completion *vfork_done = tsk->vfork_done; | ||
686 | |||
687 | /* Get rid of any futexes when releasing the mm */ | 734 | /* Get rid of any futexes when releasing the mm */ |
688 | #ifdef CONFIG_FUTEX | 735 | #ifdef CONFIG_FUTEX |
689 | if (unlikely(tsk->robust_list)) { | 736 | if (unlikely(tsk->robust_list)) { |
@@ -703,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
703 | /* Get rid of any cached register state */ | 750 | /* Get rid of any cached register state */ |
704 | deactivate_mm(tsk, mm); | 751 | deactivate_mm(tsk, mm); |
705 | 752 | ||
706 | /* notify parent sleeping on vfork() */ | 753 | if (tsk->vfork_done) |
707 | if (vfork_done) { | 754 | complete_vfork_done(tsk); |
708 | tsk->vfork_done = NULL; | ||
709 | complete(vfork_done); | ||
710 | } | ||
711 | 755 | ||
712 | /* | 756 | /* |
713 | * If we're exiting normally, clear a user-space tid field if | 757 | * If we're exiting normally, clear a user-space tid field if |
714 | * requested. We leave this alone when dying by signal, to leave | 758 | * requested. We leave this alone when dying by signal, to leave |
715 | * the value intact in a core dump, and to save the unnecessary | 759 | * the value intact in a core dump, and to save the unnecessary |
716 | * trouble otherwise. Userland only wants this done for a sys_exit. | 760 | * trouble, say, a killed vfork parent shouldn't touch this mm. |
761 | * Userland only wants this done for a sys_exit. | ||
717 | */ | 762 | */ |
718 | if (tsk->clear_child_tid) { | 763 | if (tsk->clear_child_tid) { |
719 | if (!(tsk->flags & PF_SIGNALED) && | 764 | if (!(tsk->flags & PF_SIGNALED) && |
@@ -934,8 +979,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
934 | 979 | ||
935 | void __cleanup_sighand(struct sighand_struct *sighand) | 980 | void __cleanup_sighand(struct sighand_struct *sighand) |
936 | { | 981 | { |
937 | if (atomic_dec_and_test(&sighand->count)) | 982 | if (atomic_dec_and_test(&sighand->count)) { |
983 | signalfd_cleanup(sighand); | ||
938 | kmem_cache_free(sighand_cachep, sighand); | 984 | kmem_cache_free(sighand_cachep, sighand); |
985 | } | ||
939 | } | 986 | } |
940 | 987 | ||
941 | 988 | ||
@@ -1003,6 +1050,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1003 | sig->oom_score_adj = current->signal->oom_score_adj; | 1050 | sig->oom_score_adj = current->signal->oom_score_adj; |
1004 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 1051 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
1005 | 1052 | ||
1053 | sig->has_child_subreaper = current->signal->has_child_subreaper || | ||
1054 | current->signal->is_child_subreaper; | ||
1055 | |||
1006 | mutex_init(&sig->cred_guard_mutex); | 1056 | mutex_init(&sig->cred_guard_mutex); |
1007 | 1057 | ||
1008 | return 0; | 1058 | return 0; |
@@ -1014,7 +1064,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
1014 | 1064 | ||
1015 | new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); | 1065 | new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); |
1016 | new_flags |= PF_FORKNOEXEC; | 1066 | new_flags |= PF_FORKNOEXEC; |
1017 | new_flags |= PF_STARTING; | ||
1018 | p->flags = new_flags; | 1067 | p->flags = new_flags; |
1019 | } | 1068 | } |
1020 | 1069 | ||
@@ -1191,6 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1191 | #ifdef CONFIG_CPUSETS | 1240 | #ifdef CONFIG_CPUSETS |
1192 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; | 1241 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; |
1193 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; | 1242 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; |
1243 | seqcount_init(&p->mems_allowed_seq); | ||
1194 | #endif | 1244 | #endif |
1195 | #ifdef CONFIG_TRACE_IRQFLAGS | 1245 | #ifdef CONFIG_TRACE_IRQFLAGS |
1196 | p->irq_events = 0; | 1246 | p->irq_events = 0; |
@@ -1309,7 +1359,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1309 | clear_all_latency_tracing(p); | 1359 | clear_all_latency_tracing(p); |
1310 | 1360 | ||
1311 | /* ok, now we should be set up.. */ | 1361 | /* ok, now we should be set up.. */ |
1312 | p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); | 1362 | if (clone_flags & CLONE_THREAD) |
1363 | p->exit_signal = -1; | ||
1364 | else if (clone_flags & CLONE_PARENT) | ||
1365 | p->exit_signal = current->group_leader->exit_signal; | ||
1366 | else | ||
1367 | p->exit_signal = (clone_flags & CSIGNAL); | ||
1368 | |||
1313 | p->pdeath_signal = 0; | 1369 | p->pdeath_signal = 0; |
1314 | p->exit_state = 0; | 1370 | p->exit_state = 0; |
1315 | 1371 | ||
@@ -1544,16 +1600,9 @@ long do_fork(unsigned long clone_flags, | |||
1544 | if (clone_flags & CLONE_VFORK) { | 1600 | if (clone_flags & CLONE_VFORK) { |
1545 | p->vfork_done = &vfork; | 1601 | p->vfork_done = &vfork; |
1546 | init_completion(&vfork); | 1602 | init_completion(&vfork); |
1603 | get_task_struct(p); | ||
1547 | } | 1604 | } |
1548 | 1605 | ||
1549 | /* | ||
1550 | * We set PF_STARTING at creation in case tracing wants to | ||
1551 | * use this to distinguish a fully live task from one that | ||
1552 | * hasn't finished SIGSTOP raising yet. Now we clear it | ||
1553 | * and set the child going. | ||
1554 | */ | ||
1555 | p->flags &= ~PF_STARTING; | ||
1556 | |||
1557 | wake_up_new_task(p); | 1606 | wake_up_new_task(p); |
1558 | 1607 | ||
1559 | /* forking complete and child started to run, tell ptracer */ | 1608 | /* forking complete and child started to run, tell ptracer */ |
@@ -1561,10 +1610,8 @@ long do_fork(unsigned long clone_flags, | |||
1561 | ptrace_event(trace, nr); | 1610 | ptrace_event(trace, nr); |
1562 | 1611 | ||
1563 | if (clone_flags & CLONE_VFORK) { | 1612 | if (clone_flags & CLONE_VFORK) { |
1564 | freezer_do_not_count(); | 1613 | if (!wait_for_vfork_done(p, &vfork)) |
1565 | wait_for_completion(&vfork); | 1614 | ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); |
1566 | freezer_count(); | ||
1567 | ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); | ||
1568 | } | 1615 | } |
1569 | } else { | 1616 | } else { |
1570 | nr = PTR_ERR(p); | 1617 | nr = PTR_ERR(p); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed5..11f82a4d4eae 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p) | |||
99 | * freeze_task - send a freeze request to given task | 99 | * freeze_task - send a freeze request to given task |
100 | * @p: task to send the request to | 100 | * @p: task to send the request to |
101 | * | 101 | * |
102 | * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE | 102 | * If @p is freezing, the freeze request is sent either by sending a fake |
103 | * flag and either sending a fake signal to it or waking it up, depending | 103 | * signal (if it's not a kernel thread) or waking it up (if it's a kernel |
104 | * on whether it has %PF_FREEZER_NOSIG set. | 104 | * thread). |
105 | * | 105 | * |
106 | * RETURNS: | 106 | * RETURNS: |
107 | * %false, if @p is not freezing or already frozen; %true, otherwise | 107 | * %false, if @p is not freezing or already frozen; %true, otherwise |
diff --git a/kernel/futex.c b/kernel/futex.c index 1614be20173d..e2b0fb9a0b3b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/magic.h> | 59 | #include <linux/magic.h> |
60 | #include <linux/pid.h> | 60 | #include <linux/pid.h> |
61 | #include <linux/nsproxy.h> | 61 | #include <linux/nsproxy.h> |
62 | #include <linux/ptrace.h> | ||
62 | 63 | ||
63 | #include <asm/futex.h> | 64 | #include <asm/futex.h> |
64 | 65 | ||
@@ -2443,40 +2444,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, | |||
2443 | { | 2444 | { |
2444 | struct robust_list_head __user *head; | 2445 | struct robust_list_head __user *head; |
2445 | unsigned long ret; | 2446 | unsigned long ret; |
2446 | const struct cred *cred = current_cred(), *pcred; | 2447 | struct task_struct *p; |
2447 | 2448 | ||
2448 | if (!futex_cmpxchg_enabled) | 2449 | if (!futex_cmpxchg_enabled) |
2449 | return -ENOSYS; | 2450 | return -ENOSYS; |
2450 | 2451 | ||
2452 | WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); | ||
2453 | |||
2454 | rcu_read_lock(); | ||
2455 | |||
2456 | ret = -ESRCH; | ||
2451 | if (!pid) | 2457 | if (!pid) |
2452 | head = current->robust_list; | 2458 | p = current; |
2453 | else { | 2459 | else { |
2454 | struct task_struct *p; | ||
2455 | |||
2456 | ret = -ESRCH; | ||
2457 | rcu_read_lock(); | ||
2458 | p = find_task_by_vpid(pid); | 2460 | p = find_task_by_vpid(pid); |
2459 | if (!p) | 2461 | if (!p) |
2460 | goto err_unlock; | 2462 | goto err_unlock; |
2461 | ret = -EPERM; | ||
2462 | pcred = __task_cred(p); | ||
2463 | /* If victim is in different user_ns, then uids are not | ||
2464 | comparable, so we must have CAP_SYS_PTRACE */ | ||
2465 | if (cred->user->user_ns != pcred->user->user_ns) { | ||
2466 | if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
2467 | goto err_unlock; | ||
2468 | goto ok; | ||
2469 | } | ||
2470 | /* If victim is in same user_ns, then uids are comparable */ | ||
2471 | if (cred->euid != pcred->euid && | ||
2472 | cred->euid != pcred->uid && | ||
2473 | !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
2474 | goto err_unlock; | ||
2475 | ok: | ||
2476 | head = p->robust_list; | ||
2477 | rcu_read_unlock(); | ||
2478 | } | 2463 | } |
2479 | 2464 | ||
2465 | ret = -EPERM; | ||
2466 | if (!ptrace_may_access(p, PTRACE_MODE_READ)) | ||
2467 | goto err_unlock; | ||
2468 | |||
2469 | head = p->robust_list; | ||
2470 | rcu_read_unlock(); | ||
2471 | |||
2480 | if (put_user(sizeof(*head), len_ptr)) | 2472 | if (put_user(sizeof(*head), len_ptr)) |
2481 | return -EFAULT; | 2473 | return -EFAULT; |
2482 | return put_user(head, head_ptr); | 2474 | return put_user(head, head_ptr); |
@@ -2628,7 +2620,7 @@ void exit_robust_list(struct task_struct *curr) | |||
2628 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2620 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
2629 | u32 __user *uaddr2, u32 val2, u32 val3) | 2621 | u32 __user *uaddr2, u32 val2, u32 val3) |
2630 | { | 2622 | { |
2631 | int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; | 2623 | int cmd = op & FUTEX_CMD_MASK; |
2632 | unsigned int flags = 0; | 2624 | unsigned int flags = 0; |
2633 | 2625 | ||
2634 | if (!(op & FUTEX_PRIVATE_FLAG)) | 2626 | if (!(op & FUTEX_PRIVATE_FLAG)) |
@@ -2641,49 +2633,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
2641 | } | 2633 | } |
2642 | 2634 | ||
2643 | switch (cmd) { | 2635 | switch (cmd) { |
2636 | case FUTEX_LOCK_PI: | ||
2637 | case FUTEX_UNLOCK_PI: | ||
2638 | case FUTEX_TRYLOCK_PI: | ||
2639 | case FUTEX_WAIT_REQUEUE_PI: | ||
2640 | case FUTEX_CMP_REQUEUE_PI: | ||
2641 | if (!futex_cmpxchg_enabled) | ||
2642 | return -ENOSYS; | ||
2643 | } | ||
2644 | |||
2645 | switch (cmd) { | ||
2644 | case FUTEX_WAIT: | 2646 | case FUTEX_WAIT: |
2645 | val3 = FUTEX_BITSET_MATCH_ANY; | 2647 | val3 = FUTEX_BITSET_MATCH_ANY; |
2646 | case FUTEX_WAIT_BITSET: | 2648 | case FUTEX_WAIT_BITSET: |
2647 | ret = futex_wait(uaddr, flags, val, timeout, val3); | 2649 | return futex_wait(uaddr, flags, val, timeout, val3); |
2648 | break; | ||
2649 | case FUTEX_WAKE: | 2650 | case FUTEX_WAKE: |
2650 | val3 = FUTEX_BITSET_MATCH_ANY; | 2651 | val3 = FUTEX_BITSET_MATCH_ANY; |
2651 | case FUTEX_WAKE_BITSET: | 2652 | case FUTEX_WAKE_BITSET: |
2652 | ret = futex_wake(uaddr, flags, val, val3); | 2653 | return futex_wake(uaddr, flags, val, val3); |
2653 | break; | ||
2654 | case FUTEX_REQUEUE: | 2654 | case FUTEX_REQUEUE: |
2655 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); | 2655 | return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); |
2656 | break; | ||
2657 | case FUTEX_CMP_REQUEUE: | 2656 | case FUTEX_CMP_REQUEUE: |
2658 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); | 2657 | return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); |
2659 | break; | ||
2660 | case FUTEX_WAKE_OP: | 2658 | case FUTEX_WAKE_OP: |
2661 | ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); | 2659 | return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); |
2662 | break; | ||
2663 | case FUTEX_LOCK_PI: | 2660 | case FUTEX_LOCK_PI: |
2664 | if (futex_cmpxchg_enabled) | 2661 | return futex_lock_pi(uaddr, flags, val, timeout, 0); |
2665 | ret = futex_lock_pi(uaddr, flags, val, timeout, 0); | ||
2666 | break; | ||
2667 | case FUTEX_UNLOCK_PI: | 2662 | case FUTEX_UNLOCK_PI: |
2668 | if (futex_cmpxchg_enabled) | 2663 | return futex_unlock_pi(uaddr, flags); |
2669 | ret = futex_unlock_pi(uaddr, flags); | ||
2670 | break; | ||
2671 | case FUTEX_TRYLOCK_PI: | 2664 | case FUTEX_TRYLOCK_PI: |
2672 | if (futex_cmpxchg_enabled) | 2665 | return futex_lock_pi(uaddr, flags, 0, timeout, 1); |
2673 | ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); | ||
2674 | break; | ||
2675 | case FUTEX_WAIT_REQUEUE_PI: | 2666 | case FUTEX_WAIT_REQUEUE_PI: |
2676 | val3 = FUTEX_BITSET_MATCH_ANY; | 2667 | val3 = FUTEX_BITSET_MATCH_ANY; |
2677 | ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, | 2668 | return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, |
2678 | uaddr2); | 2669 | uaddr2); |
2679 | break; | ||
2680 | case FUTEX_CMP_REQUEUE_PI: | 2670 | case FUTEX_CMP_REQUEUE_PI: |
2681 | ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); | 2671 | return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); |
2682 | break; | ||
2683 | default: | ||
2684 | ret = -ENOSYS; | ||
2685 | } | 2672 | } |
2686 | return ret; | 2673 | return -ENOSYS; |
2687 | } | 2674 | } |
2688 | 2675 | ||
2689 | 2676 | ||
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 5f9e689dc8f0..83e368b005fc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/compat.h> | 10 | #include <linux/compat.h> |
11 | #include <linux/nsproxy.h> | 11 | #include <linux/nsproxy.h> |
12 | #include <linux/futex.h> | 12 | #include <linux/futex.h> |
13 | #include <linux/ptrace.h> | ||
13 | 14 | ||
14 | #include <asm/uaccess.h> | 15 | #include <asm/uaccess.h> |
15 | 16 | ||
@@ -136,40 +137,31 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, | |||
136 | { | 137 | { |
137 | struct compat_robust_list_head __user *head; | 138 | struct compat_robust_list_head __user *head; |
138 | unsigned long ret; | 139 | unsigned long ret; |
139 | const struct cred *cred = current_cred(), *pcred; | 140 | struct task_struct *p; |
140 | 141 | ||
141 | if (!futex_cmpxchg_enabled) | 142 | if (!futex_cmpxchg_enabled) |
142 | return -ENOSYS; | 143 | return -ENOSYS; |
143 | 144 | ||
145 | WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); | ||
146 | |||
147 | rcu_read_lock(); | ||
148 | |||
149 | ret = -ESRCH; | ||
144 | if (!pid) | 150 | if (!pid) |
145 | head = current->compat_robust_list; | 151 | p = current; |
146 | else { | 152 | else { |
147 | struct task_struct *p; | ||
148 | |||
149 | ret = -ESRCH; | ||
150 | rcu_read_lock(); | ||
151 | p = find_task_by_vpid(pid); | 153 | p = find_task_by_vpid(pid); |
152 | if (!p) | 154 | if (!p) |
153 | goto err_unlock; | 155 | goto err_unlock; |
154 | ret = -EPERM; | ||
155 | pcred = __task_cred(p); | ||
156 | /* If victim is in different user_ns, then uids are not | ||
157 | comparable, so we must have CAP_SYS_PTRACE */ | ||
158 | if (cred->user->user_ns != pcred->user->user_ns) { | ||
159 | if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
160 | goto err_unlock; | ||
161 | goto ok; | ||
162 | } | ||
163 | /* If victim is in same user_ns, then uids are comparable */ | ||
164 | if (cred->euid != pcred->euid && | ||
165 | cred->euid != pcred->uid && | ||
166 | !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) | ||
167 | goto err_unlock; | ||
168 | ok: | ||
169 | head = p->compat_robust_list; | ||
170 | rcu_read_unlock(); | ||
171 | } | 156 | } |
172 | 157 | ||
158 | ret = -EPERM; | ||
159 | if (!ptrace_may_access(p, PTRACE_MODE_READ)) | ||
160 | goto err_unlock; | ||
161 | |||
162 | head = p->compat_robust_list; | ||
163 | rcu_read_unlock(); | ||
164 | |||
173 | if (put_user(sizeof(*head), len_ptr)) | 165 | if (put_user(sizeof(*head), len_ptr)) |
174 | return -EFAULT; | 166 | return -EFAULT; |
175 | return put_user(ptr_to_compat(head), head_ptr); | 167 | return put_user(ptr_to_compat(head), head_ptr); |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2e48ec0c2e91..c21449f85a2a 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
119 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order | 119 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order |
120 | * to exit the grace period. For classic RCU, a reschedule is required. | 120 | * to exit the grace period. For classic RCU, a reschedule is required. |
121 | */ | 121 | */ |
122 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) | 122 | static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) |
123 | { | 123 | { |
124 | bool can_cont; | ||
125 | |||
124 | get_task_struct(g); | 126 | get_task_struct(g); |
125 | get_task_struct(t); | 127 | get_task_struct(t); |
126 | rcu_read_unlock(); | 128 | rcu_read_unlock(); |
127 | cond_resched(); | 129 | cond_resched(); |
128 | rcu_read_lock(); | 130 | rcu_read_lock(); |
131 | can_cont = pid_alive(g) && pid_alive(t); | ||
129 | put_task_struct(t); | 132 | put_task_struct(t); |
130 | put_task_struct(g); | 133 | put_task_struct(g); |
134 | |||
135 | return can_cont; | ||
131 | } | 136 | } |
132 | 137 | ||
133 | /* | 138 | /* |
@@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
154 | goto unlock; | 159 | goto unlock; |
155 | if (!--batch_count) { | 160 | if (!--batch_count) { |
156 | batch_count = HUNG_TASK_BATCHING; | 161 | batch_count = HUNG_TASK_BATCHING; |
157 | rcu_lock_break(g, t); | 162 | if (!rcu_lock_break(g, t)) |
158 | /* Exit if t or g was unhashed during refresh. */ | ||
159 | if (t->state == TASK_DEAD || g->state == TASK_DEAD) | ||
160 | goto unlock; | 163 | goto unlock; |
161 | } | 164 | } |
162 | /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ | 165 | /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5a38bf4de641..cf1a4a68ce44 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS | |||
13 | # Options selectable by the architecture code | 13 | # Options selectable by the architecture code |
14 | 14 | ||
15 | # Make sparse irq Kconfig switch below available | 15 | # Make sparse irq Kconfig switch below available |
16 | config HAVE_SPARSE_IRQ | 16 | config MAY_HAVE_SPARSE_IRQ |
17 | bool | 17 | bool |
18 | 18 | ||
19 | # Enable the generic irq autoprobe mechanism | 19 | # Enable the generic irq autoprobe mechanism |
@@ -56,13 +56,22 @@ config GENERIC_IRQ_CHIP | |||
56 | config IRQ_DOMAIN | 56 | config IRQ_DOMAIN |
57 | bool | 57 | bool |
58 | 58 | ||
59 | config IRQ_DOMAIN_DEBUG | ||
60 | bool "Expose hardware/virtual IRQ mapping via debugfs" | ||
61 | depends on IRQ_DOMAIN && DEBUG_FS | ||
62 | help | ||
63 | This option will show the mapping relationship between hardware irq | ||
64 | numbers and Linux irq numbers. The mapping is exposed via debugfs | ||
65 | in the file "virq_mapping". | ||
66 | |||
67 | If you don't know what this means you don't need it. | ||
68 | |||
59 | # Support forced irq threading | 69 | # Support forced irq threading |
60 | config IRQ_FORCED_THREADING | 70 | config IRQ_FORCED_THREADING |
61 | bool | 71 | bool |
62 | 72 | ||
63 | config SPARSE_IRQ | 73 | config SPARSE_IRQ |
64 | bool "Support sparse irq numbering" | 74 | bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ |
65 | depends on HAVE_SPARSE_IRQ | ||
66 | ---help--- | 75 | ---help--- |
67 | 76 | ||
68 | Sparse irq numbering is useful for distro kernels that want | 77 | Sparse irq numbering is useful for distro kernels that want |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f44e401..0119b9d467ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) | |||
53 | if (desc->irq_data.chip->irq_set_type) | 53 | if (desc->irq_data.chip->irq_set_type) |
54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, | 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
55 | IRQ_TYPE_PROBE); | 55 | IRQ_TYPE_PROBE); |
56 | irq_startup(desc); | 56 | irq_startup(desc, false); |
57 | } | 57 | } |
58 | raw_spin_unlock_irq(&desc->lock); | 58 | raw_spin_unlock_irq(&desc->lock); |
59 | } | 59 | } |
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) | |||
70 | raw_spin_lock_irq(&desc->lock); | 70 | raw_spin_lock_irq(&desc->lock); |
71 | if (!desc->action && irq_settings_can_probe(desc)) { | 71 | if (!desc->action && irq_settings_can_probe(desc)) { |
72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; | 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
73 | if (irq_startup(desc)) | 73 | if (irq_startup(desc, false)) |
74 | desc->istate |= IRQS_PENDING; | 74 | desc->istate |= IRQS_PENDING; |
75 | } | 75 | } |
76 | raw_spin_unlock_irq(&desc->lock); | 76 | raw_spin_unlock_irq(&desc->lock); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..6080f6bc8c33 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
18 | 18 | ||
19 | #include <trace/events/irq.h> | ||
20 | |||
19 | #include "internals.h" | 21 | #include "internals.h" |
20 | 22 | ||
21 | /** | 23 | /** |
@@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) | |||
61 | return -EINVAL; | 63 | return -EINVAL; |
62 | 64 | ||
63 | type &= IRQ_TYPE_SENSE_MASK; | 65 | type &= IRQ_TYPE_SENSE_MASK; |
64 | if (type != IRQ_TYPE_NONE) | 66 | ret = __irq_set_trigger(desc, irq, type); |
65 | ret = __irq_set_trigger(desc, irq, type); | ||
66 | irq_put_desc_busunlock(desc, flags); | 67 | irq_put_desc_busunlock(desc, flags); |
67 | return ret; | 68 | return ret; |
68 | } | 69 | } |
@@ -157,19 +158,22 @@ static void irq_state_set_masked(struct irq_desc *desc) | |||
157 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | 158 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); |
158 | } | 159 | } |
159 | 160 | ||
160 | int irq_startup(struct irq_desc *desc) | 161 | int irq_startup(struct irq_desc *desc, bool resend) |
161 | { | 162 | { |
163 | int ret = 0; | ||
164 | |||
162 | irq_state_clr_disabled(desc); | 165 | irq_state_clr_disabled(desc); |
163 | desc->depth = 0; | 166 | desc->depth = 0; |
164 | 167 | ||
165 | if (desc->irq_data.chip->irq_startup) { | 168 | if (desc->irq_data.chip->irq_startup) { |
166 | int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | 169 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); |
167 | irq_state_clr_masked(desc); | 170 | irq_state_clr_masked(desc); |
168 | return ret; | 171 | } else { |
172 | irq_enable(desc); | ||
169 | } | 173 | } |
170 | 174 | if (resend) | |
171 | irq_enable(desc); | 175 | check_irq_resend(desc, desc->irq_data.irq); |
172 | return 0; | 176 | return ret; |
173 | } | 177 | } |
174 | 178 | ||
175 | void irq_shutdown(struct irq_desc *desc) | 179 | void irq_shutdown(struct irq_desc *desc) |
@@ -330,6 +334,24 @@ out_unlock: | |||
330 | } | 334 | } |
331 | EXPORT_SYMBOL_GPL(handle_simple_irq); | 335 | EXPORT_SYMBOL_GPL(handle_simple_irq); |
332 | 336 | ||
337 | /* | ||
338 | * Called unconditionally from handle_level_irq() and only for oneshot | ||
339 | * interrupts from handle_fasteoi_irq() | ||
340 | */ | ||
341 | static void cond_unmask_irq(struct irq_desc *desc) | ||
342 | { | ||
343 | /* | ||
344 | * We need to unmask in the following cases: | ||
345 | * - Standard level irq (IRQF_ONESHOT is not set) | ||
346 | * - Oneshot irq which did not wake the thread (caused by a | ||
347 | * spurious interrupt or a primary handler handling it | ||
348 | * completely). | ||
349 | */ | ||
350 | if (!irqd_irq_disabled(&desc->irq_data) && | ||
351 | irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) | ||
352 | unmask_irq(desc); | ||
353 | } | ||
354 | |||
333 | /** | 355 | /** |
334 | * handle_level_irq - Level type irq handler | 356 | * handle_level_irq - Level type irq handler |
335 | * @irq: the interrupt number | 357 | * @irq: the interrupt number |
@@ -362,8 +384,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
362 | 384 | ||
363 | handle_irq_event(desc); | 385 | handle_irq_event(desc); |
364 | 386 | ||
365 | if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) | 387 | cond_unmask_irq(desc); |
366 | unmask_irq(desc); | 388 | |
367 | out_unlock: | 389 | out_unlock: |
368 | raw_spin_unlock(&desc->lock); | 390 | raw_spin_unlock(&desc->lock); |
369 | } | 391 | } |
@@ -417,6 +439,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) | |||
417 | preflow_handler(desc); | 439 | preflow_handler(desc); |
418 | handle_irq_event(desc); | 440 | handle_irq_event(desc); |
419 | 441 | ||
442 | if (desc->istate & IRQS_ONESHOT) | ||
443 | cond_unmask_irq(desc); | ||
444 | |||
420 | out_eoi: | 445 | out_eoi: |
421 | desc->irq_data.chip->irq_eoi(&desc->irq_data); | 446 | desc->irq_data.chip->irq_eoi(&desc->irq_data); |
422 | out_unlock: | 447 | out_unlock: |
@@ -625,7 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
625 | irq_settings_set_noprobe(desc); | 650 | irq_settings_set_noprobe(desc); |
626 | irq_settings_set_norequest(desc); | 651 | irq_settings_set_norequest(desc); |
627 | irq_settings_set_nothread(desc); | 652 | irq_settings_set_nothread(desc); |
628 | irq_startup(desc); | 653 | irq_startup(desc, true); |
629 | } | 654 | } |
630 | out: | 655 | out: |
631 | irq_put_desc_busunlock(desc, flags); | 656 | irq_put_desc_busunlock(desc, flags); |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c82bbe..bdb180325551 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) | |||
54 | static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) | 54 | static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) |
55 | { | 55 | { |
56 | /* | 56 | /* |
57 | * Wake up the handler thread for this action. In case the | 57 | * In case the thread crashed and was killed we just pretend that |
58 | * thread crashed and was killed we just pretend that we | 58 | * we handled the interrupt. The hardirq handler has disabled the |
59 | * handled the interrupt. The hardirq handler has disabled the | 59 | * device interrupt, so no irq storm is lurking. |
60 | * device interrupt, so no irq storm is lurking. If the | 60 | */ |
61 | if (action->thread->flags & PF_EXITING) | ||
62 | return; | ||
63 | |||
64 | /* | ||
65 | * Wake up the handler thread for this action. If the | ||
61 | * RUNTHREAD bit is already set, nothing to do. | 66 | * RUNTHREAD bit is already set, nothing to do. |
62 | */ | 67 | */ |
63 | if (test_bit(IRQTF_DIED, &action->thread_flags) || | 68 | if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) |
64 | test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
65 | return; | 69 | return; |
66 | 70 | ||
67 | /* | 71 | /* |
@@ -110,6 +114,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) | |||
110 | * threads_oneshot untouched and runs the thread another time. | 114 | * threads_oneshot untouched and runs the thread another time. |
111 | */ | 115 | */ |
112 | desc->threads_oneshot |= action->thread_mask; | 116 | desc->threads_oneshot |= action->thread_mask; |
117 | |||
118 | /* | ||
119 | * We increment the threads_active counter in case we wake up | ||
120 | * the irq thread. The irq thread decrements the counter when | ||
121 | * it returns from the handler or in the exit path and wakes | ||
122 | * up waiters which are stuck in synchronize_irq() when the | ||
123 | * active count becomes zero. synchronize_irq() is serialized | ||
124 | * against this code (hard irq handler) via IRQS_INPROGRESS | ||
125 | * like the finalize_oneshot() code. See comment above. | ||
126 | */ | ||
127 | atomic_inc(&desc->threads_active); | ||
128 | |||
113 | wake_up_process(action->thread); | 129 | wake_up_process(action->thread); |
114 | } | 130 | } |
115 | 131 | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b7952316016a..8e5c56b3b7d9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -20,14 +20,12 @@ extern bool noirqdebug; | |||
20 | /* | 20 | /* |
21 | * Bits used by threaded handlers: | 21 | * Bits used by threaded handlers: |
22 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run | 22 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run |
23 | * IRQTF_DIED - handler thread died | ||
24 | * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed | 23 | * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed |
25 | * IRQTF_AFFINITY - irq thread is requested to adjust affinity | 24 | * IRQTF_AFFINITY - irq thread is requested to adjust affinity |
26 | * IRQTF_FORCED_THREAD - irq action is force threaded | 25 | * IRQTF_FORCED_THREAD - irq action is force threaded |
27 | */ | 26 | */ |
28 | enum { | 27 | enum { |
29 | IRQTF_RUNTHREAD, | 28 | IRQTF_RUNTHREAD, |
30 | IRQTF_DIED, | ||
31 | IRQTF_WARNED, | 29 | IRQTF_WARNED, |
32 | IRQTF_AFFINITY, | 30 | IRQTF_AFFINITY, |
33 | IRQTF_FORCED_THREAD, | 31 | IRQTF_FORCED_THREAD, |
@@ -67,7 +65,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
67 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); | 65 | extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); |
68 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); | 66 | extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); |
69 | 67 | ||
70 | extern int irq_startup(struct irq_desc *desc); | 68 | extern int irq_startup(struct irq_desc *desc, bool resend); |
71 | extern void irq_shutdown(struct irq_desc *desc); | 69 | extern void irq_shutdown(struct irq_desc *desc); |
72 | extern void irq_enable(struct irq_desc *desc); | 70 | extern void irq_enable(struct irq_desc *desc); |
73 | extern void irq_disable(struct irq_desc *desc); | 71 | extern void irq_disable(struct irq_desc *desc); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f9e26526b69..3601f3fbf67c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -1,189 +1,793 @@ | |||
1 | #include <linux/debugfs.h> | ||
2 | #include <linux/hardirq.h> | ||
3 | #include <linux/interrupt.h> | ||
1 | #include <linux/irq.h> | 4 | #include <linux/irq.h> |
5 | #include <linux/irqdesc.h> | ||
2 | #include <linux/irqdomain.h> | 6 | #include <linux/irqdomain.h> |
3 | #include <linux/module.h> | 7 | #include <linux/module.h> |
4 | #include <linux/mutex.h> | 8 | #include <linux/mutex.h> |
5 | #include <linux/of.h> | 9 | #include <linux/of.h> |
6 | #include <linux/of_address.h> | 10 | #include <linux/of_address.h> |
11 | #include <linux/seq_file.h> | ||
7 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/smp.h> | ||
14 | #include <linux/fs.h> | ||
15 | |||
16 | #define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. | ||
17 | * ie. legacy 8259, gets irqs 1..15 */ | ||
18 | #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ | ||
19 | #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ | ||
20 | #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ | ||
8 | 21 | ||
9 | static LIST_HEAD(irq_domain_list); | 22 | static LIST_HEAD(irq_domain_list); |
10 | static DEFINE_MUTEX(irq_domain_mutex); | 23 | static DEFINE_MUTEX(irq_domain_mutex); |
11 | 24 | ||
25 | static DEFINE_MUTEX(revmap_trees_mutex); | ||
26 | static unsigned int irq_virq_count = NR_IRQS; | ||
27 | static struct irq_domain *irq_default_domain; | ||
28 | |||
12 | /** | 29 | /** |
13 | * irq_domain_add() - Register an irq_domain | 30 | * irq_domain_alloc() - Allocate a new irq_domain data structure |
14 | * @domain: ptr to initialized irq_domain structure | 31 | * @of_node: optional device-tree node of the interrupt controller |
32 | * @revmap_type: type of reverse mapping to use | ||
33 | * @ops: map/unmap domain callbacks | ||
34 | * @host_data: Controller private data pointer | ||
15 | * | 35 | * |
16 | * Registers an irq_domain structure. The irq_domain must at a minimum be | 36 | * Allocates and initialize and irq_domain structure. Caller is expected to |
17 | * initialized with an ops structure pointer, and either a ->to_irq hook or | 37 | * register allocated irq_domain with irq_domain_register(). Returns pointer |
18 | * a valid irq_base value. Everything else is optional. | 38 | * to IRQ domain, or NULL on failure. |
19 | */ | 39 | */ |
20 | void irq_domain_add(struct irq_domain *domain) | 40 | static struct irq_domain *irq_domain_alloc(struct device_node *of_node, |
41 | unsigned int revmap_type, | ||
42 | const struct irq_domain_ops *ops, | ||
43 | void *host_data) | ||
21 | { | 44 | { |
22 | struct irq_data *d; | 45 | struct irq_domain *domain; |
23 | int hwirq, irq; | ||
24 | 46 | ||
25 | /* | 47 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); |
26 | * This assumes that the irq_domain owner has already allocated | 48 | if (WARN_ON(!domain)) |
27 | * the irq_descs. This block will be removed when support for dynamic | 49 | return NULL; |
28 | * allocation of irq_descs is added to irq_domain. | 50 | |
29 | */ | 51 | /* Fill structure */ |
30 | irq_domain_for_each_irq(domain, hwirq, irq) { | 52 | domain->revmap_type = revmap_type; |
31 | d = irq_get_irq_data(irq); | 53 | domain->ops = ops; |
32 | if (!d) { | 54 | domain->host_data = host_data; |
33 | WARN(1, "error: assigning domain to non existant irq_desc"); | 55 | domain->of_node = of_node_get(of_node); |
34 | return; | 56 | |
35 | } | 57 | return domain; |
36 | if (d->domain) { | 58 | } |
37 | /* things are broken; just report, don't clean up */ | 59 | |
38 | WARN(1, "error: irq_desc already assigned to a domain"); | 60 | static void irq_domain_add(struct irq_domain *domain) |
39 | return; | 61 | { |
62 | mutex_lock(&irq_domain_mutex); | ||
63 | list_add(&domain->link, &irq_domain_list); | ||
64 | mutex_unlock(&irq_domain_mutex); | ||
65 | pr_debug("irq: Allocated domain of type %d @0x%p\n", | ||
66 | domain->revmap_type, domain); | ||
67 | } | ||
68 | |||
69 | static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | ||
70 | irq_hw_number_t hwirq) | ||
71 | { | ||
72 | irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; | ||
73 | int size = domain->revmap_data.legacy.size; | ||
74 | |||
75 | if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) | ||
76 | return 0; | ||
77 | return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; | ||
78 | } | ||
79 | |||
80 | /** | ||
81 | * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. | ||
82 | * @of_node: pointer to interrupt controller's device tree node. | ||
83 | * @size: total number of irqs in legacy mapping | ||
84 | * @first_irq: first number of irq block assigned to the domain | ||
85 | * @first_hwirq: first hwirq number to use for the translation. Should normally | ||
86 | * be '0', but a positive integer can be used if the effective | ||
87 | * hwirqs numbering does not begin at zero. | ||
88 | * @ops: map/unmap domain callbacks | ||
89 | * @host_data: Controller private data pointer | ||
90 | * | ||
91 | * Note: the map() callback will be called before this function returns | ||
92 | * for all legacy interrupts except 0 (which is always the invalid irq for | ||
93 | * a legacy controller). | ||
94 | */ | ||
95 | struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | ||
96 | unsigned int size, | ||
97 | unsigned int first_irq, | ||
98 | irq_hw_number_t first_hwirq, | ||
99 | const struct irq_domain_ops *ops, | ||
100 | void *host_data) | ||
101 | { | ||
102 | struct irq_domain *domain; | ||
103 | unsigned int i; | ||
104 | |||
105 | domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); | ||
106 | if (!domain) | ||
107 | return NULL; | ||
108 | |||
109 | domain->revmap_data.legacy.first_irq = first_irq; | ||
110 | domain->revmap_data.legacy.first_hwirq = first_hwirq; | ||
111 | domain->revmap_data.legacy.size = size; | ||
112 | |||
113 | mutex_lock(&irq_domain_mutex); | ||
114 | /* Verify that all the irqs are available */ | ||
115 | for (i = 0; i < size; i++) { | ||
116 | int irq = first_irq + i; | ||
117 | struct irq_data *irq_data = irq_get_irq_data(irq); | ||
118 | |||
119 | if (WARN_ON(!irq_data || irq_data->domain)) { | ||
120 | mutex_unlock(&irq_domain_mutex); | ||
121 | of_node_put(domain->of_node); | ||
122 | kfree(domain); | ||
123 | return NULL; | ||
40 | } | 124 | } |
41 | d->domain = domain; | ||
42 | d->hwirq = hwirq; | ||
43 | } | 125 | } |
44 | 126 | ||
45 | mutex_lock(&irq_domain_mutex); | 127 | /* Claim all of the irqs before registering a legacy domain */ |
46 | list_add(&domain->list, &irq_domain_list); | 128 | for (i = 0; i < size; i++) { |
129 | struct irq_data *irq_data = irq_get_irq_data(first_irq + i); | ||
130 | irq_data->hwirq = first_hwirq + i; | ||
131 | irq_data->domain = domain; | ||
132 | } | ||
47 | mutex_unlock(&irq_domain_mutex); | 133 | mutex_unlock(&irq_domain_mutex); |
134 | |||
135 | for (i = 0; i < size; i++) { | ||
136 | int irq = first_irq + i; | ||
137 | int hwirq = first_hwirq + i; | ||
138 | |||
139 | /* IRQ0 gets ignored */ | ||
140 | if (!irq) | ||
141 | continue; | ||
142 | |||
143 | /* Legacy flags are left to default at this point, | ||
144 | * one can then use irq_create_mapping() to | ||
145 | * explicitly change them | ||
146 | */ | ||
147 | ops->map(domain, irq, hwirq); | ||
148 | |||
149 | /* Clear norequest flags */ | ||
150 | irq_clear_status_flags(irq, IRQ_NOREQUEST); | ||
151 | } | ||
152 | |||
153 | irq_domain_add(domain); | ||
154 | return domain; | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. | ||
159 | * @of_node: pointer to interrupt controller's device tree node. | ||
160 | * @ops: map/unmap domain callbacks | ||
161 | * @host_data: Controller private data pointer | ||
162 | */ | ||
163 | struct irq_domain *irq_domain_add_linear(struct device_node *of_node, | ||
164 | unsigned int size, | ||
165 | const struct irq_domain_ops *ops, | ||
166 | void *host_data) | ||
167 | { | ||
168 | struct irq_domain *domain; | ||
169 | unsigned int *revmap; | ||
170 | |||
171 | revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); | ||
172 | if (WARN_ON(!revmap)) | ||
173 | return NULL; | ||
174 | |||
175 | domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); | ||
176 | if (!domain) { | ||
177 | kfree(revmap); | ||
178 | return NULL; | ||
179 | } | ||
180 | domain->revmap_data.linear.size = size; | ||
181 | domain->revmap_data.linear.revmap = revmap; | ||
182 | irq_domain_add(domain); | ||
183 | return domain; | ||
184 | } | ||
185 | |||
186 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | ||
187 | const struct irq_domain_ops *ops, | ||
188 | void *host_data) | ||
189 | { | ||
190 | struct irq_domain *domain = irq_domain_alloc(of_node, | ||
191 | IRQ_DOMAIN_MAP_NOMAP, ops, host_data); | ||
192 | if (domain) | ||
193 | irq_domain_add(domain); | ||
194 | return domain; | ||
195 | } | ||
196 | |||
197 | /** | ||
198 | * irq_domain_add_tree() | ||
199 | * @of_node: pointer to interrupt controller's device tree node. | ||
200 | * @ops: map/unmap domain callbacks | ||
201 | * | ||
202 | * Note: The radix tree will be allocated later during boot automatically | ||
203 | * (the reverse mapping will use the slow path until that happens). | ||
204 | */ | ||
205 | struct irq_domain *irq_domain_add_tree(struct device_node *of_node, | ||
206 | const struct irq_domain_ops *ops, | ||
207 | void *host_data) | ||
208 | { | ||
209 | struct irq_domain *domain = irq_domain_alloc(of_node, | ||
210 | IRQ_DOMAIN_MAP_TREE, ops, host_data); | ||
211 | if (domain) { | ||
212 | INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); | ||
213 | irq_domain_add(domain); | ||
214 | } | ||
215 | return domain; | ||
48 | } | 216 | } |
49 | 217 | ||
50 | /** | 218 | /** |
51 | * irq_domain_del() - Unregister an irq_domain | 219 | * irq_find_host() - Locates a domain for a given device node |
52 | * @domain: ptr to registered irq_domain. | 220 | * @node: device-tree node of the interrupt controller |
53 | */ | 221 | */ |
54 | void irq_domain_del(struct irq_domain *domain) | 222 | struct irq_domain *irq_find_host(struct device_node *node) |
55 | { | 223 | { |
56 | struct irq_data *d; | 224 | struct irq_domain *h, *found = NULL; |
57 | int hwirq, irq; | 225 | int rc; |
58 | 226 | ||
227 | /* We might want to match the legacy controller last since | ||
228 | * it might potentially be set to match all interrupts in | ||
229 | * the absence of a device node. This isn't a problem so far | ||
230 | * yet though... | ||
231 | */ | ||
59 | mutex_lock(&irq_domain_mutex); | 232 | mutex_lock(&irq_domain_mutex); |
60 | list_del(&domain->list); | 233 | list_for_each_entry(h, &irq_domain_list, link) { |
234 | if (h->ops->match) | ||
235 | rc = h->ops->match(h, node); | ||
236 | else | ||
237 | rc = (h->of_node != NULL) && (h->of_node == node); | ||
238 | |||
239 | if (rc) { | ||
240 | found = h; | ||
241 | break; | ||
242 | } | ||
243 | } | ||
61 | mutex_unlock(&irq_domain_mutex); | 244 | mutex_unlock(&irq_domain_mutex); |
245 | return found; | ||
246 | } | ||
247 | EXPORT_SYMBOL_GPL(irq_find_host); | ||
248 | |||
249 | /** | ||
250 | * irq_set_default_host() - Set a "default" irq domain | ||
251 | * @domain: default domain pointer | ||
252 | * | ||
253 | * For convenience, it's possible to set a "default" domain that will be used | ||
254 | * whenever NULL is passed to irq_create_mapping(). It makes life easier for | ||
255 | * platforms that want to manipulate a few hard coded interrupt numbers that | ||
256 | * aren't properly represented in the device-tree. | ||
257 | */ | ||
258 | void irq_set_default_host(struct irq_domain *domain) | ||
259 | { | ||
260 | pr_debug("irq: Default domain set to @0x%p\n", domain); | ||
261 | |||
262 | irq_default_domain = domain; | ||
263 | } | ||
264 | |||
265 | /** | ||
266 | * irq_set_virq_count() - Set the maximum number of linux irqs | ||
267 | * @count: number of linux irqs, capped with NR_IRQS | ||
268 | * | ||
269 | * This is mainly for use by platforms like iSeries who want to program | ||
270 | * the virtual irq number in the controller to avoid the reverse mapping | ||
271 | */ | ||
272 | void irq_set_virq_count(unsigned int count) | ||
273 | { | ||
274 | pr_debug("irq: Trying to set virq count to %d\n", count); | ||
62 | 275 | ||
63 | /* Clear the irq_domain assignments */ | 276 | BUG_ON(count < NUM_ISA_INTERRUPTS); |
64 | irq_domain_for_each_irq(domain, hwirq, irq) { | 277 | if (count < NR_IRQS) |
65 | d = irq_get_irq_data(irq); | 278 | irq_virq_count = count; |
66 | d->domain = NULL; | 279 | } |
280 | |||
281 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | ||
282 | irq_hw_number_t hwirq) | ||
283 | { | ||
284 | struct irq_data *irq_data = irq_get_irq_data(virq); | ||
285 | |||
286 | irq_data->hwirq = hwirq; | ||
287 | irq_data->domain = domain; | ||
288 | if (domain->ops->map(domain, virq, hwirq)) { | ||
289 | pr_debug("irq: -> mapping failed, freeing\n"); | ||
290 | irq_data->domain = NULL; | ||
291 | irq_data->hwirq = 0; | ||
292 | return -1; | ||
67 | } | 293 | } |
294 | |||
295 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | ||
296 | |||
297 | return 0; | ||
68 | } | 298 | } |
69 | 299 | ||
70 | #if defined(CONFIG_OF_IRQ) | ||
71 | /** | 300 | /** |
72 | * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec | 301 | * irq_create_direct_mapping() - Allocate an irq for direct mapping |
302 | * @domain: domain to allocate the irq for or NULL for default domain | ||
73 | * | 303 | * |
74 | * Used by the device tree interrupt mapping code to translate a device tree | 304 | * This routine is used for irq controllers which can choose the hardware |
75 | * interrupt specifier to a valid linux irq number. Returns either a valid | 305 | * interrupt numbers they generate. In such a case it's simplest to use |
76 | * linux IRQ number or 0. | 306 | * the linux irq as the hardware interrupt number. |
307 | */ | ||
308 | unsigned int irq_create_direct_mapping(struct irq_domain *domain) | ||
309 | { | ||
310 | unsigned int virq; | ||
311 | |||
312 | if (domain == NULL) | ||
313 | domain = irq_default_domain; | ||
314 | |||
315 | BUG_ON(domain == NULL); | ||
316 | WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); | ||
317 | |||
318 | virq = irq_alloc_desc_from(1, 0); | ||
319 | if (!virq) { | ||
320 | pr_debug("irq: create_direct virq allocation failed\n"); | ||
321 | return 0; | ||
322 | } | ||
323 | if (virq >= irq_virq_count) { | ||
324 | pr_err("ERROR: no free irqs available below %i maximum\n", | ||
325 | irq_virq_count); | ||
326 | irq_free_desc(virq); | ||
327 | return 0; | ||
328 | } | ||
329 | |||
330 | pr_debug("irq: create_direct obtained virq %d\n", virq); | ||
331 | |||
332 | if (irq_setup_virq(domain, virq, virq)) { | ||
333 | irq_free_desc(virq); | ||
334 | return 0; | ||
335 | } | ||
336 | |||
337 | return virq; | ||
338 | } | ||
339 | |||
340 | /** | ||
341 | * irq_create_mapping() - Map a hardware interrupt into linux irq space | ||
342 | * @domain: domain owning this hardware interrupt or NULL for default domain | ||
343 | * @hwirq: hardware irq number in that domain space | ||
77 | * | 344 | * |
78 | * When the caller no longer need the irq number returned by this function it | 345 | * Only one mapping per hardware interrupt is permitted. Returns a linux |
79 | * should arrange to call irq_dispose_mapping(). | 346 | * irq number. |
347 | * If the sense/trigger is to be specified, set_irq_type() should be called | ||
348 | * on the number returned from that call. | ||
80 | */ | 349 | */ |
350 | unsigned int irq_create_mapping(struct irq_domain *domain, | ||
351 | irq_hw_number_t hwirq) | ||
352 | { | ||
353 | unsigned int virq, hint; | ||
354 | |||
355 | pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | ||
356 | |||
357 | /* Look for default domain if nececssary */ | ||
358 | if (domain == NULL) | ||
359 | domain = irq_default_domain; | ||
360 | if (domain == NULL) { | ||
361 | printk(KERN_WARNING "irq_create_mapping called for" | ||
362 | " NULL domain, hwirq=%lx\n", hwirq); | ||
363 | WARN_ON(1); | ||
364 | return 0; | ||
365 | } | ||
366 | pr_debug("irq: -> using domain @%p\n", domain); | ||
367 | |||
368 | /* Check if mapping already exists */ | ||
369 | virq = irq_find_mapping(domain, hwirq); | ||
370 | if (virq) { | ||
371 | pr_debug("irq: -> existing mapping on virq %d\n", virq); | ||
372 | return virq; | ||
373 | } | ||
374 | |||
375 | /* Get a virtual interrupt number */ | ||
376 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | ||
377 | return irq_domain_legacy_revmap(domain, hwirq); | ||
378 | |||
379 | /* Allocate a virtual interrupt number */ | ||
380 | hint = hwirq % irq_virq_count; | ||
381 | if (hint == 0) | ||
382 | hint++; | ||
383 | virq = irq_alloc_desc_from(hint, 0); | ||
384 | if (!virq) | ||
385 | virq = irq_alloc_desc_from(1, 0); | ||
386 | if (!virq) { | ||
387 | pr_debug("irq: -> virq allocation failed\n"); | ||
388 | return 0; | ||
389 | } | ||
390 | |||
391 | if (irq_setup_virq(domain, virq, hwirq)) { | ||
392 | if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) | ||
393 | irq_free_desc(virq); | ||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", | ||
398 | hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); | ||
399 | |||
400 | return virq; | ||
401 | } | ||
402 | EXPORT_SYMBOL_GPL(irq_create_mapping); | ||
403 | |||
81 | unsigned int irq_create_of_mapping(struct device_node *controller, | 404 | unsigned int irq_create_of_mapping(struct device_node *controller, |
82 | const u32 *intspec, unsigned int intsize) | 405 | const u32 *intspec, unsigned int intsize) |
83 | { | 406 | { |
84 | struct irq_domain *domain; | 407 | struct irq_domain *domain; |
85 | unsigned long hwirq; | 408 | irq_hw_number_t hwirq; |
86 | unsigned int irq, type; | 409 | unsigned int type = IRQ_TYPE_NONE; |
87 | int rc = -EINVAL; | 410 | unsigned int virq; |
88 | 411 | ||
89 | /* Find a domain which can translate the irq spec */ | 412 | domain = controller ? irq_find_host(controller) : irq_default_domain; |
90 | mutex_lock(&irq_domain_mutex); | 413 | if (!domain) { |
91 | list_for_each_entry(domain, &irq_domain_list, list) { | 414 | #ifdef CONFIG_MIPS |
92 | if (!domain->ops->dt_translate) | 415 | /* |
93 | continue; | 416 | * Workaround to avoid breaking interrupt controller drivers |
94 | rc = domain->ops->dt_translate(domain, controller, | 417 | * that don't yet register an irq_domain. This is temporary |
95 | intspec, intsize, &hwirq, &type); | 418 | * code. ~~~gcl, Feb 24, 2012 |
96 | if (rc == 0) | 419 | * |
97 | break; | 420 | * Scheduled for removal in Linux v3.6. That should be enough |
421 | * time. | ||
422 | */ | ||
423 | if (intsize > 0) | ||
424 | return intspec[0]; | ||
425 | #endif | ||
426 | printk(KERN_WARNING "irq: no irq domain found for %s !\n", | ||
427 | controller->full_name); | ||
428 | return 0; | ||
98 | } | 429 | } |
99 | mutex_unlock(&irq_domain_mutex); | ||
100 | 430 | ||
101 | if (rc != 0) | 431 | /* If domain has no translation, then we assume interrupt line */ |
102 | return 0; | 432 | if (domain->ops->xlate == NULL) |
433 | hwirq = intspec[0]; | ||
434 | else { | ||
435 | if (domain->ops->xlate(domain, controller, intspec, intsize, | ||
436 | &hwirq, &type)) | ||
437 | return 0; | ||
438 | } | ||
439 | |||
440 | /* Create mapping */ | ||
441 | virq = irq_create_mapping(domain, hwirq); | ||
442 | if (!virq) | ||
443 | return virq; | ||
103 | 444 | ||
104 | irq = irq_domain_to_irq(domain, hwirq); | 445 | /* Set type if specified and different than the current one */ |
105 | if (type != IRQ_TYPE_NONE) | 446 | if (type != IRQ_TYPE_NONE && |
106 | irq_set_irq_type(irq, type); | 447 | type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) |
107 | pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", | 448 | irq_set_irq_type(virq, type); |
108 | controller->full_name, (int)hwirq, irq, type); | 449 | return virq; |
109 | return irq; | ||
110 | } | 450 | } |
111 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); | 451 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); |
112 | 452 | ||
113 | /** | 453 | /** |
114 | * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() | 454 | * irq_dispose_mapping() - Unmap an interrupt |
115 | * @irq: linux irq number to be discarded | 455 | * @virq: linux irq number of the interrupt to unmap |
456 | */ | ||
457 | void irq_dispose_mapping(unsigned int virq) | ||
458 | { | ||
459 | struct irq_data *irq_data = irq_get_irq_data(virq); | ||
460 | struct irq_domain *domain; | ||
461 | irq_hw_number_t hwirq; | ||
462 | |||
463 | if (!virq || !irq_data) | ||
464 | return; | ||
465 | |||
466 | domain = irq_data->domain; | ||
467 | if (WARN_ON(domain == NULL)) | ||
468 | return; | ||
469 | |||
470 | /* Never unmap legacy interrupts */ | ||
471 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | ||
472 | return; | ||
473 | |||
474 | irq_set_status_flags(virq, IRQ_NOREQUEST); | ||
475 | |||
476 | /* remove chip and handler */ | ||
477 | irq_set_chip_and_handler(virq, NULL, NULL); | ||
478 | |||
479 | /* Make sure it's completed */ | ||
480 | synchronize_irq(virq); | ||
481 | |||
482 | /* Tell the PIC about it */ | ||
483 | if (domain->ops->unmap) | ||
484 | domain->ops->unmap(domain, virq); | ||
485 | smp_mb(); | ||
486 | |||
487 | /* Clear reverse map */ | ||
488 | hwirq = irq_data->hwirq; | ||
489 | switch(domain->revmap_type) { | ||
490 | case IRQ_DOMAIN_MAP_LINEAR: | ||
491 | if (hwirq < domain->revmap_data.linear.size) | ||
492 | domain->revmap_data.linear.revmap[hwirq] = 0; | ||
493 | break; | ||
494 | case IRQ_DOMAIN_MAP_TREE: | ||
495 | mutex_lock(&revmap_trees_mutex); | ||
496 | radix_tree_delete(&domain->revmap_data.tree, hwirq); | ||
497 | mutex_unlock(&revmap_trees_mutex); | ||
498 | break; | ||
499 | } | ||
500 | |||
501 | irq_free_desc(virq); | ||
502 | } | ||
503 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | ||
504 | |||
505 | /** | ||
506 | * irq_find_mapping() - Find a linux irq from an hw irq number. | ||
507 | * @domain: domain owning this hardware interrupt | ||
508 | * @hwirq: hardware irq number in that domain space | ||
509 | * | ||
510 | * This is a slow path, for use by generic code. It's expected that an | ||
511 | * irq controller implementation directly calls the appropriate low level | ||
512 | * mapping function. | ||
513 | */ | ||
514 | unsigned int irq_find_mapping(struct irq_domain *domain, | ||
515 | irq_hw_number_t hwirq) | ||
516 | { | ||
517 | unsigned int i; | ||
518 | unsigned int hint = hwirq % irq_virq_count; | ||
519 | |||
520 | /* Look for default domain if nececssary */ | ||
521 | if (domain == NULL) | ||
522 | domain = irq_default_domain; | ||
523 | if (domain == NULL) | ||
524 | return 0; | ||
525 | |||
526 | /* legacy -> bail early */ | ||
527 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | ||
528 | return irq_domain_legacy_revmap(domain, hwirq); | ||
529 | |||
530 | /* Slow path does a linear search of the map */ | ||
531 | if (hint == 0) | ||
532 | hint = 1; | ||
533 | i = hint; | ||
534 | do { | ||
535 | struct irq_data *data = irq_get_irq_data(i); | ||
536 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) | ||
537 | return i; | ||
538 | i++; | ||
539 | if (i >= irq_virq_count) | ||
540 | i = 1; | ||
541 | } while(i != hint); | ||
542 | return 0; | ||
543 | } | ||
544 | EXPORT_SYMBOL_GPL(irq_find_mapping); | ||
545 | |||
546 | /** | ||
547 | * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. | ||
548 | * @domain: domain owning this hardware interrupt | ||
549 | * @hwirq: hardware irq number in that domain space | ||
116 | * | 550 | * |
117 | * Calling this function indicates the caller no longer needs a reference to | 551 | * This is a fast path, for use by irq controller code that uses radix tree |
118 | * the linux irq number returned by a prior call to irq_create_of_mapping(). | 552 | * revmaps |
119 | */ | 553 | */ |
120 | void irq_dispose_mapping(unsigned int irq) | 554 | unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, |
555 | irq_hw_number_t hwirq) | ||
121 | { | 556 | { |
557 | struct irq_data *irq_data; | ||
558 | |||
559 | if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) | ||
560 | return irq_find_mapping(domain, hwirq); | ||
561 | |||
562 | /* | ||
563 | * Freeing an irq can delete nodes along the path to | ||
564 | * do the lookup via call_rcu. | ||
565 | */ | ||
566 | rcu_read_lock(); | ||
567 | irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); | ||
568 | rcu_read_unlock(); | ||
569 | |||
122 | /* | 570 | /* |
123 | * nothing yet; will be filled when support for dynamic allocation of | 571 | * If found in radix tree, then fine. |
124 | * irq_descs is added to irq_domain | 572 | * Else fallback to linear lookup - this should not happen in practice |
573 | * as it means that we failed to insert the node in the radix tree. | ||
125 | */ | 574 | */ |
575 | return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); | ||
126 | } | 576 | } |
127 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | ||
128 | 577 | ||
129 | int irq_domain_simple_dt_translate(struct irq_domain *d, | 578 | /** |
130 | struct device_node *controller, | 579 | * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. |
131 | const u32 *intspec, unsigned int intsize, | 580 | * @domain: domain owning this hardware interrupt |
132 | unsigned long *out_hwirq, unsigned int *out_type) | 581 | * @virq: linux irq number |
582 | * @hwirq: hardware irq number in that domain space | ||
583 | * | ||
584 | * This is for use by irq controllers that use a radix tree reverse | ||
585 | * mapping for fast lookup. | ||
586 | */ | ||
587 | void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, | ||
588 | irq_hw_number_t hwirq) | ||
133 | { | 589 | { |
134 | if (d->of_node != controller) | 590 | struct irq_data *irq_data = irq_get_irq_data(virq); |
135 | return -EINVAL; | 591 | |
136 | if (intsize < 1) | 592 | if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) |
137 | return -EINVAL; | 593 | return; |
138 | if (d->nr_irq && ((intspec[0] < d->hwirq_base) || | 594 | |
139 | (intspec[0] >= d->hwirq_base + d->nr_irq))) | 595 | if (virq) { |
140 | return -EINVAL; | 596 | mutex_lock(&revmap_trees_mutex); |
597 | radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); | ||
598 | mutex_unlock(&revmap_trees_mutex); | ||
599 | } | ||
600 | } | ||
601 | |||
602 | /** | ||
603 | * irq_linear_revmap() - Find a linux irq from a hw irq number. | ||
604 | * @domain: domain owning this hardware interrupt | ||
605 | * @hwirq: hardware irq number in that domain space | ||
606 | * | ||
607 | * This is a fast path, for use by irq controller code that uses linear | ||
608 | * revmaps. It does fallback to the slow path if the revmap doesn't exist | ||
609 | * yet and will create the revmap entry with appropriate locking | ||
610 | */ | ||
611 | unsigned int irq_linear_revmap(struct irq_domain *domain, | ||
612 | irq_hw_number_t hwirq) | ||
613 | { | ||
614 | unsigned int *revmap; | ||
615 | |||
616 | if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) | ||
617 | return irq_find_mapping(domain, hwirq); | ||
618 | |||
619 | /* Check revmap bounds */ | ||
620 | if (unlikely(hwirq >= domain->revmap_data.linear.size)) | ||
621 | return irq_find_mapping(domain, hwirq); | ||
622 | |||
623 | /* Check if revmap was allocated */ | ||
624 | revmap = domain->revmap_data.linear.revmap; | ||
625 | if (unlikely(revmap == NULL)) | ||
626 | return irq_find_mapping(domain, hwirq); | ||
627 | |||
628 | /* Fill up revmap with slow path if no mapping found */ | ||
629 | if (unlikely(!revmap[hwirq])) | ||
630 | revmap[hwirq] = irq_find_mapping(domain, hwirq); | ||
631 | |||
632 | return revmap[hwirq]; | ||
633 | } | ||
634 | |||
635 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG | ||
636 | static int virq_debug_show(struct seq_file *m, void *private) | ||
637 | { | ||
638 | unsigned long flags; | ||
639 | struct irq_desc *desc; | ||
640 | const char *p; | ||
641 | static const char none[] = "none"; | ||
642 | void *data; | ||
643 | int i; | ||
644 | |||
645 | seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", | ||
646 | "chip name", "chip data", "domain name"); | ||
647 | |||
648 | for (i = 1; i < nr_irqs; i++) { | ||
649 | desc = irq_to_desc(i); | ||
650 | if (!desc) | ||
651 | continue; | ||
652 | |||
653 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
654 | |||
655 | if (desc->action && desc->action->handler) { | ||
656 | struct irq_chip *chip; | ||
657 | |||
658 | seq_printf(m, "%5d ", i); | ||
659 | seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); | ||
660 | |||
661 | chip = irq_desc_get_chip(desc); | ||
662 | if (chip && chip->name) | ||
663 | p = chip->name; | ||
664 | else | ||
665 | p = none; | ||
666 | seq_printf(m, "%-15s ", p); | ||
667 | |||
668 | data = irq_desc_get_chip_data(desc); | ||
669 | seq_printf(m, "0x%16p ", data); | ||
670 | |||
671 | if (desc->irq_data.domain && desc->irq_data.domain->of_node) | ||
672 | p = desc->irq_data.domain->of_node->full_name; | ||
673 | else | ||
674 | p = none; | ||
675 | seq_printf(m, "%s\n", p); | ||
676 | } | ||
677 | |||
678 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
679 | } | ||
680 | |||
681 | return 0; | ||
682 | } | ||
141 | 683 | ||
684 | static int virq_debug_open(struct inode *inode, struct file *file) | ||
685 | { | ||
686 | return single_open(file, virq_debug_show, inode->i_private); | ||
687 | } | ||
688 | |||
689 | static const struct file_operations virq_debug_fops = { | ||
690 | .open = virq_debug_open, | ||
691 | .read = seq_read, | ||
692 | .llseek = seq_lseek, | ||
693 | .release = single_release, | ||
694 | }; | ||
695 | |||
696 | static int __init irq_debugfs_init(void) | ||
697 | { | ||
698 | if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, | ||
699 | NULL, &virq_debug_fops) == NULL) | ||
700 | return -ENOMEM; | ||
701 | |||
702 | return 0; | ||
703 | } | ||
704 | __initcall(irq_debugfs_init); | ||
705 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ | ||
706 | |||
707 | int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, | ||
708 | irq_hw_number_t hwirq) | ||
709 | { | ||
710 | return 0; | ||
711 | } | ||
712 | |||
713 | /** | ||
714 | * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings | ||
715 | * | ||
716 | * Device Tree IRQ specifier translation function which works with one cell | ||
717 | * bindings where the cell value maps directly to the hwirq number. | ||
718 | */ | ||
719 | int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, | ||
720 | const u32 *intspec, unsigned int intsize, | ||
721 | unsigned long *out_hwirq, unsigned int *out_type) | ||
722 | { | ||
723 | if (WARN_ON(intsize < 1)) | ||
724 | return -EINVAL; | ||
142 | *out_hwirq = intspec[0]; | 725 | *out_hwirq = intspec[0]; |
143 | *out_type = IRQ_TYPE_NONE; | 726 | *out_type = IRQ_TYPE_NONE; |
144 | if (intsize > 1) | ||
145 | *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; | ||
146 | return 0; | 727 | return 0; |
147 | } | 728 | } |
729 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); | ||
148 | 730 | ||
149 | /** | 731 | /** |
150 | * irq_domain_create_simple() - Set up a 'simple' translation range | 732 | * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings |
733 | * | ||
734 | * Device Tree IRQ specifier translation function which works with two cell | ||
735 | * bindings where the cell values map directly to the hwirq number | ||
736 | * and linux irq flags. | ||
151 | */ | 737 | */ |
152 | void irq_domain_add_simple(struct device_node *controller, int irq_base) | 738 | int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, |
739 | const u32 *intspec, unsigned int intsize, | ||
740 | irq_hw_number_t *out_hwirq, unsigned int *out_type) | ||
153 | { | 741 | { |
154 | struct irq_domain *domain; | 742 | if (WARN_ON(intsize < 2)) |
155 | 743 | return -EINVAL; | |
156 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); | 744 | *out_hwirq = intspec[0]; |
157 | if (!domain) { | 745 | *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; |
158 | WARN_ON(1); | 746 | return 0; |
159 | return; | 747 | } |
160 | } | 748 | EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); |
161 | 749 | ||
162 | domain->irq_base = irq_base; | 750 | /** |
163 | domain->of_node = of_node_get(controller); | 751 | * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings |
164 | domain->ops = &irq_domain_simple_ops; | 752 | * |
165 | irq_domain_add(domain); | 753 | * Device Tree IRQ specifier translation function which works with either one |
754 | * or two cell bindings where the cell values map directly to the hwirq number | ||
755 | * and linux irq flags. | ||
756 | * | ||
757 | * Note: don't use this function unless your interrupt controller explicitly | ||
758 | * supports both one and two cell bindings. For the majority of controllers | ||
759 | * the _onecell() or _twocell() variants above should be used. | ||
760 | */ | ||
761 | int irq_domain_xlate_onetwocell(struct irq_domain *d, | ||
762 | struct device_node *ctrlr, | ||
763 | const u32 *intspec, unsigned int intsize, | ||
764 | unsigned long *out_hwirq, unsigned int *out_type) | ||
765 | { | ||
766 | if (WARN_ON(intsize < 1)) | ||
767 | return -EINVAL; | ||
768 | *out_hwirq = intspec[0]; | ||
769 | *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; | ||
770 | return 0; | ||
166 | } | 771 | } |
167 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); | 772 | EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); |
168 | 773 | ||
774 | const struct irq_domain_ops irq_domain_simple_ops = { | ||
775 | .map = irq_domain_simple_map, | ||
776 | .xlate = irq_domain_xlate_onetwocell, | ||
777 | }; | ||
778 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
779 | |||
780 | #ifdef CONFIG_OF_IRQ | ||
169 | void irq_domain_generate_simple(const struct of_device_id *match, | 781 | void irq_domain_generate_simple(const struct of_device_id *match, |
170 | u64 phys_base, unsigned int irq_start) | 782 | u64 phys_base, unsigned int irq_start) |
171 | { | 783 | { |
172 | struct device_node *node; | 784 | struct device_node *node; |
173 | pr_info("looking for phys_base=%llx, irq_start=%i\n", | 785 | pr_debug("looking for phys_base=%llx, irq_start=%i\n", |
174 | (unsigned long long) phys_base, (int) irq_start); | 786 | (unsigned long long) phys_base, (int) irq_start); |
175 | node = of_find_matching_node_by_address(NULL, match, phys_base); | 787 | node = of_find_matching_node_by_address(NULL, match, phys_base); |
176 | if (node) | 788 | if (node) |
177 | irq_domain_add_simple(node, irq_start); | 789 | irq_domain_add_legacy(node, 32, irq_start, 0, |
178 | else | 790 | &irq_domain_simple_ops, NULL); |
179 | pr_info("no node found\n"); | ||
180 | } | 791 | } |
181 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | 792 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); |
182 | #endif /* CONFIG_OF_IRQ */ | 793 | #endif |
183 | |||
184 | struct irq_domain_ops irq_domain_simple_ops = { | ||
185 | #ifdef CONFIG_OF_IRQ | ||
186 | .dt_translate = irq_domain_simple_dt_translate, | ||
187 | #endif /* CONFIG_OF_IRQ */ | ||
188 | }; | ||
189 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe49fea..89a3ea82569b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
282 | { | 282 | { |
283 | struct irq_chip *chip = irq_desc_get_chip(desc); | 283 | struct irq_chip *chip = irq_desc_get_chip(desc); |
284 | struct cpumask *set = irq_default_affinity; | 284 | struct cpumask *set = irq_default_affinity; |
285 | int ret; | 285 | int ret, node = desc->irq_data.node; |
286 | 286 | ||
287 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 287 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
288 | if (!irq_can_set_affinity(irq)) | 288 | if (!irq_can_set_affinity(irq)) |
@@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
301 | } | 301 | } |
302 | 302 | ||
303 | cpumask_and(mask, cpu_online_mask, set); | 303 | cpumask_and(mask, cpu_online_mask, set); |
304 | if (node != NUMA_NO_NODE) { | ||
305 | const struct cpumask *nodemask = cpumask_of_node(node); | ||
306 | |||
307 | /* make sure at least one of the cpus in nodemask is online */ | ||
308 | if (cpumask_intersects(mask, nodemask)) | ||
309 | cpumask_and(mask, mask, nodemask); | ||
310 | } | ||
304 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); | 311 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); |
305 | switch (ret) { | 312 | switch (ret) { |
306 | case IRQ_SET_MASK_OK: | 313 | case IRQ_SET_MASK_OK: |
@@ -645,7 +652,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
645 | * is marked MASKED. | 652 | * is marked MASKED. |
646 | */ | 653 | */ |
647 | static void irq_finalize_oneshot(struct irq_desc *desc, | 654 | static void irq_finalize_oneshot(struct irq_desc *desc, |
648 | struct irqaction *action, bool force) | 655 | struct irqaction *action) |
649 | { | 656 | { |
650 | if (!(desc->istate & IRQS_ONESHOT)) | 657 | if (!(desc->istate & IRQS_ONESHOT)) |
651 | return; | 658 | return; |
@@ -679,7 +686,7 @@ again: | |||
679 | * we would clear the threads_oneshot bit of this thread which | 686 | * we would clear the threads_oneshot bit of this thread which |
680 | * was just set. | 687 | * was just set. |
681 | */ | 688 | */ |
682 | if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | 689 | if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) |
683 | goto out_unlock; | 690 | goto out_unlock; |
684 | 691 | ||
685 | desc->threads_oneshot &= ~action->thread_mask; | 692 | desc->threads_oneshot &= ~action->thread_mask; |
@@ -739,7 +746,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | |||
739 | 746 | ||
740 | local_bh_disable(); | 747 | local_bh_disable(); |
741 | ret = action->thread_fn(action->irq, action->dev_id); | 748 | ret = action->thread_fn(action->irq, action->dev_id); |
742 | irq_finalize_oneshot(desc, action, false); | 749 | irq_finalize_oneshot(desc, action); |
743 | local_bh_enable(); | 750 | local_bh_enable(); |
744 | return ret; | 751 | return ret; |
745 | } | 752 | } |
@@ -755,10 +762,17 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, | |||
755 | irqreturn_t ret; | 762 | irqreturn_t ret; |
756 | 763 | ||
757 | ret = action->thread_fn(action->irq, action->dev_id); | 764 | ret = action->thread_fn(action->irq, action->dev_id); |
758 | irq_finalize_oneshot(desc, action, false); | 765 | irq_finalize_oneshot(desc, action); |
759 | return ret; | 766 | return ret; |
760 | } | 767 | } |
761 | 768 | ||
769 | static void wake_threads_waitq(struct irq_desc *desc) | ||
770 | { | ||
771 | if (atomic_dec_and_test(&desc->threads_active) && | ||
772 | waitqueue_active(&desc->wait_for_threads)) | ||
773 | wake_up(&desc->wait_for_threads); | ||
774 | } | ||
775 | |||
762 | /* | 776 | /* |
763 | * Interrupt handler thread | 777 | * Interrupt handler thread |
764 | */ | 778 | */ |
@@ -771,57 +785,41 @@ static int irq_thread(void *data) | |||
771 | struct irq_desc *desc = irq_to_desc(action->irq); | 785 | struct irq_desc *desc = irq_to_desc(action->irq); |
772 | irqreturn_t (*handler_fn)(struct irq_desc *desc, | 786 | irqreturn_t (*handler_fn)(struct irq_desc *desc, |
773 | struct irqaction *action); | 787 | struct irqaction *action); |
774 | int wake; | ||
775 | 788 | ||
776 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, | 789 | if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, |
777 | &action->thread_flags)) | 790 | &action->thread_flags)) |
778 | handler_fn = irq_forced_thread_fn; | 791 | handler_fn = irq_forced_thread_fn; |
779 | else | 792 | else |
780 | handler_fn = irq_thread_fn; | 793 | handler_fn = irq_thread_fn; |
781 | 794 | ||
782 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 795 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
783 | current->irqaction = action; | 796 | current->irq_thread = 1; |
784 | 797 | ||
785 | while (!irq_wait_for_interrupt(action)) { | 798 | while (!irq_wait_for_interrupt(action)) { |
799 | irqreturn_t action_ret; | ||
786 | 800 | ||
787 | irq_thread_check_affinity(desc, action); | 801 | irq_thread_check_affinity(desc, action); |
788 | 802 | ||
789 | atomic_inc(&desc->threads_active); | 803 | action_ret = handler_fn(desc, action); |
804 | if (!noirqdebug) | ||
805 | note_interrupt(action->irq, desc, action_ret); | ||
790 | 806 | ||
791 | raw_spin_lock_irq(&desc->lock); | 807 | wake_threads_waitq(desc); |
792 | if (unlikely(irqd_irq_disabled(&desc->irq_data))) { | ||
793 | /* | ||
794 | * CHECKME: We might need a dedicated | ||
795 | * IRQ_THREAD_PENDING flag here, which | ||
796 | * retriggers the thread in check_irq_resend() | ||
797 | * but AFAICT IRQS_PENDING should be fine as it | ||
798 | * retriggers the interrupt itself --- tglx | ||
799 | */ | ||
800 | desc->istate |= IRQS_PENDING; | ||
801 | raw_spin_unlock_irq(&desc->lock); | ||
802 | } else { | ||
803 | irqreturn_t action_ret; | ||
804 | |||
805 | raw_spin_unlock_irq(&desc->lock); | ||
806 | action_ret = handler_fn(desc, action); | ||
807 | if (!noirqdebug) | ||
808 | note_interrupt(action->irq, desc, action_ret); | ||
809 | } | ||
810 | |||
811 | wake = atomic_dec_and_test(&desc->threads_active); | ||
812 | |||
813 | if (wake && waitqueue_active(&desc->wait_for_threads)) | ||
814 | wake_up(&desc->wait_for_threads); | ||
815 | } | 808 | } |
816 | 809 | ||
817 | /* Prevent a stale desc->threads_oneshot */ | ||
818 | irq_finalize_oneshot(desc, action, true); | ||
819 | |||
820 | /* | 810 | /* |
821 | * Clear irqaction. Otherwise exit_irq_thread() would make | 811 | * This is the regular exit path. __free_irq() is stopping the |
812 | * thread via kthread_stop() after calling | ||
813 | * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the | ||
814 | * oneshot mask bit can be set. We cannot verify that as we | ||
815 | * cannot touch the oneshot mask at this point anymore as | ||
816 | * __setup_irq() might have given out currents thread_mask | ||
817 | * again. | ||
818 | * | ||
819 | * Clear irq_thread. Otherwise exit_irq_thread() would make | ||
822 | * fuzz about an active irq thread going into nirvana. | 820 | * fuzz about an active irq thread going into nirvana. |
823 | */ | 821 | */ |
824 | current->irqaction = NULL; | 822 | current->irq_thread = 0; |
825 | return 0; | 823 | return 0; |
826 | } | 824 | } |
827 | 825 | ||
@@ -832,27 +830,28 @@ void exit_irq_thread(void) | |||
832 | { | 830 | { |
833 | struct task_struct *tsk = current; | 831 | struct task_struct *tsk = current; |
834 | struct irq_desc *desc; | 832 | struct irq_desc *desc; |
833 | struct irqaction *action; | ||
835 | 834 | ||
836 | if (!tsk->irqaction) | 835 | if (!tsk->irq_thread) |
837 | return; | 836 | return; |
838 | 837 | ||
838 | action = kthread_data(tsk); | ||
839 | |||
839 | printk(KERN_ERR | 840 | printk(KERN_ERR |
840 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | 841 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", |
841 | tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); | 842 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); |
842 | 843 | ||
843 | desc = irq_to_desc(tsk->irqaction->irq); | 844 | desc = irq_to_desc(action->irq); |
844 | 845 | ||
845 | /* | 846 | /* |
846 | * Prevent a stale desc->threads_oneshot. Must be called | 847 | * If IRQTF_RUNTHREAD is set, we need to decrement |
847 | * before setting the IRQTF_DIED flag. | 848 | * desc->threads_active and wake possible waiters. |
848 | */ | 849 | */ |
849 | irq_finalize_oneshot(desc, tsk->irqaction, true); | 850 | if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) |
851 | wake_threads_waitq(desc); | ||
850 | 852 | ||
851 | /* | 853 | /* Prevent a stale desc->threads_oneshot */ |
852 | * Set the THREAD DIED flag to prevent further wakeups of the | 854 | irq_finalize_oneshot(desc, action); |
853 | * soon to be gone threaded handler. | ||
854 | */ | ||
855 | set_bit(IRQTF_DIED, &tsk->irqaction->flags); | ||
856 | } | 855 | } |
857 | 856 | ||
858 | static void irq_setup_forced_threading(struct irqaction *new) | 857 | static void irq_setup_forced_threading(struct irqaction *new) |
@@ -985,6 +984,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
985 | 984 | ||
986 | /* add new interrupt at end of irq queue */ | 985 | /* add new interrupt at end of irq queue */ |
987 | do { | 986 | do { |
987 | /* | ||
988 | * Or all existing action->thread_mask bits, | ||
989 | * so we can find the next zero bit for this | ||
990 | * new action. | ||
991 | */ | ||
988 | thread_mask |= old->thread_mask; | 992 | thread_mask |= old->thread_mask; |
989 | old_ptr = &old->next; | 993 | old_ptr = &old->next; |
990 | old = *old_ptr; | 994 | old = *old_ptr; |
@@ -993,14 +997,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
993 | } | 997 | } |
994 | 998 | ||
995 | /* | 999 | /* |
996 | * Setup the thread mask for this irqaction. Unlikely to have | 1000 | * Setup the thread mask for this irqaction for ONESHOT. For |
997 | * 32 resp 64 irqs sharing one line, but who knows. | 1001 | * !ONESHOT irqs the thread mask is 0 so we can avoid a |
1002 | * conditional in irq_wake_thread(). | ||
998 | */ | 1003 | */ |
999 | if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { | 1004 | if (new->flags & IRQF_ONESHOT) { |
1000 | ret = -EBUSY; | 1005 | /* |
1001 | goto out_mask; | 1006 | * Unlikely to have 32 resp 64 irqs sharing one line, |
1007 | * but who knows. | ||
1008 | */ | ||
1009 | if (thread_mask == ~0UL) { | ||
1010 | ret = -EBUSY; | ||
1011 | goto out_mask; | ||
1012 | } | ||
1013 | /* | ||
1014 | * The thread_mask for the action is or'ed to | ||
1015 | * desc->thread_active to indicate that the | ||
1016 | * IRQF_ONESHOT thread handler has been woken, but not | ||
1017 | * yet finished. The bit is cleared when a thread | ||
1018 | * completes. When all threads of a shared interrupt | ||
1019 | * line have completed desc->threads_active becomes | ||
1020 | * zero and the interrupt line is unmasked. See | ||
1021 | * handle.c:irq_wake_thread() for further information. | ||
1022 | * | ||
1023 | * If no thread is woken by primary (hard irq context) | ||
1024 | * interrupt handlers, then desc->threads_active is | ||
1025 | * also checked for zero to unmask the irq line in the | ||
1026 | * affected hard irq flow handlers | ||
1027 | * (handle_[fasteoi|level]_irq). | ||
1028 | * | ||
1029 | * The new action gets the first zero bit of | ||
1030 | * thread_mask assigned. See the loop above which or's | ||
1031 | * all existing action->thread_mask bits. | ||
1032 | */ | ||
1033 | new->thread_mask = 1 << ffz(thread_mask); | ||
1002 | } | 1034 | } |
1003 | new->thread_mask = 1 << ffz(thread_mask); | ||
1004 | 1035 | ||
1005 | if (!shared) { | 1036 | if (!shared) { |
1006 | init_waitqueue_head(&desc->wait_for_threads); | 1037 | init_waitqueue_head(&desc->wait_for_threads); |
@@ -1027,7 +1058,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1027 | desc->istate |= IRQS_ONESHOT; | 1058 | desc->istate |= IRQS_ONESHOT; |
1028 | 1059 | ||
1029 | if (irq_settings_can_autoenable(desc)) | 1060 | if (irq_settings_can_autoenable(desc)) |
1030 | irq_startup(desc); | 1061 | irq_startup(desc, true); |
1031 | else | 1062 | else |
1032 | /* Undo nested disables: */ | 1063 | /* Undo nested disables: */ |
1033 | desc->depth = 1; | 1064 | desc->depth = 1; |
@@ -1103,8 +1134,7 @@ out_thread: | |||
1103 | struct task_struct *t = new->thread; | 1134 | struct task_struct *t = new->thread; |
1104 | 1135 | ||
1105 | new->thread = NULL; | 1136 | new->thread = NULL; |
1106 | if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) | 1137 | kthread_stop(t); |
1107 | kthread_stop(t); | ||
1108 | put_task_struct(t); | 1138 | put_task_struct(t); |
1109 | } | 1139 | } |
1110 | out_mput: | 1140 | out_mput: |
@@ -1214,8 +1244,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1214 | #endif | 1244 | #endif |
1215 | 1245 | ||
1216 | if (action->thread) { | 1246 | if (action->thread) { |
1217 | if (!test_bit(IRQTF_DIED, &action->thread_flags)) | 1247 | kthread_stop(action->thread); |
1218 | kthread_stop(action->thread); | ||
1219 | put_task_struct(action->thread); | 1248 | put_task_struct(action->thread); |
1220 | } | 1249 | } |
1221 | 1250 | ||
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 47420908fba0..c3c89751b327 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata) | |||
43 | * masking the irqs. | 43 | * masking the irqs. |
44 | */ | 44 | */ |
45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) |
46 | < nr_cpu_ids)) | 46 | < nr_cpu_ids)) { |
47 | if (!chip->irq_set_affinity(&desc->irq_data, | 47 | int ret = chip->irq_set_affinity(&desc->irq_data, |
48 | desc->pending_mask, false)) { | 48 | desc->pending_mask, false); |
49 | switch (ret) { | ||
50 | case IRQ_SET_MASK_OK: | ||
49 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | 51 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); |
52 | case IRQ_SET_MASK_OK_NOCOPY: | ||
50 | irq_set_thread_affinity(desc); | 53 | irq_set_thread_affinity(desc); |
51 | } | 54 | } |
55 | } | ||
52 | 56 | ||
53 | cpumask_clear(desc->pending_mask); | 57 | cpumask_clear(desc->pending_mask); |
54 | } | 58 | } |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01d3b70fc98a..43049192b5ec 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/jump_label.h> | 15 | #include <linux/static_key.h> |
16 | 16 | ||
17 | #ifdef HAVE_JUMP_LABEL | 17 | #ifdef HAVE_JUMP_LABEL |
18 | 18 | ||
@@ -29,11 +29,6 @@ void jump_label_unlock(void) | |||
29 | mutex_unlock(&jump_label_mutex); | 29 | mutex_unlock(&jump_label_mutex); |
30 | } | 30 | } |
31 | 31 | ||
32 | bool jump_label_enabled(struct jump_label_key *key) | ||
33 | { | ||
34 | return !!atomic_read(&key->enabled); | ||
35 | } | ||
36 | |||
37 | static int jump_label_cmp(const void *a, const void *b) | 32 | static int jump_label_cmp(const void *a, const void *b) |
38 | { | 33 | { |
39 | const struct jump_entry *jea = a; | 34 | const struct jump_entry *jea = a; |
@@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) | |||
58 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | 53 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); |
59 | } | 54 | } |
60 | 55 | ||
61 | static void jump_label_update(struct jump_label_key *key, int enable); | 56 | static void jump_label_update(struct static_key *key, int enable); |
62 | 57 | ||
63 | void jump_label_inc(struct jump_label_key *key) | 58 | void static_key_slow_inc(struct static_key *key) |
64 | { | 59 | { |
65 | if (atomic_inc_not_zero(&key->enabled)) | 60 | if (atomic_inc_not_zero(&key->enabled)) |
66 | return; | 61 | return; |
67 | 62 | ||
68 | jump_label_lock(); | 63 | jump_label_lock(); |
69 | if (atomic_read(&key->enabled) == 0) | 64 | if (atomic_read(&key->enabled) == 0) { |
70 | jump_label_update(key, JUMP_LABEL_ENABLE); | 65 | if (!jump_label_get_branch_default(key)) |
66 | jump_label_update(key, JUMP_LABEL_ENABLE); | ||
67 | else | ||
68 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
69 | } | ||
71 | atomic_inc(&key->enabled); | 70 | atomic_inc(&key->enabled); |
72 | jump_label_unlock(); | 71 | jump_label_unlock(); |
73 | } | 72 | } |
74 | EXPORT_SYMBOL_GPL(jump_label_inc); | 73 | EXPORT_SYMBOL_GPL(static_key_slow_inc); |
75 | 74 | ||
76 | static void __jump_label_dec(struct jump_label_key *key, | 75 | static void __static_key_slow_dec(struct static_key *key, |
77 | unsigned long rate_limit, struct delayed_work *work) | 76 | unsigned long rate_limit, struct delayed_work *work) |
78 | { | 77 | { |
79 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) | 78 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { |
79 | WARN(atomic_read(&key->enabled) < 0, | ||
80 | "jump label: negative count!\n"); | ||
80 | return; | 81 | return; |
82 | } | ||
81 | 83 | ||
82 | if (rate_limit) { | 84 | if (rate_limit) { |
83 | atomic_inc(&key->enabled); | 85 | atomic_inc(&key->enabled); |
84 | schedule_delayed_work(work, rate_limit); | 86 | schedule_delayed_work(work, rate_limit); |
85 | } else | 87 | } else { |
86 | jump_label_update(key, JUMP_LABEL_DISABLE); | 88 | if (!jump_label_get_branch_default(key)) |
87 | 89 | jump_label_update(key, JUMP_LABEL_DISABLE); | |
90 | else | ||
91 | jump_label_update(key, JUMP_LABEL_ENABLE); | ||
92 | } | ||
88 | jump_label_unlock(); | 93 | jump_label_unlock(); |
89 | } | 94 | } |
90 | EXPORT_SYMBOL_GPL(jump_label_dec); | ||
91 | 95 | ||
92 | static void jump_label_update_timeout(struct work_struct *work) | 96 | static void jump_label_update_timeout(struct work_struct *work) |
93 | { | 97 | { |
94 | struct jump_label_key_deferred *key = | 98 | struct static_key_deferred *key = |
95 | container_of(work, struct jump_label_key_deferred, work.work); | 99 | container_of(work, struct static_key_deferred, work.work); |
96 | __jump_label_dec(&key->key, 0, NULL); | 100 | __static_key_slow_dec(&key->key, 0, NULL); |
97 | } | 101 | } |
98 | 102 | ||
99 | void jump_label_dec(struct jump_label_key *key) | 103 | void static_key_slow_dec(struct static_key *key) |
100 | { | 104 | { |
101 | __jump_label_dec(key, 0, NULL); | 105 | __static_key_slow_dec(key, 0, NULL); |
102 | } | 106 | } |
107 | EXPORT_SYMBOL_GPL(static_key_slow_dec); | ||
103 | 108 | ||
104 | void jump_label_dec_deferred(struct jump_label_key_deferred *key) | 109 | void static_key_slow_dec_deferred(struct static_key_deferred *key) |
105 | { | 110 | { |
106 | __jump_label_dec(&key->key, key->timeout, &key->work); | 111 | __static_key_slow_dec(&key->key, key->timeout, &key->work); |
107 | } | 112 | } |
113 | EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); | ||
108 | 114 | ||
109 | 115 | void jump_label_rate_limit(struct static_key_deferred *key, | |
110 | void jump_label_rate_limit(struct jump_label_key_deferred *key, | ||
111 | unsigned long rl) | 116 | unsigned long rl) |
112 | { | 117 | { |
113 | key->timeout = rl; | 118 | key->timeout = rl; |
@@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry | |||
150 | arch_jump_label_transform(entry, type); | 155 | arch_jump_label_transform(entry, type); |
151 | } | 156 | } |
152 | 157 | ||
153 | static void __jump_label_update(struct jump_label_key *key, | 158 | static void __jump_label_update(struct static_key *key, |
154 | struct jump_entry *entry, | 159 | struct jump_entry *entry, |
155 | struct jump_entry *stop, int enable) | 160 | struct jump_entry *stop, int enable) |
156 | { | 161 | { |
@@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key, | |||
167 | } | 172 | } |
168 | } | 173 | } |
169 | 174 | ||
175 | static enum jump_label_type jump_label_type(struct static_key *key) | ||
176 | { | ||
177 | bool true_branch = jump_label_get_branch_default(key); | ||
178 | bool state = static_key_enabled(key); | ||
179 | |||
180 | if ((!true_branch && state) || (true_branch && !state)) | ||
181 | return JUMP_LABEL_ENABLE; | ||
182 | |||
183 | return JUMP_LABEL_DISABLE; | ||
184 | } | ||
185 | |||
170 | void __init jump_label_init(void) | 186 | void __init jump_label_init(void) |
171 | { | 187 | { |
172 | struct jump_entry *iter_start = __start___jump_table; | 188 | struct jump_entry *iter_start = __start___jump_table; |
173 | struct jump_entry *iter_stop = __stop___jump_table; | 189 | struct jump_entry *iter_stop = __stop___jump_table; |
174 | struct jump_label_key *key = NULL; | 190 | struct static_key *key = NULL; |
175 | struct jump_entry *iter; | 191 | struct jump_entry *iter; |
176 | 192 | ||
177 | jump_label_lock(); | 193 | jump_label_lock(); |
178 | jump_label_sort_entries(iter_start, iter_stop); | 194 | jump_label_sort_entries(iter_start, iter_stop); |
179 | 195 | ||
180 | for (iter = iter_start; iter < iter_stop; iter++) { | 196 | for (iter = iter_start; iter < iter_stop; iter++) { |
181 | struct jump_label_key *iterk; | 197 | struct static_key *iterk; |
182 | 198 | ||
183 | iterk = (struct jump_label_key *)(unsigned long)iter->key; | 199 | iterk = (struct static_key *)(unsigned long)iter->key; |
184 | arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? | 200 | arch_jump_label_transform_static(iter, jump_label_type(iterk)); |
185 | JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); | ||
186 | if (iterk == key) | 201 | if (iterk == key) |
187 | continue; | 202 | continue; |
188 | 203 | ||
189 | key = iterk; | 204 | key = iterk; |
190 | key->entries = iter; | 205 | /* |
206 | * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. | ||
207 | */ | ||
208 | *((unsigned long *)&key->entries) += (unsigned long)iter; | ||
191 | #ifdef CONFIG_MODULES | 209 | #ifdef CONFIG_MODULES |
192 | key->next = NULL; | 210 | key->next = NULL; |
193 | #endif | 211 | #endif |
@@ -197,8 +215,8 @@ void __init jump_label_init(void) | |||
197 | 215 | ||
198 | #ifdef CONFIG_MODULES | 216 | #ifdef CONFIG_MODULES |
199 | 217 | ||
200 | struct jump_label_mod { | 218 | struct static_key_mod { |
201 | struct jump_label_mod *next; | 219 | struct static_key_mod *next; |
202 | struct jump_entry *entries; | 220 | struct jump_entry *entries; |
203 | struct module *mod; | 221 | struct module *mod; |
204 | }; | 222 | }; |
@@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) | |||
218 | start, end); | 236 | start, end); |
219 | } | 237 | } |
220 | 238 | ||
221 | static void __jump_label_mod_update(struct jump_label_key *key, int enable) | 239 | static void __jump_label_mod_update(struct static_key *key, int enable) |
222 | { | 240 | { |
223 | struct jump_label_mod *mod = key->next; | 241 | struct static_key_mod *mod = key->next; |
224 | 242 | ||
225 | while (mod) { | 243 | while (mod) { |
226 | struct module *m = mod->mod; | 244 | struct module *m = mod->mod; |
@@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod) | |||
251 | return; | 269 | return; |
252 | 270 | ||
253 | for (iter = iter_start; iter < iter_stop; iter++) { | 271 | for (iter = iter_start; iter < iter_stop; iter++) { |
254 | struct jump_label_key *iterk; | 272 | arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); |
255 | |||
256 | iterk = (struct jump_label_key *)(unsigned long)iter->key; | ||
257 | arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? | ||
258 | JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); | ||
259 | } | 273 | } |
260 | } | 274 | } |
261 | 275 | ||
@@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod) | |||
264 | struct jump_entry *iter_start = mod->jump_entries; | 278 | struct jump_entry *iter_start = mod->jump_entries; |
265 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | 279 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
266 | struct jump_entry *iter; | 280 | struct jump_entry *iter; |
267 | struct jump_label_key *key = NULL; | 281 | struct static_key *key = NULL; |
268 | struct jump_label_mod *jlm; | 282 | struct static_key_mod *jlm; |
269 | 283 | ||
270 | /* if the module doesn't have jump label entries, just return */ | 284 | /* if the module doesn't have jump label entries, just return */ |
271 | if (iter_start == iter_stop) | 285 | if (iter_start == iter_stop) |
@@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod) | |||
274 | jump_label_sort_entries(iter_start, iter_stop); | 288 | jump_label_sort_entries(iter_start, iter_stop); |
275 | 289 | ||
276 | for (iter = iter_start; iter < iter_stop; iter++) { | 290 | for (iter = iter_start; iter < iter_stop; iter++) { |
277 | if (iter->key == (jump_label_t)(unsigned long)key) | 291 | struct static_key *iterk; |
278 | continue; | ||
279 | 292 | ||
280 | key = (struct jump_label_key *)(unsigned long)iter->key; | 293 | iterk = (struct static_key *)(unsigned long)iter->key; |
294 | if (iterk == key) | ||
295 | continue; | ||
281 | 296 | ||
297 | key = iterk; | ||
282 | if (__module_address(iter->key) == mod) { | 298 | if (__module_address(iter->key) == mod) { |
283 | atomic_set(&key->enabled, 0); | 299 | /* |
284 | key->entries = iter; | 300 | * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. |
301 | */ | ||
302 | *((unsigned long *)&key->entries) += (unsigned long)iter; | ||
285 | key->next = NULL; | 303 | key->next = NULL; |
286 | continue; | 304 | continue; |
287 | } | 305 | } |
288 | 306 | jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); | |
289 | jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); | ||
290 | if (!jlm) | 307 | if (!jlm) |
291 | return -ENOMEM; | 308 | return -ENOMEM; |
292 | |||
293 | jlm->mod = mod; | 309 | jlm->mod = mod; |
294 | jlm->entries = iter; | 310 | jlm->entries = iter; |
295 | jlm->next = key->next; | 311 | jlm->next = key->next; |
296 | key->next = jlm; | 312 | key->next = jlm; |
297 | 313 | ||
298 | if (jump_label_enabled(key)) | 314 | if (jump_label_type(key) == JUMP_LABEL_ENABLE) |
299 | __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); | 315 | __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); |
300 | } | 316 | } |
301 | 317 | ||
@@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod) | |||
307 | struct jump_entry *iter_start = mod->jump_entries; | 323 | struct jump_entry *iter_start = mod->jump_entries; |
308 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | 324 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
309 | struct jump_entry *iter; | 325 | struct jump_entry *iter; |
310 | struct jump_label_key *key = NULL; | 326 | struct static_key *key = NULL; |
311 | struct jump_label_mod *jlm, **prev; | 327 | struct static_key_mod *jlm, **prev; |
312 | 328 | ||
313 | for (iter = iter_start; iter < iter_stop; iter++) { | 329 | for (iter = iter_start; iter < iter_stop; iter++) { |
314 | if (iter->key == (jump_label_t)(unsigned long)key) | 330 | if (iter->key == (jump_label_t)(unsigned long)key) |
315 | continue; | 331 | continue; |
316 | 332 | ||
317 | key = (struct jump_label_key *)(unsigned long)iter->key; | 333 | key = (struct static_key *)(unsigned long)iter->key; |
318 | 334 | ||
319 | if (__module_address(iter->key) == mod) | 335 | if (__module_address(iter->key) == mod) |
320 | continue; | 336 | continue; |
@@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end) | |||
416 | return ret; | 432 | return ret; |
417 | } | 433 | } |
418 | 434 | ||
419 | static void jump_label_update(struct jump_label_key *key, int enable) | 435 | static void jump_label_update(struct static_key *key, int enable) |
420 | { | 436 | { |
421 | struct jump_entry *entry = key->entries, *stop = __stop___jump_table; | 437 | struct jump_entry *stop = __stop___jump_table; |
438 | struct jump_entry *entry = jump_label_get_entries(key); | ||
422 | 439 | ||
423 | #ifdef CONFIG_MODULES | 440 | #ifdef CONFIG_MODULES |
424 | struct module *mod = __module_address((jump_label_t)key); | 441 | struct module *mod = __module_address((unsigned long)key); |
425 | 442 | ||
426 | __jump_label_mod_update(key, enable); | 443 | __jump_label_mod_update(key, enable); |
427 | 444 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..4e2e472f6aeb 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -37,7 +37,6 @@ | |||
37 | #include <asm/page.h> | 37 | #include <asm/page.h> |
38 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
39 | #include <asm/io.h> | 39 | #include <asm/io.h> |
40 | #include <asm/system.h> | ||
41 | #include <asm/sections.h> | 40 | #include <asm/sections.h> |
42 | 41 | ||
43 | /* Per cpu memory for storing cpu states in case of system crash. */ | 42 | /* Per cpu memory for storing cpu states in case of system crash. */ |
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline, | |||
1359 | 1358 | ||
1360 | if (*cur == '@') | 1359 | if (*cur == '@') |
1361 | *crash_base = memparse(cur+1, &cur); | 1360 | *crash_base = memparse(cur+1, &cur); |
1361 | else if (*cur != ' ' && *cur != '\0') { | ||
1362 | pr_warning("crashkernel: unrecognized char\n"); | ||
1363 | return -EINVAL; | ||
1364 | } | ||
1362 | 1365 | ||
1363 | return 0; | 1366 | return 0; |
1364 | } | 1367 | } |
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1462 | 1465 | ||
1463 | VMCOREINFO_SYMBOL(init_uts_ns); | 1466 | VMCOREINFO_SYMBOL(init_uts_ns); |
1464 | VMCOREINFO_SYMBOL(node_online_map); | 1467 | VMCOREINFO_SYMBOL(node_online_map); |
1468 | #ifdef CONFIG_MMU | ||
1465 | VMCOREINFO_SYMBOL(swapper_pg_dir); | 1469 | VMCOREINFO_SYMBOL(swapper_pg_dir); |
1470 | #endif | ||
1466 | VMCOREINFO_SYMBOL(_stext); | 1471 | VMCOREINFO_SYMBOL(_stext); |
1467 | VMCOREINFO_SYMBOL(vmlist); | 1472 | VMCOREINFO_SYMBOL(vmlist); |
1468 | 1473 | ||
@@ -1546,13 +1551,13 @@ int kernel_kexec(void) | |||
1546 | if (error) | 1551 | if (error) |
1547 | goto Resume_console; | 1552 | goto Resume_console; |
1548 | /* At this point, dpm_suspend_start() has been called, | 1553 | /* At this point, dpm_suspend_start() has been called, |
1549 | * but *not* dpm_suspend_noirq(). We *must* call | 1554 | * but *not* dpm_suspend_end(). We *must* call |
1550 | * dpm_suspend_noirq() now. Otherwise, drivers for | 1555 | * dpm_suspend_end() now. Otherwise, drivers for |
1551 | * some devices (e.g. interrupt controllers) become | 1556 | * some devices (e.g. interrupt controllers) become |
1552 | * desynchronized with the actual state of the | 1557 | * desynchronized with the actual state of the |
1553 | * hardware at resume time, and evil weirdness ensues. | 1558 | * hardware at resume time, and evil weirdness ensues. |
1554 | */ | 1559 | */ |
1555 | error = dpm_suspend_noirq(PMSG_FREEZE); | 1560 | error = dpm_suspend_end(PMSG_FREEZE); |
1556 | if (error) | 1561 | if (error) |
1557 | goto Resume_devices; | 1562 | goto Resume_devices; |
1558 | error = disable_nonboot_cpus(); | 1563 | error = disable_nonboot_cpus(); |
@@ -1579,7 +1584,7 @@ int kernel_kexec(void) | |||
1579 | local_irq_enable(); | 1584 | local_irq_enable(); |
1580 | Enable_cpus: | 1585 | Enable_cpus: |
1581 | enable_nonboot_cpus(); | 1586 | enable_nonboot_cpus(); |
1582 | dpm_resume_noirq(PMSG_RESTORE); | 1587 | dpm_resume_start(PMSG_RESTORE); |
1583 | Resume_devices: | 1588 | Resume_devices: |
1584 | dpm_resume_end(PMSG_RESTORE); | 1589 | dpm_resume_end(PMSG_RESTORE); |
1585 | Resume_console: | 1590 | Resume_console: |
diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934e..957a7aab8ebc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem); | |||
60 | */ | 60 | */ |
61 | char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; | 61 | char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; |
62 | 62 | ||
63 | static void free_modprobe_argv(struct subprocess_info *info) | ||
64 | { | ||
65 | kfree(info->argv[3]); /* check call_modprobe() */ | ||
66 | kfree(info->argv); | ||
67 | } | ||
68 | |||
69 | static int call_modprobe(char *module_name, int wait) | ||
70 | { | ||
71 | static char *envp[] = { | ||
72 | "HOME=/", | ||
73 | "TERM=linux", | ||
74 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
75 | NULL | ||
76 | }; | ||
77 | |||
78 | char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); | ||
79 | if (!argv) | ||
80 | goto out; | ||
81 | |||
82 | module_name = kstrdup(module_name, GFP_KERNEL); | ||
83 | if (!module_name) | ||
84 | goto free_argv; | ||
85 | |||
86 | argv[0] = modprobe_path; | ||
87 | argv[1] = "-q"; | ||
88 | argv[2] = "--"; | ||
89 | argv[3] = module_name; /* check free_modprobe_argv() */ | ||
90 | argv[4] = NULL; | ||
91 | |||
92 | return call_usermodehelper_fns(modprobe_path, argv, envp, | ||
93 | wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); | ||
94 | free_argv: | ||
95 | kfree(argv); | ||
96 | out: | ||
97 | return -ENOMEM; | ||
98 | } | ||
99 | |||
63 | /** | 100 | /** |
64 | * __request_module - try to load a kernel module | 101 | * __request_module - try to load a kernel module |
65 | * @wait: wait (or not) for the operation to complete | 102 | * @wait: wait (or not) for the operation to complete |
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...) | |||
81 | char module_name[MODULE_NAME_LEN]; | 118 | char module_name[MODULE_NAME_LEN]; |
82 | unsigned int max_modprobes; | 119 | unsigned int max_modprobes; |
83 | int ret; | 120 | int ret; |
84 | char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; | ||
85 | static char *envp[] = { "HOME=/", | ||
86 | "TERM=linux", | ||
87 | "PATH=/sbin:/usr/sbin:/bin:/usr/bin", | ||
88 | NULL }; | ||
89 | static atomic_t kmod_concurrent = ATOMIC_INIT(0); | 121 | static atomic_t kmod_concurrent = ATOMIC_INIT(0); |
90 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ | 122 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ |
91 | static int kmod_loop_msg; | 123 | static int kmod_loop_msg; |
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...) | |||
128 | 160 | ||
129 | trace_module_request(module_name, wait, _RET_IP_); | 161 | trace_module_request(module_name, wait, _RET_IP_); |
130 | 162 | ||
131 | ret = call_usermodehelper_fns(modprobe_path, argv, envp, | 163 | ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); |
132 | wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, | ||
133 | NULL, NULL, NULL); | ||
134 | 164 | ||
135 | atomic_dec(&kmod_concurrent); | 165 | atomic_dec(&kmod_concurrent); |
136 | return ret; | 166 | return ret; |
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data) | |||
188 | /* Exec failed? */ | 218 | /* Exec failed? */ |
189 | fail: | 219 | fail: |
190 | sub_info->retval = retval; | 220 | sub_info->retval = retval; |
191 | do_exit(0); | 221 | return 0; |
192 | } | 222 | } |
193 | 223 | ||
194 | void call_usermodehelper_freeinfo(struct subprocess_info *info) | 224 | void call_usermodehelper_freeinfo(struct subprocess_info *info) |
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) | |||
199 | } | 229 | } |
200 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); | 230 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); |
201 | 231 | ||
232 | static void umh_complete(struct subprocess_info *sub_info) | ||
233 | { | ||
234 | struct completion *comp = xchg(&sub_info->complete, NULL); | ||
235 | /* | ||
236 | * See call_usermodehelper_exec(). If xchg() returns NULL | ||
237 | * we own sub_info, the UMH_KILLABLE caller has gone away. | ||
238 | */ | ||
239 | if (comp) | ||
240 | complete(comp); | ||
241 | else | ||
242 | call_usermodehelper_freeinfo(sub_info); | ||
243 | } | ||
244 | |||
202 | /* Keventd can't block, but this (a child) can. */ | 245 | /* Keventd can't block, but this (a child) can. */ |
203 | static int wait_for_helper(void *data) | 246 | static int wait_for_helper(void *data) |
204 | { | 247 | { |
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data) | |||
235 | sub_info->retval = ret; | 278 | sub_info->retval = ret; |
236 | } | 279 | } |
237 | 280 | ||
238 | complete(sub_info->complete); | 281 | umh_complete(sub_info); |
239 | return 0; | 282 | return 0; |
240 | } | 283 | } |
241 | 284 | ||
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work) | |||
244 | { | 287 | { |
245 | struct subprocess_info *sub_info = | 288 | struct subprocess_info *sub_info = |
246 | container_of(work, struct subprocess_info, work); | 289 | container_of(work, struct subprocess_info, work); |
247 | enum umh_wait wait = sub_info->wait; | 290 | int wait = sub_info->wait & ~UMH_KILLABLE; |
248 | pid_t pid; | 291 | pid_t pid; |
249 | 292 | ||
250 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 293 | /* CLONE_VFORK: wait until the usermode helper has execve'd |
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work) | |||
269 | case UMH_WAIT_EXEC: | 312 | case UMH_WAIT_EXEC: |
270 | if (pid < 0) | 313 | if (pid < 0) |
271 | sub_info->retval = pid; | 314 | sub_info->retval = pid; |
272 | complete(sub_info->complete); | 315 | umh_complete(sub_info); |
273 | } | 316 | } |
274 | } | 317 | } |
275 | 318 | ||
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); | |||
435 | * asynchronously if wait is not set, and runs as a child of keventd. | 478 | * asynchronously if wait is not set, and runs as a child of keventd. |
436 | * (ie. it runs with full root capabilities). | 479 | * (ie. it runs with full root capabilities). |
437 | */ | 480 | */ |
438 | int call_usermodehelper_exec(struct subprocess_info *sub_info, | 481 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) |
439 | enum umh_wait wait) | ||
440 | { | 482 | { |
441 | DECLARE_COMPLETION_ONSTACK(done); | 483 | DECLARE_COMPLETION_ONSTACK(done); |
442 | int retval = 0; | 484 | int retval = 0; |
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, | |||
456 | queue_work(khelper_wq, &sub_info->work); | 498 | queue_work(khelper_wq, &sub_info->work); |
457 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ | 499 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
458 | goto unlock; | 500 | goto unlock; |
501 | |||
502 | if (wait & UMH_KILLABLE) { | ||
503 | retval = wait_for_completion_killable(&done); | ||
504 | if (!retval) | ||
505 | goto wait_done; | ||
506 | |||
507 | /* umh_complete() will see NULL and free sub_info */ | ||
508 | if (xchg(&sub_info->complete, NULL)) | ||
509 | goto unlock; | ||
510 | /* fallthrough, umh_complete() was already called */ | ||
511 | } | ||
512 | |||
459 | wait_for_completion(&done); | 513 | wait_for_completion(&done); |
514 | wait_done: | ||
460 | retval = sub_info->retval; | 515 | retval = sub_info->retval; |
461 | |||
462 | out: | 516 | out: |
463 | call_usermodehelper_freeinfo(sub_info); | 517 | call_usermodehelper_freeinfo(sub_info); |
464 | unlock: | 518 | unlock: |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9788c0ec6f43..c62b8546cc90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1334 | if (!kernel_text_address((unsigned long) p->addr) || | 1334 | if (!kernel_text_address((unsigned long) p->addr) || |
1335 | in_kprobes_functions((unsigned long) p->addr) || | 1335 | in_kprobes_functions((unsigned long) p->addr) || |
1336 | ftrace_text_reserved(p->addr, p->addr) || | 1336 | ftrace_text_reserved(p->addr, p->addr) || |
1337 | jump_label_text_reserved(p->addr, p->addr)) | 1337 | jump_label_text_reserved(p->addr, p->addr)) { |
1338 | goto fail_with_jump_label; | 1338 | ret = -EINVAL; |
1339 | goto cannot_probe; | ||
1340 | } | ||
1339 | 1341 | ||
1340 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ | 1342 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ |
1341 | p->flags &= KPROBE_FLAG_DISABLED; | 1343 | p->flags &= KPROBE_FLAG_DISABLED; |
@@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1352 | * its code to prohibit unexpected unloading. | 1354 | * its code to prohibit unexpected unloading. |
1353 | */ | 1355 | */ |
1354 | if (unlikely(!try_module_get(probed_mod))) | 1356 | if (unlikely(!try_module_get(probed_mod))) |
1355 | goto fail_with_jump_label; | 1357 | goto cannot_probe; |
1356 | 1358 | ||
1357 | /* | 1359 | /* |
1358 | * If the module freed .init.text, we couldn't insert | 1360 | * If the module freed .init.text, we couldn't insert |
@@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1361 | if (within_module_init((unsigned long)p->addr, probed_mod) && | 1363 | if (within_module_init((unsigned long)p->addr, probed_mod) && |
1362 | probed_mod->state != MODULE_STATE_COMING) { | 1364 | probed_mod->state != MODULE_STATE_COMING) { |
1363 | module_put(probed_mod); | 1365 | module_put(probed_mod); |
1364 | goto fail_with_jump_label; | 1366 | goto cannot_probe; |
1365 | } | 1367 | } |
1366 | /* ret will be updated by following code */ | 1368 | /* ret will be updated by following code */ |
1367 | } | 1369 | } |
@@ -1409,7 +1411,7 @@ out: | |||
1409 | 1411 | ||
1410 | return ret; | 1412 | return ret; |
1411 | 1413 | ||
1412 | fail_with_jump_label: | 1414 | cannot_probe: |
1413 | preempt_enable(); | 1415 | preempt_enable(); |
1414 | jump_label_unlock(); | 1416 | jump_label_unlock(); |
1415 | return ret; | 1417 | return ret; |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8889f7dd7c46..ea9ee4518c35 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4176 | printk("-------------------------------\n"); | 4176 | printk("-------------------------------\n"); |
4177 | printk("%s:%d %s!\n", file, line, s); | 4177 | printk("%s:%d %s!\n", file, line, s); |
4178 | printk("\nother info that might help us debug this:\n\n"); | 4178 | printk("\nother info that might help us debug this:\n\n"); |
4179 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); | 4179 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
4180 | !rcu_lockdep_current_cpu_online() | ||
4181 | ? "RCU used illegally from offline CPU!\n" | ||
4182 | : rcu_is_cpu_idle() | ||
4183 | ? "RCU used illegally from idle CPU!\n" | ||
4184 | : "", | ||
4185 | rcu_scheduler_active, debug_locks); | ||
4180 | 4186 | ||
4181 | /* | 4187 | /* |
4182 | * If a CPU is in the RCU-free window in idle (ie: in the section | 4188 | * If a CPU is in the RCU-free window in idle (ie: in the section |
diff --git a/kernel/module.c b/kernel/module.c index 2c932760fd33..78ac6ec1e425 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ | |||
105 | 105 | ||
106 | /* Block module loading/unloading? */ | 106 | /* Block module loading/unloading? */ |
107 | int modules_disabled = 0; | 107 | int modules_disabled = 0; |
108 | core_param(nomodule, modules_disabled, bint, 0); | ||
108 | 109 | ||
109 | /* Waiting for a module to finish initializing? */ | 110 | /* Waiting for a module to finish initializing? */ |
110 | static DECLARE_WAIT_QUEUE_HEAD(module_wq); | 111 | static DECLARE_WAIT_QUEUE_HEAD(module_wq); |
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr, | |||
903 | static struct module_attribute modinfo_refcnt = | 904 | static struct module_attribute modinfo_refcnt = |
904 | __ATTR(refcnt, 0444, show_refcnt, NULL); | 905 | __ATTR(refcnt, 0444, show_refcnt, NULL); |
905 | 906 | ||
907 | void __module_get(struct module *module) | ||
908 | { | ||
909 | if (module) { | ||
910 | preempt_disable(); | ||
911 | __this_cpu_inc(module->refptr->incs); | ||
912 | trace_module_get(module, _RET_IP_); | ||
913 | preempt_enable(); | ||
914 | } | ||
915 | } | ||
916 | EXPORT_SYMBOL(__module_get); | ||
917 | |||
918 | bool try_module_get(struct module *module) | ||
919 | { | ||
920 | bool ret = true; | ||
921 | |||
922 | if (module) { | ||
923 | preempt_disable(); | ||
924 | |||
925 | if (likely(module_is_live(module))) { | ||
926 | __this_cpu_inc(module->refptr->incs); | ||
927 | trace_module_get(module, _RET_IP_); | ||
928 | } else | ||
929 | ret = false; | ||
930 | |||
931 | preempt_enable(); | ||
932 | } | ||
933 | return ret; | ||
934 | } | ||
935 | EXPORT_SYMBOL(try_module_get); | ||
936 | |||
906 | void module_put(struct module *module) | 937 | void module_put(struct module *module) |
907 | { | 938 | { |
908 | if (module) { | 939 | if (module) { |
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info, | |||
2380 | return -ENOEXEC; | 2411 | return -ENOEXEC; |
2381 | 2412 | ||
2382 | /* Suck in entire file: we'll want most of it. */ | 2413 | /* Suck in entire file: we'll want most of it. */ |
2383 | /* vmalloc barfs on "unusual" numbers. Check here */ | 2414 | if ((hdr = vmalloc(len)) == NULL) |
2384 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) | ||
2385 | return -ENOMEM; | 2415 | return -ENOMEM; |
2386 | 2416 | ||
2387 | if (copy_from_user(hdr, umod, len) != 0) { | 2417 | if (copy_from_user(hdr, umod, len) != 0) { |
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod, | |||
2922 | mutex_unlock(&module_mutex); | 2952 | mutex_unlock(&module_mutex); |
2923 | 2953 | ||
2924 | /* Module is ready to execute: parsing args may do that. */ | 2954 | /* Module is ready to execute: parsing args may do that. */ |
2925 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); | 2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
2956 | -32768, 32767, NULL); | ||
2926 | if (err < 0) | 2957 | if (err < 0) |
2927 | goto unlink; | 2958 | goto unlink; |
2928 | 2959 | ||
diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
240 | 240 | ||
241 | /* didn't get the lock, go to sleep: */ | 241 | /* didn't get the lock, go to sleep: */ |
242 | spin_unlock_mutex(&lock->wait_lock, flags); | 242 | spin_unlock_mutex(&lock->wait_lock, flags); |
243 | preempt_enable_no_resched(); | 243 | schedule_preempt_disabled(); |
244 | schedule(); | ||
245 | preempt_disable(); | ||
246 | spin_lock_mutex(&lock->wait_lock, flags); | 244 | spin_lock_mutex(&lock->wait_lock, flags); |
247 | } | 245 | } |
248 | 246 | ||
diff --git a/kernel/padata.c b/kernel/padata.c index b45259931512..6f10eb285ece 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/sysfs.h> | 29 | #include <linux/sysfs.h> |
30 | #include <linux/rcupdate.h> | 30 | #include <linux/rcupdate.h> |
31 | 31 | ||
32 | #define MAX_SEQ_NR (INT_MAX - NR_CPUS) | ||
33 | #define MAX_OBJ_NUM 1000 | 32 | #define MAX_OBJ_NUM 1000 |
34 | 33 | ||
35 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | 34 | static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) |
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) | |||
43 | return target_cpu; | 42 | return target_cpu; |
44 | } | 43 | } |
45 | 44 | ||
46 | static int padata_cpu_hash(struct padata_priv *padata) | 45 | static int padata_cpu_hash(struct parallel_data *pd) |
47 | { | 46 | { |
48 | int cpu_index; | 47 | int cpu_index; |
49 | struct parallel_data *pd; | ||
50 | |||
51 | pd = padata->pd; | ||
52 | 48 | ||
53 | /* | 49 | /* |
54 | * Hash the sequence numbers to the cpus by taking | 50 | * Hash the sequence numbers to the cpus by taking |
55 | * seq_nr mod. number of cpus in use. | 51 | * seq_nr mod. number of cpus in use. |
56 | */ | 52 | */ |
57 | cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); | 53 | |
54 | spin_lock(&pd->seq_lock); | ||
55 | cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); | ||
56 | pd->seq_nr++; | ||
57 | spin_unlock(&pd->seq_lock); | ||
58 | 58 | ||
59 | return padata_index_to_cpu(pd, cpu_index); | 59 | return padata_index_to_cpu(pd, cpu_index); |
60 | } | 60 | } |
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst, | |||
132 | padata->pd = pd; | 132 | padata->pd = pd; |
133 | padata->cb_cpu = cb_cpu; | 133 | padata->cb_cpu = cb_cpu; |
134 | 134 | ||
135 | if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) | 135 | target_cpu = padata_cpu_hash(pd); |
136 | atomic_set(&pd->seq_nr, -1); | ||
137 | |||
138 | padata->seq_nr = atomic_inc_return(&pd->seq_nr); | ||
139 | |||
140 | target_cpu = padata_cpu_hash(padata); | ||
141 | queue = per_cpu_ptr(pd->pqueue, target_cpu); | 136 | queue = per_cpu_ptr(pd->pqueue, target_cpu); |
142 | 137 | ||
143 | spin_lock(&queue->parallel.lock); | 138 | spin_lock(&queue->parallel.lock); |
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel); | |||
173 | static struct padata_priv *padata_get_next(struct parallel_data *pd) | 168 | static struct padata_priv *padata_get_next(struct parallel_data *pd) |
174 | { | 169 | { |
175 | int cpu, num_cpus; | 170 | int cpu, num_cpus; |
176 | int next_nr, next_index; | 171 | unsigned int next_nr, next_index; |
177 | struct padata_parallel_queue *queue, *next_queue; | 172 | struct padata_parallel_queue *queue, *next_queue; |
178 | struct padata_priv *padata; | 173 | struct padata_priv *padata; |
179 | struct padata_list *reorder; | 174 | struct padata_list *reorder; |
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
189 | cpu = padata_index_to_cpu(pd, next_index); | 184 | cpu = padata_index_to_cpu(pd, next_index); |
190 | next_queue = per_cpu_ptr(pd->pqueue, cpu); | 185 | next_queue = per_cpu_ptr(pd->pqueue, cpu); |
191 | 186 | ||
192 | if (unlikely(next_nr > pd->max_seq_nr)) { | ||
193 | next_nr = next_nr - pd->max_seq_nr - 1; | ||
194 | next_index = next_nr % num_cpus; | ||
195 | cpu = padata_index_to_cpu(pd, next_index); | ||
196 | next_queue = per_cpu_ptr(pd->pqueue, cpu); | ||
197 | pd->processed = 0; | ||
198 | } | ||
199 | |||
200 | padata = NULL; | 187 | padata = NULL; |
201 | 188 | ||
202 | reorder = &next_queue->reorder; | 189 | reorder = &next_queue->reorder; |
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) | |||
205 | padata = list_entry(reorder->list.next, | 192 | padata = list_entry(reorder->list.next, |
206 | struct padata_priv, list); | 193 | struct padata_priv, list); |
207 | 194 | ||
208 | BUG_ON(next_nr != padata->seq_nr); | ||
209 | |||
210 | spin_lock(&reorder->lock); | 195 | spin_lock(&reorder->lock); |
211 | list_del_init(&padata->list); | 196 | list_del_init(&padata->list); |
212 | atomic_dec(&pd->reorder_objects); | 197 | atomic_dec(&pd->reorder_objects); |
@@ -230,6 +215,7 @@ out: | |||
230 | 215 | ||
231 | static void padata_reorder(struct parallel_data *pd) | 216 | static void padata_reorder(struct parallel_data *pd) |
232 | { | 217 | { |
218 | int cb_cpu; | ||
233 | struct padata_priv *padata; | 219 | struct padata_priv *padata; |
234 | struct padata_serial_queue *squeue; | 220 | struct padata_serial_queue *squeue; |
235 | struct padata_instance *pinst = pd->pinst; | 221 | struct padata_instance *pinst = pd->pinst; |
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd) | |||
270 | return; | 256 | return; |
271 | } | 257 | } |
272 | 258 | ||
273 | squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); | 259 | cb_cpu = padata->cb_cpu; |
260 | squeue = per_cpu_ptr(pd->squeue, cb_cpu); | ||
274 | 261 | ||
275 | spin_lock(&squeue->serial.lock); | 262 | spin_lock(&squeue->serial.lock); |
276 | list_add_tail(&padata->list, &squeue->serial.list); | 263 | list_add_tail(&padata->list, &squeue->serial.list); |
277 | spin_unlock(&squeue->serial.lock); | 264 | spin_unlock(&squeue->serial.lock); |
278 | 265 | ||
279 | queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); | 266 | queue_work_on(cb_cpu, pinst->wq, &squeue->work); |
280 | } | 267 | } |
281 | 268 | ||
282 | spin_unlock_bh(&pd->lock); | 269 | spin_unlock_bh(&pd->lock); |
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd) | |||
400 | /* Initialize all percpu queues used by parallel workers */ | 387 | /* Initialize all percpu queues used by parallel workers */ |
401 | static void padata_init_pqueues(struct parallel_data *pd) | 388 | static void padata_init_pqueues(struct parallel_data *pd) |
402 | { | 389 | { |
403 | int cpu_index, num_cpus, cpu; | 390 | int cpu_index, cpu; |
404 | struct padata_parallel_queue *pqueue; | 391 | struct padata_parallel_queue *pqueue; |
405 | 392 | ||
406 | cpu_index = 0; | 393 | cpu_index = 0; |
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd) | |||
415 | INIT_WORK(&pqueue->work, padata_parallel_worker); | 402 | INIT_WORK(&pqueue->work, padata_parallel_worker); |
416 | atomic_set(&pqueue->num_obj, 0); | 403 | atomic_set(&pqueue->num_obj, 0); |
417 | } | 404 | } |
418 | |||
419 | num_cpus = cpumask_weight(pd->cpumask.pcpu); | ||
420 | pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; | ||
421 | } | 405 | } |
422 | 406 | ||
423 | /* Allocate and initialize the internal cpumask dependend resources. */ | 407 | /* Allocate and initialize the internal cpumask dependend resources. */ |
@@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, | |||
444 | padata_init_pqueues(pd); | 428 | padata_init_pqueues(pd); |
445 | padata_init_squeues(pd); | 429 | padata_init_squeues(pd); |
446 | setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); | 430 | setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); |
447 | atomic_set(&pd->seq_nr, -1); | 431 | pd->seq_nr = 0; |
448 | atomic_set(&pd->reorder_objects, 0); | 432 | atomic_set(&pd->reorder_objects, 0); |
449 | atomic_set(&pd->refcnt, 0); | 433 | atomic_set(&pd->refcnt, 0); |
450 | pd->pinst = pinst; | 434 | pd->pinst = pinst; |
diff --git a/kernel/params.c b/kernel/params.c index 4bc965d8a1fe..f37d82631347 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -15,7 +15,6 @@ | |||
15 | along with this program; if not, write to the Free Software | 15 | along with this program; if not, write to the Free Software |
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 | */ | 17 | */ |
18 | #include <linux/module.h> | ||
19 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
20 | #include <linux/string.h> | 19 | #include <linux/string.h> |
21 | #include <linux/errno.h> | 20 | #include <linux/errno.h> |
@@ -88,6 +87,8 @@ static int parse_one(char *param, | |||
88 | char *val, | 87 | char *val, |
89 | const struct kernel_param *params, | 88 | const struct kernel_param *params, |
90 | unsigned num_params, | 89 | unsigned num_params, |
90 | s16 min_level, | ||
91 | s16 max_level, | ||
91 | int (*handle_unknown)(char *param, char *val)) | 92 | int (*handle_unknown)(char *param, char *val)) |
92 | { | 93 | { |
93 | unsigned int i; | 94 | unsigned int i; |
@@ -96,6 +97,9 @@ static int parse_one(char *param, | |||
96 | /* Find parameter */ | 97 | /* Find parameter */ |
97 | for (i = 0; i < num_params; i++) { | 98 | for (i = 0; i < num_params; i++) { |
98 | if (parameq(param, params[i].name)) { | 99 | if (parameq(param, params[i].name)) { |
100 | if (params[i].level < min_level | ||
101 | || params[i].level > max_level) | ||
102 | return 0; | ||
99 | /* No one handled NULL, so do it here. */ | 103 | /* No one handled NULL, so do it here. */ |
100 | if (!val && params[i].ops->set != param_set_bool | 104 | if (!val && params[i].ops->set != param_set_bool |
101 | && params[i].ops->set != param_set_bint) | 105 | && params[i].ops->set != param_set_bint) |
@@ -175,6 +179,8 @@ int parse_args(const char *name, | |||
175 | char *args, | 179 | char *args, |
176 | const struct kernel_param *params, | 180 | const struct kernel_param *params, |
177 | unsigned num, | 181 | unsigned num, |
182 | s16 min_level, | ||
183 | s16 max_level, | ||
178 | int (*unknown)(char *param, char *val)) | 184 | int (*unknown)(char *param, char *val)) |
179 | { | 185 | { |
180 | char *param, *val; | 186 | char *param, *val; |
@@ -190,7 +196,8 @@ int parse_args(const char *name, | |||
190 | 196 | ||
191 | args = next_arg(args, ¶m, &val); | 197 | args = next_arg(args, ¶m, &val); |
192 | irq_was_disabled = irqs_disabled(); | 198 | irq_was_disabled = irqs_disabled(); |
193 | ret = parse_one(param, val, params, num, unknown); | 199 | ret = parse_one(param, val, params, num, |
200 | min_level, max_level, unknown); | ||
194 | if (irq_was_disabled && !irqs_disabled()) { | 201 | if (irq_was_disabled && !irqs_disabled()) { |
195 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | 202 | printk(KERN_WARNING "parse_args(): option '%s' enabled " |
196 | "irq's!\n", param); | 203 | "irq's!\n", param); |
@@ -298,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp); | |||
298 | /* Actually could be a bool or an int, for historical reasons. */ | 305 | /* Actually could be a bool or an int, for historical reasons. */ |
299 | int param_set_bool(const char *val, const struct kernel_param *kp) | 306 | int param_set_bool(const char *val, const struct kernel_param *kp) |
300 | { | 307 | { |
301 | bool v; | ||
302 | int ret; | ||
303 | |||
304 | /* No equals means "set"... */ | 308 | /* No equals means "set"... */ |
305 | if (!val) val = "1"; | 309 | if (!val) val = "1"; |
306 | 310 | ||
307 | /* One of =[yYnN01] */ | 311 | /* One of =[yYnN01] */ |
308 | ret = strtobool(val, &v); | 312 | return strtobool(val, kp->arg); |
309 | if (ret) | ||
310 | return ret; | ||
311 | |||
312 | if (kp->flags & KPARAM_ISBOOL) | ||
313 | *(bool *)kp->arg = v; | ||
314 | else | ||
315 | *(int *)kp->arg = v; | ||
316 | return 0; | ||
317 | } | 313 | } |
318 | EXPORT_SYMBOL(param_set_bool); | 314 | EXPORT_SYMBOL(param_set_bool); |
319 | 315 | ||
320 | int param_get_bool(char *buffer, const struct kernel_param *kp) | 316 | int param_get_bool(char *buffer, const struct kernel_param *kp) |
321 | { | 317 | { |
322 | bool val; | ||
323 | if (kp->flags & KPARAM_ISBOOL) | ||
324 | val = *(bool *)kp->arg; | ||
325 | else | ||
326 | val = *(int *)kp->arg; | ||
327 | |||
328 | /* Y and N chosen as being relatively non-coder friendly */ | 318 | /* Y and N chosen as being relatively non-coder friendly */ |
329 | return sprintf(buffer, "%c", val ? 'Y' : 'N'); | 319 | return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); |
330 | } | 320 | } |
331 | EXPORT_SYMBOL(param_get_bool); | 321 | EXPORT_SYMBOL(param_get_bool); |
332 | 322 | ||
@@ -344,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp) | |||
344 | struct kernel_param dummy; | 334 | struct kernel_param dummy; |
345 | 335 | ||
346 | dummy.arg = &boolval; | 336 | dummy.arg = &boolval; |
347 | dummy.flags = KPARAM_ISBOOL; | ||
348 | ret = param_set_bool(val, &dummy); | 337 | ret = param_set_bool(val, &dummy); |
349 | if (ret == 0) | 338 | if (ret == 0) |
350 | *(bool *)kp->arg = !boolval; | 339 | *(bool *)kp->arg = !boolval; |
@@ -373,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp) | |||
373 | /* Match bool exactly, by re-using it. */ | 362 | /* Match bool exactly, by re-using it. */ |
374 | boolkp = *kp; | 363 | boolkp = *kp; |
375 | boolkp.arg = &v; | 364 | boolkp.arg = &v; |
376 | boolkp.flags |= KPARAM_ISBOOL; | ||
377 | 365 | ||
378 | ret = param_set_bool(val, &boolkp); | 366 | ret = param_set_bool(val, &boolkp); |
379 | if (ret == 0) | 367 | if (ret == 0) |
@@ -394,7 +382,7 @@ static int param_array(const char *name, | |||
394 | unsigned int min, unsigned int max, | 382 | unsigned int min, unsigned int max, |
395 | void *elem, int elemsize, | 383 | void *elem, int elemsize, |
396 | int (*set)(const char *, const struct kernel_param *kp), | 384 | int (*set)(const char *, const struct kernel_param *kp), |
397 | u16 flags, | 385 | s16 level, |
398 | unsigned int *num) | 386 | unsigned int *num) |
399 | { | 387 | { |
400 | int ret; | 388 | int ret; |
@@ -404,7 +392,7 @@ static int param_array(const char *name, | |||
404 | /* Get the name right for errors. */ | 392 | /* Get the name right for errors. */ |
405 | kp.name = name; | 393 | kp.name = name; |
406 | kp.arg = elem; | 394 | kp.arg = elem; |
407 | kp.flags = flags; | 395 | kp.level = level; |
408 | 396 | ||
409 | *num = 0; | 397 | *num = 0; |
410 | /* We expect a comma-separated list of values. */ | 398 | /* We expect a comma-separated list of values. */ |
@@ -445,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp) | |||
445 | unsigned int temp_num; | 433 | unsigned int temp_num; |
446 | 434 | ||
447 | return param_array(kp->name, val, 1, arr->max, arr->elem, | 435 | return param_array(kp->name, val, 1, arr->max, arr->elem, |
448 | arr->elemsize, arr->ops->set, kp->flags, | 436 | arr->elemsize, arr->ops->set, kp->level, |
449 | arr->num ?: &temp_num); | 437 | arr->num ?: &temp_num); |
450 | } | 438 | } |
451 | 439 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deaccb..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) | |||
543 | */ | 543 | */ |
544 | void __init pidhash_init(void) | 544 | void __init pidhash_init(void) |
545 | { | 545 | { |
546 | int i, pidhash_size; | 546 | unsigned int i, pidhash_size; |
547 | 547 | ||
548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
549 | HASH_EARLY | HASH_SMALL, | 549 | HASH_EARLY | HASH_SMALL, |
550 | &pidhash_shift, NULL, 4096); | 550 | &pidhash_shift, NULL, 4096); |
551 | pidhash_size = 1 << pidhash_shift; | 551 | pidhash_size = 1U << pidhash_shift; |
552 | 552 | ||
553 | for (i = 0; i < pidhash_size; i++) | 553 | for (i = 0; i < pidhash_size; i++) |
554 | INIT_HLIST_HEAD(&pid_hash[i]); | 554 | INIT_HLIST_HEAD(&pid_hash[i]); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046d..57bc1fd35b3c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/acct.h> | 15 | #include <linux/acct.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/proc_fs.h> | 17 | #include <linux/proc_fs.h> |
18 | #include <linux/reboot.h> | ||
18 | 19 | ||
19 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 20 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
20 | 21 | ||
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
168 | while (nr > 0) { | 169 | while (nr > 0) { |
169 | rcu_read_lock(); | 170 | rcu_read_lock(); |
170 | 171 | ||
171 | /* | ||
172 | * Any nested-container's init processes won't ignore the | ||
173 | * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). | ||
174 | */ | ||
175 | task = pid_task(find_vpid(nr), PIDTYPE_PID); | 172 | task = pid_task(find_vpid(nr), PIDTYPE_PID); |
176 | if (task) | 173 | if (task && !__fatal_signal_pending(task)) |
177 | send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); | 174 | send_sig_info(SIGKILL, SEND_SIG_FORCED, task); |
178 | 175 | ||
179 | rcu_read_unlock(); | 176 | rcu_read_unlock(); |
180 | 177 | ||
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
187 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 184 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
188 | } while (rc != -ECHILD); | 185 | } while (rc != -ECHILD); |
189 | 186 | ||
187 | if (pid_ns->reboot) | ||
188 | current->signal->group_exit_code = pid_ns->reboot; | ||
189 | |||
190 | acct_exit_ns(pid_ns); | 190 | acct_exit_ns(pid_ns); |
191 | return; | 191 | return; |
192 | } | 192 | } |
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = { | |||
221 | 221 | ||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | 222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; |
223 | 223 | ||
224 | int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | ||
225 | { | ||
226 | if (pid_ns == &init_pid_ns) | ||
227 | return 0; | ||
228 | |||
229 | switch (cmd) { | ||
230 | case LINUX_REBOOT_CMD_RESTART2: | ||
231 | case LINUX_REBOOT_CMD_RESTART: | ||
232 | pid_ns->reboot = SIGHUP; | ||
233 | break; | ||
234 | |||
235 | case LINUX_REBOOT_CMD_POWER_OFF: | ||
236 | case LINUX_REBOOT_CMD_HALT: | ||
237 | pid_ns->reboot = SIGINT; | ||
238 | break; | ||
239 | default: | ||
240 | return -EINVAL; | ||
241 | } | ||
242 | |||
243 | read_lock(&tasklist_lock); | ||
244 | force_sig(SIGKILL, pid_ns->child_reaper); | ||
245 | read_unlock(&tasklist_lock); | ||
246 | |||
247 | do_exit(0); | ||
248 | |||
249 | /* Not reached */ | ||
250 | return 0; | ||
251 | } | ||
252 | |||
224 | static __init int pid_namespaces_init(void) | 253 | static __init int pid_namespaces_init(void) |
225 | { | 254 | { |
226 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 255 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 07e0e28ffba7..66d808ec5252 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,7 +1,8 @@ | |||
1 | 1 | ||
2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG |
3 | 3 | ||
4 | obj-$(CONFIG_PM) += main.o qos.o | 4 | obj-y += qos.o |
5 | obj-$(CONFIG_PM) += main.o | ||
5 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o | 6 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o |
6 | obj-$(CONFIG_FREEZER) += process.o | 7 | obj-$(CONFIG_FREEZER) += process.o |
7 | obj-$(CONFIG_SUSPEND) += suspend.o | 8 | obj-$(CONFIG_SUSPEND) += suspend.o |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..0a186cfde788 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, | |||
245 | * create_image - Create a hibernation image. | 245 | * create_image - Create a hibernation image. |
246 | * @platform_mode: Whether or not to use the platform driver. | 246 | * @platform_mode: Whether or not to use the platform driver. |
247 | * | 247 | * |
248 | * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image | 248 | * Execute device drivers' "late" and "noirq" freeze callbacks, create a |
249 | * and execute the drivers' .thaw_noirq() callbacks. | 249 | * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. |
250 | * | 250 | * |
251 | * Control reappears in this routine after the subsequent restore. | 251 | * Control reappears in this routine after the subsequent restore. |
252 | */ | 252 | */ |
@@ -254,7 +254,7 @@ static int create_image(int platform_mode) | |||
254 | { | 254 | { |
255 | int error; | 255 | int error; |
256 | 256 | ||
257 | error = dpm_suspend_noirq(PMSG_FREEZE); | 257 | error = dpm_suspend_end(PMSG_FREEZE); |
258 | if (error) { | 258 | if (error) { |
259 | printk(KERN_ERR "PM: Some devices failed to power down, " | 259 | printk(KERN_ERR "PM: Some devices failed to power down, " |
260 | "aborting hibernation\n"); | 260 | "aborting hibernation\n"); |
@@ -306,7 +306,7 @@ static int create_image(int platform_mode) | |||
306 | Platform_finish: | 306 | Platform_finish: |
307 | platform_finish(platform_mode); | 307 | platform_finish(platform_mode); |
308 | 308 | ||
309 | dpm_resume_noirq(in_suspend ? | 309 | dpm_resume_start(in_suspend ? |
310 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 310 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); |
311 | 311 | ||
312 | return error; | 312 | return error; |
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode) | |||
343 | * successful freezer test. | 343 | * successful freezer test. |
344 | */ | 344 | */ |
345 | freezer_test_done = true; | 345 | freezer_test_done = true; |
346 | goto Cleanup; | 346 | goto Thaw; |
347 | } | 347 | } |
348 | 348 | ||
349 | error = dpm_prepare(PMSG_FREEZE); | 349 | error = dpm_prepare(PMSG_FREEZE); |
350 | if (error) { | 350 | if (error) { |
351 | dpm_complete(PMSG_RECOVER); | 351 | dpm_complete(PMSG_RECOVER); |
352 | goto Cleanup; | 352 | goto Thaw; |
353 | } | 353 | } |
354 | 354 | ||
355 | suspend_console(); | 355 | suspend_console(); |
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode) | |||
385 | platform_end(platform_mode); | 385 | platform_end(platform_mode); |
386 | return error; | 386 | return error; |
387 | 387 | ||
388 | Thaw: | ||
389 | thaw_kernel_threads(); | ||
388 | Cleanup: | 390 | Cleanup: |
389 | swsusp_free(); | 391 | swsusp_free(); |
390 | goto Close; | 392 | goto Close; |
@@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode) | |||
394 | * resume_target_kernel - Restore system state from a hibernation image. | 396 | * resume_target_kernel - Restore system state from a hibernation image. |
395 | * @platform_mode: Whether or not to use the platform driver. | 397 | * @platform_mode: Whether or not to use the platform driver. |
396 | * | 398 | * |
397 | * Execute device drivers' .freeze_noirq() callbacks, restore the contents of | 399 | * Execute device drivers' "noirq" and "late" freeze callbacks, restore the |
398 | * highmem that have not been restored yet from the image and run the low-level | 400 | * contents of highmem that have not been restored yet from the image and run |
399 | * code that will restore the remaining contents of memory and switch to the | 401 | * the low-level code that will restore the remaining contents of memory and |
400 | * just restored target kernel. | 402 | * switch to the just restored target kernel. |
401 | */ | 403 | */ |
402 | static int resume_target_kernel(bool platform_mode) | 404 | static int resume_target_kernel(bool platform_mode) |
403 | { | 405 | { |
404 | int error; | 406 | int error; |
405 | 407 | ||
406 | error = dpm_suspend_noirq(PMSG_QUIESCE); | 408 | error = dpm_suspend_end(PMSG_QUIESCE); |
407 | if (error) { | 409 | if (error) { |
408 | printk(KERN_ERR "PM: Some devices failed to power down, " | 410 | printk(KERN_ERR "PM: Some devices failed to power down, " |
409 | "aborting resume\n"); | 411 | "aborting resume\n"); |
@@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode) | |||
460 | Cleanup: | 462 | Cleanup: |
461 | platform_restore_cleanup(platform_mode); | 463 | platform_restore_cleanup(platform_mode); |
462 | 464 | ||
463 | dpm_resume_noirq(PMSG_RECOVER); | 465 | dpm_resume_start(PMSG_RECOVER); |
464 | 466 | ||
465 | return error; | 467 | return error; |
466 | } | 468 | } |
@@ -518,7 +520,7 @@ int hibernation_platform_enter(void) | |||
518 | goto Resume_devices; | 520 | goto Resume_devices; |
519 | } | 521 | } |
520 | 522 | ||
521 | error = dpm_suspend_noirq(PMSG_HIBERNATE); | 523 | error = dpm_suspend_end(PMSG_HIBERNATE); |
522 | if (error) | 524 | if (error) |
523 | goto Resume_devices; | 525 | goto Resume_devices; |
524 | 526 | ||
@@ -549,7 +551,7 @@ int hibernation_platform_enter(void) | |||
549 | Platform_finish: | 551 | Platform_finish: |
550 | hibernation_ops->finish(); | 552 | hibernation_ops->finish(); |
551 | 553 | ||
552 | dpm_resume_noirq(PMSG_RESTORE); | 554 | dpm_resume_start(PMSG_RESTORE); |
553 | 555 | ||
554 | Resume_devices: | 556 | Resume_devices: |
555 | entering_platform_hibernation = false; | 557 | entering_platform_hibernation = false; |
@@ -616,7 +618,7 @@ int hibernate(void) | |||
616 | /* Allocate memory management structures */ | 618 | /* Allocate memory management structures */ |
617 | error = create_basic_memory_bitmaps(); | 619 | error = create_basic_memory_bitmaps(); |
618 | if (error) | 620 | if (error) |
619 | goto Exit; | 621 | goto Enable_umh; |
620 | 622 | ||
621 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 623 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
622 | sys_sync(); | 624 | sys_sync(); |
@@ -624,15 +626,11 @@ int hibernate(void) | |||
624 | 626 | ||
625 | error = freeze_processes(); | 627 | error = freeze_processes(); |
626 | if (error) | 628 | if (error) |
627 | goto Finish; | 629 | goto Free_bitmaps; |
628 | 630 | ||
629 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | 631 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); |
630 | if (error) | 632 | if (error || freezer_test_done) |
631 | goto Thaw; | ||
632 | if (freezer_test_done) { | ||
633 | freezer_test_done = false; | ||
634 | goto Thaw; | 633 | goto Thaw; |
635 | } | ||
636 | 634 | ||
637 | if (in_suspend) { | 635 | if (in_suspend) { |
638 | unsigned int flags = 0; | 636 | unsigned int flags = 0; |
@@ -657,8 +655,13 @@ int hibernate(void) | |||
657 | 655 | ||
658 | Thaw: | 656 | Thaw: |
659 | thaw_processes(); | 657 | thaw_processes(); |
660 | Finish: | 658 | |
659 | /* Don't bother checking whether freezer_test_done is true */ | ||
660 | freezer_test_done = false; | ||
661 | |||
662 | Free_bitmaps: | ||
661 | free_basic_memory_bitmaps(); | 663 | free_basic_memory_bitmaps(); |
664 | Enable_umh: | ||
662 | usermodehelper_enable(); | 665 | usermodehelper_enable(); |
663 | Exit: | 666 | Exit: |
664 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 667 | pm_notifier_call_chain(PM_POST_HIBERNATION); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..1c12581f1c62 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) | |||
165 | last_errno %= REC_FAILED_NUM; | 165 | last_errno %= REC_FAILED_NUM; |
166 | last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; | 166 | last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; |
167 | last_step %= REC_FAILED_NUM; | 167 | last_step %= REC_FAILED_NUM; |
168 | seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" | 168 | seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" |
169 | "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", | 169 | "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", |
170 | "success", suspend_stats.success, | 170 | "success", suspend_stats.success, |
171 | "fail", suspend_stats.fail, | 171 | "fail", suspend_stats.fail, |
172 | "failed_freeze", suspend_stats.failed_freeze, | 172 | "failed_freeze", suspend_stats.failed_freeze, |
173 | "failed_prepare", suspend_stats.failed_prepare, | 173 | "failed_prepare", suspend_stats.failed_prepare, |
174 | "failed_suspend", suspend_stats.failed_suspend, | 174 | "failed_suspend", suspend_stats.failed_suspend, |
175 | "failed_suspend_late", | ||
176 | suspend_stats.failed_suspend_late, | ||
175 | "failed_suspend_noirq", | 177 | "failed_suspend_noirq", |
176 | suspend_stats.failed_suspend_noirq, | 178 | suspend_stats.failed_suspend_noirq, |
177 | "failed_resume", suspend_stats.failed_resume, | 179 | "failed_resume", suspend_stats.failed_resume, |
180 | "failed_resume_early", | ||
181 | suspend_stats.failed_resume_early, | ||
178 | "failed_resume_noirq", | 182 | "failed_resume_noirq", |
179 | suspend_stats.failed_resume_noirq); | 183 | suspend_stats.failed_resume_noirq); |
180 | seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", | 184 | seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", |
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
287 | 291 | ||
288 | #ifdef CONFIG_SUSPEND | 292 | #ifdef CONFIG_SUSPEND |
289 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | 293 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { |
290 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) | 294 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { |
295 | error = pm_suspend(state); | ||
291 | break; | 296 | break; |
292 | } | 297 | } |
293 | if (state < PM_SUSPEND_MAX && *s) { | ||
294 | error = enter_state(state); | ||
295 | if (error) { | ||
296 | suspend_stats.fail++; | ||
297 | dpm_save_failed_errno(error); | ||
298 | } else | ||
299 | suspend_stats.success++; | ||
300 | } | 298 | } |
301 | #endif | 299 | #endif |
302 | 300 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 21724eee5206..98f3622d7407 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -177,13 +177,11 @@ extern const char *const pm_states[]; | |||
177 | 177 | ||
178 | extern bool valid_state(suspend_state_t state); | 178 | extern bool valid_state(suspend_state_t state); |
179 | extern int suspend_devices_and_enter(suspend_state_t state); | 179 | extern int suspend_devices_and_enter(suspend_state_t state); |
180 | extern int enter_state(suspend_state_t state); | ||
181 | #else /* !CONFIG_SUSPEND */ | 180 | #else /* !CONFIG_SUSPEND */ |
182 | static inline int suspend_devices_and_enter(suspend_state_t state) | 181 | static inline int suspend_devices_and_enter(suspend_state_t state) |
183 | { | 182 | { |
184 | return -ENOSYS; | 183 | return -ENOSYS; |
185 | } | 184 | } |
186 | static inline int enter_state(suspend_state_t state) { return -ENOSYS; } | ||
187 | static inline bool valid_state(suspend_state_t state) { return false; } | 185 | static inline bool valid_state(suspend_state_t state) { return false; } |
188 | #endif /* !CONFIG_SUSPEND */ | 186 | #endif /* !CONFIG_SUSPEND */ |
189 | 187 | ||
@@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void) | |||
234 | int error; | 232 | int error; |
235 | 233 | ||
236 | error = freeze_processes(); | 234 | error = freeze_processes(); |
237 | |||
238 | /* | 235 | /* |
239 | * freeze_processes() automatically thaws every task if freezing | 236 | * freeze_processes() automatically thaws every task if freezing |
240 | * fails. So we need not do anything extra upon error. | 237 | * fails. So we need not do anything extra upon error. |
241 | */ | 238 | */ |
242 | if (error) | 239 | if (error) |
243 | goto Finish; | 240 | return error; |
244 | 241 | ||
245 | error = freeze_kernel_threads(); | 242 | error = freeze_kernel_threads(); |
246 | |||
247 | /* | 243 | /* |
248 | * freeze_kernel_threads() thaws only kernel threads upon freezing | 244 | * freeze_kernel_threads() thaws only kernel threads upon freezing |
249 | * failure. So we have to thaw the userspace tasks ourselves. | 245 | * failure. So we have to thaw the userspace tasks ourselves. |
@@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void) | |||
251 | if (error) | 247 | if (error) |
252 | thaw_processes(); | 248 | thaw_processes(); |
253 | 249 | ||
254 | Finish: | ||
255 | return error; | 250 | return error; |
256 | } | 251 | } |
257 | 252 | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 7e426459e60a..0d2aeb226108 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only) | |||
53 | * It is "frozen enough". If the task does wake | 53 | * It is "frozen enough". If the task does wake |
54 | * up, it will immediately call try_to_freeze. | 54 | * up, it will immediately call try_to_freeze. |
55 | * | 55 | * |
56 | * Because freeze_task() goes through p's | 56 | * Because freeze_task() goes through p's scheduler lock, it's |
57 | * scheduler lock after setting TIF_FREEZE, it's | 57 | * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING |
58 | * guaranteed that either we see TASK_RUNNING or | 58 | * transition can't race with task state testing here. |
59 | * try_to_stop() after schedule() in ptrace/signal | ||
60 | * stop sees TIF_FREEZE. | ||
61 | */ | 59 | */ |
62 | if (!task_is_stopped_or_traced(p) && | 60 | if (!task_is_stopped_or_traced(p) && |
63 | !freezer_should_skip(p)) | 61 | !freezer_should_skip(p)) |
@@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only) | |||
98 | elapsed_csecs / 100, elapsed_csecs % 100, | 96 | elapsed_csecs / 100, elapsed_csecs % 100, |
99 | todo - wq_busy, wq_busy); | 97 | todo - wq_busy, wq_busy); |
100 | 98 | ||
101 | read_lock(&tasklist_lock); | 99 | if (!wakeup) { |
102 | do_each_thread(g, p) { | 100 | read_lock(&tasklist_lock); |
103 | if (!wakeup && !freezer_should_skip(p) && | 101 | do_each_thread(g, p) { |
104 | p != current && freezing(p) && !frozen(p)) | 102 | if (p != current && !freezer_should_skip(p) |
105 | sched_show_task(p); | 103 | && freezing(p) && !frozen(p)) |
106 | } while_each_thread(g, p); | 104 | sched_show_task(p); |
107 | read_unlock(&tasklist_lock); | 105 | } while_each_thread(g, p); |
106 | read_unlock(&tasklist_lock); | ||
107 | } | ||
108 | } else { | 108 | } else { |
109 | printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, | 109 | printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, |
110 | elapsed_csecs % 100); | 110 | elapsed_csecs % 100); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 995e3bd3417b..d6d6dbd1ecc0 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
469 | static int __init pm_qos_power_init(void) | 469 | static int __init pm_qos_power_init(void) |
470 | { | 470 | { |
471 | int ret = 0; | 471 | int ret = 0; |
472 | int i; | ||
472 | 473 | ||
473 | ret = register_pm_qos_misc(&cpu_dma_pm_qos); | 474 | BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); |
474 | if (ret < 0) { | 475 | |
475 | printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); | 476 | for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { |
476 | return ret; | 477 | ret = register_pm_qos_misc(pm_qos_array[i]); |
477 | } | 478 | if (ret < 0) { |
478 | ret = register_pm_qos_misc(&network_lat_pm_qos); | 479 | printk(KERN_ERR "pm_qos_param: %s setup failed\n", |
479 | if (ret < 0) { | 480 | pm_qos_array[i]->name); |
480 | printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); | 481 | return ret; |
481 | return ret; | 482 | } |
482 | } | 483 | } |
483 | ret = register_pm_qos_misc(&network_throughput_pm_qos); | ||
484 | if (ret < 0) | ||
485 | printk(KERN_ERR | ||
486 | "pm_qos_param: network_throughput setup failed\n"); | ||
487 | 484 | ||
488 | return ret; | 485 | return ret; |
489 | } | 486 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..0de28576807d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
711 | list_for_each_entry(region, &nosave_regions, list) { | 711 | list_for_each_entry(region, &nosave_regions, list) { |
712 | unsigned long pfn; | 712 | unsigned long pfn; |
713 | 713 | ||
714 | pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", | 714 | pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", |
715 | region->start_pfn << PAGE_SHIFT, | 715 | (unsigned long long) region->start_pfn << PAGE_SHIFT, |
716 | region->end_pfn << PAGE_SHIFT); | 716 | ((unsigned long long) region->end_pfn << PAGE_SHIFT) |
717 | - 1); | ||
717 | 718 | ||
718 | for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) | 719 | for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) |
719 | if (pfn_valid(pfn)) { | 720 | if (pfn_valid(pfn)) { |
@@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | |||
1000 | s_page = pfn_to_page(src_pfn); | 1001 | s_page = pfn_to_page(src_pfn); |
1001 | d_page = pfn_to_page(dst_pfn); | 1002 | d_page = pfn_to_page(dst_pfn); |
1002 | if (PageHighMem(s_page)) { | 1003 | if (PageHighMem(s_page)) { |
1003 | src = kmap_atomic(s_page, KM_USER0); | 1004 | src = kmap_atomic(s_page); |
1004 | dst = kmap_atomic(d_page, KM_USER1); | 1005 | dst = kmap_atomic(d_page); |
1005 | do_copy_page(dst, src); | 1006 | do_copy_page(dst, src); |
1006 | kunmap_atomic(dst, KM_USER1); | 1007 | kunmap_atomic(dst); |
1007 | kunmap_atomic(src, KM_USER0); | 1008 | kunmap_atomic(src); |
1008 | } else { | 1009 | } else { |
1009 | if (PageHighMem(d_page)) { | 1010 | if (PageHighMem(d_page)) { |
1010 | /* Page pointed to by src may contain some kernel | 1011 | /* Page pointed to by src may contain some kernel |
1011 | * data modified by kmap_atomic() | 1012 | * data modified by kmap_atomic() |
1012 | */ | 1013 | */ |
1013 | safe_copy_page(buffer, s_page); | 1014 | safe_copy_page(buffer, s_page); |
1014 | dst = kmap_atomic(d_page, KM_USER0); | 1015 | dst = kmap_atomic(d_page); |
1015 | copy_page(dst, buffer); | 1016 | copy_page(dst, buffer); |
1016 | kunmap_atomic(dst, KM_USER0); | 1017 | kunmap_atomic(dst); |
1017 | } else { | 1018 | } else { |
1018 | safe_copy_page(page_address(d_page), s_page); | 1019 | safe_copy_page(page_address(d_page), s_page); |
1019 | } | 1020 | } |
@@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle) | |||
1728 | */ | 1729 | */ |
1729 | void *kaddr; | 1730 | void *kaddr; |
1730 | 1731 | ||
1731 | kaddr = kmap_atomic(page, KM_USER0); | 1732 | kaddr = kmap_atomic(page); |
1732 | copy_page(buffer, kaddr); | 1733 | copy_page(buffer, kaddr); |
1733 | kunmap_atomic(kaddr, KM_USER0); | 1734 | kunmap_atomic(kaddr); |
1734 | handle->buffer = buffer; | 1735 | handle->buffer = buffer; |
1735 | } else { | 1736 | } else { |
1736 | handle->buffer = page_address(page); | 1737 | handle->buffer = page_address(page); |
@@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void) | |||
2014 | if (last_highmem_page) { | 2015 | if (last_highmem_page) { |
2015 | void *dst; | 2016 | void *dst; |
2016 | 2017 | ||
2017 | dst = kmap_atomic(last_highmem_page, KM_USER0); | 2018 | dst = kmap_atomic(last_highmem_page); |
2018 | copy_page(dst, buffer); | 2019 | copy_page(dst, buffer); |
2019 | kunmap_atomic(dst, KM_USER0); | 2020 | kunmap_atomic(dst); |
2020 | last_highmem_page = NULL; | 2021 | last_highmem_page = NULL; |
2021 | } | 2022 | } |
2022 | } | 2023 | } |
@@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | |||
2309 | { | 2310 | { |
2310 | void *kaddr1, *kaddr2; | 2311 | void *kaddr1, *kaddr2; |
2311 | 2312 | ||
2312 | kaddr1 = kmap_atomic(p1, KM_USER0); | 2313 | kaddr1 = kmap_atomic(p1); |
2313 | kaddr2 = kmap_atomic(p2, KM_USER1); | 2314 | kaddr2 = kmap_atomic(p2); |
2314 | copy_page(buf, kaddr1); | 2315 | copy_page(buf, kaddr1); |
2315 | copy_page(kaddr1, kaddr2); | 2316 | copy_page(kaddr1, kaddr2); |
2316 | copy_page(kaddr2, buf); | 2317 | copy_page(kaddr2, buf); |
2317 | kunmap_atomic(kaddr2, KM_USER1); | 2318 | kunmap_atomic(kaddr2); |
2318 | kunmap_atomic(kaddr1, KM_USER0); | 2319 | kunmap_atomic(kaddr1); |
2319 | } | 2320 | } |
2320 | 2321 | ||
2321 | /** | 2322 | /** |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..88e5c967370d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = { | |||
37 | static const struct platform_suspend_ops *suspend_ops; | 37 | static const struct platform_suspend_ops *suspend_ops; |
38 | 38 | ||
39 | /** | 39 | /** |
40 | * suspend_set_ops - Set the global suspend method table. | 40 | * suspend_set_ops - Set the global suspend method table. |
41 | * @ops: Pointer to ops structure. | 41 | * @ops: Suspend operations to use. |
42 | */ | 42 | */ |
43 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 43 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
44 | { | 44 | { |
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state) | |||
58 | } | 58 | } |
59 | 59 | ||
60 | /** | 60 | /** |
61 | * suspend_valid_only_mem - generic memory-only valid callback | 61 | * suspend_valid_only_mem - Generic memory-only valid callback. |
62 | * | 62 | * |
63 | * Platform drivers that implement mem suspend only and only need | 63 | * Platform drivers that implement mem suspend only and only need to check for |
64 | * to check for that in their .valid callback can use this instead | 64 | * that in their .valid() callback can use this instead of rolling their own |
65 | * of rolling their own .valid callback. | 65 | * .valid() callback. |
66 | */ | 66 | */ |
67 | int suspend_valid_only_mem(suspend_state_t state) | 67 | int suspend_valid_only_mem(suspend_state_t state) |
68 | { | 68 | { |
@@ -83,10 +83,11 @@ static int suspend_test(int level) | |||
83 | } | 83 | } |
84 | 84 | ||
85 | /** | 85 | /** |
86 | * suspend_prepare - Do prep work before entering low-power state. | 86 | * suspend_prepare - Prepare for entering system sleep state. |
87 | * | 87 | * |
88 | * This is common code that is called for each state that we're entering. | 88 | * Common code run for every system sleep state that can be entered (except for |
89 | * Run suspend notifiers, allocate a console and stop all processes. | 89 | * hibernation). Run suspend notifiers, allocate the "suspend" console and |
90 | * freeze processes. | ||
90 | */ | 91 | */ |
91 | static int suspend_prepare(void) | 92 | static int suspend_prepare(void) |
92 | { | 93 | { |
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) | |||
131 | } | 132 | } |
132 | 133 | ||
133 | /** | 134 | /** |
134 | * suspend_enter - enter the desired system sleep state. | 135 | * suspend_enter - Make the system enter the given sleep state. |
135 | * @state: State to enter | 136 | * @state: System sleep state to enter. |
136 | * @wakeup: Returns information that suspend should not be entered again. | 137 | * @wakeup: Returns information that the sleep state should not be re-entered. |
137 | * | 138 | * |
138 | * This function should be called after devices have been suspended. | 139 | * This function should be called after devices have been suspended. |
139 | */ | 140 | */ |
@@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
147 | goto Platform_finish; | 148 | goto Platform_finish; |
148 | } | 149 | } |
149 | 150 | ||
150 | error = dpm_suspend_noirq(PMSG_SUSPEND); | 151 | error = dpm_suspend_end(PMSG_SUSPEND); |
151 | if (error) { | 152 | if (error) { |
152 | printk(KERN_ERR "PM: Some devices failed to power down\n"); | 153 | printk(KERN_ERR "PM: Some devices failed to power down\n"); |
153 | goto Platform_finish; | 154 | goto Platform_finish; |
@@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
189 | if (suspend_ops->wake) | 190 | if (suspend_ops->wake) |
190 | suspend_ops->wake(); | 191 | suspend_ops->wake(); |
191 | 192 | ||
192 | dpm_resume_noirq(PMSG_RESUME); | 193 | dpm_resume_start(PMSG_RESUME); |
193 | 194 | ||
194 | Platform_finish: | 195 | Platform_finish: |
195 | if (suspend_ops->finish) | 196 | if (suspend_ops->finish) |
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
199 | } | 200 | } |
200 | 201 | ||
201 | /** | 202 | /** |
202 | * suspend_devices_and_enter - suspend devices and enter the desired system | 203 | * suspend_devices_and_enter - Suspend devices and enter system sleep state. |
203 | * sleep state. | 204 | * @state: System sleep state to enter. |
204 | * @state: state to enter | ||
205 | */ | 205 | */ |
206 | int suspend_devices_and_enter(suspend_state_t state) | 206 | int suspend_devices_and_enter(suspend_state_t state) |
207 | { | 207 | { |
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
251 | } | 251 | } |
252 | 252 | ||
253 | /** | 253 | /** |
254 | * suspend_finish - Do final work before exiting suspend sequence. | 254 | * suspend_finish - Clean up before finishing the suspend sequence. |
255 | * | 255 | * |
256 | * Call platform code to clean up, restart processes, and free the | 256 | * Call platform code to clean up, restart processes, and free the console that |
257 | * console that we've allocated. This is not called for suspend-to-disk. | 257 | * we've allocated. This routine is not called for hibernation. |
258 | */ | 258 | */ |
259 | static void suspend_finish(void) | 259 | static void suspend_finish(void) |
260 | { | 260 | { |
@@ -265,16 +265,14 @@ static void suspend_finish(void) | |||
265 | } | 265 | } |
266 | 266 | ||
267 | /** | 267 | /** |
268 | * enter_state - Do common work of entering low-power state. | 268 | * enter_state - Do common work needed to enter system sleep state. |
269 | * @state: pm_state structure for state we're entering. | 269 | * @state: System sleep state to enter. |
270 | * | 270 | * |
271 | * Make sure we're the only ones trying to enter a sleep state. Fail | 271 | * Make sure that no one else is trying to put the system into a sleep state. |
272 | * if someone has beat us to it, since we don't want anything weird to | 272 | * Fail if that's not the case. Otherwise, prepare for system suspend, make the |
273 | * happen when we wake up. | 273 | * system enter the given sleep state and clean up after wakeup. |
274 | * Then, do the setup for suspend, enter the state, and cleaup (after | ||
275 | * we've woken up). | ||
276 | */ | 274 | */ |
277 | int enter_state(suspend_state_t state) | 275 | static int enter_state(suspend_state_t state) |
278 | { | 276 | { |
279 | int error; | 277 | int error; |
280 | 278 | ||
@@ -310,24 +308,26 @@ int enter_state(suspend_state_t state) | |||
310 | } | 308 | } |
311 | 309 | ||
312 | /** | 310 | /** |
313 | * pm_suspend - Externally visible function for suspending system. | 311 | * pm_suspend - Externally visible function for suspending the system. |
314 | * @state: Enumerated value of state to enter. | 312 | * @state: System sleep state to enter. |
315 | * | 313 | * |
316 | * Determine whether or not value is within range, get state | 314 | * Check if the value of @state represents one of the supported states, |
317 | * structure, and enter (above). | 315 | * execute enter_state() and update system suspend statistics. |
318 | */ | 316 | */ |
319 | int pm_suspend(suspend_state_t state) | 317 | int pm_suspend(suspend_state_t state) |
320 | { | 318 | { |
321 | int ret; | 319 | int error; |
322 | if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { | 320 | |
323 | ret = enter_state(state); | 321 | if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) |
324 | if (ret) { | 322 | return -EINVAL; |
325 | suspend_stats.fail++; | 323 | |
326 | dpm_save_failed_errno(ret); | 324 | error = enter_state(state); |
327 | } else | 325 | if (error) { |
328 | suspend_stats.success++; | 326 | suspend_stats.fail++; |
329 | return ret; | 327 | dpm_save_failed_errno(error); |
328 | } else { | ||
329 | suspend_stats.success++; | ||
330 | } | 330 | } |
331 | return -EINVAL; | 331 | return error; |
332 | } | 332 | } |
333 | EXPORT_SYMBOL(pm_suspend); | 333 | EXPORT_SYMBOL(pm_suspend); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 3e100075b13c..33c4329205af 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
249 | } | 249 | } |
250 | pm_restore_gfp_mask(); | 250 | pm_restore_gfp_mask(); |
251 | error = hibernation_snapshot(data->platform_support); | 251 | error = hibernation_snapshot(data->platform_support); |
252 | if (error) { | 252 | if (!error) { |
253 | thaw_kernel_threads(); | ||
254 | } else { | ||
255 | error = put_user(in_suspend, (int __user *)arg); | 253 | error = put_user(in_suspend, (int __user *)arg); |
256 | if (!error && !freezer_test_done) | 254 | data->ready = !freezer_test_done && !error; |
257 | data->ready = 1; | 255 | freezer_test_done = false; |
258 | if (freezer_test_done) { | ||
259 | freezer_test_done = false; | ||
260 | thaw_kernel_threads(); | ||
261 | } | ||
262 | } | 256 | } |
263 | break; | 257 | break; |
264 | 258 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..b663c2c95d39 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -44,6 +44,9 @@ | |||
44 | 44 | ||
45 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
46 | 46 | ||
47 | #define CREATE_TRACE_POINTS | ||
48 | #include <trace/events/printk.h> | ||
49 | |||
47 | /* | 50 | /* |
48 | * Architectures can override it: | 51 | * Architectures can override it: |
49 | */ | 52 | */ |
@@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | |||
542 | static void _call_console_drivers(unsigned start, | 545 | static void _call_console_drivers(unsigned start, |
543 | unsigned end, int msg_log_level) | 546 | unsigned end, int msg_log_level) |
544 | { | 547 | { |
548 | trace_console(&LOG_BUF(0), start, end, log_buf_len); | ||
549 | |||
545 | if ((msg_log_level < console_loglevel || ignore_loglevel) && | 550 | if ((msg_log_level < console_loglevel || ignore_loglevel) && |
546 | console_drivers && start != end) { | 551 | console_drivers && start != end) { |
547 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | 552 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { |
@@ -702,6 +707,9 @@ static bool printk_time = 0; | |||
702 | #endif | 707 | #endif |
703 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 708 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
704 | 709 | ||
710 | static bool always_kmsg_dump; | ||
711 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
712 | |||
705 | /* Check if we have any console registered that can be called early in boot. */ | 713 | /* Check if we have any console registered that can be called early in boot. */ |
706 | static int have_callable_console(void) | 714 | static int have_callable_console(void) |
707 | { | 715 | { |
@@ -1208,13 +1216,27 @@ int is_console_locked(void) | |||
1208 | return console_locked; | 1216 | return console_locked; |
1209 | } | 1217 | } |
1210 | 1218 | ||
1219 | /* | ||
1220 | * Delayed printk facility, for scheduler-internal messages: | ||
1221 | */ | ||
1222 | #define PRINTK_BUF_SIZE 512 | ||
1223 | |||
1224 | #define PRINTK_PENDING_WAKEUP 0x01 | ||
1225 | #define PRINTK_PENDING_SCHED 0x02 | ||
1226 | |||
1211 | static DEFINE_PER_CPU(int, printk_pending); | 1227 | static DEFINE_PER_CPU(int, printk_pending); |
1228 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
1212 | 1229 | ||
1213 | void printk_tick(void) | 1230 | void printk_tick(void) |
1214 | { | 1231 | { |
1215 | if (__this_cpu_read(printk_pending)) { | 1232 | if (__this_cpu_read(printk_pending)) { |
1216 | __this_cpu_write(printk_pending, 0); | 1233 | int pending = __this_cpu_xchg(printk_pending, 0); |
1217 | wake_up_interruptible(&log_wait); | 1234 | if (pending & PRINTK_PENDING_SCHED) { |
1235 | char *buf = __get_cpu_var(printk_sched_buf); | ||
1236 | printk(KERN_WARNING "[sched_delayed] %s", buf); | ||
1237 | } | ||
1238 | if (pending & PRINTK_PENDING_WAKEUP) | ||
1239 | wake_up_interruptible(&log_wait); | ||
1218 | } | 1240 | } |
1219 | } | 1241 | } |
1220 | 1242 | ||
@@ -1228,7 +1250,7 @@ int printk_needs_cpu(int cpu) | |||
1228 | void wake_up_klogd(void) | 1250 | void wake_up_klogd(void) |
1229 | { | 1251 | { |
1230 | if (waitqueue_active(&log_wait)) | 1252 | if (waitqueue_active(&log_wait)) |
1231 | this_cpu_write(printk_pending, 1); | 1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1232 | } | 1254 | } |
1233 | 1255 | ||
1234 | /** | 1256 | /** |
@@ -1621,6 +1643,26 @@ late_initcall(printk_late_init); | |||
1621 | 1643 | ||
1622 | #if defined CONFIG_PRINTK | 1644 | #if defined CONFIG_PRINTK |
1623 | 1645 | ||
1646 | int printk_sched(const char *fmt, ...) | ||
1647 | { | ||
1648 | unsigned long flags; | ||
1649 | va_list args; | ||
1650 | char *buf; | ||
1651 | int r; | ||
1652 | |||
1653 | local_irq_save(flags); | ||
1654 | buf = __get_cpu_var(printk_sched_buf); | ||
1655 | |||
1656 | va_start(args, fmt); | ||
1657 | r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); | ||
1658 | va_end(args); | ||
1659 | |||
1660 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); | ||
1661 | local_irq_restore(flags); | ||
1662 | |||
1663 | return r; | ||
1664 | } | ||
1665 | |||
1624 | /* | 1666 | /* |
1625 | * printk rate limiting, lifted from the networking subsystem. | 1667 | * printk rate limiting, lifted from the networking subsystem. |
1626 | * | 1668 | * |
@@ -1732,6 +1774,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1732 | unsigned long l1, l2; | 1774 | unsigned long l1, l2; |
1733 | unsigned long flags; | 1775 | unsigned long flags; |
1734 | 1776 | ||
1777 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) | ||
1778 | return; | ||
1779 | |||
1735 | /* Theoretically, the log could move on after we do this, but | 1780 | /* Theoretically, the log could move on after we do this, but |
1736 | there's not a lot we can do about that. The new messages | 1781 | there's not a lot we can do about that. The new messages |
1737 | will overwrite the start of what we dump. */ | 1782 | will overwrite the start of what we dump. */ |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed11..ee8d49b9c309 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
231 | } | 231 | } |
232 | 232 | ||
233 | static int ptrace_attach(struct task_struct *task, long request, | 233 | static int ptrace_attach(struct task_struct *task, long request, |
234 | unsigned long addr, | ||
234 | unsigned long flags) | 235 | unsigned long flags) |
235 | { | 236 | { |
236 | bool seize = (request == PTRACE_SEIZE); | 237 | bool seize = (request == PTRACE_SEIZE); |
237 | int retval; | 238 | int retval; |
238 | 239 | ||
239 | /* | ||
240 | * SEIZE will enable new ptrace behaviors which will be implemented | ||
241 | * gradually. SEIZE_DEVEL is used to prevent applications | ||
242 | * expecting full SEIZE behaviors trapping on kernel commits which | ||
243 | * are still in the process of implementing them. | ||
244 | * | ||
245 | * Only test programs for new ptrace behaviors being implemented | ||
246 | * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. | ||
247 | * | ||
248 | * Once SEIZE behaviors are completely implemented, this flag and | ||
249 | * the following test will be removed. | ||
250 | */ | ||
251 | retval = -EIO; | 240 | retval = -EIO; |
252 | if (seize && !(flags & PTRACE_SEIZE_DEVEL)) | 241 | if (seize) { |
253 | goto out; | 242 | if (addr != 0) |
243 | goto out; | ||
244 | if (flags & ~(unsigned long)PTRACE_O_MASK) | ||
245 | goto out; | ||
246 | flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); | ||
247 | } else { | ||
248 | flags = PT_PTRACED; | ||
249 | } | ||
254 | 250 | ||
255 | audit_ptrace(task); | 251 | audit_ptrace(task); |
256 | 252 | ||
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
262 | 258 | ||
263 | /* | 259 | /* |
264 | * Protect exec's credential calculations against our interference; | 260 | * Protect exec's credential calculations against our interference; |
265 | * interference; SUID, SGID and LSM creds get determined differently | 261 | * SUID, SGID and LSM creds get determined differently |
266 | * under ptrace. | 262 | * under ptrace. |
267 | */ | 263 | */ |
268 | retval = -ERESTARTNOINTR; | 264 | retval = -ERESTARTNOINTR; |
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
282 | if (task->ptrace) | 278 | if (task->ptrace) |
283 | goto unlock_tasklist; | 279 | goto unlock_tasklist; |
284 | 280 | ||
285 | task->ptrace = PT_PTRACED; | ||
286 | if (seize) | 281 | if (seize) |
287 | task->ptrace |= PT_SEIZED; | 282 | flags |= PT_SEIZED; |
288 | if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) | 283 | if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) |
289 | task->ptrace |= PT_PTRACE_CAP; | 284 | flags |= PT_PTRACE_CAP; |
285 | task->ptrace = flags; | ||
290 | 286 | ||
291 | __ptrace_link(task, current); | 287 | __ptrace_link(task, current); |
292 | 288 | ||
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds | |||
528 | 524 | ||
529 | static int ptrace_setoptions(struct task_struct *child, unsigned long data) | 525 | static int ptrace_setoptions(struct task_struct *child, unsigned long data) |
530 | { | 526 | { |
531 | child->ptrace &= ~PT_TRACE_MASK; | 527 | unsigned flags; |
532 | 528 | ||
533 | if (data & PTRACE_O_TRACESYSGOOD) | 529 | if (data & ~(unsigned long)PTRACE_O_MASK) |
534 | child->ptrace |= PT_TRACESYSGOOD; | 530 | return -EINVAL; |
535 | |||
536 | if (data & PTRACE_O_TRACEFORK) | ||
537 | child->ptrace |= PT_TRACE_FORK; | ||
538 | |||
539 | if (data & PTRACE_O_TRACEVFORK) | ||
540 | child->ptrace |= PT_TRACE_VFORK; | ||
541 | |||
542 | if (data & PTRACE_O_TRACECLONE) | ||
543 | child->ptrace |= PT_TRACE_CLONE; | ||
544 | |||
545 | if (data & PTRACE_O_TRACEEXEC) | ||
546 | child->ptrace |= PT_TRACE_EXEC; | ||
547 | |||
548 | if (data & PTRACE_O_TRACEVFORKDONE) | ||
549 | child->ptrace |= PT_TRACE_VFORK_DONE; | ||
550 | 531 | ||
551 | if (data & PTRACE_O_TRACEEXIT) | 532 | /* Avoid intermediate state when all opts are cleared */ |
552 | child->ptrace |= PT_TRACE_EXIT; | 533 | flags = child->ptrace; |
534 | flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); | ||
535 | flags |= (data << PT_OPT_FLAG_SHIFT); | ||
536 | child->ptrace = flags; | ||
553 | 537 | ||
554 | return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; | 538 | return 0; |
555 | } | 539 | } |
556 | 540 | ||
557 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) | 541 | static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) |
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, | |||
891 | } | 875 | } |
892 | 876 | ||
893 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { | 877 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { |
894 | ret = ptrace_attach(child, request, data); | 878 | ret = ptrace_attach(child, request, addr, data); |
895 | /* | 879 | /* |
896 | * Some architectures need to do book-keeping after | 880 | * Some architectures need to do book-keeping after |
897 | * a ptrace attach. | 881 | * a ptrace attach. |
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
1034 | } | 1018 | } |
1035 | 1019 | ||
1036 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { | 1020 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { |
1037 | ret = ptrace_attach(child, request, data); | 1021 | ret = ptrace_attach(child, request, addr, data); |
1038 | /* | 1022 | /* |
1039 | * Some architectures need to do book-keeping after | 1023 | * Some architectures need to do book-keeping after |
1040 | * a ptrace attach. | 1024 | * a ptrace attach. |
diff --git a/kernel/rcu.h b/kernel/rcu.h index aa88baab5f78..8ba99cdc6515 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -33,8 +33,27 @@ | |||
33 | * Process-level increment to ->dynticks_nesting field. This allows for | 33 | * Process-level increment to ->dynticks_nesting field. This allows for |
34 | * architectures that use half-interrupts and half-exceptions from | 34 | * architectures that use half-interrupts and half-exceptions from |
35 | * process context. | 35 | * process context. |
36 | * | ||
37 | * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH | ||
38 | * that counts the number of process-based reasons why RCU cannot | ||
39 | * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE | ||
40 | * is the value used to increment or decrement this field. | ||
41 | * | ||
42 | * The rest of the bits could in principle be used to count interrupts, | ||
43 | * but this would mean that a negative-one value in the interrupt | ||
44 | * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. | ||
45 | * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK | ||
46 | * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. | ||
47 | * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon | ||
48 | * initial exit from idle. | ||
36 | */ | 49 | */ |
37 | #define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) | 50 | #define DYNTICK_TASK_NEST_WIDTH 7 |
51 | #define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) | ||
52 | #define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) | ||
53 | #define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) | ||
54 | #define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) | ||
55 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ | ||
56 | DYNTICK_TASK_FLAG) | ||
38 | 57 | ||
39 | /* | 58 | /* |
40 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | 59 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally |
@@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr; | |||
50 | 69 | ||
51 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 70 | static inline void debug_rcu_head_queue(struct rcu_head *head) |
52 | { | 71 | { |
53 | WARN_ON_ONCE((unsigned long)head & 0x3); | ||
54 | debug_object_activate(head, &rcuhead_debug_descr); | 72 | debug_object_activate(head, &rcuhead_debug_descr); |
55 | debug_object_active_state(head, &rcuhead_debug_descr, | 73 | debug_object_active_state(head, &rcuhead_debug_descr, |
56 | STATE_RCU_HEAD_READY, | 74 | STATE_RCU_HEAD_READY, |
@@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
76 | 94 | ||
77 | extern void kfree(const void *); | 95 | extern void kfree(const void *); |
78 | 96 | ||
79 | static inline void __rcu_reclaim(char *rn, struct rcu_head *head) | 97 | static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) |
80 | { | 98 | { |
81 | unsigned long offset = (unsigned long)head->func; | 99 | unsigned long offset = (unsigned long)head->func; |
82 | 100 | ||
83 | if (__is_kfree_rcu_offset(offset)) { | 101 | if (__is_kfree_rcu_offset(offset)) { |
84 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | 102 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); |
85 | kfree((void *)head - offset); | 103 | kfree((void *)head - offset); |
104 | return 1; | ||
86 | } else { | 105 | } else { |
87 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | 106 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); |
88 | head->func(head); | 107 | head->func(head); |
108 | return 0; | ||
89 | } | 109 | } |
90 | } | 110 | } |
91 | 111 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2bc4e135ff23..a86f1741cc27 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | |||
88 | * section. | 88 | * section. |
89 | * | 89 | * |
90 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | 90 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. |
91 | * | ||
92 | * Note that rcu_read_lock() is disallowed if the CPU is either idle or | ||
93 | * offline from an RCU perspective, so check for those as well. | ||
91 | */ | 94 | */ |
92 | int rcu_read_lock_bh_held(void) | 95 | int rcu_read_lock_bh_held(void) |
93 | { | 96 | { |
@@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void) | |||
95 | return 1; | 98 | return 1; |
96 | if (rcu_is_cpu_idle()) | 99 | if (rcu_is_cpu_idle()) |
97 | return 0; | 100 | return 0; |
101 | if (!rcu_lockdep_current_cpu_online()) | ||
102 | return 0; | ||
98 | return in_softirq() || irqs_disabled(); | 103 | return in_softirq() || irqs_disabled(); |
99 | } | 104 | } |
100 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 105 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 977296dca0a4..37a5444204d2 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head, | |||
53 | 53 | ||
54 | #include "rcutiny_plugin.h" | 54 | #include "rcutiny_plugin.h" |
55 | 55 | ||
56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; | 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
57 | 57 | ||
58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
59 | static void rcu_idle_enter_common(long long oldval) | 59 | static void rcu_idle_enter_common(long long oldval) |
@@ -88,10 +88,16 @@ void rcu_idle_enter(void) | |||
88 | 88 | ||
89 | local_irq_save(flags); | 89 | local_irq_save(flags); |
90 | oldval = rcu_dynticks_nesting; | 90 | oldval = rcu_dynticks_nesting; |
91 | rcu_dynticks_nesting = 0; | 91 | WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); |
92 | if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == | ||
93 | DYNTICK_TASK_NEST_VALUE) | ||
94 | rcu_dynticks_nesting = 0; | ||
95 | else | ||
96 | rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
92 | rcu_idle_enter_common(oldval); | 97 | rcu_idle_enter_common(oldval); |
93 | local_irq_restore(flags); | 98 | local_irq_restore(flags); |
94 | } | 99 | } |
100 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | ||
95 | 101 | ||
96 | /* | 102 | /* |
97 | * Exit an interrupt handler towards idle. | 103 | * Exit an interrupt handler towards idle. |
@@ -140,11 +146,15 @@ void rcu_idle_exit(void) | |||
140 | 146 | ||
141 | local_irq_save(flags); | 147 | local_irq_save(flags); |
142 | oldval = rcu_dynticks_nesting; | 148 | oldval = rcu_dynticks_nesting; |
143 | WARN_ON_ONCE(oldval != 0); | 149 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); |
144 | rcu_dynticks_nesting = DYNTICK_TASK_NESTING; | 150 | if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) |
151 | rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
152 | else | ||
153 | rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
145 | rcu_idle_exit_common(oldval); | 154 | rcu_idle_exit_common(oldval); |
146 | local_irq_restore(flags); | 155 | local_irq_restore(flags); |
147 | } | 156 | } |
157 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | ||
148 | 158 | ||
149 | /* | 159 | /* |
150 | * Enter an interrupt handler, moving away from idle. | 160 | * Enter an interrupt handler, moving away from idle. |
@@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
258 | 268 | ||
259 | /* If no RCU callbacks ready to invoke, just return. */ | 269 | /* If no RCU callbacks ready to invoke, just return. */ |
260 | if (&rcp->rcucblist == rcp->donetail) { | 270 | if (&rcp->rcucblist == rcp->donetail) { |
261 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | 271 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); |
262 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | 272 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, |
263 | ACCESS_ONCE(rcp->rcucblist), | 273 | ACCESS_ONCE(rcp->rcucblist), |
264 | need_resched(), | 274 | need_resched(), |
@@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
269 | 279 | ||
270 | /* Move the ready-to-invoke callbacks to a local list. */ | 280 | /* Move the ready-to-invoke callbacks to a local list. */ |
271 | local_irq_save(flags); | 281 | local_irq_save(flags); |
272 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | 282 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); |
273 | list = rcp->rcucblist; | 283 | list = rcp->rcucblist; |
274 | rcp->rcucblist = *rcp->donetail; | 284 | rcp->rcucblist = *rcp->donetail; |
275 | *rcp->donetail = NULL; | 285 | *rcp->donetail = NULL; |
@@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
319 | */ | 329 | */ |
320 | void synchronize_sched(void) | 330 | void synchronize_sched(void) |
321 | { | 331 | { |
332 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
333 | !lock_is_held(&rcu_lock_map) && | ||
334 | !lock_is_held(&rcu_sched_lock_map), | ||
335 | "Illegal synchronize_sched() in RCU read-side critical section"); | ||
322 | cond_resched(); | 336 | cond_resched(); |
323 | } | 337 | } |
324 | EXPORT_SYMBOL_GPL(synchronize_sched); | 338 | EXPORT_SYMBOL_GPL(synchronize_sched); |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 9cb1ae4aabdd..22ecea0dfb62 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | |||
132 | RCU_TRACE(.rcb.name = "rcu_preempt") | 132 | RCU_TRACE(.rcb.name = "rcu_preempt") |
133 | }; | 133 | }; |
134 | 134 | ||
135 | static void rcu_read_unlock_special(struct task_struct *t); | ||
135 | static int rcu_preempted_readers_exp(void); | 136 | static int rcu_preempted_readers_exp(void); |
136 | static void rcu_report_exp_done(void); | 137 | static void rcu_report_exp_done(void); |
137 | 138 | ||
@@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void) | |||
146 | /* | 147 | /* |
147 | * Check for a running RCU reader. Because there is only one CPU, | 148 | * Check for a running RCU reader. Because there is only one CPU, |
148 | * there can be but one running RCU reader at a time. ;-) | 149 | * there can be but one running RCU reader at a time. ;-) |
150 | * | ||
151 | * Returns zero if there are no running readers. Returns a positive | ||
152 | * number if there is at least one reader within its RCU read-side | ||
153 | * critical section. Returns a negative number if an outermost reader | ||
154 | * is in the midst of exiting from its RCU read-side critical section | ||
155 | * | ||
156 | * Returns zero if there are no running readers. Returns a positive | ||
157 | * number if there is at least one reader within its RCU read-side | ||
158 | * critical section. Returns a negative number if an outermost reader | ||
159 | * is in the midst of exiting from its RCU read-side critical section. | ||
149 | */ | 160 | */ |
150 | static int rcu_preempt_running_reader(void) | 161 | static int rcu_preempt_running_reader(void) |
151 | { | 162 | { |
@@ -307,7 +318,6 @@ static int rcu_boost(void) | |||
307 | t = container_of(tb, struct task_struct, rcu_node_entry); | 318 | t = container_of(tb, struct task_struct, rcu_node_entry); |
308 | rt_mutex_init_proxy_locked(&mtx, t); | 319 | rt_mutex_init_proxy_locked(&mtx, t); |
309 | t->rcu_boost_mutex = &mtx; | 320 | t->rcu_boost_mutex = &mtx; |
310 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
311 | raw_local_irq_restore(flags); | 321 | raw_local_irq_restore(flags); |
312 | rt_mutex_lock(&mtx); | 322 | rt_mutex_lock(&mtx); |
313 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 323 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
@@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void) | |||
475 | unsigned long flags; | 485 | unsigned long flags; |
476 | 486 | ||
477 | local_irq_save(flags); /* must exclude scheduler_tick(). */ | 487 | local_irq_save(flags); /* must exclude scheduler_tick(). */ |
478 | if (rcu_preempt_running_reader() && | 488 | if (rcu_preempt_running_reader() > 0 && |
479 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 489 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
480 | 490 | ||
481 | /* Possibly blocking in an RCU read-side critical section. */ | 491 | /* Possibly blocking in an RCU read-side critical section. */ |
@@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void) | |||
494 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); | 504 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); |
495 | if (rcu_cpu_blocking_cur_gp()) | 505 | if (rcu_cpu_blocking_cur_gp()) |
496 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; | 506 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; |
507 | } else if (rcu_preempt_running_reader() < 0 && | ||
508 | t->rcu_read_unlock_special) { | ||
509 | /* | ||
510 | * Complete exit from RCU read-side critical section on | ||
511 | * behalf of preempted instance of __rcu_read_unlock(). | ||
512 | */ | ||
513 | rcu_read_unlock_special(t); | ||
497 | } | 514 | } |
498 | 515 | ||
499 | /* | 516 | /* |
@@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); | |||
526 | * notify RCU core processing or task having blocked during the RCU | 543 | * notify RCU core processing or task having blocked during the RCU |
527 | * read-side critical section. | 544 | * read-side critical section. |
528 | */ | 545 | */ |
529 | static void rcu_read_unlock_special(struct task_struct *t) | 546 | static noinline void rcu_read_unlock_special(struct task_struct *t) |
530 | { | 547 | { |
531 | int empty; | 548 | int empty; |
532 | int empty_exp; | 549 | int empty_exp; |
533 | unsigned long flags; | 550 | unsigned long flags; |
534 | struct list_head *np; | 551 | struct list_head *np; |
552 | #ifdef CONFIG_RCU_BOOST | ||
553 | struct rt_mutex *rbmp = NULL; | ||
554 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
535 | int special; | 555 | int special; |
536 | 556 | ||
537 | /* | 557 | /* |
@@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
552 | rcu_preempt_cpu_qs(); | 572 | rcu_preempt_cpu_qs(); |
553 | 573 | ||
554 | /* Hardware IRQ handlers cannot block. */ | 574 | /* Hardware IRQ handlers cannot block. */ |
555 | if (in_irq()) { | 575 | if (in_irq() || in_serving_softirq()) { |
556 | local_irq_restore(flags); | 576 | local_irq_restore(flags); |
557 | return; | 577 | return; |
558 | } | 578 | } |
@@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
597 | } | 617 | } |
598 | #ifdef CONFIG_RCU_BOOST | 618 | #ifdef CONFIG_RCU_BOOST |
599 | /* Unboost self if was boosted. */ | 619 | /* Unboost self if was boosted. */ |
600 | if (special & RCU_READ_UNLOCK_BOOSTED) { | 620 | if (t->rcu_boost_mutex != NULL) { |
601 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | 621 | rbmp = t->rcu_boost_mutex; |
602 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
603 | t->rcu_boost_mutex = NULL; | 622 | t->rcu_boost_mutex = NULL; |
623 | rt_mutex_unlock(rbmp); | ||
604 | } | 624 | } |
605 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 625 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
606 | local_irq_restore(flags); | 626 | local_irq_restore(flags); |
@@ -618,13 +638,22 @@ void __rcu_read_unlock(void) | |||
618 | struct task_struct *t = current; | 638 | struct task_struct *t = current; |
619 | 639 | ||
620 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | 640 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ |
621 | --t->rcu_read_lock_nesting; | 641 | if (t->rcu_read_lock_nesting != 1) |
622 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | 642 | --t->rcu_read_lock_nesting; |
623 | if (t->rcu_read_lock_nesting == 0 && | 643 | else { |
624 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 644 | t->rcu_read_lock_nesting = INT_MIN; |
625 | rcu_read_unlock_special(t); | 645 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
646 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
647 | rcu_read_unlock_special(t); | ||
648 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
649 | t->rcu_read_lock_nesting = 0; | ||
650 | } | ||
626 | #ifdef CONFIG_PROVE_LOCKING | 651 | #ifdef CONFIG_PROVE_LOCKING |
627 | WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); | 652 | { |
653 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
654 | |||
655 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
656 | } | ||
628 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | 657 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ |
629 | } | 658 | } |
630 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 659 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
@@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void) | |||
649 | invoke_rcu_callbacks(); | 678 | invoke_rcu_callbacks(); |
650 | if (rcu_preempt_gp_in_progress() && | 679 | if (rcu_preempt_gp_in_progress() && |
651 | rcu_cpu_blocking_cur_gp() && | 680 | rcu_cpu_blocking_cur_gp() && |
652 | rcu_preempt_running_reader()) | 681 | rcu_preempt_running_reader() > 0) |
653 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 682 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; |
654 | } | 683 | } |
655 | 684 | ||
@@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
706 | */ | 735 | */ |
707 | void synchronize_rcu(void) | 736 | void synchronize_rcu(void) |
708 | { | 737 | { |
738 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
739 | !lock_is_held(&rcu_lock_map) && | ||
740 | !lock_is_held(&rcu_sched_lock_map), | ||
741 | "Illegal synchronize_rcu() in RCU read-side critical section"); | ||
742 | |||
709 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 743 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
710 | if (!rcu_scheduler_active) | 744 | if (!rcu_scheduler_active) |
711 | return; | 745 | return; |
@@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void) | |||
882 | static void invoke_rcu_callbacks(void) | 916 | static void invoke_rcu_callbacks(void) |
883 | { | 917 | { |
884 | have_rcu_kthread_work = 1; | 918 | have_rcu_kthread_work = 1; |
885 | wake_up(&rcu_kthread_wq); | 919 | if (rcu_kthread_task != NULL) |
920 | wake_up(&rcu_kthread_wq); | ||
886 | } | 921 | } |
887 | 922 | ||
888 | #ifdef CONFIG_RCU_TRACE | 923 | #ifdef CONFIG_RCU_TRACE |
@@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads); | |||
943 | 978 | ||
944 | #else /* #ifdef CONFIG_RCU_BOOST */ | 979 | #else /* #ifdef CONFIG_RCU_BOOST */ |
945 | 980 | ||
981 | /* Hold off callback invocation until early_initcall() time. */ | ||
982 | static int rcu_scheduler_fully_active __read_mostly; | ||
983 | |||
946 | /* | 984 | /* |
947 | * Start up softirq processing of callbacks. | 985 | * Start up softirq processing of callbacks. |
948 | */ | 986 | */ |
949 | void invoke_rcu_callbacks(void) | 987 | void invoke_rcu_callbacks(void) |
950 | { | 988 | { |
951 | raise_softirq(RCU_SOFTIRQ); | 989 | if (rcu_scheduler_fully_active) |
990 | raise_softirq(RCU_SOFTIRQ); | ||
952 | } | 991 | } |
953 | 992 | ||
954 | #ifdef CONFIG_RCU_TRACE | 993 | #ifdef CONFIG_RCU_TRACE |
@@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void) | |||
963 | 1002 | ||
964 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 1003 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
965 | 1004 | ||
966 | void rcu_init(void) | 1005 | static int __init rcu_scheduler_really_started(void) |
967 | { | 1006 | { |
1007 | rcu_scheduler_fully_active = 1; | ||
968 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 1008 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
1009 | raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ | ||
1010 | return 0; | ||
969 | } | 1011 | } |
1012 | early_initcall(rcu_scheduler_really_started); | ||
970 | 1013 | ||
971 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | 1014 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ |
972 | 1015 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a58ac285fc69..a89b381a8c6e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -65,7 +65,10 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | |||
65 | static int fqs_holdoff; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | 67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ |
68 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | ||
68 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | 69 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ |
70 | static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ | ||
71 | static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ | ||
69 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | 72 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ |
70 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | 73 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ |
71 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | 74 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ |
@@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444); | |||
95 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 98 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
96 | module_param(onoff_interval, int, 0444); | 99 | module_param(onoff_interval, int, 0444); |
97 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 100 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); |
101 | module_param(onoff_holdoff, int, 0444); | ||
102 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); | ||
98 | module_param(shutdown_secs, int, 0444); | 103 | module_param(shutdown_secs, int, 0444); |
99 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | 104 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); |
105 | module_param(stall_cpu, int, 0444); | ||
106 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); | ||
107 | module_param(stall_cpu_holdoff, int, 0444); | ||
108 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); | ||
100 | module_param(test_boost, int, 0444); | 109 | module_param(test_boost, int, 0444); |
101 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 110 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
102 | module_param(test_boost_interval, int, 0444); | 111 | module_param(test_boost_interval, int, 0444); |
@@ -129,6 +138,7 @@ static struct task_struct *shutdown_task; | |||
129 | #ifdef CONFIG_HOTPLUG_CPU | 138 | #ifdef CONFIG_HOTPLUG_CPU |
130 | static struct task_struct *onoff_task; | 139 | static struct task_struct *onoff_task; |
131 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 140 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
141 | static struct task_struct *stall_task; | ||
132 | 142 | ||
133 | #define RCU_TORTURE_PIPE_LEN 10 | 143 | #define RCU_TORTURE_PIPE_LEN 10 |
134 | 144 | ||
@@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused) | |||
990 | rcu_read_lock_bh_held() || | 1000 | rcu_read_lock_bh_held() || |
991 | rcu_read_lock_sched_held() || | 1001 | rcu_read_lock_sched_held() || |
992 | srcu_read_lock_held(&srcu_ctl)); | 1002 | srcu_read_lock_held(&srcu_ctl)); |
993 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
994 | if (p == NULL) { | 1003 | if (p == NULL) { |
995 | /* Leave because rcu_torture_writer is not yet underway */ | 1004 | /* Leave because rcu_torture_writer is not yet underway */ |
996 | cur_ops->readunlock(idx); | 1005 | cur_ops->readunlock(idx); |
997 | return; | 1006 | return; |
998 | } | 1007 | } |
1008 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
999 | if (p->rtort_mbtest == 0) | 1009 | if (p->rtort_mbtest == 0) |
1000 | atomic_inc(&n_rcu_torture_mberror); | 1010 | atomic_inc(&n_rcu_torture_mberror); |
1001 | spin_lock(&rand_lock); | 1011 | spin_lock(&rand_lock); |
@@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg) | |||
1053 | rcu_read_lock_bh_held() || | 1063 | rcu_read_lock_bh_held() || |
1054 | rcu_read_lock_sched_held() || | 1064 | rcu_read_lock_sched_held() || |
1055 | srcu_read_lock_held(&srcu_ctl)); | 1065 | srcu_read_lock_held(&srcu_ctl)); |
1056 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
1057 | if (p == NULL) { | 1066 | if (p == NULL) { |
1058 | /* Wait for rcu_torture_writer to get underway */ | 1067 | /* Wait for rcu_torture_writer to get underway */ |
1059 | cur_ops->readunlock(idx); | 1068 | cur_ops->readunlock(idx); |
1060 | schedule_timeout_interruptible(HZ); | 1069 | schedule_timeout_interruptible(HZ); |
1061 | continue; | 1070 | continue; |
1062 | } | 1071 | } |
1072 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
1063 | if (p->rtort_mbtest == 0) | 1073 | if (p->rtort_mbtest == 0) |
1064 | atomic_inc(&n_rcu_torture_mberror); | 1074 | atomic_inc(&n_rcu_torture_mberror); |
1065 | cur_ops->read_delay(&rand); | 1075 | cur_ops->read_delay(&rand); |
@@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |||
1300 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1310 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1301 | "test_boost=%d/%d test_boost_interval=%d " | 1311 | "test_boost=%d/%d test_boost_interval=%d " |
1302 | "test_boost_duration=%d shutdown_secs=%d " | 1312 | "test_boost_duration=%d shutdown_secs=%d " |
1303 | "onoff_interval=%d\n", | 1313 | "onoff_interval=%d onoff_holdoff=%d\n", |
1304 | torture_type, tag, nrealreaders, nfakewriters, | 1314 | torture_type, tag, nrealreaders, nfakewriters, |
1305 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1315 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1306 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1316 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1307 | test_boost, cur_ops->can_boost, | 1317 | test_boost, cur_ops->can_boost, |
1308 | test_boost_interval, test_boost_duration, shutdown_secs, | 1318 | test_boost_interval, test_boost_duration, shutdown_secs, |
1309 | onoff_interval); | 1319 | onoff_interval, onoff_holdoff); |
1310 | } | 1320 | } |
1311 | 1321 | ||
1312 | static struct notifier_block rcutorture_shutdown_nb = { | 1322 | static struct notifier_block rcutorture_shutdown_nb = { |
@@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg) | |||
1410 | for_each_online_cpu(cpu) | 1420 | for_each_online_cpu(cpu) |
1411 | maxcpu = cpu; | 1421 | maxcpu = cpu; |
1412 | WARN_ON(maxcpu < 0); | 1422 | WARN_ON(maxcpu < 0); |
1423 | if (onoff_holdoff > 0) { | ||
1424 | VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); | ||
1425 | schedule_timeout_interruptible(onoff_holdoff * HZ); | ||
1426 | VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); | ||
1427 | } | ||
1413 | while (!kthread_should_stop()) { | 1428 | while (!kthread_should_stop()) { |
1414 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | 1429 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); |
1415 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | 1430 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { |
@@ -1450,12 +1465,15 @@ rcu_torture_onoff(void *arg) | |||
1450 | static int __cpuinit | 1465 | static int __cpuinit |
1451 | rcu_torture_onoff_init(void) | 1466 | rcu_torture_onoff_init(void) |
1452 | { | 1467 | { |
1468 | int ret; | ||
1469 | |||
1453 | if (onoff_interval <= 0) | 1470 | if (onoff_interval <= 0) |
1454 | return 0; | 1471 | return 0; |
1455 | onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); | 1472 | onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); |
1456 | if (IS_ERR(onoff_task)) { | 1473 | if (IS_ERR(onoff_task)) { |
1474 | ret = PTR_ERR(onoff_task); | ||
1457 | onoff_task = NULL; | 1475 | onoff_task = NULL; |
1458 | return PTR_ERR(onoff_task); | 1476 | return ret; |
1459 | } | 1477 | } |
1460 | return 0; | 1478 | return 0; |
1461 | } | 1479 | } |
@@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void) | |||
1481 | 1499 | ||
1482 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | 1500 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ |
1483 | 1501 | ||
1502 | /* | ||
1503 | * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then | ||
1504 | * induces a CPU stall for the time specified by stall_cpu. | ||
1505 | */ | ||
1506 | static int __cpuinit rcu_torture_stall(void *args) | ||
1507 | { | ||
1508 | unsigned long stop_at; | ||
1509 | |||
1510 | VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); | ||
1511 | if (stall_cpu_holdoff > 0) { | ||
1512 | VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); | ||
1513 | schedule_timeout_interruptible(stall_cpu_holdoff * HZ); | ||
1514 | VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); | ||
1515 | } | ||
1516 | if (!kthread_should_stop()) { | ||
1517 | stop_at = get_seconds() + stall_cpu; | ||
1518 | /* RCU CPU stall is expected behavior in following code. */ | ||
1519 | printk(KERN_ALERT "rcu_torture_stall start.\n"); | ||
1520 | rcu_read_lock(); | ||
1521 | preempt_disable(); | ||
1522 | while (ULONG_CMP_LT(get_seconds(), stop_at)) | ||
1523 | continue; /* Induce RCU CPU stall warning. */ | ||
1524 | preempt_enable(); | ||
1525 | rcu_read_unlock(); | ||
1526 | printk(KERN_ALERT "rcu_torture_stall end.\n"); | ||
1527 | } | ||
1528 | rcutorture_shutdown_absorb("rcu_torture_stall"); | ||
1529 | while (!kthread_should_stop()) | ||
1530 | schedule_timeout_interruptible(10 * HZ); | ||
1531 | return 0; | ||
1532 | } | ||
1533 | |||
1534 | /* Spawn CPU-stall kthread, if stall_cpu specified. */ | ||
1535 | static int __init rcu_torture_stall_init(void) | ||
1536 | { | ||
1537 | int ret; | ||
1538 | |||
1539 | if (stall_cpu <= 0) | ||
1540 | return 0; | ||
1541 | stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); | ||
1542 | if (IS_ERR(stall_task)) { | ||
1543 | ret = PTR_ERR(stall_task); | ||
1544 | stall_task = NULL; | ||
1545 | return ret; | ||
1546 | } | ||
1547 | return 0; | ||
1548 | } | ||
1549 | |||
1550 | /* Clean up after the CPU-stall kthread, if one was spawned. */ | ||
1551 | static void rcu_torture_stall_cleanup(void) | ||
1552 | { | ||
1553 | if (stall_task == NULL) | ||
1554 | return; | ||
1555 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | ||
1556 | kthread_stop(stall_task); | ||
1557 | } | ||
1558 | |||
1484 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1559 | static int rcutorture_cpu_notify(struct notifier_block *self, |
1485 | unsigned long action, void *hcpu) | 1560 | unsigned long action, void *hcpu) |
1486 | { | 1561 | { |
@@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void) | |||
1523 | fullstop = FULLSTOP_RMMOD; | 1598 | fullstop = FULLSTOP_RMMOD; |
1524 | mutex_unlock(&fullstop_mutex); | 1599 | mutex_unlock(&fullstop_mutex); |
1525 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1600 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1601 | rcu_torture_stall_cleanup(); | ||
1526 | if (stutter_task) { | 1602 | if (stutter_task) { |
1527 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1603 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
1528 | kthread_stop(stutter_task); | 1604 | kthread_stop(stutter_task); |
@@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void) | |||
1602 | cur_ops->cleanup(); | 1678 | cur_ops->cleanup(); |
1603 | if (atomic_read(&n_rcu_torture_error)) | 1679 | if (atomic_read(&n_rcu_torture_error)) |
1604 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1680 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1681 | else if (n_online_successes != n_online_attempts || | ||
1682 | n_offline_successes != n_offline_attempts) | ||
1683 | rcu_torture_print_module_parms(cur_ops, | ||
1684 | "End of test: RCU_HOTPLUG"); | ||
1605 | else | 1685 | else |
1606 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1686 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1607 | } | 1687 | } |
@@ -1819,6 +1899,7 @@ rcu_torture_init(void) | |||
1819 | } | 1899 | } |
1820 | rcu_torture_onoff_init(); | 1900 | rcu_torture_onoff_init(); |
1821 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1901 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1902 | rcu_torture_stall_init(); | ||
1822 | rcutorture_record_test_transition(); | 1903 | rcutorture_record_test_transition(); |
1823 | mutex_unlock(&fullstop_mutex); | 1904 | mutex_unlock(&fullstop_mutex); |
1824 | return 0; | 1905 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6c4a6722abfd..1050d6d3922c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -50,6 +50,8 @@ | |||
50 | #include <linux/wait.h> | 50 | #include <linux/wait.h> |
51 | #include <linux/kthread.h> | 51 | #include <linux/kthread.h> |
52 | #include <linux/prefetch.h> | 52 | #include <linux/prefetch.h> |
53 | #include <linux/delay.h> | ||
54 | #include <linux/stop_machine.h> | ||
53 | 55 | ||
54 | #include "rcutree.h" | 56 | #include "rcutree.h" |
55 | #include <trace/events/rcu.h> | 57 | #include <trace/events/rcu.h> |
@@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu) | |||
196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 198 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
197 | 199 | ||
198 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 200 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
199 | .dynticks_nesting = DYNTICK_TASK_NESTING, | 201 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
200 | .dynticks = ATOMIC_INIT(1), | 202 | .dynticks = ATOMIC_INIT(1), |
201 | }; | 203 | }; |
202 | 204 | ||
@@ -208,8 +210,11 @@ module_param(blimit, int, 0); | |||
208 | module_param(qhimark, int, 0); | 210 | module_param(qhimark, int, 0); |
209 | module_param(qlowmark, int, 0); | 211 | module_param(qlowmark, int, 0); |
210 | 212 | ||
211 | int rcu_cpu_stall_suppress __read_mostly; | 213 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
214 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | ||
215 | |||
212 | module_param(rcu_cpu_stall_suppress, int, 0644); | 216 | module_param(rcu_cpu_stall_suppress, int, 0644); |
217 | module_param(rcu_cpu_stall_timeout, int, 0644); | ||
213 | 218 | ||
214 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 219 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
215 | static int rcu_pending(int cpu); | 220 | static int rcu_pending(int cpu); |
@@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
301 | return &rsp->node[0]; | 306 | return &rsp->node[0]; |
302 | } | 307 | } |
303 | 308 | ||
304 | #ifdef CONFIG_SMP | ||
305 | |||
306 | /* | 309 | /* |
307 | * If the specified CPU is offline, tell the caller that it is in | 310 | * If the specified CPU is offline, tell the caller that it is in |
308 | * a quiescent state. Otherwise, whack it with a reschedule IPI. | 311 | * a quiescent state. Otherwise, whack it with a reschedule IPI. |
@@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
317 | static int rcu_implicit_offline_qs(struct rcu_data *rdp) | 320 | static int rcu_implicit_offline_qs(struct rcu_data *rdp) |
318 | { | 321 | { |
319 | /* | 322 | /* |
320 | * If the CPU is offline, it is in a quiescent state. We can | 323 | * If the CPU is offline for more than a jiffy, it is in a quiescent |
321 | * trust its state not to change because interrupts are disabled. | 324 | * state. We can trust its state not to change because interrupts |
325 | * are disabled. The reason for the jiffy's worth of slack is to | ||
326 | * handle CPUs initializing on the way up and finding their way | ||
327 | * to the idle loop on the way down. | ||
322 | */ | 328 | */ |
323 | if (cpu_is_offline(rdp->cpu)) { | 329 | if (cpu_is_offline(rdp->cpu) && |
330 | ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { | ||
324 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | 331 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); |
325 | rdp->offline_fqs++; | 332 | rdp->offline_fqs++; |
326 | return 1; | 333 | return 1; |
327 | } | 334 | } |
328 | |||
329 | /* | ||
330 | * The CPU is online, so send it a reschedule IPI. This forces | ||
331 | * it through the scheduler, and (inefficiently) also handles cases | ||
332 | * where idle loops fail to inform RCU about the CPU being idle. | ||
333 | */ | ||
334 | if (rdp->cpu != smp_processor_id()) | ||
335 | smp_send_reschedule(rdp->cpu); | ||
336 | else | ||
337 | set_need_resched(); | ||
338 | rdp->resched_ipi++; | ||
339 | return 0; | 335 | return 0; |
340 | } | 336 | } |
341 | 337 | ||
342 | #endif /* #ifdef CONFIG_SMP */ | ||
343 | |||
344 | /* | 338 | /* |
345 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle | 339 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle |
346 | * | 340 | * |
@@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
366 | atomic_inc(&rdtp->dynticks); | 360 | atomic_inc(&rdtp->dynticks); |
367 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | 361 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ |
368 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 362 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
363 | |||
364 | /* | ||
365 | * The idle task is not permitted to enter the idle loop while | ||
366 | * in an RCU read-side critical section. | ||
367 | */ | ||
368 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), | ||
369 | "Illegal idle entry in RCU read-side critical section."); | ||
370 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), | ||
371 | "Illegal idle entry in RCU-bh read-side critical section."); | ||
372 | rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), | ||
373 | "Illegal idle entry in RCU-sched read-side critical section."); | ||
369 | } | 374 | } |
370 | 375 | ||
371 | /** | 376 | /** |
@@ -389,10 +394,15 @@ void rcu_idle_enter(void) | |||
389 | local_irq_save(flags); | 394 | local_irq_save(flags); |
390 | rdtp = &__get_cpu_var(rcu_dynticks); | 395 | rdtp = &__get_cpu_var(rcu_dynticks); |
391 | oldval = rdtp->dynticks_nesting; | 396 | oldval = rdtp->dynticks_nesting; |
392 | rdtp->dynticks_nesting = 0; | 397 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
398 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | ||
399 | rdtp->dynticks_nesting = 0; | ||
400 | else | ||
401 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
393 | rcu_idle_enter_common(rdtp, oldval); | 402 | rcu_idle_enter_common(rdtp, oldval); |
394 | local_irq_restore(flags); | 403 | local_irq_restore(flags); |
395 | } | 404 | } |
405 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | ||
396 | 406 | ||
397 | /** | 407 | /** |
398 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle | 408 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
@@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
462 | * Exit idle mode, in other words, -enter- the mode in which RCU | 472 | * Exit idle mode, in other words, -enter- the mode in which RCU |
463 | * read-side critical sections can occur. | 473 | * read-side critical sections can occur. |
464 | * | 474 | * |
465 | * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to | 475 | * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to |
466 | * allow for the possibility of usermode upcalls messing up our count | 476 | * allow for the possibility of usermode upcalls messing up our count |
467 | * of interrupt nesting level during the busy period that is just | 477 | * of interrupt nesting level during the busy period that is just |
468 | * now starting. | 478 | * now starting. |
@@ -476,11 +486,15 @@ void rcu_idle_exit(void) | |||
476 | local_irq_save(flags); | 486 | local_irq_save(flags); |
477 | rdtp = &__get_cpu_var(rcu_dynticks); | 487 | rdtp = &__get_cpu_var(rcu_dynticks); |
478 | oldval = rdtp->dynticks_nesting; | 488 | oldval = rdtp->dynticks_nesting; |
479 | WARN_ON_ONCE(oldval != 0); | 489 | WARN_ON_ONCE(oldval < 0); |
480 | rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; | 490 | if (oldval & DYNTICK_TASK_NEST_MASK) |
491 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
492 | else | ||
493 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
481 | rcu_idle_exit_common(rdtp, oldval); | 494 | rcu_idle_exit_common(rdtp, oldval); |
482 | local_irq_restore(flags); | 495 | local_irq_restore(flags); |
483 | } | 496 | } |
497 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | ||
484 | 498 | ||
485 | /** | 499 | /** |
486 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | 500 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle |
@@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void) | |||
581 | } | 595 | } |
582 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 596 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
583 | 597 | ||
598 | #ifdef CONFIG_HOTPLUG_CPU | ||
599 | |||
600 | /* | ||
601 | * Is the current CPU online? Disable preemption to avoid false positives | ||
602 | * that could otherwise happen due to the current CPU number being sampled, | ||
603 | * this task being preempted, its old CPU being taken offline, resuming | ||
604 | * on some other CPU, then determining that its old CPU is now offline. | ||
605 | * It is OK to use RCU on an offline processor during initial boot, hence | ||
606 | * the check for rcu_scheduler_fully_active. Note also that it is OK | ||
607 | * for a CPU coming online to use RCU for one jiffy prior to marking itself | ||
608 | * online in the cpu_online_mask. Similarly, it is OK for a CPU going | ||
609 | * offline to continue to use RCU for one jiffy after marking itself | ||
610 | * offline in the cpu_online_mask. This leniency is necessary given the | ||
611 | * non-atomic nature of the online and offline processing, for example, | ||
612 | * the fact that a CPU enters the scheduler after completing the CPU_DYING | ||
613 | * notifiers. | ||
614 | * | ||
615 | * This is also why RCU internally marks CPUs online during the | ||
616 | * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. | ||
617 | * | ||
618 | * Disable checking if in an NMI handler because we cannot safely report | ||
619 | * errors from NMI handlers anyway. | ||
620 | */ | ||
621 | bool rcu_lockdep_current_cpu_online(void) | ||
622 | { | ||
623 | struct rcu_data *rdp; | ||
624 | struct rcu_node *rnp; | ||
625 | bool ret; | ||
626 | |||
627 | if (in_nmi()) | ||
628 | return 1; | ||
629 | preempt_disable(); | ||
630 | rdp = &__get_cpu_var(rcu_sched_data); | ||
631 | rnp = rdp->mynode; | ||
632 | ret = (rdp->grpmask & rnp->qsmaskinit) || | ||
633 | !rcu_scheduler_fully_active; | ||
634 | preempt_enable(); | ||
635 | return ret; | ||
636 | } | ||
637 | EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | ||
638 | |||
639 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
640 | |||
584 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 641 | #endif /* #ifdef CONFIG_PROVE_RCU */ |
585 | 642 | ||
586 | /** | 643 | /** |
@@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void) | |||
595 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; | 652 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; |
596 | } | 653 | } |
597 | 654 | ||
598 | #ifdef CONFIG_SMP | ||
599 | |||
600 | /* | 655 | /* |
601 | * Snapshot the specified CPU's dynticks counter so that we can later | 656 | * Snapshot the specified CPU's dynticks counter so that we can later |
602 | * credit them with an implicit quiescent state. Return 1 if this CPU | 657 | * credit them with an implicit quiescent state. Return 1 if this CPU |
@@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
640 | return rcu_implicit_offline_qs(rdp); | 695 | return rcu_implicit_offline_qs(rdp); |
641 | } | 696 | } |
642 | 697 | ||
643 | #endif /* #ifdef CONFIG_SMP */ | 698 | static int jiffies_till_stall_check(void) |
699 | { | ||
700 | int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); | ||
701 | |||
702 | /* | ||
703 | * Limit check must be consistent with the Kconfig limits | ||
704 | * for CONFIG_RCU_CPU_STALL_TIMEOUT. | ||
705 | */ | ||
706 | if (till_stall_check < 3) { | ||
707 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; | ||
708 | till_stall_check = 3; | ||
709 | } else if (till_stall_check > 300) { | ||
710 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; | ||
711 | till_stall_check = 300; | ||
712 | } | ||
713 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | ||
714 | } | ||
644 | 715 | ||
645 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 716 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
646 | { | 717 | { |
647 | rsp->gp_start = jiffies; | 718 | rsp->gp_start = jiffies; |
648 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; | 719 | rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); |
649 | } | 720 | } |
650 | 721 | ||
651 | static void print_other_cpu_stall(struct rcu_state *rsp) | 722 | static void print_other_cpu_stall(struct rcu_state *rsp) |
@@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
664 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 735 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
665 | return; | 736 | return; |
666 | } | 737 | } |
667 | rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 738 | rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; |
668 | |||
669 | /* | ||
670 | * Now rat on any tasks that got kicked up to the root rcu_node | ||
671 | * due to CPU offlining. | ||
672 | */ | ||
673 | ndetected = rcu_print_task_stall(rnp); | ||
674 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 739 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
675 | 740 | ||
676 | /* | 741 | /* |
@@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
678 | * See Documentation/RCU/stallwarn.txt for info on how to debug | 743 | * See Documentation/RCU/stallwarn.txt for info on how to debug |
679 | * RCU CPU stall warnings. | 744 | * RCU CPU stall warnings. |
680 | */ | 745 | */ |
681 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", | 746 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", |
682 | rsp->name); | 747 | rsp->name); |
748 | print_cpu_stall_info_begin(); | ||
683 | rcu_for_each_leaf_node(rsp, rnp) { | 749 | rcu_for_each_leaf_node(rsp, rnp) { |
684 | raw_spin_lock_irqsave(&rnp->lock, flags); | 750 | raw_spin_lock_irqsave(&rnp->lock, flags); |
685 | ndetected += rcu_print_task_stall(rnp); | 751 | ndetected += rcu_print_task_stall(rnp); |
@@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
688 | continue; | 754 | continue; |
689 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 755 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
690 | if (rnp->qsmask & (1UL << cpu)) { | 756 | if (rnp->qsmask & (1UL << cpu)) { |
691 | printk(" %d", rnp->grplo + cpu); | 757 | print_cpu_stall_info(rsp, rnp->grplo + cpu); |
692 | ndetected++; | 758 | ndetected++; |
693 | } | 759 | } |
694 | } | 760 | } |
695 | printk("} (detected by %d, t=%ld jiffies)\n", | 761 | |
762 | /* | ||
763 | * Now rat on any tasks that got kicked up to the root rcu_node | ||
764 | * due to CPU offlining. | ||
765 | */ | ||
766 | rnp = rcu_get_root(rsp); | ||
767 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
768 | ndetected = rcu_print_task_stall(rnp); | ||
769 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
770 | |||
771 | print_cpu_stall_info_end(); | ||
772 | printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", | ||
696 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 773 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
697 | if (ndetected == 0) | 774 | if (ndetected == 0) |
698 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | 775 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); |
@@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
716 | * See Documentation/RCU/stallwarn.txt for info on how to debug | 793 | * See Documentation/RCU/stallwarn.txt for info on how to debug |
717 | * RCU CPU stall warnings. | 794 | * RCU CPU stall warnings. |
718 | */ | 795 | */ |
719 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 796 | printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); |
720 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 797 | print_cpu_stall_info_begin(); |
798 | print_cpu_stall_info(rsp, smp_processor_id()); | ||
799 | print_cpu_stall_info_end(); | ||
800 | printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); | ||
721 | if (!trigger_all_cpu_backtrace()) | 801 | if (!trigger_all_cpu_backtrace()) |
722 | dump_stack(); | 802 | dump_stack(); |
723 | 803 | ||
724 | raw_spin_lock_irqsave(&rnp->lock, flags); | 804 | raw_spin_lock_irqsave(&rnp->lock, flags); |
725 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | 805 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
726 | rsp->jiffies_stall = | 806 | rsp->jiffies_stall = jiffies + |
727 | jiffies + RCU_SECONDS_TILL_STALL_RECHECK; | 807 | 3 * jiffies_till_stall_check() + 3; |
728 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 808 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
729 | 809 | ||
730 | set_need_resched(); /* kick ourselves to get things going. */ | 810 | set_need_resched(); /* kick ourselves to get things going. */ |
@@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
807 | rdp->passed_quiesce = 0; | 887 | rdp->passed_quiesce = 0; |
808 | } else | 888 | } else |
809 | rdp->qs_pending = 0; | 889 | rdp->qs_pending = 0; |
890 | zero_cpu_stall_ticks(rdp); | ||
810 | } | 891 | } |
811 | } | 892 | } |
812 | 893 | ||
@@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
943 | * in preparation for detecting the next grace period. The caller must hold | 1024 | * in preparation for detecting the next grace period. The caller must hold |
944 | * the root node's ->lock, which is released before return. Hard irqs must | 1025 | * the root node's ->lock, which is released before return. Hard irqs must |
945 | * be disabled. | 1026 | * be disabled. |
1027 | * | ||
1028 | * Note that it is legal for a dying CPU (which is marked as offline) to | ||
1029 | * invoke this function. This can happen when the dying CPU reports its | ||
1030 | * quiescent state. | ||
946 | */ | 1031 | */ |
947 | static void | 1032 | static void |
948 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 1033 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) |
@@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
980 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | 1065 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ |
981 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 1066 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
982 | record_gp_stall_check_time(rsp); | 1067 | record_gp_stall_check_time(rsp); |
983 | |||
984 | /* Special-case the common single-level case. */ | ||
985 | if (NUM_RCU_NODES == 1) { | ||
986 | rcu_preempt_check_blocked_tasks(rnp); | ||
987 | rnp->qsmask = rnp->qsmaskinit; | ||
988 | rnp->gpnum = rsp->gpnum; | ||
989 | rnp->completed = rsp->completed; | ||
990 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ | ||
991 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | ||
992 | rcu_preempt_boost_start_gp(rnp); | ||
993 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
994 | rnp->level, rnp->grplo, | ||
995 | rnp->grphi, rnp->qsmask); | ||
996 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
997 | return; | ||
998 | } | ||
999 | |||
1000 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ | 1068 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ |
1001 | 1069 | ||
1002 | |||
1003 | /* Exclude any concurrent CPU-hotplug operations. */ | 1070 | /* Exclude any concurrent CPU-hotplug operations. */ |
1004 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1071 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ |
1005 | 1072 | ||
@@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1245 | 1312 | ||
1246 | /* | 1313 | /* |
1247 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. |
1248 | * Synchronization is not required because this function executes | 1315 | * Also record a quiescent state for this CPU for the current grace period. |
1249 | * in stop_machine() context. | 1316 | * Synchronization and interrupt disabling are not required because |
1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
1318 | * operations that might block must be done later from the CPU_DEAD | ||
1319 | * notifier. | ||
1320 | * | ||
1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
1323 | * destination from the bits set in that mask. | ||
1250 | */ | 1324 | */ |
1251 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) | 1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1252 | { | 1326 | { |
1253 | int i; | 1327 | int i; |
1254 | /* current DYING CPU is cleared in the cpu_online_mask */ | 1328 | unsigned long mask; |
1255 | int receive_cpu = cpumask_any(cpu_online_mask); | 1329 | int receive_cpu = cpumask_any(cpu_online_mask); |
1256 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1257 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | 1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); |
1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
1333 | |||
1334 | /* First, adjust the counts. */ | ||
1335 | if (rdp->nxtlist != NULL) { | ||
1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | ||
1337 | receive_rdp->qlen += rdp->qlen; | ||
1338 | rdp->qlen_lazy = 0; | ||
1339 | rdp->qlen = 0; | ||
1340 | } | ||
1258 | 1341 | ||
1259 | if (rdp->nxtlist == NULL) | 1342 | /* |
1260 | return; /* irqs disabled, so comparison is stable. */ | 1343 | * Next, move ready-to-invoke callbacks to be invoked on some |
1344 | * other CPU. These will not be required to pass through another | ||
1345 | * grace period: They are done, regardless of CPU. | ||
1346 | */ | ||
1347 | if (rdp->nxtlist != NULL && | ||
1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | ||
1349 | struct rcu_head *oldhead; | ||
1350 | struct rcu_head **oldtail; | ||
1351 | struct rcu_head **newtail; | ||
1352 | |||
1353 | oldhead = rdp->nxtlist; | ||
1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
1361 | receive_rdp->nxttail[i] = newtail; | ||
1362 | if (rdp->nxttail[i] == newtail) | ||
1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1364 | } | ||
1365 | } | ||
1261 | 1366 | ||
1262 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1367 | /* |
1263 | receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1368 | * Finally, put the rest of the callbacks at the end of the list. |
1264 | receive_rdp->qlen += rdp->qlen; | 1369 | * The ones that made it partway through get to start over: We |
1265 | receive_rdp->n_cbs_adopted += rdp->qlen; | 1370 | * cannot assume that grace periods are synchronized across CPUs. |
1266 | rdp->n_cbs_orphaned += rdp->qlen; | 1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but |
1372 | * this does not seem compelling. Not yet, anyway.) | ||
1373 | */ | ||
1374 | if (rdp->nxtlist != NULL) { | ||
1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | ||
1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | ||
1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1380 | |||
1381 | rdp->nxtlist = NULL; | ||
1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1384 | } | ||
1267 | 1385 | ||
1268 | rdp->nxtlist = NULL; | 1386 | /* |
1269 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1387 | * Record a quiescent state for the dying CPU. This is safe |
1270 | rdp->nxttail[i] = &rdp->nxtlist; | 1388 | * only because we have already cleared out the callbacks. |
1271 | rdp->qlen = 0; | 1389 | * (Otherwise, the RCU core might try to schedule the invocation |
1390 | * of callbacks on this now-offline CPU, which would be bad.) | ||
1391 | */ | ||
1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | ||
1393 | trace_rcu_grace_period(rsp->name, | ||
1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | ||
1395 | "cpuofl"); | ||
1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
1272 | } | 1398 | } |
1273 | 1399 | ||
1274 | /* | 1400 | /* |
1275 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy | 1401 | * The CPU has been completely removed, and some other CPU is reporting |
1276 | * and move all callbacks from the outgoing CPU to the current one. | 1402 | * this fact from process context. Do the remainder of the cleanup. |
1277 | * There can only be one CPU hotplug operation at a time, so no other | 1403 | * There can only be one CPU hotplug operation at a time, so no other |
1278 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1404 | * CPU can be attempting to update rcu_cpu_kthread_task. |
1279 | */ | 1405 | */ |
1280 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | 1406 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) |
1281 | { | 1407 | { |
1282 | unsigned long flags; | 1408 | unsigned long flags; |
1283 | unsigned long mask; | 1409 | unsigned long mask; |
1284 | int need_report = 0; | 1410 | int need_report = 0; |
1285 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1286 | struct rcu_node *rnp; | 1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ |
1287 | 1413 | ||
1414 | /* Adjust any no-longer-needed kthreads. */ | ||
1288 | rcu_stop_cpu_kthread(cpu); | 1415 | rcu_stop_cpu_kthread(cpu); |
1416 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1417 | |||
1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | ||
1289 | 1419 | ||
1290 | /* Exclude any attempts to start a new grace period. */ | 1420 | /* Exclude any attempts to start a new grace period. */ |
1291 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1292 | 1422 | ||
1293 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1294 | rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ | ||
1295 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1296 | do { | 1425 | do { |
1297 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1426 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
@@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1299 | if (rnp->qsmaskinit != 0) { | 1428 | if (rnp->qsmaskinit != 0) { |
1300 | if (rnp != rdp->mynode) | 1429 | if (rnp != rdp->mynode) |
1301 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1430 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1302 | else | ||
1303 | trace_rcu_grace_period(rsp->name, | ||
1304 | rnp->gpnum + 1 - | ||
1305 | !!(rnp->qsmask & mask), | ||
1306 | "cpuofl"); | ||
1307 | break; | 1431 | break; |
1308 | } | 1432 | } |
1309 | if (rnp == rdp->mynode) { | 1433 | if (rnp == rdp->mynode) |
1310 | trace_rcu_grace_period(rsp->name, | ||
1311 | rnp->gpnum + 1 - | ||
1312 | !!(rnp->qsmask & mask), | ||
1313 | "cpuofl"); | ||
1314 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | 1434 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
1315 | } else | 1435 | else |
1316 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1436 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1317 | mask = rnp->grpmask; | 1437 | mask = rnp->grpmask; |
1318 | rnp = rnp->parent; | 1438 | rnp = rnp->parent; |
@@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1332 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1452 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1333 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1453 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1334 | rcu_report_exp_rnp(rsp, rnp, true); | 1454 | rcu_report_exp_rnp(rsp, rnp, true); |
1335 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1336 | } | ||
1337 | |||
1338 | /* | ||
1339 | * Remove the specified CPU from the RCU hierarchy and move any pending | ||
1340 | * callbacks that it might have to the current CPU. This code assumes | ||
1341 | * that at least one CPU in the system will remain running at all times. | ||
1342 | * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. | ||
1343 | */ | ||
1344 | static void rcu_offline_cpu(int cpu) | ||
1345 | { | ||
1346 | __rcu_offline_cpu(cpu, &rcu_sched_state); | ||
1347 | __rcu_offline_cpu(cpu, &rcu_bh_state); | ||
1348 | rcu_preempt_offline_cpu(cpu); | ||
1349 | } | 1455 | } |
1350 | 1456 | ||
1351 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1352 | 1458 | ||
1353 | static void rcu_send_cbs_to_online(struct rcu_state *rsp) | 1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1354 | { | 1460 | { |
1355 | } | 1461 | } |
1356 | 1462 | ||
1357 | static void rcu_offline_cpu(int cpu) | 1463 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) |
1358 | { | 1464 | { |
1359 | } | 1465 | } |
1360 | 1466 | ||
@@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1368 | { | 1474 | { |
1369 | unsigned long flags; | 1475 | unsigned long flags; |
1370 | struct rcu_head *next, *list, **tail; | 1476 | struct rcu_head *next, *list, **tail; |
1371 | int bl, count; | 1477 | int bl, count, count_lazy; |
1372 | 1478 | ||
1373 | /* If no callbacks are ready, just return.*/ | 1479 | /* If no callbacks are ready, just return.*/ |
1374 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1480 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
1375 | trace_rcu_batch_start(rsp->name, 0, 0); | 1481 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); |
1376 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), | 1482 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), |
1377 | need_resched(), is_idle_task(current), | 1483 | need_resched(), is_idle_task(current), |
1378 | rcu_is_callbacks_kthread()); | 1484 | rcu_is_callbacks_kthread()); |
@@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1384 | * races with call_rcu() from interrupt handlers. | 1490 | * races with call_rcu() from interrupt handlers. |
1385 | */ | 1491 | */ |
1386 | local_irq_save(flags); | 1492 | local_irq_save(flags); |
1493 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | ||
1387 | bl = rdp->blimit; | 1494 | bl = rdp->blimit; |
1388 | trace_rcu_batch_start(rsp->name, rdp->qlen, bl); | 1495 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); |
1389 | list = rdp->nxtlist; | 1496 | list = rdp->nxtlist; |
1390 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1497 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1391 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1498 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
@@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1396 | local_irq_restore(flags); | 1503 | local_irq_restore(flags); |
1397 | 1504 | ||
1398 | /* Invoke callbacks. */ | 1505 | /* Invoke callbacks. */ |
1399 | count = 0; | 1506 | count = count_lazy = 0; |
1400 | while (list) { | 1507 | while (list) { |
1401 | next = list->next; | 1508 | next = list->next; |
1402 | prefetch(next); | 1509 | prefetch(next); |
1403 | debug_rcu_head_unqueue(list); | 1510 | debug_rcu_head_unqueue(list); |
1404 | __rcu_reclaim(rsp->name, list); | 1511 | if (__rcu_reclaim(rsp->name, list)) |
1512 | count_lazy++; | ||
1405 | list = next; | 1513 | list = next; |
1406 | /* Stop only if limit reached and CPU has something to do. */ | 1514 | /* Stop only if limit reached and CPU has something to do. */ |
1407 | if (++count >= bl && | 1515 | if (++count >= bl && |
@@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1416 | rcu_is_callbacks_kthread()); | 1524 | rcu_is_callbacks_kthread()); |
1417 | 1525 | ||
1418 | /* Update count, and requeue any remaining callbacks. */ | 1526 | /* Update count, and requeue any remaining callbacks. */ |
1527 | rdp->qlen_lazy -= count_lazy; | ||
1419 | rdp->qlen -= count; | 1528 | rdp->qlen -= count; |
1420 | rdp->n_cbs_invoked += count; | 1529 | rdp->n_cbs_invoked += count; |
1421 | if (list != NULL) { | 1530 | if (list != NULL) { |
@@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1458 | void rcu_check_callbacks(int cpu, int user) | 1567 | void rcu_check_callbacks(int cpu, int user) |
1459 | { | 1568 | { |
1460 | trace_rcu_utilization("Start scheduler-tick"); | 1569 | trace_rcu_utilization("Start scheduler-tick"); |
1570 | increment_cpu_stall_ticks(); | ||
1461 | if (user || rcu_is_cpu_rrupt_from_idle()) { | 1571 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
1462 | 1572 | ||
1463 | /* | 1573 | /* |
@@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user) | |||
1492 | trace_rcu_utilization("End scheduler-tick"); | 1602 | trace_rcu_utilization("End scheduler-tick"); |
1493 | } | 1603 | } |
1494 | 1604 | ||
1495 | #ifdef CONFIG_SMP | ||
1496 | |||
1497 | /* | 1605 | /* |
1498 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1606 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
1499 | * have not yet encountered a quiescent state, using the function specified. | 1607 | * have not yet encountered a quiescent state, using the function specified. |
@@ -1616,15 +1724,6 @@ unlock_fqs_ret: | |||
1616 | trace_rcu_utilization("End fqs"); | 1724 | trace_rcu_utilization("End fqs"); |
1617 | } | 1725 | } |
1618 | 1726 | ||
1619 | #else /* #ifdef CONFIG_SMP */ | ||
1620 | |||
1621 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | ||
1622 | { | ||
1623 | set_need_resched(); | ||
1624 | } | ||
1625 | |||
1626 | #endif /* #else #ifdef CONFIG_SMP */ | ||
1627 | |||
1628 | /* | 1727 | /* |
1629 | * This does the RCU core processing work for the specified rcu_state | 1728 | * This does the RCU core processing work for the specified rcu_state |
1630 | * and rcu_data structures. This may be called only from the CPU to | 1729 | * and rcu_data structures. This may be called only from the CPU to |
@@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void) | |||
1702 | 1801 | ||
1703 | static void | 1802 | static void |
1704 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1803 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
1705 | struct rcu_state *rsp) | 1804 | struct rcu_state *rsp, bool lazy) |
1706 | { | 1805 | { |
1707 | unsigned long flags; | 1806 | unsigned long flags; |
1708 | struct rcu_data *rdp; | 1807 | struct rcu_data *rdp; |
1709 | 1808 | ||
1809 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | ||
1710 | debug_rcu_head_queue(head); | 1810 | debug_rcu_head_queue(head); |
1711 | head->func = func; | 1811 | head->func = func; |
1712 | head->next = NULL; | 1812 | head->next = NULL; |
@@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1720 | * a quiescent state betweentimes. | 1820 | * a quiescent state betweentimes. |
1721 | */ | 1821 | */ |
1722 | local_irq_save(flags); | 1822 | local_irq_save(flags); |
1823 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | ||
1723 | rdp = this_cpu_ptr(rsp->rda); | 1824 | rdp = this_cpu_ptr(rsp->rda); |
1724 | 1825 | ||
1725 | /* Add the callback to our list. */ | 1826 | /* Add the callback to our list. */ |
1726 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1827 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1727 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1828 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1728 | rdp->qlen++; | 1829 | rdp->qlen++; |
1830 | if (lazy) | ||
1831 | rdp->qlen_lazy++; | ||
1729 | 1832 | ||
1730 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1833 | if (__is_kfree_rcu_offset((unsigned long)func)) |
1731 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1834 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
1732 | rdp->qlen); | 1835 | rdp->qlen_lazy, rdp->qlen); |
1733 | else | 1836 | else |
1734 | trace_rcu_callback(rsp->name, head, rdp->qlen); | 1837 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); |
1735 | 1838 | ||
1736 | /* If interrupts were disabled, don't dive into RCU core. */ | 1839 | /* If interrupts were disabled, don't dive into RCU core. */ |
1737 | if (irqs_disabled_flags(flags)) { | 1840 | if (irqs_disabled_flags(flags)) { |
@@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1778 | */ | 1881 | */ |
1779 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 1882 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
1780 | { | 1883 | { |
1781 | __call_rcu(head, func, &rcu_sched_state); | 1884 | __call_rcu(head, func, &rcu_sched_state, 0); |
1782 | } | 1885 | } |
1783 | EXPORT_SYMBOL_GPL(call_rcu_sched); | 1886 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
1784 | 1887 | ||
1785 | /* | 1888 | /* |
1786 | * Queue an RCU for invocation after a quicker grace period. | 1889 | * Queue an RCU callback for invocation after a quicker grace period. |
1787 | */ | 1890 | */ |
1788 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 1891 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
1789 | { | 1892 | { |
1790 | __call_rcu(head, func, &rcu_bh_state); | 1893 | __call_rcu(head, func, &rcu_bh_state, 0); |
1791 | } | 1894 | } |
1792 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 1895 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1793 | 1896 | ||
@@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
1816 | */ | 1919 | */ |
1817 | void synchronize_sched(void) | 1920 | void synchronize_sched(void) |
1818 | { | 1921 | { |
1922 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
1923 | !lock_is_held(&rcu_lock_map) && | ||
1924 | !lock_is_held(&rcu_sched_lock_map), | ||
1925 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | ||
1819 | if (rcu_blocking_is_gp()) | 1926 | if (rcu_blocking_is_gp()) |
1820 | return; | 1927 | return; |
1821 | wait_rcu_gp(call_rcu_sched); | 1928 | wait_rcu_gp(call_rcu_sched); |
@@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
1833 | */ | 1940 | */ |
1834 | void synchronize_rcu_bh(void) | 1941 | void synchronize_rcu_bh(void) |
1835 | { | 1942 | { |
1943 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
1944 | !lock_is_held(&rcu_lock_map) && | ||
1945 | !lock_is_held(&rcu_sched_lock_map), | ||
1946 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | ||
1836 | if (rcu_blocking_is_gp()) | 1947 | if (rcu_blocking_is_gp()) |
1837 | return; | 1948 | return; |
1838 | wait_rcu_gp(call_rcu_bh); | 1949 | wait_rcu_gp(call_rcu_bh); |
1839 | } | 1950 | } |
1840 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | 1951 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); |
1841 | 1952 | ||
1953 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
1954 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
1955 | |||
1956 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
1957 | { | ||
1958 | /* | ||
1959 | * There must be a full memory barrier on each affected CPU | ||
1960 | * between the time that try_stop_cpus() is called and the | ||
1961 | * time that it returns. | ||
1962 | * | ||
1963 | * In the current initial implementation of cpu_stop, the | ||
1964 | * above condition is already met when the control reaches | ||
1965 | * this point and the following smp_mb() is not strictly | ||
1966 | * necessary. Do smp_mb() anyway for documentation and | ||
1967 | * robustness against future implementation changes. | ||
1968 | */ | ||
1969 | smp_mb(); /* See above comment block. */ | ||
1970 | return 0; | ||
1971 | } | ||
1972 | |||
1973 | /** | ||
1974 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | ||
1975 | * | ||
1976 | * Wait for an RCU-sched grace period to elapse, but use a "big hammer" | ||
1977 | * approach to force the grace period to end quickly. This consumes | ||
1978 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
1979 | * so is thus not recommended for any sort of common-case code. In fact, | ||
1980 | * if you are using synchronize_sched_expedited() in a loop, please | ||
1981 | * restructure your code to batch your updates, and then use a single | ||
1982 | * synchronize_sched() instead. | ||
1983 | * | ||
1984 | * Note that it is illegal to call this function while holding any lock | ||
1985 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
1986 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
1987 | * these restriction will result in deadlock. | ||
1988 | * | ||
1989 | * This implementation can be thought of as an application of ticket | ||
1990 | * locking to RCU, with sync_sched_expedited_started and | ||
1991 | * sync_sched_expedited_done taking on the roles of the halves | ||
1992 | * of the ticket-lock word. Each task atomically increments | ||
1993 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
1994 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
1995 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
1996 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
1997 | * update sync_sched_expedited_done to match our snapshot -- but | ||
1998 | * only if someone else has not already advanced past our snapshot. | ||
1999 | * | ||
2000 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
2001 | * of sync_sched_expedited_done. If it has advanced past our | ||
2002 | * initial snapshot, then someone else must have forced a grace period | ||
2003 | * some time after we took our snapshot. In this case, our work is | ||
2004 | * done for us, and we can simply return. Otherwise, we try again, | ||
2005 | * but keep our initial snapshot for purposes of checking for someone | ||
2006 | * doing our work for us. | ||
2007 | * | ||
2008 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
2009 | */ | ||
2010 | void synchronize_sched_expedited(void) | ||
2011 | { | ||
2012 | int firstsnap, s, snap, trycount = 0; | ||
2013 | |||
2014 | /* Note that atomic_inc_return() implies full memory barrier. */ | ||
2015 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | ||
2016 | get_online_cpus(); | ||
2017 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | ||
2018 | |||
2019 | /* | ||
2020 | * Each pass through the following loop attempts to force a | ||
2021 | * context switch on each CPU. | ||
2022 | */ | ||
2023 | while (try_stop_cpus(cpu_online_mask, | ||
2024 | synchronize_sched_expedited_cpu_stop, | ||
2025 | NULL) == -EAGAIN) { | ||
2026 | put_online_cpus(); | ||
2027 | |||
2028 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
2029 | if (trycount++ < 10) | ||
2030 | udelay(trycount * num_online_cpus()); | ||
2031 | else { | ||
2032 | synchronize_sched(); | ||
2033 | return; | ||
2034 | } | ||
2035 | |||
2036 | /* Check to see if someone else did our work for us. */ | ||
2037 | s = atomic_read(&sync_sched_expedited_done); | ||
2038 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
2039 | smp_mb(); /* ensure test happens before caller kfree */ | ||
2040 | return; | ||
2041 | } | ||
2042 | |||
2043 | /* | ||
2044 | * Refetching sync_sched_expedited_started allows later | ||
2045 | * callers to piggyback on our grace period. We subtract | ||
2046 | * 1 to get the same token that the last incrementer got. | ||
2047 | * We retry after they started, so our grace period works | ||
2048 | * for them, and they started after our first try, so their | ||
2049 | * grace period works for us. | ||
2050 | */ | ||
2051 | get_online_cpus(); | ||
2052 | snap = atomic_read(&sync_sched_expedited_started); | ||
2053 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
2054 | } | ||
2055 | |||
2056 | /* | ||
2057 | * Everyone up to our most recent fetch is covered by our grace | ||
2058 | * period. Update the counter, but only if our work is still | ||
2059 | * relevant -- which it won't be if someone who started later | ||
2060 | * than we did beat us to the punch. | ||
2061 | */ | ||
2062 | do { | ||
2063 | s = atomic_read(&sync_sched_expedited_done); | ||
2064 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
2065 | smp_mb(); /* ensure test happens before caller kfree */ | ||
2066 | break; | ||
2067 | } | ||
2068 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
2069 | |||
2070 | put_online_cpus(); | ||
2071 | } | ||
2072 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
2073 | |||
1842 | /* | 2074 | /* |
1843 | * Check to see if there is any immediate RCU-related work to be done | 2075 | * Check to see if there is any immediate RCU-related work to be done |
1844 | * by the current CPU, for the specified type of RCU, returning 1 if so. | 2076 | * by the current CPU, for the specified type of RCU, returning 1 if so. |
@@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
1932 | /* RCU callbacks either ready or pending? */ | 2164 | /* RCU callbacks either ready or pending? */ |
1933 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 2165 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
1934 | per_cpu(rcu_bh_data, cpu).nxtlist || | 2166 | per_cpu(rcu_bh_data, cpu).nxtlist || |
1935 | rcu_preempt_needs_cpu(cpu); | 2167 | rcu_preempt_cpu_has_callbacks(cpu); |
1936 | } | 2168 | } |
1937 | 2169 | ||
1938 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2170 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
@@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
2027 | rdp->nxtlist = NULL; | 2259 | rdp->nxtlist = NULL; |
2028 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 2260 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
2029 | rdp->nxttail[i] = &rdp->nxtlist; | 2261 | rdp->nxttail[i] = &rdp->nxtlist; |
2262 | rdp->qlen_lazy = 0; | ||
2030 | rdp->qlen = 0; | 2263 | rdp->qlen = 0; |
2031 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2264 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
2032 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); | 2265 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
2033 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 2266 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
2034 | rdp->cpu = cpu; | 2267 | rdp->cpu = cpu; |
2035 | rdp->rsp = rsp; | 2268 | rdp->rsp = rsp; |
@@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2057 | rdp->qlen_last_fqs_check = 0; | 2290 | rdp->qlen_last_fqs_check = 0; |
2058 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2291 | rdp->n_force_qs_snap = rsp->n_force_qs; |
2059 | rdp->blimit = blimit; | 2292 | rdp->blimit = blimit; |
2060 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; | 2293 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2061 | atomic_set(&rdp->dynticks->dynticks, | 2294 | atomic_set(&rdp->dynticks->dynticks, |
2062 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 2295 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2063 | rcu_prepare_for_idle_init(cpu); | 2296 | rcu_prepare_for_idle_init(cpu); |
@@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2139 | * touch any data without introducing corruption. We send the | 2372 | * touch any data without introducing corruption. We send the |
2140 | * dying CPU's callbacks to an arbitrarily chosen online CPU. | 2373 | * dying CPU's callbacks to an arbitrarily chosen online CPU. |
2141 | */ | 2374 | */ |
2142 | rcu_send_cbs_to_online(&rcu_bh_state); | 2375 | rcu_cleanup_dying_cpu(&rcu_bh_state); |
2143 | rcu_send_cbs_to_online(&rcu_sched_state); | 2376 | rcu_cleanup_dying_cpu(&rcu_sched_state); |
2144 | rcu_preempt_send_cbs_to_online(); | 2377 | rcu_preempt_cleanup_dying_cpu(); |
2145 | rcu_cleanup_after_idle(cpu); | 2378 | rcu_cleanup_after_idle(cpu); |
2146 | break; | 2379 | break; |
2147 | case CPU_DEAD: | 2380 | case CPU_DEAD: |
2148 | case CPU_DEAD_FROZEN: | 2381 | case CPU_DEAD_FROZEN: |
2149 | case CPU_UP_CANCELED: | 2382 | case CPU_UP_CANCELED: |
2150 | case CPU_UP_CANCELED_FROZEN: | 2383 | case CPU_UP_CANCELED_FROZEN: |
2151 | rcu_offline_cpu(cpu); | 2384 | rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); |
2385 | rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); | ||
2386 | rcu_preempt_cleanup_dead_cpu(cpu); | ||
2152 | break; | 2387 | break; |
2153 | default: | 2388 | default: |
2154 | break; | 2389 | break; |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fddff92d6676..cdd1be0a4072 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -239,6 +239,12 @@ struct rcu_data { | |||
239 | bool preemptible; /* Preemptible RCU? */ | 239 | bool preemptible; /* Preemptible RCU? */ |
240 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 240 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
241 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 241 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
242 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
243 | unsigned long ticks_this_gp; /* The number of scheduling-clock */ | ||
244 | /* ticks this CPU has handled */ | ||
245 | /* during and after the last grace */ | ||
246 | /* period it is aware of. */ | ||
247 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
242 | 248 | ||
243 | /* 2) batch handling */ | 249 | /* 2) batch handling */ |
244 | /* | 250 | /* |
@@ -265,7 +271,8 @@ struct rcu_data { | |||
265 | */ | 271 | */ |
266 | struct rcu_head *nxtlist; | 272 | struct rcu_head *nxtlist; |
267 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | 273 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; |
268 | long qlen; /* # of queued callbacks */ | 274 | long qlen_lazy; /* # of lazy queued callbacks */ |
275 | long qlen; /* # of queued callbacks, incl lazy */ | ||
269 | long qlen_last_fqs_check; | 276 | long qlen_last_fqs_check; |
270 | /* qlen at last check for QS forcing */ | 277 | /* qlen at last check for QS forcing */ |
271 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 278 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
@@ -282,7 +289,6 @@ struct rcu_data { | |||
282 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 289 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
283 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | 290 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ |
284 | unsigned long offline_fqs; /* Kicked due to being offline. */ | 291 | unsigned long offline_fqs; /* Kicked due to being offline. */ |
285 | unsigned long resched_ipi; /* Sent a resched IPI. */ | ||
286 | 292 | ||
287 | /* 5) __rcu_pending() statistics. */ | 293 | /* 5) __rcu_pending() statistics. */ |
288 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | 294 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
@@ -313,12 +319,6 @@ struct rcu_data { | |||
313 | #else | 319 | #else |
314 | #define RCU_STALL_DELAY_DELTA 0 | 320 | #define RCU_STALL_DELAY_DELTA 0 |
315 | #endif | 321 | #endif |
316 | |||
317 | #define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ | ||
318 | RCU_STALL_DELAY_DELTA) | ||
319 | /* for rsp->jiffies_stall */ | ||
320 | #define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) | ||
321 | /* for rsp->jiffies_stall */ | ||
322 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 322 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ |
323 | /* to take at least one */ | 323 | /* to take at least one */ |
324 | /* scheduling clock irq */ | 324 | /* scheduling clock irq */ |
@@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | |||
438 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 438 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
439 | struct rcu_node *rnp, | 439 | struct rcu_node *rnp, |
440 | struct rcu_data *rdp); | 440 | struct rcu_data *rdp); |
441 | static void rcu_preempt_offline_cpu(int cpu); | ||
442 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 441 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
442 | static void rcu_preempt_cleanup_dead_cpu(int cpu); | ||
443 | static void rcu_preempt_check_callbacks(int cpu); | 443 | static void rcu_preempt_check_callbacks(int cpu); |
444 | static void rcu_preempt_process_callbacks(void); | 444 | static void rcu_preempt_process_callbacks(void); |
445 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 445 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
@@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
448 | bool wake); | 448 | bool wake); |
449 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 449 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ |
450 | static int rcu_preempt_pending(int cpu); | 450 | static int rcu_preempt_pending(int cpu); |
451 | static int rcu_preempt_needs_cpu(int cpu); | 451 | static int rcu_preempt_cpu_has_callbacks(int cpu); |
452 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 452 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
453 | static void rcu_preempt_send_cbs_to_online(void); | 453 | static void rcu_preempt_cleanup_dying_cpu(void); |
454 | static void __init __rcu_init_preempt(void); | 454 | static void __init __rcu_init_preempt(void); |
455 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 455 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
456 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 456 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
@@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); | |||
471 | static void rcu_prepare_for_idle_init(int cpu); | 471 | static void rcu_prepare_for_idle_init(int cpu); |
472 | static void rcu_cleanup_after_idle(int cpu); | 472 | static void rcu_cleanup_after_idle(int cpu); |
473 | static void rcu_prepare_for_idle(int cpu); | 473 | static void rcu_prepare_for_idle(int cpu); |
474 | static void print_cpu_stall_info_begin(void); | ||
475 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | ||
476 | static void print_cpu_stall_info_end(void); | ||
477 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | ||
478 | static void increment_cpu_stall_ticks(void); | ||
474 | 479 | ||
475 | #endif /* #ifndef RCU_TREE_NONCORE */ | 480 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8bb35d73e1f9..c023464816be 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -25,7 +25,6 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/stop_machine.h> | ||
29 | 28 | ||
30 | #define RCU_KTHREAD_PRIO 1 | 29 | #define RCU_KTHREAD_PRIO 1 |
31 | 30 | ||
@@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void) | |||
63 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); | 62 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); |
64 | #endif | 63 | #endif |
65 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | 64 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
66 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 65 | printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); |
66 | #endif | ||
67 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | ||
68 | printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); | ||
67 | #endif | 69 | #endif |
68 | #if NUM_RCU_LVL_4 != 0 | 70 | #if NUM_RCU_LVL_4 != 0 |
69 | printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); | 71 | printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); |
@@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
490 | 492 | ||
491 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | 493 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ |
492 | 494 | ||
495 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
496 | |||
497 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
498 | { | ||
499 | printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | ||
500 | rnp->level, rnp->grplo, rnp->grphi); | ||
501 | } | ||
502 | |||
503 | static void rcu_print_task_stall_end(void) | ||
504 | { | ||
505 | printk(KERN_CONT "\n"); | ||
506 | } | ||
507 | |||
508 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
509 | |||
510 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
511 | { | ||
512 | } | ||
513 | |||
514 | static void rcu_print_task_stall_end(void) | ||
515 | { | ||
516 | } | ||
517 | |||
518 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
519 | |||
493 | /* | 520 | /* |
494 | * Scan the current list of tasks blocked within RCU read-side critical | 521 | * Scan the current list of tasks blocked within RCU read-side critical |
495 | * sections, printing out the tid of each. | 522 | * sections, printing out the tid of each. |
@@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
501 | 528 | ||
502 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | 529 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
503 | return 0; | 530 | return 0; |
531 | rcu_print_task_stall_begin(rnp); | ||
504 | t = list_entry(rnp->gp_tasks, | 532 | t = list_entry(rnp->gp_tasks, |
505 | struct task_struct, rcu_node_entry); | 533 | struct task_struct, rcu_node_entry); |
506 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | 534 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { |
507 | printk(" P%d", t->pid); | 535 | printk(KERN_CONT " P%d", t->pid); |
508 | ndetected++; | 536 | ndetected++; |
509 | } | 537 | } |
538 | rcu_print_task_stall_end(); | ||
510 | return ndetected; | 539 | return ndetected; |
511 | } | 540 | } |
512 | 541 | ||
@@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
581 | * absolutely necessary, but this is a good performance/complexity | 610 | * absolutely necessary, but this is a good performance/complexity |
582 | * tradeoff. | 611 | * tradeoff. |
583 | */ | 612 | */ |
584 | if (rcu_preempt_blocked_readers_cgp(rnp)) | 613 | if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) |
585 | retval |= RCU_OFL_TASKS_NORM_GP; | 614 | retval |= RCU_OFL_TASKS_NORM_GP; |
586 | if (rcu_preempted_readers_exp(rnp)) | 615 | if (rcu_preempted_readers_exp(rnp)) |
587 | retval |= RCU_OFL_TASKS_EXP_GP; | 616 | retval |= RCU_OFL_TASKS_EXP_GP; |
@@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
618 | return retval; | 647 | return retval; |
619 | } | 648 | } |
620 | 649 | ||
650 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
651 | |||
621 | /* | 652 | /* |
622 | * Do CPU-offline processing for preemptible RCU. | 653 | * Do CPU-offline processing for preemptible RCU. |
623 | */ | 654 | */ |
624 | static void rcu_preempt_offline_cpu(int cpu) | 655 | static void rcu_preempt_cleanup_dead_cpu(int cpu) |
625 | { | 656 | { |
626 | __rcu_offline_cpu(cpu, &rcu_preempt_state); | 657 | rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); |
627 | } | 658 | } |
628 | 659 | ||
629 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
630 | |||
631 | /* | 660 | /* |
632 | * Check for a quiescent state from the current CPU. When a task blocks, | 661 | * Check for a quiescent state from the current CPU. When a task blocks, |
633 | * the task is recorded in the corresponding CPU's rcu_node structure, | 662 | * the task is recorded in the corresponding CPU's rcu_node structure, |
@@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void) | |||
671 | */ | 700 | */ |
672 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 701 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
673 | { | 702 | { |
674 | __call_rcu(head, func, &rcu_preempt_state); | 703 | __call_rcu(head, func, &rcu_preempt_state, 0); |
675 | } | 704 | } |
676 | EXPORT_SYMBOL_GPL(call_rcu); | 705 | EXPORT_SYMBOL_GPL(call_rcu); |
677 | 706 | ||
707 | /* | ||
708 | * Queue an RCU callback for lazy invocation after a grace period. | ||
709 | * This will likely be later named something like "call_rcu_lazy()", | ||
710 | * but this change will require some way of tagging the lazy RCU | ||
711 | * callbacks in the list of pending callbacks. Until then, this | ||
712 | * function may only be called from __kfree_rcu(). | ||
713 | */ | ||
714 | void kfree_call_rcu(struct rcu_head *head, | ||
715 | void (*func)(struct rcu_head *rcu)) | ||
716 | { | ||
717 | __call_rcu(head, func, &rcu_preempt_state, 1); | ||
718 | } | ||
719 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
720 | |||
678 | /** | 721 | /** |
679 | * synchronize_rcu - wait until a grace period has elapsed. | 722 | * synchronize_rcu - wait until a grace period has elapsed. |
680 | * | 723 | * |
@@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
688 | */ | 731 | */ |
689 | void synchronize_rcu(void) | 732 | void synchronize_rcu(void) |
690 | { | 733 | { |
734 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
735 | !lock_is_held(&rcu_lock_map) && | ||
736 | !lock_is_held(&rcu_sched_lock_map), | ||
737 | "Illegal synchronize_rcu() in RCU read-side critical section"); | ||
691 | if (!rcu_scheduler_active) | 738 | if (!rcu_scheduler_active) |
692 | return; | 739 | return; |
693 | wait_rcu_gp(call_rcu); | 740 | wait_rcu_gp(call_rcu); |
@@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
788 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ | 835 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ |
789 | } | 836 | } |
790 | 837 | ||
791 | /* | 838 | /** |
792 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | 839 | * synchronize_rcu_expedited - Brute-force RCU grace period |
793 | * is to invoke synchronize_sched_expedited() to push all the tasks to | 840 | * |
794 | * the ->blkd_tasks lists and wait for this list to drain. | 841 | * Wait for an RCU-preempt grace period, but expedite it. The basic |
842 | * idea is to invoke synchronize_sched_expedited() to push all the tasks to | ||
843 | * the ->blkd_tasks lists and wait for this list to drain. This consumes | ||
844 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
845 | * so is thus not recommended for any sort of common-case code. | ||
846 | * In fact, if you are using synchronize_rcu_expedited() in a loop, | ||
847 | * please restructure your code to batch your updates, and then Use a | ||
848 | * single synchronize_rcu() instead. | ||
849 | * | ||
850 | * Note that it is illegal to call this function while holding any lock | ||
851 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
852 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
853 | * these restriction will result in deadlock. | ||
795 | */ | 854 | */ |
796 | void synchronize_rcu_expedited(void) | 855 | void synchronize_rcu_expedited(void) |
797 | { | 856 | { |
@@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu) | |||
869 | } | 928 | } |
870 | 929 | ||
871 | /* | 930 | /* |
872 | * Does preemptible RCU need the CPU to stay out of dynticks mode? | 931 | * Does preemptible RCU have callbacks on this CPU? |
873 | */ | 932 | */ |
874 | static int rcu_preempt_needs_cpu(int cpu) | 933 | static int rcu_preempt_cpu_has_callbacks(int cpu) |
875 | { | 934 | { |
876 | return !!per_cpu(rcu_preempt_data, cpu).nxtlist; | 935 | return !!per_cpu(rcu_preempt_data, cpu).nxtlist; |
877 | } | 936 | } |
@@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
894 | } | 953 | } |
895 | 954 | ||
896 | /* | 955 | /* |
897 | * Move preemptible RCU's callbacks from dying CPU to other online CPU. | 956 | * Move preemptible RCU's callbacks from dying CPU to other online CPU |
957 | * and record a quiescent state. | ||
898 | */ | 958 | */ |
899 | static void rcu_preempt_send_cbs_to_online(void) | 959 | static void rcu_preempt_cleanup_dying_cpu(void) |
900 | { | 960 | { |
901 | rcu_send_cbs_to_online(&rcu_preempt_state); | 961 | rcu_cleanup_dying_cpu(&rcu_preempt_state); |
902 | } | 962 | } |
903 | 963 | ||
904 | /* | 964 | /* |
@@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
1034 | return 0; | 1094 | return 0; |
1035 | } | 1095 | } |
1036 | 1096 | ||
1097 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1098 | |||
1037 | /* | 1099 | /* |
1038 | * Because preemptible RCU does not exist, it never needs CPU-offline | 1100 | * Because preemptible RCU does not exist, it never needs CPU-offline |
1039 | * processing. | 1101 | * processing. |
1040 | */ | 1102 | */ |
1041 | static void rcu_preempt_offline_cpu(int cpu) | 1103 | static void rcu_preempt_cleanup_dead_cpu(int cpu) |
1042 | { | 1104 | { |
1043 | } | 1105 | } |
1044 | 1106 | ||
1045 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1046 | |||
1047 | /* | 1107 | /* |
1048 | * Because preemptible RCU does not exist, it never has any callbacks | 1108 | * Because preemptible RCU does not exist, it never has any callbacks |
1049 | * to check. | 1109 | * to check. |
@@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void) | |||
1061 | } | 1121 | } |
1062 | 1122 | ||
1063 | /* | 1123 | /* |
1124 | * Queue an RCU callback for lazy invocation after a grace period. | ||
1125 | * This will likely be later named something like "call_rcu_lazy()", | ||
1126 | * but this change will require some way of tagging the lazy RCU | ||
1127 | * callbacks in the list of pending callbacks. Until then, this | ||
1128 | * function may only be called from __kfree_rcu(). | ||
1129 | * | ||
1130 | * Because there is no preemptible RCU, we use RCU-sched instead. | ||
1131 | */ | ||
1132 | void kfree_call_rcu(struct rcu_head *head, | ||
1133 | void (*func)(struct rcu_head *rcu)) | ||
1134 | { | ||
1135 | __call_rcu(head, func, &rcu_sched_state, 1); | ||
1136 | } | ||
1137 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
1138 | |||
1139 | /* | ||
1064 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1140 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
1065 | * But because preemptible RCU does not exist, map to rcu-sched. | 1141 | * But because preemptible RCU does not exist, map to rcu-sched. |
1066 | */ | 1142 | */ |
@@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu) | |||
1093 | } | 1169 | } |
1094 | 1170 | ||
1095 | /* | 1171 | /* |
1096 | * Because preemptible RCU does not exist, it never needs any CPU. | 1172 | * Because preemptible RCU does not exist, it never has callbacks |
1097 | */ | 1173 | */ |
1098 | static int rcu_preempt_needs_cpu(int cpu) | 1174 | static int rcu_preempt_cpu_has_callbacks(int cpu) |
1099 | { | 1175 | { |
1100 | return 0; | 1176 | return 0; |
1101 | } | 1177 | } |
@@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
1119 | } | 1195 | } |
1120 | 1196 | ||
1121 | /* | 1197 | /* |
1122 | * Because there is no preemptible RCU, there are no callbacks to move. | 1198 | * Because there is no preemptible RCU, there is no cleanup to do. |
1123 | */ | 1199 | */ |
1124 | static void rcu_preempt_send_cbs_to_online(void) | 1200 | static void rcu_preempt_cleanup_dying_cpu(void) |
1125 | { | 1201 | { |
1126 | } | 1202 | } |
1127 | 1203 | ||
@@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1823 | 1899 | ||
1824 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | 1900 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ |
1825 | 1901 | ||
1826 | #ifndef CONFIG_SMP | ||
1827 | |||
1828 | void synchronize_sched_expedited(void) | ||
1829 | { | ||
1830 | cond_resched(); | ||
1831 | } | ||
1832 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1833 | |||
1834 | #else /* #ifndef CONFIG_SMP */ | ||
1835 | |||
1836 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | ||
1837 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
1838 | |||
1839 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
1840 | { | ||
1841 | /* | ||
1842 | * There must be a full memory barrier on each affected CPU | ||
1843 | * between the time that try_stop_cpus() is called and the | ||
1844 | * time that it returns. | ||
1845 | * | ||
1846 | * In the current initial implementation of cpu_stop, the | ||
1847 | * above condition is already met when the control reaches | ||
1848 | * this point and the following smp_mb() is not strictly | ||
1849 | * necessary. Do smp_mb() anyway for documentation and | ||
1850 | * robustness against future implementation changes. | ||
1851 | */ | ||
1852 | smp_mb(); /* See above comment block. */ | ||
1853 | return 0; | ||
1854 | } | ||
1855 | |||
1856 | /* | ||
1857 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
1858 | * approach to force grace period to end quickly. This consumes | ||
1859 | * significant time on all CPUs, and is thus not recommended for | ||
1860 | * any sort of common-case code. | ||
1861 | * | ||
1862 | * Note that it is illegal to call this function while holding any | ||
1863 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
1864 | * observe this restriction will result in deadlock. | ||
1865 | * | ||
1866 | * This implementation can be thought of as an application of ticket | ||
1867 | * locking to RCU, with sync_sched_expedited_started and | ||
1868 | * sync_sched_expedited_done taking on the roles of the halves | ||
1869 | * of the ticket-lock word. Each task atomically increments | ||
1870 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
1871 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
1872 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
1873 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
1874 | * update sync_sched_expedited_done to match our snapshot -- but | ||
1875 | * only if someone else has not already advanced past our snapshot. | ||
1876 | * | ||
1877 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
1878 | * of sync_sched_expedited_done. If it has advanced past our | ||
1879 | * initial snapshot, then someone else must have forced a grace period | ||
1880 | * some time after we took our snapshot. In this case, our work is | ||
1881 | * done for us, and we can simply return. Otherwise, we try again, | ||
1882 | * but keep our initial snapshot for purposes of checking for someone | ||
1883 | * doing our work for us. | ||
1884 | * | ||
1885 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
1886 | */ | ||
1887 | void synchronize_sched_expedited(void) | ||
1888 | { | ||
1889 | int firstsnap, s, snap, trycount = 0; | ||
1890 | |||
1891 | /* Note that atomic_inc_return() implies full memory barrier. */ | ||
1892 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | ||
1893 | get_online_cpus(); | ||
1894 | |||
1895 | /* | ||
1896 | * Each pass through the following loop attempts to force a | ||
1897 | * context switch on each CPU. | ||
1898 | */ | ||
1899 | while (try_stop_cpus(cpu_online_mask, | ||
1900 | synchronize_sched_expedited_cpu_stop, | ||
1901 | NULL) == -EAGAIN) { | ||
1902 | put_online_cpus(); | ||
1903 | |||
1904 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
1905 | if (trycount++ < 10) | ||
1906 | udelay(trycount * num_online_cpus()); | ||
1907 | else { | ||
1908 | synchronize_sched(); | ||
1909 | return; | ||
1910 | } | ||
1911 | |||
1912 | /* Check to see if someone else did our work for us. */ | ||
1913 | s = atomic_read(&sync_sched_expedited_done); | ||
1914 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
1915 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1916 | return; | ||
1917 | } | ||
1918 | |||
1919 | /* | ||
1920 | * Refetching sync_sched_expedited_started allows later | ||
1921 | * callers to piggyback on our grace period. We subtract | ||
1922 | * 1 to get the same token that the last incrementer got. | ||
1923 | * We retry after they started, so our grace period works | ||
1924 | * for them, and they started after our first try, so their | ||
1925 | * grace period works for us. | ||
1926 | */ | ||
1927 | get_online_cpus(); | ||
1928 | snap = atomic_read(&sync_sched_expedited_started); | ||
1929 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
1930 | } | ||
1931 | |||
1932 | /* | ||
1933 | * Everyone up to our most recent fetch is covered by our grace | ||
1934 | * period. Update the counter, but only if our work is still | ||
1935 | * relevant -- which it won't be if someone who started later | ||
1936 | * than we did beat us to the punch. | ||
1937 | */ | ||
1938 | do { | ||
1939 | s = atomic_read(&sync_sched_expedited_done); | ||
1940 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
1941 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1942 | break; | ||
1943 | } | ||
1944 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
1945 | |||
1946 | put_online_cpus(); | ||
1947 | } | ||
1948 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
1949 | |||
1950 | #endif /* #else #ifndef CONFIG_SMP */ | ||
1951 | |||
1952 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | 1902 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
1953 | 1903 | ||
1954 | /* | 1904 | /* |
@@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu) | |||
1981 | } | 1931 | } |
1982 | 1932 | ||
1983 | /* | 1933 | /* |
1984 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, | 1934 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, |
1985 | * is nothing. | 1935 | * is nothing. |
1986 | */ | 1936 | */ |
1987 | static void rcu_prepare_for_idle(int cpu) | 1937 | static void rcu_prepare_for_idle(int cpu) |
@@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu) | |||
2015 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your | 1965 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your |
2016 | * system. And if you are -that- concerned about energy efficiency, | 1966 | * system. And if you are -that- concerned about energy efficiency, |
2017 | * just power the system down and be done with it! | 1967 | * just power the system down and be done with it! |
1968 | * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is | ||
1969 | * permitted to sleep in dyntick-idle mode with only lazy RCU | ||
1970 | * callbacks pending. Setting this too high can OOM your system. | ||
2018 | * | 1971 | * |
2019 | * The values below work well in practice. If future workloads require | 1972 | * The values below work well in practice. If future workloads require |
2020 | * adjustment, they can be converted into kernel config parameters, though | 1973 | * adjustment, they can be converted into kernel config parameters, though |
@@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu) | |||
2023 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | 1976 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ |
2024 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | 1977 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ |
2025 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1978 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1979 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | ||
2026 | 1980 | ||
2027 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 1981 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
2028 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 1982 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
2029 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | 1983 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); |
2030 | static ktime_t rcu_idle_gp_wait; | 1984 | static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ |
1985 | static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | ||
2031 | 1986 | ||
2032 | /* | 1987 | /* |
2033 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1988 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
@@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu) | |||
2048 | } | 2003 | } |
2049 | 2004 | ||
2050 | /* | 2005 | /* |
2006 | * Does the specified flavor of RCU have non-lazy callbacks pending on | ||
2007 | * the specified CPU? Both RCU flavor and CPU are specified by the | ||
2008 | * rcu_data structure. | ||
2009 | */ | ||
2010 | static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) | ||
2011 | { | ||
2012 | return rdp->qlen != rdp->qlen_lazy; | ||
2013 | } | ||
2014 | |||
2015 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
2016 | |||
2017 | /* | ||
2018 | * Are there non-lazy RCU-preempt callbacks? (There cannot be if there | ||
2019 | * is no RCU-preempt in the kernel.) | ||
2020 | */ | ||
2021 | static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) | ||
2022 | { | ||
2023 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | ||
2024 | |||
2025 | return __rcu_cpu_has_nonlazy_callbacks(rdp); | ||
2026 | } | ||
2027 | |||
2028 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
2029 | |||
2030 | static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) | ||
2031 | { | ||
2032 | return 0; | ||
2033 | } | ||
2034 | |||
2035 | #endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
2036 | |||
2037 | /* | ||
2038 | * Does any flavor of RCU have non-lazy callbacks on the specified CPU? | ||
2039 | */ | ||
2040 | static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | ||
2041 | { | ||
2042 | return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || | ||
2043 | __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || | ||
2044 | rcu_preempt_cpu_has_nonlazy_callbacks(cpu); | ||
2045 | } | ||
2046 | |||
2047 | /* | ||
2051 | * Timer handler used to force CPU to start pushing its remaining RCU | 2048 | * Timer handler used to force CPU to start pushing its remaining RCU |
2052 | * callbacks in the case where it entered dyntick-idle mode with callbacks | 2049 | * callbacks in the case where it entered dyntick-idle mode with callbacks |
2053 | * pending. The hander doesn't really need to do anything because the | 2050 | * pending. The hander doesn't really need to do anything because the |
@@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu) | |||
2074 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | 2071 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); |
2075 | 2072 | ||
2076 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | 2073 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); |
2074 | upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); | ||
2075 | rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2077 | firsttime = 0; | 2076 | firsttime = 0; |
2078 | } | 2077 | } |
2079 | } | 2078 | } |
@@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2109 | */ | 2108 | */ |
2110 | static void rcu_prepare_for_idle(int cpu) | 2109 | static void rcu_prepare_for_idle(int cpu) |
2111 | { | 2110 | { |
2112 | unsigned long flags; | ||
2113 | |||
2114 | local_irq_save(flags); | ||
2115 | |||
2116 | /* | 2111 | /* |
2117 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2112 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2118 | * Also reset state to avoid prejudicing later attempts. | 2113 | * Also reset state to avoid prejudicing later attempts. |
@@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu) | |||
2120 | if (!rcu_cpu_has_callbacks(cpu)) { | 2115 | if (!rcu_cpu_has_callbacks(cpu)) { |
2121 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2116 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
2122 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2117 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
2123 | local_irq_restore(flags); | ||
2124 | trace_rcu_prep_idle("No callbacks"); | 2118 | trace_rcu_prep_idle("No callbacks"); |
2125 | return; | 2119 | return; |
2126 | } | 2120 | } |
@@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu) | |||
2130 | * refrained from disabling the scheduling-clock tick. | 2124 | * refrained from disabling the scheduling-clock tick. |
2131 | */ | 2125 | */ |
2132 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | 2126 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { |
2133 | local_irq_restore(flags); | ||
2134 | trace_rcu_prep_idle("In holdoff"); | 2127 | trace_rcu_prep_idle("In holdoff"); |
2135 | return; | 2128 | return; |
2136 | } | 2129 | } |
@@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu) | |||
2140 | /* First time through, initialize the counter. */ | 2133 | /* First time through, initialize the counter. */ |
2141 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; | 2134 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; |
2142 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | 2135 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && |
2143 | !rcu_pending(cpu)) { | 2136 | !rcu_pending(cpu) && |
2137 | !local_softirq_pending()) { | ||
2144 | /* Can we go dyntick-idle despite still having callbacks? */ | 2138 | /* Can we go dyntick-idle despite still having callbacks? */ |
2145 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2139 | trace_rcu_prep_idle("Dyntick with callbacks"); |
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2140 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
2147 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2141 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2148 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2142 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) |
2149 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | 2143 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), |
2144 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | ||
2145 | else | ||
2146 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | ||
2147 | rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); | ||
2150 | return; /* Nothing more to do immediately. */ | 2148 | return; /* Nothing more to do immediately. */ |
2151 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2149 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2152 | /* We have hit the limit, so time to give up. */ | 2150 | /* We have hit the limit, so time to give up. */ |
2153 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2151 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2154 | local_irq_restore(flags); | ||
2155 | trace_rcu_prep_idle("Begin holdoff"); | 2152 | trace_rcu_prep_idle("Begin holdoff"); |
2156 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | 2153 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ |
2157 | return; | 2154 | return; |
@@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu) | |||
2163 | */ | 2160 | */ |
2164 | #ifdef CONFIG_TREE_PREEMPT_RCU | 2161 | #ifdef CONFIG_TREE_PREEMPT_RCU |
2165 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | 2162 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { |
2166 | local_irq_restore(flags); | ||
2167 | rcu_preempt_qs(cpu); | 2163 | rcu_preempt_qs(cpu); |
2168 | force_quiescent_state(&rcu_preempt_state, 0); | 2164 | force_quiescent_state(&rcu_preempt_state, 0); |
2169 | local_irq_save(flags); | ||
2170 | } | 2165 | } |
2171 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 2166 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
2172 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | 2167 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { |
2173 | local_irq_restore(flags); | ||
2174 | rcu_sched_qs(cpu); | 2168 | rcu_sched_qs(cpu); |
2175 | force_quiescent_state(&rcu_sched_state, 0); | 2169 | force_quiescent_state(&rcu_sched_state, 0); |
2176 | local_irq_save(flags); | ||
2177 | } | 2170 | } |
2178 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | 2171 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { |
2179 | local_irq_restore(flags); | ||
2180 | rcu_bh_qs(cpu); | 2172 | rcu_bh_qs(cpu); |
2181 | force_quiescent_state(&rcu_bh_state, 0); | 2173 | force_quiescent_state(&rcu_bh_state, 0); |
2182 | local_irq_save(flags); | ||
2183 | } | 2174 | } |
2184 | 2175 | ||
2185 | /* | 2176 | /* |
@@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu) | |||
2187 | * So try forcing the callbacks through the grace period. | 2178 | * So try forcing the callbacks through the grace period. |
2188 | */ | 2179 | */ |
2189 | if (rcu_cpu_has_callbacks(cpu)) { | 2180 | if (rcu_cpu_has_callbacks(cpu)) { |
2190 | local_irq_restore(flags); | ||
2191 | trace_rcu_prep_idle("More callbacks"); | 2181 | trace_rcu_prep_idle("More callbacks"); |
2192 | invoke_rcu_core(); | 2182 | invoke_rcu_core(); |
2193 | } else { | 2183 | } else |
2194 | local_irq_restore(flags); | ||
2195 | trace_rcu_prep_idle("Callbacks drained"); | 2184 | trace_rcu_prep_idle("Callbacks drained"); |
2196 | } | ||
2197 | } | 2185 | } |
2198 | 2186 | ||
2199 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2187 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2188 | |||
2189 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
2190 | |||
2191 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
2192 | |||
2193 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
2194 | { | ||
2195 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2196 | |||
2197 | sprintf(cp, "drain=%d %c timer=%lld", | ||
2198 | per_cpu(rcu_dyntick_drain, cpu), | ||
2199 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | ||
2200 | hrtimer_active(hrtp) | ||
2201 | ? ktime_to_us(hrtimer_get_remaining(hrtp)) | ||
2202 | : -1); | ||
2203 | } | ||
2204 | |||
2205 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
2206 | |||
2207 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
2208 | { | ||
2209 | } | ||
2210 | |||
2211 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
2212 | |||
2213 | /* Initiate the stall-info list. */ | ||
2214 | static void print_cpu_stall_info_begin(void) | ||
2215 | { | ||
2216 | printk(KERN_CONT "\n"); | ||
2217 | } | ||
2218 | |||
2219 | /* | ||
2220 | * Print out diagnostic information for the specified stalled CPU. | ||
2221 | * | ||
2222 | * If the specified CPU is aware of the current RCU grace period | ||
2223 | * (flavor specified by rsp), then print the number of scheduling | ||
2224 | * clock interrupts the CPU has taken during the time that it has | ||
2225 | * been aware. Otherwise, print the number of RCU grace periods | ||
2226 | * that this CPU is ignorant of, for example, "1" if the CPU was | ||
2227 | * aware of the previous grace period. | ||
2228 | * | ||
2229 | * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. | ||
2230 | */ | ||
2231 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
2232 | { | ||
2233 | char fast_no_hz[72]; | ||
2234 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2235 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
2236 | char *ticks_title; | ||
2237 | unsigned long ticks_value; | ||
2238 | |||
2239 | if (rsp->gpnum == rdp->gpnum) { | ||
2240 | ticks_title = "ticks this GP"; | ||
2241 | ticks_value = rdp->ticks_this_gp; | ||
2242 | } else { | ||
2243 | ticks_title = "GPs behind"; | ||
2244 | ticks_value = rsp->gpnum - rdp->gpnum; | ||
2245 | } | ||
2246 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | ||
2247 | printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", | ||
2248 | cpu, ticks_value, ticks_title, | ||
2249 | atomic_read(&rdtp->dynticks) & 0xfff, | ||
2250 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | ||
2251 | fast_no_hz); | ||
2252 | } | ||
2253 | |||
2254 | /* Terminate the stall-info list. */ | ||
2255 | static void print_cpu_stall_info_end(void) | ||
2256 | { | ||
2257 | printk(KERN_ERR "\t"); | ||
2258 | } | ||
2259 | |||
2260 | /* Zero ->ticks_this_gp for all flavors of RCU. */ | ||
2261 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
2262 | { | ||
2263 | rdp->ticks_this_gp = 0; | ||
2264 | } | ||
2265 | |||
2266 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | ||
2267 | static void increment_cpu_stall_ticks(void) | ||
2268 | { | ||
2269 | __get_cpu_var(rcu_sched_data).ticks_this_gp++; | ||
2270 | __get_cpu_var(rcu_bh_data).ticks_this_gp++; | ||
2271 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
2272 | __get_cpu_var(rcu_preempt_data).ticks_this_gp++; | ||
2273 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
2274 | } | ||
2275 | |||
2276 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
2277 | |||
2278 | static void print_cpu_stall_info_begin(void) | ||
2279 | { | ||
2280 | printk(KERN_CONT " {"); | ||
2281 | } | ||
2282 | |||
2283 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
2284 | { | ||
2285 | printk(KERN_CONT " %d", cpu); | ||
2286 | } | ||
2287 | |||
2288 | static void print_cpu_stall_info_end(void) | ||
2289 | { | ||
2290 | printk(KERN_CONT "} "); | ||
2291 | } | ||
2292 | |||
2293 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
2294 | { | ||
2295 | } | ||
2296 | |||
2297 | static void increment_cpu_stall_ticks(void) | ||
2298 | { | ||
2299 | } | ||
2300 | |||
2301 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 654cfe67f0d1..ed459edeff43 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
72 | rdp->dynticks->dynticks_nesting, | 72 | rdp->dynticks->dynticks_nesting, |
73 | rdp->dynticks->dynticks_nmi_nesting, | 73 | rdp->dynticks->dynticks_nmi_nesting, |
74 | rdp->dynticks_fqs); | 74 | rdp->dynticks_fqs); |
75 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 75 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
76 | seq_printf(m, " ql=%ld qs=%c%c%c%c", | 76 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
77 | rdp->qlen, | 77 | rdp->qlen_lazy, rdp->qlen, |
78 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 78 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
79 | rdp->nxttail[RCU_NEXT_TAIL]], | 79 | rdp->nxttail[RCU_NEXT_TAIL]], |
80 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 80 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != |
@@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
144 | rdp->dynticks->dynticks_nesting, | 144 | rdp->dynticks->dynticks_nesting, |
145 | rdp->dynticks->dynticks_nmi_nesting, | 145 | rdp->dynticks->dynticks_nmi_nesting, |
146 | rdp->dynticks_fqs); | 146 | rdp->dynticks_fqs); |
147 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 147 | seq_printf(m, ",%lu", rdp->offline_fqs); |
148 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, | 148 | seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, |
149 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 149 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
150 | rdp->nxttail[RCU_NEXT_TAIL]], | 150 | rdp->nxttail[RCU_NEXT_TAIL]], |
151 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 151 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != |
@@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
168 | { | 168 | { |
169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | 169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
171 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); | 171 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); |
172 | #ifdef CONFIG_RCU_BOOST | 172 | #ifdef CONFIG_RCU_BOOST |
173 | seq_puts(m, "\"kt\",\"ktl\""); | 173 | seq_puts(m, "\"kt\",\"ktl\""); |
174 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 174 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
diff --git a/kernel/resource.c b/kernel/resource.c index 7640b3a947d0..7e8ea66a8c01 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t | |||
749 | write_unlock(&resource_lock); | 749 | write_unlock(&resource_lock); |
750 | return result; | 750 | return result; |
751 | } | 751 | } |
752 | EXPORT_SYMBOL(adjust_resource); | ||
752 | 753 | ||
753 | static void __init __reserve_region_with_split(struct resource *root, | 754 | static void __init __reserve_region_with_split(struct resource *root, |
754 | resource_size_t start, resource_size_t end, | 755 | resource_size_t start, resource_size_t end, |
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root, | |||
792 | write_unlock(&resource_lock); | 793 | write_unlock(&resource_lock); |
793 | } | 794 | } |
794 | 795 | ||
795 | EXPORT_SYMBOL(adjust_resource); | ||
796 | |||
797 | /** | 796 | /** |
798 | * resource_alignment - calculate resource's alignment | 797 | * resource_alignment - calculate resource's alignment |
799 | * @res: resource pointer | 798 | * @res: resource pointer |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b152f74f02de..6850f53e02d8 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/export.h> | 10 | #include <linux/export.h> |
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | 12 | ||
13 | #include <asm/system.h> | ||
14 | #include <linux/atomic.h> | 13 | #include <linux/atomic.h> |
15 | 14 | ||
16 | /* | 15 | /* |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c | |||
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); | |||
195 | 195 | ||
196 | #ifdef CONFIG_PROC_FS | 196 | #ifdef CONFIG_PROC_FS |
197 | 197 | ||
198 | int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | 198 | int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) |
199 | { | 199 | { |
200 | static unsigned long next = INITIAL_JIFFIES; | 200 | static unsigned long next = INITIAL_JIFFIES; |
201 | struct autogroup *ag; | 201 | struct autogroup *ag; |
202 | int err; | 202 | int err; |
203 | 203 | ||
204 | if (*nice < -20 || *nice > 19) | 204 | if (nice < -20 || nice > 19) |
205 | return -EINVAL; | 205 | return -EINVAL; |
206 | 206 | ||
207 | err = security_task_setnice(current, *nice); | 207 | err = security_task_setnice(current, nice); |
208 | if (err) | 208 | if (err) |
209 | return err; | 209 | return err; |
210 | 210 | ||
211 | if (*nice < 0 && !can_nice(current, *nice)) | 211 | if (nice < 0 && !can_nice(current, nice)) |
212 | return -EPERM; | 212 | return -EPERM; |
213 | 213 | ||
214 | /* this is a heavy operation taking global locks.. */ | 214 | /* this is a heavy operation taking global locks.. */ |
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) | |||
219 | ag = autogroup_task_get(p); | 219 | ag = autogroup_task_get(p); |
220 | 220 | ||
221 | down_write(&ag->lock); | 221 | down_write(&ag->lock); |
222 | err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); | 222 | err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); |
223 | if (!err) | 223 | if (!err) |
224 | ag->nice = *nice; | 224 | ag->nice = nice; |
225 | up_write(&ag->lock); | 225 | up_write(&ag->lock); |
226 | 226 | ||
227 | autogroup_kref_put(ag); | 227 | autogroup_kref_put(ag); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..afc6d7e71557 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -71,7 +71,9 @@ | |||
71 | #include <linux/ftrace.h> | 71 | #include <linux/ftrace.h> |
72 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
73 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
74 | #include <linux/binfmts.h> | ||
74 | 75 | ||
76 | #include <asm/switch_to.h> | ||
75 | #include <asm/tlb.h> | 77 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 78 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | 79 | #include <asm/mutex.h> |
@@ -162,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
162 | 164 | ||
163 | #ifdef HAVE_JUMP_LABEL | 165 | #ifdef HAVE_JUMP_LABEL |
164 | 166 | ||
165 | #define jump_label_key__true jump_label_key_enabled | 167 | #define jump_label_key__true STATIC_KEY_INIT_TRUE |
166 | #define jump_label_key__false jump_label_key_disabled | 168 | #define jump_label_key__false STATIC_KEY_INIT_FALSE |
167 | 169 | ||
168 | #define SCHED_FEAT(name, enabled) \ | 170 | #define SCHED_FEAT(name, enabled) \ |
169 | jump_label_key__##enabled , | 171 | jump_label_key__##enabled , |
170 | 172 | ||
171 | struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { | 173 | struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { |
172 | #include "features.h" | 174 | #include "features.h" |
173 | }; | 175 | }; |
174 | 176 | ||
@@ -176,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { | |||
176 | 178 | ||
177 | static void sched_feat_disable(int i) | 179 | static void sched_feat_disable(int i) |
178 | { | 180 | { |
179 | if (jump_label_enabled(&sched_feat_keys[i])) | 181 | if (static_key_enabled(&sched_feat_keys[i])) |
180 | jump_label_dec(&sched_feat_keys[i]); | 182 | static_key_slow_dec(&sched_feat_keys[i]); |
181 | } | 183 | } |
182 | 184 | ||
183 | static void sched_feat_enable(int i) | 185 | static void sched_feat_enable(int i) |
184 | { | 186 | { |
185 | if (!jump_label_enabled(&sched_feat_keys[i])) | 187 | if (!static_key_enabled(&sched_feat_keys[i])) |
186 | jump_label_inc(&sched_feat_keys[i]); | 188 | static_key_slow_inc(&sched_feat_keys[i]); |
187 | } | 189 | } |
188 | #else | 190 | #else |
189 | static void sched_feat_disable(int i) { }; | 191 | static void sched_feat_disable(int i) { }; |
@@ -894,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
894 | delta -= irq_delta; | 896 | delta -= irq_delta; |
895 | #endif | 897 | #endif |
896 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | 898 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
897 | if (static_branch((¶virt_steal_rq_enabled))) { | 899 | if (static_key_false((¶virt_steal_rq_enabled))) { |
898 | u64 st; | 900 | u64 st; |
899 | 901 | ||
900 | steal = paravirt_steal_clock(cpu_of(rq)); | 902 | steal = paravirt_steal_clock(cpu_of(rq)); |
@@ -1263,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1263 | */ | 1265 | */ |
1264 | static int select_fallback_rq(int cpu, struct task_struct *p) | 1266 | static int select_fallback_rq(int cpu, struct task_struct *p) |
1265 | { | 1267 | { |
1266 | int dest_cpu; | ||
1267 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); | 1268 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); |
1269 | enum { cpuset, possible, fail } state = cpuset; | ||
1270 | int dest_cpu; | ||
1268 | 1271 | ||
1269 | /* Look for allowed, online CPU in same node. */ | 1272 | /* Look for allowed, online CPU in same node. */ |
1270 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 1273 | for_each_cpu(dest_cpu, nodemask) { |
1274 | if (!cpu_online(dest_cpu)) | ||
1275 | continue; | ||
1276 | if (!cpu_active(dest_cpu)) | ||
1277 | continue; | ||
1271 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 1278 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
1272 | return dest_cpu; | 1279 | return dest_cpu; |
1280 | } | ||
1273 | 1281 | ||
1274 | /* Any allowed, online CPU? */ | 1282 | for (;;) { |
1275 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); | 1283 | /* Any allowed, online CPU? */ |
1276 | if (dest_cpu < nr_cpu_ids) | 1284 | for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { |
1277 | return dest_cpu; | 1285 | if (!cpu_online(dest_cpu)) |
1286 | continue; | ||
1287 | if (!cpu_active(dest_cpu)) | ||
1288 | continue; | ||
1289 | goto out; | ||
1290 | } | ||
1278 | 1291 | ||
1279 | /* No more Mr. Nice Guy. */ | 1292 | switch (state) { |
1280 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 1293 | case cpuset: |
1281 | /* | 1294 | /* No more Mr. Nice Guy. */ |
1282 | * Don't tell them about moving exiting tasks or | 1295 | cpuset_cpus_allowed_fallback(p); |
1283 | * kernel threads (both mm NULL), since they never | 1296 | state = possible; |
1284 | * leave kernel. | 1297 | break; |
1285 | */ | 1298 | |
1286 | if (p->mm && printk_ratelimit()) { | 1299 | case possible: |
1287 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", | 1300 | do_set_cpus_allowed(p, cpu_possible_mask); |
1288 | task_pid_nr(p), p->comm, cpu); | 1301 | state = fail; |
1302 | break; | ||
1303 | |||
1304 | case fail: | ||
1305 | BUG(); | ||
1306 | break; | ||
1307 | } | ||
1308 | } | ||
1309 | |||
1310 | out: | ||
1311 | if (state != cpuset) { | ||
1312 | /* | ||
1313 | * Don't tell them about moving exiting tasks or | ||
1314 | * kernel threads (both mm NULL), since they never | ||
1315 | * leave kernel. | ||
1316 | */ | ||
1317 | if (p->mm && printk_ratelimit()) { | ||
1318 | printk_sched("process %d (%s) no longer affine to cpu%d\n", | ||
1319 | task_pid_nr(p), p->comm, cpu); | ||
1320 | } | ||
1289 | } | 1321 | } |
1290 | 1322 | ||
1291 | return dest_cpu; | 1323 | return dest_cpu; |
@@ -1507,7 +1539,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | |||
1507 | } | 1539 | } |
1508 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1540 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
1509 | 1541 | ||
1510 | static inline int ttwu_share_cache(int this_cpu, int that_cpu) | 1542 | bool cpus_share_cache(int this_cpu, int that_cpu) |
1511 | { | 1543 | { |
1512 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1544 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
1513 | } | 1545 | } |
@@ -1518,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
1518 | struct rq *rq = cpu_rq(cpu); | 1550 | struct rq *rq = cpu_rq(cpu); |
1519 | 1551 | ||
1520 | #if defined(CONFIG_SMP) | 1552 | #if defined(CONFIG_SMP) |
1521 | if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { | 1553 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { |
1522 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1554 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ |
1523 | ttwu_queue_remote(p, cpu); | 1555 | ttwu_queue_remote(p, cpu); |
1524 | return; | 1556 | return; |
@@ -1932,7 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1932 | local_irq_enable(); | 1964 | local_irq_enable(); |
1933 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1965 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
1934 | finish_lock_switch(rq, prev); | 1966 | finish_lock_switch(rq, prev); |
1935 | trace_sched_stat_sleeptime(current, rq->clock); | 1967 | finish_arch_post_lock_switch(); |
1936 | 1968 | ||
1937 | fire_sched_in_preempt_notifiers(current); | 1969 | fire_sched_in_preempt_notifiers(current); |
1938 | if (mm) | 1970 | if (mm) |
@@ -2267,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
2267 | * Once we've updated the global active value, we need to apply the exponential | 2299 | * Once we've updated the global active value, we need to apply the exponential |
2268 | * weights adjusted to the number of cycles missed. | 2300 | * weights adjusted to the number of cycles missed. |
2269 | */ | 2301 | */ |
2270 | static void calc_global_nohz(unsigned long ticks) | 2302 | static void calc_global_nohz(void) |
2271 | { | 2303 | { |
2272 | long delta, active, n; | 2304 | long delta, active, n; |
2273 | 2305 | ||
2274 | if (time_before(jiffies, calc_load_update)) | ||
2275 | return; | ||
2276 | |||
2277 | /* | 2306 | /* |
2278 | * If we crossed a calc_load_update boundary, make sure to fold | 2307 | * If we crossed a calc_load_update boundary, make sure to fold |
2279 | * any pending idle changes, the respective CPUs might have | 2308 | * any pending idle changes, the respective CPUs might have |
@@ -2285,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks) | |||
2285 | atomic_long_add(delta, &calc_load_tasks); | 2314 | atomic_long_add(delta, &calc_load_tasks); |
2286 | 2315 | ||
2287 | /* | 2316 | /* |
2288 | * If we were idle for multiple load cycles, apply them. | 2317 | * It could be the one fold was all it took, we done! |
2289 | */ | 2318 | */ |
2290 | if (ticks >= LOAD_FREQ) { | 2319 | if (time_before(jiffies, calc_load_update + 10)) |
2291 | n = ticks / LOAD_FREQ; | 2320 | return; |
2292 | 2321 | ||
2293 | active = atomic_long_read(&calc_load_tasks); | 2322 | /* |
2294 | active = active > 0 ? active * FIXED_1 : 0; | 2323 | * Catch-up, fold however many we are behind still |
2324 | */ | ||
2325 | delta = jiffies - calc_load_update - 10; | ||
2326 | n = 1 + (delta / LOAD_FREQ); | ||
2295 | 2327 | ||
2296 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2328 | active = atomic_long_read(&calc_load_tasks); |
2297 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2329 | active = active > 0 ? active * FIXED_1 : 0; |
2298 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2299 | 2330 | ||
2300 | calc_load_update += n * LOAD_FREQ; | 2331 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
2301 | } | 2332 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
2333 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2302 | 2334 | ||
2303 | /* | 2335 | calc_load_update += n * LOAD_FREQ; |
2304 | * Its possible the remainder of the above division also crosses | ||
2305 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
2306 | * which comes after this will take care of that. | ||
2307 | * | ||
2308 | * Consider us being 11 ticks before a cycle completion, and us | ||
2309 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
2310 | * age us 4 cycles, and the test in calc_global_load() will | ||
2311 | * pick up the final one. | ||
2312 | */ | ||
2313 | } | 2336 | } |
2314 | #else | 2337 | #else |
2315 | void calc_load_account_idle(struct rq *this_rq) | 2338 | void calc_load_account_idle(struct rq *this_rq) |
@@ -2321,7 +2344,7 @@ static inline long calc_load_fold_idle(void) | |||
2321 | return 0; | 2344 | return 0; |
2322 | } | 2345 | } |
2323 | 2346 | ||
2324 | static void calc_global_nohz(unsigned long ticks) | 2347 | static void calc_global_nohz(void) |
2325 | { | 2348 | { |
2326 | } | 2349 | } |
2327 | #endif | 2350 | #endif |
@@ -2349,8 +2372,6 @@ void calc_global_load(unsigned long ticks) | |||
2349 | { | 2372 | { |
2350 | long active; | 2373 | long active; |
2351 | 2374 | ||
2352 | calc_global_nohz(ticks); | ||
2353 | |||
2354 | if (time_before(jiffies, calc_load_update + 10)) | 2375 | if (time_before(jiffies, calc_load_update + 10)) |
2355 | return; | 2376 | return; |
2356 | 2377 | ||
@@ -2362,6 +2383,16 @@ void calc_global_load(unsigned long ticks) | |||
2362 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | 2383 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); |
2363 | 2384 | ||
2364 | calc_load_update += LOAD_FREQ; | 2385 | calc_load_update += LOAD_FREQ; |
2386 | |||
2387 | /* | ||
2388 | * Account one period with whatever state we found before | ||
2389 | * folding in the nohz state and ageing the entire idle period. | ||
2390 | * | ||
2391 | * This avoids loosing a sample when we go idle between | ||
2392 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
2393 | * under-accounting. | ||
2394 | */ | ||
2395 | calc_global_nohz(); | ||
2365 | } | 2396 | } |
2366 | 2397 | ||
2367 | /* | 2398 | /* |
@@ -2756,7 +2787,7 @@ void account_idle_time(cputime_t cputime) | |||
2756 | static __always_inline bool steal_account_process_tick(void) | 2787 | static __always_inline bool steal_account_process_tick(void) |
2757 | { | 2788 | { |
2758 | #ifdef CONFIG_PARAVIRT | 2789 | #ifdef CONFIG_PARAVIRT |
2759 | if (static_branch(¶virt_steal_enabled)) { | 2790 | if (static_key_false(¶virt_steal_enabled)) { |
2760 | u64 steal, st = 0; | 2791 | u64 steal, st = 0; |
2761 | 2792 | ||
2762 | steal = paravirt_steal_clock(smp_processor_id()); | 2793 | steal = paravirt_steal_clock(smp_processor_id()); |
@@ -3071,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count); | |||
3071 | */ | 3102 | */ |
3072 | static noinline void __schedule_bug(struct task_struct *prev) | 3103 | static noinline void __schedule_bug(struct task_struct *prev) |
3073 | { | 3104 | { |
3074 | struct pt_regs *regs = get_irq_regs(); | ||
3075 | |||
3076 | if (oops_in_progress) | 3105 | if (oops_in_progress) |
3077 | return; | 3106 | return; |
3078 | 3107 | ||
@@ -3083,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3083 | print_modules(); | 3112 | print_modules(); |
3084 | if (irqs_disabled()) | 3113 | if (irqs_disabled()) |
3085 | print_irqtrace_events(prev); | 3114 | print_irqtrace_events(prev); |
3086 | 3115 | dump_stack(); | |
3087 | if (regs) | ||
3088 | show_regs(regs); | ||
3089 | else | ||
3090 | dump_stack(); | ||
3091 | } | 3116 | } |
3092 | 3117 | ||
3093 | /* | 3118 | /* |
@@ -3221,14 +3246,14 @@ need_resched: | |||
3221 | 3246 | ||
3222 | post_schedule(rq); | 3247 | post_schedule(rq); |
3223 | 3248 | ||
3224 | preempt_enable_no_resched(); | 3249 | sched_preempt_enable_no_resched(); |
3225 | if (need_resched()) | 3250 | if (need_resched()) |
3226 | goto need_resched; | 3251 | goto need_resched; |
3227 | } | 3252 | } |
3228 | 3253 | ||
3229 | static inline void sched_submit_work(struct task_struct *tsk) | 3254 | static inline void sched_submit_work(struct task_struct *tsk) |
3230 | { | 3255 | { |
3231 | if (!tsk->state) | 3256 | if (!tsk->state || tsk_is_pi_blocked(tsk)) |
3232 | return; | 3257 | return; |
3233 | /* | 3258 | /* |
3234 | * If we are going to sleep and we have plugged IO queued, | 3259 | * If we are going to sleep and we have plugged IO queued, |
@@ -3247,6 +3272,18 @@ asmlinkage void __sched schedule(void) | |||
3247 | } | 3272 | } |
3248 | EXPORT_SYMBOL(schedule); | 3273 | EXPORT_SYMBOL(schedule); |
3249 | 3274 | ||
3275 | /** | ||
3276 | * schedule_preempt_disabled - called with preemption disabled | ||
3277 | * | ||
3278 | * Returns with preemption disabled. Note: preempt_count must be 1 | ||
3279 | */ | ||
3280 | void __sched schedule_preempt_disabled(void) | ||
3281 | { | ||
3282 | sched_preempt_enable_no_resched(); | ||
3283 | schedule(); | ||
3284 | preempt_disable(); | ||
3285 | } | ||
3286 | |||
3250 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 3287 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
3251 | 3288 | ||
3252 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | 3289 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
@@ -3407,9 +3444,9 @@ EXPORT_SYMBOL(__wake_up); | |||
3407 | /* | 3444 | /* |
3408 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 3445 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
3409 | */ | 3446 | */ |
3410 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 3447 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) |
3411 | { | 3448 | { |
3412 | __wake_up_common(q, mode, 1, 0, NULL); | 3449 | __wake_up_common(q, mode, nr, 0, NULL); |
3413 | } | 3450 | } |
3414 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 3451 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
3415 | 3452 | ||
@@ -3768,6 +3805,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3768 | 3805 | ||
3769 | rq = __task_rq_lock(p); | 3806 | rq = __task_rq_lock(p); |
3770 | 3807 | ||
3808 | /* | ||
3809 | * Idle task boosting is a nono in general. There is one | ||
3810 | * exception, when PREEMPT_RT and NOHZ is active: | ||
3811 | * | ||
3812 | * The idle task calls get_next_timer_interrupt() and holds | ||
3813 | * the timer wheel base->lock on the CPU and another CPU wants | ||
3814 | * to access the timer (probably to cancel it). We can safely | ||
3815 | * ignore the boosting request, as the idle CPU runs this code | ||
3816 | * with interrupts disabled and will complete the lock | ||
3817 | * protected section without being interrupted. So there is no | ||
3818 | * real need to boost. | ||
3819 | */ | ||
3820 | if (unlikely(p == rq->idle)) { | ||
3821 | WARN_ON(p != rq->curr); | ||
3822 | WARN_ON(p->pi_blocked_on); | ||
3823 | goto out_unlock; | ||
3824 | } | ||
3825 | |||
3771 | trace_sched_pi_setprio(p, prio); | 3826 | trace_sched_pi_setprio(p, prio); |
3772 | oldprio = p->prio; | 3827 | oldprio = p->prio; |
3773 | prev_class = p->sched_class; | 3828 | prev_class = p->sched_class; |
@@ -3791,11 +3846,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3791 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 3846 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
3792 | 3847 | ||
3793 | check_class_changed(rq, p, prev_class, oldprio); | 3848 | check_class_changed(rq, p, prev_class, oldprio); |
3849 | out_unlock: | ||
3794 | __task_rq_unlock(rq); | 3850 | __task_rq_unlock(rq); |
3795 | } | 3851 | } |
3796 | |||
3797 | #endif | 3852 | #endif |
3798 | |||
3799 | void set_user_nice(struct task_struct *p, long nice) | 3853 | void set_user_nice(struct task_struct *p, long nice) |
3800 | { | 3854 | { |
3801 | int old_prio, delta, on_rq; | 3855 | int old_prio, delta, on_rq; |
@@ -4475,7 +4529,7 @@ SYSCALL_DEFINE0(sched_yield) | |||
4475 | __release(rq->lock); | 4529 | __release(rq->lock); |
4476 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 4530 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
4477 | do_raw_spin_unlock(&rq->lock); | 4531 | do_raw_spin_unlock(&rq->lock); |
4478 | preempt_enable_no_resched(); | 4532 | sched_preempt_enable_no_resched(); |
4479 | 4533 | ||
4480 | schedule(); | 4534 | schedule(); |
4481 | 4535 | ||
@@ -4549,8 +4603,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
4549 | /** | 4603 | /** |
4550 | * yield - yield the current processor to other threads. | 4604 | * yield - yield the current processor to other threads. |
4551 | * | 4605 | * |
4552 | * This is a shortcut for kernel-space yielding - it marks the | 4606 | * Do not ever use this function, there's a 99% chance you're doing it wrong. |
4553 | * thread runnable and calls sys_sched_yield(). | 4607 | * |
4608 | * The scheduler is at all times free to pick the calling task as the most | ||
4609 | * eligible task to run, if removing the yield() call from your code breaks | ||
4610 | * it, its already broken. | ||
4611 | * | ||
4612 | * Typical broken usage is: | ||
4613 | * | ||
4614 | * while (!event) | ||
4615 | * yield(); | ||
4616 | * | ||
4617 | * where one assumes that yield() will let 'the other' process run that will | ||
4618 | * make event true. If the current task is a SCHED_FIFO task that will never | ||
4619 | * happen. Never use yield() as a progress guarantee!! | ||
4620 | * | ||
4621 | * If you want to use yield() to wait for something, use wait_event(). | ||
4622 | * If you want to use yield() to be 'nice' for others, use cond_resched(). | ||
4623 | * If you still want to use yield(), do not! | ||
4554 | */ | 4624 | */ |
4555 | void __sched yield(void) | 4625 | void __sched yield(void) |
4556 | { | 4626 | { |
@@ -5382,7 +5452,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | |||
5382 | unsigned long action, void *hcpu) | 5452 | unsigned long action, void *hcpu) |
5383 | { | 5453 | { |
5384 | switch (action & ~CPU_TASKS_FROZEN) { | 5454 | switch (action & ~CPU_TASKS_FROZEN) { |
5385 | case CPU_ONLINE: | 5455 | case CPU_STARTING: |
5386 | case CPU_DOWN_FAILED: | 5456 | case CPU_DOWN_FAILED: |
5387 | set_cpu_active((long)hcpu, true); | 5457 | set_cpu_active((long)hcpu, true); |
5388 | return NOTIFY_OK; | 5458 | return NOTIFY_OK; |
@@ -5754,7 +5824,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5754 | * | 5824 | * |
5755 | * Also keep a unique ID per domain (we use the first cpu number in | 5825 | * Also keep a unique ID per domain (we use the first cpu number in |
5756 | * the cpumask of the domain), this allows us to quickly tell if | 5826 | * the cpumask of the domain), this allows us to quickly tell if |
5757 | * two cpus are in the same cache domain, see ttwu_share_cache(). | 5827 | * two cpus are in the same cache domain, see cpus_share_cache(). |
5758 | */ | 5828 | */ |
5759 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5829 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5760 | DEFINE_PER_CPU(int, sd_llc_id); | 5830 | DEFINE_PER_CPU(int, sd_llc_id); |
@@ -6931,6 +7001,9 @@ void __init sched_init(void) | |||
6931 | rq->online = 0; | 7001 | rq->online = 0; |
6932 | rq->idle_stamp = 0; | 7002 | rq->idle_stamp = 0; |
6933 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7003 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7004 | |||
7005 | INIT_LIST_HEAD(&rq->cfs_tasks); | ||
7006 | |||
6934 | rq_attach_root(rq, &def_root_domain); | 7007 | rq_attach_root(rq, &def_root_domain); |
6935 | #ifdef CONFIG_NO_HZ | 7008 | #ifdef CONFIG_NO_HZ |
6936 | rq->nohz_flags = 0; | 7009 | rq->nohz_flags = 0; |
@@ -7525,8 +7598,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | |||
7525 | struct task_group, css); | 7598 | struct task_group, css); |
7526 | } | 7599 | } |
7527 | 7600 | ||
7528 | static struct cgroup_subsys_state * | 7601 | static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) |
7529 | cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
7530 | { | 7602 | { |
7531 | struct task_group *tg, *parent; | 7603 | struct task_group *tg, *parent; |
7532 | 7604 | ||
@@ -7543,15 +7615,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
7543 | return &tg->css; | 7615 | return &tg->css; |
7544 | } | 7616 | } |
7545 | 7617 | ||
7546 | static void | 7618 | static void cpu_cgroup_destroy(struct cgroup *cgrp) |
7547 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
7548 | { | 7619 | { |
7549 | struct task_group *tg = cgroup_tg(cgrp); | 7620 | struct task_group *tg = cgroup_tg(cgrp); |
7550 | 7621 | ||
7551 | sched_destroy_group(tg); | 7622 | sched_destroy_group(tg); |
7552 | } | 7623 | } |
7553 | 7624 | ||
7554 | static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7625 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, |
7555 | struct cgroup_taskset *tset) | 7626 | struct cgroup_taskset *tset) |
7556 | { | 7627 | { |
7557 | struct task_struct *task; | 7628 | struct task_struct *task; |
@@ -7569,7 +7640,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7569 | return 0; | 7640 | return 0; |
7570 | } | 7641 | } |
7571 | 7642 | ||
7572 | static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7643 | static void cpu_cgroup_attach(struct cgroup *cgrp, |
7573 | struct cgroup_taskset *tset) | 7644 | struct cgroup_taskset *tset) |
7574 | { | 7645 | { |
7575 | struct task_struct *task; | 7646 | struct task_struct *task; |
@@ -7579,8 +7650,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7579 | } | 7650 | } |
7580 | 7651 | ||
7581 | static void | 7652 | static void |
7582 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7653 | cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, |
7583 | struct cgroup *old_cgrp, struct task_struct *task) | 7654 | struct task_struct *task) |
7584 | { | 7655 | { |
7585 | /* | 7656 | /* |
7586 | * cgroup_exit() is called in the copy_process() failure path. | 7657 | * cgroup_exit() is called in the copy_process() failure path. |
@@ -7899,13 +7970,9 @@ static struct cftype cpu_files[] = { | |||
7899 | .write_u64 = cpu_rt_period_write_uint, | 7970 | .write_u64 = cpu_rt_period_write_uint, |
7900 | }, | 7971 | }, |
7901 | #endif | 7972 | #endif |
7973 | { } /* terminate */ | ||
7902 | }; | 7974 | }; |
7903 | 7975 | ||
7904 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7905 | { | ||
7906 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7907 | } | ||
7908 | |||
7909 | struct cgroup_subsys cpu_cgroup_subsys = { | 7976 | struct cgroup_subsys cpu_cgroup_subsys = { |
7910 | .name = "cpu", | 7977 | .name = "cpu", |
7911 | .create = cpu_cgroup_create, | 7978 | .create = cpu_cgroup_create, |
@@ -7913,8 +7980,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7913 | .can_attach = cpu_cgroup_can_attach, | 7980 | .can_attach = cpu_cgroup_can_attach, |
7914 | .attach = cpu_cgroup_attach, | 7981 | .attach = cpu_cgroup_attach, |
7915 | .exit = cpu_cgroup_exit, | 7982 | .exit = cpu_cgroup_exit, |
7916 | .populate = cpu_cgroup_populate, | ||
7917 | .subsys_id = cpu_cgroup_subsys_id, | 7983 | .subsys_id = cpu_cgroup_subsys_id, |
7984 | .base_cftypes = cpu_files, | ||
7918 | .early_init = 1, | 7985 | .early_init = 1, |
7919 | }; | 7986 | }; |
7920 | 7987 | ||
@@ -7930,8 +7997,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7930 | */ | 7997 | */ |
7931 | 7998 | ||
7932 | /* create a new cpu accounting group */ | 7999 | /* create a new cpu accounting group */ |
7933 | static struct cgroup_subsys_state *cpuacct_create( | 8000 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) |
7934 | struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
7935 | { | 8001 | { |
7936 | struct cpuacct *ca; | 8002 | struct cpuacct *ca; |
7937 | 8003 | ||
@@ -7961,8 +8027,7 @@ out: | |||
7961 | } | 8027 | } |
7962 | 8028 | ||
7963 | /* destroy an existing cpu accounting group */ | 8029 | /* destroy an existing cpu accounting group */ |
7964 | static void | 8030 | static void cpuacct_destroy(struct cgroup *cgrp) |
7965 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
7966 | { | 8031 | { |
7967 | struct cpuacct *ca = cgroup_ca(cgrp); | 8032 | struct cpuacct *ca = cgroup_ca(cgrp); |
7968 | 8033 | ||
@@ -8101,13 +8166,9 @@ static struct cftype files[] = { | |||
8101 | .name = "stat", | 8166 | .name = "stat", |
8102 | .read_map = cpuacct_stats_show, | 8167 | .read_map = cpuacct_stats_show, |
8103 | }, | 8168 | }, |
8169 | { } /* terminate */ | ||
8104 | }; | 8170 | }; |
8105 | 8171 | ||
8106 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8107 | { | ||
8108 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8109 | } | ||
8110 | |||
8111 | /* | 8172 | /* |
8112 | * charge this task's execution time to its accounting group. | 8173 | * charge this task's execution time to its accounting group. |
8113 | * | 8174 | * |
@@ -8139,7 +8200,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8139 | .name = "cpuacct", | 8200 | .name = "cpuacct", |
8140 | .create = cpuacct_create, | 8201 | .create = cpuacct_create, |
8141 | .destroy = cpuacct_destroy, | 8202 | .destroy = cpuacct_destroy, |
8142 | .populate = cpuacct_populate, | ||
8143 | .subsys_id = cpuacct_subsys_id, | 8203 | .subsys_id = cpuacct_subsys_id, |
8204 | .base_cftypes = files, | ||
8144 | }; | 8205 | }; |
8145 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8206 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
288 | 288 | ||
289 | P(yld_count); | 289 | P(yld_count); |
290 | 290 | ||
291 | P(sched_switch); | ||
292 | P(sched_count); | 291 | P(sched_count); |
293 | P(sched_goidle); | 292 | P(sched_goidle); |
294 | #ifdef CONFIG_SMP | 293 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..0d97ebdc58f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
416 | 416 | ||
417 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 417 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
418 | 418 | ||
419 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 419 | static __always_inline |
420 | unsigned long delta_exec); | 420 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); |
421 | 421 | ||
422 | /************************************************************** | 422 | /************************************************************** |
423 | * Scheduling class tree data structure manipulation methods: | 423 | * Scheduling class tree data structure manipulation methods: |
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 776 | * Scheduling class queueing methods: |
777 | */ | 777 | */ |
778 | 778 | ||
779 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
780 | static void | ||
781 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
782 | { | ||
783 | cfs_rq->task_weight += weight; | ||
784 | } | ||
785 | #else | ||
786 | static inline void | ||
787 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
788 | { | ||
789 | } | ||
790 | #endif | ||
791 | |||
792 | static void | 779 | static void |
793 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
794 | { | 781 | { |
795 | update_load_add(&cfs_rq->load, se->load.weight); | 782 | update_load_add(&cfs_rq->load, se->load.weight); |
796 | if (!parent_entity(se)) | 783 | if (!parent_entity(se)) |
797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 784 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
798 | if (entity_is_task(se)) { | 785 | #ifdef CONFIG_SMP |
799 | add_cfs_task_weight(cfs_rq, se->load.weight); | 786 | if (entity_is_task(se)) |
800 | list_add(&se->group_node, &cfs_rq->tasks); | 787 | list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); |
801 | } | 788 | #endif |
802 | cfs_rq->nr_running++; | 789 | cfs_rq->nr_running++; |
803 | } | 790 | } |
804 | 791 | ||
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
808 | update_load_sub(&cfs_rq->load, se->load.weight); | 795 | update_load_sub(&cfs_rq->load, se->load.weight); |
809 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 797 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
811 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) |
812 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
813 | list_del_init(&se->group_node); | 799 | list_del_init(&se->group_node); |
814 | } | ||
815 | cfs_rq->nr_running--; | 800 | cfs_rq->nr_running--; |
816 | } | 801 | } |
817 | 802 | ||
@@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1003 | if (unlikely(delta > se->statistics.sleep_max)) | 988 | if (unlikely(delta > se->statistics.sleep_max)) |
1004 | se->statistics.sleep_max = delta; | 989 | se->statistics.sleep_max = delta; |
1005 | 990 | ||
991 | se->statistics.sleep_start = 0; | ||
1006 | se->statistics.sum_sleep_runtime += delta; | 992 | se->statistics.sum_sleep_runtime += delta; |
1007 | 993 | ||
1008 | if (tsk) { | 994 | if (tsk) { |
@@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1019 | if (unlikely(delta > se->statistics.block_max)) | 1005 | if (unlikely(delta > se->statistics.block_max)) |
1020 | se->statistics.block_max = delta; | 1006 | se->statistics.block_max = delta; |
1021 | 1007 | ||
1008 | se->statistics.block_start = 0; | ||
1022 | se->statistics.sum_sleep_runtime += delta; | 1009 | se->statistics.sum_sleep_runtime += delta; |
1023 | 1010 | ||
1024 | if (tsk) { | 1011 | if (tsk) { |
@@ -1175,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1175 | __clear_buddies_skip(se); | 1162 | __clear_buddies_skip(se); |
1176 | } | 1163 | } |
1177 | 1164 | ||
1178 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | 1165 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); |
1179 | 1166 | ||
1180 | static void | 1167 | static void |
1181 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1168 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
@@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1399 | #ifdef CONFIG_CFS_BANDWIDTH | 1386 | #ifdef CONFIG_CFS_BANDWIDTH |
1400 | 1387 | ||
1401 | #ifdef HAVE_JUMP_LABEL | 1388 | #ifdef HAVE_JUMP_LABEL |
1402 | static struct jump_label_key __cfs_bandwidth_used; | 1389 | static struct static_key __cfs_bandwidth_used; |
1403 | 1390 | ||
1404 | static inline bool cfs_bandwidth_used(void) | 1391 | static inline bool cfs_bandwidth_used(void) |
1405 | { | 1392 | { |
1406 | return static_branch(&__cfs_bandwidth_used); | 1393 | return static_key_false(&__cfs_bandwidth_used); |
1407 | } | 1394 | } |
1408 | 1395 | ||
1409 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | 1396 | void account_cfs_bandwidth_used(int enabled, int was_enabled) |
1410 | { | 1397 | { |
1411 | /* only need to count groups transitioning between enabled/!enabled */ | 1398 | /* only need to count groups transitioning between enabled/!enabled */ |
1412 | if (enabled && !was_enabled) | 1399 | if (enabled && !was_enabled) |
1413 | jump_label_inc(&__cfs_bandwidth_used); | 1400 | static_key_slow_inc(&__cfs_bandwidth_used); |
1414 | else if (!enabled && was_enabled) | 1401 | else if (!enabled && was_enabled) |
1415 | jump_label_dec(&__cfs_bandwidth_used); | 1402 | static_key_slow_dec(&__cfs_bandwidth_used); |
1416 | } | 1403 | } |
1417 | #else /* HAVE_JUMP_LABEL */ | 1404 | #else /* HAVE_JUMP_LABEL */ |
1418 | static bool cfs_bandwidth_used(void) | 1405 | static bool cfs_bandwidth_used(void) |
@@ -1559,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
1559 | resched_task(rq_of(cfs_rq)->curr); | 1546 | resched_task(rq_of(cfs_rq)->curr); |
1560 | } | 1547 | } |
1561 | 1548 | ||
1562 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 1549 | static __always_inline |
1563 | unsigned long delta_exec) | 1550 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) |
1564 | { | 1551 | { |
1565 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) | 1552 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
1566 | return; | 1553 | return; |
@@ -2086,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) | |||
2086 | } | 2073 | } |
2087 | 2074 | ||
2088 | #else /* CONFIG_CFS_BANDWIDTH */ | 2075 | #else /* CONFIG_CFS_BANDWIDTH */ |
2089 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2076 | static __always_inline |
2090 | unsigned long delta_exec) {} | 2077 | void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} |
2091 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2078 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2092 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 2079 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
2093 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2080 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
2094 | 2081 | ||
2095 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 2082 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
2096 | { | 2083 | { |
@@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2670 | /* | 2657 | /* |
2671 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2658 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2672 | */ | 2659 | */ |
2673 | rcu_read_lock(); | ||
2674 | |||
2675 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2676 | for_each_lower_domain(sd) { | 2661 | for_each_lower_domain(sd) { |
2677 | sg = sd->groups; | 2662 | sg = sd->groups; |
@@ -2693,8 +2678,6 @@ next: | |||
2693 | } while (sg != sd->groups); | 2678 | } while (sg != sd->groups); |
2694 | } | 2679 | } |
2695 | done: | 2680 | done: |
2696 | rcu_read_unlock(); | ||
2697 | |||
2698 | return target; | 2681 | return target; |
2699 | } | 2682 | } |
2700 | 2683 | ||
@@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
2920 | return; | 2903 | return; |
2921 | 2904 | ||
2922 | /* | 2905 | /* |
2923 | * This is possible from callers such as pull_task(), in which we | 2906 | * This is possible from callers such as move_task(), in which we |
2924 | * unconditionally check_prempt_curr() after an enqueue (which may have | 2907 | * unconditionally check_prempt_curr() after an enqueue (which may have |
2925 | * lead to a throttle). This both saves work and prevents false | 2908 | * lead to a throttle). This both saves work and prevents false |
2926 | * next-buddy nomination below. | 2909 | * next-buddy nomination below. |
@@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
3084 | * Fair scheduling class load-balancing methods: | 3067 | * Fair scheduling class load-balancing methods: |
3085 | */ | 3068 | */ |
3086 | 3069 | ||
3070 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3071 | |||
3072 | #define LBF_ALL_PINNED 0x01 | ||
3073 | #define LBF_NEED_BREAK 0x02 | ||
3074 | |||
3075 | struct lb_env { | ||
3076 | struct sched_domain *sd; | ||
3077 | |||
3078 | int src_cpu; | ||
3079 | struct rq *src_rq; | ||
3080 | |||
3081 | int dst_cpu; | ||
3082 | struct rq *dst_rq; | ||
3083 | |||
3084 | enum cpu_idle_type idle; | ||
3085 | long load_move; | ||
3086 | unsigned int flags; | ||
3087 | |||
3088 | unsigned int loop; | ||
3089 | unsigned int loop_break; | ||
3090 | unsigned int loop_max; | ||
3091 | }; | ||
3092 | |||
3087 | /* | 3093 | /* |
3088 | * pull_task - move a task from a remote runqueue to the local runqueue. | 3094 | * move_task - move a task from one runqueue to another runqueue. |
3089 | * Both runqueues must be locked. | 3095 | * Both runqueues must be locked. |
3090 | */ | 3096 | */ |
3091 | static void pull_task(struct rq *src_rq, struct task_struct *p, | 3097 | static void move_task(struct task_struct *p, struct lb_env *env) |
3092 | struct rq *this_rq, int this_cpu) | ||
3093 | { | 3098 | { |
3094 | deactivate_task(src_rq, p, 0); | 3099 | deactivate_task(env->src_rq, p, 0); |
3095 | set_task_cpu(p, this_cpu); | 3100 | set_task_cpu(p, env->dst_cpu); |
3096 | activate_task(this_rq, p, 0); | 3101 | activate_task(env->dst_rq, p, 0); |
3097 | check_preempt_curr(this_rq, p, 0); | 3102 | check_preempt_curr(env->dst_rq, p, 0); |
3098 | } | 3103 | } |
3099 | 3104 | ||
3100 | /* | 3105 | /* |
@@ -3129,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
3129 | return delta < (s64)sysctl_sched_migration_cost; | 3134 | return delta < (s64)sysctl_sched_migration_cost; |
3130 | } | 3135 | } |
3131 | 3136 | ||
3132 | #define LBF_ALL_PINNED 0x01 | ||
3133 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | ||
3134 | #define LBF_HAD_BREAK 0x04 | ||
3135 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3136 | #define LBF_ABORT 0x10 | ||
3137 | |||
3138 | /* | 3137 | /* |
3139 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3138 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
3140 | */ | 3139 | */ |
3141 | static | 3140 | static |
3142 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3141 | int can_migrate_task(struct task_struct *p, struct lb_env *env) |
3143 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3144 | int *lb_flags) | ||
3145 | { | 3142 | { |
3146 | int tsk_cache_hot = 0; | 3143 | int tsk_cache_hot = 0; |
3147 | /* | 3144 | /* |
@@ -3150,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3150 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3147 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3151 | * 3) are cache-hot on their current CPU. | 3148 | * 3) are cache-hot on their current CPU. |
3152 | */ | 3149 | */ |
3153 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { | 3150 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3154 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3151 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3155 | return 0; | 3152 | return 0; |
3156 | } | 3153 | } |
3157 | *lb_flags &= ~LBF_ALL_PINNED; | 3154 | env->flags &= ~LBF_ALL_PINNED; |
3158 | 3155 | ||
3159 | if (task_running(rq, p)) { | 3156 | if (task_running(env->src_rq, p)) { |
3160 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3157 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
3161 | return 0; | 3158 | return 0; |
3162 | } | 3159 | } |
@@ -3167,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3167 | * 2) too many balance attempts have failed. | 3164 | * 2) too many balance attempts have failed. |
3168 | */ | 3165 | */ |
3169 | 3166 | ||
3170 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); | 3167 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3171 | if (!tsk_cache_hot || | 3168 | if (!tsk_cache_hot || |
3172 | sd->nr_balance_failed > sd->cache_nice_tries) { | 3169 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3173 | #ifdef CONFIG_SCHEDSTATS | 3170 | #ifdef CONFIG_SCHEDSTATS |
3174 | if (tsk_cache_hot) { | 3171 | if (tsk_cache_hot) { |
3175 | schedstat_inc(sd, lb_hot_gained[idle]); | 3172 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3176 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3173 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3177 | } | 3174 | } |
3178 | #endif | 3175 | #endif |
@@ -3193,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
3193 | * | 3190 | * |
3194 | * Called with both runqueues locked. | 3191 | * Called with both runqueues locked. |
3195 | */ | 3192 | */ |
3196 | static int | 3193 | static int move_one_task(struct lb_env *env) |
3197 | move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3198 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
3199 | { | 3194 | { |
3200 | struct task_struct *p, *n; | 3195 | struct task_struct *p, *n; |
3201 | struct cfs_rq *cfs_rq; | ||
3202 | int pinned = 0; | ||
3203 | 3196 | ||
3204 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 3197 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3205 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 3198 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) |
3206 | if (throttled_lb_pair(task_group(p), | 3199 | continue; |
3207 | busiest->cpu, this_cpu)) | ||
3208 | break; | ||
3209 | 3200 | ||
3210 | if (!can_migrate_task(p, busiest, this_cpu, | 3201 | if (!can_migrate_task(p, env)) |
3211 | sd, idle, &pinned)) | 3202 | continue; |
3212 | continue; | ||
3213 | 3203 | ||
3214 | pull_task(busiest, p, this_rq, this_cpu); | 3204 | move_task(p, env); |
3215 | /* | 3205 | /* |
3216 | * Right now, this is only the second place pull_task() | 3206 | * Right now, this is only the second place move_task() |
3217 | * is called, so we can safely collect pull_task() | 3207 | * is called, so we can safely collect move_task() |
3218 | * stats here rather than inside pull_task(). | 3208 | * stats here rather than inside move_task(). |
3219 | */ | 3209 | */ |
3220 | schedstat_inc(sd, lb_gained[idle]); | 3210 | schedstat_inc(env->sd, lb_gained[env->idle]); |
3221 | return 1; | 3211 | return 1; |
3222 | } | ||
3223 | } | 3212 | } |
3224 | |||
3225 | return 0; | 3213 | return 0; |
3226 | } | 3214 | } |
3227 | 3215 | ||
3228 | static unsigned long | 3216 | static unsigned long task_h_load(struct task_struct *p); |
3229 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3217 | |
3230 | unsigned long max_load_move, struct sched_domain *sd, | 3218 | /* |
3231 | enum cpu_idle_type idle, int *lb_flags, | 3219 | * move_tasks tries to move up to load_move weighted load from busiest to |
3232 | struct cfs_rq *busiest_cfs_rq) | 3220 | * this_rq, as part of a balancing operation within domain "sd". |
3221 | * Returns 1 if successful and 0 otherwise. | ||
3222 | * | ||
3223 | * Called with both runqueues locked. | ||
3224 | */ | ||
3225 | static int move_tasks(struct lb_env *env) | ||
3233 | { | 3226 | { |
3234 | int loops = 0, pulled = 0; | 3227 | struct list_head *tasks = &env->src_rq->cfs_tasks; |
3235 | long rem_load_move = max_load_move; | 3228 | struct task_struct *p; |
3236 | struct task_struct *p, *n; | 3229 | unsigned long load; |
3230 | int pulled = 0; | ||
3231 | |||
3232 | if (env->load_move <= 0) | ||
3233 | return 0; | ||
3237 | 3234 | ||
3238 | if (max_load_move == 0) | 3235 | while (!list_empty(tasks)) { |
3239 | goto out; | 3236 | p = list_first_entry(tasks, struct task_struct, se.group_node); |
3240 | 3237 | ||
3241 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3238 | env->loop++; |
3242 | if (loops++ > sysctl_sched_nr_migrate) { | 3239 | /* We've more or less seen every task there is, call it quits */ |
3243 | *lb_flags |= LBF_NEED_BREAK; | 3240 | if (env->loop > env->loop_max) |
3241 | break; | ||
3242 | |||
3243 | /* take a breather every nr_migrate tasks */ | ||
3244 | if (env->loop > env->loop_break) { | ||
3245 | env->loop_break += sysctl_sched_nr_migrate; | ||
3246 | env->flags |= LBF_NEED_BREAK; | ||
3244 | break; | 3247 | break; |
3245 | } | 3248 | } |
3246 | 3249 | ||
3247 | if ((p->se.load.weight >> 1) > rem_load_move || | 3250 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
3248 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3251 | goto next; |
3249 | lb_flags)) | 3252 | |
3250 | continue; | 3253 | load = task_h_load(p); |
3254 | |||
3255 | if (load < 16 && !env->sd->nr_balance_failed) | ||
3256 | goto next; | ||
3257 | |||
3258 | if ((load / 2) > env->load_move) | ||
3259 | goto next; | ||
3251 | 3260 | ||
3252 | pull_task(busiest, p, this_rq, this_cpu); | 3261 | if (!can_migrate_task(p, env)) |
3262 | goto next; | ||
3263 | |||
3264 | move_task(p, env); | ||
3253 | pulled++; | 3265 | pulled++; |
3254 | rem_load_move -= p->se.load.weight; | 3266 | env->load_move -= load; |
3255 | 3267 | ||
3256 | #ifdef CONFIG_PREEMPT | 3268 | #ifdef CONFIG_PREEMPT |
3257 | /* | 3269 | /* |
@@ -3259,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3259 | * kernels will stop after the first task is pulled to minimize | 3271 | * kernels will stop after the first task is pulled to minimize |
3260 | * the critical section. | 3272 | * the critical section. |
3261 | */ | 3273 | */ |
3262 | if (idle == CPU_NEWLY_IDLE) { | 3274 | if (env->idle == CPU_NEWLY_IDLE) |
3263 | *lb_flags |= LBF_ABORT; | ||
3264 | break; | 3275 | break; |
3265 | } | ||
3266 | #endif | 3276 | #endif |
3267 | 3277 | ||
3268 | /* | 3278 | /* |
3269 | * We only want to steal up to the prescribed amount of | 3279 | * We only want to steal up to the prescribed amount of |
3270 | * weighted load. | 3280 | * weighted load. |
3271 | */ | 3281 | */ |
3272 | if (rem_load_move <= 0) | 3282 | if (env->load_move <= 0) |
3273 | break; | 3283 | break; |
3284 | |||
3285 | continue; | ||
3286 | next: | ||
3287 | list_move_tail(&p->se.group_node, tasks); | ||
3274 | } | 3288 | } |
3275 | out: | 3289 | |
3276 | /* | 3290 | /* |
3277 | * Right now, this is one of only two places pull_task() is called, | 3291 | * Right now, this is one of only two places move_task() is called, |
3278 | * so we can safely collect pull_task() stats here rather than | 3292 | * so we can safely collect move_task() stats here rather than |
3279 | * inside pull_task(). | 3293 | * inside move_task(). |
3280 | */ | 3294 | */ |
3281 | schedstat_add(sd, lb_gained[idle], pulled); | 3295 | schedstat_add(env->sd, lb_gained[env->idle], pulled); |
3282 | 3296 | ||
3283 | return max_load_move - rem_load_move; | 3297 | return pulled; |
3284 | } | 3298 | } |
3285 | 3299 | ||
3286 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3300 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3360,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
3360 | 3374 | ||
3361 | static void update_h_load(long cpu) | 3375 | static void update_h_load(long cpu) |
3362 | { | 3376 | { |
3377 | rcu_read_lock(); | ||
3363 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3378 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
3379 | rcu_read_unlock(); | ||
3364 | } | 3380 | } |
3365 | 3381 | ||
3366 | static unsigned long | 3382 | static unsigned long task_h_load(struct task_struct *p) |
3367 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3368 | unsigned long max_load_move, | ||
3369 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3370 | int *lb_flags) | ||
3371 | { | 3383 | { |
3372 | long rem_load_move = max_load_move; | 3384 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
3373 | struct cfs_rq *busiest_cfs_rq; | 3385 | unsigned long load; |
3374 | |||
3375 | rcu_read_lock(); | ||
3376 | update_h_load(cpu_of(busiest)); | ||
3377 | |||
3378 | for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { | ||
3379 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | ||
3380 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | ||
3381 | u64 rem_load, moved_load; | ||
3382 | |||
3383 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3384 | break; | ||
3385 | |||
3386 | /* | ||
3387 | * empty group or part of a throttled hierarchy | ||
3388 | */ | ||
3389 | if (!busiest_cfs_rq->task_weight || | ||
3390 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
3391 | continue; | ||
3392 | |||
3393 | rem_load = (u64)rem_load_move * busiest_weight; | ||
3394 | rem_load = div_u64(rem_load, busiest_h_load + 1); | ||
3395 | |||
3396 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | ||
3397 | rem_load, sd, idle, lb_flags, | ||
3398 | busiest_cfs_rq); | ||
3399 | |||
3400 | if (!moved_load) | ||
3401 | continue; | ||
3402 | 3386 | ||
3403 | moved_load *= busiest_h_load; | 3387 | load = p->se.load.weight; |
3404 | moved_load = div_u64(moved_load, busiest_weight + 1); | 3388 | load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); |
3405 | 3389 | ||
3406 | rem_load_move -= moved_load; | 3390 | return load; |
3407 | if (rem_load_move < 0) | ||
3408 | break; | ||
3409 | } | ||
3410 | rcu_read_unlock(); | ||
3411 | |||
3412 | return max_load_move - rem_load_move; | ||
3413 | } | 3391 | } |
3414 | #else | 3392 | #else |
3415 | static inline void update_shares(int cpu) | 3393 | static inline void update_shares(int cpu) |
3416 | { | 3394 | { |
3417 | } | 3395 | } |
3418 | 3396 | ||
3419 | static unsigned long | 3397 | static inline void update_h_load(long cpu) |
3420 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3421 | unsigned long max_load_move, | ||
3422 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3423 | int *lb_flags) | ||
3424 | { | 3398 | { |
3425 | return balance_tasks(this_rq, this_cpu, busiest, | ||
3426 | max_load_move, sd, idle, lb_flags, | ||
3427 | &busiest->cfs); | ||
3428 | } | 3399 | } |
3429 | #endif | ||
3430 | 3400 | ||
3431 | /* | 3401 | static unsigned long task_h_load(struct task_struct *p) |
3432 | * move_tasks tries to move up to max_load_move weighted load from busiest to | ||
3433 | * this_rq, as part of a balancing operation within domain "sd". | ||
3434 | * Returns 1 if successful and 0 otherwise. | ||
3435 | * | ||
3436 | * Called with both runqueues locked. | ||
3437 | */ | ||
3438 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
3439 | unsigned long max_load_move, | ||
3440 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
3441 | int *lb_flags) | ||
3442 | { | 3402 | { |
3443 | unsigned long total_load_moved = 0, load_moved; | 3403 | return p->se.load.weight; |
3444 | |||
3445 | do { | ||
3446 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | ||
3447 | max_load_move - total_load_moved, | ||
3448 | sd, idle, lb_flags); | ||
3449 | |||
3450 | total_load_moved += load_moved; | ||
3451 | |||
3452 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3453 | break; | ||
3454 | |||
3455 | #ifdef CONFIG_PREEMPT | ||
3456 | /* | ||
3457 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3458 | * kernels will stop after the first task is pulled to minimize | ||
3459 | * the critical section. | ||
3460 | */ | ||
3461 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { | ||
3462 | *lb_flags |= LBF_ABORT; | ||
3463 | break; | ||
3464 | } | ||
3465 | #endif | ||
3466 | } while (load_moved && max_load_move > total_load_moved); | ||
3467 | |||
3468 | return total_load_moved > 0; | ||
3469 | } | 3404 | } |
3405 | #endif | ||
3470 | 3406 | ||
3471 | /********** Helpers for find_busiest_group ************************/ | 3407 | /********** Helpers for find_busiest_group ************************/ |
3472 | /* | 3408 | /* |
@@ -3776,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3776 | struct sched_domain *child = sd->child; | 3712 | struct sched_domain *child = sd->child; |
3777 | struct sched_group *group, *sdg = sd->groups; | 3713 | struct sched_group *group, *sdg = sd->groups; |
3778 | unsigned long power; | 3714 | unsigned long power; |
3715 | unsigned long interval; | ||
3716 | |||
3717 | interval = msecs_to_jiffies(sd->balance_interval); | ||
3718 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
3719 | sdg->sgp->next_update = jiffies + interval; | ||
3779 | 3720 | ||
3780 | if (!child) { | 3721 | if (!child) { |
3781 | update_cpu_power(sd, cpu); | 3722 | update_cpu_power(sd, cpu); |
@@ -3883,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3883 | * domains. In the newly idle case, we will allow all the cpu's | 3824 | * domains. In the newly idle case, we will allow all the cpu's |
3884 | * to do the newly idle load balance. | 3825 | * to do the newly idle load balance. |
3885 | */ | 3826 | */ |
3886 | if (idle != CPU_NEWLY_IDLE && local_group) { | 3827 | if (local_group) { |
3887 | if (balance_cpu != this_cpu) { | 3828 | if (idle != CPU_NEWLY_IDLE) { |
3888 | *balance = 0; | 3829 | if (balance_cpu != this_cpu) { |
3889 | return; | 3830 | *balance = 0; |
3890 | } | 3831 | return; |
3891 | update_group_power(sd, this_cpu); | 3832 | } |
3833 | update_group_power(sd, this_cpu); | ||
3834 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
3835 | update_group_power(sd, this_cpu); | ||
3892 | } | 3836 | } |
3893 | 3837 | ||
3894 | /* Adjust by relative CPU power of the group */ | 3838 | /* Adjust by relative CPU power of the group */ |
@@ -4451,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4451 | struct sched_domain *sd, enum cpu_idle_type idle, | 4395 | struct sched_domain *sd, enum cpu_idle_type idle, |
4452 | int *balance) | 4396 | int *balance) |
4453 | { | 4397 | { |
4454 | int ld_moved, lb_flags = 0, active_balance = 0; | 4398 | int ld_moved, active_balance = 0; |
4455 | struct sched_group *group; | 4399 | struct sched_group *group; |
4456 | unsigned long imbalance; | 4400 | unsigned long imbalance; |
4457 | struct rq *busiest; | 4401 | struct rq *busiest; |
4458 | unsigned long flags; | 4402 | unsigned long flags; |
4459 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4403 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
4460 | 4404 | ||
4405 | struct lb_env env = { | ||
4406 | .sd = sd, | ||
4407 | .dst_cpu = this_cpu, | ||
4408 | .dst_rq = this_rq, | ||
4409 | .idle = idle, | ||
4410 | .loop_break = sysctl_sched_nr_migrate, | ||
4411 | }; | ||
4412 | |||
4461 | cpumask_copy(cpus, cpu_active_mask); | 4413 | cpumask_copy(cpus, cpu_active_mask); |
4462 | 4414 | ||
4463 | schedstat_inc(sd, lb_count[idle]); | 4415 | schedstat_inc(sd, lb_count[idle]); |
@@ -4492,32 +4444,34 @@ redo: | |||
4492 | * still unbalanced. ld_moved simply stays zero, so it is | 4444 | * still unbalanced. ld_moved simply stays zero, so it is |
4493 | * correctly treated as an imbalance. | 4445 | * correctly treated as an imbalance. |
4494 | */ | 4446 | */ |
4495 | lb_flags |= LBF_ALL_PINNED; | 4447 | env.flags |= LBF_ALL_PINNED; |
4448 | env.load_move = imbalance; | ||
4449 | env.src_cpu = busiest->cpu; | ||
4450 | env.src_rq = busiest; | ||
4451 | env.loop_max = busiest->nr_running; | ||
4452 | |||
4453 | more_balance: | ||
4496 | local_irq_save(flags); | 4454 | local_irq_save(flags); |
4497 | double_rq_lock(this_rq, busiest); | 4455 | double_rq_lock(this_rq, busiest); |
4498 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4456 | if (!env.loop) |
4499 | imbalance, sd, idle, &lb_flags); | 4457 | update_h_load(env.src_cpu); |
4458 | ld_moved += move_tasks(&env); | ||
4500 | double_rq_unlock(this_rq, busiest); | 4459 | double_rq_unlock(this_rq, busiest); |
4501 | local_irq_restore(flags); | 4460 | local_irq_restore(flags); |
4502 | 4461 | ||
4462 | if (env.flags & LBF_NEED_BREAK) { | ||
4463 | env.flags &= ~LBF_NEED_BREAK; | ||
4464 | goto more_balance; | ||
4465 | } | ||
4466 | |||
4503 | /* | 4467 | /* |
4504 | * some other cpu did the load balance for us. | 4468 | * some other cpu did the load balance for us. |
4505 | */ | 4469 | */ |
4506 | if (ld_moved && this_cpu != smp_processor_id()) | 4470 | if (ld_moved && this_cpu != smp_processor_id()) |
4507 | resched_cpu(this_cpu); | 4471 | resched_cpu(this_cpu); |
4508 | 4472 | ||
4509 | if (lb_flags & LBF_ABORT) | ||
4510 | goto out_balanced; | ||
4511 | |||
4512 | if (lb_flags & LBF_NEED_BREAK) { | ||
4513 | lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | ||
4514 | if (lb_flags & LBF_ABORT) | ||
4515 | goto out_balanced; | ||
4516 | goto redo; | ||
4517 | } | ||
4518 | |||
4519 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4473 | /* All tasks on this runqueue were pinned by CPU affinity */ |
4520 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { | 4474 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
4521 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4475 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
4522 | if (!cpumask_empty(cpus)) | 4476 | if (!cpumask_empty(cpus)) |
4523 | goto redo; | 4477 | goto redo; |
@@ -4547,7 +4501,7 @@ redo: | |||
4547 | tsk_cpus_allowed(busiest->curr))) { | 4501 | tsk_cpus_allowed(busiest->curr))) { |
4548 | raw_spin_unlock_irqrestore(&busiest->lock, | 4502 | raw_spin_unlock_irqrestore(&busiest->lock, |
4549 | flags); | 4503 | flags); |
4550 | lb_flags |= LBF_ALL_PINNED; | 4504 | env.flags |= LBF_ALL_PINNED; |
4551 | goto out_one_pinned; | 4505 | goto out_one_pinned; |
4552 | } | 4506 | } |
4553 | 4507 | ||
@@ -4600,7 +4554,7 @@ out_balanced: | |||
4600 | 4554 | ||
4601 | out_one_pinned: | 4555 | out_one_pinned: |
4602 | /* tune up the balancing interval */ | 4556 | /* tune up the balancing interval */ |
4603 | if (((lb_flags & LBF_ALL_PINNED) && | 4557 | if (((env.flags & LBF_ALL_PINNED) && |
4604 | sd->balance_interval < MAX_PINNED_INTERVAL) || | 4558 | sd->balance_interval < MAX_PINNED_INTERVAL) || |
4605 | (sd->balance_interval < sd->max_interval)) | 4559 | (sd->balance_interval < sd->max_interval)) |
4606 | sd->balance_interval *= 2; | 4560 | sd->balance_interval *= 2; |
@@ -4710,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data) | |||
4710 | } | 4664 | } |
4711 | 4665 | ||
4712 | if (likely(sd)) { | 4666 | if (likely(sd)) { |
4667 | struct lb_env env = { | ||
4668 | .sd = sd, | ||
4669 | .dst_cpu = target_cpu, | ||
4670 | .dst_rq = target_rq, | ||
4671 | .src_cpu = busiest_rq->cpu, | ||
4672 | .src_rq = busiest_rq, | ||
4673 | .idle = CPU_IDLE, | ||
4674 | }; | ||
4675 | |||
4713 | schedstat_inc(sd, alb_count); | 4676 | schedstat_inc(sd, alb_count); |
4714 | 4677 | ||
4715 | if (move_one_task(target_rq, target_cpu, busiest_rq, | 4678 | if (move_one_task(&env)) |
4716 | sd, CPU_IDLE)) | ||
4717 | schedstat_inc(sd, alb_pushed); | 4679 | schedstat_inc(sd, alb_pushed); |
4718 | else | 4680 | else |
4719 | schedstat_inc(sd, alb_failed); | 4681 | schedstat_inc(sd, alb_failed); |
@@ -4945,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
4945 | 4907 | ||
4946 | static DEFINE_SPINLOCK(balancing); | 4908 | static DEFINE_SPINLOCK(balancing); |
4947 | 4909 | ||
4948 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
4949 | |||
4950 | /* | 4910 | /* |
4951 | * Scale the max load_balance interval with the number of CPUs in the system. | 4911 | * Scale the max load_balance interval with the number of CPUs in the system. |
4952 | * This trades load-balance latency on larger machines for less cross talk. | 4912 | * This trades load-balance latency on larger machines for less cross talk. |
@@ -5340,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq) | |||
5340 | void init_cfs_rq(struct cfs_rq *cfs_rq) | 5300 | void init_cfs_rq(struct cfs_rq *cfs_rq) |
5341 | { | 5301 | { |
5342 | cfs_rq->tasks_timeline = RB_ROOT; | 5302 | cfs_rq->tasks_timeline = RB_ROOT; |
5343 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5344 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 5303 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
5345 | #ifndef CONFIG_64BIT | 5304 | #ifndef CONFIG_64BIT |
5346 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5305 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
@@ -5612,6 +5571,7 @@ __init void init_sched_fair_class(void) | |||
5612 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 5571 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
5613 | 5572 | ||
5614 | #ifdef CONFIG_NO_HZ | 5573 | #ifdef CONFIG_NO_HZ |
5574 | nohz.next_balance = jiffies; | ||
5615 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 5575 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
5616 | cpu_notifier(sched_ilb_notifier, 0); | 5576 | cpu_notifier(sched_ilb_notifier, 0); |
5617 | #endif | 5577 | #endif |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec5..44af55e6d5d0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq) | |||
778 | 778 | ||
779 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | 779 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) |
780 | { | 780 | { |
781 | int i, idle = 1; | 781 | int i, idle = 1, throttled = 0; |
782 | const struct cpumask *span; | 782 | const struct cpumask *span; |
783 | 783 | ||
784 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
785 | return 1; | ||
786 | |||
787 | span = sched_rt_period_mask(); | 784 | span = sched_rt_period_mask(); |
788 | for_each_cpu(i, span) { | 785 | for_each_cpu(i, span) { |
789 | int enqueue = 0; | 786 | int enqueue = 0; |
@@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
818 | if (!rt_rq_throttled(rt_rq)) | 815 | if (!rt_rq_throttled(rt_rq)) |
819 | enqueue = 1; | 816 | enqueue = 1; |
820 | } | 817 | } |
818 | if (rt_rq->rt_throttled) | ||
819 | throttled = 1; | ||
821 | 820 | ||
822 | if (enqueue) | 821 | if (enqueue) |
823 | sched_rt_rq_enqueue(rt_rq); | 822 | sched_rt_rq_enqueue(rt_rq); |
824 | raw_spin_unlock(&rq->lock); | 823 | raw_spin_unlock(&rq->lock); |
825 | } | 824 | } |
826 | 825 | ||
826 | if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) | ||
827 | return 1; | ||
828 | |||
827 | return idle; | 829 | return idle; |
828 | } | 830 | } |
829 | 831 | ||
@@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
855 | return 0; | 857 | return 0; |
856 | 858 | ||
857 | if (rt_rq->rt_time > runtime) { | 859 | if (rt_rq->rt_time > runtime) { |
858 | rt_rq->rt_throttled = 1; | 860 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
859 | printk_once(KERN_WARNING "sched: RT throttling activated\n"); | 861 | |
862 | /* | ||
863 | * Don't actually throttle groups that have no runtime assigned | ||
864 | * but accrue some time due to boosting. | ||
865 | */ | ||
866 | if (likely(rt_b->rt_runtime)) { | ||
867 | static bool once = false; | ||
868 | |||
869 | rt_rq->rt_throttled = 1; | ||
870 | |||
871 | if (!once) { | ||
872 | once = true; | ||
873 | printk_sched("sched: RT throttling activated\n"); | ||
874 | } | ||
875 | } else { | ||
876 | /* | ||
877 | * In case we did anyway, make it go away, | ||
878 | * replenishment is a joke, since it will replenish us | ||
879 | * with exactly 0 ns. | ||
880 | */ | ||
881 | rt_rq->rt_time = 0; | ||
882 | } | ||
883 | |||
860 | if (rt_rq_throttled(rt_rq)) { | 884 | if (rt_rq_throttled(rt_rq)) { |
861 | sched_rt_rq_dequeue(rt_rq); | 885 | sched_rt_rq_dequeue(rt_rq); |
862 | return 1; | 886 | return 1; |
@@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq) | |||
884 | if (unlikely((s64)delta_exec < 0)) | 908 | if (unlikely((s64)delta_exec < 0)) |
885 | delta_exec = 0; | 909 | delta_exec = 0; |
886 | 910 | ||
887 | schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); | 911 | schedstat_set(curr->se.statistics.exec_max, |
912 | max(curr->se.statistics.exec_max, delta_exec)); | ||
888 | 913 | ||
889 | curr->se.sum_exec_runtime += delta_exec; | 914 | curr->se.sum_exec_runtime += delta_exec; |
890 | account_group_exec_runtime(curr, delta_exec); | 915 | account_group_exec_runtime(curr, delta_exec); |
@@ -1403,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
1403 | next_idx: | 1428 | next_idx: |
1404 | if (idx >= MAX_RT_PRIO) | 1429 | if (idx >= MAX_RT_PRIO) |
1405 | continue; | 1430 | continue; |
1406 | if (next && next->prio < idx) | 1431 | if (next && next->prio <= idx) |
1407 | continue; | 1432 | continue; |
1408 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | 1433 | list_for_each_entry(rt_se, array->queue + idx, run_list) { |
1409 | struct task_struct *p; | 1434 | struct task_struct *p; |
@@ -1972,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
1972 | if (--p->rt.time_slice) | 1997 | if (--p->rt.time_slice) |
1973 | return; | 1998 | return; |
1974 | 1999 | ||
1975 | p->rt.time_slice = DEF_TIMESLICE; | 2000 | p->rt.time_slice = RR_TIMESLICE; |
1976 | 2001 | ||
1977 | /* | 2002 | /* |
1978 | * Requeue to the end of queue if we are not the only element | 2003 | * Requeue to the end of queue if we are not the only element |
@@ -2000,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
2000 | * Time slice is 0 for SCHED_FIFO tasks | 2025 | * Time slice is 0 for SCHED_FIFO tasks |
2001 | */ | 2026 | */ |
2002 | if (task->policy == SCHED_RR) | 2027 | if (task->policy == SCHED_RR) |
2003 | return DEF_TIMESLICE; | 2028 | return RR_TIMESLICE; |
2004 | else | 2029 | else |
2005 | return 0; | 2030 | return 0; |
2006 | } | 2031 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..fb3acba4d52e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running; | |||
36 | 36 | ||
37 | /* | 37 | /* |
38 | * These are the 'tuning knobs' of the scheduler: | 38 | * These are the 'tuning knobs' of the scheduler: |
39 | * | ||
40 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
41 | * Timeslices get refilled after they expire. | ||
42 | */ | 39 | */ |
43 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
44 | 40 | ||
45 | /* | 41 | /* |
46 | * single value that denotes runtime == period, ie unlimited time. | 42 | * single value that denotes runtime == period, ie unlimited time. |
@@ -216,9 +212,6 @@ struct cfs_rq { | |||
216 | struct rb_root tasks_timeline; | 212 | struct rb_root tasks_timeline; |
217 | struct rb_node *rb_leftmost; | 213 | struct rb_node *rb_leftmost; |
218 | 214 | ||
219 | struct list_head tasks; | ||
220 | struct list_head *balance_iterator; | ||
221 | |||
222 | /* | 215 | /* |
223 | * 'curr' points to currently running entity on this cfs_rq. | 216 | * 'curr' points to currently running entity on this cfs_rq. |
224 | * It is set to NULL otherwise (i.e when none are currently running). | 217 | * It is set to NULL otherwise (i.e when none are currently running). |
@@ -246,11 +239,6 @@ struct cfs_rq { | |||
246 | 239 | ||
247 | #ifdef CONFIG_SMP | 240 | #ifdef CONFIG_SMP |
248 | /* | 241 | /* |
249 | * the part of load.weight contributed by tasks | ||
250 | */ | ||
251 | unsigned long task_weight; | ||
252 | |||
253 | /* | ||
254 | * h_load = weight * f(tg) | 242 | * h_load = weight * f(tg) |
255 | * | 243 | * |
256 | * Where f(tg) is the recursive weight fraction assigned to | 244 | * Where f(tg) is the recursive weight fraction assigned to |
@@ -424,6 +412,8 @@ struct rq { | |||
424 | int cpu; | 412 | int cpu; |
425 | int online; | 413 | int online; |
426 | 414 | ||
415 | struct list_head cfs_tasks; | ||
416 | |||
427 | u64 rt_avg; | 417 | u64 rt_avg; |
428 | u64 age_stamp; | 418 | u64 age_stamp; |
429 | u64 idle_stamp; | 419 | u64 idle_stamp; |
@@ -462,7 +452,6 @@ struct rq { | |||
462 | unsigned int yld_count; | 452 | unsigned int yld_count; |
463 | 453 | ||
464 | /* schedule() stats */ | 454 | /* schedule() stats */ |
465 | unsigned int sched_switch; | ||
466 | unsigned int sched_count; | 455 | unsigned int sched_count; |
467 | unsigned int sched_goidle; | 456 | unsigned int sched_goidle; |
468 | 457 | ||
@@ -611,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
611 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 600 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
612 | */ | 601 | */ |
613 | #ifdef CONFIG_SCHED_DEBUG | 602 | #ifdef CONFIG_SCHED_DEBUG |
614 | # include <linux/jump_label.h> | 603 | # include <linux/static_key.h> |
615 | # define const_debug __read_mostly | 604 | # define const_debug __read_mostly |
616 | #else | 605 | #else |
617 | # define const_debug const | 606 | # define const_debug const |
@@ -630,18 +619,18 @@ enum { | |||
630 | #undef SCHED_FEAT | 619 | #undef SCHED_FEAT |
631 | 620 | ||
632 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | 621 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) |
633 | static __always_inline bool static_branch__true(struct jump_label_key *key) | 622 | static __always_inline bool static_branch__true(struct static_key *key) |
634 | { | 623 | { |
635 | return likely(static_branch(key)); /* Not out of line branch. */ | 624 | return static_key_true(key); /* Not out of line branch. */ |
636 | } | 625 | } |
637 | 626 | ||
638 | static __always_inline bool static_branch__false(struct jump_label_key *key) | 627 | static __always_inline bool static_branch__false(struct static_key *key) |
639 | { | 628 | { |
640 | return unlikely(static_branch(key)); /* Out of line branch. */ | 629 | return static_key_false(key); /* Out of line branch. */ |
641 | } | 630 | } |
642 | 631 | ||
643 | #define SCHED_FEAT(name, enabled) \ | 632 | #define SCHED_FEAT(name, enabled) \ |
644 | static __always_inline bool static_branch_##name(struct jump_label_key *key) \ | 633 | static __always_inline bool static_branch_##name(struct static_key *key) \ |
645 | { \ | 634 | { \ |
646 | return static_branch__##enabled(key); \ | 635 | return static_branch__##enabled(key); \ |
647 | } | 636 | } |
@@ -650,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ | |||
650 | 639 | ||
651 | #undef SCHED_FEAT | 640 | #undef SCHED_FEAT |
652 | 641 | ||
653 | extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; | 642 | extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; |
654 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) | 643 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) |
655 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ | 644 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ |
656 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 645 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
@@ -692,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
692 | #ifndef finish_arch_switch | 681 | #ifndef finish_arch_switch |
693 | # define finish_arch_switch(prev) do { } while (0) | 682 | # define finish_arch_switch(prev) do { } while (0) |
694 | #endif | 683 | #endif |
684 | #ifndef finish_arch_post_lock_switch | ||
685 | # define finish_arch_post_lock_switch() do { } while (0) | ||
686 | #endif | ||
695 | 687 | ||
696 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 688 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
697 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 689 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
32 | 32 | ||
33 | /* runqueue-specific stats */ | 33 | /* runqueue-specific stats */ |
34 | seq_printf(seq, | 34 | seq_printf(seq, |
35 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | 35 | "cpu%d %u 0 %u %u %u %u %llu %llu %lu", |
36 | cpu, rq->yld_count, | 36 | cpu, rq->yld_count, |
37 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | 37 | rq->sched_count, rq->sched_goidle, |
38 | rq->ttwu_count, rq->ttwu_local, | 38 | rq->ttwu_count, rq->ttwu_local, |
39 | rq->rq_cpu_time, | 39 | rq->rq_cpu_time, |
40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | 40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); |
diff --git a/kernel/signal.c b/kernel/signal.c index c73c4284160e..17afcaf582d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
37 | #include <asm/unistd.h> | 37 | #include <asm/unistd.h> |
38 | #include <asm/siginfo.h> | 38 | #include <asm/siginfo.h> |
39 | #include <asm/cacheflush.h> | ||
39 | #include "audit.h" /* audit_signal_info() */ | 40 | #include "audit.h" /* audit_signal_info() */ |
40 | 41 | ||
41 | /* | 42 | /* |
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig) | |||
58 | (handler == SIG_DFL && sig_kernel_ignore(sig)); | 59 | (handler == SIG_DFL && sig_kernel_ignore(sig)); |
59 | } | 60 | } |
60 | 61 | ||
61 | static int sig_task_ignored(struct task_struct *t, int sig, | 62 | static int sig_task_ignored(struct task_struct *t, int sig, bool force) |
62 | int from_ancestor_ns) | ||
63 | { | 63 | { |
64 | void __user *handler; | 64 | void __user *handler; |
65 | 65 | ||
66 | handler = sig_handler(t, sig); | 66 | handler = sig_handler(t, sig); |
67 | 67 | ||
68 | if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && | 68 | if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && |
69 | handler == SIG_DFL && !from_ancestor_ns) | 69 | handler == SIG_DFL && !force) |
70 | return 1; | 70 | return 1; |
71 | 71 | ||
72 | return sig_handler_ignored(handler, sig); | 72 | return sig_handler_ignored(handler, sig); |
73 | } | 73 | } |
74 | 74 | ||
75 | static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) | 75 | static int sig_ignored(struct task_struct *t, int sig, bool force) |
76 | { | 76 | { |
77 | /* | 77 | /* |
78 | * Blocked signals are never ignored, since the | 78 | * Blocked signals are never ignored, since the |
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) | |||
82 | if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) | 82 | if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) |
83 | return 0; | 83 | return 0; |
84 | 84 | ||
85 | if (!sig_task_ignored(t, sig, from_ancestor_ns)) | 85 | if (!sig_task_ignored(t, sig, force)) |
86 | return 0; | 86 | return 0; |
87 | 87 | ||
88 | /* | 88 | /* |
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t) | |||
855 | * Returns true if the signal should be actually delivered, otherwise | 855 | * Returns true if the signal should be actually delivered, otherwise |
856 | * it should be dropped. | 856 | * it should be dropped. |
857 | */ | 857 | */ |
858 | static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | 858 | static int prepare_signal(int sig, struct task_struct *p, bool force) |
859 | { | 859 | { |
860 | struct signal_struct *signal = p->signal; | 860 | struct signal_struct *signal = p->signal; |
861 | struct task_struct *t; | 861 | struct task_struct *t; |
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
915 | } | 915 | } |
916 | } | 916 | } |
917 | 917 | ||
918 | return !sig_ignored(p, sig, from_ancestor_ns); | 918 | return !sig_ignored(p, sig, force); |
919 | } | 919 | } |
920 | 920 | ||
921 | /* | 921 | /* |
@@ -1054,13 +1054,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1054 | struct sigpending *pending; | 1054 | struct sigpending *pending; |
1055 | struct sigqueue *q; | 1055 | struct sigqueue *q; |
1056 | int override_rlimit; | 1056 | int override_rlimit; |
1057 | 1057 | int ret = 0, result; | |
1058 | trace_signal_generate(sig, info, t); | ||
1059 | 1058 | ||
1060 | assert_spin_locked(&t->sighand->siglock); | 1059 | assert_spin_locked(&t->sighand->siglock); |
1061 | 1060 | ||
1062 | if (!prepare_signal(sig, t, from_ancestor_ns)) | 1061 | result = TRACE_SIGNAL_IGNORED; |
1063 | return 0; | 1062 | if (!prepare_signal(sig, t, |
1063 | from_ancestor_ns || (info == SEND_SIG_FORCED))) | ||
1064 | goto ret; | ||
1064 | 1065 | ||
1065 | pending = group ? &t->signal->shared_pending : &t->pending; | 1066 | pending = group ? &t->signal->shared_pending : &t->pending; |
1066 | /* | 1067 | /* |
@@ -1068,8 +1069,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1068 | * exactly one non-rt signal, so that we can get more | 1069 | * exactly one non-rt signal, so that we can get more |
1069 | * detailed information about the cause of the signal. | 1070 | * detailed information about the cause of the signal. |
1070 | */ | 1071 | */ |
1072 | result = TRACE_SIGNAL_ALREADY_PENDING; | ||
1071 | if (legacy_queue(pending, sig)) | 1073 | if (legacy_queue(pending, sig)) |
1072 | return 0; | 1074 | goto ret; |
1075 | |||
1076 | result = TRACE_SIGNAL_DELIVERED; | ||
1073 | /* | 1077 | /* |
1074 | * fast-pathed signals for kernel-internal things like SIGSTOP | 1078 | * fast-pathed signals for kernel-internal things like SIGSTOP |
1075 | * or SIGKILL. | 1079 | * or SIGKILL. |
@@ -1127,14 +1131,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1127 | * signal was rt and sent by user using something | 1131 | * signal was rt and sent by user using something |
1128 | * other than kill(). | 1132 | * other than kill(). |
1129 | */ | 1133 | */ |
1130 | trace_signal_overflow_fail(sig, group, info); | 1134 | result = TRACE_SIGNAL_OVERFLOW_FAIL; |
1131 | return -EAGAIN; | 1135 | ret = -EAGAIN; |
1136 | goto ret; | ||
1132 | } else { | 1137 | } else { |
1133 | /* | 1138 | /* |
1134 | * This is a silent loss of information. We still | 1139 | * This is a silent loss of information. We still |
1135 | * send the signal, but the *info bits are lost. | 1140 | * send the signal, but the *info bits are lost. |
1136 | */ | 1141 | */ |
1137 | trace_signal_lose_info(sig, group, info); | 1142 | result = TRACE_SIGNAL_LOSE_INFO; |
1138 | } | 1143 | } |
1139 | } | 1144 | } |
1140 | 1145 | ||
@@ -1142,7 +1147,9 @@ out_set: | |||
1142 | signalfd_notify(t, sig); | 1147 | signalfd_notify(t, sig); |
1143 | sigaddset(&pending->signal, sig); | 1148 | sigaddset(&pending->signal, sig); |
1144 | complete_signal(sig, t, group); | 1149 | complete_signal(sig, t, group); |
1145 | return 0; | 1150 | ret: |
1151 | trace_signal_generate(sig, info, t, group, result); | ||
1152 | return ret; | ||
1146 | } | 1153 | } |
1147 | 1154 | ||
1148 | static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | 1155 | static int send_signal(int sig, struct siginfo *info, struct task_struct *t, |
@@ -1585,7 +1592,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) | |||
1585 | int sig = q->info.si_signo; | 1592 | int sig = q->info.si_signo; |
1586 | struct sigpending *pending; | 1593 | struct sigpending *pending; |
1587 | unsigned long flags; | 1594 | unsigned long flags; |
1588 | int ret; | 1595 | int ret, result; |
1589 | 1596 | ||
1590 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1597 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1591 | 1598 | ||
@@ -1594,7 +1601,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) | |||
1594 | goto ret; | 1601 | goto ret; |
1595 | 1602 | ||
1596 | ret = 1; /* the signal is ignored */ | 1603 | ret = 1; /* the signal is ignored */ |
1597 | if (!prepare_signal(sig, t, 0)) | 1604 | result = TRACE_SIGNAL_IGNORED; |
1605 | if (!prepare_signal(sig, t, false)) | ||
1598 | goto out; | 1606 | goto out; |
1599 | 1607 | ||
1600 | ret = 0; | 1608 | ret = 0; |
@@ -1605,6 +1613,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) | |||
1605 | */ | 1613 | */ |
1606 | BUG_ON(q->info.si_code != SI_TIMER); | 1614 | BUG_ON(q->info.si_code != SI_TIMER); |
1607 | q->info.si_overrun++; | 1615 | q->info.si_overrun++; |
1616 | result = TRACE_SIGNAL_ALREADY_PENDING; | ||
1608 | goto out; | 1617 | goto out; |
1609 | } | 1618 | } |
1610 | q->info.si_overrun = 0; | 1619 | q->info.si_overrun = 0; |
@@ -1614,7 +1623,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) | |||
1614 | list_add_tail(&q->list, &pending->list); | 1623 | list_add_tail(&q->list, &pending->list); |
1615 | sigaddset(&pending->signal, sig); | 1624 | sigaddset(&pending->signal, sig); |
1616 | complete_signal(sig, t, group); | 1625 | complete_signal(sig, t, group); |
1626 | result = TRACE_SIGNAL_DELIVERED; | ||
1617 | out: | 1627 | out: |
1628 | trace_signal_generate(sig, &q->info, t, group, result); | ||
1618 | unlock_task_sighand(t, &flags); | 1629 | unlock_task_sighand(t, &flags); |
1619 | ret: | 1630 | ret: |
1620 | return ret; | 1631 | return ret; |
@@ -1642,6 +1653,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1642 | BUG_ON(!tsk->ptrace && | 1653 | BUG_ON(!tsk->ptrace && |
1643 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); | 1654 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); |
1644 | 1655 | ||
1656 | if (sig != SIGCHLD) { | ||
1657 | /* | ||
1658 | * This is only possible if parent == real_parent. | ||
1659 | * Check if it has changed security domain. | ||
1660 | */ | ||
1661 | if (tsk->parent_exec_id != tsk->parent->self_exec_id) | ||
1662 | sig = SIGCHLD; | ||
1663 | } | ||
1664 | |||
1645 | info.si_signo = sig; | 1665 | info.si_signo = sig; |
1646 | info.si_errno = 0; | 1666 | info.si_errno = 0; |
1647 | /* | 1667 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index db197d60489b..2f8b10ecf759 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) | |||
701 | return ret; | 701 | return ret; |
702 | } | 702 | } |
703 | EXPORT_SYMBOL(on_each_cpu); | 703 | EXPORT_SYMBOL(on_each_cpu); |
704 | |||
705 | /** | ||
706 | * on_each_cpu_mask(): Run a function on processors specified by | ||
707 | * cpumask, which may include the local processor. | ||
708 | * @mask: The set of cpus to run on (only runs on online subset). | ||
709 | * @func: The function to run. This must be fast and non-blocking. | ||
710 | * @info: An arbitrary pointer to pass to the function. | ||
711 | * @wait: If true, wait (atomically) until function has completed | ||
712 | * on other CPUs. | ||
713 | * | ||
714 | * If @wait is true, then returns once @func has returned. | ||
715 | * | ||
716 | * You must not call this function with disabled interrupts or | ||
717 | * from a hardware interrupt handler or from a bottom half handler. | ||
718 | */ | ||
719 | void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, | ||
720 | void *info, bool wait) | ||
721 | { | ||
722 | int cpu = get_cpu(); | ||
723 | |||
724 | smp_call_function_many(mask, func, info, wait); | ||
725 | if (cpumask_test_cpu(cpu, mask)) { | ||
726 | local_irq_disable(); | ||
727 | func(info); | ||
728 | local_irq_enable(); | ||
729 | } | ||
730 | put_cpu(); | ||
731 | } | ||
732 | EXPORT_SYMBOL(on_each_cpu_mask); | ||
733 | |||
734 | /* | ||
735 | * on_each_cpu_cond(): Call a function on each processor for which | ||
736 | * the supplied function cond_func returns true, optionally waiting | ||
737 | * for all the required CPUs to finish. This may include the local | ||
738 | * processor. | ||
739 | * @cond_func: A callback function that is passed a cpu id and | ||
740 | * the the info parameter. The function is called | ||
741 | * with preemption disabled. The function should | ||
742 | * return a blooean value indicating whether to IPI | ||
743 | * the specified CPU. | ||
744 | * @func: The function to run on all applicable CPUs. | ||
745 | * This must be fast and non-blocking. | ||
746 | * @info: An arbitrary pointer to pass to both functions. | ||
747 | * @wait: If true, wait (atomically) until function has | ||
748 | * completed on other CPUs. | ||
749 | * @gfp_flags: GFP flags to use when allocating the cpumask | ||
750 | * used internally by the function. | ||
751 | * | ||
752 | * The function might sleep if the GFP flags indicates a non | ||
753 | * atomic allocation is allowed. | ||
754 | * | ||
755 | * Preemption is disabled to protect against CPUs going offline but not online. | ||
756 | * CPUs going online during the call will not be seen or sent an IPI. | ||
757 | * | ||
758 | * You must not call this function with disabled interrupts or | ||
759 | * from a hardware interrupt handler or from a bottom half handler. | ||
760 | */ | ||
761 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | ||
762 | smp_call_func_t func, void *info, bool wait, | ||
763 | gfp_t gfp_flags) | ||
764 | { | ||
765 | cpumask_var_t cpus; | ||
766 | int cpu, ret; | ||
767 | |||
768 | might_sleep_if(gfp_flags & __GFP_WAIT); | ||
769 | |||
770 | if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { | ||
771 | preempt_disable(); | ||
772 | for_each_online_cpu(cpu) | ||
773 | if (cond_func(cpu, info)) | ||
774 | cpumask_set_cpu(cpu, cpus); | ||
775 | on_each_cpu_mask(cpus, func, info, wait); | ||
776 | preempt_enable(); | ||
777 | free_cpumask_var(cpus); | ||
778 | } else { | ||
779 | /* | ||
780 | * No free cpumask, bother. No matter, we'll | ||
781 | * just have to IPI them one by one. | ||
782 | */ | ||
783 | preempt_disable(); | ||
784 | for_each_online_cpu(cpu) | ||
785 | if (cond_func(cpu, info)) { | ||
786 | ret = smp_call_function_single(cpu, func, | ||
787 | info, wait); | ||
788 | WARN_ON_ONCE(!ret); | ||
789 | } | ||
790 | preempt_enable(); | ||
791 | } | ||
792 | } | ||
793 | EXPORT_SYMBOL(on_each_cpu_cond); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..671f9594e368 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -297,7 +297,7 @@ void irq_enter(void) | |||
297 | int cpu = smp_processor_id(); | 297 | int cpu = smp_processor_id(); |
298 | 298 | ||
299 | rcu_irq_enter(); | 299 | rcu_irq_enter(); |
300 | if (idle_cpu(cpu) && !in_interrupt()) { | 300 | if (is_idle_task(current) && !in_interrupt()) { |
301 | /* | 301 | /* |
302 | * Prevent raise_softirq from needlessly waking up ksoftirqd | 302 | * Prevent raise_softirq from needlessly waking up ksoftirqd |
303 | * here, as softirq will be serviced on return from interrupt. | 303 | * here, as softirq will be serviced on return from interrupt. |
@@ -310,31 +310,21 @@ void irq_enter(void) | |||
310 | __irq_enter(); | 310 | __irq_enter(); |
311 | } | 311 | } |
312 | 312 | ||
313 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | ||
314 | static inline void invoke_softirq(void) | 313 | static inline void invoke_softirq(void) |
315 | { | 314 | { |
316 | if (!force_irqthreads) | 315 | if (!force_irqthreads) { |
316 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | ||
317 | __do_softirq(); | 317 | __do_softirq(); |
318 | else { | ||
319 | __local_bh_disable((unsigned long)__builtin_return_address(0), | ||
320 | SOFTIRQ_OFFSET); | ||
321 | wakeup_softirqd(); | ||
322 | __local_bh_enable(SOFTIRQ_OFFSET); | ||
323 | } | ||
324 | } | ||
325 | #else | 318 | #else |
326 | static inline void invoke_softirq(void) | ||
327 | { | ||
328 | if (!force_irqthreads) | ||
329 | do_softirq(); | 319 | do_softirq(); |
330 | else { | 320 | #endif |
321 | } else { | ||
331 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 322 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
332 | SOFTIRQ_OFFSET); | 323 | SOFTIRQ_OFFSET); |
333 | wakeup_softirqd(); | 324 | wakeup_softirqd(); |
334 | __local_bh_enable(SOFTIRQ_OFFSET); | 325 | __local_bh_enable(SOFTIRQ_OFFSET); |
335 | } | 326 | } |
336 | } | 327 | } |
337 | #endif | ||
338 | 328 | ||
339 | /* | 329 | /* |
340 | * Exit an interrupt context. Process softirqs if needed and possible: | 330 | * Exit an interrupt context. Process softirqs if needed and possible: |
@@ -353,7 +343,7 @@ void irq_exit(void) | |||
353 | tick_nohz_irq_exit(); | 343 | tick_nohz_irq_exit(); |
354 | #endif | 344 | #endif |
355 | rcu_irq_exit(); | 345 | rcu_irq_exit(); |
356 | preempt_enable_no_resched(); | 346 | sched_preempt_enable_no_resched(); |
357 | } | 347 | } |
358 | 348 | ||
359 | /* | 349 | /* |
@@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr) | |||
385 | local_irq_restore(flags); | 375 | local_irq_restore(flags); |
386 | } | 376 | } |
387 | 377 | ||
378 | void __raise_softirq_irqoff(unsigned int nr) | ||
379 | { | ||
380 | trace_softirq_raise(nr); | ||
381 | or_softirq_pending(1UL << nr); | ||
382 | } | ||
383 | |||
388 | void open_softirq(int nr, void (*action)(struct softirq_action *)) | 384 | void open_softirq(int nr, void (*action)(struct softirq_action *)) |
389 | { | 385 | { |
390 | softirq_vec[nr].action = action; | 386 | softirq_vec[nr].action = action; |
@@ -744,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
744 | while (!kthread_should_stop()) { | 740 | while (!kthread_should_stop()) { |
745 | preempt_disable(); | 741 | preempt_disable(); |
746 | if (!local_softirq_pending()) { | 742 | if (!local_softirq_pending()) { |
747 | preempt_enable_no_resched(); | 743 | schedule_preempt_disabled(); |
748 | schedule(); | ||
749 | preempt_disable(); | ||
750 | } | 744 | } |
751 | 745 | ||
752 | __set_current_state(TASK_RUNNING); | 746 | __set_current_state(TASK_RUNNING); |
@@ -761,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
761 | if (local_softirq_pending()) | 755 | if (local_softirq_pending()) |
762 | __do_softirq(); | 756 | __do_softirq(); |
763 | local_irq_enable(); | 757 | local_irq_enable(); |
764 | preempt_enable_no_resched(); | 758 | sched_preempt_enable_no_resched(); |
765 | cond_resched(); | 759 | cond_resched(); |
766 | preempt_disable(); | 760 | preempt_disable(); |
767 | rcu_note_context_switch((long)__bind_cpu); | 761 | rcu_note_context_switch((long)__bind_cpu); |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 84c7d96918bf..5cdd8065a3ce 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) | |||
163 | EXPORT_SYMBOL(_raw_spin_lock_bh); | 163 | EXPORT_SYMBOL(_raw_spin_lock_bh); |
164 | #endif | 164 | #endif |
165 | 165 | ||
166 | #ifndef CONFIG_INLINE_SPIN_UNLOCK | 166 | #ifdef CONFIG_UNINLINE_SPIN_UNLOCK |
167 | void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) | 167 | void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) |
168 | { | 168 | { |
169 | __raw_spin_unlock(lock); | 169 | __raw_spin_unlock(lock); |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 0febf61e1aa3..ba35f3a4a1f4 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
172 | { | 172 | { |
173 | int idx; | 173 | int idx; |
174 | 174 | ||
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | ||
176 | !lock_is_held(&rcu_bh_lock_map) && | ||
177 | !lock_is_held(&rcu_lock_map) && | ||
178 | !lock_is_held(&rcu_sched_lock_map), | ||
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
180 | |||
175 | idx = sp->completed; | 181 | idx = sp->completed; |
176 | mutex_lock(&sp->mutex); | 182 | mutex_lock(&sp->mutex); |
177 | 183 | ||
@@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp) | |||
280 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 286 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
281 | 287 | ||
282 | /** | 288 | /** |
283 | * synchronize_srcu_expedited - like synchronize_srcu, but less patient | 289 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
284 | * @sp: srcu_struct with which to synchronize. | 290 | * @sp: srcu_struct with which to synchronize. |
285 | * | 291 | * |
286 | * Flip the completed counter, and wait for the old count to drain to zero. | 292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" |
287 | * As with classic RCU, the updater must use some separate means of | 293 | * approach to force the grace period to end quickly. This consumes |
288 | * synchronizing concurrent updates. Can block; must be called from | 294 | * significant time on all CPUs and is unfriendly to real-time workloads, |
289 | * process context. | 295 | * so is thus not recommended for any sort of common-case code. In fact, |
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
290 | * | 299 | * |
291 | * Note that it is illegal to call synchronize_srcu_expedited() | 300 | * Note that it is illegal to call this function while holding any lock |
292 | * from the corresponding SRCU read-side critical section; doing so | 301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal |
293 | * will result in deadlock. However, it is perfectly legal to call | 302 | * to call this function from a CPU-hotplug notifier. Failing to observe |
294 | * synchronize_srcu_expedited() on one srcu_struct from some other | 303 | * these restriction will result in deadlock. It is also illegal to call |
295 | * srcu_struct's read-side critical section. | 304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | ||
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | ||
307 | * from some other srcu_struct's read-side critical section, as long as | ||
308 | * the resulting graph of srcu_structs is acyclic. | ||
296 | */ | 309 | */ |
297 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 310 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
298 | { | 311 | { |
diff --git a/kernel/sys.c b/kernel/sys.c index 40701538fbd1..e7006eb6c1e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, | |||
444 | magic2 != LINUX_REBOOT_MAGIC2C)) | 444 | magic2 != LINUX_REBOOT_MAGIC2C)) |
445 | return -EINVAL; | 445 | return -EINVAL; |
446 | 446 | ||
447 | /* | ||
448 | * If pid namespaces are enabled and the current task is in a child | ||
449 | * pid_namespace, the command is handled by reboot_pid_ns() which will | ||
450 | * call do_exit(). | ||
451 | */ | ||
452 | ret = reboot_pid_ns(task_active_pid_ns(current), cmd); | ||
453 | if (ret) | ||
454 | return ret; | ||
455 | |||
447 | /* Instead of trying to make the power_off code look like | 456 | /* Instead of trying to make the power_off code look like |
448 | * halt when pm_power_off is not set do it the easy way. | 457 | * halt when pm_power_off is not set do it the easy way. |
449 | */ | 458 | */ |
@@ -1706,7 +1715,7 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1706 | if (arg4 | arg5) | 1715 | if (arg4 | arg5) |
1707 | return -EINVAL; | 1716 | return -EINVAL; |
1708 | 1717 | ||
1709 | if (!capable(CAP_SYS_ADMIN)) | 1718 | if (!capable(CAP_SYS_RESOURCE)) |
1710 | return -EPERM; | 1719 | return -EPERM; |
1711 | 1720 | ||
1712 | if (addr >= TASK_SIZE) | 1721 | if (addr >= TASK_SIZE) |
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1962 | case PR_SET_MM: | 1971 | case PR_SET_MM: |
1963 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | 1972 | error = prctl_set_mm(arg2, arg3, arg4, arg5); |
1964 | break; | 1973 | break; |
1974 | case PR_SET_CHILD_SUBREAPER: | ||
1975 | me->signal->is_child_subreaper = !!arg2; | ||
1976 | error = 0; | ||
1977 | break; | ||
1978 | case PR_GET_CHILD_SUBREAPER: | ||
1979 | error = put_user(me->signal->is_child_subreaper, | ||
1980 | (int __user *) arg2); | ||
1981 | break; | ||
1965 | default: | 1982 | default: |
1966 | error = -EINVAL; | 1983 | error = -EINVAL; |
1967 | break; | 1984 | break; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..52b3a06a02f8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/sysctl.h> | 25 | #include <linux/sysctl.h> |
26 | #include <linux/bitmap.h> | ||
26 | #include <linux/signal.h> | 27 | #include <linux/signal.h> |
27 | #include <linux/printk.h> | 28 | #include <linux/printk.h> |
28 | #include <linux/proc_fs.h> | 29 | #include <linux/proc_fs.h> |
@@ -58,6 +59,7 @@ | |||
58 | #include <linux/oom.h> | 59 | #include <linux/oom.h> |
59 | #include <linux/kmod.h> | 60 | #include <linux/kmod.h> |
60 | #include <linux/capability.h> | 61 | #include <linux/capability.h> |
62 | #include <linux/binfmts.h> | ||
61 | 63 | ||
62 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
63 | #include <asm/processor.h> | 65 | #include <asm/processor.h> |
@@ -67,6 +69,9 @@ | |||
67 | #include <asm/stacktrace.h> | 69 | #include <asm/stacktrace.h> |
68 | #include <asm/io.h> | 70 | #include <asm/io.h> |
69 | #endif | 71 | #endif |
72 | #ifdef CONFIG_SPARC | ||
73 | #include <asm/setup.h> | ||
74 | #endif | ||
70 | #ifdef CONFIG_BSD_PROCESS_ACCT | 75 | #ifdef CONFIG_BSD_PROCESS_ACCT |
71 | #include <linux/acct.h> | 76 | #include <linux/acct.h> |
72 | #endif | 77 | #endif |
@@ -141,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP; | |||
141 | #include <linux/inotify.h> | 146 | #include <linux/inotify.h> |
142 | #endif | 147 | #endif |
143 | #ifdef CONFIG_SPARC | 148 | #ifdef CONFIG_SPARC |
144 | #include <asm/system.h> | ||
145 | #endif | 149 | #endif |
146 | 150 | ||
147 | #ifdef CONFIG_SPARC64 | 151 | #ifdef CONFIG_SPARC64 |
@@ -192,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, | |||
192 | 196 | ||
193 | #endif | 197 | #endif |
194 | 198 | ||
195 | static struct ctl_table root_table[]; | ||
196 | static struct ctl_table_root sysctl_table_root; | ||
197 | static struct ctl_table_header root_table_header = { | ||
198 | {{.count = 1, | ||
199 | .ctl_table = root_table, | ||
200 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, | ||
201 | .root = &sysctl_table_root, | ||
202 | .set = &sysctl_table_root.default_set, | ||
203 | }; | ||
204 | static struct ctl_table_root sysctl_table_root = { | ||
205 | .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), | ||
206 | .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), | ||
207 | }; | ||
208 | |||
209 | static struct ctl_table kern_table[]; | 199 | static struct ctl_table kern_table[]; |
210 | static struct ctl_table vm_table[]; | 200 | static struct ctl_table vm_table[]; |
211 | static struct ctl_table fs_table[]; | 201 | static struct ctl_table fs_table[]; |
@@ -222,7 +212,7 @@ int sysctl_legacy_va_layout; | |||
222 | 212 | ||
223 | /* The default sysctl tables: */ | 213 | /* The default sysctl tables: */ |
224 | 214 | ||
225 | static struct ctl_table root_table[] = { | 215 | static struct ctl_table sysctl_base_table[] = { |
226 | { | 216 | { |
227 | .procname = "kernel", | 217 | .procname = "kernel", |
228 | .mode = 0555, | 218 | .mode = 0555, |
@@ -1559,490 +1549,12 @@ static struct ctl_table dev_table[] = { | |||
1559 | { } | 1549 | { } |
1560 | }; | 1550 | }; |
1561 | 1551 | ||
1562 | static DEFINE_SPINLOCK(sysctl_lock); | 1552 | int __init sysctl_init(void) |
1563 | |||
1564 | /* called under sysctl_lock */ | ||
1565 | static int use_table(struct ctl_table_header *p) | ||
1566 | { | 1553 | { |
1567 | if (unlikely(p->unregistering)) | 1554 | register_sysctl_table(sysctl_base_table); |
1568 | return 0; | ||
1569 | p->used++; | ||
1570 | return 1; | ||
1571 | } | ||
1572 | |||
1573 | /* called under sysctl_lock */ | ||
1574 | static void unuse_table(struct ctl_table_header *p) | ||
1575 | { | ||
1576 | if (!--p->used) | ||
1577 | if (unlikely(p->unregistering)) | ||
1578 | complete(p->unregistering); | ||
1579 | } | ||
1580 | |||
1581 | /* called under sysctl_lock, will reacquire if has to wait */ | ||
1582 | static void start_unregistering(struct ctl_table_header *p) | ||
1583 | { | ||
1584 | /* | ||
1585 | * if p->used is 0, nobody will ever touch that entry again; | ||
1586 | * we'll eliminate all paths to it before dropping sysctl_lock | ||
1587 | */ | ||
1588 | if (unlikely(p->used)) { | ||
1589 | struct completion wait; | ||
1590 | init_completion(&wait); | ||
1591 | p->unregistering = &wait; | ||
1592 | spin_unlock(&sysctl_lock); | ||
1593 | wait_for_completion(&wait); | ||
1594 | spin_lock(&sysctl_lock); | ||
1595 | } else { | ||
1596 | /* anything non-NULL; we'll never dereference it */ | ||
1597 | p->unregistering = ERR_PTR(-EINVAL); | ||
1598 | } | ||
1599 | /* | ||
1600 | * do not remove from the list until nobody holds it; walking the | ||
1601 | * list in do_sysctl() relies on that. | ||
1602 | */ | ||
1603 | list_del_init(&p->ctl_entry); | ||
1604 | } | ||
1605 | |||
1606 | void sysctl_head_get(struct ctl_table_header *head) | ||
1607 | { | ||
1608 | spin_lock(&sysctl_lock); | ||
1609 | head->count++; | ||
1610 | spin_unlock(&sysctl_lock); | ||
1611 | } | ||
1612 | |||
1613 | void sysctl_head_put(struct ctl_table_header *head) | ||
1614 | { | ||
1615 | spin_lock(&sysctl_lock); | ||
1616 | if (!--head->count) | ||
1617 | kfree_rcu(head, rcu); | ||
1618 | spin_unlock(&sysctl_lock); | ||
1619 | } | ||
1620 | |||
1621 | struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) | ||
1622 | { | ||
1623 | if (!head) | ||
1624 | BUG(); | ||
1625 | spin_lock(&sysctl_lock); | ||
1626 | if (!use_table(head)) | ||
1627 | head = ERR_PTR(-ENOENT); | ||
1628 | spin_unlock(&sysctl_lock); | ||
1629 | return head; | ||
1630 | } | ||
1631 | |||
1632 | void sysctl_head_finish(struct ctl_table_header *head) | ||
1633 | { | ||
1634 | if (!head) | ||
1635 | return; | ||
1636 | spin_lock(&sysctl_lock); | ||
1637 | unuse_table(head); | ||
1638 | spin_unlock(&sysctl_lock); | ||
1639 | } | ||
1640 | |||
1641 | static struct ctl_table_set * | ||
1642 | lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) | ||
1643 | { | ||
1644 | struct ctl_table_set *set = &root->default_set; | ||
1645 | if (root->lookup) | ||
1646 | set = root->lookup(root, namespaces); | ||
1647 | return set; | ||
1648 | } | ||
1649 | |||
1650 | static struct list_head * | ||
1651 | lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) | ||
1652 | { | ||
1653 | struct ctl_table_set *set = lookup_header_set(root, namespaces); | ||
1654 | return &set->list; | ||
1655 | } | ||
1656 | |||
1657 | struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, | ||
1658 | struct ctl_table_header *prev) | ||
1659 | { | ||
1660 | struct ctl_table_root *root; | ||
1661 | struct list_head *header_list; | ||
1662 | struct ctl_table_header *head; | ||
1663 | struct list_head *tmp; | ||
1664 | |||
1665 | spin_lock(&sysctl_lock); | ||
1666 | if (prev) { | ||
1667 | head = prev; | ||
1668 | tmp = &prev->ctl_entry; | ||
1669 | unuse_table(prev); | ||
1670 | goto next; | ||
1671 | } | ||
1672 | tmp = &root_table_header.ctl_entry; | ||
1673 | for (;;) { | ||
1674 | head = list_entry(tmp, struct ctl_table_header, ctl_entry); | ||
1675 | |||
1676 | if (!use_table(head)) | ||
1677 | goto next; | ||
1678 | spin_unlock(&sysctl_lock); | ||
1679 | return head; | ||
1680 | next: | ||
1681 | root = head->root; | ||
1682 | tmp = tmp->next; | ||
1683 | header_list = lookup_header_list(root, namespaces); | ||
1684 | if (tmp != header_list) | ||
1685 | continue; | ||
1686 | |||
1687 | do { | ||
1688 | root = list_entry(root->root_list.next, | ||
1689 | struct ctl_table_root, root_list); | ||
1690 | if (root == &sysctl_table_root) | ||
1691 | goto out; | ||
1692 | header_list = lookup_header_list(root, namespaces); | ||
1693 | } while (list_empty(header_list)); | ||
1694 | tmp = header_list->next; | ||
1695 | } | ||
1696 | out: | ||
1697 | spin_unlock(&sysctl_lock); | ||
1698 | return NULL; | ||
1699 | } | ||
1700 | |||
1701 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | ||
1702 | { | ||
1703 | return __sysctl_head_next(current->nsproxy, prev); | ||
1704 | } | ||
1705 | |||
1706 | void register_sysctl_root(struct ctl_table_root *root) | ||
1707 | { | ||
1708 | spin_lock(&sysctl_lock); | ||
1709 | list_add_tail(&root->root_list, &sysctl_table_root.root_list); | ||
1710 | spin_unlock(&sysctl_lock); | ||
1711 | } | ||
1712 | |||
1713 | /* | ||
1714 | * sysctl_perm does NOT grant the superuser all rights automatically, because | ||
1715 | * some sysctl variables are readonly even to root. | ||
1716 | */ | ||
1717 | |||
1718 | static int test_perm(int mode, int op) | ||
1719 | { | ||
1720 | if (!current_euid()) | ||
1721 | mode >>= 6; | ||
1722 | else if (in_egroup_p(0)) | ||
1723 | mode >>= 3; | ||
1724 | if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) | ||
1725 | return 0; | ||
1726 | return -EACCES; | ||
1727 | } | ||
1728 | |||
1729 | int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) | ||
1730 | { | ||
1731 | int mode; | ||
1732 | |||
1733 | if (root->permissions) | ||
1734 | mode = root->permissions(root, current->nsproxy, table); | ||
1735 | else | ||
1736 | mode = table->mode; | ||
1737 | |||
1738 | return test_perm(mode, op); | ||
1739 | } | ||
1740 | |||
1741 | static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) | ||
1742 | { | ||
1743 | for (; table->procname; table++) { | ||
1744 | table->parent = parent; | ||
1745 | if (table->child) | ||
1746 | sysctl_set_parent(table, table->child); | ||
1747 | } | ||
1748 | } | ||
1749 | |||
1750 | static __init int sysctl_init(void) | ||
1751 | { | ||
1752 | sysctl_set_parent(NULL, root_table); | ||
1753 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | ||
1754 | sysctl_check_table(current->nsproxy, root_table); | ||
1755 | #endif | ||
1756 | return 0; | 1555 | return 0; |
1757 | } | 1556 | } |
1758 | 1557 | ||
1759 | core_initcall(sysctl_init); | ||
1760 | |||
1761 | static struct ctl_table *is_branch_in(struct ctl_table *branch, | ||
1762 | struct ctl_table *table) | ||
1763 | { | ||
1764 | struct ctl_table *p; | ||
1765 | const char *s = branch->procname; | ||
1766 | |||
1767 | /* branch should have named subdirectory as its first element */ | ||
1768 | if (!s || !branch->child) | ||
1769 | return NULL; | ||
1770 | |||
1771 | /* ... and nothing else */ | ||
1772 | if (branch[1].procname) | ||
1773 | return NULL; | ||
1774 | |||
1775 | /* table should contain subdirectory with the same name */ | ||
1776 | for (p = table; p->procname; p++) { | ||
1777 | if (!p->child) | ||
1778 | continue; | ||
1779 | if (p->procname && strcmp(p->procname, s) == 0) | ||
1780 | return p; | ||
1781 | } | ||
1782 | return NULL; | ||
1783 | } | ||
1784 | |||
1785 | /* see if attaching q to p would be an improvement */ | ||
1786 | static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) | ||
1787 | { | ||
1788 | struct ctl_table *to = p->ctl_table, *by = q->ctl_table; | ||
1789 | struct ctl_table *next; | ||
1790 | int is_better = 0; | ||
1791 | int not_in_parent = !p->attached_by; | ||
1792 | |||
1793 | while ((next = is_branch_in(by, to)) != NULL) { | ||
1794 | if (by == q->attached_by) | ||
1795 | is_better = 1; | ||
1796 | if (to == p->attached_by) | ||
1797 | not_in_parent = 1; | ||
1798 | by = by->child; | ||
1799 | to = next->child; | ||
1800 | } | ||
1801 | |||
1802 | if (is_better && not_in_parent) { | ||
1803 | q->attached_by = by; | ||
1804 | q->attached_to = to; | ||
1805 | q->parent = p; | ||
1806 | } | ||
1807 | } | ||
1808 | |||
1809 | /** | ||
1810 | * __register_sysctl_paths - register a sysctl hierarchy | ||
1811 | * @root: List of sysctl headers to register on | ||
1812 | * @namespaces: Data to compute which lists of sysctl entries are visible | ||
1813 | * @path: The path to the directory the sysctl table is in. | ||
1814 | * @table: the top-level table structure | ||
1815 | * | ||
1816 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1817 | * array. A completely 0 filled entry terminates the table. | ||
1818 | * | ||
1819 | * The members of the &struct ctl_table structure are used as follows: | ||
1820 | * | ||
1821 | * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not | ||
1822 | * enter a sysctl file | ||
1823 | * | ||
1824 | * data - a pointer to data for use by proc_handler | ||
1825 | * | ||
1826 | * maxlen - the maximum size in bytes of the data | ||
1827 | * | ||
1828 | * mode - the file permissions for the /proc/sys file, and for sysctl(2) | ||
1829 | * | ||
1830 | * child - a pointer to the child sysctl table if this entry is a directory, or | ||
1831 | * %NULL. | ||
1832 | * | ||
1833 | * proc_handler - the text handler routine (described below) | ||
1834 | * | ||
1835 | * de - for internal use by the sysctl routines | ||
1836 | * | ||
1837 | * extra1, extra2 - extra pointers usable by the proc handler routines | ||
1838 | * | ||
1839 | * Leaf nodes in the sysctl tree will be represented by a single file | ||
1840 | * under /proc; non-leaf nodes will be represented by directories. | ||
1841 | * | ||
1842 | * sysctl(2) can automatically manage read and write requests through | ||
1843 | * the sysctl table. The data and maxlen fields of the ctl_table | ||
1844 | * struct enable minimal validation of the values being written to be | ||
1845 | * performed, and the mode field allows minimal authentication. | ||
1846 | * | ||
1847 | * There must be a proc_handler routine for any terminal nodes | ||
1848 | * mirrored under /proc/sys (non-terminals are handled by a built-in | ||
1849 | * directory handler). Several default handlers are available to | ||
1850 | * cover common cases - | ||
1851 | * | ||
1852 | * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), | ||
1853 | * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), | ||
1854 | * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() | ||
1855 | * | ||
1856 | * It is the handler's job to read the input buffer from user memory | ||
1857 | * and process it. The handler should return 0 on success. | ||
1858 | * | ||
1859 | * This routine returns %NULL on a failure to register, and a pointer | ||
1860 | * to the table header on success. | ||
1861 | */ | ||
1862 | struct ctl_table_header *__register_sysctl_paths( | ||
1863 | struct ctl_table_root *root, | ||
1864 | struct nsproxy *namespaces, | ||
1865 | const struct ctl_path *path, struct ctl_table *table) | ||
1866 | { | ||
1867 | struct ctl_table_header *header; | ||
1868 | struct ctl_table *new, **prevp; | ||
1869 | unsigned int n, npath; | ||
1870 | struct ctl_table_set *set; | ||
1871 | |||
1872 | /* Count the path components */ | ||
1873 | for (npath = 0; path[npath].procname; ++npath) | ||
1874 | ; | ||
1875 | |||
1876 | /* | ||
1877 | * For each path component, allocate a 2-element ctl_table array. | ||
1878 | * The first array element will be filled with the sysctl entry | ||
1879 | * for this, the second will be the sentinel (procname == 0). | ||
1880 | * | ||
1881 | * We allocate everything in one go so that we don't have to | ||
1882 | * worry about freeing additional memory in unregister_sysctl_table. | ||
1883 | */ | ||
1884 | header = kzalloc(sizeof(struct ctl_table_header) + | ||
1885 | (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); | ||
1886 | if (!header) | ||
1887 | return NULL; | ||
1888 | |||
1889 | new = (struct ctl_table *) (header + 1); | ||
1890 | |||
1891 | /* Now connect the dots */ | ||
1892 | prevp = &header->ctl_table; | ||
1893 | for (n = 0; n < npath; ++n, ++path) { | ||
1894 | /* Copy the procname */ | ||
1895 | new->procname = path->procname; | ||
1896 | new->mode = 0555; | ||
1897 | |||
1898 | *prevp = new; | ||
1899 | prevp = &new->child; | ||
1900 | |||
1901 | new += 2; | ||
1902 | } | ||
1903 | *prevp = table; | ||
1904 | header->ctl_table_arg = table; | ||
1905 | |||
1906 | INIT_LIST_HEAD(&header->ctl_entry); | ||
1907 | header->used = 0; | ||
1908 | header->unregistering = NULL; | ||
1909 | header->root = root; | ||
1910 | sysctl_set_parent(NULL, header->ctl_table); | ||
1911 | header->count = 1; | ||
1912 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | ||
1913 | if (sysctl_check_table(namespaces, header->ctl_table)) { | ||
1914 | kfree(header); | ||
1915 | return NULL; | ||
1916 | } | ||
1917 | #endif | ||
1918 | spin_lock(&sysctl_lock); | ||
1919 | header->set = lookup_header_set(root, namespaces); | ||
1920 | header->attached_by = header->ctl_table; | ||
1921 | header->attached_to = root_table; | ||
1922 | header->parent = &root_table_header; | ||
1923 | for (set = header->set; set; set = set->parent) { | ||
1924 | struct ctl_table_header *p; | ||
1925 | list_for_each_entry(p, &set->list, ctl_entry) { | ||
1926 | if (p->unregistering) | ||
1927 | continue; | ||
1928 | try_attach(p, header); | ||
1929 | } | ||
1930 | } | ||
1931 | header->parent->count++; | ||
1932 | list_add_tail(&header->ctl_entry, &header->set->list); | ||
1933 | spin_unlock(&sysctl_lock); | ||
1934 | |||
1935 | return header; | ||
1936 | } | ||
1937 | |||
1938 | /** | ||
1939 | * register_sysctl_table_path - register a sysctl table hierarchy | ||
1940 | * @path: The path to the directory the sysctl table is in. | ||
1941 | * @table: the top-level table structure | ||
1942 | * | ||
1943 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1944 | * array. A completely 0 filled entry terminates the table. | ||
1945 | * | ||
1946 | * See __register_sysctl_paths for more details. | ||
1947 | */ | ||
1948 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1949 | struct ctl_table *table) | ||
1950 | { | ||
1951 | return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, | ||
1952 | path, table); | ||
1953 | } | ||
1954 | |||
1955 | /** | ||
1956 | * register_sysctl_table - register a sysctl table hierarchy | ||
1957 | * @table: the top-level table structure | ||
1958 | * | ||
1959 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1960 | * array. A completely 0 filled entry terminates the table. | ||
1961 | * | ||
1962 | * See register_sysctl_paths for more details. | ||
1963 | */ | ||
1964 | struct ctl_table_header *register_sysctl_table(struct ctl_table *table) | ||
1965 | { | ||
1966 | static const struct ctl_path null_path[] = { {} }; | ||
1967 | |||
1968 | return register_sysctl_paths(null_path, table); | ||
1969 | } | ||
1970 | |||
1971 | /** | ||
1972 | * unregister_sysctl_table - unregister a sysctl table hierarchy | ||
1973 | * @header: the header returned from register_sysctl_table | ||
1974 | * | ||
1975 | * Unregisters the sysctl table and all children. proc entries may not | ||
1976 | * actually be removed until they are no longer used by anyone. | ||
1977 | */ | ||
1978 | void unregister_sysctl_table(struct ctl_table_header * header) | ||
1979 | { | ||
1980 | might_sleep(); | ||
1981 | |||
1982 | if (header == NULL) | ||
1983 | return; | ||
1984 | |||
1985 | spin_lock(&sysctl_lock); | ||
1986 | start_unregistering(header); | ||
1987 | if (!--header->parent->count) { | ||
1988 | WARN_ON(1); | ||
1989 | kfree_rcu(header->parent, rcu); | ||
1990 | } | ||
1991 | if (!--header->count) | ||
1992 | kfree_rcu(header, rcu); | ||
1993 | spin_unlock(&sysctl_lock); | ||
1994 | } | ||
1995 | |||
1996 | int sysctl_is_seen(struct ctl_table_header *p) | ||
1997 | { | ||
1998 | struct ctl_table_set *set = p->set; | ||
1999 | int res; | ||
2000 | spin_lock(&sysctl_lock); | ||
2001 | if (p->unregistering) | ||
2002 | res = 0; | ||
2003 | else if (!set->is_seen) | ||
2004 | res = 1; | ||
2005 | else | ||
2006 | res = set->is_seen(set); | ||
2007 | spin_unlock(&sysctl_lock); | ||
2008 | return res; | ||
2009 | } | ||
2010 | |||
2011 | void setup_sysctl_set(struct ctl_table_set *p, | ||
2012 | struct ctl_table_set *parent, | ||
2013 | int (*is_seen)(struct ctl_table_set *)) | ||
2014 | { | ||
2015 | INIT_LIST_HEAD(&p->list); | ||
2016 | p->parent = parent ? parent : &sysctl_table_root.default_set; | ||
2017 | p->is_seen = is_seen; | ||
2018 | } | ||
2019 | |||
2020 | #else /* !CONFIG_SYSCTL */ | ||
2021 | struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | ||
2022 | { | ||
2023 | return NULL; | ||
2024 | } | ||
2025 | |||
2026 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
2027 | struct ctl_table *table) | ||
2028 | { | ||
2029 | return NULL; | ||
2030 | } | ||
2031 | |||
2032 | void unregister_sysctl_table(struct ctl_table_header * table) | ||
2033 | { | ||
2034 | } | ||
2035 | |||
2036 | void setup_sysctl_set(struct ctl_table_set *p, | ||
2037 | struct ctl_table_set *parent, | ||
2038 | int (*is_seen)(struct ctl_table_set *)) | ||
2039 | { | ||
2040 | } | ||
2041 | |||
2042 | void sysctl_head_put(struct ctl_table_header *head) | ||
2043 | { | ||
2044 | } | ||
2045 | |||
2046 | #endif /* CONFIG_SYSCTL */ | 1558 | #endif /* CONFIG_SYSCTL */ |
2047 | 1559 | ||
2048 | /* | 1560 | /* |
@@ -2884,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
2884 | } | 2396 | } |
2885 | } | 2397 | } |
2886 | 2398 | ||
2887 | while (val_a <= val_b) | 2399 | bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); |
2888 | set_bit(val_a++, tmp_bitmap); | ||
2889 | |||
2890 | first = 0; | 2400 | first = 0; |
2891 | proc_skip_char(&kbuf, &left, '\n'); | 2401 | proc_skip_char(&kbuf, &left, '\n'); |
2892 | } | 2402 | } |
@@ -2929,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
2929 | if (*ppos) | 2439 | if (*ppos) |
2930 | bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); | 2440 | bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); |
2931 | else | 2441 | else |
2932 | memcpy(bitmap, tmp_bitmap, | 2442 | bitmap_copy(bitmap, tmp_bitmap, bitmap_len); |
2933 | BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); | ||
2934 | } | 2443 | } |
2935 | kfree(tmp_bitmap); | 2444 | kfree(tmp_bitmap); |
2936 | *lenp -= left; | 2445 | *lenp -= left; |
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); | |||
3008 | EXPORT_SYMBOL(proc_dostring); | 2517 | EXPORT_SYMBOL(proc_dostring); |
3009 | EXPORT_SYMBOL(proc_doulongvec_minmax); | 2518 | EXPORT_SYMBOL(proc_doulongvec_minmax); |
3010 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); | 2519 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); |
3011 | EXPORT_SYMBOL(register_sysctl_table); | ||
3012 | EXPORT_SYMBOL(register_sysctl_paths); | ||
3013 | EXPORT_SYMBOL(unregister_sysctl_table); | ||
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c deleted file mode 100644 index 362da653813d..000000000000 --- a/kernel/sysctl_check.c +++ /dev/null | |||
@@ -1,160 +0,0 @@ | |||
1 | #include <linux/stat.h> | ||
2 | #include <linux/sysctl.h> | ||
3 | #include "../fs/xfs/xfs_sysctl.h" | ||
4 | #include <linux/sunrpc/debug.h> | ||
5 | #include <linux/string.h> | ||
6 | #include <net/ip_vs.h> | ||
7 | |||
8 | |||
9 | static int sysctl_depth(struct ctl_table *table) | ||
10 | { | ||
11 | struct ctl_table *tmp; | ||
12 | int depth; | ||
13 | |||
14 | depth = 0; | ||
15 | for (tmp = table; tmp->parent; tmp = tmp->parent) | ||
16 | depth++; | ||
17 | |||
18 | return depth; | ||
19 | } | ||
20 | |||
21 | static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) | ||
22 | { | ||
23 | int i; | ||
24 | |||
25 | for (i = 0; table && i < n; i++) | ||
26 | table = table->parent; | ||
27 | |||
28 | return table; | ||
29 | } | ||
30 | |||
31 | |||
32 | static void sysctl_print_path(struct ctl_table *table) | ||
33 | { | ||
34 | struct ctl_table *tmp; | ||
35 | int depth, i; | ||
36 | depth = sysctl_depth(table); | ||
37 | if (table->procname) { | ||
38 | for (i = depth; i >= 0; i--) { | ||
39 | tmp = sysctl_parent(table, i); | ||
40 | printk("/%s", tmp->procname?tmp->procname:""); | ||
41 | } | ||
42 | } | ||
43 | printk(" "); | ||
44 | } | ||
45 | |||
46 | static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, | ||
47 | struct ctl_table *table) | ||
48 | { | ||
49 | struct ctl_table_header *head; | ||
50 | struct ctl_table *ref, *test; | ||
51 | int depth, cur_depth; | ||
52 | |||
53 | depth = sysctl_depth(table); | ||
54 | |||
55 | for (head = __sysctl_head_next(namespaces, NULL); head; | ||
56 | head = __sysctl_head_next(namespaces, head)) { | ||
57 | cur_depth = depth; | ||
58 | ref = head->ctl_table; | ||
59 | repeat: | ||
60 | test = sysctl_parent(table, cur_depth); | ||
61 | for (; ref->procname; ref++) { | ||
62 | int match = 0; | ||
63 | if (cur_depth && !ref->child) | ||
64 | continue; | ||
65 | |||
66 | if (test->procname && ref->procname && | ||
67 | (strcmp(test->procname, ref->procname) == 0)) | ||
68 | match++; | ||
69 | |||
70 | if (match) { | ||
71 | if (cur_depth != 0) { | ||
72 | cur_depth--; | ||
73 | ref = ref->child; | ||
74 | goto repeat; | ||
75 | } | ||
76 | goto out; | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | ref = NULL; | ||
81 | out: | ||
82 | sysctl_head_finish(head); | ||
83 | return ref; | ||
84 | } | ||
85 | |||
86 | static void set_fail(const char **fail, struct ctl_table *table, const char *str) | ||
87 | { | ||
88 | if (*fail) { | ||
89 | printk(KERN_ERR "sysctl table check failed: "); | ||
90 | sysctl_print_path(table); | ||
91 | printk(" %s\n", *fail); | ||
92 | dump_stack(); | ||
93 | } | ||
94 | *fail = str; | ||
95 | } | ||
96 | |||
97 | static void sysctl_check_leaf(struct nsproxy *namespaces, | ||
98 | struct ctl_table *table, const char **fail) | ||
99 | { | ||
100 | struct ctl_table *ref; | ||
101 | |||
102 | ref = sysctl_check_lookup(namespaces, table); | ||
103 | if (ref && (ref != table)) | ||
104 | set_fail(fail, table, "Sysctl already exists"); | ||
105 | } | ||
106 | |||
107 | int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | ||
108 | { | ||
109 | int error = 0; | ||
110 | for (; table->procname; table++) { | ||
111 | const char *fail = NULL; | ||
112 | |||
113 | if (table->parent) { | ||
114 | if (!table->parent->procname) | ||
115 | set_fail(&fail, table, "Parent without procname"); | ||
116 | } | ||
117 | if (table->child) { | ||
118 | if (table->data) | ||
119 | set_fail(&fail, table, "Directory with data?"); | ||
120 | if (table->maxlen) | ||
121 | set_fail(&fail, table, "Directory with maxlen?"); | ||
122 | if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) | ||
123 | set_fail(&fail, table, "Writable sysctl directory"); | ||
124 | if (table->proc_handler) | ||
125 | set_fail(&fail, table, "Directory with proc_handler"); | ||
126 | if (table->extra1) | ||
127 | set_fail(&fail, table, "Directory with extra1"); | ||
128 | if (table->extra2) | ||
129 | set_fail(&fail, table, "Directory with extra2"); | ||
130 | } else { | ||
131 | if ((table->proc_handler == proc_dostring) || | ||
132 | (table->proc_handler == proc_dointvec) || | ||
133 | (table->proc_handler == proc_dointvec_minmax) || | ||
134 | (table->proc_handler == proc_dointvec_jiffies) || | ||
135 | (table->proc_handler == proc_dointvec_userhz_jiffies) || | ||
136 | (table->proc_handler == proc_dointvec_ms_jiffies) || | ||
137 | (table->proc_handler == proc_doulongvec_minmax) || | ||
138 | (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { | ||
139 | if (!table->data) | ||
140 | set_fail(&fail, table, "No data"); | ||
141 | if (!table->maxlen) | ||
142 | set_fail(&fail, table, "No maxlen"); | ||
143 | } | ||
144 | #ifdef CONFIG_PROC_SYSCTL | ||
145 | if (!table->proc_handler) | ||
146 | set_fail(&fail, table, "No proc_handler"); | ||
147 | #endif | ||
148 | sysctl_check_leaf(namespaces, table, &fail); | ||
149 | } | ||
150 | if (table->mode > 0777) | ||
151 | set_fail(&fail, table, "bogus .mode"); | ||
152 | if (fail) { | ||
153 | set_fail(&fail, table, NULL); | ||
154 | error = -EINVAL; | ||
155 | } | ||
156 | if (table->child) | ||
157 | error |= sysctl_check_table(namespaces, table->child); | ||
158 | } | ||
159 | return error; | ||
160 | } | ||
diff --git a/kernel/time.c b/kernel/time.c index 73e416db0a1e..ba744cf80696 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) | |||
163 | return error; | 163 | return error; |
164 | 164 | ||
165 | if (tz) { | 165 | if (tz) { |
166 | /* SMP safe, global irq locking makes it work. */ | ||
167 | sys_tz = *tz; | 166 | sys_tz = *tz; |
168 | update_vsyscall_tz(); | 167 | update_vsyscall_tz(); |
169 | if (firsttime) { | 168 | if (firsttime) { |
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) | |||
173 | } | 172 | } |
174 | } | 173 | } |
175 | if (tv) | 174 | if (tv) |
176 | { | ||
177 | /* SMP safe, again the code in arch/foo/time.c should | ||
178 | * globally block out interrupts when it runs. | ||
179 | */ | ||
180 | return do_settimeofday(tv); | 175 | return do_settimeofday(tv); |
181 | } | ||
182 | return 0; | 176 | return 0; |
183 | } | 177 | } |
184 | 178 | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a46f5d64504..8a538c55fc7b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, | |||
96 | return 0; | 96 | return 0; |
97 | } | 97 | } |
98 | 98 | ||
99 | static inline void alarmtimer_rtc_timer_init(void) | ||
100 | { | ||
101 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
102 | } | ||
103 | |||
99 | static struct class_interface alarmtimer_rtc_interface = { | 104 | static struct class_interface alarmtimer_rtc_interface = { |
100 | .add_dev = &alarmtimer_rtc_add_device, | 105 | .add_dev = &alarmtimer_rtc_add_device, |
101 | }; | 106 | }; |
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void) | |||
117 | #define rtcdev (NULL) | 122 | #define rtcdev (NULL) |
118 | static inline int alarmtimer_rtc_interface_setup(void) { return 0; } | 123 | static inline int alarmtimer_rtc_interface_setup(void) { return 0; } |
119 | static inline void alarmtimer_rtc_interface_remove(void) { } | 124 | static inline void alarmtimer_rtc_interface_remove(void) { } |
125 | static inline void alarmtimer_rtc_timer_init(void) { } | ||
120 | #endif | 126 | #endif |
121 | 127 | ||
122 | /** | 128 | /** |
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void) | |||
783 | .nsleep = alarm_timer_nsleep, | 789 | .nsleep = alarm_timer_nsleep, |
784 | }; | 790 | }; |
785 | 791 | ||
792 | alarmtimer_rtc_timer_init(); | ||
793 | |||
786 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); | 794 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); |
787 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); | 795 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); |
788 | 796 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a45ca167ab24..c9583382141a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
500 | { | 500 | { |
501 | u64 ret; | 501 | u64 ret; |
502 | /* | 502 | /* |
503 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | 503 | * We won't try to correct for more than 11% adjustments (110,000 ppm), |
504 | */ | 504 | */ |
505 | ret = (u64)cs->mult * 11; | 505 | ret = (u64)cs->mult * 11; |
506 | do_div(ret,100); | 506 | do_div(ret,100); |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f6117a4c7cb8..f03fd83b170b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -22,17 +22,18 @@ | |||
22 | * NTP timekeeping variables: | 22 | * NTP timekeeping variables: |
23 | */ | 23 | */ |
24 | 24 | ||
25 | DEFINE_SPINLOCK(ntp_lock); | ||
26 | |||
27 | |||
25 | /* USER_HZ period (usecs): */ | 28 | /* USER_HZ period (usecs): */ |
26 | unsigned long tick_usec = TICK_USEC; | 29 | unsigned long tick_usec = TICK_USEC; |
27 | 30 | ||
28 | /* ACTHZ period (nsecs): */ | 31 | /* ACTHZ period (nsecs): */ |
29 | unsigned long tick_nsec; | 32 | unsigned long tick_nsec; |
30 | 33 | ||
31 | u64 tick_length; | 34 | static u64 tick_length; |
32 | static u64 tick_length_base; | 35 | static u64 tick_length_base; |
33 | 36 | ||
34 | static struct hrtimer leap_timer; | ||
35 | |||
36 | #define MAX_TICKADJ 500LL /* usecs */ | 37 | #define MAX_TICKADJ 500LL /* usecs */ |
37 | #define MAX_TICKADJ_SCALED \ | 38 | #define MAX_TICKADJ_SCALED \ |
38 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) | 39 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) |
@@ -49,7 +50,7 @@ static struct hrtimer leap_timer; | |||
49 | static int time_state = TIME_OK; | 50 | static int time_state = TIME_OK; |
50 | 51 | ||
51 | /* clock status bits: */ | 52 | /* clock status bits: */ |
52 | int time_status = STA_UNSYNC; | 53 | static int time_status = STA_UNSYNC; |
53 | 54 | ||
54 | /* TAI offset (secs): */ | 55 | /* TAI offset (secs): */ |
55 | static long time_tai; | 56 | static long time_tai; |
@@ -133,7 +134,7 @@ static inline void pps_reset_freq_interval(void) | |||
133 | /** | 134 | /** |
134 | * pps_clear - Clears the PPS state variables | 135 | * pps_clear - Clears the PPS state variables |
135 | * | 136 | * |
136 | * Must be called while holding a write on the xtime_lock | 137 | * Must be called while holding a write on the ntp_lock |
137 | */ | 138 | */ |
138 | static inline void pps_clear(void) | 139 | static inline void pps_clear(void) |
139 | { | 140 | { |
@@ -149,7 +150,7 @@ static inline void pps_clear(void) | |||
149 | * the last PPS signal. When it reaches 0, indicate that PPS signal is | 150 | * the last PPS signal. When it reaches 0, indicate that PPS signal is |
150 | * missing. | 151 | * missing. |
151 | * | 152 | * |
152 | * Must be called while holding a write on the xtime_lock | 153 | * Must be called while holding a write on the ntp_lock |
153 | */ | 154 | */ |
154 | static inline void pps_dec_valid(void) | 155 | static inline void pps_dec_valid(void) |
155 | { | 156 | { |
@@ -233,6 +234,17 @@ static inline void pps_fill_timex(struct timex *txc) | |||
233 | 234 | ||
234 | #endif /* CONFIG_NTP_PPS */ | 235 | #endif /* CONFIG_NTP_PPS */ |
235 | 236 | ||
237 | |||
238 | /** | ||
239 | * ntp_synced - Returns 1 if the NTP status is not UNSYNC | ||
240 | * | ||
241 | */ | ||
242 | static inline int ntp_synced(void) | ||
243 | { | ||
244 | return !(time_status & STA_UNSYNC); | ||
245 | } | ||
246 | |||
247 | |||
236 | /* | 248 | /* |
237 | * NTP methods: | 249 | * NTP methods: |
238 | */ | 250 | */ |
@@ -275,7 +287,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs) | |||
275 | 287 | ||
276 | time_status |= STA_MODE; | 288 | time_status |= STA_MODE; |
277 | 289 | ||
278 | return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); | 290 | return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); |
279 | } | 291 | } |
280 | 292 | ||
281 | static void ntp_update_offset(long offset) | 293 | static void ntp_update_offset(long offset) |
@@ -330,11 +342,13 @@ static void ntp_update_offset(long offset) | |||
330 | 342 | ||
331 | /** | 343 | /** |
332 | * ntp_clear - Clears the NTP state variables | 344 | * ntp_clear - Clears the NTP state variables |
333 | * | ||
334 | * Must be called while holding a write on the xtime_lock | ||
335 | */ | 345 | */ |
336 | void ntp_clear(void) | 346 | void ntp_clear(void) |
337 | { | 347 | { |
348 | unsigned long flags; | ||
349 | |||
350 | spin_lock_irqsave(&ntp_lock, flags); | ||
351 | |||
338 | time_adjust = 0; /* stop active adjtime() */ | 352 | time_adjust = 0; /* stop active adjtime() */ |
339 | time_status |= STA_UNSYNC; | 353 | time_status |= STA_UNSYNC; |
340 | time_maxerror = NTP_PHASE_LIMIT; | 354 | time_maxerror = NTP_PHASE_LIMIT; |
@@ -347,63 +361,81 @@ void ntp_clear(void) | |||
347 | 361 | ||
348 | /* Clear PPS state variables */ | 362 | /* Clear PPS state variables */ |
349 | pps_clear(); | 363 | pps_clear(); |
364 | spin_unlock_irqrestore(&ntp_lock, flags); | ||
365 | |||
366 | } | ||
367 | |||
368 | |||
369 | u64 ntp_tick_length(void) | ||
370 | { | ||
371 | unsigned long flags; | ||
372 | s64 ret; | ||
373 | |||
374 | spin_lock_irqsave(&ntp_lock, flags); | ||
375 | ret = tick_length; | ||
376 | spin_unlock_irqrestore(&ntp_lock, flags); | ||
377 | return ret; | ||
350 | } | 378 | } |
351 | 379 | ||
380 | |||
352 | /* | 381 | /* |
353 | * Leap second processing. If in leap-insert state at the end of the | 382 | * this routine handles the overflow of the microsecond field |
354 | * day, the system clock is set back one second; if in leap-delete | 383 | * |
355 | * state, the system clock is set ahead one second. | 384 | * The tricky bits of code to handle the accurate clock support |
385 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
386 | * They were originally developed for SUN and DEC kernels. | ||
387 | * All the kudos should go to Dave for this stuff. | ||
388 | * | ||
389 | * Also handles leap second processing, and returns leap offset | ||
356 | */ | 390 | */ |
357 | static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) | 391 | int second_overflow(unsigned long secs) |
358 | { | 392 | { |
359 | enum hrtimer_restart res = HRTIMER_NORESTART; | 393 | s64 delta; |
394 | int leap = 0; | ||
395 | unsigned long flags; | ||
360 | 396 | ||
361 | write_seqlock(&xtime_lock); | 397 | spin_lock_irqsave(&ntp_lock, flags); |
362 | 398 | ||
399 | /* | ||
400 | * Leap second processing. If in leap-insert state at the end of the | ||
401 | * day, the system clock is set back one second; if in leap-delete | ||
402 | * state, the system clock is set ahead one second. | ||
403 | */ | ||
363 | switch (time_state) { | 404 | switch (time_state) { |
364 | case TIME_OK: | 405 | case TIME_OK: |
406 | if (time_status & STA_INS) | ||
407 | time_state = TIME_INS; | ||
408 | else if (time_status & STA_DEL) | ||
409 | time_state = TIME_DEL; | ||
365 | break; | 410 | break; |
366 | case TIME_INS: | 411 | case TIME_INS: |
367 | timekeeping_leap_insert(-1); | 412 | if (secs % 86400 == 0) { |
368 | time_state = TIME_OOP; | 413 | leap = -1; |
369 | printk(KERN_NOTICE | 414 | time_state = TIME_OOP; |
370 | "Clock: inserting leap second 23:59:60 UTC\n"); | 415 | printk(KERN_NOTICE |
371 | hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); | 416 | "Clock: inserting leap second 23:59:60 UTC\n"); |
372 | res = HRTIMER_RESTART; | 417 | } |
373 | break; | 418 | break; |
374 | case TIME_DEL: | 419 | case TIME_DEL: |
375 | timekeeping_leap_insert(1); | 420 | if ((secs + 1) % 86400 == 0) { |
376 | time_tai--; | 421 | leap = 1; |
377 | time_state = TIME_WAIT; | 422 | time_tai--; |
378 | printk(KERN_NOTICE | 423 | time_state = TIME_WAIT; |
379 | "Clock: deleting leap second 23:59:59 UTC\n"); | 424 | printk(KERN_NOTICE |
425 | "Clock: deleting leap second 23:59:59 UTC\n"); | ||
426 | } | ||
380 | break; | 427 | break; |
381 | case TIME_OOP: | 428 | case TIME_OOP: |
382 | time_tai++; | 429 | time_tai++; |
383 | time_state = TIME_WAIT; | 430 | time_state = TIME_WAIT; |
384 | /* fall through */ | 431 | break; |
432 | |||
385 | case TIME_WAIT: | 433 | case TIME_WAIT: |
386 | if (!(time_status & (STA_INS | STA_DEL))) | 434 | if (!(time_status & (STA_INS | STA_DEL))) |
387 | time_state = TIME_OK; | 435 | time_state = TIME_OK; |
388 | break; | 436 | break; |
389 | } | 437 | } |
390 | 438 | ||
391 | write_sequnlock(&xtime_lock); | ||
392 | |||
393 | return res; | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * this routine handles the overflow of the microsecond field | ||
398 | * | ||
399 | * The tricky bits of code to handle the accurate clock support | ||
400 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
401 | * They were originally developed for SUN and DEC kernels. | ||
402 | * All the kudos should go to Dave for this stuff. | ||
403 | */ | ||
404 | void second_overflow(void) | ||
405 | { | ||
406 | s64 delta; | ||
407 | 439 | ||
408 | /* Bump the maxerror field */ | 440 | /* Bump the maxerror field */ |
409 | time_maxerror += MAXFREQ / NSEC_PER_USEC; | 441 | time_maxerror += MAXFREQ / NSEC_PER_USEC; |
@@ -423,30 +455,34 @@ void second_overflow(void) | |||
423 | pps_dec_valid(); | 455 | pps_dec_valid(); |
424 | 456 | ||
425 | if (!time_adjust) | 457 | if (!time_adjust) |
426 | return; | 458 | goto out; |
427 | 459 | ||
428 | if (time_adjust > MAX_TICKADJ) { | 460 | if (time_adjust > MAX_TICKADJ) { |
429 | time_adjust -= MAX_TICKADJ; | 461 | time_adjust -= MAX_TICKADJ; |
430 | tick_length += MAX_TICKADJ_SCALED; | 462 | tick_length += MAX_TICKADJ_SCALED; |
431 | return; | 463 | goto out; |
432 | } | 464 | } |
433 | 465 | ||
434 | if (time_adjust < -MAX_TICKADJ) { | 466 | if (time_adjust < -MAX_TICKADJ) { |
435 | time_adjust += MAX_TICKADJ; | 467 | time_adjust += MAX_TICKADJ; |
436 | tick_length -= MAX_TICKADJ_SCALED; | 468 | tick_length -= MAX_TICKADJ_SCALED; |
437 | return; | 469 | goto out; |
438 | } | 470 | } |
439 | 471 | ||
440 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) | 472 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) |
441 | << NTP_SCALE_SHIFT; | 473 | << NTP_SCALE_SHIFT; |
442 | time_adjust = 0; | 474 | time_adjust = 0; |
475 | |||
476 | |||
477 | |||
478 | out: | ||
479 | spin_unlock_irqrestore(&ntp_lock, flags); | ||
480 | |||
481 | return leap; | ||
443 | } | 482 | } |
444 | 483 | ||
445 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 484 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
446 | 485 | ||
447 | /* Disable the cmos update - used by virtualization and embedded */ | ||
448 | int no_sync_cmos_clock __read_mostly; | ||
449 | |||
450 | static void sync_cmos_clock(struct work_struct *work); | 486 | static void sync_cmos_clock(struct work_struct *work); |
451 | 487 | ||
452 | static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); | 488 | static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); |
@@ -493,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work) | |||
493 | 529 | ||
494 | static void notify_cmos_timer(void) | 530 | static void notify_cmos_timer(void) |
495 | { | 531 | { |
496 | if (!no_sync_cmos_clock) | 532 | schedule_delayed_work(&sync_cmos_work, 0); |
497 | schedule_delayed_work(&sync_cmos_work, 0); | ||
498 | } | 533 | } |
499 | 534 | ||
500 | #else | 535 | #else |
501 | static inline void notify_cmos_timer(void) { } | 536 | static inline void notify_cmos_timer(void) { } |
502 | #endif | 537 | #endif |
503 | 538 | ||
504 | /* | ||
505 | * Start the leap seconds timer: | ||
506 | */ | ||
507 | static inline void ntp_start_leap_timer(struct timespec *ts) | ||
508 | { | ||
509 | long now = ts->tv_sec; | ||
510 | |||
511 | if (time_status & STA_INS) { | ||
512 | time_state = TIME_INS; | ||
513 | now += 86400 - now % 86400; | ||
514 | hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); | ||
515 | |||
516 | return; | ||
517 | } | ||
518 | |||
519 | if (time_status & STA_DEL) { | ||
520 | time_state = TIME_DEL; | ||
521 | now += 86400 - (now + 1) % 86400; | ||
522 | hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); | ||
523 | } | ||
524 | } | ||
525 | 539 | ||
526 | /* | 540 | /* |
527 | * Propagate a new txc->status value into the NTP state: | 541 | * Propagate a new txc->status value into the NTP state: |
@@ -546,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
546 | time_status &= STA_RONLY; | 560 | time_status &= STA_RONLY; |
547 | time_status |= txc->status & ~STA_RONLY; | 561 | time_status |= txc->status & ~STA_RONLY; |
548 | 562 | ||
549 | switch (time_state) { | ||
550 | case TIME_OK: | ||
551 | ntp_start_leap_timer(ts); | ||
552 | break; | ||
553 | case TIME_INS: | ||
554 | case TIME_DEL: | ||
555 | time_state = TIME_OK; | ||
556 | ntp_start_leap_timer(ts); | ||
557 | case TIME_WAIT: | ||
558 | if (!(time_status & (STA_INS | STA_DEL))) | ||
559 | time_state = TIME_OK; | ||
560 | break; | ||
561 | case TIME_OOP: | ||
562 | hrtimer_restart(&leap_timer); | ||
563 | break; | ||
564 | } | ||
565 | } | 563 | } |
566 | /* | 564 | /* |
567 | * Called with the xtime lock held, so we can access and modify | 565 | * Called with the xtime lock held, so we can access and modify |
@@ -643,9 +641,6 @@ int do_adjtimex(struct timex *txc) | |||
643 | (txc->tick < 900000/USER_HZ || | 641 | (txc->tick < 900000/USER_HZ || |
644 | txc->tick > 1100000/USER_HZ)) | 642 | txc->tick > 1100000/USER_HZ)) |
645 | return -EINVAL; | 643 | return -EINVAL; |
646 | |||
647 | if (txc->modes & ADJ_STATUS && time_state != TIME_OK) | ||
648 | hrtimer_cancel(&leap_timer); | ||
649 | } | 644 | } |
650 | 645 | ||
651 | if (txc->modes & ADJ_SETOFFSET) { | 646 | if (txc->modes & ADJ_SETOFFSET) { |
@@ -663,7 +658,7 @@ int do_adjtimex(struct timex *txc) | |||
663 | 658 | ||
664 | getnstimeofday(&ts); | 659 | getnstimeofday(&ts); |
665 | 660 | ||
666 | write_seqlock_irq(&xtime_lock); | 661 | spin_lock_irq(&ntp_lock); |
667 | 662 | ||
668 | if (txc->modes & ADJ_ADJTIME) { | 663 | if (txc->modes & ADJ_ADJTIME) { |
669 | long save_adjust = time_adjust; | 664 | long save_adjust = time_adjust; |
@@ -705,7 +700,7 @@ int do_adjtimex(struct timex *txc) | |||
705 | /* fill PPS status fields */ | 700 | /* fill PPS status fields */ |
706 | pps_fill_timex(txc); | 701 | pps_fill_timex(txc); |
707 | 702 | ||
708 | write_sequnlock_irq(&xtime_lock); | 703 | spin_unlock_irq(&ntp_lock); |
709 | 704 | ||
710 | txc->time.tv_sec = ts.tv_sec; | 705 | txc->time.tv_sec = ts.tv_sec; |
711 | txc->time.tv_usec = ts.tv_nsec; | 706 | txc->time.tv_usec = ts.tv_nsec; |
@@ -903,7 +898,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
903 | 898 | ||
904 | pts_norm = pps_normalize_ts(*phase_ts); | 899 | pts_norm = pps_normalize_ts(*phase_ts); |
905 | 900 | ||
906 | write_seqlock_irqsave(&xtime_lock, flags); | 901 | spin_lock_irqsave(&ntp_lock, flags); |
907 | 902 | ||
908 | /* clear the error bits, they will be set again if needed */ | 903 | /* clear the error bits, they will be set again if needed */ |
909 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); | 904 | time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); |
@@ -916,7 +911,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
916 | * just start the frequency interval */ | 911 | * just start the frequency interval */ |
917 | if (unlikely(pps_fbase.tv_sec == 0)) { | 912 | if (unlikely(pps_fbase.tv_sec == 0)) { |
918 | pps_fbase = *raw_ts; | 913 | pps_fbase = *raw_ts; |
919 | write_sequnlock_irqrestore(&xtime_lock, flags); | 914 | spin_unlock_irqrestore(&ntp_lock, flags); |
920 | return; | 915 | return; |
921 | } | 916 | } |
922 | 917 | ||
@@ -931,7 +926,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
931 | time_status |= STA_PPSJITTER; | 926 | time_status |= STA_PPSJITTER; |
932 | /* restart the frequency calibration interval */ | 927 | /* restart the frequency calibration interval */ |
933 | pps_fbase = *raw_ts; | 928 | pps_fbase = *raw_ts; |
934 | write_sequnlock_irqrestore(&xtime_lock, flags); | 929 | spin_unlock_irqrestore(&ntp_lock, flags); |
935 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | 930 | pr_err("hardpps: PPSJITTER: bad pulse\n"); |
936 | return; | 931 | return; |
937 | } | 932 | } |
@@ -948,7 +943,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
948 | 943 | ||
949 | hardpps_update_phase(pts_norm.nsec); | 944 | hardpps_update_phase(pts_norm.nsec); |
950 | 945 | ||
951 | write_sequnlock_irqrestore(&xtime_lock, flags); | 946 | spin_unlock_irqrestore(&ntp_lock, flags); |
952 | } | 947 | } |
953 | EXPORT_SYMBOL(hardpps); | 948 | EXPORT_SYMBOL(hardpps); |
954 | 949 | ||
@@ -967,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); | |||
967 | void __init ntp_init(void) | 962 | void __init ntp_init(void) |
968 | { | 963 | { |
969 | ntp_clear(); | 964 | ntp_clear(); |
970 | hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); | ||
971 | leap_timer.function = ntp_leap_second; | ||
972 | } | 965 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fd4a7b1625a2..e883f57a3cd3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void) | |||
575 | unsigned long flags; | 575 | unsigned long flags; |
576 | 576 | ||
577 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 577 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
578 | if (cpumask_empty(tick_get_broadcast_mask())) | ||
579 | goto end; | ||
578 | 580 | ||
579 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; | 581 | tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; |
580 | bc = tick_broadcast_device.evtdev; | 582 | bc = tick_broadcast_device.evtdev; |
581 | if (bc) | 583 | if (bc) |
582 | tick_broadcast_setup_oneshot(bc); | 584 | tick_broadcast_setup_oneshot(bc); |
585 | |||
586 | end: | ||
583 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 587 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
584 | } | 588 | } |
585 | 589 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7656642e4b8e..3526038f2836 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) | |||
182 | 182 | ||
183 | static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) | 183 | static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) |
184 | { | 184 | { |
185 | ktime_t now; | 185 | ktime_t now = ktime_get(); |
186 | |||
187 | now = ktime_get(); | ||
188 | |||
189 | update_ts_time_stats(cpu, ts, now, NULL); | ||
190 | 186 | ||
191 | ts->idle_entrytime = now; | 187 | ts->idle_entrytime = now; |
192 | ts->idle_active = 1; | 188 | ts->idle_active = 1; |
@@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void) | |||
562 | 558 | ||
563 | local_irq_disable(); | 559 | local_irq_disable(); |
564 | 560 | ||
565 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) | 561 | WARN_ON_ONCE(!ts->inidle); |
562 | |||
563 | ts->inidle = 0; | ||
564 | |||
565 | if (ts->idle_active || ts->tick_stopped) | ||
566 | now = ktime_get(); | 566 | now = ktime_get(); |
567 | 567 | ||
568 | if (ts->idle_active) | 568 | if (ts->idle_active) |
569 | tick_nohz_stop_idle(cpu, now); | 569 | tick_nohz_stop_idle(cpu, now); |
570 | 570 | ||
571 | if (!ts->inidle || !ts->tick_stopped) { | 571 | if (!ts->tick_stopped) { |
572 | ts->inidle = 0; | ||
573 | local_irq_enable(); | 572 | local_irq_enable(); |
574 | return; | 573 | return; |
575 | } | 574 | } |
576 | 575 | ||
577 | ts->inidle = 0; | ||
578 | |||
579 | /* Update jiffies first */ | 576 | /* Update jiffies first */ |
580 | select_nohz_load_balancer(0); | 577 | select_nohz_load_balancer(0); |
581 | tick_do_update_jiffies64(now); | 578 | tick_do_update_jiffies64(now); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c6358186401..d66b21308f7c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -25,6 +25,8 @@ | |||
25 | struct timekeeper { | 25 | struct timekeeper { |
26 | /* Current clocksource used for timekeeping. */ | 26 | /* Current clocksource used for timekeeping. */ |
27 | struct clocksource *clock; | 27 | struct clocksource *clock; |
28 | /* NTP adjusted clock multiplier */ | ||
29 | u32 mult; | ||
28 | /* The shift value of the current clocksource. */ | 30 | /* The shift value of the current clocksource. */ |
29 | int shift; | 31 | int shift; |
30 | 32 | ||
@@ -45,12 +47,47 @@ struct timekeeper { | |||
45 | /* Shift conversion between clock shifted nano seconds and | 47 | /* Shift conversion between clock shifted nano seconds and |
46 | * ntp shifted nano seconds. */ | 48 | * ntp shifted nano seconds. */ |
47 | int ntp_error_shift; | 49 | int ntp_error_shift; |
48 | /* NTP adjusted clock multiplier */ | 50 | |
49 | u32 mult; | 51 | /* The current time */ |
52 | struct timespec xtime; | ||
53 | /* | ||
54 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | ||
55 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | ||
56 | * at zero at system boot time, so wall_to_monotonic will be negative, | ||
57 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | ||
58 | * the usual normalization. | ||
59 | * | ||
60 | * wall_to_monotonic is moved after resume from suspend for the | ||
61 | * monotonic time not to jump. We need to add total_sleep_time to | ||
62 | * wall_to_monotonic to get the real boot based time offset. | ||
63 | * | ||
64 | * - wall_to_monotonic is no longer the boot time, getboottime must be | ||
65 | * used instead. | ||
66 | */ | ||
67 | struct timespec wall_to_monotonic; | ||
68 | /* time spent in suspend */ | ||
69 | struct timespec total_sleep_time; | ||
70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | ||
71 | struct timespec raw_time; | ||
72 | |||
73 | /* Seqlock for all timekeeper values */ | ||
74 | seqlock_t lock; | ||
50 | }; | 75 | }; |
51 | 76 | ||
52 | static struct timekeeper timekeeper; | 77 | static struct timekeeper timekeeper; |
53 | 78 | ||
79 | /* | ||
80 | * This read-write spinlock protects us from races in SMP while | ||
81 | * playing with xtime. | ||
82 | */ | ||
83 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
84 | |||
85 | |||
86 | /* flag for if timekeeping is suspended */ | ||
87 | int __read_mostly timekeeping_suspended; | ||
88 | |||
89 | |||
90 | |||
54 | /** | 91 | /** |
55 | * timekeeper_setup_internals - Set up internals to use clocksource clock. | 92 | * timekeeper_setup_internals - Set up internals to use clocksource clock. |
56 | * | 93 | * |
@@ -135,49 +172,18 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 172 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
136 | } | 173 | } |
137 | 174 | ||
138 | /* | 175 | /* must hold write on timekeeper.lock */ |
139 | * This read-write spinlock protects us from races in SMP while | 176 | static void timekeeping_update(bool clearntp) |
140 | * playing with xtime. | ||
141 | */ | ||
142 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | ||
143 | |||
144 | |||
145 | /* | ||
146 | * The current time | ||
147 | * wall_to_monotonic is what we need to add to xtime (or xtime corrected | ||
148 | * for sub jiffie times) to get to monotonic time. Monotonic is pegged | ||
149 | * at zero at system boot time, so wall_to_monotonic will be negative, | ||
150 | * however, we will ALWAYS keep the tv_nsec part positive so we can use | ||
151 | * the usual normalization. | ||
152 | * | ||
153 | * wall_to_monotonic is moved after resume from suspend for the monotonic | ||
154 | * time not to jump. We need to add total_sleep_time to wall_to_monotonic | ||
155 | * to get the real boot based time offset. | ||
156 | * | ||
157 | * - wall_to_monotonic is no longer the boot time, getboottime must be | ||
158 | * used instead. | ||
159 | */ | ||
160 | static struct timespec xtime __attribute__ ((aligned (16))); | ||
161 | static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | ||
162 | static struct timespec total_sleep_time; | ||
163 | |||
164 | /* | ||
165 | * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. | ||
166 | */ | ||
167 | static struct timespec raw_time; | ||
168 | |||
169 | /* flag for if timekeeping is suspended */ | ||
170 | int __read_mostly timekeeping_suspended; | ||
171 | |||
172 | /* must hold xtime_lock */ | ||
173 | void timekeeping_leap_insert(int leapsecond) | ||
174 | { | 177 | { |
175 | xtime.tv_sec += leapsecond; | 178 | if (clearntp) { |
176 | wall_to_monotonic.tv_sec -= leapsecond; | 179 | timekeeper.ntp_error = 0; |
177 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | 180 | ntp_clear(); |
178 | timekeeper.mult); | 181 | } |
182 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, | ||
183 | timekeeper.clock, timekeeper.mult); | ||
179 | } | 184 | } |
180 | 185 | ||
186 | |||
181 | /** | 187 | /** |
182 | * timekeeping_forward_now - update clock to the current time | 188 | * timekeeping_forward_now - update clock to the current time |
183 | * | 189 | * |
@@ -202,10 +208,10 @@ static void timekeeping_forward_now(void) | |||
202 | /* If arch requires, add in gettimeoffset() */ | 208 | /* If arch requires, add in gettimeoffset() */ |
203 | nsec += arch_gettimeoffset(); | 209 | nsec += arch_gettimeoffset(); |
204 | 210 | ||
205 | timespec_add_ns(&xtime, nsec); | 211 | timespec_add_ns(&timekeeper.xtime, nsec); |
206 | 212 | ||
207 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 213 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
208 | timespec_add_ns(&raw_time, nsec); | 214 | timespec_add_ns(&timekeeper.raw_time, nsec); |
209 | } | 215 | } |
210 | 216 | ||
211 | /** | 217 | /** |
@@ -222,15 +228,15 @@ void getnstimeofday(struct timespec *ts) | |||
222 | WARN_ON(timekeeping_suspended); | 228 | WARN_ON(timekeeping_suspended); |
223 | 229 | ||
224 | do { | 230 | do { |
225 | seq = read_seqbegin(&xtime_lock); | 231 | seq = read_seqbegin(&timekeeper.lock); |
226 | 232 | ||
227 | *ts = xtime; | 233 | *ts = timekeeper.xtime; |
228 | nsecs = timekeeping_get_ns(); | 234 | nsecs = timekeeping_get_ns(); |
229 | 235 | ||
230 | /* If arch requires, add in gettimeoffset() */ | 236 | /* If arch requires, add in gettimeoffset() */ |
231 | nsecs += arch_gettimeoffset(); | 237 | nsecs += arch_gettimeoffset(); |
232 | 238 | ||
233 | } while (read_seqretry(&xtime_lock, seq)); | 239 | } while (read_seqretry(&timekeeper.lock, seq)); |
234 | 240 | ||
235 | timespec_add_ns(ts, nsecs); | 241 | timespec_add_ns(ts, nsecs); |
236 | } | 242 | } |
@@ -245,14 +251,16 @@ ktime_t ktime_get(void) | |||
245 | WARN_ON(timekeeping_suspended); | 251 | WARN_ON(timekeeping_suspended); |
246 | 252 | ||
247 | do { | 253 | do { |
248 | seq = read_seqbegin(&xtime_lock); | 254 | seq = read_seqbegin(&timekeeper.lock); |
249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 255 | secs = timekeeper.xtime.tv_sec + |
250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 256 | timekeeper.wall_to_monotonic.tv_sec; |
257 | nsecs = timekeeper.xtime.tv_nsec + | ||
258 | timekeeper.wall_to_monotonic.tv_nsec; | ||
251 | nsecs += timekeeping_get_ns(); | 259 | nsecs += timekeeping_get_ns(); |
252 | /* If arch requires, add in gettimeoffset() */ | 260 | /* If arch requires, add in gettimeoffset() */ |
253 | nsecs += arch_gettimeoffset(); | 261 | nsecs += arch_gettimeoffset(); |
254 | 262 | ||
255 | } while (read_seqretry(&xtime_lock, seq)); | 263 | } while (read_seqretry(&timekeeper.lock, seq)); |
256 | /* | 264 | /* |
257 | * Use ktime_set/ktime_add_ns to create a proper ktime on | 265 | * Use ktime_set/ktime_add_ns to create a proper ktime on |
258 | * 32-bit architectures without CONFIG_KTIME_SCALAR. | 266 | * 32-bit architectures without CONFIG_KTIME_SCALAR. |
@@ -278,14 +286,14 @@ void ktime_get_ts(struct timespec *ts) | |||
278 | WARN_ON(timekeeping_suspended); | 286 | WARN_ON(timekeeping_suspended); |
279 | 287 | ||
280 | do { | 288 | do { |
281 | seq = read_seqbegin(&xtime_lock); | 289 | seq = read_seqbegin(&timekeeper.lock); |
282 | *ts = xtime; | 290 | *ts = timekeeper.xtime; |
283 | tomono = wall_to_monotonic; | 291 | tomono = timekeeper.wall_to_monotonic; |
284 | nsecs = timekeeping_get_ns(); | 292 | nsecs = timekeeping_get_ns(); |
285 | /* If arch requires, add in gettimeoffset() */ | 293 | /* If arch requires, add in gettimeoffset() */ |
286 | nsecs += arch_gettimeoffset(); | 294 | nsecs += arch_gettimeoffset(); |
287 | 295 | ||
288 | } while (read_seqretry(&xtime_lock, seq)); | 296 | } while (read_seqretry(&timekeeper.lock, seq)); |
289 | 297 | ||
290 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | 298 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, |
291 | ts->tv_nsec + tomono.tv_nsec + nsecs); | 299 | ts->tv_nsec + tomono.tv_nsec + nsecs); |
@@ -313,10 +321,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
313 | do { | 321 | do { |
314 | u32 arch_offset; | 322 | u32 arch_offset; |
315 | 323 | ||
316 | seq = read_seqbegin(&xtime_lock); | 324 | seq = read_seqbegin(&timekeeper.lock); |
317 | 325 | ||
318 | *ts_raw = raw_time; | 326 | *ts_raw = timekeeper.raw_time; |
319 | *ts_real = xtime; | 327 | *ts_real = timekeeper.xtime; |
320 | 328 | ||
321 | nsecs_raw = timekeeping_get_ns_raw(); | 329 | nsecs_raw = timekeeping_get_ns_raw(); |
322 | nsecs_real = timekeeping_get_ns(); | 330 | nsecs_real = timekeeping_get_ns(); |
@@ -326,7 +334,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
326 | nsecs_raw += arch_offset; | 334 | nsecs_raw += arch_offset; |
327 | nsecs_real += arch_offset; | 335 | nsecs_real += arch_offset; |
328 | 336 | ||
329 | } while (read_seqretry(&xtime_lock, seq)); | 337 | } while (read_seqretry(&timekeeper.lock, seq)); |
330 | 338 | ||
331 | timespec_add_ns(ts_raw, nsecs_raw); | 339 | timespec_add_ns(ts_raw, nsecs_raw); |
332 | timespec_add_ns(ts_real, nsecs_real); | 340 | timespec_add_ns(ts_real, nsecs_real); |
@@ -365,23 +373,19 @@ int do_settimeofday(const struct timespec *tv) | |||
365 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | 373 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) |
366 | return -EINVAL; | 374 | return -EINVAL; |
367 | 375 | ||
368 | write_seqlock_irqsave(&xtime_lock, flags); | 376 | write_seqlock_irqsave(&timekeeper.lock, flags); |
369 | 377 | ||
370 | timekeeping_forward_now(); | 378 | timekeeping_forward_now(); |
371 | 379 | ||
372 | ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; | 380 | ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; |
373 | ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; | 381 | ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; |
374 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); | 382 | timekeeper.wall_to_monotonic = |
383 | timespec_sub(timekeeper.wall_to_monotonic, ts_delta); | ||
375 | 384 | ||
376 | xtime = *tv; | 385 | timekeeper.xtime = *tv; |
377 | 386 | timekeeping_update(true); | |
378 | timekeeper.ntp_error = 0; | ||
379 | ntp_clear(); | ||
380 | 387 | ||
381 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | 388 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
382 | timekeeper.mult); | ||
383 | |||
384 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
385 | 389 | ||
386 | /* signal hrtimers about time change */ | 390 | /* signal hrtimers about time change */ |
387 | clock_was_set(); | 391 | clock_was_set(); |
@@ -405,20 +409,17 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
405 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | 409 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) |
406 | return -EINVAL; | 410 | return -EINVAL; |
407 | 411 | ||
408 | write_seqlock_irqsave(&xtime_lock, flags); | 412 | write_seqlock_irqsave(&timekeeper.lock, flags); |
409 | 413 | ||
410 | timekeeping_forward_now(); | 414 | timekeeping_forward_now(); |
411 | 415 | ||
412 | xtime = timespec_add(xtime, *ts); | 416 | timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); |
413 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); | 417 | timekeeper.wall_to_monotonic = |
414 | 418 | timespec_sub(timekeeper.wall_to_monotonic, *ts); | |
415 | timekeeper.ntp_error = 0; | ||
416 | ntp_clear(); | ||
417 | 419 | ||
418 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | 420 | timekeeping_update(true); |
419 | timekeeper.mult); | ||
420 | 421 | ||
421 | write_sequnlock_irqrestore(&xtime_lock, flags); | 422 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
422 | 423 | ||
423 | /* signal hrtimers about time change */ | 424 | /* signal hrtimers about time change */ |
424 | clock_was_set(); | 425 | clock_was_set(); |
@@ -435,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset); | |||
435 | static int change_clocksource(void *data) | 436 | static int change_clocksource(void *data) |
436 | { | 437 | { |
437 | struct clocksource *new, *old; | 438 | struct clocksource *new, *old; |
439 | unsigned long flags; | ||
438 | 440 | ||
439 | new = (struct clocksource *) data; | 441 | new = (struct clocksource *) data; |
440 | 442 | ||
443 | write_seqlock_irqsave(&timekeeper.lock, flags); | ||
444 | |||
441 | timekeeping_forward_now(); | 445 | timekeeping_forward_now(); |
442 | if (!new->enable || new->enable(new) == 0) { | 446 | if (!new->enable || new->enable(new) == 0) { |
443 | old = timekeeper.clock; | 447 | old = timekeeper.clock; |
@@ -445,6 +449,10 @@ static int change_clocksource(void *data) | |||
445 | if (old->disable) | 449 | if (old->disable) |
446 | old->disable(old); | 450 | old->disable(old); |
447 | } | 451 | } |
452 | timekeeping_update(true); | ||
453 | |||
454 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | ||
455 | |||
448 | return 0; | 456 | return 0; |
449 | } | 457 | } |
450 | 458 | ||
@@ -490,11 +498,11 @@ void getrawmonotonic(struct timespec *ts) | |||
490 | s64 nsecs; | 498 | s64 nsecs; |
491 | 499 | ||
492 | do { | 500 | do { |
493 | seq = read_seqbegin(&xtime_lock); | 501 | seq = read_seqbegin(&timekeeper.lock); |
494 | nsecs = timekeeping_get_ns_raw(); | 502 | nsecs = timekeeping_get_ns_raw(); |
495 | *ts = raw_time; | 503 | *ts = timekeeper.raw_time; |
496 | 504 | ||
497 | } while (read_seqretry(&xtime_lock, seq)); | 505 | } while (read_seqretry(&timekeeper.lock, seq)); |
498 | 506 | ||
499 | timespec_add_ns(ts, nsecs); | 507 | timespec_add_ns(ts, nsecs); |
500 | } | 508 | } |
@@ -510,24 +518,30 @@ int timekeeping_valid_for_hres(void) | |||
510 | int ret; | 518 | int ret; |
511 | 519 | ||
512 | do { | 520 | do { |
513 | seq = read_seqbegin(&xtime_lock); | 521 | seq = read_seqbegin(&timekeeper.lock); |
514 | 522 | ||
515 | ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 523 | ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
516 | 524 | ||
517 | } while (read_seqretry(&xtime_lock, seq)); | 525 | } while (read_seqretry(&timekeeper.lock, seq)); |
518 | 526 | ||
519 | return ret; | 527 | return ret; |
520 | } | 528 | } |
521 | 529 | ||
522 | /** | 530 | /** |
523 | * timekeeping_max_deferment - Returns max time the clocksource can be deferred | 531 | * timekeeping_max_deferment - Returns max time the clocksource can be deferred |
524 | * | ||
525 | * Caller must observe xtime_lock via read_seqbegin/read_seqretry to | ||
526 | * ensure that the clocksource does not change! | ||
527 | */ | 532 | */ |
528 | u64 timekeeping_max_deferment(void) | 533 | u64 timekeeping_max_deferment(void) |
529 | { | 534 | { |
530 | return timekeeper.clock->max_idle_ns; | 535 | unsigned long seq; |
536 | u64 ret; | ||
537 | do { | ||
538 | seq = read_seqbegin(&timekeeper.lock); | ||
539 | |||
540 | ret = timekeeper.clock->max_idle_ns; | ||
541 | |||
542 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
543 | |||
544 | return ret; | ||
531 | } | 545 | } |
532 | 546 | ||
533 | /** | 547 | /** |
@@ -572,28 +586,29 @@ void __init timekeeping_init(void) | |||
572 | read_persistent_clock(&now); | 586 | read_persistent_clock(&now); |
573 | read_boot_clock(&boot); | 587 | read_boot_clock(&boot); |
574 | 588 | ||
575 | write_seqlock_irqsave(&xtime_lock, flags); | 589 | seqlock_init(&timekeeper.lock); |
576 | 590 | ||
577 | ntp_init(); | 591 | ntp_init(); |
578 | 592 | ||
593 | write_seqlock_irqsave(&timekeeper.lock, flags); | ||
579 | clock = clocksource_default_clock(); | 594 | clock = clocksource_default_clock(); |
580 | if (clock->enable) | 595 | if (clock->enable) |
581 | clock->enable(clock); | 596 | clock->enable(clock); |
582 | timekeeper_setup_internals(clock); | 597 | timekeeper_setup_internals(clock); |
583 | 598 | ||
584 | xtime.tv_sec = now.tv_sec; | 599 | timekeeper.xtime.tv_sec = now.tv_sec; |
585 | xtime.tv_nsec = now.tv_nsec; | 600 | timekeeper.xtime.tv_nsec = now.tv_nsec; |
586 | raw_time.tv_sec = 0; | 601 | timekeeper.raw_time.tv_sec = 0; |
587 | raw_time.tv_nsec = 0; | 602 | timekeeper.raw_time.tv_nsec = 0; |
588 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) { | 603 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) { |
589 | boot.tv_sec = xtime.tv_sec; | 604 | boot.tv_sec = timekeeper.xtime.tv_sec; |
590 | boot.tv_nsec = xtime.tv_nsec; | 605 | boot.tv_nsec = timekeeper.xtime.tv_nsec; |
591 | } | 606 | } |
592 | set_normalized_timespec(&wall_to_monotonic, | 607 | set_normalized_timespec(&timekeeper.wall_to_monotonic, |
593 | -boot.tv_sec, -boot.tv_nsec); | 608 | -boot.tv_sec, -boot.tv_nsec); |
594 | total_sleep_time.tv_sec = 0; | 609 | timekeeper.total_sleep_time.tv_sec = 0; |
595 | total_sleep_time.tv_nsec = 0; | 610 | timekeeper.total_sleep_time.tv_nsec = 0; |
596 | write_sequnlock_irqrestore(&xtime_lock, flags); | 611 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
597 | } | 612 | } |
598 | 613 | ||
599 | /* time in seconds when suspend began */ | 614 | /* time in seconds when suspend began */ |
@@ -614,9 +629,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) | |||
614 | return; | 629 | return; |
615 | } | 630 | } |
616 | 631 | ||
617 | xtime = timespec_add(xtime, *delta); | 632 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); |
618 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); | 633 | timekeeper.wall_to_monotonic = |
619 | total_sleep_time = timespec_add(total_sleep_time, *delta); | 634 | timespec_sub(timekeeper.wall_to_monotonic, *delta); |
635 | timekeeper.total_sleep_time = timespec_add( | ||
636 | timekeeper.total_sleep_time, *delta); | ||
620 | } | 637 | } |
621 | 638 | ||
622 | 639 | ||
@@ -640,17 +657,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
640 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | 657 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) |
641 | return; | 658 | return; |
642 | 659 | ||
643 | write_seqlock_irqsave(&xtime_lock, flags); | 660 | write_seqlock_irqsave(&timekeeper.lock, flags); |
661 | |||
644 | timekeeping_forward_now(); | 662 | timekeeping_forward_now(); |
645 | 663 | ||
646 | __timekeeping_inject_sleeptime(delta); | 664 | __timekeeping_inject_sleeptime(delta); |
647 | 665 | ||
648 | timekeeper.ntp_error = 0; | 666 | timekeeping_update(true); |
649 | ntp_clear(); | ||
650 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
651 | timekeeper.mult); | ||
652 | 667 | ||
653 | write_sequnlock_irqrestore(&xtime_lock, flags); | 668 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
654 | 669 | ||
655 | /* signal hrtimers about time change */ | 670 | /* signal hrtimers about time change */ |
656 | clock_was_set(); | 671 | clock_was_set(); |
@@ -673,7 +688,7 @@ static void timekeeping_resume(void) | |||
673 | 688 | ||
674 | clocksource_resume(); | 689 | clocksource_resume(); |
675 | 690 | ||
676 | write_seqlock_irqsave(&xtime_lock, flags); | 691 | write_seqlock_irqsave(&timekeeper.lock, flags); |
677 | 692 | ||
678 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 693 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
679 | ts = timespec_sub(ts, timekeeping_suspend_time); | 694 | ts = timespec_sub(ts, timekeeping_suspend_time); |
@@ -683,7 +698,7 @@ static void timekeeping_resume(void) | |||
683 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 698 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
684 | timekeeper.ntp_error = 0; | 699 | timekeeper.ntp_error = 0; |
685 | timekeeping_suspended = 0; | 700 | timekeeping_suspended = 0; |
686 | write_sequnlock_irqrestore(&xtime_lock, flags); | 701 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
687 | 702 | ||
688 | touch_softlockup_watchdog(); | 703 | touch_softlockup_watchdog(); |
689 | 704 | ||
@@ -701,7 +716,7 @@ static int timekeeping_suspend(void) | |||
701 | 716 | ||
702 | read_persistent_clock(&timekeeping_suspend_time); | 717 | read_persistent_clock(&timekeeping_suspend_time); |
703 | 718 | ||
704 | write_seqlock_irqsave(&xtime_lock, flags); | 719 | write_seqlock_irqsave(&timekeeper.lock, flags); |
705 | timekeeping_forward_now(); | 720 | timekeeping_forward_now(); |
706 | timekeeping_suspended = 1; | 721 | timekeeping_suspended = 1; |
707 | 722 | ||
@@ -711,7 +726,7 @@ static int timekeeping_suspend(void) | |||
711 | * try to compensate so the difference in system time | 726 | * try to compensate so the difference in system time |
712 | * and persistent_clock time stays close to constant. | 727 | * and persistent_clock time stays close to constant. |
713 | */ | 728 | */ |
714 | delta = timespec_sub(xtime, timekeeping_suspend_time); | 729 | delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); |
715 | delta_delta = timespec_sub(delta, old_delta); | 730 | delta_delta = timespec_sub(delta, old_delta); |
716 | if (abs(delta_delta.tv_sec) >= 2) { | 731 | if (abs(delta_delta.tv_sec) >= 2) { |
717 | /* | 732 | /* |
@@ -724,7 +739,7 @@ static int timekeeping_suspend(void) | |||
724 | timekeeping_suspend_time = | 739 | timekeeping_suspend_time = |
725 | timespec_add(timekeeping_suspend_time, delta_delta); | 740 | timespec_add(timekeeping_suspend_time, delta_delta); |
726 | } | 741 | } |
727 | write_sequnlock_irqrestore(&xtime_lock, flags); | 742 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
728 | 743 | ||
729 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 744 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
730 | clocksource_suspend(); | 745 | clocksource_suspend(); |
@@ -775,7 +790,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, | |||
775 | * Now calculate the error in (1 << look_ahead) ticks, but first | 790 | * Now calculate the error in (1 << look_ahead) ticks, but first |
776 | * remove the single look ahead already included in the error. | 791 | * remove the single look ahead already included in the error. |
777 | */ | 792 | */ |
778 | tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); | 793 | tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); |
779 | tick_error -= timekeeper.xtime_interval >> 1; | 794 | tick_error -= timekeeper.xtime_interval >> 1; |
780 | error = ((error - tick_error) >> look_ahead) + tick_error; | 795 | error = ((error - tick_error) >> look_ahead) + tick_error; |
781 | 796 | ||
@@ -807,7 +822,7 @@ static void timekeeping_adjust(s64 offset) | |||
807 | int adj; | 822 | int adj; |
808 | 823 | ||
809 | /* | 824 | /* |
810 | * The point of this is to check if the error is greater then half | 825 | * The point of this is to check if the error is greater than half |
811 | * an interval. | 826 | * an interval. |
812 | * | 827 | * |
813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | 828 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. |
@@ -815,7 +830,7 @@ static void timekeeping_adjust(s64 offset) | |||
815 | * Note we subtract one in the shift, so that error is really error*2. | 830 | * Note we subtract one in the shift, so that error is really error*2. |
816 | * This "saves" dividing(shifting) interval twice, but keeps the | 831 | * This "saves" dividing(shifting) interval twice, but keeps the |
817 | * (error > interval) comparison as still measuring if error is | 832 | * (error > interval) comparison as still measuring if error is |
818 | * larger then half an interval. | 833 | * larger than half an interval. |
819 | * | 834 | * |
820 | * Note: It does not "save" on aggravation when reading the code. | 835 | * Note: It does not "save" on aggravation when reading the code. |
821 | */ | 836 | */ |
@@ -823,7 +838,7 @@ static void timekeeping_adjust(s64 offset) | |||
823 | if (error > interval) { | 838 | if (error > interval) { |
824 | /* | 839 | /* |
825 | * We now divide error by 4(via shift), which checks if | 840 | * We now divide error by 4(via shift), which checks if |
826 | * the error is greater then twice the interval. | 841 | * the error is greater than twice the interval. |
827 | * If it is greater, we need a bigadjust, if its smaller, | 842 | * If it is greater, we need a bigadjust, if its smaller, |
828 | * we can adjust by 1. | 843 | * we can adjust by 1. |
829 | */ | 844 | */ |
@@ -854,13 +869,15 @@ static void timekeeping_adjust(s64 offset) | |||
854 | } else /* No adjustment needed */ | 869 | } else /* No adjustment needed */ |
855 | return; | 870 | return; |
856 | 871 | ||
857 | WARN_ONCE(timekeeper.clock->maxadj && | 872 | if (unlikely(timekeeper.clock->maxadj && |
858 | (timekeeper.mult + adj > timekeeper.clock->mult + | 873 | (timekeeper.mult + adj > |
859 | timekeeper.clock->maxadj), | 874 | timekeeper.clock->mult + timekeeper.clock->maxadj))) { |
860 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | 875 | printk_once(KERN_WARNING |
876 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | ||
861 | timekeeper.clock->name, (long)timekeeper.mult + adj, | 877 | timekeeper.clock->name, (long)timekeeper.mult + adj, |
862 | (long)timekeeper.clock->mult + | 878 | (long)timekeeper.clock->mult + |
863 | timekeeper.clock->maxadj); | 879 | timekeeper.clock->maxadj); |
880 | } | ||
864 | /* | 881 | /* |
865 | * So the following can be confusing. | 882 | * So the following can be confusing. |
866 | * | 883 | * |
@@ -932,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
932 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | 949 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; |
933 | u64 raw_nsecs; | 950 | u64 raw_nsecs; |
934 | 951 | ||
935 | /* If the offset is smaller then a shifted interval, do nothing */ | 952 | /* If the offset is smaller than a shifted interval, do nothing */ |
936 | if (offset < timekeeper.cycle_interval<<shift) | 953 | if (offset < timekeeper.cycle_interval<<shift) |
937 | return offset; | 954 | return offset; |
938 | 955 | ||
@@ -942,23 +959,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
942 | 959 | ||
943 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; | 960 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; |
944 | while (timekeeper.xtime_nsec >= nsecps) { | 961 | while (timekeeper.xtime_nsec >= nsecps) { |
962 | int leap; | ||
945 | timekeeper.xtime_nsec -= nsecps; | 963 | timekeeper.xtime_nsec -= nsecps; |
946 | xtime.tv_sec++; | 964 | timekeeper.xtime.tv_sec++; |
947 | second_overflow(); | 965 | leap = second_overflow(timekeeper.xtime.tv_sec); |
966 | timekeeper.xtime.tv_sec += leap; | ||
948 | } | 967 | } |
949 | 968 | ||
950 | /* Accumulate raw time */ | 969 | /* Accumulate raw time */ |
951 | raw_nsecs = timekeeper.raw_interval << shift; | 970 | raw_nsecs = timekeeper.raw_interval << shift; |
952 | raw_nsecs += raw_time.tv_nsec; | 971 | raw_nsecs += timekeeper.raw_time.tv_nsec; |
953 | if (raw_nsecs >= NSEC_PER_SEC) { | 972 | if (raw_nsecs >= NSEC_PER_SEC) { |
954 | u64 raw_secs = raw_nsecs; | 973 | u64 raw_secs = raw_nsecs; |
955 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); | 974 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); |
956 | raw_time.tv_sec += raw_secs; | 975 | timekeeper.raw_time.tv_sec += raw_secs; |
957 | } | 976 | } |
958 | raw_time.tv_nsec = raw_nsecs; | 977 | timekeeper.raw_time.tv_nsec = raw_nsecs; |
959 | 978 | ||
960 | /* Accumulate error between NTP and clock interval */ | 979 | /* Accumulate error between NTP and clock interval */ |
961 | timekeeper.ntp_error += tick_length << shift; | 980 | timekeeper.ntp_error += ntp_tick_length() << shift; |
962 | timekeeper.ntp_error -= | 981 | timekeeper.ntp_error -= |
963 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << | 982 | (timekeeper.xtime_interval + timekeeper.xtime_remainder) << |
964 | (timekeeper.ntp_error_shift + shift); | 983 | (timekeeper.ntp_error_shift + shift); |
@@ -970,17 +989,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
970 | /** | 989 | /** |
971 | * update_wall_time - Uses the current clocksource to increment the wall time | 990 | * update_wall_time - Uses the current clocksource to increment the wall time |
972 | * | 991 | * |
973 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
974 | */ | 992 | */ |
975 | static void update_wall_time(void) | 993 | static void update_wall_time(void) |
976 | { | 994 | { |
977 | struct clocksource *clock; | 995 | struct clocksource *clock; |
978 | cycle_t offset; | 996 | cycle_t offset; |
979 | int shift = 0, maxshift; | 997 | int shift = 0, maxshift; |
998 | unsigned long flags; | ||
999 | |||
1000 | write_seqlock_irqsave(&timekeeper.lock, flags); | ||
980 | 1001 | ||
981 | /* Make sure we're fully resumed: */ | 1002 | /* Make sure we're fully resumed: */ |
982 | if (unlikely(timekeeping_suspended)) | 1003 | if (unlikely(timekeeping_suspended)) |
983 | return; | 1004 | goto out; |
984 | 1005 | ||
985 | clock = timekeeper.clock; | 1006 | clock = timekeeper.clock; |
986 | 1007 | ||
@@ -989,20 +1010,21 @@ static void update_wall_time(void) | |||
989 | #else | 1010 | #else |
990 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 1011 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; |
991 | #endif | 1012 | #endif |
992 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; | 1013 | timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << |
1014 | timekeeper.shift; | ||
993 | 1015 | ||
994 | /* | 1016 | /* |
995 | * With NO_HZ we may have to accumulate many cycle_intervals | 1017 | * With NO_HZ we may have to accumulate many cycle_intervals |
996 | * (think "ticks") worth of time at once. To do this efficiently, | 1018 | * (think "ticks") worth of time at once. To do this efficiently, |
997 | * we calculate the largest doubling multiple of cycle_intervals | 1019 | * we calculate the largest doubling multiple of cycle_intervals |
998 | * that is smaller then the offset. We then accumulate that | 1020 | * that is smaller than the offset. We then accumulate that |
999 | * chunk in one go, and then try to consume the next smaller | 1021 | * chunk in one go, and then try to consume the next smaller |
1000 | * doubled multiple. | 1022 | * doubled multiple. |
1001 | */ | 1023 | */ |
1002 | shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); | 1024 | shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); |
1003 | shift = max(0, shift); | 1025 | shift = max(0, shift); |
1004 | /* Bound shift to one less then what overflows tick_length */ | 1026 | /* Bound shift to one less than what overflows tick_length */ |
1005 | maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; | 1027 | maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; |
1006 | shift = min(shift, maxshift); | 1028 | shift = min(shift, maxshift); |
1007 | while (offset >= timekeeper.cycle_interval) { | 1029 | while (offset >= timekeeper.cycle_interval) { |
1008 | offset = logarithmic_accumulation(offset, shift); | 1030 | offset = logarithmic_accumulation(offset, shift); |
@@ -1040,24 +1062,30 @@ static void update_wall_time(void) | |||
1040 | * Store full nanoseconds into xtime after rounding it up and | 1062 | * Store full nanoseconds into xtime after rounding it up and |
1041 | * add the remainder to the error difference. | 1063 | * add the remainder to the error difference. |
1042 | */ | 1064 | */ |
1043 | xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; | 1065 | timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> |
1044 | timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; | 1066 | timekeeper.shift) + 1; |
1067 | timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << | ||
1068 | timekeeper.shift; | ||
1045 | timekeeper.ntp_error += timekeeper.xtime_nsec << | 1069 | timekeeper.ntp_error += timekeeper.xtime_nsec << |
1046 | timekeeper.ntp_error_shift; | 1070 | timekeeper.ntp_error_shift; |
1047 | 1071 | ||
1048 | /* | 1072 | /* |
1049 | * Finally, make sure that after the rounding | 1073 | * Finally, make sure that after the rounding |
1050 | * xtime.tv_nsec isn't larger then NSEC_PER_SEC | 1074 | * xtime.tv_nsec isn't larger than NSEC_PER_SEC |
1051 | */ | 1075 | */ |
1052 | if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { | 1076 | if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { |
1053 | xtime.tv_nsec -= NSEC_PER_SEC; | 1077 | int leap; |
1054 | xtime.tv_sec++; | 1078 | timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; |
1055 | second_overflow(); | 1079 | timekeeper.xtime.tv_sec++; |
1080 | leap = second_overflow(timekeeper.xtime.tv_sec); | ||
1081 | timekeeper.xtime.tv_sec += leap; | ||
1056 | } | 1082 | } |
1057 | 1083 | ||
1058 | /* check to see if there is a new clocksource to use */ | 1084 | timekeeping_update(false); |
1059 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | 1085 | |
1060 | timekeeper.mult); | 1086 | out: |
1087 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | ||
1088 | |||
1061 | } | 1089 | } |
1062 | 1090 | ||
1063 | /** | 1091 | /** |
@@ -1074,8 +1102,10 @@ static void update_wall_time(void) | |||
1074 | void getboottime(struct timespec *ts) | 1102 | void getboottime(struct timespec *ts) |
1075 | { | 1103 | { |
1076 | struct timespec boottime = { | 1104 | struct timespec boottime = { |
1077 | .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, | 1105 | .tv_sec = timekeeper.wall_to_monotonic.tv_sec + |
1078 | .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec | 1106 | timekeeper.total_sleep_time.tv_sec, |
1107 | .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + | ||
1108 | timekeeper.total_sleep_time.tv_nsec | ||
1079 | }; | 1109 | }; |
1080 | 1110 | ||
1081 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); | 1111 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); |
@@ -1101,13 +1131,13 @@ void get_monotonic_boottime(struct timespec *ts) | |||
1101 | WARN_ON(timekeeping_suspended); | 1131 | WARN_ON(timekeeping_suspended); |
1102 | 1132 | ||
1103 | do { | 1133 | do { |
1104 | seq = read_seqbegin(&xtime_lock); | 1134 | seq = read_seqbegin(&timekeeper.lock); |
1105 | *ts = xtime; | 1135 | *ts = timekeeper.xtime; |
1106 | tomono = wall_to_monotonic; | 1136 | tomono = timekeeper.wall_to_monotonic; |
1107 | sleep = total_sleep_time; | 1137 | sleep = timekeeper.total_sleep_time; |
1108 | nsecs = timekeeping_get_ns(); | 1138 | nsecs = timekeeping_get_ns(); |
1109 | 1139 | ||
1110 | } while (read_seqretry(&xtime_lock, seq)); | 1140 | } while (read_seqretry(&timekeeper.lock, seq)); |
1111 | 1141 | ||
1112 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, | 1142 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, |
1113 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); | 1143 | ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); |
@@ -1137,19 +1167,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); | |||
1137 | */ | 1167 | */ |
1138 | void monotonic_to_bootbased(struct timespec *ts) | 1168 | void monotonic_to_bootbased(struct timespec *ts) |
1139 | { | 1169 | { |
1140 | *ts = timespec_add(*ts, total_sleep_time); | 1170 | *ts = timespec_add(*ts, timekeeper.total_sleep_time); |
1141 | } | 1171 | } |
1142 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | 1172 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); |
1143 | 1173 | ||
1144 | unsigned long get_seconds(void) | 1174 | unsigned long get_seconds(void) |
1145 | { | 1175 | { |
1146 | return xtime.tv_sec; | 1176 | return timekeeper.xtime.tv_sec; |
1147 | } | 1177 | } |
1148 | EXPORT_SYMBOL(get_seconds); | 1178 | EXPORT_SYMBOL(get_seconds); |
1149 | 1179 | ||
1150 | struct timespec __current_kernel_time(void) | 1180 | struct timespec __current_kernel_time(void) |
1151 | { | 1181 | { |
1152 | return xtime; | 1182 | return timekeeper.xtime; |
1153 | } | 1183 | } |
1154 | 1184 | ||
1155 | struct timespec current_kernel_time(void) | 1185 | struct timespec current_kernel_time(void) |
@@ -1158,10 +1188,10 @@ struct timespec current_kernel_time(void) | |||
1158 | unsigned long seq; | 1188 | unsigned long seq; |
1159 | 1189 | ||
1160 | do { | 1190 | do { |
1161 | seq = read_seqbegin(&xtime_lock); | 1191 | seq = read_seqbegin(&timekeeper.lock); |
1162 | 1192 | ||
1163 | now = xtime; | 1193 | now = timekeeper.xtime; |
1164 | } while (read_seqretry(&xtime_lock, seq)); | 1194 | } while (read_seqretry(&timekeeper.lock, seq)); |
1165 | 1195 | ||
1166 | return now; | 1196 | return now; |
1167 | } | 1197 | } |
@@ -1173,11 +1203,11 @@ struct timespec get_monotonic_coarse(void) | |||
1173 | unsigned long seq; | 1203 | unsigned long seq; |
1174 | 1204 | ||
1175 | do { | 1205 | do { |
1176 | seq = read_seqbegin(&xtime_lock); | 1206 | seq = read_seqbegin(&timekeeper.lock); |
1177 | 1207 | ||
1178 | now = xtime; | 1208 | now = timekeeper.xtime; |
1179 | mono = wall_to_monotonic; | 1209 | mono = timekeeper.wall_to_monotonic; |
1180 | } while (read_seqretry(&xtime_lock, seq)); | 1210 | } while (read_seqretry(&timekeeper.lock, seq)); |
1181 | 1211 | ||
1182 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, | 1212 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, |
1183 | now.tv_nsec + mono.tv_nsec); | 1213 | now.tv_nsec + mono.tv_nsec); |
@@ -1209,11 +1239,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1209 | unsigned long seq; | 1239 | unsigned long seq; |
1210 | 1240 | ||
1211 | do { | 1241 | do { |
1212 | seq = read_seqbegin(&xtime_lock); | 1242 | seq = read_seqbegin(&timekeeper.lock); |
1213 | *xtim = xtime; | 1243 | *xtim = timekeeper.xtime; |
1214 | *wtom = wall_to_monotonic; | 1244 | *wtom = timekeeper.wall_to_monotonic; |
1215 | *sleep = total_sleep_time; | 1245 | *sleep = timekeeper.total_sleep_time; |
1216 | } while (read_seqretry(&xtime_lock, seq)); | 1246 | } while (read_seqretry(&timekeeper.lock, seq)); |
1217 | } | 1247 | } |
1218 | 1248 | ||
1219 | /** | 1249 | /** |
@@ -1225,11 +1255,14 @@ ktime_t ktime_get_monotonic_offset(void) | |||
1225 | struct timespec wtom; | 1255 | struct timespec wtom; |
1226 | 1256 | ||
1227 | do { | 1257 | do { |
1228 | seq = read_seqbegin(&xtime_lock); | 1258 | seq = read_seqbegin(&timekeeper.lock); |
1229 | wtom = wall_to_monotonic; | 1259 | wtom = timekeeper.wall_to_monotonic; |
1230 | } while (read_seqretry(&xtime_lock, seq)); | 1260 | } while (read_seqretry(&timekeeper.lock, seq)); |
1261 | |||
1231 | return timespec_to_ktime(wtom); | 1262 | return timespec_to_ktime(wtom); |
1232 | } | 1263 | } |
1264 | EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | ||
1265 | |||
1233 | 1266 | ||
1234 | /** | 1267 | /** |
1235 | * xtime_update() - advances the timekeeping infrastructure | 1268 | * xtime_update() - advances the timekeeping infrastructure |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cd3134510f3d..a1d2849f2473 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,7 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE | 144 | select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE |
145 | select KALLSYMS | 145 | select KALLSYMS |
146 | select GENERIC_TRACER | 146 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 147 | select CONTEXT_SWITCH_TRACER |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 683d559a0eef..0fa92f677c92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -62,6 +62,8 @@ | |||
62 | #define FTRACE_HASH_DEFAULT_BITS 10 | 62 | #define FTRACE_HASH_DEFAULT_BITS 10 |
63 | #define FTRACE_HASH_MAX_BITS 12 | 63 | #define FTRACE_HASH_MAX_BITS 12 |
64 | 64 | ||
65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) | ||
66 | |||
65 | /* ftrace_enabled is a method to turn ftrace on or off */ | 67 | /* ftrace_enabled is a method to turn ftrace on or off */ |
66 | int ftrace_enabled __read_mostly; | 68 | int ftrace_enabled __read_mostly; |
67 | static int last_ftrace_enabled; | 69 | static int last_ftrace_enabled; |
@@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { | |||
89 | }; | 91 | }; |
90 | 92 | ||
91 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; | 93 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
94 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; | ||
92 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 95 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
93 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 96 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
94 | static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; | 97 | static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; |
95 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 98 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
96 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 99 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
97 | static struct ftrace_ops global_ops; | 100 | static struct ftrace_ops global_ops; |
101 | static struct ftrace_ops control_ops; | ||
98 | 102 | ||
99 | static void | 103 | static void |
100 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | 104 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); |
@@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | |||
168 | } | 172 | } |
169 | #endif | 173 | #endif |
170 | 174 | ||
175 | static void control_ops_disable_all(struct ftrace_ops *ops) | ||
176 | { | ||
177 | int cpu; | ||
178 | |||
179 | for_each_possible_cpu(cpu) | ||
180 | *per_cpu_ptr(ops->disabled, cpu) = 1; | ||
181 | } | ||
182 | |||
183 | static int control_ops_alloc(struct ftrace_ops *ops) | ||
184 | { | ||
185 | int __percpu *disabled; | ||
186 | |||
187 | disabled = alloc_percpu(int); | ||
188 | if (!disabled) | ||
189 | return -ENOMEM; | ||
190 | |||
191 | ops->disabled = disabled; | ||
192 | control_ops_disable_all(ops); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | static void control_ops_free(struct ftrace_ops *ops) | ||
197 | { | ||
198 | free_percpu(ops->disabled); | ||
199 | } | ||
200 | |||
171 | static void update_global_ops(void) | 201 | static void update_global_ops(void) |
172 | { | 202 | { |
173 | ftrace_func_t func; | 203 | ftrace_func_t func; |
@@ -219,7 +249,8 @@ static void update_ftrace_function(void) | |||
219 | #else | 249 | #else |
220 | __ftrace_trace_function = func; | 250 | __ftrace_trace_function = func; |
221 | #endif | 251 | #endif |
222 | ftrace_trace_function = ftrace_test_stop_func; | 252 | ftrace_trace_function = |
253 | (func == ftrace_stub) ? func : ftrace_test_stop_func; | ||
223 | #endif | 254 | #endif |
224 | } | 255 | } |
225 | 256 | ||
@@ -259,6 +290,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | |||
259 | return 0; | 290 | return 0; |
260 | } | 291 | } |
261 | 292 | ||
293 | static void add_ftrace_list_ops(struct ftrace_ops **list, | ||
294 | struct ftrace_ops *main_ops, | ||
295 | struct ftrace_ops *ops) | ||
296 | { | ||
297 | int first = *list == &ftrace_list_end; | ||
298 | add_ftrace_ops(list, ops); | ||
299 | if (first) | ||
300 | add_ftrace_ops(&ftrace_ops_list, main_ops); | ||
301 | } | ||
302 | |||
303 | static int remove_ftrace_list_ops(struct ftrace_ops **list, | ||
304 | struct ftrace_ops *main_ops, | ||
305 | struct ftrace_ops *ops) | ||
306 | { | ||
307 | int ret = remove_ftrace_ops(list, ops); | ||
308 | if (!ret && *list == &ftrace_list_end) | ||
309 | ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); | ||
310 | return ret; | ||
311 | } | ||
312 | |||
262 | static int __register_ftrace_function(struct ftrace_ops *ops) | 313 | static int __register_ftrace_function(struct ftrace_ops *ops) |
263 | { | 314 | { |
264 | if (ftrace_disabled) | 315 | if (ftrace_disabled) |
@@ -270,15 +321,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
270 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) | 321 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
271 | return -EBUSY; | 322 | return -EBUSY; |
272 | 323 | ||
324 | /* We don't support both control and global flags set. */ | ||
325 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) | ||
326 | return -EINVAL; | ||
327 | |||
273 | if (!core_kernel_data((unsigned long)ops)) | 328 | if (!core_kernel_data((unsigned long)ops)) |
274 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; | 329 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
275 | 330 | ||
276 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 331 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
277 | int first = ftrace_global_list == &ftrace_list_end; | 332 | add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); |
278 | add_ftrace_ops(&ftrace_global_list, ops); | ||
279 | ops->flags |= FTRACE_OPS_FL_ENABLED; | 333 | ops->flags |= FTRACE_OPS_FL_ENABLED; |
280 | if (first) | 334 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { |
281 | add_ftrace_ops(&ftrace_ops_list, &global_ops); | 335 | if (control_ops_alloc(ops)) |
336 | return -ENOMEM; | ||
337 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); | ||
282 | } else | 338 | } else |
283 | add_ftrace_ops(&ftrace_ops_list, ops); | 339 | add_ftrace_ops(&ftrace_ops_list, ops); |
284 | 340 | ||
@@ -302,11 +358,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
302 | return -EINVAL; | 358 | return -EINVAL; |
303 | 359 | ||
304 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 360 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
305 | ret = remove_ftrace_ops(&ftrace_global_list, ops); | 361 | ret = remove_ftrace_list_ops(&ftrace_global_list, |
306 | if (!ret && ftrace_global_list == &ftrace_list_end) | 362 | &global_ops, ops); |
307 | ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); | ||
308 | if (!ret) | 363 | if (!ret) |
309 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 364 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
365 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { | ||
366 | ret = remove_ftrace_list_ops(&ftrace_control_list, | ||
367 | &control_ops, ops); | ||
368 | if (!ret) { | ||
369 | /* | ||
370 | * The ftrace_ops is now removed from the list, | ||
371 | * so there'll be no new users. We must ensure | ||
372 | * all current users are done before we free | ||
373 | * the control data. | ||
374 | */ | ||
375 | synchronize_sched(); | ||
376 | control_ops_free(ops); | ||
377 | } | ||
310 | } else | 378 | } else |
311 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); | 379 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); |
312 | 380 | ||
@@ -1119,6 +1187,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | |||
1119 | call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); | 1187 | call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); |
1120 | } | 1188 | } |
1121 | 1189 | ||
1190 | void ftrace_free_filter(struct ftrace_ops *ops) | ||
1191 | { | ||
1192 | free_ftrace_hash(ops->filter_hash); | ||
1193 | free_ftrace_hash(ops->notrace_hash); | ||
1194 | } | ||
1195 | |||
1122 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | 1196 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) |
1123 | { | 1197 | { |
1124 | struct ftrace_hash *hash; | 1198 | struct ftrace_hash *hash; |
@@ -1129,7 +1203,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | |||
1129 | return NULL; | 1203 | return NULL; |
1130 | 1204 | ||
1131 | size = 1 << size_bits; | 1205 | size = 1 << size_bits; |
1132 | hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); | 1206 | hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); |
1133 | 1207 | ||
1134 | if (!hash->buckets) { | 1208 | if (!hash->buckets) { |
1135 | kfree(hash); | 1209 | kfree(hash); |
@@ -3146,8 +3220,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3146 | mutex_lock(&ftrace_regex_lock); | 3220 | mutex_lock(&ftrace_regex_lock); |
3147 | if (reset) | 3221 | if (reset) |
3148 | ftrace_filter_reset(hash); | 3222 | ftrace_filter_reset(hash); |
3149 | if (buf) | 3223 | if (buf && !ftrace_match_records(hash, buf, len)) { |
3150 | ftrace_match_records(hash, buf, len); | 3224 | ret = -EINVAL; |
3225 | goto out_regex_unlock; | ||
3226 | } | ||
3151 | 3227 | ||
3152 | mutex_lock(&ftrace_lock); | 3228 | mutex_lock(&ftrace_lock); |
3153 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 3229 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
@@ -3157,6 +3233,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3157 | 3233 | ||
3158 | mutex_unlock(&ftrace_lock); | 3234 | mutex_unlock(&ftrace_lock); |
3159 | 3235 | ||
3236 | out_regex_unlock: | ||
3160 | mutex_unlock(&ftrace_regex_lock); | 3237 | mutex_unlock(&ftrace_regex_lock); |
3161 | 3238 | ||
3162 | free_ftrace_hash(hash); | 3239 | free_ftrace_hash(hash); |
@@ -3173,10 +3250,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3173 | * Filters denote which functions should be enabled when tracing is enabled. | 3250 | * Filters denote which functions should be enabled when tracing is enabled. |
3174 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | 3251 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. |
3175 | */ | 3252 | */ |
3176 | void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, | 3253 | int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, |
3177 | int len, int reset) | 3254 | int len, int reset) |
3178 | { | 3255 | { |
3179 | ftrace_set_regex(ops, buf, len, reset, 1); | 3256 | return ftrace_set_regex(ops, buf, len, reset, 1); |
3180 | } | 3257 | } |
3181 | EXPORT_SYMBOL_GPL(ftrace_set_filter); | 3258 | EXPORT_SYMBOL_GPL(ftrace_set_filter); |
3182 | 3259 | ||
@@ -3191,10 +3268,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); | |||
3191 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | 3268 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled |
3192 | * for tracing. | 3269 | * for tracing. |
3193 | */ | 3270 | */ |
3194 | void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, | 3271 | int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, |
3195 | int len, int reset) | 3272 | int len, int reset) |
3196 | { | 3273 | { |
3197 | ftrace_set_regex(ops, buf, len, reset, 0); | 3274 | return ftrace_set_regex(ops, buf, len, reset, 0); |
3198 | } | 3275 | } |
3199 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | 3276 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); |
3200 | /** | 3277 | /** |
@@ -3871,6 +3948,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
3871 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 3948 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
3872 | 3949 | ||
3873 | static void | 3950 | static void |
3951 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | ||
3952 | { | ||
3953 | struct ftrace_ops *op; | ||
3954 | |||
3955 | if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) | ||
3956 | return; | ||
3957 | |||
3958 | /* | ||
3959 | * Some of the ops may be dynamically allocated, | ||
3960 | * they must be freed after a synchronize_sched(). | ||
3961 | */ | ||
3962 | preempt_disable_notrace(); | ||
3963 | trace_recursion_set(TRACE_CONTROL_BIT); | ||
3964 | op = rcu_dereference_raw(ftrace_control_list); | ||
3965 | while (op != &ftrace_list_end) { | ||
3966 | if (!ftrace_function_local_disabled(op) && | ||
3967 | ftrace_ops_test(op, ip)) | ||
3968 | op->func(ip, parent_ip); | ||
3969 | |||
3970 | op = rcu_dereference_raw(op->next); | ||
3971 | }; | ||
3972 | trace_recursion_clear(TRACE_CONTROL_BIT); | ||
3973 | preempt_enable_notrace(); | ||
3974 | } | ||
3975 | |||
3976 | static struct ftrace_ops control_ops = { | ||
3977 | .func = ftrace_ops_control_func, | ||
3978 | }; | ||
3979 | |||
3980 | static void | ||
3874 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | 3981 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) |
3875 | { | 3982 | { |
3876 | struct ftrace_ops *op; | 3983 | struct ftrace_ops *op; |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5b7b5c1195b..cf8d11e91efd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -154,33 +154,10 @@ enum { | |||
154 | 154 | ||
155 | static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; | 155 | static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; |
156 | 156 | ||
157 | #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) | 157 | /* Used for individual buffers (after the counter) */ |
158 | 158 | #define RB_BUFFER_OFF (1 << 20) | |
159 | /** | ||
160 | * tracing_on - enable all tracing buffers | ||
161 | * | ||
162 | * This function enables all tracing buffers that may have been | ||
163 | * disabled with tracing_off. | ||
164 | */ | ||
165 | void tracing_on(void) | ||
166 | { | ||
167 | set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); | ||
168 | } | ||
169 | EXPORT_SYMBOL_GPL(tracing_on); | ||
170 | 159 | ||
171 | /** | 160 | #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) |
172 | * tracing_off - turn off all tracing buffers | ||
173 | * | ||
174 | * This function stops all tracing buffers from recording data. | ||
175 | * It does not disable any overhead the tracers themselves may | ||
176 | * be causing. This function simply causes all recording to | ||
177 | * the ring buffers to fail. | ||
178 | */ | ||
179 | void tracing_off(void) | ||
180 | { | ||
181 | clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); | ||
182 | } | ||
183 | EXPORT_SYMBOL_GPL(tracing_off); | ||
184 | 161 | ||
185 | /** | 162 | /** |
186 | * tracing_off_permanent - permanently disable ring buffers | 163 | * tracing_off_permanent - permanently disable ring buffers |
@@ -193,15 +170,6 @@ void tracing_off_permanent(void) | |||
193 | set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); | 170 | set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); |
194 | } | 171 | } |
195 | 172 | ||
196 | /** | ||
197 | * tracing_is_on - show state of ring buffers enabled | ||
198 | */ | ||
199 | int tracing_is_on(void) | ||
200 | { | ||
201 | return ring_buffer_flags == RB_BUFFERS_ON; | ||
202 | } | ||
203 | EXPORT_SYMBOL_GPL(tracing_is_on); | ||
204 | |||
205 | #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) | 173 | #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) |
206 | #define RB_ALIGNMENT 4U | 174 | #define RB_ALIGNMENT 4U |
207 | #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) | 175 | #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) |
@@ -2619,6 +2587,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer) | |||
2619 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable); | 2587 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable); |
2620 | 2588 | ||
2621 | /** | 2589 | /** |
2590 | * ring_buffer_record_off - stop all writes into the buffer | ||
2591 | * @buffer: The ring buffer to stop writes to. | ||
2592 | * | ||
2593 | * This prevents all writes to the buffer. Any attempt to write | ||
2594 | * to the buffer after this will fail and return NULL. | ||
2595 | * | ||
2596 | * This is different than ring_buffer_record_disable() as | ||
2597 | * it works like an on/off switch, where as the disable() verison | ||
2598 | * must be paired with a enable(). | ||
2599 | */ | ||
2600 | void ring_buffer_record_off(struct ring_buffer *buffer) | ||
2601 | { | ||
2602 | unsigned int rd; | ||
2603 | unsigned int new_rd; | ||
2604 | |||
2605 | do { | ||
2606 | rd = atomic_read(&buffer->record_disabled); | ||
2607 | new_rd = rd | RB_BUFFER_OFF; | ||
2608 | } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); | ||
2609 | } | ||
2610 | EXPORT_SYMBOL_GPL(ring_buffer_record_off); | ||
2611 | |||
2612 | /** | ||
2613 | * ring_buffer_record_on - restart writes into the buffer | ||
2614 | * @buffer: The ring buffer to start writes to. | ||
2615 | * | ||
2616 | * This enables all writes to the buffer that was disabled by | ||
2617 | * ring_buffer_record_off(). | ||
2618 | * | ||
2619 | * This is different than ring_buffer_record_enable() as | ||
2620 | * it works like an on/off switch, where as the enable() verison | ||
2621 | * must be paired with a disable(). | ||
2622 | */ | ||
2623 | void ring_buffer_record_on(struct ring_buffer *buffer) | ||
2624 | { | ||
2625 | unsigned int rd; | ||
2626 | unsigned int new_rd; | ||
2627 | |||
2628 | do { | ||
2629 | rd = atomic_read(&buffer->record_disabled); | ||
2630 | new_rd = rd & ~RB_BUFFER_OFF; | ||
2631 | } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); | ||
2632 | } | ||
2633 | EXPORT_SYMBOL_GPL(ring_buffer_record_on); | ||
2634 | |||
2635 | /** | ||
2636 | * ring_buffer_record_is_on - return true if the ring buffer can write | ||
2637 | * @buffer: The ring buffer to see if write is enabled | ||
2638 | * | ||
2639 | * Returns true if the ring buffer is in a state that it accepts writes. | ||
2640 | */ | ||
2641 | int ring_buffer_record_is_on(struct ring_buffer *buffer) | ||
2642 | { | ||
2643 | return !atomic_read(&buffer->record_disabled); | ||
2644 | } | ||
2645 | |||
2646 | /** | ||
2622 | * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer | 2647 | * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer |
2623 | * @buffer: The ring buffer to stop writes to. | 2648 | * @buffer: The ring buffer to stop writes to. |
2624 | * @cpu: The CPU buffer to stop | 2649 | * @cpu: The CPU buffer to stop |
@@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
4039 | } | 4064 | } |
4040 | EXPORT_SYMBOL_GPL(ring_buffer_read_page); | 4065 | EXPORT_SYMBOL_GPL(ring_buffer_read_page); |
4041 | 4066 | ||
4042 | #ifdef CONFIG_TRACING | ||
4043 | static ssize_t | ||
4044 | rb_simple_read(struct file *filp, char __user *ubuf, | ||
4045 | size_t cnt, loff_t *ppos) | ||
4046 | { | ||
4047 | unsigned long *p = filp->private_data; | ||
4048 | char buf[64]; | ||
4049 | int r; | ||
4050 | |||
4051 | if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) | ||
4052 | r = sprintf(buf, "permanently disabled\n"); | ||
4053 | else | ||
4054 | r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); | ||
4055 | |||
4056 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
4057 | } | ||
4058 | |||
4059 | static ssize_t | ||
4060 | rb_simple_write(struct file *filp, const char __user *ubuf, | ||
4061 | size_t cnt, loff_t *ppos) | ||
4062 | { | ||
4063 | unsigned long *p = filp->private_data; | ||
4064 | unsigned long val; | ||
4065 | int ret; | ||
4066 | |||
4067 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
4068 | if (ret) | ||
4069 | return ret; | ||
4070 | |||
4071 | if (val) | ||
4072 | set_bit(RB_BUFFERS_ON_BIT, p); | ||
4073 | else | ||
4074 | clear_bit(RB_BUFFERS_ON_BIT, p); | ||
4075 | |||
4076 | (*ppos)++; | ||
4077 | |||
4078 | return cnt; | ||
4079 | } | ||
4080 | |||
4081 | static const struct file_operations rb_simple_fops = { | ||
4082 | .open = tracing_open_generic, | ||
4083 | .read = rb_simple_read, | ||
4084 | .write = rb_simple_write, | ||
4085 | .llseek = default_llseek, | ||
4086 | }; | ||
4087 | |||
4088 | |||
4089 | static __init int rb_init_debugfs(void) | ||
4090 | { | ||
4091 | struct dentry *d_tracer; | ||
4092 | |||
4093 | d_tracer = tracing_init_dentry(); | ||
4094 | |||
4095 | trace_create_file("tracing_on", 0644, d_tracer, | ||
4096 | &ring_buffer_flags, &rb_simple_fops); | ||
4097 | |||
4098 | return 0; | ||
4099 | } | ||
4100 | |||
4101 | fs_initcall(rb_init_debugfs); | ||
4102 | #endif | ||
4103 | |||
4104 | #ifdef CONFIG_HOTPLUG_CPU | 4067 | #ifdef CONFIG_HOTPLUG_CPU |
4105 | static int rb_cpu_notify(struct notifier_block *self, | 4068 | static int rb_cpu_notify(struct notifier_block *self, |
4106 | unsigned long action, void *hcpu) | 4069 | unsigned long action, void *hcpu) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5d2a00..ed7b5d1e12f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/ctype.h> | 36 | #include <linux/ctype.h> |
37 | #include <linux/init.h> | 37 | #include <linux/init.h> |
38 | #include <linux/poll.h> | 38 | #include <linux/poll.h> |
39 | #include <linux/nmi.h> | ||
39 | #include <linux/fs.h> | 40 | #include <linux/fs.h> |
40 | 41 | ||
41 | #include "trace.h" | 42 | #include "trace.h" |
@@ -352,6 +353,59 @@ static void wakeup_work_handler(struct work_struct *work) | |||
352 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); | 353 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); |
353 | 354 | ||
354 | /** | 355 | /** |
356 | * tracing_on - enable tracing buffers | ||
357 | * | ||
358 | * This function enables tracing buffers that may have been | ||
359 | * disabled with tracing_off. | ||
360 | */ | ||
361 | void tracing_on(void) | ||
362 | { | ||
363 | if (global_trace.buffer) | ||
364 | ring_buffer_record_on(global_trace.buffer); | ||
365 | /* | ||
366 | * This flag is only looked at when buffers haven't been | ||
367 | * allocated yet. We don't really care about the race | ||
368 | * between setting this flag and actually turning | ||
369 | * on the buffer. | ||
370 | */ | ||
371 | global_trace.buffer_disabled = 0; | ||
372 | } | ||
373 | EXPORT_SYMBOL_GPL(tracing_on); | ||
374 | |||
375 | /** | ||
376 | * tracing_off - turn off tracing buffers | ||
377 | * | ||
378 | * This function stops the tracing buffers from recording data. | ||
379 | * It does not disable any overhead the tracers themselves may | ||
380 | * be causing. This function simply causes all recording to | ||
381 | * the ring buffers to fail. | ||
382 | */ | ||
383 | void tracing_off(void) | ||
384 | { | ||
385 | if (global_trace.buffer) | ||
386 | ring_buffer_record_on(global_trace.buffer); | ||
387 | /* | ||
388 | * This flag is only looked at when buffers haven't been | ||
389 | * allocated yet. We don't really care about the race | ||
390 | * between setting this flag and actually turning | ||
391 | * on the buffer. | ||
392 | */ | ||
393 | global_trace.buffer_disabled = 1; | ||
394 | } | ||
395 | EXPORT_SYMBOL_GPL(tracing_off); | ||
396 | |||
397 | /** | ||
398 | * tracing_is_on - show state of ring buffers enabled | ||
399 | */ | ||
400 | int tracing_is_on(void) | ||
401 | { | ||
402 | if (global_trace.buffer) | ||
403 | return ring_buffer_record_is_on(global_trace.buffer); | ||
404 | return !global_trace.buffer_disabled; | ||
405 | } | ||
406 | EXPORT_SYMBOL_GPL(tracing_is_on); | ||
407 | |||
408 | /** | ||
355 | * trace_wake_up - wake up tasks waiting for trace input | 409 | * trace_wake_up - wake up tasks waiting for trace input |
356 | * | 410 | * |
357 | * Schedules a delayed work to wake up any task that is blocked on the | 411 | * Schedules a delayed work to wake up any task that is blocked on the |
@@ -1644,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, | |||
1644 | int cpu_file = iter->cpu_file; | 1698 | int cpu_file = iter->cpu_file; |
1645 | u64 next_ts = 0, ts; | 1699 | u64 next_ts = 0, ts; |
1646 | int next_cpu = -1; | 1700 | int next_cpu = -1; |
1701 | int next_size = 0; | ||
1647 | int cpu; | 1702 | int cpu; |
1648 | 1703 | ||
1649 | /* | 1704 | /* |
@@ -1675,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, | |||
1675 | next_cpu = cpu; | 1730 | next_cpu = cpu; |
1676 | next_ts = ts; | 1731 | next_ts = ts; |
1677 | next_lost = lost_events; | 1732 | next_lost = lost_events; |
1733 | next_size = iter->ent_size; | ||
1678 | } | 1734 | } |
1679 | } | 1735 | } |
1680 | 1736 | ||
1737 | iter->ent_size = next_size; | ||
1738 | |||
1681 | if (ent_cpu) | 1739 | if (ent_cpu) |
1682 | *ent_cpu = next_cpu; | 1740 | *ent_cpu = next_cpu; |
1683 | 1741 | ||
@@ -2764,12 +2822,12 @@ static const char readme_msg[] = | |||
2764 | "tracing mini-HOWTO:\n\n" | 2822 | "tracing mini-HOWTO:\n\n" |
2765 | "# mount -t debugfs nodev /sys/kernel/debug\n\n" | 2823 | "# mount -t debugfs nodev /sys/kernel/debug\n\n" |
2766 | "# cat /sys/kernel/debug/tracing/available_tracers\n" | 2824 | "# cat /sys/kernel/debug/tracing/available_tracers\n" |
2767 | "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" | 2825 | "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" |
2768 | "# cat /sys/kernel/debug/tracing/current_tracer\n" | 2826 | "# cat /sys/kernel/debug/tracing/current_tracer\n" |
2769 | "nop\n" | 2827 | "nop\n" |
2770 | "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" | 2828 | "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" |
2771 | "# cat /sys/kernel/debug/tracing/current_tracer\n" | 2829 | "# cat /sys/kernel/debug/tracing/current_tracer\n" |
2772 | "sched_switch\n" | 2830 | "wakeup\n" |
2773 | "# cat /sys/kernel/debug/tracing/trace_options\n" | 2831 | "# cat /sys/kernel/debug/tracing/trace_options\n" |
2774 | "noprint-parent nosym-offset nosym-addr noverbose\n" | 2832 | "noprint-parent nosym-offset nosym-addr noverbose\n" |
2775 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" | 2833 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" |
@@ -4567,6 +4625,55 @@ static __init void create_trace_options_dir(void) | |||
4567 | create_trace_option_core_file(trace_options[i], i); | 4625 | create_trace_option_core_file(trace_options[i], i); |
4568 | } | 4626 | } |
4569 | 4627 | ||
4628 | static ssize_t | ||
4629 | rb_simple_read(struct file *filp, char __user *ubuf, | ||
4630 | size_t cnt, loff_t *ppos) | ||
4631 | { | ||
4632 | struct ring_buffer *buffer = filp->private_data; | ||
4633 | char buf[64]; | ||
4634 | int r; | ||
4635 | |||
4636 | if (buffer) | ||
4637 | r = ring_buffer_record_is_on(buffer); | ||
4638 | else | ||
4639 | r = 0; | ||
4640 | |||
4641 | r = sprintf(buf, "%d\n", r); | ||
4642 | |||
4643 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
4644 | } | ||
4645 | |||
4646 | static ssize_t | ||
4647 | rb_simple_write(struct file *filp, const char __user *ubuf, | ||
4648 | size_t cnt, loff_t *ppos) | ||
4649 | { | ||
4650 | struct ring_buffer *buffer = filp->private_data; | ||
4651 | unsigned long val; | ||
4652 | int ret; | ||
4653 | |||
4654 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | ||
4655 | if (ret) | ||
4656 | return ret; | ||
4657 | |||
4658 | if (buffer) { | ||
4659 | if (val) | ||
4660 | ring_buffer_record_on(buffer); | ||
4661 | else | ||
4662 | ring_buffer_record_off(buffer); | ||
4663 | } | ||
4664 | |||
4665 | (*ppos)++; | ||
4666 | |||
4667 | return cnt; | ||
4668 | } | ||
4669 | |||
4670 | static const struct file_operations rb_simple_fops = { | ||
4671 | .open = tracing_open_generic, | ||
4672 | .read = rb_simple_read, | ||
4673 | .write = rb_simple_write, | ||
4674 | .llseek = default_llseek, | ||
4675 | }; | ||
4676 | |||
4570 | static __init int tracer_init_debugfs(void) | 4677 | static __init int tracer_init_debugfs(void) |
4571 | { | 4678 | { |
4572 | struct dentry *d_tracer; | 4679 | struct dentry *d_tracer; |
@@ -4626,6 +4733,9 @@ static __init int tracer_init_debugfs(void) | |||
4626 | trace_create_file("trace_clock", 0644, d_tracer, NULL, | 4733 | trace_create_file("trace_clock", 0644, d_tracer, NULL, |
4627 | &trace_clock_fops); | 4734 | &trace_clock_fops); |
4628 | 4735 | ||
4736 | trace_create_file("tracing_on", 0644, d_tracer, | ||
4737 | global_trace.buffer, &rb_simple_fops); | ||
4738 | |||
4629 | #ifdef CONFIG_DYNAMIC_FTRACE | 4739 | #ifdef CONFIG_DYNAMIC_FTRACE |
4630 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 4740 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
4631 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 4741 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
@@ -4798,6 +4908,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4798 | if (ret != TRACE_TYPE_NO_CONSUME) | 4908 | if (ret != TRACE_TYPE_NO_CONSUME) |
4799 | trace_consume(&iter); | 4909 | trace_consume(&iter); |
4800 | } | 4910 | } |
4911 | touch_nmi_watchdog(); | ||
4801 | 4912 | ||
4802 | trace_printk_seq(&iter.seq); | 4913 | trace_printk_seq(&iter.seq); |
4803 | } | 4914 | } |
@@ -4863,6 +4974,8 @@ __init static int tracer_alloc_buffers(void) | |||
4863 | goto out_free_cpumask; | 4974 | goto out_free_cpumask; |
4864 | } | 4975 | } |
4865 | global_trace.entries = ring_buffer_size(global_trace.buffer); | 4976 | global_trace.entries = ring_buffer_size(global_trace.buffer); |
4977 | if (global_trace.buffer_disabled) | ||
4978 | tracing_off(); | ||
4866 | 4979 | ||
4867 | 4980 | ||
4868 | #ifdef CONFIG_TRACER_MAX_TRACE | 4981 | #ifdef CONFIG_TRACER_MAX_TRACE |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6d..95059f091a24 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -56,17 +56,23 @@ enum trace_type { | |||
56 | #define F_STRUCT(args...) args | 56 | #define F_STRUCT(args...) args |
57 | 57 | ||
58 | #undef FTRACE_ENTRY | 58 | #undef FTRACE_ENTRY |
59 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | 59 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ |
60 | struct struct_name { \ | 60 | struct struct_name { \ |
61 | struct trace_entry ent; \ | 61 | struct trace_entry ent; \ |
62 | tstruct \ | 62 | tstruct \ |
63 | } | 63 | } |
64 | 64 | ||
65 | #undef TP_ARGS | 65 | #undef TP_ARGS |
66 | #define TP_ARGS(args...) args | 66 | #define TP_ARGS(args...) args |
67 | 67 | ||
68 | #undef FTRACE_ENTRY_DUP | 68 | #undef FTRACE_ENTRY_DUP |
69 | #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) | 69 | #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) |
70 | |||
71 | #undef FTRACE_ENTRY_REG | ||
72 | #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ | ||
73 | filter, regfn) \ | ||
74 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | ||
75 | filter) | ||
70 | 76 | ||
71 | #include "trace_entries.h" | 77 | #include "trace_entries.h" |
72 | 78 | ||
@@ -148,6 +154,7 @@ struct trace_array { | |||
148 | struct ring_buffer *buffer; | 154 | struct ring_buffer *buffer; |
149 | unsigned long entries; | 155 | unsigned long entries; |
150 | int cpu; | 156 | int cpu; |
157 | int buffer_disabled; | ||
151 | cycle_t time_start; | 158 | cycle_t time_start; |
152 | struct task_struct *waiter; | 159 | struct task_struct *waiter; |
153 | struct trace_array_cpu *data[NR_CPUS]; | 160 | struct trace_array_cpu *data[NR_CPUS]; |
@@ -288,6 +295,8 @@ struct tracer { | |||
288 | /* for function tracing recursion */ | 295 | /* for function tracing recursion */ |
289 | #define TRACE_INTERNAL_BIT (1<<11) | 296 | #define TRACE_INTERNAL_BIT (1<<11) |
290 | #define TRACE_GLOBAL_BIT (1<<12) | 297 | #define TRACE_GLOBAL_BIT (1<<12) |
298 | #define TRACE_CONTROL_BIT (1<<13) | ||
299 | |||
291 | /* | 300 | /* |
292 | * Abuse of the trace_recursion. | 301 | * Abuse of the trace_recursion. |
293 | * As we need a way to maintain state if we are tracing the function | 302 | * As we need a way to maintain state if we are tracing the function |
@@ -589,6 +598,8 @@ static inline int ftrace_trace_task(struct task_struct *task) | |||
589 | static inline int ftrace_is_dead(void) { return 0; } | 598 | static inline int ftrace_is_dead(void) { return 0; } |
590 | #endif | 599 | #endif |
591 | 600 | ||
601 | int ftrace_event_is_function(struct ftrace_event_call *call); | ||
602 | |||
592 | /* | 603 | /* |
593 | * struct trace_parser - servers for reading the user input separated by spaces | 604 | * struct trace_parser - servers for reading the user input separated by spaces |
594 | * @cont: set if the input is not complete - no final space char was found | 605 | * @cont: set if the input is not complete - no final space char was found |
@@ -766,9 +777,7 @@ struct filter_pred { | |||
766 | u64 val; | 777 | u64 val; |
767 | struct regex regex; | 778 | struct regex regex; |
768 | unsigned short *ops; | 779 | unsigned short *ops; |
769 | #ifdef CONFIG_FTRACE_STARTUP_TEST | ||
770 | struct ftrace_event_field *field; | 780 | struct ftrace_event_field *field; |
771 | #endif | ||
772 | int offset; | 781 | int offset; |
773 | int not; | 782 | int not; |
774 | int op; | 783 | int op; |
@@ -818,12 +827,20 @@ extern const char *__start___trace_bprintk_fmt[]; | |||
818 | extern const char *__stop___trace_bprintk_fmt[]; | 827 | extern const char *__stop___trace_bprintk_fmt[]; |
819 | 828 | ||
820 | #undef FTRACE_ENTRY | 829 | #undef FTRACE_ENTRY |
821 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ | 830 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
822 | extern struct ftrace_event_call \ | 831 | extern struct ftrace_event_call \ |
823 | __attribute__((__aligned__(4))) event_##call; | 832 | __attribute__((__aligned__(4))) event_##call; |
824 | #undef FTRACE_ENTRY_DUP | 833 | #undef FTRACE_ENTRY_DUP |
825 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ | 834 | #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ |
826 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 835 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ |
836 | filter) | ||
827 | #include "trace_entries.h" | 837 | #include "trace_entries.h" |
828 | 838 | ||
839 | #ifdef CONFIG_FUNCTION_TRACER | ||
840 | int perf_ftrace_event_register(struct ftrace_event_call *call, | ||
841 | enum trace_reg type, void *data); | ||
842 | #else | ||
843 | #define perf_ftrace_event_register NULL | ||
844 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
845 | |||
829 | #endif /* _LINUX_KERNEL_TRACE_H */ | 846 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..4108e1250ca2 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -55,7 +55,7 @@ | |||
55 | /* | 55 | /* |
56 | * Function trace entry - function address and parent function address: | 56 | * Function trace entry - function address and parent function address: |
57 | */ | 57 | */ |
58 | FTRACE_ENTRY(function, ftrace_entry, | 58 | FTRACE_ENTRY_REG(function, ftrace_entry, |
59 | 59 | ||
60 | TRACE_FN, | 60 | TRACE_FN, |
61 | 61 | ||
@@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry, | |||
64 | __field( unsigned long, parent_ip ) | 64 | __field( unsigned long, parent_ip ) |
65 | ), | 65 | ), |
66 | 66 | ||
67 | F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) | 67 | F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), |
68 | |||
69 | FILTER_TRACE_FN, | ||
70 | |||
71 | perf_ftrace_event_register | ||
68 | ); | 72 | ); |
69 | 73 | ||
70 | /* Function call entry */ | 74 | /* Function call entry */ |
@@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, | |||
78 | __field_desc( int, graph_ent, depth ) | 82 | __field_desc( int, graph_ent, depth ) |
79 | ), | 83 | ), |
80 | 84 | ||
81 | F_printk("--> %lx (%d)", __entry->func, __entry->depth) | 85 | F_printk("--> %lx (%d)", __entry->func, __entry->depth), |
86 | |||
87 | FILTER_OTHER | ||
82 | ); | 88 | ); |
83 | 89 | ||
84 | /* Function return entry */ | 90 | /* Function return entry */ |
@@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, | |||
98 | F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", | 104 | F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", |
99 | __entry->func, __entry->depth, | 105 | __entry->func, __entry->depth, |
100 | __entry->calltime, __entry->rettime, | 106 | __entry->calltime, __entry->rettime, |
101 | __entry->depth) | 107 | __entry->depth), |
108 | |||
109 | FILTER_OTHER | ||
102 | ); | 110 | ); |
103 | 111 | ||
104 | /* | 112 | /* |
@@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, | |||
127 | F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", | 135 | F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", |
128 | __entry->prev_pid, __entry->prev_prio, __entry->prev_state, | 136 | __entry->prev_pid, __entry->prev_prio, __entry->prev_state, |
129 | __entry->next_pid, __entry->next_prio, __entry->next_state, | 137 | __entry->next_pid, __entry->next_prio, __entry->next_state, |
130 | __entry->next_cpu | 138 | __entry->next_cpu), |
131 | ) | 139 | |
140 | FILTER_OTHER | ||
132 | ); | 141 | ); |
133 | 142 | ||
134 | /* | 143 | /* |
@@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, | |||
146 | F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", | 155 | F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", |
147 | __entry->prev_pid, __entry->prev_prio, __entry->prev_state, | 156 | __entry->prev_pid, __entry->prev_prio, __entry->prev_state, |
148 | __entry->next_pid, __entry->next_prio, __entry->next_state, | 157 | __entry->next_pid, __entry->next_prio, __entry->next_state, |
149 | __entry->next_cpu | 158 | __entry->next_cpu), |
150 | ) | 159 | |
160 | FILTER_OTHER | ||
151 | ); | 161 | ); |
152 | 162 | ||
153 | /* | 163 | /* |
@@ -156,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, | |||
156 | 166 | ||
157 | #define FTRACE_STACK_ENTRIES 8 | 167 | #define FTRACE_STACK_ENTRIES 8 |
158 | 168 | ||
169 | #ifndef CONFIG_64BIT | ||
170 | # define IP_FMT "%08lx" | ||
171 | #else | ||
172 | # define IP_FMT "%016lx" | ||
173 | #endif | ||
174 | |||
159 | FTRACE_ENTRY(kernel_stack, stack_entry, | 175 | FTRACE_ENTRY(kernel_stack, stack_entry, |
160 | 176 | ||
161 | TRACE_STACK, | 177 | TRACE_STACK, |
@@ -165,11 +181,14 @@ FTRACE_ENTRY(kernel_stack, stack_entry, | |||
165 | __dynamic_array(unsigned long, caller ) | 181 | __dynamic_array(unsigned long, caller ) |
166 | ), | 182 | ), |
167 | 183 | ||
168 | F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" | 184 | F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" |
169 | "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", | 185 | "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" |
186 | "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", | ||
170 | __entry->caller[0], __entry->caller[1], __entry->caller[2], | 187 | __entry->caller[0], __entry->caller[1], __entry->caller[2], |
171 | __entry->caller[3], __entry->caller[4], __entry->caller[5], | 188 | __entry->caller[3], __entry->caller[4], __entry->caller[5], |
172 | __entry->caller[6], __entry->caller[7]) | 189 | __entry->caller[6], __entry->caller[7]), |
190 | |||
191 | FILTER_OTHER | ||
173 | ); | 192 | ); |
174 | 193 | ||
175 | FTRACE_ENTRY(user_stack, userstack_entry, | 194 | FTRACE_ENTRY(user_stack, userstack_entry, |
@@ -181,11 +200,14 @@ FTRACE_ENTRY(user_stack, userstack_entry, | |||
181 | __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) | 200 | __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) |
182 | ), | 201 | ), |
183 | 202 | ||
184 | F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" | 203 | F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" |
185 | "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", | 204 | "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" |
205 | "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", | ||
186 | __entry->caller[0], __entry->caller[1], __entry->caller[2], | 206 | __entry->caller[0], __entry->caller[1], __entry->caller[2], |
187 | __entry->caller[3], __entry->caller[4], __entry->caller[5], | 207 | __entry->caller[3], __entry->caller[4], __entry->caller[5], |
188 | __entry->caller[6], __entry->caller[7]) | 208 | __entry->caller[6], __entry->caller[7]), |
209 | |||
210 | FILTER_OTHER | ||
189 | ); | 211 | ); |
190 | 212 | ||
191 | /* | 213 | /* |
@@ -202,7 +224,9 @@ FTRACE_ENTRY(bprint, bprint_entry, | |||
202 | ), | 224 | ), |
203 | 225 | ||
204 | F_printk("%08lx fmt:%p", | 226 | F_printk("%08lx fmt:%p", |
205 | __entry->ip, __entry->fmt) | 227 | __entry->ip, __entry->fmt), |
228 | |||
229 | FILTER_OTHER | ||
206 | ); | 230 | ); |
207 | 231 | ||
208 | FTRACE_ENTRY(print, print_entry, | 232 | FTRACE_ENTRY(print, print_entry, |
@@ -215,7 +239,9 @@ FTRACE_ENTRY(print, print_entry, | |||
215 | ), | 239 | ), |
216 | 240 | ||
217 | F_printk("%08lx %s", | 241 | F_printk("%08lx %s", |
218 | __entry->ip, __entry->buf) | 242 | __entry->ip, __entry->buf), |
243 | |||
244 | FILTER_OTHER | ||
219 | ); | 245 | ); |
220 | 246 | ||
221 | FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, | 247 | FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, |
@@ -234,7 +260,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, | |||
234 | 260 | ||
235 | F_printk("%lx %lx %lx %d %x %x", | 261 | F_printk("%lx %lx %lx %d %x %x", |
236 | (unsigned long)__entry->phys, __entry->value, __entry->pc, | 262 | (unsigned long)__entry->phys, __entry->value, __entry->pc, |
237 | __entry->map_id, __entry->opcode, __entry->width) | 263 | __entry->map_id, __entry->opcode, __entry->width), |
264 | |||
265 | FILTER_OTHER | ||
238 | ); | 266 | ); |
239 | 267 | ||
240 | FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, | 268 | FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, |
@@ -252,7 +280,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, | |||
252 | 280 | ||
253 | F_printk("%lx %lx %lx %d %x", | 281 | F_printk("%lx %lx %lx %d %x", |
254 | (unsigned long)__entry->phys, __entry->virt, __entry->len, | 282 | (unsigned long)__entry->phys, __entry->virt, __entry->len, |
255 | __entry->map_id, __entry->opcode) | 283 | __entry->map_id, __entry->opcode), |
284 | |||
285 | FILTER_OTHER | ||
256 | ); | 286 | ); |
257 | 287 | ||
258 | 288 | ||
@@ -272,6 +302,8 @@ FTRACE_ENTRY(branch, trace_branch, | |||
272 | 302 | ||
273 | F_printk("%u:%s:%s (%u)", | 303 | F_printk("%u:%s:%s (%u)", |
274 | __entry->line, | 304 | __entry->line, |
275 | __entry->func, __entry->file, __entry->correct) | 305 | __entry->func, __entry->file, __entry->correct), |
306 | |||
307 | FILTER_OTHER | ||
276 | ); | 308 | ); |
277 | 309 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d5..fee3752ae8f6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -24,6 +24,11 @@ static int total_ref_count; | |||
24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | 24 | static int perf_trace_event_perm(struct ftrace_event_call *tp_event, |
25 | struct perf_event *p_event) | 25 | struct perf_event *p_event) |
26 | { | 26 | { |
27 | /* The ftrace function trace is allowed only for root. */ | ||
28 | if (ftrace_event_is_function(tp_event) && | ||
29 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | ||
30 | return -EPERM; | ||
31 | |||
27 | /* No tracing, just counting, so no obvious leak */ | 32 | /* No tracing, just counting, so no obvious leak */ |
28 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) | 33 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) |
29 | return 0; | 34 | return 0; |
@@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | |||
44 | return 0; | 49 | return 0; |
45 | } | 50 | } |
46 | 51 | ||
47 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 52 | static int perf_trace_event_reg(struct ftrace_event_call *tp_event, |
48 | struct perf_event *p_event) | 53 | struct perf_event *p_event) |
49 | { | 54 | { |
50 | struct hlist_head __percpu *list; | 55 | struct hlist_head __percpu *list; |
51 | int ret; | 56 | int ret = -ENOMEM; |
52 | int cpu; | 57 | int cpu; |
53 | 58 | ||
54 | ret = perf_trace_event_perm(tp_event, p_event); | ||
55 | if (ret) | ||
56 | return ret; | ||
57 | |||
58 | p_event->tp_event = tp_event; | 59 | p_event->tp_event = tp_event; |
59 | if (tp_event->perf_refcount++ > 0) | 60 | if (tp_event->perf_refcount++ > 0) |
60 | return 0; | 61 | return 0; |
61 | 62 | ||
62 | ret = -ENOMEM; | ||
63 | |||
64 | list = alloc_percpu(struct hlist_head); | 63 | list = alloc_percpu(struct hlist_head); |
65 | if (!list) | 64 | if (!list) |
66 | goto fail; | 65 | goto fail; |
@@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
83 | } | 82 | } |
84 | } | 83 | } |
85 | 84 | ||
86 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); | 85 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); |
87 | if (ret) | 86 | if (ret) |
88 | goto fail; | 87 | goto fail; |
89 | 88 | ||
@@ -108,6 +107,69 @@ fail: | |||
108 | return ret; | 107 | return ret; |
109 | } | 108 | } |
110 | 109 | ||
110 | static void perf_trace_event_unreg(struct perf_event *p_event) | ||
111 | { | ||
112 | struct ftrace_event_call *tp_event = p_event->tp_event; | ||
113 | int i; | ||
114 | |||
115 | if (--tp_event->perf_refcount > 0) | ||
116 | goto out; | ||
117 | |||
118 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); | ||
119 | |||
120 | /* | ||
121 | * Ensure our callback won't be called anymore. The buffers | ||
122 | * will be freed after that. | ||
123 | */ | ||
124 | tracepoint_synchronize_unregister(); | ||
125 | |||
126 | free_percpu(tp_event->perf_events); | ||
127 | tp_event->perf_events = NULL; | ||
128 | |||
129 | if (!--total_ref_count) { | ||
130 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { | ||
131 | free_percpu(perf_trace_buf[i]); | ||
132 | perf_trace_buf[i] = NULL; | ||
133 | } | ||
134 | } | ||
135 | out: | ||
136 | module_put(tp_event->mod); | ||
137 | } | ||
138 | |||
139 | static int perf_trace_event_open(struct perf_event *p_event) | ||
140 | { | ||
141 | struct ftrace_event_call *tp_event = p_event->tp_event; | ||
142 | return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); | ||
143 | } | ||
144 | |||
145 | static void perf_trace_event_close(struct perf_event *p_event) | ||
146 | { | ||
147 | struct ftrace_event_call *tp_event = p_event->tp_event; | ||
148 | tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); | ||
149 | } | ||
150 | |||
151 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | ||
152 | struct perf_event *p_event) | ||
153 | { | ||
154 | int ret; | ||
155 | |||
156 | ret = perf_trace_event_perm(tp_event, p_event); | ||
157 | if (ret) | ||
158 | return ret; | ||
159 | |||
160 | ret = perf_trace_event_reg(tp_event, p_event); | ||
161 | if (ret) | ||
162 | return ret; | ||
163 | |||
164 | ret = perf_trace_event_open(p_event); | ||
165 | if (ret) { | ||
166 | perf_trace_event_unreg(p_event); | ||
167 | return ret; | ||
168 | } | ||
169 | |||
170 | return 0; | ||
171 | } | ||
172 | |||
111 | int perf_trace_init(struct perf_event *p_event) | 173 | int perf_trace_init(struct perf_event *p_event) |
112 | { | 174 | { |
113 | struct ftrace_event_call *tp_event; | 175 | struct ftrace_event_call *tp_event; |
@@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event) | |||
130 | return ret; | 192 | return ret; |
131 | } | 193 | } |
132 | 194 | ||
195 | void perf_trace_destroy(struct perf_event *p_event) | ||
196 | { | ||
197 | mutex_lock(&event_mutex); | ||
198 | perf_trace_event_close(p_event); | ||
199 | perf_trace_event_unreg(p_event); | ||
200 | mutex_unlock(&event_mutex); | ||
201 | } | ||
202 | |||
133 | int perf_trace_add(struct perf_event *p_event, int flags) | 203 | int perf_trace_add(struct perf_event *p_event, int flags) |
134 | { | 204 | { |
135 | struct ftrace_event_call *tp_event = p_event->tp_event; | 205 | struct ftrace_event_call *tp_event = p_event->tp_event; |
@@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags) | |||
146 | list = this_cpu_ptr(pcpu_list); | 216 | list = this_cpu_ptr(pcpu_list); |
147 | hlist_add_head_rcu(&p_event->hlist_entry, list); | 217 | hlist_add_head_rcu(&p_event->hlist_entry, list); |
148 | 218 | ||
149 | return 0; | 219 | return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); |
150 | } | 220 | } |
151 | 221 | ||
152 | void perf_trace_del(struct perf_event *p_event, int flags) | 222 | void perf_trace_del(struct perf_event *p_event, int flags) |
153 | { | 223 | { |
154 | hlist_del_rcu(&p_event->hlist_entry); | ||
155 | } | ||
156 | |||
157 | void perf_trace_destroy(struct perf_event *p_event) | ||
158 | { | ||
159 | struct ftrace_event_call *tp_event = p_event->tp_event; | 224 | struct ftrace_event_call *tp_event = p_event->tp_event; |
160 | int i; | 225 | hlist_del_rcu(&p_event->hlist_entry); |
161 | 226 | tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); | |
162 | mutex_lock(&event_mutex); | ||
163 | if (--tp_event->perf_refcount > 0) | ||
164 | goto out; | ||
165 | |||
166 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); | ||
167 | |||
168 | /* | ||
169 | * Ensure our callback won't be called anymore. The buffers | ||
170 | * will be freed after that. | ||
171 | */ | ||
172 | tracepoint_synchronize_unregister(); | ||
173 | |||
174 | free_percpu(tp_event->perf_events); | ||
175 | tp_event->perf_events = NULL; | ||
176 | |||
177 | if (!--total_ref_count) { | ||
178 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { | ||
179 | free_percpu(perf_trace_buf[i]); | ||
180 | perf_trace_buf[i] = NULL; | ||
181 | } | ||
182 | } | ||
183 | out: | ||
184 | module_put(tp_event->mod); | ||
185 | mutex_unlock(&event_mutex); | ||
186 | } | 227 | } |
187 | 228 | ||
188 | __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, | 229 | __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, |
@@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, | |||
214 | return raw_data; | 255 | return raw_data; |
215 | } | 256 | } |
216 | EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); | 257 | EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); |
258 | |||
259 | #ifdef CONFIG_FUNCTION_TRACER | ||
260 | static void | ||
261 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) | ||
262 | { | ||
263 | struct ftrace_entry *entry; | ||
264 | struct hlist_head *head; | ||
265 | struct pt_regs regs; | ||
266 | int rctx; | ||
267 | |||
268 | #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ | ||
269 | sizeof(u64)) - sizeof(u32)) | ||
270 | |||
271 | BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); | ||
272 | |||
273 | perf_fetch_caller_regs(®s); | ||
274 | |||
275 | entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); | ||
276 | if (!entry) | ||
277 | return; | ||
278 | |||
279 | entry->ip = ip; | ||
280 | entry->parent_ip = parent_ip; | ||
281 | |||
282 | head = this_cpu_ptr(event_function.perf_events); | ||
283 | perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, | ||
284 | 1, ®s, head); | ||
285 | |||
286 | #undef ENTRY_SIZE | ||
287 | } | ||
288 | |||
289 | static int perf_ftrace_function_register(struct perf_event *event) | ||
290 | { | ||
291 | struct ftrace_ops *ops = &event->ftrace_ops; | ||
292 | |||
293 | ops->flags |= FTRACE_OPS_FL_CONTROL; | ||
294 | ops->func = perf_ftrace_function_call; | ||
295 | return register_ftrace_function(ops); | ||
296 | } | ||
297 | |||
298 | static int perf_ftrace_function_unregister(struct perf_event *event) | ||
299 | { | ||
300 | struct ftrace_ops *ops = &event->ftrace_ops; | ||
301 | int ret = unregister_ftrace_function(ops); | ||
302 | ftrace_free_filter(ops); | ||
303 | return ret; | ||
304 | } | ||
305 | |||
306 | static void perf_ftrace_function_enable(struct perf_event *event) | ||
307 | { | ||
308 | ftrace_function_local_enable(&event->ftrace_ops); | ||
309 | } | ||
310 | |||
311 | static void perf_ftrace_function_disable(struct perf_event *event) | ||
312 | { | ||
313 | ftrace_function_local_disable(&event->ftrace_ops); | ||
314 | } | ||
315 | |||
316 | int perf_ftrace_event_register(struct ftrace_event_call *call, | ||
317 | enum trace_reg type, void *data) | ||
318 | { | ||
319 | switch (type) { | ||
320 | case TRACE_REG_REGISTER: | ||
321 | case TRACE_REG_UNREGISTER: | ||
322 | break; | ||
323 | case TRACE_REG_PERF_REGISTER: | ||
324 | case TRACE_REG_PERF_UNREGISTER: | ||
325 | return 0; | ||
326 | case TRACE_REG_PERF_OPEN: | ||
327 | return perf_ftrace_function_register(data); | ||
328 | case TRACE_REG_PERF_CLOSE: | ||
329 | return perf_ftrace_function_unregister(data); | ||
330 | case TRACE_REG_PERF_ADD: | ||
331 | perf_ftrace_function_enable(data); | ||
332 | return 0; | ||
333 | case TRACE_REG_PERF_DEL: | ||
334 | perf_ftrace_function_disable(data); | ||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | return -EINVAL; | ||
339 | } | ||
340 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934ec..079a93ae8a9d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call) | |||
147 | } | 147 | } |
148 | EXPORT_SYMBOL_GPL(trace_event_raw_init); | 148 | EXPORT_SYMBOL_GPL(trace_event_raw_init); |
149 | 149 | ||
150 | int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) | 150 | int ftrace_event_reg(struct ftrace_event_call *call, |
151 | enum trace_reg type, void *data) | ||
151 | { | 152 | { |
152 | switch (type) { | 153 | switch (type) { |
153 | case TRACE_REG_REGISTER: | 154 | case TRACE_REG_REGISTER: |
@@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) | |||
170 | call->class->perf_probe, | 171 | call->class->perf_probe, |
171 | call); | 172 | call); |
172 | return 0; | 173 | return 0; |
174 | case TRACE_REG_PERF_OPEN: | ||
175 | case TRACE_REG_PERF_CLOSE: | ||
176 | case TRACE_REG_PERF_ADD: | ||
177 | case TRACE_REG_PERF_DEL: | ||
178 | return 0; | ||
173 | #endif | 179 | #endif |
174 | } | 180 | } |
175 | return 0; | 181 | return 0; |
@@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
209 | tracing_stop_cmdline_record(); | 215 | tracing_stop_cmdline_record(); |
210 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; | 216 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; |
211 | } | 217 | } |
212 | call->class->reg(call, TRACE_REG_UNREGISTER); | 218 | call->class->reg(call, TRACE_REG_UNREGISTER, NULL); |
213 | } | 219 | } |
214 | break; | 220 | break; |
215 | case 1: | 221 | case 1: |
@@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
218 | tracing_start_cmdline_record(); | 224 | tracing_start_cmdline_record(); |
219 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; | 225 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; |
220 | } | 226 | } |
221 | ret = call->class->reg(call, TRACE_REG_REGISTER); | 227 | ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); |
222 | if (ret) { | 228 | if (ret) { |
223 | tracing_stop_cmdline_record(); | 229 | tracing_stop_cmdline_record(); |
224 | pr_info("event trace: Could not enable event " | 230 | pr_info("event trace: Could not enable event " |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 24aee7127451..431dba8b7542 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -81,6 +81,7 @@ enum { | |||
81 | FILT_ERR_TOO_MANY_PREDS, | 81 | FILT_ERR_TOO_MANY_PREDS, |
82 | FILT_ERR_MISSING_FIELD, | 82 | FILT_ERR_MISSING_FIELD, |
83 | FILT_ERR_INVALID_FILTER, | 83 | FILT_ERR_INVALID_FILTER, |
84 | FILT_ERR_IP_FIELD_ONLY, | ||
84 | }; | 85 | }; |
85 | 86 | ||
86 | static char *err_text[] = { | 87 | static char *err_text[] = { |
@@ -96,6 +97,7 @@ static char *err_text[] = { | |||
96 | "Too many terms in predicate expression", | 97 | "Too many terms in predicate expression", |
97 | "Missing field name and/or value", | 98 | "Missing field name and/or value", |
98 | "Meaningless filter expression", | 99 | "Meaningless filter expression", |
100 | "Only 'ip' field is supported for function trace", | ||
99 | }; | 101 | }; |
100 | 102 | ||
101 | struct opstack_op { | 103 | struct opstack_op { |
@@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name) | |||
685 | 687 | ||
686 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) | 688 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
687 | { | 689 | { |
688 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | 690 | stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); |
689 | if (!stack->preds) | 691 | if (!stack->preds) |
690 | return -ENOMEM; | 692 | return -ENOMEM; |
691 | stack->index = n_preds; | 693 | stack->index = n_preds; |
@@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) | |||
826 | if (filter->preds) | 828 | if (filter->preds) |
827 | __free_preds(filter); | 829 | __free_preds(filter); |
828 | 830 | ||
829 | filter->preds = | 831 | filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); |
830 | kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); | ||
831 | 832 | ||
832 | if (!filter->preds) | 833 | if (!filter->preds) |
833 | return -ENOMEM; | 834 | return -ENOMEM; |
@@ -900,6 +901,11 @@ int filter_assign_type(const char *type) | |||
900 | return FILTER_OTHER; | 901 | return FILTER_OTHER; |
901 | } | 902 | } |
902 | 903 | ||
904 | static bool is_function_field(struct ftrace_event_field *field) | ||
905 | { | ||
906 | return field->filter_type == FILTER_TRACE_FN; | ||
907 | } | ||
908 | |||
903 | static bool is_string_field(struct ftrace_event_field *field) | 909 | static bool is_string_field(struct ftrace_event_field *field) |
904 | { | 910 | { |
905 | return field->filter_type == FILTER_DYN_STRING || | 911 | return field->filter_type == FILTER_DYN_STRING || |
@@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps, | |||
987 | fn = filter_pred_strloc; | 993 | fn = filter_pred_strloc; |
988 | else | 994 | else |
989 | fn = filter_pred_pchar; | 995 | fn = filter_pred_pchar; |
996 | } else if (is_function_field(field)) { | ||
997 | if (strcmp(field->name, "ip")) { | ||
998 | parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); | ||
999 | return -EINVAL; | ||
1000 | } | ||
990 | } else { | 1001 | } else { |
991 | if (field->is_signed) | 1002 | if (field->is_signed) |
992 | ret = strict_strtoll(pred->regex.pattern, 0, &val); | 1003 | ret = strict_strtoll(pred->regex.pattern, 0, &val); |
@@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, | |||
1334 | 1345 | ||
1335 | strcpy(pred.regex.pattern, operand2); | 1346 | strcpy(pred.regex.pattern, operand2); |
1336 | pred.regex.len = strlen(pred.regex.pattern); | 1347 | pred.regex.len = strlen(pred.regex.pattern); |
1337 | |||
1338 | #ifdef CONFIG_FTRACE_STARTUP_TEST | ||
1339 | pred.field = field; | 1348 | pred.field = field; |
1340 | #endif | ||
1341 | return init_pred(ps, field, &pred) ? NULL : &pred; | 1349 | return init_pred(ps, field, &pred) ? NULL : &pred; |
1342 | } | 1350 | } |
1343 | 1351 | ||
@@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | |||
1486 | children = count_leafs(preds, &preds[root->left]); | 1494 | children = count_leafs(preds, &preds[root->left]); |
1487 | children += count_leafs(preds, &preds[root->right]); | 1495 | children += count_leafs(preds, &preds[root->right]); |
1488 | 1496 | ||
1489 | root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); | 1497 | root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); |
1490 | if (!root->ops) | 1498 | if (!root->ops) |
1491 | return -ENOMEM; | 1499 | return -ENOMEM; |
1492 | 1500 | ||
@@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event) | |||
1950 | __free_filter(filter); | 1958 | __free_filter(filter); |
1951 | } | 1959 | } |
1952 | 1960 | ||
1961 | struct function_filter_data { | ||
1962 | struct ftrace_ops *ops; | ||
1963 | int first_filter; | ||
1964 | int first_notrace; | ||
1965 | }; | ||
1966 | |||
1967 | #ifdef CONFIG_FUNCTION_TRACER | ||
1968 | static char ** | ||
1969 | ftrace_function_filter_re(char *buf, int len, int *count) | ||
1970 | { | ||
1971 | char *str, *sep, **re; | ||
1972 | |||
1973 | str = kstrndup(buf, len, GFP_KERNEL); | ||
1974 | if (!str) | ||
1975 | return NULL; | ||
1976 | |||
1977 | /* | ||
1978 | * The argv_split function takes white space | ||
1979 | * as a separator, so convert ',' into spaces. | ||
1980 | */ | ||
1981 | while ((sep = strchr(str, ','))) | ||
1982 | *sep = ' '; | ||
1983 | |||
1984 | re = argv_split(GFP_KERNEL, str, count); | ||
1985 | kfree(str); | ||
1986 | return re; | ||
1987 | } | ||
1988 | |||
1989 | static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, | ||
1990 | int reset, char *re, int len) | ||
1991 | { | ||
1992 | int ret; | ||
1993 | |||
1994 | if (filter) | ||
1995 | ret = ftrace_set_filter(ops, re, len, reset); | ||
1996 | else | ||
1997 | ret = ftrace_set_notrace(ops, re, len, reset); | ||
1998 | |||
1999 | return ret; | ||
2000 | } | ||
2001 | |||
2002 | static int __ftrace_function_set_filter(int filter, char *buf, int len, | ||
2003 | struct function_filter_data *data) | ||
2004 | { | ||
2005 | int i, re_cnt, ret; | ||
2006 | int *reset; | ||
2007 | char **re; | ||
2008 | |||
2009 | reset = filter ? &data->first_filter : &data->first_notrace; | ||
2010 | |||
2011 | /* | ||
2012 | * The 'ip' field could have multiple filters set, separated | ||
2013 | * either by space or comma. We first cut the filter and apply | ||
2014 | * all pieces separatelly. | ||
2015 | */ | ||
2016 | re = ftrace_function_filter_re(buf, len, &re_cnt); | ||
2017 | if (!re) | ||
2018 | return -EINVAL; | ||
2019 | |||
2020 | for (i = 0; i < re_cnt; i++) { | ||
2021 | ret = ftrace_function_set_regexp(data->ops, filter, *reset, | ||
2022 | re[i], strlen(re[i])); | ||
2023 | if (ret) | ||
2024 | break; | ||
2025 | |||
2026 | if (*reset) | ||
2027 | *reset = 0; | ||
2028 | } | ||
2029 | |||
2030 | argv_free(re); | ||
2031 | return ret; | ||
2032 | } | ||
2033 | |||
2034 | static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) | ||
2035 | { | ||
2036 | struct ftrace_event_field *field = pred->field; | ||
2037 | |||
2038 | if (leaf) { | ||
2039 | /* | ||
2040 | * Check the leaf predicate for function trace, verify: | ||
2041 | * - only '==' and '!=' is used | ||
2042 | * - the 'ip' field is used | ||
2043 | */ | ||
2044 | if ((pred->op != OP_EQ) && (pred->op != OP_NE)) | ||
2045 | return -EINVAL; | ||
2046 | |||
2047 | if (strcmp(field->name, "ip")) | ||
2048 | return -EINVAL; | ||
2049 | } else { | ||
2050 | /* | ||
2051 | * Check the non leaf predicate for function trace, verify: | ||
2052 | * - only '||' is used | ||
2053 | */ | ||
2054 | if (pred->op != OP_OR) | ||
2055 | return -EINVAL; | ||
2056 | } | ||
2057 | |||
2058 | return 0; | ||
2059 | } | ||
2060 | |||
2061 | static int ftrace_function_set_filter_cb(enum move_type move, | ||
2062 | struct filter_pred *pred, | ||
2063 | int *err, void *data) | ||
2064 | { | ||
2065 | /* Checking the node is valid for function trace. */ | ||
2066 | if ((move != MOVE_DOWN) || | ||
2067 | (pred->left != FILTER_PRED_INVALID)) { | ||
2068 | *err = ftrace_function_check_pred(pred, 0); | ||
2069 | } else { | ||
2070 | *err = ftrace_function_check_pred(pred, 1); | ||
2071 | if (*err) | ||
2072 | return WALK_PRED_ABORT; | ||
2073 | |||
2074 | *err = __ftrace_function_set_filter(pred->op == OP_EQ, | ||
2075 | pred->regex.pattern, | ||
2076 | pred->regex.len, | ||
2077 | data); | ||
2078 | } | ||
2079 | |||
2080 | return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; | ||
2081 | } | ||
2082 | |||
2083 | static int ftrace_function_set_filter(struct perf_event *event, | ||
2084 | struct event_filter *filter) | ||
2085 | { | ||
2086 | struct function_filter_data data = { | ||
2087 | .first_filter = 1, | ||
2088 | .first_notrace = 1, | ||
2089 | .ops = &event->ftrace_ops, | ||
2090 | }; | ||
2091 | |||
2092 | return walk_pred_tree(filter->preds, filter->root, | ||
2093 | ftrace_function_set_filter_cb, &data); | ||
2094 | } | ||
2095 | #else | ||
2096 | static int ftrace_function_set_filter(struct perf_event *event, | ||
2097 | struct event_filter *filter) | ||
2098 | { | ||
2099 | return -ENODEV; | ||
2100 | } | ||
2101 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
2102 | |||
1953 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, | 2103 | int ftrace_profile_set_filter(struct perf_event *event, int event_id, |
1954 | char *filter_str) | 2104 | char *filter_str) |
1955 | { | 2105 | { |
@@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1970 | goto out_unlock; | 2120 | goto out_unlock; |
1971 | 2121 | ||
1972 | err = create_filter(call, filter_str, false, &filter); | 2122 | err = create_filter(call, filter_str, false, &filter); |
1973 | if (!err) | 2123 | if (err) |
1974 | event->filter = filter; | 2124 | goto free_filter; |
2125 | |||
2126 | if (ftrace_event_is_function(call)) | ||
2127 | err = ftrace_function_set_filter(event, filter); | ||
1975 | else | 2128 | else |
2129 | event->filter = filter; | ||
2130 | |||
2131 | free_filter: | ||
2132 | if (err || ftrace_event_is_function(call)) | ||
1976 | __free_filter(filter); | 2133 | __free_filter(filter); |
1977 | 2134 | ||
1978 | out_unlock: | 2135 | out_unlock: |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..3dd15e8bc856 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -18,6 +18,16 @@ | |||
18 | #undef TRACE_SYSTEM | 18 | #undef TRACE_SYSTEM |
19 | #define TRACE_SYSTEM ftrace | 19 | #define TRACE_SYSTEM ftrace |
20 | 20 | ||
21 | /* | ||
22 | * The FTRACE_ENTRY_REG macro allows ftrace entry to define register | ||
23 | * function and thus become accesible via perf. | ||
24 | */ | ||
25 | #undef FTRACE_ENTRY_REG | ||
26 | #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ | ||
27 | filter, regfn) \ | ||
28 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ | ||
29 | filter) | ||
30 | |||
21 | /* not needed for this file */ | 31 | /* not needed for this file */ |
22 | #undef __field_struct | 32 | #undef __field_struct |
23 | #define __field_struct(type, item) | 33 | #define __field_struct(type, item) |
@@ -44,21 +54,22 @@ | |||
44 | #define F_printk(fmt, args...) fmt, args | 54 | #define F_printk(fmt, args...) fmt, args |
45 | 55 | ||
46 | #undef FTRACE_ENTRY | 56 | #undef FTRACE_ENTRY |
47 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | 57 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ |
48 | struct ____ftrace_##name { \ | 58 | struct ____ftrace_##name { \ |
49 | tstruct \ | 59 | tstruct \ |
50 | }; \ | 60 | }; \ |
51 | static void __always_unused ____ftrace_check_##name(void) \ | 61 | static void __always_unused ____ftrace_check_##name(void) \ |
52 | { \ | 62 | { \ |
53 | struct ____ftrace_##name *__entry = NULL; \ | 63 | struct ____ftrace_##name *__entry = NULL; \ |
54 | \ | 64 | \ |
55 | /* force compile-time check on F_printk() */ \ | 65 | /* force compile-time check on F_printk() */ \ |
56 | printk(print); \ | 66 | printk(print); \ |
57 | } | 67 | } |
58 | 68 | ||
59 | #undef FTRACE_ENTRY_DUP | 69 | #undef FTRACE_ENTRY_DUP |
60 | #define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ | 70 | #define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ |
61 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 71 | FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ |
72 | filter) | ||
62 | 73 | ||
63 | #include "trace_entries.h" | 74 | #include "trace_entries.h" |
64 | 75 | ||
@@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
67 | ret = trace_define_field(event_call, #type, #item, \ | 78 | ret = trace_define_field(event_call, #type, #item, \ |
68 | offsetof(typeof(field), item), \ | 79 | offsetof(typeof(field), item), \ |
69 | sizeof(field.item), \ | 80 | sizeof(field.item), \ |
70 | is_signed_type(type), FILTER_OTHER); \ | 81 | is_signed_type(type), filter_type); \ |
71 | if (ret) \ | 82 | if (ret) \ |
72 | return ret; | 83 | return ret; |
73 | 84 | ||
@@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
77 | offsetof(typeof(field), \ | 88 | offsetof(typeof(field), \ |
78 | container.item), \ | 89 | container.item), \ |
79 | sizeof(field.container.item), \ | 90 | sizeof(field.container.item), \ |
80 | is_signed_type(type), FILTER_OTHER); \ | 91 | is_signed_type(type), filter_type); \ |
81 | if (ret) \ | 92 | if (ret) \ |
82 | return ret; | 93 | return ret; |
83 | 94 | ||
@@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
91 | ret = trace_define_field(event_call, event_storage, #item, \ | 102 | ret = trace_define_field(event_call, event_storage, #item, \ |
92 | offsetof(typeof(field), item), \ | 103 | offsetof(typeof(field), item), \ |
93 | sizeof(field.item), \ | 104 | sizeof(field.item), \ |
94 | is_signed_type(type), FILTER_OTHER); \ | 105 | is_signed_type(type), filter_type); \ |
95 | mutex_unlock(&event_storage_mutex); \ | 106 | mutex_unlock(&event_storage_mutex); \ |
96 | if (ret) \ | 107 | if (ret) \ |
97 | return ret; \ | 108 | return ret; \ |
@@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
104 | offsetof(typeof(field), \ | 115 | offsetof(typeof(field), \ |
105 | container.item), \ | 116 | container.item), \ |
106 | sizeof(field.container.item), \ | 117 | sizeof(field.container.item), \ |
107 | is_signed_type(type), FILTER_OTHER); \ | 118 | is_signed_type(type), filter_type); \ |
108 | if (ret) \ | 119 | if (ret) \ |
109 | return ret; | 120 | return ret; |
110 | 121 | ||
@@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \ | |||
112 | #define __dynamic_array(type, item) \ | 123 | #define __dynamic_array(type, item) \ |
113 | ret = trace_define_field(event_call, #type, #item, \ | 124 | ret = trace_define_field(event_call, #type, #item, \ |
114 | offsetof(typeof(field), item), \ | 125 | offsetof(typeof(field), item), \ |
115 | 0, is_signed_type(type), FILTER_OTHER);\ | 126 | 0, is_signed_type(type), filter_type);\ |
116 | if (ret) \ | 127 | if (ret) \ |
117 | return ret; | 128 | return ret; |
118 | 129 | ||
119 | #undef FTRACE_ENTRY | 130 | #undef FTRACE_ENTRY |
120 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ | 131 | #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ |
121 | int \ | 132 | int \ |
122 | ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | 133 | ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ |
123 | { \ | 134 | { \ |
124 | struct struct_name field; \ | 135 | struct struct_name field; \ |
125 | int ret; \ | 136 | int ret; \ |
137 | int filter_type = filter; \ | ||
126 | \ | 138 | \ |
127 | tstruct; \ | 139 | tstruct; \ |
128 | \ | 140 | \ |
@@ -150,15 +162,17 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
150 | #define __dynamic_array(type, item) | 162 | #define __dynamic_array(type, item) |
151 | 163 | ||
152 | #undef F_printk | 164 | #undef F_printk |
153 | #define F_printk(fmt, args...) #fmt ", " __stringify(args) | 165 | #define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) |
154 | 166 | ||
155 | #undef FTRACE_ENTRY | 167 | #undef FTRACE_ENTRY_REG |
156 | #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ | 168 | #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ |
169 | regfn) \ | ||
157 | \ | 170 | \ |
158 | struct ftrace_event_class event_class_ftrace_##call = { \ | 171 | struct ftrace_event_class event_class_ftrace_##call = { \ |
159 | .system = __stringify(TRACE_SYSTEM), \ | 172 | .system = __stringify(TRACE_SYSTEM), \ |
160 | .define_fields = ftrace_define_fields_##call, \ | 173 | .define_fields = ftrace_define_fields_##call, \ |
161 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ | 174 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
175 | .reg = regfn, \ | ||
162 | }; \ | 176 | }; \ |
163 | \ | 177 | \ |
164 | struct ftrace_event_call __used event_##call = { \ | 178 | struct ftrace_event_call __used event_##call = { \ |
@@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = { \ | |||
170 | struct ftrace_event_call __used \ | 184 | struct ftrace_event_call __used \ |
171 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 185 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
172 | 186 | ||
187 | #undef FTRACE_ENTRY | ||
188 | #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ | ||
189 | FTRACE_ENTRY_REG(call, struct_name, etype, \ | ||
190 | PARAMS(tstruct), PARAMS(print), filter, NULL) | ||
191 | |||
192 | int ftrace_event_is_function(struct ftrace_event_call *call) | ||
193 | { | ||
194 | return call == &event_function; | ||
195 | } | ||
196 | |||
173 | #include "trace_entries.h" | 197 | #include "trace_entries.h" |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a4..580a05ec926b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1892 | #endif /* CONFIG_PERF_EVENTS */ | 1892 | #endif /* CONFIG_PERF_EVENTS */ |
1893 | 1893 | ||
1894 | static __kprobes | 1894 | static __kprobes |
1895 | int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) | 1895 | int kprobe_register(struct ftrace_event_call *event, |
1896 | enum trace_reg type, void *data) | ||
1896 | { | 1897 | { |
1897 | struct trace_probe *tp = (struct trace_probe *)event->data; | 1898 | struct trace_probe *tp = (struct trace_probe *)event->data; |
1898 | 1899 | ||
@@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) | |||
1909 | case TRACE_REG_PERF_UNREGISTER: | 1910 | case TRACE_REG_PERF_UNREGISTER: |
1910 | disable_trace_probe(tp, TP_FLAG_PROFILE); | 1911 | disable_trace_probe(tp, TP_FLAG_PROFILE); |
1911 | return 0; | 1912 | return 0; |
1913 | case TRACE_REG_PERF_OPEN: | ||
1914 | case TRACE_REG_PERF_CLOSE: | ||
1915 | case TRACE_REG_PERF_ADD: | ||
1916 | case TRACE_REG_PERF_DEL: | ||
1917 | return 0; | ||
1912 | #endif | 1918 | #endif |
1913 | } | 1919 | } |
1914 | return 0; | 1920 | return 0; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..859fae6b1825 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) | |||
264 | return ret; | 264 | return ret; |
265 | } | 265 | } |
266 | 266 | ||
267 | int trace_seq_path(struct trace_seq *s, struct path *path) | 267 | int trace_seq_path(struct trace_seq *s, const struct path *path) |
268 | { | 268 | { |
269 | unsigned char *p; | 269 | unsigned char *p; |
270 | 270 | ||
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, | |||
300 | unsigned long mask; | 300 | unsigned long mask; |
301 | const char *str; | 301 | const char *str; |
302 | const char *ret = p->buffer + p->len; | 302 | const char *ret = p->buffer + p->len; |
303 | int i; | 303 | int i, first = 1; |
304 | 304 | ||
305 | for (i = 0; flag_array[i].name && flags; i++) { | 305 | for (i = 0; flag_array[i].name && flags; i++) { |
306 | 306 | ||
@@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, | |||
310 | 310 | ||
311 | str = flag_array[i].name; | 311 | str = flag_array[i].name; |
312 | flags &= ~mask; | 312 | flags &= ~mask; |
313 | if (p->len && delim) | 313 | if (!first && delim) |
314 | trace_seq_puts(p, delim); | 314 | trace_seq_puts(p, delim); |
315 | else | ||
316 | first = 0; | ||
315 | trace_seq_puts(p, str); | 317 | trace_seq_puts(p, str); |
316 | } | 318 | } |
317 | 319 | ||
318 | /* check for left over flags */ | 320 | /* check for left over flags */ |
319 | if (flags) { | 321 | if (flags) { |
320 | if (p->len && delim) | 322 | if (!first && delim) |
321 | trace_seq_puts(p, delim); | 323 | trace_seq_puts(p, delim); |
322 | trace_seq_printf(p, "0x%lx", flags); | 324 | trace_seq_printf(p, "0x%lx", flags); |
323 | } | 325 | } |
@@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
344 | break; | 346 | break; |
345 | } | 347 | } |
346 | 348 | ||
347 | if (!p->len) | 349 | if (ret == (const char *)(p->buffer + p->len)) |
348 | trace_seq_printf(p, "0x%lx", val); | 350 | trace_seq_printf(p, "0x%lx", val); |
349 | 351 | ||
350 | trace_seq_putc(p, 0); | 352 | trace_seq_putc(p, 0); |
@@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, | |||
370 | break; | 372 | break; |
371 | } | 373 | } |
372 | 374 | ||
373 | if (!p->len) | 375 | if (ret == (const char *)(p->buffer + p->len)) |
374 | trace_seq_printf(p, "0x%llx", val); | 376 | trace_seq_printf(p, "0x%llx", val); |
375 | 377 | ||
376 | trace_seq_putc(p, 0); | 378 | trace_seq_putc(p, 0); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb654542c1a1..96fc73369099 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); | |||
17 | static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); | 17 | static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); |
18 | 18 | ||
19 | static int syscall_enter_register(struct ftrace_event_call *event, | 19 | static int syscall_enter_register(struct ftrace_event_call *event, |
20 | enum trace_reg type); | 20 | enum trace_reg type, void *data); |
21 | static int syscall_exit_register(struct ftrace_event_call *event, | 21 | static int syscall_exit_register(struct ftrace_event_call *event, |
22 | enum trace_reg type); | 22 | enum trace_reg type, void *data); |
23 | 23 | ||
24 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
25 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 25 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
@@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void) | |||
468 | unsigned long addr; | 468 | unsigned long addr; |
469 | int i; | 469 | int i; |
470 | 470 | ||
471 | syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * | 471 | syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), |
472 | NR_syscalls, GFP_KERNEL); | 472 | GFP_KERNEL); |
473 | if (!syscalls_metadata) { | 473 | if (!syscalls_metadata) { |
474 | WARN_ON(1); | 474 | WARN_ON(1); |
475 | return -ENOMEM; | 475 | return -ENOMEM; |
@@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call) | |||
649 | #endif /* CONFIG_PERF_EVENTS */ | 649 | #endif /* CONFIG_PERF_EVENTS */ |
650 | 650 | ||
651 | static int syscall_enter_register(struct ftrace_event_call *event, | 651 | static int syscall_enter_register(struct ftrace_event_call *event, |
652 | enum trace_reg type) | 652 | enum trace_reg type, void *data) |
653 | { | 653 | { |
654 | switch (type) { | 654 | switch (type) { |
655 | case TRACE_REG_REGISTER: | 655 | case TRACE_REG_REGISTER: |
@@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event, | |||
664 | case TRACE_REG_PERF_UNREGISTER: | 664 | case TRACE_REG_PERF_UNREGISTER: |
665 | perf_sysenter_disable(event); | 665 | perf_sysenter_disable(event); |
666 | return 0; | 666 | return 0; |
667 | case TRACE_REG_PERF_OPEN: | ||
668 | case TRACE_REG_PERF_CLOSE: | ||
669 | case TRACE_REG_PERF_ADD: | ||
670 | case TRACE_REG_PERF_DEL: | ||
671 | return 0; | ||
667 | #endif | 672 | #endif |
668 | } | 673 | } |
669 | return 0; | 674 | return 0; |
670 | } | 675 | } |
671 | 676 | ||
672 | static int syscall_exit_register(struct ftrace_event_call *event, | 677 | static int syscall_exit_register(struct ftrace_event_call *event, |
673 | enum trace_reg type) | 678 | enum trace_reg type, void *data) |
674 | { | 679 | { |
675 | switch (type) { | 680 | switch (type) { |
676 | case TRACE_REG_REGISTER: | 681 | case TRACE_REG_REGISTER: |
@@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
685 | case TRACE_REG_PERF_UNREGISTER: | 690 | case TRACE_REG_PERF_UNREGISTER: |
686 | perf_sysexit_disable(event); | 691 | perf_sysexit_disable(event); |
687 | return 0; | 692 | return 0; |
693 | case TRACE_REG_PERF_OPEN: | ||
694 | case TRACE_REG_PERF_CLOSE: | ||
695 | case TRACE_REG_PERF_ADD: | ||
696 | case TRACE_REG_PERF_DEL: | ||
697 | return 0; | ||
688 | #endif | 698 | #endif |
689 | } | 699 | } |
690 | return 0; | 700 | return 0; |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/jump_label.h> | 28 | #include <linux/static_key.h> |
29 | 29 | ||
30 | extern struct tracepoint * const __start___tracepoints_ptrs[]; | 30 | extern struct tracepoint * const __start___tracepoints_ptrs[]; |
31 | extern struct tracepoint * const __stop___tracepoints_ptrs[]; | 31 | extern struct tracepoint * const __stop___tracepoints_ptrs[]; |
@@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
256 | { | 256 | { |
257 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); | 257 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); |
258 | 258 | ||
259 | if (elem->regfunc && !jump_label_enabled(&elem->key) && active) | 259 | if (elem->regfunc && !static_key_enabled(&elem->key) && active) |
260 | elem->regfunc(); | 260 | elem->regfunc(); |
261 | else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) | 261 | else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) |
262 | elem->unregfunc(); | 262 | elem->unregfunc(); |
263 | 263 | ||
264 | /* | 264 | /* |
@@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
269 | * is used. | 269 | * is used. |
270 | */ | 270 | */ |
271 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 271 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
272 | if (active && !jump_label_enabled(&elem->key)) | 272 | if (active && !static_key_enabled(&elem->key)) |
273 | jump_label_inc(&elem->key); | 273 | static_key_slow_inc(&elem->key); |
274 | else if (!active && jump_label_enabled(&elem->key)) | 274 | else if (!active && static_key_enabled(&elem->key)) |
275 | jump_label_dec(&elem->key); | 275 | static_key_slow_dec(&elem->key); |
276 | } | 276 | } |
277 | 277 | ||
278 | /* | 278 | /* |
@@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
283 | */ | 283 | */ |
284 | static void disable_tracepoint(struct tracepoint *elem) | 284 | static void disable_tracepoint(struct tracepoint *elem) |
285 | { | 285 | { |
286 | if (elem->unregfunc && jump_label_enabled(&elem->key)) | 286 | if (elem->unregfunc && static_key_enabled(&elem->key)) |
287 | elem->unregfunc(); | 287 | elem->unregfunc(); |
288 | 288 | ||
289 | if (jump_label_enabled(&elem->key)) | 289 | if (static_key_enabled(&elem->key)) |
290 | jump_label_dec(&elem->key); | 290 | static_key_slow_dec(&elem->key); |
291 | rcu_assign_pointer(elem->funcs, NULL); | 291 | rcu_assign_pointer(elem->funcs, NULL); |
292 | } | 292 | } |
293 | 293 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d117262deba3..df30ee08bdd4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -3,15 +3,14 @@ | |||
3 | * | 3 | * |
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | 4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. |
5 | * | 5 | * |
6 | * this code detects hard lockups: incidents in where on a CPU | 6 | * Note: Most of this code is borrowed heavily from the original softlockup |
7 | * the kernel does not respond to anything except NMI. | 7 | * detector, so thanks to Ingo for the initial implementation. |
8 | * | 8 | * Some chunks also taken from the old x86-specific nmi watchdog code, thanks |
9 | * Note: Most of this code is borrowed heavily from softlockup.c, | ||
10 | * so thanks to Ingo for the initial implementation. | ||
11 | * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks | ||
12 | * to those contributors as well. | 9 | * to those contributors as well. |
13 | */ | 10 | */ |
14 | 11 | ||
12 | #define pr_fmt(fmt) "NMI watchdog: " fmt | ||
13 | |||
15 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
16 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
17 | #include <linux/nmi.h> | 16 | #include <linux/nmi.h> |
@@ -117,9 +116,10 @@ static unsigned long get_sample_period(void) | |||
117 | { | 116 | { |
118 | /* | 117 | /* |
119 | * convert watchdog_thresh from seconds to ns | 118 | * convert watchdog_thresh from seconds to ns |
120 | * the divide by 5 is to give hrtimer 5 chances to | 119 | * the divide by 5 is to give hrtimer several chances (two |
121 | * increment before the hardlockup detector generates | 120 | * or three with the current relation between the soft |
122 | * a warning | 121 | * and hard thresholds) to increment before the |
122 | * hardlockup detector generates a warning | ||
123 | */ | 123 | */ |
124 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); | 124 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); |
125 | } | 125 | } |
@@ -321,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
321 | */ | 321 | */ |
322 | static int watchdog(void *unused) | 322 | static int watchdog(void *unused) |
323 | { | 323 | { |
324 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 324 | struct sched_param param = { .sched_priority = 0 }; |
325 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 325 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
326 | 326 | ||
327 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
328 | |||
329 | /* initialize timestamp */ | 327 | /* initialize timestamp */ |
330 | __touch_watchdog(); | 328 | __touch_watchdog(); |
331 | 329 | ||
@@ -336,9 +334,11 @@ static int watchdog(void *unused) | |||
336 | 334 | ||
337 | set_current_state(TASK_INTERRUPTIBLE); | 335 | set_current_state(TASK_INTERRUPTIBLE); |
338 | /* | 336 | /* |
339 | * Run briefly once per second to reset the softlockup timestamp. | 337 | * Run briefly (kicked by the hrtimer callback function) once every |
340 | * If this gets delayed for more than 60 seconds then the | 338 | * get_sample_period() seconds (4 seconds by default) to reset the |
341 | * debug-printout triggers in watchdog_timer_fn(). | 339 | * softlockup timestamp. If this gets delayed for more than |
340 | * 2*watchdog_thresh seconds then the debug-printout triggers in | ||
341 | * watchdog_timer_fn(). | ||
342 | */ | 342 | */ |
343 | while (!kthread_should_stop()) { | 343 | while (!kthread_should_stop()) { |
344 | __touch_watchdog(); | 344 | __touch_watchdog(); |
@@ -349,8 +349,11 @@ static int watchdog(void *unused) | |||
349 | 349 | ||
350 | set_current_state(TASK_INTERRUPTIBLE); | 350 | set_current_state(TASK_INTERRUPTIBLE); |
351 | } | 351 | } |
352 | /* | ||
353 | * Drop the policy/priority elevation during thread exit to avoid a | ||
354 | * scheduling latency spike. | ||
355 | */ | ||
352 | __set_current_state(TASK_RUNNING); | 356 | __set_current_state(TASK_RUNNING); |
353 | param.sched_priority = 0; | ||
354 | sched_setscheduler(current, SCHED_NORMAL, ¶m); | 357 | sched_setscheduler(current, SCHED_NORMAL, ¶m); |
355 | return 0; | 358 | return 0; |
356 | } | 359 | } |
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu) | |||
376 | /* Try to register using hardware perf events */ | 379 | /* Try to register using hardware perf events */ |
377 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 380 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
378 | if (!IS_ERR(event)) { | 381 | if (!IS_ERR(event)) { |
379 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 382 | pr_info("enabled, takes one hw-pmu counter.\n"); |
380 | goto out_save; | 383 | goto out_save; |
381 | } | 384 | } |
382 | 385 | ||
383 | 386 | ||
384 | /* vary the KERN level based on the returned errno */ | 387 | /* vary the KERN level based on the returned errno */ |
385 | if (PTR_ERR(event) == -EOPNOTSUPP) | 388 | if (PTR_ERR(event) == -EOPNOTSUPP) |
386 | printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | 389 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); |
387 | else if (PTR_ERR(event) == -ENOENT) | 390 | else if (PTR_ERR(event) == -ENOENT) |
388 | printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); | 391 | pr_warning("disabled (cpu%i): hardware events not enabled\n", |
392 | cpu); | ||
389 | else | 393 | else |
390 | printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); | 394 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", |
395 | cpu, PTR_ERR(event)); | ||
391 | return PTR_ERR(event); | 396 | return PTR_ERR(event); |
392 | 397 | ||
393 | /* success path */ | 398 | /* success path */ |
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu) | |||
439 | 444 | ||
440 | /* create the watchdog thread */ | 445 | /* create the watchdog thread */ |
441 | if (!p) { | 446 | if (!p) { |
447 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
442 | p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); | 448 | p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); |
443 | if (IS_ERR(p)) { | 449 | if (IS_ERR(p)) { |
444 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 450 | pr_err("softlockup watchdog for %i failed\n", cpu); |
445 | if (!err) { | 451 | if (!err) { |
446 | /* if hardlockup hasn't already set this */ | 452 | /* if hardlockup hasn't already set this */ |
447 | err = PTR_ERR(p); | 453 | err = PTR_ERR(p); |
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu) | |||
450 | } | 456 | } |
451 | goto out; | 457 | goto out; |
452 | } | 458 | } |
459 | sched_setscheduler(p, SCHED_FIFO, ¶m); | ||
453 | kthread_bind(p, cpu); | 460 | kthread_bind(p, cpu); |
454 | per_cpu(watchdog_touch_ts, cpu) = 0; | 461 | per_cpu(watchdog_touch_ts, cpu) = 0; |
455 | per_cpu(softlockup_watchdog, cpu) = p; | 462 | per_cpu(softlockup_watchdog, cpu) = p; |
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void) | |||
496 | watchdog_enabled = 1; | 503 | watchdog_enabled = 1; |
497 | 504 | ||
498 | if (!watchdog_enabled) | 505 | if (!watchdog_enabled) |
499 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | 506 | pr_err("failed to be enabled on some cpus\n"); |
500 | 507 | ||
501 | } | 508 | } |
502 | 509 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f2c5638bb5ab..5abf42f63c08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -476,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, | |||
476 | struct workqueue_struct *wq) | 476 | struct workqueue_struct *wq) |
477 | { | 477 | { |
478 | if (!(wq->flags & WQ_UNBOUND)) { | 478 | if (!(wq->flags & WQ_UNBOUND)) { |
479 | if (likely(cpu < nr_cpu_ids)) { | 479 | if (likely(cpu < nr_cpu_ids)) |
480 | #ifdef CONFIG_SMP | ||
481 | return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); | 480 | return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); |
482 | #else | ||
483 | return wq->cpu_wq.single; | ||
484 | #endif | ||
485 | } | ||
486 | } else if (likely(cpu == WORK_CPU_UNBOUND)) | 481 | } else if (likely(cpu == WORK_CPU_UNBOUND)) |
487 | return wq->cpu_wq.single; | 482 | return wq->cpu_wq.single; |
488 | return NULL; | 483 | return NULL; |
@@ -2899,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq) | |||
2899 | const size_t size = sizeof(struct cpu_workqueue_struct); | 2894 | const size_t size = sizeof(struct cpu_workqueue_struct); |
2900 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, | 2895 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, |
2901 | __alignof__(unsigned long long)); | 2896 | __alignof__(unsigned long long)); |
2902 | #ifdef CONFIG_SMP | ||
2903 | bool percpu = !(wq->flags & WQ_UNBOUND); | ||
2904 | #else | ||
2905 | bool percpu = false; | ||
2906 | #endif | ||
2907 | 2897 | ||
2908 | if (percpu) | 2898 | if (!(wq->flags & WQ_UNBOUND)) |
2909 | wq->cpu_wq.pcpu = __alloc_percpu(size, align); | 2899 | wq->cpu_wq.pcpu = __alloc_percpu(size, align); |
2910 | else { | 2900 | else { |
2911 | void *ptr; | 2901 | void *ptr; |
@@ -2929,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) | |||
2929 | 2919 | ||
2930 | static void free_cwqs(struct workqueue_struct *wq) | 2920 | static void free_cwqs(struct workqueue_struct *wq) |
2931 | { | 2921 | { |
2932 | #ifdef CONFIG_SMP | 2922 | if (!(wq->flags & WQ_UNBOUND)) |
2933 | bool percpu = !(wq->flags & WQ_UNBOUND); | ||
2934 | #else | ||
2935 | bool percpu = false; | ||
2936 | #endif | ||
2937 | |||
2938 | if (percpu) | ||
2939 | free_percpu(wq->cpu_wq.pcpu); | 2923 | free_percpu(wq->cpu_wq.pcpu); |
2940 | else if (wq->cpu_wq.single) { | 2924 | else if (wq->cpu_wq.single) { |
2941 | /* the pointer to free is stored right after the cwq */ | 2925 | /* the pointer to free is stored right after the cwq */ |