aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-04-01 15:30:01 -0400
committerTejun Heo <tj@kernel.org>2012-04-01 15:55:00 -0400
commit959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch)
tree3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /kernel
parenta5567932fc926739e29e98487128080f40c61710 (diff)
parent48ddbe194623ae089cc0576e60363f2d2e85662a (diff)
Merge branch 'for-3.5' of ../cgroup into block/for-3.5/core-merged
cgroup/for-3.5 contains the following changes which blk-cgroup needs to proceed with the on-going cleanup. * Dynamic addition and removal of cftypes to make config/stat file handling modular for policies. * cgroup removal update to not wait for css references to drain to fix blkcg removal hang caused by cfq caching cfqgs. Pull in cgroup/for-3.5 into block/for-3.5/core. This causes the following conflicts in block/blk-cgroup.c. * 761b3ef50e "cgroup: remove cgroup_subsys argument from callbacks" conflicts with blkiocg_pre_destroy() addition and blkiocg_attach() removal. Resolved by removing @subsys from all subsys methods. * 676f7c8f84 "cgroup: relocate cftype and cgroup_subsys definitions in controllers" conflicts with ->pre_destroy() and ->attach() updates and removal of modular config. Resolved by dropping forward declarations of the methods and applying updates to the relocated blkio_subsys. * 4baf6e3325 "cgroup: convert all non-memcg controllers to the new cftype interface" builds upon the previous item. Resolved by adding ->base_cftypes to the relocated blkio_subsys. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c904
-rw-r--r--kernel/cgroup_freezer.c22
-rw-r--r--kernel/compat.c68
-rw-r--r--kernel/cpuset.c111
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/debug/debug_core.c34
-rw-r--r--kernel/debug/gdbstub.c10
-rw-r--r--kernel/debug/kdb/kdb_bp.c7
-rw-r--r--kernel/debug/kdb/kdb_bt.c1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_private.h7
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/events/core.c266
-rw-r--r--kernel/events/hw_breakpoint.c17
-rw-r--r--kernel/exit.c70
-rw-r--r--kernel/fork.c101
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c89
-rw-r--r--kernel/futex_compat.c38
-rw-r--r--kernel/hung_task.c11
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c47
-rw-r--r--kernel/irq/handle.c28
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdomain.c828
-rw-r--r--kernel/irq/manage.c149
-rw-r--r--kernel/irq/migration.c10
-rw-r--r--kernel/jump_label.c135
-rw-r--r--kernel/kexec.c15
-rw-r--r--kernel/kmod.c84
-rw-r--r--kernel/kprobes.c12
-rw-r--r--kernel/lockdep.c8
-rw-r--r--kernel/module.c37
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/padata.c44
-rw-r--r--kernel/params.c40
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c41
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/hibernate.c47
-rw-r--r--kernel/power/main.c20
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/process.c24
-rw-r--r--kernel/power/qos.c23
-rw-r--r--kernel/power/snapshot.c35
-rw-r--r--kernel/power/suspend.c84
-rw-r--r--kernel/power/user.c12
-rw-r--r--kernel/printk.c51
-rw-r--r--kernel/ptrace.c66
-rw-r--r--kernel/rcu.h26
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/rcutiny.c26
-rw-r--r--kernel/rcutiny_plugin.h77
-rw-r--r--kernel/rcutorture.c91
-rw-r--r--kernel/rcutree.c507
-rw-r--r--kernel/rcutree.h27
-rw-r--r--kernel/rcutree_plugin.h450
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/rwsem.c1
-rw-r--r--kernel/sched/auto_group.c12
-rw-r--r--kernel/sched/core.c255
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c418
-rw-r--r--kernel/sched/rt.c45
-rw-r--r--kernel/sched/sched.h32
-rw-r--r--kernel/sched/stats.c4
-rw-r--r--kernel/signal.c56
-rw-r--r--kernel/smp.c90
-rw-r--r--kernel/softirq.c34
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/srcu.c33
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c514
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/time.c6
-rw-r--r--kernel/time/alarmtimer.c8
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/ntp.c191
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-sched.c17
-rw-r--r--kernel/time/timekeeping.c373
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c137
-rw-r--r--kernel/trace/ring_buffer.c157
-rw-r--r--kernel/trace/trace.c119
-rw-r--r--kernel/trace/trace.h37
-rw-r--r--kernel/trace/trace_entries.h70
-rw-r--r--kernel/trace/trace_event_perf.c208
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c175
-rw-r--r--kernel/trace/trace_export.c66
-rw-r--r--kernel/trace/trace_kprobe.c8
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_syscalls.c22
-rw-r--r--kernel/tracepoint.c20
-rw-r--r--kernel/watchdog.c51
-rw-r--r--kernel/workqueue.c22
106 files changed, 5181 insertions, 3197 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75f..2251882daf53 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE 125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126 126
127config INLINE_SPIN_UNLOCK 127config UNINLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) 128 bool
129 129
130config INLINE_SPIN_UNLOCK_BH 130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH 131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26a..3f9c97419f02 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT 38 select PREEMPT_COUNT
39 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
39 help 40 help
40 This option reduces the latency of the kernel by making 41 This option reduces the latency of the kernel by making
41 all kernel code (that is not executing in a critical section) 42 all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
27 27
28obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
31obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
32obj-y += time/ 31obj-y += time/
33obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 32obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1418 1418
1419/* This is a helper-function to print the escaped d_path */ 1419/* This is a helper-function to print the escaped d_path */
1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1421 struct path *path) 1421 const struct path *path)
1422{ 1422{
1423 char *p, *pathname; 1423 char *p, *pathname;
1424 1424
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..2905977e0f33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,6 +63,9 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
66/* 69/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 70 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 71 * hierarchy must be performed while holding it.
@@ -127,6 +130,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 130 /* A list running through the active hierarchies */
128 struct list_head root_list; 131 struct list_head root_list;
129 132
133 /* All cgroups on this root, cgroup_mutex protected */
134 struct list_head allcg_list;
135
130 /* Hierarchy-specific flags */ 136 /* Hierarchy-specific flags */
131 unsigned long flags; 137 unsigned long flags;
132 138
@@ -145,6 +151,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 151static struct cgroupfs_root rootnode;
146 152
147/* 153/*
154 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
155 */
156struct cfent {
157 struct list_head node;
158 struct dentry *dentry;
159 struct cftype *type;
160};
161
162/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 163 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 164 * cgroup_subsys->use_id != 0.
150 */ 165 */
@@ -239,6 +254,14 @@ int cgroup_lock_is_held(void)
239 254
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 255EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 256
257/* the current nr of refs, always >= 0 whether @css is deactivated or not */
258static int css_refcnt(struct cgroup_subsys_state *css)
259{
260 int v = atomic_read(&css->refcnt);
261
262 return v >= 0 ? v : v - CSS_DEACT_BIAS;
263}
264
242/* convenient tests for these bits */ 265/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 266inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 267{
@@ -279,6 +302,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 302#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 303list_for_each_entry(_root, &roots, root_list)
281 304
305static inline struct cgroup *__d_cgrp(struct dentry *dentry)
306{
307 return dentry->d_fsdata;
308}
309
310static inline struct cfent *__d_cfe(struct dentry *dentry)
311{
312 return dentry->d_fsdata;
313}
314
315static inline struct cftype *__d_cft(struct dentry *dentry)
316{
317 return __d_cfe(dentry)->type;
318}
319
282/* the list of cgroups eligible for automatic release. Protected by 320/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 321 * release_list_lock */
284static LIST_HEAD(release_list); 322static LIST_HEAD(release_list);
@@ -816,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 854 struct cgroup_subsys *ss;
817 int ret = 0; 855 int ret = 0;
818 856
819 for_each_subsys(cgrp->root, ss) 857 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 858 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(ss, cgrp); 859 continue;
822 if (ret) 860
823 break; 861 ret = ss->pre_destroy(cgrp);
862 if (ret) {
863 /* ->pre_destroy() failure is being deprecated */
864 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
865 break;
824 } 866 }
867 }
825 868
826 return ret; 869 return ret;
827} 870}
@@ -846,7 +889,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
846 * Release the subsystem state objects. 889 * Release the subsystem state objects.
847 */ 890 */
848 for_each_subsys(cgrp->root, ss) 891 for_each_subsys(cgrp->root, ss)
849 ss->destroy(ss, cgrp); 892 ss->destroy(cgrp);
850 893
851 cgrp->root->number_of_cgroups--; 894 cgrp->root->number_of_cgroups--;
852 mutex_unlock(&cgroup_mutex); 895 mutex_unlock(&cgroup_mutex);
@@ -864,6 +907,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 907 BUG_ON(!list_empty(&cgrp->pidlists));
865 908
866 kfree_rcu(cgrp, rcu_head); 909 kfree_rcu(cgrp, rcu_head);
910 } else {
911 struct cfent *cfe = __d_cfe(dentry);
912 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
913
914 WARN_ONCE(!list_empty(&cfe->node) &&
915 cgrp != &cgrp->root->top_cgroup,
916 "cfe still linked for %s\n", cfe->type->name);
917 kfree(cfe);
867 } 918 }
868 iput(inode); 919 iput(inode);
869} 920}
@@ -882,34 +933,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 933 dput(parent);
883} 934}
884 935
885static void cgroup_clear_directory(struct dentry *dentry) 936static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 937{
887 struct list_head *node; 938 struct cfent *cfe;
888 939
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 940 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 941 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 942
892 while (node != &dentry->d_subdirs) { 943 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 944 struct dentry *d = cfe->dentry;
894 945
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 946 if (cft && cfe->type != cft)
896 list_del_init(node); 947 continue;
897 if (d->d_inode) { 948
898 /* This should never be called on a cgroup 949 dget(d);
899 * directory with child cgroups */ 950 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 951 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 952 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 953 dput(d);
903 spin_unlock(&dentry->d_lock); 954
904 d_delete(d); 955 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 956 }
912 spin_unlock(&dentry->d_lock); 957 return -ENOENT;
958}
959
960static void cgroup_clear_directory(struct dentry *dir)
961{
962 struct cgroup *cgrp = __d_cgrp(dir);
963
964 while (!list_empty(&cgrp->files))
965 cgroup_rm_file(cgrp, NULL);
913} 966}
914 967
915/* 968/*
@@ -1015,7 +1068,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1015 list_move(&ss->sibling, &root->subsys_list); 1068 list_move(&ss->sibling, &root->subsys_list);
1016 ss->root = root; 1069 ss->root = root;
1017 if (ss->bind) 1070 if (ss->bind)
1018 ss->bind(ss, cgrp); 1071 ss->bind(cgrp);
1019 mutex_unlock(&ss->hierarchy_mutex); 1072 mutex_unlock(&ss->hierarchy_mutex);
1020 /* refcount was already taken, and we're keeping it */ 1073 /* refcount was already taken, and we're keeping it */
1021 } else if (bit & removed_bits) { 1074 } else if (bit & removed_bits) {
@@ -1025,7 +1078,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1078 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1026 mutex_lock(&ss->hierarchy_mutex); 1079 mutex_lock(&ss->hierarchy_mutex);
1027 if (ss->bind) 1080 if (ss->bind)
1028 ss->bind(ss, dummytop); 1081 ss->bind(dummytop);
1029 dummytop->subsys[i]->cgroup = dummytop; 1082 dummytop->subsys[i]->cgroup = dummytop;
1030 cgrp->subsys[i] = NULL; 1083 cgrp->subsys[i] = NULL;
1031 subsys[i]->root = &rootnode; 1084 subsys[i]->root = &rootnode;
@@ -1294,6 +1347,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1347 if (ret)
1295 goto out_unlock; 1348 goto out_unlock;
1296 1349
1350 /* See feature-removal-schedule.txt */
1351 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1352 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1353 task_tgid_nr(current), current->comm);
1354
1297 /* Don't allow flags or name to change at remount */ 1355 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1356 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1357 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1366,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1366 goto out_unlock;
1309 } 1367 }
1310 1368
1311 /* (re)populate subsystem files */ 1369 /* clear out any existing files and repopulate subsystem files */
1370 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1371 cgroup_populate_dir(cgrp);
1313 1372
1314 if (opts.release_agent) 1373 if (opts.release_agent)
@@ -1333,6 +1392,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1392{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1393 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1394 INIT_LIST_HEAD(&cgrp->children);
1395 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1396 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1397 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1398 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1404,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1404static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1405{
1346 struct cgroup *cgrp = &root->top_cgroup; 1406 struct cgroup *cgrp = &root->top_cgroup;
1407
1347 INIT_LIST_HEAD(&root->subsys_list); 1408 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1409 INIT_LIST_HEAD(&root->root_list);
1410 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1411 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1412 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1413 cgrp->top_cgroup = cgrp;
1414 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1415 init_cgroup_housekeeping(cgrp);
1353} 1416}
1354 1417
@@ -1472,7 +1535,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1472 1535
1473 struct inode *inode = 1536 struct inode *inode =
1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1537 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1475 struct dentry *dentry;
1476 1538
1477 if (!inode) 1539 if (!inode)
1478 return -ENOMEM; 1540 return -ENOMEM;
@@ -1481,12 +1543,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1481 inode->i_op = &cgroup_dir_inode_operations; 1543 inode->i_op = &cgroup_dir_inode_operations;
1482 /* directories start off with i_nlink == 2 (for "." entry) */ 1544 /* directories start off with i_nlink == 2 (for "." entry) */
1483 inc_nlink(inode); 1545 inc_nlink(inode);
1484 dentry = d_alloc_root(inode); 1546 sb->s_root = d_make_root(inode);
1485 if (!dentry) { 1547 if (!sb->s_root)
1486 iput(inode);
1487 return -ENOMEM; 1548 return -ENOMEM;
1488 }
1489 sb->s_root = dentry;
1490 /* for everything else we want ->d_op set */ 1549 /* for everything else we want ->d_op set */
1491 sb->s_d_op = &cgroup_dops; 1550 sb->s_d_op = &cgroup_dops;
1492 return 0; 1551 return 0;
@@ -1696,16 +1755,6 @@ static struct file_system_type cgroup_fs_type = {
1696 1755
1697static struct kobject *cgroup_kobj; 1756static struct kobject *cgroup_kobj;
1698 1757
1699static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1700{
1701 return dentry->d_fsdata;
1702}
1703
1704static inline struct cftype *__d_cft(struct dentry *dentry)
1705{
1706 return dentry->d_fsdata;
1707}
1708
1709/** 1758/**
1710 * cgroup_path - generate the path of a cgroup 1759 * cgroup_path - generate the path of a cgroup
1711 * @cgrp: the cgroup in question 1760 * @cgrp: the cgroup in question
@@ -1763,6 +1812,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1763struct task_and_cgroup { 1812struct task_and_cgroup {
1764 struct task_struct *task; 1813 struct task_struct *task;
1765 struct cgroup *cgrp; 1814 struct cgroup *cgrp;
1815 struct css_set *cg;
1766}; 1816};
1767 1817
1768struct cgroup_taskset { 1818struct cgroup_taskset {
@@ -1843,11 +1893,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1843 * will already exist. If not set, this function might sleep, and can fail with 1893 * will already exist. If not set, this function might sleep, and can fail with
1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. 1894 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1845 */ 1895 */
1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1896static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1847 struct task_struct *tsk, bool guarantee) 1897 struct task_struct *tsk, struct css_set *newcg)
1848{ 1898{
1849 struct css_set *oldcg; 1899 struct css_set *oldcg;
1850 struct css_set *newcg;
1851 1900
1852 /* 1901 /*
1853 * We are synchronized through threadgroup_lock() against PF_EXITING 1902 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1906,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1857 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1906 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1858 oldcg = tsk->cgroups; 1907 oldcg = tsk->cgroups;
1859 1908
1860 /* locate or allocate a new css_set for this task. */
1861 if (guarantee) {
1862 /* we know the css_set we want already exists. */
1863 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1864 read_lock(&css_set_lock);
1865 newcg = find_existing_css_set(oldcg, cgrp, template);
1866 BUG_ON(!newcg);
1867 get_css_set(newcg);
1868 read_unlock(&css_set_lock);
1869 } else {
1870 might_sleep();
1871 /* find_css_set will give us newcg already referenced. */
1872 newcg = find_css_set(oldcg, cgrp);
1873 if (!newcg)
1874 return -ENOMEM;
1875 }
1876
1877 task_lock(tsk); 1909 task_lock(tsk);
1878 rcu_assign_pointer(tsk->cgroups, newcg); 1910 rcu_assign_pointer(tsk->cgroups, newcg);
1879 task_unlock(tsk); 1911 task_unlock(tsk);
@@ -1892,7 +1924,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1892 put_css_set(oldcg); 1924 put_css_set(oldcg);
1893 1925
1894 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1926 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1895 return 0;
1896} 1927}
1897 1928
1898/** 1929/**
@@ -1905,11 +1936,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1905 */ 1936 */
1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1937int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1907{ 1938{
1908 int retval; 1939 int retval = 0;
1909 struct cgroup_subsys *ss, *failed_ss = NULL; 1940 struct cgroup_subsys *ss, *failed_ss = NULL;
1910 struct cgroup *oldcgrp; 1941 struct cgroup *oldcgrp;
1911 struct cgroupfs_root *root = cgrp->root; 1942 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { }; 1943 struct cgroup_taskset tset = { };
1944 struct css_set *newcg;
1913 1945
1914 /* @tsk either already exited or can't exit until the end */ 1946 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING) 1947 if (tsk->flags & PF_EXITING)
@@ -1925,7 +1957,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1925 1957
1926 for_each_subsys(root, ss) { 1958 for_each_subsys(root, ss) {
1927 if (ss->can_attach) { 1959 if (ss->can_attach) {
1928 retval = ss->can_attach(ss, cgrp, &tset); 1960 retval = ss->can_attach(cgrp, &tset);
1929 if (retval) { 1961 if (retval) {
1930 /* 1962 /*
1931 * Remember on which subsystem the can_attach() 1963 * Remember on which subsystem the can_attach()
@@ -1939,13 +1971,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1939 } 1971 }
1940 } 1972 }
1941 1973
1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1974 newcg = find_css_set(tsk->cgroups, cgrp);
1943 if (retval) 1975 if (!newcg) {
1976 retval = -ENOMEM;
1944 goto out; 1977 goto out;
1978 }
1979
1980 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1945 1981
1946 for_each_subsys(root, ss) { 1982 for_each_subsys(root, ss) {
1947 if (ss->attach) 1983 if (ss->attach)
1948 ss->attach(ss, cgrp, &tset); 1984 ss->attach(cgrp, &tset);
1949 } 1985 }
1950 1986
1951 synchronize_rcu(); 1987 synchronize_rcu();
@@ -1967,7 +2003,7 @@ out:
1967 */ 2003 */
1968 break; 2004 break;
1969 if (ss->cancel_attach) 2005 if (ss->cancel_attach)
1970 ss->cancel_attach(ss, cgrp, &tset); 2006 ss->cancel_attach(cgrp, &tset);
1971 } 2007 }
1972 } 2008 }
1973 return retval; 2009 return retval;
@@ -1997,66 +2033,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1997} 2033}
1998EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2034EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1999 2035
2000/*
2001 * cgroup_attach_proc works in two stages, the first of which prefetches all
2002 * new css_sets needed (to make sure we have enough memory before committing
2003 * to the move) and stores them in a list of entries of the following type.
2004 * TODO: possible optimization: use css_set->rcu_head for chaining instead
2005 */
2006struct cg_list_entry {
2007 struct css_set *cg;
2008 struct list_head links;
2009};
2010
2011static bool css_set_check_fetched(struct cgroup *cgrp,
2012 struct task_struct *tsk, struct css_set *cg,
2013 struct list_head *newcg_list)
2014{
2015 struct css_set *newcg;
2016 struct cg_list_entry *cg_entry;
2017 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
2018
2019 read_lock(&css_set_lock);
2020 newcg = find_existing_css_set(cg, cgrp, template);
2021 read_unlock(&css_set_lock);
2022
2023 /* doesn't exist at all? */
2024 if (!newcg)
2025 return false;
2026 /* see if it's already in the list */
2027 list_for_each_entry(cg_entry, newcg_list, links)
2028 if (cg_entry->cg == newcg)
2029 return true;
2030
2031 /* not found */
2032 return false;
2033}
2034
2035/*
2036 * Find the new css_set and store it in the list in preparation for moving the
2037 * given task to the given cgroup. Returns 0 or -ENOMEM.
2038 */
2039static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
2040 struct list_head *newcg_list)
2041{
2042 struct css_set *newcg;
2043 struct cg_list_entry *cg_entry;
2044
2045 /* ensure a new css_set will exist for this thread */
2046 newcg = find_css_set(cg, cgrp);
2047 if (!newcg)
2048 return -ENOMEM;
2049 /* add it to the list */
2050 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
2051 if (!cg_entry) {
2052 put_css_set(newcg);
2053 return -ENOMEM;
2054 }
2055 cg_entry->cg = newcg;
2056 list_add(&cg_entry->links, newcg_list);
2057 return 0;
2058}
2059
2060/** 2036/**
2061 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 2037 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2062 * @cgrp: the cgroup to attach to 2038 * @cgrp: the cgroup to attach to
@@ -2070,20 +2046,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2070 int retval, i, group_size; 2046 int retval, i, group_size;
2071 struct cgroup_subsys *ss, *failed_ss = NULL; 2047 struct cgroup_subsys *ss, *failed_ss = NULL;
2072 /* guaranteed to be initialized later, but the compiler needs this */ 2048 /* guaranteed to be initialized later, but the compiler needs this */
2073 struct css_set *oldcg;
2074 struct cgroupfs_root *root = cgrp->root; 2049 struct cgroupfs_root *root = cgrp->root;
2075 /* threadgroup list cursor and array */ 2050 /* threadgroup list cursor and array */
2076 struct task_struct *tsk; 2051 struct task_struct *tsk;
2077 struct task_and_cgroup *tc; 2052 struct task_and_cgroup *tc;
2078 struct flex_array *group; 2053 struct flex_array *group;
2079 struct cgroup_taskset tset = { }; 2054 struct cgroup_taskset tset = { };
2080 /*
2081 * we need to make sure we have css_sets for all the tasks we're
2082 * going to move -before- we actually start moving them, so that in
2083 * case we get an ENOMEM we can bail out before making any changes.
2084 */
2085 struct list_head newcg_list;
2086 struct cg_list_entry *cg_entry, *temp_nobe;
2087 2055
2088 /* 2056 /*
2089 * step 0: in order to do expensive, possibly blocking operations for 2057 * step 0: in order to do expensive, possibly blocking operations for
@@ -2102,23 +2070,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2102 if (retval) 2070 if (retval)
2103 goto out_free_group_list; 2071 goto out_free_group_list;
2104 2072
2105 /* prevent changes to the threadgroup list while we take a snapshot. */
2106 read_lock(&tasklist_lock);
2107 if (!thread_group_leader(leader)) {
2108 /*
2109 * a race with de_thread from another thread's exec() may strip
2110 * us of our leadership, making while_each_thread unsafe to use
2111 * on this task. if this happens, there is no choice but to
2112 * throw this task away and try again (from cgroup_procs_write);
2113 * this is "double-double-toil-and-trouble-check locking".
2114 */
2115 read_unlock(&tasklist_lock);
2116 retval = -EAGAIN;
2117 goto out_free_group_list;
2118 }
2119
2120 tsk = leader; 2073 tsk = leader;
2121 i = 0; 2074 i = 0;
2075 /*
2076 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2077 * already PF_EXITING could be freed from underneath us unless we
2078 * take an rcu_read_lock.
2079 */
2080 rcu_read_lock();
2122 do { 2081 do {
2123 struct task_and_cgroup ent; 2082 struct task_and_cgroup ent;
2124 2083
@@ -2128,24 +2087,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2128 2087
2129 /* as per above, nr_threads may decrease, but not increase. */ 2088 /* as per above, nr_threads may decrease, but not increase. */
2130 BUG_ON(i >= group_size); 2089 BUG_ON(i >= group_size);
2131 /*
2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2133 * earlier, but it's good form to communicate our expectations.
2134 */
2135 ent.task = tsk; 2090 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root); 2091 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */ 2092 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp) 2093 if (ent.cgrp == cgrp)
2139 continue; 2094 continue;
2095 /*
2096 * saying GFP_ATOMIC has no effect here because we did prealloc
2097 * earlier, but it's good form to communicate our expectations.
2098 */
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2099 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2141 BUG_ON(retval != 0); 2100 BUG_ON(retval != 0);
2142 i++; 2101 i++;
2143 } while_each_thread(leader, tsk); 2102 } while_each_thread(leader, tsk);
2103 rcu_read_unlock();
2144 /* remember the number of threads in the array for later. */ 2104 /* remember the number of threads in the array for later. */
2145 group_size = i; 2105 group_size = i;
2146 tset.tc_array = group; 2106 tset.tc_array = group;
2147 tset.tc_array_len = group_size; 2107 tset.tc_array_len = group_size;
2148 read_unlock(&tasklist_lock);
2149 2108
2150 /* methods shouldn't be called if no task is actually migrating */ 2109 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0; 2110 retval = 0;
@@ -2157,7 +2116,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2157 */ 2116 */
2158 for_each_subsys(root, ss) { 2117 for_each_subsys(root, ss) {
2159 if (ss->can_attach) { 2118 if (ss->can_attach) {
2160 retval = ss->can_attach(ss, cgrp, &tset); 2119 retval = ss->can_attach(cgrp, &tset);
2161 if (retval) { 2120 if (retval) {
2162 failed_ss = ss; 2121 failed_ss = ss;
2163 goto out_cancel_attach; 2122 goto out_cancel_attach;
@@ -2169,17 +2128,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2169 * step 2: make sure css_sets exist for all threads to be migrated. 2128 * step 2: make sure css_sets exist for all threads to be migrated.
2170 * we use find_css_set, which allocates a new one if necessary. 2129 * we use find_css_set, which allocates a new one if necessary.
2171 */ 2130 */
2172 INIT_LIST_HEAD(&newcg_list);
2173 for (i = 0; i < group_size; i++) { 2131 for (i = 0; i < group_size; i++) {
2174 tc = flex_array_get(group, i); 2132 tc = flex_array_get(group, i);
2175 oldcg = tc->task->cgroups; 2133 tc->cg = find_css_set(tc->task->cgroups, cgrp);
2176 2134 if (!tc->cg) {
2177 /* if we don't already have it in the list get a new one */ 2135 retval = -ENOMEM;
2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg, 2136 goto out_put_css_set_refs;
2179 &newcg_list)) {
2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2181 if (retval)
2182 goto out_list_teardown;
2183 } 2137 }
2184 } 2138 }
2185 2139
@@ -2190,8 +2144,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2190 */ 2144 */
2191 for (i = 0; i < group_size; i++) { 2145 for (i = 0; i < group_size; i++) {
2192 tc = flex_array_get(group, i); 2146 tc = flex_array_get(group, i);
2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); 2147 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
2194 BUG_ON(retval);
2195 } 2148 }
2196 /* nothing is sensitive to fork() after this point. */ 2149 /* nothing is sensitive to fork() after this point. */
2197 2150
@@ -2200,7 +2153,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 */ 2153 */
2201 for_each_subsys(root, ss) { 2154 for_each_subsys(root, ss) {
2202 if (ss->attach) 2155 if (ss->attach)
2203 ss->attach(ss, cgrp, &tset); 2156 ss->attach(cgrp, &tset);
2204 } 2157 }
2205 2158
2206 /* 2159 /*
@@ -2209,21 +2162,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2209 synchronize_rcu(); 2162 synchronize_rcu();
2210 cgroup_wakeup_rmdir_waiter(cgrp); 2163 cgroup_wakeup_rmdir_waiter(cgrp);
2211 retval = 0; 2164 retval = 0;
2212out_list_teardown: 2165out_put_css_set_refs:
2213 /* clean up the list of prefetched css_sets. */ 2166 if (retval) {
2214 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2167 for (i = 0; i < group_size; i++) {
2215 list_del(&cg_entry->links); 2168 tc = flex_array_get(group, i);
2216 put_css_set(cg_entry->cg); 2169 if (!tc->cg)
2217 kfree(cg_entry); 2170 break;
2171 put_css_set(tc->cg);
2172 }
2218 } 2173 }
2219out_cancel_attach: 2174out_cancel_attach:
2220 /* same deal as in cgroup_attach_task */
2221 if (retval) { 2175 if (retval) {
2222 for_each_subsys(root, ss) { 2176 for_each_subsys(root, ss) {
2223 if (ss == failed_ss) 2177 if (ss == failed_ss)
2224 break; 2178 break;
2225 if (ss->cancel_attach) 2179 if (ss->cancel_attach)
2226 ss->cancel_attach(ss, cgrp, &tset); 2180 ss->cancel_attach(cgrp, &tset);
2227 } 2181 }
2228 } 2182 }
2229out_free_group_list: 2183out_free_group_list:
@@ -2245,22 +2199,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2245 if (!cgroup_lock_live_group(cgrp)) 2199 if (!cgroup_lock_live_group(cgrp))
2246 return -ENODEV; 2200 return -ENODEV;
2247 2201
2202retry_find_task:
2203 rcu_read_lock();
2248 if (pid) { 2204 if (pid) {
2249 rcu_read_lock();
2250 tsk = find_task_by_vpid(pid); 2205 tsk = find_task_by_vpid(pid);
2251 if (!tsk) { 2206 if (!tsk) {
2252 rcu_read_unlock(); 2207 rcu_read_unlock();
2253 cgroup_unlock(); 2208 ret= -ESRCH;
2254 return -ESRCH; 2209 goto out_unlock_cgroup;
2255 }
2256 if (threadgroup) {
2257 /*
2258 * RCU protects this access, since tsk was found in the
2259 * tid map. a race with de_thread may cause group_leader
2260 * to stop being the leader, but cgroup_attach_proc will
2261 * detect it later.
2262 */
2263 tsk = tsk->group_leader;
2264 } 2210 }
2265 /* 2211 /*
2266 * even if we're attaching all tasks in the thread group, we 2212 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2217,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2271 cred->euid != tcred->uid && 2217 cred->euid != tcred->uid &&
2272 cred->euid != tcred->suid) { 2218 cred->euid != tcred->suid) {
2273 rcu_read_unlock(); 2219 rcu_read_unlock();
2274 cgroup_unlock(); 2220 ret = -EACCES;
2275 return -EACCES; 2221 goto out_unlock_cgroup;
2276 } 2222 }
2277 get_task_struct(tsk); 2223 } else
2278 rcu_read_unlock(); 2224 tsk = current;
2279 } else {
2280 if (threadgroup)
2281 tsk = current->group_leader;
2282 else
2283 tsk = current;
2284 get_task_struct(tsk);
2285 }
2286
2287 threadgroup_lock(tsk);
2288 2225
2289 if (threadgroup) 2226 if (threadgroup)
2227 tsk = tsk->group_leader;
2228 get_task_struct(tsk);
2229 rcu_read_unlock();
2230
2231 threadgroup_lock(tsk);
2232 if (threadgroup) {
2233 if (!thread_group_leader(tsk)) {
2234 /*
2235 * a race with de_thread from another thread's exec()
2236 * may strip us of our leadership, if this happens,
2237 * there is no choice but to throw this task away and
2238 * try again; this is
2239 * "double-double-toil-and-trouble-check locking".
2240 */
2241 threadgroup_unlock(tsk);
2242 put_task_struct(tsk);
2243 goto retry_find_task;
2244 }
2290 ret = cgroup_attach_proc(cgrp, tsk); 2245 ret = cgroup_attach_proc(cgrp, tsk);
2291 else 2246 } else
2292 ret = cgroup_attach_task(cgrp, tsk); 2247 ret = cgroup_attach_task(cgrp, tsk);
2293
2294 threadgroup_unlock(tsk); 2248 threadgroup_unlock(tsk);
2295 2249
2296 put_task_struct(tsk); 2250 put_task_struct(tsk);
2251out_unlock_cgroup:
2297 cgroup_unlock(); 2252 cgroup_unlock();
2298 return ret; 2253 return ret;
2299} 2254}
@@ -2305,16 +2260,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2305 2260
2306static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2261static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2307{ 2262{
2308 int ret; 2263 return attach_task_by_pid(cgrp, tgid, true);
2309 do {
2310 /*
2311 * attach_proc fails with -EAGAIN if threadgroup leadership
2312 * changes in the middle of the operation, in which case we need
2313 * to find the task_struct for the new leader and start over.
2314 */
2315 ret = attach_task_by_pid(cgrp, tgid, true);
2316 } while (ret == -EAGAIN);
2317 return ret;
2318} 2264}
2319 2265
2320/** 2266/**
@@ -2710,50 +2656,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2710 return mode; 2656 return mode;
2711} 2657}
2712 2658
2713int cgroup_add_file(struct cgroup *cgrp, 2659static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2714 struct cgroup_subsys *subsys, 2660 const struct cftype *cft)
2715 const struct cftype *cft)
2716{ 2661{
2717 struct dentry *dir = cgrp->dentry; 2662 struct dentry *dir = cgrp->dentry;
2663 struct cgroup *parent = __d_cgrp(dir);
2718 struct dentry *dentry; 2664 struct dentry *dentry;
2665 struct cfent *cfe;
2719 int error; 2666 int error;
2720 umode_t mode; 2667 umode_t mode;
2721
2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2668 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2669
2670 /* does @cft->flags tell us to skip creation on @cgrp? */
2671 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2672 return 0;
2673 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2674 return 0;
2675
2723 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2676 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2724 strcpy(name, subsys->name); 2677 strcpy(name, subsys->name);
2725 strcat(name, "."); 2678 strcat(name, ".");
2726 } 2679 }
2727 strcat(name, cft->name); 2680 strcat(name, cft->name);
2681
2728 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2682 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2683
2684 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2685 if (!cfe)
2686 return -ENOMEM;
2687
2729 dentry = lookup_one_len(name, dir, strlen(name)); 2688 dentry = lookup_one_len(name, dir, strlen(name));
2730 if (!IS_ERR(dentry)) { 2689 if (IS_ERR(dentry)) {
2731 mode = cgroup_file_mode(cft);
2732 error = cgroup_create_file(dentry, mode | S_IFREG,
2733 cgrp->root->sb);
2734 if (!error)
2735 dentry->d_fsdata = (void *)cft;
2736 dput(dentry);
2737 } else
2738 error = PTR_ERR(dentry); 2690 error = PTR_ERR(dentry);
2691 goto out;
2692 }
2693
2694 mode = cgroup_file_mode(cft);
2695 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2696 if (!error) {
2697 cfe->type = (void *)cft;
2698 cfe->dentry = dentry;
2699 dentry->d_fsdata = cfe;
2700 list_add_tail(&cfe->node, &parent->files);
2701 cfe = NULL;
2702 }
2703 dput(dentry);
2704out:
2705 kfree(cfe);
2739 return error; 2706 return error;
2740} 2707}
2741EXPORT_SYMBOL_GPL(cgroup_add_file);
2742 2708
2743int cgroup_add_files(struct cgroup *cgrp, 2709static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2744 struct cgroup_subsys *subsys, 2710 const struct cftype cfts[], bool is_add)
2745 const struct cftype cft[],
2746 int count)
2747{ 2711{
2748 int i, err; 2712 const struct cftype *cft;
2749 for (i = 0; i < count; i++) { 2713 int err, ret = 0;
2750 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2714
2751 if (err) 2715 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2752 return err; 2716 if (is_add)
2717 err = cgroup_add_file(cgrp, subsys, cft);
2718 else
2719 err = cgroup_rm_file(cgrp, cft);
2720 if (err) {
2721 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2722 is_add ? "add" : "remove", cft->name, err);
2723 ret = err;
2724 }
2725 }
2726 return ret;
2727}
2728
2729static DEFINE_MUTEX(cgroup_cft_mutex);
2730
2731static void cgroup_cfts_prepare(void)
2732 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2733{
2734 /*
2735 * Thanks to the entanglement with vfs inode locking, we can't walk
2736 * the existing cgroups under cgroup_mutex and create files.
2737 * Instead, we increment reference on all cgroups and build list of
2738 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2739 * exclusive access to the field.
2740 */
2741 mutex_lock(&cgroup_cft_mutex);
2742 mutex_lock(&cgroup_mutex);
2743}
2744
2745static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2746 const struct cftype *cfts, bool is_add)
2747 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2748{
2749 LIST_HEAD(pending);
2750 struct cgroup *cgrp, *n;
2751
2752 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2753 if (cfts && ss->root != &rootnode) {
2754 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2755 dget(cgrp->dentry);
2756 list_add_tail(&cgrp->cft_q_node, &pending);
2757 }
2758 }
2759
2760 mutex_unlock(&cgroup_mutex);
2761
2762 /*
2763 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2764 * files for all cgroups which were created before.
2765 */
2766 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2767 struct inode *inode = cgrp->dentry->d_inode;
2768
2769 mutex_lock(&inode->i_mutex);
2770 mutex_lock(&cgroup_mutex);
2771 if (!cgroup_is_removed(cgrp))
2772 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2773 mutex_unlock(&cgroup_mutex);
2774 mutex_unlock(&inode->i_mutex);
2775
2776 list_del_init(&cgrp->cft_q_node);
2777 dput(cgrp->dentry);
2753 } 2778 }
2779
2780 mutex_unlock(&cgroup_cft_mutex);
2781}
2782
2783/**
2784 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2785 * @ss: target cgroup subsystem
2786 * @cfts: zero-length name terminated array of cftypes
2787 *
2788 * Register @cfts to @ss. Files described by @cfts are created for all
2789 * existing cgroups to which @ss is attached and all future cgroups will
2790 * have them too. This function can be called anytime whether @ss is
2791 * attached or not.
2792 *
2793 * Returns 0 on successful registration, -errno on failure. Note that this
2794 * function currently returns 0 as long as @cfts registration is successful
2795 * even if some file creation attempts on existing cgroups fail.
2796 */
2797int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2798{
2799 struct cftype_set *set;
2800
2801 set = kzalloc(sizeof(*set), GFP_KERNEL);
2802 if (!set)
2803 return -ENOMEM;
2804
2805 cgroup_cfts_prepare();
2806 set->cfts = cfts;
2807 list_add_tail(&set->node, &ss->cftsets);
2808 cgroup_cfts_commit(ss, cfts, true);
2809
2754 return 0; 2810 return 0;
2755} 2811}
2756EXPORT_SYMBOL_GPL(cgroup_add_files); 2812EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2813
2814/**
2815 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2816 * @ss: target cgroup subsystem
2817 * @cfts: zero-length name terminated array of cftypes
2818 *
2819 * Unregister @cfts from @ss. Files described by @cfts are removed from
2820 * all existing cgroups to which @ss is attached and all future cgroups
2821 * won't have them either. This function can be called anytime whether @ss
2822 * is attached or not.
2823 *
2824 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2825 * registered with @ss.
2826 */
2827int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2828{
2829 struct cftype_set *set;
2830
2831 cgroup_cfts_prepare();
2832
2833 list_for_each_entry(set, &ss->cftsets, node) {
2834 if (set->cfts == cfts) {
2835 list_del_init(&set->node);
2836 cgroup_cfts_commit(ss, cfts, false);
2837 return 0;
2838 }
2839 }
2840
2841 cgroup_cfts_commit(ss, NULL, false);
2842 return -ENOENT;
2843}
2757 2844
2758/** 2845/**
2759 * cgroup_task_count - count the number of tasks in a cgroup. 2846 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2804,15 +2891,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
2804 * using their cgroups capability, we don't maintain the lists running 2891 * using their cgroups capability, we don't maintain the lists running
2805 * through each css_set to its tasks until we see the list actually 2892 * through each css_set to its tasks until we see the list actually
2806 * used - in other words after the first call to cgroup_iter_start(). 2893 * used - in other words after the first call to cgroup_iter_start().
2807 *
2808 * The tasklist_lock is not held here, as do_each_thread() and
2809 * while_each_thread() are protected by RCU.
2810 */ 2894 */
2811static void cgroup_enable_task_cg_lists(void) 2895static void cgroup_enable_task_cg_lists(void)
2812{ 2896{
2813 struct task_struct *p, *g; 2897 struct task_struct *p, *g;
2814 write_lock(&css_set_lock); 2898 write_lock(&css_set_lock);
2815 use_task_css_set_links = 1; 2899 use_task_css_set_links = 1;
2900 /*
2901 * We need tasklist_lock because RCU is not safe against
2902 * while_each_thread(). Besides, a forking task that has passed
2903 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2904 * is not guaranteed to have its child immediately visible in the
2905 * tasklist if we walk through it with RCU.
2906 */
2907 read_lock(&tasklist_lock);
2816 do_each_thread(g, p) { 2908 do_each_thread(g, p) {
2817 task_lock(p); 2909 task_lock(p);
2818 /* 2910 /*
@@ -2824,6 +2916,7 @@ static void cgroup_enable_task_cg_lists(void)
2824 list_add(&p->cg_list, &p->cgroups->tasks); 2916 list_add(&p->cg_list, &p->cgroups->tasks);
2825 task_unlock(p); 2917 task_unlock(p);
2826 } while_each_thread(g, p); 2918 } while_each_thread(g, p);
2919 read_unlock(&tasklist_lock);
2827 write_unlock(&css_set_lock); 2920 write_unlock(&css_set_lock);
2828} 2921}
2829 2922
@@ -3043,6 +3136,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3043 * 3136 *
3044 */ 3137 */
3045 3138
3139/* which pidlist file are we talking about? */
3140enum cgroup_filetype {
3141 CGROUP_FILE_PROCS,
3142 CGROUP_FILE_TASKS,
3143};
3144
3145/*
3146 * A pidlist is a list of pids that virtually represents the contents of one
3147 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3148 * a pair (one each for procs, tasks) for each pid namespace that's relevant
3149 * to the cgroup.
3150 */
3151struct cgroup_pidlist {
3152 /*
3153 * used to find which pidlist is wanted. doesn't change as long as
3154 * this particular list stays in the list.
3155 */
3156 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3157 /* array of xids */
3158 pid_t *list;
3159 /* how many elements the above list has */
3160 int length;
3161 /* how many files are using the current array */
3162 int use_count;
3163 /* each of these stored in a list by its cgroup */
3164 struct list_head links;
3165 /* pointer to the cgroup we belong to, for list removal purposes */
3166 struct cgroup *owner;
3167 /* protects the other fields */
3168 struct rw_semaphore mutex;
3169};
3170
3046/* 3171/*
3047 * The following two functions "fix" the issue where there are more pids 3172 * The following two functions "fix" the issue where there are more pids
3048 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 3173 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3694,13 +3819,14 @@ static struct cftype files[] = {
3694 .read_u64 = cgroup_clone_children_read, 3819 .read_u64 = cgroup_clone_children_read,
3695 .write_u64 = cgroup_clone_children_write, 3820 .write_u64 = cgroup_clone_children_write,
3696 }, 3821 },
3697}; 3822 {
3698 3823 .name = "release_agent",
3699static struct cftype cft_release_agent = { 3824 .flags = CFTYPE_ONLY_ON_ROOT,
3700 .name = "release_agent", 3825 .read_seq_string = cgroup_release_agent_show,
3701 .read_seq_string = cgroup_release_agent_show, 3826 .write_string = cgroup_release_agent_write,
3702 .write_string = cgroup_release_agent_write, 3827 .max_write_len = PATH_MAX,
3703 .max_write_len = PATH_MAX, 3828 },
3829 { } /* terminate */
3704}; 3830};
3705 3831
3706static int cgroup_populate_dir(struct cgroup *cgrp) 3832static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3708,22 +3834,21 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3708 int err; 3834 int err;
3709 struct cgroup_subsys *ss; 3835 struct cgroup_subsys *ss;
3710 3836
3711 /* First clear out any existing files */ 3837 err = cgroup_addrm_files(cgrp, NULL, files, true);
3712 cgroup_clear_directory(cgrp->dentry);
3713
3714 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3715 if (err < 0) 3838 if (err < 0)
3716 return err; 3839 return err;
3717 3840
3718 if (cgrp == cgrp->top_cgroup) { 3841 /* process cftsets of each subsystem */
3719 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3720 return err;
3721 }
3722
3723 for_each_subsys(cgrp->root, ss) { 3842 for_each_subsys(cgrp->root, ss) {
3843 struct cftype_set *set;
3844
3724 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3845 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
3725 return err; 3846 return err;
3847
3848 list_for_each_entry(set, &ss->cftsets, node)
3849 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3726 } 3850 }
3851
3727 /* This cgroup is ready now */ 3852 /* This cgroup is ready now */
3728 for_each_subsys(cgrp->root, ss) { 3853 for_each_subsys(cgrp->root, ss) {
3729 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3854 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3739,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3739 return 0; 3864 return 0;
3740} 3865}
3741 3866
3867static void css_dput_fn(struct work_struct *work)
3868{
3869 struct cgroup_subsys_state *css =
3870 container_of(work, struct cgroup_subsys_state, dput_work);
3871
3872 dput(css->cgroup->dentry);
3873}
3874
3742static void init_cgroup_css(struct cgroup_subsys_state *css, 3875static void init_cgroup_css(struct cgroup_subsys_state *css,
3743 struct cgroup_subsys *ss, 3876 struct cgroup_subsys *ss,
3744 struct cgroup *cgrp) 3877 struct cgroup *cgrp)
@@ -3751,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3751 set_bit(CSS_ROOT, &css->flags); 3884 set_bit(CSS_ROOT, &css->flags);
3752 BUG_ON(cgrp->subsys[ss->subsys_id]); 3885 BUG_ON(cgrp->subsys[ss->subsys_id]);
3753 cgrp->subsys[ss->subsys_id] = css; 3886 cgrp->subsys[ss->subsys_id] = css;
3887
3888 /*
3889 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3890 * which is put on the last css_put(). dput() requires process
3891 * context, which css_put() may be called without. @css->dput_work
3892 * will be used to invoke dput() asynchronously from css_put().
3893 */
3894 INIT_WORK(&css->dput_work, css_dput_fn);
3895 if (ss->__DEPRECATED_clear_css_refs)
3896 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3754} 3897}
3755 3898
3756static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3899static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3827,7 +3970,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3827 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3970 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3828 3971
3829 for_each_subsys(root, ss) { 3972 for_each_subsys(root, ss) {
3830 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3973 struct cgroup_subsys_state *css = ss->create(cgrp);
3831 3974
3832 if (IS_ERR(css)) { 3975 if (IS_ERR(css)) {
3833 err = PTR_ERR(css); 3976 err = PTR_ERR(css);
@@ -3841,7 +3984,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3841 } 3984 }
3842 /* At error, ->destroy() callback has to free assigned ID. */ 3985 /* At error, ->destroy() callback has to free assigned ID. */
3843 if (clone_children(parent) && ss->post_clone) 3986 if (clone_children(parent) && ss->post_clone)
3844 ss->post_clone(ss, cgrp); 3987 ss->post_clone(cgrp);
3845 } 3988 }
3846 3989
3847 cgroup_lock_hierarchy(root); 3990 cgroup_lock_hierarchy(root);
@@ -3853,9 +3996,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3853 if (err < 0) 3996 if (err < 0)
3854 goto err_remove; 3997 goto err_remove;
3855 3998
3999 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4000 for_each_subsys(root, ss)
4001 if (!ss->__DEPRECATED_clear_css_refs)
4002 dget(dentry);
4003
3856 /* The cgroup directory was pre-locked for us */ 4004 /* The cgroup directory was pre-locked for us */
3857 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4005 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3858 4006
4007 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4008
3859 err = cgroup_populate_dir(cgrp); 4009 err = cgroup_populate_dir(cgrp);
3860 /* If err < 0, we have a half-filled directory - oh well ;) */ 4010 /* If err < 0, we have a half-filled directory - oh well ;) */
3861 4011
@@ -3875,7 +4025,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3875 4025
3876 for_each_subsys(root, ss) { 4026 for_each_subsys(root, ss) {
3877 if (cgrp->subsys[ss->subsys_id]) 4027 if (cgrp->subsys[ss->subsys_id])
3878 ss->destroy(ss, cgrp); 4028 ss->destroy(cgrp);
3879 } 4029 }
3880 4030
3881 mutex_unlock(&cgroup_mutex); 4031 mutex_unlock(&cgroup_mutex);
@@ -3895,18 +4045,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3895 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4045 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3896} 4046}
3897 4047
4048/*
4049 * Check the reference count on each subsystem. Since we already
4050 * established that there are no tasks in the cgroup, if the css refcount
4051 * is also 1, then there should be no outstanding references, so the
4052 * subsystem is safe to destroy. We scan across all subsystems rather than
4053 * using the per-hierarchy linked list of mounted subsystems since we can
4054 * be called via check_for_release() with no synchronization other than
4055 * RCU, and the subsystem linked list isn't RCU-safe.
4056 */
3898static int cgroup_has_css_refs(struct cgroup *cgrp) 4057static int cgroup_has_css_refs(struct cgroup *cgrp)
3899{ 4058{
3900 /* Check the reference count on each subsystem. Since we
3901 * already established that there are no tasks in the
3902 * cgroup, if the css refcount is also 1, then there should
3903 * be no outstanding references, so the subsystem is safe to
3904 * destroy. We scan across all subsystems rather than using
3905 * the per-hierarchy linked list of mounted subsystems since
3906 * we can be called via check_for_release() with no
3907 * synchronization other than RCU, and the subsystem linked
3908 * list isn't RCU-safe */
3909 int i; 4059 int i;
4060
3910 /* 4061 /*
3911 * We won't need to lock the subsys array, because the subsystems 4062 * We won't need to lock the subsys array, because the subsystems
3912 * we're concerned about aren't going anywhere since our cgroup root 4063 * we're concerned about aren't going anywhere since our cgroup root
@@ -3915,17 +4066,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3915 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4066 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3916 struct cgroup_subsys *ss = subsys[i]; 4067 struct cgroup_subsys *ss = subsys[i];
3917 struct cgroup_subsys_state *css; 4068 struct cgroup_subsys_state *css;
4069
3918 /* Skip subsystems not present or not in this hierarchy */ 4070 /* Skip subsystems not present or not in this hierarchy */
3919 if (ss == NULL || ss->root != cgrp->root) 4071 if (ss == NULL || ss->root != cgrp->root)
3920 continue; 4072 continue;
4073
3921 css = cgrp->subsys[ss->subsys_id]; 4074 css = cgrp->subsys[ss->subsys_id];
3922 /* When called from check_for_release() it's possible 4075 /*
4076 * When called from check_for_release() it's possible
3923 * that by this point the cgroup has been removed 4077 * that by this point the cgroup has been removed
3924 * and the css deleted. But a false-positive doesn't 4078 * and the css deleted. But a false-positive doesn't
3925 * matter, since it can only happen if the cgroup 4079 * matter, since it can only happen if the cgroup
3926 * has been deleted and hence no longer needs the 4080 * has been deleted and hence no longer needs the
3927 * release agent to be called anyway. */ 4081 * release agent to be called anyway.
3928 if (css && (atomic_read(&css->refcnt) > 1)) 4082 */
4083 if (css && css_refcnt(css) > 1)
3929 return 1; 4084 return 1;
3930 } 4085 }
3931 return 0; 4086 return 0;
@@ -3935,51 +4090,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3935 * Atomically mark all (or else none) of the cgroup's CSS objects as 4090 * Atomically mark all (or else none) of the cgroup's CSS objects as
3936 * CSS_REMOVED. Return true on success, or false if the cgroup has 4091 * CSS_REMOVED. Return true on success, or false if the cgroup has
3937 * busy subsystems. Call with cgroup_mutex held 4092 * busy subsystems. Call with cgroup_mutex held
4093 *
4094 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4095 * not, cgroup removal behaves differently.
4096 *
4097 * If clear is set, css refcnt for the subsystem should be zero before
4098 * cgroup removal can be committed. This is implemented by
4099 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4100 * called multiple times until all css refcnts reach zero and is allowed to
4101 * veto removal on any invocation. This behavior is deprecated and will be
4102 * removed as soon as the existing user (memcg) is updated.
4103 *
4104 * If clear is not set, each css holds an extra reference to the cgroup's
4105 * dentry and cgroup removal proceeds regardless of css refs.
4106 * ->pre_destroy() will be called at least once and is not allowed to fail.
4107 * On the last put of each css, whenever that may be, the extra dentry ref
4108 * is put so that dentry destruction happens only after all css's are
4109 * released.
3938 */ 4110 */
3939
3940static int cgroup_clear_css_refs(struct cgroup *cgrp) 4111static int cgroup_clear_css_refs(struct cgroup *cgrp)
3941{ 4112{
3942 struct cgroup_subsys *ss; 4113 struct cgroup_subsys *ss;
3943 unsigned long flags; 4114 unsigned long flags;
3944 bool failed = false; 4115 bool failed = false;
4116
3945 local_irq_save(flags); 4117 local_irq_save(flags);
4118
4119 /*
4120 * Block new css_tryget() by deactivating refcnt. If all refcnts
4121 * for subsystems w/ clear_css_refs set were 1 at the moment of
4122 * deactivation, we succeeded.
4123 */
3946 for_each_subsys(cgrp->root, ss) { 4124 for_each_subsys(cgrp->root, ss) {
3947 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4125 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3948 int refcnt; 4126
3949 while (1) { 4127 WARN_ON(atomic_read(&css->refcnt) < 0);
3950 /* We can only remove a CSS with a refcnt==1 */ 4128 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3951 refcnt = atomic_read(&css->refcnt); 4129
3952 if (refcnt > 1) { 4130 if (ss->__DEPRECATED_clear_css_refs)
3953 failed = true; 4131 failed |= css_refcnt(css) != 1;
3954 goto done;
3955 }
3956 BUG_ON(!refcnt);
3957 /*
3958 * Drop the refcnt to 0 while we check other
3959 * subsystems. This will cause any racing
3960 * css_tryget() to spin until we set the
3961 * CSS_REMOVED bits or abort
3962 */
3963 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3964 break;
3965 cpu_relax();
3966 }
3967 } 4132 }
3968 done: 4133
4134 /*
4135 * If succeeded, set REMOVED and put all the base refs; otherwise,
4136 * restore refcnts to positive values. Either way, all in-progress
4137 * css_tryget() will be released.
4138 */
3969 for_each_subsys(cgrp->root, ss) { 4139 for_each_subsys(cgrp->root, ss) {
3970 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4140 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3971 if (failed) { 4141
3972 /* 4142 if (!failed) {
3973 * Restore old refcnt if we previously managed
3974 * to clear it from 1 to 0
3975 */
3976 if (!atomic_read(&css->refcnt))
3977 atomic_set(&css->refcnt, 1);
3978 } else {
3979 /* Commit the fact that the CSS is removed */
3980 set_bit(CSS_REMOVED, &css->flags); 4143 set_bit(CSS_REMOVED, &css->flags);
4144 css_put(css);
4145 } else {
4146 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3981 } 4147 }
3982 } 4148 }
4149
3983 local_irq_restore(flags); 4150 local_irq_restore(flags);
3984 return !failed; 4151 return !failed;
3985} 4152}
@@ -4064,6 +4231,8 @@ again:
4064 list_del_init(&cgrp->sibling); 4231 list_del_init(&cgrp->sibling);
4065 cgroup_unlock_hierarchy(cgrp->root); 4232 cgroup_unlock_hierarchy(cgrp->root);
4066 4233
4234 list_del_init(&cgrp->allcg_node);
4235
4067 d = dget(cgrp->dentry); 4236 d = dget(cgrp->dentry);
4068 4237
4069 cgroup_d_remove_dir(d); 4238 cgroup_d_remove_dir(d);
@@ -4090,16 +4259,33 @@ again:
4090 return 0; 4259 return 0;
4091} 4260}
4092 4261
4262static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4263{
4264 INIT_LIST_HEAD(&ss->cftsets);
4265
4266 /*
4267 * base_cftset is embedded in subsys itself, no need to worry about
4268 * deregistration.
4269 */
4270 if (ss->base_cftypes) {
4271 ss->base_cftset.cfts = ss->base_cftypes;
4272 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4273 }
4274}
4275
4093static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4276static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4094{ 4277{
4095 struct cgroup_subsys_state *css; 4278 struct cgroup_subsys_state *css;
4096 4279
4097 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4280 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4098 4281
4282 /* init base cftset */
4283 cgroup_init_cftsets(ss);
4284
4099 /* Create the top cgroup state for this subsystem */ 4285 /* Create the top cgroup state for this subsystem */
4100 list_add(&ss->sibling, &rootnode.subsys_list); 4286 list_add(&ss->sibling, &rootnode.subsys_list);
4101 ss->root = &rootnode; 4287 ss->root = &rootnode;
4102 css = ss->create(ss, dummytop); 4288 css = ss->create(dummytop);
4103 /* We don't handle early failures gracefully */ 4289 /* We don't handle early failures gracefully */
4104 BUG_ON(IS_ERR(css)); 4290 BUG_ON(IS_ERR(css));
4105 init_cgroup_css(css, ss, dummytop); 4291 init_cgroup_css(css, ss, dummytop);
@@ -4165,6 +4351,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4165 return 0; 4351 return 0;
4166 } 4352 }
4167 4353
4354 /* init base cftset */
4355 cgroup_init_cftsets(ss);
4356
4168 /* 4357 /*
4169 * need to register a subsys id before anything else - for example, 4358 * need to register a subsys id before anything else - for example,
4170 * init_cgroup_css needs it. 4359 * init_cgroup_css needs it.
@@ -4188,7 +4377,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4188 * no ss->create seems to need anything important in the ss struct, so 4377 * no ss->create seems to need anything important in the ss struct, so
4189 * this can happen first (i.e. before the rootnode attachment). 4378 * this can happen first (i.e. before the rootnode attachment).
4190 */ 4379 */
4191 css = ss->create(ss, dummytop); 4380 css = ss->create(dummytop);
4192 if (IS_ERR(css)) { 4381 if (IS_ERR(css)) {
4193 /* failure case - need to deassign the subsys[] slot. */ 4382 /* failure case - need to deassign the subsys[] slot. */
4194 subsys[i] = NULL; 4383 subsys[i] = NULL;
@@ -4206,7 +4395,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4206 int ret = cgroup_init_idr(ss, css); 4395 int ret = cgroup_init_idr(ss, css);
4207 if (ret) { 4396 if (ret) {
4208 dummytop->subsys[ss->subsys_id] = NULL; 4397 dummytop->subsys[ss->subsys_id] = NULL;
4209 ss->destroy(ss, dummytop); 4398 ss->destroy(dummytop);
4210 subsys[i] = NULL; 4399 subsys[i] = NULL;
4211 mutex_unlock(&cgroup_mutex); 4400 mutex_unlock(&cgroup_mutex);
4212 return ret; 4401 return ret;
@@ -4304,7 +4493,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4304 * pointer to find their state. note that this also takes care of 4493 * pointer to find their state. note that this also takes care of
4305 * freeing the css_id. 4494 * freeing the css_id.
4306 */ 4495 */
4307 ss->destroy(ss, dummytop); 4496 ss->destroy(dummytop);
4308 dummytop->subsys[ss->subsys_id] = NULL; 4497 dummytop->subsys[ss->subsys_id] = NULL;
4309 4498
4310 mutex_unlock(&cgroup_mutex); 4499 mutex_unlock(&cgroup_mutex);
@@ -4580,7 +4769,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
4580 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4769 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4581 struct cgroup_subsys *ss = subsys[i]; 4770 struct cgroup_subsys *ss = subsys[i];
4582 if (ss->fork) 4771 if (ss->fork)
4583 ss->fork(ss, child); 4772 ss->fork(child);
4584 } 4773 }
4585 } 4774 }
4586} 4775}
@@ -4596,6 +4785,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
4596 */ 4785 */
4597void cgroup_post_fork(struct task_struct *child) 4786void cgroup_post_fork(struct task_struct *child)
4598{ 4787{
4788 /*
4789 * use_task_css_set_links is set to 1 before we walk the tasklist
4790 * under the tasklist_lock and we read it here after we added the child
4791 * to the tasklist under the tasklist_lock as well. If the child wasn't
4792 * yet in the tasklist when we walked through it from
4793 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4794 * should be visible now due to the paired locking and barriers implied
4795 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4796 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4797 * lock on fork.
4798 */
4599 if (use_task_css_set_links) { 4799 if (use_task_css_set_links) {
4600 write_lock(&css_set_lock); 4800 write_lock(&css_set_lock);
4601 if (list_empty(&child->cg_list)) { 4801 if (list_empty(&child->cg_list)) {
@@ -4682,7 +4882,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4682 struct cgroup *old_cgrp = 4882 struct cgroup *old_cgrp =
4683 rcu_dereference_raw(cg->subsys[i])->cgroup; 4883 rcu_dereference_raw(cg->subsys[i])->cgroup;
4684 struct cgroup *cgrp = task_cgroup(tsk, i); 4884 struct cgroup *cgrp = task_cgroup(tsk, i);
4685 ss->exit(ss, cgrp, old_cgrp, tsk); 4885 ss->exit(cgrp, old_cgrp, tsk);
4686 } 4886 }
4687 } 4887 }
4688 } 4888 }
@@ -4743,21 +4943,41 @@ static void check_for_release(struct cgroup *cgrp)
4743} 4943}
4744 4944
4745/* Caller must verify that the css is not for root cgroup */ 4945/* Caller must verify that the css is not for root cgroup */
4746void __css_put(struct cgroup_subsys_state *css, int count) 4946bool __css_tryget(struct cgroup_subsys_state *css)
4947{
4948 do {
4949 int v = css_refcnt(css);
4950
4951 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4952 return true;
4953 cpu_relax();
4954 } while (!test_bit(CSS_REMOVED, &css->flags));
4955
4956 return false;
4957}
4958EXPORT_SYMBOL_GPL(__css_tryget);
4959
4960/* Caller must verify that the css is not for root cgroup */
4961void __css_put(struct cgroup_subsys_state *css)
4747{ 4962{
4748 struct cgroup *cgrp = css->cgroup; 4963 struct cgroup *cgrp = css->cgroup;
4749 int val; 4964
4750 rcu_read_lock(); 4965 rcu_read_lock();
4751 val = atomic_sub_return(count, &css->refcnt); 4966 atomic_dec(&css->refcnt);
4752 if (val == 1) { 4967 switch (css_refcnt(css)) {
4968 case 1:
4753 if (notify_on_release(cgrp)) { 4969 if (notify_on_release(cgrp)) {
4754 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4970 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4755 check_for_release(cgrp); 4971 check_for_release(cgrp);
4756 } 4972 }
4757 cgroup_wakeup_rmdir_waiter(cgrp); 4973 cgroup_wakeup_rmdir_waiter(cgrp);
4974 break;
4975 case 0:
4976 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4977 schedule_work(&css->dput_work);
4978 break;
4758 } 4979 }
4759 rcu_read_unlock(); 4980 rcu_read_unlock();
4760 WARN_ON_ONCE(val < 1);
4761} 4981}
4762EXPORT_SYMBOL_GPL(__css_put); 4982EXPORT_SYMBOL_GPL(__css_put);
4763 4983
@@ -4876,7 +5096,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4876 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5096 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4877 * it's unchanged until freed. 5097 * it's unchanged until freed.
4878 */ 5098 */
4879 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5099 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4880 5100
4881 if (cssid) 5101 if (cssid)
4882 return cssid->id; 5102 return cssid->id;
@@ -4888,7 +5108,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4888{ 5108{
4889 struct css_id *cssid; 5109 struct css_id *cssid;
4890 5110
4891 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5111 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4892 5112
4893 if (cssid) 5113 if (cssid)
4894 return cssid->depth; 5114 return cssid->depth;
@@ -4939,9 +5159,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4939 5159
4940 rcu_assign_pointer(id->css, NULL); 5160 rcu_assign_pointer(id->css, NULL);
4941 rcu_assign_pointer(css->id, NULL); 5161 rcu_assign_pointer(css->id, NULL);
4942 write_lock(&ss->id_lock); 5162 spin_lock(&ss->id_lock);
4943 idr_remove(&ss->idr, id->id); 5163 idr_remove(&ss->idr, id->id);
4944 write_unlock(&ss->id_lock); 5164 spin_unlock(&ss->id_lock);
4945 kfree_rcu(id, rcu_head); 5165 kfree_rcu(id, rcu_head);
4946} 5166}
4947EXPORT_SYMBOL_GPL(free_css_id); 5167EXPORT_SYMBOL_GPL(free_css_id);
@@ -4967,10 +5187,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4967 error = -ENOMEM; 5187 error = -ENOMEM;
4968 goto err_out; 5188 goto err_out;
4969 } 5189 }
4970 write_lock(&ss->id_lock); 5190 spin_lock(&ss->id_lock);
4971 /* Don't use 0. allocates an ID of 1-65535 */ 5191 /* Don't use 0. allocates an ID of 1-65535 */
4972 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 5192 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4973 write_unlock(&ss->id_lock); 5193 spin_unlock(&ss->id_lock);
4974 5194
4975 /* Returns error when there are no free spaces for new ID.*/ 5195 /* Returns error when there are no free spaces for new ID.*/
4976 if (error) { 5196 if (error) {
@@ -4985,9 +5205,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4985 return newid; 5205 return newid;
4986remove_idr: 5206remove_idr:
4987 error = -ENOSPC; 5207 error = -ENOSPC;
4988 write_lock(&ss->id_lock); 5208 spin_lock(&ss->id_lock);
4989 idr_remove(&ss->idr, myid); 5209 idr_remove(&ss->idr, myid);
4990 write_unlock(&ss->id_lock); 5210 spin_unlock(&ss->id_lock);
4991err_out: 5211err_out:
4992 kfree(newid); 5212 kfree(newid);
4993 return ERR_PTR(error); 5213 return ERR_PTR(error);
@@ -4999,7 +5219,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4999{ 5219{
5000 struct css_id *newid; 5220 struct css_id *newid;
5001 5221
5002 rwlock_init(&ss->id_lock); 5222 spin_lock_init(&ss->id_lock);
5003 idr_init(&ss->idr); 5223 idr_init(&ss->idr);
5004 5224
5005 newid = get_new_cssid(ss, 0); 5225 newid = get_new_cssid(ss, 0);
@@ -5087,6 +5307,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
5087 return NULL; 5307 return NULL;
5088 5308
5089 BUG_ON(!ss->use_id); 5309 BUG_ON(!ss->use_id);
5310 WARN_ON_ONCE(!rcu_read_lock_held());
5311
5090 /* fill start point for scan */ 5312 /* fill start point for scan */
5091 tmpid = id; 5313 tmpid = id;
5092 while (1) { 5314 while (1) {
@@ -5094,10 +5316,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
5094 * scan next entry from bitmap(tree), tmpid is updated after 5316 * scan next entry from bitmap(tree), tmpid is updated after
5095 * idr_get_next(). 5317 * idr_get_next().
5096 */ 5318 */
5097 read_lock(&ss->id_lock);
5098 tmp = idr_get_next(&ss->idr, &tmpid); 5319 tmp = idr_get_next(&ss->idr, &tmpid);
5099 read_unlock(&ss->id_lock);
5100
5101 if (!tmp) 5320 if (!tmp)
5102 break; 5321 break;
5103 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5322 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5137,8 +5356,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5137} 5356}
5138 5357
5139#ifdef CONFIG_CGROUP_DEBUG 5358#ifdef CONFIG_CGROUP_DEBUG
5140static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5359static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5141 struct cgroup *cont)
5142{ 5360{
5143 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5361 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5144 5362
@@ -5148,7 +5366,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5148 return css; 5366 return css;
5149} 5367}
5150 5368
5151static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 5369static void debug_destroy(struct cgroup *cont)
5152{ 5370{
5153 kfree(cont->subsys[debug_subsys_id]); 5371 kfree(cont->subsys[debug_subsys_id]);
5154} 5372}
@@ -5271,19 +5489,15 @@ static struct cftype debug_files[] = {
5271 .name = "releasable", 5489 .name = "releasable",
5272 .read_u64 = releasable_read, 5490 .read_u64 = releasable_read,
5273 }, 5491 },
5274};
5275 5492
5276static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5493 { } /* terminate */
5277{ 5494};
5278 return cgroup_add_files(cont, ss, debug_files,
5279 ARRAY_SIZE(debug_files));
5280}
5281 5495
5282struct cgroup_subsys debug_subsys = { 5496struct cgroup_subsys debug_subsys = {
5283 .name = "debug", 5497 .name = "debug",
5284 .create = debug_create, 5498 .create = debug_create,
5285 .destroy = debug_destroy, 5499 .destroy = debug_destroy,
5286 .populate = debug_populate,
5287 .subsys_id = debug_subsys_id, 5500 .subsys_id = debug_subsys_id,
5501 .base_cftypes = debug_files,
5288}; 5502};
5289#endif /* CONFIG_CGROUP_DEBUG */ 5503#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a64..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock 129 * sighand->siglock
130 */ 130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132 struct cgroup *cgroup)
133{ 132{
134 struct freezer *freezer; 133 struct freezer *freezer;
135 134
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
142 return &freezer->css; 141 return &freezer->css;
143} 142}
144 143
145static void freezer_destroy(struct cgroup_subsys *ss, 144static void freezer_destroy(struct cgroup *cgroup)
146 struct cgroup *cgroup)
147{ 145{
148 struct freezer *freezer = cgroup_freezer(cgroup); 146 struct freezer *freezer = cgroup_freezer(cgroup);
149 147
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
164 * a write to that file racing against an attach, and hence the 162 * a write to that file racing against an attach, and hence the
165 * can_attach() result will remain valid until the attach completes. 163 * can_attach() result will remain valid until the attach completes.
166 */ 164 */
167static int freezer_can_attach(struct cgroup_subsys *ss, 165static int freezer_can_attach(struct cgroup *new_cgroup,
168 struct cgroup *new_cgroup,
169 struct cgroup_taskset *tset) 166 struct cgroup_taskset *tset)
170{ 167{
171 struct freezer *freezer; 168 struct freezer *freezer;
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
185 return 0; 182 return 0;
186} 183}
187 184
188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 185static void freezer_fork(struct task_struct *task)
189{ 186{
190 struct freezer *freezer; 187 struct freezer *freezer;
191 188
@@ -361,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
361static struct cftype files[] = { 358static struct cftype files[] = {
362 { 359 {
363 .name = "state", 360 .name = "state",
361 .flags = CFTYPE_NOT_ON_ROOT,
364 .read_seq_string = freezer_read, 362 .read_seq_string = freezer_read,
365 .write_string = freezer_write, 363 .write_string = freezer_write,
366 }, 364 },
365 { } /* terminate */
367}; 366};
368 367
369static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
370{
371 if (!cgroup->parent)
372 return 0;
373 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
374}
375
376struct cgroup_subsys freezer_subsys = { 368struct cgroup_subsys freezer_subsys = {
377 .name = "freezer", 369 .name = "freezer",
378 .create = freezer_create, 370 .create = freezer_create,
379 .destroy = freezer_destroy, 371 .destroy = freezer_destroy,
380 .populate = freezer_populate,
381 .subsys_id = freezer_subsys_id, 372 .subsys_id = freezer_subsys_id,
382 .can_attach = freezer_can_attach, 373 .can_attach = freezer_can_attach,
383 .fork = freezer_fork, 374 .fork = freezer_fork,
375 .base_cftypes = files,
384}; 376};
diff --git a/kernel/compat.c b/kernel/compat.c
index f346cedfe24d..74ff8498809a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -31,11 +31,10 @@
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32 32
33/* 33/*
34 * Note that the native side is already converted to a timespec, because 34 * Get/set struct timeval with struct timespec on the native side
35 * that's what we want anyway.
36 */ 35 */
37static int compat_get_timeval(struct timespec *o, 36static int compat_get_timeval_convert(struct timespec *o,
38 struct compat_timeval __user *i) 37 struct compat_timeval __user *i)
39{ 38{
40 long usec; 39 long usec;
41 40
@@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o,
46 return 0; 45 return 0;
47} 46}
48 47
49static int compat_put_timeval(struct compat_timeval __user *o, 48static int compat_put_timeval_convert(struct compat_timeval __user *o,
50 struct timeval *i) 49 struct timeval *i)
51{ 50{
52 return (put_user(i->tv_sec, &o->tv_sec) || 51 return (put_user(i->tv_sec, &o->tv_sec) ||
53 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
@@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
117 if (tv) { 116 if (tv) {
118 struct timeval ktv; 117 struct timeval ktv;
119 do_gettimeofday(&ktv); 118 do_gettimeofday(&ktv);
120 if (compat_put_timeval(tv, &ktv)) 119 if (compat_put_timeval_convert(tv, &ktv))
121 return -EFAULT; 120 return -EFAULT;
122 } 121 }
123 if (tz) { 122 if (tz) {
@@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
135 struct timezone ktz; 134 struct timezone ktz;
136 135
137 if (tv) { 136 if (tv) {
138 if (compat_get_timeval(&kts, tv)) 137 if (compat_get_timeval_convert(&kts, tv))
139 return -EFAULT; 138 return -EFAULT;
140 } 139 }
141 if (tz) { 140 if (tz) {
@@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
146 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); 145 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
147} 146}
148 147
148int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
149{
150 return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
151 __get_user(tv->tv_sec, &ctv->tv_sec) ||
152 __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
153}
154EXPORT_SYMBOL_GPL(get_compat_timeval);
155
156int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
157{
158 return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
159 __put_user(tv->tv_sec, &ctv->tv_sec) ||
160 __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
161}
162EXPORT_SYMBOL_GPL(put_compat_timeval);
163
149int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 164int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
150{ 165{
151 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 166 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
152 __get_user(ts->tv_sec, &cts->tv_sec) || 167 __get_user(ts->tv_sec, &cts->tv_sec) ||
153 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 168 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
154} 169}
170EXPORT_SYMBOL_GPL(get_compat_timespec);
155 171
156int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) 172int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
157{ 173{
@@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
161} 177}
162EXPORT_SYMBOL_GPL(put_compat_timespec); 178EXPORT_SYMBOL_GPL(put_compat_timespec);
163 179
180int compat_get_timeval(struct timeval *tv, const void __user *utv)
181{
182 if (COMPAT_USE_64BIT_TIME)
183 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
184 else
185 return get_compat_timeval(tv, utv);
186}
187EXPORT_SYMBOL_GPL(compat_get_timeval);
188
189int compat_put_timeval(const struct timeval *tv, void __user *utv)
190{
191 if (COMPAT_USE_64BIT_TIME)
192 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
193 else
194 return put_compat_timeval(tv, utv);
195}
196EXPORT_SYMBOL_GPL(compat_put_timeval);
197
198int compat_get_timespec(struct timespec *ts, const void __user *uts)
199{
200 if (COMPAT_USE_64BIT_TIME)
201 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
202 else
203 return get_compat_timespec(ts, uts);
204}
205EXPORT_SYMBOL_GPL(compat_get_timespec);
206
207int compat_put_timespec(const struct timespec *ts, void __user *uts)
208{
209 if (COMPAT_USE_64BIT_TIME)
210 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
211 else
212 return put_compat_timespec(ts, uts);
213}
214EXPORT_SYMBOL_GPL(compat_put_timespec);
215
164static long compat_nanosleep_restart(struct restart_block *restart) 216static long compat_nanosleep_restart(struct restart_block *restart)
165{ 217{
166 struct compat_timespec __user *rmtp; 218 struct compat_timespec __user *rmtp;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a661..2382683617a3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
964{ 964{
965 bool need_loop; 965 bool need_loop;
966 966
967repeat:
968 /* 967 /*
969 * Allow tasks that have access to memory reserves because they have 968 * Allow tasks that have access to memory reserves because they have
970 * been OOM killed to get memory anywhere. 969 * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
983 */ 982 */
984 need_loop = task_has_mempolicy(tsk) || 983 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed); 984 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988 985
989 /* 986 if (need_loop)
990 * ensure checking ->mems_allowed_change_disable after setting all new 987 write_seqcount_begin(&tsk->mems_allowed_seq);
991 * allowed nodes.
992 *
993 * the read-side task can see an nodemask with new allowed nodes and
994 * old allowed nodes. and if it allocates page when cpuset clears newly
995 * disallowed ones continuous, it can see the new allowed bits.
996 *
997 * And if setting all new allowed nodes is after the checking, setting
998 * all new allowed nodes and clearing newly disallowed ones will be done
999 * continuous, and the read-side task may find no node to alloc page.
1000 */
1001 smp_mb();
1002
1003 /*
1004 * Allocation of memory is very fast, we needn't sleep when waiting
1005 * for the read-side.
1006 */
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013 988
1014 /* 989 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1015 * ensure checking ->mems_allowed_change_disable before clearing all new 990 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1016 * disallowed nodes.
1017 *
1018 * if clearing newly disallowed bits before the checking, the read-side
1019 * task may find no node to alloc page.
1020 */
1021 smp_mb();
1022 991
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 992 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems; 993 tsk->mems_allowed = *newmems;
994
995 if (need_loop)
996 write_seqcount_end(&tsk->mems_allowed_seq);
997
1025 task_unlock(tsk); 998 task_unlock(tsk);
1026} 999}
1027 1000
@@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from;
1399static nodemask_t cpuset_attach_nodemask_to; 1372static nodemask_t cpuset_attach_nodemask_to;
1400 1373
1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1374/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1375static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1403 struct cgroup_taskset *tset)
1404{ 1376{
1405 struct cpuset *cs = cgroup_cs(cgrp); 1377 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task; 1378 struct task_struct *task;
@@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1436 return 0; 1408 return 0;
1437} 1409}
1438 1410
1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1411static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1440 struct cgroup_taskset *tset)
1441{ 1412{
1442 struct mm_struct *mm; 1413 struct mm_struct *mm;
1443 struct task_struct *task; 1414 struct task_struct *task;
@@ -1794,28 +1765,17 @@ static struct cftype files[] = {
1794 .write_u64 = cpuset_write_u64, 1765 .write_u64 = cpuset_write_u64,
1795 .private = FILE_SPREAD_SLAB, 1766 .private = FILE_SPREAD_SLAB,
1796 }, 1767 },
1797};
1798 1768
1799static struct cftype cft_memory_pressure_enabled = { 1769 {
1800 .name = "memory_pressure_enabled", 1770 .name = "memory_pressure_enabled",
1801 .read_u64 = cpuset_read_u64, 1771 .flags = CFTYPE_ONLY_ON_ROOT,
1802 .write_u64 = cpuset_write_u64, 1772 .read_u64 = cpuset_read_u64,
1803 .private = FILE_MEMORY_PRESSURE_ENABLED, 1773 .write_u64 = cpuset_write_u64,
1804}; 1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1805 1775 },
1806static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1807{
1808 int err;
1809 1776
1810 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 1777 { } /* terminate */
1811 if (err) 1778};
1812 return err;
1813 /* memory_pressure_enabled is in root cpuset only */
1814 if (!cont->parent)
1815 err = cgroup_add_file(cont, ss,
1816 &cft_memory_pressure_enabled);
1817 return err;
1818}
1819 1779
1820/* 1780/*
1821 * post_clone() is called during cgroup_create() when the 1781 * post_clone() is called during cgroup_create() when the
@@ -1833,8 +1793,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1833 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex 1793 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1834 * held. 1794 * held.
1835 */ 1795 */
1836static void cpuset_post_clone(struct cgroup_subsys *ss, 1796static void cpuset_post_clone(struct cgroup *cgroup)
1837 struct cgroup *cgroup)
1838{ 1797{
1839 struct cgroup *parent, *child; 1798 struct cgroup *parent, *child;
1840 struct cpuset *cs, *parent_cs; 1799 struct cpuset *cs, *parent_cs;
@@ -1857,13 +1816,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1857 1816
1858/* 1817/*
1859 * cpuset_create - create a cpuset 1818 * cpuset_create - create a cpuset
1860 * ss: cpuset cgroup subsystem
1861 * cont: control group that the new cpuset will be part of 1819 * cont: control group that the new cpuset will be part of
1862 */ 1820 */
1863 1821
1864static struct cgroup_subsys_state *cpuset_create( 1822static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1865 struct cgroup_subsys *ss,
1866 struct cgroup *cont)
1867{ 1823{
1868 struct cpuset *cs; 1824 struct cpuset *cs;
1869 struct cpuset *parent; 1825 struct cpuset *parent;
@@ -1902,7 +1858,7 @@ static struct cgroup_subsys_state *cpuset_create(
1902 * will call async_rebuild_sched_domains(). 1858 * will call async_rebuild_sched_domains().
1903 */ 1859 */
1904 1860
1905static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1861static void cpuset_destroy(struct cgroup *cont)
1906{ 1862{
1907 struct cpuset *cs = cgroup_cs(cont); 1863 struct cpuset *cs = cgroup_cs(cont);
1908 1864
@@ -1920,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
1920 .destroy = cpuset_destroy, 1876 .destroy = cpuset_destroy,
1921 .can_attach = cpuset_can_attach, 1877 .can_attach = cpuset_can_attach,
1922 .attach = cpuset_attach, 1878 .attach = cpuset_attach,
1923 .populate = cpuset_populate,
1924 .post_clone = cpuset_post_clone, 1879 .post_clone = cpuset_post_clone,
1925 .subsys_id = cpuset_subsys_id, 1880 .subsys_id = cpuset_subsys_id,
1881 .base_cftypes = files,
1926 .early_init = 1, 1882 .early_init = 1,
1927}; 1883};
1928 1884
@@ -2195,10 +2151,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2195 mutex_unlock(&callback_mutex); 2151 mutex_unlock(&callback_mutex);
2196} 2152}
2197 2153
2198int cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2154void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2199{ 2155{
2200 const struct cpuset *cs; 2156 const struct cpuset *cs;
2201 int cpu;
2202 2157
2203 rcu_read_lock(); 2158 rcu_read_lock();
2204 cs = task_cs(tsk); 2159 cs = task_cs(tsk);
@@ -2219,22 +2174,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2219 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 2174 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2220 * set any mask even if it is not right from task_cs() pov, 2175 * set any mask even if it is not right from task_cs() pov,
2221 * the pending set_cpus_allowed_ptr() will fix things. 2176 * the pending set_cpus_allowed_ptr() will fix things.
2177 *
2178 * select_fallback_rq() will fix things ups and set cpu_possible_mask
2179 * if required.
2222 */ 2180 */
2223
2224 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2225 if (cpu >= nr_cpu_ids) {
2226 /*
2227 * Either tsk->cpus_allowed is wrong (see above) or it
2228 * is actually empty. The latter case is only possible
2229 * if we are racing with remove_tasks_in_empty_cpuset().
2230 * Like above we can temporary set any mask and rely on
2231 * set_cpus_allowed_ptr() as synchronization point.
2232 */
2233 do_set_cpus_allowed(tsk, cpu_possible_mask);
2234 cpu = cpumask_any(cpu_active_mask);
2235 }
2236
2237 return cpu;
2238} 2181}
2239 2182
2240void cpuset_init_current_mems_allowed(void) 2183void cpuset_init_current_mems_allowed(void)
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..97b36eeca4c9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
16#include <linux/keyctl.h> 16#include <linux/keyctl.h>
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/binfmts.h>
19#include <linux/cn_proc.h> 20#include <linux/cn_proc.h>
20 21
21#if 0 22#if 0
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..1dc53bae56e1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
41#include <linux/delay.h> 41#include <linux/delay.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/sysrq.h> 43#include <linux/sysrq.h>
44#include <linux/reboot.h>
44#include <linux/init.h> 45#include <linux/init.h>
45#include <linux/kgdb.h> 46#include <linux/kgdb.h>
46#include <linux/kdb.h> 47#include <linux/kdb.h>
@@ -52,7 +53,6 @@
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/byteorder.h> 54#include <asm/byteorder.h>
54#include <linux/atomic.h> 55#include <linux/atomic.h>
55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
58 58
@@ -75,6 +75,8 @@ static int exception_level;
75struct kgdb_io *dbg_io_ops; 75struct kgdb_io *dbg_io_ops;
76static DEFINE_SPINLOCK(kgdb_registration_lock); 76static DEFINE_SPINLOCK(kgdb_registration_lock);
77 77
78/* Action for the reboot notifiter, a global allow kdb to change it */
79static int kgdbreboot;
78/* kgdb console driver is loaded */ 80/* kgdb console driver is loaded */
79static int kgdb_con_registered; 81static int kgdb_con_registered;
80/* determine if kgdb console output should be used */ 82/* determine if kgdb console output should be used */
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str)
96early_param("kgdbcon", opt_kgdb_con); 98early_param("kgdbcon", opt_kgdb_con);
97 99
98module_param(kgdb_use_con, int, 0644); 100module_param(kgdb_use_con, int, 0644);
101module_param(kgdbreboot, int, 0644);
99 102
100/* 103/*
101 * Holds information about breakpoints in a kernel. These breakpoints are 104 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -784,6 +787,33 @@ void __init dbg_late_init(void)
784 kdb_init(KDB_INIT_FULL); 787 kdb_init(KDB_INIT_FULL);
785} 788}
786 789
790static int
791dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
792{
793 /*
794 * Take the following action on reboot notify depending on value:
795 * 1 == Enter debugger
796 * 0 == [the default] detatch debug client
797 * -1 == Do nothing... and use this until the board resets
798 */
799 switch (kgdbreboot) {
800 case 1:
801 kgdb_breakpoint();
802 case -1:
803 goto done;
804 }
805 if (!dbg_kdb_mode)
806 gdbstub_exit(code);
807done:
808 return NOTIFY_DONE;
809}
810
811static struct notifier_block dbg_reboot_notifier = {
812 .notifier_call = dbg_notify_reboot,
813 .next = NULL,
814 .priority = INT_MAX,
815};
816
787static void kgdb_register_callbacks(void) 817static void kgdb_register_callbacks(void)
788{ 818{
789 if (!kgdb_io_module_registered) { 819 if (!kgdb_io_module_registered) {
@@ -791,6 +821,7 @@ static void kgdb_register_callbacks(void)
791 kgdb_arch_init(); 821 kgdb_arch_init();
792 if (!dbg_is_early) 822 if (!dbg_is_early)
793 kgdb_arch_late(); 823 kgdb_arch_late();
824 register_reboot_notifier(&dbg_reboot_notifier);
794 atomic_notifier_chain_register(&panic_notifier_list, 825 atomic_notifier_chain_register(&panic_notifier_list,
795 &kgdb_panic_event_nb); 826 &kgdb_panic_event_nb);
796#ifdef CONFIG_MAGIC_SYSRQ 827#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +843,7 @@ static void kgdb_unregister_callbacks(void)
812 */ 843 */
813 if (kgdb_io_module_registered) { 844 if (kgdb_io_module_registered) {
814 kgdb_io_module_registered = 0; 845 kgdb_io_module_registered = 0;
846 unregister_reboot_notifier(&dbg_reboot_notifier);
815 atomic_notifier_chain_unregister(&panic_notifier_list, 847 atomic_notifier_chain_unregister(&panic_notifier_list,
816 &kgdb_panic_event_nb); 848 &kgdb_panic_event_nb);
817 kgdb_arch_exit(); 849 kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
1111 unsigned char checksum, ch, buffer[3]; 1111 unsigned char checksum, ch, buffer[3];
1112 int loop; 1112 int loop;
1113 1113
1114 if (!kgdb_connected)
1115 return;
1116 kgdb_connected = 0;
1117
1118 if (!dbg_io_ops || dbg_kdb_mode)
1119 return;
1120
1114 buffer[0] = 'W'; 1121 buffer[0] = 'W';
1115 buffer[1] = hex_asc_hi(status); 1122 buffer[1] = hex_asc_hi(status);
1116 buffer[2] = hex_asc_lo(status); 1123 buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
1129 dbg_io_ops->write_char(hex_asc_lo(checksum)); 1136 dbg_io_ops->write_char(hex_asc_lo(checksum));
1130 1137
1131 /* make sure the output is flushed, lest the bootloader clobber it */ 1138 /* make sure the output is flushed, lest the bootloader clobber it */
1132 dbg_io_ops->flush(); 1139 if (dbg_io_ops->flush)
1140 dbg_io_ops->flush();
1133} 1141}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
153 } else { 153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n", 154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr); 155 __func__, bp->bp_addr);
156#ifdef CONFIG_DEBUG_RODATA
157 if (!bp->bp_type) {
158 kdb_printf("Software breakpoints are unavailable.\n"
159 " Change the kernel CONFIG_DEBUG_RODATA=n\n"
160 " OR use hw breaks: help bph\n");
161 }
162#endif
156 return 1; 163 return 1;
157 } 164 }
158 return 0; 165 return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41c..07c9bbb94a0b 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,7 +15,6 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kdb.h> 16#include <linux/kdb.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h" 18#include "kdb_private.h"
20 19
21 20
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
689 if (!dbg_kdb_mode && kgdb_connected) { 689 if (!dbg_kdb_mode && kgdb_connected) {
690 gdbstub_msg_write(kdb_buffer, retlen); 690 gdbstub_msg_write(kdb_buffer, retlen);
691 } else { 691 } else {
692 if (!dbg_io_ops->is_console) { 692 if (dbg_io_ops && !dbg_io_ops->is_console) {
693 len = strlen(kdb_buffer); 693 len = strlen(kdb_buffer);
694 cp = kdb_buffer; 694 cp = kdb_buffer;
695 while (len--) { 695 while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ 25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26 26
27static int kbd_exists; 27static int kbd_exists;
28static int kbd_last_ret;
28 29
29/* 30/*
30 * Check if the keyboard controller has a keypress for us. 31 * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
90 return -1; 91 return -1;
91 } 92 }
92 93
93 if ((scancode & 0x80) != 0) 94 if ((scancode & 0x80) != 0) {
95 if (scancode == 0x9c)
96 kbd_last_ret = 0;
94 return -1; 97 return -1;
98 }
95 99
96 scancode &= 0x7f; 100 scancode &= 0x7f;
97 101
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
178 return -1; /* ignore unprintables */ 182 return -1; /* ignore unprintables */
179 } 183 }
180 184
181 if ((scancode & 0x7f) == 0x1c) { 185 if (scancode == 0x1c) {
182 /* 186 kbd_last_ret = 1;
183 * enter key. All done. Absorb the release scancode. 187 return 13;
184 */ 188 }
189
190 return keychar & 0xff;
191}
192EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
193
194/*
195 * Best effort cleanup of ENTER break codes on leaving KDB. Called on
196 * exiting KDB, when we know we processed an ENTER or KP ENTER scan
197 * code.
198 */
199void kdb_kbd_cleanup_state(void)
200{
201 int scancode, scanstatus;
202
203 /*
204 * Nothing to clean up, since either
205 * ENTER was never pressed, or has already
206 * gotten cleaned up.
207 */
208 if (!kbd_last_ret)
209 return;
210
211 kbd_last_ret = 0;
212 /*
213 * Enter key. Need to absorb the break code here, lest it gets
214 * leaked out if we exit KDB as the result of processing 'g'.
215 *
216 * This has several interesting implications:
217 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
218 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
219 * only get a break code at the end of the repeated
220 * sequence. This means we can't propagate the repeated key
221 * press, and must swallow it away.
222 * + Need to handle possible PS/2 mouse input.
223 * + Need to handle mashed keys.
224 */
225
226 while (1) {
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) 227 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ; 228 cpu_relax();
187 229
188 /* 230 /*
189 * Fetch the scancode 231 * Fetch the scancode.
190 */ 232 */
191 scancode = inb(KBD_DATA_REG); 233 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG); 234 scanstatus = inb(KBD_STATUS_REG);
193 235
194 while (scanstatus & KBD_STAT_MOUSE_OBF) { 236 /*
195 scancode = inb(KBD_DATA_REG); 237 * Skip mouse input.
196 scanstatus = inb(KBD_STATUS_REG); 238 */
197 } 239 if (scanstatus & KBD_STAT_MOUSE_OBF)
240 continue;
198 241
199 if (scancode != 0x9c) { 242 /*
200 /* 243 * If we see 0xe0, this is either a break code for KP
201 * Wasn't an enter-release, why not? 244 * ENTER, or a repeat make for KP ENTER. Either way,
202 */ 245 * since the second byte is equivalent to an ENTER,
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", 246 * skip the 0xe0 and try again.
204 scancode, scanstatus); 247 *
205 } 248 * If we see 0x1c, this must be a repeat ENTER or KP
249 * ENTER (and we swallowed 0xe0 before). Try again.
250 *
251 * We can also see make and break codes for other keys
252 * mashed before or after pressing ENTER. Thus, if we
253 * see anything other than 0x9c, we have to try again.
254 *
255 * Note, if you held some key as ENTER was depressed,
256 * that break code would get leaked out.
257 */
258 if (scancode != 0x9c)
259 continue;
206 260
207 return 13; 261 return;
208 } 262 }
209
210 return keychar & 0xff;
211} 263}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1400 if (KDB_STATE(DOING_SS)) 1400 if (KDB_STATE(DOING_SS))
1401 KDB_STATE_CLEAR(SSBPT); 1401 KDB_STATE_CLEAR(SSBPT);
1402 1402
1403 /* Clean up any keyboard devices before leaving */
1404 kdb_kbd_cleanup_state();
1405
1403 return result; 1406 return result;
1404} 1407}
1405 1408
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
246 246
247extern void kdb_set_current_task(struct task_struct *); 247extern void kdb_set_current_task(struct task_struct *);
248extern struct task_struct *kdb_current_task; 248extern struct task_struct *kdb_current_task;
249
250#ifdef CONFIG_KDB_KEYBOARD
251extern void kdb_kbd_cleanup_state(void);
252#else /* ! CONFIG_KDB_KEYBOARD */
253#define kdb_kbd_cleanup_state()
254#endif /* ! CONFIG_KDB_KEYBOARD */
255
249#ifdef CONFIG_MODULES 256#ifdef CONFIG_MODULES
250extern struct list_head *kdb_modules; 257extern struct list_head *kdb_modules;
251#endif /* CONFIG_MODULES */ 258#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
384 if (!pfn_valid(pfn)) 384 if (!pfn_valid(pfn))
385 return 1; 385 return 1;
386 page = pfn_to_page(pfn); 386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB); 387 vaddr = kmap_atomic(page);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); 388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB); 389 kunmap_atomic(vaddr);
390 390
391 return 0; 391 return 0;
392} 392}
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c8..6c6262f86c17 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/system.h>
22 21
23 22
24 23
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1b5c081d8b9f..a6a9ec4cd8f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
118 PERF_FLAG_FD_OUTPUT |\ 118 PERF_FLAG_FD_OUTPUT |\
119 PERF_FLAG_PID_CGROUP) 119 PERF_FLAG_PID_CGROUP)
120 120
121/*
122 * branch priv levels that need permission checks
123 */
124#define PERF_SAMPLE_BRANCH_PERM_PLM \
125 (PERF_SAMPLE_BRANCH_KERNEL |\
126 PERF_SAMPLE_BRANCH_HV)
127
121enum event_type_t { 128enum event_type_t {
122 EVENT_FLEXIBLE = 0x1, 129 EVENT_FLEXIBLE = 0x1,
123 EVENT_PINNED = 0x2, 130 EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 135 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 136 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 137 */
131struct jump_label_key_deferred perf_sched_events __read_mostly; 138struct static_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
133 141
134static atomic_t nr_mmap_events __read_mostly; 142static atomic_t nr_mmap_events __read_mostly;
135static atomic_t nr_comm_events __read_mostly; 143static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
881 if (is_cgroup_event(event)) 889 if (is_cgroup_event(event))
882 ctx->nr_cgroups++; 890 ctx->nr_cgroups++;
883 891
892 if (has_branch_stack(event))
893 ctx->nr_branch_stack++;
894
884 list_add_rcu(&event->event_entry, &ctx->event_list); 895 list_add_rcu(&event->event_entry, &ctx->event_list);
885 if (!ctx->nr_events) 896 if (!ctx->nr_events)
886 perf_pmu_rotate_start(ctx->pmu); 897 perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1020 cpuctx->cgrp = NULL; 1031 cpuctx->cgrp = NULL;
1021 } 1032 }
1022 1033
1034 if (has_branch_stack(event))
1035 ctx->nr_branch_stack--;
1036
1023 ctx->nr_events--; 1037 ctx->nr_events--;
1024 if (event->attr.inherit_stat) 1038 if (event->attr.inherit_stat)
1025 ctx->nr_stat--; 1039 ctx->nr_stat--;
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2195} 2209}
2196 2210
2197/* 2211/*
2212 * When sampling the branck stack in system-wide, it may be necessary
2213 * to flush the stack on context switch. This happens when the branch
2214 * stack does not tag its entries with the pid of the current task.
2215 * Otherwise it becomes impossible to associate a branch entry with a
2216 * task. This ambiguity is more likely to appear when the branch stack
2217 * supports priv level filtering and the user sets it to monitor only
2218 * at the user level (which could be a useful measurement in system-wide
2219 * mode). In that case, the risk is high of having a branch stack with
2220 * branch from multiple tasks. Flushing may mean dropping the existing
2221 * entries or stashing them somewhere in the PMU specific code layer.
2222 *
2223 * This function provides the context switch callback to the lower code
2224 * layer. It is invoked ONLY when there is at least one system-wide context
2225 * with at least one active event using taken branch sampling.
2226 */
2227static void perf_branch_stack_sched_in(struct task_struct *prev,
2228 struct task_struct *task)
2229{
2230 struct perf_cpu_context *cpuctx;
2231 struct pmu *pmu;
2232 unsigned long flags;
2233
2234 /* no need to flush branch stack if not changing task */
2235 if (prev == task)
2236 return;
2237
2238 local_irq_save(flags);
2239
2240 rcu_read_lock();
2241
2242 list_for_each_entry_rcu(pmu, &pmus, entry) {
2243 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2244
2245 /*
2246 * check if the context has at least one
2247 * event using PERF_SAMPLE_BRANCH_STACK
2248 */
2249 if (cpuctx->ctx.nr_branch_stack > 0
2250 && pmu->flush_branch_stack) {
2251
2252 pmu = cpuctx->ctx.pmu;
2253
2254 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2255
2256 perf_pmu_disable(pmu);
2257
2258 pmu->flush_branch_stack();
2259
2260 perf_pmu_enable(pmu);
2261
2262 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2263 }
2264 }
2265
2266 rcu_read_unlock();
2267
2268 local_irq_restore(flags);
2269}
2270
2271/*
2198 * Called from scheduler to add the events of the current task 2272 * Called from scheduler to add the events of the current task
2199 * with interrupts disabled. 2273 * with interrupts disabled.
2200 * 2274 *
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2225 */ 2299 */
2226 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2300 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2227 perf_cgroup_sched_in(prev, task); 2301 perf_cgroup_sched_in(prev, task);
2302
2303 /* check for system-wide branch_stack events */
2304 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2305 perf_branch_stack_sched_in(prev, task);
2228} 2306}
2229 2307
2230static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
2778 2856
2779 if (!event->parent) { 2857 if (!event->parent) {
2780 if (event->attach_state & PERF_ATTACH_TASK) 2858 if (event->attach_state & PERF_ATTACH_TASK)
2781 jump_label_dec_deferred(&perf_sched_events); 2859 static_key_slow_dec_deferred(&perf_sched_events);
2782 if (event->attr.mmap || event->attr.mmap_data) 2860 if (event->attr.mmap || event->attr.mmap_data)
2783 atomic_dec(&nr_mmap_events); 2861 atomic_dec(&nr_mmap_events);
2784 if (event->attr.comm) 2862 if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
2789 put_callchain_buffers(); 2867 put_callchain_buffers();
2790 if (is_cgroup_event(event)) { 2868 if (is_cgroup_event(event)) {
2791 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2869 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2792 jump_label_dec_deferred(&perf_sched_events); 2870 static_key_slow_dec_deferred(&perf_sched_events);
2871 }
2872
2873 if (has_branch_stack(event)) {
2874 static_key_slow_dec_deferred(&perf_sched_events);
2875 /* is system-wide event */
2876 if (!(event->attach_state & PERF_ATTACH_TASK))
2877 atomic_dec(&per_cpu(perf_branch_stack_events,
2878 event->cpu));
2793 } 2879 }
2794 } 2880 }
2795 2881
@@ -3238,10 +3324,6 @@ int perf_event_task_disable(void)
3238 return 0; 3324 return 0;
3239} 3325}
3240 3326
3241#ifndef PERF_EVENT_INDEX_OFFSET
3242# define PERF_EVENT_INDEX_OFFSET 0
3243#endif
3244
3245static int perf_event_index(struct perf_event *event) 3327static int perf_event_index(struct perf_event *event)
3246{ 3328{
3247 if (event->hw.state & PERF_HES_STOPPED) 3329 if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)
3250 if (event->state != PERF_EVENT_STATE_ACTIVE) 3332 if (event->state != PERF_EVENT_STATE_ACTIVE)
3251 return 0; 3333 return 0;
3252 3334
3253 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3335 return event->pmu->event_idx(event);
3254} 3336}
3255 3337
3256static void calc_timer_values(struct perf_event *event, 3338static void calc_timer_values(struct perf_event *event,
3339 u64 *now,
3257 u64 *enabled, 3340 u64 *enabled,
3258 u64 *running) 3341 u64 *running)
3259{ 3342{
3260 u64 now, ctx_time; 3343 u64 ctx_time;
3261 3344
3262 now = perf_clock(); 3345 *now = perf_clock();
3263 ctx_time = event->shadow_ctx_time + now; 3346 ctx_time = event->shadow_ctx_time + *now;
3264 *enabled = ctx_time - event->tstamp_enabled; 3347 *enabled = ctx_time - event->tstamp_enabled;
3265 *running = ctx_time - event->tstamp_running; 3348 *running = ctx_time - event->tstamp_running;
3266} 3349}
3267 3350
3351void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3352{
3353}
3354
3268/* 3355/*
3269 * Callers need to ensure there can be no nesting of this function, otherwise 3356 * Callers need to ensure there can be no nesting of this function, otherwise
3270 * the seqlock logic goes bad. We can not serialize this because the arch 3357 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)
3274{ 3361{
3275 struct perf_event_mmap_page *userpg; 3362 struct perf_event_mmap_page *userpg;
3276 struct ring_buffer *rb; 3363 struct ring_buffer *rb;
3277 u64 enabled, running; 3364 u64 enabled, running, now;
3278 3365
3279 rcu_read_lock(); 3366 rcu_read_lock();
3280 /* 3367 /*
@@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)
3286 * because of locking issue as we can be called in 3373 * because of locking issue as we can be called in
3287 * NMI context 3374 * NMI context
3288 */ 3375 */
3289 calc_timer_values(event, &enabled, &running); 3376 calc_timer_values(event, &now, &enabled, &running);
3290 rb = rcu_dereference(event->rb); 3377 rb = rcu_dereference(event->rb);
3291 if (!rb) 3378 if (!rb)
3292 goto unlock; 3379 goto unlock;
@@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)
3302 barrier(); 3389 barrier();
3303 userpg->index = perf_event_index(event); 3390 userpg->index = perf_event_index(event);
3304 userpg->offset = perf_event_count(event); 3391 userpg->offset = perf_event_count(event);
3305 if (event->state == PERF_EVENT_STATE_ACTIVE) 3392 if (userpg->index)
3306 userpg->offset -= local64_read(&event->hw.prev_count); 3393 userpg->offset -= local64_read(&event->hw.prev_count);
3307 3394
3308 userpg->time_enabled = enabled + 3395 userpg->time_enabled = enabled +
@@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)
3311 userpg->time_running = running + 3398 userpg->time_running = running +
3312 atomic64_read(&event->child_total_time_running); 3399 atomic64_read(&event->child_total_time_running);
3313 3400
3401 arch_perf_update_userpage(userpg, now);
3402
3314 barrier(); 3403 barrier();
3315 ++userpg->lock; 3404 ++userpg->lock;
3316 preempt_enable(); 3405 preempt_enable();
@@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3568 event->mmap_user = get_current_user(); 3657 event->mmap_user = get_current_user();
3569 vma->vm_mm->pinned_vm += event->mmap_locked; 3658 vma->vm_mm->pinned_vm += event->mmap_locked;
3570 3659
3660 perf_event_update_userpage(event);
3661
3571unlock: 3662unlock:
3572 if (!ret) 3663 if (!ret)
3573 atomic_inc(&event->mmap_count); 3664 atomic_inc(&event->mmap_count);
@@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3799static void perf_output_read(struct perf_output_handle *handle, 3890static void perf_output_read(struct perf_output_handle *handle,
3800 struct perf_event *event) 3891 struct perf_event *event)
3801{ 3892{
3802 u64 enabled = 0, running = 0; 3893 u64 enabled = 0, running = 0, now;
3803 u64 read_format = event->attr.read_format; 3894 u64 read_format = event->attr.read_format;
3804 3895
3805 /* 3896 /*
@@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,
3812 * NMI context 3903 * NMI context
3813 */ 3904 */
3814 if (read_format & PERF_FORMAT_TOTAL_TIMES) 3905 if (read_format & PERF_FORMAT_TOTAL_TIMES)
3815 calc_timer_values(event, &enabled, &running); 3906 calc_timer_values(event, &now, &enabled, &running);
3816 3907
3817 if (event->attr.read_format & PERF_FORMAT_GROUP) 3908 if (event->attr.read_format & PERF_FORMAT_GROUP)
3818 perf_output_read_group(handle, event, enabled, running); 3909 perf_output_read_group(handle, event, enabled, running);
@@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
3902 } 3993 }
3903 } 3994 }
3904 } 3995 }
3996
3997 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
3998 if (data->br_stack) {
3999 size_t size;
4000
4001 size = data->br_stack->nr
4002 * sizeof(struct perf_branch_entry);
4003
4004 perf_output_put(handle, data->br_stack->nr);
4005 perf_output_copy(handle, data->br_stack->entries, size);
4006 } else {
4007 /*
4008 * we always store at least the value of nr
4009 */
4010 u64 nr = 0;
4011 perf_output_put(handle, nr);
4012 }
4013 }
3905} 4014}
3906 4015
3907void perf_prepare_sample(struct perf_event_header *header, 4016void perf_prepare_sample(struct perf_event_header *header,
@@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
3944 WARN_ON_ONCE(size & (sizeof(u64)-1)); 4053 WARN_ON_ONCE(size & (sizeof(u64)-1));
3945 header->size += size; 4054 header->size += size;
3946 } 4055 }
4056
4057 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4058 int size = sizeof(u64); /* nr */
4059 if (data->br_stack) {
4060 size += data->br_stack->nr
4061 * sizeof(struct perf_branch_entry);
4062 }
4063 header->size += size;
4064 }
3947} 4065}
3948 4066
3949static void perf_event_output(struct perf_event *event, 4067static void perf_event_output(struct perf_event *event,
@@ -4986,7 +5104,7 @@ fail:
4986 return err; 5104 return err;
4987} 5105}
4988 5106
4989struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5107struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
4990 5108
4991static void sw_perf_event_destroy(struct perf_event *event) 5109static void sw_perf_event_destroy(struct perf_event *event)
4992{ 5110{
@@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4994 5112
4995 WARN_ON(event->parent); 5113 WARN_ON(event->parent);
4996 5114
4997 jump_label_dec(&perf_swevent_enabled[event_id]); 5115 static_key_slow_dec(&perf_swevent_enabled[event_id]);
4998 swevent_hlist_put(event); 5116 swevent_hlist_put(event);
4999} 5117}
5000 5118
@@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
5005 if (event->attr.type != PERF_TYPE_SOFTWARE) 5123 if (event->attr.type != PERF_TYPE_SOFTWARE)
5006 return -ENOENT; 5124 return -ENOENT;
5007 5125
5126 /*
5127 * no branch sampling for software events
5128 */
5129 if (has_branch_stack(event))
5130 return -EOPNOTSUPP;
5131
5008 switch (event_id) { 5132 switch (event_id) {
5009 case PERF_COUNT_SW_CPU_CLOCK: 5133 case PERF_COUNT_SW_CPU_CLOCK:
5010 case PERF_COUNT_SW_TASK_CLOCK: 5134 case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)
5024 if (err) 5148 if (err)
5025 return err; 5149 return err;
5026 5150
5027 jump_label_inc(&perf_swevent_enabled[event_id]); 5151 static_key_slow_inc(&perf_swevent_enabled[event_id]);
5028 event->destroy = sw_perf_event_destroy; 5152 event->destroy = sw_perf_event_destroy;
5029 } 5153 }
5030 5154
5031 return 0; 5155 return 0;
5032} 5156}
5033 5157
5158static int perf_swevent_event_idx(struct perf_event *event)
5159{
5160 return 0;
5161}
5162
5034static struct pmu perf_swevent = { 5163static struct pmu perf_swevent = {
5035 .task_ctx_nr = perf_sw_context, 5164 .task_ctx_nr = perf_sw_context,
5036 5165
@@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = {
5040 .start = perf_swevent_start, 5169 .start = perf_swevent_start,
5041 .stop = perf_swevent_stop, 5170 .stop = perf_swevent_stop,
5042 .read = perf_swevent_read, 5171 .read = perf_swevent_read,
5172
5173 .event_idx = perf_swevent_event_idx,
5043}; 5174};
5044 5175
5045#ifdef CONFIG_EVENT_TRACING 5176#ifdef CONFIG_EVENT_TRACING
@@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
5108 if (event->attr.type != PERF_TYPE_TRACEPOINT) 5239 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5109 return -ENOENT; 5240 return -ENOENT;
5110 5241
5242 /*
5243 * no branch sampling for tracepoint events
5244 */
5245 if (has_branch_stack(event))
5246 return -EOPNOTSUPP;
5247
5111 err = perf_trace_init(event); 5248 err = perf_trace_init(event);
5112 if (err) 5249 if (err)
5113 return err; 5250 return err;
@@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = {
5126 .start = perf_swevent_start, 5263 .start = perf_swevent_start,
5127 .stop = perf_swevent_stop, 5264 .stop = perf_swevent_stop,
5128 .read = perf_swevent_read, 5265 .read = perf_swevent_read,
5266
5267 .event_idx = perf_swevent_event_idx,
5129}; 5268};
5130 5269
5131static inline void perf_tp_register(void) 5270static inline void perf_tp_register(void)
@@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
5331 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5470 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5332 return -ENOENT; 5471 return -ENOENT;
5333 5472
5473 /*
5474 * no branch sampling for software events
5475 */
5476 if (has_branch_stack(event))
5477 return -EOPNOTSUPP;
5478
5334 perf_swevent_init_hrtimer(event); 5479 perf_swevent_init_hrtimer(event);
5335 5480
5336 return 0; 5481 return 0;
@@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = {
5345 .start = cpu_clock_event_start, 5490 .start = cpu_clock_event_start,
5346 .stop = cpu_clock_event_stop, 5491 .stop = cpu_clock_event_stop,
5347 .read = cpu_clock_event_read, 5492 .read = cpu_clock_event_read,
5493
5494 .event_idx = perf_swevent_event_idx,
5348}; 5495};
5349 5496
5350/* 5497/*
@@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
5403 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5550 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5404 return -ENOENT; 5551 return -ENOENT;
5405 5552
5553 /*
5554 * no branch sampling for software events
5555 */
5556 if (has_branch_stack(event))
5557 return -EOPNOTSUPP;
5558
5406 perf_swevent_init_hrtimer(event); 5559 perf_swevent_init_hrtimer(event);
5407 5560
5408 return 0; 5561 return 0;
@@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = {
5417 .start = task_clock_event_start, 5570 .start = task_clock_event_start,
5418 .stop = task_clock_event_stop, 5571 .stop = task_clock_event_stop,
5419 .read = task_clock_event_read, 5572 .read = task_clock_event_read,
5573
5574 .event_idx = perf_swevent_event_idx,
5420}; 5575};
5421 5576
5422static void perf_pmu_nop_void(struct pmu *pmu) 5577static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
5444 perf_pmu_enable(pmu); 5599 perf_pmu_enable(pmu);
5445} 5600}
5446 5601
5602static int perf_event_idx_default(struct perf_event *event)
5603{
5604 return event->hw.idx + 1;
5605}
5606
5447/* 5607/*
5448 * Ensures all contexts with the same task_ctx_nr have the same 5608 * Ensures all contexts with the same task_ctx_nr have the same
5449 * pmu_cpu_context too. 5609 * pmu_cpu_context too.
@@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
5530 if (!pmu->dev) 5690 if (!pmu->dev)
5531 goto out; 5691 goto out;
5532 5692
5693 pmu->dev->groups = pmu->attr_groups;
5533 device_initialize(pmu->dev); 5694 device_initialize(pmu->dev);
5534 ret = dev_set_name(pmu->dev, "%s", pmu->name); 5695 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5535 if (ret) 5696 if (ret)
@@ -5633,6 +5794,9 @@ got_cpu_context:
5633 pmu->pmu_disable = perf_pmu_nop_void; 5794 pmu->pmu_disable = perf_pmu_nop_void;
5634 } 5795 }
5635 5796
5797 if (!pmu->event_idx)
5798 pmu->event_idx = perf_event_idx_default;
5799
5636 list_add_rcu(&pmu->entry, &pmus); 5800 list_add_rcu(&pmu->entry, &pmus);
5637 ret = 0; 5801 ret = 0;
5638unlock: 5802unlock:
@@ -5825,7 +5989,7 @@ done:
5825 5989
5826 if (!event->parent) { 5990 if (!event->parent) {
5827 if (event->attach_state & PERF_ATTACH_TASK) 5991 if (event->attach_state & PERF_ATTACH_TASK)
5828 jump_label_inc(&perf_sched_events.key); 5992 static_key_slow_inc(&perf_sched_events.key);
5829 if (event->attr.mmap || event->attr.mmap_data) 5993 if (event->attr.mmap || event->attr.mmap_data)
5830 atomic_inc(&nr_mmap_events); 5994 atomic_inc(&nr_mmap_events);
5831 if (event->attr.comm) 5995 if (event->attr.comm)
@@ -5839,6 +6003,12 @@ done:
5839 return ERR_PTR(err); 6003 return ERR_PTR(err);
5840 } 6004 }
5841 } 6005 }
6006 if (has_branch_stack(event)) {
6007 static_key_slow_inc(&perf_sched_events.key);
6008 if (!(event->attach_state & PERF_ATTACH_TASK))
6009 atomic_inc(&per_cpu(perf_branch_stack_events,
6010 event->cpu));
6011 }
5842 } 6012 }
5843 6013
5844 return event; 6014 return event;
@@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
5908 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 6078 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5909 return -EINVAL; 6079 return -EINVAL;
5910 6080
6081 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6082 u64 mask = attr->branch_sample_type;
6083
6084 /* only using defined bits */
6085 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6086 return -EINVAL;
6087
6088 /* at least one branch bit must be set */
6089 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6090 return -EINVAL;
6091
6092 /* kernel level capture: check permissions */
6093 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6094 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6095 return -EACCES;
6096
6097 /* propagate priv level, when not set for branch */
6098 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6099
6100 /* exclude_kernel checked on syscall entry */
6101 if (!attr->exclude_kernel)
6102 mask |= PERF_SAMPLE_BRANCH_KERNEL;
6103
6104 if (!attr->exclude_user)
6105 mask |= PERF_SAMPLE_BRANCH_USER;
6106
6107 if (!attr->exclude_hv)
6108 mask |= PERF_SAMPLE_BRANCH_HV;
6109 /*
6110 * adjust user setting (for HW filter setup)
6111 */
6112 attr->branch_sample_type = mask;
6113 }
6114 }
5911out: 6115out:
5912 return ret; 6116 return ret;
5913 6117
@@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
6063 * - that may need work on context switch 6267 * - that may need work on context switch
6064 */ 6268 */
6065 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6269 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6066 jump_label_inc(&perf_sched_events.key); 6270 static_key_slow_inc(&perf_sched_events.key);
6067 } 6271 }
6068 6272
6069 /* 6273 /*
@@ -6912,6 +7116,13 @@ void __init perf_event_init(void)
6912 7116
6913 /* do not patch jump label more than once per second */ 7117 /* do not patch jump label more than once per second */
6914 jump_label_rate_limit(&perf_sched_events, HZ); 7118 jump_label_rate_limit(&perf_sched_events, HZ);
7119
7120 /*
7121 * Build time assertion that we keep the data_head at the intended
7122 * location. IOW, validation we got the __reserved[] size right.
7123 */
7124 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
7125 != 1024);
6915} 7126}
6916 7127
6917static int __init perf_event_sysfs_init(void) 7128static int __init perf_event_sysfs_init(void)
@@ -6943,8 +7154,7 @@ unlock:
6943device_initcall(perf_event_sysfs_init); 7154device_initcall(perf_event_sysfs_init);
6944 7155
6945#ifdef CONFIG_CGROUP_PERF 7156#ifdef CONFIG_CGROUP_PERF
6946static struct cgroup_subsys_state *perf_cgroup_create( 7157static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
6947 struct cgroup_subsys *ss, struct cgroup *cont)
6948{ 7158{
6949 struct perf_cgroup *jc; 7159 struct perf_cgroup *jc;
6950 7160
@@ -6961,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
6961 return &jc->css; 7171 return &jc->css;
6962} 7172}
6963 7173
6964static void perf_cgroup_destroy(struct cgroup_subsys *ss, 7174static void perf_cgroup_destroy(struct cgroup *cont)
6965 struct cgroup *cont)
6966{ 7175{
6967 struct perf_cgroup *jc; 7176 struct perf_cgroup *jc;
6968 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7177 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6978,8 +7187,7 @@ static int __perf_cgroup_move(void *info)
6978 return 0; 7187 return 0;
6979} 7188}
6980 7189
6981static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7190static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
6982 struct cgroup_taskset *tset)
6983{ 7191{
6984 struct task_struct *task; 7192 struct task_struct *task;
6985 7193
@@ -6987,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
6987 task_function_call(task, __perf_cgroup_move, task); 7195 task_function_call(task, __perf_cgroup_move, task);
6988} 7196}
6989 7197
6990static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7198static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
6991 struct cgroup *old_cgrp, struct task_struct *task) 7199 struct task_struct *task)
6992{ 7200{
6993 /* 7201 /*
6994 * cgroup_exit() is called in the copy_process() failure path. 7202 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38bf..bb38c4d3ee12 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
581 if (bp->attr.type != PERF_TYPE_BREAKPOINT) 581 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
582 return -ENOENT; 582 return -ENOENT;
583 583
584 /*
585 * no branch sampling for breakpoint events
586 */
587 if (has_branch_stack(bp))
588 return -EOPNOTSUPP;
589
584 err = register_perf_hw_breakpoint(bp); 590 err = register_perf_hw_breakpoint(bp);
585 if (err) 591 if (err)
586 return err; 592 return err;
@@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
613 bp->hw.state = PERF_HES_STOPPED; 619 bp->hw.state = PERF_HES_STOPPED;
614} 620}
615 621
622static int hw_breakpoint_event_idx(struct perf_event *bp)
623{
624 return 0;
625}
626
616static struct pmu perf_breakpoint = { 627static struct pmu perf_breakpoint = {
617 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 628 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
618 629
@@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = {
622 .start = hw_breakpoint_start, 633 .start = hw_breakpoint_start,
623 .stop = hw_breakpoint_stop, 634 .stop = hw_breakpoint_stop,
624 .read = hw_breakpoint_pmu_read, 635 .read = hw_breakpoint_pmu_read,
636
637 .event_idx = hw_breakpoint_event_idx,
625}; 638};
626 639
627int __init init_hw_breakpoint(void) 640int __init init_hw_breakpoint(void)
@@ -651,10 +664,10 @@ int __init init_hw_breakpoint(void)
651 664
652 err_alloc: 665 err_alloc:
653 for_each_possible_cpu(err_cpu) { 666 for_each_possible_cpu(err_cpu) {
654 if (err_cpu == cpu)
655 break;
656 for (i = 0; i < TYPE_MAX; i++) 667 for (i = 0; i < TYPE_MAX; i++)
657 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 668 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
669 if (err_cpu == cpu)
670 break;
658 } 671 }
659 672
660 return -ENOMEM; 673 return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6a..d8bd3b425fa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h> 54#include <linux/writeback.h>
55#include <linux/shm.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/unistd.h> 58#include <asm/unistd.h>
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...)
424 */ 425 */
425 exit_mm(current); 426 exit_mm(current);
426 /* 427 /*
427 * We don't want to have TIF_FREEZE set if the system-wide hibernation 428 * We don't want to get frozen, in case system-wide hibernation
428 * or suspend transition begins right now. 429 * or suspend transition begins right now.
429 */ 430 */
430 current->flags |= (PF_NOFREEZE | PF_KTHREAD); 431 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
@@ -473,7 +474,7 @@ static void close_files(struct files_struct * files)
473 i = j * __NFDBITS; 474 i = j * __NFDBITS;
474 if (i >= fdt->max_fds) 475 if (i >= fdt->max_fds)
475 break; 476 break;
476 set = fdt->open_fds->fds_bits[j++]; 477 set = fdt->open_fds[j++];
477 while (set) { 478 while (set) {
478 if (set & 1) { 479 if (set & 1) {
479 struct file * file = xchg(&fdt->fd[i], NULL); 480 struct file * file = xchg(&fdt->fd[i], NULL);
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
686} 687}
687 688
688/* 689/*
689 * When we die, we re-parent all our children. 690 * When we die, we re-parent all our children, and try to:
690 * Try to give them to another thread in our thread 691 * 1. give them to another thread in our thread group, if such a member exists
691 * group, and if no such member exists, give it to 692 * 2. give it to the first ancestor process which prctl'd itself as a
692 * the child reaper process (ie "init") in our pid 693 * child_subreaper for its children (like a service manager)
693 * space. 694 * 3. give it to the init process (PID 1) in our pid namespace
694 */ 695 */
695static struct task_struct *find_new_reaper(struct task_struct *father) 696static struct task_struct *find_new_reaper(struct task_struct *father)
696 __releases(&tasklist_lock) 697 __releases(&tasklist_lock)
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
710 711
711 if (unlikely(pid_ns->child_reaper == father)) { 712 if (unlikely(pid_ns->child_reaper == father)) {
712 write_unlock_irq(&tasklist_lock); 713 write_unlock_irq(&tasklist_lock);
713 if (unlikely(pid_ns == &init_pid_ns)) 714 if (unlikely(pid_ns == &init_pid_ns)) {
714 panic("Attempted to kill init!"); 715 panic("Attempted to kill init! exitcode=0x%08x\n",
716 father->signal->group_exit_code ?:
717 father->exit_code);
718 }
715 719
716 zap_pid_ns_processes(pid_ns); 720 zap_pid_ns_processes(pid_ns);
717 write_lock_irq(&tasklist_lock); 721 write_lock_irq(&tasklist_lock);
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
721 * forget_original_parent() must move them somewhere. 725 * forget_original_parent() must move them somewhere.
722 */ 726 */
723 pid_ns->child_reaper = init_pid_ns.child_reaper; 727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper;
730
731 /*
732 * Find the first ancestor marked as child_subreaper.
733 * Note that the code below checks same_thread_group(reaper,
734 * pid_ns->child_reaper). This is what we need to DTRT in a
735 * PID namespace. However we still need the check above, see
736 * http://marc.info/?l=linux-kernel&m=131385460420380
737 */
738 for (reaper = father->real_parent;
739 reaper != &init_task;
740 reaper = reaper->real_parent) {
741 if (same_thread_group(reaper, pid_ns->child_reaper))
742 break;
743 if (!reaper->signal->is_child_subreaper)
744 continue;
745 thread = reaper;
746 do {
747 if (!(thread->flags & PF_EXITING))
748 return reaper;
749 } while_each_thread(reaper, thread);
750 }
724 } 751 }
725 752
726 return pid_ns->child_reaper; 753 return pid_ns->child_reaper;
@@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
818 if (group_dead) 845 if (group_dead)
819 kill_orphaned_pgrp(tsk->group_leader, NULL); 846 kill_orphaned_pgrp(tsk->group_leader, NULL);
820 847
821 /* Let father know we died
822 *
823 * Thread signals are configurable, but you aren't going to use
824 * that to send signals to arbitrary processes.
825 * That stops right now.
826 *
827 * If the parent exec id doesn't match the exec id we saved
828 * when we started then we know the parent has changed security
829 * domain.
830 *
831 * If our self_exec id doesn't match our parent_exec_id then
832 * we have changed execution domain as these two values started
833 * the same after a fork.
834 */
835 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
836 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
837 tsk->self_exec_id != tsk->parent_exec_id))
838 tsk->exit_signal = SIGCHLD;
839
840 if (unlikely(tsk->ptrace)) { 848 if (unlikely(tsk->ptrace)) {
841 int sig = thread_group_leader(tsk) && 849 int sig = thread_group_leader(tsk) &&
842 thread_group_empty(tsk) && 850 thread_group_empty(tsk) &&
@@ -935,8 +943,6 @@ void do_exit(long code)
935 schedule(); 943 schedule();
936 } 944 }
937 945
938 exit_irq_thread();
939
940 exit_signals(tsk); /* sets PF_EXITING */ 946 exit_signals(tsk); /* sets PF_EXITING */
941 /* 947 /*
942 * tsk->flags are checked in the futex code to protect against 948 * tsk->flags are checked in the futex code to protect against
@@ -945,6 +951,8 @@ void do_exit(long code)
945 smp_mb(); 951 smp_mb();
946 raw_spin_unlock_wait(&tsk->pi_lock); 952 raw_spin_unlock_wait(&tsk->pi_lock);
947 953
954 exit_irq_thread();
955
948 if (unlikely(in_atomic())) 956 if (unlikely(in_atomic()))
949 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 957 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
950 current->comm, task_pid_nr(current), 958 current->comm, task_pid_nr(current),
@@ -953,7 +961,7 @@ void do_exit(long code)
953 acct_update_integrals(tsk); 961 acct_update_integrals(tsk);
954 /* sync mm's RSS info before statistics gathering */ 962 /* sync mm's RSS info before statistics gathering */
955 if (tsk->mm) 963 if (tsk->mm)
956 sync_mm_rss(tsk, tsk->mm); 964 sync_mm_rss(tsk->mm);
957 group_dead = atomic_dec_and_test(&tsk->signal->live); 965 group_dead = atomic_dec_and_test(&tsk->signal->live);
958 if (group_dead) { 966 if (group_dead) {
959 hrtimer_cancel(&tsk->signal->real_timer); 967 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index a1b632713e43..08eb8584e2a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/user-return-notifier.h> 66#include <linux/user-return-notifier.h>
67#include <linux/oom.h> 67#include <linux/oom.h>
68#include <linux/khugepaged.h> 68#include <linux/khugepaged.h>
69#include <linux/signalfd.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -192,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
192 WARN_ON(atomic_read(&tsk->usage)); 193 WARN_ON(atomic_read(&tsk->usage));
193 WARN_ON(tsk == current); 194 WARN_ON(tsk == current);
194 195
196 security_task_free(tsk);
195 exit_creds(tsk); 197 exit_creds(tsk);
196 delayacct_tsk_free(tsk); 198 delayacct_tsk_free(tsk);
197 put_signal_struct(tsk->signal); 199 put_signal_struct(tsk->signal);
@@ -354,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
354 charge = 0; 356 charge = 0;
355 if (mpnt->vm_flags & VM_ACCOUNT) { 357 if (mpnt->vm_flags & VM_ACCOUNT) {
356 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 358 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
357 if (security_vm_enough_memory(len)) 359 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
358 goto fail_nomem; 360 goto fail_nomem;
359 charge = len; 361 charge = len;
360 } 362 }
@@ -510,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
510 return NULL; 512 return NULL;
511} 513}
512 514
515static void check_mm(struct mm_struct *mm)
516{
517 int i;
518
519 for (i = 0; i < NR_MM_COUNTERS; i++) {
520 long x = atomic_long_read(&mm->rss_stat.count[i]);
521
522 if (unlikely(x))
523 printk(KERN_ALERT "BUG: Bad rss-counter state "
524 "mm:%p idx:%d val:%ld\n", mm, i, x);
525 }
526
527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
528 VM_BUG_ON(mm->pmd_huge_pte);
529#endif
530}
531
513/* 532/*
514 * Allocate and initialize an mm_struct. 533 * Allocate and initialize an mm_struct.
515 */ 534 */
@@ -537,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
537 mm_free_pgd(mm); 556 mm_free_pgd(mm);
538 destroy_context(mm); 557 destroy_context(mm);
539 mmu_notifier_mm_destroy(mm); 558 mmu_notifier_mm_destroy(mm);
540#ifdef CONFIG_TRANSPARENT_HUGEPAGE 559 check_mm(mm);
541 VM_BUG_ON(mm->pmd_huge_pte);
542#endif
543 free_mm(mm); 560 free_mm(mm);
544} 561}
545EXPORT_SYMBOL_GPL(__mmdrop); 562EXPORT_SYMBOL_GPL(__mmdrop);
@@ -667,6 +684,38 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
667 return mm; 684 return mm;
668} 685}
669 686
687static void complete_vfork_done(struct task_struct *tsk)
688{
689 struct completion *vfork;
690
691 task_lock(tsk);
692 vfork = tsk->vfork_done;
693 if (likely(vfork)) {
694 tsk->vfork_done = NULL;
695 complete(vfork);
696 }
697 task_unlock(tsk);
698}
699
700static int wait_for_vfork_done(struct task_struct *child,
701 struct completion *vfork)
702{
703 int killed;
704
705 freezer_do_not_count();
706 killed = wait_for_completion_killable(vfork);
707 freezer_count();
708
709 if (killed) {
710 task_lock(child);
711 child->vfork_done = NULL;
712 task_unlock(child);
713 }
714
715 put_task_struct(child);
716 return killed;
717}
718
670/* Please note the differences between mmput and mm_release. 719/* Please note the differences between mmput and mm_release.
671 * mmput is called whenever we stop holding onto a mm_struct, 720 * mmput is called whenever we stop holding onto a mm_struct,
672 * error success whatever. 721 * error success whatever.
@@ -682,8 +731,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
682 */ 731 */
683void mm_release(struct task_struct *tsk, struct mm_struct *mm) 732void mm_release(struct task_struct *tsk, struct mm_struct *mm)
684{ 733{
685 struct completion *vfork_done = tsk->vfork_done;
686
687 /* Get rid of any futexes when releasing the mm */ 734 /* Get rid of any futexes when releasing the mm */
688#ifdef CONFIG_FUTEX 735#ifdef CONFIG_FUTEX
689 if (unlikely(tsk->robust_list)) { 736 if (unlikely(tsk->robust_list)) {
@@ -703,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
703 /* Get rid of any cached register state */ 750 /* Get rid of any cached register state */
704 deactivate_mm(tsk, mm); 751 deactivate_mm(tsk, mm);
705 752
706 /* notify parent sleeping on vfork() */ 753 if (tsk->vfork_done)
707 if (vfork_done) { 754 complete_vfork_done(tsk);
708 tsk->vfork_done = NULL;
709 complete(vfork_done);
710 }
711 755
712 /* 756 /*
713 * If we're exiting normally, clear a user-space tid field if 757 * If we're exiting normally, clear a user-space tid field if
714 * requested. We leave this alone when dying by signal, to leave 758 * requested. We leave this alone when dying by signal, to leave
715 * the value intact in a core dump, and to save the unnecessary 759 * the value intact in a core dump, and to save the unnecessary
716 * trouble otherwise. Userland only wants this done for a sys_exit. 760 * trouble, say, a killed vfork parent shouldn't touch this mm.
761 * Userland only wants this done for a sys_exit.
717 */ 762 */
718 if (tsk->clear_child_tid) { 763 if (tsk->clear_child_tid) {
719 if (!(tsk->flags & PF_SIGNALED) && 764 if (!(tsk->flags & PF_SIGNALED) &&
@@ -934,8 +979,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
934 979
935void __cleanup_sighand(struct sighand_struct *sighand) 980void __cleanup_sighand(struct sighand_struct *sighand)
936{ 981{
937 if (atomic_dec_and_test(&sighand->count)) 982 if (atomic_dec_and_test(&sighand->count)) {
983 signalfd_cleanup(sighand);
938 kmem_cache_free(sighand_cachep, sighand); 984 kmem_cache_free(sighand_cachep, sighand);
985 }
939} 986}
940 987
941 988
@@ -1003,6 +1050,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1003 sig->oom_score_adj = current->signal->oom_score_adj; 1050 sig->oom_score_adj = current->signal->oom_score_adj;
1004 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1051 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1005 1052
1053 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1054 current->signal->is_child_subreaper;
1055
1006 mutex_init(&sig->cred_guard_mutex); 1056 mutex_init(&sig->cred_guard_mutex);
1007 1057
1008 return 0; 1058 return 0;
@@ -1014,7 +1064,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
1014 1064
1015 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1065 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1016 new_flags |= PF_FORKNOEXEC; 1066 new_flags |= PF_FORKNOEXEC;
1017 new_flags |= PF_STARTING;
1018 p->flags = new_flags; 1067 p->flags = new_flags;
1019} 1068}
1020 1069
@@ -1191,6 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1191#ifdef CONFIG_CPUSETS 1240#ifdef CONFIG_CPUSETS
1192 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1241 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1193 p->cpuset_slab_spread_rotor = NUMA_NO_NODE; 1242 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1243 seqcount_init(&p->mems_allowed_seq);
1194#endif 1244#endif
1195#ifdef CONFIG_TRACE_IRQFLAGS 1245#ifdef CONFIG_TRACE_IRQFLAGS
1196 p->irq_events = 0; 1246 p->irq_events = 0;
@@ -1309,7 +1359,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1309 clear_all_latency_tracing(p); 1359 clear_all_latency_tracing(p);
1310 1360
1311 /* ok, now we should be set up.. */ 1361 /* ok, now we should be set up.. */
1312 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1362 if (clone_flags & CLONE_THREAD)
1363 p->exit_signal = -1;
1364 else if (clone_flags & CLONE_PARENT)
1365 p->exit_signal = current->group_leader->exit_signal;
1366 else
1367 p->exit_signal = (clone_flags & CSIGNAL);
1368
1313 p->pdeath_signal = 0; 1369 p->pdeath_signal = 0;
1314 p->exit_state = 0; 1370 p->exit_state = 0;
1315 1371
@@ -1544,16 +1600,9 @@ long do_fork(unsigned long clone_flags,
1544 if (clone_flags & CLONE_VFORK) { 1600 if (clone_flags & CLONE_VFORK) {
1545 p->vfork_done = &vfork; 1601 p->vfork_done = &vfork;
1546 init_completion(&vfork); 1602 init_completion(&vfork);
1603 get_task_struct(p);
1547 } 1604 }
1548 1605
1549 /*
1550 * We set PF_STARTING at creation in case tracing wants to
1551 * use this to distinguish a fully live task from one that
1552 * hasn't finished SIGSTOP raising yet. Now we clear it
1553 * and set the child going.
1554 */
1555 p->flags &= ~PF_STARTING;
1556
1557 wake_up_new_task(p); 1606 wake_up_new_task(p);
1558 1607
1559 /* forking complete and child started to run, tell ptracer */ 1608 /* forking complete and child started to run, tell ptracer */
@@ -1561,10 +1610,8 @@ long do_fork(unsigned long clone_flags,
1561 ptrace_event(trace, nr); 1610 ptrace_event(trace, nr);
1562 1611
1563 if (clone_flags & CLONE_VFORK) { 1612 if (clone_flags & CLONE_VFORK) {
1564 freezer_do_not_count(); 1613 if (!wait_for_vfork_done(p, &vfork))
1565 wait_for_completion(&vfork); 1614 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1566 freezer_count();
1567 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1568 } 1615 }
1569 } else { 1616 } else {
1570 nr = PTR_ERR(p); 1617 nr = PTR_ERR(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
99 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
100 * @p: task to send the request to 100 * @p: task to send the request to
101 * 101 *
102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE 102 * If @p is freezing, the freeze request is sent either by sending a fake
103 * flag and either sending a fake signal to it or waking it up, depending 103 * signal (if it's not a kernel thread) or waking it up (if it's a kernel
104 * on whether it has %PF_FREEZER_NOSIG set. 104 * thread).
105 * 105 *
106 * RETURNS: 106 * RETURNS:
107 * %false, if @p is not freezing or already frozen; %true, otherwise 107 * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173d..e2b0fb9a0b3b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,6 +59,7 @@
59#include <linux/magic.h> 59#include <linux/magic.h>
60#include <linux/pid.h> 60#include <linux/pid.h>
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h>
62 63
63#include <asm/futex.h> 64#include <asm/futex.h>
64 65
@@ -2443,40 +2444,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
2443{ 2444{
2444 struct robust_list_head __user *head; 2445 struct robust_list_head __user *head;
2445 unsigned long ret; 2446 unsigned long ret;
2446 const struct cred *cred = current_cred(), *pcred; 2447 struct task_struct *p;
2447 2448
2448 if (!futex_cmpxchg_enabled) 2449 if (!futex_cmpxchg_enabled)
2449 return -ENOSYS; 2450 return -ENOSYS;
2450 2451
2452 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
2453
2454 rcu_read_lock();
2455
2456 ret = -ESRCH;
2451 if (!pid) 2457 if (!pid)
2452 head = current->robust_list; 2458 p = current;
2453 else { 2459 else {
2454 struct task_struct *p;
2455
2456 ret = -ESRCH;
2457 rcu_read_lock();
2458 p = find_task_by_vpid(pid); 2460 p = find_task_by_vpid(pid);
2459 if (!p) 2461 if (!p)
2460 goto err_unlock; 2462 goto err_unlock;
2461 ret = -EPERM;
2462 pcred = __task_cred(p);
2463 /* If victim is in different user_ns, then uids are not
2464 comparable, so we must have CAP_SYS_PTRACE */
2465 if (cred->user->user_ns != pcred->user->user_ns) {
2466 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2467 goto err_unlock;
2468 goto ok;
2469 }
2470 /* If victim is in same user_ns, then uids are comparable */
2471 if (cred->euid != pcred->euid &&
2472 cred->euid != pcred->uid &&
2473 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2474 goto err_unlock;
2475ok:
2476 head = p->robust_list;
2477 rcu_read_unlock();
2478 } 2463 }
2479 2464
2465 ret = -EPERM;
2466 if (!ptrace_may_access(p, PTRACE_MODE_READ))
2467 goto err_unlock;
2468
2469 head = p->robust_list;
2470 rcu_read_unlock();
2471
2480 if (put_user(sizeof(*head), len_ptr)) 2472 if (put_user(sizeof(*head), len_ptr))
2481 return -EFAULT; 2473 return -EFAULT;
2482 return put_user(head, head_ptr); 2474 return put_user(head, head_ptr);
@@ -2628,7 +2620,7 @@ void exit_robust_list(struct task_struct *curr)
2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2620long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2629 u32 __user *uaddr2, u32 val2, u32 val3) 2621 u32 __user *uaddr2, u32 val2, u32 val3)
2630{ 2622{
2631 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; 2623 int cmd = op & FUTEX_CMD_MASK;
2632 unsigned int flags = 0; 2624 unsigned int flags = 0;
2633 2625
2634 if (!(op & FUTEX_PRIVATE_FLAG)) 2626 if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2633,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2641 } 2633 }
2642 2634
2643 switch (cmd) { 2635 switch (cmd) {
2636 case FUTEX_LOCK_PI:
2637 case FUTEX_UNLOCK_PI:
2638 case FUTEX_TRYLOCK_PI:
2639 case FUTEX_WAIT_REQUEUE_PI:
2640 case FUTEX_CMP_REQUEUE_PI:
2641 if (!futex_cmpxchg_enabled)
2642 return -ENOSYS;
2643 }
2644
2645 switch (cmd) {
2644 case FUTEX_WAIT: 2646 case FUTEX_WAIT:
2645 val3 = FUTEX_BITSET_MATCH_ANY; 2647 val3 = FUTEX_BITSET_MATCH_ANY;
2646 case FUTEX_WAIT_BITSET: 2648 case FUTEX_WAIT_BITSET:
2647 ret = futex_wait(uaddr, flags, val, timeout, val3); 2649 return futex_wait(uaddr, flags, val, timeout, val3);
2648 break;
2649 case FUTEX_WAKE: 2650 case FUTEX_WAKE:
2650 val3 = FUTEX_BITSET_MATCH_ANY; 2651 val3 = FUTEX_BITSET_MATCH_ANY;
2651 case FUTEX_WAKE_BITSET: 2652 case FUTEX_WAKE_BITSET:
2652 ret = futex_wake(uaddr, flags, val, val3); 2653 return futex_wake(uaddr, flags, val, val3);
2653 break;
2654 case FUTEX_REQUEUE: 2654 case FUTEX_REQUEUE:
2655 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 2655 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2656 break;
2657 case FUTEX_CMP_REQUEUE: 2656 case FUTEX_CMP_REQUEUE:
2658 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 2657 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2659 break;
2660 case FUTEX_WAKE_OP: 2658 case FUTEX_WAKE_OP:
2661 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2659 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2662 break;
2663 case FUTEX_LOCK_PI: 2660 case FUTEX_LOCK_PI:
2664 if (futex_cmpxchg_enabled) 2661 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2665 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2666 break;
2667 case FUTEX_UNLOCK_PI: 2662 case FUTEX_UNLOCK_PI:
2668 if (futex_cmpxchg_enabled) 2663 return futex_unlock_pi(uaddr, flags);
2669 ret = futex_unlock_pi(uaddr, flags);
2670 break;
2671 case FUTEX_TRYLOCK_PI: 2664 case FUTEX_TRYLOCK_PI:
2672 if (futex_cmpxchg_enabled) 2665 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2673 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2674 break;
2675 case FUTEX_WAIT_REQUEUE_PI: 2666 case FUTEX_WAIT_REQUEUE_PI:
2676 val3 = FUTEX_BITSET_MATCH_ANY; 2667 val3 = FUTEX_BITSET_MATCH_ANY;
2677 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2668 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2678 uaddr2); 2669 uaddr2);
2679 break;
2680 case FUTEX_CMP_REQUEUE_PI: 2670 case FUTEX_CMP_REQUEUE_PI:
2681 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 2671 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2682 break;
2683 default:
2684 ret = -ENOSYS;
2685 } 2672 }
2686 return ret; 2673 return -ENOSYS;
2687} 2674}
2688 2675
2689 2676
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 5f9e689dc8f0..83e368b005fc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,6 +10,7 @@
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <linux/nsproxy.h> 11#include <linux/nsproxy.h>
12#include <linux/futex.h> 12#include <linux/futex.h>
13#include <linux/ptrace.h>
13 14
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15 16
@@ -136,40 +137,31 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
136{ 137{
137 struct compat_robust_list_head __user *head; 138 struct compat_robust_list_head __user *head;
138 unsigned long ret; 139 unsigned long ret;
139 const struct cred *cred = current_cred(), *pcred; 140 struct task_struct *p;
140 141
141 if (!futex_cmpxchg_enabled) 142 if (!futex_cmpxchg_enabled)
142 return -ENOSYS; 143 return -ENOSYS;
143 144
145 WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
146
147 rcu_read_lock();
148
149 ret = -ESRCH;
144 if (!pid) 150 if (!pid)
145 head = current->compat_robust_list; 151 p = current;
146 else { 152 else {
147 struct task_struct *p;
148
149 ret = -ESRCH;
150 rcu_read_lock();
151 p = find_task_by_vpid(pid); 153 p = find_task_by_vpid(pid);
152 if (!p) 154 if (!p)
153 goto err_unlock; 155 goto err_unlock;
154 ret = -EPERM;
155 pcred = __task_cred(p);
156 /* If victim is in different user_ns, then uids are not
157 comparable, so we must have CAP_SYS_PTRACE */
158 if (cred->user->user_ns != pcred->user->user_ns) {
159 if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
160 goto err_unlock;
161 goto ok;
162 }
163 /* If victim is in same user_ns, then uids are comparable */
164 if (cred->euid != pcred->euid &&
165 cred->euid != pcred->uid &&
166 !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
167 goto err_unlock;
168ok:
169 head = p->compat_robust_list;
170 rcu_read_unlock();
171 } 156 }
172 157
158 ret = -EPERM;
159 if (!ptrace_may_access(p, PTRACE_MODE_READ))
160 goto err_unlock;
161
162 head = p->compat_robust_list;
163 rcu_read_unlock();
164
173 if (put_user(sizeof(*head), len_ptr)) 165 if (put_user(sizeof(*head), len_ptr))
174 return -EFAULT; 166 return -EFAULT;
175 return put_user(ptr_to_compat(head), head_ptr); 167 return put_user(ptr_to_compat(head), head_ptr);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 2e48ec0c2e91..c21449f85a2a 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
119 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 119 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
120 * to exit the grace period. For classic RCU, a reschedule is required. 120 * to exit the grace period. For classic RCU, a reschedule is required.
121 */ 121 */
122static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 122static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
123{ 123{
124 bool can_cont;
125
124 get_task_struct(g); 126 get_task_struct(g);
125 get_task_struct(t); 127 get_task_struct(t);
126 rcu_read_unlock(); 128 rcu_read_unlock();
127 cond_resched(); 129 cond_resched();
128 rcu_read_lock(); 130 rcu_read_lock();
131 can_cont = pid_alive(g) && pid_alive(t);
129 put_task_struct(t); 132 put_task_struct(t);
130 put_task_struct(g); 133 put_task_struct(g);
134
135 return can_cont;
131} 136}
132 137
133/* 138/*
@@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
154 goto unlock; 159 goto unlock;
155 if (!--batch_count) { 160 if (!--batch_count) {
156 batch_count = HUNG_TASK_BATCHING; 161 batch_count = HUNG_TASK_BATCHING;
157 rcu_lock_break(g, t); 162 if (!rcu_lock_break(g, t))
158 /* Exit if t or g was unhashed during refresh. */
159 if (t->state == TASK_DEAD || g->state == TASK_DEAD)
160 goto unlock; 163 goto unlock;
161 } 164 }
162 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ 165 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de641..cf1a4a68ce44 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
13# Options selectable by the architecture code 13# Options selectable by the architecture code
14 14
15# Make sparse irq Kconfig switch below available 15# Make sparse irq Kconfig switch below available
16config HAVE_SPARSE_IRQ 16config MAY_HAVE_SPARSE_IRQ
17 bool 17 bool
18 18
19# Enable the generic irq autoprobe mechanism 19# Enable the generic irq autoprobe mechanism
@@ -56,13 +56,22 @@ config GENERIC_IRQ_CHIP
56config IRQ_DOMAIN 56config IRQ_DOMAIN
57 bool 57 bool
58 58
59config IRQ_DOMAIN_DEBUG
60 bool "Expose hardware/virtual IRQ mapping via debugfs"
61 depends on IRQ_DOMAIN && DEBUG_FS
62 help
63 This option will show the mapping relationship between hardware irq
64 numbers and Linux irq numbers. The mapping is exposed via debugfs
65 in the file "virq_mapping".
66
67 If you don't know what this means you don't need it.
68
59# Support forced irq threading 69# Support forced irq threading
60config IRQ_FORCED_THREADING 70config IRQ_FORCED_THREADING
61 bool 71 bool
62 72
63config SPARSE_IRQ 73config SPARSE_IRQ
64 bool "Support sparse irq numbering" 74 bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
65 depends on HAVE_SPARSE_IRQ
66 ---help--- 75 ---help---
67 76
68 Sparse irq numbering is useful for distro kernels that want 77 Sparse irq numbering is useful for distro kernels that want
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 342d8f44e401..0119b9d467ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
53 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
54 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
55 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
56 irq_startup(desc); 56 irq_startup(desc, false);
57 } 57 }
58 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
59 } 59 }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
70 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
71 if (!desc->action && irq_settings_can_probe(desc)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
73 if (irq_startup(desc)) 73 if (irq_startup(desc, false))
74 desc->istate |= IRQS_PENDING; 74 desc->istate |= IRQS_PENDING;
75 } 75 }
76 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f7c543a801d9..6080f6bc8c33 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,6 +16,8 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18 18
19#include <trace/events/irq.h>
20
19#include "internals.h" 21#include "internals.h"
20 22
21/** 23/**
@@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
61 return -EINVAL; 63 return -EINVAL;
62 64
63 type &= IRQ_TYPE_SENSE_MASK; 65 type &= IRQ_TYPE_SENSE_MASK;
64 if (type != IRQ_TYPE_NONE) 66 ret = __irq_set_trigger(desc, irq, type);
65 ret = __irq_set_trigger(desc, irq, type);
66 irq_put_desc_busunlock(desc, flags); 67 irq_put_desc_busunlock(desc, flags);
67 return ret; 68 return ret;
68} 69}
@@ -157,19 +158,22 @@ static void irq_state_set_masked(struct irq_desc *desc)
157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); 158 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
158} 159}
159 160
160int irq_startup(struct irq_desc *desc) 161int irq_startup(struct irq_desc *desc, bool resend)
161{ 162{
163 int ret = 0;
164
162 irq_state_clr_disabled(desc); 165 irq_state_clr_disabled(desc);
163 desc->depth = 0; 166 desc->depth = 0;
164 167
165 if (desc->irq_data.chip->irq_startup) { 168 if (desc->irq_data.chip->irq_startup) {
166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 169 ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
167 irq_state_clr_masked(desc); 170 irq_state_clr_masked(desc);
168 return ret; 171 } else {
172 irq_enable(desc);
169 } 173 }
170 174 if (resend)
171 irq_enable(desc); 175 check_irq_resend(desc, desc->irq_data.irq);
172 return 0; 176 return ret;
173} 177}
174 178
175void irq_shutdown(struct irq_desc *desc) 179void irq_shutdown(struct irq_desc *desc)
@@ -330,6 +334,24 @@ out_unlock:
330} 334}
331EXPORT_SYMBOL_GPL(handle_simple_irq); 335EXPORT_SYMBOL_GPL(handle_simple_irq);
332 336
337/*
338 * Called unconditionally from handle_level_irq() and only for oneshot
339 * interrupts from handle_fasteoi_irq()
340 */
341static void cond_unmask_irq(struct irq_desc *desc)
342{
343 /*
344 * We need to unmask in the following cases:
345 * - Standard level irq (IRQF_ONESHOT is not set)
346 * - Oneshot irq which did not wake the thread (caused by a
347 * spurious interrupt or a primary handler handling it
348 * completely).
349 */
350 if (!irqd_irq_disabled(&desc->irq_data) &&
351 irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
352 unmask_irq(desc);
353}
354
333/** 355/**
334 * handle_level_irq - Level type irq handler 356 * handle_level_irq - Level type irq handler
335 * @irq: the interrupt number 357 * @irq: the interrupt number
@@ -362,8 +384,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
362 384
363 handle_irq_event(desc); 385 handle_irq_event(desc);
364 386
365 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) 387 cond_unmask_irq(desc);
366 unmask_irq(desc); 388
367out_unlock: 389out_unlock:
368 raw_spin_unlock(&desc->lock); 390 raw_spin_unlock(&desc->lock);
369} 391}
@@ -417,6 +439,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
417 preflow_handler(desc); 439 preflow_handler(desc);
418 handle_irq_event(desc); 440 handle_irq_event(desc);
419 441
442 if (desc->istate & IRQS_ONESHOT)
443 cond_unmask_irq(desc);
444
420out_eoi: 445out_eoi:
421 desc->irq_data.chip->irq_eoi(&desc->irq_data); 446 desc->irq_data.chip->irq_eoi(&desc->irq_data);
422out_unlock: 447out_unlock:
@@ -625,7 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
625 irq_settings_set_noprobe(desc); 650 irq_settings_set_noprobe(desc);
626 irq_settings_set_norequest(desc); 651 irq_settings_set_norequest(desc);
627 irq_settings_set_nothread(desc); 652 irq_settings_set_nothread(desc);
628 irq_startup(desc); 653 irq_startup(desc, true);
629 } 654 }
630out: 655out:
631 irq_put_desc_busunlock(desc, flags); 656 irq_put_desc_busunlock(desc, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c82bbe..bdb180325551 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55{ 55{
56 /* 56 /*
57 * Wake up the handler thread for this action. In case the 57 * In case the thread crashed and was killed we just pretend that
58 * thread crashed and was killed we just pretend that we 58 * we handled the interrupt. The hardirq handler has disabled the
59 * handled the interrupt. The hardirq handler has disabled the 59 * device interrupt, so no irq storm is lurking.
60 * device interrupt, so no irq storm is lurking. If the 60 */
61 if (action->thread->flags & PF_EXITING)
62 return;
63
64 /*
65 * Wake up the handler thread for this action. If the
61 * RUNTHREAD bit is already set, nothing to do. 66 * RUNTHREAD bit is already set, nothing to do.
62 */ 67 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) || 68 if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return; 69 return;
66 70
67 /* 71 /*
@@ -110,6 +114,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
110 * threads_oneshot untouched and runs the thread another time. 114 * threads_oneshot untouched and runs the thread another time.
111 */ 115 */
112 desc->threads_oneshot |= action->thread_mask; 116 desc->threads_oneshot |= action->thread_mask;
117
118 /*
119 * We increment the threads_active counter in case we wake up
120 * the irq thread. The irq thread decrements the counter when
121 * it returns from the handler or in the exit path and wakes
122 * up waiters which are stuck in synchronize_irq() when the
123 * active count becomes zero. synchronize_irq() is serialized
124 * against this code (hard irq handler) via IRQS_INPROGRESS
125 * like the finalize_oneshot() code. See comment above.
126 */
127 atomic_inc(&desc->threads_active);
128
113 wake_up_process(action->thread); 129 wake_up_process(action->thread);
114} 130}
115 131
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b7952316016a..8e5c56b3b7d9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,14 +20,12 @@ extern bool noirqdebug;
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run 22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed 23 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity 24 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
26 * IRQTF_FORCED_THREAD - irq action is force threaded 25 * IRQTF_FORCED_THREAD - irq action is force threaded
27 */ 26 */
28enum { 27enum {
29 IRQTF_RUNTHREAD, 28 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
31 IRQTF_WARNED, 29 IRQTF_WARNED,
32 IRQTF_AFFINITY, 30 IRQTF_AFFINITY,
33 IRQTF_FORCED_THREAD, 31 IRQTF_FORCED_THREAD,
@@ -67,7 +65,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 65extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 66extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
69 67
70extern int irq_startup(struct irq_desc *desc); 68extern int irq_startup(struct irq_desc *desc, bool resend);
71extern void irq_shutdown(struct irq_desc *desc); 69extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc); 70extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc); 71extern void irq_disable(struct irq_desc *desc);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..3601f3fbf67c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,793 @@
1#include <linux/debugfs.h>
2#include <linux/hardirq.h>
3#include <linux/interrupt.h>
1#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/irqdesc.h>
2#include <linux/irqdomain.h> 6#include <linux/irqdomain.h>
3#include <linux/module.h> 7#include <linux/module.h>
4#include <linux/mutex.h> 8#include <linux/mutex.h>
5#include <linux/of.h> 9#include <linux/of.h>
6#include <linux/of_address.h> 10#include <linux/of_address.h>
11#include <linux/seq_file.h>
7#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
15
16#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
17 * ie. legacy 8259, gets irqs 1..15 */
18#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
19#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
20#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
8 21
9static LIST_HEAD(irq_domain_list); 22static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex); 23static DEFINE_MUTEX(irq_domain_mutex);
11 24
25static DEFINE_MUTEX(revmap_trees_mutex);
26static unsigned int irq_virq_count = NR_IRQS;
27static struct irq_domain *irq_default_domain;
28
12/** 29/**
13 * irq_domain_add() - Register an irq_domain 30 * irq_domain_alloc() - Allocate a new irq_domain data structure
14 * @domain: ptr to initialized irq_domain structure 31 * @of_node: optional device-tree node of the interrupt controller
32 * @revmap_type: type of reverse mapping to use
33 * @ops: map/unmap domain callbacks
34 * @host_data: Controller private data pointer
15 * 35 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be 36 * Allocates and initialize and irq_domain structure. Caller is expected to
17 * initialized with an ops structure pointer, and either a ->to_irq hook or 37 * register allocated irq_domain with irq_domain_register(). Returns pointer
18 * a valid irq_base value. Everything else is optional. 38 * to IRQ domain, or NULL on failure.
19 */ 39 */
20void irq_domain_add(struct irq_domain *domain) 40static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
41 unsigned int revmap_type,
42 const struct irq_domain_ops *ops,
43 void *host_data)
21{ 44{
22 struct irq_data *d; 45 struct irq_domain *domain;
23 int hwirq, irq;
24 46
25 /* 47 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
26 * This assumes that the irq_domain owner has already allocated 48 if (WARN_ON(!domain))
27 * the irq_descs. This block will be removed when support for dynamic 49 return NULL;
28 * allocation of irq_descs is added to irq_domain. 50
29 */ 51 /* Fill structure */
30 irq_domain_for_each_irq(domain, hwirq, irq) { 52 domain->revmap_type = revmap_type;
31 d = irq_get_irq_data(irq); 53 domain->ops = ops;
32 if (!d) { 54 domain->host_data = host_data;
33 WARN(1, "error: assigning domain to non existant irq_desc"); 55 domain->of_node = of_node_get(of_node);
34 return; 56
35 } 57 return domain;
36 if (d->domain) { 58}
37 /* things are broken; just report, don't clean up */ 59
38 WARN(1, "error: irq_desc already assigned to a domain"); 60static void irq_domain_add(struct irq_domain *domain)
39 return; 61{
62 mutex_lock(&irq_domain_mutex);
63 list_add(&domain->link, &irq_domain_list);
64 mutex_unlock(&irq_domain_mutex);
65 pr_debug("irq: Allocated domain of type %d @0x%p\n",
66 domain->revmap_type, domain);
67}
68
69static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
70 irq_hw_number_t hwirq)
71{
72 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
73 int size = domain->revmap_data.legacy.size;
74
75 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
76 return 0;
77 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
78}
79
80/**
81 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
82 * @of_node: pointer to interrupt controller's device tree node.
83 * @size: total number of irqs in legacy mapping
84 * @first_irq: first number of irq block assigned to the domain
85 * @first_hwirq: first hwirq number to use for the translation. Should normally
86 * be '0', but a positive integer can be used if the effective
87 * hwirqs numbering does not begin at zero.
88 * @ops: map/unmap domain callbacks
89 * @host_data: Controller private data pointer
90 *
91 * Note: the map() callback will be called before this function returns
92 * for all legacy interrupts except 0 (which is always the invalid irq for
93 * a legacy controller).
94 */
95struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
96 unsigned int size,
97 unsigned int first_irq,
98 irq_hw_number_t first_hwirq,
99 const struct irq_domain_ops *ops,
100 void *host_data)
101{
102 struct irq_domain *domain;
103 unsigned int i;
104
105 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
106 if (!domain)
107 return NULL;
108
109 domain->revmap_data.legacy.first_irq = first_irq;
110 domain->revmap_data.legacy.first_hwirq = first_hwirq;
111 domain->revmap_data.legacy.size = size;
112
113 mutex_lock(&irq_domain_mutex);
114 /* Verify that all the irqs are available */
115 for (i = 0; i < size; i++) {
116 int irq = first_irq + i;
117 struct irq_data *irq_data = irq_get_irq_data(irq);
118
119 if (WARN_ON(!irq_data || irq_data->domain)) {
120 mutex_unlock(&irq_domain_mutex);
121 of_node_put(domain->of_node);
122 kfree(domain);
123 return NULL;
40 } 124 }
41 d->domain = domain;
42 d->hwirq = hwirq;
43 } 125 }
44 126
45 mutex_lock(&irq_domain_mutex); 127 /* Claim all of the irqs before registering a legacy domain */
46 list_add(&domain->list, &irq_domain_list); 128 for (i = 0; i < size; i++) {
129 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
130 irq_data->hwirq = first_hwirq + i;
131 irq_data->domain = domain;
132 }
47 mutex_unlock(&irq_domain_mutex); 133 mutex_unlock(&irq_domain_mutex);
134
135 for (i = 0; i < size; i++) {
136 int irq = first_irq + i;
137 int hwirq = first_hwirq + i;
138
139 /* IRQ0 gets ignored */
140 if (!irq)
141 continue;
142
143 /* Legacy flags are left to default at this point,
144 * one can then use irq_create_mapping() to
145 * explicitly change them
146 */
147 ops->map(domain, irq, hwirq);
148
149 /* Clear norequest flags */
150 irq_clear_status_flags(irq, IRQ_NOREQUEST);
151 }
152
153 irq_domain_add(domain);
154 return domain;
155}
156
157/**
158 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
159 * @of_node: pointer to interrupt controller's device tree node.
160 * @ops: map/unmap domain callbacks
161 * @host_data: Controller private data pointer
162 */
163struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
164 unsigned int size,
165 const struct irq_domain_ops *ops,
166 void *host_data)
167{
168 struct irq_domain *domain;
169 unsigned int *revmap;
170
171 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
172 if (WARN_ON(!revmap))
173 return NULL;
174
175 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
176 if (!domain) {
177 kfree(revmap);
178 return NULL;
179 }
180 domain->revmap_data.linear.size = size;
181 domain->revmap_data.linear.revmap = revmap;
182 irq_domain_add(domain);
183 return domain;
184}
185
186struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
187 const struct irq_domain_ops *ops,
188 void *host_data)
189{
190 struct irq_domain *domain = irq_domain_alloc(of_node,
191 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
192 if (domain)
193 irq_domain_add(domain);
194 return domain;
195}
196
197/**
198 * irq_domain_add_tree()
199 * @of_node: pointer to interrupt controller's device tree node.
200 * @ops: map/unmap domain callbacks
201 *
202 * Note: The radix tree will be allocated later during boot automatically
203 * (the reverse mapping will use the slow path until that happens).
204 */
205struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
206 const struct irq_domain_ops *ops,
207 void *host_data)
208{
209 struct irq_domain *domain = irq_domain_alloc(of_node,
210 IRQ_DOMAIN_MAP_TREE, ops, host_data);
211 if (domain) {
212 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
213 irq_domain_add(domain);
214 }
215 return domain;
48} 216}
49 217
50/** 218/**
51 * irq_domain_del() - Unregister an irq_domain 219 * irq_find_host() - Locates a domain for a given device node
52 * @domain: ptr to registered irq_domain. 220 * @node: device-tree node of the interrupt controller
53 */ 221 */
54void irq_domain_del(struct irq_domain *domain) 222struct irq_domain *irq_find_host(struct device_node *node)
55{ 223{
56 struct irq_data *d; 224 struct irq_domain *h, *found = NULL;
57 int hwirq, irq; 225 int rc;
58 226
227 /* We might want to match the legacy controller last since
228 * it might potentially be set to match all interrupts in
229 * the absence of a device node. This isn't a problem so far
230 * yet though...
231 */
59 mutex_lock(&irq_domain_mutex); 232 mutex_lock(&irq_domain_mutex);
60 list_del(&domain->list); 233 list_for_each_entry(h, &irq_domain_list, link) {
234 if (h->ops->match)
235 rc = h->ops->match(h, node);
236 else
237 rc = (h->of_node != NULL) && (h->of_node == node);
238
239 if (rc) {
240 found = h;
241 break;
242 }
243 }
61 mutex_unlock(&irq_domain_mutex); 244 mutex_unlock(&irq_domain_mutex);
245 return found;
246}
247EXPORT_SYMBOL_GPL(irq_find_host);
248
249/**
250 * irq_set_default_host() - Set a "default" irq domain
251 * @domain: default domain pointer
252 *
253 * For convenience, it's possible to set a "default" domain that will be used
254 * whenever NULL is passed to irq_create_mapping(). It makes life easier for
255 * platforms that want to manipulate a few hard coded interrupt numbers that
256 * aren't properly represented in the device-tree.
257 */
258void irq_set_default_host(struct irq_domain *domain)
259{
260 pr_debug("irq: Default domain set to @0x%p\n", domain);
261
262 irq_default_domain = domain;
263}
264
265/**
266 * irq_set_virq_count() - Set the maximum number of linux irqs
267 * @count: number of linux irqs, capped with NR_IRQS
268 *
269 * This is mainly for use by platforms like iSeries who want to program
270 * the virtual irq number in the controller to avoid the reverse mapping
271 */
272void irq_set_virq_count(unsigned int count)
273{
274 pr_debug("irq: Trying to set virq count to %d\n", count);
62 275
63 /* Clear the irq_domain assignments */ 276 BUG_ON(count < NUM_ISA_INTERRUPTS);
64 irq_domain_for_each_irq(domain, hwirq, irq) { 277 if (count < NR_IRQS)
65 d = irq_get_irq_data(irq); 278 irq_virq_count = count;
66 d->domain = NULL; 279}
280
281static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
282 irq_hw_number_t hwirq)
283{
284 struct irq_data *irq_data = irq_get_irq_data(virq);
285
286 irq_data->hwirq = hwirq;
287 irq_data->domain = domain;
288 if (domain->ops->map(domain, virq, hwirq)) {
289 pr_debug("irq: -> mapping failed, freeing\n");
290 irq_data->domain = NULL;
291 irq_data->hwirq = 0;
292 return -1;
67 } 293 }
294
295 irq_clear_status_flags(virq, IRQ_NOREQUEST);
296
297 return 0;
68} 298}
69 299
70#if defined(CONFIG_OF_IRQ)
71/** 300/**
72 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec 301 * irq_create_direct_mapping() - Allocate an irq for direct mapping
302 * @domain: domain to allocate the irq for or NULL for default domain
73 * 303 *
74 * Used by the device tree interrupt mapping code to translate a device tree 304 * This routine is used for irq controllers which can choose the hardware
75 * interrupt specifier to a valid linux irq number. Returns either a valid 305 * interrupt numbers they generate. In such a case it's simplest to use
76 * linux IRQ number or 0. 306 * the linux irq as the hardware interrupt number.
307 */
308unsigned int irq_create_direct_mapping(struct irq_domain *domain)
309{
310 unsigned int virq;
311
312 if (domain == NULL)
313 domain = irq_default_domain;
314
315 BUG_ON(domain == NULL);
316 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
317
318 virq = irq_alloc_desc_from(1, 0);
319 if (!virq) {
320 pr_debug("irq: create_direct virq allocation failed\n");
321 return 0;
322 }
323 if (virq >= irq_virq_count) {
324 pr_err("ERROR: no free irqs available below %i maximum\n",
325 irq_virq_count);
326 irq_free_desc(virq);
327 return 0;
328 }
329
330 pr_debug("irq: create_direct obtained virq %d\n", virq);
331
332 if (irq_setup_virq(domain, virq, virq)) {
333 irq_free_desc(virq);
334 return 0;
335 }
336
337 return virq;
338}
339
340/**
341 * irq_create_mapping() - Map a hardware interrupt into linux irq space
342 * @domain: domain owning this hardware interrupt or NULL for default domain
343 * @hwirq: hardware irq number in that domain space
77 * 344 *
78 * When the caller no longer need the irq number returned by this function it 345 * Only one mapping per hardware interrupt is permitted. Returns a linux
79 * should arrange to call irq_dispose_mapping(). 346 * irq number.
347 * If the sense/trigger is to be specified, set_irq_type() should be called
348 * on the number returned from that call.
80 */ 349 */
350unsigned int irq_create_mapping(struct irq_domain *domain,
351 irq_hw_number_t hwirq)
352{
353 unsigned int virq, hint;
354
355 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
356
357 /* Look for default domain if nececssary */
358 if (domain == NULL)
359 domain = irq_default_domain;
360 if (domain == NULL) {
361 printk(KERN_WARNING "irq_create_mapping called for"
362 " NULL domain, hwirq=%lx\n", hwirq);
363 WARN_ON(1);
364 return 0;
365 }
366 pr_debug("irq: -> using domain @%p\n", domain);
367
368 /* Check if mapping already exists */
369 virq = irq_find_mapping(domain, hwirq);
370 if (virq) {
371 pr_debug("irq: -> existing mapping on virq %d\n", virq);
372 return virq;
373 }
374
375 /* Get a virtual interrupt number */
376 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
377 return irq_domain_legacy_revmap(domain, hwirq);
378
379 /* Allocate a virtual interrupt number */
380 hint = hwirq % irq_virq_count;
381 if (hint == 0)
382 hint++;
383 virq = irq_alloc_desc_from(hint, 0);
384 if (!virq)
385 virq = irq_alloc_desc_from(1, 0);
386 if (!virq) {
387 pr_debug("irq: -> virq allocation failed\n");
388 return 0;
389 }
390
391 if (irq_setup_virq(domain, virq, hwirq)) {
392 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
393 irq_free_desc(virq);
394 return 0;
395 }
396
397 pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
398 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
399
400 return virq;
401}
402EXPORT_SYMBOL_GPL(irq_create_mapping);
403
81unsigned int irq_create_of_mapping(struct device_node *controller, 404unsigned int irq_create_of_mapping(struct device_node *controller,
82 const u32 *intspec, unsigned int intsize) 405 const u32 *intspec, unsigned int intsize)
83{ 406{
84 struct irq_domain *domain; 407 struct irq_domain *domain;
85 unsigned long hwirq; 408 irq_hw_number_t hwirq;
86 unsigned int irq, type; 409 unsigned int type = IRQ_TYPE_NONE;
87 int rc = -EINVAL; 410 unsigned int virq;
88 411
89 /* Find a domain which can translate the irq spec */ 412 domain = controller ? irq_find_host(controller) : irq_default_domain;
90 mutex_lock(&irq_domain_mutex); 413 if (!domain) {
91 list_for_each_entry(domain, &irq_domain_list, list) { 414#ifdef CONFIG_MIPS
92 if (!domain->ops->dt_translate) 415 /*
93 continue; 416 * Workaround to avoid breaking interrupt controller drivers
94 rc = domain->ops->dt_translate(domain, controller, 417 * that don't yet register an irq_domain. This is temporary
95 intspec, intsize, &hwirq, &type); 418 * code. ~~~gcl, Feb 24, 2012
96 if (rc == 0) 419 *
97 break; 420 * Scheduled for removal in Linux v3.6. That should be enough
421 * time.
422 */
423 if (intsize > 0)
424 return intspec[0];
425#endif
426 printk(KERN_WARNING "irq: no irq domain found for %s !\n",
427 controller->full_name);
428 return 0;
98 } 429 }
99 mutex_unlock(&irq_domain_mutex);
100 430
101 if (rc != 0) 431 /* If domain has no translation, then we assume interrupt line */
102 return 0; 432 if (domain->ops->xlate == NULL)
433 hwirq = intspec[0];
434 else {
435 if (domain->ops->xlate(domain, controller, intspec, intsize,
436 &hwirq, &type))
437 return 0;
438 }
439
440 /* Create mapping */
441 virq = irq_create_mapping(domain, hwirq);
442 if (!virq)
443 return virq;
103 444
104 irq = irq_domain_to_irq(domain, hwirq); 445 /* Set type if specified and different than the current one */
105 if (type != IRQ_TYPE_NONE) 446 if (type != IRQ_TYPE_NONE &&
106 irq_set_irq_type(irq, type); 447 type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
107 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", 448 irq_set_irq_type(virq, type);
108 controller->full_name, (int)hwirq, irq, type); 449 return virq;
109 return irq;
110} 450}
111EXPORT_SYMBOL_GPL(irq_create_of_mapping); 451EXPORT_SYMBOL_GPL(irq_create_of_mapping);
112 452
113/** 453/**
114 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() 454 * irq_dispose_mapping() - Unmap an interrupt
115 * @irq: linux irq number to be discarded 455 * @virq: linux irq number of the interrupt to unmap
456 */
457void irq_dispose_mapping(unsigned int virq)
458{
459 struct irq_data *irq_data = irq_get_irq_data(virq);
460 struct irq_domain *domain;
461 irq_hw_number_t hwirq;
462
463 if (!virq || !irq_data)
464 return;
465
466 domain = irq_data->domain;
467 if (WARN_ON(domain == NULL))
468 return;
469
470 /* Never unmap legacy interrupts */
471 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
472 return;
473
474 irq_set_status_flags(virq, IRQ_NOREQUEST);
475
476 /* remove chip and handler */
477 irq_set_chip_and_handler(virq, NULL, NULL);
478
479 /* Make sure it's completed */
480 synchronize_irq(virq);
481
482 /* Tell the PIC about it */
483 if (domain->ops->unmap)
484 domain->ops->unmap(domain, virq);
485 smp_mb();
486
487 /* Clear reverse map */
488 hwirq = irq_data->hwirq;
489 switch(domain->revmap_type) {
490 case IRQ_DOMAIN_MAP_LINEAR:
491 if (hwirq < domain->revmap_data.linear.size)
492 domain->revmap_data.linear.revmap[hwirq] = 0;
493 break;
494 case IRQ_DOMAIN_MAP_TREE:
495 mutex_lock(&revmap_trees_mutex);
496 radix_tree_delete(&domain->revmap_data.tree, hwirq);
497 mutex_unlock(&revmap_trees_mutex);
498 break;
499 }
500
501 irq_free_desc(virq);
502}
503EXPORT_SYMBOL_GPL(irq_dispose_mapping);
504
505/**
506 * irq_find_mapping() - Find a linux irq from an hw irq number.
507 * @domain: domain owning this hardware interrupt
508 * @hwirq: hardware irq number in that domain space
509 *
510 * This is a slow path, for use by generic code. It's expected that an
511 * irq controller implementation directly calls the appropriate low level
512 * mapping function.
513 */
514unsigned int irq_find_mapping(struct irq_domain *domain,
515 irq_hw_number_t hwirq)
516{
517 unsigned int i;
518 unsigned int hint = hwirq % irq_virq_count;
519
520 /* Look for default domain if nececssary */
521 if (domain == NULL)
522 domain = irq_default_domain;
523 if (domain == NULL)
524 return 0;
525
526 /* legacy -> bail early */
527 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
528 return irq_domain_legacy_revmap(domain, hwirq);
529
530 /* Slow path does a linear search of the map */
531 if (hint == 0)
532 hint = 1;
533 i = hint;
534 do {
535 struct irq_data *data = irq_get_irq_data(i);
536 if (data && (data->domain == domain) && (data->hwirq == hwirq))
537 return i;
538 i++;
539 if (i >= irq_virq_count)
540 i = 1;
541 } while(i != hint);
542 return 0;
543}
544EXPORT_SYMBOL_GPL(irq_find_mapping);
545
546/**
547 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
548 * @domain: domain owning this hardware interrupt
549 * @hwirq: hardware irq number in that domain space
116 * 550 *
117 * Calling this function indicates the caller no longer needs a reference to 551 * This is a fast path, for use by irq controller code that uses radix tree
118 * the linux irq number returned by a prior call to irq_create_of_mapping(). 552 * revmaps
119 */ 553 */
120void irq_dispose_mapping(unsigned int irq) 554unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
555 irq_hw_number_t hwirq)
121{ 556{
557 struct irq_data *irq_data;
558
559 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
560 return irq_find_mapping(domain, hwirq);
561
562 /*
563 * Freeing an irq can delete nodes along the path to
564 * do the lookup via call_rcu.
565 */
566 rcu_read_lock();
567 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
568 rcu_read_unlock();
569
122 /* 570 /*
123 * nothing yet; will be filled when support for dynamic allocation of 571 * If found in radix tree, then fine.
124 * irq_descs is added to irq_domain 572 * Else fallback to linear lookup - this should not happen in practice
573 * as it means that we failed to insert the node in the radix tree.
125 */ 574 */
575 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
126} 576}
127EXPORT_SYMBOL_GPL(irq_dispose_mapping);
128 577
129int irq_domain_simple_dt_translate(struct irq_domain *d, 578/**
130 struct device_node *controller, 579 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
131 const u32 *intspec, unsigned int intsize, 580 * @domain: domain owning this hardware interrupt
132 unsigned long *out_hwirq, unsigned int *out_type) 581 * @virq: linux irq number
582 * @hwirq: hardware irq number in that domain space
583 *
584 * This is for use by irq controllers that use a radix tree reverse
585 * mapping for fast lookup.
586 */
587void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
588 irq_hw_number_t hwirq)
133{ 589{
134 if (d->of_node != controller) 590 struct irq_data *irq_data = irq_get_irq_data(virq);
135 return -EINVAL; 591
136 if (intsize < 1) 592 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
137 return -EINVAL; 593 return;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) || 594
139 (intspec[0] >= d->hwirq_base + d->nr_irq))) 595 if (virq) {
140 return -EINVAL; 596 mutex_lock(&revmap_trees_mutex);
597 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
598 mutex_unlock(&revmap_trees_mutex);
599 }
600}
601
602/**
603 * irq_linear_revmap() - Find a linux irq from a hw irq number.
604 * @domain: domain owning this hardware interrupt
605 * @hwirq: hardware irq number in that domain space
606 *
607 * This is a fast path, for use by irq controller code that uses linear
608 * revmaps. It does fallback to the slow path if the revmap doesn't exist
609 * yet and will create the revmap entry with appropriate locking
610 */
611unsigned int irq_linear_revmap(struct irq_domain *domain,
612 irq_hw_number_t hwirq)
613{
614 unsigned int *revmap;
615
616 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
617 return irq_find_mapping(domain, hwirq);
618
619 /* Check revmap bounds */
620 if (unlikely(hwirq >= domain->revmap_data.linear.size))
621 return irq_find_mapping(domain, hwirq);
622
623 /* Check if revmap was allocated */
624 revmap = domain->revmap_data.linear.revmap;
625 if (unlikely(revmap == NULL))
626 return irq_find_mapping(domain, hwirq);
627
628 /* Fill up revmap with slow path if no mapping found */
629 if (unlikely(!revmap[hwirq]))
630 revmap[hwirq] = irq_find_mapping(domain, hwirq);
631
632 return revmap[hwirq];
633}
634
635#ifdef CONFIG_IRQ_DOMAIN_DEBUG
636static int virq_debug_show(struct seq_file *m, void *private)
637{
638 unsigned long flags;
639 struct irq_desc *desc;
640 const char *p;
641 static const char none[] = "none";
642 void *data;
643 int i;
644
645 seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq",
646 "chip name", "chip data", "domain name");
647
648 for (i = 1; i < nr_irqs; i++) {
649 desc = irq_to_desc(i);
650 if (!desc)
651 continue;
652
653 raw_spin_lock_irqsave(&desc->lock, flags);
654
655 if (desc->action && desc->action->handler) {
656 struct irq_chip *chip;
657
658 seq_printf(m, "%5d ", i);
659 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq);
660
661 chip = irq_desc_get_chip(desc);
662 if (chip && chip->name)
663 p = chip->name;
664 else
665 p = none;
666 seq_printf(m, "%-15s ", p);
667
668 data = irq_desc_get_chip_data(desc);
669 seq_printf(m, "0x%16p ", data);
670
671 if (desc->irq_data.domain && desc->irq_data.domain->of_node)
672 p = desc->irq_data.domain->of_node->full_name;
673 else
674 p = none;
675 seq_printf(m, "%s\n", p);
676 }
677
678 raw_spin_unlock_irqrestore(&desc->lock, flags);
679 }
680
681 return 0;
682}
141 683
684static int virq_debug_open(struct inode *inode, struct file *file)
685{
686 return single_open(file, virq_debug_show, inode->i_private);
687}
688
689static const struct file_operations virq_debug_fops = {
690 .open = virq_debug_open,
691 .read = seq_read,
692 .llseek = seq_lseek,
693 .release = single_release,
694};
695
696static int __init irq_debugfs_init(void)
697{
698 if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
699 NULL, &virq_debug_fops) == NULL)
700 return -ENOMEM;
701
702 return 0;
703}
704__initcall(irq_debugfs_init);
705#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
706
707int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
708 irq_hw_number_t hwirq)
709{
710 return 0;
711}
712
713/**
714 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
715 *
716 * Device Tree IRQ specifier translation function which works with one cell
717 * bindings where the cell value maps directly to the hwirq number.
718 */
719int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
720 const u32 *intspec, unsigned int intsize,
721 unsigned long *out_hwirq, unsigned int *out_type)
722{
723 if (WARN_ON(intsize < 1))
724 return -EINVAL;
142 *out_hwirq = intspec[0]; 725 *out_hwirq = intspec[0];
143 *out_type = IRQ_TYPE_NONE; 726 *out_type = IRQ_TYPE_NONE;
144 if (intsize > 1)
145 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
146 return 0; 727 return 0;
147} 728}
729EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
148 730
149/** 731/**
150 * irq_domain_create_simple() - Set up a 'simple' translation range 732 * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
733 *
734 * Device Tree IRQ specifier translation function which works with two cell
735 * bindings where the cell values map directly to the hwirq number
736 * and linux irq flags.
151 */ 737 */
152void irq_domain_add_simple(struct device_node *controller, int irq_base) 738int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
739 const u32 *intspec, unsigned int intsize,
740 irq_hw_number_t *out_hwirq, unsigned int *out_type)
153{ 741{
154 struct irq_domain *domain; 742 if (WARN_ON(intsize < 2))
155 743 return -EINVAL;
156 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 744 *out_hwirq = intspec[0];
157 if (!domain) { 745 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
158 WARN_ON(1); 746 return 0;
159 return; 747}
160 } 748EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
161 749
162 domain->irq_base = irq_base; 750/**
163 domain->of_node = of_node_get(controller); 751 * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
164 domain->ops = &irq_domain_simple_ops; 752 *
165 irq_domain_add(domain); 753 * Device Tree IRQ specifier translation function which works with either one
754 * or two cell bindings where the cell values map directly to the hwirq number
755 * and linux irq flags.
756 *
757 * Note: don't use this function unless your interrupt controller explicitly
758 * supports both one and two cell bindings. For the majority of controllers
759 * the _onecell() or _twocell() variants above should be used.
760 */
761int irq_domain_xlate_onetwocell(struct irq_domain *d,
762 struct device_node *ctrlr,
763 const u32 *intspec, unsigned int intsize,
764 unsigned long *out_hwirq, unsigned int *out_type)
765{
766 if (WARN_ON(intsize < 1))
767 return -EINVAL;
768 *out_hwirq = intspec[0];
769 *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
770 return 0;
166} 771}
167EXPORT_SYMBOL_GPL(irq_domain_add_simple); 772EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
168 773
774const struct irq_domain_ops irq_domain_simple_ops = {
775 .map = irq_domain_simple_map,
776 .xlate = irq_domain_xlate_onetwocell,
777};
778EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
779
780#ifdef CONFIG_OF_IRQ
169void irq_domain_generate_simple(const struct of_device_id *match, 781void irq_domain_generate_simple(const struct of_device_id *match,
170 u64 phys_base, unsigned int irq_start) 782 u64 phys_base, unsigned int irq_start)
171{ 783{
172 struct device_node *node; 784 struct device_node *node;
173 pr_info("looking for phys_base=%llx, irq_start=%i\n", 785 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
174 (unsigned long long) phys_base, (int) irq_start); 786 (unsigned long long) phys_base, (int) irq_start);
175 node = of_find_matching_node_by_address(NULL, match, phys_base); 787 node = of_find_matching_node_by_address(NULL, match, phys_base);
176 if (node) 788 if (node)
177 irq_domain_add_simple(node, irq_start); 789 irq_domain_add_legacy(node, 32, irq_start, 0,
178 else 790 &irq_domain_simple_ops, NULL);
179 pr_info("no node found\n");
180} 791}
181EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 792EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
182#endif /* CONFIG_OF_IRQ */ 793#endif
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a9a9dbe49fea..89a3ea82569b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
282{ 282{
283 struct irq_chip *chip = irq_desc_get_chip(desc); 283 struct irq_chip *chip = irq_desc_get_chip(desc);
284 struct cpumask *set = irq_default_affinity; 284 struct cpumask *set = irq_default_affinity;
285 int ret; 285 int ret, node = desc->irq_data.node;
286 286
287 /* Excludes PER_CPU and NO_BALANCE interrupts */ 287 /* Excludes PER_CPU and NO_BALANCE interrupts */
288 if (!irq_can_set_affinity(irq)) 288 if (!irq_can_set_affinity(irq))
@@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
301 } 301 }
302 302
303 cpumask_and(mask, cpu_online_mask, set); 303 cpumask_and(mask, cpu_online_mask, set);
304 if (node != NUMA_NO_NODE) {
305 const struct cpumask *nodemask = cpumask_of_node(node);
306
307 /* make sure at least one of the cpus in nodemask is online */
308 if (cpumask_intersects(mask, nodemask))
309 cpumask_and(mask, mask, nodemask);
310 }
304 ret = chip->irq_set_affinity(&desc->irq_data, mask, false); 311 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
305 switch (ret) { 312 switch (ret) {
306 case IRQ_SET_MASK_OK: 313 case IRQ_SET_MASK_OK:
@@ -645,7 +652,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
645 * is marked MASKED. 652 * is marked MASKED.
646 */ 653 */
647static void irq_finalize_oneshot(struct irq_desc *desc, 654static void irq_finalize_oneshot(struct irq_desc *desc,
648 struct irqaction *action, bool force) 655 struct irqaction *action)
649{ 656{
650 if (!(desc->istate & IRQS_ONESHOT)) 657 if (!(desc->istate & IRQS_ONESHOT))
651 return; 658 return;
@@ -679,7 +686,7 @@ again:
679 * we would clear the threads_oneshot bit of this thread which 686 * we would clear the threads_oneshot bit of this thread which
680 * was just set. 687 * was just set.
681 */ 688 */
682 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) 689 if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
683 goto out_unlock; 690 goto out_unlock;
684 691
685 desc->threads_oneshot &= ~action->thread_mask; 692 desc->threads_oneshot &= ~action->thread_mask;
@@ -739,7 +746,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
739 746
740 local_bh_disable(); 747 local_bh_disable();
741 ret = action->thread_fn(action->irq, action->dev_id); 748 ret = action->thread_fn(action->irq, action->dev_id);
742 irq_finalize_oneshot(desc, action, false); 749 irq_finalize_oneshot(desc, action);
743 local_bh_enable(); 750 local_bh_enable();
744 return ret; 751 return ret;
745} 752}
@@ -755,10 +762,17 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
755 irqreturn_t ret; 762 irqreturn_t ret;
756 763
757 ret = action->thread_fn(action->irq, action->dev_id); 764 ret = action->thread_fn(action->irq, action->dev_id);
758 irq_finalize_oneshot(desc, action, false); 765 irq_finalize_oneshot(desc, action);
759 return ret; 766 return ret;
760} 767}
761 768
769static void wake_threads_waitq(struct irq_desc *desc)
770{
771 if (atomic_dec_and_test(&desc->threads_active) &&
772 waitqueue_active(&desc->wait_for_threads))
773 wake_up(&desc->wait_for_threads);
774}
775
762/* 776/*
763 * Interrupt handler thread 777 * Interrupt handler thread
764 */ 778 */
@@ -771,57 +785,41 @@ static int irq_thread(void *data)
771 struct irq_desc *desc = irq_to_desc(action->irq); 785 struct irq_desc *desc = irq_to_desc(action->irq);
772 irqreturn_t (*handler_fn)(struct irq_desc *desc, 786 irqreturn_t (*handler_fn)(struct irq_desc *desc,
773 struct irqaction *action); 787 struct irqaction *action);
774 int wake;
775 788
776 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, 789 if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
777 &action->thread_flags)) 790 &action->thread_flags))
778 handler_fn = irq_forced_thread_fn; 791 handler_fn = irq_forced_thread_fn;
779 else 792 else
780 handler_fn = irq_thread_fn; 793 handler_fn = irq_thread_fn;
781 794
782 sched_setscheduler(current, SCHED_FIFO, &param); 795 sched_setscheduler(current, SCHED_FIFO, &param);
783 current->irqaction = action; 796 current->irq_thread = 1;
784 797
785 while (!irq_wait_for_interrupt(action)) { 798 while (!irq_wait_for_interrupt(action)) {
799 irqreturn_t action_ret;
786 800
787 irq_thread_check_affinity(desc, action); 801 irq_thread_check_affinity(desc, action);
788 802
789 atomic_inc(&desc->threads_active); 803 action_ret = handler_fn(desc, action);
804 if (!noirqdebug)
805 note_interrupt(action->irq, desc, action_ret);
790 806
791 raw_spin_lock_irq(&desc->lock); 807 wake_threads_waitq(desc);
792 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
793 /*
794 * CHECKME: We might need a dedicated
795 * IRQ_THREAD_PENDING flag here, which
796 * retriggers the thread in check_irq_resend()
797 * but AFAICT IRQS_PENDING should be fine as it
798 * retriggers the interrupt itself --- tglx
799 */
800 desc->istate |= IRQS_PENDING;
801 raw_spin_unlock_irq(&desc->lock);
802 } else {
803 irqreturn_t action_ret;
804
805 raw_spin_unlock_irq(&desc->lock);
806 action_ret = handler_fn(desc, action);
807 if (!noirqdebug)
808 note_interrupt(action->irq, desc, action_ret);
809 }
810
811 wake = atomic_dec_and_test(&desc->threads_active);
812
813 if (wake && waitqueue_active(&desc->wait_for_threads))
814 wake_up(&desc->wait_for_threads);
815 } 808 }
816 809
817 /* Prevent a stale desc->threads_oneshot */
818 irq_finalize_oneshot(desc, action, true);
819
820 /* 810 /*
821 * Clear irqaction. Otherwise exit_irq_thread() would make 811 * This is the regular exit path. __free_irq() is stopping the
812 * thread via kthread_stop() after calling
813 * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
814 * oneshot mask bit can be set. We cannot verify that as we
815 * cannot touch the oneshot mask at this point anymore as
816 * __setup_irq() might have given out currents thread_mask
817 * again.
818 *
819 * Clear irq_thread. Otherwise exit_irq_thread() would make
822 * fuzz about an active irq thread going into nirvana. 820 * fuzz about an active irq thread going into nirvana.
823 */ 821 */
824 current->irqaction = NULL; 822 current->irq_thread = 0;
825 return 0; 823 return 0;
826} 824}
827 825
@@ -832,27 +830,28 @@ void exit_irq_thread(void)
832{ 830{
833 struct task_struct *tsk = current; 831 struct task_struct *tsk = current;
834 struct irq_desc *desc; 832 struct irq_desc *desc;
833 struct irqaction *action;
835 834
836 if (!tsk->irqaction) 835 if (!tsk->irq_thread)
837 return; 836 return;
838 837
838 action = kthread_data(tsk);
839
839 printk(KERN_ERR 840 printk(KERN_ERR
840 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 841 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
841 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 842 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
842 843
843 desc = irq_to_desc(tsk->irqaction->irq); 844 desc = irq_to_desc(action->irq);
844 845
845 /* 846 /*
846 * Prevent a stale desc->threads_oneshot. Must be called 847 * If IRQTF_RUNTHREAD is set, we need to decrement
847 * before setting the IRQTF_DIED flag. 848 * desc->threads_active and wake possible waiters.
848 */ 849 */
849 irq_finalize_oneshot(desc, tsk->irqaction, true); 850 if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
851 wake_threads_waitq(desc);
850 852
851 /* 853 /* Prevent a stale desc->threads_oneshot */
852 * Set the THREAD DIED flag to prevent further wakeups of the 854 irq_finalize_oneshot(desc, action);
853 * soon to be gone threaded handler.
854 */
855 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
856} 855}
857 856
858static void irq_setup_forced_threading(struct irqaction *new) 857static void irq_setup_forced_threading(struct irqaction *new)
@@ -985,6 +984,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
985 984
986 /* add new interrupt at end of irq queue */ 985 /* add new interrupt at end of irq queue */
987 do { 986 do {
987 /*
988 * Or all existing action->thread_mask bits,
989 * so we can find the next zero bit for this
990 * new action.
991 */
988 thread_mask |= old->thread_mask; 992 thread_mask |= old->thread_mask;
989 old_ptr = &old->next; 993 old_ptr = &old->next;
990 old = *old_ptr; 994 old = *old_ptr;
@@ -993,14 +997,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
993 } 997 }
994 998
995 /* 999 /*
996 * Setup the thread mask for this irqaction. Unlikely to have 1000 * Setup the thread mask for this irqaction for ONESHOT. For
997 * 32 resp 64 irqs sharing one line, but who knows. 1001 * !ONESHOT irqs the thread mask is 0 so we can avoid a
1002 * conditional in irq_wake_thread().
998 */ 1003 */
999 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { 1004 if (new->flags & IRQF_ONESHOT) {
1000 ret = -EBUSY; 1005 /*
1001 goto out_mask; 1006 * Unlikely to have 32 resp 64 irqs sharing one line,
1007 * but who knows.
1008 */
1009 if (thread_mask == ~0UL) {
1010 ret = -EBUSY;
1011 goto out_mask;
1012 }
1013 /*
1014 * The thread_mask for the action is or'ed to
1015 * desc->thread_active to indicate that the
1016 * IRQF_ONESHOT thread handler has been woken, but not
1017 * yet finished. The bit is cleared when a thread
1018 * completes. When all threads of a shared interrupt
1019 * line have completed desc->threads_active becomes
1020 * zero and the interrupt line is unmasked. See
1021 * handle.c:irq_wake_thread() for further information.
1022 *
1023 * If no thread is woken by primary (hard irq context)
1024 * interrupt handlers, then desc->threads_active is
1025 * also checked for zero to unmask the irq line in the
1026 * affected hard irq flow handlers
1027 * (handle_[fasteoi|level]_irq).
1028 *
1029 * The new action gets the first zero bit of
1030 * thread_mask assigned. See the loop above which or's
1031 * all existing action->thread_mask bits.
1032 */
1033 new->thread_mask = 1 << ffz(thread_mask);
1002 } 1034 }
1003 new->thread_mask = 1 << ffz(thread_mask);
1004 1035
1005 if (!shared) { 1036 if (!shared) {
1006 init_waitqueue_head(&desc->wait_for_threads); 1037 init_waitqueue_head(&desc->wait_for_threads);
@@ -1027,7 +1058,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1027 desc->istate |= IRQS_ONESHOT; 1058 desc->istate |= IRQS_ONESHOT;
1028 1059
1029 if (irq_settings_can_autoenable(desc)) 1060 if (irq_settings_can_autoenable(desc))
1030 irq_startup(desc); 1061 irq_startup(desc, true);
1031 else 1062 else
1032 /* Undo nested disables: */ 1063 /* Undo nested disables: */
1033 desc->depth = 1; 1064 desc->depth = 1;
@@ -1103,8 +1134,7 @@ out_thread:
1103 struct task_struct *t = new->thread; 1134 struct task_struct *t = new->thread;
1104 1135
1105 new->thread = NULL; 1136 new->thread = NULL;
1106 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) 1137 kthread_stop(t);
1107 kthread_stop(t);
1108 put_task_struct(t); 1138 put_task_struct(t);
1109 } 1139 }
1110out_mput: 1140out_mput:
@@ -1214,8 +1244,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1214#endif 1244#endif
1215 1245
1216 if (action->thread) { 1246 if (action->thread) {
1217 if (!test_bit(IRQTF_DIED, &action->thread_flags)) 1247 kthread_stop(action->thread);
1218 kthread_stop(action->thread);
1219 put_task_struct(action->thread); 1248 put_task_struct(action->thread);
1220 } 1249 }
1221 1250
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 47420908fba0..c3c89751b327 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata)
43 * masking the irqs. 43 * masking the irqs.
44 */ 44 */
45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
46 < nr_cpu_ids)) 46 < nr_cpu_ids)) {
47 if (!chip->irq_set_affinity(&desc->irq_data, 47 int ret = chip->irq_set_affinity(&desc->irq_data,
48 desc->pending_mask, false)) { 48 desc->pending_mask, false);
49 switch (ret) {
50 case IRQ_SET_MASK_OK:
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask); 51 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
52 case IRQ_SET_MASK_OK_NOCOPY:
50 irq_set_thread_affinity(desc); 53 irq_set_thread_affinity(desc);
51 } 54 }
55 }
52 56
53 cpumask_clear(desc->pending_mask); 57 cpumask_clear(desc->pending_mask);
54} 58}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01d3b70fc98a..43049192b5ec 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h> 15#include <linux/static_key.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
@@ -29,11 +29,6 @@ void jump_label_unlock(void)
29 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
30} 30}
31 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
37static int jump_label_cmp(const void *a, const void *b) 32static int jump_label_cmp(const void *a, const void *b)
38{ 33{
39 const struct jump_entry *jea = a; 34 const struct jump_entry *jea = a;
@@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 53 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
59} 54}
60 55
61static void jump_label_update(struct jump_label_key *key, int enable); 56static void jump_label_update(struct static_key *key, int enable);
62 57
63void jump_label_inc(struct jump_label_key *key) 58void static_key_slow_inc(struct static_key *key)
64{ 59{
65 if (atomic_inc_not_zero(&key->enabled)) 60 if (atomic_inc_not_zero(&key->enabled))
66 return; 61 return;
67 62
68 jump_label_lock(); 63 jump_label_lock();
69 if (atomic_read(&key->enabled) == 0) 64 if (atomic_read(&key->enabled) == 0) {
70 jump_label_update(key, JUMP_LABEL_ENABLE); 65 if (!jump_label_get_branch_default(key))
66 jump_label_update(key, JUMP_LABEL_ENABLE);
67 else
68 jump_label_update(key, JUMP_LABEL_DISABLE);
69 }
71 atomic_inc(&key->enabled); 70 atomic_inc(&key->enabled);
72 jump_label_unlock(); 71 jump_label_unlock();
73} 72}
74EXPORT_SYMBOL_GPL(jump_label_inc); 73EXPORT_SYMBOL_GPL(static_key_slow_inc);
75 74
76static void __jump_label_dec(struct jump_label_key *key, 75static void __static_key_slow_dec(struct static_key *key,
77 unsigned long rate_limit, struct delayed_work *work) 76 unsigned long rate_limit, struct delayed_work *work)
78{ 77{
79 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 78 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
79 WARN(atomic_read(&key->enabled) < 0,
80 "jump label: negative count!\n");
80 return; 81 return;
82 }
81 83
82 if (rate_limit) { 84 if (rate_limit) {
83 atomic_inc(&key->enabled); 85 atomic_inc(&key->enabled);
84 schedule_delayed_work(work, rate_limit); 86 schedule_delayed_work(work, rate_limit);
85 } else 87 } else {
86 jump_label_update(key, JUMP_LABEL_DISABLE); 88 if (!jump_label_get_branch_default(key))
87 89 jump_label_update(key, JUMP_LABEL_DISABLE);
90 else
91 jump_label_update(key, JUMP_LABEL_ENABLE);
92 }
88 jump_label_unlock(); 93 jump_label_unlock();
89} 94}
90EXPORT_SYMBOL_GPL(jump_label_dec);
91 95
92static void jump_label_update_timeout(struct work_struct *work) 96static void jump_label_update_timeout(struct work_struct *work)
93{ 97{
94 struct jump_label_key_deferred *key = 98 struct static_key_deferred *key =
95 container_of(work, struct jump_label_key_deferred, work.work); 99 container_of(work, struct static_key_deferred, work.work);
96 __jump_label_dec(&key->key, 0, NULL); 100 __static_key_slow_dec(&key->key, 0, NULL);
97} 101}
98 102
99void jump_label_dec(struct jump_label_key *key) 103void static_key_slow_dec(struct static_key *key)
100{ 104{
101 __jump_label_dec(key, 0, NULL); 105 __static_key_slow_dec(key, 0, NULL);
102} 106}
107EXPORT_SYMBOL_GPL(static_key_slow_dec);
103 108
104void jump_label_dec_deferred(struct jump_label_key_deferred *key) 109void static_key_slow_dec_deferred(struct static_key_deferred *key)
105{ 110{
106 __jump_label_dec(&key->key, key->timeout, &key->work); 111 __static_key_slow_dec(&key->key, key->timeout, &key->work);
107} 112}
113EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
108 114
109 115void jump_label_rate_limit(struct static_key_deferred *key,
110void jump_label_rate_limit(struct jump_label_key_deferred *key,
111 unsigned long rl) 116 unsigned long rl)
112{ 117{
113 key->timeout = rl; 118 key->timeout = rl;
@@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
150 arch_jump_label_transform(entry, type); 155 arch_jump_label_transform(entry, type);
151} 156}
152 157
153static void __jump_label_update(struct jump_label_key *key, 158static void __jump_label_update(struct static_key *key,
154 struct jump_entry *entry, 159 struct jump_entry *entry,
155 struct jump_entry *stop, int enable) 160 struct jump_entry *stop, int enable)
156{ 161{
@@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key,
167 } 172 }
168} 173}
169 174
175static enum jump_label_type jump_label_type(struct static_key *key)
176{
177 bool true_branch = jump_label_get_branch_default(key);
178 bool state = static_key_enabled(key);
179
180 if ((!true_branch && state) || (true_branch && !state))
181 return JUMP_LABEL_ENABLE;
182
183 return JUMP_LABEL_DISABLE;
184}
185
170void __init jump_label_init(void) 186void __init jump_label_init(void)
171{ 187{
172 struct jump_entry *iter_start = __start___jump_table; 188 struct jump_entry *iter_start = __start___jump_table;
173 struct jump_entry *iter_stop = __stop___jump_table; 189 struct jump_entry *iter_stop = __stop___jump_table;
174 struct jump_label_key *key = NULL; 190 struct static_key *key = NULL;
175 struct jump_entry *iter; 191 struct jump_entry *iter;
176 192
177 jump_label_lock(); 193 jump_label_lock();
178 jump_label_sort_entries(iter_start, iter_stop); 194 jump_label_sort_entries(iter_start, iter_stop);
179 195
180 for (iter = iter_start; iter < iter_stop; iter++) { 196 for (iter = iter_start; iter < iter_stop; iter++) {
181 struct jump_label_key *iterk; 197 struct static_key *iterk;
182 198
183 iterk = (struct jump_label_key *)(unsigned long)iter->key; 199 iterk = (struct static_key *)(unsigned long)iter->key;
184 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? 200 arch_jump_label_transform_static(iter, jump_label_type(iterk));
185 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
186 if (iterk == key) 201 if (iterk == key)
187 continue; 202 continue;
188 203
189 key = iterk; 204 key = iterk;
190 key->entries = iter; 205 /*
206 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
207 */
208 *((unsigned long *)&key->entries) += (unsigned long)iter;
191#ifdef CONFIG_MODULES 209#ifdef CONFIG_MODULES
192 key->next = NULL; 210 key->next = NULL;
193#endif 211#endif
@@ -197,8 +215,8 @@ void __init jump_label_init(void)
197 215
198#ifdef CONFIG_MODULES 216#ifdef CONFIG_MODULES
199 217
200struct jump_label_mod { 218struct static_key_mod {
201 struct jump_label_mod *next; 219 struct static_key_mod *next;
202 struct jump_entry *entries; 220 struct jump_entry *entries;
203 struct module *mod; 221 struct module *mod;
204}; 222};
@@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
218 start, end); 236 start, end);
219} 237}
220 238
221static void __jump_label_mod_update(struct jump_label_key *key, int enable) 239static void __jump_label_mod_update(struct static_key *key, int enable)
222{ 240{
223 struct jump_label_mod *mod = key->next; 241 struct static_key_mod *mod = key->next;
224 242
225 while (mod) { 243 while (mod) {
226 struct module *m = mod->mod; 244 struct module *m = mod->mod;
@@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod)
251 return; 269 return;
252 270
253 for (iter = iter_start; iter < iter_stop; iter++) { 271 for (iter = iter_start; iter < iter_stop; iter++) {
254 struct jump_label_key *iterk; 272 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
255
256 iterk = (struct jump_label_key *)(unsigned long)iter->key;
257 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
258 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
259 } 273 }
260} 274}
261 275
@@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod)
264 struct jump_entry *iter_start = mod->jump_entries; 278 struct jump_entry *iter_start = mod->jump_entries;
265 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 279 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
266 struct jump_entry *iter; 280 struct jump_entry *iter;
267 struct jump_label_key *key = NULL; 281 struct static_key *key = NULL;
268 struct jump_label_mod *jlm; 282 struct static_key_mod *jlm;
269 283
270 /* if the module doesn't have jump label entries, just return */ 284 /* if the module doesn't have jump label entries, just return */
271 if (iter_start == iter_stop) 285 if (iter_start == iter_stop)
@@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod)
274 jump_label_sort_entries(iter_start, iter_stop); 288 jump_label_sort_entries(iter_start, iter_stop);
275 289
276 for (iter = iter_start; iter < iter_stop; iter++) { 290 for (iter = iter_start; iter < iter_stop; iter++) {
277 if (iter->key == (jump_label_t)(unsigned long)key) 291 struct static_key *iterk;
278 continue;
279 292
280 key = (struct jump_label_key *)(unsigned long)iter->key; 293 iterk = (struct static_key *)(unsigned long)iter->key;
294 if (iterk == key)
295 continue;
281 296
297 key = iterk;
282 if (__module_address(iter->key) == mod) { 298 if (__module_address(iter->key) == mod) {
283 atomic_set(&key->enabled, 0); 299 /*
284 key->entries = iter; 300 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
301 */
302 *((unsigned long *)&key->entries) += (unsigned long)iter;
285 key->next = NULL; 303 key->next = NULL;
286 continue; 304 continue;
287 } 305 }
288 306 jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
289 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
290 if (!jlm) 307 if (!jlm)
291 return -ENOMEM; 308 return -ENOMEM;
292
293 jlm->mod = mod; 309 jlm->mod = mod;
294 jlm->entries = iter; 310 jlm->entries = iter;
295 jlm->next = key->next; 311 jlm->next = key->next;
296 key->next = jlm; 312 key->next = jlm;
297 313
298 if (jump_label_enabled(key)) 314 if (jump_label_type(key) == JUMP_LABEL_ENABLE)
299 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); 315 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
300 } 316 }
301 317
@@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod)
307 struct jump_entry *iter_start = mod->jump_entries; 323 struct jump_entry *iter_start = mod->jump_entries;
308 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 324 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
309 struct jump_entry *iter; 325 struct jump_entry *iter;
310 struct jump_label_key *key = NULL; 326 struct static_key *key = NULL;
311 struct jump_label_mod *jlm, **prev; 327 struct static_key_mod *jlm, **prev;
312 328
313 for (iter = iter_start; iter < iter_stop; iter++) { 329 for (iter = iter_start; iter < iter_stop; iter++) {
314 if (iter->key == (jump_label_t)(unsigned long)key) 330 if (iter->key == (jump_label_t)(unsigned long)key)
315 continue; 331 continue;
316 332
317 key = (struct jump_label_key *)(unsigned long)iter->key; 333 key = (struct static_key *)(unsigned long)iter->key;
318 334
319 if (__module_address(iter->key) == mod) 335 if (__module_address(iter->key) == mod)
320 continue; 336 continue;
@@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end)
416 return ret; 432 return ret;
417} 433}
418 434
419static void jump_label_update(struct jump_label_key *key, int enable) 435static void jump_label_update(struct static_key *key, int enable)
420{ 436{
421 struct jump_entry *entry = key->entries, *stop = __stop___jump_table; 437 struct jump_entry *stop = __stop___jump_table;
438 struct jump_entry *entry = jump_label_get_entries(key);
422 439
423#ifdef CONFIG_MODULES 440#ifdef CONFIG_MODULES
424 struct module *mod = __module_address((jump_label_t)key); 441 struct module *mod = __module_address((unsigned long)key);
425 442
426 __jump_label_mod_update(key, enable); 443 __jump_label_mod_update(key, enable);
427 444
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..4e2e472f6aeb 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -37,7 +37,6 @@
37#include <asm/page.h> 37#include <asm/page.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39#include <asm/io.h> 39#include <asm/io.h>
40#include <asm/system.h>
41#include <asm/sections.h> 40#include <asm/sections.h>
42 41
43/* Per cpu memory for storing cpu states in case of system crash. */ 42/* Per cpu memory for storing cpu states in case of system crash. */
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline,
1359 1358
1360 if (*cur == '@') 1359 if (*cur == '@')
1361 *crash_base = memparse(cur+1, &cur); 1360 *crash_base = memparse(cur+1, &cur);
1361 else if (*cur != ' ' && *cur != '\0') {
1362 pr_warning("crashkernel: unrecognized char\n");
1363 return -EINVAL;
1364 }
1362 1365
1363 return 0; 1366 return 0;
1364} 1367}
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1462 1465
1463 VMCOREINFO_SYMBOL(init_uts_ns); 1466 VMCOREINFO_SYMBOL(init_uts_ns);
1464 VMCOREINFO_SYMBOL(node_online_map); 1467 VMCOREINFO_SYMBOL(node_online_map);
1468#ifdef CONFIG_MMU
1465 VMCOREINFO_SYMBOL(swapper_pg_dir); 1469 VMCOREINFO_SYMBOL(swapper_pg_dir);
1470#endif
1466 VMCOREINFO_SYMBOL(_stext); 1471 VMCOREINFO_SYMBOL(_stext);
1467 VMCOREINFO_SYMBOL(vmlist); 1472 VMCOREINFO_SYMBOL(vmlist);
1468 1473
@@ -1546,13 +1551,13 @@ int kernel_kexec(void)
1546 if (error) 1551 if (error)
1547 goto Resume_console; 1552 goto Resume_console;
1548 /* At this point, dpm_suspend_start() has been called, 1553 /* At this point, dpm_suspend_start() has been called,
1549 * but *not* dpm_suspend_noirq(). We *must* call 1554 * but *not* dpm_suspend_end(). We *must* call
1550 * dpm_suspend_noirq() now. Otherwise, drivers for 1555 * dpm_suspend_end() now. Otherwise, drivers for
1551 * some devices (e.g. interrupt controllers) become 1556 * some devices (e.g. interrupt controllers) become
1552 * desynchronized with the actual state of the 1557 * desynchronized with the actual state of the
1553 * hardware at resume time, and evil weirdness ensues. 1558 * hardware at resume time, and evil weirdness ensues.
1554 */ 1559 */
1555 error = dpm_suspend_noirq(PMSG_FREEZE); 1560 error = dpm_suspend_end(PMSG_FREEZE);
1556 if (error) 1561 if (error)
1557 goto Resume_devices; 1562 goto Resume_devices;
1558 error = disable_nonboot_cpus(); 1563 error = disable_nonboot_cpus();
@@ -1579,7 +1584,7 @@ int kernel_kexec(void)
1579 local_irq_enable(); 1584 local_irq_enable();
1580 Enable_cpus: 1585 Enable_cpus:
1581 enable_nonboot_cpus(); 1586 enable_nonboot_cpus();
1582 dpm_resume_noirq(PMSG_RESTORE); 1587 dpm_resume_start(PMSG_RESTORE);
1583 Resume_devices: 1588 Resume_devices:
1584 dpm_resume_end(PMSG_RESTORE); 1589 dpm_resume_end(PMSG_RESTORE);
1585 Resume_console: 1590 Resume_console:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
60*/ 60*/
61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
62 62
63static void free_modprobe_argv(struct subprocess_info *info)
64{
65 kfree(info->argv[3]); /* check call_modprobe() */
66 kfree(info->argv);
67}
68
69static int call_modprobe(char *module_name, int wait)
70{
71 static char *envp[] = {
72 "HOME=/",
73 "TERM=linux",
74 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
75 NULL
76 };
77
78 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
79 if (!argv)
80 goto out;
81
82 module_name = kstrdup(module_name, GFP_KERNEL);
83 if (!module_name)
84 goto free_argv;
85
86 argv[0] = modprobe_path;
87 argv[1] = "-q";
88 argv[2] = "--";
89 argv[3] = module_name; /* check free_modprobe_argv() */
90 argv[4] = NULL;
91
92 return call_usermodehelper_fns(modprobe_path, argv, envp,
93 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
94free_argv:
95 kfree(argv);
96out:
97 return -ENOMEM;
98}
99
63/** 100/**
64 * __request_module - try to load a kernel module 101 * __request_module - try to load a kernel module
65 * @wait: wait (or not) for the operation to complete 102 * @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
81 char module_name[MODULE_NAME_LEN]; 118 char module_name[MODULE_NAME_LEN];
82 unsigned int max_modprobes; 119 unsigned int max_modprobes;
83 int ret; 120 int ret;
84 char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
85 static char *envp[] = { "HOME=/",
86 "TERM=linux",
87 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
88 NULL };
89 static atomic_t kmod_concurrent = ATOMIC_INIT(0); 121 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
90#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 122#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
91 static int kmod_loop_msg; 123 static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
128 160
129 trace_module_request(module_name, wait, _RET_IP_); 161 trace_module_request(module_name, wait, _RET_IP_);
130 162
131 ret = call_usermodehelper_fns(modprobe_path, argv, envp, 163 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
132 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
133 NULL, NULL, NULL);
134 164
135 atomic_dec(&kmod_concurrent); 165 atomic_dec(&kmod_concurrent);
136 return ret; 166 return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
188 /* Exec failed? */ 218 /* Exec failed? */
189fail: 219fail:
190 sub_info->retval = retval; 220 sub_info->retval = retval;
191 do_exit(0); 221 return 0;
192} 222}
193 223
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 224void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
199} 229}
200EXPORT_SYMBOL(call_usermodehelper_freeinfo); 230EXPORT_SYMBOL(call_usermodehelper_freeinfo);
201 231
232static void umh_complete(struct subprocess_info *sub_info)
233{
234 struct completion *comp = xchg(&sub_info->complete, NULL);
235 /*
236 * See call_usermodehelper_exec(). If xchg() returns NULL
237 * we own sub_info, the UMH_KILLABLE caller has gone away.
238 */
239 if (comp)
240 complete(comp);
241 else
242 call_usermodehelper_freeinfo(sub_info);
243}
244
202/* Keventd can't block, but this (a child) can. */ 245/* Keventd can't block, but this (a child) can. */
203static int wait_for_helper(void *data) 246static int wait_for_helper(void *data)
204{ 247{
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
235 sub_info->retval = ret; 278 sub_info->retval = ret;
236 } 279 }
237 280
238 complete(sub_info->complete); 281 umh_complete(sub_info);
239 return 0; 282 return 0;
240} 283}
241 284
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
244{ 287{
245 struct subprocess_info *sub_info = 288 struct subprocess_info *sub_info =
246 container_of(work, struct subprocess_info, work); 289 container_of(work, struct subprocess_info, work);
247 enum umh_wait wait = sub_info->wait; 290 int wait = sub_info->wait & ~UMH_KILLABLE;
248 pid_t pid; 291 pid_t pid;
249 292
250 /* CLONE_VFORK: wait until the usermode helper has execve'd 293 /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
269 case UMH_WAIT_EXEC: 312 case UMH_WAIT_EXEC:
270 if (pid < 0) 313 if (pid < 0)
271 sub_info->retval = pid; 314 sub_info->retval = pid;
272 complete(sub_info->complete); 315 umh_complete(sub_info);
273 } 316 }
274} 317}
275 318
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
435 * asynchronously if wait is not set, and runs as a child of keventd. 478 * asynchronously if wait is not set, and runs as a child of keventd.
436 * (ie. it runs with full root capabilities). 479 * (ie. it runs with full root capabilities).
437 */ 480 */
438int call_usermodehelper_exec(struct subprocess_info *sub_info, 481int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
439 enum umh_wait wait)
440{ 482{
441 DECLARE_COMPLETION_ONSTACK(done); 483 DECLARE_COMPLETION_ONSTACK(done);
442 int retval = 0; 484 int retval = 0;
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
456 queue_work(khelper_wq, &sub_info->work); 498 queue_work(khelper_wq, &sub_info->work);
457 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 499 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
458 goto unlock; 500 goto unlock;
501
502 if (wait & UMH_KILLABLE) {
503 retval = wait_for_completion_killable(&done);
504 if (!retval)
505 goto wait_done;
506
507 /* umh_complete() will see NULL and free sub_info */
508 if (xchg(&sub_info->complete, NULL))
509 goto unlock;
510 /* fallthrough, umh_complete() was already called */
511 }
512
459 wait_for_completion(&done); 513 wait_for_completion(&done);
514wait_done:
460 retval = sub_info->retval; 515 retval = sub_info->retval;
461
462out: 516out:
463 call_usermodehelper_freeinfo(sub_info); 517 call_usermodehelper_freeinfo(sub_info);
464unlock: 518unlock:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9788c0ec6f43..c62b8546cc90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p)
1334 if (!kernel_text_address((unsigned long) p->addr) || 1334 if (!kernel_text_address((unsigned long) p->addr) ||
1335 in_kprobes_functions((unsigned long) p->addr) || 1335 in_kprobes_functions((unsigned long) p->addr) ||
1336 ftrace_text_reserved(p->addr, p->addr) || 1336 ftrace_text_reserved(p->addr, p->addr) ||
1337 jump_label_text_reserved(p->addr, p->addr)) 1337 jump_label_text_reserved(p->addr, p->addr)) {
1338 goto fail_with_jump_label; 1338 ret = -EINVAL;
1339 goto cannot_probe;
1340 }
1339 1341
1340 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1342 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1341 p->flags &= KPROBE_FLAG_DISABLED; 1343 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1352 * its code to prohibit unexpected unloading. 1354 * its code to prohibit unexpected unloading.
1353 */ 1355 */
1354 if (unlikely(!try_module_get(probed_mod))) 1356 if (unlikely(!try_module_get(probed_mod)))
1355 goto fail_with_jump_label; 1357 goto cannot_probe;
1356 1358
1357 /* 1359 /*
1358 * If the module freed .init.text, we couldn't insert 1360 * If the module freed .init.text, we couldn't insert
@@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1361 if (within_module_init((unsigned long)p->addr, probed_mod) && 1363 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1362 probed_mod->state != MODULE_STATE_COMING) { 1364 probed_mod->state != MODULE_STATE_COMING) {
1363 module_put(probed_mod); 1365 module_put(probed_mod);
1364 goto fail_with_jump_label; 1366 goto cannot_probe;
1365 } 1367 }
1366 /* ret will be updated by following code */ 1368 /* ret will be updated by following code */
1367 } 1369 }
@@ -1409,7 +1411,7 @@ out:
1409 1411
1410 return ret; 1412 return ret;
1411 1413
1412fail_with_jump_label: 1414cannot_probe:
1413 preempt_enable(); 1415 preempt_enable();
1414 jump_label_unlock(); 1416 jump_label_unlock();
1415 return ret; 1417 return ret;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8889f7dd7c46..ea9ee4518c35 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4176 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4177 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4178 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4180 !rcu_lockdep_current_cpu_online()
4181 ? "RCU used illegally from offline CPU!\n"
4182 : rcu_is_cpu_idle()
4183 ? "RCU used illegally from idle CPU!\n"
4184 : "",
4185 rcu_scheduler_active, debug_locks);
4180 4186
4181 /* 4187 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section 4188 * If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd33..78ac6ec1e425 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
105 105
106/* Block module loading/unloading? */ 106/* Block module loading/unloading? */
107int modules_disabled = 0; 107int modules_disabled = 0;
108core_param(nomodule, modules_disabled, bint, 0);
108 109
109/* Waiting for a module to finish initializing? */ 110/* Waiting for a module to finish initializing? */
110static DECLARE_WAIT_QUEUE_HEAD(module_wq); 111static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
903static struct module_attribute modinfo_refcnt = 904static struct module_attribute modinfo_refcnt =
904 __ATTR(refcnt, 0444, show_refcnt, NULL); 905 __ATTR(refcnt, 0444, show_refcnt, NULL);
905 906
907void __module_get(struct module *module)
908{
909 if (module) {
910 preempt_disable();
911 __this_cpu_inc(module->refptr->incs);
912 trace_module_get(module, _RET_IP_);
913 preempt_enable();
914 }
915}
916EXPORT_SYMBOL(__module_get);
917
918bool try_module_get(struct module *module)
919{
920 bool ret = true;
921
922 if (module) {
923 preempt_disable();
924
925 if (likely(module_is_live(module))) {
926 __this_cpu_inc(module->refptr->incs);
927 trace_module_get(module, _RET_IP_);
928 } else
929 ret = false;
930
931 preempt_enable();
932 }
933 return ret;
934}
935EXPORT_SYMBOL(try_module_get);
936
906void module_put(struct module *module) 937void module_put(struct module *module)
907{ 938{
908 if (module) { 939 if (module) {
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info,
2380 return -ENOEXEC; 2411 return -ENOEXEC;
2381 2412
2382 /* Suck in entire file: we'll want most of it. */ 2413 /* Suck in entire file: we'll want most of it. */
2383 /* vmalloc barfs on "unusual" numbers. Check here */ 2414 if ((hdr = vmalloc(len)) == NULL)
2384 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2385 return -ENOMEM; 2415 return -ENOMEM;
2386 2416
2387 if (copy_from_user(hdr, umod, len) != 0) { 2417 if (copy_from_user(hdr, umod, len) != 0) {
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod,
2922 mutex_unlock(&module_mutex); 2952 mutex_unlock(&module_mutex);
2923 2953
2924 /* Module is ready to execute: parsing args may do that. */ 2954 /* Module is ready to execute: parsing args may do that. */
2925 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL);
2926 if (err < 0) 2957 if (err < 0)
2927 goto unlink; 2958 goto unlink;
2928 2959
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 89096dd8786f..a307cc9c9526 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
240 240
241 /* didn't get the lock, go to sleep: */ 241 /* didn't get the lock, go to sleep: */
242 spin_unlock_mutex(&lock->wait_lock, flags); 242 spin_unlock_mutex(&lock->wait_lock, flags);
243 preempt_enable_no_resched(); 243 schedule_preempt_disabled();
244 schedule();
245 preempt_disable();
246 spin_lock_mutex(&lock->wait_lock, flags); 244 spin_lock_mutex(&lock->wait_lock, flags);
247 } 245 }
248 246
diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..6f10eb285ece 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,6 @@
29#include <linux/sysfs.h> 29#include <linux/sysfs.h>
30#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
31 31
32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
33#define MAX_OBJ_NUM 1000 32#define MAX_OBJ_NUM 1000
34 33
35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
43 return target_cpu; 42 return target_cpu;
44} 43}
45 44
46static int padata_cpu_hash(struct padata_priv *padata) 45static int padata_cpu_hash(struct parallel_data *pd)
47{ 46{
48 int cpu_index; 47 int cpu_index;
49 struct parallel_data *pd;
50
51 pd = padata->pd;
52 48
53 /* 49 /*
54 * Hash the sequence numbers to the cpus by taking 50 * Hash the sequence numbers to the cpus by taking
55 * seq_nr mod. number of cpus in use. 51 * seq_nr mod. number of cpus in use.
56 */ 52 */
57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); 53
54 spin_lock(&pd->seq_lock);
55 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
56 pd->seq_nr++;
57 spin_unlock(&pd->seq_lock);
58 58
59 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
60} 60}
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
132 padata->pd = pd; 132 padata->pd = pd;
133 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
134 134
135 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) 135 target_cpu = padata_cpu_hash(pd);
136 atomic_set(&pd->seq_nr, -1);
137
138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
139
140 target_cpu = padata_cpu_hash(padata);
141 queue = per_cpu_ptr(pd->pqueue, target_cpu); 136 queue = per_cpu_ptr(pd->pqueue, target_cpu);
142 137
143 spin_lock(&queue->parallel.lock); 138 spin_lock(&queue->parallel.lock);
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);
173static struct padata_priv *padata_get_next(struct parallel_data *pd) 168static struct padata_priv *padata_get_next(struct parallel_data *pd)
174{ 169{
175 int cpu, num_cpus; 170 int cpu, num_cpus;
176 int next_nr, next_index; 171 unsigned int next_nr, next_index;
177 struct padata_parallel_queue *queue, *next_queue; 172 struct padata_parallel_queue *queue, *next_queue;
178 struct padata_priv *padata; 173 struct padata_priv *padata;
179 struct padata_list *reorder; 174 struct padata_list *reorder;
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
189 cpu = padata_index_to_cpu(pd, next_index); 184 cpu = padata_index_to_cpu(pd, next_index);
190 next_queue = per_cpu_ptr(pd->pqueue, cpu); 185 next_queue = per_cpu_ptr(pd->pqueue, cpu);
191 186
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
198 }
199
200 padata = NULL; 187 padata = NULL;
201 188
202 reorder = &next_queue->reorder; 189 reorder = &next_queue->reorder;
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
205 padata = list_entry(reorder->list.next, 192 padata = list_entry(reorder->list.next,
206 struct padata_priv, list); 193 struct padata_priv, list);
207 194
208 BUG_ON(next_nr != padata->seq_nr);
209
210 spin_lock(&reorder->lock); 195 spin_lock(&reorder->lock);
211 list_del_init(&padata->list); 196 list_del_init(&padata->list);
212 atomic_dec(&pd->reorder_objects); 197 atomic_dec(&pd->reorder_objects);
@@ -230,6 +215,7 @@ out:
230 215
231static void padata_reorder(struct parallel_data *pd) 216static void padata_reorder(struct parallel_data *pd)
232{ 217{
218 int cb_cpu;
233 struct padata_priv *padata; 219 struct padata_priv *padata;
234 struct padata_serial_queue *squeue; 220 struct padata_serial_queue *squeue;
235 struct padata_instance *pinst = pd->pinst; 221 struct padata_instance *pinst = pd->pinst;
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)
270 return; 256 return;
271 } 257 }
272 258
273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); 259 cb_cpu = padata->cb_cpu;
260 squeue = per_cpu_ptr(pd->squeue, cb_cpu);
274 261
275 spin_lock(&squeue->serial.lock); 262 spin_lock(&squeue->serial.lock);
276 list_add_tail(&padata->list, &squeue->serial.list); 263 list_add_tail(&padata->list, &squeue->serial.list);
277 spin_unlock(&squeue->serial.lock); 264 spin_unlock(&squeue->serial.lock);
278 265
279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); 266 queue_work_on(cb_cpu, pinst->wq, &squeue->work);
280 } 267 }
281 268
282 spin_unlock_bh(&pd->lock); 269 spin_unlock_bh(&pd->lock);
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
400/* Initialize all percpu queues used by parallel workers */ 387/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd) 388static void padata_init_pqueues(struct parallel_data *pd)
402{ 389{
403 int cpu_index, num_cpus, cpu; 390 int cpu_index, cpu;
404 struct padata_parallel_queue *pqueue; 391 struct padata_parallel_queue *pqueue;
405 392
406 cpu_index = 0; 393 cpu_index = 0;
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
415 INIT_WORK(&pqueue->work, padata_parallel_worker); 402 INIT_WORK(&pqueue->work, padata_parallel_worker);
416 atomic_set(&pqueue->num_obj, 0); 403 atomic_set(&pqueue->num_obj, 0);
417 } 404 }
418
419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421} 405}
422 406
423/* Allocate and initialize the internal cpumask dependend resources. */ 407/* Allocate and initialize the internal cpumask dependend resources. */
@@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
444 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
445 padata_init_squeues(pd); 429 padata_init_squeues(pd);
446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
447 atomic_set(&pd->seq_nr, -1); 431 pd->seq_nr = 0;
448 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
449 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
450 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..f37d82631347 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/module.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/string.h> 19#include <linux/string.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
@@ -88,6 +87,8 @@ static int parse_one(char *param,
88 char *val, 87 char *val,
89 const struct kernel_param *params, 88 const struct kernel_param *params,
90 unsigned num_params, 89 unsigned num_params,
90 s16 min_level,
91 s16 max_level,
91 int (*handle_unknown)(char *param, char *val)) 92 int (*handle_unknown)(char *param, char *val))
92{ 93{
93 unsigned int i; 94 unsigned int i;
@@ -96,6 +97,9 @@ static int parse_one(char *param,
96 /* Find parameter */ 97 /* Find parameter */
97 for (i = 0; i < num_params; i++) { 98 for (i = 0; i < num_params; i++) {
98 if (parameq(param, params[i].name)) { 99 if (parameq(param, params[i].name)) {
100 if (params[i].level < min_level
101 || params[i].level > max_level)
102 return 0;
99 /* No one handled NULL, so do it here. */ 103 /* No one handled NULL, so do it here. */
100 if (!val && params[i].ops->set != param_set_bool 104 if (!val && params[i].ops->set != param_set_bool
101 && params[i].ops->set != param_set_bint) 105 && params[i].ops->set != param_set_bint)
@@ -175,6 +179,8 @@ int parse_args(const char *name,
175 char *args, 179 char *args,
176 const struct kernel_param *params, 180 const struct kernel_param *params,
177 unsigned num, 181 unsigned num,
182 s16 min_level,
183 s16 max_level,
178 int (*unknown)(char *param, char *val)) 184 int (*unknown)(char *param, char *val))
179{ 185{
180 char *param, *val; 186 char *param, *val;
@@ -190,7 +196,8 @@ int parse_args(const char *name,
190 196
191 args = next_arg(args, &param, &val); 197 args = next_arg(args, &param, &val);
192 irq_was_disabled = irqs_disabled(); 198 irq_was_disabled = irqs_disabled();
193 ret = parse_one(param, val, params, num, unknown); 199 ret = parse_one(param, val, params, num,
200 min_level, max_level, unknown);
194 if (irq_was_disabled && !irqs_disabled()) { 201 if (irq_was_disabled && !irqs_disabled()) {
195 printk(KERN_WARNING "parse_args(): option '%s' enabled " 202 printk(KERN_WARNING "parse_args(): option '%s' enabled "
196 "irq's!\n", param); 203 "irq's!\n", param);
@@ -298,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp);
298/* Actually could be a bool or an int, for historical reasons. */ 305/* Actually could be a bool or an int, for historical reasons. */
299int param_set_bool(const char *val, const struct kernel_param *kp) 306int param_set_bool(const char *val, const struct kernel_param *kp)
300{ 307{
301 bool v;
302 int ret;
303
304 /* No equals means "set"... */ 308 /* No equals means "set"... */
305 if (!val) val = "1"; 309 if (!val) val = "1";
306 310
307 /* One of =[yYnN01] */ 311 /* One of =[yYnN01] */
308 ret = strtobool(val, &v); 312 return strtobool(val, kp->arg);
309 if (ret)
310 return ret;
311
312 if (kp->flags & KPARAM_ISBOOL)
313 *(bool *)kp->arg = v;
314 else
315 *(int *)kp->arg = v;
316 return 0;
317} 313}
318EXPORT_SYMBOL(param_set_bool); 314EXPORT_SYMBOL(param_set_bool);
319 315
320int param_get_bool(char *buffer, const struct kernel_param *kp) 316int param_get_bool(char *buffer, const struct kernel_param *kp)
321{ 317{
322 bool val;
323 if (kp->flags & KPARAM_ISBOOL)
324 val = *(bool *)kp->arg;
325 else
326 val = *(int *)kp->arg;
327
328 /* Y and N chosen as being relatively non-coder friendly */ 318 /* Y and N chosen as being relatively non-coder friendly */
329 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 319 return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
330} 320}
331EXPORT_SYMBOL(param_get_bool); 321EXPORT_SYMBOL(param_get_bool);
332 322
@@ -344,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
344 struct kernel_param dummy; 334 struct kernel_param dummy;
345 335
346 dummy.arg = &boolval; 336 dummy.arg = &boolval;
347 dummy.flags = KPARAM_ISBOOL;
348 ret = param_set_bool(val, &dummy); 337 ret = param_set_bool(val, &dummy);
349 if (ret == 0) 338 if (ret == 0)
350 *(bool *)kp->arg = !boolval; 339 *(bool *)kp->arg = !boolval;
@@ -373,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
373 /* Match bool exactly, by re-using it. */ 362 /* Match bool exactly, by re-using it. */
374 boolkp = *kp; 363 boolkp = *kp;
375 boolkp.arg = &v; 364 boolkp.arg = &v;
376 boolkp.flags |= KPARAM_ISBOOL;
377 365
378 ret = param_set_bool(val, &boolkp); 366 ret = param_set_bool(val, &boolkp);
379 if (ret == 0) 367 if (ret == 0)
@@ -394,7 +382,7 @@ static int param_array(const char *name,
394 unsigned int min, unsigned int max, 382 unsigned int min, unsigned int max,
395 void *elem, int elemsize, 383 void *elem, int elemsize,
396 int (*set)(const char *, const struct kernel_param *kp), 384 int (*set)(const char *, const struct kernel_param *kp),
397 u16 flags, 385 s16 level,
398 unsigned int *num) 386 unsigned int *num)
399{ 387{
400 int ret; 388 int ret;
@@ -404,7 +392,7 @@ static int param_array(const char *name,
404 /* Get the name right for errors. */ 392 /* Get the name right for errors. */
405 kp.name = name; 393 kp.name = name;
406 kp.arg = elem; 394 kp.arg = elem;
407 kp.flags = flags; 395 kp.level = level;
408 396
409 *num = 0; 397 *num = 0;
410 /* We expect a comma-separated list of values. */ 398 /* We expect a comma-separated list of values. */
@@ -445,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
445 unsigned int temp_num; 433 unsigned int temp_num;
446 434
447 return param_array(kp->name, val, 1, arr->max, arr->elem, 435 return param_array(kp->name, val, 1, arr->max, arr->elem,
448 arr->elemsize, arr->ops->set, kp->flags, 436 arr->elemsize, arr->ops->set, kp->level,
449 arr->num ?: &temp_num); 437 arr->num ?: &temp_num);
450} 438}
451 439
diff --git a/kernel/pid.c b/kernel/pid.c
index ce8e00deaccb..9f08dfabaf13 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
543 */ 543 */
544void __init pidhash_init(void) 544void __init pidhash_init(void)
545{ 545{
546 int i, pidhash_size; 546 unsigned int i, pidhash_size;
547 547
548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
549 HASH_EARLY | HASH_SMALL, 549 HASH_EARLY | HASH_SMALL,
550 &pidhash_shift, NULL, 4096); 550 &pidhash_shift, NULL, 4096);
551 pidhash_size = 1 << pidhash_shift; 551 pidhash_size = 1U << pidhash_shift;
552 552
553 for (i = 0; i < pidhash_size; i++) 553 for (i = 0; i < pidhash_size; i++)
554 INIT_HLIST_HEAD(&pid_hash[i]); 554 INIT_HLIST_HEAD(&pid_hash[i]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..57bc1fd35b3c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,6 +15,7 @@
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/reboot.h>
18 19
19#define BITS_PER_PAGE (PAGE_SIZE*8) 20#define BITS_PER_PAGE (PAGE_SIZE*8)
20 21
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
168 while (nr > 0) { 169 while (nr > 0) {
169 rcu_read_lock(); 170 rcu_read_lock();
170 171
171 /*
172 * Any nested-container's init processes won't ignore the
173 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
174 */
175 task = pid_task(find_vpid(nr), PIDTYPE_PID); 172 task = pid_task(find_vpid(nr), PIDTYPE_PID);
176 if (task) 173 if (task && !__fatal_signal_pending(task))
177 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); 174 send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
178 175
179 rcu_read_unlock(); 176 rcu_read_unlock();
180 177
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
187 rc = sys_wait4(-1, NULL, __WALL, NULL); 184 rc = sys_wait4(-1, NULL, __WALL, NULL);
188 } while (rc != -ECHILD); 185 } while (rc != -ECHILD);
189 186
187 if (pid_ns->reboot)
188 current->signal->group_exit_code = pid_ns->reboot;
189
190 acct_exit_ns(pid_ns); 190 acct_exit_ns(pid_ns);
191 return; 191 return;
192} 192}
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
221 221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; 222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223 223
224int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
225{
226 if (pid_ns == &init_pid_ns)
227 return 0;
228
229 switch (cmd) {
230 case LINUX_REBOOT_CMD_RESTART2:
231 case LINUX_REBOOT_CMD_RESTART:
232 pid_ns->reboot = SIGHUP;
233 break;
234
235 case LINUX_REBOOT_CMD_POWER_OFF:
236 case LINUX_REBOOT_CMD_HALT:
237 pid_ns->reboot = SIGINT;
238 break;
239 default:
240 return -EINVAL;
241 }
242
243 read_lock(&tasklist_lock);
244 force_sig(SIGKILL, pid_ns->child_reaper);
245 read_unlock(&tasklist_lock);
246
247 do_exit(0);
248
249 /* Not reached */
250 return 0;
251}
252
224static __init int pid_namespaces_init(void) 253static __init int pid_namespaces_init(void)
225{ 254{
226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 255 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o qos.o 4obj-y += qos.o
5obj-$(CONFIG_PM) += main.o
5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o 6obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 7obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 8obj-$(CONFIG_SUSPEND) += suspend.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..0a186cfde788 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
245 * create_image - Create a hibernation image. 245 * create_image - Create a hibernation image.
246 * @platform_mode: Whether or not to use the platform driver. 246 * @platform_mode: Whether or not to use the platform driver.
247 * 247 *
248 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image 248 * Execute device drivers' "late" and "noirq" freeze callbacks, create a
249 * and execute the drivers' .thaw_noirq() callbacks. 249 * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
250 * 250 *
251 * Control reappears in this routine after the subsequent restore. 251 * Control reappears in this routine after the subsequent restore.
252 */ 252 */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
254{ 254{
255 int error; 255 int error;
256 256
257 error = dpm_suspend_noirq(PMSG_FREEZE); 257 error = dpm_suspend_end(PMSG_FREEZE);
258 if (error) { 258 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 259 printk(KERN_ERR "PM: Some devices failed to power down, "
260 "aborting hibernation\n"); 260 "aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
306 Platform_finish: 306 Platform_finish:
307 platform_finish(platform_mode); 307 platform_finish(platform_mode);
308 308
309 dpm_resume_noirq(in_suspend ? 309 dpm_resume_start(in_suspend ?
310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
311 311
312 return error; 312 return error;
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)
343 * successful freezer test. 343 * successful freezer test.
344 */ 344 */
345 freezer_test_done = true; 345 freezer_test_done = true;
346 goto Cleanup; 346 goto Thaw;
347 } 347 }
348 348
349 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
350 if (error) { 350 if (error) {
351 dpm_complete(PMSG_RECOVER); 351 dpm_complete(PMSG_RECOVER);
352 goto Cleanup; 352 goto Thaw;
353 } 353 }
354 354
355 suspend_console(); 355 suspend_console();
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)
385 platform_end(platform_mode); 385 platform_end(platform_mode);
386 return error; 386 return error;
387 387
388 Thaw:
389 thaw_kernel_threads();
388 Cleanup: 390 Cleanup:
389 swsusp_free(); 391 swsusp_free();
390 goto Close; 392 goto Close;
@@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode)
394 * resume_target_kernel - Restore system state from a hibernation image. 396 * resume_target_kernel - Restore system state from a hibernation image.
395 * @platform_mode: Whether or not to use the platform driver. 397 * @platform_mode: Whether or not to use the platform driver.
396 * 398 *
397 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of 399 * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
398 * highmem that have not been restored yet from the image and run the low-level 400 * contents of highmem that have not been restored yet from the image and run
399 * code that will restore the remaining contents of memory and switch to the 401 * the low-level code that will restore the remaining contents of memory and
400 * just restored target kernel. 402 * switch to the just restored target kernel.
401 */ 403 */
402static int resume_target_kernel(bool platform_mode) 404static int resume_target_kernel(bool platform_mode)
403{ 405{
404 int error; 406 int error;
405 407
406 error = dpm_suspend_noirq(PMSG_QUIESCE); 408 error = dpm_suspend_end(PMSG_QUIESCE);
407 if (error) { 409 if (error) {
408 printk(KERN_ERR "PM: Some devices failed to power down, " 410 printk(KERN_ERR "PM: Some devices failed to power down, "
409 "aborting resume\n"); 411 "aborting resume\n");
@@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode)
460 Cleanup: 462 Cleanup:
461 platform_restore_cleanup(platform_mode); 463 platform_restore_cleanup(platform_mode);
462 464
463 dpm_resume_noirq(PMSG_RECOVER); 465 dpm_resume_start(PMSG_RECOVER);
464 466
465 return error; 467 return error;
466} 468}
@@ -518,7 +520,7 @@ int hibernation_platform_enter(void)
518 goto Resume_devices; 520 goto Resume_devices;
519 } 521 }
520 522
521 error = dpm_suspend_noirq(PMSG_HIBERNATE); 523 error = dpm_suspend_end(PMSG_HIBERNATE);
522 if (error) 524 if (error)
523 goto Resume_devices; 525 goto Resume_devices;
524 526
@@ -549,7 +551,7 @@ int hibernation_platform_enter(void)
549 Platform_finish: 551 Platform_finish:
550 hibernation_ops->finish(); 552 hibernation_ops->finish();
551 553
552 dpm_resume_noirq(PMSG_RESTORE); 554 dpm_resume_start(PMSG_RESTORE);
553 555
554 Resume_devices: 556 Resume_devices:
555 entering_platform_hibernation = false; 557 entering_platform_hibernation = false;
@@ -616,7 +618,7 @@ int hibernate(void)
616 /* Allocate memory management structures */ 618 /* Allocate memory management structures */
617 error = create_basic_memory_bitmaps(); 619 error = create_basic_memory_bitmaps();
618 if (error) 620 if (error)
619 goto Exit; 621 goto Enable_umh;
620 622
621 printk(KERN_INFO "PM: Syncing filesystems ... "); 623 printk(KERN_INFO "PM: Syncing filesystems ... ");
622 sys_sync(); 624 sys_sync();
@@ -624,15 +626,11 @@ int hibernate(void)
624 626
625 error = freeze_processes(); 627 error = freeze_processes();
626 if (error) 628 if (error)
627 goto Finish; 629 goto Free_bitmaps;
628 630
629 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 631 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
630 if (error) 632 if (error || freezer_test_done)
631 goto Thaw;
632 if (freezer_test_done) {
633 freezer_test_done = false;
634 goto Thaw; 633 goto Thaw;
635 }
636 634
637 if (in_suspend) { 635 if (in_suspend) {
638 unsigned int flags = 0; 636 unsigned int flags = 0;
@@ -657,8 +655,13 @@ int hibernate(void)
657 655
658 Thaw: 656 Thaw:
659 thaw_processes(); 657 thaw_processes();
660 Finish: 658
659 /* Don't bother checking whether freezer_test_done is true */
660 freezer_test_done = false;
661
662 Free_bitmaps:
661 free_basic_memory_bitmaps(); 663 free_basic_memory_bitmaps();
664 Enable_umh:
662 usermodehelper_enable(); 665 usermodehelper_enable();
663 Exit: 666 Exit:
664 pm_notifier_call_chain(PM_POST_HIBERNATION); 667 pm_notifier_call_chain(PM_POST_HIBERNATION);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
165 last_errno %= REC_FAILED_NUM; 165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; 166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM; 167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" 168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", 169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success, 170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail, 171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze, 172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare, 173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend, 174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_late",
176 suspend_stats.failed_suspend_late,
175 "failed_suspend_noirq", 177 "failed_suspend_noirq",
176 suspend_stats.failed_suspend_noirq, 178 suspend_stats.failed_suspend_noirq,
177 "failed_resume", suspend_stats.failed_resume, 179 "failed_resume", suspend_stats.failed_resume,
180 "failed_resume_early",
181 suspend_stats.failed_resume_early,
178 "failed_resume_noirq", 182 "failed_resume_noirq",
179 suspend_stats.failed_resume_noirq); 183 suspend_stats.failed_resume_noirq);
180 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", 184 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
287 291
288#ifdef CONFIG_SUSPEND 292#ifdef CONFIG_SUSPEND
289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
295 error = pm_suspend(state);
291 break; 296 break;
292 } 297 }
293 if (state < PM_SUSPEND_MAX && *s) {
294 error = enter_state(state);
295 if (error) {
296 suspend_stats.fail++;
297 dpm_save_failed_errno(error);
298 } else
299 suspend_stats.success++;
300 } 298 }
301#endif 299#endif
302 300
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee5206..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
177 177
178extern bool valid_state(suspend_state_t state); 178extern bool valid_state(suspend_state_t state);
179extern int suspend_devices_and_enter(suspend_state_t state); 179extern int suspend_devices_and_enter(suspend_state_t state);
180extern int enter_state(suspend_state_t state);
181#else /* !CONFIG_SUSPEND */ 180#else /* !CONFIG_SUSPEND */
182static inline int suspend_devices_and_enter(suspend_state_t state) 181static inline int suspend_devices_and_enter(suspend_state_t state)
183{ 182{
184 return -ENOSYS; 183 return -ENOSYS;
185} 184}
186static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
187static inline bool valid_state(suspend_state_t state) { return false; } 185static inline bool valid_state(suspend_state_t state) { return false; }
188#endif /* !CONFIG_SUSPEND */ 186#endif /* !CONFIG_SUSPEND */
189 187
@@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void)
234 int error; 232 int error;
235 233
236 error = freeze_processes(); 234 error = freeze_processes();
237
238 /* 235 /*
239 * freeze_processes() automatically thaws every task if freezing 236 * freeze_processes() automatically thaws every task if freezing
240 * fails. So we need not do anything extra upon error. 237 * fails. So we need not do anything extra upon error.
241 */ 238 */
242 if (error) 239 if (error)
243 goto Finish; 240 return error;
244 241
245 error = freeze_kernel_threads(); 242 error = freeze_kernel_threads();
246
247 /* 243 /*
248 * freeze_kernel_threads() thaws only kernel threads upon freezing 244 * freeze_kernel_threads() thaws only kernel threads upon freezing
249 * failure. So we have to thaw the userspace tasks ourselves. 245 * failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void)
251 if (error) 247 if (error)
252 thaw_processes(); 248 thaw_processes();
253 249
254 Finish:
255 return error; 250 return error;
256} 251}
257 252
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60a..0d2aeb226108 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
53 * It is "frozen enough". If the task does wake 53 * It is "frozen enough". If the task does wake
54 * up, it will immediately call try_to_freeze. 54 * up, it will immediately call try_to_freeze.
55 * 55 *
56 * Because freeze_task() goes through p's 56 * Because freeze_task() goes through p's scheduler lock, it's
57 * scheduler lock after setting TIF_FREEZE, it's 57 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
58 * guaranteed that either we see TASK_RUNNING or 58 * transition can't race with task state testing here.
59 * try_to_stop() after schedule() in ptrace/signal
60 * stop sees TIF_FREEZE.
61 */ 59 */
62 if (!task_is_stopped_or_traced(p) && 60 if (!task_is_stopped_or_traced(p) &&
63 !freezer_should_skip(p)) 61 !freezer_should_skip(p))
@@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only)
98 elapsed_csecs / 100, elapsed_csecs % 100, 96 elapsed_csecs / 100, elapsed_csecs % 100,
99 todo - wq_busy, wq_busy); 97 todo - wq_busy, wq_busy);
100 98
101 read_lock(&tasklist_lock); 99 if (!wakeup) {
102 do_each_thread(g, p) { 100 read_lock(&tasklist_lock);
103 if (!wakeup && !freezer_should_skip(p) && 101 do_each_thread(g, p) {
104 p != current && freezing(p) && !frozen(p)) 102 if (p != current && !freezer_should_skip(p)
105 sched_show_task(p); 103 && freezing(p) && !frozen(p))
106 } while_each_thread(g, p); 104 sched_show_task(p);
107 read_unlock(&tasklist_lock); 105 } while_each_thread(g, p);
106 read_unlock(&tasklist_lock);
107 }
108 } else { 108 } else {
109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
110 elapsed_csecs % 100); 110 elapsed_csecs % 100);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..d6d6dbd1ecc0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
469static int __init pm_qos_power_init(void) 469static int __init pm_qos_power_init(void)
470{ 470{
471 int ret = 0; 471 int ret = 0;
472 int i;
472 473
473 ret = register_pm_qos_misc(&cpu_dma_pm_qos); 474 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
474 if (ret < 0) { 475
475 printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); 476 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
476 return ret; 477 ret = register_pm_qos_misc(pm_qos_array[i]);
477 } 478 if (ret < 0) {
478 ret = register_pm_qos_misc(&network_lat_pm_qos); 479 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
479 if (ret < 0) { 480 pm_qos_array[i]->name);
480 printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); 481 return ret;
481 return ret; 482 }
482 } 483 }
483 ret = register_pm_qos_misc(&network_throughput_pm_qos);
484 if (ret < 0)
485 printk(KERN_ERR
486 "pm_qos_param: network_throughput setup failed\n");
487 484
488 return ret; 485 return ret;
489} 486}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..0de28576807d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
711 list_for_each_entry(region, &nosave_regions, list) { 711 list_for_each_entry(region, &nosave_regions, list) {
712 unsigned long pfn; 712 unsigned long pfn;
713 713
714 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", 714 pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
715 region->start_pfn << PAGE_SHIFT, 715 (unsigned long long) region->start_pfn << PAGE_SHIFT,
716 region->end_pfn << PAGE_SHIFT); 716 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
717 - 1);
717 718
718 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 719 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
719 if (pfn_valid(pfn)) { 720 if (pfn_valid(pfn)) {
@@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1000 s_page = pfn_to_page(src_pfn); 1001 s_page = pfn_to_page(src_pfn);
1001 d_page = pfn_to_page(dst_pfn); 1002 d_page = pfn_to_page(dst_pfn);
1002 if (PageHighMem(s_page)) { 1003 if (PageHighMem(s_page)) {
1003 src = kmap_atomic(s_page, KM_USER0); 1004 src = kmap_atomic(s_page);
1004 dst = kmap_atomic(d_page, KM_USER1); 1005 dst = kmap_atomic(d_page);
1005 do_copy_page(dst, src); 1006 do_copy_page(dst, src);
1006 kunmap_atomic(dst, KM_USER1); 1007 kunmap_atomic(dst);
1007 kunmap_atomic(src, KM_USER0); 1008 kunmap_atomic(src);
1008 } else { 1009 } else {
1009 if (PageHighMem(d_page)) { 1010 if (PageHighMem(d_page)) {
1010 /* Page pointed to by src may contain some kernel 1011 /* Page pointed to by src may contain some kernel
1011 * data modified by kmap_atomic() 1012 * data modified by kmap_atomic()
1012 */ 1013 */
1013 safe_copy_page(buffer, s_page); 1014 safe_copy_page(buffer, s_page);
1014 dst = kmap_atomic(d_page, KM_USER0); 1015 dst = kmap_atomic(d_page);
1015 copy_page(dst, buffer); 1016 copy_page(dst, buffer);
1016 kunmap_atomic(dst, KM_USER0); 1017 kunmap_atomic(dst);
1017 } else { 1018 } else {
1018 safe_copy_page(page_address(d_page), s_page); 1019 safe_copy_page(page_address(d_page), s_page);
1019 } 1020 }
@@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
1728 */ 1729 */
1729 void *kaddr; 1730 void *kaddr;
1730 1731
1731 kaddr = kmap_atomic(page, KM_USER0); 1732 kaddr = kmap_atomic(page);
1732 copy_page(buffer, kaddr); 1733 copy_page(buffer, kaddr);
1733 kunmap_atomic(kaddr, KM_USER0); 1734 kunmap_atomic(kaddr);
1734 handle->buffer = buffer; 1735 handle->buffer = buffer;
1735 } else { 1736 } else {
1736 handle->buffer = page_address(page); 1737 handle->buffer = page_address(page);
@@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void)
2014 if (last_highmem_page) { 2015 if (last_highmem_page) {
2015 void *dst; 2016 void *dst;
2016 2017
2017 dst = kmap_atomic(last_highmem_page, KM_USER0); 2018 dst = kmap_atomic(last_highmem_page);
2018 copy_page(dst, buffer); 2019 copy_page(dst, buffer);
2019 kunmap_atomic(dst, KM_USER0); 2020 kunmap_atomic(dst);
2020 last_highmem_page = NULL; 2021 last_highmem_page = NULL;
2021 } 2022 }
2022} 2023}
@@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2309{ 2310{
2310 void *kaddr1, *kaddr2; 2311 void *kaddr1, *kaddr2;
2311 2312
2312 kaddr1 = kmap_atomic(p1, KM_USER0); 2313 kaddr1 = kmap_atomic(p1);
2313 kaddr2 = kmap_atomic(p2, KM_USER1); 2314 kaddr2 = kmap_atomic(p2);
2314 copy_page(buf, kaddr1); 2315 copy_page(buf, kaddr1);
2315 copy_page(kaddr1, kaddr2); 2316 copy_page(kaddr1, kaddr2);
2316 copy_page(kaddr2, buf); 2317 copy_page(kaddr2, buf);
2317 kunmap_atomic(kaddr2, KM_USER1); 2318 kunmap_atomic(kaddr2);
2318 kunmap_atomic(kaddr1, KM_USER0); 2319 kunmap_atomic(kaddr1);
2319} 2320}
2320 2321
2321/** 2322/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..88e5c967370d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38 38
39/** 39/**
40 * suspend_set_ops - Set the global suspend method table. 40 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Pointer to ops structure. 41 * @ops: Suspend operations to use.
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)
58} 58}
59 59
60/** 60/**
61 * suspend_valid_only_mem - generic memory-only valid callback 61 * suspend_valid_only_mem - Generic memory-only valid callback.
62 * 62 *
63 * Platform drivers that implement mem suspend only and only need 63 * Platform drivers that implement mem suspend only and only need to check for
64 * to check for that in their .valid callback can use this instead 64 * that in their .valid() callback can use this instead of rolling their own
65 * of rolling their own .valid callback. 65 * .valid() callback.
66 */ 66 */
67int suspend_valid_only_mem(suspend_state_t state) 67int suspend_valid_only_mem(suspend_state_t state)
68{ 68{
@@ -83,10 +83,11 @@ static int suspend_test(int level)
83} 83}
84 84
85/** 85/**
86 * suspend_prepare - Do prep work before entering low-power state. 86 * suspend_prepare - Prepare for entering system sleep state.
87 * 87 *
88 * This is common code that is called for each state that we're entering. 88 * Common code run for every system sleep state that can be entered (except for
89 * Run suspend notifiers, allocate a console and stop all processes. 89 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes.
90 */ 91 */
91static int suspend_prepare(void) 92static int suspend_prepare(void)
92{ 93{
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
131} 132}
132 133
133/** 134/**
134 * suspend_enter - enter the desired system sleep state. 135 * suspend_enter - Make the system enter the given sleep state.
135 * @state: State to enter 136 * @state: System sleep state to enter.
136 * @wakeup: Returns information that suspend should not be entered again. 137 * @wakeup: Returns information that the sleep state should not be re-entered.
137 * 138 *
138 * This function should be called after devices have been suspended. 139 * This function should be called after devices have been suspended.
139 */ 140 */
@@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
147 goto Platform_finish; 148 goto Platform_finish;
148 } 149 }
149 150
150 error = dpm_suspend_noirq(PMSG_SUSPEND); 151 error = dpm_suspend_end(PMSG_SUSPEND);
151 if (error) { 152 if (error) {
152 printk(KERN_ERR "PM: Some devices failed to power down\n"); 153 printk(KERN_ERR "PM: Some devices failed to power down\n");
153 goto Platform_finish; 154 goto Platform_finish;
@@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
189 if (suspend_ops->wake) 190 if (suspend_ops->wake)
190 suspend_ops->wake(); 191 suspend_ops->wake();
191 192
192 dpm_resume_noirq(PMSG_RESUME); 193 dpm_resume_start(PMSG_RESUME);
193 194
194 Platform_finish: 195 Platform_finish:
195 if (suspend_ops->finish) 196 if (suspend_ops->finish)
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
199} 200}
200 201
201/** 202/**
202 * suspend_devices_and_enter - suspend devices and enter the desired system 203 * suspend_devices_and_enter - Suspend devices and enter system sleep state.
203 * sleep state. 204 * @state: System sleep state to enter.
204 * @state: state to enter
205 */ 205 */
206int suspend_devices_and_enter(suspend_state_t state) 206int suspend_devices_and_enter(suspend_state_t state)
207{ 207{
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)
251} 251}
252 252
253/** 253/**
254 * suspend_finish - Do final work before exiting suspend sequence. 254 * suspend_finish - Clean up before finishing the suspend sequence.
255 * 255 *
256 * Call platform code to clean up, restart processes, and free the 256 * Call platform code to clean up, restart processes, and free the console that
257 * console that we've allocated. This is not called for suspend-to-disk. 257 * we've allocated. This routine is not called for hibernation.
258 */ 258 */
259static void suspend_finish(void) 259static void suspend_finish(void)
260{ 260{
@@ -265,16 +265,14 @@ static void suspend_finish(void)
265} 265}
266 266
267/** 267/**
268 * enter_state - Do common work of entering low-power state. 268 * enter_state - Do common work needed to enter system sleep state.
269 * @state: pm_state structure for state we're entering. 269 * @state: System sleep state to enter.
270 * 270 *
271 * Make sure we're the only ones trying to enter a sleep state. Fail 271 * Make sure that no one else is trying to put the system into a sleep state.
272 * if someone has beat us to it, since we don't want anything weird to 272 * Fail if that's not the case. Otherwise, prepare for system suspend, make the
273 * happen when we wake up. 273 * system enter the given sleep state and clean up after wakeup.
274 * Then, do the setup for suspend, enter the state, and cleaup (after
275 * we've woken up).
276 */ 274 */
277int enter_state(suspend_state_t state) 275static int enter_state(suspend_state_t state)
278{ 276{
279 int error; 277 int error;
280 278
@@ -310,24 +308,26 @@ int enter_state(suspend_state_t state)
310} 308}
311 309
312/** 310/**
313 * pm_suspend - Externally visible function for suspending system. 311 * pm_suspend - Externally visible function for suspending the system.
314 * @state: Enumerated value of state to enter. 312 * @state: System sleep state to enter.
315 * 313 *
316 * Determine whether or not value is within range, get state 314 * Check if the value of @state represents one of the supported states,
317 * structure, and enter (above). 315 * execute enter_state() and update system suspend statistics.
318 */ 316 */
319int pm_suspend(suspend_state_t state) 317int pm_suspend(suspend_state_t state)
320{ 318{
321 int ret; 319 int error;
322 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { 320
323 ret = enter_state(state); 321 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
324 if (ret) { 322 return -EINVAL;
325 suspend_stats.fail++; 323
326 dpm_save_failed_errno(ret); 324 error = enter_state(state);
327 } else 325 if (error) {
328 suspend_stats.success++; 326 suspend_stats.fail++;
329 return ret; 327 dpm_save_failed_errno(error);
328 } else {
329 suspend_stats.success++;
330 } 330 }
331 return -EINVAL; 331 return error;
332} 332}
333EXPORT_SYMBOL(pm_suspend); 333EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13c..33c4329205af 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
249 } 249 }
250 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
251 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
252 if (error) { 252 if (!error) {
253 thaw_kernel_threads();
254 } else {
255 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
256 if (!error && !freezer_test_done) 254 data->ready = !freezer_test_done && !error;
257 data->ready = 1; 255 freezer_test_done = false;
258 if (freezer_test_done) {
259 freezer_test_done = false;
260 thaw_kernel_threads();
261 }
262 } 256 }
263 break; 257 break;
264 258
diff --git a/kernel/printk.c b/kernel/printk.c
index 13c0a1143f49..b663c2c95d39 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -44,6 +44,9 @@
44 44
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46 46
47#define CREATE_TRACE_POINTS
48#include <trace/events/printk.h>
49
47/* 50/*
48 * Architectures can override it: 51 * Architectures can override it:
49 */ 52 */
@@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
542static void _call_console_drivers(unsigned start, 545static void _call_console_drivers(unsigned start,
543 unsigned end, int msg_log_level) 546 unsigned end, int msg_log_level)
544{ 547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
545 if ((msg_log_level < console_loglevel || ignore_loglevel) && 550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
546 console_drivers && start != end) { 551 console_drivers && start != end) {
547 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
@@ -702,6 +707,9 @@ static bool printk_time = 0;
702#endif 707#endif
703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 708module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
704 709
710static bool always_kmsg_dump;
711module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
712
705/* Check if we have any console registered that can be called early in boot. */ 713/* Check if we have any console registered that can be called early in boot. */
706static int have_callable_console(void) 714static int have_callable_console(void)
707{ 715{
@@ -1208,13 +1216,27 @@ int is_console_locked(void)
1208 return console_locked; 1216 return console_locked;
1209} 1217}
1210 1218
1219/*
1220 * Delayed printk facility, for scheduler-internal messages:
1221 */
1222#define PRINTK_BUF_SIZE 512
1223
1224#define PRINTK_PENDING_WAKEUP 0x01
1225#define PRINTK_PENDING_SCHED 0x02
1226
1211static DEFINE_PER_CPU(int, printk_pending); 1227static DEFINE_PER_CPU(int, printk_pending);
1228static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1212 1229
1213void printk_tick(void) 1230void printk_tick(void)
1214{ 1231{
1215 if (__this_cpu_read(printk_pending)) { 1232 if (__this_cpu_read(printk_pending)) {
1216 __this_cpu_write(printk_pending, 0); 1233 int pending = __this_cpu_xchg(printk_pending, 0);
1217 wake_up_interruptible(&log_wait); 1234 if (pending & PRINTK_PENDING_SCHED) {
1235 char *buf = __get_cpu_var(printk_sched_buf);
1236 printk(KERN_WARNING "[sched_delayed] %s", buf);
1237 }
1238 if (pending & PRINTK_PENDING_WAKEUP)
1239 wake_up_interruptible(&log_wait);
1218 } 1240 }
1219} 1241}
1220 1242
@@ -1228,7 +1250,7 @@ int printk_needs_cpu(int cpu)
1228void wake_up_klogd(void) 1250void wake_up_klogd(void)
1229{ 1251{
1230 if (waitqueue_active(&log_wait)) 1252 if (waitqueue_active(&log_wait))
1231 this_cpu_write(printk_pending, 1); 1253 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1232} 1254}
1233 1255
1234/** 1256/**
@@ -1621,6 +1643,26 @@ late_initcall(printk_late_init);
1621 1643
1622#if defined CONFIG_PRINTK 1644#if defined CONFIG_PRINTK
1623 1645
1646int printk_sched(const char *fmt, ...)
1647{
1648 unsigned long flags;
1649 va_list args;
1650 char *buf;
1651 int r;
1652
1653 local_irq_save(flags);
1654 buf = __get_cpu_var(printk_sched_buf);
1655
1656 va_start(args, fmt);
1657 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
1658 va_end(args);
1659
1660 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
1661 local_irq_restore(flags);
1662
1663 return r;
1664}
1665
1624/* 1666/*
1625 * printk rate limiting, lifted from the networking subsystem. 1667 * printk rate limiting, lifted from the networking subsystem.
1626 * 1668 *
@@ -1732,6 +1774,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1732 unsigned long l1, l2; 1774 unsigned long l1, l2;
1733 unsigned long flags; 1775 unsigned long flags;
1734 1776
1777 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
1778 return;
1779
1735 /* Theoretically, the log could move on after we do this, but 1780 /* Theoretically, the log could move on after we do this, but
1736 there's not a lot we can do about that. The new messages 1781 there's not a lot we can do about that. The new messages
1737 will overwrite the start of what we dump. */ 1782 will overwrite the start of what we dump. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
231} 231}
232 232
233static int ptrace_attach(struct task_struct *task, long request, 233static int ptrace_attach(struct task_struct *task, long request,
234 unsigned long addr,
234 unsigned long flags) 235 unsigned long flags)
235{ 236{
236 bool seize = (request == PTRACE_SEIZE); 237 bool seize = (request == PTRACE_SEIZE);
237 int retval; 238 int retval;
238 239
239 /*
240 * SEIZE will enable new ptrace behaviors which will be implemented
241 * gradually. SEIZE_DEVEL is used to prevent applications
242 * expecting full SEIZE behaviors trapping on kernel commits which
243 * are still in the process of implementing them.
244 *
245 * Only test programs for new ptrace behaviors being implemented
246 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
247 *
248 * Once SEIZE behaviors are completely implemented, this flag and
249 * the following test will be removed.
250 */
251 retval = -EIO; 240 retval = -EIO;
252 if (seize && !(flags & PTRACE_SEIZE_DEVEL)) 241 if (seize) {
253 goto out; 242 if (addr != 0)
243 goto out;
244 if (flags & ~(unsigned long)PTRACE_O_MASK)
245 goto out;
246 flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
247 } else {
248 flags = PT_PTRACED;
249 }
254 250
255 audit_ptrace(task); 251 audit_ptrace(task);
256 252
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
262 258
263 /* 259 /*
264 * Protect exec's credential calculations against our interference; 260 * Protect exec's credential calculations against our interference;
265 * interference; SUID, SGID and LSM creds get determined differently 261 * SUID, SGID and LSM creds get determined differently
266 * under ptrace. 262 * under ptrace.
267 */ 263 */
268 retval = -ERESTARTNOINTR; 264 retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
282 if (task->ptrace) 278 if (task->ptrace)
283 goto unlock_tasklist; 279 goto unlock_tasklist;
284 280
285 task->ptrace = PT_PTRACED;
286 if (seize) 281 if (seize)
287 task->ptrace |= PT_SEIZED; 282 flags |= PT_SEIZED;
288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
289 task->ptrace |= PT_PTRACE_CAP; 284 flags |= PT_PTRACE_CAP;
285 task->ptrace = flags;
290 286
291 __ptrace_link(task, current); 287 __ptrace_link(task, current);
292 288
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
528 524
529static int ptrace_setoptions(struct task_struct *child, unsigned long data) 525static int ptrace_setoptions(struct task_struct *child, unsigned long data)
530{ 526{
531 child->ptrace &= ~PT_TRACE_MASK; 527 unsigned flags;
532 528
533 if (data & PTRACE_O_TRACESYSGOOD) 529 if (data & ~(unsigned long)PTRACE_O_MASK)
534 child->ptrace |= PT_TRACESYSGOOD; 530 return -EINVAL;
535
536 if (data & PTRACE_O_TRACEFORK)
537 child->ptrace |= PT_TRACE_FORK;
538
539 if (data & PTRACE_O_TRACEVFORK)
540 child->ptrace |= PT_TRACE_VFORK;
541
542 if (data & PTRACE_O_TRACECLONE)
543 child->ptrace |= PT_TRACE_CLONE;
544
545 if (data & PTRACE_O_TRACEEXEC)
546 child->ptrace |= PT_TRACE_EXEC;
547
548 if (data & PTRACE_O_TRACEVFORKDONE)
549 child->ptrace |= PT_TRACE_VFORK_DONE;
550 531
551 if (data & PTRACE_O_TRACEEXIT) 532 /* Avoid intermediate state when all opts are cleared */
552 child->ptrace |= PT_TRACE_EXIT; 533 flags = child->ptrace;
534 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
535 flags |= (data << PT_OPT_FLAG_SHIFT);
536 child->ptrace = flags;
553 537
554 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; 538 return 0;
555} 539}
556 540
557static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 541static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
891 } 875 }
892 876
893 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
894 ret = ptrace_attach(child, request, data); 878 ret = ptrace_attach(child, request, addr, data);
895 /* 879 /*
896 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
897 * a ptrace attach. 881 * a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1034 } 1018 }
1035 1019
1036 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
1037 ret = ptrace_attach(child, request, data); 1021 ret = ptrace_attach(child, request, addr, data);
1038 /* 1022 /*
1039 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
1040 * a ptrace attach. 1024 * a ptrace attach.
diff --git a/kernel/rcu.h b/kernel/rcu.h
index aa88baab5f78..8ba99cdc6515 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -33,8 +33,27 @@
33 * Process-level increment to ->dynticks_nesting field. This allows for 33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from 34 * architectures that use half-interrupts and half-exceptions from
35 * process context. 35 * process context.
36 *
37 * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
38 * that counts the number of process-based reasons why RCU cannot
39 * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
40 * is the value used to increment or decrement this field.
41 *
42 * The rest of the bits could in principle be used to count interrupts,
43 * but this would mean that a negative-one value in the interrupt
44 * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
45 * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
46 * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
47 * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
48 * initial exit from idle.
36 */ 49 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) 50#define DYNTICK_TASK_NEST_WIDTH 7
51#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
52#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
53#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
54#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
55#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
56 DYNTICK_TASK_FLAG)
38 57
39/* 58/*
40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 59 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
@@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr;
50 69
51static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline void debug_rcu_head_queue(struct rcu_head *head)
52{ 71{
53 WARN_ON_ONCE((unsigned long)head & 0x3);
54 debug_object_activate(head, &rcuhead_debug_descr); 72 debug_object_activate(head, &rcuhead_debug_descr);
55 debug_object_active_state(head, &rcuhead_debug_descr, 73 debug_object_active_state(head, &rcuhead_debug_descr,
56 STATE_RCU_HEAD_READY, 74 STATE_RCU_HEAD_READY,
@@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
76 94
77extern void kfree(const void *); 95extern void kfree(const void *);
78 96
79static inline void __rcu_reclaim(char *rn, struct rcu_head *head) 97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
80{ 98{
81 unsigned long offset = (unsigned long)head->func; 99 unsigned long offset = (unsigned long)head->func;
82 100
83 if (__is_kfree_rcu_offset(offset)) { 101 if (__is_kfree_rcu_offset(offset)) {
84 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 102 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
85 kfree((void *)head - offset); 103 kfree((void *)head - offset);
104 return 1;
86 } else { 105 } else {
87 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 106 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
88 head->func(head); 107 head->func(head);
108 return 0;
89 } 109 }
90} 110}
91 111
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2bc4e135ff23..a86f1741cc27 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
88 * section. 88 * section.
89 * 89 *
90 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 90 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
91 *
92 * Note that rcu_read_lock() is disallowed if the CPU is either idle or
93 * offline from an RCU perspective, so check for those as well.
91 */ 94 */
92int rcu_read_lock_bh_held(void) 95int rcu_read_lock_bh_held(void)
93{ 96{
@@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void)
95 return 1; 98 return 1;
96 if (rcu_is_cpu_idle()) 99 if (rcu_is_cpu_idle())
97 return 0; 100 return 0;
101 if (!rcu_lockdep_current_cpu_online())
102 return 0;
98 return in_softirq() || irqs_disabled(); 103 return in_softirq() || irqs_disabled();
99} 104}
100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 105EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 977296dca0a4..37a5444204d2 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; 56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval) 59static void rcu_idle_enter_common(long long oldval)
@@ -88,10 +88,16 @@ void rcu_idle_enter(void)
88 88
89 local_irq_save(flags); 89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting; 90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0; 91 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
92 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
93 DYNTICK_TASK_NEST_VALUE)
94 rcu_dynticks_nesting = 0;
95 else
96 rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
92 rcu_idle_enter_common(oldval); 97 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags); 98 local_irq_restore(flags);
94} 99}
100EXPORT_SYMBOL_GPL(rcu_idle_enter);
95 101
96/* 102/*
97 * Exit an interrupt handler towards idle. 103 * Exit an interrupt handler towards idle.
@@ -140,11 +146,15 @@ void rcu_idle_exit(void)
140 146
141 local_irq_save(flags); 147 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting; 148 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0); 149 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING; 150 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
151 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
152 else
153 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
145 rcu_idle_exit_common(oldval); 154 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags); 155 local_irq_restore(flags);
147} 156}
157EXPORT_SYMBOL_GPL(rcu_idle_exit);
148 158
149/* 159/*
150 * Enter an interrupt handler, moving away from idle. 160 * Enter an interrupt handler, moving away from idle.
@@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
258 268
259 /* If no RCU callbacks ready to invoke, just return. */ 269 /* If no RCU callbacks ready to invoke, just return. */
260 if (&rcp->rcucblist == rcp->donetail) { 270 if (&rcp->rcucblist == rcp->donetail) {
261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 271 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 272 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist), 273 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(), 274 need_resched(),
@@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
269 279
270 /* Move the ready-to-invoke callbacks to a local list. */ 280 /* Move the ready-to-invoke callbacks to a local list. */
271 local_irq_save(flags); 281 local_irq_save(flags);
272 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 282 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
273 list = rcp->rcucblist; 283 list = rcp->rcucblist;
274 rcp->rcucblist = *rcp->donetail; 284 rcp->rcucblist = *rcp->donetail;
275 *rcp->donetail = NULL; 285 *rcp->donetail = NULL;
@@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
319 */ 329 */
320void synchronize_sched(void) 330void synchronize_sched(void)
321{ 331{
332 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
333 !lock_is_held(&rcu_lock_map) &&
334 !lock_is_held(&rcu_sched_lock_map),
335 "Illegal synchronize_sched() in RCU read-side critical section");
322 cond_resched(); 336 cond_resched();
323} 337}
324EXPORT_SYMBOL_GPL(synchronize_sched); 338EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 9cb1ae4aabdd..22ecea0dfb62 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
132 RCU_TRACE(.rcb.name = "rcu_preempt") 132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 133};
134 134
135static void rcu_read_unlock_special(struct task_struct *t);
135static int rcu_preempted_readers_exp(void); 136static int rcu_preempted_readers_exp(void);
136static void rcu_report_exp_done(void); 137static void rcu_report_exp_done(void);
137 138
@@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void)
146/* 147/*
147 * Check for a running RCU reader. Because there is only one CPU, 148 * Check for a running RCU reader. Because there is only one CPU,
148 * there can be but one running RCU reader at a time. ;-) 149 * there can be but one running RCU reader at a time. ;-)
150 *
151 * Returns zero if there are no running readers. Returns a positive
152 * number if there is at least one reader within its RCU read-side
153 * critical section. Returns a negative number if an outermost reader
154 * is in the midst of exiting from its RCU read-side critical section
155 *
156 * Returns zero if there are no running readers. Returns a positive
157 * number if there is at least one reader within its RCU read-side
158 * critical section. Returns a negative number if an outermost reader
159 * is in the midst of exiting from its RCU read-side critical section.
149 */ 160 */
150static int rcu_preempt_running_reader(void) 161static int rcu_preempt_running_reader(void)
151{ 162{
@@ -307,7 +318,6 @@ static int rcu_boost(void)
307 t = container_of(tb, struct task_struct, rcu_node_entry); 318 t = container_of(tb, struct task_struct, rcu_node_entry);
308 rt_mutex_init_proxy_locked(&mtx, t); 319 rt_mutex_init_proxy_locked(&mtx, t);
309 t->rcu_boost_mutex = &mtx; 320 t->rcu_boost_mutex = &mtx;
310 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
311 raw_local_irq_restore(flags); 321 raw_local_irq_restore(flags);
312 rt_mutex_lock(&mtx); 322 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 323 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void)
475 unsigned long flags; 485 unsigned long flags;
476 486
477 local_irq_save(flags); /* must exclude scheduler_tick(). */ 487 local_irq_save(flags); /* must exclude scheduler_tick(). */
478 if (rcu_preempt_running_reader() && 488 if (rcu_preempt_running_reader() > 0 &&
479 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 489 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
480 490
481 /* Possibly blocking in an RCU read-side critical section. */ 491 /* Possibly blocking in an RCU read-side critical section. */
@@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void)
494 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); 504 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
495 if (rcu_cpu_blocking_cur_gp()) 505 if (rcu_cpu_blocking_cur_gp())
496 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; 506 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
507 } else if (rcu_preempt_running_reader() < 0 &&
508 t->rcu_read_unlock_special) {
509 /*
510 * Complete exit from RCU read-side critical section on
511 * behalf of preempted instance of __rcu_read_unlock().
512 */
513 rcu_read_unlock_special(t);
497 } 514 }
498 515
499 /* 516 /*
@@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
526 * notify RCU core processing or task having blocked during the RCU 543 * notify RCU core processing or task having blocked during the RCU
527 * read-side critical section. 544 * read-side critical section.
528 */ 545 */
529static void rcu_read_unlock_special(struct task_struct *t) 546static noinline void rcu_read_unlock_special(struct task_struct *t)
530{ 547{
531 int empty; 548 int empty;
532 int empty_exp; 549 int empty_exp;
533 unsigned long flags; 550 unsigned long flags;
534 struct list_head *np; 551 struct list_head *np;
552#ifdef CONFIG_RCU_BOOST
553 struct rt_mutex *rbmp = NULL;
554#endif /* #ifdef CONFIG_RCU_BOOST */
535 int special; 555 int special;
536 556
537 /* 557 /*
@@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
552 rcu_preempt_cpu_qs(); 572 rcu_preempt_cpu_qs();
553 573
554 /* Hardware IRQ handlers cannot block. */ 574 /* Hardware IRQ handlers cannot block. */
555 if (in_irq()) { 575 if (in_irq() || in_serving_softirq()) {
556 local_irq_restore(flags); 576 local_irq_restore(flags);
557 return; 577 return;
558 } 578 }
@@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
597 } 617 }
598#ifdef CONFIG_RCU_BOOST 618#ifdef CONFIG_RCU_BOOST
599 /* Unboost self if was boosted. */ 619 /* Unboost self if was boosted. */
600 if (special & RCU_READ_UNLOCK_BOOSTED) { 620 if (t->rcu_boost_mutex != NULL) {
601 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; 621 rbmp = t->rcu_boost_mutex;
602 rt_mutex_unlock(t->rcu_boost_mutex);
603 t->rcu_boost_mutex = NULL; 622 t->rcu_boost_mutex = NULL;
623 rt_mutex_unlock(rbmp);
604 } 624 }
605#endif /* #ifdef CONFIG_RCU_BOOST */ 625#endif /* #ifdef CONFIG_RCU_BOOST */
606 local_irq_restore(flags); 626 local_irq_restore(flags);
@@ -618,13 +638,22 @@ void __rcu_read_unlock(void)
618 struct task_struct *t = current; 638 struct task_struct *t = current;
619 639
620 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ 640 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
621 --t->rcu_read_lock_nesting; 641 if (t->rcu_read_lock_nesting != 1)
622 barrier(); /* decrement before load of ->rcu_read_unlock_special */ 642 --t->rcu_read_lock_nesting;
623 if (t->rcu_read_lock_nesting == 0 && 643 else {
624 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 644 t->rcu_read_lock_nesting = INT_MIN;
625 rcu_read_unlock_special(t); 645 barrier(); /* assign before ->rcu_read_unlock_special load */
646 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
647 rcu_read_unlock_special(t);
648 barrier(); /* ->rcu_read_unlock_special load before assign */
649 t->rcu_read_lock_nesting = 0;
650 }
626#ifdef CONFIG_PROVE_LOCKING 651#ifdef CONFIG_PROVE_LOCKING
627 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); 652 {
653 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
654
655 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
656 }
628#endif /* #ifdef CONFIG_PROVE_LOCKING */ 657#endif /* #ifdef CONFIG_PROVE_LOCKING */
629} 658}
630EXPORT_SYMBOL_GPL(__rcu_read_unlock); 659EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void)
649 invoke_rcu_callbacks(); 678 invoke_rcu_callbacks();
650 if (rcu_preempt_gp_in_progress() && 679 if (rcu_preempt_gp_in_progress() &&
651 rcu_cpu_blocking_cur_gp() && 680 rcu_cpu_blocking_cur_gp() &&
652 rcu_preempt_running_reader()) 681 rcu_preempt_running_reader() > 0)
653 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 682 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
654} 683}
655 684
@@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
706 */ 735 */
707void synchronize_rcu(void) 736void synchronize_rcu(void)
708{ 737{
738 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
739 !lock_is_held(&rcu_lock_map) &&
740 !lock_is_held(&rcu_sched_lock_map),
741 "Illegal synchronize_rcu() in RCU read-side critical section");
742
709#ifdef CONFIG_DEBUG_LOCK_ALLOC 743#ifdef CONFIG_DEBUG_LOCK_ALLOC
710 if (!rcu_scheduler_active) 744 if (!rcu_scheduler_active)
711 return; 745 return;
@@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void)
882static void invoke_rcu_callbacks(void) 916static void invoke_rcu_callbacks(void)
883{ 917{
884 have_rcu_kthread_work = 1; 918 have_rcu_kthread_work = 1;
885 wake_up(&rcu_kthread_wq); 919 if (rcu_kthread_task != NULL)
920 wake_up(&rcu_kthread_wq);
886} 921}
887 922
888#ifdef CONFIG_RCU_TRACE 923#ifdef CONFIG_RCU_TRACE
@@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads);
943 978
944#else /* #ifdef CONFIG_RCU_BOOST */ 979#else /* #ifdef CONFIG_RCU_BOOST */
945 980
981/* Hold off callback invocation until early_initcall() time. */
982static int rcu_scheduler_fully_active __read_mostly;
983
946/* 984/*
947 * Start up softirq processing of callbacks. 985 * Start up softirq processing of callbacks.
948 */ 986 */
949void invoke_rcu_callbacks(void) 987void invoke_rcu_callbacks(void)
950{ 988{
951 raise_softirq(RCU_SOFTIRQ); 989 if (rcu_scheduler_fully_active)
990 raise_softirq(RCU_SOFTIRQ);
952} 991}
953 992
954#ifdef CONFIG_RCU_TRACE 993#ifdef CONFIG_RCU_TRACE
@@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void)
963 1002
964#endif /* #ifdef CONFIG_RCU_TRACE */ 1003#endif /* #ifdef CONFIG_RCU_TRACE */
965 1004
966void rcu_init(void) 1005static int __init rcu_scheduler_really_started(void)
967{ 1006{
1007 rcu_scheduler_fully_active = 1;
968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1008 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1009 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
1010 return 0;
969} 1011}
1012early_initcall(rcu_scheduler_really_started);
970 1013
971#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1014#endif /* #else #ifdef CONFIG_RCU_BOOST */
972 1015
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a58ac285fc69..a89b381a8c6e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -65,7 +65,10 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
70static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
71static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 72static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 73static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 74static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444);
95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444); 99module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444);
102MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
98module_param(shutdown_secs, int, 0444); 103module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 104MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
105module_param(stall_cpu, int, 0444);
106MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
107module_param(stall_cpu_holdoff, int, 0444);
108MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
100module_param(test_boost, int, 0444); 109module_param(test_boost, int, 0444);
101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 110MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
102module_param(test_boost_interval, int, 0444); 111module_param(test_boost_interval, int, 0444);
@@ -129,6 +138,7 @@ static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU 138#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task; 139static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 140#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task;
132 142
133#define RCU_TORTURE_PIPE_LEN 10 143#define RCU_TORTURE_PIPE_LEN 10
134 144
@@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused)
990 rcu_read_lock_bh_held() || 1000 rcu_read_lock_bh_held() ||
991 rcu_read_lock_sched_held() || 1001 rcu_read_lock_sched_held() ||
992 srcu_read_lock_held(&srcu_ctl)); 1002 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
994 if (p == NULL) { 1003 if (p == NULL) {
995 /* Leave because rcu_torture_writer is not yet underway */ 1004 /* Leave because rcu_torture_writer is not yet underway */
996 cur_ops->readunlock(idx); 1005 cur_ops->readunlock(idx);
997 return; 1006 return;
998 } 1007 }
1008 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
999 if (p->rtort_mbtest == 0) 1009 if (p->rtort_mbtest == 0)
1000 atomic_inc(&n_rcu_torture_mberror); 1010 atomic_inc(&n_rcu_torture_mberror);
1001 spin_lock(&rand_lock); 1011 spin_lock(&rand_lock);
@@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg)
1053 rcu_read_lock_bh_held() || 1063 rcu_read_lock_bh_held() ||
1054 rcu_read_lock_sched_held() || 1064 rcu_read_lock_sched_held() ||
1055 srcu_read_lock_held(&srcu_ctl)); 1065 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1057 if (p == NULL) { 1066 if (p == NULL) {
1058 /* Wait for rcu_torture_writer to get underway */ 1067 /* Wait for rcu_torture_writer to get underway */
1059 cur_ops->readunlock(idx); 1068 cur_ops->readunlock(idx);
1060 schedule_timeout_interruptible(HZ); 1069 schedule_timeout_interruptible(HZ);
1061 continue; 1070 continue;
1062 } 1071 }
1072 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1063 if (p->rtort_mbtest == 0) 1073 if (p->rtort_mbtest == 0)
1064 atomic_inc(&n_rcu_torture_mberror); 1074 atomic_inc(&n_rcu_torture_mberror);
1065 cur_ops->read_delay(&rand); 1075 cur_ops->read_delay(&rand);
@@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1310 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1301 "test_boost=%d/%d test_boost_interval=%d " 1311 "test_boost=%d/%d test_boost_interval=%d "
1302 "test_boost_duration=%d shutdown_secs=%d " 1312 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n", 1313 "onoff_interval=%d onoff_holdoff=%d\n",
1304 torture_type, tag, nrealreaders, nfakewriters, 1314 torture_type, tag, nrealreaders, nfakewriters,
1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1315 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1316 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1307 test_boost, cur_ops->can_boost, 1317 test_boost, cur_ops->can_boost,
1308 test_boost_interval, test_boost_duration, shutdown_secs, 1318 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval); 1319 onoff_interval, onoff_holdoff);
1310} 1320}
1311 1321
1312static struct notifier_block rcutorture_shutdown_nb = { 1322static struct notifier_block rcutorture_shutdown_nb = {
@@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg)
1410 for_each_online_cpu(cpu) 1420 for_each_online_cpu(cpu)
1411 maxcpu = cpu; 1421 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0); 1422 WARN_ON(maxcpu < 0);
1423 if (onoff_holdoff > 0) {
1424 VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
1425 schedule_timeout_interruptible(onoff_holdoff * HZ);
1426 VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
1427 }
1413 while (!kthread_should_stop()) { 1428 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); 1429 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 1430 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
@@ -1450,12 +1465,15 @@ rcu_torture_onoff(void *arg)
1450static int __cpuinit 1465static int __cpuinit
1451rcu_torture_onoff_init(void) 1466rcu_torture_onoff_init(void)
1452{ 1467{
1468 int ret;
1469
1453 if (onoff_interval <= 0) 1470 if (onoff_interval <= 0)
1454 return 0; 1471 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); 1472 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) { 1473 if (IS_ERR(onoff_task)) {
1474 ret = PTR_ERR(onoff_task);
1457 onoff_task = NULL; 1475 onoff_task = NULL;
1458 return PTR_ERR(onoff_task); 1476 return ret;
1459 } 1477 }
1460 return 0; 1478 return 0;
1461} 1479}
@@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void)
1481 1499
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ 1500#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483 1501
1502/*
1503 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1504 * induces a CPU stall for the time specified by stall_cpu.
1505 */
1506static int __cpuinit rcu_torture_stall(void *args)
1507{
1508 unsigned long stop_at;
1509
1510 VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
1511 if (stall_cpu_holdoff > 0) {
1512 VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
1513 schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
1514 VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
1515 }
1516 if (!kthread_should_stop()) {
1517 stop_at = get_seconds() + stall_cpu;
1518 /* RCU CPU stall is expected behavior in following code. */
1519 printk(KERN_ALERT "rcu_torture_stall start.\n");
1520 rcu_read_lock();
1521 preempt_disable();
1522 while (ULONG_CMP_LT(get_seconds(), stop_at))
1523 continue; /* Induce RCU CPU stall warning. */
1524 preempt_enable();
1525 rcu_read_unlock();
1526 printk(KERN_ALERT "rcu_torture_stall end.\n");
1527 }
1528 rcutorture_shutdown_absorb("rcu_torture_stall");
1529 while (!kthread_should_stop())
1530 schedule_timeout_interruptible(10 * HZ);
1531 return 0;
1532}
1533
1534/* Spawn CPU-stall kthread, if stall_cpu specified. */
1535static int __init rcu_torture_stall_init(void)
1536{
1537 int ret;
1538
1539 if (stall_cpu <= 0)
1540 return 0;
1541 stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
1542 if (IS_ERR(stall_task)) {
1543 ret = PTR_ERR(stall_task);
1544 stall_task = NULL;
1545 return ret;
1546 }
1547 return 0;
1548}
1549
1550/* Clean up after the CPU-stall kthread, if one was spawned. */
1551static void rcu_torture_stall_cleanup(void)
1552{
1553 if (stall_task == NULL)
1554 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task);
1557}
1558
1484static int rcutorture_cpu_notify(struct notifier_block *self, 1559static int rcutorture_cpu_notify(struct notifier_block *self,
1485 unsigned long action, void *hcpu) 1560 unsigned long action, void *hcpu)
1486{ 1561{
@@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void)
1523 fullstop = FULLSTOP_RMMOD; 1598 fullstop = FULLSTOP_RMMOD;
1524 mutex_unlock(&fullstop_mutex); 1599 mutex_unlock(&fullstop_mutex);
1525 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1600 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1601 rcu_torture_stall_cleanup();
1526 if (stutter_task) { 1602 if (stutter_task) {
1527 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1528 kthread_stop(stutter_task); 1604 kthread_stop(stutter_task);
@@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void)
1602 cur_ops->cleanup(); 1678 cur_ops->cleanup();
1603 if (atomic_read(&n_rcu_torture_error)) 1679 if (atomic_read(&n_rcu_torture_error))
1604 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts)
1683 rcu_torture_print_module_parms(cur_ops,
1684 "End of test: RCU_HOTPLUG");
1605 else 1685 else
1606 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1686 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1607} 1687}
@@ -1819,6 +1899,7 @@ rcu_torture_init(void)
1819 } 1899 }
1820 rcu_torture_onoff_init(); 1900 rcu_torture_onoff_init();
1821 register_reboot_notifier(&rcutorture_shutdown_nb); 1901 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init();
1822 rcutorture_record_test_transition(); 1903 rcutorture_record_test_transition();
1823 mutex_unlock(&fullstop_mutex); 1904 mutex_unlock(&fullstop_mutex);
1824 return 0; 1905 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6c4a6722abfd..1050d6d3922c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -50,6 +50,8 @@
50#include <linux/wait.h> 50#include <linux/wait.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h>
54#include <linux/stop_machine.h>
53 55
54#include "rcutree.h" 56#include "rcutree.h"
55#include <trace/events/rcu.h> 57#include <trace/events/rcu.h>
@@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu)
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 198EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 199
198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 200DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
199 .dynticks_nesting = DYNTICK_TASK_NESTING, 201 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
200 .dynticks = ATOMIC_INIT(1), 202 .dynticks = ATOMIC_INIT(1),
201}; 203};
202 204
@@ -208,8 +210,11 @@ module_param(blimit, int, 0);
208module_param(qhimark, int, 0); 210module_param(qhimark, int, 0);
209module_param(qlowmark, int, 0); 211module_param(qlowmark, int, 0);
210 212
211int rcu_cpu_stall_suppress __read_mostly; 213int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
214int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
215
212module_param(rcu_cpu_stall_suppress, int, 0644); 216module_param(rcu_cpu_stall_suppress, int, 0644);
217module_param(rcu_cpu_stall_timeout, int, 0644);
213 218
214static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 219static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
215static int rcu_pending(int cpu); 220static int rcu_pending(int cpu);
@@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
301 return &rsp->node[0]; 306 return &rsp->node[0];
302} 307}
303 308
304#ifdef CONFIG_SMP
305
306/* 309/*
307 * If the specified CPU is offline, tell the caller that it is in 310 * If the specified CPU is offline, tell the caller that it is in
308 * a quiescent state. Otherwise, whack it with a reschedule IPI. 311 * a quiescent state. Otherwise, whack it with a reschedule IPI.
@@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
317static int rcu_implicit_offline_qs(struct rcu_data *rdp) 320static int rcu_implicit_offline_qs(struct rcu_data *rdp)
318{ 321{
319 /* 322 /*
320 * If the CPU is offline, it is in a quiescent state. We can 323 * If the CPU is offline for more than a jiffy, it is in a quiescent
321 * trust its state not to change because interrupts are disabled. 324 * state. We can trust its state not to change because interrupts
325 * are disabled. The reason for the jiffy's worth of slack is to
326 * handle CPUs initializing on the way up and finding their way
327 * to the idle loop on the way down.
322 */ 328 */
323 if (cpu_is_offline(rdp->cpu)) { 329 if (cpu_is_offline(rdp->cpu) &&
330 ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
324 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 331 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
325 rdp->offline_fqs++; 332 rdp->offline_fqs++;
326 return 1; 333 return 1;
327 } 334 }
328
329 /*
330 * The CPU is online, so send it a reschedule IPI. This forces
331 * it through the scheduler, and (inefficiently) also handles cases
332 * where idle loops fail to inform RCU about the CPU being idle.
333 */
334 if (rdp->cpu != smp_processor_id())
335 smp_send_reschedule(rdp->cpu);
336 else
337 set_need_resched();
338 rdp->resched_ipi++;
339 return 0; 335 return 0;
340} 336}
341 337
342#endif /* #ifdef CONFIG_SMP */
343
344/* 338/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle 339 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 * 340 *
@@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
366 atomic_inc(&rdtp->dynticks); 360 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 361 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 362 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
363
364 /*
365 * The idle task is not permitted to enter the idle loop while
366 * in an RCU read-side critical section.
367 */
368 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
369 "Illegal idle entry in RCU read-side critical section.");
370 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
371 "Illegal idle entry in RCU-bh read-side critical section.");
372 rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
373 "Illegal idle entry in RCU-sched read-side critical section.");
369} 374}
370 375
371/** 376/**
@@ -389,10 +394,15 @@ void rcu_idle_enter(void)
389 local_irq_save(flags); 394 local_irq_save(flags);
390 rdtp = &__get_cpu_var(rcu_dynticks); 395 rdtp = &__get_cpu_var(rcu_dynticks);
391 oldval = rdtp->dynticks_nesting; 396 oldval = rdtp->dynticks_nesting;
392 rdtp->dynticks_nesting = 0; 397 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
398 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
399 rdtp->dynticks_nesting = 0;
400 else
401 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
393 rcu_idle_enter_common(rdtp, oldval); 402 rcu_idle_enter_common(rdtp, oldval);
394 local_irq_restore(flags); 403 local_irq_restore(flags);
395} 404}
405EXPORT_SYMBOL_GPL(rcu_idle_enter);
396 406
397/** 407/**
398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 408 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
462 * Exit idle mode, in other words, -enter- the mode in which RCU 472 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur. 473 * read-side critical sections can occur.
464 * 474 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to 475 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
466 * allow for the possibility of usermode upcalls messing up our count 476 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just 477 * of interrupt nesting level during the busy period that is just
468 * now starting. 478 * now starting.
@@ -476,11 +486,15 @@ void rcu_idle_exit(void)
476 local_irq_save(flags); 486 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks); 487 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting; 488 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0); 489 WARN_ON_ONCE(oldval < 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; 490 if (oldval & DYNTICK_TASK_NEST_MASK)
491 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
492 else
493 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
481 rcu_idle_exit_common(rdtp, oldval); 494 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags); 495 local_irq_restore(flags);
483} 496}
497EXPORT_SYMBOL_GPL(rcu_idle_exit);
484 498
485/** 499/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 500 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void)
581} 595}
582EXPORT_SYMBOL(rcu_is_cpu_idle); 596EXPORT_SYMBOL(rcu_is_cpu_idle);
583 597
598#ifdef CONFIG_HOTPLUG_CPU
599
600/*
601 * Is the current CPU online? Disable preemption to avoid false positives
602 * that could otherwise happen due to the current CPU number being sampled,
603 * this task being preempted, its old CPU being taken offline, resuming
604 * on some other CPU, then determining that its old CPU is now offline.
605 * It is OK to use RCU on an offline processor during initial boot, hence
606 * the check for rcu_scheduler_fully_active. Note also that it is OK
607 * for a CPU coming online to use RCU for one jiffy prior to marking itself
608 * online in the cpu_online_mask. Similarly, it is OK for a CPU going
609 * offline to continue to use RCU for one jiffy after marking itself
610 * offline in the cpu_online_mask. This leniency is necessary given the
611 * non-atomic nature of the online and offline processing, for example,
612 * the fact that a CPU enters the scheduler after completing the CPU_DYING
613 * notifiers.
614 *
615 * This is also why RCU internally marks CPUs online during the
616 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
617 *
618 * Disable checking if in an NMI handler because we cannot safely report
619 * errors from NMI handlers anyway.
620 */
621bool rcu_lockdep_current_cpu_online(void)
622{
623 struct rcu_data *rdp;
624 struct rcu_node *rnp;
625 bool ret;
626
627 if (in_nmi())
628 return 1;
629 preempt_disable();
630 rdp = &__get_cpu_var(rcu_sched_data);
631 rnp = rdp->mynode;
632 ret = (rdp->grpmask & rnp->qsmaskinit) ||
633 !rcu_scheduler_fully_active;
634 preempt_enable();
635 return ret;
636}
637EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
638
639#endif /* #ifdef CONFIG_HOTPLUG_CPU */
640
584#endif /* #ifdef CONFIG_PROVE_RCU */ 641#endif /* #ifdef CONFIG_PROVE_RCU */
585 642
586/** 643/**
@@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void)
595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 652 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
596} 653}
597 654
598#ifdef CONFIG_SMP
599
600/* 655/*
601 * Snapshot the specified CPU's dynticks counter so that we can later 656 * Snapshot the specified CPU's dynticks counter so that we can later
602 * credit them with an implicit quiescent state. Return 1 if this CPU 657 * credit them with an implicit quiescent state. Return 1 if this CPU
@@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
640 return rcu_implicit_offline_qs(rdp); 695 return rcu_implicit_offline_qs(rdp);
641} 696}
642 697
643#endif /* #ifdef CONFIG_SMP */ 698static int jiffies_till_stall_check(void)
699{
700 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
701
702 /*
703 * Limit check must be consistent with the Kconfig limits
704 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
705 */
706 if (till_stall_check < 3) {
707 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
708 till_stall_check = 3;
709 } else if (till_stall_check > 300) {
710 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
711 till_stall_check = 300;
712 }
713 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
714}
644 715
645static void record_gp_stall_check_time(struct rcu_state *rsp) 716static void record_gp_stall_check_time(struct rcu_state *rsp)
646{ 717{
647 rsp->gp_start = jiffies; 718 rsp->gp_start = jiffies;
648 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; 719 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
649} 720}
650 721
651static void print_other_cpu_stall(struct rcu_state *rsp) 722static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
664 raw_spin_unlock_irqrestore(&rnp->lock, flags); 735 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 736 return;
666 } 737 }
667 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 738 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
668
669 /*
670 * Now rat on any tasks that got kicked up to the root rcu_node
671 * due to CPU offlining.
672 */
673 ndetected = rcu_print_task_stall(rnp);
674 raw_spin_unlock_irqrestore(&rnp->lock, flags); 739 raw_spin_unlock_irqrestore(&rnp->lock, flags);
675 740
676 /* 741 /*
@@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
678 * See Documentation/RCU/stallwarn.txt for info on how to debug 743 * See Documentation/RCU/stallwarn.txt for info on how to debug
679 * RCU CPU stall warnings. 744 * RCU CPU stall warnings.
680 */ 745 */
681 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 746 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
682 rsp->name); 747 rsp->name);
748 print_cpu_stall_info_begin();
683 rcu_for_each_leaf_node(rsp, rnp) { 749 rcu_for_each_leaf_node(rsp, rnp) {
684 raw_spin_lock_irqsave(&rnp->lock, flags); 750 raw_spin_lock_irqsave(&rnp->lock, flags);
685 ndetected += rcu_print_task_stall(rnp); 751 ndetected += rcu_print_task_stall(rnp);
@@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
688 continue; 754 continue;
689 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 755 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
690 if (rnp->qsmask & (1UL << cpu)) { 756 if (rnp->qsmask & (1UL << cpu)) {
691 printk(" %d", rnp->grplo + cpu); 757 print_cpu_stall_info(rsp, rnp->grplo + cpu);
692 ndetected++; 758 ndetected++;
693 } 759 }
694 } 760 }
695 printk("} (detected by %d, t=%ld jiffies)\n", 761
762 /*
763 * Now rat on any tasks that got kicked up to the root rcu_node
764 * due to CPU offlining.
765 */
766 rnp = rcu_get_root(rsp);
767 raw_spin_lock_irqsave(&rnp->lock, flags);
768 ndetected = rcu_print_task_stall(rnp);
769 raw_spin_unlock_irqrestore(&rnp->lock, flags);
770
771 print_cpu_stall_info_end();
772 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
696 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 773 smp_processor_id(), (long)(jiffies - rsp->gp_start));
697 if (ndetected == 0) 774 if (ndetected == 0)
698 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 775 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
@@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)
716 * See Documentation/RCU/stallwarn.txt for info on how to debug 793 * See Documentation/RCU/stallwarn.txt for info on how to debug
717 * RCU CPU stall warnings. 794 * RCU CPU stall warnings.
718 */ 795 */
719 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 796 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
720 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 797 print_cpu_stall_info_begin();
798 print_cpu_stall_info(rsp, smp_processor_id());
799 print_cpu_stall_info_end();
800 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
721 if (!trigger_all_cpu_backtrace()) 801 if (!trigger_all_cpu_backtrace())
722 dump_stack(); 802 dump_stack();
723 803
724 raw_spin_lock_irqsave(&rnp->lock, flags); 804 raw_spin_lock_irqsave(&rnp->lock, flags);
725 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 805 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
726 rsp->jiffies_stall = 806 rsp->jiffies_stall = jiffies +
727 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 807 3 * jiffies_till_stall_check() + 3;
728 raw_spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
729 809
730 set_need_resched(); /* kick ourselves to get things going. */ 810 set_need_resched(); /* kick ourselves to get things going. */
@@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
807 rdp->passed_quiesce = 0; 887 rdp->passed_quiesce = 0;
808 } else 888 } else
809 rdp->qs_pending = 0; 889 rdp->qs_pending = 0;
890 zero_cpu_stall_ticks(rdp);
810 } 891 }
811} 892}
812 893
@@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
943 * in preparation for detecting the next grace period. The caller must hold 1024 * in preparation for detecting the next grace period. The caller must hold
944 * the root node's ->lock, which is released before return. Hard irqs must 1025 * the root node's ->lock, which is released before return. Hard irqs must
945 * be disabled. 1026 * be disabled.
1027 *
1028 * Note that it is legal for a dying CPU (which is marked as offline) to
1029 * invoke this function. This can happen when the dying CPU reports its
1030 * quiescent state.
946 */ 1031 */
947static void 1032static void
948rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1033rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
@@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 1065 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1066 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
982 record_gp_stall_check_time(rsp); 1067 record_gp_stall_check_time(rsp);
983
984 /* Special-case the common single-level case. */
985 if (NUM_RCU_NODES == 1) {
986 rcu_preempt_check_blocked_tasks(rnp);
987 rnp->qsmask = rnp->qsmaskinit;
988 rnp->gpnum = rsp->gpnum;
989 rnp->completed = rsp->completed;
990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
992 rcu_preempt_boost_start_gp(rnp);
993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
994 rnp->level, rnp->grplo,
995 rnp->grphi, rnp->qsmask);
996 raw_spin_unlock_irqrestore(&rnp->lock, flags);
997 return;
998 }
999
1000 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ 1068 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
1001 1069
1002
1003 /* Exclude any concurrent CPU-hotplug operations. */ 1070 /* Exclude any concurrent CPU-hotplug operations. */
1004 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1071 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1005 1072
@@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1245 1312
1246/* 1313/*
1247 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1314 * Move a dying CPU's RCU callbacks to online CPU's callback list.
1248 * Synchronization is not required because this function executes 1315 * Also record a quiescent state for this CPU for the current grace period.
1249 * in stop_machine() context. 1316 * Synchronization and interrupt disabling are not required because
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1250 */ 1324 */
1251static void rcu_send_cbs_to_online(struct rcu_state *rsp) 1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1252{ 1326{
1253 int i; 1327 int i;
1254 /* current DYING CPU is cleared in the cpu_online_mask */ 1328 unsigned long mask;
1255 int receive_cpu = cpumask_any(cpu_online_mask); 1329 int receive_cpu = cpumask_any(cpu_online_mask);
1256 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1257 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); 1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333
1334 /* First, adjust the counts. */
1335 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen;
1338 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0;
1340 }
1258 1341
1259 if (rdp->nxtlist == NULL) 1342 /*
1260 return; /* irqs disabled, so comparison is stable. */ 1343 * Next, move ready-to-invoke callbacks to be invoked on some
1344 * other CPU. These will not be required to pass through another
1345 * grace period: They are done, regardless of CPU.
1346 */
1347 if (rdp->nxtlist != NULL &&
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
1349 struct rcu_head *oldhead;
1350 struct rcu_head **oldtail;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 }
1261 1366
1262 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1367 /*
1263 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1368 * Finally, put the rest of the callbacks at the end of the list.
1264 receive_rdp->qlen += rdp->qlen; 1369 * The ones that made it partway through get to start over: We
1265 receive_rdp->n_cbs_adopted += rdp->qlen; 1370 * cannot assume that grace periods are synchronized across CPUs.
1266 rdp->n_cbs_orphaned += rdp->qlen; 1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */
1374 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] =
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 }
1267 1385
1268 rdp->nxtlist = NULL; 1386 /*
1269 for (i = 0; i < RCU_NEXT_SIZE; i++) 1387 * Record a quiescent state for the dying CPU. This is safe
1270 rdp->nxttail[i] = &rdp->nxtlist; 1388 * only because we have already cleared out the callbacks.
1271 rdp->qlen = 0; 1389 * (Otherwise, the RCU core might try to schedule the invocation
1390 * of callbacks on this now-offline CPU, which would be bad.)
1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */
1393 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1272} 1398}
1273 1399
1274/* 1400/*
1275 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1401 * The CPU has been completely removed, and some other CPU is reporting
1276 * and move all callbacks from the outgoing CPU to the current one. 1402 * this fact from process context. Do the remainder of the cleanup.
1277 * There can only be one CPU hotplug operation at a time, so no other 1403 * There can only be one CPU hotplug operation at a time, so no other
1278 * CPU can be attempting to update rcu_cpu_kthread_task. 1404 * CPU can be attempting to update rcu_cpu_kthread_task.
1279 */ 1405 */
1280static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1406static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1281{ 1407{
1282 unsigned long flags; 1408 unsigned long flags;
1283 unsigned long mask; 1409 unsigned long mask;
1284 int need_report = 0; 1410 int need_report = 0;
1285 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1286 struct rcu_node *rnp; 1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */
1287 1413
1414 /* Adjust any no-longer-needed kthreads. */
1288 rcu_stop_cpu_kthread(cpu); 1415 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1);
1417
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
1289 1419
1290 /* Exclude any attempts to start a new grace period. */ 1420 /* Exclude any attempts to start a new grace period. */
1291 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1421 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1292 1422
1293 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1294 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
1295 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1424 mask = rdp->grpmask; /* rnp->grplo is constant. */
1296 do { 1425 do {
1297 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1426 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1299 if (rnp->qsmaskinit != 0) { 1428 if (rnp->qsmaskinit != 0) {
1300 if (rnp != rdp->mynode) 1429 if (rnp != rdp->mynode)
1301 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1430 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1302 else
1303 trace_rcu_grace_period(rsp->name,
1304 rnp->gpnum + 1 -
1305 !!(rnp->qsmask & mask),
1306 "cpuofl");
1307 break; 1431 break;
1308 } 1432 }
1309 if (rnp == rdp->mynode) { 1433 if (rnp == rdp->mynode)
1310 trace_rcu_grace_period(rsp->name,
1311 rnp->gpnum + 1 -
1312 !!(rnp->qsmask & mask),
1313 "cpuofl");
1314 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1434 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1315 } else 1435 else
1316 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1436 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1317 mask = rnp->grpmask; 1437 mask = rnp->grpmask;
1318 rnp = rnp->parent; 1438 rnp = rnp->parent;
@@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1332 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1452 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1333 if (need_report & RCU_OFL_TASKS_EXP_GP) 1453 if (need_report & RCU_OFL_TASKS_EXP_GP)
1334 rcu_report_exp_rnp(rsp, rnp, true); 1454 rcu_report_exp_rnp(rsp, rnp, true);
1335 rcu_node_kthread_setaffinity(rnp, -1);
1336}
1337
1338/*
1339 * Remove the specified CPU from the RCU hierarchy and move any pending
1340 * callbacks that it might have to the current CPU. This code assumes
1341 * that at least one CPU in the system will remain running at all times.
1342 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
1343 */
1344static void rcu_offline_cpu(int cpu)
1345{
1346 __rcu_offline_cpu(cpu, &rcu_sched_state);
1347 __rcu_offline_cpu(cpu, &rcu_bh_state);
1348 rcu_preempt_offline_cpu(cpu);
1349} 1455}
1350 1456
1351#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1457#else /* #ifdef CONFIG_HOTPLUG_CPU */
1352 1458
1353static void rcu_send_cbs_to_online(struct rcu_state *rsp) 1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1354{ 1460{
1355} 1461}
1356 1462
1357static void rcu_offline_cpu(int cpu) 1463static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1358{ 1464{
1359} 1465}
1360 1466
@@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1368{ 1474{
1369 unsigned long flags; 1475 unsigned long flags;
1370 struct rcu_head *next, *list, **tail; 1476 struct rcu_head *next, *list, **tail;
1371 int bl, count; 1477 int bl, count, count_lazy;
1372 1478
1373 /* If no callbacks are ready, just return.*/ 1479 /* If no callbacks are ready, just return.*/
1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1480 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1375 trace_rcu_batch_start(rsp->name, 0, 0); 1481 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1482 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current), 1483 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread()); 1484 rcu_is_callbacks_kthread());
@@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1384 * races with call_rcu() from interrupt handlers. 1490 * races with call_rcu() from interrupt handlers.
1385 */ 1491 */
1386 local_irq_save(flags); 1492 local_irq_save(flags);
1493 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1387 bl = rdp->blimit; 1494 bl = rdp->blimit;
1388 trace_rcu_batch_start(rsp->name, rdp->qlen, bl); 1495 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
1389 list = rdp->nxtlist; 1496 list = rdp->nxtlist;
1390 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1497 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1391 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1498 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1396 local_irq_restore(flags); 1503 local_irq_restore(flags);
1397 1504
1398 /* Invoke callbacks. */ 1505 /* Invoke callbacks. */
1399 count = 0; 1506 count = count_lazy = 0;
1400 while (list) { 1507 while (list) {
1401 next = list->next; 1508 next = list->next;
1402 prefetch(next); 1509 prefetch(next);
1403 debug_rcu_head_unqueue(list); 1510 debug_rcu_head_unqueue(list);
1404 __rcu_reclaim(rsp->name, list); 1511 if (__rcu_reclaim(rsp->name, list))
1512 count_lazy++;
1405 list = next; 1513 list = next;
1406 /* Stop only if limit reached and CPU has something to do. */ 1514 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl && 1515 if (++count >= bl &&
@@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1416 rcu_is_callbacks_kthread()); 1524 rcu_is_callbacks_kthread());
1417 1525
1418 /* Update count, and requeue any remaining callbacks. */ 1526 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1419 rdp->qlen -= count; 1528 rdp->qlen -= count;
1420 rdp->n_cbs_invoked += count; 1529 rdp->n_cbs_invoked += count;
1421 if (list != NULL) { 1530 if (list != NULL) {
@@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1458void rcu_check_callbacks(int cpu, int user) 1567void rcu_check_callbacks(int cpu, int user)
1459{ 1568{
1460 trace_rcu_utilization("Start scheduler-tick"); 1569 trace_rcu_utilization("Start scheduler-tick");
1570 increment_cpu_stall_ticks();
1461 if (user || rcu_is_cpu_rrupt_from_idle()) { 1571 if (user || rcu_is_cpu_rrupt_from_idle()) {
1462 1572
1463 /* 1573 /*
@@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user)
1492 trace_rcu_utilization("End scheduler-tick"); 1602 trace_rcu_utilization("End scheduler-tick");
1493} 1603}
1494 1604
1495#ifdef CONFIG_SMP
1496
1497/* 1605/*
1498 * Scan the leaf rcu_node structures, processing dyntick state for any that 1606 * Scan the leaf rcu_node structures, processing dyntick state for any that
1499 * have not yet encountered a quiescent state, using the function specified. 1607 * have not yet encountered a quiescent state, using the function specified.
@@ -1616,15 +1724,6 @@ unlock_fqs_ret:
1616 trace_rcu_utilization("End fqs"); 1724 trace_rcu_utilization("End fqs");
1617} 1725}
1618 1726
1619#else /* #ifdef CONFIG_SMP */
1620
1621static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1622{
1623 set_need_resched();
1624}
1625
1626#endif /* #else #ifdef CONFIG_SMP */
1627
1628/* 1727/*
1629 * This does the RCU core processing work for the specified rcu_state 1728 * This does the RCU core processing work for the specified rcu_state
1630 * and rcu_data structures. This may be called only from the CPU to 1729 * and rcu_data structures. This may be called only from the CPU to
@@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void)
1702 1801
1703static void 1802static void
1704__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1803__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1705 struct rcu_state *rsp) 1804 struct rcu_state *rsp, bool lazy)
1706{ 1805{
1707 unsigned long flags; 1806 unsigned long flags;
1708 struct rcu_data *rdp; 1807 struct rcu_data *rdp;
1709 1808
1809 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
1710 debug_rcu_head_queue(head); 1810 debug_rcu_head_queue(head);
1711 head->func = func; 1811 head->func = func;
1712 head->next = NULL; 1812 head->next = NULL;
@@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1720 * a quiescent state betweentimes. 1820 * a quiescent state betweentimes.
1721 */ 1821 */
1722 local_irq_save(flags); 1822 local_irq_save(flags);
1823 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1723 rdp = this_cpu_ptr(rsp->rda); 1824 rdp = this_cpu_ptr(rsp->rda);
1724 1825
1725 /* Add the callback to our list. */ 1826 /* Add the callback to our list. */
1726 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1827 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1727 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1828 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1728 rdp->qlen++; 1829 rdp->qlen++;
1830 if (lazy)
1831 rdp->qlen_lazy++;
1729 1832
1730 if (__is_kfree_rcu_offset((unsigned long)func)) 1833 if (__is_kfree_rcu_offset((unsigned long)func))
1731 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1834 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
1732 rdp->qlen); 1835 rdp->qlen_lazy, rdp->qlen);
1733 else 1836 else
1734 trace_rcu_callback(rsp->name, head, rdp->qlen); 1837 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
1735 1838
1736 /* If interrupts were disabled, don't dive into RCU core. */ 1839 /* If interrupts were disabled, don't dive into RCU core. */
1737 if (irqs_disabled_flags(flags)) { 1840 if (irqs_disabled_flags(flags)) {
@@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1778 */ 1881 */
1779void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1882void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1780{ 1883{
1781 __call_rcu(head, func, &rcu_sched_state); 1884 __call_rcu(head, func, &rcu_sched_state, 0);
1782} 1885}
1783EXPORT_SYMBOL_GPL(call_rcu_sched); 1886EXPORT_SYMBOL_GPL(call_rcu_sched);
1784 1887
1785/* 1888/*
1786 * Queue an RCU for invocation after a quicker grace period. 1889 * Queue an RCU callback for invocation after a quicker grace period.
1787 */ 1890 */
1788void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1891void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1789{ 1892{
1790 __call_rcu(head, func, &rcu_bh_state); 1893 __call_rcu(head, func, &rcu_bh_state, 0);
1791} 1894}
1792EXPORT_SYMBOL_GPL(call_rcu_bh); 1895EXPORT_SYMBOL_GPL(call_rcu_bh);
1793 1896
@@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1816 */ 1919 */
1817void synchronize_sched(void) 1920void synchronize_sched(void)
1818{ 1921{
1922 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
1923 !lock_is_held(&rcu_lock_map) &&
1924 !lock_is_held(&rcu_sched_lock_map),
1925 "Illegal synchronize_sched() in RCU-sched read-side critical section");
1819 if (rcu_blocking_is_gp()) 1926 if (rcu_blocking_is_gp())
1820 return; 1927 return;
1821 wait_rcu_gp(call_rcu_sched); 1928 wait_rcu_gp(call_rcu_sched);
@@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
1833 */ 1940 */
1834void synchronize_rcu_bh(void) 1941void synchronize_rcu_bh(void)
1835{ 1942{
1943 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
1944 !lock_is_held(&rcu_lock_map) &&
1945 !lock_is_held(&rcu_sched_lock_map),
1946 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
1836 if (rcu_blocking_is_gp()) 1947 if (rcu_blocking_is_gp())
1837 return; 1948 return;
1838 wait_rcu_gp(call_rcu_bh); 1949 wait_rcu_gp(call_rcu_bh);
1839} 1950}
1840EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1951EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1841 1952
1953static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1954static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1955
1956static int synchronize_sched_expedited_cpu_stop(void *data)
1957{
1958 /*
1959 * There must be a full memory barrier on each affected CPU
1960 * between the time that try_stop_cpus() is called and the
1961 * time that it returns.
1962 *
1963 * In the current initial implementation of cpu_stop, the
1964 * above condition is already met when the control reaches
1965 * this point and the following smp_mb() is not strictly
1966 * necessary. Do smp_mb() anyway for documentation and
1967 * robustness against future implementation changes.
1968 */
1969 smp_mb(); /* See above comment block. */
1970 return 0;
1971}
1972
1973/**
1974 * synchronize_sched_expedited - Brute-force RCU-sched grace period
1975 *
1976 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
1977 * approach to force the grace period to end quickly. This consumes
1978 * significant time on all CPUs and is unfriendly to real-time workloads,
1979 * so is thus not recommended for any sort of common-case code. In fact,
1980 * if you are using synchronize_sched_expedited() in a loop, please
1981 * restructure your code to batch your updates, and then use a single
1982 * synchronize_sched() instead.
1983 *
1984 * Note that it is illegal to call this function while holding any lock
1985 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
1986 * to call this function from a CPU-hotplug notifier. Failing to observe
1987 * these restriction will result in deadlock.
1988 *
1989 * This implementation can be thought of as an application of ticket
1990 * locking to RCU, with sync_sched_expedited_started and
1991 * sync_sched_expedited_done taking on the roles of the halves
1992 * of the ticket-lock word. Each task atomically increments
1993 * sync_sched_expedited_started upon entry, snapshotting the old value,
1994 * then attempts to stop all the CPUs. If this succeeds, then each
1995 * CPU will have executed a context switch, resulting in an RCU-sched
1996 * grace period. We are then done, so we use atomic_cmpxchg() to
1997 * update sync_sched_expedited_done to match our snapshot -- but
1998 * only if someone else has not already advanced past our snapshot.
1999 *
2000 * On the other hand, if try_stop_cpus() fails, we check the value
2001 * of sync_sched_expedited_done. If it has advanced past our
2002 * initial snapshot, then someone else must have forced a grace period
2003 * some time after we took our snapshot. In this case, our work is
2004 * done for us, and we can simply return. Otherwise, we try again,
2005 * but keep our initial snapshot for purposes of checking for someone
2006 * doing our work for us.
2007 *
2008 * If we fail too many times in a row, we fall back to synchronize_sched().
2009 */
2010void synchronize_sched_expedited(void)
2011{
2012 int firstsnap, s, snap, trycount = 0;
2013
2014 /* Note that atomic_inc_return() implies full memory barrier. */
2015 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
2016 get_online_cpus();
2017 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2018
2019 /*
2020 * Each pass through the following loop attempts to force a
2021 * context switch on each CPU.
2022 */
2023 while (try_stop_cpus(cpu_online_mask,
2024 synchronize_sched_expedited_cpu_stop,
2025 NULL) == -EAGAIN) {
2026 put_online_cpus();
2027
2028 /* No joy, try again later. Or just synchronize_sched(). */
2029 if (trycount++ < 10)
2030 udelay(trycount * num_online_cpus());
2031 else {
2032 synchronize_sched();
2033 return;
2034 }
2035
2036 /* Check to see if someone else did our work for us. */
2037 s = atomic_read(&sync_sched_expedited_done);
2038 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
2039 smp_mb(); /* ensure test happens before caller kfree */
2040 return;
2041 }
2042
2043 /*
2044 * Refetching sync_sched_expedited_started allows later
2045 * callers to piggyback on our grace period. We subtract
2046 * 1 to get the same token that the last incrementer got.
2047 * We retry after they started, so our grace period works
2048 * for them, and they started after our first try, so their
2049 * grace period works for us.
2050 */
2051 get_online_cpus();
2052 snap = atomic_read(&sync_sched_expedited_started);
2053 smp_mb(); /* ensure read is before try_stop_cpus(). */
2054 }
2055
2056 /*
2057 * Everyone up to our most recent fetch is covered by our grace
2058 * period. Update the counter, but only if our work is still
2059 * relevant -- which it won't be if someone who started later
2060 * than we did beat us to the punch.
2061 */
2062 do {
2063 s = atomic_read(&sync_sched_expedited_done);
2064 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
2065 smp_mb(); /* ensure test happens before caller kfree */
2066 break;
2067 }
2068 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
2069
2070 put_online_cpus();
2071}
2072EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
2073
1842/* 2074/*
1843 * Check to see if there is any immediate RCU-related work to be done 2075 * Check to see if there is any immediate RCU-related work to be done
1844 * by the current CPU, for the specified type of RCU, returning 1 if so. 2076 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu)
1932 /* RCU callbacks either ready or pending? */ 2164 /* RCU callbacks either ready or pending? */
1933 return per_cpu(rcu_sched_data, cpu).nxtlist || 2165 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1934 per_cpu(rcu_bh_data, cpu).nxtlist || 2166 per_cpu(rcu_bh_data, cpu).nxtlist ||
1935 rcu_preempt_needs_cpu(cpu); 2167 rcu_preempt_cpu_has_callbacks(cpu);
1936} 2168}
1937 2169
1938static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2170static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2027 rdp->nxtlist = NULL; 2259 rdp->nxtlist = NULL;
2028 for (i = 0; i < RCU_NEXT_SIZE; i++) 2260 for (i = 0; i < RCU_NEXT_SIZE; i++)
2029 rdp->nxttail[i] = &rdp->nxtlist; 2261 rdp->nxttail[i] = &rdp->nxtlist;
2262 rdp->qlen_lazy = 0;
2030 rdp->qlen = 0; 2263 rdp->qlen = 0;
2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2264 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); 2265 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2266 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2034 rdp->cpu = cpu; 2267 rdp->cpu = cpu;
2035 rdp->rsp = rsp; 2268 rdp->rsp = rsp;
@@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2057 rdp->qlen_last_fqs_check = 0; 2290 rdp->qlen_last_fqs_check = 0;
2058 rdp->n_force_qs_snap = rsp->n_force_qs; 2291 rdp->n_force_qs_snap = rsp->n_force_qs;
2059 rdp->blimit = blimit; 2292 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; 2293 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2061 atomic_set(&rdp->dynticks->dynticks, 2294 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2295 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu); 2296 rcu_prepare_for_idle_init(cpu);
@@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2139 * touch any data without introducing corruption. We send the 2372 * touch any data without introducing corruption. We send the
2140 * dying CPU's callbacks to an arbitrarily chosen online CPU. 2373 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2141 */ 2374 */
2142 rcu_send_cbs_to_online(&rcu_bh_state); 2375 rcu_cleanup_dying_cpu(&rcu_bh_state);
2143 rcu_send_cbs_to_online(&rcu_sched_state); 2376 rcu_cleanup_dying_cpu(&rcu_sched_state);
2144 rcu_preempt_send_cbs_to_online(); 2377 rcu_preempt_cleanup_dying_cpu();
2145 rcu_cleanup_after_idle(cpu); 2378 rcu_cleanup_after_idle(cpu);
2146 break; 2379 break;
2147 case CPU_DEAD: 2380 case CPU_DEAD:
2148 case CPU_DEAD_FROZEN: 2381 case CPU_DEAD_FROZEN:
2149 case CPU_UP_CANCELED: 2382 case CPU_UP_CANCELED:
2150 case CPU_UP_CANCELED_FROZEN: 2383 case CPU_UP_CANCELED_FROZEN:
2151 rcu_offline_cpu(cpu); 2384 rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
2385 rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
2386 rcu_preempt_cleanup_dead_cpu(cpu);
2152 break; 2387 break;
2153 default: 2388 default:
2154 break; 2389 break;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index fddff92d6676..cdd1be0a4072 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -239,6 +239,12 @@ struct rcu_data {
239 bool preemptible; /* Preemptible RCU? */ 239 bool preemptible; /* Preemptible RCU? */
240 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 240 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
241 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 241 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
242#ifdef CONFIG_RCU_CPU_STALL_INFO
243 unsigned long ticks_this_gp; /* The number of scheduling-clock */
244 /* ticks this CPU has handled */
245 /* during and after the last grace */
246 /* period it is aware of. */
247#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
242 248
243 /* 2) batch handling */ 249 /* 2) batch handling */
244 /* 250 /*
@@ -265,7 +271,8 @@ struct rcu_data {
265 */ 271 */
266 struct rcu_head *nxtlist; 272 struct rcu_head *nxtlist;
267 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 273 struct rcu_head **nxttail[RCU_NEXT_SIZE];
268 long qlen; /* # of queued callbacks */ 274 long qlen_lazy; /* # of lazy queued callbacks */
275 long qlen; /* # of queued callbacks, incl lazy */
269 long qlen_last_fqs_check; 276 long qlen_last_fqs_check;
270 /* qlen at last check for QS forcing */ 277 /* qlen at last check for QS forcing */
271 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 278 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -282,7 +289,6 @@ struct rcu_data {
282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 289 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 290 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
284 unsigned long offline_fqs; /* Kicked due to being offline. */ 291 unsigned long offline_fqs; /* Kicked due to being offline. */
285 unsigned long resched_ipi; /* Sent a resched IPI. */
286 292
287 /* 5) __rcu_pending() statistics. */ 293 /* 5) __rcu_pending() statistics. */
288 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 294 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -313,12 +319,6 @@ struct rcu_data {
313#else 319#else
314#define RCU_STALL_DELAY_DELTA 0 320#define RCU_STALL_DELAY_DELTA 0
315#endif 321#endif
316
317#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
318 RCU_STALL_DELAY_DELTA)
319 /* for rsp->jiffies_stall */
320#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
321 /* for rsp->jiffies_stall */
322#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 322#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
323 /* to take at least one */ 323 /* to take at least one */
324 /* scheduling clock irq */ 324 /* scheduling clock irq */
@@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
438static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 438static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
439 struct rcu_node *rnp, 439 struct rcu_node *rnp,
440 struct rcu_data *rdp); 440 struct rcu_data *rdp);
441static void rcu_preempt_offline_cpu(int cpu);
442#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 441#endif /* #ifdef CONFIG_HOTPLUG_CPU */
442static void rcu_preempt_cleanup_dead_cpu(int cpu);
443static void rcu_preempt_check_callbacks(int cpu); 443static void rcu_preempt_check_callbacks(int cpu);
444static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
@@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake); 448 bool wake);
449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
450static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
451static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_cpu_has_callbacks(int cpu);
452static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 452static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
453static void rcu_preempt_send_cbs_to_online(void); 453static void rcu_preempt_cleanup_dying_cpu(void);
454static void __init __rcu_init_preempt(void); 454static void __init __rcu_init_preempt(void);
455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 473static void rcu_prepare_for_idle(int cpu);
474static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void);
477static void zero_cpu_stall_ticks(struct rcu_data *rdp);
478static void increment_cpu_stall_ticks(void);
474 479
475#endif /* #ifndef RCU_TREE_NONCORE */ 480#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8bb35d73e1f9..c023464816be 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,7 +25,6 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
29 28
30#define RCU_KTHREAD_PRIO 1 29#define RCU_KTHREAD_PRIO 1
31 30
@@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void)
63 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 62 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
64#endif 63#endif
65#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 64#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
66 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 65 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
66#endif
67#if defined(CONFIG_RCU_CPU_STALL_INFO)
68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
67#endif 69#endif
68#if NUM_RCU_LVL_4 != 0 70#if NUM_RCU_LVL_4 != 0
69 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); 71 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
@@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
490 492
491#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 493#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
492 494
495#ifdef CONFIG_RCU_CPU_STALL_INFO
496
497static void rcu_print_task_stall_begin(struct rcu_node *rnp)
498{
499 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
500 rnp->level, rnp->grplo, rnp->grphi);
501}
502
503static void rcu_print_task_stall_end(void)
504{
505 printk(KERN_CONT "\n");
506}
507
508#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
509
510static void rcu_print_task_stall_begin(struct rcu_node *rnp)
511{
512}
513
514static void rcu_print_task_stall_end(void)
515{
516}
517
518#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
519
493/* 520/*
494 * Scan the current list of tasks blocked within RCU read-side critical 521 * Scan the current list of tasks blocked within RCU read-side critical
495 * sections, printing out the tid of each. 522 * sections, printing out the tid of each.
@@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
501 528
502 if (!rcu_preempt_blocked_readers_cgp(rnp)) 529 if (!rcu_preempt_blocked_readers_cgp(rnp))
503 return 0; 530 return 0;
531 rcu_print_task_stall_begin(rnp);
504 t = list_entry(rnp->gp_tasks, 532 t = list_entry(rnp->gp_tasks,
505 struct task_struct, rcu_node_entry); 533 struct task_struct, rcu_node_entry);
506 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 534 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
507 printk(" P%d", t->pid); 535 printk(KERN_CONT " P%d", t->pid);
508 ndetected++; 536 ndetected++;
509 } 537 }
538 rcu_print_task_stall_end();
510 return ndetected; 539 return ndetected;
511} 540}
512 541
@@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
581 * absolutely necessary, but this is a good performance/complexity 610 * absolutely necessary, but this is a good performance/complexity
582 * tradeoff. 611 * tradeoff.
583 */ 612 */
584 if (rcu_preempt_blocked_readers_cgp(rnp)) 613 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
585 retval |= RCU_OFL_TASKS_NORM_GP; 614 retval |= RCU_OFL_TASKS_NORM_GP;
586 if (rcu_preempted_readers_exp(rnp)) 615 if (rcu_preempted_readers_exp(rnp))
587 retval |= RCU_OFL_TASKS_EXP_GP; 616 retval |= RCU_OFL_TASKS_EXP_GP;
@@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
618 return retval; 647 return retval;
619} 648}
620 649
650#endif /* #ifdef CONFIG_HOTPLUG_CPU */
651
621/* 652/*
622 * Do CPU-offline processing for preemptible RCU. 653 * Do CPU-offline processing for preemptible RCU.
623 */ 654 */
624static void rcu_preempt_offline_cpu(int cpu) 655static void rcu_preempt_cleanup_dead_cpu(int cpu)
625{ 656{
626 __rcu_offline_cpu(cpu, &rcu_preempt_state); 657 rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
627} 658}
628 659
629#endif /* #ifdef CONFIG_HOTPLUG_CPU */
630
631/* 660/*
632 * Check for a quiescent state from the current CPU. When a task blocks, 661 * Check for a quiescent state from the current CPU. When a task blocks,
633 * the task is recorded in the corresponding CPU's rcu_node structure, 662 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void)
671 */ 700 */
672void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 701void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
673{ 702{
674 __call_rcu(head, func, &rcu_preempt_state); 703 __call_rcu(head, func, &rcu_preempt_state, 0);
675} 704}
676EXPORT_SYMBOL_GPL(call_rcu); 705EXPORT_SYMBOL_GPL(call_rcu);
677 706
707/*
708 * Queue an RCU callback for lazy invocation after a grace period.
709 * This will likely be later named something like "call_rcu_lazy()",
710 * but this change will require some way of tagging the lazy RCU
711 * callbacks in the list of pending callbacks. Until then, this
712 * function may only be called from __kfree_rcu().
713 */
714void kfree_call_rcu(struct rcu_head *head,
715 void (*func)(struct rcu_head *rcu))
716{
717 __call_rcu(head, func, &rcu_preempt_state, 1);
718}
719EXPORT_SYMBOL_GPL(kfree_call_rcu);
720
678/** 721/**
679 * synchronize_rcu - wait until a grace period has elapsed. 722 * synchronize_rcu - wait until a grace period has elapsed.
680 * 723 *
@@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
688 */ 731 */
689void synchronize_rcu(void) 732void synchronize_rcu(void)
690{ 733{
734 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
735 !lock_is_held(&rcu_lock_map) &&
736 !lock_is_held(&rcu_sched_lock_map),
737 "Illegal synchronize_rcu() in RCU read-side critical section");
691 if (!rcu_scheduler_active) 738 if (!rcu_scheduler_active)
692 return; 739 return;
693 wait_rcu_gp(call_rcu); 740 wait_rcu_gp(call_rcu);
@@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 835 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
789} 836}
790 837
791/* 838/**
792 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 839 * synchronize_rcu_expedited - Brute-force RCU grace period
793 * is to invoke synchronize_sched_expedited() to push all the tasks to 840 *
794 * the ->blkd_tasks lists and wait for this list to drain. 841 * Wait for an RCU-preempt grace period, but expedite it. The basic
842 * idea is to invoke synchronize_sched_expedited() to push all the tasks to
843 * the ->blkd_tasks lists and wait for this list to drain. This consumes
844 * significant time on all CPUs and is unfriendly to real-time workloads,
845 * so is thus not recommended for any sort of common-case code.
846 * In fact, if you are using synchronize_rcu_expedited() in a loop,
847 * please restructure your code to batch your updates, and then Use a
848 * single synchronize_rcu() instead.
849 *
850 * Note that it is illegal to call this function while holding any lock
851 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
852 * to call this function from a CPU-hotplug notifier. Failing to observe
853 * these restriction will result in deadlock.
795 */ 854 */
796void synchronize_rcu_expedited(void) 855void synchronize_rcu_expedited(void)
797{ 856{
@@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu)
869} 928}
870 929
871/* 930/*
872 * Does preemptible RCU need the CPU to stay out of dynticks mode? 931 * Does preemptible RCU have callbacks on this CPU?
873 */ 932 */
874static int rcu_preempt_needs_cpu(int cpu) 933static int rcu_preempt_cpu_has_callbacks(int cpu)
875{ 934{
876 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 935 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
877} 936}
@@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
894} 953}
895 954
896/* 955/*
897 * Move preemptible RCU's callbacks from dying CPU to other online CPU. 956 * Move preemptible RCU's callbacks from dying CPU to other online CPU
957 * and record a quiescent state.
898 */ 958 */
899static void rcu_preempt_send_cbs_to_online(void) 959static void rcu_preempt_cleanup_dying_cpu(void)
900{ 960{
901 rcu_send_cbs_to_online(&rcu_preempt_state); 961 rcu_cleanup_dying_cpu(&rcu_preempt_state);
902} 962}
903 963
904/* 964/*
@@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1034 return 0; 1094 return 0;
1035} 1095}
1036 1096
1097#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1098
1037/* 1099/*
1038 * Because preemptible RCU does not exist, it never needs CPU-offline 1100 * Because preemptible RCU does not exist, it never needs CPU-offline
1039 * processing. 1101 * processing.
1040 */ 1102 */
1041static void rcu_preempt_offline_cpu(int cpu) 1103static void rcu_preempt_cleanup_dead_cpu(int cpu)
1042{ 1104{
1043} 1105}
1044 1106
1045#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1046
1047/* 1107/*
1048 * Because preemptible RCU does not exist, it never has any callbacks 1108 * Because preemptible RCU does not exist, it never has any callbacks
1049 * to check. 1109 * to check.
@@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void)
1061} 1121}
1062 1122
1063/* 1123/*
1124 * Queue an RCU callback for lazy invocation after a grace period.
1125 * This will likely be later named something like "call_rcu_lazy()",
1126 * but this change will require some way of tagging the lazy RCU
1127 * callbacks in the list of pending callbacks. Until then, this
1128 * function may only be called from __kfree_rcu().
1129 *
1130 * Because there is no preemptible RCU, we use RCU-sched instead.
1131 */
1132void kfree_call_rcu(struct rcu_head *head,
1133 void (*func)(struct rcu_head *rcu))
1134{
1135 __call_rcu(head, func, &rcu_sched_state, 1);
1136}
1137EXPORT_SYMBOL_GPL(kfree_call_rcu);
1138
1139/*
1064 * Wait for an rcu-preempt grace period, but make it happen quickly. 1140 * Wait for an rcu-preempt grace period, but make it happen quickly.
1065 * But because preemptible RCU does not exist, map to rcu-sched. 1141 * But because preemptible RCU does not exist, map to rcu-sched.
1066 */ 1142 */
@@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu)
1093} 1169}
1094 1170
1095/* 1171/*
1096 * Because preemptible RCU does not exist, it never needs any CPU. 1172 * Because preemptible RCU does not exist, it never has callbacks
1097 */ 1173 */
1098static int rcu_preempt_needs_cpu(int cpu) 1174static int rcu_preempt_cpu_has_callbacks(int cpu)
1099{ 1175{
1100 return 0; 1176 return 0;
1101} 1177}
@@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1119} 1195}
1120 1196
1121/* 1197/*
1122 * Because there is no preemptible RCU, there are no callbacks to move. 1198 * Because there is no preemptible RCU, there is no cleanup to do.
1123 */ 1199 */
1124static void rcu_preempt_send_cbs_to_online(void) 1200static void rcu_preempt_cleanup_dying_cpu(void)
1125{ 1201{
1126} 1202}
1127 1203
@@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1823 1899
1824#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1900#endif /* #else #ifdef CONFIG_RCU_BOOST */
1825 1901
1826#ifndef CONFIG_SMP
1827
1828void synchronize_sched_expedited(void)
1829{
1830 cond_resched();
1831}
1832EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1833
1834#else /* #ifndef CONFIG_SMP */
1835
1836static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1837static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1838
1839static int synchronize_sched_expedited_cpu_stop(void *data)
1840{
1841 /*
1842 * There must be a full memory barrier on each affected CPU
1843 * between the time that try_stop_cpus() is called and the
1844 * time that it returns.
1845 *
1846 * In the current initial implementation of cpu_stop, the
1847 * above condition is already met when the control reaches
1848 * this point and the following smp_mb() is not strictly
1849 * necessary. Do smp_mb() anyway for documentation and
1850 * robustness against future implementation changes.
1851 */
1852 smp_mb(); /* See above comment block. */
1853 return 0;
1854}
1855
1856/*
1857 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1858 * approach to force grace period to end quickly. This consumes
1859 * significant time on all CPUs, and is thus not recommended for
1860 * any sort of common-case code.
1861 *
1862 * Note that it is illegal to call this function while holding any
1863 * lock that is acquired by a CPU-hotplug notifier. Failing to
1864 * observe this restriction will result in deadlock.
1865 *
1866 * This implementation can be thought of as an application of ticket
1867 * locking to RCU, with sync_sched_expedited_started and
1868 * sync_sched_expedited_done taking on the roles of the halves
1869 * of the ticket-lock word. Each task atomically increments
1870 * sync_sched_expedited_started upon entry, snapshotting the old value,
1871 * then attempts to stop all the CPUs. If this succeeds, then each
1872 * CPU will have executed a context switch, resulting in an RCU-sched
1873 * grace period. We are then done, so we use atomic_cmpxchg() to
1874 * update sync_sched_expedited_done to match our snapshot -- but
1875 * only if someone else has not already advanced past our snapshot.
1876 *
1877 * On the other hand, if try_stop_cpus() fails, we check the value
1878 * of sync_sched_expedited_done. If it has advanced past our
1879 * initial snapshot, then someone else must have forced a grace period
1880 * some time after we took our snapshot. In this case, our work is
1881 * done for us, and we can simply return. Otherwise, we try again,
1882 * but keep our initial snapshot for purposes of checking for someone
1883 * doing our work for us.
1884 *
1885 * If we fail too many times in a row, we fall back to synchronize_sched().
1886 */
1887void synchronize_sched_expedited(void)
1888{
1889 int firstsnap, s, snap, trycount = 0;
1890
1891 /* Note that atomic_inc_return() implies full memory barrier. */
1892 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1893 get_online_cpus();
1894
1895 /*
1896 * Each pass through the following loop attempts to force a
1897 * context switch on each CPU.
1898 */
1899 while (try_stop_cpus(cpu_online_mask,
1900 synchronize_sched_expedited_cpu_stop,
1901 NULL) == -EAGAIN) {
1902 put_online_cpus();
1903
1904 /* No joy, try again later. Or just synchronize_sched(). */
1905 if (trycount++ < 10)
1906 udelay(trycount * num_online_cpus());
1907 else {
1908 synchronize_sched();
1909 return;
1910 }
1911
1912 /* Check to see if someone else did our work for us. */
1913 s = atomic_read(&sync_sched_expedited_done);
1914 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1915 smp_mb(); /* ensure test happens before caller kfree */
1916 return;
1917 }
1918
1919 /*
1920 * Refetching sync_sched_expedited_started allows later
1921 * callers to piggyback on our grace period. We subtract
1922 * 1 to get the same token that the last incrementer got.
1923 * We retry after they started, so our grace period works
1924 * for them, and they started after our first try, so their
1925 * grace period works for us.
1926 */
1927 get_online_cpus();
1928 snap = atomic_read(&sync_sched_expedited_started);
1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1930 }
1931
1932 /*
1933 * Everyone up to our most recent fetch is covered by our grace
1934 * period. Update the counter, but only if our work is still
1935 * relevant -- which it won't be if someone who started later
1936 * than we did beat us to the punch.
1937 */
1938 do {
1939 s = atomic_read(&sync_sched_expedited_done);
1940 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1941 smp_mb(); /* ensure test happens before caller kfree */
1942 break;
1943 }
1944 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1945
1946 put_online_cpus();
1947}
1948EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1949
1950#endif /* #else #ifndef CONFIG_SMP */
1951
1952#if !defined(CONFIG_RCU_FAST_NO_HZ) 1902#if !defined(CONFIG_RCU_FAST_NO_HZ)
1953 1903
1954/* 1904/*
@@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu)
1981} 1931}
1982 1932
1983/* 1933/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, 1934 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1985 * is nothing. 1935 * is nothing.
1986 */ 1936 */
1987static void rcu_prepare_for_idle(int cpu) 1937static void rcu_prepare_for_idle(int cpu)
@@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu)
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1965 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency, 1966 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it! 1967 * just power the system down and be done with it!
1968 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1969 * permitted to sleep in dyntick-idle mode with only lazy RCU
1970 * callbacks pending. Setting this too high can OOM your system.
2018 * 1971 *
2019 * The values below work well in practice. If future workloads require 1972 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though 1973 * adjustment, they can be converted into kernel config parameters, though
@@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu)
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1976#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1977#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
2026 1980
2027static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1981static DEFINE_PER_CPU(int, rcu_dyntick_drain);
2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait; 1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
2031 1986
2032/* 1987/*
2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu)
2048} 2003}
2049 2004
2050/* 2005/*
2006 * Does the specified flavor of RCU have non-lazy callbacks pending on
2007 * the specified CPU? Both RCU flavor and CPU are specified by the
2008 * rcu_data structure.
2009 */
2010static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
2011{
2012 return rdp->qlen != rdp->qlen_lazy;
2013}
2014
2015#ifdef CONFIG_TREE_PREEMPT_RCU
2016
2017/*
2018 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there
2019 * is no RCU-preempt in the kernel.)
2020 */
2021static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
2022{
2023 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
2024
2025 return __rcu_cpu_has_nonlazy_callbacks(rdp);
2026}
2027
2028#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2029
2030static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
2031{
2032 return 0;
2033}
2034
2035#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
2036
2037/*
2038 * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
2039 */
2040static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2041{
2042 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
2043 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
2044 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
2045}
2046
2047/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU 2048 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks 2049 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the 2050 * pending. The hander doesn't really need to do anything because the
@@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); 2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075 2072
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); 2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0; 2076 firsttime = 0;
2078 } 2077 }
2079} 2078}
@@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu)
2109 */ 2108 */
2110static void rcu_prepare_for_idle(int cpu) 2109static void rcu_prepare_for_idle(int cpu)
2111{ 2110{
2112 unsigned long flags;
2113
2114 local_irq_save(flags);
2115
2116 /* 2111 /*
2117 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2112 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2118 * Also reset state to avoid prejudicing later attempts. 2113 * Also reset state to avoid prejudicing later attempts.
@@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu)
2120 if (!rcu_cpu_has_callbacks(cpu)) { 2115 if (!rcu_cpu_has_callbacks(cpu)) {
2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2116 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2122 per_cpu(rcu_dyntick_drain, cpu) = 0; 2117 per_cpu(rcu_dyntick_drain, cpu) = 0;
2123 local_irq_restore(flags);
2124 trace_rcu_prep_idle("No callbacks"); 2118 trace_rcu_prep_idle("No callbacks");
2125 return; 2119 return;
2126 } 2120 }
@@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu)
2130 * refrained from disabling the scheduling-clock tick. 2124 * refrained from disabling the scheduling-clock tick.
2131 */ 2125 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { 2126 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff"); 2127 trace_rcu_prep_idle("In holdoff");
2135 return; 2128 return;
2136 } 2129 }
@@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu)
2140 /* First time through, initialize the counter. */ 2133 /* First time through, initialize the counter. */
2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; 2134 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && 2135 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) { 2136 !rcu_pending(cpu) &&
2137 !local_softirq_pending()) {
2144 /* Can we go dyntick-idle despite still having callbacks? */ 2138 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks"); 2139 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0; 2140 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2142 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2145 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */ 2148 return; /* Nothing more to do immediately. */
2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2152 /* We have hit the limit, so time to give up. */ 2150 /* We have hit the limit, so time to give up. */
2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2151 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff"); 2152 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ 2153 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return; 2154 return;
@@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu)
2163 */ 2160 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU 2161#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) { 2162 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu); 2163 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0); 2164 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 } 2165 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 2166#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2167 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2174 rcu_sched_qs(cpu); 2168 rcu_sched_qs(cpu);
2175 force_quiescent_state(&rcu_sched_state, 0); 2169 force_quiescent_state(&rcu_sched_state, 0);
2176 local_irq_save(flags);
2177 } 2170 }
2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2171 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2180 rcu_bh_qs(cpu); 2172 rcu_bh_qs(cpu);
2181 force_quiescent_state(&rcu_bh_state, 0); 2173 force_quiescent_state(&rcu_bh_state, 0);
2182 local_irq_save(flags);
2183 } 2174 }
2184 2175
2185 /* 2176 /*
@@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu)
2187 * So try forcing the callbacks through the grace period. 2178 * So try forcing the callbacks through the grace period.
2188 */ 2179 */
2189 if (rcu_cpu_has_callbacks(cpu)) { 2180 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks"); 2181 trace_rcu_prep_idle("More callbacks");
2192 invoke_rcu_core(); 2182 invoke_rcu_core();
2193 } else { 2183 } else
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained"); 2184 trace_rcu_prep_idle("Callbacks drained");
2196 }
2197} 2185}
2198 2186
2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188
2189#ifdef CONFIG_RCU_CPU_STALL_INFO
2190
2191#ifdef CONFIG_RCU_FAST_NO_HZ
2192
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2196
2197 sprintf(cp, "drain=%d %c timer=%lld",
2198 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp)
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203}
2204
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2206
2207static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2208{
2209}
2210
2211#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
2212
2213/* Initiate the stall-info list. */
2214static void print_cpu_stall_info_begin(void)
2215{
2216 printk(KERN_CONT "\n");
2217}
2218
2219/*
2220 * Print out diagnostic information for the specified stalled CPU.
2221 *
2222 * If the specified CPU is aware of the current RCU grace period
2223 * (flavor specified by rsp), then print the number of scheduling
2224 * clock interrupts the CPU has taken during the time that it has
2225 * been aware. Otherwise, print the number of RCU grace periods
2226 * that this CPU is ignorant of, for example, "1" if the CPU was
2227 * aware of the previous grace period.
2228 *
2229 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
2230 */
2231static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2232{
2233 char fast_no_hz[72];
2234 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2235 struct rcu_dynticks *rdtp = rdp->dynticks;
2236 char *ticks_title;
2237 unsigned long ticks_value;
2238
2239 if (rsp->gpnum == rdp->gpnum) {
2240 ticks_title = "ticks this GP";
2241 ticks_value = rdp->ticks_this_gp;
2242 } else {
2243 ticks_title = "GPs behind";
2244 ticks_value = rsp->gpnum - rdp->gpnum;
2245 }
2246 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2247 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
2248 cpu, ticks_value, ticks_title,
2249 atomic_read(&rdtp->dynticks) & 0xfff,
2250 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
2251 fast_no_hz);
2252}
2253
2254/* Terminate the stall-info list. */
2255static void print_cpu_stall_info_end(void)
2256{
2257 printk(KERN_ERR "\t");
2258}
2259
2260/* Zero ->ticks_this_gp for all flavors of RCU. */
2261static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2262{
2263 rdp->ticks_this_gp = 0;
2264}
2265
2266/* Increment ->ticks_this_gp for all flavors of RCU. */
2267static void increment_cpu_stall_ticks(void)
2268{
2269 __get_cpu_var(rcu_sched_data).ticks_this_gp++;
2270 __get_cpu_var(rcu_bh_data).ticks_this_gp++;
2271#ifdef CONFIG_TREE_PREEMPT_RCU
2272 __get_cpu_var(rcu_preempt_data).ticks_this_gp++;
2273#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2274}
2275
2276#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
2277
2278static void print_cpu_stall_info_begin(void)
2279{
2280 printk(KERN_CONT " {");
2281}
2282
2283static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2284{
2285 printk(KERN_CONT " %d", cpu);
2286}
2287
2288static void print_cpu_stall_info_end(void)
2289{
2290 printk(KERN_CONT "} ");
2291}
2292
2293static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2294{
2295}
2296
2297static void increment_cpu_stall_ticks(void)
2298{
2299}
2300
2301#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 654cfe67f0d1..ed459edeff43 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
72 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
73 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
74 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu", rdp->offline_fqs);
76 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
77 rdp->qlen, 77 rdp->qlen_lazy, rdp->qlen,
78 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 78 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
79 rdp->nxttail[RCU_NEXT_TAIL]], 79 rdp->nxttail[RCU_NEXT_TAIL]],
80 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 80 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
144 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
145 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
146 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu", rdp->offline_fqs);
148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
150 rdp->nxttail[RCU_NEXT_TAIL]], 150 rdp->nxttail[RCU_NEXT_TAIL]],
151 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 151 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
168{ 168{
169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
172#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
173 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
174#endif /* #ifdef CONFIG_RCU_BOOST */ 174#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
749 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
750 return result; 750 return result;
751} 751}
752EXPORT_SYMBOL(adjust_resource);
752 753
753static void __init __reserve_region_with_split(struct resource *root, 754static void __init __reserve_region_with_split(struct resource *root,
754 resource_size_t start, resource_size_t end, 755 resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
792 write_unlock(&resource_lock); 793 write_unlock(&resource_lock);
793} 794}
794 795
795EXPORT_SYMBOL(adjust_resource);
796
797/** 796/**
798 * resource_alignment - calculate resource's alignment 797 * resource_alignment - calculate resource's alignment
799 * @res: resource pointer 798 * @res: resource pointer
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74f02de..6850f53e02d8 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -10,7 +10,6 @@
10#include <linux/export.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h>
14#include <linux/atomic.h> 13#include <linux/atomic.h>
15 14
16/* 15/*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e8a1f83ee0e7..0984a21076a3 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup);
195 195
196#ifdef CONFIG_PROC_FS 196#ifdef CONFIG_PROC_FS
197 197
198int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) 198int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
199{ 199{
200 static unsigned long next = INITIAL_JIFFIES; 200 static unsigned long next = INITIAL_JIFFIES;
201 struct autogroup *ag; 201 struct autogroup *ag;
202 int err; 202 int err;
203 203
204 if (*nice < -20 || *nice > 19) 204 if (nice < -20 || nice > 19)
205 return -EINVAL; 205 return -EINVAL;
206 206
207 err = security_task_setnice(current, *nice); 207 err = security_task_setnice(current, nice);
208 if (err) 208 if (err)
209 return err; 209 return err;
210 210
211 if (*nice < 0 && !can_nice(current, *nice)) 211 if (nice < 0 && !can_nice(current, nice))
212 return -EPERM; 212 return -EPERM;
213 213
214 /* this is a heavy operation taking global locks.. */ 214 /* this is a heavy operation taking global locks.. */
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
219 ag = autogroup_task_get(p); 219 ag = autogroup_task_get(p);
220 220
221 down_write(&ag->lock); 221 down_write(&ag->lock);
222 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); 222 err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
223 if (!err) 223 if (!err)
224 ag->nice = *nice; 224 ag->nice = nice;
225 up_write(&ag->lock); 225 up_write(&ag->lock);
226 226
227 autogroup_kref_put(ag); 227 autogroup_kref_put(ag);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e053..afc6d7e71557 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,9 @@
71#include <linux/ftrace.h> 71#include <linux/ftrace.h>
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h>
74 75
76#include <asm/switch_to.h>
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
77#include <asm/mutex.h> 79#include <asm/mutex.h>
@@ -162,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
162 164
163#ifdef HAVE_JUMP_LABEL 165#ifdef HAVE_JUMP_LABEL
164 166
165#define jump_label_key__true jump_label_key_enabled 167#define jump_label_key__true STATIC_KEY_INIT_TRUE
166#define jump_label_key__false jump_label_key_disabled 168#define jump_label_key__false STATIC_KEY_INIT_FALSE
167 169
168#define SCHED_FEAT(name, enabled) \ 170#define SCHED_FEAT(name, enabled) \
169 jump_label_key__##enabled , 171 jump_label_key__##enabled ,
170 172
171struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { 173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
172#include "features.h" 174#include "features.h"
173}; 175};
174 176
@@ -176,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
176 178
177static void sched_feat_disable(int i) 179static void sched_feat_disable(int i)
178{ 180{
179 if (jump_label_enabled(&sched_feat_keys[i])) 181 if (static_key_enabled(&sched_feat_keys[i]))
180 jump_label_dec(&sched_feat_keys[i]); 182 static_key_slow_dec(&sched_feat_keys[i]);
181} 183}
182 184
183static void sched_feat_enable(int i) 185static void sched_feat_enable(int i)
184{ 186{
185 if (!jump_label_enabled(&sched_feat_keys[i])) 187 if (!static_key_enabled(&sched_feat_keys[i]))
186 jump_label_inc(&sched_feat_keys[i]); 188 static_key_slow_inc(&sched_feat_keys[i]);
187} 189}
188#else 190#else
189static void sched_feat_disable(int i) { }; 191static void sched_feat_disable(int i) { };
@@ -894,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
894 delta -= irq_delta; 896 delta -= irq_delta;
895#endif 897#endif
896#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 898#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
897 if (static_branch((&paravirt_steal_rq_enabled))) { 899 if (static_key_false((&paravirt_steal_rq_enabled))) {
898 u64 st; 900 u64 st;
899 901
900 steal = paravirt_steal_clock(cpu_of(rq)); 902 steal = paravirt_steal_clock(cpu_of(rq));
@@ -1263,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
1263 */ 1265 */
1264static int select_fallback_rq(int cpu, struct task_struct *p) 1266static int select_fallback_rq(int cpu, struct task_struct *p)
1265{ 1267{
1266 int dest_cpu;
1267 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1268 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1269 enum { cpuset, possible, fail } state = cpuset;
1270 int dest_cpu;
1268 1271
1269 /* Look for allowed, online CPU in same node. */ 1272 /* Look for allowed, online CPU in same node. */
1270 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 1273 for_each_cpu(dest_cpu, nodemask) {
1274 if (!cpu_online(dest_cpu))
1275 continue;
1276 if (!cpu_active(dest_cpu))
1277 continue;
1271 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1278 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1272 return dest_cpu; 1279 return dest_cpu;
1280 }
1273 1281
1274 /* Any allowed, online CPU? */ 1282 for (;;) {
1275 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); 1283 /* Any allowed, online CPU? */
1276 if (dest_cpu < nr_cpu_ids) 1284 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1277 return dest_cpu; 1285 if (!cpu_online(dest_cpu))
1286 continue;
1287 if (!cpu_active(dest_cpu))
1288 continue;
1289 goto out;
1290 }
1278 1291
1279 /* No more Mr. Nice Guy. */ 1292 switch (state) {
1280 dest_cpu = cpuset_cpus_allowed_fallback(p); 1293 case cpuset:
1281 /* 1294 /* No more Mr. Nice Guy. */
1282 * Don't tell them about moving exiting tasks or 1295 cpuset_cpus_allowed_fallback(p);
1283 * kernel threads (both mm NULL), since they never 1296 state = possible;
1284 * leave kernel. 1297 break;
1285 */ 1298
1286 if (p->mm && printk_ratelimit()) { 1299 case possible:
1287 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", 1300 do_set_cpus_allowed(p, cpu_possible_mask);
1288 task_pid_nr(p), p->comm, cpu); 1301 state = fail;
1302 break;
1303
1304 case fail:
1305 BUG();
1306 break;
1307 }
1308 }
1309
1310out:
1311 if (state != cpuset) {
1312 /*
1313 * Don't tell them about moving exiting tasks or
1314 * kernel threads (both mm NULL), since they never
1315 * leave kernel.
1316 */
1317 if (p->mm && printk_ratelimit()) {
1318 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1319 task_pid_nr(p), p->comm, cpu);
1320 }
1289 } 1321 }
1290 1322
1291 return dest_cpu; 1323 return dest_cpu;
@@ -1507,7 +1539,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1507} 1539}
1508#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1540#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1509 1541
1510static inline int ttwu_share_cache(int this_cpu, int that_cpu) 1542bool cpus_share_cache(int this_cpu, int that_cpu)
1511{ 1543{
1512 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1544 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1513} 1545}
@@ -1518,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1518 struct rq *rq = cpu_rq(cpu); 1550 struct rq *rq = cpu_rq(cpu);
1519 1551
1520#if defined(CONFIG_SMP) 1552#if defined(CONFIG_SMP)
1521 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { 1553 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1522 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1554 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1523 ttwu_queue_remote(p, cpu); 1555 ttwu_queue_remote(p, cpu);
1524 return; 1556 return;
@@ -1932,7 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1932 local_irq_enable(); 1964 local_irq_enable();
1933#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1965#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1934 finish_lock_switch(rq, prev); 1966 finish_lock_switch(rq, prev);
1935 trace_sched_stat_sleeptime(current, rq->clock); 1967 finish_arch_post_lock_switch();
1936 1968
1937 fire_sched_in_preempt_notifiers(current); 1969 fire_sched_in_preempt_notifiers(current);
1938 if (mm) 1970 if (mm)
@@ -2267,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp,
2267 * Once we've updated the global active value, we need to apply the exponential 2299 * Once we've updated the global active value, we need to apply the exponential
2268 * weights adjusted to the number of cycles missed. 2300 * weights adjusted to the number of cycles missed.
2269 */ 2301 */
2270static void calc_global_nohz(unsigned long ticks) 2302static void calc_global_nohz(void)
2271{ 2303{
2272 long delta, active, n; 2304 long delta, active, n;
2273 2305
2274 if (time_before(jiffies, calc_load_update))
2275 return;
2276
2277 /* 2306 /*
2278 * If we crossed a calc_load_update boundary, make sure to fold 2307 * If we crossed a calc_load_update boundary, make sure to fold
2279 * any pending idle changes, the respective CPUs might have 2308 * any pending idle changes, the respective CPUs might have
@@ -2285,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks)
2285 atomic_long_add(delta, &calc_load_tasks); 2314 atomic_long_add(delta, &calc_load_tasks);
2286 2315
2287 /* 2316 /*
2288 * If we were idle for multiple load cycles, apply them. 2317 * It could be the one fold was all it took, we done!
2289 */ 2318 */
2290 if (ticks >= LOAD_FREQ) { 2319 if (time_before(jiffies, calc_load_update + 10))
2291 n = ticks / LOAD_FREQ; 2320 return;
2292 2321
2293 active = atomic_long_read(&calc_load_tasks); 2322 /*
2294 active = active > 0 ? active * FIXED_1 : 0; 2323 * Catch-up, fold however many we are behind still
2324 */
2325 delta = jiffies - calc_load_update - 10;
2326 n = 1 + (delta / LOAD_FREQ);
2295 2327
2296 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2328 active = atomic_long_read(&calc_load_tasks);
2297 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2329 active = active > 0 ? active * FIXED_1 : 0;
2298 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2299 2330
2300 calc_load_update += n * LOAD_FREQ; 2331 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2301 } 2332 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2333 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2302 2334
2303 /* 2335 calc_load_update += n * LOAD_FREQ;
2304 * Its possible the remainder of the above division also crosses
2305 * a LOAD_FREQ period, the regular check in calc_global_load()
2306 * which comes after this will take care of that.
2307 *
2308 * Consider us being 11 ticks before a cycle completion, and us
2309 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
2310 * age us 4 cycles, and the test in calc_global_load() will
2311 * pick up the final one.
2312 */
2313} 2336}
2314#else 2337#else
2315void calc_load_account_idle(struct rq *this_rq) 2338void calc_load_account_idle(struct rq *this_rq)
@@ -2321,7 +2344,7 @@ static inline long calc_load_fold_idle(void)
2321 return 0; 2344 return 0;
2322} 2345}
2323 2346
2324static void calc_global_nohz(unsigned long ticks) 2347static void calc_global_nohz(void)
2325{ 2348{
2326} 2349}
2327#endif 2350#endif
@@ -2349,8 +2372,6 @@ void calc_global_load(unsigned long ticks)
2349{ 2372{
2350 long active; 2373 long active;
2351 2374
2352 calc_global_nohz(ticks);
2353
2354 if (time_before(jiffies, calc_load_update + 10)) 2375 if (time_before(jiffies, calc_load_update + 10))
2355 return; 2376 return;
2356 2377
@@ -2362,6 +2383,16 @@ void calc_global_load(unsigned long ticks)
2362 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 2383 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2363 2384
2364 calc_load_update += LOAD_FREQ; 2385 calc_load_update += LOAD_FREQ;
2386
2387 /*
2388 * Account one period with whatever state we found before
2389 * folding in the nohz state and ageing the entire idle period.
2390 *
2391 * This avoids loosing a sample when we go idle between
2392 * calc_load_account_active() (10 ticks ago) and now and thus
2393 * under-accounting.
2394 */
2395 calc_global_nohz();
2365} 2396}
2366 2397
2367/* 2398/*
@@ -2756,7 +2787,7 @@ void account_idle_time(cputime_t cputime)
2756static __always_inline bool steal_account_process_tick(void) 2787static __always_inline bool steal_account_process_tick(void)
2757{ 2788{
2758#ifdef CONFIG_PARAVIRT 2789#ifdef CONFIG_PARAVIRT
2759 if (static_branch(&paravirt_steal_enabled)) { 2790 if (static_key_false(&paravirt_steal_enabled)) {
2760 u64 steal, st = 0; 2791 u64 steal, st = 0;
2761 2792
2762 steal = paravirt_steal_clock(smp_processor_id()); 2793 steal = paravirt_steal_clock(smp_processor_id());
@@ -3071,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count);
3071 */ 3102 */
3072static noinline void __schedule_bug(struct task_struct *prev) 3103static noinline void __schedule_bug(struct task_struct *prev)
3073{ 3104{
3074 struct pt_regs *regs = get_irq_regs();
3075
3076 if (oops_in_progress) 3105 if (oops_in_progress)
3077 return; 3106 return;
3078 3107
@@ -3083,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3083 print_modules(); 3112 print_modules();
3084 if (irqs_disabled()) 3113 if (irqs_disabled())
3085 print_irqtrace_events(prev); 3114 print_irqtrace_events(prev);
3086 3115 dump_stack();
3087 if (regs)
3088 show_regs(regs);
3089 else
3090 dump_stack();
3091} 3116}
3092 3117
3093/* 3118/*
@@ -3221,14 +3246,14 @@ need_resched:
3221 3246
3222 post_schedule(rq); 3247 post_schedule(rq);
3223 3248
3224 preempt_enable_no_resched(); 3249 sched_preempt_enable_no_resched();
3225 if (need_resched()) 3250 if (need_resched())
3226 goto need_resched; 3251 goto need_resched;
3227} 3252}
3228 3253
3229static inline void sched_submit_work(struct task_struct *tsk) 3254static inline void sched_submit_work(struct task_struct *tsk)
3230{ 3255{
3231 if (!tsk->state) 3256 if (!tsk->state || tsk_is_pi_blocked(tsk))
3232 return; 3257 return;
3233 /* 3258 /*
3234 * If we are going to sleep and we have plugged IO queued, 3259 * If we are going to sleep and we have plugged IO queued,
@@ -3247,6 +3272,18 @@ asmlinkage void __sched schedule(void)
3247} 3272}
3248EXPORT_SYMBOL(schedule); 3273EXPORT_SYMBOL(schedule);
3249 3274
3275/**
3276 * schedule_preempt_disabled - called with preemption disabled
3277 *
3278 * Returns with preemption disabled. Note: preempt_count must be 1
3279 */
3280void __sched schedule_preempt_disabled(void)
3281{
3282 sched_preempt_enable_no_resched();
3283 schedule();
3284 preempt_disable();
3285}
3286
3250#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 3287#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3251 3288
3252static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 3289static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@ -3407,9 +3444,9 @@ EXPORT_SYMBOL(__wake_up);
3407/* 3444/*
3408 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3445 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3409 */ 3446 */
3410void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3447void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3411{ 3448{
3412 __wake_up_common(q, mode, 1, 0, NULL); 3449 __wake_up_common(q, mode, nr, 0, NULL);
3413} 3450}
3414EXPORT_SYMBOL_GPL(__wake_up_locked); 3451EXPORT_SYMBOL_GPL(__wake_up_locked);
3415 3452
@@ -3768,6 +3805,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3768 3805
3769 rq = __task_rq_lock(p); 3806 rq = __task_rq_lock(p);
3770 3807
3808 /*
3809 * Idle task boosting is a nono in general. There is one
3810 * exception, when PREEMPT_RT and NOHZ is active:
3811 *
3812 * The idle task calls get_next_timer_interrupt() and holds
3813 * the timer wheel base->lock on the CPU and another CPU wants
3814 * to access the timer (probably to cancel it). We can safely
3815 * ignore the boosting request, as the idle CPU runs this code
3816 * with interrupts disabled and will complete the lock
3817 * protected section without being interrupted. So there is no
3818 * real need to boost.
3819 */
3820 if (unlikely(p == rq->idle)) {
3821 WARN_ON(p != rq->curr);
3822 WARN_ON(p->pi_blocked_on);
3823 goto out_unlock;
3824 }
3825
3771 trace_sched_pi_setprio(p, prio); 3826 trace_sched_pi_setprio(p, prio);
3772 oldprio = p->prio; 3827 oldprio = p->prio;
3773 prev_class = p->sched_class; 3828 prev_class = p->sched_class;
@@ -3791,11 +3846,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3791 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3846 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3792 3847
3793 check_class_changed(rq, p, prev_class, oldprio); 3848 check_class_changed(rq, p, prev_class, oldprio);
3849out_unlock:
3794 __task_rq_unlock(rq); 3850 __task_rq_unlock(rq);
3795} 3851}
3796
3797#endif 3852#endif
3798
3799void set_user_nice(struct task_struct *p, long nice) 3853void set_user_nice(struct task_struct *p, long nice)
3800{ 3854{
3801 int old_prio, delta, on_rq; 3855 int old_prio, delta, on_rq;
@@ -4475,7 +4529,7 @@ SYSCALL_DEFINE0(sched_yield)
4475 __release(rq->lock); 4529 __release(rq->lock);
4476 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4530 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4477 do_raw_spin_unlock(&rq->lock); 4531 do_raw_spin_unlock(&rq->lock);
4478 preempt_enable_no_resched(); 4532 sched_preempt_enable_no_resched();
4479 4533
4480 schedule(); 4534 schedule();
4481 4535
@@ -4549,8 +4603,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
4549/** 4603/**
4550 * yield - yield the current processor to other threads. 4604 * yield - yield the current processor to other threads.
4551 * 4605 *
4552 * This is a shortcut for kernel-space yielding - it marks the 4606 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4553 * thread runnable and calls sys_sched_yield(). 4607 *
4608 * The scheduler is at all times free to pick the calling task as the most
4609 * eligible task to run, if removing the yield() call from your code breaks
4610 * it, its already broken.
4611 *
4612 * Typical broken usage is:
4613 *
4614 * while (!event)
4615 * yield();
4616 *
4617 * where one assumes that yield() will let 'the other' process run that will
4618 * make event true. If the current task is a SCHED_FIFO task that will never
4619 * happen. Never use yield() as a progress guarantee!!
4620 *
4621 * If you want to use yield() to wait for something, use wait_event().
4622 * If you want to use yield() to be 'nice' for others, use cond_resched().
4623 * If you still want to use yield(), do not!
4554 */ 4624 */
4555void __sched yield(void) 4625void __sched yield(void)
4556{ 4626{
@@ -5382,7 +5452,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5382 unsigned long action, void *hcpu) 5452 unsigned long action, void *hcpu)
5383{ 5453{
5384 switch (action & ~CPU_TASKS_FROZEN) { 5454 switch (action & ~CPU_TASKS_FROZEN) {
5385 case CPU_ONLINE: 5455 case CPU_STARTING:
5386 case CPU_DOWN_FAILED: 5456 case CPU_DOWN_FAILED:
5387 set_cpu_active((long)hcpu, true); 5457 set_cpu_active((long)hcpu, true);
5388 return NOTIFY_OK; 5458 return NOTIFY_OK;
@@ -5754,7 +5824,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5754 * 5824 *
5755 * Also keep a unique ID per domain (we use the first cpu number in 5825 * Also keep a unique ID per domain (we use the first cpu number in
5756 * the cpumask of the domain), this allows us to quickly tell if 5826 * the cpumask of the domain), this allows us to quickly tell if
5757 * two cpus are in the same cache domain, see ttwu_share_cache(). 5827 * two cpus are in the same cache domain, see cpus_share_cache().
5758 */ 5828 */
5759DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5829DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5760DEFINE_PER_CPU(int, sd_llc_id); 5830DEFINE_PER_CPU(int, sd_llc_id);
@@ -6931,6 +7001,9 @@ void __init sched_init(void)
6931 rq->online = 0; 7001 rq->online = 0;
6932 rq->idle_stamp = 0; 7002 rq->idle_stamp = 0;
6933 rq->avg_idle = 2*sysctl_sched_migration_cost; 7003 rq->avg_idle = 2*sysctl_sched_migration_cost;
7004
7005 INIT_LIST_HEAD(&rq->cfs_tasks);
7006
6934 rq_attach_root(rq, &def_root_domain); 7007 rq_attach_root(rq, &def_root_domain);
6935#ifdef CONFIG_NO_HZ 7008#ifdef CONFIG_NO_HZ
6936 rq->nohz_flags = 0; 7009 rq->nohz_flags = 0;
@@ -7525,8 +7598,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7525 struct task_group, css); 7598 struct task_group, css);
7526} 7599}
7527 7600
7528static struct cgroup_subsys_state * 7601static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7529cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7530{ 7602{
7531 struct task_group *tg, *parent; 7603 struct task_group *tg, *parent;
7532 7604
@@ -7543,15 +7615,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7543 return &tg->css; 7615 return &tg->css;
7544} 7616}
7545 7617
7546static void 7618static void cpu_cgroup_destroy(struct cgroup *cgrp)
7547cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7548{ 7619{
7549 struct task_group *tg = cgroup_tg(cgrp); 7620 struct task_group *tg = cgroup_tg(cgrp);
7550 7621
7551 sched_destroy_group(tg); 7622 sched_destroy_group(tg);
7552} 7623}
7553 7624
7554static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7625static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7555 struct cgroup_taskset *tset) 7626 struct cgroup_taskset *tset)
7556{ 7627{
7557 struct task_struct *task; 7628 struct task_struct *task;
@@ -7569,7 +7640,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7569 return 0; 7640 return 0;
7570} 7641}
7571 7642
7572static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7643static void cpu_cgroup_attach(struct cgroup *cgrp,
7573 struct cgroup_taskset *tset) 7644 struct cgroup_taskset *tset)
7574{ 7645{
7575 struct task_struct *task; 7646 struct task_struct *task;
@@ -7579,8 +7650,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7579} 7650}
7580 7651
7581static void 7652static void
7582cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7653cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7583 struct cgroup *old_cgrp, struct task_struct *task) 7654 struct task_struct *task)
7584{ 7655{
7585 /* 7656 /*
7586 * cgroup_exit() is called in the copy_process() failure path. 7657 * cgroup_exit() is called in the copy_process() failure path.
@@ -7899,13 +7970,9 @@ static struct cftype cpu_files[] = {
7899 .write_u64 = cpu_rt_period_write_uint, 7970 .write_u64 = cpu_rt_period_write_uint,
7900 }, 7971 },
7901#endif 7972#endif
7973 { } /* terminate */
7902}; 7974};
7903 7975
7904static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7905{
7906 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7907}
7908
7909struct cgroup_subsys cpu_cgroup_subsys = { 7976struct cgroup_subsys cpu_cgroup_subsys = {
7910 .name = "cpu", 7977 .name = "cpu",
7911 .create = cpu_cgroup_create, 7978 .create = cpu_cgroup_create,
@@ -7913,8 +7980,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7913 .can_attach = cpu_cgroup_can_attach, 7980 .can_attach = cpu_cgroup_can_attach,
7914 .attach = cpu_cgroup_attach, 7981 .attach = cpu_cgroup_attach,
7915 .exit = cpu_cgroup_exit, 7982 .exit = cpu_cgroup_exit,
7916 .populate = cpu_cgroup_populate,
7917 .subsys_id = cpu_cgroup_subsys_id, 7983 .subsys_id = cpu_cgroup_subsys_id,
7984 .base_cftypes = cpu_files,
7918 .early_init = 1, 7985 .early_init = 1,
7919}; 7986};
7920 7987
@@ -7930,8 +7997,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7930 */ 7997 */
7931 7998
7932/* create a new cpu accounting group */ 7999/* create a new cpu accounting group */
7933static struct cgroup_subsys_state *cpuacct_create( 8000static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
7934 struct cgroup_subsys *ss, struct cgroup *cgrp)
7935{ 8001{
7936 struct cpuacct *ca; 8002 struct cpuacct *ca;
7937 8003
@@ -7961,8 +8027,7 @@ out:
7961} 8027}
7962 8028
7963/* destroy an existing cpu accounting group */ 8029/* destroy an existing cpu accounting group */
7964static void 8030static void cpuacct_destroy(struct cgroup *cgrp)
7965cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7966{ 8031{
7967 struct cpuacct *ca = cgroup_ca(cgrp); 8032 struct cpuacct *ca = cgroup_ca(cgrp);
7968 8033
@@ -8101,13 +8166,9 @@ static struct cftype files[] = {
8101 .name = "stat", 8166 .name = "stat",
8102 .read_map = cpuacct_stats_show, 8167 .read_map = cpuacct_stats_show,
8103 }, 8168 },
8169 { } /* terminate */
8104}; 8170};
8105 8171
8106static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8107{
8108 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8109}
8110
8111/* 8172/*
8112 * charge this task's execution time to its accounting group. 8173 * charge this task's execution time to its accounting group.
8113 * 8174 *
@@ -8139,7 +8200,7 @@ struct cgroup_subsys cpuacct_subsys = {
8139 .name = "cpuacct", 8200 .name = "cpuacct",
8140 .create = cpuacct_create, 8201 .create = cpuacct_create,
8141 .destroy = cpuacct_destroy, 8202 .destroy = cpuacct_destroy,
8142 .populate = cpuacct_populate,
8143 .subsys_id = cpuacct_subsys_id, 8203 .subsys_id = cpuacct_subsys_id,
8204 .base_cftypes = files,
8144}; 8205};
8145#endif /* CONFIG_CGROUP_CPUACCT */ 8206#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a075e10004b..09acaa15161d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu)
288 288
289 P(yld_count); 289 P(yld_count);
290 290
291 P(sched_switch);
292 P(sched_count); 291 P(sched_count);
293 P(sched_goidle); 292 P(sched_goidle);
294#ifdef CONFIG_SMP 293#ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669d..0d97ebdc58f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
416 416
417#endif /* CONFIG_FAIR_GROUP_SCHED */ 417#endif /* CONFIG_FAIR_GROUP_SCHED */
418 418
419static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 419static __always_inline
420 unsigned long delta_exec); 420void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
421 421
422/************************************************************** 422/**************************************************************
423 * Scheduling class tree data structure manipulation methods: 423 * Scheduling class tree data structure manipulation methods:
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 776 * Scheduling class queueing methods:
777 */ 777 */
778 778
779#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
780static void
781add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
782{
783 cfs_rq->task_weight += weight;
784}
785#else
786static inline void
787add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
788{
789}
790#endif
791
792static void 779static void
793account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794{ 781{
795 update_load_add(&cfs_rq->load, se->load.weight); 782 update_load_add(&cfs_rq->load, se->load.weight);
796 if (!parent_entity(se)) 783 if (!parent_entity(se))
797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
798 if (entity_is_task(se)) { 785#ifdef CONFIG_SMP
799 add_cfs_task_weight(cfs_rq, se->load.weight); 786 if (entity_is_task(se))
800 list_add(&se->group_node, &cfs_rq->tasks); 787 list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
801 } 788#endif
802 cfs_rq->nr_running++; 789 cfs_rq->nr_running++;
803} 790}
804 791
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
808 update_load_sub(&cfs_rq->load, se->load.weight); 795 update_load_sub(&cfs_rq->load, se->load.weight);
809 if (!parent_entity(se)) 796 if (!parent_entity(se))
810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 797 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
811 if (entity_is_task(se)) { 798 if (entity_is_task(se))
812 add_cfs_task_weight(cfs_rq, -se->load.weight);
813 list_del_init(&se->group_node); 799 list_del_init(&se->group_node);
814 }
815 cfs_rq->nr_running--; 800 cfs_rq->nr_running--;
816} 801}
817 802
@@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1003 if (unlikely(delta > se->statistics.sleep_max)) 988 if (unlikely(delta > se->statistics.sleep_max))
1004 se->statistics.sleep_max = delta; 989 se->statistics.sleep_max = delta;
1005 990
991 se->statistics.sleep_start = 0;
1006 se->statistics.sum_sleep_runtime += delta; 992 se->statistics.sum_sleep_runtime += delta;
1007 993
1008 if (tsk) { 994 if (tsk) {
@@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1019 if (unlikely(delta > se->statistics.block_max)) 1005 if (unlikely(delta > se->statistics.block_max))
1020 se->statistics.block_max = delta; 1006 se->statistics.block_max = delta;
1021 1007
1008 se->statistics.block_start = 0;
1022 se->statistics.sum_sleep_runtime += delta; 1009 se->statistics.sum_sleep_runtime += delta;
1023 1010
1024 if (tsk) { 1011 if (tsk) {
@@ -1175,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1175 __clear_buddies_skip(se); 1162 __clear_buddies_skip(se);
1176} 1163}
1177 1164
1178static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 1165static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1179 1166
1180static void 1167static void
1181dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1168dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1399#ifdef CONFIG_CFS_BANDWIDTH 1386#ifdef CONFIG_CFS_BANDWIDTH
1400 1387
1401#ifdef HAVE_JUMP_LABEL 1388#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used; 1389static struct static_key __cfs_bandwidth_used;
1403 1390
1404static inline bool cfs_bandwidth_used(void) 1391static inline bool cfs_bandwidth_used(void)
1405{ 1392{
1406 return static_branch(&__cfs_bandwidth_used); 1393 return static_key_false(&__cfs_bandwidth_used);
1407} 1394}
1408 1395
1409void account_cfs_bandwidth_used(int enabled, int was_enabled) 1396void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{ 1397{
1411 /* only need to count groups transitioning between enabled/!enabled */ 1398 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled) 1399 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used); 1400 static_key_slow_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled) 1401 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used); 1402 static_key_slow_dec(&__cfs_bandwidth_used);
1416} 1403}
1417#else /* HAVE_JUMP_LABEL */ 1404#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void) 1405static bool cfs_bandwidth_used(void)
@@ -1559,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1559 resched_task(rq_of(cfs_rq)->curr); 1546 resched_task(rq_of(cfs_rq)->curr);
1560} 1547}
1561 1548
1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1549static __always_inline
1563 unsigned long delta_exec) 1550void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1564{ 1551{
1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 1552 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1566 return; 1553 return;
@@ -2086,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
2086} 2073}
2087 2074
2088#else /* CONFIG_CFS_BANDWIDTH */ 2075#else /* CONFIG_CFS_BANDWIDTH */
2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2076static __always_inline
2090 unsigned long delta_exec) {} 2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2092static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2093static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2094 2081
2095static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 2082static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2096{ 2083{
@@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2670 /* 2657 /*
2671 * Otherwise, iterate the domains and find an elegible idle cpu. 2658 * Otherwise, iterate the domains and find an elegible idle cpu.
2672 */ 2659 */
2673 rcu_read_lock();
2674
2675 sd = rcu_dereference(per_cpu(sd_llc, target)); 2660 sd = rcu_dereference(per_cpu(sd_llc, target));
2676 for_each_lower_domain(sd) { 2661 for_each_lower_domain(sd) {
2677 sg = sd->groups; 2662 sg = sd->groups;
@@ -2693,8 +2678,6 @@ next:
2693 } while (sg != sd->groups); 2678 } while (sg != sd->groups);
2694 } 2679 }
2695done: 2680done:
2696 rcu_read_unlock();
2697
2698 return target; 2681 return target;
2699} 2682}
2700 2683
@@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2920 return; 2903 return;
2921 2904
2922 /* 2905 /*
2923 * This is possible from callers such as pull_task(), in which we 2906 * This is possible from callers such as move_task(), in which we
2924 * unconditionally check_prempt_curr() after an enqueue (which may have 2907 * unconditionally check_prempt_curr() after an enqueue (which may have
2925 * lead to a throttle). This both saves work and prevents false 2908 * lead to a throttle). This both saves work and prevents false
2926 * next-buddy nomination below. 2909 * next-buddy nomination below.
@@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3084 * Fair scheduling class load-balancing methods: 3067 * Fair scheduling class load-balancing methods:
3085 */ 3068 */
3086 3069
3070static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3071
3072#define LBF_ALL_PINNED 0x01
3073#define LBF_NEED_BREAK 0x02
3074
3075struct lb_env {
3076 struct sched_domain *sd;
3077
3078 int src_cpu;
3079 struct rq *src_rq;
3080
3081 int dst_cpu;
3082 struct rq *dst_rq;
3083
3084 enum cpu_idle_type idle;
3085 long load_move;
3086 unsigned int flags;
3087
3088 unsigned int loop;
3089 unsigned int loop_break;
3090 unsigned int loop_max;
3091};
3092
3087/* 3093/*
3088 * pull_task - move a task from a remote runqueue to the local runqueue. 3094 * move_task - move a task from one runqueue to another runqueue.
3089 * Both runqueues must be locked. 3095 * Both runqueues must be locked.
3090 */ 3096 */
3091static void pull_task(struct rq *src_rq, struct task_struct *p, 3097static void move_task(struct task_struct *p, struct lb_env *env)
3092 struct rq *this_rq, int this_cpu)
3093{ 3098{
3094 deactivate_task(src_rq, p, 0); 3099 deactivate_task(env->src_rq, p, 0);
3095 set_task_cpu(p, this_cpu); 3100 set_task_cpu(p, env->dst_cpu);
3096 activate_task(this_rq, p, 0); 3101 activate_task(env->dst_rq, p, 0);
3097 check_preempt_curr(this_rq, p, 0); 3102 check_preempt_curr(env->dst_rq, p, 0);
3098} 3103}
3099 3104
3100/* 3105/*
@@ -3129,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3129 return delta < (s64)sysctl_sched_migration_cost; 3134 return delta < (s64)sysctl_sched_migration_cost;
3130} 3135}
3131 3136
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3134#define LBF_HAD_BREAK 0x04
3135#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3136#define LBF_ABORT 0x10
3137
3138/* 3137/*
3139 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3138 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3140 */ 3139 */
3141static 3140static
3142int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3141int can_migrate_task(struct task_struct *p, struct lb_env *env)
3143 struct sched_domain *sd, enum cpu_idle_type idle,
3144 int *lb_flags)
3145{ 3142{
3146 int tsk_cache_hot = 0; 3143 int tsk_cache_hot = 0;
3147 /* 3144 /*
@@ -3150,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3150 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3147 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3151 * 3) are cache-hot on their current CPU. 3148 * 3) are cache-hot on their current CPU.
3152 */ 3149 */
3153 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { 3150 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3154 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3151 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3155 return 0; 3152 return 0;
3156 } 3153 }
3157 *lb_flags &= ~LBF_ALL_PINNED; 3154 env->flags &= ~LBF_ALL_PINNED;
3158 3155
3159 if (task_running(rq, p)) { 3156 if (task_running(env->src_rq, p)) {
3160 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3157 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3161 return 0; 3158 return 0;
3162 } 3159 }
@@ -3167,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3167 * 2) too many balance attempts have failed. 3164 * 2) too many balance attempts have failed.
3168 */ 3165 */
3169 3166
3170 tsk_cache_hot = task_hot(p, rq->clock_task, sd); 3167 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3171 if (!tsk_cache_hot || 3168 if (!tsk_cache_hot ||
3172 sd->nr_balance_failed > sd->cache_nice_tries) { 3169 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3173#ifdef CONFIG_SCHEDSTATS 3170#ifdef CONFIG_SCHEDSTATS
3174 if (tsk_cache_hot) { 3171 if (tsk_cache_hot) {
3175 schedstat_inc(sd, lb_hot_gained[idle]); 3172 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3176 schedstat_inc(p, se.statistics.nr_forced_migrations); 3173 schedstat_inc(p, se.statistics.nr_forced_migrations);
3177 } 3174 }
3178#endif 3175#endif
@@ -3193,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3193 * 3190 *
3194 * Called with both runqueues locked. 3191 * Called with both runqueues locked.
3195 */ 3192 */
3196static int 3193static int move_one_task(struct lb_env *env)
3197move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3198 struct sched_domain *sd, enum cpu_idle_type idle)
3199{ 3194{
3200 struct task_struct *p, *n; 3195 struct task_struct *p, *n;
3201 struct cfs_rq *cfs_rq;
3202 int pinned = 0;
3203 3196
3204 for_each_leaf_cfs_rq(busiest, cfs_rq) { 3197 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3205 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 3198 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3206 if (throttled_lb_pair(task_group(p), 3199 continue;
3207 busiest->cpu, this_cpu))
3208 break;
3209 3200
3210 if (!can_migrate_task(p, busiest, this_cpu, 3201 if (!can_migrate_task(p, env))
3211 sd, idle, &pinned)) 3202 continue;
3212 continue;
3213 3203
3214 pull_task(busiest, p, this_rq, this_cpu); 3204 move_task(p, env);
3215 /* 3205 /*
3216 * Right now, this is only the second place pull_task() 3206 * Right now, this is only the second place move_task()
3217 * is called, so we can safely collect pull_task() 3207 * is called, so we can safely collect move_task()
3218 * stats here rather than inside pull_task(). 3208 * stats here rather than inside move_task().
3219 */ 3209 */
3220 schedstat_inc(sd, lb_gained[idle]); 3210 schedstat_inc(env->sd, lb_gained[env->idle]);
3221 return 1; 3211 return 1;
3222 }
3223 } 3212 }
3224
3225 return 0; 3213 return 0;
3226} 3214}
3227 3215
3228static unsigned long 3216static unsigned long task_h_load(struct task_struct *p);
3229balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3217
3230 unsigned long max_load_move, struct sched_domain *sd, 3218/*
3231 enum cpu_idle_type idle, int *lb_flags, 3219 * move_tasks tries to move up to load_move weighted load from busiest to
3232 struct cfs_rq *busiest_cfs_rq) 3220 * this_rq, as part of a balancing operation within domain "sd".
3221 * Returns 1 if successful and 0 otherwise.
3222 *
3223 * Called with both runqueues locked.
3224 */
3225static int move_tasks(struct lb_env *env)
3233{ 3226{
3234 int loops = 0, pulled = 0; 3227 struct list_head *tasks = &env->src_rq->cfs_tasks;
3235 long rem_load_move = max_load_move; 3228 struct task_struct *p;
3236 struct task_struct *p, *n; 3229 unsigned long load;
3230 int pulled = 0;
3231
3232 if (env->load_move <= 0)
3233 return 0;
3237 3234
3238 if (max_load_move == 0) 3235 while (!list_empty(tasks)) {
3239 goto out; 3236 p = list_first_entry(tasks, struct task_struct, se.group_node);
3240 3237
3241 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3238 env->loop++;
3242 if (loops++ > sysctl_sched_nr_migrate) { 3239 /* We've more or less seen every task there is, call it quits */
3243 *lb_flags |= LBF_NEED_BREAK; 3240 if (env->loop > env->loop_max)
3241 break;
3242
3243 /* take a breather every nr_migrate tasks */
3244 if (env->loop > env->loop_break) {
3245 env->loop_break += sysctl_sched_nr_migrate;
3246 env->flags |= LBF_NEED_BREAK;
3244 break; 3247 break;
3245 } 3248 }
3246 3249
3247 if ((p->se.load.weight >> 1) > rem_load_move || 3250 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3248 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3251 goto next;
3249 lb_flags)) 3252
3250 continue; 3253 load = task_h_load(p);
3254
3255 if (load < 16 && !env->sd->nr_balance_failed)
3256 goto next;
3257
3258 if ((load / 2) > env->load_move)
3259 goto next;
3251 3260
3252 pull_task(busiest, p, this_rq, this_cpu); 3261 if (!can_migrate_task(p, env))
3262 goto next;
3263
3264 move_task(p, env);
3253 pulled++; 3265 pulled++;
3254 rem_load_move -= p->se.load.weight; 3266 env->load_move -= load;
3255 3267
3256#ifdef CONFIG_PREEMPT 3268#ifdef CONFIG_PREEMPT
3257 /* 3269 /*
@@ -3259,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3259 * kernels will stop after the first task is pulled to minimize 3271 * kernels will stop after the first task is pulled to minimize
3260 * the critical section. 3272 * the critical section.
3261 */ 3273 */
3262 if (idle == CPU_NEWLY_IDLE) { 3274 if (env->idle == CPU_NEWLY_IDLE)
3263 *lb_flags |= LBF_ABORT;
3264 break; 3275 break;
3265 }
3266#endif 3276#endif
3267 3277
3268 /* 3278 /*
3269 * We only want to steal up to the prescribed amount of 3279 * We only want to steal up to the prescribed amount of
3270 * weighted load. 3280 * weighted load.
3271 */ 3281 */
3272 if (rem_load_move <= 0) 3282 if (env->load_move <= 0)
3273 break; 3283 break;
3284
3285 continue;
3286next:
3287 list_move_tail(&p->se.group_node, tasks);
3274 } 3288 }
3275out: 3289
3276 /* 3290 /*
3277 * Right now, this is one of only two places pull_task() is called, 3291 * Right now, this is one of only two places move_task() is called,
3278 * so we can safely collect pull_task() stats here rather than 3292 * so we can safely collect move_task() stats here rather than
3279 * inside pull_task(). 3293 * inside move_task().
3280 */ 3294 */
3281 schedstat_add(sd, lb_gained[idle], pulled); 3295 schedstat_add(env->sd, lb_gained[env->idle], pulled);
3282 3296
3283 return max_load_move - rem_load_move; 3297 return pulled;
3284} 3298}
3285 3299
3286#ifdef CONFIG_FAIR_GROUP_SCHED 3300#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3360,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data)
3360 3374
3361static void update_h_load(long cpu) 3375static void update_h_load(long cpu)
3362{ 3376{
3377 rcu_read_lock();
3363 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 3378 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3379 rcu_read_unlock();
3364} 3380}
3365 3381
3366static unsigned long 3382static unsigned long task_h_load(struct task_struct *p)
3367load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3368 unsigned long max_load_move,
3369 struct sched_domain *sd, enum cpu_idle_type idle,
3370 int *lb_flags)
3371{ 3383{
3372 long rem_load_move = max_load_move; 3384 struct cfs_rq *cfs_rq = task_cfs_rq(p);
3373 struct cfs_rq *busiest_cfs_rq; 3385 unsigned long load;
3374
3375 rcu_read_lock();
3376 update_h_load(cpu_of(busiest));
3377
3378 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
3379 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
3380 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3381 u64 rem_load, moved_load;
3382
3383 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3384 break;
3385
3386 /*
3387 * empty group or part of a throttled hierarchy
3388 */
3389 if (!busiest_cfs_rq->task_weight ||
3390 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
3391 continue;
3392
3393 rem_load = (u64)rem_load_move * busiest_weight;
3394 rem_load = div_u64(rem_load, busiest_h_load + 1);
3395
3396 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3397 rem_load, sd, idle, lb_flags,
3398 busiest_cfs_rq);
3399
3400 if (!moved_load)
3401 continue;
3402 3386
3403 moved_load *= busiest_h_load; 3387 load = p->se.load.weight;
3404 moved_load = div_u64(moved_load, busiest_weight + 1); 3388 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3405 3389
3406 rem_load_move -= moved_load; 3390 return load;
3407 if (rem_load_move < 0)
3408 break;
3409 }
3410 rcu_read_unlock();
3411
3412 return max_load_move - rem_load_move;
3413} 3391}
3414#else 3392#else
3415static inline void update_shares(int cpu) 3393static inline void update_shares(int cpu)
3416{ 3394{
3417} 3395}
3418 3396
3419static unsigned long 3397static inline void update_h_load(long cpu)
3420load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3421 unsigned long max_load_move,
3422 struct sched_domain *sd, enum cpu_idle_type idle,
3423 int *lb_flags)
3424{ 3398{
3425 return balance_tasks(this_rq, this_cpu, busiest,
3426 max_load_move, sd, idle, lb_flags,
3427 &busiest->cfs);
3428} 3399}
3429#endif
3430 3400
3431/* 3401static unsigned long task_h_load(struct task_struct *p)
3432 * move_tasks tries to move up to max_load_move weighted load from busiest to
3433 * this_rq, as part of a balancing operation within domain "sd".
3434 * Returns 1 if successful and 0 otherwise.
3435 *
3436 * Called with both runqueues locked.
3437 */
3438static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3439 unsigned long max_load_move,
3440 struct sched_domain *sd, enum cpu_idle_type idle,
3441 int *lb_flags)
3442{ 3402{
3443 unsigned long total_load_moved = 0, load_moved; 3403 return p->se.load.weight;
3444
3445 do {
3446 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3447 max_load_move - total_load_moved,
3448 sd, idle, lb_flags);
3449
3450 total_load_moved += load_moved;
3451
3452 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3453 break;
3454
3455#ifdef CONFIG_PREEMPT
3456 /*
3457 * NEWIDLE balancing is a source of latency, so preemptible
3458 * kernels will stop after the first task is pulled to minimize
3459 * the critical section.
3460 */
3461 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
3462 *lb_flags |= LBF_ABORT;
3463 break;
3464 }
3465#endif
3466 } while (load_moved && max_load_move > total_load_moved);
3467
3468 return total_load_moved > 0;
3469} 3404}
3405#endif
3470 3406
3471/********** Helpers for find_busiest_group ************************/ 3407/********** Helpers for find_busiest_group ************************/
3472/* 3408/*
@@ -3776,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu)
3776 struct sched_domain *child = sd->child; 3712 struct sched_domain *child = sd->child;
3777 struct sched_group *group, *sdg = sd->groups; 3713 struct sched_group *group, *sdg = sd->groups;
3778 unsigned long power; 3714 unsigned long power;
3715 unsigned long interval;
3716
3717 interval = msecs_to_jiffies(sd->balance_interval);
3718 interval = clamp(interval, 1UL, max_load_balance_interval);
3719 sdg->sgp->next_update = jiffies + interval;
3779 3720
3780 if (!child) { 3721 if (!child) {
3781 update_cpu_power(sd, cpu); 3722 update_cpu_power(sd, cpu);
@@ -3883,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3883 * domains. In the newly idle case, we will allow all the cpu's 3824 * domains. In the newly idle case, we will allow all the cpu's
3884 * to do the newly idle load balance. 3825 * to do the newly idle load balance.
3885 */ 3826 */
3886 if (idle != CPU_NEWLY_IDLE && local_group) { 3827 if (local_group) {
3887 if (balance_cpu != this_cpu) { 3828 if (idle != CPU_NEWLY_IDLE) {
3888 *balance = 0; 3829 if (balance_cpu != this_cpu) {
3889 return; 3830 *balance = 0;
3890 } 3831 return;
3891 update_group_power(sd, this_cpu); 3832 }
3833 update_group_power(sd, this_cpu);
3834 } else if (time_after_eq(jiffies, group->sgp->next_update))
3835 update_group_power(sd, this_cpu);
3892 } 3836 }
3893 3837
3894 /* Adjust by relative CPU power of the group */ 3838 /* Adjust by relative CPU power of the group */
@@ -4451,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4451 struct sched_domain *sd, enum cpu_idle_type idle, 4395 struct sched_domain *sd, enum cpu_idle_type idle,
4452 int *balance) 4396 int *balance)
4453{ 4397{
4454 int ld_moved, lb_flags = 0, active_balance = 0; 4398 int ld_moved, active_balance = 0;
4455 struct sched_group *group; 4399 struct sched_group *group;
4456 unsigned long imbalance; 4400 unsigned long imbalance;
4457 struct rq *busiest; 4401 struct rq *busiest;
4458 unsigned long flags; 4402 unsigned long flags;
4459 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4460 4404
4405 struct lb_env env = {
4406 .sd = sd,
4407 .dst_cpu = this_cpu,
4408 .dst_rq = this_rq,
4409 .idle = idle,
4410 .loop_break = sysctl_sched_nr_migrate,
4411 };
4412
4461 cpumask_copy(cpus, cpu_active_mask); 4413 cpumask_copy(cpus, cpu_active_mask);
4462 4414
4463 schedstat_inc(sd, lb_count[idle]); 4415 schedstat_inc(sd, lb_count[idle]);
@@ -4492,32 +4444,34 @@ redo:
4492 * still unbalanced. ld_moved simply stays zero, so it is 4444 * still unbalanced. ld_moved simply stays zero, so it is
4493 * correctly treated as an imbalance. 4445 * correctly treated as an imbalance.
4494 */ 4446 */
4495 lb_flags |= LBF_ALL_PINNED; 4447 env.flags |= LBF_ALL_PINNED;
4448 env.load_move = imbalance;
4449 env.src_cpu = busiest->cpu;
4450 env.src_rq = busiest;
4451 env.loop_max = busiest->nr_running;
4452
4453more_balance:
4496 local_irq_save(flags); 4454 local_irq_save(flags);
4497 double_rq_lock(this_rq, busiest); 4455 double_rq_lock(this_rq, busiest);
4498 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4456 if (!env.loop)
4499 imbalance, sd, idle, &lb_flags); 4457 update_h_load(env.src_cpu);
4458 ld_moved += move_tasks(&env);
4500 double_rq_unlock(this_rq, busiest); 4459 double_rq_unlock(this_rq, busiest);
4501 local_irq_restore(flags); 4460 local_irq_restore(flags);
4502 4461
4462 if (env.flags & LBF_NEED_BREAK) {
4463 env.flags &= ~LBF_NEED_BREAK;
4464 goto more_balance;
4465 }
4466
4503 /* 4467 /*
4504 * some other cpu did the load balance for us. 4468 * some other cpu did the load balance for us.
4505 */ 4469 */
4506 if (ld_moved && this_cpu != smp_processor_id()) 4470 if (ld_moved && this_cpu != smp_processor_id())
4507 resched_cpu(this_cpu); 4471 resched_cpu(this_cpu);
4508 4472
4509 if (lb_flags & LBF_ABORT)
4510 goto out_balanced;
4511
4512 if (lb_flags & LBF_NEED_BREAK) {
4513 lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4514 if (lb_flags & LBF_ABORT)
4515 goto out_balanced;
4516 goto redo;
4517 }
4518
4519 /* All tasks on this runqueue were pinned by CPU affinity */ 4473 /* All tasks on this runqueue were pinned by CPU affinity */
4520 if (unlikely(lb_flags & LBF_ALL_PINNED)) { 4474 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4521 cpumask_clear_cpu(cpu_of(busiest), cpus); 4475 cpumask_clear_cpu(cpu_of(busiest), cpus);
4522 if (!cpumask_empty(cpus)) 4476 if (!cpumask_empty(cpus))
4523 goto redo; 4477 goto redo;
@@ -4547,7 +4501,7 @@ redo:
4547 tsk_cpus_allowed(busiest->curr))) { 4501 tsk_cpus_allowed(busiest->curr))) {
4548 raw_spin_unlock_irqrestore(&busiest->lock, 4502 raw_spin_unlock_irqrestore(&busiest->lock,
4549 flags); 4503 flags);
4550 lb_flags |= LBF_ALL_PINNED; 4504 env.flags |= LBF_ALL_PINNED;
4551 goto out_one_pinned; 4505 goto out_one_pinned;
4552 } 4506 }
4553 4507
@@ -4600,7 +4554,7 @@ out_balanced:
4600 4554
4601out_one_pinned: 4555out_one_pinned:
4602 /* tune up the balancing interval */ 4556 /* tune up the balancing interval */
4603 if (((lb_flags & LBF_ALL_PINNED) && 4557 if (((env.flags & LBF_ALL_PINNED) &&
4604 sd->balance_interval < MAX_PINNED_INTERVAL) || 4558 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4605 (sd->balance_interval < sd->max_interval)) 4559 (sd->balance_interval < sd->max_interval))
4606 sd->balance_interval *= 2; 4560 sd->balance_interval *= 2;
@@ -4710,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data)
4710 } 4664 }
4711 4665
4712 if (likely(sd)) { 4666 if (likely(sd)) {
4667 struct lb_env env = {
4668 .sd = sd,
4669 .dst_cpu = target_cpu,
4670 .dst_rq = target_rq,
4671 .src_cpu = busiest_rq->cpu,
4672 .src_rq = busiest_rq,
4673 .idle = CPU_IDLE,
4674 };
4675
4713 schedstat_inc(sd, alb_count); 4676 schedstat_inc(sd, alb_count);
4714 4677
4715 if (move_one_task(target_rq, target_cpu, busiest_rq, 4678 if (move_one_task(&env))
4716 sd, CPU_IDLE))
4717 schedstat_inc(sd, alb_pushed); 4679 schedstat_inc(sd, alb_pushed);
4718 else 4680 else
4719 schedstat_inc(sd, alb_failed); 4681 schedstat_inc(sd, alb_failed);
@@ -4945,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4945 4907
4946static DEFINE_SPINLOCK(balancing); 4908static DEFINE_SPINLOCK(balancing);
4947 4909
4948static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4949
4950/* 4910/*
4951 * Scale the max load_balance interval with the number of CPUs in the system. 4911 * Scale the max load_balance interval with the number of CPUs in the system.
4952 * This trades load-balance latency on larger machines for less cross talk. 4912 * This trades load-balance latency on larger machines for less cross talk.
@@ -5340,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq)
5340void init_cfs_rq(struct cfs_rq *cfs_rq) 5300void init_cfs_rq(struct cfs_rq *cfs_rq)
5341{ 5301{
5342 cfs_rq->tasks_timeline = RB_ROOT; 5302 cfs_rq->tasks_timeline = RB_ROOT;
5343 INIT_LIST_HEAD(&cfs_rq->tasks);
5344 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 5303 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5345#ifndef CONFIG_64BIT 5304#ifndef CONFIG_64BIT
5346 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5305 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -5612,6 +5571,7 @@ __init void init_sched_fair_class(void)
5612 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 5571 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5613 5572
5614#ifdef CONFIG_NO_HZ 5573#ifdef CONFIG_NO_HZ
5574 nohz.next_balance = jiffies;
5615 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 5575 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5616 cpu_notifier(sched_ilb_notifier, 0); 5576 cpu_notifier(sched_ilb_notifier, 0);
5617#endif 5577#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7fb5ec5..44af55e6d5d0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
778 778
779static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 779static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
780{ 780{
781 int i, idle = 1; 781 int i, idle = 1, throttled = 0;
782 const struct cpumask *span; 782 const struct cpumask *span;
783 783
784 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
785 return 1;
786
787 span = sched_rt_period_mask(); 784 span = sched_rt_period_mask();
788 for_each_cpu(i, span) { 785 for_each_cpu(i, span) {
789 int enqueue = 0; 786 int enqueue = 0;
@@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
818 if (!rt_rq_throttled(rt_rq)) 815 if (!rt_rq_throttled(rt_rq))
819 enqueue = 1; 816 enqueue = 1;
820 } 817 }
818 if (rt_rq->rt_throttled)
819 throttled = 1;
821 820
822 if (enqueue) 821 if (enqueue)
823 sched_rt_rq_enqueue(rt_rq); 822 sched_rt_rq_enqueue(rt_rq);
824 raw_spin_unlock(&rq->lock); 823 raw_spin_unlock(&rq->lock);
825 } 824 }
826 825
826 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
827 return 1;
828
827 return idle; 829 return idle;
828} 830}
829 831
@@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
855 return 0; 857 return 0;
856 858
857 if (rt_rq->rt_time > runtime) { 859 if (rt_rq->rt_time > runtime) {
858 rt_rq->rt_throttled = 1; 860 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
859 printk_once(KERN_WARNING "sched: RT throttling activated\n"); 861
862 /*
863 * Don't actually throttle groups that have no runtime assigned
864 * but accrue some time due to boosting.
865 */
866 if (likely(rt_b->rt_runtime)) {
867 static bool once = false;
868
869 rt_rq->rt_throttled = 1;
870
871 if (!once) {
872 once = true;
873 printk_sched("sched: RT throttling activated\n");
874 }
875 } else {
876 /*
877 * In case we did anyway, make it go away,
878 * replenishment is a joke, since it will replenish us
879 * with exactly 0 ns.
880 */
881 rt_rq->rt_time = 0;
882 }
883
860 if (rt_rq_throttled(rt_rq)) { 884 if (rt_rq_throttled(rt_rq)) {
861 sched_rt_rq_dequeue(rt_rq); 885 sched_rt_rq_dequeue(rt_rq);
862 return 1; 886 return 1;
@@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq)
884 if (unlikely((s64)delta_exec < 0)) 908 if (unlikely((s64)delta_exec < 0))
885 delta_exec = 0; 909 delta_exec = 0;
886 910
887 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); 911 schedstat_set(curr->se.statistics.exec_max,
912 max(curr->se.statistics.exec_max, delta_exec));
888 913
889 curr->se.sum_exec_runtime += delta_exec; 914 curr->se.sum_exec_runtime += delta_exec;
890 account_group_exec_runtime(curr, delta_exec); 915 account_group_exec_runtime(curr, delta_exec);
@@ -1403,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1403next_idx: 1428next_idx:
1404 if (idx >= MAX_RT_PRIO) 1429 if (idx >= MAX_RT_PRIO)
1405 continue; 1430 continue;
1406 if (next && next->prio < idx) 1431 if (next && next->prio <= idx)
1407 continue; 1432 continue;
1408 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1433 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1409 struct task_struct *p; 1434 struct task_struct *p;
@@ -1972,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1972 if (--p->rt.time_slice) 1997 if (--p->rt.time_slice)
1973 return; 1998 return;
1974 1999
1975 p->rt.time_slice = DEF_TIMESLICE; 2000 p->rt.time_slice = RR_TIMESLICE;
1976 2001
1977 /* 2002 /*
1978 * Requeue to the end of queue if we are not the only element 2003 * Requeue to the end of queue if we are not the only element
@@ -2000,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2000 * Time slice is 0 for SCHED_FIFO tasks 2025 * Time slice is 0 for SCHED_FIFO tasks
2001 */ 2026 */
2002 if (task->policy == SCHED_RR) 2027 if (task->policy == SCHED_RR)
2003 return DEF_TIMESLICE; 2028 return RR_TIMESLICE;
2004 else 2029 else
2005 return 0; 2030 return 0;
2006} 2031}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db8..fb3acba4d52e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running;
36 36
37/* 37/*
38 * These are the 'tuning knobs' of the scheduler: 38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */ 39 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44 40
45/* 41/*
46 * single value that denotes runtime == period, ie unlimited time. 42 * single value that denotes runtime == period, ie unlimited time.
@@ -216,9 +212,6 @@ struct cfs_rq {
216 struct rb_root tasks_timeline; 212 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost; 213 struct rb_node *rb_leftmost;
218 214
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /* 215 /*
223 * 'curr' points to currently running entity on this cfs_rq. 216 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running). 217 * It is set to NULL otherwise (i.e when none are currently running).
@@ -246,11 +239,6 @@ struct cfs_rq {
246 239
247#ifdef CONFIG_SMP 240#ifdef CONFIG_SMP
248 /* 241 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg) 242 * h_load = weight * f(tg)
255 * 243 *
256 * Where f(tg) is the recursive weight fraction assigned to 244 * Where f(tg) is the recursive weight fraction assigned to
@@ -424,6 +412,8 @@ struct rq {
424 int cpu; 412 int cpu;
425 int online; 413 int online;
426 414
415 struct list_head cfs_tasks;
416
427 u64 rt_avg; 417 u64 rt_avg;
428 u64 age_stamp; 418 u64 age_stamp;
429 u64 idle_stamp; 419 u64 idle_stamp;
@@ -462,7 +452,6 @@ struct rq {
462 unsigned int yld_count; 452 unsigned int yld_count;
463 453
464 /* schedule() stats */ 454 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count; 455 unsigned int sched_count;
467 unsigned int sched_goidle; 456 unsigned int sched_goidle;
468 457
@@ -611,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 600 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */ 601 */
613#ifdef CONFIG_SCHED_DEBUG 602#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h> 603# include <linux/static_key.h>
615# define const_debug __read_mostly 604# define const_debug __read_mostly
616#else 605#else
617# define const_debug const 606# define const_debug const
@@ -630,18 +619,18 @@ enum {
630#undef SCHED_FEAT 619#undef SCHED_FEAT
631 620
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 621#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key) 622static __always_inline bool static_branch__true(struct static_key *key)
634{ 623{
635 return likely(static_branch(key)); /* Not out of line branch. */ 624 return static_key_true(key); /* Not out of line branch. */
636} 625}
637 626
638static __always_inline bool static_branch__false(struct jump_label_key *key) 627static __always_inline bool static_branch__false(struct static_key *key)
639{ 628{
640 return unlikely(static_branch(key)); /* Out of line branch. */ 629 return static_key_false(key); /* Out of line branch. */
641} 630}
642 631
643#define SCHED_FEAT(name, enabled) \ 632#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \ 633static __always_inline bool static_branch_##name(struct static_key *key) \
645{ \ 634{ \
646 return static_branch__##enabled(key); \ 635 return static_branch__##enabled(key); \
647} 636}
@@ -650,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
650 639
651#undef SCHED_FEAT 640#undef SCHED_FEAT
652 641
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; 642extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 643#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 644#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 645#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
@@ -692,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
692#ifndef finish_arch_switch 681#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0) 682# define finish_arch_switch(prev) do { } while (0)
694#endif 683#endif
684#ifndef finish_arch_post_lock_switch
685# define finish_arch_post_lock_switch() do { } while (0)
686#endif
695 687
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW 688#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 689static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 2a581ba8e190..903ffa9e8872 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
32 32
33 /* runqueue-specific stats */ 33 /* runqueue-specific stats */
34 seq_printf(seq, 34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu", 35 "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count, 36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle, 37 rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local, 38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time, 39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
diff --git a/kernel/signal.c b/kernel/signal.c
index c73c4284160e..17afcaf582d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -36,6 +36,7 @@
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/unistd.h> 37#include <asm/unistd.h>
38#include <asm/siginfo.h> 38#include <asm/siginfo.h>
39#include <asm/cacheflush.h>
39#include "audit.h" /* audit_signal_info() */ 40#include "audit.h" /* audit_signal_info() */
40 41
41/* 42/*
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
58 (handler == SIG_DFL && sig_kernel_ignore(sig)); 59 (handler == SIG_DFL && sig_kernel_ignore(sig));
59} 60}
60 61
61static int sig_task_ignored(struct task_struct *t, int sig, 62static int sig_task_ignored(struct task_struct *t, int sig, bool force)
62 int from_ancestor_ns)
63{ 63{
64 void __user *handler; 64 void __user *handler;
65 65
66 handler = sig_handler(t, sig); 66 handler = sig_handler(t, sig);
67 67
68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && 68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
69 handler == SIG_DFL && !from_ancestor_ns) 69 handler == SIG_DFL && !force)
70 return 1; 70 return 1;
71 71
72 return sig_handler_ignored(handler, sig); 72 return sig_handler_ignored(handler, sig);
73} 73}
74 74
75static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) 75static int sig_ignored(struct task_struct *t, int sig, bool force)
76{ 76{
77 /* 77 /*
78 * Blocked signals are never ignored, since the 78 * Blocked signals are never ignored, since the
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
83 return 0; 83 return 0;
84 84
85 if (!sig_task_ignored(t, sig, from_ancestor_ns)) 85 if (!sig_task_ignored(t, sig, force))
86 return 0; 86 return 0;
87 87
88 /* 88 /*
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t)
855 * Returns true if the signal should be actually delivered, otherwise 855 * Returns true if the signal should be actually delivered, otherwise
856 * it should be dropped. 856 * it should be dropped.
857 */ 857 */
858static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) 858static int prepare_signal(int sig, struct task_struct *p, bool force)
859{ 859{
860 struct signal_struct *signal = p->signal; 860 struct signal_struct *signal = p->signal;
861 struct task_struct *t; 861 struct task_struct *t;
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
915 } 915 }
916 } 916 }
917 917
918 return !sig_ignored(p, sig, from_ancestor_ns); 918 return !sig_ignored(p, sig, force);
919} 919}
920 920
921/* 921/*
@@ -1054,13 +1054,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1054 struct sigpending *pending; 1054 struct sigpending *pending;
1055 struct sigqueue *q; 1055 struct sigqueue *q;
1056 int override_rlimit; 1056 int override_rlimit;
1057 1057 int ret = 0, result;
1058 trace_signal_generate(sig, info, t);
1059 1058
1060 assert_spin_locked(&t->sighand->siglock); 1059 assert_spin_locked(&t->sighand->siglock);
1061 1060
1062 if (!prepare_signal(sig, t, from_ancestor_ns)) 1061 result = TRACE_SIGNAL_IGNORED;
1063 return 0; 1062 if (!prepare_signal(sig, t,
1063 from_ancestor_ns || (info == SEND_SIG_FORCED)))
1064 goto ret;
1064 1065
1065 pending = group ? &t->signal->shared_pending : &t->pending; 1066 pending = group ? &t->signal->shared_pending : &t->pending;
1066 /* 1067 /*
@@ -1068,8 +1069,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1068 * exactly one non-rt signal, so that we can get more 1069 * exactly one non-rt signal, so that we can get more
1069 * detailed information about the cause of the signal. 1070 * detailed information about the cause of the signal.
1070 */ 1071 */
1072 result = TRACE_SIGNAL_ALREADY_PENDING;
1071 if (legacy_queue(pending, sig)) 1073 if (legacy_queue(pending, sig))
1072 return 0; 1074 goto ret;
1075
1076 result = TRACE_SIGNAL_DELIVERED;
1073 /* 1077 /*
1074 * fast-pathed signals for kernel-internal things like SIGSTOP 1078 * fast-pathed signals for kernel-internal things like SIGSTOP
1075 * or SIGKILL. 1079 * or SIGKILL.
@@ -1127,14 +1131,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1127 * signal was rt and sent by user using something 1131 * signal was rt and sent by user using something
1128 * other than kill(). 1132 * other than kill().
1129 */ 1133 */
1130 trace_signal_overflow_fail(sig, group, info); 1134 result = TRACE_SIGNAL_OVERFLOW_FAIL;
1131 return -EAGAIN; 1135 ret = -EAGAIN;
1136 goto ret;
1132 } else { 1137 } else {
1133 /* 1138 /*
1134 * This is a silent loss of information. We still 1139 * This is a silent loss of information. We still
1135 * send the signal, but the *info bits are lost. 1140 * send the signal, but the *info bits are lost.
1136 */ 1141 */
1137 trace_signal_lose_info(sig, group, info); 1142 result = TRACE_SIGNAL_LOSE_INFO;
1138 } 1143 }
1139 } 1144 }
1140 1145
@@ -1142,7 +1147,9 @@ out_set:
1142 signalfd_notify(t, sig); 1147 signalfd_notify(t, sig);
1143 sigaddset(&pending->signal, sig); 1148 sigaddset(&pending->signal, sig);
1144 complete_signal(sig, t, group); 1149 complete_signal(sig, t, group);
1145 return 0; 1150ret:
1151 trace_signal_generate(sig, info, t, group, result);
1152 return ret;
1146} 1153}
1147 1154
1148static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 1155static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1585,7 +1592,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1585 int sig = q->info.si_signo; 1592 int sig = q->info.si_signo;
1586 struct sigpending *pending; 1593 struct sigpending *pending;
1587 unsigned long flags; 1594 unsigned long flags;
1588 int ret; 1595 int ret, result;
1589 1596
1590 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1597 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1591 1598
@@ -1594,7 +1601,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1594 goto ret; 1601 goto ret;
1595 1602
1596 ret = 1; /* the signal is ignored */ 1603 ret = 1; /* the signal is ignored */
1597 if (!prepare_signal(sig, t, 0)) 1604 result = TRACE_SIGNAL_IGNORED;
1605 if (!prepare_signal(sig, t, false))
1598 goto out; 1606 goto out;
1599 1607
1600 ret = 0; 1608 ret = 0;
@@ -1605,6 +1613,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1605 */ 1613 */
1606 BUG_ON(q->info.si_code != SI_TIMER); 1614 BUG_ON(q->info.si_code != SI_TIMER);
1607 q->info.si_overrun++; 1615 q->info.si_overrun++;
1616 result = TRACE_SIGNAL_ALREADY_PENDING;
1608 goto out; 1617 goto out;
1609 } 1618 }
1610 q->info.si_overrun = 0; 1619 q->info.si_overrun = 0;
@@ -1614,7 +1623,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1614 list_add_tail(&q->list, &pending->list); 1623 list_add_tail(&q->list, &pending->list);
1615 sigaddset(&pending->signal, sig); 1624 sigaddset(&pending->signal, sig);
1616 complete_signal(sig, t, group); 1625 complete_signal(sig, t, group);
1626 result = TRACE_SIGNAL_DELIVERED;
1617out: 1627out:
1628 trace_signal_generate(sig, &q->info, t, group, result);
1618 unlock_task_sighand(t, &flags); 1629 unlock_task_sighand(t, &flags);
1619ret: 1630ret:
1620 return ret; 1631 return ret;
@@ -1642,6 +1653,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1642 BUG_ON(!tsk->ptrace && 1653 BUG_ON(!tsk->ptrace &&
1643 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1654 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1644 1655
1656 if (sig != SIGCHLD) {
1657 /*
1658 * This is only possible if parent == real_parent.
1659 * Check if it has changed security domain.
1660 */
1661 if (tsk->parent_exec_id != tsk->parent->self_exec_id)
1662 sig = SIGCHLD;
1663 }
1664
1645 info.si_signo = sig; 1665 info.si_signo = sig;
1646 info.si_errno = 0; 1666 info.si_errno = 0;
1647 /* 1667 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489b..2f8b10ecf759 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
701 return ret; 701 return ret;
702} 702}
703EXPORT_SYMBOL(on_each_cpu); 703EXPORT_SYMBOL(on_each_cpu);
704
705/**
706 * on_each_cpu_mask(): Run a function on processors specified by
707 * cpumask, which may include the local processor.
708 * @mask: The set of cpus to run on (only runs on online subset).
709 * @func: The function to run. This must be fast and non-blocking.
710 * @info: An arbitrary pointer to pass to the function.
711 * @wait: If true, wait (atomically) until function has completed
712 * on other CPUs.
713 *
714 * If @wait is true, then returns once @func has returned.
715 *
716 * You must not call this function with disabled interrupts or
717 * from a hardware interrupt handler or from a bottom half handler.
718 */
719void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
720 void *info, bool wait)
721{
722 int cpu = get_cpu();
723
724 smp_call_function_many(mask, func, info, wait);
725 if (cpumask_test_cpu(cpu, mask)) {
726 local_irq_disable();
727 func(info);
728 local_irq_enable();
729 }
730 put_cpu();
731}
732EXPORT_SYMBOL(on_each_cpu_mask);
733
734/*
735 * on_each_cpu_cond(): Call a function on each processor for which
736 * the supplied function cond_func returns true, optionally waiting
737 * for all the required CPUs to finish. This may include the local
738 * processor.
739 * @cond_func: A callback function that is passed a cpu id and
740 * the the info parameter. The function is called
741 * with preemption disabled. The function should
742 * return a blooean value indicating whether to IPI
743 * the specified CPU.
744 * @func: The function to run on all applicable CPUs.
745 * This must be fast and non-blocking.
746 * @info: An arbitrary pointer to pass to both functions.
747 * @wait: If true, wait (atomically) until function has
748 * completed on other CPUs.
749 * @gfp_flags: GFP flags to use when allocating the cpumask
750 * used internally by the function.
751 *
752 * The function might sleep if the GFP flags indicates a non
753 * atomic allocation is allowed.
754 *
755 * Preemption is disabled to protect against CPUs going offline but not online.
756 * CPUs going online during the call will not be seen or sent an IPI.
757 *
758 * You must not call this function with disabled interrupts or
759 * from a hardware interrupt handler or from a bottom half handler.
760 */
761void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
762 smp_call_func_t func, void *info, bool wait,
763 gfp_t gfp_flags)
764{
765 cpumask_var_t cpus;
766 int cpu, ret;
767
768 might_sleep_if(gfp_flags & __GFP_WAIT);
769
770 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
771 preempt_disable();
772 for_each_online_cpu(cpu)
773 if (cond_func(cpu, info))
774 cpumask_set_cpu(cpu, cpus);
775 on_each_cpu_mask(cpus, func, info, wait);
776 preempt_enable();
777 free_cpumask_var(cpus);
778 } else {
779 /*
780 * No free cpumask, bother. No matter, we'll
781 * just have to IPI them one by one.
782 */
783 preempt_disable();
784 for_each_online_cpu(cpu)
785 if (cond_func(cpu, info)) {
786 ret = smp_call_function_single(cpu, func,
787 info, wait);
788 WARN_ON_ONCE(!ret);
789 }
790 preempt_enable();
791 }
792}
793EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4eb3a0fa351e..671f9594e368 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -297,7 +297,7 @@ void irq_enter(void)
297 int cpu = smp_processor_id(); 297 int cpu = smp_processor_id();
298 298
299 rcu_irq_enter(); 299 rcu_irq_enter();
300 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (is_idle_task(current) && !in_interrupt()) {
301 /* 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd 302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt. 303 * here, as softirq will be serviced on return from interrupt.
@@ -310,31 +310,21 @@ void irq_enter(void)
310 __irq_enter(); 310 __irq_enter();
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314static inline void invoke_softirq(void) 313static inline void invoke_softirq(void)
315{ 314{
316 if (!force_irqthreads) 315 if (!force_irqthreads) {
316#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
317 __do_softirq(); 317 __do_softirq();
318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
324}
325#else 318#else
326static inline void invoke_softirq(void)
327{
328 if (!force_irqthreads)
329 do_softirq(); 319 do_softirq();
330 else { 320#endif
321 } else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0), 322 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET); 323 SOFTIRQ_OFFSET);
333 wakeup_softirqd(); 324 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET); 325 __local_bh_enable(SOFTIRQ_OFFSET);
335 } 326 }
336} 327}
337#endif
338 328
339/* 329/*
340 * Exit an interrupt context. Process softirqs if needed and possible: 330 * Exit an interrupt context. Process softirqs if needed and possible:
@@ -353,7 +343,7 @@ void irq_exit(void)
353 tick_nohz_irq_exit(); 343 tick_nohz_irq_exit();
354#endif 344#endif
355 rcu_irq_exit(); 345 rcu_irq_exit();
356 preempt_enable_no_resched(); 346 sched_preempt_enable_no_resched();
357} 347}
358 348
359/* 349/*
@@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr)
385 local_irq_restore(flags); 375 local_irq_restore(flags);
386} 376}
387 377
378void __raise_softirq_irqoff(unsigned int nr)
379{
380 trace_softirq_raise(nr);
381 or_softirq_pending(1UL << nr);
382}
383
388void open_softirq(int nr, void (*action)(struct softirq_action *)) 384void open_softirq(int nr, void (*action)(struct softirq_action *))
389{ 385{
390 softirq_vec[nr].action = action; 386 softirq_vec[nr].action = action;
@@ -744,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu)
744 while (!kthread_should_stop()) { 740 while (!kthread_should_stop()) {
745 preempt_disable(); 741 preempt_disable();
746 if (!local_softirq_pending()) { 742 if (!local_softirq_pending()) {
747 preempt_enable_no_resched(); 743 schedule_preempt_disabled();
748 schedule();
749 preempt_disable();
750 } 744 }
751 745
752 __set_current_state(TASK_RUNNING); 746 __set_current_state(TASK_RUNNING);
@@ -761,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu)
761 if (local_softirq_pending()) 755 if (local_softirq_pending())
762 __do_softirq(); 756 __do_softirq();
763 local_irq_enable(); 757 local_irq_enable();
764 preempt_enable_no_resched(); 758 sched_preempt_enable_no_resched();
765 cond_resched(); 759 cond_resched();
766 preempt_disable(); 760 preempt_disable();
767 rcu_note_context_switch((long)__bind_cpu); 761 rcu_note_context_switch((long)__bind_cpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 84c7d96918bf..5cdd8065a3ce 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
163EXPORT_SYMBOL(_raw_spin_lock_bh); 163EXPORT_SYMBOL(_raw_spin_lock_bh);
164#endif 164#endif
165 165
166#ifndef CONFIG_INLINE_SPIN_UNLOCK 166#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
168{ 168{
169 __raw_spin_unlock(lock); 169 __raw_spin_unlock(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 0febf61e1aa3..ba35f3a4a1f4 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
172{ 172{
173 int idx; 173 int idx;
174 174
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
176 !lock_is_held(&rcu_bh_lock_map) &&
177 !lock_is_held(&rcu_lock_map) &&
178 !lock_is_held(&rcu_sched_lock_map),
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
180
175 idx = sp->completed; 181 idx = sp->completed;
176 mutex_lock(&sp->mutex); 182 mutex_lock(&sp->mutex);
177 183
@@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp)
280EXPORT_SYMBOL_GPL(synchronize_srcu); 286EXPORT_SYMBOL_GPL(synchronize_srcu);
281 287
282/** 288/**
283 * synchronize_srcu_expedited - like synchronize_srcu, but less patient 289 * synchronize_srcu_expedited - Brute-force SRCU grace period
284 * @sp: srcu_struct with which to synchronize. 290 * @sp: srcu_struct with which to synchronize.
285 * 291 *
286 * Flip the completed counter, and wait for the old count to drain to zero. 292 * Wait for an SRCU grace period to elapse, but use a "big hammer"
287 * As with classic RCU, the updater must use some separate means of 293 * approach to force the grace period to end quickly. This consumes
288 * synchronizing concurrent updates. Can block; must be called from 294 * significant time on all CPUs and is unfriendly to real-time workloads,
289 * process context. 295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
290 * 299 *
291 * Note that it is illegal to call synchronize_srcu_expedited() 300 * Note that it is illegal to call this function while holding any lock
292 * from the corresponding SRCU read-side critical section; doing so 301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
293 * will result in deadlock. However, it is perfectly legal to call 302 * to call this function from a CPU-hotplug notifier. Failing to observe
294 * synchronize_srcu_expedited() on one srcu_struct from some other 303 * these restriction will result in deadlock. It is also illegal to call
295 * srcu_struct's read-side critical section. 304 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
307 * from some other srcu_struct's read-side critical section, as long as
308 * the resulting graph of srcu_structs is acyclic.
296 */ 309 */
297void synchronize_srcu_expedited(struct srcu_struct *sp) 310void synchronize_srcu_expedited(struct srcu_struct *sp)
298{ 311{
diff --git a/kernel/sys.c b/kernel/sys.c
index 40701538fbd1..e7006eb6c1e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
444 magic2 != LINUX_REBOOT_MAGIC2C)) 444 magic2 != LINUX_REBOOT_MAGIC2C))
445 return -EINVAL; 445 return -EINVAL;
446 446
447 /*
448 * If pid namespaces are enabled and the current task is in a child
449 * pid_namespace, the command is handled by reboot_pid_ns() which will
450 * call do_exit().
451 */
452 ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
453 if (ret)
454 return ret;
455
447 /* Instead of trying to make the power_off code look like 456 /* Instead of trying to make the power_off code look like
448 * halt when pm_power_off is not set do it the easy way. 457 * halt when pm_power_off is not set do it the easy way.
449 */ 458 */
@@ -1706,7 +1715,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
1706 if (arg4 | arg5) 1715 if (arg4 | arg5)
1707 return -EINVAL; 1716 return -EINVAL;
1708 1717
1709 if (!capable(CAP_SYS_ADMIN)) 1718 if (!capable(CAP_SYS_RESOURCE))
1710 return -EPERM; 1719 return -EPERM;
1711 1720
1712 if (addr >= TASK_SIZE) 1721 if (addr >= TASK_SIZE)
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1962 case PR_SET_MM: 1971 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5); 1972 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break; 1973 break;
1974 case PR_SET_CHILD_SUBREAPER:
1975 me->signal->is_child_subreaper = !!arg2;
1976 error = 0;
1977 break;
1978 case PR_GET_CHILD_SUBREAPER:
1979 error = put_user(me->signal->is_child_subreaper,
1980 (int __user *) arg2);
1981 break;
1965 default: 1982 default:
1966 error = -EINVAL; 1983 error = -EINVAL;
1967 break; 1984 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..52b3a06a02f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/bitmap.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/printk.h> 28#include <linux/printk.h>
28#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
@@ -58,6 +59,7 @@
58#include <linux/oom.h> 59#include <linux/oom.h>
59#include <linux/kmod.h> 60#include <linux/kmod.h>
60#include <linux/capability.h> 61#include <linux/capability.h>
62#include <linux/binfmts.h>
61 63
62#include <asm/uaccess.h> 64#include <asm/uaccess.h>
63#include <asm/processor.h> 65#include <asm/processor.h>
@@ -67,6 +69,9 @@
67#include <asm/stacktrace.h> 69#include <asm/stacktrace.h>
68#include <asm/io.h> 70#include <asm/io.h>
69#endif 71#endif
72#ifdef CONFIG_SPARC
73#include <asm/setup.h>
74#endif
70#ifdef CONFIG_BSD_PROCESS_ACCT 75#ifdef CONFIG_BSD_PROCESS_ACCT
71#include <linux/acct.h> 76#include <linux/acct.h>
72#endif 77#endif
@@ -141,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP;
141#include <linux/inotify.h> 146#include <linux/inotify.h>
142#endif 147#endif
143#ifdef CONFIG_SPARC 148#ifdef CONFIG_SPARC
144#include <asm/system.h>
145#endif 149#endif
146 150
147#ifdef CONFIG_SPARC64 151#ifdef CONFIG_SPARC64
@@ -192,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
192 196
193#endif 197#endif
194 198
195static struct ctl_table root_table[];
196static struct ctl_table_root sysctl_table_root;
197static struct ctl_table_header root_table_header = {
198 {{.count = 1,
199 .ctl_table = root_table,
200 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
201 .root = &sysctl_table_root,
202 .set = &sysctl_table_root.default_set,
203};
204static struct ctl_table_root sysctl_table_root = {
205 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
206 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
207};
208
209static struct ctl_table kern_table[]; 199static struct ctl_table kern_table[];
210static struct ctl_table vm_table[]; 200static struct ctl_table vm_table[];
211static struct ctl_table fs_table[]; 201static struct ctl_table fs_table[];
@@ -222,7 +212,7 @@ int sysctl_legacy_va_layout;
222 212
223/* The default sysctl tables: */ 213/* The default sysctl tables: */
224 214
225static struct ctl_table root_table[] = { 215static struct ctl_table sysctl_base_table[] = {
226 { 216 {
227 .procname = "kernel", 217 .procname = "kernel",
228 .mode = 0555, 218 .mode = 0555,
@@ -1559,490 +1549,12 @@ static struct ctl_table dev_table[] = {
1559 { } 1549 { }
1560}; 1550};
1561 1551
1562static DEFINE_SPINLOCK(sysctl_lock); 1552int __init sysctl_init(void)
1563
1564/* called under sysctl_lock */
1565static int use_table(struct ctl_table_header *p)
1566{ 1553{
1567 if (unlikely(p->unregistering)) 1554 register_sysctl_table(sysctl_base_table);
1568 return 0;
1569 p->used++;
1570 return 1;
1571}
1572
1573/* called under sysctl_lock */
1574static void unuse_table(struct ctl_table_header *p)
1575{
1576 if (!--p->used)
1577 if (unlikely(p->unregistering))
1578 complete(p->unregistering);
1579}
1580
1581/* called under sysctl_lock, will reacquire if has to wait */
1582static void start_unregistering(struct ctl_table_header *p)
1583{
1584 /*
1585 * if p->used is 0, nobody will ever touch that entry again;
1586 * we'll eliminate all paths to it before dropping sysctl_lock
1587 */
1588 if (unlikely(p->used)) {
1589 struct completion wait;
1590 init_completion(&wait);
1591 p->unregistering = &wait;
1592 spin_unlock(&sysctl_lock);
1593 wait_for_completion(&wait);
1594 spin_lock(&sysctl_lock);
1595 } else {
1596 /* anything non-NULL; we'll never dereference it */
1597 p->unregistering = ERR_PTR(-EINVAL);
1598 }
1599 /*
1600 * do not remove from the list until nobody holds it; walking the
1601 * list in do_sysctl() relies on that.
1602 */
1603 list_del_init(&p->ctl_entry);
1604}
1605
1606void sysctl_head_get(struct ctl_table_header *head)
1607{
1608 spin_lock(&sysctl_lock);
1609 head->count++;
1610 spin_unlock(&sysctl_lock);
1611}
1612
1613void sysctl_head_put(struct ctl_table_header *head)
1614{
1615 spin_lock(&sysctl_lock);
1616 if (!--head->count)
1617 kfree_rcu(head, rcu);
1618 spin_unlock(&sysctl_lock);
1619}
1620
1621struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1622{
1623 if (!head)
1624 BUG();
1625 spin_lock(&sysctl_lock);
1626 if (!use_table(head))
1627 head = ERR_PTR(-ENOENT);
1628 spin_unlock(&sysctl_lock);
1629 return head;
1630}
1631
1632void sysctl_head_finish(struct ctl_table_header *head)
1633{
1634 if (!head)
1635 return;
1636 spin_lock(&sysctl_lock);
1637 unuse_table(head);
1638 spin_unlock(&sysctl_lock);
1639}
1640
1641static struct ctl_table_set *
1642lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1643{
1644 struct ctl_table_set *set = &root->default_set;
1645 if (root->lookup)
1646 set = root->lookup(root, namespaces);
1647 return set;
1648}
1649
1650static struct list_head *
1651lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1652{
1653 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1654 return &set->list;
1655}
1656
1657struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1658 struct ctl_table_header *prev)
1659{
1660 struct ctl_table_root *root;
1661 struct list_head *header_list;
1662 struct ctl_table_header *head;
1663 struct list_head *tmp;
1664
1665 spin_lock(&sysctl_lock);
1666 if (prev) {
1667 head = prev;
1668 tmp = &prev->ctl_entry;
1669 unuse_table(prev);
1670 goto next;
1671 }
1672 tmp = &root_table_header.ctl_entry;
1673 for (;;) {
1674 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1675
1676 if (!use_table(head))
1677 goto next;
1678 spin_unlock(&sysctl_lock);
1679 return head;
1680 next:
1681 root = head->root;
1682 tmp = tmp->next;
1683 header_list = lookup_header_list(root, namespaces);
1684 if (tmp != header_list)
1685 continue;
1686
1687 do {
1688 root = list_entry(root->root_list.next,
1689 struct ctl_table_root, root_list);
1690 if (root == &sysctl_table_root)
1691 goto out;
1692 header_list = lookup_header_list(root, namespaces);
1693 } while (list_empty(header_list));
1694 tmp = header_list->next;
1695 }
1696out:
1697 spin_unlock(&sysctl_lock);
1698 return NULL;
1699}
1700
1701struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1702{
1703 return __sysctl_head_next(current->nsproxy, prev);
1704}
1705
1706void register_sysctl_root(struct ctl_table_root *root)
1707{
1708 spin_lock(&sysctl_lock);
1709 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1710 spin_unlock(&sysctl_lock);
1711}
1712
1713/*
1714 * sysctl_perm does NOT grant the superuser all rights automatically, because
1715 * some sysctl variables are readonly even to root.
1716 */
1717
1718static int test_perm(int mode, int op)
1719{
1720 if (!current_euid())
1721 mode >>= 6;
1722 else if (in_egroup_p(0))
1723 mode >>= 3;
1724 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1725 return 0;
1726 return -EACCES;
1727}
1728
1729int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1730{
1731 int mode;
1732
1733 if (root->permissions)
1734 mode = root->permissions(root, current->nsproxy, table);
1735 else
1736 mode = table->mode;
1737
1738 return test_perm(mode, op);
1739}
1740
1741static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1742{
1743 for (; table->procname; table++) {
1744 table->parent = parent;
1745 if (table->child)
1746 sysctl_set_parent(table, table->child);
1747 }
1748}
1749
1750static __init int sysctl_init(void)
1751{
1752 sysctl_set_parent(NULL, root_table);
1753#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1754 sysctl_check_table(current->nsproxy, root_table);
1755#endif
1756 return 0; 1555 return 0;
1757} 1556}
1758 1557
1759core_initcall(sysctl_init);
1760
1761static struct ctl_table *is_branch_in(struct ctl_table *branch,
1762 struct ctl_table *table)
1763{
1764 struct ctl_table *p;
1765 const char *s = branch->procname;
1766
1767 /* branch should have named subdirectory as its first element */
1768 if (!s || !branch->child)
1769 return NULL;
1770
1771 /* ... and nothing else */
1772 if (branch[1].procname)
1773 return NULL;
1774
1775 /* table should contain subdirectory with the same name */
1776 for (p = table; p->procname; p++) {
1777 if (!p->child)
1778 continue;
1779 if (p->procname && strcmp(p->procname, s) == 0)
1780 return p;
1781 }
1782 return NULL;
1783}
1784
1785/* see if attaching q to p would be an improvement */
1786static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1787{
1788 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1789 struct ctl_table *next;
1790 int is_better = 0;
1791 int not_in_parent = !p->attached_by;
1792
1793 while ((next = is_branch_in(by, to)) != NULL) {
1794 if (by == q->attached_by)
1795 is_better = 1;
1796 if (to == p->attached_by)
1797 not_in_parent = 1;
1798 by = by->child;
1799 to = next->child;
1800 }
1801
1802 if (is_better && not_in_parent) {
1803 q->attached_by = by;
1804 q->attached_to = to;
1805 q->parent = p;
1806 }
1807}
1808
1809/**
1810 * __register_sysctl_paths - register a sysctl hierarchy
1811 * @root: List of sysctl headers to register on
1812 * @namespaces: Data to compute which lists of sysctl entries are visible
1813 * @path: The path to the directory the sysctl table is in.
1814 * @table: the top-level table structure
1815 *
1816 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1817 * array. A completely 0 filled entry terminates the table.
1818 *
1819 * The members of the &struct ctl_table structure are used as follows:
1820 *
1821 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1822 * enter a sysctl file
1823 *
1824 * data - a pointer to data for use by proc_handler
1825 *
1826 * maxlen - the maximum size in bytes of the data
1827 *
1828 * mode - the file permissions for the /proc/sys file, and for sysctl(2)
1829 *
1830 * child - a pointer to the child sysctl table if this entry is a directory, or
1831 * %NULL.
1832 *
1833 * proc_handler - the text handler routine (described below)
1834 *
1835 * de - for internal use by the sysctl routines
1836 *
1837 * extra1, extra2 - extra pointers usable by the proc handler routines
1838 *
1839 * Leaf nodes in the sysctl tree will be represented by a single file
1840 * under /proc; non-leaf nodes will be represented by directories.
1841 *
1842 * sysctl(2) can automatically manage read and write requests through
1843 * the sysctl table. The data and maxlen fields of the ctl_table
1844 * struct enable minimal validation of the values being written to be
1845 * performed, and the mode field allows minimal authentication.
1846 *
1847 * There must be a proc_handler routine for any terminal nodes
1848 * mirrored under /proc/sys (non-terminals are handled by a built-in
1849 * directory handler). Several default handlers are available to
1850 * cover common cases -
1851 *
1852 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1853 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1854 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1855 *
1856 * It is the handler's job to read the input buffer from user memory
1857 * and process it. The handler should return 0 on success.
1858 *
1859 * This routine returns %NULL on a failure to register, and a pointer
1860 * to the table header on success.
1861 */
1862struct ctl_table_header *__register_sysctl_paths(
1863 struct ctl_table_root *root,
1864 struct nsproxy *namespaces,
1865 const struct ctl_path *path, struct ctl_table *table)
1866{
1867 struct ctl_table_header *header;
1868 struct ctl_table *new, **prevp;
1869 unsigned int n, npath;
1870 struct ctl_table_set *set;
1871
1872 /* Count the path components */
1873 for (npath = 0; path[npath].procname; ++npath)
1874 ;
1875
1876 /*
1877 * For each path component, allocate a 2-element ctl_table array.
1878 * The first array element will be filled with the sysctl entry
1879 * for this, the second will be the sentinel (procname == 0).
1880 *
1881 * We allocate everything in one go so that we don't have to
1882 * worry about freeing additional memory in unregister_sysctl_table.
1883 */
1884 header = kzalloc(sizeof(struct ctl_table_header) +
1885 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1886 if (!header)
1887 return NULL;
1888
1889 new = (struct ctl_table *) (header + 1);
1890
1891 /* Now connect the dots */
1892 prevp = &header->ctl_table;
1893 for (n = 0; n < npath; ++n, ++path) {
1894 /* Copy the procname */
1895 new->procname = path->procname;
1896 new->mode = 0555;
1897
1898 *prevp = new;
1899 prevp = &new->child;
1900
1901 new += 2;
1902 }
1903 *prevp = table;
1904 header->ctl_table_arg = table;
1905
1906 INIT_LIST_HEAD(&header->ctl_entry);
1907 header->used = 0;
1908 header->unregistering = NULL;
1909 header->root = root;
1910 sysctl_set_parent(NULL, header->ctl_table);
1911 header->count = 1;
1912#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1913 if (sysctl_check_table(namespaces, header->ctl_table)) {
1914 kfree(header);
1915 return NULL;
1916 }
1917#endif
1918 spin_lock(&sysctl_lock);
1919 header->set = lookup_header_set(root, namespaces);
1920 header->attached_by = header->ctl_table;
1921 header->attached_to = root_table;
1922 header->parent = &root_table_header;
1923 for (set = header->set; set; set = set->parent) {
1924 struct ctl_table_header *p;
1925 list_for_each_entry(p, &set->list, ctl_entry) {
1926 if (p->unregistering)
1927 continue;
1928 try_attach(p, header);
1929 }
1930 }
1931 header->parent->count++;
1932 list_add_tail(&header->ctl_entry, &header->set->list);
1933 spin_unlock(&sysctl_lock);
1934
1935 return header;
1936}
1937
1938/**
1939 * register_sysctl_table_path - register a sysctl table hierarchy
1940 * @path: The path to the directory the sysctl table is in.
1941 * @table: the top-level table structure
1942 *
1943 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1944 * array. A completely 0 filled entry terminates the table.
1945 *
1946 * See __register_sysctl_paths for more details.
1947 */
1948struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1949 struct ctl_table *table)
1950{
1951 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1952 path, table);
1953}
1954
1955/**
1956 * register_sysctl_table - register a sysctl table hierarchy
1957 * @table: the top-level table structure
1958 *
1959 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1960 * array. A completely 0 filled entry terminates the table.
1961 *
1962 * See register_sysctl_paths for more details.
1963 */
1964struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1965{
1966 static const struct ctl_path null_path[] = { {} };
1967
1968 return register_sysctl_paths(null_path, table);
1969}
1970
1971/**
1972 * unregister_sysctl_table - unregister a sysctl table hierarchy
1973 * @header: the header returned from register_sysctl_table
1974 *
1975 * Unregisters the sysctl table and all children. proc entries may not
1976 * actually be removed until they are no longer used by anyone.
1977 */
1978void unregister_sysctl_table(struct ctl_table_header * header)
1979{
1980 might_sleep();
1981
1982 if (header == NULL)
1983 return;
1984
1985 spin_lock(&sysctl_lock);
1986 start_unregistering(header);
1987 if (!--header->parent->count) {
1988 WARN_ON(1);
1989 kfree_rcu(header->parent, rcu);
1990 }
1991 if (!--header->count)
1992 kfree_rcu(header, rcu);
1993 spin_unlock(&sysctl_lock);
1994}
1995
1996int sysctl_is_seen(struct ctl_table_header *p)
1997{
1998 struct ctl_table_set *set = p->set;
1999 int res;
2000 spin_lock(&sysctl_lock);
2001 if (p->unregistering)
2002 res = 0;
2003 else if (!set->is_seen)
2004 res = 1;
2005 else
2006 res = set->is_seen(set);
2007 spin_unlock(&sysctl_lock);
2008 return res;
2009}
2010
2011void setup_sysctl_set(struct ctl_table_set *p,
2012 struct ctl_table_set *parent,
2013 int (*is_seen)(struct ctl_table_set *))
2014{
2015 INIT_LIST_HEAD(&p->list);
2016 p->parent = parent ? parent : &sysctl_table_root.default_set;
2017 p->is_seen = is_seen;
2018}
2019
2020#else /* !CONFIG_SYSCTL */
2021struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
2022{
2023 return NULL;
2024}
2025
2026struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
2027 struct ctl_table *table)
2028{
2029 return NULL;
2030}
2031
2032void unregister_sysctl_table(struct ctl_table_header * table)
2033{
2034}
2035
2036void setup_sysctl_set(struct ctl_table_set *p,
2037 struct ctl_table_set *parent,
2038 int (*is_seen)(struct ctl_table_set *))
2039{
2040}
2041
2042void sysctl_head_put(struct ctl_table_header *head)
2043{
2044}
2045
2046#endif /* CONFIG_SYSCTL */ 1558#endif /* CONFIG_SYSCTL */
2047 1559
2048/* 1560/*
@@ -2884,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2884 } 2396 }
2885 } 2397 }
2886 2398
2887 while (val_a <= val_b) 2399 bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
2888 set_bit(val_a++, tmp_bitmap);
2889
2890 first = 0; 2400 first = 0;
2891 proc_skip_char(&kbuf, &left, '\n'); 2401 proc_skip_char(&kbuf, &left, '\n');
2892 } 2402 }
@@ -2929,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2929 if (*ppos) 2439 if (*ppos)
2930 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); 2440 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2931 else 2441 else
2932 memcpy(bitmap, tmp_bitmap, 2442 bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
2933 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2934 } 2443 }
2935 kfree(tmp_bitmap); 2444 kfree(tmp_bitmap);
2936 *lenp -= left; 2445 *lenp -= left;
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
3008EXPORT_SYMBOL(proc_dostring); 2517EXPORT_SYMBOL(proc_dostring);
3009EXPORT_SYMBOL(proc_doulongvec_minmax); 2518EXPORT_SYMBOL(proc_doulongvec_minmax);
3010EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2519EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3011EXPORT_SYMBOL(register_sysctl_table);
3012EXPORT_SYMBOL(register_sysctl_paths);
3013EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7
8
9static int sysctl_depth(struct ctl_table *table)
10{
11 struct ctl_table *tmp;
12 int depth;
13
14 depth = 0;
15 for (tmp = table; tmp->parent; tmp = tmp->parent)
16 depth++;
17
18 return depth;
19}
20
21static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
22{
23 int i;
24
25 for (i = 0; table && i < n; i++)
26 table = table->parent;
27
28 return table;
29}
30
31
32static void sysctl_print_path(struct ctl_table *table)
33{
34 struct ctl_table *tmp;
35 int depth, i;
36 depth = sysctl_depth(table);
37 if (table->procname) {
38 for (i = depth; i >= 0; i--) {
39 tmp = sysctl_parent(table, i);
40 printk("/%s", tmp->procname?tmp->procname:"");
41 }
42 }
43 printk(" ");
44}
45
46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
47 struct ctl_table *table)
48{
49 struct ctl_table_header *head;
50 struct ctl_table *ref, *test;
51 int depth, cur_depth;
52
53 depth = sysctl_depth(table);
54
55 for (head = __sysctl_head_next(namespaces, NULL); head;
56 head = __sysctl_head_next(namespaces, head)) {
57 cur_depth = depth;
58 ref = head->ctl_table;
59repeat:
60 test = sysctl_parent(table, cur_depth);
61 for (; ref->procname; ref++) {
62 int match = 0;
63 if (cur_depth && !ref->child)
64 continue;
65
66 if (test->procname && ref->procname &&
67 (strcmp(test->procname, ref->procname) == 0))
68 match++;
69
70 if (match) {
71 if (cur_depth != 0) {
72 cur_depth--;
73 ref = ref->child;
74 goto repeat;
75 }
76 goto out;
77 }
78 }
79 }
80 ref = NULL;
81out:
82 sysctl_head_finish(head);
83 return ref;
84}
85
86static void set_fail(const char **fail, struct ctl_table *table, const char *str)
87{
88 if (*fail) {
89 printk(KERN_ERR "sysctl table check failed: ");
90 sysctl_print_path(table);
91 printk(" %s\n", *fail);
92 dump_stack();
93 }
94 *fail = str;
95}
96
97static void sysctl_check_leaf(struct nsproxy *namespaces,
98 struct ctl_table *table, const char **fail)
99{
100 struct ctl_table *ref;
101
102 ref = sysctl_check_lookup(namespaces, table);
103 if (ref && (ref != table))
104 set_fail(fail, table, "Sysctl already exists");
105}
106
107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
108{
109 int error = 0;
110 for (; table->procname; table++) {
111 const char *fail = NULL;
112
113 if (table->parent) {
114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname");
116 }
117 if (table->child) {
118 if (table->data)
119 set_fail(&fail, table, "Directory with data?");
120 if (table->maxlen)
121 set_fail(&fail, table, "Directory with maxlen?");
122 if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
123 set_fail(&fail, table, "Writable sysctl directory");
124 if (table->proc_handler)
125 set_fail(&fail, table, "Directory with proc_handler");
126 if (table->extra1)
127 set_fail(&fail, table, "Directory with extra1");
128 if (table->extra2)
129 set_fail(&fail, table, "Directory with extra2");
130 } else {
131 if ((table->proc_handler == proc_dostring) ||
132 (table->proc_handler == proc_dointvec) ||
133 (table->proc_handler == proc_dointvec_minmax) ||
134 (table->proc_handler == proc_dointvec_jiffies) ||
135 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
136 (table->proc_handler == proc_dointvec_ms_jiffies) ||
137 (table->proc_handler == proc_doulongvec_minmax) ||
138 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
139 if (!table->data)
140 set_fail(&fail, table, "No data");
141 if (!table->maxlen)
142 set_fail(&fail, table, "No maxlen");
143 }
144#ifdef CONFIG_PROC_SYSCTL
145 if (!table->proc_handler)
146 set_fail(&fail, table, "No proc_handler");
147#endif
148 sysctl_check_leaf(namespaces, table, &fail);
149 }
150 if (table->mode > 0777)
151 set_fail(&fail, table, "bogus .mode");
152 if (fail) {
153 set_fail(&fail, table, NULL);
154 error = -EINVAL;
155 }
156 if (table->child)
157 error |= sysctl_check_table(namespaces, table->child);
158 }
159 return error;
160}
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1e..ba744cf80696 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
163 return error; 163 return error;
164 164
165 if (tz) { 165 if (tz) {
166 /* SMP safe, global irq locking makes it work. */
167 sys_tz = *tz; 166 sys_tz = *tz;
168 update_vsyscall_tz(); 167 update_vsyscall_tz();
169 if (firsttime) { 168 if (firsttime) {
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
173 } 172 }
174 } 173 }
175 if (tv) 174 if (tv)
176 {
177 /* SMP safe, again the code in arch/foo/time.c should
178 * globally block out interrupts when it runs.
179 */
180 return do_settimeofday(tv); 175 return do_settimeofday(tv);
181 }
182 return 0; 176 return 0;
183} 177}
184 178
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d64504..8a538c55fc7b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
96 return 0; 96 return 0;
97} 97}
98 98
99static inline void alarmtimer_rtc_timer_init(void)
100{
101 rtc_timer_init(&rtctimer, NULL, NULL);
102}
103
99static struct class_interface alarmtimer_rtc_interface = { 104static struct class_interface alarmtimer_rtc_interface = {
100 .add_dev = &alarmtimer_rtc_add_device, 105 .add_dev = &alarmtimer_rtc_add_device,
101}; 106};
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void)
117#define rtcdev (NULL) 122#define rtcdev (NULL)
118static inline int alarmtimer_rtc_interface_setup(void) { return 0; } 123static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
119static inline void alarmtimer_rtc_interface_remove(void) { } 124static inline void alarmtimer_rtc_interface_remove(void) { }
125static inline void alarmtimer_rtc_timer_init(void) { }
120#endif 126#endif
121 127
122/** 128/**
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void)
783 .nsleep = alarm_timer_nsleep, 789 .nsleep = alarm_timer_nsleep,
784 }; 790 };
785 791
792 alarmtimer_rtc_timer_init();
793
786 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); 794 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
787 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); 795 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
788 796
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab24..c9583382141a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
500{ 500{
501 u64 ret; 501 u64 ret;
502 /* 502 /*
503 * We won't try to correct for more then 11% adjustments (110,000 ppm), 503 * We won't try to correct for more than 11% adjustments (110,000 ppm),
504 */ 504 */
505 ret = (u64)cs->mult * 11; 505 ret = (u64)cs->mult * 11;
506 do_div(ret,100); 506 do_div(ret,100);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f6117a4c7cb8..f03fd83b170b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,17 +22,18 @@
22 * NTP timekeeping variables: 22 * NTP timekeeping variables:
23 */ 23 */
24 24
25DEFINE_SPINLOCK(ntp_lock);
26
27
25/* USER_HZ period (usecs): */ 28/* USER_HZ period (usecs): */
26unsigned long tick_usec = TICK_USEC; 29unsigned long tick_usec = TICK_USEC;
27 30
28/* ACTHZ period (nsecs): */ 31/* ACTHZ period (nsecs): */
29unsigned long tick_nsec; 32unsigned long tick_nsec;
30 33
31u64 tick_length; 34static u64 tick_length;
32static u64 tick_length_base; 35static u64 tick_length_base;
33 36
34static struct hrtimer leap_timer;
35
36#define MAX_TICKADJ 500LL /* usecs */ 37#define MAX_TICKADJ 500LL /* usecs */
37#define MAX_TICKADJ_SCALED \ 38#define MAX_TICKADJ_SCALED \
38 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 39 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -49,7 +50,7 @@ static struct hrtimer leap_timer;
49static int time_state = TIME_OK; 50static int time_state = TIME_OK;
50 51
51/* clock status bits: */ 52/* clock status bits: */
52int time_status = STA_UNSYNC; 53static int time_status = STA_UNSYNC;
53 54
54/* TAI offset (secs): */ 55/* TAI offset (secs): */
55static long time_tai; 56static long time_tai;
@@ -133,7 +134,7 @@ static inline void pps_reset_freq_interval(void)
133/** 134/**
134 * pps_clear - Clears the PPS state variables 135 * pps_clear - Clears the PPS state variables
135 * 136 *
136 * Must be called while holding a write on the xtime_lock 137 * Must be called while holding a write on the ntp_lock
137 */ 138 */
138static inline void pps_clear(void) 139static inline void pps_clear(void)
139{ 140{
@@ -149,7 +150,7 @@ static inline void pps_clear(void)
149 * the last PPS signal. When it reaches 0, indicate that PPS signal is 150 * the last PPS signal. When it reaches 0, indicate that PPS signal is
150 * missing. 151 * missing.
151 * 152 *
152 * Must be called while holding a write on the xtime_lock 153 * Must be called while holding a write on the ntp_lock
153 */ 154 */
154static inline void pps_dec_valid(void) 155static inline void pps_dec_valid(void)
155{ 156{
@@ -233,6 +234,17 @@ static inline void pps_fill_timex(struct timex *txc)
233 234
234#endif /* CONFIG_NTP_PPS */ 235#endif /* CONFIG_NTP_PPS */
235 236
237
238/**
239 * ntp_synced - Returns 1 if the NTP status is not UNSYNC
240 *
241 */
242static inline int ntp_synced(void)
243{
244 return !(time_status & STA_UNSYNC);
245}
246
247
236/* 248/*
237 * NTP methods: 249 * NTP methods:
238 */ 250 */
@@ -275,7 +287,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
275 287
276 time_status |= STA_MODE; 288 time_status |= STA_MODE;
277 289
278 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); 290 return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
279} 291}
280 292
281static void ntp_update_offset(long offset) 293static void ntp_update_offset(long offset)
@@ -330,11 +342,13 @@ static void ntp_update_offset(long offset)
330 342
331/** 343/**
332 * ntp_clear - Clears the NTP state variables 344 * ntp_clear - Clears the NTP state variables
333 *
334 * Must be called while holding a write on the xtime_lock
335 */ 345 */
336void ntp_clear(void) 346void ntp_clear(void)
337{ 347{
348 unsigned long flags;
349
350 spin_lock_irqsave(&ntp_lock, flags);
351
338 time_adjust = 0; /* stop active adjtime() */ 352 time_adjust = 0; /* stop active adjtime() */
339 time_status |= STA_UNSYNC; 353 time_status |= STA_UNSYNC;
340 time_maxerror = NTP_PHASE_LIMIT; 354 time_maxerror = NTP_PHASE_LIMIT;
@@ -347,63 +361,81 @@ void ntp_clear(void)
347 361
348 /* Clear PPS state variables */ 362 /* Clear PPS state variables */
349 pps_clear(); 363 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags);
365
366}
367
368
369u64 ntp_tick_length(void)
370{
371 unsigned long flags;
372 s64 ret;
373
374 spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret;
350} 378}
351 379
380
352/* 381/*
353 * Leap second processing. If in leap-insert state at the end of the 382 * this routine handles the overflow of the microsecond field
354 * day, the system clock is set back one second; if in leap-delete 383 *
355 * state, the system clock is set ahead one second. 384 * The tricky bits of code to handle the accurate clock support
385 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
386 * They were originally developed for SUN and DEC kernels.
387 * All the kudos should go to Dave for this stuff.
388 *
389 * Also handles leap second processing, and returns leap offset
356 */ 390 */
357static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) 391int second_overflow(unsigned long secs)
358{ 392{
359 enum hrtimer_restart res = HRTIMER_NORESTART; 393 s64 delta;
394 int leap = 0;
395 unsigned long flags;
360 396
361 write_seqlock(&xtime_lock); 397 spin_lock_irqsave(&ntp_lock, flags);
362 398
399 /*
400 * Leap second processing. If in leap-insert state at the end of the
401 * day, the system clock is set back one second; if in leap-delete
402 * state, the system clock is set ahead one second.
403 */
363 switch (time_state) { 404 switch (time_state) {
364 case TIME_OK: 405 case TIME_OK:
406 if (time_status & STA_INS)
407 time_state = TIME_INS;
408 else if (time_status & STA_DEL)
409 time_state = TIME_DEL;
365 break; 410 break;
366 case TIME_INS: 411 case TIME_INS:
367 timekeeping_leap_insert(-1); 412 if (secs % 86400 == 0) {
368 time_state = TIME_OOP; 413 leap = -1;
369 printk(KERN_NOTICE 414 time_state = TIME_OOP;
370 "Clock: inserting leap second 23:59:60 UTC\n"); 415 printk(KERN_NOTICE
371 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 416 "Clock: inserting leap second 23:59:60 UTC\n");
372 res = HRTIMER_RESTART; 417 }
373 break; 418 break;
374 case TIME_DEL: 419 case TIME_DEL:
375 timekeeping_leap_insert(1); 420 if ((secs + 1) % 86400 == 0) {
376 time_tai--; 421 leap = 1;
377 time_state = TIME_WAIT; 422 time_tai--;
378 printk(KERN_NOTICE 423 time_state = TIME_WAIT;
379 "Clock: deleting leap second 23:59:59 UTC\n"); 424 printk(KERN_NOTICE
425 "Clock: deleting leap second 23:59:59 UTC\n");
426 }
380 break; 427 break;
381 case TIME_OOP: 428 case TIME_OOP:
382 time_tai++; 429 time_tai++;
383 time_state = TIME_WAIT; 430 time_state = TIME_WAIT;
384 /* fall through */ 431 break;
432
385 case TIME_WAIT: 433 case TIME_WAIT:
386 if (!(time_status & (STA_INS | STA_DEL))) 434 if (!(time_status & (STA_INS | STA_DEL)))
387 time_state = TIME_OK; 435 time_state = TIME_OK;
388 break; 436 break;
389 } 437 }
390 438
391 write_sequnlock(&xtime_lock);
392
393 return res;
394}
395
396/*
397 * this routine handles the overflow of the microsecond field
398 *
399 * The tricky bits of code to handle the accurate clock support
400 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
401 * They were originally developed for SUN and DEC kernels.
402 * All the kudos should go to Dave for this stuff.
403 */
404void second_overflow(void)
405{
406 s64 delta;
407 439
408 /* Bump the maxerror field */ 440 /* Bump the maxerror field */
409 time_maxerror += MAXFREQ / NSEC_PER_USEC; 441 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,30 +455,34 @@ void second_overflow(void)
423 pps_dec_valid(); 455 pps_dec_valid();
424 456
425 if (!time_adjust) 457 if (!time_adjust)
426 return; 458 goto out;
427 459
428 if (time_adjust > MAX_TICKADJ) { 460 if (time_adjust > MAX_TICKADJ) {
429 time_adjust -= MAX_TICKADJ; 461 time_adjust -= MAX_TICKADJ;
430 tick_length += MAX_TICKADJ_SCALED; 462 tick_length += MAX_TICKADJ_SCALED;
431 return; 463 goto out;
432 } 464 }
433 465
434 if (time_adjust < -MAX_TICKADJ) { 466 if (time_adjust < -MAX_TICKADJ) {
435 time_adjust += MAX_TICKADJ; 467 time_adjust += MAX_TICKADJ;
436 tick_length -= MAX_TICKADJ_SCALED; 468 tick_length -= MAX_TICKADJ_SCALED;
437 return; 469 goto out;
438 } 470 }
439 471
440 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) 472 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
441 << NTP_SCALE_SHIFT; 473 << NTP_SCALE_SHIFT;
442 time_adjust = 0; 474 time_adjust = 0;
475
476
477
478out:
479 spin_unlock_irqrestore(&ntp_lock, flags);
480
481 return leap;
443} 482}
444 483
445#ifdef CONFIG_GENERIC_CMOS_UPDATE 484#ifdef CONFIG_GENERIC_CMOS_UPDATE
446 485
447/* Disable the cmos update - used by virtualization and embedded */
448int no_sync_cmos_clock __read_mostly;
449
450static void sync_cmos_clock(struct work_struct *work); 486static void sync_cmos_clock(struct work_struct *work);
451 487
452static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 488static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -493,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work)
493 529
494static void notify_cmos_timer(void) 530static void notify_cmos_timer(void)
495{ 531{
496 if (!no_sync_cmos_clock) 532 schedule_delayed_work(&sync_cmos_work, 0);
497 schedule_delayed_work(&sync_cmos_work, 0);
498} 533}
499 534
500#else 535#else
501static inline void notify_cmos_timer(void) { } 536static inline void notify_cmos_timer(void) { }
502#endif 537#endif
503 538
504/*
505 * Start the leap seconds timer:
506 */
507static inline void ntp_start_leap_timer(struct timespec *ts)
508{
509 long now = ts->tv_sec;
510
511 if (time_status & STA_INS) {
512 time_state = TIME_INS;
513 now += 86400 - now % 86400;
514 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
515
516 return;
517 }
518
519 if (time_status & STA_DEL) {
520 time_state = TIME_DEL;
521 now += 86400 - (now + 1) % 86400;
522 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
523 }
524}
525 539
526/* 540/*
527 * Propagate a new txc->status value into the NTP state: 541 * Propagate a new txc->status value into the NTP state:
@@ -546,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
546 time_status &= STA_RONLY; 560 time_status &= STA_RONLY;
547 time_status |= txc->status & ~STA_RONLY; 561 time_status |= txc->status & ~STA_RONLY;
548 562
549 switch (time_state) {
550 case TIME_OK:
551 ntp_start_leap_timer(ts);
552 break;
553 case TIME_INS:
554 case TIME_DEL:
555 time_state = TIME_OK;
556 ntp_start_leap_timer(ts);
557 case TIME_WAIT:
558 if (!(time_status & (STA_INS | STA_DEL)))
559 time_state = TIME_OK;
560 break;
561 case TIME_OOP:
562 hrtimer_restart(&leap_timer);
563 break;
564 }
565} 563}
566/* 564/*
567 * Called with the xtime lock held, so we can access and modify 565 * Called with the xtime lock held, so we can access and modify
@@ -643,9 +641,6 @@ int do_adjtimex(struct timex *txc)
643 (txc->tick < 900000/USER_HZ || 641 (txc->tick < 900000/USER_HZ ||
644 txc->tick > 1100000/USER_HZ)) 642 txc->tick > 1100000/USER_HZ))
645 return -EINVAL; 643 return -EINVAL;
646
647 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
648 hrtimer_cancel(&leap_timer);
649 } 644 }
650 645
651 if (txc->modes & ADJ_SETOFFSET) { 646 if (txc->modes & ADJ_SETOFFSET) {
@@ -663,7 +658,7 @@ int do_adjtimex(struct timex *txc)
663 658
664 getnstimeofday(&ts); 659 getnstimeofday(&ts);
665 660
666 write_seqlock_irq(&xtime_lock); 661 spin_lock_irq(&ntp_lock);
667 662
668 if (txc->modes & ADJ_ADJTIME) { 663 if (txc->modes & ADJ_ADJTIME) {
669 long save_adjust = time_adjust; 664 long save_adjust = time_adjust;
@@ -705,7 +700,7 @@ int do_adjtimex(struct timex *txc)
705 /* fill PPS status fields */ 700 /* fill PPS status fields */
706 pps_fill_timex(txc); 701 pps_fill_timex(txc);
707 702
708 write_sequnlock_irq(&xtime_lock); 703 spin_unlock_irq(&ntp_lock);
709 704
710 txc->time.tv_sec = ts.tv_sec; 705 txc->time.tv_sec = ts.tv_sec;
711 txc->time.tv_usec = ts.tv_nsec; 706 txc->time.tv_usec = ts.tv_nsec;
@@ -903,7 +898,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
903 898
904 pts_norm = pps_normalize_ts(*phase_ts); 899 pts_norm = pps_normalize_ts(*phase_ts);
905 900
906 write_seqlock_irqsave(&xtime_lock, flags); 901 spin_lock_irqsave(&ntp_lock, flags);
907 902
908 /* clear the error bits, they will be set again if needed */ 903 /* clear the error bits, they will be set again if needed */
909 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 904 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -916,7 +911,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
916 * just start the frequency interval */ 911 * just start the frequency interval */
917 if (unlikely(pps_fbase.tv_sec == 0)) { 912 if (unlikely(pps_fbase.tv_sec == 0)) {
918 pps_fbase = *raw_ts; 913 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags); 914 spin_unlock_irqrestore(&ntp_lock, flags);
920 return; 915 return;
921 } 916 }
922 917
@@ -931,7 +926,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
931 time_status |= STA_PPSJITTER; 926 time_status |= STA_PPSJITTER;
932 /* restart the frequency calibration interval */ 927 /* restart the frequency calibration interval */
933 pps_fbase = *raw_ts; 928 pps_fbase = *raw_ts;
934 write_sequnlock_irqrestore(&xtime_lock, flags); 929 spin_unlock_irqrestore(&ntp_lock, flags);
935 pr_err("hardpps: PPSJITTER: bad pulse\n"); 930 pr_err("hardpps: PPSJITTER: bad pulse\n");
936 return; 931 return;
937 } 932 }
@@ -948,7 +943,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
948 943
949 hardpps_update_phase(pts_norm.nsec); 944 hardpps_update_phase(pts_norm.nsec);
950 945
951 write_sequnlock_irqrestore(&xtime_lock, flags); 946 spin_unlock_irqrestore(&ntp_lock, flags);
952} 947}
953EXPORT_SYMBOL(hardpps); 948EXPORT_SYMBOL(hardpps);
954 949
@@ -967,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
967void __init ntp_init(void) 962void __init ntp_init(void)
968{ 963{
969 ntp_clear(); 964 ntp_clear();
970 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
971 leap_timer.function = ntp_leap_second;
972} 965}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index fd4a7b1625a2..e883f57a3cd3 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void)
575 unsigned long flags; 575 unsigned long flags;
576 576
577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
578 if (cpumask_empty(tick_get_broadcast_mask()))
579 goto end;
578 580
579 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 581 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
580 bc = tick_broadcast_device.evtdev; 582 bc = tick_broadcast_device.evtdev;
581 if (bc) 583 if (bc)
582 tick_broadcast_setup_oneshot(bc); 584 tick_broadcast_setup_oneshot(bc);
585
586end:
583 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 587 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
584} 588}
585 589
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7656642e4b8e..3526038f2836 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
182 182
183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
184{ 184{
185 ktime_t now; 185 ktime_t now = ktime_get();
186
187 now = ktime_get();
188
189 update_ts_time_stats(cpu, ts, now, NULL);
190 186
191 ts->idle_entrytime = now; 187 ts->idle_entrytime = now;
192 ts->idle_active = 1; 188 ts->idle_active = 1;
@@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void)
562 558
563 local_irq_disable(); 559 local_irq_disable();
564 560
565 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 561 WARN_ON_ONCE(!ts->inidle);
562
563 ts->inidle = 0;
564
565 if (ts->idle_active || ts->tick_stopped)
566 now = ktime_get(); 566 now = ktime_get();
567 567
568 if (ts->idle_active) 568 if (ts->idle_active)
569 tick_nohz_stop_idle(cpu, now); 569 tick_nohz_stop_idle(cpu, now);
570 570
571 if (!ts->inidle || !ts->tick_stopped) { 571 if (!ts->tick_stopped) {
572 ts->inidle = 0;
573 local_irq_enable(); 572 local_irq_enable();
574 return; 573 return;
575 } 574 }
576 575
577 ts->inidle = 0;
578
579 /* Update jiffies first */ 576 /* Update jiffies first */
580 select_nohz_load_balancer(0); 577 select_nohz_load_balancer(0);
581 tick_do_update_jiffies64(now); 578 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0c6358186401..d66b21308f7c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,8 @@
25struct timekeeper { 25struct timekeeper {
26 /* Current clocksource used for timekeeping. */ 26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock; 27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */
29 u32 mult;
28 /* The shift value of the current clocksource. */ 30 /* The shift value of the current clocksource. */
29 int shift; 31 int shift;
30 32
@@ -45,12 +47,47 @@ struct timekeeper {
45 /* Shift conversion between clock shifted nano seconds and 47 /* Shift conversion between clock shifted nano seconds and
46 * ntp shifted nano seconds. */ 48 * ntp shifted nano seconds. */
47 int ntp_error_shift; 49 int ntp_error_shift;
48 /* NTP adjusted clock multiplier */ 50
49 u32 mult; 51 /* The current time */
52 struct timespec xtime;
53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
56 * at zero at system boot time, so wall_to_monotonic will be negative,
57 * however, we will ALWAYS keep the tv_nsec part positive so we can use
58 * the usual normalization.
59 *
60 * wall_to_monotonic is moved after resume from suspend for the
61 * monotonic time not to jump. We need to add total_sleep_time to
62 * wall_to_monotonic to get the real boot based time offset.
63 *
64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead.
66 */
67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */
69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time;
72
73 /* Seqlock for all timekeeper values */
74 seqlock_t lock;
50}; 75};
51 76
52static struct timekeeper timekeeper; 77static struct timekeeper timekeeper;
53 78
79/*
80 * This read-write spinlock protects us from races in SMP while
81 * playing with xtime.
82 */
83__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
84
85
86/* flag for if timekeeping is suspended */
87int __read_mostly timekeeping_suspended;
88
89
90
54/** 91/**
55 * timekeeper_setup_internals - Set up internals to use clocksource clock. 92 * timekeeper_setup_internals - Set up internals to use clocksource clock.
56 * 93 *
@@ -135,49 +172,18 @@ static inline s64 timekeeping_get_ns_raw(void)
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 173}
137 174
138/* 175/* must hold write on timekeeper.lock */
139 * This read-write spinlock protects us from races in SMP while 176static void timekeeping_update(bool clearntp)
140 * playing with xtime.
141 */
142__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
143
144
145/*
146 * The current time
147 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
148 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
149 * at zero at system boot time, so wall_to_monotonic will be negative,
150 * however, we will ALWAYS keep the tv_nsec part positive so we can use
151 * the usual normalization.
152 *
153 * wall_to_monotonic is moved after resume from suspend for the monotonic
154 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
155 * to get the real boot based time offset.
156 *
157 * - wall_to_monotonic is no longer the boot time, getboottime must be
158 * used instead.
159 */
160static struct timespec xtime __attribute__ ((aligned (16)));
161static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
162static struct timespec total_sleep_time;
163
164/*
165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
166 */
167static struct timespec raw_time;
168
169/* flag for if timekeeping is suspended */
170int __read_mostly timekeeping_suspended;
171
172/* must hold xtime_lock */
173void timekeeping_leap_insert(int leapsecond)
174{ 177{
175 xtime.tv_sec += leapsecond; 178 if (clearntp) {
176 wall_to_monotonic.tv_sec -= leapsecond; 179 timekeeper.ntp_error = 0;
177 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 180 ntp_clear();
178 timekeeper.mult); 181 }
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
183 timekeeper.clock, timekeeper.mult);
179} 184}
180 185
186
181/** 187/**
182 * timekeeping_forward_now - update clock to the current time 188 * timekeeping_forward_now - update clock to the current time
183 * 189 *
@@ -202,10 +208,10 @@ static void timekeeping_forward_now(void)
202 /* If arch requires, add in gettimeoffset() */ 208 /* If arch requires, add in gettimeoffset() */
203 nsec += arch_gettimeoffset(); 209 nsec += arch_gettimeoffset();
204 210
205 timespec_add_ns(&xtime, nsec); 211 timespec_add_ns(&timekeeper.xtime, nsec);
206 212
207 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 213 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
208 timespec_add_ns(&raw_time, nsec); 214 timespec_add_ns(&timekeeper.raw_time, nsec);
209} 215}
210 216
211/** 217/**
@@ -222,15 +228,15 @@ void getnstimeofday(struct timespec *ts)
222 WARN_ON(timekeeping_suspended); 228 WARN_ON(timekeeping_suspended);
223 229
224 do { 230 do {
225 seq = read_seqbegin(&xtime_lock); 231 seq = read_seqbegin(&timekeeper.lock);
226 232
227 *ts = xtime; 233 *ts = timekeeper.xtime;
228 nsecs = timekeeping_get_ns(); 234 nsecs = timekeeping_get_ns();
229 235
230 /* If arch requires, add in gettimeoffset() */ 236 /* If arch requires, add in gettimeoffset() */
231 nsecs += arch_gettimeoffset(); 237 nsecs += arch_gettimeoffset();
232 238
233 } while (read_seqretry(&xtime_lock, seq)); 239 } while (read_seqretry(&timekeeper.lock, seq));
234 240
235 timespec_add_ns(ts, nsecs); 241 timespec_add_ns(ts, nsecs);
236} 242}
@@ -245,14 +251,16 @@ ktime_t ktime_get(void)
245 WARN_ON(timekeeping_suspended); 251 WARN_ON(timekeeping_suspended);
246 252
247 do { 253 do {
248 seq = read_seqbegin(&xtime_lock); 254 seq = read_seqbegin(&timekeeper.lock);
249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec; 255 secs = timekeeper.xtime.tv_sec +
250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; 256 timekeeper.wall_to_monotonic.tv_sec;
257 nsecs = timekeeper.xtime.tv_nsec +
258 timekeeper.wall_to_monotonic.tv_nsec;
251 nsecs += timekeeping_get_ns(); 259 nsecs += timekeeping_get_ns();
252 /* If arch requires, add in gettimeoffset() */ 260 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset(); 261 nsecs += arch_gettimeoffset();
254 262
255 } while (read_seqretry(&xtime_lock, seq)); 263 } while (read_seqretry(&timekeeper.lock, seq));
256 /* 264 /*
257 * Use ktime_set/ktime_add_ns to create a proper ktime on 265 * Use ktime_set/ktime_add_ns to create a proper ktime on
258 * 32-bit architectures without CONFIG_KTIME_SCALAR. 266 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -278,14 +286,14 @@ void ktime_get_ts(struct timespec *ts)
278 WARN_ON(timekeeping_suspended); 286 WARN_ON(timekeeping_suspended);
279 287
280 do { 288 do {
281 seq = read_seqbegin(&xtime_lock); 289 seq = read_seqbegin(&timekeeper.lock);
282 *ts = xtime; 290 *ts = timekeeper.xtime;
283 tomono = wall_to_monotonic; 291 tomono = timekeeper.wall_to_monotonic;
284 nsecs = timekeeping_get_ns(); 292 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */ 293 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset(); 294 nsecs += arch_gettimeoffset();
287 295
288 } while (read_seqretry(&xtime_lock, seq)); 296 } while (read_seqretry(&timekeeper.lock, seq));
289 297
290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 298 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
291 ts->tv_nsec + tomono.tv_nsec + nsecs); 299 ts->tv_nsec + tomono.tv_nsec + nsecs);
@@ -313,10 +321,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
313 do { 321 do {
314 u32 arch_offset; 322 u32 arch_offset;
315 323
316 seq = read_seqbegin(&xtime_lock); 324 seq = read_seqbegin(&timekeeper.lock);
317 325
318 *ts_raw = raw_time; 326 *ts_raw = timekeeper.raw_time;
319 *ts_real = xtime; 327 *ts_real = timekeeper.xtime;
320 328
321 nsecs_raw = timekeeping_get_ns_raw(); 329 nsecs_raw = timekeeping_get_ns_raw();
322 nsecs_real = timekeeping_get_ns(); 330 nsecs_real = timekeeping_get_ns();
@@ -326,7 +334,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
326 nsecs_raw += arch_offset; 334 nsecs_raw += arch_offset;
327 nsecs_real += arch_offset; 335 nsecs_real += arch_offset;
328 336
329 } while (read_seqretry(&xtime_lock, seq)); 337 } while (read_seqretry(&timekeeper.lock, seq));
330 338
331 timespec_add_ns(ts_raw, nsecs_raw); 339 timespec_add_ns(ts_raw, nsecs_raw);
332 timespec_add_ns(ts_real, nsecs_real); 340 timespec_add_ns(ts_real, nsecs_real);
@@ -365,23 +373,19 @@ int do_settimeofday(const struct timespec *tv)
365 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 373 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
366 return -EINVAL; 374 return -EINVAL;
367 375
368 write_seqlock_irqsave(&xtime_lock, flags); 376 write_seqlock_irqsave(&timekeeper.lock, flags);
369 377
370 timekeeping_forward_now(); 378 timekeeping_forward_now();
371 379
372 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 380 ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
373 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 381 ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
374 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); 382 timekeeper.wall_to_monotonic =
383 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
375 384
376 xtime = *tv; 385 timekeeper.xtime = *tv;
377 386 timekeeping_update(true);
378 timekeeper.ntp_error = 0;
379 ntp_clear();
380 387
381 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 388 write_sequnlock_irqrestore(&timekeeper.lock, flags);
382 timekeeper.mult);
383
384 write_sequnlock_irqrestore(&xtime_lock, flags);
385 389
386 /* signal hrtimers about time change */ 390 /* signal hrtimers about time change */
387 clock_was_set(); 391 clock_was_set();
@@ -405,20 +409,17 @@ int timekeeping_inject_offset(struct timespec *ts)
405 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 409 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
406 return -EINVAL; 410 return -EINVAL;
407 411
408 write_seqlock_irqsave(&xtime_lock, flags); 412 write_seqlock_irqsave(&timekeeper.lock, flags);
409 413
410 timekeeping_forward_now(); 414 timekeeping_forward_now();
411 415
412 xtime = timespec_add(xtime, *ts); 416 timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
413 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); 417 timekeeper.wall_to_monotonic =
414 418 timespec_sub(timekeeper.wall_to_monotonic, *ts);
415 timekeeper.ntp_error = 0;
416 ntp_clear();
417 419
418 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 420 timekeeping_update(true);
419 timekeeper.mult);
420 421
421 write_sequnlock_irqrestore(&xtime_lock, flags); 422 write_sequnlock_irqrestore(&timekeeper.lock, flags);
422 423
423 /* signal hrtimers about time change */ 424 /* signal hrtimers about time change */
424 clock_was_set(); 425 clock_was_set();
@@ -435,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
435static int change_clocksource(void *data) 436static int change_clocksource(void *data)
436{ 437{
437 struct clocksource *new, *old; 438 struct clocksource *new, *old;
439 unsigned long flags;
438 440
439 new = (struct clocksource *) data; 441 new = (struct clocksource *) data;
440 442
443 write_seqlock_irqsave(&timekeeper.lock, flags);
444
441 timekeeping_forward_now(); 445 timekeeping_forward_now();
442 if (!new->enable || new->enable(new) == 0) { 446 if (!new->enable || new->enable(new) == 0) {
443 old = timekeeper.clock; 447 old = timekeeper.clock;
@@ -445,6 +449,10 @@ static int change_clocksource(void *data)
445 if (old->disable) 449 if (old->disable)
446 old->disable(old); 450 old->disable(old);
447 } 451 }
452 timekeeping_update(true);
453
454 write_sequnlock_irqrestore(&timekeeper.lock, flags);
455
448 return 0; 456 return 0;
449} 457}
450 458
@@ -490,11 +498,11 @@ void getrawmonotonic(struct timespec *ts)
490 s64 nsecs; 498 s64 nsecs;
491 499
492 do { 500 do {
493 seq = read_seqbegin(&xtime_lock); 501 seq = read_seqbegin(&timekeeper.lock);
494 nsecs = timekeeping_get_ns_raw(); 502 nsecs = timekeeping_get_ns_raw();
495 *ts = raw_time; 503 *ts = timekeeper.raw_time;
496 504
497 } while (read_seqretry(&xtime_lock, seq)); 505 } while (read_seqretry(&timekeeper.lock, seq));
498 506
499 timespec_add_ns(ts, nsecs); 507 timespec_add_ns(ts, nsecs);
500} 508}
@@ -510,24 +518,30 @@ int timekeeping_valid_for_hres(void)
510 int ret; 518 int ret;
511 519
512 do { 520 do {
513 seq = read_seqbegin(&xtime_lock); 521 seq = read_seqbegin(&timekeeper.lock);
514 522
515 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 523 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
516 524
517 } while (read_seqretry(&xtime_lock, seq)); 525 } while (read_seqretry(&timekeeper.lock, seq));
518 526
519 return ret; 527 return ret;
520} 528}
521 529
522/** 530/**
523 * timekeeping_max_deferment - Returns max time the clocksource can be deferred 531 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
524 *
525 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
526 * ensure that the clocksource does not change!
527 */ 532 */
528u64 timekeeping_max_deferment(void) 533u64 timekeeping_max_deferment(void)
529{ 534{
530 return timekeeper.clock->max_idle_ns; 535 unsigned long seq;
536 u64 ret;
537 do {
538 seq = read_seqbegin(&timekeeper.lock);
539
540 ret = timekeeper.clock->max_idle_ns;
541
542 } while (read_seqretry(&timekeeper.lock, seq));
543
544 return ret;
531} 545}
532 546
533/** 547/**
@@ -572,28 +586,29 @@ void __init timekeeping_init(void)
572 read_persistent_clock(&now); 586 read_persistent_clock(&now);
573 read_boot_clock(&boot); 587 read_boot_clock(&boot);
574 588
575 write_seqlock_irqsave(&xtime_lock, flags); 589 seqlock_init(&timekeeper.lock);
576 590
577 ntp_init(); 591 ntp_init();
578 592
593 write_seqlock_irqsave(&timekeeper.lock, flags);
579 clock = clocksource_default_clock(); 594 clock = clocksource_default_clock();
580 if (clock->enable) 595 if (clock->enable)
581 clock->enable(clock); 596 clock->enable(clock);
582 timekeeper_setup_internals(clock); 597 timekeeper_setup_internals(clock);
583 598
584 xtime.tv_sec = now.tv_sec; 599 timekeeper.xtime.tv_sec = now.tv_sec;
585 xtime.tv_nsec = now.tv_nsec; 600 timekeeper.xtime.tv_nsec = now.tv_nsec;
586 raw_time.tv_sec = 0; 601 timekeeper.raw_time.tv_sec = 0;
587 raw_time.tv_nsec = 0; 602 timekeeper.raw_time.tv_nsec = 0;
588 if (boot.tv_sec == 0 && boot.tv_nsec == 0) { 603 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
589 boot.tv_sec = xtime.tv_sec; 604 boot.tv_sec = timekeeper.xtime.tv_sec;
590 boot.tv_nsec = xtime.tv_nsec; 605 boot.tv_nsec = timekeeper.xtime.tv_nsec;
591 } 606 }
592 set_normalized_timespec(&wall_to_monotonic, 607 set_normalized_timespec(&timekeeper.wall_to_monotonic,
593 -boot.tv_sec, -boot.tv_nsec); 608 -boot.tv_sec, -boot.tv_nsec);
594 total_sleep_time.tv_sec = 0; 609 timekeeper.total_sleep_time.tv_sec = 0;
595 total_sleep_time.tv_nsec = 0; 610 timekeeper.total_sleep_time.tv_nsec = 0;
596 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&timekeeper.lock, flags);
597} 612}
598 613
599/* time in seconds when suspend began */ 614/* time in seconds when suspend began */
@@ -614,9 +629,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
614 return; 629 return;
615 } 630 }
616 631
617 xtime = timespec_add(xtime, *delta); 632 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
618 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); 633 timekeeper.wall_to_monotonic =
619 total_sleep_time = timespec_add(total_sleep_time, *delta); 634 timespec_sub(timekeeper.wall_to_monotonic, *delta);
635 timekeeper.total_sleep_time = timespec_add(
636 timekeeper.total_sleep_time, *delta);
620} 637}
621 638
622 639
@@ -640,17 +657,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
640 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 657 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
641 return; 658 return;
642 659
643 write_seqlock_irqsave(&xtime_lock, flags); 660 write_seqlock_irqsave(&timekeeper.lock, flags);
661
644 timekeeping_forward_now(); 662 timekeeping_forward_now();
645 663
646 __timekeeping_inject_sleeptime(delta); 664 __timekeeping_inject_sleeptime(delta);
647 665
648 timekeeper.ntp_error = 0; 666 timekeeping_update(true);
649 ntp_clear();
650 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
651 timekeeper.mult);
652 667
653 write_sequnlock_irqrestore(&xtime_lock, flags); 668 write_sequnlock_irqrestore(&timekeeper.lock, flags);
654 669
655 /* signal hrtimers about time change */ 670 /* signal hrtimers about time change */
656 clock_was_set(); 671 clock_was_set();
@@ -673,7 +688,7 @@ static void timekeeping_resume(void)
673 688
674 clocksource_resume(); 689 clocksource_resume();
675 690
676 write_seqlock_irqsave(&xtime_lock, flags); 691 write_seqlock_irqsave(&timekeeper.lock, flags);
677 692
678 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 693 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
679 ts = timespec_sub(ts, timekeeping_suspend_time); 694 ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -683,7 +698,7 @@ static void timekeeping_resume(void)
683 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 698 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
684 timekeeper.ntp_error = 0; 699 timekeeper.ntp_error = 0;
685 timekeeping_suspended = 0; 700 timekeeping_suspended = 0;
686 write_sequnlock_irqrestore(&xtime_lock, flags); 701 write_sequnlock_irqrestore(&timekeeper.lock, flags);
687 702
688 touch_softlockup_watchdog(); 703 touch_softlockup_watchdog();
689 704
@@ -701,7 +716,7 @@ static int timekeeping_suspend(void)
701 716
702 read_persistent_clock(&timekeeping_suspend_time); 717 read_persistent_clock(&timekeeping_suspend_time);
703 718
704 write_seqlock_irqsave(&xtime_lock, flags); 719 write_seqlock_irqsave(&timekeeper.lock, flags);
705 timekeeping_forward_now(); 720 timekeeping_forward_now();
706 timekeeping_suspended = 1; 721 timekeeping_suspended = 1;
707 722
@@ -711,7 +726,7 @@ static int timekeeping_suspend(void)
711 * try to compensate so the difference in system time 726 * try to compensate so the difference in system time
712 * and persistent_clock time stays close to constant. 727 * and persistent_clock time stays close to constant.
713 */ 728 */
714 delta = timespec_sub(xtime, timekeeping_suspend_time); 729 delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
715 delta_delta = timespec_sub(delta, old_delta); 730 delta_delta = timespec_sub(delta, old_delta);
716 if (abs(delta_delta.tv_sec) >= 2) { 731 if (abs(delta_delta.tv_sec) >= 2) {
717 /* 732 /*
@@ -724,7 +739,7 @@ static int timekeeping_suspend(void)
724 timekeeping_suspend_time = 739 timekeeping_suspend_time =
725 timespec_add(timekeeping_suspend_time, delta_delta); 740 timespec_add(timekeeping_suspend_time, delta_delta);
726 } 741 }
727 write_sequnlock_irqrestore(&xtime_lock, flags); 742 write_sequnlock_irqrestore(&timekeeper.lock, flags);
728 743
729 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 744 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
730 clocksource_suspend(); 745 clocksource_suspend();
@@ -775,7 +790,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
775 * Now calculate the error in (1 << look_ahead) ticks, but first 790 * Now calculate the error in (1 << look_ahead) ticks, but first
776 * remove the single look ahead already included in the error. 791 * remove the single look ahead already included in the error.
777 */ 792 */
778 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); 793 tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
779 tick_error -= timekeeper.xtime_interval >> 1; 794 tick_error -= timekeeper.xtime_interval >> 1;
780 error = ((error - tick_error) >> look_ahead) + tick_error; 795 error = ((error - tick_error) >> look_ahead) + tick_error;
781 796
@@ -807,7 +822,7 @@ static void timekeeping_adjust(s64 offset)
807 int adj; 822 int adj;
808 823
809 /* 824 /*
810 * The point of this is to check if the error is greater then half 825 * The point of this is to check if the error is greater than half
811 * an interval. 826 * an interval.
812 * 827 *
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. 828 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -815,7 +830,7 @@ static void timekeeping_adjust(s64 offset)
815 * Note we subtract one in the shift, so that error is really error*2. 830 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) interval twice, but keeps the 831 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparison as still measuring if error is 832 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval. 833 * larger than half an interval.
819 * 834 *
820 * Note: It does not "save" on aggravation when reading the code. 835 * Note: It does not "save" on aggravation when reading the code.
821 */ 836 */
@@ -823,7 +838,7 @@ static void timekeeping_adjust(s64 offset)
823 if (error > interval) { 838 if (error > interval) {
824 /* 839 /*
825 * We now divide error by 4(via shift), which checks if 840 * We now divide error by 4(via shift), which checks if
826 * the error is greater then twice the interval. 841 * the error is greater than twice the interval.
827 * If it is greater, we need a bigadjust, if its smaller, 842 * If it is greater, we need a bigadjust, if its smaller,
828 * we can adjust by 1. 843 * we can adjust by 1.
829 */ 844 */
@@ -854,13 +869,15 @@ static void timekeeping_adjust(s64 offset)
854 } else /* No adjustment needed */ 869 } else /* No adjustment needed */
855 return; 870 return;
856 871
857 WARN_ONCE(timekeeper.clock->maxadj && 872 if (unlikely(timekeeper.clock->maxadj &&
858 (timekeeper.mult + adj > timekeeper.clock->mult + 873 (timekeeper.mult + adj >
859 timekeeper.clock->maxadj), 874 timekeeper.clock->mult + timekeeper.clock->maxadj))) {
860 "Adjusting %s more then 11%% (%ld vs %ld)\n", 875 printk_once(KERN_WARNING
876 "Adjusting %s more than 11%% (%ld vs %ld)\n",
861 timekeeper.clock->name, (long)timekeeper.mult + adj, 877 timekeeper.clock->name, (long)timekeeper.mult + adj,
862 (long)timekeeper.clock->mult + 878 (long)timekeeper.clock->mult +
863 timekeeper.clock->maxadj); 879 timekeeper.clock->maxadj);
880 }
864 /* 881 /*
865 * So the following can be confusing. 882 * So the following can be confusing.
866 * 883 *
@@ -932,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
932 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 949 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
933 u64 raw_nsecs; 950 u64 raw_nsecs;
934 951
935 /* If the offset is smaller then a shifted interval, do nothing */ 952 /* If the offset is smaller than a shifted interval, do nothing */
936 if (offset < timekeeper.cycle_interval<<shift) 953 if (offset < timekeeper.cycle_interval<<shift)
937 return offset; 954 return offset;
938 955
@@ -942,23 +959,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
942 959
943 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 960 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
944 while (timekeeper.xtime_nsec >= nsecps) { 961 while (timekeeper.xtime_nsec >= nsecps) {
962 int leap;
945 timekeeper.xtime_nsec -= nsecps; 963 timekeeper.xtime_nsec -= nsecps;
946 xtime.tv_sec++; 964 timekeeper.xtime.tv_sec++;
947 second_overflow(); 965 leap = second_overflow(timekeeper.xtime.tv_sec);
966 timekeeper.xtime.tv_sec += leap;
948 } 967 }
949 968
950 /* Accumulate raw time */ 969 /* Accumulate raw time */
951 raw_nsecs = timekeeper.raw_interval << shift; 970 raw_nsecs = timekeeper.raw_interval << shift;
952 raw_nsecs += raw_time.tv_nsec; 971 raw_nsecs += timekeeper.raw_time.tv_nsec;
953 if (raw_nsecs >= NSEC_PER_SEC) { 972 if (raw_nsecs >= NSEC_PER_SEC) {
954 u64 raw_secs = raw_nsecs; 973 u64 raw_secs = raw_nsecs;
955 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 974 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
956 raw_time.tv_sec += raw_secs; 975 timekeeper.raw_time.tv_sec += raw_secs;
957 } 976 }
958 raw_time.tv_nsec = raw_nsecs; 977 timekeeper.raw_time.tv_nsec = raw_nsecs;
959 978
960 /* Accumulate error between NTP and clock interval */ 979 /* Accumulate error between NTP and clock interval */
961 timekeeper.ntp_error += tick_length << shift; 980 timekeeper.ntp_error += ntp_tick_length() << shift;
962 timekeeper.ntp_error -= 981 timekeeper.ntp_error -=
963 (timekeeper.xtime_interval + timekeeper.xtime_remainder) << 982 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
964 (timekeeper.ntp_error_shift + shift); 983 (timekeeper.ntp_error_shift + shift);
@@ -970,17 +989,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
970/** 989/**
971 * update_wall_time - Uses the current clocksource to increment the wall time 990 * update_wall_time - Uses the current clocksource to increment the wall time
972 * 991 *
973 * Called from the timer interrupt, must hold a write on xtime_lock.
974 */ 992 */
975static void update_wall_time(void) 993static void update_wall_time(void)
976{ 994{
977 struct clocksource *clock; 995 struct clocksource *clock;
978 cycle_t offset; 996 cycle_t offset;
979 int shift = 0, maxshift; 997 int shift = 0, maxshift;
998 unsigned long flags;
999
1000 write_seqlock_irqsave(&timekeeper.lock, flags);
980 1001
981 /* Make sure we're fully resumed: */ 1002 /* Make sure we're fully resumed: */
982 if (unlikely(timekeeping_suspended)) 1003 if (unlikely(timekeeping_suspended))
983 return; 1004 goto out;
984 1005
985 clock = timekeeper.clock; 1006 clock = timekeeper.clock;
986 1007
@@ -989,20 +1010,21 @@ static void update_wall_time(void)
989#else 1010#else
990 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1011 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
991#endif 1012#endif
992 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 1013 timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
1014 timekeeper.shift;
993 1015
994 /* 1016 /*
995 * With NO_HZ we may have to accumulate many cycle_intervals 1017 * With NO_HZ we may have to accumulate many cycle_intervals
996 * (think "ticks") worth of time at once. To do this efficiently, 1018 * (think "ticks") worth of time at once. To do this efficiently,
997 * we calculate the largest doubling multiple of cycle_intervals 1019 * we calculate the largest doubling multiple of cycle_intervals
998 * that is smaller then the offset. We then accumulate that 1020 * that is smaller than the offset. We then accumulate that
999 * chunk in one go, and then try to consume the next smaller 1021 * chunk in one go, and then try to consume the next smaller
1000 * doubled multiple. 1022 * doubled multiple.
1001 */ 1023 */
1002 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 1024 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
1003 shift = max(0, shift); 1025 shift = max(0, shift);
1004 /* Bound shift to one less then what overflows tick_length */ 1026 /* Bound shift to one less than what overflows tick_length */
1005 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; 1027 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1006 shift = min(shift, maxshift); 1028 shift = min(shift, maxshift);
1007 while (offset >= timekeeper.cycle_interval) { 1029 while (offset >= timekeeper.cycle_interval) {
1008 offset = logarithmic_accumulation(offset, shift); 1030 offset = logarithmic_accumulation(offset, shift);
@@ -1040,24 +1062,30 @@ static void update_wall_time(void)
1040 * Store full nanoseconds into xtime after rounding it up and 1062 * Store full nanoseconds into xtime after rounding it up and
1041 * add the remainder to the error difference. 1063 * add the remainder to the error difference.
1042 */ 1064 */
1043 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 1065 timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
1044 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; 1066 timekeeper.shift) + 1;
1067 timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
1068 timekeeper.shift;
1045 timekeeper.ntp_error += timekeeper.xtime_nsec << 1069 timekeeper.ntp_error += timekeeper.xtime_nsec <<
1046 timekeeper.ntp_error_shift; 1070 timekeeper.ntp_error_shift;
1047 1071
1048 /* 1072 /*
1049 * Finally, make sure that after the rounding 1073 * Finally, make sure that after the rounding
1050 * xtime.tv_nsec isn't larger then NSEC_PER_SEC 1074 * xtime.tv_nsec isn't larger than NSEC_PER_SEC
1051 */ 1075 */
1052 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { 1076 if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
1053 xtime.tv_nsec -= NSEC_PER_SEC; 1077 int leap;
1054 xtime.tv_sec++; 1078 timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
1055 second_overflow(); 1079 timekeeper.xtime.tv_sec++;
1080 leap = second_overflow(timekeeper.xtime.tv_sec);
1081 timekeeper.xtime.tv_sec += leap;
1056 } 1082 }
1057 1083
1058 /* check to see if there is a new clocksource to use */ 1084 timekeeping_update(false);
1059 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 1085
1060 timekeeper.mult); 1086out:
1087 write_sequnlock_irqrestore(&timekeeper.lock, flags);
1088
1061} 1089}
1062 1090
1063/** 1091/**
@@ -1074,8 +1102,10 @@ static void update_wall_time(void)
1074void getboottime(struct timespec *ts) 1102void getboottime(struct timespec *ts)
1075{ 1103{
1076 struct timespec boottime = { 1104 struct timespec boottime = {
1077 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, 1105 .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
1078 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec 1106 timekeeper.total_sleep_time.tv_sec,
1107 .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
1108 timekeeper.total_sleep_time.tv_nsec
1079 }; 1109 };
1080 1110
1081 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1111 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
@@ -1101,13 +1131,13 @@ void get_monotonic_boottime(struct timespec *ts)
1101 WARN_ON(timekeeping_suspended); 1131 WARN_ON(timekeeping_suspended);
1102 1132
1103 do { 1133 do {
1104 seq = read_seqbegin(&xtime_lock); 1134 seq = read_seqbegin(&timekeeper.lock);
1105 *ts = xtime; 1135 *ts = timekeeper.xtime;
1106 tomono = wall_to_monotonic; 1136 tomono = timekeeper.wall_to_monotonic;
1107 sleep = total_sleep_time; 1137 sleep = timekeeper.total_sleep_time;
1108 nsecs = timekeeping_get_ns(); 1138 nsecs = timekeeping_get_ns();
1109 1139
1110 } while (read_seqretry(&xtime_lock, seq)); 1140 } while (read_seqretry(&timekeeper.lock, seq));
1111 1141
1112 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1142 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1113 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); 1143 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
@@ -1137,19 +1167,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1137 */ 1167 */
1138void monotonic_to_bootbased(struct timespec *ts) 1168void monotonic_to_bootbased(struct timespec *ts)
1139{ 1169{
1140 *ts = timespec_add(*ts, total_sleep_time); 1170 *ts = timespec_add(*ts, timekeeper.total_sleep_time);
1141} 1171}
1142EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1172EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1143 1173
1144unsigned long get_seconds(void) 1174unsigned long get_seconds(void)
1145{ 1175{
1146 return xtime.tv_sec; 1176 return timekeeper.xtime.tv_sec;
1147} 1177}
1148EXPORT_SYMBOL(get_seconds); 1178EXPORT_SYMBOL(get_seconds);
1149 1179
1150struct timespec __current_kernel_time(void) 1180struct timespec __current_kernel_time(void)
1151{ 1181{
1152 return xtime; 1182 return timekeeper.xtime;
1153} 1183}
1154 1184
1155struct timespec current_kernel_time(void) 1185struct timespec current_kernel_time(void)
@@ -1158,10 +1188,10 @@ struct timespec current_kernel_time(void)
1158 unsigned long seq; 1188 unsigned long seq;
1159 1189
1160 do { 1190 do {
1161 seq = read_seqbegin(&xtime_lock); 1191 seq = read_seqbegin(&timekeeper.lock);
1162 1192
1163 now = xtime; 1193 now = timekeeper.xtime;
1164 } while (read_seqretry(&xtime_lock, seq)); 1194 } while (read_seqretry(&timekeeper.lock, seq));
1165 1195
1166 return now; 1196 return now;
1167} 1197}
@@ -1173,11 +1203,11 @@ struct timespec get_monotonic_coarse(void)
1173 unsigned long seq; 1203 unsigned long seq;
1174 1204
1175 do { 1205 do {
1176 seq = read_seqbegin(&xtime_lock); 1206 seq = read_seqbegin(&timekeeper.lock);
1177 1207
1178 now = xtime; 1208 now = timekeeper.xtime;
1179 mono = wall_to_monotonic; 1209 mono = timekeeper.wall_to_monotonic;
1180 } while (read_seqretry(&xtime_lock, seq)); 1210 } while (read_seqretry(&timekeeper.lock, seq));
1181 1211
1182 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1212 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1183 now.tv_nsec + mono.tv_nsec); 1213 now.tv_nsec + mono.tv_nsec);
@@ -1209,11 +1239,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1209 unsigned long seq; 1239 unsigned long seq;
1210 1240
1211 do { 1241 do {
1212 seq = read_seqbegin(&xtime_lock); 1242 seq = read_seqbegin(&timekeeper.lock);
1213 *xtim = xtime; 1243 *xtim = timekeeper.xtime;
1214 *wtom = wall_to_monotonic; 1244 *wtom = timekeeper.wall_to_monotonic;
1215 *sleep = total_sleep_time; 1245 *sleep = timekeeper.total_sleep_time;
1216 } while (read_seqretry(&xtime_lock, seq)); 1246 } while (read_seqretry(&timekeeper.lock, seq));
1217} 1247}
1218 1248
1219/** 1249/**
@@ -1225,11 +1255,14 @@ ktime_t ktime_get_monotonic_offset(void)
1225 struct timespec wtom; 1255 struct timespec wtom;
1226 1256
1227 do { 1257 do {
1228 seq = read_seqbegin(&xtime_lock); 1258 seq = read_seqbegin(&timekeeper.lock);
1229 wtom = wall_to_monotonic; 1259 wtom = timekeeper.wall_to_monotonic;
1230 } while (read_seqretry(&xtime_lock, seq)); 1260 } while (read_seqretry(&timekeeper.lock, seq));
1261
1231 return timespec_to_ktime(wtom); 1262 return timespec_to_ktime(wtom);
1232} 1263}
1264EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1265
1233 1266
1234/** 1267/**
1235 * xtime_update() - advances the timekeeping infrastructure 1268 * xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cd3134510f3d..a1d2849f2473 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE 144 select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
145 select KALLSYMS 145 select KALLSYMS
146 select GENERIC_TRACER 146 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 683d559a0eef..0fa92f677c92 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,6 +62,8 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 62#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 63#define FTRACE_HASH_MAX_BITS 12
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66
65/* ftrace_enabled is a method to turn ftrace on or off */ 67/* ftrace_enabled is a method to turn ftrace on or off */
66int ftrace_enabled __read_mostly; 68int ftrace_enabled __read_mostly;
67static int last_ftrace_enabled; 69static int last_ftrace_enabled;
@@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
89}; 91};
90 92
91static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 93static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
94static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
92static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 95static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
93ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 96ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
94static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; 97static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
95ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 98ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
96ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 99ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
97static struct ftrace_ops global_ops; 100static struct ftrace_ops global_ops;
101static struct ftrace_ops control_ops;
98 102
99static void 103static void
100ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); 104ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
@@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
168} 172}
169#endif 173#endif
170 174
175static void control_ops_disable_all(struct ftrace_ops *ops)
176{
177 int cpu;
178
179 for_each_possible_cpu(cpu)
180 *per_cpu_ptr(ops->disabled, cpu) = 1;
181}
182
183static int control_ops_alloc(struct ftrace_ops *ops)
184{
185 int __percpu *disabled;
186
187 disabled = alloc_percpu(int);
188 if (!disabled)
189 return -ENOMEM;
190
191 ops->disabled = disabled;
192 control_ops_disable_all(ops);
193 return 0;
194}
195
196static void control_ops_free(struct ftrace_ops *ops)
197{
198 free_percpu(ops->disabled);
199}
200
171static void update_global_ops(void) 201static void update_global_ops(void)
172{ 202{
173 ftrace_func_t func; 203 ftrace_func_t func;
@@ -219,7 +249,8 @@ static void update_ftrace_function(void)
219#else 249#else
220 __ftrace_trace_function = func; 250 __ftrace_trace_function = func;
221#endif 251#endif
222 ftrace_trace_function = ftrace_test_stop_func; 252 ftrace_trace_function =
253 (func == ftrace_stub) ? func : ftrace_test_stop_func;
223#endif 254#endif
224} 255}
225 256
@@ -259,6 +290,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
259 return 0; 290 return 0;
260} 291}
261 292
293static void add_ftrace_list_ops(struct ftrace_ops **list,
294 struct ftrace_ops *main_ops,
295 struct ftrace_ops *ops)
296{
297 int first = *list == &ftrace_list_end;
298 add_ftrace_ops(list, ops);
299 if (first)
300 add_ftrace_ops(&ftrace_ops_list, main_ops);
301}
302
303static int remove_ftrace_list_ops(struct ftrace_ops **list,
304 struct ftrace_ops *main_ops,
305 struct ftrace_ops *ops)
306{
307 int ret = remove_ftrace_ops(list, ops);
308 if (!ret && *list == &ftrace_list_end)
309 ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
310 return ret;
311}
312
262static int __register_ftrace_function(struct ftrace_ops *ops) 313static int __register_ftrace_function(struct ftrace_ops *ops)
263{ 314{
264 if (ftrace_disabled) 315 if (ftrace_disabled)
@@ -270,15 +321,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
270 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 321 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
271 return -EBUSY; 322 return -EBUSY;
272 323
324 /* We don't support both control and global flags set. */
325 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
326 return -EINVAL;
327
273 if (!core_kernel_data((unsigned long)ops)) 328 if (!core_kernel_data((unsigned long)ops))
274 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 329 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
275 330
276 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 331 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
277 int first = ftrace_global_list == &ftrace_list_end; 332 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
278 add_ftrace_ops(&ftrace_global_list, ops);
279 ops->flags |= FTRACE_OPS_FL_ENABLED; 333 ops->flags |= FTRACE_OPS_FL_ENABLED;
280 if (first) 334 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
281 add_ftrace_ops(&ftrace_ops_list, &global_ops); 335 if (control_ops_alloc(ops))
336 return -ENOMEM;
337 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
282 } else 338 } else
283 add_ftrace_ops(&ftrace_ops_list, ops); 339 add_ftrace_ops(&ftrace_ops_list, ops);
284 340
@@ -302,11 +358,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
302 return -EINVAL; 358 return -EINVAL;
303 359
304 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 360 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
305 ret = remove_ftrace_ops(&ftrace_global_list, ops); 361 ret = remove_ftrace_list_ops(&ftrace_global_list,
306 if (!ret && ftrace_global_list == &ftrace_list_end) 362 &global_ops, ops);
307 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
308 if (!ret) 363 if (!ret)
309 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 364 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
365 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
366 ret = remove_ftrace_list_ops(&ftrace_control_list,
367 &control_ops, ops);
368 if (!ret) {
369 /*
370 * The ftrace_ops is now removed from the list,
371 * so there'll be no new users. We must ensure
372 * all current users are done before we free
373 * the control data.
374 */
375 synchronize_sched();
376 control_ops_free(ops);
377 }
310 } else 378 } else
311 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 379 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
312 380
@@ -1119,6 +1187,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1119 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); 1187 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1120} 1188}
1121 1189
1190void ftrace_free_filter(struct ftrace_ops *ops)
1191{
1192 free_ftrace_hash(ops->filter_hash);
1193 free_ftrace_hash(ops->notrace_hash);
1194}
1195
1122static struct ftrace_hash *alloc_ftrace_hash(int size_bits) 1196static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1123{ 1197{
1124 struct ftrace_hash *hash; 1198 struct ftrace_hash *hash;
@@ -1129,7 +1203,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1129 return NULL; 1203 return NULL;
1130 1204
1131 size = 1 << size_bits; 1205 size = 1 << size_bits;
1132 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); 1206 hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
1133 1207
1134 if (!hash->buckets) { 1208 if (!hash->buckets) {
1135 kfree(hash); 1209 kfree(hash);
@@ -3146,8 +3220,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3146 mutex_lock(&ftrace_regex_lock); 3220 mutex_lock(&ftrace_regex_lock);
3147 if (reset) 3221 if (reset)
3148 ftrace_filter_reset(hash); 3222 ftrace_filter_reset(hash);
3149 if (buf) 3223 if (buf && !ftrace_match_records(hash, buf, len)) {
3150 ftrace_match_records(hash, buf, len); 3224 ret = -EINVAL;
3225 goto out_regex_unlock;
3226 }
3151 3227
3152 mutex_lock(&ftrace_lock); 3228 mutex_lock(&ftrace_lock);
3153 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3229 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3157,6 +3233,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3157 3233
3158 mutex_unlock(&ftrace_lock); 3234 mutex_unlock(&ftrace_lock);
3159 3235
3236 out_regex_unlock:
3160 mutex_unlock(&ftrace_regex_lock); 3237 mutex_unlock(&ftrace_regex_lock);
3161 3238
3162 free_ftrace_hash(hash); 3239 free_ftrace_hash(hash);
@@ -3173,10 +3250,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3173 * Filters denote which functions should be enabled when tracing is enabled. 3250 * Filters denote which functions should be enabled when tracing is enabled.
3174 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 3251 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
3175 */ 3252 */
3176void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 3253int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
3177 int len, int reset) 3254 int len, int reset)
3178{ 3255{
3179 ftrace_set_regex(ops, buf, len, reset, 1); 3256 return ftrace_set_regex(ops, buf, len, reset, 1);
3180} 3257}
3181EXPORT_SYMBOL_GPL(ftrace_set_filter); 3258EXPORT_SYMBOL_GPL(ftrace_set_filter);
3182 3259
@@ -3191,10 +3268,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
3191 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 3268 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
3192 * for tracing. 3269 * for tracing.
3193 */ 3270 */
3194void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 3271int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3195 int len, int reset) 3272 int len, int reset)
3196{ 3273{
3197 ftrace_set_regex(ops, buf, len, reset, 0); 3274 return ftrace_set_regex(ops, buf, len, reset, 0);
3198} 3275}
3199EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3276EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3200/** 3277/**
@@ -3871,6 +3948,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3871#endif /* CONFIG_DYNAMIC_FTRACE */ 3948#endif /* CONFIG_DYNAMIC_FTRACE */
3872 3949
3873static void 3950static void
3951ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
3952{
3953 struct ftrace_ops *op;
3954
3955 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
3956 return;
3957
3958 /*
3959 * Some of the ops may be dynamically allocated,
3960 * they must be freed after a synchronize_sched().
3961 */
3962 preempt_disable_notrace();
3963 trace_recursion_set(TRACE_CONTROL_BIT);
3964 op = rcu_dereference_raw(ftrace_control_list);
3965 while (op != &ftrace_list_end) {
3966 if (!ftrace_function_local_disabled(op) &&
3967 ftrace_ops_test(op, ip))
3968 op->func(ip, parent_ip);
3969
3970 op = rcu_dereference_raw(op->next);
3971 };
3972 trace_recursion_clear(TRACE_CONTROL_BIT);
3973 preempt_enable_notrace();
3974}
3975
3976static struct ftrace_ops control_ops = {
3977 .func = ftrace_ops_control_func,
3978};
3979
3980static void
3874ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) 3981ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3875{ 3982{
3876 struct ftrace_ops *op; 3983 struct ftrace_ops *op;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5b7b5c1195b..cf8d11e91efd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,33 +154,10 @@ enum {
154 154
155static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; 155static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
156 156
157#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) 157/* Used for individual buffers (after the counter) */
158 158#define RB_BUFFER_OFF (1 << 20)
159/**
160 * tracing_on - enable all tracing buffers
161 *
162 * This function enables all tracing buffers that may have been
163 * disabled with tracing_off.
164 */
165void tracing_on(void)
166{
167 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
168}
169EXPORT_SYMBOL_GPL(tracing_on);
170 159
171/** 160#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
172 * tracing_off - turn off all tracing buffers
173 *
174 * This function stops all tracing buffers from recording data.
175 * It does not disable any overhead the tracers themselves may
176 * be causing. This function simply causes all recording to
177 * the ring buffers to fail.
178 */
179void tracing_off(void)
180{
181 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
182}
183EXPORT_SYMBOL_GPL(tracing_off);
184 161
185/** 162/**
186 * tracing_off_permanent - permanently disable ring buffers 163 * tracing_off_permanent - permanently disable ring buffers
@@ -193,15 +170,6 @@ void tracing_off_permanent(void)
193 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); 170 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
194} 171}
195 172
196/**
197 * tracing_is_on - show state of ring buffers enabled
198 */
199int tracing_is_on(void)
200{
201 return ring_buffer_flags == RB_BUFFERS_ON;
202}
203EXPORT_SYMBOL_GPL(tracing_is_on);
204
205#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 173#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
206#define RB_ALIGNMENT 4U 174#define RB_ALIGNMENT 4U
207#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 175#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -2619,6 +2587,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
2619EXPORT_SYMBOL_GPL(ring_buffer_record_enable); 2587EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
2620 2588
2621/** 2589/**
2590 * ring_buffer_record_off - stop all writes into the buffer
2591 * @buffer: The ring buffer to stop writes to.
2592 *
2593 * This prevents all writes to the buffer. Any attempt to write
2594 * to the buffer after this will fail and return NULL.
2595 *
2596 * This is different than ring_buffer_record_disable() as
2597 * it works like an on/off switch, where as the disable() verison
2598 * must be paired with a enable().
2599 */
2600void ring_buffer_record_off(struct ring_buffer *buffer)
2601{
2602 unsigned int rd;
2603 unsigned int new_rd;
2604
2605 do {
2606 rd = atomic_read(&buffer->record_disabled);
2607 new_rd = rd | RB_BUFFER_OFF;
2608 } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
2609}
2610EXPORT_SYMBOL_GPL(ring_buffer_record_off);
2611
2612/**
2613 * ring_buffer_record_on - restart writes into the buffer
2614 * @buffer: The ring buffer to start writes to.
2615 *
2616 * This enables all writes to the buffer that was disabled by
2617 * ring_buffer_record_off().
2618 *
2619 * This is different than ring_buffer_record_enable() as
2620 * it works like an on/off switch, where as the enable() verison
2621 * must be paired with a disable().
2622 */
2623void ring_buffer_record_on(struct ring_buffer *buffer)
2624{
2625 unsigned int rd;
2626 unsigned int new_rd;
2627
2628 do {
2629 rd = atomic_read(&buffer->record_disabled);
2630 new_rd = rd & ~RB_BUFFER_OFF;
2631 } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
2632}
2633EXPORT_SYMBOL_GPL(ring_buffer_record_on);
2634
2635/**
2636 * ring_buffer_record_is_on - return true if the ring buffer can write
2637 * @buffer: The ring buffer to see if write is enabled
2638 *
2639 * Returns true if the ring buffer is in a state that it accepts writes.
2640 */
2641int ring_buffer_record_is_on(struct ring_buffer *buffer)
2642{
2643 return !atomic_read(&buffer->record_disabled);
2644}
2645
2646/**
2622 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer 2647 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
2623 * @buffer: The ring buffer to stop writes to. 2648 * @buffer: The ring buffer to stop writes to.
2624 * @cpu: The CPU buffer to stop 2649 * @cpu: The CPU buffer to stop
@@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
4039} 4064}
4040EXPORT_SYMBOL_GPL(ring_buffer_read_page); 4065EXPORT_SYMBOL_GPL(ring_buffer_read_page);
4041 4066
4042#ifdef CONFIG_TRACING
4043static ssize_t
4044rb_simple_read(struct file *filp, char __user *ubuf,
4045 size_t cnt, loff_t *ppos)
4046{
4047 unsigned long *p = filp->private_data;
4048 char buf[64];
4049 int r;
4050
4051 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
4052 r = sprintf(buf, "permanently disabled\n");
4053 else
4054 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
4055
4056 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
4057}
4058
4059static ssize_t
4060rb_simple_write(struct file *filp, const char __user *ubuf,
4061 size_t cnt, loff_t *ppos)
4062{
4063 unsigned long *p = filp->private_data;
4064 unsigned long val;
4065 int ret;
4066
4067 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4068 if (ret)
4069 return ret;
4070
4071 if (val)
4072 set_bit(RB_BUFFERS_ON_BIT, p);
4073 else
4074 clear_bit(RB_BUFFERS_ON_BIT, p);
4075
4076 (*ppos)++;
4077
4078 return cnt;
4079}
4080
4081static const struct file_operations rb_simple_fops = {
4082 .open = tracing_open_generic,
4083 .read = rb_simple_read,
4084 .write = rb_simple_write,
4085 .llseek = default_llseek,
4086};
4087
4088
4089static __init int rb_init_debugfs(void)
4090{
4091 struct dentry *d_tracer;
4092
4093 d_tracer = tracing_init_dentry();
4094
4095 trace_create_file("tracing_on", 0644, d_tracer,
4096 &ring_buffer_flags, &rb_simple_fops);
4097
4098 return 0;
4099}
4100
4101fs_initcall(rb_init_debugfs);
4102#endif
4103
4104#ifdef CONFIG_HOTPLUG_CPU 4067#ifdef CONFIG_HOTPLUG_CPU
4105static int rb_cpu_notify(struct notifier_block *self, 4068static int rb_cpu_notify(struct notifier_block *self,
4106 unsigned long action, void *hcpu) 4069 unsigned long action, void *hcpu)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f1bc5d2a00..ed7b5d1e12f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -36,6 +36,7 @@
36#include <linux/ctype.h> 36#include <linux/ctype.h>
37#include <linux/init.h> 37#include <linux/init.h>
38#include <linux/poll.h> 38#include <linux/poll.h>
39#include <linux/nmi.h>
39#include <linux/fs.h> 40#include <linux/fs.h>
40 41
41#include "trace.h" 42#include "trace.h"
@@ -352,6 +353,59 @@ static void wakeup_work_handler(struct work_struct *work)
352static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); 353static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
353 354
354/** 355/**
356 * tracing_on - enable tracing buffers
357 *
358 * This function enables tracing buffers that may have been
359 * disabled with tracing_off.
360 */
361void tracing_on(void)
362{
363 if (global_trace.buffer)
364 ring_buffer_record_on(global_trace.buffer);
365 /*
366 * This flag is only looked at when buffers haven't been
367 * allocated yet. We don't really care about the race
368 * between setting this flag and actually turning
369 * on the buffer.
370 */
371 global_trace.buffer_disabled = 0;
372}
373EXPORT_SYMBOL_GPL(tracing_on);
374
375/**
376 * tracing_off - turn off tracing buffers
377 *
378 * This function stops the tracing buffers from recording data.
379 * It does not disable any overhead the tracers themselves may
380 * be causing. This function simply causes all recording to
381 * the ring buffers to fail.
382 */
383void tracing_off(void)
384{
385 if (global_trace.buffer)
386 ring_buffer_record_on(global_trace.buffer);
387 /*
388 * This flag is only looked at when buffers haven't been
389 * allocated yet. We don't really care about the race
390 * between setting this flag and actually turning
391 * on the buffer.
392 */
393 global_trace.buffer_disabled = 1;
394}
395EXPORT_SYMBOL_GPL(tracing_off);
396
397/**
398 * tracing_is_on - show state of ring buffers enabled
399 */
400int tracing_is_on(void)
401{
402 if (global_trace.buffer)
403 return ring_buffer_record_is_on(global_trace.buffer);
404 return !global_trace.buffer_disabled;
405}
406EXPORT_SYMBOL_GPL(tracing_is_on);
407
408/**
355 * trace_wake_up - wake up tasks waiting for trace input 409 * trace_wake_up - wake up tasks waiting for trace input
356 * 410 *
357 * Schedules a delayed work to wake up any task that is blocked on the 411 * Schedules a delayed work to wake up any task that is blocked on the
@@ -1644,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1644 int cpu_file = iter->cpu_file; 1698 int cpu_file = iter->cpu_file;
1645 u64 next_ts = 0, ts; 1699 u64 next_ts = 0, ts;
1646 int next_cpu = -1; 1700 int next_cpu = -1;
1701 int next_size = 0;
1647 int cpu; 1702 int cpu;
1648 1703
1649 /* 1704 /*
@@ -1675,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1675 next_cpu = cpu; 1730 next_cpu = cpu;
1676 next_ts = ts; 1731 next_ts = ts;
1677 next_lost = lost_events; 1732 next_lost = lost_events;
1733 next_size = iter->ent_size;
1678 } 1734 }
1679 } 1735 }
1680 1736
1737 iter->ent_size = next_size;
1738
1681 if (ent_cpu) 1739 if (ent_cpu)
1682 *ent_cpu = next_cpu; 1740 *ent_cpu = next_cpu;
1683 1741
@@ -2764,12 +2822,12 @@ static const char readme_msg[] =
2764 "tracing mini-HOWTO:\n\n" 2822 "tracing mini-HOWTO:\n\n"
2765 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 2823 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2766 "# cat /sys/kernel/debug/tracing/available_tracers\n" 2824 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2767 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2825 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
2768 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2826 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2769 "nop\n" 2827 "nop\n"
2770 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" 2828 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
2771 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2829 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2772 "sched_switch\n" 2830 "wakeup\n"
2773 "# cat /sys/kernel/debug/tracing/trace_options\n" 2831 "# cat /sys/kernel/debug/tracing/trace_options\n"
2774 "noprint-parent nosym-offset nosym-addr noverbose\n" 2832 "noprint-parent nosym-offset nosym-addr noverbose\n"
2775 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2833 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
@@ -4567,6 +4625,55 @@ static __init void create_trace_options_dir(void)
4567 create_trace_option_core_file(trace_options[i], i); 4625 create_trace_option_core_file(trace_options[i], i);
4568} 4626}
4569 4627
4628static ssize_t
4629rb_simple_read(struct file *filp, char __user *ubuf,
4630 size_t cnt, loff_t *ppos)
4631{
4632 struct ring_buffer *buffer = filp->private_data;
4633 char buf[64];
4634 int r;
4635
4636 if (buffer)
4637 r = ring_buffer_record_is_on(buffer);
4638 else
4639 r = 0;
4640
4641 r = sprintf(buf, "%d\n", r);
4642
4643 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
4644}
4645
4646static ssize_t
4647rb_simple_write(struct file *filp, const char __user *ubuf,
4648 size_t cnt, loff_t *ppos)
4649{
4650 struct ring_buffer *buffer = filp->private_data;
4651 unsigned long val;
4652 int ret;
4653
4654 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4655 if (ret)
4656 return ret;
4657
4658 if (buffer) {
4659 if (val)
4660 ring_buffer_record_on(buffer);
4661 else
4662 ring_buffer_record_off(buffer);
4663 }
4664
4665 (*ppos)++;
4666
4667 return cnt;
4668}
4669
4670static const struct file_operations rb_simple_fops = {
4671 .open = tracing_open_generic,
4672 .read = rb_simple_read,
4673 .write = rb_simple_write,
4674 .llseek = default_llseek,
4675};
4676
4570static __init int tracer_init_debugfs(void) 4677static __init int tracer_init_debugfs(void)
4571{ 4678{
4572 struct dentry *d_tracer; 4679 struct dentry *d_tracer;
@@ -4626,6 +4733,9 @@ static __init int tracer_init_debugfs(void)
4626 trace_create_file("trace_clock", 0644, d_tracer, NULL, 4733 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4627 &trace_clock_fops); 4734 &trace_clock_fops);
4628 4735
4736 trace_create_file("tracing_on", 0644, d_tracer,
4737 global_trace.buffer, &rb_simple_fops);
4738
4629#ifdef CONFIG_DYNAMIC_FTRACE 4739#ifdef CONFIG_DYNAMIC_FTRACE
4630 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4740 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4631 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4741 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4798,6 +4908,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4798 if (ret != TRACE_TYPE_NO_CONSUME) 4908 if (ret != TRACE_TYPE_NO_CONSUME)
4799 trace_consume(&iter); 4909 trace_consume(&iter);
4800 } 4910 }
4911 touch_nmi_watchdog();
4801 4912
4802 trace_printk_seq(&iter.seq); 4913 trace_printk_seq(&iter.seq);
4803 } 4914 }
@@ -4863,6 +4974,8 @@ __init static int tracer_alloc_buffers(void)
4863 goto out_free_cpumask; 4974 goto out_free_cpumask;
4864 } 4975 }
4865 global_trace.entries = ring_buffer_size(global_trace.buffer); 4976 global_trace.entries = ring_buffer_size(global_trace.buffer);
4977 if (global_trace.buffer_disabled)
4978 tracing_off();
4866 4979
4867 4980
4868#ifdef CONFIG_TRACER_MAX_TRACE 4981#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b93ecbadad6d..95059f091a24 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,17 +56,23 @@ enum trace_type {
56#define F_STRUCT(args...) args 56#define F_STRUCT(args...) args
57 57
58#undef FTRACE_ENTRY 58#undef FTRACE_ENTRY
59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
60 struct struct_name { \ 60 struct struct_name { \
61 struct trace_entry ent; \ 61 struct trace_entry ent; \
62 tstruct \ 62 tstruct \
63 } 63 }
64 64
65#undef TP_ARGS 65#undef TP_ARGS
66#define TP_ARGS(args...) args 66#define TP_ARGS(args...) args
67 67
68#undef FTRACE_ENTRY_DUP 68#undef FTRACE_ENTRY_DUP
69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) 69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
70
71#undef FTRACE_ENTRY_REG
72#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
73 filter, regfn) \
74 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
75 filter)
70 76
71#include "trace_entries.h" 77#include "trace_entries.h"
72 78
@@ -148,6 +154,7 @@ struct trace_array {
148 struct ring_buffer *buffer; 154 struct ring_buffer *buffer;
149 unsigned long entries; 155 unsigned long entries;
150 int cpu; 156 int cpu;
157 int buffer_disabled;
151 cycle_t time_start; 158 cycle_t time_start;
152 struct task_struct *waiter; 159 struct task_struct *waiter;
153 struct trace_array_cpu *data[NR_CPUS]; 160 struct trace_array_cpu *data[NR_CPUS];
@@ -288,6 +295,8 @@ struct tracer {
288/* for function tracing recursion */ 295/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11) 296#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12) 297#define TRACE_GLOBAL_BIT (1<<12)
298#define TRACE_CONTROL_BIT (1<<13)
299
291/* 300/*
292 * Abuse of the trace_recursion. 301 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function 302 * As we need a way to maintain state if we are tracing the function
@@ -589,6 +598,8 @@ static inline int ftrace_trace_task(struct task_struct *task)
589static inline int ftrace_is_dead(void) { return 0; } 598static inline int ftrace_is_dead(void) { return 0; }
590#endif 599#endif
591 600
601int ftrace_event_is_function(struct ftrace_event_call *call);
602
592/* 603/*
593 * struct trace_parser - servers for reading the user input separated by spaces 604 * struct trace_parser - servers for reading the user input separated by spaces
594 * @cont: set if the input is not complete - no final space char was found 605 * @cont: set if the input is not complete - no final space char was found
@@ -766,9 +777,7 @@ struct filter_pred {
766 u64 val; 777 u64 val;
767 struct regex regex; 778 struct regex regex;
768 unsigned short *ops; 779 unsigned short *ops;
769#ifdef CONFIG_FTRACE_STARTUP_TEST
770 struct ftrace_event_field *field; 780 struct ftrace_event_field *field;
771#endif
772 int offset; 781 int offset;
773 int not; 782 int not;
774 int op; 783 int op;
@@ -818,12 +827,20 @@ extern const char *__start___trace_bprintk_fmt[];
818extern const char *__stop___trace_bprintk_fmt[]; 827extern const char *__stop___trace_bprintk_fmt[];
819 828
820#undef FTRACE_ENTRY 829#undef FTRACE_ENTRY
821#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 830#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
822 extern struct ftrace_event_call \ 831 extern struct ftrace_event_call \
823 __attribute__((__aligned__(4))) event_##call; 832 __attribute__((__aligned__(4))) event_##call;
824#undef FTRACE_ENTRY_DUP 833#undef FTRACE_ENTRY_DUP
825#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 834#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
826 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 835 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
836 filter)
827#include "trace_entries.h" 837#include "trace_entries.h"
828 838
839#ifdef CONFIG_FUNCTION_TRACER
840int perf_ftrace_event_register(struct ftrace_event_call *call,
841 enum trace_reg type, void *data);
842#else
843#define perf_ftrace_event_register NULL
844#endif /* CONFIG_FUNCTION_TRACER */
845
829#endif /* _LINUX_KERNEL_TRACE_H */ 846#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 93365907f219..4108e1250ca2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
55/* 55/*
56 * Function trace entry - function address and parent function address: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY_REG(function, ftrace_entry,
59 59
60 TRACE_FN, 60 TRACE_FN,
61 61
@@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,
64 __field( unsigned long, parent_ip ) 64 __field( unsigned long, parent_ip )
65 ), 65 ),
66 66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) 67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
68
69 FILTER_TRACE_FN,
70
71 perf_ftrace_event_register
68); 72);
69 73
70/* Function call entry */ 74/* Function call entry */
@@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
78 __field_desc( int, graph_ent, depth ) 82 __field_desc( int, graph_ent, depth )
79 ), 83 ),
80 84
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth) 85 F_printk("--> %lx (%d)", __entry->func, __entry->depth),
86
87 FILTER_OTHER
82); 88);
83 89
84/* Function return entry */ 90/* Function return entry */
@@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", 104 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth, 105 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime, 106 __entry->calltime, __entry->rettime,
101 __entry->depth) 107 __entry->depth),
108
109 FILTER_OTHER
102); 110);
103 111
104/* 112/*
@@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", 135 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 136 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state, 137 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu 138 __entry->next_cpu),
131 ) 139
140 FILTER_OTHER
132); 141);
133 142
134/* 143/*
@@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", 155 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 156 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state, 157 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu 158 __entry->next_cpu),
150 ) 159
160 FILTER_OTHER
151); 161);
152 162
153/* 163/*
@@ -156,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
156 166
157#define FTRACE_STACK_ENTRIES 8 167#define FTRACE_STACK_ENTRIES 8
158 168
169#ifndef CONFIG_64BIT
170# define IP_FMT "%08lx"
171#else
172# define IP_FMT "%016lx"
173#endif
174
159FTRACE_ENTRY(kernel_stack, stack_entry, 175FTRACE_ENTRY(kernel_stack, stack_entry,
160 176
161 TRACE_STACK, 177 TRACE_STACK,
@@ -165,11 +181,14 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
165 __dynamic_array(unsigned long, caller ) 181 __dynamic_array(unsigned long, caller )
166 ), 182 ),
167 183
168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 184 F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
169 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", 185 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
186 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
170 __entry->caller[0], __entry->caller[1], __entry->caller[2], 187 __entry->caller[0], __entry->caller[1], __entry->caller[2],
171 __entry->caller[3], __entry->caller[4], __entry->caller[5], 188 __entry->caller[3], __entry->caller[4], __entry->caller[5],
172 __entry->caller[6], __entry->caller[7]) 189 __entry->caller[6], __entry->caller[7]),
190
191 FILTER_OTHER
173); 192);
174 193
175FTRACE_ENTRY(user_stack, userstack_entry, 194FTRACE_ENTRY(user_stack, userstack_entry,
@@ -181,11 +200,14 @@ FTRACE_ENTRY(user_stack, userstack_entry,
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 200 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ), 201 ),
183 202
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 203 F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", 204 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
205 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2], 206 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5], 207 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7]) 208 __entry->caller[6], __entry->caller[7]),
209
210 FILTER_OTHER
189); 211);
190 212
191/* 213/*
@@ -202,7 +224,9 @@ FTRACE_ENTRY(bprint, bprint_entry,
202 ), 224 ),
203 225
204 F_printk("%08lx fmt:%p", 226 F_printk("%08lx fmt:%p",
205 __entry->ip, __entry->fmt) 227 __entry->ip, __entry->fmt),
228
229 FILTER_OTHER
206); 230);
207 231
208FTRACE_ENTRY(print, print_entry, 232FTRACE_ENTRY(print, print_entry,
@@ -215,7 +239,9 @@ FTRACE_ENTRY(print, print_entry,
215 ), 239 ),
216 240
217 F_printk("%08lx %s", 241 F_printk("%08lx %s",
218 __entry->ip, __entry->buf) 242 __entry->ip, __entry->buf),
243
244 FILTER_OTHER
219); 245);
220 246
221FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, 247FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -234,7 +260,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
234 260
235 F_printk("%lx %lx %lx %d %x %x", 261 F_printk("%lx %lx %lx %d %x %x",
236 (unsigned long)__entry->phys, __entry->value, __entry->pc, 262 (unsigned long)__entry->phys, __entry->value, __entry->pc,
237 __entry->map_id, __entry->opcode, __entry->width) 263 __entry->map_id, __entry->opcode, __entry->width),
264
265 FILTER_OTHER
238); 266);
239 267
240FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, 268FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -252,7 +280,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
252 280
253 F_printk("%lx %lx %lx %d %x", 281 F_printk("%lx %lx %lx %d %x",
254 (unsigned long)__entry->phys, __entry->virt, __entry->len, 282 (unsigned long)__entry->phys, __entry->virt, __entry->len,
255 __entry->map_id, __entry->opcode) 283 __entry->map_id, __entry->opcode),
284
285 FILTER_OTHER
256); 286);
257 287
258 288
@@ -272,6 +302,8 @@ FTRACE_ENTRY(branch, trace_branch,
272 302
273 F_printk("%u:%s:%s (%u)", 303 F_printk("%u:%s:%s (%u)",
274 __entry->line, 304 __entry->line,
275 __entry->func, __entry->file, __entry->correct) 305 __entry->func, __entry->file, __entry->correct),
306
307 FILTER_OTHER
276); 308);
277 309
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 19a359d5e6d5..fee3752ae8f6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,11 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
30 return -EPERM;
31
27 /* No tracing, just counting, so no obvious leak */ 32 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 33 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0; 34 return 0;
@@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
44 return 0; 49 return 0;
45} 50}
46 51
47static int perf_trace_event_init(struct ftrace_event_call *tp_event, 52static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
48 struct perf_event *p_event) 53 struct perf_event *p_event)
49{ 54{
50 struct hlist_head __percpu *list; 55 struct hlist_head __percpu *list;
51 int ret; 56 int ret = -ENOMEM;
52 int cpu; 57 int cpu;
53 58
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
58 p_event->tp_event = tp_event; 59 p_event->tp_event = tp_event;
59 if (tp_event->perf_refcount++ > 0) 60 if (tp_event->perf_refcount++ > 0)
60 return 0; 61 return 0;
61 62
62 ret = -ENOMEM;
63
64 list = alloc_percpu(struct hlist_head); 63 list = alloc_percpu(struct hlist_head);
65 if (!list) 64 if (!list)
66 goto fail; 65 goto fail;
@@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
83 } 82 }
84 } 83 }
85 84
86 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); 85 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
87 if (ret) 86 if (ret)
88 goto fail; 87 goto fail;
89 88
@@ -108,6 +107,69 @@ fail:
108 return ret; 107 return ret;
109} 108}
110 109
110static void perf_trace_event_unreg(struct perf_event *p_event)
111{
112 struct ftrace_event_call *tp_event = p_event->tp_event;
113 int i;
114
115 if (--tp_event->perf_refcount > 0)
116 goto out;
117
118 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
119
120 /*
121 * Ensure our callback won't be called anymore. The buffers
122 * will be freed after that.
123 */
124 tracepoint_synchronize_unregister();
125
126 free_percpu(tp_event->perf_events);
127 tp_event->perf_events = NULL;
128
129 if (!--total_ref_count) {
130 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
131 free_percpu(perf_trace_buf[i]);
132 perf_trace_buf[i] = NULL;
133 }
134 }
135out:
136 module_put(tp_event->mod);
137}
138
139static int perf_trace_event_open(struct perf_event *p_event)
140{
141 struct ftrace_event_call *tp_event = p_event->tp_event;
142 return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
143}
144
145static void perf_trace_event_close(struct perf_event *p_event)
146{
147 struct ftrace_event_call *tp_event = p_event->tp_event;
148 tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
149}
150
151static int perf_trace_event_init(struct ftrace_event_call *tp_event,
152 struct perf_event *p_event)
153{
154 int ret;
155
156 ret = perf_trace_event_perm(tp_event, p_event);
157 if (ret)
158 return ret;
159
160 ret = perf_trace_event_reg(tp_event, p_event);
161 if (ret)
162 return ret;
163
164 ret = perf_trace_event_open(p_event);
165 if (ret) {
166 perf_trace_event_unreg(p_event);
167 return ret;
168 }
169
170 return 0;
171}
172
111int perf_trace_init(struct perf_event *p_event) 173int perf_trace_init(struct perf_event *p_event)
112{ 174{
113 struct ftrace_event_call *tp_event; 175 struct ftrace_event_call *tp_event;
@@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event)
130 return ret; 192 return ret;
131} 193}
132 194
195void perf_trace_destroy(struct perf_event *p_event)
196{
197 mutex_lock(&event_mutex);
198 perf_trace_event_close(p_event);
199 perf_trace_event_unreg(p_event);
200 mutex_unlock(&event_mutex);
201}
202
133int perf_trace_add(struct perf_event *p_event, int flags) 203int perf_trace_add(struct perf_event *p_event, int flags)
134{ 204{
135 struct ftrace_event_call *tp_event = p_event->tp_event; 205 struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags)
146 list = this_cpu_ptr(pcpu_list); 216 list = this_cpu_ptr(pcpu_list);
147 hlist_add_head_rcu(&p_event->hlist_entry, list); 217 hlist_add_head_rcu(&p_event->hlist_entry, list);
148 218
149 return 0; 219 return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
150} 220}
151 221
152void perf_trace_del(struct perf_event *p_event, int flags) 222void perf_trace_del(struct perf_event *p_event, int flags)
153{ 223{
154 hlist_del_rcu(&p_event->hlist_entry);
155}
156
157void perf_trace_destroy(struct perf_event *p_event)
158{
159 struct ftrace_event_call *tp_event = p_event->tp_event; 224 struct ftrace_event_call *tp_event = p_event->tp_event;
160 int i; 225 hlist_del_rcu(&p_event->hlist_entry);
161 226 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
162 mutex_lock(&event_mutex);
163 if (--tp_event->perf_refcount > 0)
164 goto out;
165
166 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
167
168 /*
169 * Ensure our callback won't be called anymore. The buffers
170 * will be freed after that.
171 */
172 tracepoint_synchronize_unregister();
173
174 free_percpu(tp_event->perf_events);
175 tp_event->perf_events = NULL;
176
177 if (!--total_ref_count) {
178 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
179 free_percpu(perf_trace_buf[i]);
180 perf_trace_buf[i] = NULL;
181 }
182 }
183out:
184 module_put(tp_event->mod);
185 mutex_unlock(&event_mutex);
186} 227}
187 228
188__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 229__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
214 return raw_data; 255 return raw_data;
215} 256}
216EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 257EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
258
259#ifdef CONFIG_FUNCTION_TRACER
260static void
261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
262{
263 struct ftrace_entry *entry;
264 struct hlist_head *head;
265 struct pt_regs regs;
266 int rctx;
267
268#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
269 sizeof(u64)) - sizeof(u32))
270
271 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
272
273 perf_fetch_caller_regs(&regs);
274
275 entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
276 if (!entry)
277 return;
278
279 entry->ip = ip;
280 entry->parent_ip = parent_ip;
281
282 head = this_cpu_ptr(event_function.perf_events);
283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
284 1, &regs, head);
285
286#undef ENTRY_SIZE
287}
288
289static int perf_ftrace_function_register(struct perf_event *event)
290{
291 struct ftrace_ops *ops = &event->ftrace_ops;
292
293 ops->flags |= FTRACE_OPS_FL_CONTROL;
294 ops->func = perf_ftrace_function_call;
295 return register_ftrace_function(ops);
296}
297
298static int perf_ftrace_function_unregister(struct perf_event *event)
299{
300 struct ftrace_ops *ops = &event->ftrace_ops;
301 int ret = unregister_ftrace_function(ops);
302 ftrace_free_filter(ops);
303 return ret;
304}
305
306static void perf_ftrace_function_enable(struct perf_event *event)
307{
308 ftrace_function_local_enable(&event->ftrace_ops);
309}
310
311static void perf_ftrace_function_disable(struct perf_event *event)
312{
313 ftrace_function_local_disable(&event->ftrace_ops);
314}
315
316int perf_ftrace_event_register(struct ftrace_event_call *call,
317 enum trace_reg type, void *data)
318{
319 switch (type) {
320 case TRACE_REG_REGISTER:
321 case TRACE_REG_UNREGISTER:
322 break;
323 case TRACE_REG_PERF_REGISTER:
324 case TRACE_REG_PERF_UNREGISTER:
325 return 0;
326 case TRACE_REG_PERF_OPEN:
327 return perf_ftrace_function_register(data);
328 case TRACE_REG_PERF_CLOSE:
329 return perf_ftrace_function_unregister(data);
330 case TRACE_REG_PERF_ADD:
331 perf_ftrace_function_enable(data);
332 return 0;
333 case TRACE_REG_PERF_DEL:
334 perf_ftrace_function_disable(data);
335 return 0;
336 }
337
338 return -EINVAL;
339}
340#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c212a7f934ec..079a93ae8a9d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call)
147} 147}
148EXPORT_SYMBOL_GPL(trace_event_raw_init); 148EXPORT_SYMBOL_GPL(trace_event_raw_init);
149 149
150int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) 150int ftrace_event_reg(struct ftrace_event_call *call,
151 enum trace_reg type, void *data)
151{ 152{
152 switch (type) { 153 switch (type) {
153 case TRACE_REG_REGISTER: 154 case TRACE_REG_REGISTER:
@@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
170 call->class->perf_probe, 171 call->class->perf_probe,
171 call); 172 call);
172 return 0; 173 return 0;
174 case TRACE_REG_PERF_OPEN:
175 case TRACE_REG_PERF_CLOSE:
176 case TRACE_REG_PERF_ADD:
177 case TRACE_REG_PERF_DEL:
178 return 0;
173#endif 179#endif
174 } 180 }
175 return 0; 181 return 0;
@@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
209 tracing_stop_cmdline_record(); 215 tracing_stop_cmdline_record();
210 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 216 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
211 } 217 }
212 call->class->reg(call, TRACE_REG_UNREGISTER); 218 call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
213 } 219 }
214 break; 220 break;
215 case 1: 221 case 1:
@@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
218 tracing_start_cmdline_record(); 224 tracing_start_cmdline_record();
219 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 225 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
220 } 226 }
221 ret = call->class->reg(call, TRACE_REG_REGISTER); 227 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
222 if (ret) { 228 if (ret) {
223 tracing_stop_cmdline_record(); 229 tracing_stop_cmdline_record();
224 pr_info("event trace: Could not enable event " 230 pr_info("event trace: Could not enable event "
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 24aee7127451..431dba8b7542 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -81,6 +81,7 @@ enum {
81 FILT_ERR_TOO_MANY_PREDS, 81 FILT_ERR_TOO_MANY_PREDS,
82 FILT_ERR_MISSING_FIELD, 82 FILT_ERR_MISSING_FIELD,
83 FILT_ERR_INVALID_FILTER, 83 FILT_ERR_INVALID_FILTER,
84 FILT_ERR_IP_FIELD_ONLY,
84}; 85};
85 86
86static char *err_text[] = { 87static char *err_text[] = {
@@ -96,6 +97,7 @@ static char *err_text[] = {
96 "Too many terms in predicate expression", 97 "Too many terms in predicate expression",
97 "Missing field name and/or value", 98 "Missing field name and/or value",
98 "Meaningless filter expression", 99 "Meaningless filter expression",
100 "Only 'ip' field is supported for function trace",
99}; 101};
100 102
101struct opstack_op { 103struct opstack_op {
@@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
685 687
686static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
687{ 689{
688 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); 690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
689 if (!stack->preds) 691 if (!stack->preds)
690 return -ENOMEM; 692 return -ENOMEM;
691 stack->index = n_preds; 693 stack->index = n_preds;
@@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
826 if (filter->preds) 828 if (filter->preds)
827 __free_preds(filter); 829 __free_preds(filter);
828 830
829 filter->preds = 831 filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
830 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
831 832
832 if (!filter->preds) 833 if (!filter->preds)
833 return -ENOMEM; 834 return -ENOMEM;
@@ -900,6 +901,11 @@ int filter_assign_type(const char *type)
900 return FILTER_OTHER; 901 return FILTER_OTHER;
901} 902}
902 903
904static bool is_function_field(struct ftrace_event_field *field)
905{
906 return field->filter_type == FILTER_TRACE_FN;
907}
908
903static bool is_string_field(struct ftrace_event_field *field) 909static bool is_string_field(struct ftrace_event_field *field)
904{ 910{
905 return field->filter_type == FILTER_DYN_STRING || 911 return field->filter_type == FILTER_DYN_STRING ||
@@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps,
987 fn = filter_pred_strloc; 993 fn = filter_pred_strloc;
988 else 994 else
989 fn = filter_pred_pchar; 995 fn = filter_pred_pchar;
996 } else if (is_function_field(field)) {
997 if (strcmp(field->name, "ip")) {
998 parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
999 return -EINVAL;
1000 }
990 } else { 1001 } else {
991 if (field->is_signed) 1002 if (field->is_signed)
992 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1334 1345
1335 strcpy(pred.regex.pattern, operand2); 1346 strcpy(pred.regex.pattern, operand2);
1336 pred.regex.len = strlen(pred.regex.pattern); 1347 pred.regex.len = strlen(pred.regex.pattern);
1337
1338#ifdef CONFIG_FTRACE_STARTUP_TEST
1339 pred.field = field; 1348 pred.field = field;
1340#endif
1341 return init_pred(ps, field, &pred) ? NULL : &pred; 1349 return init_pred(ps, field, &pred) ? NULL : &pred;
1342} 1350}
1343 1351
@@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1486 children = count_leafs(preds, &preds[root->left]); 1494 children = count_leafs(preds, &preds[root->left]);
1487 children += count_leafs(preds, &preds[root->right]); 1495 children += count_leafs(preds, &preds[root->right]);
1488 1496
1489 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); 1497 root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
1490 if (!root->ops) 1498 if (!root->ops)
1491 return -ENOMEM; 1499 return -ENOMEM;
1492 1500
@@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event)
1950 __free_filter(filter); 1958 __free_filter(filter);
1951} 1959}
1952 1960
1961struct function_filter_data {
1962 struct ftrace_ops *ops;
1963 int first_filter;
1964 int first_notrace;
1965};
1966
1967#ifdef CONFIG_FUNCTION_TRACER
1968static char **
1969ftrace_function_filter_re(char *buf, int len, int *count)
1970{
1971 char *str, *sep, **re;
1972
1973 str = kstrndup(buf, len, GFP_KERNEL);
1974 if (!str)
1975 return NULL;
1976
1977 /*
1978 * The argv_split function takes white space
1979 * as a separator, so convert ',' into spaces.
1980 */
1981 while ((sep = strchr(str, ',')))
1982 *sep = ' ';
1983
1984 re = argv_split(GFP_KERNEL, str, count);
1985 kfree(str);
1986 return re;
1987}
1988
1989static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
1990 int reset, char *re, int len)
1991{
1992 int ret;
1993
1994 if (filter)
1995 ret = ftrace_set_filter(ops, re, len, reset);
1996 else
1997 ret = ftrace_set_notrace(ops, re, len, reset);
1998
1999 return ret;
2000}
2001
2002static int __ftrace_function_set_filter(int filter, char *buf, int len,
2003 struct function_filter_data *data)
2004{
2005 int i, re_cnt, ret;
2006 int *reset;
2007 char **re;
2008
2009 reset = filter ? &data->first_filter : &data->first_notrace;
2010
2011 /*
2012 * The 'ip' field could have multiple filters set, separated
2013 * either by space or comma. We first cut the filter and apply
2014 * all pieces separatelly.
2015 */
2016 re = ftrace_function_filter_re(buf, len, &re_cnt);
2017 if (!re)
2018 return -EINVAL;
2019
2020 for (i = 0; i < re_cnt; i++) {
2021 ret = ftrace_function_set_regexp(data->ops, filter, *reset,
2022 re[i], strlen(re[i]));
2023 if (ret)
2024 break;
2025
2026 if (*reset)
2027 *reset = 0;
2028 }
2029
2030 argv_free(re);
2031 return ret;
2032}
2033
2034static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
2035{
2036 struct ftrace_event_field *field = pred->field;
2037
2038 if (leaf) {
2039 /*
2040 * Check the leaf predicate for function trace, verify:
2041 * - only '==' and '!=' is used
2042 * - the 'ip' field is used
2043 */
2044 if ((pred->op != OP_EQ) && (pred->op != OP_NE))
2045 return -EINVAL;
2046
2047 if (strcmp(field->name, "ip"))
2048 return -EINVAL;
2049 } else {
2050 /*
2051 * Check the non leaf predicate for function trace, verify:
2052 * - only '||' is used
2053 */
2054 if (pred->op != OP_OR)
2055 return -EINVAL;
2056 }
2057
2058 return 0;
2059}
2060
2061static int ftrace_function_set_filter_cb(enum move_type move,
2062 struct filter_pred *pred,
2063 int *err, void *data)
2064{
2065 /* Checking the node is valid for function trace. */
2066 if ((move != MOVE_DOWN) ||
2067 (pred->left != FILTER_PRED_INVALID)) {
2068 *err = ftrace_function_check_pred(pred, 0);
2069 } else {
2070 *err = ftrace_function_check_pred(pred, 1);
2071 if (*err)
2072 return WALK_PRED_ABORT;
2073
2074 *err = __ftrace_function_set_filter(pred->op == OP_EQ,
2075 pred->regex.pattern,
2076 pred->regex.len,
2077 data);
2078 }
2079
2080 return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
2081}
2082
2083static int ftrace_function_set_filter(struct perf_event *event,
2084 struct event_filter *filter)
2085{
2086 struct function_filter_data data = {
2087 .first_filter = 1,
2088 .first_notrace = 1,
2089 .ops = &event->ftrace_ops,
2090 };
2091
2092 return walk_pred_tree(filter->preds, filter->root,
2093 ftrace_function_set_filter_cb, &data);
2094}
2095#else
2096static int ftrace_function_set_filter(struct perf_event *event,
2097 struct event_filter *filter)
2098{
2099 return -ENODEV;
2100}
2101#endif /* CONFIG_FUNCTION_TRACER */
2102
1953int ftrace_profile_set_filter(struct perf_event *event, int event_id, 2103int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1954 char *filter_str) 2104 char *filter_str)
1955{ 2105{
@@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1970 goto out_unlock; 2120 goto out_unlock;
1971 2121
1972 err = create_filter(call, filter_str, false, &filter); 2122 err = create_filter(call, filter_str, false, &filter);
1973 if (!err) 2123 if (err)
1974 event->filter = filter; 2124 goto free_filter;
2125
2126 if (ftrace_event_is_function(call))
2127 err = ftrace_function_set_filter(event, filter);
1975 else 2128 else
2129 event->filter = filter;
2130
2131free_filter:
2132 if (err || ftrace_event_is_function(call))
1976 __free_filter(filter); 2133 __free_filter(filter);
1977 2134
1978out_unlock: 2135out_unlock:
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index bbeec31e0ae3..3dd15e8bc856 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,6 +18,16 @@
18#undef TRACE_SYSTEM 18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace 19#define TRACE_SYSTEM ftrace
20 20
21/*
22 * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
23 * function and thus become accesible via perf.
24 */
25#undef FTRACE_ENTRY_REG
26#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
27 filter, regfn) \
28 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
29 filter)
30
21/* not needed for this file */ 31/* not needed for this file */
22#undef __field_struct 32#undef __field_struct
23#define __field_struct(type, item) 33#define __field_struct(type, item)
@@ -44,21 +54,22 @@
44#define F_printk(fmt, args...) fmt, args 54#define F_printk(fmt, args...) fmt, args
45 55
46#undef FTRACE_ENTRY 56#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 57#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
48struct ____ftrace_##name { \ 58struct ____ftrace_##name { \
49 tstruct \ 59 tstruct \
50}; \ 60}; \
51static void __always_unused ____ftrace_check_##name(void) \ 61static void __always_unused ____ftrace_check_##name(void) \
52{ \ 62{ \
53 struct ____ftrace_##name *__entry = NULL; \ 63 struct ____ftrace_##name *__entry = NULL; \
54 \ 64 \
55 /* force compile-time check on F_printk() */ \ 65 /* force compile-time check on F_printk() */ \
56 printk(print); \ 66 printk(print); \
57} 67}
58 68
59#undef FTRACE_ENTRY_DUP 69#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ 70#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) 71 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
72 filter)
62 73
63#include "trace_entries.h" 74#include "trace_entries.h"
64 75
@@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \
67 ret = trace_define_field(event_call, #type, #item, \ 78 ret = trace_define_field(event_call, #type, #item, \
68 offsetof(typeof(field), item), \ 79 offsetof(typeof(field), item), \
69 sizeof(field.item), \ 80 sizeof(field.item), \
70 is_signed_type(type), FILTER_OTHER); \ 81 is_signed_type(type), filter_type); \
71 if (ret) \ 82 if (ret) \
72 return ret; 83 return ret;
73 84
@@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \
77 offsetof(typeof(field), \ 88 offsetof(typeof(field), \
78 container.item), \ 89 container.item), \
79 sizeof(field.container.item), \ 90 sizeof(field.container.item), \
80 is_signed_type(type), FILTER_OTHER); \ 91 is_signed_type(type), filter_type); \
81 if (ret) \ 92 if (ret) \
82 return ret; 93 return ret;
83 94
@@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \
91 ret = trace_define_field(event_call, event_storage, #item, \ 102 ret = trace_define_field(event_call, event_storage, #item, \
92 offsetof(typeof(field), item), \ 103 offsetof(typeof(field), item), \
93 sizeof(field.item), \ 104 sizeof(field.item), \
94 is_signed_type(type), FILTER_OTHER); \ 105 is_signed_type(type), filter_type); \
95 mutex_unlock(&event_storage_mutex); \ 106 mutex_unlock(&event_storage_mutex); \
96 if (ret) \ 107 if (ret) \
97 return ret; \ 108 return ret; \
@@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \
104 offsetof(typeof(field), \ 115 offsetof(typeof(field), \
105 container.item), \ 116 container.item), \
106 sizeof(field.container.item), \ 117 sizeof(field.container.item), \
107 is_signed_type(type), FILTER_OTHER); \ 118 is_signed_type(type), filter_type); \
108 if (ret) \ 119 if (ret) \
109 return ret; 120 return ret;
110 121
@@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \
112#define __dynamic_array(type, item) \ 123#define __dynamic_array(type, item) \
113 ret = trace_define_field(event_call, #type, #item, \ 124 ret = trace_define_field(event_call, #type, #item, \
114 offsetof(typeof(field), item), \ 125 offsetof(typeof(field), item), \
115 0, is_signed_type(type), FILTER_OTHER);\ 126 0, is_signed_type(type), filter_type);\
116 if (ret) \ 127 if (ret) \
117 return ret; 128 return ret;
118 129
119#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
121int \ 132int \
122ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
123{ \ 134{ \
124 struct struct_name field; \ 135 struct struct_name field; \
125 int ret; \ 136 int ret; \
137 int filter_type = filter; \
126 \ 138 \
127 tstruct; \ 139 tstruct; \
128 \ 140 \
@@ -150,15 +162,17 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
150#define __dynamic_array(type, item) 162#define __dynamic_array(type, item)
151 163
152#undef F_printk 164#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 165#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
154 166
155#undef FTRACE_ENTRY 167#undef FTRACE_ENTRY_REG
156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \
157 \ 170 \
158struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
175 .reg = regfn, \
162}; \ 176}; \
163 \ 177 \
164struct ftrace_event_call __used event_##call = { \ 178struct ftrace_event_call __used event_##call = { \
@@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = { \
170struct ftrace_event_call __used \ 184struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
172 186
187#undef FTRACE_ENTRY
188#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
189 FTRACE_ENTRY_REG(call, struct_name, etype, \
190 PARAMS(tstruct), PARAMS(print), filter, NULL)
191
192int ftrace_event_is_function(struct ftrace_event_call *call)
193{
194 return call == &event_function;
195}
196
173#include "trace_entries.h" 197#include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 00d527c945a4..580a05ec926b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1892#endif /* CONFIG_PERF_EVENTS */ 1892#endif /* CONFIG_PERF_EVENTS */
1893 1893
1894static __kprobes 1894static __kprobes
1895int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1895int kprobe_register(struct ftrace_event_call *event,
1896 enum trace_reg type, void *data)
1896{ 1897{
1897 struct trace_probe *tp = (struct trace_probe *)event->data; 1898 struct trace_probe *tp = (struct trace_probe *)event->data;
1898 1899
@@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1909 case TRACE_REG_PERF_UNREGISTER: 1910 case TRACE_REG_PERF_UNREGISTER:
1910 disable_trace_probe(tp, TP_FLAG_PROFILE); 1911 disable_trace_probe(tp, TP_FLAG_PROFILE);
1911 return 0; 1912 return 0;
1913 case TRACE_REG_PERF_OPEN:
1914 case TRACE_REG_PERF_CLOSE:
1915 case TRACE_REG_PERF_ADD:
1916 case TRACE_REG_PERF_DEL:
1917 return 0;
1912#endif 1918#endif
1913 } 1919 }
1914 return 0; 1920 return 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0d6ff3555942..859fae6b1825 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
264 return ret; 264 return ret;
265} 265}
266 266
267int trace_seq_path(struct trace_seq *s, struct path *path) 267int trace_seq_path(struct trace_seq *s, const struct path *path)
268{ 268{
269 unsigned char *p; 269 unsigned char *p;
270 270
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
300 unsigned long mask; 300 unsigned long mask;
301 const char *str; 301 const char *str;
302 const char *ret = p->buffer + p->len; 302 const char *ret = p->buffer + p->len;
303 int i; 303 int i, first = 1;
304 304
305 for (i = 0; flag_array[i].name && flags; i++) { 305 for (i = 0; flag_array[i].name && flags; i++) {
306 306
@@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
310 310
311 str = flag_array[i].name; 311 str = flag_array[i].name;
312 flags &= ~mask; 312 flags &= ~mask;
313 if (p->len && delim) 313 if (!first && delim)
314 trace_seq_puts(p, delim); 314 trace_seq_puts(p, delim);
315 else
316 first = 0;
315 trace_seq_puts(p, str); 317 trace_seq_puts(p, str);
316 } 318 }
317 319
318 /* check for left over flags */ 320 /* check for left over flags */
319 if (flags) { 321 if (flags) {
320 if (p->len && delim) 322 if (!first && delim)
321 trace_seq_puts(p, delim); 323 trace_seq_puts(p, delim);
322 trace_seq_printf(p, "0x%lx", flags); 324 trace_seq_printf(p, "0x%lx", flags);
323 } 325 }
@@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
344 break; 346 break;
345 } 347 }
346 348
347 if (!p->len) 349 if (ret == (const char *)(p->buffer + p->len))
348 trace_seq_printf(p, "0x%lx", val); 350 trace_seq_printf(p, "0x%lx", val);
349 351
350 trace_seq_putc(p, 0); 352 trace_seq_putc(p, 0);
@@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
370 break; 372 break;
371 } 373 }
372 374
373 if (!p->len) 375 if (ret == (const char *)(p->buffer + p->len))
374 trace_seq_printf(p, "0x%llx", val); 376 trace_seq_printf(p, "0x%llx", val);
375 377
376 trace_seq_putc(p, 0); 378 trace_seq_putc(p, 0);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cb654542c1a1..96fc73369099 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
18 18
19static int syscall_enter_register(struct ftrace_event_call *event, 19static int syscall_enter_register(struct ftrace_event_call *event,
20 enum trace_reg type); 20 enum trace_reg type, void *data);
21static int syscall_exit_register(struct ftrace_event_call *event, 21static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type); 22 enum trace_reg type, void *data);
23 23
24static int syscall_enter_define_fields(struct ftrace_event_call *call); 24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call); 25static int syscall_exit_define_fields(struct ftrace_event_call *call);
@@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void)
468 unsigned long addr; 468 unsigned long addr;
469 int i; 469 int i;
470 470
471 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 471 syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
472 NR_syscalls, GFP_KERNEL); 472 GFP_KERNEL);
473 if (!syscalls_metadata) { 473 if (!syscalls_metadata) {
474 WARN_ON(1); 474 WARN_ON(1);
475 return -ENOMEM; 475 return -ENOMEM;
@@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
649#endif /* CONFIG_PERF_EVENTS */ 649#endif /* CONFIG_PERF_EVENTS */
650 650
651static int syscall_enter_register(struct ftrace_event_call *event, 651static int syscall_enter_register(struct ftrace_event_call *event,
652 enum trace_reg type) 652 enum trace_reg type, void *data)
653{ 653{
654 switch (type) { 654 switch (type) {
655 case TRACE_REG_REGISTER: 655 case TRACE_REG_REGISTER:
@@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event,
664 case TRACE_REG_PERF_UNREGISTER: 664 case TRACE_REG_PERF_UNREGISTER:
665 perf_sysenter_disable(event); 665 perf_sysenter_disable(event);
666 return 0; 666 return 0;
667 case TRACE_REG_PERF_OPEN:
668 case TRACE_REG_PERF_CLOSE:
669 case TRACE_REG_PERF_ADD:
670 case TRACE_REG_PERF_DEL:
671 return 0;
667#endif 672#endif
668 } 673 }
669 return 0; 674 return 0;
670} 675}
671 676
672static int syscall_exit_register(struct ftrace_event_call *event, 677static int syscall_exit_register(struct ftrace_event_call *event,
673 enum trace_reg type) 678 enum trace_reg type, void *data)
674{ 679{
675 switch (type) { 680 switch (type) {
676 case TRACE_REG_REGISTER: 681 case TRACE_REG_REGISTER:
@@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,
685 case TRACE_REG_PERF_UNREGISTER: 690 case TRACE_REG_PERF_UNREGISTER:
686 perf_sysexit_disable(event); 691 perf_sysexit_disable(event);
687 return 0; 692 return 0;
693 case TRACE_REG_PERF_OPEN:
694 case TRACE_REG_PERF_CLOSE:
695 case TRACE_REG_PERF_ADD:
696 case TRACE_REG_PERF_DEL:
697 return 0;
688#endif 698#endif
689 } 699 }
690 return 0; 700 return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f1539decd99d..d96ba22dabfa 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h> 28#include <linux/static_key.h>
29 29
30extern struct tracepoint * const __start___tracepoints_ptrs[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint * const __stop___tracepoints_ptrs[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
256{ 256{
257 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 257 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
258 258
259 if (elem->regfunc && !jump_label_enabled(&elem->key) && active) 259 if (elem->regfunc && !static_key_enabled(&elem->key) && active)
260 elem->regfunc(); 260 elem->regfunc();
261 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) 261 else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
262 elem->unregfunc(); 262 elem->unregfunc();
263 263
264 /* 264 /*
@@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
269 * is used. 269 * is used.
270 */ 270 */
271 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 271 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
272 if (active && !jump_label_enabled(&elem->key)) 272 if (active && !static_key_enabled(&elem->key))
273 jump_label_inc(&elem->key); 273 static_key_slow_inc(&elem->key);
274 else if (!active && jump_label_enabled(&elem->key)) 274 else if (!active && static_key_enabled(&elem->key))
275 jump_label_dec(&elem->key); 275 static_key_slow_dec(&elem->key);
276} 276}
277 277
278/* 278/*
@@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
283 */ 283 */
284static void disable_tracepoint(struct tracepoint *elem) 284static void disable_tracepoint(struct tracepoint *elem)
285{ 285{
286 if (elem->unregfunc && jump_label_enabled(&elem->key)) 286 if (elem->unregfunc && static_key_enabled(&elem->key))
287 elem->unregfunc(); 287 elem->unregfunc();
288 288
289 if (jump_label_enabled(&elem->key)) 289 if (static_key_enabled(&elem->key))
290 jump_label_dec(&elem->key); 290 static_key_slow_dec(&elem->key);
291 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
292} 292}
293 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d117262deba3..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,15 +3,14 @@
3 * 3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. 4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 * 5 *
6 * this code detects hard lockups: incidents in where on a CPU 6 * Note: Most of this code is borrowed heavily from the original softlockup
7 * the kernel does not respond to anything except NMI. 7 * detector, so thanks to Ingo for the initial implementation.
8 * 8 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well. 9 * to those contributors as well.
13 */ 10 */
14 11
12#define pr_fmt(fmt) "NMI watchdog: " fmt
13
15#include <linux/mm.h> 14#include <linux/mm.h>
16#include <linux/cpu.h> 15#include <linux/cpu.h>
17#include <linux/nmi.h> 16#include <linux/nmi.h>
@@ -117,9 +116,10 @@ static unsigned long get_sample_period(void)
117{ 116{
118 /* 117 /*
119 * convert watchdog_thresh from seconds to ns 118 * convert watchdog_thresh from seconds to ns
120 * the divide by 5 is to give hrtimer 5 chances to 119 * the divide by 5 is to give hrtimer several chances (two
121 * increment before the hardlockup detector generates 120 * or three with the current relation between the soft
122 * a warning 121 * and hard thresholds) to increment before the
122 * hardlockup detector generates a warning
123 */ 123 */
124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
125} 125}
@@ -321,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
321 */ 321 */
322static int watchdog(void *unused) 322static int watchdog(void *unused)
323{ 323{
324 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = 0 };
325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
326 326
327 sched_setscheduler(current, SCHED_FIFO, &param);
328
329 /* initialize timestamp */ 327 /* initialize timestamp */
330 __touch_watchdog(); 328 __touch_watchdog();
331 329
@@ -336,9 +334,11 @@ static int watchdog(void *unused)
336 334
337 set_current_state(TASK_INTERRUPTIBLE); 335 set_current_state(TASK_INTERRUPTIBLE);
338 /* 336 /*
339 * Run briefly once per second to reset the softlockup timestamp. 337 * Run briefly (kicked by the hrtimer callback function) once every
340 * If this gets delayed for more than 60 seconds then the 338 * get_sample_period() seconds (4 seconds by default) to reset the
341 * debug-printout triggers in watchdog_timer_fn(). 339 * softlockup timestamp. If this gets delayed for more than
340 * 2*watchdog_thresh seconds then the debug-printout triggers in
341 * watchdog_timer_fn().
342 */ 342 */
343 while (!kthread_should_stop()) { 343 while (!kthread_should_stop()) {
344 __touch_watchdog(); 344 __touch_watchdog();
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
349 349
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 /*
353 * Drop the policy/priority elevation during thread exit to avoid a
354 * scheduling latency spike.
355 */
352 __set_current_state(TASK_RUNNING); 356 __set_current_state(TASK_RUNNING);
353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param); 357 sched_setscheduler(current, SCHED_NORMAL, &param);
355 return 0; 358 return 0;
356} 359}
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
376 /* Try to register using hardware perf events */ 379 /* Try to register using hardware perf events */
377 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 380 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
378 if (!IS_ERR(event)) { 381 if (!IS_ERR(event)) {
379 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 382 pr_info("enabled, takes one hw-pmu counter.\n");
380 goto out_save; 383 goto out_save;
381 } 384 }
382 385
383 386
384 /* vary the KERN level based on the returned errno */ 387 /* vary the KERN level based on the returned errno */
385 if (PTR_ERR(event) == -EOPNOTSUPP) 388 if (PTR_ERR(event) == -EOPNOTSUPP)
386 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 389 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
387 else if (PTR_ERR(event) == -ENOENT) 390 else if (PTR_ERR(event) == -ENOENT)
388 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); 391 pr_warning("disabled (cpu%i): hardware events not enabled\n",
392 cpu);
389 else 393 else
390 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); 394 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
395 cpu, PTR_ERR(event));
391 return PTR_ERR(event); 396 return PTR_ERR(event);
392 397
393 /* success path */ 398 /* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
439 444
440 /* create the watchdog thread */ 445 /* create the watchdog thread */
441 if (!p) { 446 if (!p) {
447 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); 448 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
443 if (IS_ERR(p)) { 449 if (IS_ERR(p)) {
444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 450 pr_err("softlockup watchdog for %i failed\n", cpu);
445 if (!err) { 451 if (!err) {
446 /* if hardlockup hasn't already set this */ 452 /* if hardlockup hasn't already set this */
447 err = PTR_ERR(p); 453 err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
450 } 456 }
451 goto out; 457 goto out;
452 } 458 }
459 sched_setscheduler(p, SCHED_FIFO, &param);
453 kthread_bind(p, cpu); 460 kthread_bind(p, cpu);
454 per_cpu(watchdog_touch_ts, cpu) = 0; 461 per_cpu(watchdog_touch_ts, cpu) = 0;
455 per_cpu(softlockup_watchdog, cpu) = p; 462 per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
496 watchdog_enabled = 1; 503 watchdog_enabled = 1;
497 504
498 if (!watchdog_enabled) 505 if (!watchdog_enabled)
499 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 506 pr_err("failed to be enabled on some cpus\n");
500 507
501} 508}
502 509
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f2c5638bb5ab..5abf42f63c08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -476,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
476 struct workqueue_struct *wq) 476 struct workqueue_struct *wq)
477{ 477{
478 if (!(wq->flags & WQ_UNBOUND)) { 478 if (!(wq->flags & WQ_UNBOUND)) {
479 if (likely(cpu < nr_cpu_ids)) { 479 if (likely(cpu < nr_cpu_ids))
480#ifdef CONFIG_SMP
481 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 480 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
482#else
483 return wq->cpu_wq.single;
484#endif
485 }
486 } else if (likely(cpu == WORK_CPU_UNBOUND)) 481 } else if (likely(cpu == WORK_CPU_UNBOUND))
487 return wq->cpu_wq.single; 482 return wq->cpu_wq.single;
488 return NULL; 483 return NULL;
@@ -2899,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2899 const size_t size = sizeof(struct cpu_workqueue_struct); 2894 const size_t size = sizeof(struct cpu_workqueue_struct);
2900 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 2895 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2901 __alignof__(unsigned long long)); 2896 __alignof__(unsigned long long));
2902#ifdef CONFIG_SMP
2903 bool percpu = !(wq->flags & WQ_UNBOUND);
2904#else
2905 bool percpu = false;
2906#endif
2907 2897
2908 if (percpu) 2898 if (!(wq->flags & WQ_UNBOUND))
2909 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 2899 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2910 else { 2900 else {
2911 void *ptr; 2901 void *ptr;
@@ -2929,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2929 2919
2930static void free_cwqs(struct workqueue_struct *wq) 2920static void free_cwqs(struct workqueue_struct *wq)
2931{ 2921{
2932#ifdef CONFIG_SMP 2922 if (!(wq->flags & WQ_UNBOUND))
2933 bool percpu = !(wq->flags & WQ_UNBOUND);
2934#else
2935 bool percpu = false;
2936#endif
2937
2938 if (percpu)
2939 free_percpu(wq->cpu_wq.pcpu); 2923 free_percpu(wq->cpu_wq.pcpu);
2940 else if (wq->cpu_wq.single) { 2924 else if (wq->cpu_wq.single) {
2941 /* the pointer to free is stored right after the cwq */ 2925 /* the pointer to free is stored right after the cwq */