aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/capability.c21
-rw-r--r--kernel/cgroup.c607
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/compat.c73
-rw-r--r--kernel/cpu.c57
-rw-r--r--kernel/cpu_pm.c16
-rw-r--r--kernel/cpuset.c31
-rw-r--r--kernel/cred.c53
-rw-r--r--kernel/events/Makefile3
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/events/uprobes.c1667
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/extable.c8
-rw-r--r--kernel/fork.c114
-rw-r--r--kernel/groups.c50
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c13
-rw-r--r--kernel/irq/internals.h3
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/irq/irqdomain.c106
-rw-r--r--kernel/irq/manage.c157
-rw-r--r--kernel/irq/migration.c13
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/kallsyms.c32
-rw-r--r--kernel/kcmp.c196
-rw-r--r--kernel/kfifo.c1
-rw-r--r--kernel/kmod.c30
-rw-r--r--kernel/lglock.c89
-rw-r--r--kernel/module.c5
-rw-r--r--kernel/params.c62
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/power/Kconfig27
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/autosleep.c127
-rw-r--r--kernel/power/hibernate.c13
-rw-r--r--kernel/power/main.c160
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/swap.c62
-rw-r--r--kernel/power/wakelock.c259
-rw-r--r--kernel/printk.c1390
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny_plugin.h16
-rw-r--r--kernel/rcutorture.c257
-rw-r--r--kernel/rcutree.c332
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h154
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/res_counter.c81
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c494
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c504
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c105
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/seccomp.c458
-rw-r--r--kernel/semaphore.c2
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/smp.c27
-rw-r--r--kernel/smpboot.c67
-rw-r--r--kernel/smpboot.h18
-rw-r--r--kernel/srcu.c548
-rw-r--r--kernel/sys.c491
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/task_work.c84
-rw-r--r--kernel/time/Kconfig58
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/timer.c20
-rw-r--r--kernel/trace/Kconfig23
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/ftrace.c242
-rw-r--r--kernel/trace/ring_buffer.c590
-rw-r--r--kernel/trace/trace.c503
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_export.c1
-rw-r--r--kernel/trace/trace_kprobe.c899
-rw-r--r--kernel/trace/trace_printk.c4
-rw-r--r--kernel/trace/trace_probe.c839
-rw-r--r--kernel/trace/trace_probe.h161
-rw-r--r--kernel/trace/trace_uprobe.c788
-rw-r--r--kernel/trace/trace_workqueue.c300
-rw-r--r--kernel/uid16.c48
-rw-r--r--kernel/user.c51
-rw-r--r--kernel/user_namespace.c595
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/workqueue.c21
98 files changed, 10664 insertions, 3991 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9f..c0cc67ad764c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,12 @@
5obj-y = fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o groups.o 13 async.o range.o groups.o lglock.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -25,6 +25,9 @@ endif
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27 27
28ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
29obj-$(CONFIG_X86) += kcmp.o
30endif
28obj-$(CONFIG_FREEZER) += freezer.o 31obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 32obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_STACKTRACE) += stacktrace.o 33obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -43,6 +46,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
43obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 46obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
44obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 47obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
45obj-$(CONFIG_SMP) += smp.o 48obj-$(CONFIG_SMP) += smp.o
49obj-$(CONFIG_SMP) += smpboot.o
46ifneq ($(CONFIG_SMP),y) 50ifneq ($(CONFIG_SMP),y)
47obj-y += up.o 51obj-y += up.o
48endif 52endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34eae..4b96415527b8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
67#include <linux/syscalls.h> 67#include <linux/syscalls.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h> 69#include <linux/fs_struct.h>
70#include <linux/compat.h>
70 71
71#include "audit.h" 72#include "audit.h"
72 73
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
2710 audit_log_end(ab); 2711 audit_log_end(ab);
2711} 2712}
2712 2713
2713void __audit_seccomp(unsigned long syscall) 2714void __audit_seccomp(unsigned long syscall, long signr, int code)
2714{ 2715{
2715 struct audit_buffer *ab; 2716 struct audit_buffer *ab;
2716 2717
2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2718 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2718 audit_log_abend(ab, "seccomp", SIGKILL); 2719 audit_log_abend(ab, "seccomp", signr);
2719 audit_log_format(ab, " syscall=%ld", syscall); 2720 audit_log_format(ab, " syscall=%ld", syscall);
2721 audit_log_format(ab, " compat=%d", is_compat_task());
2722 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2723 audit_log_format(ab, " code=0x%x", code);
2720 audit_log_end(ab); 2724 audit_log_end(ab);
2721} 2725}
2722 2726
diff --git a/kernel/capability.c b/kernel/capability.c
index 3f1adb6c6470..493d97259484 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -419,3 +419,24 @@ bool nsown_capable(int cap)
419{ 419{
420 return ns_capable(current_user_ns(), cap); 420 return ns_capable(current_user_ns(), cap);
421} 421}
422
423/**
424 * inode_capable - Check superior capability over inode
425 * @inode: The inode in question
426 * @cap: The capability in question
427 *
428 * Return true if the current task has the given superior capability
429 * targeted at it's own user namespace and that the given inode is owned
430 * by the current user namespace or a child namespace.
431 *
432 * Currently we check to see if an inode is owned by the current
433 * user namespace by seeing if the inode's owner maps into the
434 * current user namespace.
435 *
436 */
437bool inode_capable(const struct inode *inode, int cap)
438{
439 struct user_namespace *ns = current_user_ns();
440
441 return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
442}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..72fcd3069a90 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
60#include <linux/eventfd.h> 60#include <linux/eventfd.h>
61#include <linux/poll.h> 61#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
63#include <linux/kthread.h>
63 64
64#include <linux/atomic.h> 65#include <linux/atomic.h>
65 66
67/* css deactivation bias, makes css->refcnt negative to deny new trygets */
68#define CSS_DEACT_BIAS INT_MIN
69
66/* 70/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its 71 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it. 72 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
127 /* A list running through the active hierarchies */ 131 /* A list running through the active hierarchies */
128 struct list_head root_list; 132 struct list_head root_list;
129 133
134 /* All cgroups on this root, cgroup_mutex protected */
135 struct list_head allcg_list;
136
130 /* Hierarchy-specific flags */ 137 /* Hierarchy-specific flags */
131 unsigned long flags; 138 unsigned long flags;
132 139
@@ -145,6 +152,15 @@ struct cgroupfs_root {
145static struct cgroupfs_root rootnode; 152static struct cgroupfs_root rootnode;
146 153
147/* 154/*
155 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
156 */
157struct cfent {
158 struct list_head node;
159 struct dentry *dentry;
160 struct cftype *type;
161};
162
163/*
148 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 164 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
149 * cgroup_subsys->use_id != 0. 165 * cgroup_subsys->use_id != 0.
150 */ 166 */
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void)
239 255
240EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 256EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
241 257
258/* the current nr of refs, always >= 0 whether @css is deactivated or not */
259static int css_refcnt(struct cgroup_subsys_state *css)
260{
261 int v = atomic_read(&css->refcnt);
262
263 return v >= 0 ? v : v - CSS_DEACT_BIAS;
264}
265
242/* convenient tests for these bits */ 266/* convenient tests for these bits */
243inline int cgroup_is_removed(const struct cgroup *cgrp) 267inline int cgroup_is_removed(const struct cgroup *cgrp)
244{ 268{
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
279#define for_each_active_root(_root) \ 303#define for_each_active_root(_root) \
280list_for_each_entry(_root, &roots, root_list) 304list_for_each_entry(_root, &roots, root_list)
281 305
306static inline struct cgroup *__d_cgrp(struct dentry *dentry)
307{
308 return dentry->d_fsdata;
309}
310
311static inline struct cfent *__d_cfe(struct dentry *dentry)
312{
313 return dentry->d_fsdata;
314}
315
316static inline struct cftype *__d_cft(struct dentry *dentry)
317{
318 return __d_cfe(dentry)->type;
319}
320
282/* the list of cgroups eligible for automatic release. Protected by 321/* the list of cgroups eligible for automatic release. Protected by
283 * release_list_lock */ 322 * release_list_lock */
284static LIST_HEAD(release_list); 323static LIST_HEAD(release_list);
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
816 struct cgroup_subsys *ss; 855 struct cgroup_subsys *ss;
817 int ret = 0; 856 int ret = 0;
818 857
819 for_each_subsys(cgrp->root, ss) 858 for_each_subsys(cgrp->root, ss) {
820 if (ss->pre_destroy) { 859 if (!ss->pre_destroy)
821 ret = ss->pre_destroy(cgrp); 860 continue;
822 if (ret) 861
823 break; 862 ret = ss->pre_destroy(cgrp);
863 if (ret) {
864 /* ->pre_destroy() failure is being deprecated */
865 WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
866 break;
824 } 867 }
868 }
825 869
826 return ret; 870 return ret;
827} 871}
@@ -852,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
852 mutex_unlock(&cgroup_mutex); 896 mutex_unlock(&cgroup_mutex);
853 897
854 /* 898 /*
855 * Drop the active superblock reference that we took when we 899 * We want to drop the active superblock reference from the
856 * created the cgroup 900 * cgroup creation after all the dentry refs are gone -
901 * kill_sb gets mighty unhappy otherwise. Mark
902 * dentry->d_fsdata with cgroup_diput() to tell
903 * cgroup_d_release() to call deactivate_super().
857 */ 904 */
858 deactivate_super(cgrp->root->sb); 905 dentry->d_fsdata = cgroup_diput;
859 906
860 /* 907 /*
861 * if we're getting rid of the cgroup, refcount should ensure 908 * if we're getting rid of the cgroup, refcount should ensure
@@ -864,6 +911,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
864 BUG_ON(!list_empty(&cgrp->pidlists)); 911 BUG_ON(!list_empty(&cgrp->pidlists));
865 912
866 kfree_rcu(cgrp, rcu_head); 913 kfree_rcu(cgrp, rcu_head);
914 } else {
915 struct cfent *cfe = __d_cfe(dentry);
916 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
917
918 WARN_ONCE(!list_empty(&cfe->node) &&
919 cgrp != &cgrp->root->top_cgroup,
920 "cfe still linked for %s\n", cfe->type->name);
921 kfree(cfe);
867 } 922 }
868 iput(inode); 923 iput(inode);
869} 924}
@@ -873,6 +928,13 @@ static int cgroup_delete(const struct dentry *d)
873 return 1; 928 return 1;
874} 929}
875 930
931static void cgroup_d_release(struct dentry *dentry)
932{
933 /* did cgroup_diput() tell me to deactivate super? */
934 if (dentry->d_fsdata == cgroup_diput)
935 deactivate_super(dentry->d_sb);
936}
937
876static void remove_dir(struct dentry *d) 938static void remove_dir(struct dentry *d)
877{ 939{
878 struct dentry *parent = dget(d->d_parent); 940 struct dentry *parent = dget(d->d_parent);
@@ -882,34 +944,36 @@ static void remove_dir(struct dentry *d)
882 dput(parent); 944 dput(parent);
883} 945}
884 946
885static void cgroup_clear_directory(struct dentry *dentry) 947static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
886{ 948{
887 struct list_head *node; 949 struct cfent *cfe;
888 950
889 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 951 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
890 spin_lock(&dentry->d_lock); 952 lockdep_assert_held(&cgroup_mutex);
891 node = dentry->d_subdirs.next; 953
892 while (node != &dentry->d_subdirs) { 954 list_for_each_entry(cfe, &cgrp->files, node) {
893 struct dentry *d = list_entry(node, struct dentry, d_u.d_child); 955 struct dentry *d = cfe->dentry;
894 956
895 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 957 if (cft && cfe->type != cft)
896 list_del_init(node); 958 continue;
897 if (d->d_inode) { 959
898 /* This should never be called on a cgroup 960 dget(d);
899 * directory with child cgroups */ 961 d_delete(d);
900 BUG_ON(d->d_inode->i_mode & S_IFDIR); 962 simple_unlink(d->d_inode, d);
901 dget_dlock(d); 963 list_del_init(&cfe->node);
902 spin_unlock(&d->d_lock); 964 dput(d);
903 spin_unlock(&dentry->d_lock); 965
904 d_delete(d); 966 return 0;
905 simple_unlink(dentry->d_inode, d);
906 dput(d);
907 spin_lock(&dentry->d_lock);
908 } else
909 spin_unlock(&d->d_lock);
910 node = dentry->d_subdirs.next;
911 } 967 }
912 spin_unlock(&dentry->d_lock); 968 return -ENOENT;
969}
970
971static void cgroup_clear_directory(struct dentry *dir)
972{
973 struct cgroup *cgrp = __d_cgrp(dir);
974
975 while (!list_empty(&cgrp->files))
976 cgroup_rm_file(cgrp, NULL);
913} 977}
914 978
915/* 979/*
@@ -1294,6 +1358,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1294 if (ret) 1358 if (ret)
1295 goto out_unlock; 1359 goto out_unlock;
1296 1360
1361 /* See feature-removal-schedule.txt */
1362 if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
1363 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1364 task_tgid_nr(current), current->comm);
1365
1297 /* Don't allow flags or name to change at remount */ 1366 /* Don't allow flags or name to change at remount */
1298 if (opts.flags != root->flags || 1367 if (opts.flags != root->flags ||
1299 (opts.name && strcmp(opts.name, root->name))) { 1368 (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1377,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1308 goto out_unlock; 1377 goto out_unlock;
1309 } 1378 }
1310 1379
1311 /* (re)populate subsystem files */ 1380 /* clear out any existing files and repopulate subsystem files */
1381 cgroup_clear_directory(cgrp->dentry);
1312 cgroup_populate_dir(cgrp); 1382 cgroup_populate_dir(cgrp);
1313 1383
1314 if (opts.release_agent) 1384 if (opts.release_agent)
@@ -1333,6 +1403,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1333{ 1403{
1334 INIT_LIST_HEAD(&cgrp->sibling); 1404 INIT_LIST_HEAD(&cgrp->sibling);
1335 INIT_LIST_HEAD(&cgrp->children); 1405 INIT_LIST_HEAD(&cgrp->children);
1406 INIT_LIST_HEAD(&cgrp->files);
1336 INIT_LIST_HEAD(&cgrp->css_sets); 1407 INIT_LIST_HEAD(&cgrp->css_sets);
1337 INIT_LIST_HEAD(&cgrp->release_list); 1408 INIT_LIST_HEAD(&cgrp->release_list);
1338 INIT_LIST_HEAD(&cgrp->pidlists); 1409 INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1415,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1344static void init_cgroup_root(struct cgroupfs_root *root) 1415static void init_cgroup_root(struct cgroupfs_root *root)
1345{ 1416{
1346 struct cgroup *cgrp = &root->top_cgroup; 1417 struct cgroup *cgrp = &root->top_cgroup;
1418
1347 INIT_LIST_HEAD(&root->subsys_list); 1419 INIT_LIST_HEAD(&root->subsys_list);
1348 INIT_LIST_HEAD(&root->root_list); 1420 INIT_LIST_HEAD(&root->root_list);
1421 INIT_LIST_HEAD(&root->allcg_list);
1349 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1350 cgrp->root = root; 1423 cgrp->root = root;
1351 cgrp->top_cgroup = cgrp; 1424 cgrp->top_cgroup = cgrp;
1425 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1352 init_cgroup_housekeeping(cgrp); 1426 init_cgroup_housekeeping(cgrp);
1353} 1427}
1354 1428
@@ -1468,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
1468 static const struct dentry_operations cgroup_dops = { 1542 static const struct dentry_operations cgroup_dops = {
1469 .d_iput = cgroup_diput, 1543 .d_iput = cgroup_diput,
1470 .d_delete = cgroup_delete, 1544 .d_delete = cgroup_delete,
1545 .d_release = cgroup_d_release,
1471 }; 1546 };
1472 1547
1473 struct inode *inode = 1548 struct inode *inode =
@@ -1692,16 +1767,6 @@ static struct file_system_type cgroup_fs_type = {
1692 1767
1693static struct kobject *cgroup_kobj; 1768static struct kobject *cgroup_kobj;
1694 1769
1695static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1696{
1697 return dentry->d_fsdata;
1698}
1699
1700static inline struct cftype *__d_cft(struct dentry *dentry)
1701{
1702 return dentry->d_fsdata;
1703}
1704
1705/** 1770/**
1706 * cgroup_path - generate the path of a cgroup 1771 * cgroup_path - generate the path of a cgroup
1707 * @cgrp: the cgroup in question 1772 * @cgrp: the cgroup in question
@@ -2160,9 +2225,9 @@ retry_find_task:
2160 * only need to check permissions on one of them. 2225 * only need to check permissions on one of them.
2161 */ 2226 */
2162 tcred = __task_cred(tsk); 2227 tcred = __task_cred(tsk);
2163 if (cred->euid && 2228 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2164 cred->euid != tcred->uid && 2229 !uid_eq(cred->euid, tcred->uid) &&
2165 cred->euid != tcred->suid) { 2230 !uid_eq(cred->euid, tcred->suid)) {
2166 rcu_read_unlock(); 2231 rcu_read_unlock();
2167 ret = -EACCES; 2232 ret = -EACCES;
2168 goto out_unlock_cgroup; 2233 goto out_unlock_cgroup;
@@ -2172,6 +2237,18 @@ retry_find_task:
2172 2237
2173 if (threadgroup) 2238 if (threadgroup)
2174 tsk = tsk->group_leader; 2239 tsk = tsk->group_leader;
2240
2241 /*
2242 * Workqueue threads may acquire PF_THREAD_BOUND and become
2243 * trapped in a cpuset, or RT worker may be born in a cgroup
2244 * with no rt_runtime allocated. Just say no.
2245 */
2246 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
2247 ret = -EINVAL;
2248 rcu_read_unlock();
2249 goto out_unlock_cgroup;
2250 }
2251
2175 get_task_struct(tsk); 2252 get_task_struct(tsk);
2176 rcu_read_unlock(); 2253 rcu_read_unlock();
2177 2254
@@ -2603,50 +2680,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2603 return mode; 2680 return mode;
2604} 2681}
2605 2682
2606int cgroup_add_file(struct cgroup *cgrp, 2683static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2607 struct cgroup_subsys *subsys, 2684 const struct cftype *cft)
2608 const struct cftype *cft)
2609{ 2685{
2610 struct dentry *dir = cgrp->dentry; 2686 struct dentry *dir = cgrp->dentry;
2687 struct cgroup *parent = __d_cgrp(dir);
2611 struct dentry *dentry; 2688 struct dentry *dentry;
2689 struct cfent *cfe;
2612 int error; 2690 int error;
2613 umode_t mode; 2691 umode_t mode;
2614
2615 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2692 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2693
2694 /* does @cft->flags tell us to skip creation on @cgrp? */
2695 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2696 return 0;
2697 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2698 return 0;
2699
2616 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2700 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2617 strcpy(name, subsys->name); 2701 strcpy(name, subsys->name);
2618 strcat(name, "."); 2702 strcat(name, ".");
2619 } 2703 }
2620 strcat(name, cft->name); 2704 strcat(name, cft->name);
2705
2621 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2706 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2707
2708 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2709 if (!cfe)
2710 return -ENOMEM;
2711
2622 dentry = lookup_one_len(name, dir, strlen(name)); 2712 dentry = lookup_one_len(name, dir, strlen(name));
2623 if (!IS_ERR(dentry)) { 2713 if (IS_ERR(dentry)) {
2624 mode = cgroup_file_mode(cft);
2625 error = cgroup_create_file(dentry, mode | S_IFREG,
2626 cgrp->root->sb);
2627 if (!error)
2628 dentry->d_fsdata = (void *)cft;
2629 dput(dentry);
2630 } else
2631 error = PTR_ERR(dentry); 2714 error = PTR_ERR(dentry);
2715 goto out;
2716 }
2717
2718 mode = cgroup_file_mode(cft);
2719 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2720 if (!error) {
2721 cfe->type = (void *)cft;
2722 cfe->dentry = dentry;
2723 dentry->d_fsdata = cfe;
2724 list_add_tail(&cfe->node, &parent->files);
2725 cfe = NULL;
2726 }
2727 dput(dentry);
2728out:
2729 kfree(cfe);
2632 return error; 2730 return error;
2633} 2731}
2634EXPORT_SYMBOL_GPL(cgroup_add_file);
2635 2732
2636int cgroup_add_files(struct cgroup *cgrp, 2733static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2637 struct cgroup_subsys *subsys, 2734 const struct cftype cfts[], bool is_add)
2638 const struct cftype cft[],
2639 int count)
2640{ 2735{
2641 int i, err; 2736 const struct cftype *cft;
2642 for (i = 0; i < count; i++) { 2737 int err, ret = 0;
2643 err = cgroup_add_file(cgrp, subsys, &cft[i]); 2738
2644 if (err) 2739 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2645 return err; 2740 if (is_add)
2741 err = cgroup_add_file(cgrp, subsys, cft);
2742 else
2743 err = cgroup_rm_file(cgrp, cft);
2744 if (err) {
2745 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2746 is_add ? "add" : "remove", cft->name, err);
2747 ret = err;
2748 }
2646 } 2749 }
2750 return ret;
2751}
2752
2753static DEFINE_MUTEX(cgroup_cft_mutex);
2754
2755static void cgroup_cfts_prepare(void)
2756 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
2757{
2758 /*
2759 * Thanks to the entanglement with vfs inode locking, we can't walk
2760 * the existing cgroups under cgroup_mutex and create files.
2761 * Instead, we increment reference on all cgroups and build list of
2762 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure
2763 * exclusive access to the field.
2764 */
2765 mutex_lock(&cgroup_cft_mutex);
2766 mutex_lock(&cgroup_mutex);
2767}
2768
2769static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2770 const struct cftype *cfts, bool is_add)
2771 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
2772{
2773 LIST_HEAD(pending);
2774 struct cgroup *cgrp, *n;
2775
2776 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2777 if (cfts && ss->root != &rootnode) {
2778 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
2779 dget(cgrp->dentry);
2780 list_add_tail(&cgrp->cft_q_node, &pending);
2781 }
2782 }
2783
2784 mutex_unlock(&cgroup_mutex);
2785
2786 /*
2787 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm
2788 * files for all cgroups which were created before.
2789 */
2790 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
2791 struct inode *inode = cgrp->dentry->d_inode;
2792
2793 mutex_lock(&inode->i_mutex);
2794 mutex_lock(&cgroup_mutex);
2795 if (!cgroup_is_removed(cgrp))
2796 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2797 mutex_unlock(&cgroup_mutex);
2798 mutex_unlock(&inode->i_mutex);
2799
2800 list_del_init(&cgrp->cft_q_node);
2801 dput(cgrp->dentry);
2802 }
2803
2804 mutex_unlock(&cgroup_cft_mutex);
2805}
2806
2807/**
2808 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2809 * @ss: target cgroup subsystem
2810 * @cfts: zero-length name terminated array of cftypes
2811 *
2812 * Register @cfts to @ss. Files described by @cfts are created for all
2813 * existing cgroups to which @ss is attached and all future cgroups will
2814 * have them too. This function can be called anytime whether @ss is
2815 * attached or not.
2816 *
2817 * Returns 0 on successful registration, -errno on failure. Note that this
2818 * function currently returns 0 as long as @cfts registration is successful
2819 * even if some file creation attempts on existing cgroups fail.
2820 */
2821int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2822{
2823 struct cftype_set *set;
2824
2825 set = kzalloc(sizeof(*set), GFP_KERNEL);
2826 if (!set)
2827 return -ENOMEM;
2828
2829 cgroup_cfts_prepare();
2830 set->cfts = cfts;
2831 list_add_tail(&set->node, &ss->cftsets);
2832 cgroup_cfts_commit(ss, cfts, true);
2833
2647 return 0; 2834 return 0;
2648} 2835}
2649EXPORT_SYMBOL_GPL(cgroup_add_files); 2836EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2837
2838/**
2839 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2840 * @ss: target cgroup subsystem
2841 * @cfts: zero-length name terminated array of cftypes
2842 *
2843 * Unregister @cfts from @ss. Files described by @cfts are removed from
2844 * all existing cgroups to which @ss is attached and all future cgroups
2845 * won't have them either. This function can be called anytime whether @ss
2846 * is attached or not.
2847 *
2848 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2849 * registered with @ss.
2850 */
2851int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
2852{
2853 struct cftype_set *set;
2854
2855 cgroup_cfts_prepare();
2856
2857 list_for_each_entry(set, &ss->cftsets, node) {
2858 if (set->cfts == cfts) {
2859 list_del_init(&set->node);
2860 cgroup_cfts_commit(ss, cfts, false);
2861 return 0;
2862 }
2863 }
2864
2865 cgroup_cfts_commit(ss, NULL, false);
2866 return -ENOENT;
2867}
2650 2868
2651/** 2869/**
2652 * cgroup_task_count - count the number of tasks in a cgroup. 2870 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3843,14 @@ static struct cftype files[] = {
3625 .read_u64 = cgroup_clone_children_read, 3843 .read_u64 = cgroup_clone_children_read,
3626 .write_u64 = cgroup_clone_children_write, 3844 .write_u64 = cgroup_clone_children_write,
3627 }, 3845 },
3628}; 3846 {
3629 3847 .name = "release_agent",
3630static struct cftype cft_release_agent = { 3848 .flags = CFTYPE_ONLY_ON_ROOT,
3631 .name = "release_agent", 3849 .read_seq_string = cgroup_release_agent_show,
3632 .read_seq_string = cgroup_release_agent_show, 3850 .write_string = cgroup_release_agent_write,
3633 .write_string = cgroup_release_agent_write, 3851 .max_write_len = PATH_MAX,
3634 .max_write_len = PATH_MAX, 3852 },
3853 { } /* terminate */
3635}; 3854};
3636 3855
3637static int cgroup_populate_dir(struct cgroup *cgrp) 3856static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3858,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3639 int err; 3858 int err;
3640 struct cgroup_subsys *ss; 3859 struct cgroup_subsys *ss;
3641 3860
3642 /* First clear out any existing files */ 3861 err = cgroup_addrm_files(cgrp, NULL, files, true);
3643 cgroup_clear_directory(cgrp->dentry);
3644
3645 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3646 if (err < 0) 3862 if (err < 0)
3647 return err; 3863 return err;
3648 3864
3649 if (cgrp == cgrp->top_cgroup) { 3865 /* process cftsets of each subsystem */
3650 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3651 return err;
3652 }
3653
3654 for_each_subsys(cgrp->root, ss) { 3866 for_each_subsys(cgrp->root, ss) {
3655 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 3867 struct cftype_set *set;
3656 return err; 3868
3869 list_for_each_entry(set, &ss->cftsets, node)
3870 cgroup_addrm_files(cgrp, ss, set->cfts, true);
3657 } 3871 }
3872
3658 /* This cgroup is ready now */ 3873 /* This cgroup is ready now */
3659 for_each_subsys(cgrp->root, ss) { 3874 for_each_subsys(cgrp->root, ss) {
3660 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 3875 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3885,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
3670 return 0; 3885 return 0;
3671} 3886}
3672 3887
3888static void css_dput_fn(struct work_struct *work)
3889{
3890 struct cgroup_subsys_state *css =
3891 container_of(work, struct cgroup_subsys_state, dput_work);
3892
3893 dput(css->cgroup->dentry);
3894}
3895
3673static void init_cgroup_css(struct cgroup_subsys_state *css, 3896static void init_cgroup_css(struct cgroup_subsys_state *css,
3674 struct cgroup_subsys *ss, 3897 struct cgroup_subsys *ss,
3675 struct cgroup *cgrp) 3898 struct cgroup *cgrp)
@@ -3682,6 +3905,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
3682 set_bit(CSS_ROOT, &css->flags); 3905 set_bit(CSS_ROOT, &css->flags);
3683 BUG_ON(cgrp->subsys[ss->subsys_id]); 3906 BUG_ON(cgrp->subsys[ss->subsys_id]);
3684 cgrp->subsys[ss->subsys_id] = css; 3907 cgrp->subsys[ss->subsys_id] = css;
3908
3909 /*
3910 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
3911 * which is put on the last css_put(). dput() requires process
3912 * context, which css_put() may be called without. @css->dput_work
3913 * will be used to invoke dput() asynchronously from css_put().
3914 */
3915 INIT_WORK(&css->dput_work, css_dput_fn);
3916 if (ss->__DEPRECATED_clear_css_refs)
3917 set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
3685} 3918}
3686 3919
3687static void cgroup_lock_hierarchy(struct cgroupfs_root *root) 3920static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4017,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3784 if (err < 0) 4017 if (err < 0)
3785 goto err_remove; 4018 goto err_remove;
3786 4019
4020 /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
4021 for_each_subsys(root, ss)
4022 if (!ss->__DEPRECATED_clear_css_refs)
4023 dget(dentry);
4024
3787 /* The cgroup directory was pre-locked for us */ 4025 /* The cgroup directory was pre-locked for us */
3788 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 4026 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3789 4027
4028 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4029
3790 err = cgroup_populate_dir(cgrp); 4030 err = cgroup_populate_dir(cgrp);
3791 /* If err < 0, we have a half-filled directory - oh well ;) */ 4031 /* If err < 0, we have a half-filled directory - oh well ;) */
3792 4032
@@ -3826,18 +4066,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3826 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4066 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3827} 4067}
3828 4068
4069/*
4070 * Check the reference count on each subsystem. Since we already
4071 * established that there are no tasks in the cgroup, if the css refcount
4072 * is also 1, then there should be no outstanding references, so the
4073 * subsystem is safe to destroy. We scan across all subsystems rather than
4074 * using the per-hierarchy linked list of mounted subsystems since we can
4075 * be called via check_for_release() with no synchronization other than
4076 * RCU, and the subsystem linked list isn't RCU-safe.
4077 */
3829static int cgroup_has_css_refs(struct cgroup *cgrp) 4078static int cgroup_has_css_refs(struct cgroup *cgrp)
3830{ 4079{
3831 /* Check the reference count on each subsystem. Since we
3832 * already established that there are no tasks in the
3833 * cgroup, if the css refcount is also 1, then there should
3834 * be no outstanding references, so the subsystem is safe to
3835 * destroy. We scan across all subsystems rather than using
3836 * the per-hierarchy linked list of mounted subsystems since
3837 * we can be called via check_for_release() with no
3838 * synchronization other than RCU, and the subsystem linked
3839 * list isn't RCU-safe */
3840 int i; 4080 int i;
4081
3841 /* 4082 /*
3842 * We won't need to lock the subsys array, because the subsystems 4083 * We won't need to lock the subsys array, because the subsystems
3843 * we're concerned about aren't going anywhere since our cgroup root 4084 * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4087,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3846 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4087 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3847 struct cgroup_subsys *ss = subsys[i]; 4088 struct cgroup_subsys *ss = subsys[i];
3848 struct cgroup_subsys_state *css; 4089 struct cgroup_subsys_state *css;
4090
3849 /* Skip subsystems not present or not in this hierarchy */ 4091 /* Skip subsystems not present or not in this hierarchy */
3850 if (ss == NULL || ss->root != cgrp->root) 4092 if (ss == NULL || ss->root != cgrp->root)
3851 continue; 4093 continue;
4094
3852 css = cgrp->subsys[ss->subsys_id]; 4095 css = cgrp->subsys[ss->subsys_id];
3853 /* When called from check_for_release() it's possible 4096 /*
4097 * When called from check_for_release() it's possible
3854 * that by this point the cgroup has been removed 4098 * that by this point the cgroup has been removed
3855 * and the css deleted. But a false-positive doesn't 4099 * and the css deleted. But a false-positive doesn't
3856 * matter, since it can only happen if the cgroup 4100 * matter, since it can only happen if the cgroup
3857 * has been deleted and hence no longer needs the 4101 * has been deleted and hence no longer needs the
3858 * release agent to be called anyway. */ 4102 * release agent to be called anyway.
3859 if (css && (atomic_read(&css->refcnt) > 1)) 4103 */
4104 if (css && css_refcnt(css) > 1)
3860 return 1; 4105 return 1;
3861 } 4106 }
3862 return 0; 4107 return 0;
@@ -3866,51 +4111,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3866 * Atomically mark all (or else none) of the cgroup's CSS objects as 4111 * Atomically mark all (or else none) of the cgroup's CSS objects as
3867 * CSS_REMOVED. Return true on success, or false if the cgroup has 4112 * CSS_REMOVED. Return true on success, or false if the cgroup has
3868 * busy subsystems. Call with cgroup_mutex held 4113 * busy subsystems. Call with cgroup_mutex held
4114 *
4115 * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
4116 * not, cgroup removal behaves differently.
4117 *
4118 * If clear is set, css refcnt for the subsystem should be zero before
4119 * cgroup removal can be committed. This is implemented by
4120 * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
4121 * called multiple times until all css refcnts reach zero and is allowed to
4122 * veto removal on any invocation. This behavior is deprecated and will be
4123 * removed as soon as the existing user (memcg) is updated.
4124 *
4125 * If clear is not set, each css holds an extra reference to the cgroup's
4126 * dentry and cgroup removal proceeds regardless of css refs.
4127 * ->pre_destroy() will be called at least once and is not allowed to fail.
4128 * On the last put of each css, whenever that may be, the extra dentry ref
4129 * is put so that dentry destruction happens only after all css's are
4130 * released.
3869 */ 4131 */
3870
3871static int cgroup_clear_css_refs(struct cgroup *cgrp) 4132static int cgroup_clear_css_refs(struct cgroup *cgrp)
3872{ 4133{
3873 struct cgroup_subsys *ss; 4134 struct cgroup_subsys *ss;
3874 unsigned long flags; 4135 unsigned long flags;
3875 bool failed = false; 4136 bool failed = false;
4137
3876 local_irq_save(flags); 4138 local_irq_save(flags);
4139
4140 /*
4141 * Block new css_tryget() by deactivating refcnt. If all refcnts
4142 * for subsystems w/ clear_css_refs set were 1 at the moment of
4143 * deactivation, we succeeded.
4144 */
3877 for_each_subsys(cgrp->root, ss) { 4145 for_each_subsys(cgrp->root, ss) {
3878 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4146 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3879 int refcnt; 4147
3880 while (1) { 4148 WARN_ON(atomic_read(&css->refcnt) < 0);
3881 /* We can only remove a CSS with a refcnt==1 */ 4149 atomic_add(CSS_DEACT_BIAS, &css->refcnt);
3882 refcnt = atomic_read(&css->refcnt); 4150
3883 if (refcnt > 1) { 4151 if (ss->__DEPRECATED_clear_css_refs)
3884 failed = true; 4152 failed |= css_refcnt(css) != 1;
3885 goto done;
3886 }
3887 BUG_ON(!refcnt);
3888 /*
3889 * Drop the refcnt to 0 while we check other
3890 * subsystems. This will cause any racing
3891 * css_tryget() to spin until we set the
3892 * CSS_REMOVED bits or abort
3893 */
3894 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3895 break;
3896 cpu_relax();
3897 }
3898 } 4153 }
3899 done: 4154
4155 /*
4156 * If succeeded, set REMOVED and put all the base refs; otherwise,
4157 * restore refcnts to positive values. Either way, all in-progress
4158 * css_tryget() will be released.
4159 */
3900 for_each_subsys(cgrp->root, ss) { 4160 for_each_subsys(cgrp->root, ss) {
3901 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4161 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3902 if (failed) { 4162
3903 /* 4163 if (!failed) {
3904 * Restore old refcnt if we previously managed
3905 * to clear it from 1 to 0
3906 */
3907 if (!atomic_read(&css->refcnt))
3908 atomic_set(&css->refcnt, 1);
3909 } else {
3910 /* Commit the fact that the CSS is removed */
3911 set_bit(CSS_REMOVED, &css->flags); 4164 set_bit(CSS_REMOVED, &css->flags);
4165 css_put(css);
4166 } else {
4167 atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
3912 } 4168 }
3913 } 4169 }
4170
3914 local_irq_restore(flags); 4171 local_irq_restore(flags);
3915 return !failed; 4172 return !failed;
3916} 4173}
@@ -3995,6 +4252,8 @@ again:
3995 list_del_init(&cgrp->sibling); 4252 list_del_init(&cgrp->sibling);
3996 cgroup_unlock_hierarchy(cgrp->root); 4253 cgroup_unlock_hierarchy(cgrp->root);
3997 4254
4255 list_del_init(&cgrp->allcg_node);
4256
3998 d = dget(cgrp->dentry); 4257 d = dget(cgrp->dentry);
3999 4258
4000 cgroup_d_remove_dir(d); 4259 cgroup_d_remove_dir(d);
@@ -4021,12 +4280,29 @@ again:
4021 return 0; 4280 return 0;
4022} 4281}
4023 4282
4283static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4284{
4285 INIT_LIST_HEAD(&ss->cftsets);
4286
4287 /*
4288 * base_cftset is embedded in subsys itself, no need to worry about
4289 * deregistration.
4290 */
4291 if (ss->base_cftypes) {
4292 ss->base_cftset.cfts = ss->base_cftypes;
4293 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4294 }
4295}
4296
4024static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4297static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4025{ 4298{
4026 struct cgroup_subsys_state *css; 4299 struct cgroup_subsys_state *css;
4027 4300
4028 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4301 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4029 4302
4303 /* init base cftset */
4304 cgroup_init_cftsets(ss);
4305
4030 /* Create the top cgroup state for this subsystem */ 4306 /* Create the top cgroup state for this subsystem */
4031 list_add(&ss->sibling, &rootnode.subsys_list); 4307 list_add(&ss->sibling, &rootnode.subsys_list);
4032 ss->root = &rootnode; 4308 ss->root = &rootnode;
@@ -4096,6 +4372,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096 return 0; 4372 return 0;
4097 } 4373 }
4098 4374
4375 /* init base cftset */
4376 cgroup_init_cftsets(ss);
4377
4099 /* 4378 /*
4100 * need to register a subsys id before anything else - for example, 4379 * need to register a subsys id before anything else - for example,
4101 * init_cgroup_css needs it. 4380 * init_cgroup_css needs it.
@@ -4685,21 +4964,41 @@ static void check_for_release(struct cgroup *cgrp)
4685} 4964}
4686 4965
4687/* Caller must verify that the css is not for root cgroup */ 4966/* Caller must verify that the css is not for root cgroup */
4688void __css_put(struct cgroup_subsys_state *css, int count) 4967bool __css_tryget(struct cgroup_subsys_state *css)
4968{
4969 do {
4970 int v = css_refcnt(css);
4971
4972 if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
4973 return true;
4974 cpu_relax();
4975 } while (!test_bit(CSS_REMOVED, &css->flags));
4976
4977 return false;
4978}
4979EXPORT_SYMBOL_GPL(__css_tryget);
4980
4981/* Caller must verify that the css is not for root cgroup */
4982void __css_put(struct cgroup_subsys_state *css)
4689{ 4983{
4690 struct cgroup *cgrp = css->cgroup; 4984 struct cgroup *cgrp = css->cgroup;
4691 int val; 4985
4692 rcu_read_lock(); 4986 rcu_read_lock();
4693 val = atomic_sub_return(count, &css->refcnt); 4987 atomic_dec(&css->refcnt);
4694 if (val == 1) { 4988 switch (css_refcnt(css)) {
4989 case 1:
4695 if (notify_on_release(cgrp)) { 4990 if (notify_on_release(cgrp)) {
4696 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4991 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4697 check_for_release(cgrp); 4992 check_for_release(cgrp);
4698 } 4993 }
4699 cgroup_wakeup_rmdir_waiter(cgrp); 4994 cgroup_wakeup_rmdir_waiter(cgrp);
4995 break;
4996 case 0:
4997 if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
4998 schedule_work(&css->dput_work);
4999 break;
4700 } 5000 }
4701 rcu_read_unlock(); 5001 rcu_read_unlock();
4702 WARN_ON_ONCE(val < 1);
4703} 5002}
4704EXPORT_SYMBOL_GPL(__css_put); 5003EXPORT_SYMBOL_GPL(__css_put);
4705 5004
@@ -4818,7 +5117,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4818 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5117 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4819 * it's unchanged until freed. 5118 * it's unchanged until freed.
4820 */ 5119 */
4821 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5120 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4822 5121
4823 if (cssid) 5122 if (cssid)
4824 return cssid->id; 5123 return cssid->id;
@@ -4830,7 +5129,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4830{ 5129{
4831 struct css_id *cssid; 5130 struct css_id *cssid;
4832 5131
4833 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); 5132 cssid = rcu_dereference_check(css->id, css_refcnt(css));
4834 5133
4835 if (cssid) 5134 if (cssid)
4836 return cssid->depth; 5135 return cssid->depth;
@@ -4844,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth);
4844 * @root: the css supporsed to be an ancestor of the child. 5143 * @root: the css supporsed to be an ancestor of the child.
4845 * 5144 *
4846 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 5145 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4847 * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). 5146 * this function reads css->id, the caller must hold rcu_read_lock().
4848 * But, considering usual usage, the csses should be valid objects after test. 5147 * But, considering usual usage, the csses should be valid objects after test.
4849 * Assuming that the caller will do some action to the child if this returns 5148 * Assuming that the caller will do some action to the child if this returns
4850 * returns true, the caller must take "child";s reference count. 5149 * returns true, the caller must take "child";s reference count.
@@ -4856,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
4856{ 5155{
4857 struct css_id *child_id; 5156 struct css_id *child_id;
4858 struct css_id *root_id; 5157 struct css_id *root_id;
4859 bool ret = true;
4860 5158
4861 rcu_read_lock();
4862 child_id = rcu_dereference(child->id); 5159 child_id = rcu_dereference(child->id);
5160 if (!child_id)
5161 return false;
4863 root_id = rcu_dereference(root->id); 5162 root_id = rcu_dereference(root->id);
4864 if (!child_id 5163 if (!root_id)
4865 || !root_id 5164 return false;
4866 || (child_id->depth < root_id->depth) 5165 if (child_id->depth < root_id->depth)
4867 || (child_id->stack[root_id->depth] != root_id->id)) 5166 return false;
4868 ret = false; 5167 if (child_id->stack[root_id->depth] != root_id->id)
4869 rcu_read_unlock(); 5168 return false;
4870 return ret; 5169 return true;
4871} 5170}
4872 5171
4873void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5172void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
@@ -5211,19 +5510,15 @@ static struct cftype debug_files[] = {
5211 .name = "releasable", 5510 .name = "releasable",
5212 .read_u64 = releasable_read, 5511 .read_u64 = releasable_read,
5213 }, 5512 },
5214};
5215 5513
5216static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 5514 { } /* terminate */
5217{ 5515};
5218 return cgroup_add_files(cont, ss, debug_files,
5219 ARRAY_SIZE(debug_files));
5220}
5221 5516
5222struct cgroup_subsys debug_subsys = { 5517struct cgroup_subsys debug_subsys = {
5223 .name = "debug", 5518 .name = "debug",
5224 .create = debug_create, 5519 .create = debug_create,
5225 .destroy = debug_destroy, 5520 .destroy = debug_destroy,
5226 .populate = debug_populate,
5227 .subsys_id = debug_subsys_id, 5521 .subsys_id = debug_subsys_id,
5522 .base_cftypes = debug_files,
5228}; 5523};
5229#endif /* CONFIG_CGROUP_DEBUG */ 5524#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
358static struct cftype files[] = { 358static struct cftype files[] = {
359 { 359 {
360 .name = "state", 360 .name = "state",
361 .flags = CFTYPE_NOT_ON_ROOT,
361 .read_seq_string = freezer_read, 362 .read_seq_string = freezer_read,
362 .write_string = freezer_write, 363 .write_string = freezer_write,
363 }, 364 },
365 { } /* terminate */
364}; 366};
365 367
366static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
367{
368 if (!cgroup->parent)
369 return 0;
370 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
371}
372
373struct cgroup_subsys freezer_subsys = { 368struct cgroup_subsys freezer_subsys = {
374 .name = "freezer", 369 .name = "freezer",
375 .create = freezer_create, 370 .create = freezer_create,
376 .destroy = freezer_destroy, 371 .destroy = freezer_destroy,
377 .populate = freezer_populate,
378 .subsys_id = freezer_subsys_id, 372 .subsys_id = freezer_subsys_id,
379 .can_attach = freezer_can_attach, 373 .can_attach = freezer_can_attach,
380 .fork = freezer_fork, 374 .fork = freezer_fork,
375 .base_cftypes = files,
381}; 376};
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809a..c28a306ae05c 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
372 372
373#ifdef __ARCH_WANT_SYS_SIGPROCMASK 373#ifdef __ARCH_WANT_SYS_SIGPROCMASK
374 374
375asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, 375/*
376 compat_old_sigset_t __user *oset) 376 * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
377 * blocked set of signals to the supplied signal set
378 */
379static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
377{ 380{
378 old_sigset_t s; 381 memcpy(blocked->sig, &set, sizeof(set));
379 long ret; 382}
380 mm_segment_t old_fs;
381 383
382 if (set && get_user(s, set)) 384asmlinkage long compat_sys_sigprocmask(int how,
383 return -EFAULT; 385 compat_old_sigset_t __user *nset,
384 old_fs = get_fs(); 386 compat_old_sigset_t __user *oset)
385 set_fs(KERNEL_DS); 387{
386 ret = sys_sigprocmask(how, 388 old_sigset_t old_set, new_set;
387 set ? (old_sigset_t __user *) &s : NULL, 389 sigset_t new_blocked;
388 oset ? (old_sigset_t __user *) &s : NULL); 390
389 set_fs(old_fs); 391 old_set = current->blocked.sig[0];
390 if (ret == 0) 392
391 if (oset) 393 if (nset) {
392 ret = put_user(s, oset); 394 if (get_user(new_set, nset))
393 return ret; 395 return -EFAULT;
396 new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
397
398 new_blocked = current->blocked;
399
400 switch (how) {
401 case SIG_BLOCK:
402 sigaddsetmask(&new_blocked, new_set);
403 break;
404 case SIG_UNBLOCK:
405 sigdelsetmask(&new_blocked, new_set);
406 break;
407 case SIG_SETMASK:
408 compat_sig_setmask(&new_blocked, new_set);
409 break;
410 default:
411 return -EINVAL;
412 }
413
414 set_current_blocked(&new_blocked);
415 }
416
417 if (oset) {
418 if (put_user(old_set, oset))
419 return -EFAULT;
420 }
421
422 return 0;
394} 423}
395 424
396#endif 425#endif
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
1044 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) 1073 if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
1045 return -EFAULT; 1074 return -EFAULT;
1046 sigset_from_compat(&newset, &newset32); 1075 sigset_from_compat(&newset, &newset32);
1047 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1076 return sigsuspend(&newset);
1048
1049 current->saved_sigmask = current->blocked;
1050 set_current_blocked(&newset);
1051
1052 current->state = TASK_INTERRUPTIBLE;
1053 schedule();
1054 set_restore_sigmask();
1055 return -ERESTARTNOHAND;
1056} 1077}
1057#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ 1078#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
1058 1079
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..a4eb5227a19e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,13 +10,18 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/unistd.h> 11#include <linux/unistd.h>
12#include <linux/cpu.h> 12#include <linux/cpu.h>
13#include <linux/oom.h>
14#include <linux/rcupdate.h>
13#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/bug.h>
14#include <linux/kthread.h> 17#include <linux/kthread.h>
15#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
16#include <linux/mutex.h> 19#include <linux/mutex.h>
17#include <linux/gfp.h> 20#include <linux/gfp.h>
18#include <linux/suspend.h> 21#include <linux/suspend.h>
19 22
23#include "smpboot.h"
24
20#ifdef CONFIG_SMP 25#ifdef CONFIG_SMP
21/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 26/* Serializes the updates to cpu_online_mask, cpu_present_mask */
22static DEFINE_MUTEX(cpu_add_remove_lock); 27static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -171,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
171} 176}
172EXPORT_SYMBOL(unregister_cpu_notifier); 177EXPORT_SYMBOL(unregister_cpu_notifier);
173 178
179/**
180 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
181 * @cpu: a CPU id
182 *
183 * This function walks all processes, finds a valid mm struct for each one and
184 * then clears a corresponding bit in mm's cpumask. While this all sounds
185 * trivial, there are various non-obvious corner cases, which this function
186 * tries to solve in a safe manner.
187 *
188 * Also note that the function uses a somewhat relaxed locking scheme, so it may
189 * be called only for an already offlined CPU.
190 */
191void clear_tasks_mm_cpumask(int cpu)
192{
193 struct task_struct *p;
194
195 /*
196 * This function is called after the cpu is taken down and marked
197 * offline, so its not like new tasks will ever get this cpu set in
198 * their mm mask. -- Peter Zijlstra
199 * Thus, we may use rcu_read_lock() here, instead of grabbing
200 * full-fledged tasklist_lock.
201 */
202 WARN_ON(cpu_online(cpu));
203 rcu_read_lock();
204 for_each_process(p) {
205 struct task_struct *t;
206
207 /*
208 * Main thread might exit, but other threads may still have
209 * a valid mm. Find one.
210 */
211 t = find_lock_task_mm(p);
212 if (!t)
213 continue;
214 cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
215 task_unlock(t);
216 }
217 rcu_read_unlock();
218}
219
174static inline void check_for_tasks(int cpu) 220static inline void check_for_tasks(int cpu)
175{ 221{
176 struct task_struct *p; 222 struct task_struct *p;
@@ -295,11 +341,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
295 int ret, nr_calls = 0; 341 int ret, nr_calls = 0;
296 void *hcpu = (void *)(long)cpu; 342 void *hcpu = (void *)(long)cpu;
297 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 343 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
344 struct task_struct *idle;
298 345
299 if (cpu_online(cpu) || !cpu_present(cpu)) 346 if (cpu_online(cpu) || !cpu_present(cpu))
300 return -EINVAL; 347 return -EINVAL;
301 348
302 cpu_hotplug_begin(); 349 cpu_hotplug_begin();
350
351 idle = idle_thread_get(cpu);
352 if (IS_ERR(idle)) {
353 ret = PTR_ERR(idle);
354 goto out;
355 }
356
303 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 357 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
304 if (ret) { 358 if (ret) {
305 nr_calls--; 359 nr_calls--;
@@ -309,7 +363,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
309 } 363 }
310 364
311 /* Arch-specific enabling code. */ 365 /* Arch-specific enabling code. */
312 ret = __cpu_up(cpu); 366 ret = __cpu_up(cpu, idle);
313 if (ret != 0) 367 if (ret != 0)
314 goto out_notify; 368 goto out_notify;
315 BUG_ON(!cpu_online(cpu)); 369 BUG_ON(!cpu_online(cpu));
@@ -320,6 +374,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
320out_notify: 374out_notify:
321 if (ret != 0) 375 if (ret != 0)
322 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 376 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
377out:
323 cpu_hotplug_done(); 378 cpu_hotplug_done();
324 379
325 return ret; 380 return ret;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e15308..9656a3c36503 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); 81EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
82 82
83/** 83/**
84 * cpm_pm_enter - CPU low power entry notifier 84 * cpu_pm_enter - CPU low power entry notifier
85 * 85 *
86 * Notifies listeners that a single CPU is entering a low power state that may 86 * Notifies listeners that a single CPU is entering a low power state that may
87 * cause some blocks in the same power domain as the cpu to reset. 87 * cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
89 * Must be called on the affected CPU with interrupts disabled. Platform is 89 * Must be called on the affected CPU with interrupts disabled. Platform is
90 * responsible for ensuring that cpu_pm_enter is not called twice on the same 90 * responsible for ensuring that cpu_pm_enter is not called twice on the same
91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP 91 * CPU before cpu_pm_exit is called. Notified drivers can include VFP
92 * co-processor, interrupt controller and it's PM extensions, local CPU 92 * co-processor, interrupt controller and its PM extensions, local CPU
93 * timers context save/restore which shouldn't be interrupted. Hence it 93 * timers context save/restore which shouldn't be interrupted. Hence it
94 * must be called with interrupts disabled. 94 * must be called with interrupts disabled.
95 * 95 *
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
115EXPORT_SYMBOL_GPL(cpu_pm_enter); 115EXPORT_SYMBOL_GPL(cpu_pm_enter);
116 116
117/** 117/**
118 * cpm_pm_exit - CPU low power exit notifier 118 * cpu_pm_exit - CPU low power exit notifier
119 * 119 *
120 * Notifies listeners that a single CPU is exiting a low power state that may 120 * Notifies listeners that a single CPU is exiting a low power state that may
121 * have caused some blocks in the same power domain as the cpu to reset. 121 * have caused some blocks in the same power domain as the cpu to reset.
122 * 122 *
123 * Notified drivers can include VFP co-processor, interrupt controller 123 * Notified drivers can include VFP co-processor, interrupt controller
124 * and it's PM extensions, local CPU timers context save/restore which 124 * and its PM extensions, local CPU timers context save/restore which
125 * shouldn't be interrupted. Hence it must be called with interrupts disabled. 125 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
126 * 126 *
127 * Return conditions are same as __raw_notifier_call_chain. 127 * Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
139EXPORT_SYMBOL_GPL(cpu_pm_exit); 139EXPORT_SYMBOL_GPL(cpu_pm_exit);
140 140
141/** 141/**
142 * cpm_cluster_pm_enter - CPU cluster low power entry notifier 142 * cpu_cluster_pm_enter - CPU cluster low power entry notifier
143 * 143 *
144 * Notifies listeners that all cpus in a power domain are entering a low power 144 * Notifies listeners that all cpus in a power domain are entering a low power
145 * state that may cause some blocks in the same power domain to reset. 145 * state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
147 * Must be called after cpu_pm_enter has been called on all cpus in the power 147 * Must be called after cpu_pm_enter has been called on all cpus in the power
148 * domain, and before cpu_pm_exit has been called on any cpu in the power 148 * domain, and before cpu_pm_exit has been called on any cpu in the power
149 * domain. Notified drivers can include VFP co-processor, interrupt controller 149 * domain. Notified drivers can include VFP co-processor, interrupt controller
150 * and it's PM extensions, local CPU timers context save/restore which 150 * and its PM extensions, local CPU timers context save/restore which
151 * shouldn't be interrupted. Hence it must be called with interrupts disabled. 151 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
152 * 152 *
153 * Must be called with interrupts disabled. 153 * Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); 174EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
175 175
176/** 176/**
177 * cpm_cluster_pm_exit - CPU cluster low power exit notifier 177 * cpu_cluster_pm_exit - CPU cluster low power exit notifier
178 * 178 *
179 * Notifies listeners that all cpus in a power domain are exiting form a 179 * Notifies listeners that all cpus in a power domain are exiting form a
180 * low power state that may have caused some blocks in the same power domain 180 * low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
183 * Must be called after cpu_pm_exit has been called on all cpus in the power 183 * Must be called after cpu_pm_exit has been called on all cpus in the power
184 * domain, and before cpu_pm_exit has been called on any cpu in the power 184 * domain, and before cpu_pm_exit has been called on any cpu in the power
185 * domain. Notified drivers can include VFP co-processor, interrupt controller 185 * domain. Notified drivers can include VFP co-processor, interrupt controller
186 * and it's PM extensions, local CPU timers context save/restore which 186 * and its PM extensions, local CPU timers context save/restore which
187 * shouldn't be interrupted. Hence it must be called with interrupts disabled. 187 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
188 * 188 *
189 * Return conditions are same as __raw_notifier_call_chain. 189 * Return conditions are same as __raw_notifier_call_chain.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070b4ba2..8c8bd652dd12 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
1765 .write_u64 = cpuset_write_u64, 1765 .write_u64 = cpuset_write_u64,
1766 .private = FILE_SPREAD_SLAB, 1766 .private = FILE_SPREAD_SLAB,
1767 }, 1767 },
1768};
1769
1770static struct cftype cft_memory_pressure_enabled = {
1771 .name = "memory_pressure_enabled",
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775};
1776 1768
1777static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) 1769 {
1778{ 1770 .name = "memory_pressure_enabled",
1779 int err; 1771 .flags = CFTYPE_ONLY_ON_ROOT,
1772 .read_u64 = cpuset_read_u64,
1773 .write_u64 = cpuset_write_u64,
1774 .private = FILE_MEMORY_PRESSURE_ENABLED,
1775 },
1780 1776
1781 err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 1777 { } /* terminate */
1782 if (err) 1778};
1783 return err;
1784 /* memory_pressure_enabled is in root cpuset only */
1785 if (!cont->parent)
1786 err = cgroup_add_file(cont, ss,
1787 &cft_memory_pressure_enabled);
1788 return err;
1789}
1790 1779
1791/* 1780/*
1792 * post_clone() is called during cgroup_create() when the 1781 * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
1887 .destroy = cpuset_destroy, 1876 .destroy = cpuset_destroy,
1888 .can_attach = cpuset_can_attach, 1877 .can_attach = cpuset_can_attach,
1889 .attach = cpuset_attach, 1878 .attach = cpuset_attach,
1890 .populate = cpuset_populate,
1891 .post_clone = cpuset_post_clone, 1879 .post_clone = cpuset_post_clone,
1892 .subsys_id = cpuset_subsys_id, 1880 .subsys_id = cpuset_subsys_id,
1881 .base_cftypes = files,
1893 .early_init = 1, 1882 .early_init = 1,
1894}; 1883};
1895 1884
diff --git a/kernel/cred.c b/kernel/cred.c
index e70683d9ec32..de728ac50d82 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,6 +49,14 @@ struct cred init_cred = {
49 .subscribers = ATOMIC_INIT(2), 49 .subscribers = ATOMIC_INIT(2),
50 .magic = CRED_MAGIC, 50 .magic = CRED_MAGIC,
51#endif 51#endif
52 .uid = GLOBAL_ROOT_UID,
53 .gid = GLOBAL_ROOT_GID,
54 .suid = GLOBAL_ROOT_UID,
55 .sgid = GLOBAL_ROOT_GID,
56 .euid = GLOBAL_ROOT_UID,
57 .egid = GLOBAL_ROOT_GID,
58 .fsuid = GLOBAL_ROOT_UID,
59 .fsgid = GLOBAL_ROOT_GID,
52 .securebits = SECUREBITS_DEFAULT, 60 .securebits = SECUREBITS_DEFAULT,
53 .cap_inheritable = CAP_EMPTY_SET, 61 .cap_inheritable = CAP_EMPTY_SET,
54 .cap_permitted = CAP_FULL_SET, 62 .cap_permitted = CAP_FULL_SET,
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
148 if (cred->group_info) 156 if (cred->group_info)
149 put_group_info(cred->group_info); 157 put_group_info(cred->group_info);
150 free_uid(cred->user); 158 free_uid(cred->user);
159 put_user_ns(cred->user_ns);
151 kmem_cache_free(cred_jar, cred); 160 kmem_cache_free(cred_jar, cred);
152} 161}
153 162
@@ -198,13 +207,6 @@ void exit_creds(struct task_struct *tsk)
198 validate_creds(cred); 207 validate_creds(cred);
199 alter_cred_subscribers(cred, -1); 208 alter_cred_subscribers(cred, -1);
200 put_cred(cred); 209 put_cred(cred);
201
202 cred = (struct cred *) tsk->replacement_session_keyring;
203 if (cred) {
204 tsk->replacement_session_keyring = NULL;
205 validate_creds(cred);
206 put_cred(cred);
207 }
208} 210}
209 211
210/** 212/**
@@ -303,6 +305,7 @@ struct cred *prepare_creds(void)
303 set_cred_subscribers(new, 0); 305 set_cred_subscribers(new, 0);
304 get_group_info(new->group_info); 306 get_group_info(new->group_info);
305 get_uid(new->user); 307 get_uid(new->user);
308 get_user_ns(new->user_ns);
306 309
307#ifdef CONFIG_KEYS 310#ifdef CONFIG_KEYS
308 key_get(new->thread_keyring); 311 key_get(new->thread_keyring);
@@ -386,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
386 struct cred *new; 389 struct cred *new;
387 int ret; 390 int ret;
388 391
389 p->replacement_session_keyring = NULL;
390
391 if ( 392 if (
392#ifdef CONFIG_KEYS 393#ifdef CONFIG_KEYS
393 !p->cred->thread_keyring && 394 !p->cred->thread_keyring &&
@@ -414,11 +415,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
414 goto error_put; 415 goto error_put;
415 } 416 }
416 417
417 /* cache user_ns in cred. Doesn't need a refcount because it will
418 * stay pinned by cred->user
419 */
420 new->user_ns = new->user->user_ns;
421
422#ifdef CONFIG_KEYS 418#ifdef CONFIG_KEYS
423 /* new threads get their own thread keyrings if their parent already 419 /* new threads get their own thread keyrings if their parent already
424 * had one */ 420 * had one */
@@ -493,10 +489,10 @@ int commit_creds(struct cred *new)
493 get_cred(new); /* we will require a ref for the subj creds too */ 489 get_cred(new); /* we will require a ref for the subj creds too */
494 490
495 /* dumpability changes */ 491 /* dumpability changes */
496 if (old->euid != new->euid || 492 if (!uid_eq(old->euid, new->euid) ||
497 old->egid != new->egid || 493 !gid_eq(old->egid, new->egid) ||
498 old->fsuid != new->fsuid || 494 !uid_eq(old->fsuid, new->fsuid) ||
499 old->fsgid != new->fsgid || 495 !gid_eq(old->fsgid, new->fsgid) ||
500 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 496 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
501 if (task->mm) 497 if (task->mm)
502 set_dumpable(task->mm, suid_dumpable); 498 set_dumpable(task->mm, suid_dumpable);
@@ -505,9 +501,9 @@ int commit_creds(struct cred *new)
505 } 501 }
506 502
507 /* alter the thread keyring */ 503 /* alter the thread keyring */
508 if (new->fsuid != old->fsuid) 504 if (!uid_eq(new->fsuid, old->fsuid))
509 key_fsuid_changed(task); 505 key_fsuid_changed(task);
510 if (new->fsgid != old->fsgid) 506 if (!gid_eq(new->fsgid, old->fsgid))
511 key_fsgid_changed(task); 507 key_fsgid_changed(task);
512 508
513 /* do it 509 /* do it
@@ -524,16 +520,16 @@ int commit_creds(struct cred *new)
524 alter_cred_subscribers(old, -2); 520 alter_cred_subscribers(old, -2);
525 521
526 /* send notifications */ 522 /* send notifications */
527 if (new->uid != old->uid || 523 if (!uid_eq(new->uid, old->uid) ||
528 new->euid != old->euid || 524 !uid_eq(new->euid, old->euid) ||
529 new->suid != old->suid || 525 !uid_eq(new->suid, old->suid) ||
530 new->fsuid != old->fsuid) 526 !uid_eq(new->fsuid, old->fsuid))
531 proc_id_connector(task, PROC_EVENT_UID); 527 proc_id_connector(task, PROC_EVENT_UID);
532 528
533 if (new->gid != old->gid || 529 if (!gid_eq(new->gid, old->gid) ||
534 new->egid != old->egid || 530 !gid_eq(new->egid, old->egid) ||
535 new->sgid != old->sgid || 531 !gid_eq(new->sgid, old->sgid) ||
536 new->fsgid != old->fsgid) 532 !gid_eq(new->fsgid, old->fsgid))
537 proc_id_connector(task, PROC_EVENT_GID); 533 proc_id_connector(task, PROC_EVENT_GID);
538 534
539 /* release the old obj and subj refs both */ 535 /* release the old obj and subj refs both */
@@ -678,6 +674,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
678 atomic_set(&new->usage, 1); 674 atomic_set(&new->usage, 1);
679 set_cred_subscribers(new, 0); 675 set_cred_subscribers(new, 0);
680 get_uid(new->user); 676 get_uid(new->user);
677 get_user_ns(new->user_ns);
681 get_group_info(new->group_info); 678 get_group_info(new->group_info);
682 679
683#ifdef CONFIG_KEYS 680#ifdef CONFIG_KEYS
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf4..103f5d147b2f 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o callchain.o 5obj-y := core.o ring_buffer.o callchain.o
6
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 7obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
8obj-$(CONFIG_UPROBES) += uprobes.o
9
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd126f82b57c..5b06cbbf6931 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
4957 if (rctx < 0) 4957 if (rctx < 0)
4958 return; 4958 return;
4959 4959
4960 perf_sample_data_init(&data, addr); 4960 perf_sample_data_init(&data, addr, 0);
4961 4961
4962 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 4962 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
4963 4963
@@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5215 .data = record, 5215 .data = record,
5216 }; 5216 };
5217 5217
5218 perf_sample_data_init(&data, addr); 5218 perf_sample_data_init(&data, addr, 0);
5219 data.raw = &raw; 5219 data.raw = &raw;
5220 5220
5221 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5221 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5318 struct perf_sample_data sample; 5318 struct perf_sample_data sample;
5319 struct pt_regs *regs = data; 5319 struct pt_regs *regs = data;
5320 5320
5321 perf_sample_data_init(&sample, bp->attr.bp_addr); 5321 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
5322 5322
5323 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5323 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5324 perf_swevent_event(bp, 1, &sample, regs); 5324 perf_swevent_event(bp, 1, &sample, regs);
@@ -5344,13 +5344,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5344 5344
5345 event->pmu->read(event); 5345 event->pmu->read(event);
5346 5346
5347 perf_sample_data_init(&data, 0); 5347 perf_sample_data_init(&data, 0, event->hw.last_period);
5348 data.period = event->hw.last_period;
5349 regs = get_irq_regs(); 5348 regs = get_irq_regs();
5350 5349
5351 if (regs && !perf_exclude_event(event, regs)) { 5350 if (regs && !perf_exclude_event(event, regs)) {
5352 if (!(event->attr.exclude_idle && is_idle_task(current))) 5351 if (!(event->attr.exclude_idle && is_idle_task(current)))
5353 if (perf_event_overflow(event, &data, regs)) 5352 if (__perf_event_overflow(event, 1, &data, regs))
5354 ret = HRTIMER_NORESTART; 5353 ret = HRTIMER_NORESTART;
5355 } 5354 }
5356 5355
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 000000000000..985be4d80fe8
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1667 @@
1/*
2 * User-space Probes (UProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2012
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
23 */
24
25#include <linux/kernel.h>
26#include <linux/highmem.h>
27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h>
29#include <linux/sched.h>
30#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */
35
36#include <linux/uprobes.h>
37
38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
40
41static struct srcu_struct uprobes_srcu;
42static struct rb_root uprobes_tree = RB_ROOT;
43
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45
46#define UPROBES_HASH_SZ 13
47
48/* serialize (un)register */
49static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
50
51#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
52
53/* serialize uprobe->pending_list */
54static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
55#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
56
57/*
58 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
59 * events active at this time. Probably a fine grained per inode count is
60 * better?
61 */
62static atomic_t uprobe_events = ATOMIC_INIT(0);
63
64/*
65 * Maintain a temporary per vma info that can be used to search if a vma
66 * has already been handled. This structure is introduced since extending
67 * vm_area_struct wasnt recommended.
68 */
69struct vma_info {
70 struct list_head probe_list;
71 struct mm_struct *mm;
72 loff_t vaddr;
73};
74
75struct uprobe {
76 struct rb_node rb_node; /* node in the rb tree */
77 atomic_t ref;
78 struct rw_semaphore consumer_rwsem;
79 struct list_head pending_list;
80 struct uprobe_consumer *consumers;
81 struct inode *inode; /* Also hold a ref to inode */
82 loff_t offset;
83 int flags;
84 struct arch_uprobe arch;
85};
86
87/*
88 * valid_vma: Verify if the specified vma is an executable vma
89 * Relax restrictions while unregistering: vm_flags might have
90 * changed after breakpoint was inserted.
91 * - is_register: indicates if we are in register context.
92 * - Return 1 if the specified virtual address is in an
93 * executable vma.
94 */
95static bool valid_vma(struct vm_area_struct *vma, bool is_register)
96{
97 if (!vma->vm_file)
98 return false;
99
100 if (!is_register)
101 return true;
102
103 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
104 return true;
105
106 return false;
107}
108
109static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
110{
111 loff_t vaddr;
112
113 vaddr = vma->vm_start + offset;
114 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
115
116 return vaddr;
117}
118
119/**
120 * __replace_page - replace page in vma by new page.
121 * based on replace_page in mm/ksm.c
122 *
123 * @vma: vma that holds the pte pointing to page
124 * @page: the cowed page we are replacing by kpage
125 * @kpage: the modified page we replace page by
126 *
127 * Returns 0 on success, -EFAULT on failure.
128 */
129static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
130{
131 struct mm_struct *mm = vma->vm_mm;
132 pgd_t *pgd;
133 pud_t *pud;
134 pmd_t *pmd;
135 pte_t *ptep;
136 spinlock_t *ptl;
137 unsigned long addr;
138 int err = -EFAULT;
139
140 addr = page_address_in_vma(page, vma);
141 if (addr == -EFAULT)
142 goto out;
143
144 pgd = pgd_offset(mm, addr);
145 if (!pgd_present(*pgd))
146 goto out;
147
148 pud = pud_offset(pgd, addr);
149 if (!pud_present(*pud))
150 goto out;
151
152 pmd = pmd_offset(pud, addr);
153 if (!pmd_present(*pmd))
154 goto out;
155
156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
157 if (!ptep)
158 goto out;
159
160 get_page(kpage);
161 page_add_new_anon_rmap(kpage, vma, addr);
162
163 if (!PageAnon(page)) {
164 dec_mm_counter(mm, MM_FILEPAGES);
165 inc_mm_counter(mm, MM_ANONPAGES);
166 }
167
168 flush_cache_page(vma, addr, pte_pfn(*ptep));
169 ptep_clear_flush(vma, addr, ptep);
170 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
171
172 page_remove_rmap(page);
173 if (!page_mapped(page))
174 try_to_free_swap(page);
175 put_page(page);
176 pte_unmap_unlock(ptep, ptl);
177 err = 0;
178
179out:
180 return err;
181}
182
183/**
184 * is_swbp_insn - check if instruction is breakpoint instruction.
185 * @insn: instruction to be checked.
186 * Default implementation of is_swbp_insn
187 * Returns true if @insn is a breakpoint instruction.
188 */
189bool __weak is_swbp_insn(uprobe_opcode_t *insn)
190{
191 return *insn == UPROBE_SWBP_INSN;
192}
193
194/*
195 * NOTE:
196 * Expect the breakpoint instruction to be the smallest size instruction for
197 * the architecture. If an arch has variable length instruction and the
198 * breakpoint instruction is not of the smallest length instruction
199 * supported by that architecture then we need to modify read_opcode /
200 * write_opcode accordingly. This would never be a problem for archs that
201 * have fixed length instructions.
202 */
203
204/*
205 * write_opcode - write the opcode at a given virtual address.
206 * @auprobe: arch breakpointing information.
207 * @mm: the probed process address space.
208 * @vaddr: the virtual address to store the opcode.
209 * @opcode: opcode to be written at @vaddr.
210 *
211 * Called with mm->mmap_sem held (for read and with a reference to
212 * mm).
213 *
214 * For mm @mm, write the opcode at @vaddr.
215 * Return 0 (success) or a negative errno.
216 */
217static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
218 unsigned long vaddr, uprobe_opcode_t opcode)
219{
220 struct page *old_page, *new_page;
221 struct address_space *mapping;
222 void *vaddr_old, *vaddr_new;
223 struct vm_area_struct *vma;
224 struct uprobe *uprobe;
225 loff_t addr;
226 int ret;
227
228 /* Read the page with vaddr into memory */
229 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
230 if (ret <= 0)
231 return ret;
232
233 ret = -EINVAL;
234
235 /*
236 * We are interested in text pages only. Our pages of interest
237 * should be mapped for read and execute only. We desist from
238 * adding probes in write mapped pages since the breakpoints
239 * might end up in the file copy.
240 */
241 if (!valid_vma(vma, is_swbp_insn(&opcode)))
242 goto put_out;
243
244 uprobe = container_of(auprobe, struct uprobe, arch);
245 mapping = uprobe->inode->i_mapping;
246 if (mapping != vma->vm_file->f_mapping)
247 goto put_out;
248
249 addr = vma_address(vma, uprobe->offset);
250 if (vaddr != (unsigned long)addr)
251 goto put_out;
252
253 ret = -ENOMEM;
254 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
255 if (!new_page)
256 goto put_out;
257
258 __SetPageUptodate(new_page);
259
260 /*
261 * lock page will serialize against do_wp_page()'s
262 * PageAnon() handling
263 */
264 lock_page(old_page);
265 /* copy the page now that we've got it stable */
266 vaddr_old = kmap_atomic(old_page);
267 vaddr_new = kmap_atomic(new_page);
268
269 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
270
271 /* poke the new insn in, ASSUMES we don't cross page boundary */
272 vaddr &= ~PAGE_MASK;
273 BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
274 memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
275
276 kunmap_atomic(vaddr_new);
277 kunmap_atomic(vaddr_old);
278
279 ret = anon_vma_prepare(vma);
280 if (ret)
281 goto unlock_out;
282
283 lock_page(new_page);
284 ret = __replace_page(vma, old_page, new_page);
285 unlock_page(new_page);
286
287unlock_out:
288 unlock_page(old_page);
289 page_cache_release(new_page);
290
291put_out:
292 put_page(old_page);
293
294 return ret;
295}
296
297/**
298 * read_opcode - read the opcode at a given virtual address.
299 * @mm: the probed process address space.
300 * @vaddr: the virtual address to read the opcode.
301 * @opcode: location to store the read opcode.
302 *
303 * Called with mm->mmap_sem held (for read and with a reference to
304 * mm.
305 *
306 * For mm @mm, read the opcode at @vaddr and store it in @opcode.
307 * Return 0 (success) or a negative errno.
308 */
309static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
310{
311 struct page *page;
312 void *vaddr_new;
313 int ret;
314
315 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
316 if (ret <= 0)
317 return ret;
318
319 lock_page(page);
320 vaddr_new = kmap_atomic(page);
321 vaddr &= ~PAGE_MASK;
322 memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
323 kunmap_atomic(vaddr_new);
324 unlock_page(page);
325
326 put_page(page);
327
328 return 0;
329}
330
331static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
332{
333 uprobe_opcode_t opcode;
334 int result;
335
336 result = read_opcode(mm, vaddr, &opcode);
337 if (result)
338 return result;
339
340 if (is_swbp_insn(&opcode))
341 return 1;
342
343 return 0;
344}
345
346/**
347 * set_swbp - store breakpoint at a given address.
348 * @auprobe: arch specific probepoint information.
349 * @mm: the probed process address space.
350 * @vaddr: the virtual address to insert the opcode.
351 *
352 * For mm @mm, store the breakpoint instruction at @vaddr.
353 * Return 0 (success) or a negative errno.
354 */
355int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
356{
357 int result;
358
359 result = is_swbp_at_addr(mm, vaddr);
360 if (result == 1)
361 return -EEXIST;
362
363 if (result)
364 return result;
365
366 return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
367}
368
369/**
370 * set_orig_insn - Restore the original instruction.
371 * @mm: the probed process address space.
372 * @auprobe: arch specific probepoint information.
373 * @vaddr: the virtual address to insert the opcode.
374 * @verify: if true, verify existance of breakpoint instruction.
375 *
376 * For mm @mm, restore the original opcode (opcode) at @vaddr.
377 * Return 0 (success) or a negative errno.
378 */
379int __weak
380set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
381{
382 if (verify) {
383 int result;
384
385 result = is_swbp_at_addr(mm, vaddr);
386 if (!result)
387 return -EINVAL;
388
389 if (result != 1)
390 return result;
391 }
392 return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
393}
394
395static int match_uprobe(struct uprobe *l, struct uprobe *r)
396{
397 if (l->inode < r->inode)
398 return -1;
399
400 if (l->inode > r->inode)
401 return 1;
402
403 if (l->offset < r->offset)
404 return -1;
405
406 if (l->offset > r->offset)
407 return 1;
408
409 return 0;
410}
411
412static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
413{
414 struct uprobe u = { .inode = inode, .offset = offset };
415 struct rb_node *n = uprobes_tree.rb_node;
416 struct uprobe *uprobe;
417 int match;
418
419 while (n) {
420 uprobe = rb_entry(n, struct uprobe, rb_node);
421 match = match_uprobe(&u, uprobe);
422 if (!match) {
423 atomic_inc(&uprobe->ref);
424 return uprobe;
425 }
426
427 if (match < 0)
428 n = n->rb_left;
429 else
430 n = n->rb_right;
431 }
432 return NULL;
433}
434
435/*
436 * Find a uprobe corresponding to a given inode:offset
437 * Acquires uprobes_treelock
438 */
439static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
440{
441 struct uprobe *uprobe;
442 unsigned long flags;
443
444 spin_lock_irqsave(&uprobes_treelock, flags);
445 uprobe = __find_uprobe(inode, offset);
446 spin_unlock_irqrestore(&uprobes_treelock, flags);
447
448 return uprobe;
449}
450
451static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
452{
453 struct rb_node **p = &uprobes_tree.rb_node;
454 struct rb_node *parent = NULL;
455 struct uprobe *u;
456 int match;
457
458 while (*p) {
459 parent = *p;
460 u = rb_entry(parent, struct uprobe, rb_node);
461 match = match_uprobe(uprobe, u);
462 if (!match) {
463 atomic_inc(&u->ref);
464 return u;
465 }
466
467 if (match < 0)
468 p = &parent->rb_left;
469 else
470 p = &parent->rb_right;
471
472 }
473
474 u = NULL;
475 rb_link_node(&uprobe->rb_node, parent, p);
476 rb_insert_color(&uprobe->rb_node, &uprobes_tree);
477 /* get access + creation ref */
478 atomic_set(&uprobe->ref, 2);
479
480 return u;
481}
482
483/*
484 * Acquire uprobes_treelock.
485 * Matching uprobe already exists in rbtree;
486 * increment (access refcount) and return the matching uprobe.
487 *
488 * No matching uprobe; insert the uprobe in rb_tree;
489 * get a double refcount (access + creation) and return NULL.
490 */
491static struct uprobe *insert_uprobe(struct uprobe *uprobe)
492{
493 unsigned long flags;
494 struct uprobe *u;
495
496 spin_lock_irqsave(&uprobes_treelock, flags);
497 u = __insert_uprobe(uprobe);
498 spin_unlock_irqrestore(&uprobes_treelock, flags);
499
500 /* For now assume that the instruction need not be single-stepped */
501 uprobe->flags |= UPROBE_SKIP_SSTEP;
502
503 return u;
504}
505
506static void put_uprobe(struct uprobe *uprobe)
507{
508 if (atomic_dec_and_test(&uprobe->ref))
509 kfree(uprobe);
510}
511
512static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
513{
514 struct uprobe *uprobe, *cur_uprobe;
515
516 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
517 if (!uprobe)
518 return NULL;
519
520 uprobe->inode = igrab(inode);
521 uprobe->offset = offset;
522 init_rwsem(&uprobe->consumer_rwsem);
523 INIT_LIST_HEAD(&uprobe->pending_list);
524
525 /* add to uprobes_tree, sorted on inode:offset */
526 cur_uprobe = insert_uprobe(uprobe);
527
528 /* a uprobe exists for this inode:offset combination */
529 if (cur_uprobe) {
530 kfree(uprobe);
531 uprobe = cur_uprobe;
532 iput(inode);
533 } else {
534 atomic_inc(&uprobe_events);
535 }
536
537 return uprobe;
538}
539
540static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
541{
542 struct uprobe_consumer *uc;
543
544 if (!(uprobe->flags & UPROBE_RUN_HANDLER))
545 return;
546
547 down_read(&uprobe->consumer_rwsem);
548 for (uc = uprobe->consumers; uc; uc = uc->next) {
549 if (!uc->filter || uc->filter(uc, current))
550 uc->handler(uc, regs);
551 }
552 up_read(&uprobe->consumer_rwsem);
553}
554
555/* Returns the previous consumer */
556static struct uprobe_consumer *
557consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
558{
559 down_write(&uprobe->consumer_rwsem);
560 uc->next = uprobe->consumers;
561 uprobe->consumers = uc;
562 up_write(&uprobe->consumer_rwsem);
563
564 return uc->next;
565}
566
567/*
568 * For uprobe @uprobe, delete the consumer @uc.
569 * Return true if the @uc is deleted successfully
570 * or return false.
571 */
572static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
573{
574 struct uprobe_consumer **con;
575 bool ret = false;
576
577 down_write(&uprobe->consumer_rwsem);
578 for (con = &uprobe->consumers; *con; con = &(*con)->next) {
579 if (*con == uc) {
580 *con = uc->next;
581 ret = true;
582 break;
583 }
584 }
585 up_write(&uprobe->consumer_rwsem);
586
587 return ret;
588}
589
590static int
591__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
592 unsigned long nbytes, unsigned long offset)
593{
594 struct file *filp = vma->vm_file;
595 struct page *page;
596 void *vaddr;
597 unsigned long off1;
598 unsigned long idx;
599
600 if (!filp)
601 return -EINVAL;
602
603 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
604 off1 = offset &= ~PAGE_MASK;
605
606 /*
607 * Ensure that the page that has the original instruction is
608 * populated and in page-cache.
609 */
610 page = read_mapping_page(mapping, idx, filp);
611 if (IS_ERR(page))
612 return PTR_ERR(page);
613
614 vaddr = kmap_atomic(page);
615 memcpy(insn, vaddr + off1, nbytes);
616 kunmap_atomic(vaddr);
617 page_cache_release(page);
618
619 return 0;
620}
621
622static int
623copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
624{
625 struct address_space *mapping;
626 unsigned long nbytes;
627 int bytes;
628
629 addr &= ~PAGE_MASK;
630 nbytes = PAGE_SIZE - addr;
631 mapping = uprobe->inode->i_mapping;
632
633 /* Instruction at end of binary; copy only available bytes */
634 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
635 bytes = uprobe->inode->i_size - uprobe->offset;
636 else
637 bytes = MAX_UINSN_BYTES;
638
639 /* Instruction at the page-boundary; copy bytes in second page */
640 if (nbytes < bytes) {
641 if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
642 bytes - nbytes, uprobe->offset + nbytes))
643 return -ENOMEM;
644
645 bytes = nbytes;
646 }
647 return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
648}
649
650/*
651 * How mm->uprobes_state.count gets updated
652 * uprobe_mmap() increments the count if
653 * - it successfully adds a breakpoint.
654 * - it cannot add a breakpoint, but sees that there is a underlying
655 * breakpoint (via a is_swbp_at_addr()).
656 *
657 * uprobe_munmap() decrements the count if
658 * - it sees a underlying breakpoint, (via is_swbp_at_addr)
659 * (Subsequent uprobe_unregister wouldnt find the breakpoint
660 * unless a uprobe_mmap kicks in, since the old vma would be
661 * dropped just after uprobe_munmap.)
662 *
663 * uprobe_register increments the count if:
664 * - it successfully adds a breakpoint.
665 *
666 * uprobe_unregister decrements the count if:
667 * - it sees a underlying breakpoint and removes successfully.
668 * (via is_swbp_at_addr)
669 * (Subsequent uprobe_munmap wouldnt find the breakpoint
670 * since there is no underlying breakpoint after the
671 * breakpoint removal.)
672 */
673static int
674install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
675 struct vm_area_struct *vma, loff_t vaddr)
676{
677 unsigned long addr;
678 int ret;
679
680 /*
681 * If probe is being deleted, unregister thread could be done with
682 * the vma-rmap-walk through. Adding a probe now can be fatal since
683 * nobody will be able to cleanup. Also we could be from fork or
684 * mremap path, where the probe might have already been inserted.
685 * Hence behave as if probe already existed.
686 */
687 if (!uprobe->consumers)
688 return -EEXIST;
689
690 addr = (unsigned long)vaddr;
691
692 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
693 ret = copy_insn(uprobe, vma, addr);
694 if (ret)
695 return ret;
696
697 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
698 return -EEXIST;
699
700 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
701 if (ret)
702 return ret;
703
704 uprobe->flags |= UPROBE_COPY_INSN;
705 }
706
707 /*
708 * Ideally, should be updating the probe count after the breakpoint
709 * has been successfully inserted. However a thread could hit the
710 * breakpoint we just inserted even before the probe count is
711 * incremented. If this is the first breakpoint placed, breakpoint
712 * notifier might ignore uprobes and pass the trap to the thread.
713 * Hence increment before and decrement on failure.
714 */
715 atomic_inc(&mm->uprobes_state.count);
716 ret = set_swbp(&uprobe->arch, mm, addr);
717 if (ret)
718 atomic_dec(&mm->uprobes_state.count);
719
720 return ret;
721}
722
723static void
724remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
725{
726 if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
727 atomic_dec(&mm->uprobes_state.count);
728}
729
730/*
731 * There could be threads that have hit the breakpoint and are entering the
732 * notifier code and trying to acquire the uprobes_treelock. The thread
733 * calling delete_uprobe() that is removing the uprobe from the rb_tree can
734 * race with these threads and might acquire the uprobes_treelock compared
735 * to some of the breakpoint hit threads. In such a case, the breakpoint
736 * hit threads will not find the uprobe. The current unregistering thread
737 * waits till all other threads have hit a breakpoint, to acquire the
738 * uprobes_treelock before the uprobe is removed from the rbtree.
739 */
740static void delete_uprobe(struct uprobe *uprobe)
741{
742 unsigned long flags;
743
744 synchronize_srcu(&uprobes_srcu);
745 spin_lock_irqsave(&uprobes_treelock, flags);
746 rb_erase(&uprobe->rb_node, &uprobes_tree);
747 spin_unlock_irqrestore(&uprobes_treelock, flags);
748 iput(uprobe->inode);
749 put_uprobe(uprobe);
750 atomic_dec(&uprobe_events);
751}
752
753static struct vma_info *
754__find_next_vma_info(struct address_space *mapping, struct list_head *head,
755 struct vma_info *vi, loff_t offset, bool is_register)
756{
757 struct prio_tree_iter iter;
758 struct vm_area_struct *vma;
759 struct vma_info *tmpvi;
760 unsigned long pgoff;
761 int existing_vma;
762 loff_t vaddr;
763
764 pgoff = offset >> PAGE_SHIFT;
765
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (!valid_vma(vma, is_register))
768 continue;
769
770 existing_vma = 0;
771 vaddr = vma_address(vma, offset);
772
773 list_for_each_entry(tmpvi, head, probe_list) {
774 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
775 existing_vma = 1;
776 break;
777 }
778 }
779
780 /*
781 * Another vma needs a probe to be installed. However skip
782 * installing the probe if the vma is about to be unlinked.
783 */
784 if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
785 vi->mm = vma->vm_mm;
786 vi->vaddr = vaddr;
787 list_add(&vi->probe_list, head);
788
789 return vi;
790 }
791 }
792
793 return NULL;
794}
795
796/*
797 * Iterate in the rmap prio tree and find a vma where a probe has not
798 * yet been inserted.
799 */
800static struct vma_info *
801find_next_vma_info(struct address_space *mapping, struct list_head *head,
802 loff_t offset, bool is_register)
803{
804 struct vma_info *vi, *retvi;
805
806 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
807 if (!vi)
808 return ERR_PTR(-ENOMEM);
809
810 mutex_lock(&mapping->i_mmap_mutex);
811 retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
812 mutex_unlock(&mapping->i_mmap_mutex);
813
814 if (!retvi)
815 kfree(vi);
816
817 return retvi;
818}
819
820static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
821{
822 struct list_head try_list;
823 struct vm_area_struct *vma;
824 struct address_space *mapping;
825 struct vma_info *vi, *tmpvi;
826 struct mm_struct *mm;
827 loff_t vaddr;
828 int ret;
829
830 mapping = uprobe->inode->i_mapping;
831 INIT_LIST_HEAD(&try_list);
832
833 ret = 0;
834
835 for (;;) {
836 vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
837 if (!vi)
838 break;
839
840 if (IS_ERR(vi)) {
841 ret = PTR_ERR(vi);
842 break;
843 }
844
845 mm = vi->mm;
846 down_read(&mm->mmap_sem);
847 vma = find_vma(mm, (unsigned long)vi->vaddr);
848 if (!vma || !valid_vma(vma, is_register)) {
849 list_del(&vi->probe_list);
850 kfree(vi);
851 up_read(&mm->mmap_sem);
852 mmput(mm);
853 continue;
854 }
855 vaddr = vma_address(vma, uprobe->offset);
856 if (vma->vm_file->f_mapping->host != uprobe->inode ||
857 vaddr != vi->vaddr) {
858 list_del(&vi->probe_list);
859 kfree(vi);
860 up_read(&mm->mmap_sem);
861 mmput(mm);
862 continue;
863 }
864
865 if (is_register)
866 ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
867 else
868 remove_breakpoint(uprobe, mm, vi->vaddr);
869
870 up_read(&mm->mmap_sem);
871 mmput(mm);
872 if (is_register) {
873 if (ret && ret == -EEXIST)
874 ret = 0;
875 if (ret)
876 break;
877 }
878 }
879
880 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
881 list_del(&vi->probe_list);
882 kfree(vi);
883 }
884
885 return ret;
886}
887
888static int __uprobe_register(struct uprobe *uprobe)
889{
890 return register_for_each_vma(uprobe, true);
891}
892
893static void __uprobe_unregister(struct uprobe *uprobe)
894{
895 if (!register_for_each_vma(uprobe, false))
896 delete_uprobe(uprobe);
897
898 /* TODO : cant unregister? schedule a worker thread */
899}
900
901/*
902 * uprobe_register - register a probe
903 * @inode: the file in which the probe has to be placed.
904 * @offset: offset from the start of the file.
905 * @uc: information on howto handle the probe..
906 *
907 * Apart from the access refcount, uprobe_register() takes a creation
908 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
909 * inserted into the rbtree (i.e first consumer for a @inode:@offset
910 * tuple). Creation refcount stops uprobe_unregister from freeing the
911 * @uprobe even before the register operation is complete. Creation
912 * refcount is released when the last @uc for the @uprobe
913 * unregisters.
914 *
915 * Return errno if it cannot successully install probes
916 * else return 0 (success)
917 */
918int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
919{
920 struct uprobe *uprobe;
921 int ret;
922
923 if (!inode || !uc || uc->next)
924 return -EINVAL;
925
926 if (offset > i_size_read(inode))
927 return -EINVAL;
928
929 ret = 0;
930 mutex_lock(uprobes_hash(inode));
931 uprobe = alloc_uprobe(inode, offset);
932
933 if (uprobe && !consumer_add(uprobe, uc)) {
934 ret = __uprobe_register(uprobe);
935 if (ret) {
936 uprobe->consumers = NULL;
937 __uprobe_unregister(uprobe);
938 } else {
939 uprobe->flags |= UPROBE_RUN_HANDLER;
940 }
941 }
942
943 mutex_unlock(uprobes_hash(inode));
944 put_uprobe(uprobe);
945
946 return ret;
947}
948
949/*
950 * uprobe_unregister - unregister a already registered probe.
951 * @inode: the file in which the probe has to be removed.
952 * @offset: offset from the start of the file.
953 * @uc: identify which probe if multiple probes are colocated.
954 */
955void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
956{
957 struct uprobe *uprobe;
958
959 if (!inode || !uc)
960 return;
961
962 uprobe = find_uprobe(inode, offset);
963 if (!uprobe)
964 return;
965
966 mutex_lock(uprobes_hash(inode));
967
968 if (consumer_del(uprobe, uc)) {
969 if (!uprobe->consumers) {
970 __uprobe_unregister(uprobe);
971 uprobe->flags &= ~UPROBE_RUN_HANDLER;
972 }
973 }
974
975 mutex_unlock(uprobes_hash(inode));
976 if (uprobe)
977 put_uprobe(uprobe);
978}
979
980/*
981 * Of all the nodes that correspond to the given inode, return the node
982 * with the least offset.
983 */
984static struct rb_node *find_least_offset_node(struct inode *inode)
985{
986 struct uprobe u = { .inode = inode, .offset = 0};
987 struct rb_node *n = uprobes_tree.rb_node;
988 struct rb_node *close_node = NULL;
989 struct uprobe *uprobe;
990 int match;
991
992 while (n) {
993 uprobe = rb_entry(n, struct uprobe, rb_node);
994 match = match_uprobe(&u, uprobe);
995
996 if (uprobe->inode == inode)
997 close_node = n;
998
999 if (!match)
1000 return close_node;
1001
1002 if (match < 0)
1003 n = n->rb_left;
1004 else
1005 n = n->rb_right;
1006 }
1007
1008 return close_node;
1009}
1010
1011/*
1012 * For a given inode, build a list of probes that need to be inserted.
1013 */
1014static void build_probe_list(struct inode *inode, struct list_head *head)
1015{
1016 struct uprobe *uprobe;
1017 unsigned long flags;
1018 struct rb_node *n;
1019
1020 spin_lock_irqsave(&uprobes_treelock, flags);
1021
1022 n = find_least_offset_node(inode);
1023
1024 for (; n; n = rb_next(n)) {
1025 uprobe = rb_entry(n, struct uprobe, rb_node);
1026 if (uprobe->inode != inode)
1027 break;
1028
1029 list_add(&uprobe->pending_list, head);
1030 atomic_inc(&uprobe->ref);
1031 }
1032
1033 spin_unlock_irqrestore(&uprobes_treelock, flags);
1034}
1035
1036/*
1037 * Called from mmap_region.
1038 * called with mm->mmap_sem acquired.
1039 *
1040 * Return -ve no if we fail to insert probes and we cannot
1041 * bail-out.
1042 * Return 0 otherwise. i.e:
1043 *
1044 * - successful insertion of probes
1045 * - (or) no possible probes to be inserted.
1046 * - (or) insertion of probes failed but we can bail-out.
1047 */
1048int uprobe_mmap(struct vm_area_struct *vma)
1049{
1050 struct list_head tmp_list;
1051 struct uprobe *uprobe, *u;
1052 struct inode *inode;
1053 int ret, count;
1054
1055 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
1056 return 0;
1057
1058 inode = vma->vm_file->f_mapping->host;
1059 if (!inode)
1060 return 0;
1061
1062 INIT_LIST_HEAD(&tmp_list);
1063 mutex_lock(uprobes_mmap_hash(inode));
1064 build_probe_list(inode, &tmp_list);
1065
1066 ret = 0;
1067 count = 0;
1068
1069 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1070 loff_t vaddr;
1071
1072 list_del(&uprobe->pending_list);
1073 if (!ret) {
1074 vaddr = vma_address(vma, uprobe->offset);
1075
1076 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1077 put_uprobe(uprobe);
1078 continue;
1079 }
1080
1081 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1082
1083 /* Ignore double add: */
1084 if (ret == -EEXIST) {
1085 ret = 0;
1086
1087 if (!is_swbp_at_addr(vma->vm_mm, vaddr))
1088 continue;
1089
1090 /*
1091 * Unable to insert a breakpoint, but
1092 * breakpoint lies underneath. Increment the
1093 * probe count.
1094 */
1095 atomic_inc(&vma->vm_mm->uprobes_state.count);
1096 }
1097
1098 if (!ret)
1099 count++;
1100 }
1101 put_uprobe(uprobe);
1102 }
1103
1104 mutex_unlock(uprobes_mmap_hash(inode));
1105
1106 if (ret)
1107 atomic_sub(count, &vma->vm_mm->uprobes_state.count);
1108
1109 return ret;
1110}
1111
1112/*
1113 * Called in context of a munmap of a vma.
1114 */
1115void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1116{
1117 struct list_head tmp_list;
1118 struct uprobe *uprobe, *u;
1119 struct inode *inode;
1120
1121 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1122 return;
1123
1124 if (!atomic_read(&vma->vm_mm->uprobes_state.count))
1125 return;
1126
1127 inode = vma->vm_file->f_mapping->host;
1128 if (!inode)
1129 return;
1130
1131 INIT_LIST_HEAD(&tmp_list);
1132 mutex_lock(uprobes_mmap_hash(inode));
1133 build_probe_list(inode, &tmp_list);
1134
1135 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1136 loff_t vaddr;
1137
1138 list_del(&uprobe->pending_list);
1139 vaddr = vma_address(vma, uprobe->offset);
1140
1141 if (vaddr >= start && vaddr < end) {
1142 /*
1143 * An unregister could have removed the probe before
1144 * unmap. So check before we decrement the count.
1145 */
1146 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1147 atomic_dec(&vma->vm_mm->uprobes_state.count);
1148 }
1149 put_uprobe(uprobe);
1150 }
1151 mutex_unlock(uprobes_mmap_hash(inode));
1152}
1153
1154/* Slot allocation for XOL */
1155static int xol_add_vma(struct xol_area *area)
1156{
1157 struct mm_struct *mm;
1158 int ret;
1159
1160 area->page = alloc_page(GFP_HIGHUSER);
1161 if (!area->page)
1162 return -ENOMEM;
1163
1164 ret = -EALREADY;
1165 mm = current->mm;
1166
1167 down_write(&mm->mmap_sem);
1168 if (mm->uprobes_state.xol_area)
1169 goto fail;
1170
1171 ret = -ENOMEM;
1172
1173 /* Try to map as high as possible, this is only a hint. */
1174 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1175 if (area->vaddr & ~PAGE_MASK) {
1176 ret = area->vaddr;
1177 goto fail;
1178 }
1179
1180 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1181 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
1182 if (ret)
1183 goto fail;
1184
1185 smp_wmb(); /* pairs with get_xol_area() */
1186 mm->uprobes_state.xol_area = area;
1187 ret = 0;
1188
1189fail:
1190 up_write(&mm->mmap_sem);
1191 if (ret)
1192 __free_page(area->page);
1193
1194 return ret;
1195}
1196
1197static struct xol_area *get_xol_area(struct mm_struct *mm)
1198{
1199 struct xol_area *area;
1200
1201 area = mm->uprobes_state.xol_area;
1202 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1203
1204 return area;
1205}
1206
1207/*
1208 * xol_alloc_area - Allocate process's xol_area.
1209 * This area will be used for storing instructions for execution out of
1210 * line.
1211 *
1212 * Returns the allocated area or NULL.
1213 */
1214static struct xol_area *xol_alloc_area(void)
1215{
1216 struct xol_area *area;
1217
1218 area = kzalloc(sizeof(*area), GFP_KERNEL);
1219 if (unlikely(!area))
1220 return NULL;
1221
1222 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1223
1224 if (!area->bitmap)
1225 goto fail;
1226
1227 init_waitqueue_head(&area->wq);
1228 if (!xol_add_vma(area))
1229 return area;
1230
1231fail:
1232 kfree(area->bitmap);
1233 kfree(area);
1234
1235 return get_xol_area(current->mm);
1236}
1237
1238/*
1239 * uprobe_clear_state - Free the area allocated for slots.
1240 */
1241void uprobe_clear_state(struct mm_struct *mm)
1242{
1243 struct xol_area *area = mm->uprobes_state.xol_area;
1244
1245 if (!area)
1246 return;
1247
1248 put_page(area->page);
1249 kfree(area->bitmap);
1250 kfree(area);
1251}
1252
1253/*
1254 * uprobe_reset_state - Free the area allocated for slots.
1255 */
1256void uprobe_reset_state(struct mm_struct *mm)
1257{
1258 mm->uprobes_state.xol_area = NULL;
1259 atomic_set(&mm->uprobes_state.count, 0);
1260}
1261
1262/*
1263 * - search for a free slot.
1264 */
1265static unsigned long xol_take_insn_slot(struct xol_area *area)
1266{
1267 unsigned long slot_addr;
1268 int slot_nr;
1269
1270 do {
1271 slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1272 if (slot_nr < UINSNS_PER_PAGE) {
1273 if (!test_and_set_bit(slot_nr, area->bitmap))
1274 break;
1275
1276 slot_nr = UINSNS_PER_PAGE;
1277 continue;
1278 }
1279 wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1280 } while (slot_nr >= UINSNS_PER_PAGE);
1281
1282 slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1283 atomic_inc(&area->slot_count);
1284
1285 return slot_addr;
1286}
1287
1288/*
1289 * xol_get_insn_slot - If was not allocated a slot, then
1290 * allocate a slot.
1291 * Returns the allocated slot address or 0.
1292 */
1293static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
1294{
1295 struct xol_area *area;
1296 unsigned long offset;
1297 void *vaddr;
1298
1299 area = get_xol_area(current->mm);
1300 if (!area) {
1301 area = xol_alloc_area();
1302 if (!area)
1303 return 0;
1304 }
1305 current->utask->xol_vaddr = xol_take_insn_slot(area);
1306
1307 /*
1308 * Initialize the slot if xol_vaddr points to valid
1309 * instruction slot.
1310 */
1311 if (unlikely(!current->utask->xol_vaddr))
1312 return 0;
1313
1314 current->utask->vaddr = slot_addr;
1315 offset = current->utask->xol_vaddr & ~PAGE_MASK;
1316 vaddr = kmap_atomic(area->page);
1317 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1318 kunmap_atomic(vaddr);
1319
1320 return current->utask->xol_vaddr;
1321}
1322
1323/*
1324 * xol_free_insn_slot - If slot was earlier allocated by
1325 * @xol_get_insn_slot(), make the slot available for
1326 * subsequent requests.
1327 */
1328static void xol_free_insn_slot(struct task_struct *tsk)
1329{
1330 struct xol_area *area;
1331 unsigned long vma_end;
1332 unsigned long slot_addr;
1333
1334 if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1335 return;
1336
1337 slot_addr = tsk->utask->xol_vaddr;
1338
1339 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1340 return;
1341
1342 area = tsk->mm->uprobes_state.xol_area;
1343 vma_end = area->vaddr + PAGE_SIZE;
1344 if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1345 unsigned long offset;
1346 int slot_nr;
1347
1348 offset = slot_addr - area->vaddr;
1349 slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1350 if (slot_nr >= UINSNS_PER_PAGE)
1351 return;
1352
1353 clear_bit(slot_nr, area->bitmap);
1354 atomic_dec(&area->slot_count);
1355 if (waitqueue_active(&area->wq))
1356 wake_up(&area->wq);
1357
1358 tsk->utask->xol_vaddr = 0;
1359 }
1360}
1361
1362/**
1363 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1364 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1365 * instruction.
1366 * Return the address of the breakpoint instruction.
1367 */
1368unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1369{
1370 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1371}
1372
1373/*
1374 * Called with no locks held.
1375 * Called in context of a exiting or a exec-ing thread.
1376 */
1377void uprobe_free_utask(struct task_struct *t)
1378{
1379 struct uprobe_task *utask = t->utask;
1380
1381 if (t->uprobe_srcu_id != -1)
1382 srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
1383
1384 if (!utask)
1385 return;
1386
1387 if (utask->active_uprobe)
1388 put_uprobe(utask->active_uprobe);
1389
1390 xol_free_insn_slot(t);
1391 kfree(utask);
1392 t->utask = NULL;
1393}
1394
1395/*
1396 * Called in context of a new clone/fork from copy_process.
1397 */
1398void uprobe_copy_process(struct task_struct *t)
1399{
1400 t->utask = NULL;
1401 t->uprobe_srcu_id = -1;
1402}
1403
1404/*
1405 * Allocate a uprobe_task object for the task.
1406 * Called when the thread hits a breakpoint for the first time.
1407 *
1408 * Returns:
1409 * - pointer to new uprobe_task on success
1410 * - NULL otherwise
1411 */
1412static struct uprobe_task *add_utask(void)
1413{
1414 struct uprobe_task *utask;
1415
1416 utask = kzalloc(sizeof *utask, GFP_KERNEL);
1417 if (unlikely(!utask))
1418 return NULL;
1419
1420 utask->active_uprobe = NULL;
1421 current->utask = utask;
1422 return utask;
1423}
1424
1425/* Prepare to single-step probed instruction out of line. */
1426static int
1427pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
1428{
1429 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
1430 return 0;
1431
1432 return -EFAULT;
1433}
1434
1435/*
1436 * If we are singlestepping, then ensure this thread is not connected to
1437 * non-fatal signals until completion of singlestep. When xol insn itself
1438 * triggers the signal, restart the original insn even if the task is
1439 * already SIGKILL'ed (since coredump should report the correct ip). This
1440 * is even more important if the task has a handler for SIGSEGV/etc, The
1441 * _same_ instruction should be repeated again after return from the signal
1442 * handler, and SSTEP can never finish in this case.
1443 */
1444bool uprobe_deny_signal(void)
1445{
1446 struct task_struct *t = current;
1447 struct uprobe_task *utask = t->utask;
1448
1449 if (likely(!utask || !utask->active_uprobe))
1450 return false;
1451
1452 WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1453
1454 if (signal_pending(t)) {
1455 spin_lock_irq(&t->sighand->siglock);
1456 clear_tsk_thread_flag(t, TIF_SIGPENDING);
1457 spin_unlock_irq(&t->sighand->siglock);
1458
1459 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1460 utask->state = UTASK_SSTEP_TRAPPED;
1461 set_tsk_thread_flag(t, TIF_UPROBE);
1462 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1463 }
1464 }
1465
1466 return true;
1467}
1468
1469/*
1470 * Avoid singlestepping the original instruction if the original instruction
1471 * is a NOP or can be emulated.
1472 */
1473static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1474{
1475 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1476 return true;
1477
1478 uprobe->flags &= ~UPROBE_SKIP_SSTEP;
1479 return false;
1480}
1481
1482/*
1483 * Run handler and ask thread to singlestep.
1484 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1485 */
1486static void handle_swbp(struct pt_regs *regs)
1487{
1488 struct vm_area_struct *vma;
1489 struct uprobe_task *utask;
1490 struct uprobe *uprobe;
1491 struct mm_struct *mm;
1492 unsigned long bp_vaddr;
1493
1494 uprobe = NULL;
1495 bp_vaddr = uprobe_get_swbp_addr(regs);
1496 mm = current->mm;
1497 down_read(&mm->mmap_sem);
1498 vma = find_vma(mm, bp_vaddr);
1499
1500 if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
1501 struct inode *inode;
1502 loff_t offset;
1503
1504 inode = vma->vm_file->f_mapping->host;
1505 offset = bp_vaddr - vma->vm_start;
1506 offset += (vma->vm_pgoff << PAGE_SHIFT);
1507 uprobe = find_uprobe(inode, offset);
1508 }
1509
1510 srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
1511 current->uprobe_srcu_id = -1;
1512 up_read(&mm->mmap_sem);
1513
1514 if (!uprobe) {
1515 /* No matching uprobe; signal SIGTRAP. */
1516 send_sig(SIGTRAP, current, 0);
1517 return;
1518 }
1519
1520 utask = current->utask;
1521 if (!utask) {
1522 utask = add_utask();
1523 /* Cannot allocate; re-execute the instruction. */
1524 if (!utask)
1525 goto cleanup_ret;
1526 }
1527 utask->active_uprobe = uprobe;
1528 handler_chain(uprobe, regs);
1529 if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
1530 goto cleanup_ret;
1531
1532 utask->state = UTASK_SSTEP;
1533 if (!pre_ssout(uprobe, regs, bp_vaddr)) {
1534 user_enable_single_step(current);
1535 return;
1536 }
1537
1538cleanup_ret:
1539 if (utask) {
1540 utask->active_uprobe = NULL;
1541 utask->state = UTASK_RUNNING;
1542 }
1543 if (uprobe) {
1544 if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
1545
1546 /*
1547 * cannot singlestep; cannot skip instruction;
1548 * re-execute the instruction.
1549 */
1550 instruction_pointer_set(regs, bp_vaddr);
1551
1552 put_uprobe(uprobe);
1553 }
1554}
1555
1556/*
1557 * Perform required fix-ups and disable singlestep.
1558 * Allow pending signals to take effect.
1559 */
1560static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1561{
1562 struct uprobe *uprobe;
1563
1564 uprobe = utask->active_uprobe;
1565 if (utask->state == UTASK_SSTEP_ACK)
1566 arch_uprobe_post_xol(&uprobe->arch, regs);
1567 else if (utask->state == UTASK_SSTEP_TRAPPED)
1568 arch_uprobe_abort_xol(&uprobe->arch, regs);
1569 else
1570 WARN_ON_ONCE(1);
1571
1572 put_uprobe(uprobe);
1573 utask->active_uprobe = NULL;
1574 utask->state = UTASK_RUNNING;
1575 user_disable_single_step(current);
1576 xol_free_insn_slot(current);
1577
1578 spin_lock_irq(&current->sighand->siglock);
1579 recalc_sigpending(); /* see uprobe_deny_signal() */
1580 spin_unlock_irq(&current->sighand->siglock);
1581}
1582
1583/*
1584 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on
1585 * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
1586 * allows the thread to return from interrupt.
1587 *
1588 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
1589 * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
1590 * interrupt.
1591 *
1592 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1593 * uprobe_notify_resume().
1594 */
1595void uprobe_notify_resume(struct pt_regs *regs)
1596{
1597 struct uprobe_task *utask;
1598
1599 utask = current->utask;
1600 if (!utask || utask->state == UTASK_BP_HIT)
1601 handle_swbp(regs);
1602 else
1603 handle_singlestep(utask, regs);
1604}
1605
1606/*
1607 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
1608 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
1609 */
1610int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1611{
1612 struct uprobe_task *utask;
1613
1614 if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
1615 /* task is currently not uprobed */
1616 return 0;
1617
1618 utask = current->utask;
1619 if (utask)
1620 utask->state = UTASK_BP_HIT;
1621
1622 set_thread_flag(TIF_UPROBE);
1623 current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
1624
1625 return 1;
1626}
1627
1628/*
1629 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
1630 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
1631 */
1632int uprobe_post_sstep_notifier(struct pt_regs *regs)
1633{
1634 struct uprobe_task *utask = current->utask;
1635
1636 if (!current->mm || !utask || !utask->active_uprobe)
1637 /* task is currently not uprobed */
1638 return 0;
1639
1640 utask->state = UTASK_SSTEP_ACK;
1641 set_thread_flag(TIF_UPROBE);
1642 return 1;
1643}
1644
1645static struct notifier_block uprobe_exception_nb = {
1646 .notifier_call = arch_uprobe_exception_notify,
1647 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
1648};
1649
1650static int __init init_uprobes(void)
1651{
1652 int i;
1653
1654 for (i = 0; i < UPROBES_HASH_SZ; i++) {
1655 mutex_init(&uprobes_mutex[i]);
1656 mutex_init(&uprobes_mmap_mutex[i]);
1657 }
1658 init_srcu_struct(&uprobes_srcu);
1659
1660 return register_die_notifier(&uprobe_exception_nb);
1661}
1662module_init(init_uprobes);
1663
1664static void __exit exit_uprobes(void)
1665{
1666}
1667module_exit(exit_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b425fa7..34867cc5b42a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -884,9 +884,9 @@ static void check_stack_usage(void)
884 884
885 spin_lock(&low_water_lock); 885 spin_lock(&low_water_lock);
886 if (free < lowest_to_date) { 886 if (free < lowest_to_date) {
887 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " 887 printk(KERN_WARNING "%s (%d) used greatest stack depth: "
888 "left\n", 888 "%lu bytes left\n",
889 current->comm, free); 889 current->comm, task_pid_nr(current), free);
890 lowest_to_date = free; 890 lowest_to_date = free;
891 } 891 }
892 spin_unlock(&low_water_lock); 892 spin_unlock(&low_water_lock);
@@ -946,12 +946,13 @@ void do_exit(long code)
946 exit_signals(tsk); /* sets PF_EXITING */ 946 exit_signals(tsk); /* sets PF_EXITING */
947 /* 947 /*
948 * tsk->flags are checked in the futex code to protect against 948 * tsk->flags are checked in the futex code to protect against
949 * an exiting task cleaning up the robust pi futexes. 949 * an exiting task cleaning up the robust pi futexes, and in
950 * task_work_add() to avoid the race with exit_task_work().
950 */ 951 */
951 smp_mb(); 952 smp_mb();
952 raw_spin_unlock_wait(&tsk->pi_lock); 953 raw_spin_unlock_wait(&tsk->pi_lock);
953 954
954 exit_irq_thread(); 955 exit_task_work(tsk);
955 956
956 if (unlikely(in_atomic())) 957 if (unlikely(in_atomic()))
957 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 958 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -1214,7 +1215,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1214 unsigned long state; 1215 unsigned long state;
1215 int retval, status, traced; 1216 int retval, status, traced;
1216 pid_t pid = task_pid_vnr(p); 1217 pid_t pid = task_pid_vnr(p);
1217 uid_t uid = __task_cred(p)->uid; 1218 uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1218 struct siginfo __user *infop; 1219 struct siginfo __user *infop;
1219 1220
1220 if (!likely(wo->wo_flags & WEXITED)) 1221 if (!likely(wo->wo_flags & WEXITED))
@@ -1427,7 +1428,7 @@ static int wait_task_stopped(struct wait_opts *wo,
1427 if (!unlikely(wo->wo_flags & WNOWAIT)) 1428 if (!unlikely(wo->wo_flags & WNOWAIT))
1428 *p_code = 0; 1429 *p_code = 0;
1429 1430
1430 uid = task_uid(p); 1431 uid = from_kuid_munged(current_user_ns(), task_uid(p));
1431unlock_sig: 1432unlock_sig:
1432 spin_unlock_irq(&p->sighand->siglock); 1433 spin_unlock_irq(&p->sighand->siglock);
1433 if (!exit_code) 1434 if (!exit_code)
@@ -1500,7 +1501,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1500 } 1501 }
1501 if (!unlikely(wo->wo_flags & WNOWAIT)) 1502 if (!unlikely(wo->wo_flags & WNOWAIT))
1502 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1503 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1503 uid = task_uid(p); 1504 uid = from_kuid_munged(current_user_ns(), task_uid(p));
1504 spin_unlock_irq(&p->sighand->siglock); 1505 spin_unlock_irq(&p->sighand->siglock);
1505 1506
1506 pid = task_pid_vnr(p); 1507 pid = task_pid_vnr(p);
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b8241..fe35a634bf76 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
35extern struct exception_table_entry __start___ex_table[]; 35extern struct exception_table_entry __start___ex_table[];
36extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
37 37
38/* Cleared by build time tools if the table is already sorted. */
39u32 __initdata main_extable_sort_needed = 1;
40
38/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
39void __init sort_main_extable(void) 42void __init sort_main_extable(void)
40{ 43{
41 sort_extable(__start___ex_table, __stop___ex_table); 44 if (main_extable_sort_needed)
45 sort_extable(__start___ex_table, __stop___ex_table);
46 else
47 pr_notice("__ex_table already sorted, skipping sort\n");
42} 48}
43 49
44/* Given an address, look for it in the exception tables. */ 50/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..ab5211b9e622 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
34#include <linux/cgroup.h> 34#include <linux/cgroup.h>
35#include <linux/security.h> 35#include <linux/security.h>
36#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
37#include <linux/seccomp.h>
37#include <linux/swap.h> 38#include <linux/swap.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
39#include <linux/jiffies.h> 40#include <linux/jiffies.h>
@@ -47,6 +48,7 @@
47#include <linux/audit.h> 48#include <linux/audit.h>
48#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
49#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/proc_fs.h>
50#include <linux/profile.h> 52#include <linux/profile.h>
51#include <linux/rmap.h> 53#include <linux/rmap.h>
52#include <linux/ksm.h> 54#include <linux/ksm.h>
@@ -67,6 +69,7 @@
67#include <linux/oom.h> 69#include <linux/oom.h>
68#include <linux/khugepaged.h> 70#include <linux/khugepaged.h>
69#include <linux/signalfd.h> 71#include <linux/signalfd.h>
72#include <linux/uprobes.h>
70 73
71#include <asm/pgtable.h> 74#include <asm/pgtable.h>
72#include <asm/pgalloc.h> 75#include <asm/pgalloc.h>
@@ -111,32 +114,67 @@ int nr_processes(void)
111 return total; 114 return total;
112} 115}
113 116
114#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 117#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
115# define alloc_task_struct_node(node) \
116 kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
117# define free_task_struct(tsk) \
118 kmem_cache_free(task_struct_cachep, (tsk))
119static struct kmem_cache *task_struct_cachep; 118static struct kmem_cache *task_struct_cachep;
119
120static inline struct task_struct *alloc_task_struct_node(int node)
121{
122 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
123}
124
125void __weak arch_release_task_struct(struct task_struct *tsk) { }
126
127static inline void free_task_struct(struct task_struct *tsk)
128{
129 arch_release_task_struct(tsk);
130 kmem_cache_free(task_struct_cachep, tsk);
131}
120#endif 132#endif
121 133
122#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR 134#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
135void __weak arch_release_thread_info(struct thread_info *ti) { }
136
137/*
138 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
139 * kmemcache based allocator.
140 */
141# if THREAD_SIZE >= PAGE_SIZE
123static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, 142static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
124 int node) 143 int node)
125{ 144{
126#ifdef CONFIG_DEBUG_STACK_USAGE 145 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
127 gfp_t mask = GFP_KERNEL | __GFP_ZERO; 146 THREAD_SIZE_ORDER);
128#else
129 gfp_t mask = GFP_KERNEL;
130#endif
131 struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
132 147
133 return page ? page_address(page) : NULL; 148 return page ? page_address(page) : NULL;
134} 149}
135 150
136static inline void free_thread_info(struct thread_info *ti) 151static inline void free_thread_info(struct thread_info *ti)
137{ 152{
153 arch_release_thread_info(ti);
138 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 154 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
139} 155}
156# else
157static struct kmem_cache *thread_info_cache;
158
159static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
160 int node)
161{
162 return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
163}
164
165static void free_thread_info(struct thread_info *ti)
166{
167 arch_release_thread_info(ti);
168 kmem_cache_free(thread_info_cache, ti);
169}
170
171void thread_info_cache_init(void)
172{
173 thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
174 THREAD_SIZE, 0, NULL);
175 BUG_ON(thread_info_cache == NULL);
176}
177# endif
140#endif 178#endif
141 179
142/* SLAB cache for signal_struct structures (tsk->signal) */ 180/* SLAB cache for signal_struct structures (tsk->signal) */
@@ -170,6 +208,7 @@ void free_task(struct task_struct *tsk)
170 free_thread_info(tsk->stack); 208 free_thread_info(tsk->stack);
171 rt_mutex_debug_task_free(tsk); 209 rt_mutex_debug_task_free(tsk);
172 ftrace_graph_exit_task(tsk); 210 ftrace_graph_exit_task(tsk);
211 put_seccomp_filter(tsk);
173 free_task_struct(tsk); 212 free_task_struct(tsk);
174} 213}
175EXPORT_SYMBOL(free_task); 214EXPORT_SYMBOL(free_task);
@@ -203,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk)
203} 242}
204EXPORT_SYMBOL_GPL(__put_task_struct); 243EXPORT_SYMBOL_GPL(__put_task_struct);
205 244
206/* 245void __init __weak arch_task_cache_init(void) { }
207 * macro override instead of weak attribute alias, to workaround
208 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
209 */
210#ifndef arch_task_cache_init
211#define arch_task_cache_init()
212#endif
213 246
214void __init fork_init(unsigned long mempages) 247void __init fork_init(unsigned long mempages)
215{ 248{
216#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 249#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
217#ifndef ARCH_MIN_TASKALIGN 250#ifndef ARCH_MIN_TASKALIGN
218#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES 251#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
219#endif 252#endif
@@ -260,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
260 int node = tsk_fork_get_node(orig); 293 int node = tsk_fork_get_node(orig);
261 int err; 294 int err;
262 295
263 prepare_to_copy(orig);
264
265 tsk = alloc_task_struct_node(node); 296 tsk = alloc_task_struct_node(node);
266 if (!tsk) 297 if (!tsk)
267 return NULL; 298 return NULL;
@@ -355,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
355 } 386 }
356 charge = 0; 387 charge = 0;
357 if (mpnt->vm_flags & VM_ACCOUNT) { 388 if (mpnt->vm_flags & VM_ACCOUNT) {
358 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 389 unsigned long len;
390 len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
359 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 391 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
360 goto fail_nomem; 392 goto fail_nomem;
361 charge = len; 393 charge = len;
@@ -421,6 +453,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
421 453
422 if (retval) 454 if (retval)
423 goto out; 455 goto out;
456
457 if (file && uprobe_mmap(tmp))
458 goto out;
424 } 459 }
425 /* a new mm has just been created */ 460 /* a new mm has just been created */
426 arch_dup_mmap(oldmm, mm); 461 arch_dup_mmap(oldmm, mm);
@@ -569,6 +604,7 @@ void mmput(struct mm_struct *mm)
569 might_sleep(); 604 might_sleep();
570 605
571 if (atomic_dec_and_test(&mm->mm_users)) { 606 if (atomic_dec_and_test(&mm->mm_users)) {
607 uprobe_clear_state(mm);
572 exit_aio(mm); 608 exit_aio(mm);
573 ksm_exit(mm); 609 ksm_exit(mm);
574 khugepaged_exit(mm); /* must run before exit_mmap */ 610 khugepaged_exit(mm); /* must run before exit_mmap */
@@ -579,7 +615,6 @@ void mmput(struct mm_struct *mm)
579 list_del(&mm->mmlist); 615 list_del(&mm->mmlist);
580 spin_unlock(&mmlist_lock); 616 spin_unlock(&mmlist_lock);
581 } 617 }
582 put_swap_token(mm);
583 if (mm->binfmt) 618 if (mm->binfmt)
584 module_put(mm->binfmt->module); 619 module_put(mm->binfmt->module);
585 mmdrop(mm); 620 mmdrop(mm);
@@ -747,12 +782,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
747 exit_pi_state_list(tsk); 782 exit_pi_state_list(tsk);
748#endif 783#endif
749 784
785 uprobe_free_utask(tsk);
786
750 /* Get rid of any cached register state */ 787 /* Get rid of any cached register state */
751 deactivate_mm(tsk, mm); 788 deactivate_mm(tsk, mm);
752 789
753 if (tsk->vfork_done)
754 complete_vfork_done(tsk);
755
756 /* 790 /*
757 * If we're exiting normally, clear a user-space tid field if 791 * If we're exiting normally, clear a user-space tid field if
758 * requested. We leave this alone when dying by signal, to leave 792 * requested. We leave this alone when dying by signal, to leave
@@ -773,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
773 } 807 }
774 tsk->clear_child_tid = NULL; 808 tsk->clear_child_tid = NULL;
775 } 809 }
810
811 /*
812 * All done, finally we can wake up parent and return this mm to him.
813 * Also kthread_stop() uses this completion for synchronization.
814 */
815 if (tsk->vfork_done)
816 complete_vfork_done(tsk);
776} 817}
777 818
778/* 819/*
@@ -794,13 +835,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
794 memcpy(mm, oldmm, sizeof(*mm)); 835 memcpy(mm, oldmm, sizeof(*mm));
795 mm_init_cpumask(mm); 836 mm_init_cpumask(mm);
796 837
797 /* Initializing for Swap token stuff */
798 mm->token_priority = 0;
799 mm->last_interval = 0;
800
801#ifdef CONFIG_TRANSPARENT_HUGEPAGE 838#ifdef CONFIG_TRANSPARENT_HUGEPAGE
802 mm->pmd_huge_pte = NULL; 839 mm->pmd_huge_pte = NULL;
803#endif 840#endif
841 uprobe_reset_state(mm);
804 842
805 if (!mm_init(mm, tsk)) 843 if (!mm_init(mm, tsk))
806 goto fail_nomem; 844 goto fail_nomem;
@@ -875,10 +913,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
875 goto fail_nomem; 913 goto fail_nomem;
876 914
877good_mm: 915good_mm:
878 /* Initializing for Swap token stuff */
879 mm->token_priority = 0;
880 mm->last_interval = 0;
881
882 tsk->mm = mm; 916 tsk->mm = mm;
883 tsk->active_mm = mm; 917 tsk->active_mm = mm;
884 return 0; 918 return 0;
@@ -946,9 +980,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
946 * Share io context with parent, if CLONE_IO is set 980 * Share io context with parent, if CLONE_IO is set
947 */ 981 */
948 if (clone_flags & CLONE_IO) { 982 if (clone_flags & CLONE_IO) {
949 tsk->io_context = ioc_task_link(ioc); 983 ioc_task_link(ioc);
950 if (unlikely(!tsk->io_context)) 984 tsk->io_context = ioc;
951 return -ENOMEM;
952 } else if (ioprio_valid(ioc->ioprio)) { 985 } else if (ioprio_valid(ioc->ioprio)) {
953 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); 986 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
954 if (unlikely(!new_ioc)) 987 if (unlikely(!new_ioc))
@@ -1162,6 +1195,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1162 goto fork_out; 1195 goto fork_out;
1163 1196
1164 ftrace_graph_init_task(p); 1197 ftrace_graph_init_task(p);
1198 get_seccomp_filter(p);
1165 1199
1166 rt_mutex_init_task(p); 1200 rt_mutex_init_task(p);
1167 1201
@@ -1342,6 +1376,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1342 INIT_LIST_HEAD(&p->pi_state_list); 1376 INIT_LIST_HEAD(&p->pi_state_list);
1343 p->pi_state_cache = NULL; 1377 p->pi_state_cache = NULL;
1344#endif 1378#endif
1379 uprobe_copy_process(p);
1345 /* 1380 /*
1346 * sigaltstack should be cleared when sharing the same VM 1381 * sigaltstack should be cleared when sharing the same VM
1347 */ 1382 */
@@ -1380,6 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1380 */ 1415 */
1381 p->group_leader = p; 1416 p->group_leader = p;
1382 INIT_LIST_HEAD(&p->thread_group); 1417 INIT_LIST_HEAD(&p->thread_group);
1418 INIT_HLIST_HEAD(&p->task_works);
1383 1419
1384 /* Now that the task is set up, run cgroup callbacks if 1420 /* Now that the task is set up, run cgroup callbacks if
1385 * necessary. We need to run them before the task is visible 1421 * necessary. We need to run them before the task is visible
@@ -1464,6 +1500,8 @@ bad_fork_cleanup_io:
1464 if (p->io_context) 1500 if (p->io_context)
1465 exit_io_context(p); 1501 exit_io_context(p);
1466bad_fork_cleanup_namespaces: 1502bad_fork_cleanup_namespaces:
1503 if (unlikely(clone_flags & CLONE_NEWPID))
1504 pid_ns_release_proc(p->nsproxy->pid_ns);
1467 exit_task_namespaces(p); 1505 exit_task_namespaces(p);
1468bad_fork_cleanup_mm: 1506bad_fork_cleanup_mm:
1469 if (p->mm) 1507 if (p->mm)
diff --git a/kernel/groups.c b/kernel/groups.c
index 99b53d1eb7ea..6b2588dd04ff 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
31 group_info->blocks[0] = group_info->small_block; 31 group_info->blocks[0] = group_info->small_block;
32 else { 32 else {
33 for (i = 0; i < nblocks; i++) { 33 for (i = 0; i < nblocks; i++) {
34 gid_t *b; 34 kgid_t *b;
35 b = (void *)__get_free_page(GFP_USER); 35 b = (void *)__get_free_page(GFP_USER);
36 if (!b) 36 if (!b)
37 goto out_undo_partial_alloc; 37 goto out_undo_partial_alloc;
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free);
66static int groups_to_user(gid_t __user *grouplist, 66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info) 67 const struct group_info *group_info)
68{ 68{
69 struct user_namespace *user_ns = current_user_ns();
69 int i; 70 int i;
70 unsigned int count = group_info->ngroups; 71 unsigned int count = group_info->ngroups;
71 72
72 for (i = 0; i < group_info->nblocks; i++) { 73 for (i = 0; i < count; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); 74 gid_t gid;
74 unsigned int len = cp_count * sizeof(*grouplist); 75 gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
75 76 if (put_user(gid, grouplist+i))
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT; 77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 } 78 }
82 return 0; 79 return 0;
83} 80}
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist,
86static int groups_from_user(struct group_info *group_info, 83static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist) 84 gid_t __user *grouplist)
88{ 85{
86 struct user_namespace *user_ns = current_user_ns();
89 int i; 87 int i;
90 unsigned int count = group_info->ngroups; 88 unsigned int count = group_info->ngroups;
91 89
92 for (i = 0; i < group_info->nblocks; i++) { 90 for (i = 0; i < count; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); 91 gid_t gid;
94 unsigned int len = cp_count * sizeof(*grouplist); 92 kgid_t kgid;
95 93 if (get_user(gid, grouplist+i))
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT; 94 return -EFAULT;
98 95
99 grouplist += NGROUPS_PER_BLOCK; 96 kgid = make_kgid(user_ns, gid);
100 count -= cp_count; 97 if (!gid_valid(kgid))
98 return -EINVAL;
99
100 GROUP_AT(group_info, i) = kgid;
101 } 101 }
102 return 0; 102 return 0;
103} 103}
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
117 for (base = 0; base < max; base++) { 117 for (base = 0; base < max; base++) {
118 int left = base; 118 int left = base;
119 int right = left + stride; 119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right); 120 kgid_t tmp = GROUP_AT(group_info, right);
121 121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) { 122 while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
123 GROUP_AT(group_info, right) = 123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left); 124 GROUP_AT(group_info, left);
125 right = left; 125 right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
132} 132}
133 133
134/* a simple bsearch */ 134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp) 135int groups_search(const struct group_info *group_info, kgid_t grp)
136{ 136{
137 unsigned int left, right; 137 unsigned int left, right;
138 138
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 if (grp > GROUP_AT(group_info, mid)) 146 if (gid_gt(grp, GROUP_AT(group_info, mid)))
147 left = mid + 1; 147 left = mid + 1;
148 else if (grp < GROUP_AT(group_info, mid)) 148 else if (gid_lt(grp, GROUP_AT(group_info, mid)))
149 right = mid; 149 right = mid;
150 else 150 else
151 return 1; 151 return 1;
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
256/* 256/*
257 * Check whether we're fsgid/egid or in the supplemental group.. 257 * Check whether we're fsgid/egid or in the supplemental group..
258 */ 258 */
259int in_group_p(gid_t grp) 259int in_group_p(kgid_t grp)
260{ 260{
261 const struct cred *cred = current_cred(); 261 const struct cred *cred = current_cred();
262 int retval = 1; 262 int retval = 1;
263 263
264 if (grp != cred->fsgid) 264 if (!gid_eq(grp, cred->fsgid))
265 retval = groups_search(cred->group_info, grp); 265 retval = groups_search(cred->group_info, grp);
266 return retval; 266 return retval;
267} 267}
268 268
269EXPORT_SYMBOL(in_group_p); 269EXPORT_SYMBOL(in_group_p);
270 270
271int in_egroup_p(gid_t grp) 271int in_egroup_p(kgid_t grp)
272{ 272{
273 const struct cred *cred = current_cred(); 273 const struct cred *cred = current_cred();
274 int retval = 1; 274 int retval = 1;
275 275
276 if (grp != cred->egid) 276 if (!gid_eq(grp, cred->egid))
277 retval = groups_search(cred->group_info, grp); 277 retval = groups_search(cred->group_info, grp);
278 return retval; 278 return retval;
279} 279}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2a..6df614912b9d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
108 108
109 touch_nmi_watchdog(); 109 touch_nmi_watchdog();
110 110
111 if (sysctl_hung_task_panic) 111 if (sysctl_hung_task_panic) {
112 trigger_all_cpu_backtrace();
112 panic("hung_task: blocked tasks"); 113 panic("hung_task: blocked tasks");
114 }
113} 115}
114 116
115/* 117/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c33..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
275 kstat_incr_irqs_this_cpu(irq, desc); 275 kstat_incr_irqs_this_cpu(irq, desc);
276 276
277 action = desc->action; 277 action = desc->action;
278 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) 278 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
279 desc->istate |= IRQS_PENDING;
279 goto out_unlock; 280 goto out_unlock;
281 }
280 282
281 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); 283 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
282 raw_spin_unlock_irq(&desc->lock); 284 raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
324 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 326 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
325 kstat_incr_irqs_this_cpu(irq, desc); 327 kstat_incr_irqs_this_cpu(irq, desc);
326 328
327 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 329 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
330 desc->istate |= IRQS_PENDING;
328 goto out_unlock; 331 goto out_unlock;
332 }
329 333
330 handle_irq_event(desc); 334 handle_irq_event(desc);
331 335
@@ -379,8 +383,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
379 * If its disabled or no action available 383 * If its disabled or no action available
380 * keep it masked and get out of here 384 * keep it masked and get out of here
381 */ 385 */
382 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) 386 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
387 desc->istate |= IRQS_PENDING;
383 goto out_unlock; 388 goto out_unlock;
389 }
384 390
385 handle_irq_event(desc); 391 handle_irq_event(desc);
386 392
@@ -518,6 +524,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
518out_unlock: 524out_unlock:
519 raw_spin_unlock(&desc->lock); 525 raw_spin_unlock(&desc->lock);
520} 526}
527EXPORT_SYMBOL(handle_edge_irq);
521 528
522#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER 529#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
523/** 530/**
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
101 101
102extern void irq_set_thread_affinity(struct irq_desc *desc); 102extern void irq_set_thread_affinity(struct irq_desc *desc);
103 103
104extern int irq_do_set_affinity(struct irq_data *data,
105 const struct cpumask *dest, bool force);
106
104/* Inline functions for support of irq chips on slow busses */ 107/* Inline functions for support of irq chips on slow busses */
105static inline void chip_bus_lock(struct irq_desc *desc) 108static inline void chip_bus_lock(struct irq_desc *desc)
106{ 109{
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95eb..192a302d6cfd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
112{ 112{
113 return radix_tree_lookup(&irq_desc_tree, irq); 113 return radix_tree_lookup(&irq_desc_tree, irq);
114} 114}
115EXPORT_SYMBOL(irq_to_desc);
115 116
116static void delete_irq_desc(unsigned int irq) 117static void delete_irq_desc(unsigned int irq)
117{ 118{
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 0e0ba5f840b2..41c1564103f1 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) "irq: " fmt
2
1#include <linux/debugfs.h> 3#include <linux/debugfs.h>
2#include <linux/hardirq.h> 4#include <linux/hardirq.h>
3#include <linux/interrupt.h> 5#include <linux/interrupt.h>
@@ -56,14 +58,73 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
56 return domain; 58 return domain;
57} 59}
58 60
61static void irq_domain_free(struct irq_domain *domain)
62{
63 of_node_put(domain->of_node);
64 kfree(domain);
65}
66
59static void irq_domain_add(struct irq_domain *domain) 67static void irq_domain_add(struct irq_domain *domain)
60{ 68{
61 mutex_lock(&irq_domain_mutex); 69 mutex_lock(&irq_domain_mutex);
62 list_add(&domain->link, &irq_domain_list); 70 list_add(&domain->link, &irq_domain_list);
63 mutex_unlock(&irq_domain_mutex); 71 mutex_unlock(&irq_domain_mutex);
64 pr_debug("irq: Allocated domain of type %d @0x%p\n", 72 pr_debug("Allocated domain of type %d @0x%p\n",
73 domain->revmap_type, domain);
74}
75
76/**
77 * irq_domain_remove() - Remove an irq domain.
78 * @domain: domain to remove
79 *
80 * This routine is used to remove an irq domain. The caller must ensure
81 * that all mappings within the domain have been disposed of prior to
82 * use, depending on the revmap type.
83 */
84void irq_domain_remove(struct irq_domain *domain)
85{
86 mutex_lock(&irq_domain_mutex);
87
88 switch (domain->revmap_type) {
89 case IRQ_DOMAIN_MAP_LEGACY:
90 /*
91 * Legacy domains don't manage their own irq_desc
92 * allocations, we expect the caller to handle irq_desc
93 * freeing on their own.
94 */
95 break;
96 case IRQ_DOMAIN_MAP_TREE:
97 /*
98 * radix_tree_delete() takes care of destroying the root
99 * node when all entries are removed. Shout if there are
100 * any mappings left.
101 */
102 WARN_ON(domain->revmap_data.tree.height);
103 break;
104 case IRQ_DOMAIN_MAP_LINEAR:
105 kfree(domain->revmap_data.linear.revmap);
106 domain->revmap_data.linear.size = 0;
107 break;
108 case IRQ_DOMAIN_MAP_NOMAP:
109 break;
110 }
111
112 list_del(&domain->link);
113
114 /*
115 * If the going away domain is the default one, reset it.
116 */
117 if (unlikely(irq_default_domain == domain))
118 irq_set_default_host(NULL);
119
120 mutex_unlock(&irq_domain_mutex);
121
122 pr_debug("Removed domain of type %d @0x%p\n",
65 domain->revmap_type, domain); 123 domain->revmap_type, domain);
124
125 irq_domain_free(domain);
66} 126}
127EXPORT_SYMBOL_GPL(irq_domain_remove);
67 128
68static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, 129static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
69 irq_hw_number_t hwirq) 130 irq_hw_number_t hwirq)
@@ -117,8 +178,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
117 178
118 if (WARN_ON(!irq_data || irq_data->domain)) { 179 if (WARN_ON(!irq_data || irq_data->domain)) {
119 mutex_unlock(&irq_domain_mutex); 180 mutex_unlock(&irq_domain_mutex);
120 of_node_put(domain->of_node); 181 irq_domain_free(domain);
121 kfree(domain);
122 return NULL; 182 return NULL;
123 } 183 }
124 } 184 }
@@ -152,10 +212,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
152 irq_domain_add(domain); 212 irq_domain_add(domain);
153 return domain; 213 return domain;
154} 214}
215EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
155 216
156/** 217/**
157 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. 218 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
158 * @of_node: pointer to interrupt controller's device tree node. 219 * @of_node: pointer to interrupt controller's device tree node.
220 * @size: Number of interrupts in the domain.
159 * @ops: map/unmap domain callbacks 221 * @ops: map/unmap domain callbacks
160 * @host_data: Controller private data pointer 222 * @host_data: Controller private data pointer
161 */ 223 */
@@ -181,6 +243,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
181 irq_domain_add(domain); 243 irq_domain_add(domain);
182 return domain; 244 return domain;
183} 245}
246EXPORT_SYMBOL_GPL(irq_domain_add_linear);
184 247
185struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, 248struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
186 unsigned int max_irq, 249 unsigned int max_irq,
@@ -195,6 +258,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
195 } 258 }
196 return domain; 259 return domain;
197} 260}
261EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
198 262
199/** 263/**
200 * irq_domain_add_tree() 264 * irq_domain_add_tree()
@@ -216,6 +280,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
216 } 280 }
217 return domain; 281 return domain;
218} 282}
283EXPORT_SYMBOL_GPL(irq_domain_add_tree);
219 284
220/** 285/**
221 * irq_find_host() - Locates a domain for a given device node 286 * irq_find_host() - Locates a domain for a given device node
@@ -259,10 +324,11 @@ EXPORT_SYMBOL_GPL(irq_find_host);
259 */ 324 */
260void irq_set_default_host(struct irq_domain *domain) 325void irq_set_default_host(struct irq_domain *domain)
261{ 326{
262 pr_debug("irq: Default domain set to @0x%p\n", domain); 327 pr_debug("Default domain set to @0x%p\n", domain);
263 328
264 irq_default_domain = domain; 329 irq_default_domain = domain;
265} 330}
331EXPORT_SYMBOL_GPL(irq_set_default_host);
266 332
267static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, 333static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
268 irq_hw_number_t hwirq) 334 irq_hw_number_t hwirq)
@@ -272,7 +338,7 @@ static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
272 irq_data->hwirq = hwirq; 338 irq_data->hwirq = hwirq;
273 irq_data->domain = domain; 339 irq_data->domain = domain;
274 if (domain->ops->map(domain, virq, hwirq)) { 340 if (domain->ops->map(domain, virq, hwirq)) {
275 pr_debug("irq: -> mapping failed, freeing\n"); 341 pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
276 irq_data->domain = NULL; 342 irq_data->domain = NULL;
277 irq_data->hwirq = 0; 343 irq_data->hwirq = 0;
278 return -1; 344 return -1;
@@ -303,7 +369,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
303 369
304 virq = irq_alloc_desc_from(1, 0); 370 virq = irq_alloc_desc_from(1, 0);
305 if (!virq) { 371 if (!virq) {
306 pr_debug("irq: create_direct virq allocation failed\n"); 372 pr_debug("create_direct virq allocation failed\n");
307 return 0; 373 return 0;
308 } 374 }
309 if (virq >= domain->revmap_data.nomap.max_irq) { 375 if (virq >= domain->revmap_data.nomap.max_irq) {
@@ -312,7 +378,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
312 irq_free_desc(virq); 378 irq_free_desc(virq);
313 return 0; 379 return 0;
314 } 380 }
315 pr_debug("irq: create_direct obtained virq %d\n", virq); 381 pr_debug("create_direct obtained virq %d\n", virq);
316 382
317 if (irq_setup_virq(domain, virq, virq)) { 383 if (irq_setup_virq(domain, virq, virq)) {
318 irq_free_desc(virq); 384 irq_free_desc(virq);
@@ -321,6 +387,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
321 387
322 return virq; 388 return virq;
323} 389}
390EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
324 391
325/** 392/**
326 * irq_create_mapping() - Map a hardware interrupt into linux irq space 393 * irq_create_mapping() - Map a hardware interrupt into linux irq space
@@ -338,23 +405,23 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
338 unsigned int hint; 405 unsigned int hint;
339 int virq; 406 int virq;
340 407
341 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 408 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
342 409
343 /* Look for default domain if nececssary */ 410 /* Look for default domain if nececssary */
344 if (domain == NULL) 411 if (domain == NULL)
345 domain = irq_default_domain; 412 domain = irq_default_domain;
346 if (domain == NULL) { 413 if (domain == NULL) {
347 printk(KERN_WARNING "irq_create_mapping called for" 414 pr_warning("irq_create_mapping called for"
348 " NULL domain, hwirq=%lx\n", hwirq); 415 " NULL domain, hwirq=%lx\n", hwirq);
349 WARN_ON(1); 416 WARN_ON(1);
350 return 0; 417 return 0;
351 } 418 }
352 pr_debug("irq: -> using domain @%p\n", domain); 419 pr_debug("-> using domain @%p\n", domain);
353 420
354 /* Check if mapping already exists */ 421 /* Check if mapping already exists */
355 virq = irq_find_mapping(domain, hwirq); 422 virq = irq_find_mapping(domain, hwirq);
356 if (virq) { 423 if (virq) {
357 pr_debug("irq: -> existing mapping on virq %d\n", virq); 424 pr_debug("-> existing mapping on virq %d\n", virq);
358 return virq; 425 return virq;
359 } 426 }
360 427
@@ -370,7 +437,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
370 if (virq <= 0) 437 if (virq <= 0)
371 virq = irq_alloc_desc_from(1, 0); 438 virq = irq_alloc_desc_from(1, 0);
372 if (virq <= 0) { 439 if (virq <= 0) {
373 pr_debug("irq: -> virq allocation failed\n"); 440 pr_debug("-> virq allocation failed\n");
374 return 0; 441 return 0;
375 } 442 }
376 443
@@ -380,7 +447,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
380 return 0; 447 return 0;
381 } 448 }
382 449
383 pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", 450 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
384 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); 451 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
385 452
386 return virq; 453 return virq;
@@ -409,8 +476,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
409 if (intsize > 0) 476 if (intsize > 0)
410 return intspec[0]; 477 return intspec[0];
411#endif 478#endif
412 printk(KERN_WARNING "irq: no irq domain found for %s !\n", 479 pr_warning("no irq domain found for %s !\n",
413 controller->full_name); 480 controller->full_name);
414 return 0; 481 return 0;
415 } 482 }
416 483
@@ -560,6 +627,7 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
560 */ 627 */
561 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); 628 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
562} 629}
630EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
563 631
564/** 632/**
565 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. 633 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
@@ -584,6 +652,7 @@ void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
584 mutex_unlock(&revmap_trees_mutex); 652 mutex_unlock(&revmap_trees_mutex);
585 } 653 }
586} 654}
655EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
587 656
588/** 657/**
589 * irq_linear_revmap() - Find a linux irq from a hw irq number. 658 * irq_linear_revmap() - Find a linux irq from a hw irq number.
@@ -617,6 +686,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain,
617 686
618 return revmap[hwirq]; 687 return revmap[hwirq];
619} 688}
689EXPORT_SYMBOL_GPL(irq_linear_revmap);
620 690
621#ifdef CONFIG_IRQ_DOMAIN_DEBUG 691#ifdef CONFIG_IRQ_DOMAIN_DEBUG
622static int virq_debug_show(struct seq_file *m, void *private) 692static int virq_debug_show(struct seq_file *m, void *private)
@@ -691,8 +761,8 @@ static int __init irq_debugfs_init(void)
691__initcall(irq_debugfs_init); 761__initcall(irq_debugfs_init);
692#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ 762#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
693 763
694int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, 764static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
695 irq_hw_number_t hwirq) 765 irq_hw_number_t hwirq)
696{ 766{
697 return 0; 767 return 0;
698} 768}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569b..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,6 +7,8 @@
7 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
8 */ 8 */
9 9
10#define pr_fmt(fmt) "genirq: " fmt
11
10#include <linux/irq.h> 12#include <linux/irq.h>
11#include <linux/kthread.h> 13#include <linux/kthread.h>
12#include <linux/module.h> 14#include <linux/module.h>
@@ -14,6 +16,7 @@
14#include <linux/interrupt.h> 16#include <linux/interrupt.h>
15#include <linux/slab.h> 17#include <linux/slab.h>
16#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/task_work.h>
17 20
18#include "internals.h" 21#include "internals.h"
19 22
@@ -139,6 +142,25 @@ static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } 142irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif 143#endif
141 144
145int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
146 bool force)
147{
148 struct irq_desc *desc = irq_data_to_desc(data);
149 struct irq_chip *chip = irq_data_get_irq_chip(data);
150 int ret;
151
152 ret = chip->irq_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160
161 return ret;
162}
163
142int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) 164int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
143{ 165{
144 struct irq_chip *chip = irq_data_get_irq_chip(data); 166 struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
149 return -EINVAL; 171 return -EINVAL;
150 172
151 if (irq_can_move_pcntxt(data)) { 173 if (irq_can_move_pcntxt(data)) {
152 ret = chip->irq_set_affinity(data, mask, false); 174 ret = irq_do_set_affinity(data, mask, false);
153 switch (ret) {
154 case IRQ_SET_MASK_OK:
155 cpumask_copy(data->affinity, mask);
156 case IRQ_SET_MASK_OK_NOCOPY:
157 irq_set_thread_affinity(desc);
158 ret = 0;
159 }
160 } else { 175 } else {
161 irqd_set_move_pending(data); 176 irqd_set_move_pending(data);
162 irq_copy_pending(desc, mask); 177 irq_copy_pending(desc, mask);
@@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
280static int 295static int
281setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) 296setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
282{ 297{
283 struct irq_chip *chip = irq_desc_get_chip(desc);
284 struct cpumask *set = irq_default_affinity; 298 struct cpumask *set = irq_default_affinity;
285 int ret, node = desc->irq_data.node; 299 int node = desc->irq_data.node;
286 300
287 /* Excludes PER_CPU and NO_BALANCE interrupts */ 301 /* Excludes PER_CPU and NO_BALANCE interrupts */
288 if (!irq_can_set_affinity(irq)) 302 if (!irq_can_set_affinity(irq))
@@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
308 if (cpumask_intersects(mask, nodemask)) 322 if (cpumask_intersects(mask, nodemask))
309 cpumask_and(mask, mask, nodemask); 323 cpumask_and(mask, mask, nodemask);
310 } 324 }
311 ret = chip->irq_set_affinity(&desc->irq_data, mask, false); 325 irq_do_set_affinity(&desc->irq_data, mask, false);
312 switch (ret) {
313 case IRQ_SET_MASK_OK:
314 cpumask_copy(desc->irq_data.affinity, mask);
315 case IRQ_SET_MASK_OK_NOCOPY:
316 irq_set_thread_affinity(desc);
317 }
318 return 0; 326 return 0;
319} 327}
320#else 328#else
@@ -566,7 +574,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
566 * flow-types? 574 * flow-types?
567 */ 575 */
568 pr_debug("No set_type function for IRQ %d (%s)\n", irq, 576 pr_debug("No set_type function for IRQ %d (%s)\n", irq,
569 chip ? (chip->name ? : "unknown") : "unknown"); 577 chip ? (chip->name ? : "unknown") : "unknown");
570 return 0; 578 return 0;
571 } 579 }
572 580
@@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
600 ret = 0; 608 ret = 0;
601 break; 609 break;
602 default: 610 default:
603 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 611 pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
604 flags, irq, chip->irq_set_type); 612 flags, irq, chip->irq_set_type);
605 } 613 }
606 if (unmask) 614 if (unmask)
@@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc)
773 wake_up(&desc->wait_for_threads); 781 wake_up(&desc->wait_for_threads);
774} 782}
775 783
784static void irq_thread_dtor(struct task_work *unused)
785{
786 struct task_struct *tsk = current;
787 struct irq_desc *desc;
788 struct irqaction *action;
789
790 if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
791 return;
792
793 action = kthread_data(tsk);
794
795 pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
796 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
797
798
799 desc = irq_to_desc(action->irq);
800 /*
801 * If IRQTF_RUNTHREAD is set, we need to decrement
802 * desc->threads_active and wake possible waiters.
803 */
804 if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
805 wake_threads_waitq(desc);
806
807 /* Prevent a stale desc->threads_oneshot */
808 irq_finalize_oneshot(desc, action);
809}
810
776/* 811/*
777 * Interrupt handler thread 812 * Interrupt handler thread
778 */ 813 */
779static int irq_thread(void *data) 814static int irq_thread(void *data)
780{ 815{
816 struct task_work on_exit_work;
781 static const struct sched_param param = { 817 static const struct sched_param param = {
782 .sched_priority = MAX_USER_RT_PRIO/2, 818 .sched_priority = MAX_USER_RT_PRIO/2,
783 }; 819 };
@@ -793,7 +829,9 @@ static int irq_thread(void *data)
793 handler_fn = irq_thread_fn; 829 handler_fn = irq_thread_fn;
794 830
795 sched_setscheduler(current, SCHED_FIFO, &param); 831 sched_setscheduler(current, SCHED_FIFO, &param);
796 current->irq_thread = 1; 832
833 init_task_work(&on_exit_work, irq_thread_dtor, NULL);
834 task_work_add(current, &on_exit_work, false);
797 835
798 while (!irq_wait_for_interrupt(action)) { 836 while (!irq_wait_for_interrupt(action)) {
799 irqreturn_t action_ret; 837 irqreturn_t action_ret;
@@ -815,45 +853,11 @@ static int irq_thread(void *data)
815 * cannot touch the oneshot mask at this point anymore as 853 * cannot touch the oneshot mask at this point anymore as
816 * __setup_irq() might have given out currents thread_mask 854 * __setup_irq() might have given out currents thread_mask
817 * again. 855 * again.
818 *
819 * Clear irq_thread. Otherwise exit_irq_thread() would make
820 * fuzz about an active irq thread going into nirvana.
821 */ 856 */
822 current->irq_thread = 0; 857 task_work_cancel(current, irq_thread_dtor);
823 return 0; 858 return 0;
824} 859}
825 860
826/*
827 * Called from do_exit()
828 */
829void exit_irq_thread(void)
830{
831 struct task_struct *tsk = current;
832 struct irq_desc *desc;
833 struct irqaction *action;
834
835 if (!tsk->irq_thread)
836 return;
837
838 action = kthread_data(tsk);
839
840 printk(KERN_ERR
841 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
842 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
843
844 desc = irq_to_desc(action->irq);
845
846 /*
847 * If IRQTF_RUNTHREAD is set, we need to decrement
848 * desc->threads_active and wake possible waiters.
849 */
850 if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
851 wake_threads_waitq(desc);
852
853 /* Prevent a stale desc->threads_oneshot */
854 irq_finalize_oneshot(desc, action);
855}
856
857static void irq_setup_forced_threading(struct irqaction *new) 861static void irq_setup_forced_threading(struct irqaction *new)
858{ 862{
859 if (!force_irqthreads) 863 if (!force_irqthreads)
@@ -878,7 +882,6 @@ static int
878__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) 882__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
879{ 883{
880 struct irqaction *old, **old_ptr; 884 struct irqaction *old, **old_ptr;
881 const char *old_name = NULL;
882 unsigned long flags, thread_mask = 0; 885 unsigned long flags, thread_mask = 0;
883 int ret, nested, shared = 0; 886 int ret, nested, shared = 0;
884 cpumask_var_t mask; 887 cpumask_var_t mask;
@@ -972,10 +975,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
972 */ 975 */
973 if (!((old->flags & new->flags) & IRQF_SHARED) || 976 if (!((old->flags & new->flags) & IRQF_SHARED) ||
974 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || 977 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
975 ((old->flags ^ new->flags) & IRQF_ONESHOT)) { 978 ((old->flags ^ new->flags) & IRQF_ONESHOT))
976 old_name = old->name;
977 goto mismatch; 979 goto mismatch;
978 }
979 980
980 /* All handlers must agree on per-cpuness */ 981 /* All handlers must agree on per-cpuness */
981 if ((old->flags & IRQF_PERCPU) != 982 if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1032,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1031 * all existing action->thread_mask bits. 1032 * all existing action->thread_mask bits.
1032 */ 1033 */
1033 new->thread_mask = 1 << ffz(thread_mask); 1034 new->thread_mask = 1 << ffz(thread_mask);
1035
1036 } else if (new->handler == irq_default_primary_handler) {
1037 /*
1038 * The interrupt was requested with handler = NULL, so
1039 * we use the default primary handler for it. But it
1040 * does not have the oneshot flag set. In combination
1041 * with level interrupts this is deadly, because the
1042 * default primary handler just wakes the thread, then
1043 * the irq lines is reenabled, but the device still
1044 * has the level irq asserted. Rinse and repeat....
1045 *
1046 * While this works for edge type interrupts, we play
1047 * it safe and reject unconditionally because we can't
1048 * say for sure which type this interrupt really
1049 * has. The type flags are unreliable as the
1050 * underlying chip implementation can override them.
1051 */
1052 pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1053 irq);
1054 ret = -EINVAL;
1055 goto out_mask;
1034 } 1056 }
1035 1057
1036 if (!shared) { 1058 if (!shared) {
@@ -1078,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1078 1100
1079 if (nmsk != omsk) 1101 if (nmsk != omsk)
1080 /* hope the handler works with current trigger mode */ 1102 /* hope the handler works with current trigger mode */
1081 pr_warning("IRQ %d uses trigger mode %u; requested %u\n", 1103 pr_warning("irq %d uses trigger mode %u; requested %u\n",
1082 irq, nmsk, omsk); 1104 irq, nmsk, omsk);
1083 } 1105 }
1084 1106
@@ -1115,14 +1137,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1115 return 0; 1137 return 0;
1116 1138
1117mismatch: 1139mismatch:
1118#ifdef CONFIG_DEBUG_SHIRQ
1119 if (!(new->flags & IRQF_PROBE_SHARED)) { 1140 if (!(new->flags & IRQF_PROBE_SHARED)) {
1120 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); 1141 pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
1121 if (old_name) 1142 irq, new->flags, new->name, old->flags, old->name);
1122 printk(KERN_ERR "current handler: %s\n", old_name); 1143#ifdef CONFIG_DEBUG_SHIRQ
1123 dump_stack(); 1144 dump_stack();
1124 }
1125#endif 1145#endif
1146 }
1126 ret = -EBUSY; 1147 ret = -EBUSY;
1127 1148
1128out_mask: 1149out_mask:
@@ -1204,12 +1225,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1204 /* Found it - now remove it from the list of entries: */ 1225 /* Found it - now remove it from the list of entries: */
1205 *action_ptr = action->next; 1226 *action_ptr = action->next;
1206 1227
1207 /* Currently used only by UML, might disappear one day: */
1208#ifdef CONFIG_IRQ_RELEASE_METHOD
1209 if (desc->irq_data.chip->release)
1210 desc->irq_data.chip->release(irq, dev_id);
1211#endif
1212
1213 /* If this was the last handler, shut down the IRQ line: */ 1228 /* If this was the last handler, shut down the IRQ line: */
1214 if (!desc->action) 1229 if (!desc->action)
1215 irq_shutdown(desc); 1230 irq_shutdown(desc);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
42 * For correct operation this depends on the caller 42 * For correct operation this depends on the caller
43 * masking the irqs. 43 * masking the irqs.
44 */ 44 */
45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
46 < nr_cpu_ids)) { 46 irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
47 int ret = chip->irq_set_affinity(&desc->irq_data,
48 desc->pending_mask, false);
49 switch (ret) {
50 case IRQ_SET_MASK_OK:
51 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
52 case IRQ_SET_MASK_OK_NOCOPY:
53 irq_set_thread_affinity(desc);
54 }
55 }
56 47
57 cpumask_clear(desc->pending_mask); 48 cpumask_clear(desc->pending_mask);
58} 49}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a6..cb228bf21760 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
103 int irq; 103 int irq;
104 104
105 for_each_irq_desc(irq, desc) { 105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
106 if (irqd_is_wakeup_set(&desc->irq_data)) { 111 if (irqd_is_wakeup_set(&desc->irq_data)) {
107 if (desc->istate & IRQS_PENDING) 112 if (desc->depth == 1 && desc->istate & IRQS_PENDING)
108 return -EBUSY; 113 return -EBUSY;
109 continue; 114 continue;
110 } 115 }
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c9..6454db7b6a4d 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
58 /* 58 /*
59 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
60 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
61 * active. 61 * active. Clear the pending bit so suspend/resume does not
62 * get confused.
62 */ 63 */
63 if (irq_settings_is_level(desc)) 64 if (irq_settings_is_level(desc)) {
65 desc->istate &= ~IRQS_PENDING;
64 return; 66 return;
67 }
65 if (desc->istate & IRQS_REPLAY) 68 if (desc->istate & IRQS_REPLAY)
66 return; 69 return;
67 if (desc->istate & IRQS_PENDING) { 70 if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 079f1d39a8b8..2169feeba529 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
343 343
344/* Look up a kernel symbol and return it in a text buffer. */ 344/* Look up a kernel symbol and return it in a text buffer. */
345static int __sprint_symbol(char *buffer, unsigned long address, 345static int __sprint_symbol(char *buffer, unsigned long address,
346 int symbol_offset) 346 int symbol_offset, int add_offset)
347{ 347{
348 char *modname; 348 char *modname;
349 const char *name; 349 const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
358 if (name != buffer) 358 if (name != buffer)
359 strcpy(buffer, name); 359 strcpy(buffer, name);
360 len = strlen(buffer); 360 len = strlen(buffer);
361 buffer += len;
362 offset -= symbol_offset; 361 offset -= symbol_offset;
363 362
363 if (add_offset)
364 len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
365
364 if (modname) 366 if (modname)
365 len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); 367 len += sprintf(buffer + len, " [%s]", modname);
366 else
367 len += sprintf(buffer, "+%#lx/%#lx", offset, size);
368 368
369 return len; 369 return len;
370} 370}
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
382 */ 382 */
383int sprint_symbol(char *buffer, unsigned long address) 383int sprint_symbol(char *buffer, unsigned long address)
384{ 384{
385 return __sprint_symbol(buffer, address, 0); 385 return __sprint_symbol(buffer, address, 0, 1);
386} 386}
387
388EXPORT_SYMBOL_GPL(sprint_symbol); 387EXPORT_SYMBOL_GPL(sprint_symbol);
389 388
390/** 389/**
390 * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
391 * @buffer: buffer to be stored
392 * @address: address to lookup
393 *
394 * This function looks up a kernel symbol with @address and stores its name
395 * and module name to @buffer if possible. If no symbol was found, just saves
396 * its @address as is.
397 *
398 * This function returns the number of bytes stored in @buffer.
399 */
400int sprint_symbol_no_offset(char *buffer, unsigned long address)
401{
402 return __sprint_symbol(buffer, address, 0, 0);
403}
404EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
405
406/**
391 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer 407 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
392 * @buffer: buffer to be stored 408 * @buffer: buffer to be stored
393 * @address: address to lookup 409 * @address: address to lookup
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
403 */ 419 */
404int sprint_backtrace(char *buffer, unsigned long address) 420int sprint_backtrace(char *buffer, unsigned long address)
405{ 421{
406 return __sprint_symbol(buffer, address, -1); 422 return __sprint_symbol(buffer, address, -1, 1);
407} 423}
408 424
409/* Look up a kernel symbol and print it to the kernel messages. */ 425/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
1#include <linux/kernel.h>
2#include <linux/syscalls.h>
3#include <linux/fdtable.h>
4#include <linux/string.h>
5#include <linux/random.h>
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/cache.h>
10#include <linux/bug.h>
11#include <linux/err.h>
12#include <linux/kcmp.h>
13
14#include <asm/unistd.h>
15
16/*
17 * We don't expose the real in-memory order of objects for security reasons.
18 * But still the comparison results should be suitable for sorting. So we
19 * obfuscate kernel pointers values and compare the production instead.
20 *
21 * The obfuscation is done in two steps. First we xor the kernel pointer with
22 * a random value, which puts pointer into a new position in a reordered space.
23 * Secondly we multiply the xor production with a large odd random number to
24 * permute its bits even more (the odd multiplier guarantees that the product
25 * is unique ever after the high bits are truncated, since any odd number is
26 * relative prime to 2^n).
27 *
28 * Note also that the obfuscation itself is invisible to userspace and if needed
29 * it can be changed to an alternate scheme.
30 */
31static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
32
33static long kptr_obfuscate(long v, int type)
34{
35 return (v ^ cookies[type][0]) * cookies[type][1];
36}
37
38/*
39 * 0 - equal, i.e. v1 = v2
40 * 1 - less than, i.e. v1 < v2
41 * 2 - greater than, i.e. v1 > v2
42 * 3 - not equal but ordering unavailable (reserved for future)
43 */
44static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
45{
46 long ret;
47
48 ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
49
50 return (ret < 0) | ((ret > 0) << 1);
51}
52
53/* The caller must have pinned the task */
54static struct file *
55get_file_raw_ptr(struct task_struct *task, unsigned int idx)
56{
57 struct file *file = NULL;
58
59 task_lock(task);
60 rcu_read_lock();
61
62 if (task->files)
63 file = fcheck_files(task->files, idx);
64
65 rcu_read_unlock();
66 task_unlock(task);
67
68 return file;
69}
70
71static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
72{
73 if (likely(m2 != m1))
74 mutex_unlock(m2);
75 mutex_unlock(m1);
76}
77
78static int kcmp_lock(struct mutex *m1, struct mutex *m2)
79{
80 int err;
81
82 if (m2 > m1)
83 swap(m1, m2);
84
85 err = mutex_lock_killable(m1);
86 if (!err && likely(m1 != m2)) {
87 err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
88 if (err)
89 mutex_unlock(m1);
90 }
91
92 return err;
93}
94
95SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
96 unsigned long, idx1, unsigned long, idx2)
97{
98 struct task_struct *task1, *task2;
99 int ret;
100
101 rcu_read_lock();
102
103 /*
104 * Tasks are looked up in caller's PID namespace only.
105 */
106 task1 = find_task_by_vpid(pid1);
107 task2 = find_task_by_vpid(pid2);
108 if (!task1 || !task2)
109 goto err_no_task;
110
111 get_task_struct(task1);
112 get_task_struct(task2);
113
114 rcu_read_unlock();
115
116 /*
117 * One should have enough rights to inspect task details.
118 */
119 ret = kcmp_lock(&task1->signal->cred_guard_mutex,
120 &task2->signal->cred_guard_mutex);
121 if (ret)
122 goto err;
123 if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
124 !ptrace_may_access(task2, PTRACE_MODE_READ)) {
125 ret = -EPERM;
126 goto err_unlock;
127 }
128
129 switch (type) {
130 case KCMP_FILE: {
131 struct file *filp1, *filp2;
132
133 filp1 = get_file_raw_ptr(task1, idx1);
134 filp2 = get_file_raw_ptr(task2, idx2);
135
136 if (filp1 && filp2)
137 ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
138 else
139 ret = -EBADF;
140 break;
141 }
142 case KCMP_VM:
143 ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
144 break;
145 case KCMP_FILES:
146 ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
147 break;
148 case KCMP_FS:
149 ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
150 break;
151 case KCMP_SIGHAND:
152 ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
153 break;
154 case KCMP_IO:
155 ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
156 break;
157 case KCMP_SYSVSEM:
158#ifdef CONFIG_SYSVIPC
159 ret = kcmp_ptr(task1->sysvsem.undo_list,
160 task2->sysvsem.undo_list,
161 KCMP_SYSVSEM);
162#else
163 ret = -EOPNOTSUPP;
164#endif
165 break;
166 default:
167 ret = -EINVAL;
168 break;
169 }
170
171err_unlock:
172 kcmp_unlock(&task1->signal->cred_guard_mutex,
173 &task2->signal->cred_guard_mutex);
174err:
175 put_task_struct(task1);
176 put_task_struct(task2);
177
178 return ret;
179
180err_no_task:
181 rcu_read_unlock();
182 return -ESRCH;
183}
184
185static __init int kcmp_cookies_init(void)
186{
187 int i;
188
189 get_random_bytes(cookies, sizeof(cookies));
190
191 for (i = 0; i < KCMP_TYPES; i++)
192 cookies[i][1] |= (~(~0UL >> 1) | 1);
193
194 return 0;
195}
196arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index c744b88c44e2..59dcf5b81d24 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
402 return max; 402 return max;
403 return len; 403 return len;
404} 404}
405EXPORT_SYMBOL(__kfifo_max_r);
405 406
406#define __KFIFO_PEEK(data, out, mask) \ 407#define __KFIFO_PEEK(data, out, mask) \
407 ((data)[(out) & (mask)]) 408 ((data)[(out) & (mask)])
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 05698a7415fe..ff2c7cb86d77 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -221,13 +221,12 @@ fail:
221 return 0; 221 return 0;
222} 222}
223 223
224void call_usermodehelper_freeinfo(struct subprocess_info *info) 224static void call_usermodehelper_freeinfo(struct subprocess_info *info)
225{ 225{
226 if (info->cleanup) 226 if (info->cleanup)
227 (*info->cleanup)(info); 227 (*info->cleanup)(info);
228 kfree(info); 228 kfree(info);
229} 229}
230EXPORT_SYMBOL(call_usermodehelper_freeinfo);
231 230
232static void umh_complete(struct subprocess_info *sub_info) 231static void umh_complete(struct subprocess_info *sub_info)
233{ 232{
@@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
410 409
411/** 410/**
412 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. 411 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
413 * depth: New value to assign to usermodehelper_disabled. 412 * @depth: New value to assign to usermodehelper_disabled.
414 * 413 *
415 * Change the value of usermodehelper_disabled (under umhelper_sem locked for 414 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
416 * writing) and wakeup tasks waiting for it to change. 415 * writing) and wakeup tasks waiting for it to change.
@@ -479,6 +478,7 @@ static void helper_unlock(void)
479 * structure. This should be passed to call_usermodehelper_exec to 478 * structure. This should be passed to call_usermodehelper_exec to
480 * exec the process and free the structure. 479 * exec the process and free the structure.
481 */ 480 */
481static
482struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, 482struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
483 char **envp, gfp_t gfp_mask) 483 char **envp, gfp_t gfp_mask)
484{ 484{
@@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
494 out: 494 out:
495 return sub_info; 495 return sub_info;
496} 496}
497EXPORT_SYMBOL(call_usermodehelper_setup);
498 497
499/** 498/**
500 * call_usermodehelper_setfns - set a cleanup/init function 499 * call_usermodehelper_setfns - set a cleanup/init function
@@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
512 * Function must be runnable in either a process context or the 511 * Function must be runnable in either a process context or the
513 * context in which call_usermodehelper_exec is called. 512 * context in which call_usermodehelper_exec is called.
514 */ 513 */
514static
515void call_usermodehelper_setfns(struct subprocess_info *info, 515void call_usermodehelper_setfns(struct subprocess_info *info,
516 int (*init)(struct subprocess_info *info, struct cred *new), 516 int (*init)(struct subprocess_info *info, struct cred *new),
517 void (*cleanup)(struct subprocess_info *info), 517 void (*cleanup)(struct subprocess_info *info),
@@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
521 info->init = init; 521 info->init = init;
522 info->data = data; 522 info->data = data;
523} 523}
524EXPORT_SYMBOL(call_usermodehelper_setfns);
525 524
526/** 525/**
527 * call_usermodehelper_exec - start a usermode application 526 * call_usermodehelper_exec - start a usermode application
@@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
535 * asynchronously if wait is not set, and runs as a child of keventd. 534 * asynchronously if wait is not set, and runs as a child of keventd.
536 * (ie. it runs with full root capabilities). 535 * (ie. it runs with full root capabilities).
537 */ 536 */
537static
538int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 538int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
539{ 539{
540 DECLARE_COMPLETION_ONSTACK(done); 540 DECLARE_COMPLETION_ONSTACK(done);
@@ -576,7 +576,25 @@ unlock:
576 helper_unlock(); 576 helper_unlock();
577 return retval; 577 return retval;
578} 578}
579EXPORT_SYMBOL(call_usermodehelper_exec); 579
580int call_usermodehelper_fns(
581 char *path, char **argv, char **envp, int wait,
582 int (*init)(struct subprocess_info *info, struct cred *new),
583 void (*cleanup)(struct subprocess_info *), void *data)
584{
585 struct subprocess_info *info;
586 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
587
588 info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
589
590 if (info == NULL)
591 return -ENOMEM;
592
593 call_usermodehelper_setfns(info, init, cleanup, data);
594
595 return call_usermodehelper_exec(info, wait);
596}
597EXPORT_SYMBOL(call_usermodehelper_fns);
580 598
581static int proc_cap_handler(struct ctl_table *table, int write, 599static int proc_cap_handler(struct ctl_table *table, int write,
582 void __user *buffer, size_t *lenp, loff_t *ppos) 600 void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
index 000000000000..6535a667a5a7
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
1/* See include/linux/lglock.h for description */
2#include <linux/module.h>
3#include <linux/lglock.h>
4#include <linux/cpu.h>
5#include <linux/string.h>
6
7/*
8 * Note there is no uninit, so lglocks cannot be defined in
9 * modules (but it's fine to use them from there)
10 * Could be added though, just undo lg_lock_init
11 */
12
13void lg_lock_init(struct lglock *lg, char *name)
14{
15 LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
16}
17EXPORT_SYMBOL(lg_lock_init);
18
19void lg_local_lock(struct lglock *lg)
20{
21 arch_spinlock_t *lock;
22
23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock);
27}
28EXPORT_SYMBOL(lg_local_lock);
29
30void lg_local_unlock(struct lglock *lg)
31{
32 arch_spinlock_t *lock;
33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock);
37 preempt_enable();
38}
39EXPORT_SYMBOL(lg_local_unlock);
40
41void lg_local_lock_cpu(struct lglock *lg, int cpu)
42{
43 arch_spinlock_t *lock;
44
45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock);
49}
50EXPORT_SYMBOL(lg_local_lock_cpu);
51
52void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{
54 arch_spinlock_t *lock;
55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock);
59 preempt_enable();
60}
61EXPORT_SYMBOL(lg_local_unlock_cpu);
62
63void lg_global_lock(struct lglock *lg)
64{
65 int i;
66
67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i);
72 arch_spin_lock(lock);
73 }
74}
75EXPORT_SYMBOL(lg_global_lock);
76
77void lg_global_unlock(struct lglock *lg)
78{
79 int i;
80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i);
85 arch_spin_unlock(lock);
86 }
87 preempt_enable();
88}
89EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e425..4edbd9c11aca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info,
2429 goto free_hdr; 2429 goto free_hdr;
2430 } 2430 }
2431 2431
2432 if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { 2432 if (hdr->e_shoff >= len ||
2433 hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
2433 err = -ENOEXEC; 2434 err = -ENOEXEC;
2434 goto free_hdr; 2435 goto free_hdr;
2435 } 2436 }
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod,
2953 2954
2954 /* Module is ready to execute: parsing args may do that. */ 2955 /* Module is ready to execute: parsing args may do that. */
2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 2956 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL); 2957 -32768, 32767, &ddebug_dyndbg_module_param_cb);
2957 if (err < 0) 2958 if (err < 0)
2958 goto unlink; 2959 goto unlink;
2959 2960
diff --git a/kernel/params.c b/kernel/params.c
index f37d82631347..ed35345be536 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
85 85
86static int parse_one(char *param, 86static int parse_one(char *param,
87 char *val, 87 char *val,
88 const char *doing,
88 const struct kernel_param *params, 89 const struct kernel_param *params,
89 unsigned num_params, 90 unsigned num_params,
90 s16 min_level, 91 s16 min_level,
91 s16 max_level, 92 s16 max_level,
92 int (*handle_unknown)(char *param, char *val)) 93 int (*handle_unknown)(char *param, char *val,
94 const char *doing))
93{ 95{
94 unsigned int i; 96 unsigned int i;
95 int err; 97 int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
104 if (!val && params[i].ops->set != param_set_bool 106 if (!val && params[i].ops->set != param_set_bool
105 && params[i].ops->set != param_set_bint) 107 && params[i].ops->set != param_set_bint)
106 return -EINVAL; 108 return -EINVAL;
107 pr_debug("They are equal! Calling %p\n", 109 pr_debug("handling %s with %p\n", param,
108 params[i].ops->set); 110 params[i].ops->set);
109 mutex_lock(&param_lock); 111 mutex_lock(&param_lock);
110 err = params[i].ops->set(val, &params[i]); 112 err = params[i].ops->set(val, &params[i]);
111 mutex_unlock(&param_lock); 113 mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
114 } 116 }
115 117
116 if (handle_unknown) { 118 if (handle_unknown) {
117 pr_debug("Unknown argument: calling %p\n", handle_unknown); 119 pr_debug("doing %s: %s='%s'\n", doing, param, val);
118 return handle_unknown(param, val); 120 return handle_unknown(param, val, doing);
119 } 121 }
120 122
121 pr_debug("Unknown argument `%s'\n", param); 123 pr_debug("Unknown argument '%s'\n", param);
122 return -ENOENT; 124 return -ENOENT;
123} 125}
124 126
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
175} 177}
176 178
177/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 179/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
178int parse_args(const char *name, 180int parse_args(const char *doing,
179 char *args, 181 char *args,
180 const struct kernel_param *params, 182 const struct kernel_param *params,
181 unsigned num, 183 unsigned num,
182 s16 min_level, 184 s16 min_level,
183 s16 max_level, 185 s16 max_level,
184 int (*unknown)(char *param, char *val)) 186 int (*unknown)(char *param, char *val, const char *doing))
185{ 187{
186 char *param, *val; 188 char *param, *val;
187 189
188 pr_debug("Parsing ARGS: %s\n", args);
189
190 /* Chew leading spaces */ 190 /* Chew leading spaces */
191 args = skip_spaces(args); 191 args = skip_spaces(args);
192 192
193 if (*args)
194 pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
195
193 while (*args) { 196 while (*args) {
194 int ret; 197 int ret;
195 int irq_was_disabled; 198 int irq_was_disabled;
196 199
197 args = next_arg(args, &param, &val); 200 args = next_arg(args, &param, &val);
198 irq_was_disabled = irqs_disabled(); 201 irq_was_disabled = irqs_disabled();
199 ret = parse_one(param, val, params, num, 202 ret = parse_one(param, val, doing, params, num,
200 min_level, max_level, unknown); 203 min_level, max_level, unknown);
201 if (irq_was_disabled && !irqs_disabled()) { 204 if (irq_was_disabled && !irqs_disabled())
202 printk(KERN_WARNING "parse_args(): option '%s' enabled " 205 pr_warn("%s: option '%s' enabled irq's!\n",
203 "irq's!\n", param); 206 doing, param);
204 } 207
205 switch (ret) { 208 switch (ret) {
206 case -ENOENT: 209 case -ENOENT:
207 printk(KERN_ERR "%s: Unknown parameter `%s'\n", 210 pr_err("%s: Unknown parameter `%s'\n", doing, param);
208 name, param);
209 return ret; 211 return ret;
210 case -ENOSPC: 212 case -ENOSPC:
211 printk(KERN_ERR 213 pr_err("%s: `%s' too large for parameter `%s'\n",
212 "%s: `%s' too large for parameter `%s'\n", 214 doing, val ?: "", param);
213 name, val ?: "", param);
214 return ret; 215 return ret;
215 case 0: 216 case 0:
216 break; 217 break;
217 default: 218 default:
218 printk(KERN_ERR 219 pr_err("%s: `%s' invalid for parameter `%s'\n",
219 "%s: `%s' invalid for parameter `%s'\n", 220 doing, val ?: "", param);
220 name, val ?: "", param);
221 return ret; 221 return ret;
222 } 222 }
223 } 223 }
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
263int param_set_charp(const char *val, const struct kernel_param *kp) 263int param_set_charp(const char *val, const struct kernel_param *kp)
264{ 264{
265 if (strlen(val) > 1024) { 265 if (strlen(val) > 1024) {
266 printk(KERN_ERR "%s: string parameter too long\n", 266 pr_err("%s: string parameter too long\n", kp->name);
267 kp->name);
268 return -ENOSPC; 267 return -ENOSPC;
269 } 268 }
270 269
@@ -400,8 +399,7 @@ static int param_array(const char *name,
400 int len; 399 int len;
401 400
402 if (*num == max) { 401 if (*num == max) {
403 printk(KERN_ERR "%s: can only take %i arguments\n", 402 pr_err("%s: can only take %i arguments\n", name, max);
404 name, max);
405 return -EINVAL; 403 return -EINVAL;
406 } 404 }
407 len = strcspn(val, ","); 405 len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
420 } while (save == ','); 418 } while (save == ',');
421 419
422 if (*num < min) { 420 if (*num < min) {
423 printk(KERN_ERR "%s: needs at least %i arguments\n", 421 pr_err("%s: needs at least %i arguments\n", name, min);
424 name, min);
425 return -EINVAL; 422 return -EINVAL;
426 } 423 }
427 return 0; 424 return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
480 const struct kparam_string *kps = kp->str; 477 const struct kparam_string *kps = kp->str;
481 478
482 if (strlen(val)+1 > kps->maxlen) { 479 if (strlen(val)+1 > kps->maxlen) {
483 printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", 480 pr_err("%s: string doesn't fit in %u chars.\n",
484 kp->name, kps->maxlen-1); 481 kp->name, kps->maxlen-1);
485 return -ENOSPC; 482 return -ENOSPC;
486 } 483 }
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
750#endif 747#endif
751 if (err) { 748 if (err) {
752 kobject_put(&mk->kobj); 749 kobject_put(&mk->kobj);
753 printk(KERN_ERR 750 pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
754 "Module '%s' failed add to sysfs, error number %d\n",
755 name, err); 751 name, err);
756 printk(KERN_ERR
757 "The system will be unstable now.\n");
758 return NULL; 752 return NULL;
759 } 753 }
760 754
diff --git a/kernel/pid.c b/kernel/pid.c
index 9f08dfabaf13..e86b291ad834 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -547,7 +547,8 @@ void __init pidhash_init(void)
547 547
548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
549 HASH_EARLY | HASH_SMALL, 549 HASH_EARLY | HASH_SMALL,
550 &pidhash_shift, NULL, 4096); 550 &pidhash_shift, NULL,
551 0, 4096);
551 pidhash_size = 1U << pidhash_shift; 552 pidhash_size = 1U << pidhash_shift;
552 553
553 for (i = 0; i < pidhash_size; i++) 554 for (i = 0; i < pidhash_size; i++)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 57bc1fd35b3c..16b20e38c4a1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
149{ 149{
150 int nr; 150 int nr;
151 int rc; 151 int rc;
152 struct task_struct *task; 152 struct task_struct *task, *me = current;
153
154 /* Ignore SIGCHLD causing any terminated children to autoreap */
155 spin_lock_irq(&me->sighand->siglock);
156 me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
157 spin_unlock_irq(&me->sighand->siglock);
153 158
154 /* 159 /*
155 * The last thread in the cgroup-init thread group is terminating. 160 * The last thread in the cgroup-init thread group is terminating.
@@ -191,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 196 return;
192} 197}
193 198
199#ifdef CONFIG_CHECKPOINT_RESTORE
194static int pid_ns_ctl_handler(struct ctl_table *table, int write, 200static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos) 201 void __user *buffer, size_t *lenp, loff_t *ppos)
196{ 202{
@@ -218,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = {
218 }, 224 },
219 { } 225 { }
220}; 226};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; 227static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
228#endif /* CONFIG_CHECKPOINT_RESTORE */
223 229
224int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) 230int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
225{ 231{
@@ -253,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
253static __init int pid_namespaces_init(void) 259static __init int pid_namespaces_init(void)
254{ 260{
255 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 261 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
262
263#ifdef CONFIG_CHECKPOINT_RESTORE
256 register_sysctl_paths(kern_path, pid_ns_ctl_table); 264 register_sysctl_paths(kern_path, pid_ns_ctl_table);
265#endif
257 return 0; 266 return 0;
258} 267}
259 268
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e3216..8f9b4eb974e0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP
103 select HOTPLUG 103 select HOTPLUG
104 select HOTPLUG_CPU 104 select HOTPLUG_CPU
105 105
106config PM_AUTOSLEEP
107 bool "Opportunistic sleep"
108 depends on PM_SLEEP
109 default n
110 ---help---
111 Allow the kernel to trigger a system transition into a global sleep
112 state automatically whenever there are no active wakeup sources.
113
114config PM_WAKELOCKS
115 bool "User space wakeup sources interface"
116 depends on PM_SLEEP
117 default n
118 ---help---
119 Allow user space to create, activate and deactivate wakeup source
120 objects with the help of a sysfs-based interface.
121
122config PM_WAKELOCKS_LIMIT
123 int "Maximum number of user space wakeup sources (0 = no limit)"
124 range 0 100000
125 default 100
126 depends on PM_WAKELOCKS
127
128config PM_WAKELOCKS_GC
129 bool "Garbage collector for user space wakeup sources"
130 depends on PM_WAKELOCKS
131 default y
132
106config PM_RUNTIME 133config PM_RUNTIME
107 bool "Run-time PM core functionality" 134 bool "Run-time PM core functionality"
108 depends on !IA64_HP_SIM 135 depends on !IA64_HP_SIM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 66d808ec5252..29472bff11ef 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o
9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 9obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 10obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
11 block_io.o 11 block_io.o
12obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
13obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
12 14
13obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 000000000000..ca304046d9e2
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,127 @@
1/*
2 * kernel/power/autosleep.c
3 *
4 * Opportunistic sleep support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 */
8
9#include <linux/device.h>
10#include <linux/mutex.h>
11#include <linux/pm_wakeup.h>
12
13#include "power.h"
14
15static suspend_state_t autosleep_state;
16static struct workqueue_struct *autosleep_wq;
17/*
18 * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
19 * is active, otherwise a deadlock with try_to_suspend() is possible.
20 * Alternatively mutex_lock_interruptible() can be used. This will then fail
21 * if an auto_sleep cycle tries to freeze processes.
22 */
23static DEFINE_MUTEX(autosleep_lock);
24static struct wakeup_source *autosleep_ws;
25
26static void try_to_suspend(struct work_struct *work)
27{
28 unsigned int initial_count, final_count;
29
30 if (!pm_get_wakeup_count(&initial_count, true))
31 goto out;
32
33 mutex_lock(&autosleep_lock);
34
35 if (!pm_save_wakeup_count(initial_count)) {
36 mutex_unlock(&autosleep_lock);
37 goto out;
38 }
39
40 if (autosleep_state == PM_SUSPEND_ON) {
41 mutex_unlock(&autosleep_lock);
42 return;
43 }
44 if (autosleep_state >= PM_SUSPEND_MAX)
45 hibernate();
46 else
47 pm_suspend(autosleep_state);
48
49 mutex_unlock(&autosleep_lock);
50
51 if (!pm_get_wakeup_count(&final_count, false))
52 goto out;
53
54 /*
55 * If the wakeup occured for an unknown reason, wait to prevent the
56 * system from trying to suspend and waking up in a tight loop.
57 */
58 if (final_count == initial_count)
59 schedule_timeout_uninterruptible(HZ / 2);
60
61 out:
62 queue_up_suspend_work();
63}
64
65static DECLARE_WORK(suspend_work, try_to_suspend);
66
67void queue_up_suspend_work(void)
68{
69 if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
70 queue_work(autosleep_wq, &suspend_work);
71}
72
73suspend_state_t pm_autosleep_state(void)
74{
75 return autosleep_state;
76}
77
78int pm_autosleep_lock(void)
79{
80 return mutex_lock_interruptible(&autosleep_lock);
81}
82
83void pm_autosleep_unlock(void)
84{
85 mutex_unlock(&autosleep_lock);
86}
87
88int pm_autosleep_set_state(suspend_state_t state)
89{
90
91#ifndef CONFIG_HIBERNATION
92 if (state >= PM_SUSPEND_MAX)
93 return -EINVAL;
94#endif
95
96 __pm_stay_awake(autosleep_ws);
97
98 mutex_lock(&autosleep_lock);
99
100 autosleep_state = state;
101
102 __pm_relax(autosleep_ws);
103
104 if (state > PM_SUSPEND_ON) {
105 pm_wakep_autosleep_enabled(true);
106 queue_up_suspend_work();
107 } else {
108 pm_wakep_autosleep_enabled(false);
109 }
110
111 mutex_unlock(&autosleep_lock);
112 return 0;
113}
114
115int __init pm_autosleep_init(void)
116{
117 autosleep_ws = wakeup_source_register("autosleep");
118 if (!autosleep_ws)
119 return -ENOMEM;
120
121 autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
122 if (autosleep_wq)
123 return 0;
124
125 wakeup_source_unregister(autosleep_ws);
126 return -ENOMEM;
127}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e09dfbfeecee..8b53db38a279 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,6 +25,8 @@
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/ctype.h>
29#include <linux/genhd.h>
28#include <scsi/scsi_scan.h> 30#include <scsi/scsi_scan.h>
29 31
30#include "power.h" 32#include "power.h"
@@ -722,6 +724,17 @@ static int software_resume(void)
722 724
723 /* Check if the device is there */ 725 /* Check if the device is there */
724 swsusp_resume_device = name_to_dev_t(resume_file); 726 swsusp_resume_device = name_to_dev_t(resume_file);
727
728 /*
729 * name_to_dev_t is ineffective to verify parition if resume_file is in
730 * integer format. (e.g. major:minor)
731 */
732 if (isdigit(resume_file[0]) && resume_wait) {
733 int partno;
734 while (!get_gendisk(swsusp_resume_device, &partno))
735 msleep(10);
736 }
737
725 if (!swsusp_resume_device) { 738 if (!swsusp_resume_device) {
726 /* 739 /*
727 * Some device discovery might still be in progress; we need 740 * Some device discovery might still be in progress; we need
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c12581f1c62..428f8a034e96 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
269 return (s - buf); 269 return (s - buf);
270} 270}
271 271
272static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, 272static suspend_state_t decode_state(const char *buf, size_t n)
273 const char *buf, size_t n)
274{ 273{
275#ifdef CONFIG_SUSPEND 274#ifdef CONFIG_SUSPEND
276 suspend_state_t state = PM_SUSPEND_STANDBY; 275 suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
278#endif 277#endif
279 char *p; 278 char *p;
280 int len; 279 int len;
281 int error = -EINVAL;
282 280
283 p = memchr(buf, '\n', n); 281 p = memchr(buf, '\n', n);
284 len = p ? p - buf : n; 282 len = p ? p - buf : n;
285 283
286 /* First, check if we are requested to hibernate */ 284 /* Check hibernation first. */
287 if (len == 4 && !strncmp(buf, "disk", len)) { 285 if (len == 4 && !strncmp(buf, "disk", len))
288 error = hibernate(); 286 return PM_SUSPEND_MAX;
289 goto Exit;
290 }
291 287
292#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { 290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
295 error = pm_suspend(state); 291 return state;
296 break;
297 }
298 }
299#endif 292#endif
300 293
301 Exit: 294 return PM_SUSPEND_ON;
295}
296
297static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
298 const char *buf, size_t n)
299{
300 suspend_state_t state;
301 int error;
302
303 error = pm_autosleep_lock();
304 if (error)
305 return error;
306
307 if (pm_autosleep_state() > PM_SUSPEND_ON) {
308 error = -EBUSY;
309 goto out;
310 }
311
312 state = decode_state(buf, n);
313 if (state < PM_SUSPEND_MAX)
314 error = pm_suspend(state);
315 else if (state == PM_SUSPEND_MAX)
316 error = hibernate();
317 else
318 error = -EINVAL;
319
320 out:
321 pm_autosleep_unlock();
302 return error ? error : n; 322 return error ? error : n;
303} 323}
304 324
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
339{ 359{
340 unsigned int val; 360 unsigned int val;
341 361
342 return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; 362 return pm_get_wakeup_count(&val, true) ?
363 sprintf(buf, "%u\n", val) : -EINTR;
343} 364}
344 365
345static ssize_t wakeup_count_store(struct kobject *kobj, 366static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
347 const char *buf, size_t n) 368 const char *buf, size_t n)
348{ 369{
349 unsigned int val; 370 unsigned int val;
371 int error;
372
373 error = pm_autosleep_lock();
374 if (error)
375 return error;
376
377 if (pm_autosleep_state() > PM_SUSPEND_ON) {
378 error = -EBUSY;
379 goto out;
380 }
350 381
382 error = -EINVAL;
351 if (sscanf(buf, "%u", &val) == 1) { 383 if (sscanf(buf, "%u", &val) == 1) {
352 if (pm_save_wakeup_count(val)) 384 if (pm_save_wakeup_count(val))
353 return n; 385 error = n;
354 } 386 }
355 return -EINVAL; 387
388 out:
389 pm_autosleep_unlock();
390 return error;
356} 391}
357 392
358power_attr(wakeup_count); 393power_attr(wakeup_count);
394
395#ifdef CONFIG_PM_AUTOSLEEP
396static ssize_t autosleep_show(struct kobject *kobj,
397 struct kobj_attribute *attr,
398 char *buf)
399{
400 suspend_state_t state = pm_autosleep_state();
401
402 if (state == PM_SUSPEND_ON)
403 return sprintf(buf, "off\n");
404
405#ifdef CONFIG_SUSPEND
406 if (state < PM_SUSPEND_MAX)
407 return sprintf(buf, "%s\n", valid_state(state) ?
408 pm_states[state] : "error");
409#endif
410#ifdef CONFIG_HIBERNATION
411 return sprintf(buf, "disk\n");
412#else
413 return sprintf(buf, "error");
414#endif
415}
416
417static ssize_t autosleep_store(struct kobject *kobj,
418 struct kobj_attribute *attr,
419 const char *buf, size_t n)
420{
421 suspend_state_t state = decode_state(buf, n);
422 int error;
423
424 if (state == PM_SUSPEND_ON
425 && strcmp(buf, "off") && strcmp(buf, "off\n"))
426 return -EINVAL;
427
428 error = pm_autosleep_set_state(state);
429 return error ? error : n;
430}
431
432power_attr(autosleep);
433#endif /* CONFIG_PM_AUTOSLEEP */
434
435#ifdef CONFIG_PM_WAKELOCKS
436static ssize_t wake_lock_show(struct kobject *kobj,
437 struct kobj_attribute *attr,
438 char *buf)
439{
440 return pm_show_wakelocks(buf, true);
441}
442
443static ssize_t wake_lock_store(struct kobject *kobj,
444 struct kobj_attribute *attr,
445 const char *buf, size_t n)
446{
447 int error = pm_wake_lock(buf);
448 return error ? error : n;
449}
450
451power_attr(wake_lock);
452
453static ssize_t wake_unlock_show(struct kobject *kobj,
454 struct kobj_attribute *attr,
455 char *buf)
456{
457 return pm_show_wakelocks(buf, false);
458}
459
460static ssize_t wake_unlock_store(struct kobject *kobj,
461 struct kobj_attribute *attr,
462 const char *buf, size_t n)
463{
464 int error = pm_wake_unlock(buf);
465 return error ? error : n;
466}
467
468power_attr(wake_unlock);
469
470#endif /* CONFIG_PM_WAKELOCKS */
359#endif /* CONFIG_PM_SLEEP */ 471#endif /* CONFIG_PM_SLEEP */
360 472
361#ifdef CONFIG_PM_TRACE 473#ifdef CONFIG_PM_TRACE
@@ -409,6 +521,13 @@ static struct attribute * g[] = {
409#ifdef CONFIG_PM_SLEEP 521#ifdef CONFIG_PM_SLEEP
410 &pm_async_attr.attr, 522 &pm_async_attr.attr,
411 &wakeup_count_attr.attr, 523 &wakeup_count_attr.attr,
524#ifdef CONFIG_PM_AUTOSLEEP
525 &autosleep_attr.attr,
526#endif
527#ifdef CONFIG_PM_WAKELOCKS
528 &wake_lock_attr.attr,
529 &wake_unlock_attr.attr,
530#endif
412#ifdef CONFIG_PM_DEBUG 531#ifdef CONFIG_PM_DEBUG
413 &pm_test_attr.attr, 532 &pm_test_attr.attr,
414#endif 533#endif
@@ -444,7 +563,10 @@ static int __init pm_init(void)
444 power_kobj = kobject_create_and_add("power", NULL); 563 power_kobj = kobject_create_and_add("power", NULL);
445 if (!power_kobj) 564 if (!power_kobj)
446 return -ENOMEM; 565 return -ENOMEM;
447 return sysfs_create_group(power_kobj, &attr_group); 566 error = sysfs_create_group(power_kobj, &attr_group);
567 if (error)
568 return error;
569 return pm_autosleep_init();
448} 570}
449 571
450core_initcall(pm_init); 572core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98f3622d7407..b0bd4beaebfe 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void)
264{ 264{
265} 265}
266#endif 266#endif
267
268#ifdef CONFIG_PM_AUTOSLEEP
269
270/* kernel/power/autosleep.c */
271extern int pm_autosleep_init(void);
272extern int pm_autosleep_lock(void);
273extern void pm_autosleep_unlock(void);
274extern suspend_state_t pm_autosleep_state(void);
275extern int pm_autosleep_set_state(suspend_state_t state);
276
277#else /* !CONFIG_PM_AUTOSLEEP */
278
279static inline int pm_autosleep_init(void) { return 0; }
280static inline int pm_autosleep_lock(void) { return 0; }
281static inline void pm_autosleep_unlock(void) {}
282static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
283
284#endif /* !CONFIG_PM_AUTOSLEEP */
285
286#ifdef CONFIG_PM_WAKELOCKS
287
288/* kernel/power/wakelock.c */
289extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
290extern int pm_wake_lock(const char *buf);
291extern int pm_wake_unlock(const char *buf);
292
293#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index eef311a58a64..11e22c068e8b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> 9 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
10 * 10 *
11 * This file is released under the GPLv2. 11 * This file is released under the GPLv2.
12 * 12 *
@@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
282 return -ENOSPC; 282 return -ENOSPC;
283 283
284 if (bio_chain) { 284 if (bio_chain) {
285 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 285 src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
286 __GFP_NORETRY);
286 if (src) { 287 if (src) {
287 copy_page(src, buf); 288 copy_page(src, buf);
288 } else { 289 } else {
289 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ 290 ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
290 if (ret) 291 if (ret)
291 return ret; 292 return ret;
292 src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 293 src = (void *)__get_free_page(__GFP_WAIT |
294 __GFP_NOWARN |
295 __GFP_NORETRY);
293 if (src) { 296 if (src) {
294 copy_page(src, buf); 297 copy_page(src, buf);
295 } else { 298 } else {
@@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
367 clear_page(handle->cur); 370 clear_page(handle->cur);
368 handle->cur_swap = offset; 371 handle->cur_swap = offset;
369 handle->k = 0; 372 handle->k = 0;
370 } 373
371 if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { 374 if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
372 error = hib_wait_on_bio_chain(bio_chain); 375 error = hib_wait_on_bio_chain(bio_chain);
373 if (error) 376 if (error)
374 goto out; 377 goto out;
375 handle->reqd_free_pages = reqd_free_pages(); 378 /*
379 * Recalculate the number of required free pages, to
380 * make sure we never take more than half.
381 */
382 handle->reqd_free_pages = reqd_free_pages();
383 }
376 } 384 }
377 out: 385 out:
378 return error; 386 return error;
@@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
419/* Maximum number of threads for compression/decompression. */ 427/* Maximum number of threads for compression/decompression. */
420#define LZO_THREADS 3 428#define LZO_THREADS 3
421 429
422/* Maximum number of pages for read buffering. */ 430/* Minimum/maximum number of pages for read buffering. */
423#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) 431#define LZO_MIN_RD_PAGES 1024
432#define LZO_MAX_RD_PAGES 8192
424 433
425 434
426/** 435/**
@@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
631 } 640 }
632 641
633 /* 642 /*
634 * Adjust number of free pages after all allocations have been done.
635 * We don't want to run out of pages when writing.
636 */
637 handle->reqd_free_pages = reqd_free_pages();
638
639 /*
640 * Start the CRC32 thread. 643 * Start the CRC32 thread.
641 */ 644 */
642 init_waitqueue_head(&crc->go); 645 init_waitqueue_head(&crc->go);
@@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
657 goto out_clean; 660 goto out_clean;
658 } 661 }
659 662
663 /*
664 * Adjust the number of required free pages after all allocations have
665 * been done. We don't want to run out of pages when writing.
666 */
667 handle->reqd_free_pages = reqd_free_pages();
668
660 printk(KERN_INFO 669 printk(KERN_INFO
661 "PM: Using %u thread(s) for compression.\n" 670 "PM: Using %u thread(s) for compression.\n"
662 "PM: Compressing and saving image data (%u pages) ... ", 671 "PM: Compressing and saving image data (%u pages) ... ",
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1067 unsigned i, thr, run_threads, nr_threads; 1076 unsigned i, thr, run_threads, nr_threads;
1068 unsigned ring = 0, pg = 0, ring_size = 0, 1077 unsigned ring = 0, pg = 0, ring_size = 0,
1069 have = 0, want, need, asked = 0; 1078 have = 0, want, need, asked = 0;
1070 unsigned long read_pages; 1079 unsigned long read_pages = 0;
1071 unsigned char **page = NULL; 1080 unsigned char **page = NULL;
1072 struct dec_data *data = NULL; 1081 struct dec_data *data = NULL;
1073 struct crc_data *crc = NULL; 1082 struct crc_data *crc = NULL;
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
1079 nr_threads = num_online_cpus() - 1; 1088 nr_threads = num_online_cpus() - 1;
1080 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); 1089 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
1081 1090
1082 page = vmalloc(sizeof(*page) * LZO_READ_PAGES); 1091 page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
1083 if (!page) { 1092 if (!page) {
1084 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 1093 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
1085 ret = -ENOMEM; 1094 ret = -ENOMEM;
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
1144 } 1153 }
1145 1154
1146 /* 1155 /*
1147 * Adjust number of pages for read buffering, in case we are short. 1156 * Set the number of pages for read buffering.
1157 * This is complete guesswork, because we'll only know the real
1158 * picture once prepare_image() is called, which is much later on
1159 * during the image load phase. We'll assume the worst case and
1160 * say that none of the image pages are from high memory.
1148 */ 1161 */
1149 read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; 1162 if (low_free_pages() > snapshot_get_image_size())
1150 read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); 1163 read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
1164 read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
1151 1165
1152 for (i = 0; i < read_pages; i++) { 1166 for (i = 0; i < read_pages; i++) {
1153 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? 1167 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1154 __GFP_WAIT | __GFP_HIGH : 1168 __GFP_WAIT | __GFP_HIGH :
1155 __GFP_WAIT); 1169 __GFP_WAIT | __GFP_NOWARN |
1170 __GFP_NORETRY);
1171
1156 if (!page[i]) { 1172 if (!page[i]) {
1157 if (i < LZO_CMP_PAGES) { 1173 if (i < LZO_CMP_PAGES) {
1158 ring_size = i; 1174 ring_size = i;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 000000000000..c8fba3380076
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,259 @@
1/*
2 * kernel/power/wakelock.c
3 *
4 * User space wakeup sources support.
5 *
6 * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This code is based on the analogous interface allowing user space to
9 * manipulate wakelocks on Android.
10 */
11
12#include <linux/ctype.h>
13#include <linux/device.h>
14#include <linux/err.h>
15#include <linux/hrtimer.h>
16#include <linux/list.h>
17#include <linux/rbtree.h>
18#include <linux/slab.h>
19
20static DEFINE_MUTEX(wakelocks_lock);
21
22struct wakelock {
23 char *name;
24 struct rb_node node;
25 struct wakeup_source ws;
26#ifdef CONFIG_PM_WAKELOCKS_GC
27 struct list_head lru;
28#endif
29};
30
31static struct rb_root wakelocks_tree = RB_ROOT;
32
33ssize_t pm_show_wakelocks(char *buf, bool show_active)
34{
35 struct rb_node *node;
36 struct wakelock *wl;
37 char *str = buf;
38 char *end = buf + PAGE_SIZE;
39
40 mutex_lock(&wakelocks_lock);
41
42 for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
43 wl = rb_entry(node, struct wakelock, node);
44 if (wl->ws.active == show_active)
45 str += scnprintf(str, end - str, "%s ", wl->name);
46 }
47 if (str > buf)
48 str--;
49
50 str += scnprintf(str, end - str, "\n");
51
52 mutex_unlock(&wakelocks_lock);
53 return (str - buf);
54}
55
56#if CONFIG_PM_WAKELOCKS_LIMIT > 0
57static unsigned int number_of_wakelocks;
58
59static inline bool wakelocks_limit_exceeded(void)
60{
61 return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
62}
63
64static inline void increment_wakelocks_number(void)
65{
66 number_of_wakelocks++;
67}
68
69static inline void decrement_wakelocks_number(void)
70{
71 number_of_wakelocks--;
72}
73#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
74static inline bool wakelocks_limit_exceeded(void) { return false; }
75static inline void increment_wakelocks_number(void) {}
76static inline void decrement_wakelocks_number(void) {}
77#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
78
79#ifdef CONFIG_PM_WAKELOCKS_GC
80#define WL_GC_COUNT_MAX 100
81#define WL_GC_TIME_SEC 300
82
83static LIST_HEAD(wakelocks_lru_list);
84static unsigned int wakelocks_gc_count;
85
86static inline void wakelocks_lru_add(struct wakelock *wl)
87{
88 list_add(&wl->lru, &wakelocks_lru_list);
89}
90
91static inline void wakelocks_lru_most_recent(struct wakelock *wl)
92{
93 list_move(&wl->lru, &wakelocks_lru_list);
94}
95
96static void wakelocks_gc(void)
97{
98 struct wakelock *wl, *aux;
99 ktime_t now;
100
101 if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
102 return;
103
104 now = ktime_get();
105 list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
106 u64 idle_time_ns;
107 bool active;
108
109 spin_lock_irq(&wl->ws.lock);
110 idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
111 active = wl->ws.active;
112 spin_unlock_irq(&wl->ws.lock);
113
114 if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
115 break;
116
117 if (!active) {
118 wakeup_source_remove(&wl->ws);
119 rb_erase(&wl->node, &wakelocks_tree);
120 list_del(&wl->lru);
121 kfree(wl->name);
122 kfree(wl);
123 decrement_wakelocks_number();
124 }
125 }
126 wakelocks_gc_count = 0;
127}
128#else /* !CONFIG_PM_WAKELOCKS_GC */
129static inline void wakelocks_lru_add(struct wakelock *wl) {}
130static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
131static inline void wakelocks_gc(void) {}
132#endif /* !CONFIG_PM_WAKELOCKS_GC */
133
134static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
135 bool add_if_not_found)
136{
137 struct rb_node **node = &wakelocks_tree.rb_node;
138 struct rb_node *parent = *node;
139 struct wakelock *wl;
140
141 while (*node) {
142 int diff;
143
144 parent = *node;
145 wl = rb_entry(*node, struct wakelock, node);
146 diff = strncmp(name, wl->name, len);
147 if (diff == 0) {
148 if (wl->name[len])
149 diff = -1;
150 else
151 return wl;
152 }
153 if (diff < 0)
154 node = &(*node)->rb_left;
155 else
156 node = &(*node)->rb_right;
157 }
158 if (!add_if_not_found)
159 return ERR_PTR(-EINVAL);
160
161 if (wakelocks_limit_exceeded())
162 return ERR_PTR(-ENOSPC);
163
164 /* Not found, we have to add a new one. */
165 wl = kzalloc(sizeof(*wl), GFP_KERNEL);
166 if (!wl)
167 return ERR_PTR(-ENOMEM);
168
169 wl->name = kstrndup(name, len, GFP_KERNEL);
170 if (!wl->name) {
171 kfree(wl);
172 return ERR_PTR(-ENOMEM);
173 }
174 wl->ws.name = wl->name;
175 wakeup_source_add(&wl->ws);
176 rb_link_node(&wl->node, parent, node);
177 rb_insert_color(&wl->node, &wakelocks_tree);
178 wakelocks_lru_add(wl);
179 increment_wakelocks_number();
180 return wl;
181}
182
183int pm_wake_lock(const char *buf)
184{
185 const char *str = buf;
186 struct wakelock *wl;
187 u64 timeout_ns = 0;
188 size_t len;
189 int ret = 0;
190
191 while (*str && !isspace(*str))
192 str++;
193
194 len = str - buf;
195 if (!len)
196 return -EINVAL;
197
198 if (*str && *str != '\n') {
199 /* Find out if there's a valid timeout string appended. */
200 ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
201 if (ret)
202 return -EINVAL;
203 }
204
205 mutex_lock(&wakelocks_lock);
206
207 wl = wakelock_lookup_add(buf, len, true);
208 if (IS_ERR(wl)) {
209 ret = PTR_ERR(wl);
210 goto out;
211 }
212 if (timeout_ns) {
213 u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
214
215 do_div(timeout_ms, NSEC_PER_MSEC);
216 __pm_wakeup_event(&wl->ws, timeout_ms);
217 } else {
218 __pm_stay_awake(&wl->ws);
219 }
220
221 wakelocks_lru_most_recent(wl);
222
223 out:
224 mutex_unlock(&wakelocks_lock);
225 return ret;
226}
227
228int pm_wake_unlock(const char *buf)
229{
230 struct wakelock *wl;
231 size_t len;
232 int ret = 0;
233
234 len = strlen(buf);
235 if (!len)
236 return -EINVAL;
237
238 if (buf[len-1] == '\n')
239 len--;
240
241 if (!len)
242 return -EINVAL;
243
244 mutex_lock(&wakelocks_lock);
245
246 wl = wakelock_lookup_add(buf, len, false);
247 if (IS_ERR(wl)) {
248 ret = PTR_ERR(wl);
249 goto out;
250 }
251 __pm_relax(&wl->ws);
252
253 wakelocks_lru_most_recent(wl);
254 wakelocks_gc();
255
256 out:
257 mutex_unlock(&wakelocks_lock);
258 return ret;
259}
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d39..32462d2b364a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
41#include <linux/cpu.h> 41#include <linux/cpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46 47
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
54{ 55{
55} 56}
56 57
57#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
58
59/* printk's without a loglevel use this.. */ 58/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 59#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 60
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
99static int console_locked, console_suspended; 98static int console_locked, console_suspended;
100 99
101/* 100/*
102 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
103 * It is also used in interesting ways to provide interlocking in
104 * console_unlock();.
105 */
106static DEFINE_RAW_SPINLOCK(logbuf_lock);
107
108#define LOG_BUF_MASK (log_buf_len-1)
109#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
110
111/*
112 * The indices into log_buf are not constrained to log_buf_len - they
113 * must be masked before subscripting
114 */
115static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
116static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
117static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
118
119/*
120 * If exclusive_console is non-NULL then only this console is to be printed to. 101 * If exclusive_console is non-NULL then only this console is to be printed to.
121 */ 102 */
122static struct console *exclusive_console; 103static struct console *exclusive_console;
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline);
145/* Flag: console code may call schedule() */ 126/* Flag: console code may call schedule() */
146static int console_may_schedule; 127static int console_may_schedule;
147 128
129/*
130 * The printk log buffer consists of a chain of concatenated variable
131 * length records. Every record starts with a record header, containing
132 * the overall length of the record.
133 *
134 * The heads to the first and last entry in the buffer, as well as the
135 * sequence numbers of these both entries are maintained when messages
136 * are stored..
137 *
138 * If the heads indicate available messages, the length in the header
139 * tells the start next message. A length == 0 for the next message
140 * indicates a wrap-around to the beginning of the buffer.
141 *
142 * Every record carries the monotonic timestamp in microseconds, as well as
143 * the standard userspace syslog level and syslog facility. The usual
144 * kernel messages use LOG_KERN; userspace-injected messages always carry
145 * a matching syslog facility, by default LOG_USER. The origin of every
146 * message can be reliably determined that way.
147 *
148 * The human readable log message directly follows the message header. The
149 * length of the message text is stored in the header, the stored message
150 * is not terminated.
151 *
152 * Optionally, a message can carry a dictionary of properties (key/value pairs),
153 * to provide userspace with a machine-readable message context.
154 *
155 * Examples for well-defined, commonly used property names are:
156 * DEVICE=b12:8 device identifier
157 * b12:8 block dev_t
158 * c127:3 char dev_t
159 * n8 netdev ifindex
160 * +sound:card0 subsystem:devname
161 * SUBSYSTEM=pci driver-core subsystem name
162 *
163 * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
164 * follows directly after a '=' character. Every property is terminated by
165 * a '\0' character. The last property is not terminated.
166 *
167 * Example of a message structure:
168 * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
169 * 0008 34 00 record is 52 bytes long
170 * 000a 0b 00 text is 11 bytes long
171 * 000c 1f 00 dictionary is 23 bytes long
172 * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
173 * 0010 69 74 27 73 20 61 20 6c "it's a l"
174 * 69 6e 65 "ine"
175 * 001b 44 45 56 49 43 "DEVIC"
176 * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
177 * 52 49 56 45 52 3d 62 75 "RIVER=bu"
178 * 67 "g"
179 * 0032 00 00 00 padding to next message header
180 *
181 * The 'struct log' buffer header must never be directly exported to
182 * userspace, it is a kernel-private implementation detail that might
183 * need to be changed in the future, when the requirements change.
184 *
185 * /dev/kmsg exports the structured data in the following line format:
186 * "level,sequnum,timestamp;<message text>\n"
187 *
188 * The optional key/value pairs are attached as continuation lines starting
189 * with a space character and terminated by a newline. All possible
190 * non-prinatable characters are escaped in the "\xff" notation.
191 *
192 * Users of the export format should ignore possible additional values
193 * separated by ',', and find the message after the ';' character.
194 */
195
196struct log {
197 u64 ts_nsec; /* timestamp in nanoseconds */
198 u16 len; /* length of entire record */
199 u16 text_len; /* length of text buffer */
200 u16 dict_len; /* length of dictionary buffer */
201 u16 level; /* syslog level + facility */
202};
203
204/*
205 * The logbuf_lock protects kmsg buffer, indices, counters. It is also
206 * used in interesting ways to provide interlocking in console_unlock();
207 */
208static DEFINE_RAW_SPINLOCK(logbuf_lock);
209
210/* the next printk record to read by syslog(READ) or /proc/kmsg */
211static u64 syslog_seq;
212static u32 syslog_idx;
213
214/* index and sequence number of the first record stored in the buffer */
215static u64 log_first_seq;
216static u32 log_first_idx;
217
218/* index and sequence number of the next record to store in the buffer */
219static u64 log_next_seq;
148#ifdef CONFIG_PRINTK 220#ifdef CONFIG_PRINTK
221static u32 log_next_idx;
222
223/* the next printk record to read after the last 'clear' command */
224static u64 clear_seq;
225static u32 clear_idx;
226
227#define LOG_LINE_MAX 1024
149 228
150static char __log_buf[__LOG_BUF_LEN]; 229/* record buffer */
230#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
231#define LOG_ALIGN 4
232#else
233#define LOG_ALIGN 8
234#endif
235#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
236static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
151static char *log_buf = __log_buf; 237static char *log_buf = __log_buf;
152static int log_buf_len = __LOG_BUF_LEN; 238static u32 log_buf_len = __LOG_BUF_LEN;
153static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 239
154static int saved_console_loglevel = -1; 240/* cpu currently holding logbuf_lock */
241static volatile unsigned int logbuf_cpu = UINT_MAX;
242
243/* human readable text of the record */
244static char *log_text(const struct log *msg)
245{
246 return (char *)msg + sizeof(struct log);
247}
248
249/* optional key/value pair dictionary attached to the record */
250static char *log_dict(const struct log *msg)
251{
252 return (char *)msg + sizeof(struct log) + msg->text_len;
253}
254
255/* get record by index; idx must point to valid msg */
256static struct log *log_from_idx(u32 idx)
257{
258 struct log *msg = (struct log *)(log_buf + idx);
259
260 /*
261 * A length == 0 record is the end of buffer marker. Wrap around and
262 * read the message at the start of the buffer.
263 */
264 if (!msg->len)
265 return (struct log *)log_buf;
266 return msg;
267}
268
269/* get next record; idx must point to valid msg */
270static u32 log_next(u32 idx)
271{
272 struct log *msg = (struct log *)(log_buf + idx);
273
274 /* length == 0 indicates the end of the buffer; wrap */
275 /*
276 * A length == 0 record is the end of buffer marker. Wrap around and
277 * read the message at the start of the buffer as *this* one, and
278 * return the one after that.
279 */
280 if (!msg->len) {
281 msg = (struct log *)log_buf;
282 return msg->len;
283 }
284 return idx + msg->len;
285}
286
287/* insert record into the buffer, discard old ones, update heads */
288static void log_store(int facility, int level,
289 const char *dict, u16 dict_len,
290 const char *text, u16 text_len)
291{
292 struct log *msg;
293 u32 size, pad_len;
294
295 /* number of '\0' padding bytes to next message */
296 size = sizeof(struct log) + text_len + dict_len;
297 pad_len = (-size) & (LOG_ALIGN - 1);
298 size += pad_len;
299
300 while (log_first_seq < log_next_seq) {
301 u32 free;
302
303 if (log_next_idx > log_first_idx)
304 free = max(log_buf_len - log_next_idx, log_first_idx);
305 else
306 free = log_first_idx - log_next_idx;
307
308 if (free > size + sizeof(struct log))
309 break;
310
311 /* drop old messages until we have enough contiuous space */
312 log_first_idx = log_next(log_first_idx);
313 log_first_seq++;
314 }
315
316 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
317 /*
318 * This message + an additional empty header does not fit
319 * at the end of the buffer. Add an empty header with len == 0
320 * to signify a wrap around.
321 */
322 memset(log_buf + log_next_idx, 0, sizeof(struct log));
323 log_next_idx = 0;
324 }
325
326 /* fill message */
327 msg = (struct log *)(log_buf + log_next_idx);
328 memcpy(log_text(msg), text, text_len);
329 msg->text_len = text_len;
330 memcpy(log_dict(msg), dict, dict_len);
331 msg->dict_len = dict_len;
332 msg->level = (facility << 3) | (level & 7);
333 msg->ts_nsec = local_clock();
334 memset(log_dict(msg) + dict_len, 0, pad_len);
335 msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
336
337 /* insert message */
338 log_next_idx += msg->len;
339 log_next_seq++;
340}
341
342/* /dev/kmsg - userspace message inject/listen interface */
343struct devkmsg_user {
344 u64 seq;
345 u32 idx;
346 struct mutex lock;
347 char buf[8192];
348};
349
350static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
351 unsigned long count, loff_t pos)
352{
353 char *buf, *line;
354 int i;
355 int level = default_message_loglevel;
356 int facility = 1; /* LOG_USER */
357 size_t len = iov_length(iv, count);
358 ssize_t ret = len;
359
360 if (len > LOG_LINE_MAX)
361 return -EINVAL;
362 buf = kmalloc(len+1, GFP_KERNEL);
363 if (buf == NULL)
364 return -ENOMEM;
365
366 line = buf;
367 for (i = 0; i < count; i++) {
368 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
369 goto out;
370 line += iv[i].iov_len;
371 }
372
373 /*
374 * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
375 * the decimal value represents 32bit, the lower 3 bit are the log
376 * level, the rest are the log facility.
377 *
378 * If no prefix or no userspace facility is specified, we
379 * enforce LOG_USER, to be able to reliably distinguish
380 * kernel-generated messages from userspace-injected ones.
381 */
382 line = buf;
383 if (line[0] == '<') {
384 char *endp = NULL;
385
386 i = simple_strtoul(line+1, &endp, 10);
387 if (endp && endp[0] == '>') {
388 level = i & 7;
389 if (i >> 3)
390 facility = i >> 3;
391 endp++;
392 len -= endp - line;
393 line = endp;
394 }
395 }
396 line[len] = '\0';
397
398 printk_emit(facility, level, NULL, 0, "%s", line);
399out:
400 kfree(buf);
401 return ret;
402}
403
404static ssize_t devkmsg_read(struct file *file, char __user *buf,
405 size_t count, loff_t *ppos)
406{
407 struct devkmsg_user *user = file->private_data;
408 struct log *msg;
409 u64 ts_usec;
410 size_t i;
411 size_t len;
412 ssize_t ret;
413
414 if (!user)
415 return -EBADF;
416
417 mutex_lock(&user->lock);
418 raw_spin_lock(&logbuf_lock);
419 while (user->seq == log_next_seq) {
420 if (file->f_flags & O_NONBLOCK) {
421 ret = -EAGAIN;
422 raw_spin_unlock(&logbuf_lock);
423 goto out;
424 }
425
426 raw_spin_unlock(&logbuf_lock);
427 ret = wait_event_interruptible(log_wait,
428 user->seq != log_next_seq);
429 if (ret)
430 goto out;
431 raw_spin_lock(&logbuf_lock);
432 }
433
434 if (user->seq < log_first_seq) {
435 /* our last seen message is gone, return error and reset */
436 user->idx = log_first_idx;
437 user->seq = log_first_seq;
438 ret = -EPIPE;
439 raw_spin_unlock(&logbuf_lock);
440 goto out;
441 }
442
443 msg = log_from_idx(user->idx);
444 ts_usec = msg->ts_nsec;
445 do_div(ts_usec, 1000);
446 len = sprintf(user->buf, "%u,%llu,%llu;",
447 msg->level, user->seq, ts_usec);
448
449 /* escape non-printable characters */
450 for (i = 0; i < msg->text_len; i++) {
451 unsigned char c = log_text(msg)[i];
452
453 if (c < ' ' || c >= 128)
454 len += sprintf(user->buf + len, "\\x%02x", c);
455 else
456 user->buf[len++] = c;
457 }
458 user->buf[len++] = '\n';
459
460 if (msg->dict_len) {
461 bool line = true;
462
463 for (i = 0; i < msg->dict_len; i++) {
464 unsigned char c = log_dict(msg)[i];
465
466 if (line) {
467 user->buf[len++] = ' ';
468 line = false;
469 }
470
471 if (c == '\0') {
472 user->buf[len++] = '\n';
473 line = true;
474 continue;
475 }
476
477 if (c < ' ' || c >= 128) {
478 len += sprintf(user->buf + len, "\\x%02x", c);
479 continue;
480 }
481
482 user->buf[len++] = c;
483 }
484 user->buf[len++] = '\n';
485 }
486
487 user->idx = log_next(user->idx);
488 user->seq++;
489 raw_spin_unlock(&logbuf_lock);
490
491 if (len > count) {
492 ret = -EINVAL;
493 goto out;
494 }
495
496 if (copy_to_user(buf, user->buf, len)) {
497 ret = -EFAULT;
498 goto out;
499 }
500 ret = len;
501out:
502 mutex_unlock(&user->lock);
503 return ret;
504}
505
506static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
507{
508 struct devkmsg_user *user = file->private_data;
509 loff_t ret = 0;
510
511 if (!user)
512 return -EBADF;
513 if (offset)
514 return -ESPIPE;
515
516 raw_spin_lock(&logbuf_lock);
517 switch (whence) {
518 case SEEK_SET:
519 /* the first record */
520 user->idx = log_first_idx;
521 user->seq = log_first_seq;
522 break;
523 case SEEK_DATA:
524 /*
525 * The first record after the last SYSLOG_ACTION_CLEAR,
526 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
527 * changes no global state, and does not clear anything.
528 */
529 user->idx = clear_idx;
530 user->seq = clear_seq;
531 break;
532 case SEEK_END:
533 /* after the last record */
534 user->idx = log_next_idx;
535 user->seq = log_next_seq;
536 break;
537 default:
538 ret = -EINVAL;
539 }
540 raw_spin_unlock(&logbuf_lock);
541 return ret;
542}
543
544static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
545{
546 struct devkmsg_user *user = file->private_data;
547 int ret = 0;
548
549 if (!user)
550 return POLLERR|POLLNVAL;
551
552 poll_wait(file, &log_wait, wait);
553
554 raw_spin_lock(&logbuf_lock);
555 if (user->seq < log_next_seq) {
556 /* return error when data has vanished underneath us */
557 if (user->seq < log_first_seq)
558 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
559 ret = POLLIN|POLLRDNORM;
560 }
561 raw_spin_unlock(&logbuf_lock);
562
563 return ret;
564}
565
566static int devkmsg_open(struct inode *inode, struct file *file)
567{
568 struct devkmsg_user *user;
569 int err;
570
571 /* write-only does not need any file context */
572 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
573 return 0;
574
575 err = security_syslog(SYSLOG_ACTION_READ_ALL);
576 if (err)
577 return err;
578
579 user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
580 if (!user)
581 return -ENOMEM;
582
583 mutex_init(&user->lock);
584
585 raw_spin_lock(&logbuf_lock);
586 user->idx = log_first_idx;
587 user->seq = log_first_seq;
588 raw_spin_unlock(&logbuf_lock);
589
590 file->private_data = user;
591 return 0;
592}
593
594static int devkmsg_release(struct inode *inode, struct file *file)
595{
596 struct devkmsg_user *user = file->private_data;
597
598 if (!user)
599 return 0;
600
601 mutex_destroy(&user->lock);
602 kfree(user);
603 return 0;
604}
605
606const struct file_operations kmsg_fops = {
607 .open = devkmsg_open,
608 .read = devkmsg_read,
609 .aio_write = devkmsg_writev,
610 .llseek = devkmsg_llseek,
611 .poll = devkmsg_poll,
612 .release = devkmsg_release,
613};
155 614
156#ifdef CONFIG_KEXEC 615#ifdef CONFIG_KEXEC
157/* 616/*
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1;
165void log_buf_kexec_setup(void) 624void log_buf_kexec_setup(void)
166{ 625{
167 VMCOREINFO_SYMBOL(log_buf); 626 VMCOREINFO_SYMBOL(log_buf);
168 VMCOREINFO_SYMBOL(log_end);
169 VMCOREINFO_SYMBOL(log_buf_len); 627 VMCOREINFO_SYMBOL(log_buf_len);
170 VMCOREINFO_SYMBOL(logged_chars); 628 VMCOREINFO_SYMBOL(log_first_idx);
629 VMCOREINFO_SYMBOL(log_next_idx);
171} 630}
172#endif 631#endif
173 632
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup);
191void __init setup_log_buf(int early) 650void __init setup_log_buf(int early)
192{ 651{
193 unsigned long flags; 652 unsigned long flags;
194 unsigned start, dest_idx, offset;
195 char *new_log_buf; 653 char *new_log_buf;
196 int free; 654 int free;
197 655
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early)
219 log_buf_len = new_log_buf_len; 677 log_buf_len = new_log_buf_len;
220 log_buf = new_log_buf; 678 log_buf = new_log_buf;
221 new_log_buf_len = 0; 679 new_log_buf_len = 0;
222 free = __LOG_BUF_LEN - log_end; 680 free = __LOG_BUF_LEN - log_next_idx;
223 681 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
224 offset = start = min(con_start, log_start);
225 dest_idx = 0;
226 while (start != log_end) {
227 unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
228
229 log_buf[dest_idx] = __log_buf[log_idx_mask];
230 start++;
231 dest_idx++;
232 }
233 log_start -= offset;
234 con_start -= offset;
235 log_end -= offset;
236 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 682 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
237 683
238 pr_info("log_buf_len: %d\n", log_buf_len); 684 pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file)
332 return 0; 778 return 0;
333} 779}
334 780
781#if defined(CONFIG_PRINTK_TIME)
782static bool printk_time = 1;
783#else
784static bool printk_time;
785#endif
786module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
787
788static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
789{
790 size_t len = 0;
791
792 if (syslog) {
793 if (buf) {
794 len += sprintf(buf, "<%u>", msg->level);
795 } else {
796 len += 3;
797 if (msg->level > 9)
798 len++;
799 if (msg->level > 99)
800 len++;
801 }
802 }
803
804 if (printk_time) {
805 if (buf) {
806 unsigned long long ts = msg->ts_nsec;
807 unsigned long rem_nsec = do_div(ts, 1000000000);
808
809 len += sprintf(buf + len, "[%5lu.%06lu] ",
810 (unsigned long) ts, rem_nsec / 1000);
811 } else {
812 len += 15;
813 }
814 }
815
816 return len;
817}
818
819static size_t msg_print_text(const struct log *msg, bool syslog,
820 char *buf, size_t size)
821{
822 const char *text = log_text(msg);
823 size_t text_size = msg->text_len;
824 size_t len = 0;
825
826 do {
827 const char *next = memchr(text, '\n', text_size);
828 size_t text_len;
829
830 if (next) {
831 text_len = next - text;
832 next++;
833 text_size -= next - text;
834 } else {
835 text_len = text_size;
836 }
837
838 if (buf) {
839 if (print_prefix(msg, syslog, NULL) +
840 text_len + 1>= size - len)
841 break;
842
843 len += print_prefix(msg, syslog, buf + len);
844 memcpy(buf + len, text, text_len);
845 len += text_len;
846 buf[len++] = '\n';
847 } else {
848 /* SYSLOG_ACTION_* buffer size only calculation */
849 len += print_prefix(msg, syslog, NULL);
850 len += text_len + 1;
851 }
852
853 text = next;
854 } while (text);
855
856 return len;
857}
858
859static int syslog_print(char __user *buf, int size)
860{
861 char *text;
862 struct log *msg;
863 int len;
864
865 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
866 if (!text)
867 return -ENOMEM;
868
869 raw_spin_lock_irq(&logbuf_lock);
870 if (syslog_seq < log_first_seq) {
871 /* messages are gone, move to first one */
872 syslog_seq = log_first_seq;
873 syslog_idx = log_first_idx;
874 }
875 msg = log_from_idx(syslog_idx);
876 len = msg_print_text(msg, true, text, LOG_LINE_MAX);
877 syslog_idx = log_next(syslog_idx);
878 syslog_seq++;
879 raw_spin_unlock_irq(&logbuf_lock);
880
881 if (len > 0 && copy_to_user(buf, text, len))
882 len = -EFAULT;
883
884 kfree(text);
885 return len;
886}
887
888static int syslog_print_all(char __user *buf, int size, bool clear)
889{
890 char *text;
891 int len = 0;
892
893 text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
894 if (!text)
895 return -ENOMEM;
896
897 raw_spin_lock_irq(&logbuf_lock);
898 if (buf) {
899 u64 next_seq;
900 u64 seq;
901 u32 idx;
902
903 if (clear_seq < log_first_seq) {
904 /* messages are gone, move to first available one */
905 clear_seq = log_first_seq;
906 clear_idx = log_first_idx;
907 }
908
909 /*
910 * Find first record that fits, including all following records,
911 * into the user-provided buffer for this dump.
912 */
913 seq = clear_seq;
914 idx = clear_idx;
915 while (seq < log_next_seq) {
916 struct log *msg = log_from_idx(idx);
917
918 len += msg_print_text(msg, true, NULL, 0);
919 idx = log_next(idx);
920 seq++;
921 }
922 seq = clear_seq;
923 idx = clear_idx;
924 while (len > size && seq < log_next_seq) {
925 struct log *msg = log_from_idx(idx);
926
927 len -= msg_print_text(msg, true, NULL, 0);
928 idx = log_next(idx);
929 seq++;
930 }
931
932 /* last message in this dump */
933 next_seq = log_next_seq;
934
935 len = 0;
936 while (len >= 0 && seq < next_seq) {
937 struct log *msg = log_from_idx(idx);
938 int textlen;
939
940 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
941 if (textlen < 0) {
942 len = textlen;
943 break;
944 }
945 idx = log_next(idx);
946 seq++;
947
948 raw_spin_unlock_irq(&logbuf_lock);
949 if (copy_to_user(buf + len, text, textlen))
950 len = -EFAULT;
951 else
952 len += textlen;
953 raw_spin_lock_irq(&logbuf_lock);
954
955 if (seq < log_first_seq) {
956 /* messages are gone, move to next one */
957 seq = log_first_seq;
958 idx = log_first_idx;
959 }
960 }
961 }
962
963 if (clear) {
964 clear_seq = log_next_seq;
965 clear_idx = log_next_idx;
966 }
967 raw_spin_unlock_irq(&logbuf_lock);
968
969 kfree(text);
970 return len;
971}
972
335int do_syslog(int type, char __user *buf, int len, bool from_file) 973int do_syslog(int type, char __user *buf, int len, bool from_file)
336{ 974{
337 unsigned i, j, limit, count; 975 bool clear = false;
338 int do_clear = 0; 976 static int saved_console_loglevel = -1;
339 char c;
340 int error; 977 int error;
341 978
342 error = check_syslog_permissions(type, from_file); 979 error = check_syslog_permissions(type, from_file);
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
364 goto out; 1001 goto out;
365 } 1002 }
366 error = wait_event_interruptible(log_wait, 1003 error = wait_event_interruptible(log_wait,
367 (log_start - log_end)); 1004 syslog_seq != log_next_seq);
368 if (error) 1005 if (error)
369 goto out; 1006 goto out;
370 i = 0; 1007 error = syslog_print(buf, len);
371 raw_spin_lock_irq(&logbuf_lock);
372 while (!error && (log_start != log_end) && i < len) {
373 c = LOG_BUF(log_start);
374 log_start++;
375 raw_spin_unlock_irq(&logbuf_lock);
376 error = __put_user(c,buf);
377 buf++;
378 i++;
379 cond_resched();
380 raw_spin_lock_irq(&logbuf_lock);
381 }
382 raw_spin_unlock_irq(&logbuf_lock);
383 if (!error)
384 error = i;
385 break; 1008 break;
386 /* Read/clear last kernel messages */ 1009 /* Read/clear last kernel messages */
387 case SYSLOG_ACTION_READ_CLEAR: 1010 case SYSLOG_ACTION_READ_CLEAR:
388 do_clear = 1; 1011 clear = true;
389 /* FALL THRU */ 1012 /* FALL THRU */
390 /* Read last kernel messages */ 1013 /* Read last kernel messages */
391 case SYSLOG_ACTION_READ_ALL: 1014 case SYSLOG_ACTION_READ_ALL:
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
399 error = -EFAULT; 1022 error = -EFAULT;
400 goto out; 1023 goto out;
401 } 1024 }
402 count = len; 1025 error = syslog_print_all(buf, len, clear);
403 if (count > log_buf_len)
404 count = log_buf_len;
405 raw_spin_lock_irq(&logbuf_lock);
406 if (count > logged_chars)
407 count = logged_chars;
408 if (do_clear)
409 logged_chars = 0;
410 limit = log_end;
411 /*
412 * __put_user() could sleep, and while we sleep
413 * printk() could overwrite the messages
414 * we try to copy to user space. Therefore
415 * the messages are copied in reverse. <manfreds>
416 */
417 for (i = 0; i < count && !error; i++) {
418 j = limit-1-i;
419 if (j + log_buf_len < log_end)
420 break;
421 c = LOG_BUF(j);
422 raw_spin_unlock_irq(&logbuf_lock);
423 error = __put_user(c,&buf[count-1-i]);
424 cond_resched();
425 raw_spin_lock_irq(&logbuf_lock);
426 }
427 raw_spin_unlock_irq(&logbuf_lock);
428 if (error)
429 break;
430 error = i;
431 if (i != count) {
432 int offset = count-error;
433 /* buffer overflow during copy, correct user buffer. */
434 for (i = 0; i < error; i++) {
435 if (__get_user(c,&buf[i+offset]) ||
436 __put_user(c,&buf[i])) {
437 error = -EFAULT;
438 break;
439 }
440 cond_resched();
441 }
442 }
443 break; 1026 break;
444 /* Clear ring buffer */ 1027 /* Clear ring buffer */
445 case SYSLOG_ACTION_CLEAR: 1028 case SYSLOG_ACTION_CLEAR:
446 logged_chars = 0; 1029 syslog_print_all(NULL, 0, true);
447 break;
448 /* Disable logging to console */ 1030 /* Disable logging to console */
449 case SYSLOG_ACTION_CONSOLE_OFF: 1031 case SYSLOG_ACTION_CONSOLE_OFF:
450 if (saved_console_loglevel == -1) 1032 if (saved_console_loglevel == -1)
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
472 break; 1054 break;
473 /* Number of chars in the log buffer */ 1055 /* Number of chars in the log buffer */
474 case SYSLOG_ACTION_SIZE_UNREAD: 1056 case SYSLOG_ACTION_SIZE_UNREAD:
475 error = log_end - log_start; 1057 raw_spin_lock_irq(&logbuf_lock);
1058 if (syslog_seq < log_first_seq) {
1059 /* messages are gone, move to first one */
1060 syslog_seq = log_first_seq;
1061 syslog_idx = log_first_idx;
1062 }
1063 if (from_file) {
1064 /*
1065 * Short-cut for poll(/"proc/kmsg") which simply checks
1066 * for pending data, not the size; return the count of
1067 * records, not the length.
1068 */
1069 error = log_next_idx - syslog_idx;
1070 } else {
1071 u64 seq;
1072 u32 idx;
1073
1074 error = 0;
1075 seq = syslog_seq;
1076 idx = syslog_idx;
1077 while (seq < log_next_seq) {
1078 struct log *msg = log_from_idx(idx);
1079
1080 error += msg_print_text(msg, true, NULL, 0);
1081 idx = log_next(idx);
1082 seq++;
1083 }
1084 }
1085 raw_spin_unlock_irq(&logbuf_lock);
476 break; 1086 break;
477 /* Size of the log buffer */ 1087 /* Size of the log buffer */
478 case SYSLOG_ACTION_SIZE_BUFFER: 1088 case SYSLOG_ACTION_SIZE_BUFFER:
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4])
501{ 1111{
502 syslog_data[0] = log_buf; 1112 syslog_data[0] = log_buf;
503 syslog_data[1] = log_buf + log_buf_len; 1113 syslog_data[1] = log_buf + log_buf_len;
504 syslog_data[2] = log_buf + log_end - 1114 syslog_data[2] = log_buf + log_first_idx;
505 (logged_chars < log_buf_len ? logged_chars : log_buf_len); 1115 syslog_data[3] = log_buf + log_next_idx;
506 syslog_data[3] = log_buf + log_end;
507} 1116}
508#endif /* CONFIG_KGDB_KDB */ 1117#endif /* CONFIG_KGDB_KDB */
509 1118
510/*
511 * Call the console drivers on a range of log_buf
512 */
513static void __call_console_drivers(unsigned start, unsigned end)
514{
515 struct console *con;
516
517 for_each_console(con) {
518 if (exclusive_console && con != exclusive_console)
519 continue;
520 if ((con->flags & CON_ENABLED) && con->write &&
521 (cpu_online(smp_processor_id()) ||
522 (con->flags & CON_ANYTIME)))
523 con->write(con, &LOG_BUF(start), end - start);
524 }
525}
526
527static bool __read_mostly ignore_loglevel; 1119static bool __read_mostly ignore_loglevel;
528 1120
529static int __init ignore_loglevel_setup(char *str) 1121static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
540 "print all kernel messages to the console."); 1132 "print all kernel messages to the console.");
541 1133
542/* 1134/*
543 * Write out chars from start to end - 1 inclusive
544 */
545static void _call_console_drivers(unsigned start,
546 unsigned end, int msg_log_level)
547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
551 console_drivers && start != end) {
552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
553 /* wrapped write */
554 __call_console_drivers(start & LOG_BUF_MASK,
555 log_buf_len);
556 __call_console_drivers(0, end & LOG_BUF_MASK);
557 } else {
558 __call_console_drivers(start, end);
559 }
560 }
561}
562
563/*
564 * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
565 * lower 3 bit are the log level, the rest are the log facility. In case
566 * userspace passes usual userspace syslog messages to /dev/kmsg or
567 * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
568 * to extract the correct log level for in-kernel processing, and not mangle
569 * the original value.
570 *
571 * If a prefix is found, the length of the prefix is returned. If 'level' is
572 * passed, it will be filled in with the log level without a possible facility
573 * value. If 'special' is passed, the special printk prefix chars are accepted
574 * and returned. If no valid header is found, 0 is returned and the passed
575 * variables are not touched.
576 */
577static size_t log_prefix(const char *p, unsigned int *level, char *special)
578{
579 unsigned int lev = 0;
580 char sp = '\0';
581 size_t len;
582
583 if (p[0] != '<' || !p[1])
584 return 0;
585 if (p[2] == '>') {
586 /* usual single digit level number or special char */
587 switch (p[1]) {
588 case '0' ... '7':
589 lev = p[1] - '0';
590 break;
591 case 'c': /* KERN_CONT */
592 case 'd': /* KERN_DEFAULT */
593 sp = p[1];
594 break;
595 default:
596 return 0;
597 }
598 len = 3;
599 } else {
600 /* multi digit including the level and facility number */
601 char *endp = NULL;
602
603 lev = (simple_strtoul(&p[1], &endp, 10) & 7);
604 if (endp == NULL || endp[0] != '>')
605 return 0;
606 len = (endp + 1) - p;
607 }
608
609 /* do not accept special char if not asked for */
610 if (sp && !special)
611 return 0;
612
613 if (special) {
614 *special = sp;
615 /* return special char, do not touch level */
616 if (sp)
617 return len;
618 }
619
620 if (level)
621 *level = lev;
622 return len;
623}
624
625/*
626 * Call the console drivers, asking them to write out 1135 * Call the console drivers, asking them to write out
627 * log_buf[start] to log_buf[end - 1]. 1136 * log_buf[start] to log_buf[end - 1].
628 * The console_lock must be held. 1137 * The console_lock must be held.
629 */ 1138 */
630static void call_console_drivers(unsigned start, unsigned end) 1139static void call_console_drivers(int level, const char *text, size_t len)
631{ 1140{
632 unsigned cur_index, start_print; 1141 struct console *con;
633 static int msg_level = -1;
634 1142
635 BUG_ON(((int)(start - end)) > 0); 1143 trace_console(text, 0, len, len);
636 1144
637 cur_index = start; 1145 if (level >= console_loglevel && !ignore_loglevel)
638 start_print = start; 1146 return;
639 while (cur_index != end) { 1147 if (!console_drivers)
640 if (msg_level < 0 && ((end - cur_index) > 2)) { 1148 return;
641 /* strip log prefix */
642 cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
643 start_print = cur_index;
644 }
645 while (cur_index != end) {
646 char c = LOG_BUF(cur_index);
647
648 cur_index++;
649 if (c == '\n') {
650 if (msg_level < 0) {
651 /*
652 * printk() has already given us loglevel tags in
653 * the buffer. This code is here in case the
654 * log buffer has wrapped right round and scribbled
655 * on those tags
656 */
657 msg_level = default_message_loglevel;
658 }
659 _call_console_drivers(start_print, cur_index, msg_level);
660 msg_level = -1;
661 start_print = cur_index;
662 break;
663 }
664 }
665 }
666 _call_console_drivers(start_print, end, msg_level);
667}
668 1149
669static void emit_log_char(char c) 1150 for_each_console(con) {
670{ 1151 if (exclusive_console && con != exclusive_console)
671 LOG_BUF(log_end) = c; 1152 continue;
672 log_end++; 1153 if (!(con->flags & CON_ENABLED))
673 if (log_end - log_start > log_buf_len) 1154 continue;
674 log_start = log_end - log_buf_len; 1155 if (!con->write)
675 if (log_end - con_start > log_buf_len) 1156 continue;
676 con_start = log_end - log_buf_len; 1157 if (!cpu_online(smp_processor_id()) &&
677 if (logged_chars < log_buf_len) 1158 !(con->flags & CON_ANYTIME))
678 logged_chars++; 1159 continue;
1160 con->write(con, text, len);
1161 }
679} 1162}
680 1163
681/* 1164/*
@@ -700,16 +1183,6 @@ static void zap_locks(void)
700 sema_init(&console_sem, 1); 1183 sema_init(&console_sem, 1);
701} 1184}
702 1185
703#if defined(CONFIG_PRINTK_TIME)
704static bool printk_time = 1;
705#else
706static bool printk_time = 0;
707#endif
708module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
709
710static bool always_kmsg_dump;
711module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
712
713/* Check if we have any console registered that can be called early in boot. */ 1186/* Check if we have any console registered that can be called early in boot. */
714static int have_callable_console(void) 1187static int have_callable_console(void)
715{ 1188{
@@ -722,51 +1195,6 @@ static int have_callable_console(void)
722 return 0; 1195 return 0;
723} 1196}
724 1197
725/**
726 * printk - print a kernel message
727 * @fmt: format string
728 *
729 * This is printk(). It can be called from any context. We want it to work.
730 *
731 * We try to grab the console_lock. If we succeed, it's easy - we log the output and
732 * call the console drivers. If we fail to get the semaphore we place the output
733 * into the log buffer and return. The current holder of the console_sem will
734 * notice the new output in console_unlock(); and will send it to the
735 * consoles before releasing the lock.
736 *
737 * One effect of this deferred printing is that code which calls printk() and
738 * then changes console_loglevel may break. This is because console_loglevel
739 * is inspected when the actual printing occurs.
740 *
741 * See also:
742 * printf(3)
743 *
744 * See the vsnprintf() documentation for format string extensions over C99.
745 */
746
747asmlinkage int printk(const char *fmt, ...)
748{
749 va_list args;
750 int r;
751
752#ifdef CONFIG_KGDB_KDB
753 if (unlikely(kdb_trap_printk)) {
754 va_start(args, fmt);
755 r = vkdb_printf(fmt, args);
756 va_end(args);
757 return r;
758 }
759#endif
760 va_start(args, fmt);
761 r = vprintk(fmt, args);
762 va_end(args);
763
764 return r;
765}
766
767/* cpu currently holding logbuf_lock */
768static volatile unsigned int printk_cpu = UINT_MAX;
769
770/* 1198/*
771 * Can we actually use the console at this time on this cpu? 1199 * Can we actually use the console at this time on this cpu?
772 * 1200 *
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu)
810 retval = 0; 1238 retval = 0;
811 } 1239 }
812 } 1240 }
813 printk_cpu = UINT_MAX; 1241 logbuf_cpu = UINT_MAX;
814 if (wake) 1242 if (wake)
815 up(&console_sem); 1243 up(&console_sem);
816 raw_spin_unlock(&logbuf_lock); 1244 raw_spin_unlock(&logbuf_lock);
817 return retval; 1245 return retval;
818} 1246}
819static const char recursion_bug_msg [] =
820 KERN_CRIT "BUG: recent printk recursion!\n";
821static int recursion_bug;
822static int new_text_line = 1;
823static char printk_buf[1024];
824 1247
825int printk_delay_msec __read_mostly; 1248int printk_delay_msec __read_mostly;
826 1249
@@ -836,15 +1259,23 @@ static inline void printk_delay(void)
836 } 1259 }
837} 1260}
838 1261
839asmlinkage int vprintk(const char *fmt, va_list args) 1262asmlinkage int vprintk_emit(int facility, int level,
1263 const char *dict, size_t dictlen,
1264 const char *fmt, va_list args)
840{ 1265{
841 int printed_len = 0; 1266 static int recursion_bug;
842 int current_log_level = default_message_loglevel; 1267 static char cont_buf[LOG_LINE_MAX];
1268 static size_t cont_len;
1269 static int cont_level;
1270 static struct task_struct *cont_task;
1271 static char textbuf[LOG_LINE_MAX];
1272 char *text = textbuf;
1273 size_t text_len;
843 unsigned long flags; 1274 unsigned long flags;
844 int this_cpu; 1275 int this_cpu;
845 char *p; 1276 bool newline = false;
846 size_t plen; 1277 bool prefix = false;
847 char special; 1278 int printed_len = 0;
848 1279
849 boot_delay_msec(); 1280 boot_delay_msec();
850 printk_delay(); 1281 printk_delay();
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 /* 1287 /*
857 * Ouch, printk recursed into itself! 1288 * Ouch, printk recursed into itself!
858 */ 1289 */
859 if (unlikely(printk_cpu == this_cpu)) { 1290 if (unlikely(logbuf_cpu == this_cpu)) {
860 /* 1291 /*
861 * If a crash is occurring during printk() on this CPU, 1292 * If a crash is occurring during printk() on this CPU,
862 * then try to get the crash message out but make sure 1293 * then try to get the crash message out but make sure
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args)
873 1304
874 lockdep_off(); 1305 lockdep_off();
875 raw_spin_lock(&logbuf_lock); 1306 raw_spin_lock(&logbuf_lock);
876 printk_cpu = this_cpu; 1307 logbuf_cpu = this_cpu;
877 1308
878 if (recursion_bug) { 1309 if (recursion_bug) {
1310 static const char recursion_msg[] =
1311 "BUG: recent printk recursion!";
1312
879 recursion_bug = 0; 1313 recursion_bug = 0;
880 strcpy(printk_buf, recursion_bug_msg); 1314 printed_len += strlen(recursion_msg);
881 printed_len = strlen(recursion_bug_msg); 1315 /* emit KERN_CRIT message */
1316 log_store(0, 2, NULL, 0, recursion_msg, printed_len);
882 } 1317 }
883 /* Emit the output into the temporary buffer */
884 printed_len += vscnprintf(printk_buf + printed_len,
885 sizeof(printk_buf) - printed_len, fmt, args);
886 1318
887 p = printk_buf; 1319 /*
1320 * The printf needs to come first; we need the syslog
1321 * prefix which might be passed-in as a parameter.
1322 */
1323 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
888 1324
889 /* Read log level and handle special printk prefix */ 1325 /* mark and strip a trailing newline */
890 plen = log_prefix(p, &current_log_level, &special); 1326 if (text_len && text[text_len-1] == '\n') {
891 if (plen) { 1327 text_len--;
892 p += plen; 1328 newline = true;
1329 }
893 1330
894 switch (special) { 1331 /* strip syslog prefix and extract log level or control flags */
895 case 'c': /* Strip <c> KERN_CONT, continue line */ 1332 if (text[0] == '<' && text[1] && text[2] == '>') {
896 plen = 0; 1333 switch (text[1]) {
897 break; 1334 case '0' ... '7':
898 case 'd': /* Strip <d> KERN_DEFAULT, start new line */ 1335 if (level == -1)
899 plen = 0; 1336 level = text[1] - '0';
900 default: 1337 case 'd': /* KERN_DEFAULT */
901 if (!new_text_line) { 1338 prefix = true;
902 emit_log_char('\n'); 1339 case 'c': /* KERN_CONT */
903 new_text_line = 1; 1340 text += 3;
904 } 1341 text_len -= 3;
905 } 1342 }
906 } 1343 }
907 1344
908 /* 1345 if (level == -1)
909 * Copy the output into log_buf. If the caller didn't provide 1346 level = default_message_loglevel;
910 * the appropriate log prefix, we insert them here
911 */
912 for (; *p; p++) {
913 if (new_text_line) {
914 new_text_line = 0;
915
916 if (plen) {
917 /* Copy original log prefix */
918 int i;
919
920 for (i = 0; i < plen; i++)
921 emit_log_char(printk_buf[i]);
922 printed_len += plen;
923 } else {
924 /* Add log prefix */
925 emit_log_char('<');
926 emit_log_char(current_log_level + '0');
927 emit_log_char('>');
928 printed_len += 3;
929 }
930 1347
931 if (printk_time) { 1348 if (dict) {
932 /* Add the current time stamp */ 1349 prefix = true;
933 char tbuf[50], *tp; 1350 newline = true;
934 unsigned tlen; 1351 }
935 unsigned long long t;
936 unsigned long nanosec_rem;
937
938 t = cpu_clock(printk_cpu);
939 nanosec_rem = do_div(t, 1000000000);
940 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
941 (unsigned long) t,
942 nanosec_rem / 1000);
943
944 for (tp = tbuf; tp < tbuf + tlen; tp++)
945 emit_log_char(*tp);
946 printed_len += tlen;
947 }
948 1352
949 if (!*p) 1353 if (!newline) {
950 break; 1354 if (cont_len && (prefix || cont_task != current)) {
1355 /*
1356 * Flush earlier buffer, which is either from a
1357 * different thread, or when we got a new prefix.
1358 */
1359 log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
1360 cont_len = 0;
951 } 1361 }
952 1362
953 emit_log_char(*p); 1363 if (!cont_len) {
954 if (*p == '\n') 1364 cont_level = level;
955 new_text_line = 1; 1365 cont_task = current;
1366 }
1367
1368 /* buffer or append to earlier buffer from the same thread */
1369 if (cont_len + text_len > sizeof(cont_buf))
1370 text_len = sizeof(cont_buf) - cont_len;
1371 memcpy(cont_buf + cont_len, text, text_len);
1372 cont_len += text_len;
1373 } else {
1374 if (cont_len && cont_task == current) {
1375 if (prefix) {
1376 /*
1377 * New prefix from the same thread; flush. We
1378 * either got no earlier newline, or we race
1379 * with an interrupt.
1380 */
1381 log_store(facility, cont_level,
1382 NULL, 0, cont_buf, cont_len);
1383 cont_len = 0;
1384 }
1385
1386 /* append to the earlier buffer and flush */
1387 if (cont_len + text_len > sizeof(cont_buf))
1388 text_len = sizeof(cont_buf) - cont_len;
1389 memcpy(cont_buf + cont_len, text, text_len);
1390 cont_len += text_len;
1391 log_store(facility, cont_level,
1392 NULL, 0, cont_buf, cont_len);
1393 cont_len = 0;
1394 cont_task = NULL;
1395 printed_len = cont_len;
1396 } else {
1397 /* ordinary single and terminated line */
1398 log_store(facility, level,
1399 dict, dictlen, text, text_len);
1400 printed_len = text_len;
1401 }
956 } 1402 }
957 1403
958 /* 1404 /*
959 * Try to acquire and then immediately release the 1405 * Try to acquire and then immediately release the console semaphore.
960 * console semaphore. The release will do all the 1406 * The release will print out buffers and wake up /dev/kmsg and syslog()
961 * actual magic (print out buffers, wake up klogd, 1407 * users.
962 * etc).
963 * 1408 *
964 * The console_trylock_for_printk() function 1409 * The console_trylock_for_printk() function will release 'logbuf_lock'
965 * will release 'logbuf_lock' regardless of whether it 1410 * regardless of whether it actually gets the console semaphore or not.
966 * actually gets the semaphore or not.
967 */ 1411 */
968 if (console_trylock_for_printk(this_cpu)) 1412 if (console_trylock_for_printk(this_cpu))
969 console_unlock(); 1413 console_unlock();
@@ -974,16 +1418,81 @@ out_restore_irqs:
974 1418
975 return printed_len; 1419 return printed_len;
976} 1420}
977EXPORT_SYMBOL(printk); 1421EXPORT_SYMBOL(vprintk_emit);
978EXPORT_SYMBOL(vprintk);
979 1422
980#else 1423asmlinkage int vprintk(const char *fmt, va_list args)
1424{
1425 return vprintk_emit(0, -1, NULL, 0, fmt, args);
1426}
1427EXPORT_SYMBOL(vprintk);
981 1428
982static void call_console_drivers(unsigned start, unsigned end) 1429asmlinkage int printk_emit(int facility, int level,
1430 const char *dict, size_t dictlen,
1431 const char *fmt, ...)
983{ 1432{
1433 va_list args;
1434 int r;
1435
1436 va_start(args, fmt);
1437 r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
1438 va_end(args);
1439
1440 return r;
984} 1441}
1442EXPORT_SYMBOL(printk_emit);
985 1443
1444/**
1445 * printk - print a kernel message
1446 * @fmt: format string
1447 *
1448 * This is printk(). It can be called from any context. We want it to work.
1449 *
1450 * We try to grab the console_lock. If we succeed, it's easy - we log the
1451 * output and call the console drivers. If we fail to get the semaphore, we
1452 * place the output into the log buffer and return. The current holder of
1453 * the console_sem will notice the new output in console_unlock(); and will
1454 * send it to the consoles before releasing the lock.
1455 *
1456 * One effect of this deferred printing is that code which calls printk() and
1457 * then changes console_loglevel may break. This is because console_loglevel
1458 * is inspected when the actual printing occurs.
1459 *
1460 * See also:
1461 * printf(3)
1462 *
1463 * See the vsnprintf() documentation for format string extensions over C99.
1464 */
1465asmlinkage int printk(const char *fmt, ...)
1466{
1467 va_list args;
1468 int r;
1469
1470#ifdef CONFIG_KGDB_KDB
1471 if (unlikely(kdb_trap_printk)) {
1472 va_start(args, fmt);
1473 r = vkdb_printf(fmt, args);
1474 va_end(args);
1475 return r;
1476 }
986#endif 1477#endif
1478 va_start(args, fmt);
1479 r = vprintk_emit(0, -1, NULL, 0, fmt, args);
1480 va_end(args);
1481
1482 return r;
1483}
1484EXPORT_SYMBOL(printk);
1485
1486#else
1487
1488#define LOG_LINE_MAX 0
1489static struct log *log_from_idx(u32 idx) { return NULL; }
1490static u32 log_next(u32 idx) { return 0; }
1491static void call_console_drivers(int level, const char *text, size_t len) {}
1492static size_t msg_print_text(const struct log *msg, bool syslog,
1493 char *buf, size_t size) { return 0; }
1494
1495#endif /* CONFIG_PRINTK */
987 1496
988static int __add_preferred_console(char *name, int idx, char *options, 1497static int __add_preferred_console(char *name, int idx, char *options,
989 char *brl_options) 1498 char *brl_options)
@@ -1217,7 +1726,7 @@ int is_console_locked(void)
1217} 1726}
1218 1727
1219/* 1728/*
1220 * Delayed printk facility, for scheduler-internal messages: 1729 * Delayed printk version, for scheduler-internal messages:
1221 */ 1730 */
1222#define PRINTK_BUF_SIZE 512 1731#define PRINTK_BUF_SIZE 512
1223 1732
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void)
1253 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 1762 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1254} 1763}
1255 1764
1765/* the next printk record to write to the console */
1766static u64 console_seq;
1767static u32 console_idx;
1768
1256/** 1769/**
1257 * console_unlock - unlock the console system 1770 * console_unlock - unlock the console system
1258 * 1771 *
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void)
1263 * by printk(). If this is the case, console_unlock(); emits 1776 * by printk(). If this is the case, console_unlock(); emits
1264 * the output prior to releasing the lock. 1777 * the output prior to releasing the lock.
1265 * 1778 *
1266 * If there is output waiting for klogd, we wake it up. 1779 * If there is output waiting, we wake /dev/kmsg and syslog() users.
1267 * 1780 *
1268 * console_unlock(); may be called from any context. 1781 * console_unlock(); may be called from any context.
1269 */ 1782 */
1270void console_unlock(void) 1783void console_unlock(void)
1271{ 1784{
1785 static u64 seen_seq;
1272 unsigned long flags; 1786 unsigned long flags;
1273 unsigned _con_start, _log_end; 1787 bool wake_klogd = false;
1274 unsigned wake_klogd = 0, retry = 0; 1788 bool retry;
1275 1789
1276 if (console_suspended) { 1790 if (console_suspended) {
1277 up(&console_sem); 1791 up(&console_sem);
@@ -1281,17 +1795,38 @@ void console_unlock(void)
1281 console_may_schedule = 0; 1795 console_may_schedule = 0;
1282 1796
1283again: 1797again:
1284 for ( ; ; ) { 1798 for (;;) {
1799 struct log *msg;
1800 static char text[LOG_LINE_MAX];
1801 size_t len;
1802 int level;
1803
1285 raw_spin_lock_irqsave(&logbuf_lock, flags); 1804 raw_spin_lock_irqsave(&logbuf_lock, flags);
1286 wake_klogd |= log_start - log_end; 1805 if (seen_seq != log_next_seq) {
1287 if (con_start == log_end) 1806 wake_klogd = true;
1288 break; /* Nothing to print */ 1807 seen_seq = log_next_seq;
1289 _con_start = con_start; 1808 }
1290 _log_end = log_end; 1809
1291 con_start = log_end; /* Flush */ 1810 if (console_seq < log_first_seq) {
1811 /* messages are gone, move to first one */
1812 console_seq = log_first_seq;
1813 console_idx = log_first_idx;
1814 }
1815
1816 if (console_seq == log_next_seq)
1817 break;
1818
1819 msg = log_from_idx(console_idx);
1820 level = msg->level & 7;
1821
1822 len = msg_print_text(msg, false, text, sizeof(text));
1823
1824 console_idx = log_next(console_idx);
1825 console_seq++;
1292 raw_spin_unlock(&logbuf_lock); 1826 raw_spin_unlock(&logbuf_lock);
1827
1293 stop_critical_timings(); /* don't trace print latency */ 1828 stop_critical_timings(); /* don't trace print latency */
1294 call_console_drivers(_con_start, _log_end); 1829 call_console_drivers(level, text, len);
1295 start_critical_timings(); 1830 start_critical_timings();
1296 local_irq_restore(flags); 1831 local_irq_restore(flags);
1297 } 1832 }
@@ -1312,8 +1847,7 @@ again:
1312 * flush, no worries. 1847 * flush, no worries.
1313 */ 1848 */
1314 raw_spin_lock(&logbuf_lock); 1849 raw_spin_lock(&logbuf_lock);
1315 if (con_start != log_end) 1850 retry = console_seq != log_next_seq;
1316 retry = 1;
1317 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 1851 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1318 1852
1319 if (retry && console_trylock()) 1853 if (retry && console_trylock())
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon)
1549 * for us. 2083 * for us.
1550 */ 2084 */
1551 raw_spin_lock_irqsave(&logbuf_lock, flags); 2085 raw_spin_lock_irqsave(&logbuf_lock, flags);
1552 con_start = log_start; 2086 console_seq = syslog_seq;
2087 console_idx = syslog_idx;
1553 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2088 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1554 /* 2089 /*
1555 * We're about to replay the log buffer. Only do this to the 2090 * We're about to replay the log buffer. Only do this to the
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1758} 2293}
1759EXPORT_SYMBOL_GPL(kmsg_dump_unregister); 2294EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1760 2295
2296static bool always_kmsg_dump;
2297module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
2298
1761/** 2299/**
1762 * kmsg_dump - dump kernel log to kernel message dumpers. 2300 * kmsg_dump - dump kernel log to kernel message dumpers.
1763 * @reason: the reason (oops, panic etc) for dumping 2301 * @reason: the reason (oops, panic etc) for dumping
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1767 */ 2305 */
1768void kmsg_dump(enum kmsg_dump_reason reason) 2306void kmsg_dump(enum kmsg_dump_reason reason)
1769{ 2307{
1770 unsigned long end; 2308 u64 idx;
1771 unsigned chars;
1772 struct kmsg_dumper *dumper; 2309 struct kmsg_dumper *dumper;
1773 const char *s1, *s2; 2310 const char *s1, *s2;
1774 unsigned long l1, l2; 2311 unsigned long l1, l2;
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1780 /* Theoretically, the log could move on after we do this, but 2317 /* Theoretically, the log could move on after we do this, but
1781 there's not a lot we can do about that. The new messages 2318 there's not a lot we can do about that. The new messages
1782 will overwrite the start of what we dump. */ 2319 will overwrite the start of what we dump. */
2320
1783 raw_spin_lock_irqsave(&logbuf_lock, flags); 2321 raw_spin_lock_irqsave(&logbuf_lock, flags);
1784 end = log_end & LOG_BUF_MASK; 2322 if (syslog_seq < log_first_seq)
1785 chars = logged_chars; 2323 idx = syslog_idx;
1786 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2324 else
2325 idx = log_first_idx;
1787 2326
1788 if (chars > end) { 2327 if (idx > log_next_idx) {
1789 s1 = log_buf + log_buf_len - chars + end; 2328 s1 = log_buf;
1790 l1 = chars - end; 2329 l1 = log_next_idx;
1791 2330
1792 s2 = log_buf; 2331 s2 = log_buf + idx;
1793 l2 = end; 2332 l2 = log_buf_len - idx;
1794 } else { 2333 } else {
1795 s1 = ""; 2334 s1 = "";
1796 l1 = 0; 2335 l1 = 0;
1797 2336
1798 s2 = log_buf + end - chars; 2337 s2 = log_buf + idx;
1799 l2 = chars; 2338 l2 = log_next_idx - idx;
1800 } 2339 }
2340 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1801 2341
1802 rcu_read_lock(); 2342 rcu_read_lock();
1803 list_for_each_entry_rcu(dumper, &dump_list, list) 2343 list_for_each_entry_rcu(dumper, &dump_list, list)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee8d49b9c309..a232bb59d93f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
198 return 0; 198 return 0;
199 rcu_read_lock(); 199 rcu_read_lock();
200 tcred = __task_cred(task); 200 tcred = __task_cred(task);
201 if (cred->user->user_ns == tcred->user->user_ns && 201 if (uid_eq(cred->uid, tcred->euid) &&
202 (cred->uid == tcred->euid && 202 uid_eq(cred->uid, tcred->suid) &&
203 cred->uid == tcred->suid && 203 uid_eq(cred->uid, tcred->uid) &&
204 cred->uid == tcred->uid && 204 gid_eq(cred->gid, tcred->egid) &&
205 cred->gid == tcred->egid && 205 gid_eq(cred->gid, tcred->sgid) &&
206 cred->gid == tcred->sgid && 206 gid_eq(cred->gid, tcred->gid))
207 cred->gid == tcred->gid))
208 goto ok; 207 goto ok;
209 if (ptrace_has_cap(tcred->user->user_ns, mode)) 208 if (ptrace_has_cap(tcred->user_ns, mode))
210 goto ok; 209 goto ok;
211 rcu_read_unlock(); 210 rcu_read_unlock();
212 return -EPERM; 211 return -EPERM;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
51 51
52#include "rcu.h" 52#include "rcu.h"
53 53
54#ifdef CONFIG_PREEMPT_RCU
55
56/*
57 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep
60 * is enabled.
61 */
62void exit_rcu(void)
63{
64 struct task_struct *t = current;
65
66 if (likely(list_empty(&current->rcu_node_entry)))
67 return;
68 t->rcu_read_lock_nesting = 1;
69 barrier();
70 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
71 __rcu_read_unlock();
72}
73
74#else /* #ifdef CONFIG_PREEMPT_RCU */
75
76void exit_rcu(void)
77{
78}
79
80#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
81
54#ifdef CONFIG_DEBUG_LOCK_ALLOC 82#ifdef CONFIG_DEBUG_LOCK_ALLOC
55static struct lock_class_key rcu_lock_key; 83static struct lock_class_key rcu_lock_key;
56struct lockdep_map rcu_lock_map = 84struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 852}
853 853
854/*
855 * Check for a task exiting while in a preemptible -RCU read-side
856 * critical section, clean up if so. No need to issue warnings,
857 * as debug_check_no_locks_held() already does this if lockdep
858 * is enabled.
859 */
860void exit_rcu(void)
861{
862 struct task_struct *t = current;
863
864 if (t->rcu_read_lock_nesting == 0)
865 return;
866 t->rcu_read_lock_nesting = 1;
867 __rcu_read_unlock();
868}
869
870#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 854#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
871 855
872#ifdef CONFIG_RCU_TRACE 856#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 68static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ 69static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 70static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
96MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 97MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
97module_param(fqs_stutter, int, 0444); 98module_param(fqs_stutter, int, 0444);
98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 99MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
100module_param(n_barrier_cbs, int, 0444);
101MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
99module_param(onoff_interval, int, 0444); 102module_param(onoff_interval, int, 0444);
100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 103MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444); 104module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
139static struct task_struct *onoff_task; 142static struct task_struct *onoff_task;
140#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 143#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task; 144static struct task_struct *stall_task;
145static struct task_struct **barrier_cbs_tasks;
146static struct task_struct *barrier_task;
142 147
143#define RCU_TORTURE_PIPE_LEN 10 148#define RCU_TORTURE_PIPE_LEN 10
144 149
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
164static atomic_t n_rcu_torture_free; 169static atomic_t n_rcu_torture_free;
165static atomic_t n_rcu_torture_mberror; 170static atomic_t n_rcu_torture_mberror;
166static atomic_t n_rcu_torture_error; 171static atomic_t n_rcu_torture_error;
172static long n_rcu_torture_barrier_error;
167static long n_rcu_torture_boost_ktrerror; 173static long n_rcu_torture_boost_ktrerror;
168static long n_rcu_torture_boost_rterror; 174static long n_rcu_torture_boost_rterror;
169static long n_rcu_torture_boost_failure; 175static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
173static long n_offline_successes; 179static long n_offline_successes;
174static long n_online_attempts; 180static long n_online_attempts;
175static long n_online_successes; 181static long n_online_successes;
182static long n_barrier_attempts;
183static long n_barrier_successes;
176static struct list_head rcu_torture_removed; 184static struct list_head rcu_torture_removed;
177static cpumask_var_t shuffle_tmp_mask; 185static cpumask_var_t shuffle_tmp_mask;
178 186
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */
197static unsigned long boost_starttime; /* jiffies of next boost test start. */ 205static unsigned long boost_starttime; /* jiffies of next boost test start. */
198DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
199 /* and boost task create/destroy. */ 207 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
200 212
201/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ 213/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
202 214
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
327 int (*completed)(void); 339 int (*completed)(void);
328 void (*deferred_free)(struct rcu_torture *p); 340 void (*deferred_free)(struct rcu_torture *p);
329 void (*sync)(void); 341 void (*sync)(void);
342 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
330 void (*cb_barrier)(void); 343 void (*cb_barrier)(void);
331 void (*fqs)(void); 344 void (*fqs)(void);
332 int (*stats)(char *page); 345 int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
417 .completed = rcu_torture_completed, 430 .completed = rcu_torture_completed,
418 .deferred_free = rcu_torture_deferred_free, 431 .deferred_free = rcu_torture_deferred_free,
419 .sync = synchronize_rcu, 432 .sync = synchronize_rcu,
433 .call = call_rcu,
420 .cb_barrier = rcu_barrier, 434 .cb_barrier = rcu_barrier,
421 .fqs = rcu_force_quiescent_state, 435 .fqs = rcu_force_quiescent_state,
422 .stats = NULL, 436 .stats = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
460 .completed = rcu_torture_completed, 474 .completed = rcu_torture_completed,
461 .deferred_free = rcu_sync_torture_deferred_free, 475 .deferred_free = rcu_sync_torture_deferred_free,
462 .sync = synchronize_rcu, 476 .sync = synchronize_rcu,
477 .call = NULL,
463 .cb_barrier = NULL, 478 .cb_barrier = NULL,
464 .fqs = rcu_force_quiescent_state, 479 .fqs = rcu_force_quiescent_state,
465 .stats = NULL, 480 .stats = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
477 .completed = rcu_no_completed, 492 .completed = rcu_no_completed,
478 .deferred_free = rcu_sync_torture_deferred_free, 493 .deferred_free = rcu_sync_torture_deferred_free,
479 .sync = synchronize_rcu_expedited, 494 .sync = synchronize_rcu_expedited,
495 .call = NULL,
480 .cb_barrier = NULL, 496 .cb_barrier = NULL,
481 .fqs = rcu_force_quiescent_state, 497 .fqs = rcu_force_quiescent_state,
482 .stats = NULL, 498 .stats = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
519 .completed = rcu_bh_torture_completed, 535 .completed = rcu_bh_torture_completed,
520 .deferred_free = rcu_bh_torture_deferred_free, 536 .deferred_free = rcu_bh_torture_deferred_free,
521 .sync = synchronize_rcu_bh, 537 .sync = synchronize_rcu_bh,
538 .call = call_rcu_bh,
522 .cb_barrier = rcu_barrier_bh, 539 .cb_barrier = rcu_barrier_bh,
523 .fqs = rcu_bh_force_quiescent_state, 540 .fqs = rcu_bh_force_quiescent_state,
524 .stats = NULL, 541 .stats = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
535 .completed = rcu_bh_torture_completed, 552 .completed = rcu_bh_torture_completed,
536 .deferred_free = rcu_sync_torture_deferred_free, 553 .deferred_free = rcu_sync_torture_deferred_free,
537 .sync = synchronize_rcu_bh, 554 .sync = synchronize_rcu_bh,
555 .call = NULL,
538 .cb_barrier = NULL, 556 .cb_barrier = NULL,
539 .fqs = rcu_bh_force_quiescent_state, 557 .fqs = rcu_bh_force_quiescent_state,
540 .stats = NULL, 558 .stats = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
551 .completed = rcu_bh_torture_completed, 569 .completed = rcu_bh_torture_completed,
552 .deferred_free = rcu_sync_torture_deferred_free, 570 .deferred_free = rcu_sync_torture_deferred_free,
553 .sync = synchronize_rcu_bh_expedited, 571 .sync = synchronize_rcu_bh_expedited,
572 .call = NULL,
554 .cb_barrier = NULL, 573 .cb_barrier = NULL,
555 .fqs = rcu_bh_force_quiescent_state, 574 .fqs = rcu_bh_force_quiescent_state,
556 .stats = NULL, 575 .stats = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
606 return srcu_batches_completed(&srcu_ctl); 625 return srcu_batches_completed(&srcu_ctl);
607} 626}
608 627
628static void srcu_torture_deferred_free(struct rcu_torture *rp)
629{
630 call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
631}
632
609static void srcu_torture_synchronize(void) 633static void srcu_torture_synchronize(void)
610{ 634{
611 synchronize_srcu(&srcu_ctl); 635 synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
620 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", 644 cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
621 torture_type, TORTURE_FLAG, idx); 645 torture_type, TORTURE_FLAG, idx);
622 for_each_possible_cpu(cpu) { 646 for_each_possible_cpu(cpu) {
623 cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, 647 cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
624 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], 648 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
625 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); 649 per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
626 } 650 }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
635 .read_delay = srcu_read_delay, 659 .read_delay = srcu_read_delay,
636 .readunlock = srcu_torture_read_unlock, 660 .readunlock = srcu_torture_read_unlock,
637 .completed = srcu_torture_completed, 661 .completed = srcu_torture_completed,
638 .deferred_free = rcu_sync_torture_deferred_free, 662 .deferred_free = srcu_torture_deferred_free,
639 .sync = srcu_torture_synchronize, 663 .sync = srcu_torture_synchronize,
664 .call = NULL,
640 .cb_barrier = NULL, 665 .cb_barrier = NULL,
641 .stats = srcu_torture_stats, 666 .stats = srcu_torture_stats,
642 .name = "srcu" 667 .name = "srcu"
643}; 668};
644 669
670static struct rcu_torture_ops srcu_sync_ops = {
671 .init = srcu_torture_init,
672 .cleanup = srcu_torture_cleanup,
673 .readlock = srcu_torture_read_lock,
674 .read_delay = srcu_read_delay,
675 .readunlock = srcu_torture_read_unlock,
676 .completed = srcu_torture_completed,
677 .deferred_free = rcu_sync_torture_deferred_free,
678 .sync = srcu_torture_synchronize,
679 .call = NULL,
680 .cb_barrier = NULL,
681 .stats = srcu_torture_stats,
682 .name = "srcu_sync"
683};
684
645static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) 685static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
646{ 686{
647 return srcu_read_lock_raw(&srcu_ctl); 687 return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
659 .read_delay = srcu_read_delay, 699 .read_delay = srcu_read_delay,
660 .readunlock = srcu_torture_read_unlock_raw, 700 .readunlock = srcu_torture_read_unlock_raw,
661 .completed = srcu_torture_completed, 701 .completed = srcu_torture_completed,
662 .deferred_free = rcu_sync_torture_deferred_free, 702 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 703 .sync = srcu_torture_synchronize,
704 .call = NULL,
664 .cb_barrier = NULL, 705 .cb_barrier = NULL,
665 .stats = srcu_torture_stats, 706 .stats = srcu_torture_stats,
666 .name = "srcu_raw" 707 .name = "srcu_raw"
667}; 708};
668 709
710static struct rcu_torture_ops srcu_raw_sync_ops = {
711 .init = srcu_torture_init,
712 .cleanup = srcu_torture_cleanup,
713 .readlock = srcu_torture_read_lock_raw,
714 .read_delay = srcu_read_delay,
715 .readunlock = srcu_torture_read_unlock_raw,
716 .completed = srcu_torture_completed,
717 .deferred_free = rcu_sync_torture_deferred_free,
718 .sync = srcu_torture_synchronize,
719 .call = NULL,
720 .cb_barrier = NULL,
721 .stats = srcu_torture_stats,
722 .name = "srcu_raw_sync"
723};
724
669static void srcu_torture_synchronize_expedited(void) 725static void srcu_torture_synchronize_expedited(void)
670{ 726{
671 synchronize_srcu_expedited(&srcu_ctl); 727 synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
680 .completed = srcu_torture_completed, 736 .completed = srcu_torture_completed,
681 .deferred_free = rcu_sync_torture_deferred_free, 737 .deferred_free = rcu_sync_torture_deferred_free,
682 .sync = srcu_torture_synchronize_expedited, 738 .sync = srcu_torture_synchronize_expedited,
739 .call = NULL,
683 .cb_barrier = NULL, 740 .cb_barrier = NULL,
684 .stats = srcu_torture_stats, 741 .stats = srcu_torture_stats,
685 .name = "srcu_expedited" 742 .name = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
1129 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1130 "rtmbe: %d rtbke: %ld rtbre: %ld " 1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1131 "rtbf: %ld rtb: %ld nt: %ld " 1188 "rtbf: %ld rtb: %ld nt: %ld "
1132 "onoff: %ld/%ld:%ld/%ld", 1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1133 rcu_torture_current, 1191 rcu_torture_current,
1134 rcu_torture_current_version, 1192 rcu_torture_current_version,
1135 list_empty(&rcu_torture_freelist), 1193 list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
1145 n_online_successes, 1203 n_online_successes,
1146 n_online_attempts, 1204 n_online_attempts,
1147 n_offline_successes, 1205 n_offline_successes,
1148 n_offline_attempts); 1206 n_offline_attempts,
1207 n_barrier_successes,
1208 n_barrier_attempts,
1209 n_rcu_torture_barrier_error);
1210 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1149 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1211 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1212 n_rcu_torture_barrier_error != 0 ||
1150 n_rcu_torture_boost_ktrerror != 0 || 1213 n_rcu_torture_boost_ktrerror != 0 ||
1151 n_rcu_torture_boost_rterror != 0 || 1214 n_rcu_torture_boost_rterror != 0 ||
1152 n_rcu_torture_boost_failure != 0) 1215 n_rcu_torture_boost_failure != 0 ||
1153 cnt += sprintf(&page[cnt], " !!!"); 1216 i > 1) {
1154 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
1155 if (i > 1) {
1156 cnt += sprintf(&page[cnt], "!!! "); 1217 cnt += sprintf(&page[cnt], "!!! ");
1157 atomic_inc(&n_rcu_torture_error); 1218 atomic_inc(&n_rcu_torture_error);
1158 WARN_ON_ONCE(1); 1219 WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
1337 1398
1338 /* This must be outside of the mutex, otherwise deadlock! */ 1399 /* This must be outside of the mutex, otherwise deadlock! */
1339 kthread_stop(t); 1400 kthread_stop(t);
1401 boost_tasks[cpu] = NULL;
1340} 1402}
1341 1403
1342static int rcutorture_booster_init(int cpu) 1404static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
1484 return; 1546 return;
1485 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); 1547 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1486 kthread_stop(onoff_task); 1548 kthread_stop(onoff_task);
1549 onoff_task = NULL;
1487} 1550}
1488 1551
1489#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1552#else /* #ifdef CONFIG_HOTPLUG_CPU */
1490 1553
1491static void 1554static int
1492rcu_torture_onoff_init(void) 1555rcu_torture_onoff_init(void)
1493{ 1556{
1557 return 0;
1494} 1558}
1495 1559
1496static void rcu_torture_onoff_cleanup(void) 1560static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
1554 return; 1618 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); 1619 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task); 1620 kthread_stop(stall_task);
1621 stall_task = NULL;
1622}
1623
1624/* Callback function for RCU barrier testing. */
1625void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1626{
1627 atomic_inc(&barrier_cbs_invoked);
1628}
1629
1630/* kthread function to register callbacks used to test RCU barriers. */
1631static int rcu_torture_barrier_cbs(void *arg)
1632{
1633 long myid = (long)arg;
1634 struct rcu_head rcu;
1635
1636 init_rcu_head_on_stack(&rcu);
1637 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
1638 set_user_nice(current, 19);
1639 do {
1640 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
1642 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP);
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1647 if (atomic_dec_and_test(&barrier_cbs_count))
1648 wake_up(&barrier_wq);
1649 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1650 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1651 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1652 while (!kthread_should_stop())
1653 schedule_timeout_interruptible(1);
1654 cur_ops->cb_barrier();
1655 destroy_rcu_head_on_stack(&rcu);
1656 return 0;
1657}
1658
1659/* kthread function to drive and coordinate RCU barrier testing. */
1660static int rcu_torture_barrier(void *arg)
1661{
1662 int i;
1663
1664 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
1665 do {
1666 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */
1669 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq,
1672 atomic_read(&barrier_cbs_count) == 0 ||
1673 kthread_should_stop() ||
1674 fullstop != FULLSTOP_DONTSTOP);
1675 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1676 break;
1677 n_barrier_attempts++;
1678 cur_ops->cb_barrier();
1679 if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
1680 n_rcu_torture_barrier_error++;
1681 WARN_ON_ONCE(1);
1682 }
1683 n_barrier_successes++;
1684 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1688 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1);
1690 return 0;
1691}
1692
1693/* Initialize RCU barrier testing. */
1694static int rcu_torture_barrier_init(void)
1695{
1696 int i;
1697 int ret;
1698
1699 if (n_barrier_cbs == 0)
1700 return 0;
1701 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1702 printk(KERN_ALERT "%s" TORTURE_FLAG
1703 " Call or barrier ops missing for %s,\n",
1704 torture_type, cur_ops->name);
1705 printk(KERN_ALERT "%s" TORTURE_FLAG
1706 " RCU barrier testing omitted from run.\n",
1707 torture_type);
1708 return 0;
1709 }
1710 atomic_set(&barrier_cbs_count, 0);
1711 atomic_set(&barrier_cbs_invoked, 0);
1712 barrier_cbs_tasks =
1713 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
1714 GFP_KERNEL);
1715 barrier_cbs_wq =
1716 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
1717 GFP_KERNEL);
1718 if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
1719 return -ENOMEM;
1720 for (i = 0; i < n_barrier_cbs; i++) {
1721 init_waitqueue_head(&barrier_cbs_wq[i]);
1722 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
1723 (void *)(long)i,
1724 "rcu_torture_barrier_cbs");
1725 if (IS_ERR(barrier_cbs_tasks[i])) {
1726 ret = PTR_ERR(barrier_cbs_tasks[i]);
1727 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1728 barrier_cbs_tasks[i] = NULL;
1729 return ret;
1730 }
1731 }
1732 barrier_task = kthread_run(rcu_torture_barrier, NULL,
1733 "rcu_torture_barrier");
1734 if (IS_ERR(barrier_task)) {
1735 ret = PTR_ERR(barrier_task);
1736 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1737 barrier_task = NULL;
1738 }
1739 return 0;
1740}
1741
1742/* Clean up after RCU barrier testing. */
1743static void rcu_torture_barrier_cleanup(void)
1744{
1745 int i;
1746
1747 if (barrier_task != NULL) {
1748 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1749 kthread_stop(barrier_task);
1750 barrier_task = NULL;
1751 }
1752 if (barrier_cbs_tasks != NULL) {
1753 for (i = 0; i < n_barrier_cbs; i++) {
1754 if (barrier_cbs_tasks[i] != NULL) {
1755 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
1756 kthread_stop(barrier_cbs_tasks[i]);
1757 barrier_cbs_tasks[i] = NULL;
1758 }
1759 }
1760 kfree(barrier_cbs_tasks);
1761 barrier_cbs_tasks = NULL;
1762 }
1763 if (barrier_cbs_wq != NULL) {
1764 kfree(barrier_cbs_wq);
1765 barrier_cbs_wq = NULL;
1766 }
1557} 1767}
1558 1768
1559static int rcutorture_cpu_notify(struct notifier_block *self, 1769static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
1598 fullstop = FULLSTOP_RMMOD; 1808 fullstop = FULLSTOP_RMMOD;
1599 mutex_unlock(&fullstop_mutex); 1809 mutex_unlock(&fullstop_mutex);
1600 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1810 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1811 rcu_torture_barrier_cleanup();
1601 rcu_torture_stall_cleanup(); 1812 rcu_torture_stall_cleanup();
1602 if (stutter_task) { 1813 if (stutter_task) {
1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1814 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
1665 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); 1876 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1666 kthread_stop(shutdown_task); 1877 kthread_stop(shutdown_task);
1667 } 1878 }
1879 shutdown_task = NULL;
1668 rcu_torture_onoff_cleanup(); 1880 rcu_torture_onoff_cleanup();
1669 1881
1670 /* Wait for all RCU callbacks to fire. */ 1882 /* Wait for all RCU callbacks to fire. */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
1676 1888
1677 if (cur_ops->cleanup) 1889 if (cur_ops->cleanup)
1678 cur_ops->cleanup(); 1890 cur_ops->cleanup();
1679 if (atomic_read(&n_rcu_torture_error)) 1891 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1892 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts || 1893 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts) 1894 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
1692 int i; 1904 int i;
1693 int cpu; 1905 int cpu;
1694 int firsterr = 0; 1906 int firsterr = 0;
1907 int retval;
1695 static struct rcu_torture_ops *torture_ops[] = 1908 static struct rcu_torture_ops *torture_ops[] =
1696 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1697 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1698 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, 1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops,
1699 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1700 1914
1701 mutex_lock(&fullstop_mutex); 1915 mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
1749 atomic_set(&n_rcu_torture_free, 0); 1963 atomic_set(&n_rcu_torture_free, 0);
1750 atomic_set(&n_rcu_torture_mberror, 0); 1964 atomic_set(&n_rcu_torture_mberror, 0);
1751 atomic_set(&n_rcu_torture_error, 0); 1965 atomic_set(&n_rcu_torture_error, 0);
1966 n_rcu_torture_barrier_error = 0;
1752 n_rcu_torture_boost_ktrerror = 0; 1967 n_rcu_torture_boost_ktrerror = 0;
1753 n_rcu_torture_boost_rterror = 0; 1968 n_rcu_torture_boost_rterror = 0;
1754 n_rcu_torture_boost_failure = 0; 1969 n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
1872 test_boost_duration = 2; 2087 test_boost_duration = 2;
1873 if ((test_boost == 1 && cur_ops->can_boost) || 2088 if ((test_boost == 1 && cur_ops->can_boost) ||
1874 test_boost == 2) { 2089 test_boost == 2) {
1875 int retval;
1876 2090
1877 boost_starttime = jiffies + test_boost_interval * HZ; 2091 boost_starttime = jiffies + test_boost_interval * HZ;
1878 register_cpu_notifier(&rcutorture_cpu_nb); 2092 register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
1897 goto unwind; 2111 goto unwind;
1898 } 2112 }
1899 } 2113 }
1900 rcu_torture_onoff_init(); 2114 i = rcu_torture_onoff_init();
2115 if (i != 0) {
2116 firsterr = i;
2117 goto unwind;
2118 }
1901 register_reboot_notifier(&rcutorture_shutdown_nb); 2119 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init(); 2120 i = rcu_torture_stall_init();
2121 if (i != 0) {
2122 firsterr = i;
2123 goto unwind;
2124 }
2125 retval = rcu_torture_barrier_init();
2126 if (retval != 0) {
2127 firsterr = retval;
2128 goto unwind;
2129 }
1903 rcutorture_record_test_transition(); 2130 rcutorture_record_test_transition();
1904 mutex_unlock(&fullstop_mutex); 2131 mutex_unlock(&fullstop_mutex);
1905 return 0; 2132 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab18..0da7b88d92d0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
75 .gpnum = -300, \ 75 .gpnum = -300, \
76 .completed = -300, \ 76 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \
78 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
79 .n_force_qs = 0, \ 81 .n_force_qs = 0, \
80 .n_force_qs_ngp = 0, \ 82 .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
145unsigned long rcutorture_testseq; 147unsigned long rcutorture_testseq;
146unsigned long rcutorture_vernum; 148unsigned long rcutorture_vernum;
147 149
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
148/* 157/*
149 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
150 * permit this function to be invoked without holding the root rcu_node 159 * permit this function to be invoked without holding the root rcu_node
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu)
192{ 201{
193 trace_rcu_utilization("Start context switch"); 202 trace_rcu_utilization("Start context switch");
194 rcu_sched_qs(cpu); 203 rcu_sched_qs(cpu);
195 rcu_preempt_note_context_switch(cpu);
196 trace_rcu_utilization("End context switch"); 204 trace_rcu_utilization("End context switch");
197} 205}
198EXPORT_SYMBOL_GPL(rcu_note_context_switch); 206EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1311#ifdef CONFIG_HOTPLUG_CPU 1319#ifdef CONFIG_HOTPLUG_CPU
1312 1320
1313/* 1321/*
1314 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1322 * Send the specified CPU's RCU callbacks to the orphanage. The
1315 * Also record a quiescent state for this CPU for the current grace period. 1323 * specified CPU must be offline, and the caller must hold the
1316 * Synchronization and interrupt disabling are not required because 1324 * ->onofflock.
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1324 */ 1325 */
1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1326static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp)
1326{ 1329{
1327 int i; 1330 int i;
1328 unsigned long mask;
1329 int receive_cpu = cpumask_any(cpu_online_mask);
1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333 1331
1334 /* First, adjust the counts. */ 1332 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of
1335 * the callbacks, thus no memory barrier is required.
1336 */
1335 if (rdp->nxtlist != NULL) { 1337 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy; 1338 rsp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen; 1339 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen;
1338 rdp->qlen_lazy = 0; 1341 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0; 1342 rdp->qlen = 0;
1340 } 1343 }
1341 1344
1342 /* 1345 /*
1343 * Next, move ready-to-invoke callbacks to be invoked on some 1346 * Next, move those callbacks still needing a grace period to
1344 * other CPU. These will not be required to pass through another 1347 * the orphanage, where some other CPU will pick them up.
1345 * grace period: They are done, regardless of CPU. 1348 * Some of the callbacks might have gone partway through a grace
1349 * period, but that is too bad. They get to start over because we
1350 * cannot assume that grace periods are synchronized across CPUs.
1351 * We don't bother updating the ->nxttail[] array yet, instead
1352 * we just reset the whole thing later on.
1346 */ 1353 */
1347 if (rdp->nxtlist != NULL && 1354 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { 1355 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1349 struct rcu_head *oldhead; 1356 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1350 struct rcu_head **oldtail; 1357 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 } 1358 }
1366 1359
1367 /* 1360 /*
1368 * Finally, put the rest of the callbacks at the end of the list. 1361 * Then move the ready-to-invoke callbacks to the orphanage,
1369 * The ones that made it partway through get to start over: We 1362 * where some other CPU will pick them up. These will not be
1370 * cannot assume that grace periods are synchronized across CPUs. 1363 * required to pass though another grace period: They are done.
1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */ 1364 */
1374 if (rdp->nxtlist != NULL) { 1365 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1366 *rsp->orphan_donetail = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] = 1367 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 } 1368 }
1385 1369
1370 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL;
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374}
1375
1376/*
1377 * Adopt the RCU callbacks from the specified rcu_state structure's
1378 * orphanage. The caller must hold the ->onofflock.
1379 */
1380static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1381{
1382 int i;
1383 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1384
1386 /* 1385 /*
1387 * Record a quiescent state for the dying CPU. This is safe 1386 * If there is an rcu_barrier() operation in progress, then
1388 * only because we have already cleared out the callbacks. 1387 * only the task doing that operation is permitted to adopt
1389 * (Otherwise, the RCU core might try to schedule the invocation 1388 * callbacks. To do otherwise breaks rcu_barrier() and friends
1390 * of callbacks on this now-offline CPU, which would be bad.) 1389 * by causing them to fail to wait for the callbacks in the
1390 * orphanage.
1391 */ 1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1392 if (rsp->rcu_barrier_in_progress &&
1393 rsp->rcu_barrier_in_progress != current)
1394 return;
1395
1396 /* Do the accounting first. */
1397 rdp->qlen_lazy += rsp->qlen_lazy;
1398 rdp->qlen += rsp->qlen;
1399 rdp->n_cbs_adopted += rsp->qlen;
1400 rsp->qlen_lazy = 0;
1401 rsp->qlen = 0;
1402
1403 /*
1404 * We do not need a memory barrier here because the only way we
1405 * can get here if there is an rcu_barrier() in flight is if
1406 * we are the task doing the rcu_barrier().
1407 */
1408
1409 /* First adopt the ready-to-invoke callbacks. */
1410 if (rsp->orphan_donelist != NULL) {
1411 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1412 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1413 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1414 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1415 rdp->nxttail[i] = rsp->orphan_donetail;
1416 rsp->orphan_donelist = NULL;
1417 rsp->orphan_donetail = &rsp->orphan_donelist;
1418 }
1419
1420 /* And then adopt the callbacks that still need a grace period. */
1421 if (rsp->orphan_nxtlist != NULL) {
1422 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1423 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1424 rsp->orphan_nxtlist = NULL;
1425 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1426 }
1427}
1428
1429/*
1430 * Trace the fact that this CPU is going offline.
1431 */
1432static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1433{
1434 RCU_TRACE(unsigned long mask);
1435 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1436 RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1437
1438 RCU_TRACE(mask = rdp->grpmask);
1393 trace_rcu_grace_period(rsp->name, 1439 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1440 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl"); 1441 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1398} 1442}
1399 1443
1400/* 1444/*
1401 * The CPU has been completely removed, and some other CPU is reporting 1445 * The CPU has been completely removed, and some other CPU is reporting
1402 * this fact from process context. Do the remainder of the cleanup. 1446 * this fact from process context. Do the remainder of the cleanup,
1447 * including orphaning the outgoing CPU's RCU callbacks, and also
1448 * adopting them, if there is no _rcu_barrier() instance running.
1403 * There can only be one CPU hotplug operation at a time, so no other 1449 * There can only be one CPU hotplug operation at a time, so no other
1404 * CPU can be attempting to update rcu_cpu_kthread_task. 1450 * CPU can be attempting to update rcu_cpu_kthread_task.
1405 */ 1451 */
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1409 unsigned long mask; 1455 unsigned long mask;
1410 int need_report = 0; 1456 int need_report = 0;
1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1457 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ 1458 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
1413 1459
1414 /* Adjust any no-longer-needed kthreads. */ 1460 /* Adjust any no-longer-needed kthreads. */
1415 rcu_stop_cpu_kthread(cpu); 1461 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1); 1462 rcu_node_kthread_setaffinity(rnp, -1);
1417 1463
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ 1464 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1419 1465
1420 /* Exclude any attempts to start a new grace period. */ 1466 /* Exclude any attempts to start a new grace period. */
1421 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1467 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1422 1468
1469 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1470 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1471 rcu_adopt_orphan_cbs(rsp);
1472
1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1473 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1424 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1474 mask = rdp->grpmask; /* rnp->grplo is constant. */
1425 do { 1475 do {
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1456 1506
1457#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1507#else /* #ifdef CONFIG_HOTPLUG_CPU */
1458 1508
1509static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1510{
1511}
1512
1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 1513static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1460{ 1514{
1461} 1515}
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1524 rcu_is_callbacks_kthread()); 1578 rcu_is_callbacks_kthread());
1525 1579
1526 /* Update count, and requeue any remaining callbacks. */ 1580 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1528 rdp->qlen -= count;
1529 rdp->n_cbs_invoked += count;
1530 if (list != NULL) { 1581 if (list != NULL) {
1531 *tail = rdp->nxtlist; 1582 *tail = rdp->nxtlist;
1532 rdp->nxtlist = list; 1583 rdp->nxtlist = list;
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1536 else 1587 else
1537 break; 1588 break;
1538 } 1589 }
1590 smp_mb(); /* List handling before counting for rcu_barrier(). */
1591 rdp->qlen_lazy -= count_lazy;
1592 rdp->qlen -= count;
1593 rdp->n_cbs_invoked += count;
1539 1594
1540 /* Reinstate batch limit if we have worked down the excess. */ 1595 /* Reinstate batch limit if we have worked down the excess. */
1541 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1596 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1823 rdp = this_cpu_ptr(rsp->rda); 1878 rdp = this_cpu_ptr(rsp->rda);
1824 1879
1825 /* Add the callback to our list. */ 1880 /* Add the callback to our list. */
1826 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1827 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1828 rdp->qlen++; 1881 rdp->qlen++;
1829 if (lazy) 1882 if (lazy)
1830 rdp->qlen_lazy++; 1883 rdp->qlen_lazy++;
1884 else
1885 rcu_idle_count_callbacks_posted();
1886 smp_mb(); /* Count before adding callback for rcu_barrier(). */
1887 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1888 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1831 1889
1832 if (__is_kfree_rcu_offset((unsigned long)func)) 1890 if (__is_kfree_rcu_offset((unsigned long)func))
1833 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1891 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1893} 1951}
1894EXPORT_SYMBOL_GPL(call_rcu_bh); 1952EXPORT_SYMBOL_GPL(call_rcu_bh);
1895 1953
1954/*
1955 * Because a context switch is a grace period for RCU-sched and RCU-bh,
1956 * any blocking grace-period wait automatically implies a grace period
1957 * if there is only one CPU online at any point time during execution
1958 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
1959 * occasionally incorrectly indicate that there are multiple CPUs online
1960 * when there was in fact only one the whole time, as this just adds
1961 * some overhead: RCU still operates correctly.
1962 *
1963 * Of course, sampling num_online_cpus() with preemption enabled can
1964 * give erroneous results if there are concurrent CPU-hotplug operations.
1965 * For example, given a demonic sequence of preemptions in num_online_cpus()
1966 * and CPU-hotplug operations, there could be two or more CPUs online at
1967 * all times, but num_online_cpus() might well return one (or even zero).
1968 *
1969 * However, all such demonic sequences require at least one CPU-offline
1970 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1971 * is only a problem if there is an RCU read-side critical section executing
1972 * throughout. But RCU-sched and RCU-bh read-side critical sections
1973 * disable either preemption or bh, which prevents a CPU from going offline.
1974 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1975 * that there is only one CPU when in fact there was more than one throughout
1976 * is when there were no RCU readers in the system. If there are no
1977 * RCU readers, the grace period by definition can be of zero length,
1978 * regardless of the number of online CPUs.
1979 */
1980static inline int rcu_blocking_is_gp(void)
1981{
1982 might_sleep(); /* Check for RCU read-side critical section. */
1983 return num_online_cpus() <= 1;
1984}
1985
1896/** 1986/**
1897 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 1987 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
1898 * 1988 *
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu)
2166 rcu_preempt_cpu_has_callbacks(cpu); 2256 rcu_preempt_cpu_has_callbacks(cpu);
2167} 2257}
2168 2258
2169static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2259/*
2170static atomic_t rcu_barrier_cpu_count; 2260 * RCU callback function for _rcu_barrier(). If we are last, wake
2171static DEFINE_MUTEX(rcu_barrier_mutex); 2261 * up the task executing _rcu_barrier().
2172static struct completion rcu_barrier_completion; 2262 */
2173
2174static void rcu_barrier_callback(struct rcu_head *notused) 2263static void rcu_barrier_callback(struct rcu_head *notused)
2175{ 2264{
2176 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2265 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
2200 void (*call_rcu_func)(struct rcu_head *head, 2289 void (*call_rcu_func)(struct rcu_head *head,
2201 void (*func)(struct rcu_head *head))) 2290 void (*func)(struct rcu_head *head)))
2202{ 2291{
2203 BUG_ON(in_interrupt()); 2292 int cpu;
2293 unsigned long flags;
2294 struct rcu_data *rdp;
2295 struct rcu_head rh;
2296
2297 init_rcu_head_on_stack(&rh);
2298
2204 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2299 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2205 mutex_lock(&rcu_barrier_mutex); 2300 mutex_lock(&rcu_barrier_mutex);
2206 init_completion(&rcu_barrier_completion); 2301
2302 smp_mb(); /* Prevent any prior operations from leaking in. */
2303
2207 /* 2304 /*
2208 * Initialize rcu_barrier_cpu_count to 1, then invoke 2305 * Initialize the count to one rather than to zero in order to
2209 * rcu_barrier_func() on each CPU, so that each CPU also has 2306 * avoid a too-soon return to zero in case of a short grace period
2210 * incremented rcu_barrier_cpu_count. Only then is it safe to 2307 * (or preemption of this task). Also flag this task as doing
2211 * decrement rcu_barrier_cpu_count -- otherwise the first CPU 2308 * an rcu_barrier(). This will prevent anyone else from adopting
2212 * might complete its grace period before all of the other CPUs 2309 * orphaned callbacks, which could cause otherwise failure if a
2213 * did their increment, causing this function to return too 2310 * CPU went offline and quickly came back online. To see this,
2214 * early. Note that on_each_cpu() disables irqs, which prevents 2311 * consider the following sequence of events:
2215 * any CPUs from coming online or going offline until each online 2312 *
2216 * CPU has queued its RCU-barrier callback. 2313 * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
2314 * 2. CPU 1 goes offline, orphaning its callbacks.
2315 * 3. CPU 0 adopts CPU 1's orphaned callbacks.
2316 * 4. CPU 1 comes back online.
2317 * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
2318 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2319 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2217 */ 2320 */
2321 init_completion(&rcu_barrier_completion);
2218 atomic_set(&rcu_barrier_cpu_count, 1); 2322 atomic_set(&rcu_barrier_cpu_count, 1);
2219 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); 2323 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2324 rsp->rcu_barrier_in_progress = current;
2325 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2326
2327 /*
2328 * Force every CPU with callbacks to register a new callback
2329 * that will tell us when all the preceding callbacks have
2330 * been invoked. If an offline CPU has callbacks, wait for
2331 * it to either come back online or to finish orphaning those
2332 * callbacks.
2333 */
2334 for_each_possible_cpu(cpu) {
2335 preempt_disable();
2336 rdp = per_cpu_ptr(rsp->rda, cpu);
2337 if (cpu_is_offline(cpu)) {
2338 preempt_enable();
2339 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2340 schedule_timeout_interruptible(1);
2341 } else if (ACCESS_ONCE(rdp->qlen)) {
2342 smp_call_function_single(cpu, rcu_barrier_func,
2343 (void *)call_rcu_func, 1);
2344 preempt_enable();
2345 } else {
2346 preempt_enable();
2347 }
2348 }
2349
2350 /*
2351 * Now that all online CPUs have rcu_barrier_callback() callbacks
2352 * posted, we can adopt all of the orphaned callbacks and place
2353 * an rcu_barrier_callback() callback after them. When that is done,
2354 * we are guaranteed to have an rcu_barrier_callback() callback
2355 * following every callback that could possibly have been
2356 * registered before _rcu_barrier() was called.
2357 */
2358 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2359 rcu_adopt_orphan_cbs(rsp);
2360 rsp->rcu_barrier_in_progress = NULL;
2361 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2362 atomic_inc(&rcu_barrier_cpu_count);
2363 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2364 call_rcu_func(&rh, rcu_barrier_callback);
2365
2366 /*
2367 * Now that we have an rcu_barrier_callback() callback on each
2368 * CPU, and thus each counted, remove the initial count.
2369 */
2220 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2370 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
2221 complete(&rcu_barrier_completion); 2371 complete(&rcu_barrier_completion);
2372
2373 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2222 wait_for_completion(&rcu_barrier_completion); 2374 wait_for_completion(&rcu_barrier_completion);
2375
2376 /* Other rcu_barrier() invocations can now safely proceed. */
2223 mutex_unlock(&rcu_barrier_mutex); 2377 mutex_unlock(&rcu_barrier_mutex);
2378
2379 destroy_rcu_head_on_stack(&rh);
2224} 2380}
2225 2381
2226/** 2382/**
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2417 2573
2418 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2574 for (i = NUM_RCU_LVLS - 1; i > 0; i--)
2419 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2575 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2420 rsp->levelspread[0] = RCU_FANOUT_LEAF; 2576 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
2421} 2577}
2422#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2578#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2423static void __init rcu_init_levelspread(struct rcu_state *rsp) 2579static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..7f5d138dedf5 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30 30
31/* 31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. 32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
33 * CONFIG_RCU_FANOUT_LEAF.
33 * In theory, it should be possible to add more levels straightforwardly. 34 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this did work well going from three levels to four. 35 * In practice, this did work well going from three levels to four.
35 * Of course, your mileage may vary. 36 * Of course, your mileage may vary.
36 */ 37 */
37#define MAX_RCU_LVLS 4 38#define MAX_RCU_LVLS 4
38#if CONFIG_RCU_FANOUT > 16 39#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
39#define RCU_FANOUT_LEAF 16
40#else /* #if CONFIG_RCU_FANOUT > 16 */
41#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
43#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
44#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) 40#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
45#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) 41#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
46#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -371,6 +367,17 @@ struct rcu_state {
371 367
372 raw_spinlock_t onofflock; /* exclude on/offline and */ 368 raw_spinlock_t onofflock; /* exclude on/offline and */
373 /* starting new GP. */ 369 /* starting new GP. */
370 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
371 /* need a grace period. */
372 struct rcu_head **orphan_nxttail; /* Tail of above. */
373 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
374 /* are ready to invoke. */
375 struct rcu_head **orphan_donetail; /* Tail of above. */
376 long qlen_lazy; /* Number of lazy callbacks. */
377 long qlen; /* Total number of callbacks. */
378 struct task_struct *rcu_barrier_in_progress;
379 /* Task doing rcu_barrier(), */
380 /* or NULL if no barrier. */
374 raw_spinlock_t fqslock; /* Only one task forcing */ 381 raw_spinlock_t fqslock; /* Only one task forcing */
375 /* quiescent states. */ 382 /* quiescent states. */
376 unsigned long jiffies_force_qs; /* Time at which to invoke */ 383 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
423/* Forward declarations for rcutree_plugin.h */ 430/* Forward declarations for rcutree_plugin.h */
424static void rcu_bootup_announce(void); 431static void rcu_bootup_announce(void);
425long rcu_batches_completed(void); 432long rcu_batches_completed(void);
426static void rcu_preempt_note_context_switch(int cpu);
427static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 433static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
428#ifdef CONFIG_HOTPLUG_CPU 434#ifdef CONFIG_HOTPLUG_CPU
429static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 435static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 477static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 478static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 479static void rcu_prepare_for_idle(int cpu);
480static void rcu_idle_count_callbacks_posted(void);
474static void print_cpu_stall_info_begin(void); 481static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 482static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void); 483static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..2411000d9869 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
153 * 153 *
154 * Caller must disable preemption. 154 * Caller must disable preemption.
155 */ 155 */
156static void rcu_preempt_note_context_switch(int cpu) 156void rcu_preempt_note_context_switch(void)
157{ 157{
158 struct task_struct *t = current; 158 struct task_struct *t = current;
159 unsigned long flags; 159 unsigned long flags;
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 165
166 /* Possibly blocking in an RCU read-side critical section. */ 166 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 167 rdp = __this_cpu_ptr(rcu_preempt_state.rda);
168 rnp = rdp->mynode; 168 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 169 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu)
228 * means that we continue to block the current grace period. 228 * means that we continue to block the current grace period.
229 */ 229 */
230 local_irq_save(flags); 230 local_irq_save(flags);
231 rcu_preempt_qs(cpu); 231 rcu_preempt_qs(smp_processor_id());
232 local_irq_restore(flags); 232 local_irq_restore(flags);
233} 233}
234 234
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 969 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
970} 970}
971 971
972/*
973 * Check for a task exiting while in a preemptible-RCU read-side
974 * critical section, clean up if so. No need to issue warnings,
975 * as debug_check_no_locks_held() already does this if lockdep
976 * is enabled.
977 */
978void exit_rcu(void)
979{
980 struct task_struct *t = current;
981
982 if (t->rcu_read_lock_nesting == 0)
983 return;
984 t->rcu_read_lock_nesting = 1;
985 __rcu_read_unlock();
986}
987
988#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 972#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
989 973
990static struct rcu_state *rcu_state = &rcu_sched_state; 974static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void)
1018EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1019 1003
1020/* 1004/*
1021 * Because preemptible RCU does not exist, we never have to check for
1022 * CPUs being in quiescent states.
1023 */
1024static void rcu_preempt_note_context_switch(int cpu)
1025{
1026}
1027
1028/*
1029 * Because preemptible RCU does not exist, there are never any preempted 1005 * Because preemptible RCU does not exist, there are never any preempted
1030 * RCU readers. 1006 * RCU readers.
1031 */ 1007 */
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu)
1938{ 1914{
1939} 1915}
1940 1916
1917/*
1918 * Don't bother keeping a running count of the number of RCU callbacks
1919 * posted because CONFIG_RCU_FAST_NO_HZ=n.
1920 */
1921static void rcu_idle_count_callbacks_posted(void)
1922{
1923}
1924
1941#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1925#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1942 1926
1943/* 1927/*
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu)
1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1962#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1963#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1980 1964
1965/* Loop counter for rcu_prepare_for_idle(). */
1981static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1966static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1967/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1968static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1969/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ 1970static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ 1971/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
1972static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
1973/* Enable special processing on first attempt to enter dyntick-idle mode. */
1974static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
1975/* Running count of non-lazy callbacks posted, never decremented. */
1976static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
1977/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
1978static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
1986 1979
1987/* 1980/*
1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1981 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
1995 */ 1988 */
1996int rcu_needs_cpu(int cpu) 1989int rcu_needs_cpu(int cpu)
1997{ 1990{
1991 /* Flag a new idle sojourn to the idle-entry state machine. */
1992 per_cpu(rcu_idle_first_pass, cpu) = 1;
1998 /* If no callbacks, RCU doesn't need the CPU. */ 1993 /* If no callbacks, RCU doesn't need the CPU. */
1999 if (!rcu_cpu_has_callbacks(cpu)) 1994 if (!rcu_cpu_has_callbacks(cpu))
2000 return 0; 1995 return 0;
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2045} 2040}
2046 2041
2047/* 2042/*
2043 * Handler for smp_call_function_single(). The only point of this
2044 * handler is to wake the CPU up, so the handler does only tracing.
2045 */
2046void rcu_idle_demigrate(void *unused)
2047{
2048 trace_rcu_prep_idle("Demigrate");
2049}
2050
2051/*
2048 * Timer handler used to force CPU to start pushing its remaining RCU 2052 * Timer handler used to force CPU to start pushing its remaining RCU
2049 * callbacks in the case where it entered dyntick-idle mode with callbacks 2053 * callbacks in the case where it entered dyntick-idle mode with callbacks
2050 * pending. The hander doesn't really need to do anything because the 2054 * pending. The hander doesn't really need to do anything because the
2051 * real work is done upon re-entry to idle, or by the next scheduling-clock 2055 * real work is done upon re-entry to idle, or by the next scheduling-clock
2052 * interrupt should idle not be re-entered. 2056 * interrupt should idle not be re-entered.
2057 *
2058 * One special case: the timer gets migrated without awakening the CPU
2059 * on which the timer was scheduled on. In this case, we must wake up
2060 * that CPU. We do so with smp_call_function_single().
2053 */ 2061 */
2054static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) 2062static void rcu_idle_gp_timer_func(unsigned long cpu_in)
2055{ 2063{
2064 int cpu = (int)cpu_in;
2065
2056 trace_rcu_prep_idle("Timer"); 2066 trace_rcu_prep_idle("Timer");
2057 return HRTIMER_NORESTART; 2067 if (cpu != smp_processor_id())
2068 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
2069 else
2070 WARN_ON_ONCE(1); /* Getting here can hang the system... */
2058} 2071}
2059 2072
2060/* 2073/*
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2062 */ 2075 */
2063static void rcu_prepare_for_idle_init(int cpu) 2076static void rcu_prepare_for_idle_init(int cpu)
2064{ 2077{
2065 static int firsttime = 1; 2078 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2066 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2079 setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
2067 2080 rcu_idle_gp_timer_func, cpu);
2068 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2081 per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
2069 hrtp->function = rcu_idle_gp_timer_func; 2082 per_cpu(rcu_idle_first_pass, cpu) = 1;
2070 if (firsttime) {
2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2072
2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2076 firsttime = 0;
2077 }
2078} 2083}
2079 2084
2080/* 2085/*
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2084 */ 2089 */
2085static void rcu_cleanup_after_idle(int cpu) 2090static void rcu_cleanup_after_idle(int cpu)
2086{ 2091{
2087 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); 2092 del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
2093 trace_rcu_prep_idle("Cleanup after idle");
2088} 2094}
2089 2095
2090/* 2096/*
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu)
2108 */ 2114 */
2109static void rcu_prepare_for_idle(int cpu) 2115static void rcu_prepare_for_idle(int cpu)
2110{ 2116{
2117 struct timer_list *tp;
2118
2119 /*
2120 * If this is an idle re-entry, for example, due to use of
2121 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
2122 * loop, then don't take any state-machine actions, unless the
2123 * momentary exit from idle queued additional non-lazy callbacks.
2124 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
2125 * pending.
2126 */
2127 if (!per_cpu(rcu_idle_first_pass, cpu) &&
2128 (per_cpu(rcu_nonlazy_posted, cpu) ==
2129 per_cpu(rcu_nonlazy_posted_snap, cpu))) {
2130 if (rcu_cpu_has_callbacks(cpu)) {
2131 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2132 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2133 }
2134 return;
2135 }
2136 per_cpu(rcu_idle_first_pass, cpu) = 0;
2137 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2138 per_cpu(rcu_nonlazy_posted, cpu) - 1;
2139
2111 /* 2140 /*
2112 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2141 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2113 * Also reset state to avoid prejudicing later attempts. 2142 * Also reset state to avoid prejudicing later attempts.
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu)
2140 per_cpu(rcu_dyntick_drain, cpu) = 0; 2169 per_cpu(rcu_dyntick_drain, cpu) = 0;
2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2170 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2142 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 2171 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2172 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2173 jiffies + RCU_IDLE_GP_DELAY;
2145 else 2174 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2175 per_cpu(rcu_idle_gp_timer_expires, cpu) =
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); 2176 jiffies + RCU_IDLE_LAZY_GP_DELAY;
2177 tp = &per_cpu(rcu_idle_gp_timer, cpu);
2178 mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
2179 per_cpu(rcu_nonlazy_posted_snap, cpu) =
2180 per_cpu(rcu_nonlazy_posted, cpu);
2148 return; /* Nothing more to do immediately. */ 2181 return; /* Nothing more to do immediately. */
2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2182 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2150 /* We have hit the limit, so time to give up. */ 2183 /* We have hit the limit, so time to give up. */
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu)
2184 trace_rcu_prep_idle("Callbacks drained"); 2217 trace_rcu_prep_idle("Callbacks drained");
2185} 2218}
2186 2219
2220/*
2221 * Keep a running count of the number of non-lazy callbacks posted
2222 * on this CPU. This running counter (which is never decremented) allows
2223 * rcu_prepare_for_idle() to detect when something out of the idle loop
2224 * posts a callback, even if an equal number of callbacks are invoked.
2225 * Of course, callbacks should only be posted from within a trace event
2226 * designed to be called from idle or from within RCU_NONIDLE().
2227 */
2228static void rcu_idle_count_callbacks_posted(void)
2229{
2230 __this_cpu_add(rcu_nonlazy_posted, 1);
2231}
2232
2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2233#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188 2234
2189#ifdef CONFIG_RCU_CPU_STALL_INFO 2235#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu)
2192 2238
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2239static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{ 2240{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); 2241 struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
2196 2242
2197 sprintf(cp, "drain=%d %c timer=%lld", 2243 sprintf(cp, "drain=%d %c timer=%lu",
2198 per_cpu(rcu_dyntick_drain, cpu), 2244 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', 2245 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp) 2246 timer_pending(tltp) ? tltp->expires - jiffies : -1);
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203} 2247}
2204 2248
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 2249#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
271 271
272 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
275 rsp->completed, gpnum, rsp->fqs_state, 275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 279 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh); 280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
282 if (rnp->level != level) { 282 if (rnp->level != level) {
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b3..ad581aa2369a 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
25int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
26 bool force)
26{ 27{
28 int ret = 0;
29
27 if (counter->usage + val > counter->limit) { 30 if (counter->usage + val > counter->limit) {
28 counter->failcnt++; 31 counter->failcnt++;
29 return -ENOMEM; 32 ret = -ENOMEM;
33 if (!force)
34 return ret;
30 } 35 }
31 36
32 counter->usage += val; 37 counter->usage += val;
33 if (counter->usage > counter->max_usage) 38 if (counter->usage > counter->max_usage)
34 counter->max_usage = counter->usage; 39 counter->max_usage = counter->usage;
35 return 0; 40 return ret;
36} 41}
37 42
38int res_counter_charge(struct res_counter *counter, unsigned long val, 43static int __res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at) 44 struct res_counter **limit_fail_at, bool force)
40{ 45{
41 int ret; 46 int ret, r;
42 unsigned long flags; 47 unsigned long flags;
43 struct res_counter *c, *u; 48 struct res_counter *c, *u;
44 49
50 r = ret = 0;
45 *limit_fail_at = NULL; 51 *limit_fail_at = NULL;
46 local_irq_save(flags); 52 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) { 53 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock); 54 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val); 55 r = res_counter_charge_locked(c, val, force);
50 spin_unlock(&c->lock); 56 spin_unlock(&c->lock);
51 if (ret < 0) { 57 if (r < 0 && !ret) {
58 ret = r;
52 *limit_fail_at = c; 59 *limit_fail_at = c;
53 goto undo; 60 if (!force)
61 break;
54 } 62 }
55 } 63 }
56 ret = 0; 64
57 goto done; 65 if (ret < 0 && !force) {
58undo: 66 for (u = counter; u != c; u = u->parent) {
59 for (u = counter; u != c; u = u->parent) { 67 spin_lock(&u->lock);
60 spin_lock(&u->lock); 68 res_counter_uncharge_locked(u, val);
61 res_counter_uncharge_locked(u, val); 69 spin_unlock(&u->lock);
62 spin_unlock(&u->lock); 70 }
63 } 71 }
64done:
65 local_irq_restore(flags); 72 local_irq_restore(flags);
73
66 return ret; 74 return ret;
67} 75}
68 76
77int res_counter_charge(struct res_counter *counter, unsigned long val,
78 struct res_counter **limit_fail_at)
79{
80 return __res_counter_charge(counter, val, limit_fail_at, false);
81}
82
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, 83int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at) 84 struct res_counter **limit_fail_at)
71{ 85{
72 int ret, r; 86 return __res_counter_charge(counter, val, limit_fail_at, true);
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93} 87}
88
94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 89void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
95{ 90{
96 if (WARN_ON(counter->usage < val)) 91 if (WARN_ON(counter->usage < val))
@@ -99,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
99 counter->usage -= val; 94 counter->usage -= val;
100} 95}
101 96
102void res_counter_uncharge(struct res_counter *counter, unsigned long val) 97void res_counter_uncharge_until(struct res_counter *counter,
98 struct res_counter *top,
99 unsigned long val)
103{ 100{
104 unsigned long flags; 101 unsigned long flags;
105 struct res_counter *c; 102 struct res_counter *c;
106 103
107 local_irq_save(flags); 104 local_irq_save(flags);
108 for (c = counter; c != NULL; c = c->parent) { 105 for (c = counter; c != top; c = c->parent) {
109 spin_lock(&c->lock); 106 spin_lock(&c->lock);
110 res_counter_uncharge_locked(c, val); 107 res_counter_uncharge_locked(c, val);
111 spin_unlock(&c->lock); 108 spin_unlock(&c->lock);
@@ -113,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
113 local_irq_restore(flags); 110 local_irq_restore(flags);
114} 111}
115 112
113void res_counter_uncharge(struct res_counter *counter, unsigned long val)
114{
115 res_counter_uncharge_until(counter, NULL, val);
116}
116 117
117static inline unsigned long long * 118static inline unsigned long long *
118res_counter_member(struct res_counter *counter, int member) 119res_counter_member(struct res_counter *counter, int member)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7e8ea66a8c01..e1d2b8ee76d5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -515,8 +515,8 @@ out:
515 * @root: root resource descriptor 515 * @root: root resource descriptor
516 * @new: resource descriptor desired by caller 516 * @new: resource descriptor desired by caller
517 * @size: requested resource region size 517 * @size: requested resource region size
518 * @min: minimum size to allocate 518 * @min: minimum boundary to allocate
519 * @max: maximum size to allocate 519 * @max: maximum boundary to allocate
520 * @align: alignment requested, in bytes 520 * @align: alignment requested, in bytes
521 * @alignf: alignment function, optional, called if not NULL 521 * @alignf: alignment function, optional, called if not NULL
522 * @alignf_data: arbitrary data to pass to the @alignf function 522 * @alignf_data: arbitrary data to pass to the @alignf function
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533a688ce22..c46958e26121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
141#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
142 #name , 143 #name ,
143 144
144static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
145#include "features.h" 146#include "features.h"
146 NULL
147}; 147};
148 148
149#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2083,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2083#endif 2081#endif
2084 2082
2085 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2085 switch_to(prev, next, prev);
2087 2086
2088 barrier(); 2087 barrier();
@@ -2486,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2485 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2486 * every tick. We fix it up based on jiffies.
2488 */ 2487 */
2489void update_cpu_load(struct rq *this_rq) 2488static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2489 unsigned long pending_updates)
2490{ 2490{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2491 int i, scale;
2495 2492
2496 this_rq->nr_load_updates++; 2493 this_rq->nr_load_updates++;
2497 2494
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2495 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2496 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2497 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2516 sched_avg_update(this_rq);
2527} 2517}
2528 2518
2519#ifdef CONFIG_NO_HZ
2520/*
2521 * There is no sane way to deal with nohz on smp when using jiffies because the
2522 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2523 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2524 *
2525 * Therefore we cannot use the delta approach from the regular tick since that
2526 * would seriously skew the load calculation. However we'll make do for those
2527 * updates happening while idle (nohz_idle_balance) or coming out of idle
2528 * (tick_nohz_idle_exit).
2529 *
2530 * This means we might still be one tick off for nohz periods.
2531 */
2532
2533/*
2534 * Called from nohz_idle_balance() to update the load ratings before doing the
2535 * idle balance.
2536 */
2537void update_idle_cpu_load(struct rq *this_rq)
2538{
2539 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2540 unsigned long load = this_rq->load.weight;
2541 unsigned long pending_updates;
2542
2543 /*
2544 * bail if there's load or we're actually up-to-date.
2545 */
2546 if (load || curr_jiffies == this_rq->last_load_update_tick)
2547 return;
2548
2549 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2550 this_rq->last_load_update_tick = curr_jiffies;
2551
2552 __update_cpu_load(this_rq, load, pending_updates);
2553}
2554
2555/*
2556 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2557 */
2558void update_cpu_load_nohz(void)
2559{
2560 struct rq *this_rq = this_rq();
2561 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2562 unsigned long pending_updates;
2563
2564 if (curr_jiffies == this_rq->last_load_update_tick)
2565 return;
2566
2567 raw_spin_lock(&this_rq->lock);
2568 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2569 if (pending_updates) {
2570 this_rq->last_load_update_tick = curr_jiffies;
2571 /*
2572 * We were idle, this means load 0, the current load might be
2573 * !0 due to remote wakeups and the sort.
2574 */
2575 __update_cpu_load(this_rq, 0, pending_updates);
2576 }
2577 raw_spin_unlock(&this_rq->lock);
2578}
2579#endif /* CONFIG_NO_HZ */
2580
2581/*
2582 * Called from scheduler_tick()
2583 */
2529static void update_cpu_load_active(struct rq *this_rq) 2584static void update_cpu_load_active(struct rq *this_rq)
2530{ 2585{
2531 update_cpu_load(this_rq); 2586 /*
2587 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2588 */
2589 this_rq->last_load_update_tick = jiffies;
2590 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2591
2533 calc_load_account_active(this_rq); 2592 calc_load_account_active(this_rq);
2534} 2593}
@@ -3113,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3113 if (irqs_disabled()) 3172 if (irqs_disabled())
3114 print_irqtrace_events(prev); 3173 print_irqtrace_events(prev);
3115 dump_stack(); 3174 dump_stack();
3175 add_taint(TAINT_WARN);
3116} 3176}
3117 3177
3118/* 3178/*
@@ -4042,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p)
4042 4102
4043 rcu_read_lock(); 4103 rcu_read_lock();
4044 pcred = __task_cred(p); 4104 pcred = __task_cred(p);
4045 if (cred->user->user_ns == pcred->user->user_ns) 4105 match = (uid_eq(cred->euid, pcred->euid) ||
4046 match = (cred->euid == pcred->euid || 4106 uid_eq(cred->euid, pcred->uid));
4047 cred->euid == pcred->uid);
4048 else
4049 match = false;
4050 rcu_read_unlock(); 4107 rcu_read_unlock();
4051 return match; 4108 return match;
4052} 4109}
@@ -4957,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4957 p->sched_class->set_cpus_allowed(p, new_mask); 5014 p->sched_class->set_cpus_allowed(p, new_mask);
4958 5015
4959 cpumask_copy(&p->cpus_allowed, new_mask); 5016 cpumask_copy(&p->cpus_allowed, new_mask);
4960 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5017 p->nr_cpus_allowed = cpumask_weight(new_mask);
4961} 5018}
4962 5019
4963/* 5020/*
@@ -5560,7 +5617,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5617 break;
5561 } 5618 }
5562 5619
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5620 if (!(sd->flags & SD_OVERLAP) &&
5621 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5622 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5623 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5624 break;
@@ -5898,99 +5956,11 @@ static int __init isolated_cpu_setup(char *str)
5898 5956
5899__setup("isolcpus=", isolated_cpu_setup); 5957__setup("isolcpus=", isolated_cpu_setup);
5900 5958
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5959static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5960{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5961 return cpumask_of_node(cpu_to_node(cpu));
5990} 5962}
5991 5963
5992int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5993
5994struct sd_data { 5964struct sd_data {
5995 struct sched_domain **__percpu sd; 5965 struct sched_domain **__percpu sd;
5996 struct sched_group **__percpu sg; 5966 struct sched_group **__percpu sg;
@@ -6020,6 +5990,7 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 5990 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 5991 sched_domain_mask_f mask;
6022 int flags; 5992 int flags;
5993 int numa_level;
6023 struct sd_data data; 5994 struct sd_data data;
6024}; 5995};
6025 5996
@@ -6058,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6058 6029
6059 cpumask_or(covered, covered, sg_span); 6030 cpumask_or(covered, covered, sg_span);
6060 6031
6061 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6032 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6062 atomic_inc(&sg->sgp->ref); 6033 atomic_inc(&sg->sgp->ref);
6063 6034
6064 if (cpumask_test_cpu(cpu, sg_span)) 6035 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6036 cpumask_first(sg_span) == cpu) {
6037 WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
6065 groups = sg; 6038 groups = sg;
6039 }
6066 6040
6067 if (!first) 6041 if (!first)
6068 first = sg; 6042 first = sg;
@@ -6211,10 +6185,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6185}
6212 6186
6213SD_INIT_FUNC(CPU) 6187SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6188#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6189 SD_INIT_FUNC(SIBLING)
6220#endif 6190#endif
@@ -6336,15 +6306,184 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6306 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6307#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6308 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6309 { NULL, },
6344}; 6310};
6345 6311
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6312static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6313
6314#ifdef CONFIG_NUMA
6315
6316static int sched_domains_numa_levels;
6317static int sched_domains_numa_scale;
6318static int *sched_domains_numa_distance;
6319static struct cpumask ***sched_domains_numa_masks;
6320static int sched_domains_curr_level;
6321
6322static inline int sd_local_flags(int level)
6323{
6324 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6325 return 0;
6326
6327 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6328}
6329
6330static struct sched_domain *
6331sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6332{
6333 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6334 int level = tl->numa_level;
6335 int sd_weight = cpumask_weight(
6336 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6337
6338 *sd = (struct sched_domain){
6339 .min_interval = sd_weight,
6340 .max_interval = 2*sd_weight,
6341 .busy_factor = 32,
6342 .imbalance_pct = 125,
6343 .cache_nice_tries = 2,
6344 .busy_idx = 3,
6345 .idle_idx = 2,
6346 .newidle_idx = 0,
6347 .wake_idx = 0,
6348 .forkexec_idx = 0,
6349
6350 .flags = 1*SD_LOAD_BALANCE
6351 | 1*SD_BALANCE_NEWIDLE
6352 | 0*SD_BALANCE_EXEC
6353 | 0*SD_BALANCE_FORK
6354 | 0*SD_BALANCE_WAKE
6355 | 0*SD_WAKE_AFFINE
6356 | 0*SD_PREFER_LOCAL
6357 | 0*SD_SHARE_CPUPOWER
6358 | 0*SD_SHARE_PKG_RESOURCES
6359 | 1*SD_SERIALIZE
6360 | 0*SD_PREFER_SIBLING
6361 | sd_local_flags(level)
6362 ,
6363 .last_balance = jiffies,
6364 .balance_interval = sd_weight,
6365 };
6366 SD_INIT_NAME(sd, NUMA);
6367 sd->private = &tl->data;
6368
6369 /*
6370 * Ugly hack to pass state to sd_numa_mask()...
6371 */
6372 sched_domains_curr_level = tl->numa_level;
6373
6374 return sd;
6375}
6376
6377static const struct cpumask *sd_numa_mask(int cpu)
6378{
6379 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6380}
6381
6382static void sched_init_numa(void)
6383{
6384 int next_distance, curr_distance = node_distance(0, 0);
6385 struct sched_domain_topology_level *tl;
6386 int level = 0;
6387 int i, j, k;
6388
6389 sched_domains_numa_scale = curr_distance;
6390 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6391 if (!sched_domains_numa_distance)
6392 return;
6393
6394 /*
6395 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6396 * unique distances in the node_distance() table.
6397 *
6398 * Assumes node_distance(0,j) includes all distances in
6399 * node_distance(i,j) in order to avoid cubic time.
6400 *
6401 * XXX: could be optimized to O(n log n) by using sort()
6402 */
6403 next_distance = curr_distance;
6404 for (i = 0; i < nr_node_ids; i++) {
6405 for (j = 0; j < nr_node_ids; j++) {
6406 int distance = node_distance(0, j);
6407 if (distance > curr_distance &&
6408 (distance < next_distance ||
6409 next_distance == curr_distance))
6410 next_distance = distance;
6411 }
6412 if (next_distance != curr_distance) {
6413 sched_domains_numa_distance[level++] = next_distance;
6414 sched_domains_numa_levels = level;
6415 curr_distance = next_distance;
6416 } else break;
6417 }
6418 /*
6419 * 'level' contains the number of unique distances, excluding the
6420 * identity distance node_distance(i,i).
6421 *
6422 * The sched_domains_nume_distance[] array includes the actual distance
6423 * numbers.
6424 */
6425
6426 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6427 if (!sched_domains_numa_masks)
6428 return;
6429
6430 /*
6431 * Now for each level, construct a mask per node which contains all
6432 * cpus of nodes that are that many hops away from us.
6433 */
6434 for (i = 0; i < level; i++) {
6435 sched_domains_numa_masks[i] =
6436 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6437 if (!sched_domains_numa_masks[i])
6438 return;
6439
6440 for (j = 0; j < nr_node_ids; j++) {
6441 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6442 if (!mask)
6443 return;
6444
6445 sched_domains_numa_masks[i][j] = mask;
6446
6447 for (k = 0; k < nr_node_ids; k++) {
6448 if (node_distance(j, k) > sched_domains_numa_distance[i])
6449 continue;
6450
6451 cpumask_or(mask, mask, cpumask_of_node(k));
6452 }
6453 }
6454 }
6455
6456 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6457 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6458 if (!tl)
6459 return;
6460
6461 /*
6462 * Copy the default topology bits..
6463 */
6464 for (i = 0; default_topology[i].init; i++)
6465 tl[i] = default_topology[i];
6466
6467 /*
6468 * .. and append 'j' levels of NUMA goodness.
6469 */
6470 for (j = 0; j < level; i++, j++) {
6471 tl[i] = (struct sched_domain_topology_level){
6472 .init = sd_numa_init,
6473 .mask = sd_numa_mask,
6474 .flags = SDTL_OVERLAP,
6475 .numa_level = j,
6476 };
6477 }
6478
6479 sched_domain_topology = tl;
6480}
6481#else
6482static inline void sched_init_numa(void)
6483{
6484}
6485#endif /* CONFIG_NUMA */
6486
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6487static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6488{
6350 struct sched_domain_topology_level *tl; 6489 struct sched_domain_topology_level *tl;
@@ -6382,6 +6521,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6382 if (!sg) 6521 if (!sg)
6383 return -ENOMEM; 6522 return -ENOMEM;
6384 6523
6524 sg->next = sg;
6525
6385 *per_cpu_ptr(sdd->sg, j) = sg; 6526 *per_cpu_ptr(sdd->sg, j) = sg;
6386 6527
6387 sgp = kzalloc_node(sizeof(struct sched_group_power), 6528 sgp = kzalloc_node(sizeof(struct sched_group_power),
@@ -6585,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6585 if (!doms_cur) 6726 if (!doms_cur)
6586 doms_cur = &fallback_doms; 6727 doms_cur = &fallback_doms;
6587 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6728 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6588 dattr_cur = NULL;
6589 err = build_sched_domains(doms_cur[0], NULL); 6729 err = build_sched_domains(doms_cur[0], NULL);
6590 register_sched_domain_sysctl(); 6730 register_sched_domain_sysctl();
6591 6731
@@ -6710,97 +6850,6 @@ match2:
6710 mutex_unlock(&sched_domains_mutex); 6850 mutex_unlock(&sched_domains_mutex);
6711} 6851}
6712 6852
6713#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6714static void reinit_sched_domains(void)
6715{
6716 get_online_cpus();
6717
6718 /* Destroy domains first to force the rebuild */
6719 partition_sched_domains(0, NULL, NULL);
6720
6721 rebuild_sched_domains();
6722 put_online_cpus();
6723}
6724
6725static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6726{
6727 unsigned int level = 0;
6728
6729 if (sscanf(buf, "%u", &level) != 1)
6730 return -EINVAL;
6731
6732 /*
6733 * level is always be positive so don't check for
6734 * level < POWERSAVINGS_BALANCE_NONE which is 0
6735 * What happens on 0 or 1 byte write,
6736 * need to check for count as well?
6737 */
6738
6739 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6740 return -EINVAL;
6741
6742 if (smt)
6743 sched_smt_power_savings = level;
6744 else
6745 sched_mc_power_savings = level;
6746
6747 reinit_sched_domains();
6748
6749 return count;
6750}
6751
6752#ifdef CONFIG_SCHED_MC
6753static ssize_t sched_mc_power_savings_show(struct device *dev,
6754 struct device_attribute *attr,
6755 char *buf)
6756{
6757 return sprintf(buf, "%u\n", sched_mc_power_savings);
6758}
6759static ssize_t sched_mc_power_savings_store(struct device *dev,
6760 struct device_attribute *attr,
6761 const char *buf, size_t count)
6762{
6763 return sched_power_savings_store(buf, count, 0);
6764}
6765static DEVICE_ATTR(sched_mc_power_savings, 0644,
6766 sched_mc_power_savings_show,
6767 sched_mc_power_savings_store);
6768#endif
6769
6770#ifdef CONFIG_SCHED_SMT
6771static ssize_t sched_smt_power_savings_show(struct device *dev,
6772 struct device_attribute *attr,
6773 char *buf)
6774{
6775 return sprintf(buf, "%u\n", sched_smt_power_savings);
6776}
6777static ssize_t sched_smt_power_savings_store(struct device *dev,
6778 struct device_attribute *attr,
6779 const char *buf, size_t count)
6780{
6781 return sched_power_savings_store(buf, count, 1);
6782}
6783static DEVICE_ATTR(sched_smt_power_savings, 0644,
6784 sched_smt_power_savings_show,
6785 sched_smt_power_savings_store);
6786#endif
6787
6788int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6789{
6790 int err = 0;
6791
6792#ifdef CONFIG_SCHED_SMT
6793 if (smt_capable())
6794 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6795#endif
6796#ifdef CONFIG_SCHED_MC
6797 if (!err && mc_capable())
6798 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6799#endif
6800 return err;
6801}
6802#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6803
6804/* 6853/*
6805 * Update cpusets according to cpu_active mask. If cpusets are 6854 * Update cpusets according to cpu_active mask. If cpusets are
6806 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6855 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6838,6 +6887,8 @@ void __init sched_init_smp(void)
6838 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6887 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6839 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6888 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6840 6889
6890 sched_init_numa();
6891
6841 get_online_cpus(); 6892 get_online_cpus();
6842 mutex_lock(&sched_domains_mutex); 6893 mutex_lock(&sched_domains_mutex);
6843 init_sched_domains(cpu_active_mask); 6894 init_sched_domains(cpu_active_mask);
@@ -7059,6 +7110,7 @@ void __init sched_init(void)
7059 /* May be allocated at isolcpus cmdline parse time */ 7110 /* May be allocated at isolcpus cmdline parse time */
7060 if (cpu_isolated_map == NULL) 7111 if (cpu_isolated_map == NULL)
7061 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7112 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7113 idle_thread_set_boot_cpu();
7062#endif 7114#endif
7063 init_sched_fair_class(); 7115 init_sched_fair_class();
7064 7116
@@ -7980,13 +8032,9 @@ static struct cftype cpu_files[] = {
7980 .write_u64 = cpu_rt_period_write_uint, 8032 .write_u64 = cpu_rt_period_write_uint,
7981 }, 8033 },
7982#endif 8034#endif
8035 { } /* terminate */
7983}; 8036};
7984 8037
7985static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7986{
7987 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7988}
7989
7990struct cgroup_subsys cpu_cgroup_subsys = { 8038struct cgroup_subsys cpu_cgroup_subsys = {
7991 .name = "cpu", 8039 .name = "cpu",
7992 .create = cpu_cgroup_create, 8040 .create = cpu_cgroup_create,
@@ -7994,8 +8042,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7994 .can_attach = cpu_cgroup_can_attach, 8042 .can_attach = cpu_cgroup_can_attach,
7995 .attach = cpu_cgroup_attach, 8043 .attach = cpu_cgroup_attach,
7996 .exit = cpu_cgroup_exit, 8044 .exit = cpu_cgroup_exit,
7997 .populate = cpu_cgroup_populate,
7998 .subsys_id = cpu_cgroup_subsys_id, 8045 .subsys_id = cpu_cgroup_subsys_id,
8046 .base_cftypes = cpu_files,
7999 .early_init = 1, 8047 .early_init = 1,
8000}; 8048};
8001 8049
@@ -8180,13 +8228,9 @@ static struct cftype files[] = {
8180 .name = "stat", 8228 .name = "stat",
8181 .read_map = cpuacct_stats_show, 8229 .read_map = cpuacct_stats_show,
8182 }, 8230 },
8231 { } /* terminate */
8183}; 8232};
8184 8233
8185static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8186{
8187 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8188}
8189
8190/* 8234/*
8191 * charge this task's execution time to its accounting group. 8235 * charge this task's execution time to its accounting group.
8192 * 8236 *
@@ -8218,7 +8262,7 @@ struct cgroup_subsys cpuacct_subsys = {
8218 .name = "cpuacct", 8262 .name = "cpuacct",
8219 .create = cpuacct_create, 8263 .create = cpuacct_create,
8220 .destroy = cpuacct_destroy, 8264 .destroy = cpuacct_destroy,
8221 .populate = cpuacct_populate,
8222 .subsys_id = cpuacct_subsys_id, 8265 .subsys_id = cpuacct_subsys_id,
8266 .base_cftypes = files,
8223}; 8267};
8224#endif /* CONFIG_CGROUP_CPUACCT */ 8268#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 SPLIT_NS(spread0)); 202 SPLIT_NS(spread0));
203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 203 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
204 cfs_rq->nr_spread_over); 204 cfs_rq->nr_spread_over);
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207#ifdef CONFIG_FAIR_GROUP_SCHED 207#ifdef CONFIG_FAIR_GROUP_SCHED
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
260 SEQ_printf(m, "\ncpu#%d\n", cpu); 260 SEQ_printf(m, "\ncpu#%d\n", cpu);
261#endif 261#endif
262 262
263#define P(x) \ 263#define P(x) \
264 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 264do { \
265 if (sizeof(rq->x) == 4) \
266 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
267 else \
268 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
269} while (0)
270
265#define PN(x) \ 271#define PN(x) \
266 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 272 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
267 273
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..b2a2d236f27b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2703 int want_sd = 1; 2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2704 int sync = wake_flags & WF_SYNC;
2705 2705
2706 if (p->rt.nr_cpus_allowed == 1) 2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu; 2707 return prev_cpu;
2708 2708
2709 if (sd_flag & SD_BALANCE_WAKE) { 2709 if (sd_flag & SD_BALANCE_WAKE) {
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2721 * If power savings logic is enabled for a domain, see if we 2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider. 2722 * are not overloaded, if so, don't balance wider.
2723 */ 2723 */
2724 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { 2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0; 2725 unsigned long power = 0;
2726 unsigned long nr_running = 0; 2726 unsigned long nr_running = 0;
2727 unsigned long capacity; 2727 unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2734 2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); 2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736 2736
2737 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2738 nr_running /= 2;
2739
2740 if (nr_running < capacity) 2737 if (nr_running < capacity)
2741 want_sd = 0; 2738 want_sd = 0;
2742 } 2739 }
@@ -3082,7 +3079,7 @@ struct lb_env {
3082 struct rq *dst_rq; 3079 struct rq *dst_rq;
3083 3080
3084 enum cpu_idle_type idle; 3081 enum cpu_idle_type idle;
3085 long load_move; 3082 long imbalance;
3086 unsigned int flags; 3083 unsigned int flags;
3087 3084
3088 unsigned int loop; 3085 unsigned int loop;
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);
3218static const unsigned int sched_nr_migrate_break = 32; 3215static const unsigned int sched_nr_migrate_break = 32;
3219 3216
3220/* 3217/*
3221 * move_tasks tries to move up to load_move weighted load from busiest to 3218 * move_tasks tries to move up to imbalance weighted load from busiest to
3222 * this_rq, as part of a balancing operation within domain "sd". 3219 * this_rq, as part of a balancing operation within domain "sd".
3223 * Returns 1 if successful and 0 otherwise. 3220 * Returns 1 if successful and 0 otherwise.
3224 * 3221 *
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
3231 unsigned long load; 3228 unsigned long load;
3232 int pulled = 0; 3229 int pulled = 0;
3233 3230
3234 if (env->load_move <= 0) 3231 if (env->imbalance <= 0)
3235 return 0; 3232 return 0;
3236 3233
3237 while (!list_empty(tasks)) { 3234 while (!list_empty(tasks)) {
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
3257 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 3254 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
3258 goto next; 3255 goto next;
3259 3256
3260 if ((load / 2) > env->load_move) 3257 if ((load / 2) > env->imbalance)
3261 goto next; 3258 goto next;
3262 3259
3263 if (!can_migrate_task(p, env)) 3260 if (!can_migrate_task(p, env))
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
3265 3262
3266 move_task(p, env); 3263 move_task(p, env);
3267 pulled++; 3264 pulled++;
3268 env->load_move -= load; 3265 env->imbalance -= load;
3269 3266
3270#ifdef CONFIG_PREEMPT 3267#ifdef CONFIG_PREEMPT
3271 /* 3268 /*
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
3281 * We only want to steal up to the prescribed amount of 3278 * We only want to steal up to the prescribed amount of
3282 * weighted load. 3279 * weighted load.
3283 */ 3280 */
3284 if (env->load_move <= 0) 3281 if (env->imbalance <= 0)
3285 break; 3282 break;
3286 3283
3287 continue; 3284 continue;
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
3435 unsigned int busiest_group_weight; 3432 unsigned int busiest_group_weight;
3436 3433
3437 int group_imb; /* Is there imbalance in this sd */ 3434 int group_imb; /* Is there imbalance in this sd */
3438#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3439 int power_savings_balance; /* Is powersave balance needed for this sd */
3440 struct sched_group *group_min; /* Least loaded group in sd */
3441 struct sched_group *group_leader; /* Group which relieves group_min */
3442 unsigned long min_load_per_task; /* load_per_task in group_min */
3443 unsigned long leader_nr_running; /* Nr running of group_leader */
3444 unsigned long min_nr_running; /* Nr running of group_min */
3445#endif
3446}; 3435};
3447 3436
3448/* 3437/*
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
3486 return load_idx; 3475 return load_idx;
3487} 3476}
3488 3477
3489
3490#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3491/**
3492 * init_sd_power_savings_stats - Initialize power savings statistics for
3493 * the given sched_domain, during load balancing.
3494 *
3495 * @sd: Sched domain whose power-savings statistics are to be initialized.
3496 * @sds: Variable containing the statistics for sd.
3497 * @idle: Idle status of the CPU at which we're performing load-balancing.
3498 */
3499static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3500 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3501{
3502 /*
3503 * Busy processors will not participate in power savings
3504 * balance.
3505 */
3506 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3507 sds->power_savings_balance = 0;
3508 else {
3509 sds->power_savings_balance = 1;
3510 sds->min_nr_running = ULONG_MAX;
3511 sds->leader_nr_running = 0;
3512 }
3513}
3514
3515/**
3516 * update_sd_power_savings_stats - Update the power saving stats for a
3517 * sched_domain while performing load balancing.
3518 *
3519 * @group: sched_group belonging to the sched_domain under consideration.
3520 * @sds: Variable containing the statistics of the sched_domain
3521 * @local_group: Does group contain the CPU for which we're performing
3522 * load balancing ?
3523 * @sgs: Variable containing the statistics of the group.
3524 */
3525static inline void update_sd_power_savings_stats(struct sched_group *group,
3526 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3527{
3528
3529 if (!sds->power_savings_balance)
3530 return;
3531
3532 /*
3533 * If the local group is idle or completely loaded
3534 * no need to do power savings balance at this domain
3535 */
3536 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3537 !sds->this_nr_running))
3538 sds->power_savings_balance = 0;
3539
3540 /*
3541 * If a group is already running at full capacity or idle,
3542 * don't include that group in power savings calculations
3543 */
3544 if (!sds->power_savings_balance ||
3545 sgs->sum_nr_running >= sgs->group_capacity ||
3546 !sgs->sum_nr_running)
3547 return;
3548
3549 /*
3550 * Calculate the group which has the least non-idle load.
3551 * This is the group from where we need to pick up the load
3552 * for saving power
3553 */
3554 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3555 (sgs->sum_nr_running == sds->min_nr_running &&
3556 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3557 sds->group_min = group;
3558 sds->min_nr_running = sgs->sum_nr_running;
3559 sds->min_load_per_task = sgs->sum_weighted_load /
3560 sgs->sum_nr_running;
3561 }
3562
3563 /*
3564 * Calculate the group which is almost near its
3565 * capacity but still has some space to pick up some load
3566 * from other group and save more power
3567 */
3568 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3569 return;
3570
3571 if (sgs->sum_nr_running > sds->leader_nr_running ||
3572 (sgs->sum_nr_running == sds->leader_nr_running &&
3573 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3574 sds->group_leader = group;
3575 sds->leader_nr_running = sgs->sum_nr_running;
3576 }
3577}
3578
3579/**
3580 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3581 * @sds: Variable containing the statistics of the sched_domain
3582 * under consideration.
3583 * @this_cpu: Cpu at which we're currently performing load-balancing.
3584 * @imbalance: Variable to store the imbalance.
3585 *
3586 * Description:
3587 * Check if we have potential to perform some power-savings balance.
3588 * If yes, set the busiest group to be the least loaded group in the
3589 * sched_domain, so that it's CPUs can be put to idle.
3590 *
3591 * Returns 1 if there is potential to perform power-savings balance.
3592 * Else returns 0.
3593 */
3594static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3595 int this_cpu, unsigned long *imbalance)
3596{
3597 if (!sds->power_savings_balance)
3598 return 0;
3599
3600 if (sds->this != sds->group_leader ||
3601 sds->group_leader == sds->group_min)
3602 return 0;
3603
3604 *imbalance = sds->min_load_per_task;
3605 sds->busiest = sds->group_min;
3606
3607 return 1;
3608
3609}
3610#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3611static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3612 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3613{
3614 return;
3615}
3616
3617static inline void update_sd_power_savings_stats(struct sched_group *group,
3618 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3619{
3620 return;
3621}
3622
3623static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 int this_cpu, unsigned long *imbalance)
3625{
3626 return 0;
3627}
3628#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3629
3630
3631unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3478unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3632{ 3479{
3633 return SCHED_POWER_SCALE; 3480 return SCHED_POWER_SCALE;
@@ -3656,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3656unsigned long scale_rt_power(int cpu) 3503unsigned long scale_rt_power(int cpu)
3657{ 3504{
3658 struct rq *rq = cpu_rq(cpu); 3505 struct rq *rq = cpu_rq(cpu);
3659 u64 total, available; 3506 u64 total, available, age_stamp, avg;
3507
3508 /*
3509 * Since we're reading these variables without serialization make sure
3510 * we read them once before doing sanity checks on them.
3511 */
3512 age_stamp = ACCESS_ONCE(rq->age_stamp);
3513 avg = ACCESS_ONCE(rq->rt_avg);
3660 3514
3661 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3515 total = sched_avg_period() + (rq->clock - age_stamp);
3662 3516
3663 if (unlikely(total < rq->rt_avg)) { 3517 if (unlikely(total < avg)) {
3664 /* Ensures that power won't end up being negative */ 3518 /* Ensures that power won't end up being negative */
3665 available = 0; 3519 available = 0;
3666 } else { 3520 } else {
3667 available = total - rq->rt_avg; 3521 available = total - avg;
3668 } 3522 }
3669 3523
3670 if (unlikely((s64)total < SCHED_POWER_SCALE)) 3524 if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3727,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu)
3727 3581
3728 power = 0; 3582 power = 0;
3729 3583
3730 group = child->groups; 3584 if (child->flags & SD_OVERLAP) {
3731 do { 3585 /*
3732 power += group->sgp->power; 3586 * SD_OVERLAP domains cannot assume that child groups
3733 group = group->next; 3587 * span the current group.
3734 } while (group != child->groups); 3588 */
3589
3590 for_each_cpu(cpu, sched_group_cpus(sdg))
3591 power += power_of(cpu);
3592 } else {
3593 /*
3594 * !SD_OVERLAP domains can assume that child groups
3595 * span the current group.
3596 */
3597
3598 group = child->groups;
3599 do {
3600 power += group->sgp->power;
3601 group = group->next;
3602 } while (group != child->groups);
3603 }
3735 3604
3736 sdg->sgp->power = power; 3605 sdg->sgp->power = power;
3737} 3606}
@@ -3765,24 +3634,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3765 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3766 * @sd: The sched_domain whose statistics are to be updated. 3635 * @sd: The sched_domain whose statistics are to be updated.
3767 * @group: sched_group whose statistics are to be updated. 3636 * @group: sched_group whose statistics are to be updated.
3768 * @this_cpu: Cpu for which load balance is currently performed.
3769 * @idle: Idle status of this_cpu
3770 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3771 * @local_group: Does group contain this_cpu. 3638 * @local_group: Does group contain this_cpu.
3772 * @cpus: Set of cpus considered for load balancing. 3639 * @cpus: Set of cpus considered for load balancing.
3773 * @balance: Should we balance. 3640 * @balance: Should we balance.
3774 * @sgs: variable to hold the statistics for this group. 3641 * @sgs: variable to hold the statistics for this group.
3775 */ 3642 */
3776static inline void update_sg_lb_stats(struct sched_domain *sd, 3643static inline void update_sg_lb_stats(struct lb_env *env,
3777 struct sched_group *group, int this_cpu, 3644 struct sched_group *group, int load_idx,
3778 enum cpu_idle_type idle, int load_idx,
3779 int local_group, const struct cpumask *cpus, 3645 int local_group, const struct cpumask *cpus,
3780 int *balance, struct sg_lb_stats *sgs) 3646 int *balance, struct sg_lb_stats *sgs)
3781{ 3647{
3782 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3648 unsigned long nr_running, max_nr_running, min_nr_running;
3783 int i; 3649 unsigned long load, max_cpu_load, min_cpu_load;
3784 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3650 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3785 unsigned long avg_load_per_task = 0; 3651 unsigned long avg_load_per_task = 0;
3652 int i;
3786 3653
3787 if (local_group) 3654 if (local_group)
3788 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_first_cpu(group);
@@ -3791,10 +3658,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3791 max_cpu_load = 0; 3658 max_cpu_load = 0;
3792 min_cpu_load = ~0UL; 3659 min_cpu_load = ~0UL;
3793 max_nr_running = 0; 3660 max_nr_running = 0;
3661 min_nr_running = ~0UL;
3794 3662
3795 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3663 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3796 struct rq *rq = cpu_rq(i); 3664 struct rq *rq = cpu_rq(i);
3797 3665
3666 nr_running = rq->nr_running;
3667
3798 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3799 if (local_group) { 3669 if (local_group) {
3800 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu) {
@@ -3805,16 +3675,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3805 load = target_load(i, load_idx); 3675 load = target_load(i, load_idx);
3806 } else { 3676 } else {
3807 load = source_load(i, load_idx); 3677 load = source_load(i, load_idx);
3808 if (load > max_cpu_load) { 3678 if (load > max_cpu_load)
3809 max_cpu_load = load; 3679 max_cpu_load = load;
3810 max_nr_running = rq->nr_running;
3811 }
3812 if (min_cpu_load > load) 3680 if (min_cpu_load > load)
3813 min_cpu_load = load; 3681 min_cpu_load = load;
3682
3683 if (nr_running > max_nr_running)
3684 max_nr_running = nr_running;
3685 if (min_nr_running > nr_running)
3686 min_nr_running = nr_running;
3814 } 3687 }
3815 3688
3816 sgs->group_load += load; 3689 sgs->group_load += load;
3817 sgs->sum_nr_running += rq->nr_running; 3690 sgs->sum_nr_running += nr_running;
3818 sgs->sum_weighted_load += weighted_cpuload(i); 3691 sgs->sum_weighted_load += weighted_cpuload(i);
3819 if (idle_cpu(i)) 3692 if (idle_cpu(i))
3820 sgs->idle_cpus++; 3693 sgs->idle_cpus++;
@@ -3827,14 +3700,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3827 * to do the newly idle load balance. 3700 * to do the newly idle load balance.
3828 */ 3701 */
3829 if (local_group) { 3702 if (local_group) {
3830 if (idle != CPU_NEWLY_IDLE) { 3703 if (env->idle != CPU_NEWLY_IDLE) {
3831 if (balance_cpu != this_cpu) { 3704 if (balance_cpu != env->dst_cpu) {
3832 *balance = 0; 3705 *balance = 0;
3833 return; 3706 return;
3834 } 3707 }
3835 update_group_power(sd, this_cpu); 3708 update_group_power(env->sd, env->dst_cpu);
3836 } else if (time_after_eq(jiffies, group->sgp->next_update)) 3709 } else if (time_after_eq(jiffies, group->sgp->next_update))
3837 update_group_power(sd, this_cpu); 3710 update_group_power(env->sd, env->dst_cpu);
3838 } 3711 }
3839 3712
3840 /* Adjust by relative CPU power of the group */ 3713 /* Adjust by relative CPU power of the group */
@@ -3852,13 +3725,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3852 if (sgs->sum_nr_running) 3725 if (sgs->sum_nr_running)
3853 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 3726 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
3854 3727
3855 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 3728 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
3729 (max_nr_running - min_nr_running) > 1)
3856 sgs->group_imb = 1; 3730 sgs->group_imb = 1;
3857 3731
3858 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 3732 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
3859 SCHED_POWER_SCALE); 3733 SCHED_POWER_SCALE);
3860 if (!sgs->group_capacity) 3734 if (!sgs->group_capacity)
3861 sgs->group_capacity = fix_small_capacity(sd, group); 3735 sgs->group_capacity = fix_small_capacity(env->sd, group);
3862 sgs->group_weight = group->group_weight; 3736 sgs->group_weight = group->group_weight;
3863 3737
3864 if (sgs->group_capacity > sgs->sum_nr_running) 3738 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3876,11 +3750,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3876 * Determine if @sg is a busier group than the previously selected 3750 * Determine if @sg is a busier group than the previously selected
3877 * busiest group. 3751 * busiest group.
3878 */ 3752 */
3879static bool update_sd_pick_busiest(struct sched_domain *sd, 3753static bool update_sd_pick_busiest(struct lb_env *env,
3880 struct sd_lb_stats *sds, 3754 struct sd_lb_stats *sds,
3881 struct sched_group *sg, 3755 struct sched_group *sg,
3882 struct sg_lb_stats *sgs, 3756 struct sg_lb_stats *sgs)
3883 int this_cpu)
3884{ 3757{
3885 if (sgs->avg_load <= sds->max_load) 3758 if (sgs->avg_load <= sds->max_load)
3886 return false; 3759 return false;
@@ -3896,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3896 * numbered CPUs in the group, therefore mark all groups 3769 * numbered CPUs in the group, therefore mark all groups
3897 * higher than ourself as busy. 3770 * higher than ourself as busy.
3898 */ 3771 */
3899 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 3772 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
3900 this_cpu < group_first_cpu(sg)) { 3773 env->dst_cpu < group_first_cpu(sg)) {
3901 if (!sds->busiest) 3774 if (!sds->busiest)
3902 return true; 3775 return true;
3903 3776
@@ -3917,28 +3790,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3917 * @balance: Should we balance. 3790 * @balance: Should we balance.
3918 * @sds: variable to hold the statistics for this sched_domain. 3791 * @sds: variable to hold the statistics for this sched_domain.
3919 */ 3792 */
3920static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3793static inline void update_sd_lb_stats(struct lb_env *env,
3921 enum cpu_idle_type idle, const struct cpumask *cpus, 3794 const struct cpumask *cpus,
3922 int *balance, struct sd_lb_stats *sds) 3795 int *balance, struct sd_lb_stats *sds)
3923{ 3796{
3924 struct sched_domain *child = sd->child; 3797 struct sched_domain *child = env->sd->child;
3925 struct sched_group *sg = sd->groups; 3798 struct sched_group *sg = env->sd->groups;
3926 struct sg_lb_stats sgs; 3799 struct sg_lb_stats sgs;
3927 int load_idx, prefer_sibling = 0; 3800 int load_idx, prefer_sibling = 0;
3928 3801
3929 if (child && child->flags & SD_PREFER_SIBLING) 3802 if (child && child->flags & SD_PREFER_SIBLING)
3930 prefer_sibling = 1; 3803 prefer_sibling = 1;
3931 3804
3932 init_sd_power_savings_stats(sd, sds, idle); 3805 load_idx = get_sd_load_idx(env->sd, env->idle);
3933 load_idx = get_sd_load_idx(sd, idle);
3934 3806
3935 do { 3807 do {
3936 int local_group; 3808 int local_group;
3937 3809
3938 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 3810 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3939 memset(&sgs, 0, sizeof(sgs)); 3811 memset(&sgs, 0, sizeof(sgs));
3940 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, 3812 update_sg_lb_stats(env, sg, load_idx, local_group,
3941 local_group, cpus, balance, &sgs); 3813 cpus, balance, &sgs);
3942 3814
3943 if (local_group && !(*balance)) 3815 if (local_group && !(*balance))
3944 return; 3816 return;
@@ -3966,7 +3838,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3966 sds->this_load_per_task = sgs.sum_weighted_load; 3838 sds->this_load_per_task = sgs.sum_weighted_load;
3967 sds->this_has_capacity = sgs.group_has_capacity; 3839 sds->this_has_capacity = sgs.group_has_capacity;
3968 sds->this_idle_cpus = sgs.idle_cpus; 3840 sds->this_idle_cpus = sgs.idle_cpus;
3969 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 3841 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
3970 sds->max_load = sgs.avg_load; 3842 sds->max_load = sgs.avg_load;
3971 sds->busiest = sg; 3843 sds->busiest = sg;
3972 sds->busiest_nr_running = sgs.sum_nr_running; 3844 sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3978,9 +3850,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3978 sds->group_imb = sgs.group_imb; 3850 sds->group_imb = sgs.group_imb;
3979 } 3851 }
3980 3852
3981 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
3982 sg = sg->next; 3853 sg = sg->next;
3983 } while (sg != sd->groups); 3854 } while (sg != env->sd->groups);
3984} 3855}
3985 3856
3986/** 3857/**
@@ -4008,24 +3879,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
4008 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3879 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4009 * @imbalance: returns amount of imbalanced due to packing. 3880 * @imbalance: returns amount of imbalanced due to packing.
4010 */ 3881 */
4011static int check_asym_packing(struct sched_domain *sd, 3882static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4012 struct sd_lb_stats *sds,
4013 int this_cpu, unsigned long *imbalance)
4014{ 3883{
4015 int busiest_cpu; 3884 int busiest_cpu;
4016 3885
4017 if (!(sd->flags & SD_ASYM_PACKING)) 3886 if (!(env->sd->flags & SD_ASYM_PACKING))
4018 return 0; 3887 return 0;
4019 3888
4020 if (!sds->busiest) 3889 if (!sds->busiest)
4021 return 0; 3890 return 0;
4022 3891
4023 busiest_cpu = group_first_cpu(sds->busiest); 3892 busiest_cpu = group_first_cpu(sds->busiest);
4024 if (this_cpu > busiest_cpu) 3893 if (env->dst_cpu > busiest_cpu)
4025 return 0; 3894 return 0;
4026 3895
4027 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 3896 env->imbalance = DIV_ROUND_CLOSEST(
4028 SCHED_POWER_SCALE); 3897 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
3898
4029 return 1; 3899 return 1;
4030} 3900}
4031 3901
@@ -4037,8 +3907,8 @@ static int check_asym_packing(struct sched_domain *sd,
4037 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3907 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
4038 * @imbalance: Variable to store the imbalance. 3908 * @imbalance: Variable to store the imbalance.
4039 */ 3909 */
4040static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3910static inline
4041 int this_cpu, unsigned long *imbalance) 3911void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4042{ 3912{
4043 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3913 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4044 unsigned int imbn = 2; 3914 unsigned int imbn = 2;
@@ -4049,9 +3919,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4049 if (sds->busiest_load_per_task > 3919 if (sds->busiest_load_per_task >
4050 sds->this_load_per_task) 3920 sds->this_load_per_task)
4051 imbn = 1; 3921 imbn = 1;
4052 } else 3922 } else {
4053 sds->this_load_per_task = 3923 sds->this_load_per_task =
4054 cpu_avg_load_per_task(this_cpu); 3924 cpu_avg_load_per_task(env->dst_cpu);
3925 }
4055 3926
4056 scaled_busy_load_per_task = sds->busiest_load_per_task 3927 scaled_busy_load_per_task = sds->busiest_load_per_task
4057 * SCHED_POWER_SCALE; 3928 * SCHED_POWER_SCALE;
@@ -4059,7 +3930,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4059 3930
4060 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3931 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4061 (scaled_busy_load_per_task * imbn)) { 3932 (scaled_busy_load_per_task * imbn)) {
4062 *imbalance = sds->busiest_load_per_task; 3933 env->imbalance = sds->busiest_load_per_task;
4063 return; 3934 return;
4064 } 3935 }
4065 3936
@@ -4096,18 +3967,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
4096 3967
4097 /* Move if we gain throughput */ 3968 /* Move if we gain throughput */
4098 if (pwr_move > pwr_now) 3969 if (pwr_move > pwr_now)
4099 *imbalance = sds->busiest_load_per_task; 3970 env->imbalance = sds->busiest_load_per_task;
4100} 3971}
4101 3972
4102/** 3973/**
4103 * calculate_imbalance - Calculate the amount of imbalance present within the 3974 * calculate_imbalance - Calculate the amount of imbalance present within the
4104 * groups of a given sched_domain during load balance. 3975 * groups of a given sched_domain during load balance.
3976 * @env: load balance environment
4105 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3977 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4106 * @this_cpu: Cpu for which currently load balance is being performed.
4107 * @imbalance: The variable to store the imbalance.
4108 */ 3978 */
4109static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3979static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4110 unsigned long *imbalance)
4111{ 3980{
4112 unsigned long max_pull, load_above_capacity = ~0UL; 3981 unsigned long max_pull, load_above_capacity = ~0UL;
4113 3982
@@ -4123,8 +3992,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4123 * its cpu_power, while calculating max_load..) 3992 * its cpu_power, while calculating max_load..)
4124 */ 3993 */
4125 if (sds->max_load < sds->avg_load) { 3994 if (sds->max_load < sds->avg_load) {
4126 *imbalance = 0; 3995 env->imbalance = 0;
4127 return fix_small_imbalance(sds, this_cpu, imbalance); 3996 return fix_small_imbalance(env, sds);
4128 } 3997 }
4129 3998
4130 if (!sds->group_imb) { 3999 if (!sds->group_imb) {
@@ -4152,7 +4021,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4152 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4021 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4153 4022
4154 /* How much load to actually move to equalise the imbalance */ 4023 /* How much load to actually move to equalise the imbalance */
4155 *imbalance = min(max_pull * sds->busiest->sgp->power, 4024 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4156 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4025 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4157 / SCHED_POWER_SCALE; 4026 / SCHED_POWER_SCALE;
4158 4027
@@ -4162,8 +4031,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4162 * a think about bumping its value to force at least one task to be 4031 * a think about bumping its value to force at least one task to be
4163 * moved 4032 * moved
4164 */ 4033 */
4165 if (*imbalance < sds->busiest_load_per_task) 4034 if (env->imbalance < sds->busiest_load_per_task)
4166 return fix_small_imbalance(sds, this_cpu, imbalance); 4035 return fix_small_imbalance(env, sds);
4167 4036
4168} 4037}
4169 4038
@@ -4194,9 +4063,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4194 * put to idle by rebalancing its tasks onto our group. 4063 * put to idle by rebalancing its tasks onto our group.
4195 */ 4064 */
4196static struct sched_group * 4065static struct sched_group *
4197find_busiest_group(struct sched_domain *sd, int this_cpu, 4066find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4198 unsigned long *imbalance, enum cpu_idle_type idle,
4199 const struct cpumask *cpus, int *balance)
4200{ 4067{
4201 struct sd_lb_stats sds; 4068 struct sd_lb_stats sds;
4202 4069
@@ -4206,7 +4073,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4206 * Compute the various statistics relavent for load balancing at 4073 * Compute the various statistics relavent for load balancing at
4207 * this level. 4074 * this level.
4208 */ 4075 */
4209 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); 4076 update_sd_lb_stats(env, cpus, balance, &sds);
4210 4077
4211 /* 4078 /*
4212 * this_cpu is not the appropriate cpu to perform load balancing at 4079 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4082,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4215 if (!(*balance)) 4082 if (!(*balance))
4216 goto ret; 4083 goto ret;
4217 4084
4218 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && 4085 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4219 check_asym_packing(sd, &sds, this_cpu, imbalance)) 4086 check_asym_packing(env, &sds))
4220 return sds.busiest; 4087 return sds.busiest;
4221 4088
4222 /* There is no busy sibling group to pull tasks from */ 4089 /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4101,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4234 goto force_balance; 4101 goto force_balance;
4235 4102
4236 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4103 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4237 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4104 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4238 !sds.busiest_has_capacity) 4105 !sds.busiest_has_capacity)
4239 goto force_balance; 4106 goto force_balance;
4240 4107
@@ -4252,7 +4119,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4252 if (sds.this_load >= sds.avg_load) 4119 if (sds.this_load >= sds.avg_load)
4253 goto out_balanced; 4120 goto out_balanced;
4254 4121
4255 if (idle == CPU_IDLE) { 4122 if (env->idle == CPU_IDLE) {
4256 /* 4123 /*
4257 * This cpu is idle. If the busiest group load doesn't 4124 * This cpu is idle. If the busiest group load doesn't
4258 * have more tasks than the number of available cpu's and 4125 * have more tasks than the number of available cpu's and
@@ -4267,34 +4134,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
4267 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 4134 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4268 * imbalance_pct to be conservative. 4135 * imbalance_pct to be conservative.
4269 */ 4136 */
4270 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 4137 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4271 goto out_balanced; 4138 goto out_balanced;
4272 } 4139 }
4273 4140
4274force_balance: 4141force_balance:
4275 /* Looks like there is an imbalance. Compute it */ 4142 /* Looks like there is an imbalance. Compute it */
4276 calculate_imbalance(&sds, this_cpu, imbalance); 4143 calculate_imbalance(env, &sds);
4277 return sds.busiest; 4144 return sds.busiest;
4278 4145
4279out_balanced: 4146out_balanced:
4280 /*
4281 * There is no obvious imbalance. But check if we can do some balancing
4282 * to save power.
4283 */
4284 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4285 return sds.busiest;
4286ret: 4147ret:
4287 *imbalance = 0; 4148 env->imbalance = 0;
4288 return NULL; 4149 return NULL;
4289} 4150}
4290 4151
4291/* 4152/*
4292 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4153 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4293 */ 4154 */
4294static struct rq * 4155static struct rq *find_busiest_queue(struct lb_env *env,
4295find_busiest_queue(struct sched_domain *sd, struct sched_group *group, 4156 struct sched_group *group,
4296 enum cpu_idle_type idle, unsigned long imbalance, 4157 const struct cpumask *cpus)
4297 const struct cpumask *cpus)
4298{ 4158{
4299 struct rq *busiest = NULL, *rq; 4159 struct rq *busiest = NULL, *rq;
4300 unsigned long max_load = 0; 4160 unsigned long max_load = 0;
@@ -4307,7 +4167,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4307 unsigned long wl; 4167 unsigned long wl;
4308 4168
4309 if (!capacity) 4169 if (!capacity)
4310 capacity = fix_small_capacity(sd, group); 4170 capacity = fix_small_capacity(env->sd, group);
4311 4171
4312 if (!cpumask_test_cpu(i, cpus)) 4172 if (!cpumask_test_cpu(i, cpus))
4313 continue; 4173 continue;
@@ -4319,7 +4179,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4319 * When comparing with imbalance, use weighted_cpuload() 4179 * When comparing with imbalance, use weighted_cpuload()
4320 * which is not scaled with the cpu power. 4180 * which is not scaled with the cpu power.
4321 */ 4181 */
4322 if (capacity && rq->nr_running == 1 && wl > imbalance) 4182 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4323 continue; 4183 continue;
4324 4184
4325 /* 4185 /*
@@ -4348,40 +4208,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4348/* Working cpumask for load_balance and load_balance_newidle. */ 4208/* Working cpumask for load_balance and load_balance_newidle. */
4349DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4209DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4350 4210
4351static int need_active_balance(struct sched_domain *sd, int idle, 4211static int need_active_balance(struct lb_env *env)
4352 int busiest_cpu, int this_cpu)
4353{ 4212{
4354 if (idle == CPU_NEWLY_IDLE) { 4213 struct sched_domain *sd = env->sd;
4214
4215 if (env->idle == CPU_NEWLY_IDLE) {
4355 4216
4356 /* 4217 /*
4357 * ASYM_PACKING needs to force migrate tasks from busy but 4218 * ASYM_PACKING needs to force migrate tasks from busy but
4358 * higher numbered CPUs in order to pack all tasks in the 4219 * higher numbered CPUs in order to pack all tasks in the
4359 * lowest numbered CPUs. 4220 * lowest numbered CPUs.
4360 */ 4221 */
4361 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) 4222 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4362 return 1; 4223 return 1;
4363
4364 /*
4365 * The only task running in a non-idle cpu can be moved to this
4366 * cpu in an attempt to completely freeup the other CPU
4367 * package.
4368 *
4369 * The package power saving logic comes from
4370 * find_busiest_group(). If there are no imbalance, then
4371 * f_b_g() will return NULL. However when sched_mc={1,2} then
4372 * f_b_g() will select a group from which a running task may be
4373 * pulled to this cpu in order to make the other package idle.
4374 * If there is no opportunity to make a package idle and if
4375 * there are no imbalance, then f_b_g() will return NULL and no
4376 * action will be taken in load_balance_newidle().
4377 *
4378 * Under normal task pull operation due to imbalance, there
4379 * will be more than one task in the source run queue and
4380 * move_tasks() will succeed. ld_moved will be true and this
4381 * active balance code will not be triggered.
4382 */
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return 0;
4385 } 4224 }
4386 4225
4387 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 4226 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4399,7 +4238,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4399{ 4238{
4400 int ld_moved, active_balance = 0; 4239 int ld_moved, active_balance = 0;
4401 struct sched_group *group; 4240 struct sched_group *group;
4402 unsigned long imbalance;
4403 struct rq *busiest; 4241 struct rq *busiest;
4404 unsigned long flags; 4242 unsigned long flags;
4405 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4243 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4255,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4417 schedstat_inc(sd, lb_count[idle]); 4255 schedstat_inc(sd, lb_count[idle]);
4418 4256
4419redo: 4257redo:
4420 group = find_busiest_group(sd, this_cpu, &imbalance, idle, 4258 group = find_busiest_group(&env, cpus, balance);
4421 cpus, balance);
4422 4259
4423 if (*balance == 0) 4260 if (*balance == 0)
4424 goto out_balanced; 4261 goto out_balanced;
@@ -4428,7 +4265,7 @@ redo:
4428 goto out_balanced; 4265 goto out_balanced;
4429 } 4266 }
4430 4267
4431 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); 4268 busiest = find_busiest_queue(&env, group, cpus);
4432 if (!busiest) { 4269 if (!busiest) {
4433 schedstat_inc(sd, lb_nobusyq[idle]); 4270 schedstat_inc(sd, lb_nobusyq[idle]);
4434 goto out_balanced; 4271 goto out_balanced;
@@ -4436,7 +4273,7 @@ redo:
4436 4273
4437 BUG_ON(busiest == this_rq); 4274 BUG_ON(busiest == this_rq);
4438 4275
4439 schedstat_add(sd, lb_imbalance[idle], imbalance); 4276 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4440 4277
4441 ld_moved = 0; 4278 ld_moved = 0;
4442 if (busiest->nr_running > 1) { 4279 if (busiest->nr_running > 1) {
@@ -4447,10 +4284,9 @@ redo:
4447 * correctly treated as an imbalance. 4284 * correctly treated as an imbalance.
4448 */ 4285 */
4449 env.flags |= LBF_ALL_PINNED; 4286 env.flags |= LBF_ALL_PINNED;
4450 env.load_move = imbalance; 4287 env.src_cpu = busiest->cpu;
4451 env.src_cpu = busiest->cpu; 4288 env.src_rq = busiest;
4452 env.src_rq = busiest; 4289 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4453 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
4454 4290
4455more_balance: 4291more_balance:
4456 local_irq_save(flags); 4292 local_irq_save(flags);
@@ -4492,7 +4328,7 @@ more_balance:
4492 if (idle != CPU_NEWLY_IDLE) 4328 if (idle != CPU_NEWLY_IDLE)
4493 sd->nr_balance_failed++; 4329 sd->nr_balance_failed++;
4494 4330
4495 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { 4331 if (need_active_balance(&env)) {
4496 raw_spin_lock_irqsave(&busiest->lock, flags); 4332 raw_spin_lock_irqsave(&busiest->lock, flags);
4497 4333
4498 /* don't kick the active_load_balance_cpu_stop, 4334 /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4355,11 @@ more_balance:
4519 } 4355 }
4520 raw_spin_unlock_irqrestore(&busiest->lock, flags); 4356 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4521 4357
4522 if (active_balance) 4358 if (active_balance) {
4523 stop_one_cpu_nowait(cpu_of(busiest), 4359 stop_one_cpu_nowait(cpu_of(busiest),
4524 active_load_balance_cpu_stop, busiest, 4360 active_load_balance_cpu_stop, busiest,
4525 &busiest->active_balance_work); 4361 &busiest->active_balance_work);
4362 }
4526 4363
4527 /* 4364 /*
4528 * We've kicked active balancing, reset the failure 4365 * We've kicked active balancing, reset the failure
@@ -4703,104 +4540,15 @@ static struct {
4703 unsigned long next_balance; /* in jiffy units */ 4540 unsigned long next_balance; /* in jiffy units */
4704} nohz ____cacheline_aligned; 4541} nohz ____cacheline_aligned;
4705 4542
4706#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4543static inline int find_new_ilb(int call_cpu)
4707/**
4708 * lowest_flag_domain - Return lowest sched_domain containing flag.
4709 * @cpu: The cpu whose lowest level of sched domain is to
4710 * be returned.
4711 * @flag: The flag to check for the lowest sched_domain
4712 * for the given cpu.
4713 *
4714 * Returns the lowest sched_domain of a cpu which contains the given flag.
4715 */
4716static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4717{
4718 struct sched_domain *sd;
4719
4720 for_each_domain(cpu, sd)
4721 if (sd->flags & flag)
4722 break;
4723
4724 return sd;
4725}
4726
4727/**
4728 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4729 * @cpu: The cpu whose domains we're iterating over.
4730 * @sd: variable holding the value of the power_savings_sd
4731 * for cpu.
4732 * @flag: The flag to filter the sched_domains to be iterated.
4733 *
4734 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4735 * set, starting from the lowest sched_domain to the highest.
4736 */
4737#define for_each_flag_domain(cpu, sd, flag) \
4738 for (sd = lowest_flag_domain(cpu, flag); \
4739 (sd && (sd->flags & flag)); sd = sd->parent)
4740
4741/**
4742 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4743 * @cpu: The cpu which is nominating a new idle_load_balancer.
4744 *
4745 * Returns: Returns the id of the idle load balancer if it exists,
4746 * Else, returns >= nr_cpu_ids.
4747 *
4748 * This algorithm picks the idle load balancer such that it belongs to a
4749 * semi-idle powersavings sched_domain. The idea is to try and avoid
4750 * completely idle packages/cores just for the purpose of idle load balancing
4751 * when there are other idle cpu's which are better suited for that job.
4752 */
4753static int find_new_ilb(int cpu)
4754{ 4544{
4755 int ilb = cpumask_first(nohz.idle_cpus_mask); 4545 int ilb = cpumask_first(nohz.idle_cpus_mask);
4756 struct sched_group *ilbg;
4757 struct sched_domain *sd;
4758 4546
4759 /*
4760 * Have idle load balancer selection from semi-idle packages only
4761 * when power-aware load balancing is enabled
4762 */
4763 if (!(sched_smt_power_savings || sched_mc_power_savings))
4764 goto out_done;
4765
4766 /*
4767 * Optimize for the case when we have no idle CPUs or only one
4768 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4769 */
4770 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
4771 goto out_done;
4772
4773 rcu_read_lock();
4774 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4775 ilbg = sd->groups;
4776
4777 do {
4778 if (ilbg->group_weight !=
4779 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4780 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4781 sched_group_cpus(ilbg));
4782 goto unlock;
4783 }
4784
4785 ilbg = ilbg->next;
4786
4787 } while (ilbg != sd->groups);
4788 }
4789unlock:
4790 rcu_read_unlock();
4791
4792out_done:
4793 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 4547 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4794 return ilb; 4548 return ilb;
4795 4549
4796 return nr_cpu_ids; 4550 return nr_cpu_ids;
4797} 4551}
4798#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4799static inline int find_new_ilb(int call_cpu)
4800{
4801 return nr_cpu_ids;
4802}
4803#endif
4804 4552
4805/* 4553/*
4806 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 4554 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5023,7 +4771,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5023 4771
5024 raw_spin_lock_irq(&this_rq->lock); 4772 raw_spin_lock_irq(&this_rq->lock);
5025 update_rq_clock(this_rq); 4773 update_rq_clock(this_rq);
5026 update_cpu_load(this_rq); 4774 update_idle_cpu_load(this_rq);
5027 raw_spin_unlock_irq(&this_rq->lock); 4775 raw_spin_unlock_irq(&this_rq->lock);
5028 4776
5029 rebalance_domains(balance_cpu, CPU_IDLE); 4777 rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
4 * idle-task scheduling class. 4 * idle-task scheduling class.
5 * 5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are 6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched_fair.c) 7 * handled in sched/fair.c)
8 */ 8 */
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..2a4e8dffbd6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
274 274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{ 276{
277 struct task_struct *p;
278
277 if (!rt_entity_is_task(rt_se)) 279 if (!rt_entity_is_task(rt_se))
278 return; 280 return;
279 281
282 p = rt_task_of(rt_se);
280 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
281 284
282 rt_rq->rt_nr_total++; 285 rt_rq->rt_nr_total++;
283 if (rt_se->nr_cpus_allowed > 1) 286 if (p->nr_cpus_allowed > 1)
284 rt_rq->rt_nr_migratory++; 287 rt_rq->rt_nr_migratory++;
285 288
286 update_rt_migration(rt_rq); 289 update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
288 291
289static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
290{ 293{
294 struct task_struct *p;
295
291 if (!rt_entity_is_task(rt_se)) 296 if (!rt_entity_is_task(rt_se))
292 return; 297 return;
293 298
299 p = rt_task_of(rt_se);
294 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
295 301
296 rt_rq->rt_nr_total--; 302 rt_rq->rt_nr_total--;
297 if (rt_se->nr_cpus_allowed > 1) 303 if (p->nr_cpus_allowed > 1)
298 rt_rq->rt_nr_migratory--; 304 rt_rq->rt_nr_migratory--;
299 305
300 update_rt_migration(rt_rq); 306 update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1161 1167
1162 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1168 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1163 1169
1164 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1170 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1165 enqueue_pushable_task(rq, p); 1171 enqueue_pushable_task(rq, p);
1166 1172
1167 inc_nr_running(rq); 1173 inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1225 1231
1226 cpu = task_cpu(p); 1232 cpu = task_cpu(p);
1227 1233
1228 if (p->rt.nr_cpus_allowed == 1) 1234 if (p->nr_cpus_allowed == 1)
1229 goto out; 1235 goto out;
1230 1236
1231 /* For anything but wake ups, just return the task_cpu */ 1237 /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1260 * will have to sort it out. 1266 * will have to sort it out.
1261 */ 1267 */
1262 if (curr && unlikely(rt_task(curr)) && 1268 if (curr && unlikely(rt_task(curr)) &&
1263 (curr->rt.nr_cpus_allowed < 2 || 1269 (curr->nr_cpus_allowed < 2 ||
1264 curr->prio <= p->prio) && 1270 curr->prio <= p->prio) &&
1265 (p->rt.nr_cpus_allowed > 1)) { 1271 (p->nr_cpus_allowed > 1)) {
1266 int target = find_lowest_rq(p); 1272 int target = find_lowest_rq(p);
1267 1273
1268 if (target != -1) 1274 if (target != -1)
@@ -1276,10 +1282,10 @@ out:
1276 1282
1277static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1283static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1278{ 1284{
1279 if (rq->curr->rt.nr_cpus_allowed == 1) 1285 if (rq->curr->nr_cpus_allowed == 1)
1280 return; 1286 return;
1281 1287
1282 if (p->rt.nr_cpus_allowed != 1 1288 if (p->nr_cpus_allowed != 1
1283 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1289 && cpupri_find(&rq->rd->cpupri, p, NULL))
1284 return; 1290 return;
1285 1291
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1395 * The previous task needs to be made eligible for pushing 1401 * The previous task needs to be made eligible for pushing
1396 * if it is still active 1402 * if it is still active
1397 */ 1403 */
1398 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1404 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1399 enqueue_pushable_task(rq, p); 1405 enqueue_pushable_task(rq, p);
1400} 1406}
1401 1407
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1408{ 1414{
1409 if (!task_running(rq, p) && 1415 if (!task_running(rq, p) &&
1410 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1416 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1411 (p->rt.nr_cpus_allowed > 1)) 1417 (p->nr_cpus_allowed > 1))
1412 return 1; 1418 return 1;
1413 return 0; 1419 return 0;
1414} 1420}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
1464 if (unlikely(!lowest_mask)) 1470 if (unlikely(!lowest_mask))
1465 return -1; 1471 return -1;
1466 1472
1467 if (task->rt.nr_cpus_allowed == 1) 1473 if (task->nr_cpus_allowed == 1)
1468 return -1; /* No other targets possible */ 1474 return -1; /* No other targets possible */
1469 1475
1470 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1476 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1586 1592
1587 BUG_ON(rq->cpu != task_cpu(p)); 1593 BUG_ON(rq->cpu != task_cpu(p));
1588 BUG_ON(task_current(rq, p)); 1594 BUG_ON(task_current(rq, p));
1589 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1595 BUG_ON(p->nr_cpus_allowed <= 1);
1590 1596
1591 BUG_ON(!p->on_rq); 1597 BUG_ON(!p->on_rq);
1592 BUG_ON(!rt_task(p)); 1598 BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1793 if (!task_running(rq, p) && 1799 if (!task_running(rq, p) &&
1794 !test_tsk_need_resched(rq->curr) && 1800 !test_tsk_need_resched(rq->curr) &&
1795 has_pushable_tasks(rq) && 1801 has_pushable_tasks(rq) &&
1796 p->rt.nr_cpus_allowed > 1 && 1802 p->nr_cpus_allowed > 1 &&
1797 rt_task(rq->curr) && 1803 rt_task(rq->curr) &&
1798 (rq->curr->rt.nr_cpus_allowed < 2 || 1804 (rq->curr->nr_cpus_allowed < 2 ||
1799 rq->curr->prio <= p->prio)) 1805 rq->curr->prio <= p->prio))
1800 push_rt_tasks(rq); 1806 push_rt_tasks(rq);
1801} 1807}
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1803static void set_cpus_allowed_rt(struct task_struct *p, 1809static void set_cpus_allowed_rt(struct task_struct *p,
1804 const struct cpumask *new_mask) 1810 const struct cpumask *new_mask)
1805{ 1811{
1806 int weight = cpumask_weight(new_mask); 1812 struct rq *rq;
1813 int weight;
1807 1814
1808 BUG_ON(!rt_task(p)); 1815 BUG_ON(!rt_task(p));
1809 1816
1810 /* 1817 if (!p->on_rq)
1811 * Update the migration status of the RQ if we have an RT task 1818 return;
1812 * which is running AND changing its weight value.
1813 */
1814 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
1815 struct rq *rq = task_rq(p);
1816 1819
1817 if (!task_current(rq, p)) { 1820 weight = cpumask_weight(new_mask);
1818 /*
1819 * Make sure we dequeue this task from the pushable list
1820 * before going further. It will either remain off of
1821 * the list because we are no longer pushable, or it
1822 * will be requeued.
1823 */
1824 if (p->rt.nr_cpus_allowed > 1)
1825 dequeue_pushable_task(rq, p);
1826 1821
1827 /* 1822 /*
1828 * Requeue if our weight is changing and still > 1 1823 * Only update if the process changes its state from whether it
1829 */ 1824 * can migrate or not.
1830 if (weight > 1) 1825 */
1831 enqueue_pushable_task(rq, p); 1826 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1832 1827 return;
1833 }
1834 1828
1835 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1829 rq = task_rq(p);
1836 rq->rt.rt_nr_migratory++;
1837 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
1838 BUG_ON(!rq->rt.rt_nr_migratory);
1839 rq->rt.rt_nr_migratory--;
1840 }
1841 1830
1842 update_rt_migration(&rq->rt); 1831 /*
1832 * The process used to be able to migrate OR it can now migrate
1833 */
1834 if (weight <= 1) {
1835 if (!task_current(rq, p))
1836 dequeue_pushable_task(rq, p);
1837 BUG_ON(!rq->rt.rt_nr_migratory);
1838 rq->rt.rt_nr_migratory--;
1839 } else {
1840 if (!task_current(rq, p))
1841 enqueue_pushable_task(rq, p);
1842 rq->rt.rt_nr_migratory++;
1843 } 1843 }
1844
1845 update_rt_migration(&rq->rt);
1844} 1846}
1845 1847
1846/* Assumes rq->lock is held */ 1848/* Assumes rq->lock is held */
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1983 1985
1984static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 1986static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1985{ 1987{
1988 struct sched_rt_entity *rt_se = &p->rt;
1989
1986 update_curr_rt(rq); 1990 update_curr_rt(rq);
1987 1991
1988 watchdog(rq, p); 1992 watchdog(rq, p);
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2000 p->rt.time_slice = RR_TIMESLICE; 2004 p->rt.time_slice = RR_TIMESLICE;
2001 2005
2002 /* 2006 /*
2003 * Requeue to the end of queue if we are not the only element 2007 * Requeue to the end of queue if we (and all of our ancestors) are the
2004 * on the queue: 2008 * only element on the queue
2005 */ 2009 */
2006 if (p->rt.run_list.prev != p->rt.run_list.next) { 2010 for_each_sched_rt_entity(rt_se) {
2007 requeue_task_rt(rq, p, 0); 2011 if (rt_se->run_list.prev != rt_se->run_list.next) {
2008 set_tsk_need_resched(p); 2012 requeue_task_rt(rq, p, 0);
2013 set_tsk_need_resched(p);
2014 return;
2015 }
2009 } 2016 }
2010} 2017}
2011 2018
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..ba9dccfd24ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
201/* CFS-related fields in a runqueue */ 201/* CFS-related fields in a runqueue */
202struct cfs_rq { 202struct cfs_rq {
203 struct load_weight load; 203 struct load_weight load;
204 unsigned long nr_running, h_nr_running; 204 unsigned int nr_running, h_nr_running;
205 205
206 u64 exec_clock; 206 u64 exec_clock;
207 u64 min_vruntime; 207 u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
279/* Real-Time classes' related field in a runqueue: */ 279/* Real-Time classes' related field in a runqueue: */
280struct rt_rq { 280struct rt_rq {
281 struct rt_prio_array active; 281 struct rt_prio_array active;
282 unsigned long rt_nr_running; 282 unsigned int rt_nr_running;
283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 283#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
284 struct { 284 struct {
285 int curr; /* highest queued rt task prio */ 285 int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
353 * nr_running and cpu_load should be in the same cacheline because 353 * nr_running and cpu_load should be in the same cacheline because
354 * remote CPUs use both these fields when doing load calculation. 354 * remote CPUs use both these fields when doing load calculation.
355 */ 355 */
356 unsigned long nr_running; 356 unsigned int nr_running;
357 #define CPU_LOAD_IDX_MAX 5 357 #define CPU_LOAD_IDX_MAX 5
358 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 358 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
359 unsigned long last_load_update_tick; 359 unsigned long last_load_update_tick;
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu);
876extern struct rt_bandwidth def_rt_bandwidth; 876extern struct rt_bandwidth def_rt_bandwidth;
877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 877extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
878 878
879extern void update_cpu_load(struct rq *this_rq); 879extern void update_idle_cpu_load(struct rq *this_rq);
880 880
881#ifdef CONFIG_CGROUP_CPUACCT 881#ifdef CONFIG_CGROUP_CPUACCT
882#include <linux/cgroup.h> 882#include <linux/cgroup.h>
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..ee376beedaf9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
3 * 3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> 4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 * 5 *
6 * This defines a simple but solid secure-computing mode. 6 * Copyright (C) 2012 Google, Inc.
7 * Will Drewry <wad@chromium.org>
8 *
9 * This defines a simple but solid secure-computing facility.
10 *
11 * Mode 1 uses a fixed list of allowed system calls.
12 * Mode 2 allows user-defined system call filters in the form
13 * of Berkeley Packet Filters/Linux Socket Filters.
7 */ 14 */
8 15
16#include <linux/atomic.h>
9#include <linux/audit.h> 17#include <linux/audit.h>
10#include <linux/seccomp.h>
11#include <linux/sched.h>
12#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/sched.h>
20#include <linux/seccomp.h>
13 21
14/* #define SECCOMP_DEBUG 1 */ 22/* #define SECCOMP_DEBUG 1 */
15#define NR_SECCOMP_MODES 1 23
24#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h>
26#include <linux/filter.h>
27#include <linux/ptrace.h>
28#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h>
31#include <linux/uaccess.h>
32
33/**
34 * struct seccomp_filter - container for seccomp BPF programs
35 *
36 * @usage: reference count to manage the object lifetime.
37 * get/put helpers should be used when accessing an instance
38 * outside of a lifetime-guarded section. In general, this
39 * is only needed for handling filters shared across tasks.
40 * @prev: points to a previously installed, or inherited, filter
41 * @len: the number of instructions in the program
42 * @insns: the BPF program instructions to evaluate
43 *
44 * seccomp_filter objects are organized in a tree linked via the @prev
45 * pointer. For any task, it appears to be a singly-linked list starting
46 * with current->seccomp.filter, the most recently attached or inherited filter.
47 * However, multiple filters may share a @prev node, by way of fork(), which
48 * results in a unidirectional tree existing in memory. This is similar to
49 * how namespaces work.
50 *
51 * seccomp_filter objects should never be modified after being attached
52 * to a task_struct (other than @usage).
53 */
54struct seccomp_filter {
55 atomic_t usage;
56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[];
59};
60
61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63
64/**
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture.
76 */
77static inline u32 get_u32(u64 data, int index)
78{
79 return ((u32 *)&data)[index];
80}
81
82/* Helper for bpf_load below. */
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
84/**
85 * bpf_load: checks and returns a pointer to the requested offset
86 * @off: offset into struct seccomp_data to load from
87 *
88 * Returns the requested 32-bits of data.
89 * seccomp_check_filter() should assure that @off is 32-bit aligned
90 * and not out of bounds. Failure to do so is a BUG.
91 */
92u32 seccomp_bpf_load(int off)
93{
94 struct pt_regs *regs = task_pt_regs(current);
95 if (off == BPF_DATA(nr))
96 return syscall_get_nr(current, regs);
97 if (off == BPF_DATA(arch))
98 return syscall_get_arch(current, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
100 unsigned long value;
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
102 int index = !!(off % sizeof(u64));
103 syscall_get_arguments(current, regs, arg, 1, &value);
104 return get_u32(value, index);
105 }
106 if (off == BPF_DATA(instruction_pointer))
107 return get_u32(KSTK_EIP(current), 0);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112}
113
114/**
115 * seccomp_check_filter - verify seccomp filter code
116 * @filter: filter to verify
117 * @flen: length of filter
118 *
119 * Takes a previously checked filter (by sk_chk_filter) and
120 * redirects all filter code that loads struct sk_buff data
121 * and related data through seccomp_bpf_load. It also
122 * enforces length and alignment checking of those loads.
123 *
124 * Returns 0 if the rule set is legal or -EINVAL if not.
125 */
126static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
127{
128 int pc;
129 for (pc = 0; pc < flen; pc++) {
130 struct sock_filter *ftest = &filter[pc];
131 u16 code = ftest->code;
132 u32 k = ftest->k;
133
134 switch (code) {
135 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W;
137 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL;
140 continue;
141 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM;
143 ftest->k = sizeof(struct seccomp_data);
144 continue;
145 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM;
147 ftest->k = sizeof(struct seccomp_data);
148 continue;
149 /* Explicitly include allowed calls. */
150 case BPF_S_RET_K:
151 case BPF_S_RET_A:
152 case BPF_S_ALU_ADD_K:
153 case BPF_S_ALU_ADD_X:
154 case BPF_S_ALU_SUB_K:
155 case BPF_S_ALU_SUB_X:
156 case BPF_S_ALU_MUL_K:
157 case BPF_S_ALU_MUL_X:
158 case BPF_S_ALU_DIV_X:
159 case BPF_S_ALU_AND_K:
160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K:
166 case BPF_S_ALU_RSH_X:
167 case BPF_S_ALU_NEG:
168 case BPF_S_LD_IMM:
169 case BPF_S_LDX_IMM:
170 case BPF_S_MISC_TAX:
171 case BPF_S_MISC_TXA:
172 case BPF_S_ALU_DIV_K:
173 case BPF_S_LD_MEM:
174 case BPF_S_LDX_MEM:
175 case BPF_S_ST:
176 case BPF_S_STX:
177 case BPF_S_JMP_JA:
178 case BPF_S_JMP_JEQ_K:
179 case BPF_S_JMP_JEQ_X:
180 case BPF_S_JMP_JGE_K:
181 case BPF_S_JMP_JGE_X:
182 case BPF_S_JMP_JGT_K:
183 case BPF_S_JMP_JGT_X:
184 case BPF_S_JMP_JSET_K:
185 case BPF_S_JMP_JSET_X:
186 continue;
187 default:
188 return -EINVAL;
189 }
190 }
191 return 0;
192}
193
194/**
195 * seccomp_run_filters - evaluates all seccomp filters against @syscall
196 * @syscall: number of the current system call
197 *
198 * Returns valid seccomp BPF response codes.
199 */
200static u32 seccomp_run_filters(int syscall)
201{
202 struct seccomp_filter *f;
203 u32 ret = SECCOMP_RET_ALLOW;
204
205 /* Ensure unexpected behavior doesn't result in failing open. */
206 if (WARN_ON(current->seccomp.filter == NULL))
207 return SECCOMP_RET_KILL;
208
209 /*
210 * All filters in the list are evaluated and the lowest BPF return
211 * value always takes priority (ignoring the DATA).
212 */
213 for (f = current->seccomp.filter; f; f = f->prev) {
214 u32 cur_ret = sk_run_filter(NULL, f->insns);
215 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
216 ret = cur_ret;
217 }
218 return ret;
219}
220
221/**
222 * seccomp_attach_filter: Attaches a seccomp filter to current.
223 * @fprog: BPF program to install
224 *
225 * Returns 0 on success or an errno on failure.
226 */
227static long seccomp_attach_filter(struct sock_fprog *fprog)
228{
229 struct seccomp_filter *filter;
230 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
231 unsigned long total_insns = fprog->len;
232 long ret;
233
234 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
235 return -EINVAL;
236
237 for (filter = current->seccomp.filter; filter; filter = filter->prev)
238 total_insns += filter->len + 4; /* include a 4 instr penalty */
239 if (total_insns > MAX_INSNS_PER_PATH)
240 return -ENOMEM;
241
242 /*
243 * Installing a seccomp filter requires that the task have
244 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
245 * This avoids scenarios where unprivileged tasks can affect the
246 * behavior of privileged children.
247 */
248 if (!current->no_new_privs &&
249 security_capable_noaudit(current_cred(), current_user_ns(),
250 CAP_SYS_ADMIN) != 0)
251 return -EACCES;
252
253 /* Allocate a new seccomp_filter */
254 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
255 GFP_KERNEL|__GFP_NOWARN);
256 if (!filter)
257 return -ENOMEM;
258 atomic_set(&filter->usage, 1);
259 filter->len = fprog->len;
260
261 /* Copy the instructions from fprog. */
262 ret = -EFAULT;
263 if (copy_from_user(filter->insns, fprog->filter, fp_size))
264 goto fail;
265
266 /* Check and rewrite the fprog via the skb checker */
267 ret = sk_chk_filter(filter->insns, filter->len);
268 if (ret)
269 goto fail;
270
271 /* Check and rewrite the fprog for seccomp use */
272 ret = seccomp_check_filter(filter->insns, filter->len);
273 if (ret)
274 goto fail;
275
276 /*
277 * If there is an existing filter, make it the prev and don't drop its
278 * task reference.
279 */
280 filter->prev = current->seccomp.filter;
281 current->seccomp.filter = filter;
282 return 0;
283fail:
284 kfree(filter);
285 return ret;
286}
287
288/**
289 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
290 * @user_filter: pointer to the user data containing a sock_fprog.
291 *
292 * Returns 0 on success and non-zero otherwise.
293 */
294long seccomp_attach_user_filter(char __user *user_filter)
295{
296 struct sock_fprog fprog;
297 long ret = -EFAULT;
298
299#ifdef CONFIG_COMPAT
300 if (is_compat_task()) {
301 struct compat_sock_fprog fprog32;
302 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
303 goto out;
304 fprog.len = fprog32.len;
305 fprog.filter = compat_ptr(fprog32.filter);
306 } else /* falls through to the if below. */
307#endif
308 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
309 goto out;
310 ret = seccomp_attach_filter(&fprog);
311out:
312 return ret;
313}
314
315/* get_seccomp_filter - increments the reference count of the filter on @tsk */
316void get_seccomp_filter(struct task_struct *tsk)
317{
318 struct seccomp_filter *orig = tsk->seccomp.filter;
319 if (!orig)
320 return;
321 /* Reference count is bounded by the number of total processes. */
322 atomic_inc(&orig->usage);
323}
324
325/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
326void put_seccomp_filter(struct task_struct *tsk)
327{
328 struct seccomp_filter *orig = tsk->seccomp.filter;
329 /* Clean up single-reference branches iteratively. */
330 while (orig && atomic_dec_and_test(&orig->usage)) {
331 struct seccomp_filter *freeme = orig;
332 orig = orig->prev;
333 kfree(freeme);
334 }
335}
336
337/**
338 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
339 * @syscall: syscall number to send to userland
340 * @reason: filter-supplied reason code to send to userland (via si_errno)
341 *
342 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
343 */
344static void seccomp_send_sigsys(int syscall, int reason)
345{
346 struct siginfo info;
347 memset(&info, 0, sizeof(info));
348 info.si_signo = SIGSYS;
349 info.si_code = SYS_SECCOMP;
350 info.si_call_addr = (void __user *)KSTK_EIP(current);
351 info.si_errno = reason;
352 info.si_arch = syscall_get_arch(current, task_pt_regs(current));
353 info.si_syscall = syscall;
354 force_sig_info(SIGSYS, &info, current);
355}
356#endif /* CONFIG_SECCOMP_FILTER */
16 357
17/* 358/*
18 * Secure computing mode 1 allows only read/write/exit/sigreturn. 359 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
31}; 372};
32#endif 373#endif
33 374
34void __secure_computing(int this_syscall) 375int __secure_computing(int this_syscall)
35{ 376{
36 int mode = current->seccomp.mode; 377 int mode = current->seccomp.mode;
37 int * syscall; 378 int exit_sig = 0;
379 int *syscall;
380 u32 ret;
38 381
39 switch (mode) { 382 switch (mode) {
40 case 1: 383 case SECCOMP_MODE_STRICT:
41 syscall = mode1_syscalls; 384 syscall = mode1_syscalls;
42#ifdef CONFIG_COMPAT 385#ifdef CONFIG_COMPAT
43 if (is_compat_task()) 386 if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
45#endif 388#endif
46 do { 389 do {
47 if (*syscall == this_syscall) 390 if (*syscall == this_syscall)
48 return; 391 return 0;
49 } while (*++syscall); 392 } while (*++syscall);
393 exit_sig = SIGKILL;
394 ret = SECCOMP_RET_KILL;
395 break;
396#ifdef CONFIG_SECCOMP_FILTER
397 case SECCOMP_MODE_FILTER: {
398 int data;
399 ret = seccomp_run_filters(this_syscall);
400 data = ret & SECCOMP_RET_DATA;
401 ret &= SECCOMP_RET_ACTION;
402 switch (ret) {
403 case SECCOMP_RET_ERRNO:
404 /* Set the low-order 16-bits as a errno. */
405 syscall_set_return_value(current, task_pt_regs(current),
406 -data, 0);
407 goto skip;
408 case SECCOMP_RET_TRAP:
409 /* Show the handler the original registers. */
410 syscall_rollback(current, task_pt_regs(current));
411 /* Let the filter pass back 16 bits of data. */
412 seccomp_send_sigsys(this_syscall, data);
413 goto skip;
414 case SECCOMP_RET_TRACE:
415 /* Skip these calls if there is no tracer. */
416 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
417 goto skip;
418 /* Allow the BPF to provide the event message */
419 ptrace_event(PTRACE_EVENT_SECCOMP, data);
420 /*
421 * The delivery of a fatal signal during event
422 * notification may silently skip tracer notification.
423 * Terminating the task now avoids executing a system
424 * call that may not be intended.
425 */
426 if (fatal_signal_pending(current))
427 break;
428 return 0;
429 case SECCOMP_RET_ALLOW:
430 return 0;
431 case SECCOMP_RET_KILL:
432 default:
433 break;
434 }
435 exit_sig = SIGSYS;
50 break; 436 break;
437 }
438#endif
51 default: 439 default:
52 BUG(); 440 BUG();
53 } 441 }
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
55#ifdef SECCOMP_DEBUG 443#ifdef SECCOMP_DEBUG
56 dump_stack(); 444 dump_stack();
57#endif 445#endif
58 audit_seccomp(this_syscall); 446 audit_seccomp(this_syscall, exit_sig, ret);
59 do_exit(SIGKILL); 447 do_exit(exit_sig);
448#ifdef CONFIG_SECCOMP_FILTER
449skip:
450 audit_seccomp(this_syscall, exit_sig, ret);
451#endif
452 return -1;
60} 453}
61 454
62long prctl_get_seccomp(void) 455long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
64 return current->seccomp.mode; 457 return current->seccomp.mode;
65} 458}
66 459
67long prctl_set_seccomp(unsigned long seccomp_mode) 460/**
461 * prctl_set_seccomp: configures current->seccomp.mode
462 * @seccomp_mode: requested mode to use
463 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
464 *
465 * This function may be called repeatedly with a @seccomp_mode of
466 * SECCOMP_MODE_FILTER to install additional filters. Every filter
467 * successfully installed will be evaluated (in reverse order) for each system
468 * call the task makes.
469 *
470 * Once current->seccomp.mode is non-zero, it may not be changed.
471 *
472 * Returns 0 on success or -EINVAL on failure.
473 */
474long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
68{ 475{
69 long ret; 476 long ret = -EINVAL;
70 477
71 /* can set it only once to be even more secure */ 478 if (current->seccomp.mode &&
72 ret = -EPERM; 479 current->seccomp.mode != seccomp_mode)
73 if (unlikely(current->seccomp.mode))
74 goto out; 480 goto out;
75 481
76 ret = -EINVAL; 482 switch (seccomp_mode) {
77 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { 483 case SECCOMP_MODE_STRICT:
78 current->seccomp.mode = seccomp_mode; 484 ret = 0;
79 set_thread_flag(TIF_SECCOMP);
80#ifdef TIF_NOTSC 485#ifdef TIF_NOTSC
81 disable_TSC(); 486 disable_TSC();
82#endif 487#endif
83 ret = 0; 488 break;
489#ifdef CONFIG_SECCOMP_FILTER
490 case SECCOMP_MODE_FILTER:
491 ret = seccomp_attach_user_filter(filter);
492 if (ret)
493 goto out;
494 break;
495#endif
496 default:
497 goto out;
84 } 498 }
85 499
86 out: 500 current->seccomp.mode = seccomp_mode;
501 set_thread_flag(TIF_SECCOMP);
502out:
87 return ret; 503 return ret;
88} 504}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 60636a4e25c3..4567fc020fe3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
118 * down_trylock - try to acquire the semaphore, without waiting 118 * down_trylock - try to acquire the semaphore, without waiting
119 * @sem: the semaphore to be acquired 119 * @sem: the semaphore to be acquired
120 * 120 *
121 * Try to acquire the semaphore atomically. Returns 0 if the mutex has 121 * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
122 * been acquired successfully or 1 if it it cannot be acquired. 122 * been acquired successfully or 1 if it it cannot be acquired.
123 * 123 *
124 * NOTE: This return value is inverted from both spin_trylock and 124 * NOTE: This return value is inverted from both spin_trylock and
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..677102789cf2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -29,6 +29,7 @@
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h> 31#include <linux/user_namespace.h>
32#include <linux/uprobes.h>
32#define CREATE_TRACE_POINTS 33#define CREATE_TRACE_POINTS
33#include <trace/events/signal.h> 34#include <trace/events/signal.h>
34 35
@@ -160,7 +161,7 @@ void recalc_sigpending(void)
160 161
161#define SYNCHRONOUS_MASK \ 162#define SYNCHRONOUS_MASK \
162 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ 163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
163 sigmask(SIGTRAP) | sigmask(SIGFPE)) 164 sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
164 165
165int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
166{ 167{
@@ -767,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t)
767 const struct cred *cred = current_cred(); 768 const struct cred *cred = current_cred();
768 const struct cred *tcred = __task_cred(t); 769 const struct cred *tcred = __task_cred(t);
769 770
770 if (cred->user->user_ns == tcred->user->user_ns && 771 if (uid_eq(cred->euid, tcred->suid) ||
771 (cred->euid == tcred->suid || 772 uid_eq(cred->euid, tcred->uid) ||
772 cred->euid == tcred->uid || 773 uid_eq(cred->uid, tcred->suid) ||
773 cred->uid == tcred->suid || 774 uid_eq(cred->uid, tcred->uid))
774 cred->uid == tcred->uid))
775 return 1; 775 return 1;
776 776
777 if (ns_capable(tcred->user->user_ns, CAP_KILL)) 777 if (ns_capable(tcred->user_ns, CAP_KILL))
778 return 1; 778 return 1;
779 779
780 return 0; 780 return 0;
@@ -1020,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1021} 1021}
1022 1022
1023/*
1024 * map the uid in struct cred into user namespace *ns
1025 */
1026static inline uid_t map_cred_ns(const struct cred *cred,
1027 struct user_namespace *ns)
1028{
1029 return user_ns_map_uid(ns, cred, cred->uid);
1030}
1031
1032#ifdef CONFIG_USER_NS 1023#ifdef CONFIG_USER_NS
1033static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) 1024static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1034{ 1025{
@@ -1038,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
1038 if (SI_FROMKERNEL(info)) 1029 if (SI_FROMKERNEL(info))
1039 return; 1030 return;
1040 1031
1041 info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), 1032 rcu_read_lock();
1042 current_cred(), info->si_uid); 1033 info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
1034 make_kuid(current_user_ns(), info->si_uid));
1035 rcu_read_unlock();
1043} 1036}
1044#else 1037#else
1045static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) 1038static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1106,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1106 q->info.si_code = SI_USER; 1099 q->info.si_code = SI_USER;
1107 q->info.si_pid = task_tgid_nr_ns(current, 1100 q->info.si_pid = task_tgid_nr_ns(current,
1108 task_active_pid_ns(t)); 1101 task_active_pid_ns(t));
1109 q->info.si_uid = current_uid(); 1102 q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
1110 break; 1103 break;
1111 case (unsigned long) SEND_SIG_PRIV: 1104 case (unsigned long) SEND_SIG_PRIV:
1112 q->info.si_signo = sig; 1105 q->info.si_signo = sig;
@@ -1387,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred,
1387 struct task_struct *target) 1380 struct task_struct *target)
1388{ 1381{
1389 const struct cred *pcred = __task_cred(target); 1382 const struct cred *pcred = __task_cred(target);
1390 if (cred->user_ns != pcred->user_ns) 1383 if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
1391 return 0; 1384 !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
1392 if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
1393 cred->uid != pcred->suid && cred->uid != pcred->uid)
1394 return 0; 1385 return 0;
1395 return 1; 1386 return 1;
1396} 1387}
@@ -1665,21 +1656,20 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1665 info.si_signo = sig; 1656 info.si_signo = sig;
1666 info.si_errno = 0; 1657 info.si_errno = 0;
1667 /* 1658 /*
1668 * we are under tasklist_lock here so our parent is tied to 1659 * We are under tasklist_lock here so our parent is tied to
1669 * us and cannot exit and release its namespace. 1660 * us and cannot change.
1670 * 1661 *
1671 * the only it can is to switch its nsproxy with sys_unshare, 1662 * task_active_pid_ns will always return the same pid namespace
1672 * bu uncharing pid namespaces is not allowed, so we'll always 1663 * until a task passes through release_task.
1673 * see relevant namespace
1674 * 1664 *
1675 * write_lock() currently calls preempt_disable() which is the 1665 * write_lock() currently calls preempt_disable() which is the
1676 * same as rcu_read_lock(), but according to Oleg, this is not 1666 * same as rcu_read_lock(), but according to Oleg, this is not
1677 * correct to rely on this 1667 * correct to rely on this
1678 */ 1668 */
1679 rcu_read_lock(); 1669 rcu_read_lock();
1680 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1670 info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
1681 info.si_uid = map_cred_ns(__task_cred(tsk), 1671 info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
1682 task_cred_xxx(tsk->parent, user_ns)); 1672 task_uid(tsk));
1683 rcu_read_unlock(); 1673 rcu_read_unlock();
1684 1674
1685 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); 1675 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1762,8 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1762 */ 1752 */
1763 rcu_read_lock(); 1753 rcu_read_lock();
1764 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1754 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1765 info.si_uid = map_cred_ns(__task_cred(tsk), 1755 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
1766 task_cred_xxx(parent, user_ns));
1767 rcu_read_unlock(); 1756 rcu_read_unlock();
1768 1757
1769 info.si_utime = cputime_to_clock_t(tsk->utime); 1758 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1973,7 +1962,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1973 info.si_signo = signr; 1962 info.si_signo = signr;
1974 info.si_code = exit_code; 1963 info.si_code = exit_code;
1975 info.si_pid = task_pid_vnr(current); 1964 info.si_pid = task_pid_vnr(current);
1976 info.si_uid = current_uid(); 1965 info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
1977 1966
1978 /* Let the debugger run. */ 1967 /* Let the debugger run. */
1979 ptrace_stop(exit_code, why, 1, &info); 1968 ptrace_stop(exit_code, why, 1, &info);
@@ -2181,8 +2170,8 @@ static int ptrace_signal(int signr, siginfo_t *info,
2181 info->si_code = SI_USER; 2170 info->si_code = SI_USER;
2182 rcu_read_lock(); 2171 rcu_read_lock();
2183 info->si_pid = task_pid_vnr(current->parent); 2172 info->si_pid = task_pid_vnr(current->parent);
2184 info->si_uid = map_cred_ns(__task_cred(current->parent), 2173 info->si_uid = from_kuid_munged(current_user_ns(),
2185 current_user_ns()); 2174 task_uid(current->parent));
2186 rcu_read_unlock(); 2175 rcu_read_unlock();
2187 } 2176 }
2188 2177
@@ -2202,6 +2191,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
2202 struct signal_struct *signal = current->signal; 2191 struct signal_struct *signal = current->signal;
2203 int signr; 2192 int signr;
2204 2193
2194 if (unlikely(uprobe_deny_signal()))
2195 return 0;
2196
2205relock: 2197relock:
2206 /* 2198 /*
2207 * We'll jump back here after any time we were stopped in TASK_STOPPED. 2199 * We'll jump back here after any time we were stopped in TASK_STOPPED.
@@ -2376,24 +2368,34 @@ relock:
2376} 2368}
2377 2369
2378/** 2370/**
2379 * block_sigmask - add @ka's signal mask to current->blocked 2371 * signal_delivered -
2380 * @ka: action for @signr 2372 * @sig: number of signal being delivered
2381 * @signr: signal that has been successfully delivered 2373 * @info: siginfo_t of signal being delivered
2374 * @ka: sigaction setting that chose the handler
2375 * @regs: user register state
2376 * @stepping: nonzero if debugger single-step or block-step in use
2382 * 2377 *
2383 * This function should be called when a signal has succesfully been 2378 * This function should be called when a signal has succesfully been
2384 * delivered. It adds the mask of signals for @ka to current->blocked 2379 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
2385 * so that they are blocked during the execution of the signal 2380 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
2386 * handler. In addition, @signr will be blocked unless %SA_NODEFER is 2381 * is set in @ka->sa.sa_flags. Tracing is notified.
2387 * set in @ka->sa.sa_flags.
2388 */ 2382 */
2389void block_sigmask(struct k_sigaction *ka, int signr) 2383void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2384 struct pt_regs *regs, int stepping)
2390{ 2385{
2391 sigset_t blocked; 2386 sigset_t blocked;
2392 2387
2388 /* A signal was successfully delivered, and the
2389 saved sigmask was stored on the signal frame,
2390 and will be restored by sigreturn. So we can
2391 simply clear the restore sigmask flag. */
2392 clear_restore_sigmask();
2393
2393 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 2394 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
2394 if (!(ka->sa.sa_flags & SA_NODEFER)) 2395 if (!(ka->sa.sa_flags & SA_NODEFER))
2395 sigaddset(&blocked, signr); 2396 sigaddset(&blocked, sig);
2396 set_current_blocked(&blocked); 2397 set_current_blocked(&blocked);
2398 tracehook_signal_handler(sig, info, ka, regs, stepping);
2397} 2399}
2398 2400
2399/* 2401/*
@@ -2526,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
2526 * It is wrong to change ->blocked directly, this helper should be used 2528 * It is wrong to change ->blocked directly, this helper should be used
2527 * to ensure the process can't miss a shared signal we are going to block. 2529 * to ensure the process can't miss a shared signal we are going to block.
2528 */ 2530 */
2529void set_current_blocked(const sigset_t *newset) 2531void set_current_blocked(sigset_t *newset)
2532{
2533 struct task_struct *tsk = current;
2534 sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
2535 spin_lock_irq(&tsk->sighand->siglock);
2536 __set_task_blocked(tsk, newset);
2537 spin_unlock_irq(&tsk->sighand->siglock);
2538}
2539
2540void __set_current_blocked(const sigset_t *newset)
2530{ 2541{
2531 struct task_struct *tsk = current; 2542 struct task_struct *tsk = current;
2532 2543
@@ -2566,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2566 return -EINVAL; 2577 return -EINVAL;
2567 } 2578 }
2568 2579
2569 set_current_blocked(&newset); 2580 __set_current_blocked(&newset);
2570 return 0; 2581 return 0;
2571} 2582}
2572 2583
@@ -2706,6 +2717,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2706 err |= __put_user(from->si_uid, &to->si_uid); 2717 err |= __put_user(from->si_uid, &to->si_uid);
2707 err |= __put_user(from->si_ptr, &to->si_ptr); 2718 err |= __put_user(from->si_ptr, &to->si_ptr);
2708 break; 2719 break;
2720#ifdef __ARCH_SIGSYS
2721 case __SI_SYS:
2722 err |= __put_user(from->si_call_addr, &to->si_call_addr);
2723 err |= __put_user(from->si_syscall, &to->si_syscall);
2724 err |= __put_user(from->si_arch, &to->si_arch);
2725 break;
2726#endif
2709 default: /* this is just in case for now ... */ 2727 default: /* this is just in case for now ... */
2710 err |= __put_user(from->si_pid, &to->si_pid); 2728 err |= __put_user(from->si_pid, &to->si_pid);
2711 err |= __put_user(from->si_uid, &to->si_uid); 2729 err |= __put_user(from->si_uid, &to->si_uid);
@@ -2828,7 +2846,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2828 info.si_errno = 0; 2846 info.si_errno = 0;
2829 info.si_code = SI_USER; 2847 info.si_code = SI_USER;
2830 info.si_pid = task_tgid_vnr(current); 2848 info.si_pid = task_tgid_vnr(current);
2831 info.si_uid = current_uid(); 2849 info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
2832 2850
2833 return kill_something_info(sig, &info, pid); 2851 return kill_something_info(sig, &info, pid);
2834} 2852}
@@ -2871,7 +2889,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2871 info.si_errno = 0; 2889 info.si_errno = 0;
2872 info.si_code = SI_TKILL; 2890 info.si_code = SI_TKILL;
2873 info.si_pid = task_tgid_vnr(current); 2891 info.si_pid = task_tgid_vnr(current);
2874 info.si_uid = current_uid(); 2892 info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
2875 2893
2876 return do_send_specific(tgid, pid, sig, &info); 2894 return do_send_specific(tgid, pid, sig, &info);
2877} 2895}
@@ -3133,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
3133 return -EINVAL; 3151 return -EINVAL;
3134 } 3152 }
3135 3153
3136 set_current_blocked(&new_blocked); 3154 __set_current_blocked(&new_blocked);
3137 } 3155 }
3138 3156
3139 if (oset) { 3157 if (oset) {
@@ -3197,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
3197 int old = current->blocked.sig[0]; 3215 int old = current->blocked.sig[0];
3198 sigset_t newset; 3216 sigset_t newset;
3199 3217
3200 siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
3201 set_current_blocked(&newset); 3218 set_current_blocked(&newset);
3202 3219
3203 return old; 3220 return old;
@@ -3236,6 +3253,17 @@ SYSCALL_DEFINE0(pause)
3236 3253
3237#endif 3254#endif
3238 3255
3256int sigsuspend(sigset_t *set)
3257{
3258 current->saved_sigmask = current->blocked;
3259 set_current_blocked(set);
3260
3261 current->state = TASK_INTERRUPTIBLE;
3262 schedule();
3263 set_restore_sigmask();
3264 return -ERESTARTNOHAND;
3265}
3266
3239#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 3267#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
3240/** 3268/**
3241 * sys_rt_sigsuspend - replace the signal mask for a value with the 3269 * sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3253,15 +3281,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3253 3281
3254 if (copy_from_user(&newset, unewset, sizeof(newset))) 3282 if (copy_from_user(&newset, unewset, sizeof(newset)))
3255 return -EFAULT; 3283 return -EFAULT;
3256 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3284 return sigsuspend(&newset);
3257
3258 current->saved_sigmask = current->blocked;
3259 set_current_blocked(&newset);
3260
3261 current->state = TASK_INTERRUPTIBLE;
3262 schedule();
3263 set_restore_sigmask();
3264 return -ERESTARTNOHAND;
3265} 3285}
3266#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 3286#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
3267 3287
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf759..d0ae5b24875e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15 15
16#include "smpboot.h"
17
16#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
17static struct { 19static struct {
18 struct list_head queue; 20 struct list_head queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
669{ 671{
670 unsigned int cpu; 672 unsigned int cpu;
671 673
674 idle_threads_init();
675
672 /* FIXME: This should be done in userspace --RR */ 676 /* FIXME: This should be done in userspace --RR */
673 for_each_present_cpu(cpu) { 677 for_each_present_cpu(cpu) {
674 if (num_online_cpus() >= setup_max_cpus) 678 if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
791 } 795 }
792} 796}
793EXPORT_SYMBOL(on_each_cpu_cond); 797EXPORT_SYMBOL(on_each_cpu_cond);
798
799static void do_nothing(void *unused)
800{
801}
802
803/**
804 * kick_all_cpus_sync - Force all cpus out of idle
805 *
806 * Used to synchronize the update of pm_idle function pointer. It's
807 * called after the pointer is updated and returns after the dummy
808 * callback function has been executed on all cpus. The execution of
809 * the function can only happen on the remote cpus after they have
810 * left the idle function which had been called via pm_idle function
811 * pointer. So it's guaranteed that nothing uses the previous pointer
812 * anymore.
813 */
814void kick_all_cpus_sync(void)
815{
816 /* Make sure the change is visible before we kick the cpus */
817 smp_mb();
818 smp_call_function(do_nothing, NULL, 1);
819}
820EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 000000000000..98f60c5caa1b
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,67 @@
1/*
2 * Common SMP CPU bringup/teardown functions
3 */
4#include <linux/err.h>
5#include <linux/smp.h>
6#include <linux/init.h>
7#include <linux/sched.h>
8#include <linux/percpu.h>
9
10#include "smpboot.h"
11
12#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
13/*
14 * For the hotplug case we keep the task structs around and reuse
15 * them.
16 */
17static DEFINE_PER_CPU(struct task_struct *, idle_threads);
18
19struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
20{
21 struct task_struct *tsk = per_cpu(idle_threads, cpu);
22
23 if (!tsk)
24 return ERR_PTR(-ENOMEM);
25 init_idle(tsk, cpu);
26 return tsk;
27}
28
29void __init idle_thread_set_boot_cpu(void)
30{
31 per_cpu(idle_threads, smp_processor_id()) = current;
32}
33
34/**
35 * idle_init - Initialize the idle thread for a cpu
36 * @cpu: The cpu for which the idle thread should be initialized
37 *
38 * Creates the thread if it does not exist.
39 */
40static inline void idle_init(unsigned int cpu)
41{
42 struct task_struct *tsk = per_cpu(idle_threads, cpu);
43
44 if (!tsk) {
45 tsk = fork_idle(cpu);
46 if (IS_ERR(tsk))
47 pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
48 else
49 per_cpu(idle_threads, cpu) = tsk;
50 }
51}
52
53/**
54 * idle_threads_init - Initialize idle threads for all cpus
55 */
56void __init idle_threads_init(void)
57{
58 unsigned int cpu, boot_cpu;
59
60 boot_cpu = smp_processor_id();
61
62 for_each_possible_cpu(cpu) {
63 if (cpu != boot_cpu)
64 idle_init(cpu);
65 }
66}
67#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 000000000000..80c0acfb8472
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
1#ifndef SMPBOOT_H
2#define SMPBOOT_H
3
4struct task_struct;
5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void);
11void idle_threads_init(void);
12#else
13static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
14static inline void idle_thread_set_boot_cpu(void) { }
15static inline void idle_threads_init(void) { }
16#endif
17
18#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37/*
38 * Initialize an rcu_batch structure to empty.
39 */
40static inline void rcu_batch_init(struct rcu_batch *b)
41{
42 b->head = NULL;
43 b->tail = &b->head;
44}
45
46/*
47 * Enqueue a callback onto the tail of the specified rcu_batch structure.
48 */
49static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
50{
51 *b->tail = head;
52 b->tail = &head->next;
53}
54
55/*
56 * Is the specified rcu_batch structure empty?
57 */
58static inline bool rcu_batch_empty(struct rcu_batch *b)
59{
60 return b->tail == &b->head;
61}
62
63/*
64 * Remove the callback at the head of the specified rcu_batch structure
65 * and return a pointer to it, or return NULL if the structure is empty.
66 */
67static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
68{
69 struct rcu_head *head;
70
71 if (rcu_batch_empty(b))
72 return NULL;
73
74 head = b->head;
75 b->head = head->next;
76 if (b->tail == &head->next)
77 rcu_batch_init(b);
78
79 return head;
80}
81
82/*
83 * Move all callbacks from the rcu_batch structure specified by "from" to
84 * the structure specified by "to".
85 */
86static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
87{
88 if (!rcu_batch_empty(from)) {
89 *to->tail = from->head;
90 to->tail = from->tail;
91 rcu_batch_init(from);
92 }
93}
94
95/* single-thread state-machine */
96static void process_srcu(struct work_struct *work);
97
37static int init_srcu_struct_fields(struct srcu_struct *sp) 98static int init_srcu_struct_fields(struct srcu_struct *sp)
38{ 99{
39 sp->completed = 0; 100 sp->completed = 0;
40 mutex_init(&sp->mutex); 101 spin_lock_init(&sp->queue_lock);
102 sp->running = false;
103 rcu_batch_init(&sp->batch_queue);
104 rcu_batch_init(&sp->batch_check0);
105 rcu_batch_init(&sp->batch_check1);
106 rcu_batch_init(&sp->batch_done);
107 INIT_DELAYED_WORK(&sp->work, process_srcu);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); 108 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM; 109 return sp->per_cpu_ref ? 0 : -ENOMEM;
43} 110}
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 140#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 141
75/* 142/*
76 * srcu_readers_active_idx -- returns approximate number of readers 143 * Returns approximate total of the readers' ->seq[] values for the
77 * active on the specified rank of per-CPU counters. 144 * rank of per-CPU counters specified by idx.
78 */ 145 */
146static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
147{
148 int cpu;
149 unsigned long sum = 0;
150 unsigned long t;
79 151
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 152 for_each_possible_cpu(cpu) {
153 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
154 sum += t;
155 }
156 return sum;
157}
158
159/*
160 * Returns approximate number of readers active on the specified rank
161 * of the per-CPU ->c[] counters.
162 */
163static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
81{ 164{
82 int cpu; 165 int cpu;
83 int sum; 166 unsigned long sum = 0;
167 unsigned long t;
84 168
85 sum = 0; 169 for_each_possible_cpu(cpu) {
86 for_each_possible_cpu(cpu) 170 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 171 sum += t;
172 }
88 return sum; 173 return sum;
89} 174}
90 175
176/*
177 * Return true if the number of pre-existing readers is determined to
178 * be stably zero. An example unstable zero can occur if the call
179 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
180 * but due to task migration, sees the corresponding __srcu_read_unlock()
181 * decrement. This can happen because srcu_readers_active_idx() takes
182 * time to sum the array, and might in fact be interrupted or preempted
183 * partway through the summation.
184 */
185static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
186{
187 unsigned long seq;
188
189 seq = srcu_readers_seq_idx(sp, idx);
190
191 /*
192 * The following smp_mb() A pairs with the smp_mb() B located in
193 * __srcu_read_lock(). This pairing ensures that if an
194 * __srcu_read_lock() increments its counter after the summation
195 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
196 * critical section will see any changes made prior to the start
197 * of the current SRCU grace period.
198 *
199 * Also, if the above call to srcu_readers_seq_idx() saw the
200 * increment of ->seq[], then the call to srcu_readers_active_idx()
201 * must see the increment of ->c[].
202 */
203 smp_mb(); /* A */
204
205 /*
206 * Note that srcu_readers_active_idx() can incorrectly return
207 * zero even though there is a pre-existing reader throughout.
208 * To see this, suppose that task A is in a very long SRCU
209 * read-side critical section that started on CPU 0, and that
210 * no other reader exists, so that the sum of the counters
211 * is equal to one. Then suppose that task B starts executing
212 * srcu_readers_active_idx(), summing up to CPU 1, and then that
213 * task C starts reading on CPU 0, so that its increment is not
214 * summed, but finishes reading on CPU 2, so that its decrement
215 * -is- summed. Then when task B completes its sum, it will
216 * incorrectly get zero, despite the fact that task A has been
217 * in its SRCU read-side critical section the whole time.
218 *
219 * We therefore do a validation step should srcu_readers_active_idx()
220 * return zero.
221 */
222 if (srcu_readers_active_idx(sp, idx) != 0)
223 return false;
224
225 /*
226 * The remainder of this function is the validation step.
227 * The following smp_mb() D pairs with the smp_mb() C in
228 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
229 * by srcu_readers_active_idx() above, then any destructive
230 * operation performed after the grace period will happen after
231 * the corresponding SRCU read-side critical section.
232 *
233 * Note that there can be at most NR_CPUS worth of readers using
234 * the old index, which is not enough to overflow even a 32-bit
235 * integer. (Yes, this does mean that systems having more than
236 * a billion or so CPUs need to be 64-bit systems.) Therefore,
237 * the sum of the ->seq[] counters cannot possibly overflow.
238 * Therefore, the only way that the return values of the two
239 * calls to srcu_readers_seq_idx() can be equal is if there were
240 * no increments of the corresponding rank of ->seq[] counts
241 * in the interim. But the missed-increment scenario laid out
242 * above includes an increment of the ->seq[] counter by
243 * the corresponding __srcu_read_lock(). Therefore, if this
244 * scenario occurs, the return values from the two calls to
245 * srcu_readers_seq_idx() will differ, and thus the validation
246 * step below suffices.
247 */
248 smp_mb(); /* D */
249
250 return srcu_readers_seq_idx(sp, idx) == seq;
251}
252
91/** 253/**
92 * srcu_readers_active - returns approximate number of readers. 254 * srcu_readers_active - returns approximate number of readers.
93 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 255 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
98 */ 260 */
99static int srcu_readers_active(struct srcu_struct *sp) 261static int srcu_readers_active(struct srcu_struct *sp)
100{ 262{
101 return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); 263 int cpu;
264 unsigned long sum = 0;
265
266 for_each_possible_cpu(cpu) {
267 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
268 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
269 }
270 return sum;
102} 271}
103 272
104/** 273/**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 300 int idx;
132 301
133 preempt_disable(); 302 preempt_disable();
134 idx = sp->completed & 0x1; 303 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 304 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 305 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 306 smp_mb(); /* B */ /* Avoid leaking the critical section. */
307 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
138 preempt_enable(); 308 preempt_enable();
139 return idx; 309 return idx;
140} 310}
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 319void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 320{
151 preempt_disable(); 321 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 322 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 323 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
154 preempt_enable(); 324 preempt_enable();
155} 325}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 326EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 333 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 334 * has done well in testing, so there is no need for a config parameter.
165 */ 335 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 336#define SRCU_RETRY_CHECK_DELAY 5
337#define SYNCHRONIZE_SRCU_TRYCOUNT 2
338#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
167 339
168/* 340/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 341 * @@@ Wait until all pre-existing readers complete. Such readers
342 * will have used the index specified by "idx".
343 * the caller should ensures the ->completed is not changed while checking
344 * and idx = (->completed & 1) ^ 1
170 */ 345 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 346static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
172{ 347{
173 int idx; 348 for (;;) {
174 349 if (srcu_readers_active_idx_check(sp, idx))
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 350 return true;
176 !lock_is_held(&rcu_bh_lock_map) && 351 if (--trycount <= 0)
177 !lock_is_held(&rcu_lock_map) && 352 return false;
178 !lock_is_held(&rcu_sched_lock_map), 353 udelay(SRCU_RETRY_CHECK_DELAY);
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 354 }
180 355}
181 idx = sp->completed;
182 mutex_lock(&sp->mutex);
183 356
184 /* 357/*
185 * Check to see if someone else did the work for us while we were 358 * Increment the ->completed counter so that future SRCU readers will
186 * waiting to acquire the lock. We need -two- advances of 359 * use the other rank of the ->c[] and ->seq[] arrays. This allows
187 * the counter, not just one. If there was but one, we might have 360 * us to wait for pre-existing readers in a starvation-free manner.
188 * shown up -after- our helper's first synchronize_sched(), thus 361 */
189 * having failed to prevent CPU-reordering races with concurrent 362static void srcu_flip(struct srcu_struct *sp)
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 363{
191 * either (1) wait for two or (2) supply the second ourselves. 364 sp->completed++;
192 */ 365}
193 366
194 if ((sp->completed - idx) >= 2) { 367/*
195 mutex_unlock(&sp->mutex); 368 * Enqueue an SRCU callback on the specified srcu_struct structure,
196 return; 369 * initiating grace-period processing if it is not already running.
370 */
371void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
372 void (*func)(struct rcu_head *head))
373{
374 unsigned long flags;
375
376 head->next = NULL;
377 head->func = func;
378 spin_lock_irqsave(&sp->queue_lock, flags);
379 rcu_batch_queue(&sp->batch_queue, head);
380 if (!sp->running) {
381 sp->running = true;
382 queue_delayed_work(system_nrt_wq, &sp->work, 0);
197 } 383 }
384 spin_unlock_irqrestore(&sp->queue_lock, flags);
385}
386EXPORT_SYMBOL_GPL(call_srcu);
198 387
199 sync_func(); /* Force memory barrier on all CPUs. */ 388struct rcu_synchronize {
389 struct rcu_head head;
390 struct completion completion;
391};
200 392
201 /* 393/*
202 * The preceding synchronize_sched() ensures that any CPU that 394 * Awaken the corresponding synchronize_srcu() instance now that a
203 * sees the new value of sp->completed will also see any preceding 395 * grace period has elapsed.
204 * changes to data structures made by this CPU. This prevents 396 */
205 * some other CPU from reordering the accesses in its SRCU 397static void wakeme_after_rcu(struct rcu_head *head)
206 * read-side critical section to precede the corresponding 398{
207 * srcu_read_lock() -- ensuring that such references will in 399 struct rcu_synchronize *rcu;
208 * fact be protected.
209 *
210 * So it is now safe to do the flip.
211 */
212 400
213 idx = sp->completed & 0x1; 401 rcu = container_of(head, struct rcu_synchronize, head);
214 sp->completed++; 402 complete(&rcu->completion);
403}
215 404
216 sync_func(); /* Force memory barrier on all CPUs. */ 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
406static void srcu_reschedule(struct srcu_struct *sp);
217 407
218 /* 408/*
219 * At this point, because of the preceding synchronize_sched(), 409 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
220 * all srcu_read_lock() calls using the old counters have completed. 410 */
221 * Their corresponding critical sections might well be still 411static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
222 * executing, but the srcu_read_lock() primitives themselves 412{
223 * will have finished executing. We initially give readers 413 struct rcu_synchronize rcu;
224 * an arbitrarily chosen 10 microseconds to get out of their 414 struct rcu_head *head = &rcu.head;
225 * SRCU read-side critical sections, then loop waiting 1/HZ 415 bool done = false;
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234 416
235 sync_func(); /* Force memory barrier on all CPUs. */ 417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
418 !lock_is_held(&rcu_bh_lock_map) &&
419 !lock_is_held(&rcu_lock_map) &&
420 !lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
236 422
237 /* 423 init_completion(&rcu.completion);
238 * The preceding synchronize_sched() forces all srcu_read_unlock() 424
239 * primitives that were executing concurrently with the preceding 425 head->next = NULL;
240 * for_each_possible_cpu() loop to have completed by this point. 426 head->func = wakeme_after_rcu;
241 * More importantly, it also forces the corresponding SRCU read-side 427 spin_lock_irq(&sp->queue_lock);
242 * critical sections to have also completed, and the corresponding 428 if (!sp->running) {
243 * references to SRCU-protected data items to be dropped. 429 /* steal the processing owner */
244 * 430 sp->running = true;
245 * Note: 431 rcu_batch_queue(&sp->batch_check0, head);
246 * 432 spin_unlock_irq(&sp->queue_lock);
247 * Despite what you might think at first glance, the 433
248 * preceding synchronize_sched() -must- be within the 434 srcu_advance_batches(sp, trycount);
249 * critical section ended by the following mutex_unlock(). 435 if (!rcu_batch_empty(&sp->batch_done)) {
250 * Otherwise, a task taking the early exit can race 436 BUG_ON(sp->batch_done.head != head);
251 * with a srcu_read_unlock(), which might have executed 437 rcu_batch_dequeue(&sp->batch_done);
252 * just before the preceding srcu_readers_active() check, 438 done = true;
253 * and whose CPU might have reordered the srcu_read_unlock() 439 }
254 * with the preceding critical section. In this case, there 440 /* give the processing owner to work_struct */
255 * is nothing preventing the synchronize_sched() task that is 441 srcu_reschedule(sp);
256 * taking the early exit from freeing a data structure that 442 } else {
257 * is still being referenced (out of order) by the task 443 rcu_batch_queue(&sp->batch_queue, head);
258 * doing the srcu_read_unlock(). 444 spin_unlock_irq(&sp->queue_lock);
259 * 445 }
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */
264 446
265 mutex_unlock(&sp->mutex); 447 if (!done)
448 wait_for_completion(&rcu.completion);
266} 449}
267 450
268/** 451/**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 464 */
282void synchronize_srcu(struct srcu_struct *sp) 465void synchronize_srcu(struct srcu_struct *sp)
283{ 466{
284 __synchronize_srcu(sp, synchronize_sched); 467 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
285} 468}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 469EXPORT_SYMBOL_GPL(synchronize_srcu);
287 470
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 472 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 473 * @sp: srcu_struct with which to synchronize.
291 * 474 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 475 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 476 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 477 *
300 * Note that it is illegal to call this function while holding any lock 478 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 479 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 480 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 481 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 482 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 485 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 486void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 487{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 488 __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
313} 489}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 490EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 491
316/** 492/**
493 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
494 */
495void srcu_barrier(struct srcu_struct *sp)
496{
497 synchronize_srcu(sp);
498}
499EXPORT_SYMBOL_GPL(srcu_barrier);
500
501/**
317 * srcu_batches_completed - return batches completed. 502 * srcu_batches_completed - return batches completed.
318 * @sp: srcu_struct on which to report batch completion. 503 * @sp: srcu_struct on which to report batch completion.
319 * 504 *
320 * Report the number of batches, correlated with, but not necessarily 505 * Report the number of batches, correlated with, but not necessarily
321 * precisely the same as, the number of grace periods that have elapsed. 506 * precisely the same as, the number of grace periods that have elapsed.
322 */ 507 */
323
324long srcu_batches_completed(struct srcu_struct *sp) 508long srcu_batches_completed(struct srcu_struct *sp)
325{ 509{
326 return sp->completed; 510 return sp->completed;
327} 511}
328EXPORT_SYMBOL_GPL(srcu_batches_completed); 512EXPORT_SYMBOL_GPL(srcu_batches_completed);
513
514#define SRCU_CALLBACK_BATCH 10
515#define SRCU_INTERVAL 1
516
517/*
518 * Move any new SRCU callbacks to the first stage of the SRCU grace
519 * period pipeline.
520 */
521static void srcu_collect_new(struct srcu_struct *sp)
522{
523 if (!rcu_batch_empty(&sp->batch_queue)) {
524 spin_lock_irq(&sp->queue_lock);
525 rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
526 spin_unlock_irq(&sp->queue_lock);
527 }
528}
529
530/*
531 * Core SRCU state machine. Advance callbacks from ->batch_check0 to
532 * ->batch_check1 and then to ->batch_done as readers drain.
533 */
534static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
535{
536 int idx = 1 ^ (sp->completed & 1);
537
538 /*
539 * Because readers might be delayed for an extended period after
540 * fetching ->completed for their index, at any point in time there
541 * might well be readers using both idx=0 and idx=1. We therefore
542 * need to wait for readers to clear from both index values before
543 * invoking a callback.
544 */
545
546 if (rcu_batch_empty(&sp->batch_check0) &&
547 rcu_batch_empty(&sp->batch_check1))
548 return; /* no callbacks need to be advanced */
549
550 if (!try_check_zero(sp, idx, trycount))
551 return; /* failed to advance, will try after SRCU_INTERVAL */
552
553 /*
554 * The callbacks in ->batch_check1 have already done with their
555 * first zero check and flip back when they were enqueued on
556 * ->batch_check0 in a previous invocation of srcu_advance_batches().
557 * (Presumably try_check_zero() returned false during that
558 * invocation, leaving the callbacks stranded on ->batch_check1.)
559 * They are therefore ready to invoke, so move them to ->batch_done.
560 */
561 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
562
563 if (rcu_batch_empty(&sp->batch_check0))
564 return; /* no callbacks need to be advanced */
565 srcu_flip(sp);
566
567 /*
568 * The callbacks in ->batch_check0 just finished their
569 * first check zero and flip, so move them to ->batch_check1
570 * for future checking on the other idx.
571 */
572 rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
573
574 /*
575 * SRCU read-side critical sections are normally short, so check
576 * at least twice in quick succession after a flip.
577 */
578 trycount = trycount < 2 ? 2 : trycount;
579 if (!try_check_zero(sp, idx^1, trycount))
580 return; /* failed to advance, will try after SRCU_INTERVAL */
581
582 /*
583 * The callbacks in ->batch_check1 have now waited for all
584 * pre-existing readers using both idx values. They are therefore
585 * ready to invoke, so move them to ->batch_done.
586 */
587 rcu_batch_move(&sp->batch_done, &sp->batch_check1);
588}
589
590/*
591 * Invoke a limited number of SRCU callbacks that have passed through
592 * their grace period. If there are more to do, SRCU will reschedule
593 * the workqueue.
594 */
595static void srcu_invoke_callbacks(struct srcu_struct *sp)
596{
597 int i;
598 struct rcu_head *head;
599
600 for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
601 head = rcu_batch_dequeue(&sp->batch_done);
602 if (!head)
603 break;
604 local_bh_disable();
605 head->func(head);
606 local_bh_enable();
607 }
608}
609
610/*
611 * Finished one round of SRCU grace period. Start another if there are
612 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
613 */
614static void srcu_reschedule(struct srcu_struct *sp)
615{
616 bool pending = true;
617
618 if (rcu_batch_empty(&sp->batch_done) &&
619 rcu_batch_empty(&sp->batch_check1) &&
620 rcu_batch_empty(&sp->batch_check0) &&
621 rcu_batch_empty(&sp->batch_queue)) {
622 spin_lock_irq(&sp->queue_lock);
623 if (rcu_batch_empty(&sp->batch_done) &&
624 rcu_batch_empty(&sp->batch_check1) &&
625 rcu_batch_empty(&sp->batch_check0) &&
626 rcu_batch_empty(&sp->batch_queue)) {
627 sp->running = false;
628 pending = false;
629 }
630 spin_unlock_irq(&sp->queue_lock);
631 }
632
633 if (pending)
634 queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
635}
636
637/*
638 * This is the work-queue function that handles SRCU grace periods.
639 */
640static void process_srcu(struct work_struct *work)
641{
642 struct srcu_struct *sp;
643
644 sp = container_of(work, struct srcu_struct, work.work);
645
646 srcu_collect_new(sp);
647 srcu_advance_batches(sp, 1);
648 srcu_invoke_callbacks(sp);
649 srcu_reschedule(sp);
650}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..9ff89cb9657a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,8 @@
36#include <linux/personality.h> 36#include <linux/personality.h>
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/file.h>
40#include <linux/mount.h>
39#include <linux/gfp.h> 41#include <linux/gfp.h>
40#include <linux/syscore_ops.h> 42#include <linux/syscore_ops.h>
41#include <linux/version.h> 43#include <linux/version.h>
@@ -93,10 +95,8 @@
93int overflowuid = DEFAULT_OVERFLOWUID; 95int overflowuid = DEFAULT_OVERFLOWUID;
94int overflowgid = DEFAULT_OVERFLOWGID; 96int overflowgid = DEFAULT_OVERFLOWGID;
95 97
96#ifdef CONFIG_UID16
97EXPORT_SYMBOL(overflowuid); 98EXPORT_SYMBOL(overflowuid);
98EXPORT_SYMBOL(overflowgid); 99EXPORT_SYMBOL(overflowgid);
99#endif
100 100
101/* 101/*
102 * the same as above, but for filesystems which can only store a 16-bit 102 * the same as above, but for filesystems which can only store a 16-bit
@@ -133,11 +133,10 @@ static bool set_one_prio_perm(struct task_struct *p)
133{ 133{
134 const struct cred *cred = current_cred(), *pcred = __task_cred(p); 134 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
135 135
136 if (pcred->user->user_ns == cred->user->user_ns && 136 if (uid_eq(pcred->uid, cred->euid) ||
137 (pcred->uid == cred->euid || 137 uid_eq(pcred->euid, cred->euid))
138 pcred->euid == cred->euid))
139 return true; 138 return true;
140 if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) 139 if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
141 return true; 140 return true;
142 return false; 141 return false;
143} 142}
@@ -177,6 +176,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
177 const struct cred *cred = current_cred(); 176 const struct cred *cred = current_cred();
178 int error = -EINVAL; 177 int error = -EINVAL;
179 struct pid *pgrp; 178 struct pid *pgrp;
179 kuid_t uid;
180 180
181 if (which > PRIO_USER || which < PRIO_PROCESS) 181 if (which > PRIO_USER || which < PRIO_PROCESS)
182 goto out; 182 goto out;
@@ -209,18 +209,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
209 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 209 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
210 break; 210 break;
211 case PRIO_USER: 211 case PRIO_USER:
212 user = (struct user_struct *) cred->user; 212 uid = make_kuid(cred->user_ns, who);
213 user = cred->user;
213 if (!who) 214 if (!who)
214 who = cred->uid; 215 uid = cred->uid;
215 else if ((who != cred->uid) && 216 else if (!uid_eq(uid, cred->uid) &&
216 !(user = find_user(who))) 217 !(user = find_user(uid)))
217 goto out_unlock; /* No processes for this user */ 218 goto out_unlock; /* No processes for this user */
218 219
219 do_each_thread(g, p) { 220 do_each_thread(g, p) {
220 if (__task_cred(p)->uid == who) 221 if (uid_eq(task_uid(p), uid))
221 error = set_one_prio(p, niceval, error); 222 error = set_one_prio(p, niceval, error);
222 } while_each_thread(g, p); 223 } while_each_thread(g, p);
223 if (who != cred->uid) 224 if (!uid_eq(uid, cred->uid))
224 free_uid(user); /* For find_user() */ 225 free_uid(user); /* For find_user() */
225 break; 226 break;
226 } 227 }
@@ -244,6 +245,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
244 const struct cred *cred = current_cred(); 245 const struct cred *cred = current_cred();
245 long niceval, retval = -ESRCH; 246 long niceval, retval = -ESRCH;
246 struct pid *pgrp; 247 struct pid *pgrp;
248 kuid_t uid;
247 249
248 if (which > PRIO_USER || which < PRIO_PROCESS) 250 if (which > PRIO_USER || which < PRIO_PROCESS)
249 return -EINVAL; 251 return -EINVAL;
@@ -274,21 +276,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
274 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 276 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
275 break; 277 break;
276 case PRIO_USER: 278 case PRIO_USER:
277 user = (struct user_struct *) cred->user; 279 uid = make_kuid(cred->user_ns, who);
280 user = cred->user;
278 if (!who) 281 if (!who)
279 who = cred->uid; 282 uid = cred->uid;
280 else if ((who != cred->uid) && 283 else if (!uid_eq(uid, cred->uid) &&
281 !(user = find_user(who))) 284 !(user = find_user(uid)))
282 goto out_unlock; /* No processes for this user */ 285 goto out_unlock; /* No processes for this user */
283 286
284 do_each_thread(g, p) { 287 do_each_thread(g, p) {
285 if (__task_cred(p)->uid == who) { 288 if (uid_eq(task_uid(p), uid)) {
286 niceval = 20 - task_nice(p); 289 niceval = 20 - task_nice(p);
287 if (niceval > retval) 290 if (niceval > retval)
288 retval = niceval; 291 retval = niceval;
289 } 292 }
290 } while_each_thread(g, p); 293 } while_each_thread(g, p);
291 if (who != cred->uid) 294 if (!uid_eq(uid, cred->uid))
292 free_uid(user); /* for find_user() */ 295 free_uid(user); /* for find_user() */
293 break; 296 break;
294 } 297 }
@@ -553,9 +556,19 @@ void ctrl_alt_del(void)
553 */ 556 */
554SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 557SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
555{ 558{
559 struct user_namespace *ns = current_user_ns();
556 const struct cred *old; 560 const struct cred *old;
557 struct cred *new; 561 struct cred *new;
558 int retval; 562 int retval;
563 kgid_t krgid, kegid;
564
565 krgid = make_kgid(ns, rgid);
566 kegid = make_kgid(ns, egid);
567
568 if ((rgid != (gid_t) -1) && !gid_valid(krgid))
569 return -EINVAL;
570 if ((egid != (gid_t) -1) && !gid_valid(kegid))
571 return -EINVAL;
559 572
560 new = prepare_creds(); 573 new = prepare_creds();
561 if (!new) 574 if (!new)
@@ -564,25 +577,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
564 577
565 retval = -EPERM; 578 retval = -EPERM;
566 if (rgid != (gid_t) -1) { 579 if (rgid != (gid_t) -1) {
567 if (old->gid == rgid || 580 if (gid_eq(old->gid, krgid) ||
568 old->egid == rgid || 581 gid_eq(old->egid, krgid) ||
569 nsown_capable(CAP_SETGID)) 582 nsown_capable(CAP_SETGID))
570 new->gid = rgid; 583 new->gid = krgid;
571 else 584 else
572 goto error; 585 goto error;
573 } 586 }
574 if (egid != (gid_t) -1) { 587 if (egid != (gid_t) -1) {
575 if (old->gid == egid || 588 if (gid_eq(old->gid, kegid) ||
576 old->egid == egid || 589 gid_eq(old->egid, kegid) ||
577 old->sgid == egid || 590 gid_eq(old->sgid, kegid) ||
578 nsown_capable(CAP_SETGID)) 591 nsown_capable(CAP_SETGID))
579 new->egid = egid; 592 new->egid = kegid;
580 else 593 else
581 goto error; 594 goto error;
582 } 595 }
583 596
584 if (rgid != (gid_t) -1 || 597 if (rgid != (gid_t) -1 ||
585 (egid != (gid_t) -1 && egid != old->gid)) 598 (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
586 new->sgid = new->egid; 599 new->sgid = new->egid;
587 new->fsgid = new->egid; 600 new->fsgid = new->egid;
588 601
@@ -600,9 +613,15 @@ error:
600 */ 613 */
601SYSCALL_DEFINE1(setgid, gid_t, gid) 614SYSCALL_DEFINE1(setgid, gid_t, gid)
602{ 615{
616 struct user_namespace *ns = current_user_ns();
603 const struct cred *old; 617 const struct cred *old;
604 struct cred *new; 618 struct cred *new;
605 int retval; 619 int retval;
620 kgid_t kgid;
621
622 kgid = make_kgid(ns, gid);
623 if (!gid_valid(kgid))
624 return -EINVAL;
606 625
607 new = prepare_creds(); 626 new = prepare_creds();
608 if (!new) 627 if (!new)
@@ -611,9 +630,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
611 630
612 retval = -EPERM; 631 retval = -EPERM;
613 if (nsown_capable(CAP_SETGID)) 632 if (nsown_capable(CAP_SETGID))
614 new->gid = new->egid = new->sgid = new->fsgid = gid; 633 new->gid = new->egid = new->sgid = new->fsgid = kgid;
615 else if (gid == old->gid || gid == old->sgid) 634 else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
616 new->egid = new->fsgid = gid; 635 new->egid = new->fsgid = kgid;
617 else 636 else
618 goto error; 637 goto error;
619 638
@@ -631,7 +650,7 @@ static int set_user(struct cred *new)
631{ 650{
632 struct user_struct *new_user; 651 struct user_struct *new_user;
633 652
634 new_user = alloc_uid(current_user_ns(), new->uid); 653 new_user = alloc_uid(new->uid);
635 if (!new_user) 654 if (!new_user)
636 return -EAGAIN; 655 return -EAGAIN;
637 656
@@ -670,9 +689,19 @@ static int set_user(struct cred *new)
670 */ 689 */
671SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 690SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
672{ 691{
692 struct user_namespace *ns = current_user_ns();
673 const struct cred *old; 693 const struct cred *old;
674 struct cred *new; 694 struct cred *new;
675 int retval; 695 int retval;
696 kuid_t kruid, keuid;
697
698 kruid = make_kuid(ns, ruid);
699 keuid = make_kuid(ns, euid);
700
701 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
702 return -EINVAL;
703 if ((euid != (uid_t) -1) && !uid_valid(keuid))
704 return -EINVAL;
676 705
677 new = prepare_creds(); 706 new = prepare_creds();
678 if (!new) 707 if (!new)
@@ -681,29 +710,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
681 710
682 retval = -EPERM; 711 retval = -EPERM;
683 if (ruid != (uid_t) -1) { 712 if (ruid != (uid_t) -1) {
684 new->uid = ruid; 713 new->uid = kruid;
685 if (old->uid != ruid && 714 if (!uid_eq(old->uid, kruid) &&
686 old->euid != ruid && 715 !uid_eq(old->euid, kruid) &&
687 !nsown_capable(CAP_SETUID)) 716 !nsown_capable(CAP_SETUID))
688 goto error; 717 goto error;
689 } 718 }
690 719
691 if (euid != (uid_t) -1) { 720 if (euid != (uid_t) -1) {
692 new->euid = euid; 721 new->euid = keuid;
693 if (old->uid != euid && 722 if (!uid_eq(old->uid, keuid) &&
694 old->euid != euid && 723 !uid_eq(old->euid, keuid) &&
695 old->suid != euid && 724 !uid_eq(old->suid, keuid) &&
696 !nsown_capable(CAP_SETUID)) 725 !nsown_capable(CAP_SETUID))
697 goto error; 726 goto error;
698 } 727 }
699 728
700 if (new->uid != old->uid) { 729 if (!uid_eq(new->uid, old->uid)) {
701 retval = set_user(new); 730 retval = set_user(new);
702 if (retval < 0) 731 if (retval < 0)
703 goto error; 732 goto error;
704 } 733 }
705 if (ruid != (uid_t) -1 || 734 if (ruid != (uid_t) -1 ||
706 (euid != (uid_t) -1 && euid != old->uid)) 735 (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
707 new->suid = new->euid; 736 new->suid = new->euid;
708 new->fsuid = new->euid; 737 new->fsuid = new->euid;
709 738
@@ -731,9 +760,15 @@ error:
731 */ 760 */
732SYSCALL_DEFINE1(setuid, uid_t, uid) 761SYSCALL_DEFINE1(setuid, uid_t, uid)
733{ 762{
763 struct user_namespace *ns = current_user_ns();
734 const struct cred *old; 764 const struct cred *old;
735 struct cred *new; 765 struct cred *new;
736 int retval; 766 int retval;
767 kuid_t kuid;
768
769 kuid = make_kuid(ns, uid);
770 if (!uid_valid(kuid))
771 return -EINVAL;
737 772
738 new = prepare_creds(); 773 new = prepare_creds();
739 if (!new) 774 if (!new)
@@ -742,17 +777,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
742 777
743 retval = -EPERM; 778 retval = -EPERM;
744 if (nsown_capable(CAP_SETUID)) { 779 if (nsown_capable(CAP_SETUID)) {
745 new->suid = new->uid = uid; 780 new->suid = new->uid = kuid;
746 if (uid != old->uid) { 781 if (!uid_eq(kuid, old->uid)) {
747 retval = set_user(new); 782 retval = set_user(new);
748 if (retval < 0) 783 if (retval < 0)
749 goto error; 784 goto error;
750 } 785 }
751 } else if (uid != old->uid && uid != new->suid) { 786 } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
752 goto error; 787 goto error;
753 } 788 }
754 789
755 new->fsuid = new->euid = uid; 790 new->fsuid = new->euid = kuid;
756 791
757 retval = security_task_fix_setuid(new, old, LSM_SETID_ID); 792 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
758 if (retval < 0) 793 if (retval < 0)
@@ -772,9 +807,24 @@ error:
772 */ 807 */
773SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 808SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
774{ 809{
810 struct user_namespace *ns = current_user_ns();
775 const struct cred *old; 811 const struct cred *old;
776 struct cred *new; 812 struct cred *new;
777 int retval; 813 int retval;
814 kuid_t kruid, keuid, ksuid;
815
816 kruid = make_kuid(ns, ruid);
817 keuid = make_kuid(ns, euid);
818 ksuid = make_kuid(ns, suid);
819
820 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
821 return -EINVAL;
822
823 if ((euid != (uid_t) -1) && !uid_valid(keuid))
824 return -EINVAL;
825
826 if ((suid != (uid_t) -1) && !uid_valid(ksuid))
827 return -EINVAL;
778 828
779 new = prepare_creds(); 829 new = prepare_creds();
780 if (!new) 830 if (!new)
@@ -784,29 +834,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
784 834
785 retval = -EPERM; 835 retval = -EPERM;
786 if (!nsown_capable(CAP_SETUID)) { 836 if (!nsown_capable(CAP_SETUID)) {
787 if (ruid != (uid_t) -1 && ruid != old->uid && 837 if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
788 ruid != old->euid && ruid != old->suid) 838 !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
789 goto error; 839 goto error;
790 if (euid != (uid_t) -1 && euid != old->uid && 840 if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
791 euid != old->euid && euid != old->suid) 841 !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
792 goto error; 842 goto error;
793 if (suid != (uid_t) -1 && suid != old->uid && 843 if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
794 suid != old->euid && suid != old->suid) 844 !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
795 goto error; 845 goto error;
796 } 846 }
797 847
798 if (ruid != (uid_t) -1) { 848 if (ruid != (uid_t) -1) {
799 new->uid = ruid; 849 new->uid = kruid;
800 if (ruid != old->uid) { 850 if (!uid_eq(kruid, old->uid)) {
801 retval = set_user(new); 851 retval = set_user(new);
802 if (retval < 0) 852 if (retval < 0)
803 goto error; 853 goto error;
804 } 854 }
805 } 855 }
806 if (euid != (uid_t) -1) 856 if (euid != (uid_t) -1)
807 new->euid = euid; 857 new->euid = keuid;
808 if (suid != (uid_t) -1) 858 if (suid != (uid_t) -1)
809 new->suid = suid; 859 new->suid = ksuid;
810 new->fsuid = new->euid; 860 new->fsuid = new->euid;
811 861
812 retval = security_task_fix_setuid(new, old, LSM_SETID_RES); 862 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -820,14 +870,19 @@ error:
820 return retval; 870 return retval;
821} 871}
822 872
823SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) 873SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
824{ 874{
825 const struct cred *cred = current_cred(); 875 const struct cred *cred = current_cred();
826 int retval; 876 int retval;
877 uid_t ruid, euid, suid;
827 878
828 if (!(retval = put_user(cred->uid, ruid)) && 879 ruid = from_kuid_munged(cred->user_ns, cred->uid);
829 !(retval = put_user(cred->euid, euid))) 880 euid = from_kuid_munged(cred->user_ns, cred->euid);
830 retval = put_user(cred->suid, suid); 881 suid = from_kuid_munged(cred->user_ns, cred->suid);
882
883 if (!(retval = put_user(ruid, ruidp)) &&
884 !(retval = put_user(euid, euidp)))
885 retval = put_user(suid, suidp);
831 886
832 return retval; 887 return retval;
833} 888}
@@ -837,9 +892,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
837 */ 892 */
838SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 893SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
839{ 894{
895 struct user_namespace *ns = current_user_ns();
840 const struct cred *old; 896 const struct cred *old;
841 struct cred *new; 897 struct cred *new;
842 int retval; 898 int retval;
899 kgid_t krgid, kegid, ksgid;
900
901 krgid = make_kgid(ns, rgid);
902 kegid = make_kgid(ns, egid);
903 ksgid = make_kgid(ns, sgid);
904
905 if ((rgid != (gid_t) -1) && !gid_valid(krgid))
906 return -EINVAL;
907 if ((egid != (gid_t) -1) && !gid_valid(kegid))
908 return -EINVAL;
909 if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
910 return -EINVAL;
843 911
844 new = prepare_creds(); 912 new = prepare_creds();
845 if (!new) 913 if (!new)
@@ -848,23 +916,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
848 916
849 retval = -EPERM; 917 retval = -EPERM;
850 if (!nsown_capable(CAP_SETGID)) { 918 if (!nsown_capable(CAP_SETGID)) {
851 if (rgid != (gid_t) -1 && rgid != old->gid && 919 if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
852 rgid != old->egid && rgid != old->sgid) 920 !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
853 goto error; 921 goto error;
854 if (egid != (gid_t) -1 && egid != old->gid && 922 if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
855 egid != old->egid && egid != old->sgid) 923 !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
856 goto error; 924 goto error;
857 if (sgid != (gid_t) -1 && sgid != old->gid && 925 if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
858 sgid != old->egid && sgid != old->sgid) 926 !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
859 goto error; 927 goto error;
860 } 928 }
861 929
862 if (rgid != (gid_t) -1) 930 if (rgid != (gid_t) -1)
863 new->gid = rgid; 931 new->gid = krgid;
864 if (egid != (gid_t) -1) 932 if (egid != (gid_t) -1)
865 new->egid = egid; 933 new->egid = kegid;
866 if (sgid != (gid_t) -1) 934 if (sgid != (gid_t) -1)
867 new->sgid = sgid; 935 new->sgid = ksgid;
868 new->fsgid = new->egid; 936 new->fsgid = new->egid;
869 937
870 return commit_creds(new); 938 return commit_creds(new);
@@ -874,14 +942,19 @@ error:
874 return retval; 942 return retval;
875} 943}
876 944
877SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) 945SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
878{ 946{
879 const struct cred *cred = current_cred(); 947 const struct cred *cred = current_cred();
880 int retval; 948 int retval;
949 gid_t rgid, egid, sgid;
881 950
882 if (!(retval = put_user(cred->gid, rgid)) && 951 rgid = from_kgid_munged(cred->user_ns, cred->gid);
883 !(retval = put_user(cred->egid, egid))) 952 egid = from_kgid_munged(cred->user_ns, cred->egid);
884 retval = put_user(cred->sgid, sgid); 953 sgid = from_kgid_munged(cred->user_ns, cred->sgid);
954
955 if (!(retval = put_user(rgid, rgidp)) &&
956 !(retval = put_user(egid, egidp)))
957 retval = put_user(sgid, sgidp);
885 958
886 return retval; 959 return retval;
887} 960}
@@ -898,18 +971,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
898 const struct cred *old; 971 const struct cred *old;
899 struct cred *new; 972 struct cred *new;
900 uid_t old_fsuid; 973 uid_t old_fsuid;
974 kuid_t kuid;
975
976 old = current_cred();
977 old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
978
979 kuid = make_kuid(old->user_ns, uid);
980 if (!uid_valid(kuid))
981 return old_fsuid;
901 982
902 new = prepare_creds(); 983 new = prepare_creds();
903 if (!new) 984 if (!new)
904 return current_fsuid(); 985 return old_fsuid;
905 old = current_cred();
906 old_fsuid = old->fsuid;
907 986
908 if (uid == old->uid || uid == old->euid || 987 if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
909 uid == old->suid || uid == old->fsuid || 988 uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
910 nsown_capable(CAP_SETUID)) { 989 nsown_capable(CAP_SETUID)) {
911 if (uid != old_fsuid) { 990 if (!uid_eq(kuid, old->fsuid)) {
912 new->fsuid = uid; 991 new->fsuid = kuid;
913 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) 992 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
914 goto change_okay; 993 goto change_okay;
915 } 994 }
@@ -931,18 +1010,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
931 const struct cred *old; 1010 const struct cred *old;
932 struct cred *new; 1011 struct cred *new;
933 gid_t old_fsgid; 1012 gid_t old_fsgid;
1013 kgid_t kgid;
1014
1015 old = current_cred();
1016 old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
1017
1018 kgid = make_kgid(old->user_ns, gid);
1019 if (!gid_valid(kgid))
1020 return old_fsgid;
934 1021
935 new = prepare_creds(); 1022 new = prepare_creds();
936 if (!new) 1023 if (!new)
937 return current_fsgid(); 1024 return old_fsgid;
938 old = current_cred();
939 old_fsgid = old->fsgid;
940 1025
941 if (gid == old->gid || gid == old->egid || 1026 if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
942 gid == old->sgid || gid == old->fsgid || 1027 gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
943 nsown_capable(CAP_SETGID)) { 1028 nsown_capable(CAP_SETGID)) {
944 if (gid != old_fsgid) { 1029 if (!gid_eq(kgid, old->fsgid)) {
945 new->fsgid = gid; 1030 new->fsgid = kgid;
946 goto change_okay; 1031 goto change_okay;
947 } 1032 }
948 } 1033 }
@@ -1295,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1295 memcpy(u->nodename, tmp, len); 1380 memcpy(u->nodename, tmp, len);
1296 memset(u->nodename + len, 0, sizeof(u->nodename) - len); 1381 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1297 errno = 0; 1382 errno = 0;
1383 uts_proc_notify(UTS_PROC_HOSTNAME);
1298 } 1384 }
1299 uts_proc_notify(UTS_PROC_HOSTNAME);
1300 up_write(&uts_sem); 1385 up_write(&uts_sem);
1301 return errno; 1386 return errno;
1302} 1387}
@@ -1346,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1346 memcpy(u->domainname, tmp, len); 1431 memcpy(u->domainname, tmp, len);
1347 memset(u->domainname + len, 0, sizeof(u->domainname) - len); 1432 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1348 errno = 0; 1433 errno = 0;
1434 uts_proc_notify(UTS_PROC_DOMAINNAME);
1349 } 1435 }
1350 uts_proc_notify(UTS_PROC_DOMAINNAME);
1351 up_write(&uts_sem); 1436 up_write(&uts_sem);
1352 return errno; 1437 return errno;
1353} 1438}
@@ -1498,15 +1583,14 @@ static int check_prlimit_permission(struct task_struct *task)
1498 return 0; 1583 return 0;
1499 1584
1500 tcred = __task_cred(task); 1585 tcred = __task_cred(task);
1501 if (cred->user->user_ns == tcred->user->user_ns && 1586 if (uid_eq(cred->uid, tcred->euid) &&
1502 (cred->uid == tcred->euid && 1587 uid_eq(cred->uid, tcred->suid) &&
1503 cred->uid == tcred->suid && 1588 uid_eq(cred->uid, tcred->uid) &&
1504 cred->uid == tcred->uid && 1589 gid_eq(cred->gid, tcred->egid) &&
1505 cred->gid == tcred->egid && 1590 gid_eq(cred->gid, tcred->sgid) &&
1506 cred->gid == tcred->sgid && 1591 gid_eq(cred->gid, tcred->gid))
1507 cred->gid == tcred->gid))
1508 return 0; 1592 return 0;
1509 if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) 1593 if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
1510 return 0; 1594 return 0;
1511 1595
1512 return -EPERM; 1596 return -EPERM;
@@ -1702,77 +1786,102 @@ SYSCALL_DEFINE1(umask, int, mask)
1702} 1786}
1703 1787
1704#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static bool vma_flags_mismatch(struct vm_area_struct *vma,
1790 unsigned long required,
1791 unsigned long banned)
1792{
1793 return (vma->vm_flags & required) != required ||
1794 (vma->vm_flags & banned);
1795}
1796
1797static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1798{
1799 struct file *exe_file;
1800 struct dentry *dentry;
1801 int err;
1802
1803 /*
1804 * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
1805 * remain. So perform a quick test first.
1806 */
1807 if (mm->num_exe_file_vmas)
1808 return -EBUSY;
1809
1810 exe_file = fget(fd);
1811 if (!exe_file)
1812 return -EBADF;
1813
1814 dentry = exe_file->f_path.dentry;
1815
1816 /*
1817 * Because the original mm->exe_file points to executable file, make
1818 * sure that this one is executable as well, to avoid breaking an
1819 * overall picture.
1820 */
1821 err = -EACCES;
1822 if (!S_ISREG(dentry->d_inode->i_mode) ||
1823 exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1824 goto exit;
1825
1826 err = inode_permission(dentry->d_inode, MAY_EXEC);
1827 if (err)
1828 goto exit;
1829
1830 /*
1831 * The symlink can be changed only once, just to disallow arbitrary
1832 * transitions malicious software might bring in. This means one
1833 * could make a snapshot over all processes running and monitor
1834 * /proc/pid/exe changes to notice unusual activity if needed.
1835 */
1836 down_write(&mm->mmap_sem);
1837 if (likely(!mm->exe_file))
1838 set_mm_exe_file(mm, exe_file);
1839 else
1840 err = -EBUSY;
1841 up_write(&mm->mmap_sem);
1842
1843exit:
1844 fput(exe_file);
1845 return err;
1846}
1847
1705static int prctl_set_mm(int opt, unsigned long addr, 1848static int prctl_set_mm(int opt, unsigned long addr,
1706 unsigned long arg4, unsigned long arg5) 1849 unsigned long arg4, unsigned long arg5)
1707{ 1850{
1708 unsigned long rlim = rlimit(RLIMIT_DATA); 1851 unsigned long rlim = rlimit(RLIMIT_DATA);
1709 unsigned long vm_req_flags;
1710 unsigned long vm_bad_flags;
1711 struct vm_area_struct *vma;
1712 int error = 0;
1713 struct mm_struct *mm = current->mm; 1852 struct mm_struct *mm = current->mm;
1853 struct vm_area_struct *vma;
1854 int error;
1714 1855
1715 if (arg4 | arg5) 1856 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
1716 return -EINVAL; 1857 return -EINVAL;
1717 1858
1718 if (!capable(CAP_SYS_RESOURCE)) 1859 if (!capable(CAP_SYS_RESOURCE))
1719 return -EPERM; 1860 return -EPERM;
1720 1861
1862 if (opt == PR_SET_MM_EXE_FILE)
1863 return prctl_set_mm_exe_file(mm, (unsigned int)addr);
1864
1721 if (addr >= TASK_SIZE) 1865 if (addr >= TASK_SIZE)
1722 return -EINVAL; 1866 return -EINVAL;
1723 1867
1868 error = -EINVAL;
1869
1724 down_read(&mm->mmap_sem); 1870 down_read(&mm->mmap_sem);
1725 vma = find_vma(mm, addr); 1871 vma = find_vma(mm, addr);
1726 1872
1727 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1728 /* It must be existing VMA */
1729 if (!vma || vma->vm_start > addr)
1730 goto out;
1731 }
1732
1733 error = -EINVAL;
1734 switch (opt) { 1873 switch (opt) {
1735 case PR_SET_MM_START_CODE: 1874 case PR_SET_MM_START_CODE:
1875 mm->start_code = addr;
1876 break;
1736 case PR_SET_MM_END_CODE: 1877 case PR_SET_MM_END_CODE:
1737 vm_req_flags = VM_READ | VM_EXEC; 1878 mm->end_code = addr;
1738 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1739
1740 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1741 (vma->vm_flags & vm_bad_flags))
1742 goto out;
1743
1744 if (opt == PR_SET_MM_START_CODE)
1745 mm->start_code = addr;
1746 else
1747 mm->end_code = addr;
1748 break; 1879 break;
1749
1750 case PR_SET_MM_START_DATA: 1880 case PR_SET_MM_START_DATA:
1751 case PR_SET_MM_END_DATA: 1881 mm->start_data = addr;
1752 vm_req_flags = VM_READ | VM_WRITE;
1753 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1754
1755 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1756 (vma->vm_flags & vm_bad_flags))
1757 goto out;
1758
1759 if (opt == PR_SET_MM_START_DATA)
1760 mm->start_data = addr;
1761 else
1762 mm->end_data = addr;
1763 break; 1882 break;
1764 1883 case PR_SET_MM_END_DATA:
1765 case PR_SET_MM_START_STACK: 1884 mm->end_data = addr;
1766
1767#ifdef CONFIG_STACK_GROWSUP
1768 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1769#else
1770 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1771#endif
1772 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1773 goto out;
1774
1775 mm->start_stack = addr;
1776 break; 1885 break;
1777 1886
1778 case PR_SET_MM_START_BRK: 1887 case PR_SET_MM_START_BRK:
@@ -1799,16 +1908,77 @@ static int prctl_set_mm(int opt, unsigned long addr,
1799 mm->brk = addr; 1908 mm->brk = addr;
1800 break; 1909 break;
1801 1910
1911 /*
1912 * If command line arguments and environment
1913 * are placed somewhere else on stack, we can
1914 * set them up here, ARG_START/END to setup
1915 * command line argumets and ENV_START/END
1916 * for environment.
1917 */
1918 case PR_SET_MM_START_STACK:
1919 case PR_SET_MM_ARG_START:
1920 case PR_SET_MM_ARG_END:
1921 case PR_SET_MM_ENV_START:
1922 case PR_SET_MM_ENV_END:
1923 if (!vma) {
1924 error = -EFAULT;
1925 goto out;
1926 }
1927#ifdef CONFIG_STACK_GROWSUP
1928 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
1929#else
1930 if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
1931#endif
1932 goto out;
1933 if (opt == PR_SET_MM_START_STACK)
1934 mm->start_stack = addr;
1935 else if (opt == PR_SET_MM_ARG_START)
1936 mm->arg_start = addr;
1937 else if (opt == PR_SET_MM_ARG_END)
1938 mm->arg_end = addr;
1939 else if (opt == PR_SET_MM_ENV_START)
1940 mm->env_start = addr;
1941 else if (opt == PR_SET_MM_ENV_END)
1942 mm->env_end = addr;
1943 break;
1944
1945 /*
1946 * This doesn't move auxiliary vector itself
1947 * since it's pinned to mm_struct, but allow
1948 * to fill vector with new values. It's up
1949 * to a caller to provide sane values here
1950 * otherwise user space tools which use this
1951 * vector might be unhappy.
1952 */
1953 case PR_SET_MM_AUXV: {
1954 unsigned long user_auxv[AT_VECTOR_SIZE];
1955
1956 if (arg4 > sizeof(user_auxv))
1957 goto out;
1958 up_read(&mm->mmap_sem);
1959
1960 if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
1961 return -EFAULT;
1962
1963 /* Make sure the last entry is always AT_NULL */
1964 user_auxv[AT_VECTOR_SIZE - 2] = 0;
1965 user_auxv[AT_VECTOR_SIZE - 1] = 0;
1966
1967 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1968
1969 task_lock(current);
1970 memcpy(mm->saved_auxv, user_auxv, arg4);
1971 task_unlock(current);
1972
1973 return 0;
1974 }
1802 default: 1975 default:
1803 error = -EINVAL;
1804 goto out; 1976 goto out;
1805 } 1977 }
1806 1978
1807 error = 0; 1979 error = 0;
1808
1809out: 1980out:
1810 up_read(&mm->mmap_sem); 1981 up_read(&mm->mmap_sem);
1811
1812 return error; 1982 return error;
1813} 1983}
1814#else /* CONFIG_CHECKPOINT_RESTORE */ 1984#else /* CONFIG_CHECKPOINT_RESTORE */
@@ -1908,7 +2078,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1908 error = prctl_get_seccomp(); 2078 error = prctl_get_seccomp();
1909 break; 2079 break;
1910 case PR_SET_SECCOMP: 2080 case PR_SET_SECCOMP:
1911 error = prctl_set_seccomp(arg2); 2081 error = prctl_set_seccomp(arg2, (char __user *)arg3);
1912 break; 2082 break;
1913 case PR_GET_TSC: 2083 case PR_GET_TSC:
1914 error = GET_TSC_CTL(arg2); 2084 error = GET_TSC_CTL(arg2);
@@ -1979,6 +2149,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1979 error = put_user(me->signal->is_child_subreaper, 2149 error = put_user(me->signal->is_child_subreaper,
1980 (int __user *) arg2); 2150 (int __user *) arg2);
1981 break; 2151 break;
2152 case PR_SET_NO_NEW_PRIVS:
2153 if (arg2 != 1 || arg3 || arg4 || arg5)
2154 return -EINVAL;
2155
2156 current->no_new_privs = 1;
2157 break;
2158 case PR_GET_NO_NEW_PRIVS:
2159 if (arg2 || arg3 || arg4 || arg5)
2160 return -EINVAL;
2161 return current->no_new_privs ? 1 : 0;
1982 default: 2162 default:
1983 error = -EINVAL; 2163 error = -EINVAL;
1984 break; 2164 break;
@@ -2022,7 +2202,6 @@ int orderly_poweroff(bool force)
2022 NULL 2202 NULL
2023 }; 2203 };
2024 int ret = -ENOMEM; 2204 int ret = -ENOMEM;
2025 struct subprocess_info *info;
2026 2205
2027 if (argv == NULL) { 2206 if (argv == NULL) {
2028 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2207 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
@@ -2030,18 +2209,16 @@ int orderly_poweroff(bool force)
2030 goto out; 2209 goto out;
2031 } 2210 }
2032 2211
2033 info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); 2212 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
2034 if (info == NULL) { 2213 NULL, argv_cleanup, NULL);
2035 argv_free(argv); 2214out:
2036 goto out; 2215 if (likely(!ret))
2037 } 2216 return 0;
2038
2039 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
2040 2217
2041 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 2218 if (ret == -ENOMEM)
2219 argv_free(argv);
2042 2220
2043 out: 2221 if (force) {
2044 if (ret && force) {
2045 printk(KERN_WARNING "Failed to start orderly shutdown: " 2222 printk(KERN_WARNING "Failed to start orderly shutdown: "
2046 "forcing the issue\n"); 2223 "forcing the issue\n");
2047 2224
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
203cond_syscall(sys_name_to_handle_at); 203cond_syscall(sys_name_to_handle_at);
204cond_syscall(sys_open_by_handle_at); 204cond_syscall(sys_open_by_handle_at);
205cond_syscall(compat_sys_open_by_handle_at); 205cond_syscall(compat_sys_open_by_handle_at);
206
207/* compare kernel pointers */
208cond_syscall(sys_kcmp);
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 000000000000..82d1c794066d
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,84 @@
1#include <linux/spinlock.h>
2#include <linux/task_work.h>
3#include <linux/tracehook.h>
4
5int
6task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
7{
8 unsigned long flags;
9 int err = -ESRCH;
10
11#ifndef TIF_NOTIFY_RESUME
12 if (notify)
13 return -ENOTSUPP;
14#endif
15 /*
16 * We must not insert the new work if the task has already passed
17 * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
18 * and check PF_EXITING under pi_lock.
19 */
20 raw_spin_lock_irqsave(&task->pi_lock, flags);
21 if (likely(!(task->flags & PF_EXITING))) {
22 hlist_add_head(&twork->hlist, &task->task_works);
23 err = 0;
24 }
25 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
26
27 /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
28 if (likely(!err) && notify)
29 set_notify_resume(task);
30 return err;
31}
32
33struct task_work *
34task_work_cancel(struct task_struct *task, task_work_func_t func)
35{
36 unsigned long flags;
37 struct task_work *twork;
38 struct hlist_node *pos;
39
40 raw_spin_lock_irqsave(&task->pi_lock, flags);
41 hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
42 if (twork->func == func) {
43 hlist_del(&twork->hlist);
44 goto found;
45 }
46 }
47 twork = NULL;
48 found:
49 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
50
51 return twork;
52}
53
54void task_work_run(void)
55{
56 struct task_struct *task = current;
57 struct hlist_head task_works;
58 struct hlist_node *pos;
59
60 raw_spin_lock_irq(&task->pi_lock);
61 hlist_move_list(&task->task_works, &task_works);
62 raw_spin_unlock_irq(&task->pi_lock);
63
64 if (unlikely(hlist_empty(&task_works)))
65 return;
66 /*
67 * We use hlist to save the space in task_struct, but we want fifo.
68 * Find the last entry, the list should be short, then process them
69 * in reverse order.
70 */
71 for (pos = task_works.first; pos->next; pos = pos->next)
72 ;
73
74 for (;;) {
75 struct hlist_node **pprev = pos->pprev;
76 struct task_work *twork = container_of(pos, struct task_work,
77 hlist);
78 twork->func(twork);
79
80 if (pprev == &task_works.first)
81 break;
82 pos = container_of(pprev, struct hlist_node, next);
83 }
84}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a20dc8a3c949..fd42bd452b75 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -2,6 +2,55 @@
2# Timer subsystem related configuration options 2# Timer subsystem related configuration options
3# 3#
4 4
5# Options selectable by arch Kconfig
6
7# Watchdog function for clocksources to detect instabilities
8config CLOCKSOURCE_WATCHDOG
9 bool
10
11# Architecture has extra clocksource data
12config ARCH_CLOCKSOURCE_DATA
13 bool
14
15# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL
17 bool
18
19# ktime_t scalar 64bit nsec representation
20config KTIME_SCALAR
21 bool
22
23# Old style timekeeping
24config ARCH_USES_GETTIMEOFFSET
25 bool
26
27# The generic clock events infrastructure
28config GENERIC_CLOCKEVENTS
29 bool
30
31# Migration helper. Builds, but does not invoke
32config GENERIC_CLOCKEVENTS_BUILD
33 bool
34 default y
35 depends on GENERIC_CLOCKEVENTS
36
37# Clockevents broadcasting infrastructure
38config GENERIC_CLOCKEVENTS_BROADCAST
39 bool
40 depends on GENERIC_CLOCKEVENTS
41
42# Automatically adjust the min. reprogramming time for
43# clock event device
44config GENERIC_CLOCKEVENTS_MIN_ADJUST
45 bool
46
47# Generic update of CMOS clock
48config GENERIC_CMOS_UPDATE
49 bool
50
51if GENERIC_CLOCKEVENTS
52menu "Timers subsystem"
53
5# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is 54# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
6# only related to the tick functionality. Oneshot clockevent devices 55# only related to the tick functionality. Oneshot clockevent devices
7# are supported independ of this. 56# are supported independ of this.
@@ -26,10 +75,5 @@ config HIGH_RES_TIMERS
26 hardware is not capable then this option only increases 75 hardware is not capable then this option only increases
27 the size of the kernel image. 76 the size of the kernel image.
28 77
29config GENERIC_CLOCKEVENTS_BUILD 78endmenu
30 bool 79endif
31 default y
32 depends on GENERIC_CLOCKEVENTS
33
34config GENERIC_CLOCKEVENTS_MIN_ADJUST
35 bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7b..aa27d391bfc8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
59 * If one has not already been chosen, it checks to see if a 59 * If one has not already been chosen, it checks to see if a
60 * functional rtc device is available. 60 * functional rtc device is available.
61 */ 61 */
62static struct rtc_device *alarmtimer_get_rtcdev(void) 62struct rtc_device *alarmtimer_get_rtcdev(void)
63{ 63{
64 unsigned long flags; 64 unsigned long flags;
65 struct rtc_device *ret; 65 struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
115 class_interface_unregister(&alarmtimer_rtc_interface); 115 class_interface_unregister(&alarmtimer_rtc_interface);
116} 116}
117#else 117#else
118static inline struct rtc_device *alarmtimer_get_rtcdev(void) 118struct rtc_device *alarmtimer_get_rtcdev(void)
119{ 119{
120 return NULL; 120 return NULL;
121} 121}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
297} 297}
298EXPORT_SYMBOL_GPL(clockevents_register_device); 298EXPORT_SYMBOL_GPL(clockevents_register_device);
299 299
300static void clockevents_config(struct clock_event_device *dev, 300void clockevents_config(struct clock_event_device *dev, u32 freq)
301 u32 freq)
302{ 301{
303 u64 sec; 302 u64 sec;
304 303
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f03fd83b170b..70b33abcc7bb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -412,6 +412,7 @@ int second_overflow(unsigned long secs)
412 if (secs % 86400 == 0) { 412 if (secs % 86400 == 0) {
413 leap = -1; 413 leap = -1;
414 time_state = TIME_OOP; 414 time_state = TIME_OOP;
415 time_tai++;
415 printk(KERN_NOTICE 416 printk(KERN_NOTICE
416 "Clock: inserting leap second 23:59:60 UTC\n"); 417 "Clock: inserting leap second 23:59:60 UTC\n");
417 } 418 }
@@ -426,7 +427,6 @@ int second_overflow(unsigned long secs)
426 } 427 }
427 break; 428 break;
428 case TIME_OOP: 429 case TIME_OOP:
429 time_tai++;
430 time_state = TIME_WAIT; 430 time_state = TIME_WAIT;
431 break; 431 break;
432 432
@@ -473,8 +473,6 @@ int second_overflow(unsigned long secs)
473 << NTP_SCALE_SHIFT; 473 << NTP_SCALE_SHIFT;
474 time_adjust = 0; 474 time_adjust = 0;
475 475
476
477
478out: 476out:
479 spin_unlock_irqrestore(&ntp_lock, flags); 477 spin_unlock_irqrestore(&ntp_lock, flags);
480 478
@@ -559,10 +557,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
559 /* only set allowed bits */ 557 /* only set allowed bits */
560 time_status &= STA_RONLY; 558 time_status &= STA_RONLY;
561 time_status |= txc->status & ~STA_RONLY; 559 time_status |= txc->status & ~STA_RONLY;
562
563} 560}
561
564/* 562/*
565 * Called with the xtime lock held, so we can access and modify 563 * Called with ntp_lock held, so we can access and modify
566 * all the global NTP state: 564 * all the global NTP state:
567 */ 565 */
568static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) 566static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..da70c6db496c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void)
576 /* Update jiffies first */ 576 /* Update jiffies first */
577 select_nohz_load_balancer(0); 577 select_nohz_load_balancer(0);
578 tick_do_update_jiffies64(now); 578 tick_do_update_jiffies64(now);
579 update_cpu_load_nohz();
579 580
580#ifndef CONFIG_VIRT_CPU_ACCOUNTING 581#ifndef CONFIG_VIRT_CPU_ACCOUNTING
581 /* 582 /*
@@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
814 return HRTIMER_RESTART; 815 return HRTIMER_RESTART;
815} 816}
816 817
818static int sched_skew_tick;
819
820static int __init skew_tick(char *str)
821{
822 get_option(&str, &sched_skew_tick);
823
824 return 0;
825}
826early_param("skew_tick", skew_tick);
827
817/** 828/**
818 * tick_setup_sched_timer - setup the tick emulation timer 829 * tick_setup_sched_timer - setup the tick emulation timer
819 */ 830 */
@@ -831,6 +842,14 @@ void tick_setup_sched_timer(void)
831 /* Get the next period (per cpu) */ 842 /* Get the next period (per cpu) */
832 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 843 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
833 844
845 /* Offset the tick to avert xtime_lock contention. */
846 if (sched_skew_tick) {
847 u64 offset = ktime_to_ns(tick_period) >> 1;
848 do_div(offset, num_possible_cpus());
849 offset *= smp_processor_id();
850 hrtimer_add_expires_ns(&ts->sched_timer, offset);
851 }
852
834 for (;;) { 853 for (;;) {
835 hrtimer_forward(&ts->sched_timer, now, tick_period); 854 hrtimer_forward(&ts->sched_timer, now, tick_period);
836 hrtimer_start_expires(&ts->sched_timer, 855 hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d66b21308f7c..6e46cacf5969 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -240,7 +240,6 @@ void getnstimeofday(struct timespec *ts)
240 240
241 timespec_add_ns(ts, nsecs); 241 timespec_add_ns(ts, nsecs);
242} 242}
243
244EXPORT_SYMBOL(getnstimeofday); 243EXPORT_SYMBOL(getnstimeofday);
245 244
246ktime_t ktime_get(void) 245ktime_t ktime_get(void)
@@ -357,8 +356,8 @@ void do_gettimeofday(struct timeval *tv)
357 tv->tv_sec = now.tv_sec; 356 tv->tv_sec = now.tv_sec;
358 tv->tv_usec = now.tv_nsec/1000; 357 tv->tv_usec = now.tv_nsec/1000;
359} 358}
360
361EXPORT_SYMBOL(do_gettimeofday); 359EXPORT_SYMBOL(do_gettimeofday);
360
362/** 361/**
363 * do_settimeofday - Sets the time of day 362 * do_settimeofday - Sets the time of day
364 * @tv: pointer to the timespec variable containing the new time 363 * @tv: pointer to the timespec variable containing the new time
@@ -392,7 +391,6 @@ int do_settimeofday(const struct timespec *tv)
392 391
393 return 0; 392 return 0;
394} 393}
395
396EXPORT_SYMBOL(do_settimeofday); 394EXPORT_SYMBOL(do_settimeofday);
397 395
398 396
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..6ec7e7e0db43 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
861 * 861 *
862 * mod_timer_pinned() is a way to update the expire field of an 862 * mod_timer_pinned() is a way to update the expire field of an
863 * active timer (if the timer is inactive it will be activated) 863 * active timer (if the timer is inactive it will be activated)
864 * and not allow the timer to be migrated to a different CPU. 864 * and to ensure that the timer is scheduled on the current CPU.
865 *
866 * Note that this does not prevent the timer from being migrated
867 * when the current CPU goes offline. If this is a problem for
868 * you, use CPU-hotplug notifiers to handle it correctly, for
869 * example, cancelling the timer when the corresponding CPU goes
870 * offline.
865 * 871 *
866 * mod_timer_pinned(timer, expires) is equivalent to: 872 * mod_timer_pinned(timer, expires) is equivalent to:
867 * 873 *
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1102 * warnings as well as problems when looking into 1108 * warnings as well as problems when looking into
1103 * timer->lockdep_map, make a copy and use that here. 1109 * timer->lockdep_map, make a copy and use that here.
1104 */ 1110 */
1105 struct lockdep_map lockdep_map = timer->lockdep_map; 1111 struct lockdep_map lockdep_map;
1112
1113 lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
1106#endif 1114#endif
1107 /* 1115 /*
1108 * Couple the lock chain with the lock chain at 1116 * Couple the lock chain with the lock chain at
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid)
1427SYSCALL_DEFINE0(getuid) 1435SYSCALL_DEFINE0(getuid)
1428{ 1436{
1429 /* Only we change this so SMP safe */ 1437 /* Only we change this so SMP safe */
1430 return current_uid(); 1438 return from_kuid_munged(current_user_ns(), current_uid());
1431} 1439}
1432 1440
1433SYSCALL_DEFINE0(geteuid) 1441SYSCALL_DEFINE0(geteuid)
1434{ 1442{
1435 /* Only we change this so SMP safe */ 1443 /* Only we change this so SMP safe */
1436 return current_euid(); 1444 return from_kuid_munged(current_user_ns(), current_euid());
1437} 1445}
1438 1446
1439SYSCALL_DEFINE0(getgid) 1447SYSCALL_DEFINE0(getgid)
1440{ 1448{
1441 /* Only we change this so SMP safe */ 1449 /* Only we change this so SMP safe */
1442 return current_gid(); 1450 return from_kgid_munged(current_user_ns(), current_gid());
1443} 1451}
1444 1452
1445SYSCALL_DEFINE0(getegid) 1453SYSCALL_DEFINE0(getegid)
1446{ 1454{
1447 /* Only we change this so SMP safe */ 1455 /* Only we change this so SMP safe */
1448 return current_egid(); 1456 return from_kgid_munged(current_user_ns(), current_egid());
1449} 1457}
1450 1458
1451#endif 1459#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..8c4c07071cc5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,6 @@ if FTRACE
141config FUNCTION_TRACER 141config FUNCTION_TRACER
142 bool "Kernel Function Tracer" 142 bool "Kernel Function Tracer"
143 depends on HAVE_FUNCTION_TRACER 143 depends on HAVE_FUNCTION_TRACER
144 select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
145 select KALLSYMS 144 select KALLSYMS
146 select GENERIC_TRACER 145 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 146 select CONTEXT_SWITCH_TRACER
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES
272 bool "Trace likely/unlikely profiler" 271 bool "Trace likely/unlikely profiler"
273 select TRACE_BRANCH_PROFILING 272 select TRACE_BRANCH_PROFILING
274 help 273 help
275 This tracer profiles all the the likely and unlikely macros 274 This tracer profiles all likely and unlikely macros
276 in the kernel. It will display the results in: 275 in the kernel. It will display the results in:
277 276
278 /sys/kernel/debug/tracing/trace_stat/branch_annotated 277 /sys/kernel/debug/tracing/trace_stat/branch_annotated
@@ -373,6 +372,7 @@ config KPROBE_EVENT
373 depends on HAVE_REGS_AND_STACK_ACCESS_API 372 depends on HAVE_REGS_AND_STACK_ACCESS_API
374 bool "Enable kprobes-based dynamic events" 373 bool "Enable kprobes-based dynamic events"
375 select TRACING 374 select TRACING
375 select PROBE_EVENTS
376 default y 376 default y
377 help 377 help
378 This allows the user to add tracing events (similar to tracepoints) 378 This allows the user to add tracing events (similar to tracepoints)
@@ -385,6 +385,25 @@ config KPROBE_EVENT
385 This option is also required by perf-probe subcommand of perf tools. 385 This option is also required by perf-probe subcommand of perf tools.
386 If you want to use perf tools, this option is strongly recommended. 386 If you want to use perf tools, this option is strongly recommended.
387 387
388config UPROBE_EVENT
389 bool "Enable uprobes-based dynamic events"
390 depends on ARCH_SUPPORTS_UPROBES
391 depends on MMU
392 select UPROBES
393 select PROBE_EVENTS
394 select TRACING
395 default n
396 help
397 This allows the user to add tracing events on top of userspace
398 dynamic events (similar to tracepoints) on the fly via the trace
399 events interface. Those events can be inserted wherever uprobes
400 can probe, and record various registers.
401 This option is required if you plan to use perf-probe subcommand
402 of perf tools on user space applications.
403
404config PROBE_EVENTS
405 def_bool n
406
388config DYNAMIC_FTRACE 407config DYNAMIC_FTRACE
389 bool "enable/disable ftrace tracepoints dynamically" 408 bool "enable/disable ftrace tracepoints dynamically"
390 depends on FUNCTION_TRACER 409 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..b831087c8200 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 41obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
45obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 44obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
46ifeq ($(CONFIG_BLOCK),y) 45ifeq ($(CONFIG_BLOCK),y)
47obj-$(CONFIG_EVENT_TRACING) += blktrace.o 46obj-$(CONFIG_EVENT_TRACING) += blktrace.o
@@ -61,5 +60,7 @@ endif
61ifeq ($(CONFIG_TRACING),y) 60ifeq ($(CONFIG_TRACING),y)
62obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 61obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
63endif 62endif
63obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
64obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
64 65
65libftrace-y := ftrace.o 66libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0fa92f677c92..a008663d86c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1383 1383
1384static int ftrace_cmp_recs(const void *a, const void *b) 1384static int ftrace_cmp_recs(const void *a, const void *b)
1385{ 1385{
1386 const struct dyn_ftrace *reca = a; 1386 const struct dyn_ftrace *key = a;
1387 const struct dyn_ftrace *recb = b; 1387 const struct dyn_ftrace *rec = b;
1388 1388
1389 if (reca->ip > recb->ip) 1389 if (key->flags < rec->ip)
1390 return 1;
1391 if (reca->ip < recb->ip)
1392 return -1; 1390 return -1;
1391 if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
1392 return 1;
1393 return 0; 1393 return 0;
1394} 1394}
1395 1395
1396/** 1396static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
1397 * ftrace_location - return true if the ip giving is a traced location
1398 * @ip: the instruction pointer to check
1399 *
1400 * Returns 1 if @ip given is a pointer to a ftrace location.
1401 * That is, the instruction that is either a NOP or call to
1402 * the function tracer. It checks the ftrace internal tables to
1403 * determine if the address belongs or not.
1404 */
1405int ftrace_location(unsigned long ip)
1406{ 1397{
1407 struct ftrace_page *pg; 1398 struct ftrace_page *pg;
1408 struct dyn_ftrace *rec; 1399 struct dyn_ftrace *rec;
1409 struct dyn_ftrace key; 1400 struct dyn_ftrace key;
1410 1401
1411 key.ip = ip; 1402 key.ip = start;
1403 key.flags = end; /* overload flags, as it is unsigned long */
1412 1404
1413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 1405 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1406 if (end < pg->records[0].ip ||
1407 start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
1408 continue;
1414 rec = bsearch(&key, pg->records, pg->index, 1409 rec = bsearch(&key, pg->records, pg->index,
1415 sizeof(struct dyn_ftrace), 1410 sizeof(struct dyn_ftrace),
1416 ftrace_cmp_recs); 1411 ftrace_cmp_recs);
1417 if (rec) 1412 if (rec)
1418 return 1; 1413 return rec->ip;
1419 } 1414 }
1420 1415
1421 return 0; 1416 return 0;
1422} 1417}
1423 1418
1419/**
1420 * ftrace_location - return true if the ip giving is a traced location
1421 * @ip: the instruction pointer to check
1422 *
1423 * Returns rec->ip if @ip given is a pointer to a ftrace location.
1424 * That is, the instruction that is either a NOP or call to
1425 * the function tracer. It checks the ftrace internal tables to
1426 * determine if the address belongs or not.
1427 */
1428unsigned long ftrace_location(unsigned long ip)
1429{
1430 return ftrace_location_range(ip, ip);
1431}
1432
1433/**
1434 * ftrace_text_reserved - return true if range contains an ftrace location
1435 * @start: start of range to search
1436 * @end: end of range to search (inclusive). @end points to the last byte to check.
1437 *
1438 * Returns 1 if @start and @end contains a ftrace location.
1439 * That is, the instruction that is either a NOP or call to
1440 * the function tracer. It checks the ftrace internal tables to
1441 * determine if the address belongs or not.
1442 */
1443int ftrace_text_reserved(void *start, void *end)
1444{
1445 unsigned long ret;
1446
1447 ret = ftrace_location_range((unsigned long)start,
1448 (unsigned long)end);
1449
1450 return (int)!!ret;
1451}
1452
1424static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1453static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1425 int filter_hash, 1454 int filter_hash,
1426 bool inc) 1455 bool inc)
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1520 __ftrace_hash_rec_update(ops, filter_hash, 1); 1549 __ftrace_hash_rec_update(ops, filter_hash, 1);
1521} 1550}
1522 1551
1523static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1524{
1525 if (ftrace_pages->index == ftrace_pages->size) {
1526 /* We should have allocated enough */
1527 if (WARN_ON(!ftrace_pages->next))
1528 return NULL;
1529 ftrace_pages = ftrace_pages->next;
1530 }
1531
1532 return &ftrace_pages->records[ftrace_pages->index++];
1533}
1534
1535static struct dyn_ftrace *
1536ftrace_record_ip(unsigned long ip)
1537{
1538 struct dyn_ftrace *rec;
1539
1540 if (ftrace_disabled)
1541 return NULL;
1542
1543 rec = ftrace_alloc_dyn_node(ip);
1544 if (!rec)
1545 return NULL;
1546
1547 rec->ip = ip;
1548
1549 return rec;
1550}
1551
1552static void print_ip_ins(const char *fmt, unsigned char *p) 1552static void print_ip_ins(const char *fmt, unsigned char *p)
1553{ 1553{
1554 int i; 1554 int i;
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)
1598 } 1598 }
1599} 1599}
1600 1600
1601
1602/* Return 1 if the address range is reserved for ftrace */
1603int ftrace_text_reserved(void *start, void *end)
1604{
1605 struct dyn_ftrace *rec;
1606 struct ftrace_page *pg;
1607
1608 do_for_each_ftrace_rec(pg, rec) {
1609 if (rec->ip <= (unsigned long)end &&
1610 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1611 return 1;
1612 } while_for_each_ftrace_rec();
1613 return 0;
1614}
1615
1616static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) 1601static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1617{ 1602{
1618 unsigned long flag = 0UL; 1603 unsigned long flag = 0UL;
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1698 return -1; /* unknow ftrace bug */ 1683 return -1; /* unknow ftrace bug */
1699} 1684}
1700 1685
1701static void ftrace_replace_code(int update) 1686void __weak ftrace_replace_code(int enable)
1702{ 1687{
1703 struct dyn_ftrace *rec; 1688 struct dyn_ftrace *rec;
1704 struct ftrace_page *pg; 1689 struct ftrace_page *pg;
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)
1708 return; 1693 return;
1709 1694
1710 do_for_each_ftrace_rec(pg, rec) { 1695 do_for_each_ftrace_rec(pg, rec) {
1711 failed = __ftrace_replace_code(rec, update); 1696 failed = __ftrace_replace_code(rec, enable);
1712 if (failed) { 1697 if (failed) {
1713 ftrace_bug(failed, rec->ip); 1698 ftrace_bug(failed, rec->ip);
1714 /* Stop processing */ 1699 /* Stop processing */
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
1826 return 0; 1811 return 0;
1827} 1812}
1828 1813
1829static int __ftrace_modify_code(void *data) 1814void ftrace_modify_all_code(int command)
1830{ 1815{
1831 int *command = data; 1816 if (command & FTRACE_UPDATE_CALLS)
1832
1833 if (*command & FTRACE_UPDATE_CALLS)
1834 ftrace_replace_code(1); 1817 ftrace_replace_code(1);
1835 else if (*command & FTRACE_DISABLE_CALLS) 1818 else if (command & FTRACE_DISABLE_CALLS)
1836 ftrace_replace_code(0); 1819 ftrace_replace_code(0);
1837 1820
1838 if (*command & FTRACE_UPDATE_TRACE_FUNC) 1821 if (command & FTRACE_UPDATE_TRACE_FUNC)
1839 ftrace_update_ftrace_func(ftrace_trace_function); 1822 ftrace_update_ftrace_func(ftrace_trace_function);
1840 1823
1841 if (*command & FTRACE_START_FUNC_RET) 1824 if (command & FTRACE_START_FUNC_RET)
1842 ftrace_enable_ftrace_graph_caller(); 1825 ftrace_enable_ftrace_graph_caller();
1843 else if (*command & FTRACE_STOP_FUNC_RET) 1826 else if (command & FTRACE_STOP_FUNC_RET)
1844 ftrace_disable_ftrace_graph_caller(); 1827 ftrace_disable_ftrace_graph_caller();
1828}
1829
1830static int __ftrace_modify_code(void *data)
1831{
1832 int *command = data;
1833
1834 ftrace_modify_all_code(*command);
1845 1835
1846 return 0; 1836 return 0;
1847} 1837}
@@ -2469,57 +2459,35 @@ static int
2469ftrace_avail_open(struct inode *inode, struct file *file) 2459ftrace_avail_open(struct inode *inode, struct file *file)
2470{ 2460{
2471 struct ftrace_iterator *iter; 2461 struct ftrace_iterator *iter;
2472 int ret;
2473 2462
2474 if (unlikely(ftrace_disabled)) 2463 if (unlikely(ftrace_disabled))
2475 return -ENODEV; 2464 return -ENODEV;
2476 2465
2477 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2466 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2478 if (!iter) 2467 if (iter) {
2479 return -ENOMEM; 2468 iter->pg = ftrace_pages_start;
2480 2469 iter->ops = &global_ops;
2481 iter->pg = ftrace_pages_start;
2482 iter->ops = &global_ops;
2483
2484 ret = seq_open(file, &show_ftrace_seq_ops);
2485 if (!ret) {
2486 struct seq_file *m = file->private_data;
2487
2488 m->private = iter;
2489 } else {
2490 kfree(iter);
2491 } 2470 }
2492 2471
2493 return ret; 2472 return iter ? 0 : -ENOMEM;
2494} 2473}
2495 2474
2496static int 2475static int
2497ftrace_enabled_open(struct inode *inode, struct file *file) 2476ftrace_enabled_open(struct inode *inode, struct file *file)
2498{ 2477{
2499 struct ftrace_iterator *iter; 2478 struct ftrace_iterator *iter;
2500 int ret;
2501 2479
2502 if (unlikely(ftrace_disabled)) 2480 if (unlikely(ftrace_disabled))
2503 return -ENODEV; 2481 return -ENODEV;
2504 2482
2505 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2483 iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
2506 if (!iter) 2484 if (iter) {
2507 return -ENOMEM; 2485 iter->pg = ftrace_pages_start;
2508 2486 iter->flags = FTRACE_ITER_ENABLED;
2509 iter->pg = ftrace_pages_start; 2487 iter->ops = &global_ops;
2510 iter->flags = FTRACE_ITER_ENABLED;
2511 iter->ops = &global_ops;
2512
2513 ret = seq_open(file, &show_ftrace_seq_ops);
2514 if (!ret) {
2515 struct seq_file *m = file->private_data;
2516
2517 m->private = iter;
2518 } else {
2519 kfree(iter);
2520 } 2488 }
2521 2489
2522 return ret; 2490 return iter ? 0 : -ENOMEM;
2523} 2491}
2524 2492
2525static void ftrace_filter_reset(struct ftrace_hash *hash) 2493static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3688 return 0; 3656 return 0;
3689} 3657}
3690 3658
3691static void ftrace_swap_recs(void *a, void *b, int size) 3659static int ftrace_cmp_ips(const void *a, const void *b)
3660{
3661 const unsigned long *ipa = a;
3662 const unsigned long *ipb = b;
3663
3664 if (*ipa > *ipb)
3665 return 1;
3666 if (*ipa < *ipb)
3667 return -1;
3668 return 0;
3669}
3670
3671static void ftrace_swap_ips(void *a, void *b, int size)
3692{ 3672{
3693 struct dyn_ftrace *reca = a; 3673 unsigned long *ipa = a;
3694 struct dyn_ftrace *recb = b; 3674 unsigned long *ipb = b;
3695 struct dyn_ftrace t; 3675 unsigned long t;
3696 3676
3697 t = *reca; 3677 t = *ipa;
3698 *reca = *recb; 3678 *ipa = *ipb;
3699 *recb = t; 3679 *ipb = t;
3700} 3680}
3701 3681
3702static int ftrace_process_locs(struct module *mod, 3682static int ftrace_process_locs(struct module *mod,
3703 unsigned long *start, 3683 unsigned long *start,
3704 unsigned long *end) 3684 unsigned long *end)
3705{ 3685{
3686 struct ftrace_page *start_pg;
3706 struct ftrace_page *pg; 3687 struct ftrace_page *pg;
3688 struct dyn_ftrace *rec;
3707 unsigned long count; 3689 unsigned long count;
3708 unsigned long *p; 3690 unsigned long *p;
3709 unsigned long addr; 3691 unsigned long addr;
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,
3715 if (!count) 3697 if (!count)
3716 return 0; 3698 return 0;
3717 3699
3718 pg = ftrace_allocate_pages(count); 3700 sort(start, count, sizeof(*start),
3719 if (!pg) 3701 ftrace_cmp_ips, ftrace_swap_ips);
3702
3703 start_pg = ftrace_allocate_pages(count);
3704 if (!start_pg)
3720 return -ENOMEM; 3705 return -ENOMEM;
3721 3706
3722 mutex_lock(&ftrace_lock); 3707 mutex_lock(&ftrace_lock);
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,
3729 if (!mod) { 3714 if (!mod) {
3730 WARN_ON(ftrace_pages || ftrace_pages_start); 3715 WARN_ON(ftrace_pages || ftrace_pages_start);
3731 /* First initialization */ 3716 /* First initialization */
3732 ftrace_pages = ftrace_pages_start = pg; 3717 ftrace_pages = ftrace_pages_start = start_pg;
3733 } else { 3718 } else {
3734 if (!ftrace_pages) 3719 if (!ftrace_pages)
3735 goto out; 3720 goto out;
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,
3740 ftrace_pages = ftrace_pages->next; 3725 ftrace_pages = ftrace_pages->next;
3741 } 3726 }
3742 3727
3743 ftrace_pages->next = pg; 3728 ftrace_pages->next = start_pg;
3744 ftrace_pages = pg;
3745 } 3729 }
3746 3730
3747 p = start; 3731 p = start;
3732 pg = start_pg;
3748 while (p < end) { 3733 while (p < end) {
3749 addr = ftrace_call_adjust(*p++); 3734 addr = ftrace_call_adjust(*p++);
3750 /* 3735 /*
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,
3755 */ 3740 */
3756 if (!addr) 3741 if (!addr)
3757 continue; 3742 continue;
3758 if (!ftrace_record_ip(addr)) 3743
3759 break; 3744 if (pg->index == pg->size) {
3745 /* We should have allocated enough */
3746 if (WARN_ON(!pg->next))
3747 break;
3748 pg = pg->next;
3749 }
3750
3751 rec = &pg->records[pg->index++];
3752 rec->ip = addr;
3760 } 3753 }
3761 3754
3762 /* These new locations need to be initialized */ 3755 /* We should have used all pages */
3763 ftrace_new_pgs = pg; 3756 WARN_ON(pg->next);
3757
3758 /* Assign the last page to ftrace_pages */
3759 ftrace_pages = pg;
3764 3760
3765 /* Make each individual set of pages sorted by ips */ 3761 /* These new locations need to be initialized */
3766 for (; pg; pg = pg->next) 3762 ftrace_new_pgs = start_pg;
3767 sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
3768 ftrace_cmp_recs, ftrace_swap_recs);
3769 3763
3770 /* 3764 /*
3771 * We only need to disable interrupts on start up 3765 * We only need to disable interrupts on start up
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cf8d11e91efd..1d0f6a8a0e5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
23#include <asm/local.h> 23#include <asm/local.h>
24#include "trace.h" 24#include "trace.h"
25 25
26static void update_pages_handler(struct work_struct *work);
27
26/* 28/*
27 * The ring buffer header is special. We must manually up keep it. 29 * The ring buffer header is special. We must manually up keep it.
28 */ 30 */
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu {
449 raw_spinlock_t reader_lock; /* serialize readers */ 451 raw_spinlock_t reader_lock; /* serialize readers */
450 arch_spinlock_t lock; 452 arch_spinlock_t lock;
451 struct lock_class_key lock_key; 453 struct lock_class_key lock_key;
454 unsigned int nr_pages;
452 struct list_head *pages; 455 struct list_head *pages;
453 struct buffer_page *head_page; /* read from head */ 456 struct buffer_page *head_page; /* read from head */
454 struct buffer_page *tail_page; /* write to tail */ 457 struct buffer_page *tail_page; /* write to tail */
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu {
466 unsigned long read_bytes; 469 unsigned long read_bytes;
467 u64 write_stamp; 470 u64 write_stamp;
468 u64 read_stamp; 471 u64 read_stamp;
472 /* ring buffer pages to update, > 0 to add, < 0 to remove */
473 int nr_pages_to_update;
474 struct list_head new_pages; /* new pages to add */
475 struct work_struct update_pages_work;
476 struct completion update_done;
469}; 477};
470 478
471struct ring_buffer { 479struct ring_buffer {
472 unsigned pages;
473 unsigned flags; 480 unsigned flags;
474 int cpus; 481 int cpus;
475 atomic_t record_disabled; 482 atomic_t record_disabled;
483 atomic_t resize_disabled;
476 cpumask_var_t cpumask; 484 cpumask_var_t cpumask;
477 485
478 struct lock_class_key *reader_lock_key; 486 struct lock_class_key *reader_lock_key;
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
937 struct list_head *head = cpu_buffer->pages; 945 struct list_head *head = cpu_buffer->pages;
938 struct buffer_page *bpage, *tmp; 946 struct buffer_page *bpage, *tmp;
939 947
948 /* Reset the head page if it exists */
949 if (cpu_buffer->head_page)
950 rb_set_head_page(cpu_buffer);
951
940 rb_head_page_deactivate(cpu_buffer); 952 rb_head_page_deactivate(cpu_buffer);
941 953
942 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 954 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
963 return 0; 975 return 0;
964} 976}
965 977
966static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 978static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
967 unsigned nr_pages)
968{ 979{
980 int i;
969 struct buffer_page *bpage, *tmp; 981 struct buffer_page *bpage, *tmp;
970 LIST_HEAD(pages);
971 unsigned i;
972
973 WARN_ON(!nr_pages);
974 982
975 for (i = 0; i < nr_pages; i++) { 983 for (i = 0; i < nr_pages; i++) {
976 struct page *page; 984 struct page *page;
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
981 */ 989 */
982 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 990 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
983 GFP_KERNEL | __GFP_NORETRY, 991 GFP_KERNEL | __GFP_NORETRY,
984 cpu_to_node(cpu_buffer->cpu)); 992 cpu_to_node(cpu));
985 if (!bpage) 993 if (!bpage)
986 goto free_pages; 994 goto free_pages;
987 995
988 rb_check_bpage(cpu_buffer, bpage); 996 list_add(&bpage->list, pages);
989 997
990 list_add(&bpage->list, &pages); 998 page = alloc_pages_node(cpu_to_node(cpu),
991
992 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
993 GFP_KERNEL | __GFP_NORETRY, 0); 999 GFP_KERNEL | __GFP_NORETRY, 0);
994 if (!page) 1000 if (!page)
995 goto free_pages; 1001 goto free_pages;
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 rb_init_page(bpage->page); 1003 rb_init_page(bpage->page);
998 } 1004 }
999 1005
1006 return 0;
1007
1008free_pages:
1009 list_for_each_entry_safe(bpage, tmp, pages, list) {
1010 list_del_init(&bpage->list);
1011 free_buffer_page(bpage);
1012 }
1013
1014 return -ENOMEM;
1015}
1016
1017static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1018 unsigned nr_pages)
1019{
1020 LIST_HEAD(pages);
1021
1022 WARN_ON(!nr_pages);
1023
1024 if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
1025 return -ENOMEM;
1026
1000 /* 1027 /*
1001 * The ring buffer page list is a circular list that does not 1028 * The ring buffer page list is a circular list that does not
1002 * start and end with a list head. All page list items point to 1029 * start and end with a list head. All page list items point to
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1005 cpu_buffer->pages = pages.next; 1032 cpu_buffer->pages = pages.next;
1006 list_del(&pages); 1033 list_del(&pages);
1007 1034
1035 cpu_buffer->nr_pages = nr_pages;
1036
1008 rb_check_pages(cpu_buffer); 1037 rb_check_pages(cpu_buffer);
1009 1038
1010 return 0; 1039 return 0;
1011
1012 free_pages:
1013 list_for_each_entry_safe(bpage, tmp, &pages, list) {
1014 list_del_init(&bpage->list);
1015 free_buffer_page(bpage);
1016 }
1017 return -ENOMEM;
1018} 1040}
1019 1041
1020static struct ring_buffer_per_cpu * 1042static struct ring_buffer_per_cpu *
1021rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 1043rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1022{ 1044{
1023 struct ring_buffer_per_cpu *cpu_buffer; 1045 struct ring_buffer_per_cpu *cpu_buffer;
1024 struct buffer_page *bpage; 1046 struct buffer_page *bpage;
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1035 raw_spin_lock_init(&cpu_buffer->reader_lock); 1057 raw_spin_lock_init(&cpu_buffer->reader_lock);
1036 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1058 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1037 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1059 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1060 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1061 init_completion(&cpu_buffer->update_done);
1038 1062
1039 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1063 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1040 GFP_KERNEL, cpu_to_node(cpu)); 1064 GFP_KERNEL, cpu_to_node(cpu));
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1052 1076
1053 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1054 1078
1055 ret = rb_allocate_pages(cpu_buffer, buffer->pages); 1079 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1056 if (ret < 0) 1080 if (ret < 0)
1057 goto fail_free_reader; 1081 goto fail_free_reader;
1058 1082
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1113{ 1137{
1114 struct ring_buffer *buffer; 1138 struct ring_buffer *buffer;
1115 int bsize; 1139 int bsize;
1116 int cpu; 1140 int cpu, nr_pages;
1117 1141
1118 /* keep it in its own cache line */ 1142 /* keep it in its own cache line */
1119 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1143 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1124 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) 1148 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
1125 goto fail_free_buffer; 1149 goto fail_free_buffer;
1126 1150
1127 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1151 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1128 buffer->flags = flags; 1152 buffer->flags = flags;
1129 buffer->clock = trace_clock_local; 1153 buffer->clock = trace_clock_local;
1130 buffer->reader_lock_key = key; 1154 buffer->reader_lock_key = key;
1131 1155
1132 /* need at least two pages */ 1156 /* need at least two pages */
1133 if (buffer->pages < 2) 1157 if (nr_pages < 2)
1134 buffer->pages = 2; 1158 nr_pages = 2;
1135 1159
1136 /* 1160 /*
1137 * In case of non-hotplug cpu, if the ring-buffer is allocated 1161 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1154 1178
1155 for_each_buffer_cpu(buffer, cpu) { 1179 for_each_buffer_cpu(buffer, cpu) {
1156 buffer->buffers[cpu] = 1180 buffer->buffers[cpu] =
1157 rb_allocate_cpu_buffer(buffer, cpu); 1181 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
1158 if (!buffer->buffers[cpu]) 1182 if (!buffer->buffers[cpu])
1159 goto fail_free_buffers; 1183 goto fail_free_buffers;
1160 } 1184 }
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
1222 1246
1223static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 1247static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1224 1248
1225static void 1249static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1226rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1227{ 1250{
1228 struct buffer_page *bpage; 1251 return local_read(&bpage->entries) & RB_WRITE_MASK;
1229 struct list_head *p; 1252}
1230 unsigned i; 1253
1254static inline unsigned long rb_page_write(struct buffer_page *bpage)
1255{
1256 return local_read(&bpage->write) & RB_WRITE_MASK;
1257}
1258
1259static int
1260rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1261{
1262 struct list_head *tail_page, *to_remove, *next_page;
1263 struct buffer_page *to_remove_page, *tmp_iter_page;
1264 struct buffer_page *last_page, *first_page;
1265 unsigned int nr_removed;
1266 unsigned long head_bit;
1267 int page_entries;
1268
1269 head_bit = 0;
1231 1270
1232 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1271 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1233 rb_head_page_deactivate(cpu_buffer); 1272 atomic_inc(&cpu_buffer->record_disabled);
1273 /*
1274 * We don't race with the readers since we have acquired the reader
1275 * lock. We also don't race with writers after disabling recording.
1276 * This makes it easy to figure out the first and the last page to be
1277 * removed from the list. We unlink all the pages in between including
1278 * the first and last pages. This is done in a busy loop so that we
1279 * lose the least number of traces.
1280 * The pages are freed after we restart recording and unlock readers.
1281 */
1282 tail_page = &cpu_buffer->tail_page->list;
1234 1283
1235 for (i = 0; i < nr_pages; i++) { 1284 /*
1236 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1285 * tail page might be on reader page, we remove the next page
1237 goto out; 1286 * from the ring buffer
1238 p = cpu_buffer->pages->next; 1287 */
1239 bpage = list_entry(p, struct buffer_page, list); 1288 if (cpu_buffer->tail_page == cpu_buffer->reader_page)
1240 list_del_init(&bpage->list); 1289 tail_page = rb_list_head(tail_page->next);
1241 free_buffer_page(bpage); 1290 to_remove = tail_page;
1291
1292 /* start of pages to remove */
1293 first_page = list_entry(rb_list_head(to_remove->next),
1294 struct buffer_page, list);
1295
1296 for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
1297 to_remove = rb_list_head(to_remove)->next;
1298 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
1242 } 1299 }
1243 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1244 goto out;
1245 1300
1246 rb_reset_cpu(cpu_buffer); 1301 next_page = rb_list_head(to_remove)->next;
1247 rb_check_pages(cpu_buffer);
1248 1302
1249out: 1303 /*
1304 * Now we remove all pages between tail_page and next_page.
1305 * Make sure that we have head_bit value preserved for the
1306 * next page
1307 */
1308 tail_page->next = (struct list_head *)((unsigned long)next_page |
1309 head_bit);
1310 next_page = rb_list_head(next_page);
1311 next_page->prev = tail_page;
1312
1313 /* make sure pages points to a valid page in the ring buffer */
1314 cpu_buffer->pages = next_page;
1315
1316 /* update head page */
1317 if (head_bit)
1318 cpu_buffer->head_page = list_entry(next_page,
1319 struct buffer_page, list);
1320
1321 /*
1322 * change read pointer to make sure any read iterators reset
1323 * themselves
1324 */
1325 cpu_buffer->read = 0;
1326
1327 /* pages are removed, resume tracing and then free the pages */
1328 atomic_dec(&cpu_buffer->record_disabled);
1250 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1329 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1330
1331 RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
1332
1333 /* last buffer page to remove */
1334 last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
1335 list);
1336 tmp_iter_page = first_page;
1337
1338 do {
1339 to_remove_page = tmp_iter_page;
1340 rb_inc_page(cpu_buffer, &tmp_iter_page);
1341
1342 /* update the counters */
1343 page_entries = rb_page_entries(to_remove_page);
1344 if (page_entries) {
1345 /*
1346 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 }
1355
1356 /*
1357 * We have already removed references to this list item, just
1358 * free up the buffer_page and its page
1359 */
1360 free_buffer_page(to_remove_page);
1361 nr_removed--;
1362
1363 } while (to_remove_page != last_page);
1364
1365 RB_WARN_ON(cpu_buffer, nr_removed);
1366
1367 return nr_removed == 0;
1251} 1368}
1252 1369
1253static void 1370static int
1254rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 1371rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
1255 struct list_head *pages, unsigned nr_pages)
1256{ 1372{
1257 struct buffer_page *bpage; 1373 struct list_head *pages = &cpu_buffer->new_pages;
1258 struct list_head *p; 1374 int retries, success;
1259 unsigned i;
1260 1375
1261 raw_spin_lock_irq(&cpu_buffer->reader_lock); 1376 raw_spin_lock_irq(&cpu_buffer->reader_lock);
1262 rb_head_page_deactivate(cpu_buffer); 1377 /*
1378 * We are holding the reader lock, so the reader page won't be swapped
1379 * in the ring buffer. Now we are racing with the writer trying to
1380 * move head page and the tail page.
1381 * We are going to adapt the reader page update process where:
1382 * 1. We first splice the start and end of list of new pages between
1383 * the head page and its previous page.
1384 * 2. We cmpxchg the prev_page->next to point from head page to the
1385 * start of new pages list.
1386 * 3. Finally, we update the head->prev to the end of new list.
1387 *
1388 * We will try this process 10 times, to make sure that we don't keep
1389 * spinning.
1390 */
1391 retries = 10;
1392 success = 0;
1393 while (retries--) {
1394 struct list_head *head_page, *prev_page, *r;
1395 struct list_head *last_page, *first_page;
1396 struct list_head *head_page_with_bit;
1263 1397
1264 for (i = 0; i < nr_pages; i++) { 1398 head_page = &rb_set_head_page(cpu_buffer)->list;
1265 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1399 prev_page = head_page->prev;
1266 goto out; 1400
1267 p = pages->next; 1401 first_page = pages->next;
1268 bpage = list_entry(p, struct buffer_page, list); 1402 last_page = pages->prev;
1269 list_del_init(&bpage->list); 1403
1270 list_add_tail(&bpage->list, cpu_buffer->pages); 1404 head_page_with_bit = (struct list_head *)
1405 ((unsigned long)head_page | RB_PAGE_HEAD);
1406
1407 last_page->next = head_page_with_bit;
1408 first_page->prev = prev_page;
1409
1410 r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
1411
1412 if (r == head_page_with_bit) {
1413 /*
1414 * yay, we replaced the page pointer to our new list,
1415 * now, we just have to update to head page's prev
1416 * pointer to point to end of list
1417 */
1418 head_page->prev = last_page;
1419 success = 1;
1420 break;
1421 }
1271 } 1422 }
1272 rb_reset_cpu(cpu_buffer);
1273 rb_check_pages(cpu_buffer);
1274 1423
1275out: 1424 if (success)
1425 INIT_LIST_HEAD(pages);
1426 /*
1427 * If we weren't successful in adding in new pages, warn and stop
1428 * tracing
1429 */
1430 RB_WARN_ON(cpu_buffer, !success);
1276 raw_spin_unlock_irq(&cpu_buffer->reader_lock); 1431 raw_spin_unlock_irq(&cpu_buffer->reader_lock);
1432
1433 /* free pages if they weren't inserted */
1434 if (!success) {
1435 struct buffer_page *bpage, *tmp;
1436 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1437 list) {
1438 list_del_init(&bpage->list);
1439 free_buffer_page(bpage);
1440 }
1441 }
1442 return success;
1443}
1444
1445static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
1446{
1447 int success;
1448
1449 if (cpu_buffer->nr_pages_to_update > 0)
1450 success = rb_insert_pages(cpu_buffer);
1451 else
1452 success = rb_remove_pages(cpu_buffer,
1453 -cpu_buffer->nr_pages_to_update);
1454
1455 if (success)
1456 cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
1457}
1458
1459static void update_pages_handler(struct work_struct *work)
1460{
1461 struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
1462 struct ring_buffer_per_cpu, update_pages_work);
1463 rb_update_pages(cpu_buffer);
1464 complete(&cpu_buffer->update_done);
1277} 1465}
1278 1466
1279/** 1467/**
@@ -1283,16 +1471,14 @@ out:
1283 * 1471 *
1284 * Minimum size is 2 * BUF_PAGE_SIZE. 1472 * Minimum size is 2 * BUF_PAGE_SIZE.
1285 * 1473 *
1286 * Returns -1 on failure. 1474 * Returns 0 on success and < 0 on failure.
1287 */ 1475 */
1288int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) 1476int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1477 int cpu_id)
1289{ 1478{
1290 struct ring_buffer_per_cpu *cpu_buffer; 1479 struct ring_buffer_per_cpu *cpu_buffer;
1291 unsigned nr_pages, rm_pages, new_pages; 1480 unsigned nr_pages;
1292 struct buffer_page *bpage, *tmp; 1481 int cpu, err = 0;
1293 unsigned long buffer_size;
1294 LIST_HEAD(pages);
1295 int i, cpu;
1296 1482
1297 /* 1483 /*
1298 * Always succeed at resizing a non-existent buffer: 1484 * Always succeed at resizing a non-existent buffer:
@@ -1300,115 +1486,161 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1300 if (!buffer) 1486 if (!buffer)
1301 return size; 1487 return size;
1302 1488
1489 /* Make sure the requested buffer exists */
1490 if (cpu_id != RING_BUFFER_ALL_CPUS &&
1491 !cpumask_test_cpu(cpu_id, buffer->cpumask))
1492 return size;
1493
1303 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 1494 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1304 size *= BUF_PAGE_SIZE; 1495 size *= BUF_PAGE_SIZE;
1305 buffer_size = buffer->pages * BUF_PAGE_SIZE;
1306 1496
1307 /* we need a minimum of two pages */ 1497 /* we need a minimum of two pages */
1308 if (size < BUF_PAGE_SIZE * 2) 1498 if (size < BUF_PAGE_SIZE * 2)
1309 size = BUF_PAGE_SIZE * 2; 1499 size = BUF_PAGE_SIZE * 2;
1310 1500
1311 if (size == buffer_size) 1501 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1312 return size;
1313
1314 atomic_inc(&buffer->record_disabled);
1315 1502
1316 /* Make sure all writers are done with this buffer. */ 1503 /*
1317 synchronize_sched(); 1504 * Don't succeed if resizing is disabled, as a reader might be
1505 * manipulating the ring buffer and is expecting a sane state while
1506 * this is true.
1507 */
1508 if (atomic_read(&buffer->resize_disabled))
1509 return -EBUSY;
1318 1510
1511 /* prevent another thread from changing buffer sizes */
1319 mutex_lock(&buffer->mutex); 1512 mutex_lock(&buffer->mutex);
1320 get_online_cpus();
1321
1322 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
1323 1513
1324 if (size < buffer_size) { 1514 if (cpu_id == RING_BUFFER_ALL_CPUS) {
1515 /* calculate the pages to update */
1516 for_each_buffer_cpu(buffer, cpu) {
1517 cpu_buffer = buffer->buffers[cpu];
1325 1518
1326 /* easy case, just free pages */ 1519 cpu_buffer->nr_pages_to_update = nr_pages -
1327 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) 1520 cpu_buffer->nr_pages;
1328 goto out_fail; 1521 /*
1522 * nothing more to do for removing pages or no update
1523 */
1524 if (cpu_buffer->nr_pages_to_update <= 0)
1525 continue;
1526 /*
1527 * to add pages, make sure all new pages can be
1528 * allocated without receiving ENOMEM
1529 */
1530 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1531 if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1532 &cpu_buffer->new_pages, cpu)) {
1533 /* not enough memory for new pages */
1534 err = -ENOMEM;
1535 goto out_err;
1536 }
1537 }
1329 1538
1330 rm_pages = buffer->pages - nr_pages; 1539 get_online_cpus();
1540 /*
1541 * Fire off all the required work handlers
1542 * We can't schedule on offline CPUs, but it's not necessary
1543 * since we can change their buffer sizes without any race.
1544 */
1545 for_each_buffer_cpu(buffer, cpu) {
1546 cpu_buffer = buffer->buffers[cpu];
1547 if (!cpu_buffer->nr_pages_to_update)
1548 continue;
1549
1550 if (cpu_online(cpu))
1551 schedule_work_on(cpu,
1552 &cpu_buffer->update_pages_work);
1553 else
1554 rb_update_pages(cpu_buffer);
1555 }
1331 1556
1557 /* wait for all the updates to complete */
1332 for_each_buffer_cpu(buffer, cpu) { 1558 for_each_buffer_cpu(buffer, cpu) {
1333 cpu_buffer = buffer->buffers[cpu]; 1559 cpu_buffer = buffer->buffers[cpu];
1334 rb_remove_pages(cpu_buffer, rm_pages); 1560 if (!cpu_buffer->nr_pages_to_update)
1561 continue;
1562
1563 if (cpu_online(cpu))
1564 wait_for_completion(&cpu_buffer->update_done);
1565 cpu_buffer->nr_pages_to_update = 0;
1335 } 1566 }
1336 goto out;
1337 }
1338 1567
1339 /* 1568 put_online_cpus();
1340 * This is a bit more difficult. We only want to add pages 1569 } else {
1341 * when we can allocate enough for all CPUs. We do this 1570 cpu_buffer = buffer->buffers[cpu_id];
1342 * by allocating all the pages and storing them on a local
1343 * link list. If we succeed in our allocation, then we
1344 * add these pages to the cpu_buffers. Otherwise we just free
1345 * them all and return -ENOMEM;
1346 */
1347 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
1348 goto out_fail;
1349 1571
1350 new_pages = nr_pages - buffer->pages; 1572 if (nr_pages == cpu_buffer->nr_pages)
1573 goto out;
1351 1574
1352 for_each_buffer_cpu(buffer, cpu) { 1575 cpu_buffer->nr_pages_to_update = nr_pages -
1353 for (i = 0; i < new_pages; i++) { 1576 cpu_buffer->nr_pages;
1354 struct page *page; 1577
1355 /* 1578 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1356 * __GFP_NORETRY flag makes sure that the allocation 1579 if (cpu_buffer->nr_pages_to_update > 0 &&
1357 * fails gracefully without invoking oom-killer and 1580 __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
1358 * the system is not destabilized. 1581 &cpu_buffer->new_pages, cpu_id)) {
1359 */ 1582 err = -ENOMEM;
1360 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1583 goto out_err;
1361 cache_line_size()),
1362 GFP_KERNEL | __GFP_NORETRY,
1363 cpu_to_node(cpu));
1364 if (!bpage)
1365 goto free_pages;
1366 list_add(&bpage->list, &pages);
1367 page = alloc_pages_node(cpu_to_node(cpu),
1368 GFP_KERNEL | __GFP_NORETRY, 0);
1369 if (!page)
1370 goto free_pages;
1371 bpage->page = page_address(page);
1372 rb_init_page(bpage->page);
1373 } 1584 }
1374 }
1375 1585
1376 for_each_buffer_cpu(buffer, cpu) { 1586 get_online_cpus();
1377 cpu_buffer = buffer->buffers[cpu];
1378 rb_insert_pages(cpu_buffer, &pages, new_pages);
1379 }
1380 1587
1381 if (RB_WARN_ON(buffer, !list_empty(&pages))) 1588 if (cpu_online(cpu_id)) {
1382 goto out_fail; 1589 schedule_work_on(cpu_id,
1590 &cpu_buffer->update_pages_work);
1591 wait_for_completion(&cpu_buffer->update_done);
1592 } else
1593 rb_update_pages(cpu_buffer);
1594
1595 cpu_buffer->nr_pages_to_update = 0;
1596 put_online_cpus();
1597 }
1383 1598
1384 out: 1599 out:
1385 buffer->pages = nr_pages; 1600 /*
1386 put_online_cpus(); 1601 * The ring buffer resize can happen with the ring buffer
1602 * enabled, so that the update disturbs the tracing as little
1603 * as possible. But if the buffer is disabled, we do not need
1604 * to worry about that, and we can take the time to verify
1605 * that the buffer is not corrupt.
1606 */
1607 if (atomic_read(&buffer->record_disabled)) {
1608 atomic_inc(&buffer->record_disabled);
1609 /*
1610 * Even though the buffer was disabled, we must make sure
1611 * that it is truly disabled before calling rb_check_pages.
1612 * There could have been a race between checking
1613 * record_disable and incrementing it.
1614 */
1615 synchronize_sched();
1616 for_each_buffer_cpu(buffer, cpu) {
1617 cpu_buffer = buffer->buffers[cpu];
1618 rb_check_pages(cpu_buffer);
1619 }
1620 atomic_dec(&buffer->record_disabled);
1621 }
1622
1387 mutex_unlock(&buffer->mutex); 1623 mutex_unlock(&buffer->mutex);
1624 return size;
1388 1625
1389 atomic_dec(&buffer->record_disabled); 1626 out_err:
1627 for_each_buffer_cpu(buffer, cpu) {
1628 struct buffer_page *bpage, *tmp;
1390 1629
1391 return size; 1630 cpu_buffer = buffer->buffers[cpu];
1631 cpu_buffer->nr_pages_to_update = 0;
1392 1632
1393 free_pages: 1633 if (list_empty(&cpu_buffer->new_pages))
1394 list_for_each_entry_safe(bpage, tmp, &pages, list) { 1634 continue;
1395 list_del_init(&bpage->list);
1396 free_buffer_page(bpage);
1397 }
1398 put_online_cpus();
1399 mutex_unlock(&buffer->mutex);
1400 atomic_dec(&buffer->record_disabled);
1401 return -ENOMEM;
1402 1635
1403 /* 1636 list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
1404 * Something went totally wrong, and we are too paranoid 1637 list) {
1405 * to even clean up the mess. 1638 list_del_init(&bpage->list);
1406 */ 1639 free_buffer_page(bpage);
1407 out_fail: 1640 }
1408 put_online_cpus(); 1641 }
1409 mutex_unlock(&buffer->mutex); 1642 mutex_unlock(&buffer->mutex);
1410 atomic_dec(&buffer->record_disabled); 1643 return err;
1411 return -1;
1412} 1644}
1413EXPORT_SYMBOL_GPL(ring_buffer_resize); 1645EXPORT_SYMBOL_GPL(ring_buffer_resize);
1414 1646
@@ -1447,21 +1679,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
1447 return __rb_page_index(iter->head_page, iter->head); 1679 return __rb_page_index(iter->head_page, iter->head);
1448} 1680}
1449 1681
1450static inline unsigned long rb_page_write(struct buffer_page *bpage)
1451{
1452 return local_read(&bpage->write) & RB_WRITE_MASK;
1453}
1454
1455static inline unsigned rb_page_commit(struct buffer_page *bpage) 1682static inline unsigned rb_page_commit(struct buffer_page *bpage)
1456{ 1683{
1457 return local_read(&bpage->page->commit); 1684 return local_read(&bpage->page->commit);
1458} 1685}
1459 1686
1460static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1461{
1462 return local_read(&bpage->entries) & RB_WRITE_MASK;
1463}
1464
1465/* Size is determined by what has been committed */ 1687/* Size is determined by what has been committed */
1466static inline unsigned rb_page_size(struct buffer_page *bpage) 1688static inline unsigned rb_page_size(struct buffer_page *bpage)
1467{ 1689{
@@ -1510,7 +1732,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1510 * assign the commit to the tail. 1732 * assign the commit to the tail.
1511 */ 1733 */
1512 again: 1734 again:
1513 max_count = cpu_buffer->buffer->pages * 100; 1735 max_count = cpu_buffer->nr_pages * 100;
1514 1736
1515 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1737 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1516 if (RB_WARN_ON(cpu_buffer, !(--max_count))) 1738 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3486,6 +3708,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3486 3708
3487 iter->cpu_buffer = cpu_buffer; 3709 iter->cpu_buffer = cpu_buffer;
3488 3710
3711 atomic_inc(&buffer->resize_disabled);
3489 atomic_inc(&cpu_buffer->record_disabled); 3712 atomic_inc(&cpu_buffer->record_disabled);
3490 3713
3491 return iter; 3714 return iter;
@@ -3548,7 +3771,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
3548{ 3771{
3549 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3772 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3550 3773
3774 /*
3775 * Ring buffer is disabled from recording, here's a good place
3776 * to check the integrity of the ring buffer.
3777 */
3778 rb_check_pages(cpu_buffer);
3779
3551 atomic_dec(&cpu_buffer->record_disabled); 3780 atomic_dec(&cpu_buffer->record_disabled);
3781 atomic_dec(&cpu_buffer->buffer->resize_disabled);
3552 kfree(iter); 3782 kfree(iter);
3553} 3783}
3554EXPORT_SYMBOL_GPL(ring_buffer_read_finish); 3784EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3588,9 +3818,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
3588 * ring_buffer_size - return the size of the ring buffer (in bytes) 3818 * ring_buffer_size - return the size of the ring buffer (in bytes)
3589 * @buffer: The ring buffer. 3819 * @buffer: The ring buffer.
3590 */ 3820 */
3591unsigned long ring_buffer_size(struct ring_buffer *buffer) 3821unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
3592{ 3822{
3593 return BUF_PAGE_SIZE * buffer->pages; 3823 /*
3824 * Earlier, this method returned
3825 * BUF_PAGE_SIZE * buffer->nr_pages
3826 * Since the nr_pages field is now removed, we have converted this to
3827 * return the per cpu buffer value.
3828 */
3829 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3830 return 0;
3831
3832 return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
3594} 3833}
3595EXPORT_SYMBOL_GPL(ring_buffer_size); 3834EXPORT_SYMBOL_GPL(ring_buffer_size);
3596 3835
@@ -3611,6 +3850,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3611 cpu_buffer->commit_page = cpu_buffer->head_page; 3850 cpu_buffer->commit_page = cpu_buffer->head_page;
3612 3851
3613 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 3852 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
3853 INIT_LIST_HEAD(&cpu_buffer->new_pages);
3614 local_set(&cpu_buffer->reader_page->write, 0); 3854 local_set(&cpu_buffer->reader_page->write, 0);
3615 local_set(&cpu_buffer->reader_page->entries, 0); 3855 local_set(&cpu_buffer->reader_page->entries, 0);
3616 local_set(&cpu_buffer->reader_page->page->commit, 0); 3856 local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3647,8 +3887,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3647 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3887 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3648 return; 3888 return;
3649 3889
3890 atomic_inc(&buffer->resize_disabled);
3650 atomic_inc(&cpu_buffer->record_disabled); 3891 atomic_inc(&cpu_buffer->record_disabled);
3651 3892
3893 /* Make sure all commits have finished */
3894 synchronize_sched();
3895
3652 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3896 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3653 3897
3654 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3898 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3664,6 +3908,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3664 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3908 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3665 3909
3666 atomic_dec(&cpu_buffer->record_disabled); 3910 atomic_dec(&cpu_buffer->record_disabled);
3911 atomic_dec(&buffer->resize_disabled);
3667} 3912}
3668EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 3913EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
3669 3914
@@ -3765,8 +4010,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3765 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 4010 !cpumask_test_cpu(cpu, buffer_b->cpumask))
3766 goto out; 4011 goto out;
3767 4012
4013 cpu_buffer_a = buffer_a->buffers[cpu];
4014 cpu_buffer_b = buffer_b->buffers[cpu];
4015
3768 /* At least make sure the two buffers are somewhat the same */ 4016 /* At least make sure the two buffers are somewhat the same */
3769 if (buffer_a->pages != buffer_b->pages) 4017 if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
3770 goto out; 4018 goto out;
3771 4019
3772 ret = -EAGAIN; 4020 ret = -EAGAIN;
@@ -3780,9 +4028,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
3780 if (atomic_read(&buffer_b->record_disabled)) 4028 if (atomic_read(&buffer_b->record_disabled))
3781 goto out; 4029 goto out;
3782 4030
3783 cpu_buffer_a = buffer_a->buffers[cpu];
3784 cpu_buffer_b = buffer_b->buffers[cpu];
3785
3786 if (atomic_read(&cpu_buffer_a->record_disabled)) 4031 if (atomic_read(&cpu_buffer_a->record_disabled))
3787 goto out; 4032 goto out;
3788 4033
@@ -4071,6 +4316,8 @@ static int rb_cpu_notify(struct notifier_block *self,
4071 struct ring_buffer *buffer = 4316 struct ring_buffer *buffer =
4072 container_of(self, struct ring_buffer, cpu_notify); 4317 container_of(self, struct ring_buffer, cpu_notify);
4073 long cpu = (long)hcpu; 4318 long cpu = (long)hcpu;
4319 int cpu_i, nr_pages_same;
4320 unsigned int nr_pages;
4074 4321
4075 switch (action) { 4322 switch (action) {
4076 case CPU_UP_PREPARE: 4323 case CPU_UP_PREPARE:
@@ -4078,8 +4325,23 @@ static int rb_cpu_notify(struct notifier_block *self,
4078 if (cpumask_test_cpu(cpu, buffer->cpumask)) 4325 if (cpumask_test_cpu(cpu, buffer->cpumask))
4079 return NOTIFY_OK; 4326 return NOTIFY_OK;
4080 4327
4328 nr_pages = 0;
4329 nr_pages_same = 1;
4330 /* check if all cpu sizes are same */
4331 for_each_buffer_cpu(buffer, cpu_i) {
4332 /* fill in the size from first enabled cpu */
4333 if (nr_pages == 0)
4334 nr_pages = buffer->buffers[cpu_i]->nr_pages;
4335 if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
4336 nr_pages_same = 0;
4337 break;
4338 }
4339 }
4340 /* allocate minimum pages, user can later expand it */
4341 if (!nr_pages_same)
4342 nr_pages = 2;
4081 buffer->buffers[cpu] = 4343 buffer->buffers[cpu] =
4082 rb_allocate_cpu_buffer(buffer, cpu); 4344 rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
4083 if (!buffer->buffers[cpu]) { 4345 if (!buffer->buffers[cpu]) {
4084 WARN(1, "failed to allocate ring buffer on CPU %ld\n", 4346 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
4085 cpu); 4347 cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a22255c1010..68032c6177db 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,18 +87,6 @@ static int tracing_disabled = 1;
87 87
88DEFINE_PER_CPU(int, ftrace_cpu_disabled); 88DEFINE_PER_CPU(int, ftrace_cpu_disabled);
89 89
90static inline void ftrace_disable_cpu(void)
91{
92 preempt_disable();
93 __this_cpu_inc(ftrace_cpu_disabled);
94}
95
96static inline void ftrace_enable_cpu(void)
97{
98 __this_cpu_dec(ftrace_cpu_disabled);
99 preempt_enable();
100}
101
102cpumask_var_t __read_mostly tracing_buffer_mask; 90cpumask_var_t __read_mostly tracing_buffer_mask;
103 91
104/* 92/*
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
629static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) 617static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
630{ 618{
631 int len; 619 int len;
632 void *ret;
633 620
634 if (s->len <= s->readpos) 621 if (s->len <= s->readpos)
635 return -EBUSY; 622 return -EBUSY;
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
637 len = s->len - s->readpos; 624 len = s->len - s->readpos;
638 if (cnt > len) 625 if (cnt > len)
639 cnt = len; 626 cnt = len;
640 ret = memcpy(buf, s->buffer + s->readpos, cnt); 627 memcpy(buf, s->buffer + s->readpos, cnt);
641 if (!ret)
642 return -EFAULT;
643 628
644 s->readpos += cnt; 629 s->readpos += cnt;
645 return cnt; 630 return cnt;
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
751 736
752 arch_spin_lock(&ftrace_max_lock); 737 arch_spin_lock(&ftrace_max_lock);
753 738
754 ftrace_disable_cpu();
755
756 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 739 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
757 740
758 if (ret == -EBUSY) { 741 if (ret == -EBUSY) {
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
766 "Failed to swap buffers due to commit in progress\n"); 749 "Failed to swap buffers due to commit in progress\n");
767 } 750 }
768 751
769 ftrace_enable_cpu();
770
771 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 752 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
772 753
773 __update_max_tr(tr, tsk, cpu); 754 __update_max_tr(tr, tsk, cpu);
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
782 * Register a new plugin tracer. 763 * Register a new plugin tracer.
783 */ 764 */
784int register_tracer(struct tracer *type) 765int register_tracer(struct tracer *type)
785__releases(kernel_lock)
786__acquires(kernel_lock)
787{ 766{
788 struct tracer *t; 767 struct tracer *t;
789 int ret = 0; 768 int ret = 0;
@@ -841,7 +820,8 @@ __acquires(kernel_lock)
841 820
842 /* If we expanded the buffers, make sure the max is expanded too */ 821 /* If we expanded the buffers, make sure the max is expanded too */
843 if (ring_buffer_expanded && type->use_max_tr) 822 if (ring_buffer_expanded && type->use_max_tr)
844 ring_buffer_resize(max_tr.buffer, trace_buf_size); 823 ring_buffer_resize(max_tr.buffer, trace_buf_size,
824 RING_BUFFER_ALL_CPUS);
845 825
846 /* the test is responsible for initializing and enabling */ 826 /* the test is responsible for initializing and enabling */
847 pr_info("Testing tracer %s: ", type->name); 827 pr_info("Testing tracer %s: ", type->name);
@@ -857,7 +837,8 @@ __acquires(kernel_lock)
857 837
858 /* Shrink the max buffer again */ 838 /* Shrink the max buffer again */
859 if (ring_buffer_expanded && type->use_max_tr) 839 if (ring_buffer_expanded && type->use_max_tr)
860 ring_buffer_resize(max_tr.buffer, 1); 840 ring_buffer_resize(max_tr.buffer, 1,
841 RING_BUFFER_ALL_CPUS);
861 842
862 printk(KERN_CONT "PASSED\n"); 843 printk(KERN_CONT "PASSED\n");
863 } 844 }
@@ -917,13 +898,6 @@ out:
917 mutex_unlock(&trace_types_lock); 898 mutex_unlock(&trace_types_lock);
918} 899}
919 900
920static void __tracing_reset(struct ring_buffer *buffer, int cpu)
921{
922 ftrace_disable_cpu();
923 ring_buffer_reset_cpu(buffer, cpu);
924 ftrace_enable_cpu();
925}
926
927void tracing_reset(struct trace_array *tr, int cpu) 901void tracing_reset(struct trace_array *tr, int cpu)
928{ 902{
929 struct ring_buffer *buffer = tr->buffer; 903 struct ring_buffer *buffer = tr->buffer;
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
932 906
933 /* Make sure all commits have finished */ 907 /* Make sure all commits have finished */
934 synchronize_sched(); 908 synchronize_sched();
935 __tracing_reset(buffer, cpu); 909 ring_buffer_reset_cpu(buffer, cpu);
936 910
937 ring_buffer_record_enable(buffer); 911 ring_buffer_record_enable(buffer);
938} 912}
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
950 tr->time_start = ftrace_now(tr->cpu); 924 tr->time_start = ftrace_now(tr->cpu);
951 925
952 for_each_online_cpu(cpu) 926 for_each_online_cpu(cpu)
953 __tracing_reset(buffer, cpu); 927 ring_buffer_reset_cpu(buffer, cpu);
954 928
955 ring_buffer_record_enable(buffer); 929 ring_buffer_record_enable(buffer);
956} 930}
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1498 1472
1499#endif /* CONFIG_STACKTRACE */ 1473#endif /* CONFIG_STACKTRACE */
1500 1474
1475/* created for use with alloc_percpu */
1476struct trace_buffer_struct {
1477 char buffer[TRACE_BUF_SIZE];
1478};
1479
1480static struct trace_buffer_struct *trace_percpu_buffer;
1481static struct trace_buffer_struct *trace_percpu_sirq_buffer;
1482static struct trace_buffer_struct *trace_percpu_irq_buffer;
1483static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1484
1485/*
1486 * The buffer used is dependent on the context. There is a per cpu
1487 * buffer for normal context, softirq contex, hard irq context and
1488 * for NMI context. Thise allows for lockless recording.
1489 *
1490 * Note, if the buffers failed to be allocated, then this returns NULL
1491 */
1492static char *get_trace_buf(void)
1493{
1494 struct trace_buffer_struct *percpu_buffer;
1495 struct trace_buffer_struct *buffer;
1496
1497 /*
1498 * If we have allocated per cpu buffers, then we do not
1499 * need to do any locking.
1500 */
1501 if (in_nmi())
1502 percpu_buffer = trace_percpu_nmi_buffer;
1503 else if (in_irq())
1504 percpu_buffer = trace_percpu_irq_buffer;
1505 else if (in_softirq())
1506 percpu_buffer = trace_percpu_sirq_buffer;
1507 else
1508 percpu_buffer = trace_percpu_buffer;
1509
1510 if (!percpu_buffer)
1511 return NULL;
1512
1513 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
1514
1515 return buffer->buffer;
1516}
1517
1518static int alloc_percpu_trace_buffer(void)
1519{
1520 struct trace_buffer_struct *buffers;
1521 struct trace_buffer_struct *sirq_buffers;
1522 struct trace_buffer_struct *irq_buffers;
1523 struct trace_buffer_struct *nmi_buffers;
1524
1525 buffers = alloc_percpu(struct trace_buffer_struct);
1526 if (!buffers)
1527 goto err_warn;
1528
1529 sirq_buffers = alloc_percpu(struct trace_buffer_struct);
1530 if (!sirq_buffers)
1531 goto err_sirq;
1532
1533 irq_buffers = alloc_percpu(struct trace_buffer_struct);
1534 if (!irq_buffers)
1535 goto err_irq;
1536
1537 nmi_buffers = alloc_percpu(struct trace_buffer_struct);
1538 if (!nmi_buffers)
1539 goto err_nmi;
1540
1541 trace_percpu_buffer = buffers;
1542 trace_percpu_sirq_buffer = sirq_buffers;
1543 trace_percpu_irq_buffer = irq_buffers;
1544 trace_percpu_nmi_buffer = nmi_buffers;
1545
1546 return 0;
1547
1548 err_nmi:
1549 free_percpu(irq_buffers);
1550 err_irq:
1551 free_percpu(sirq_buffers);
1552 err_sirq:
1553 free_percpu(buffers);
1554 err_warn:
1555 WARN(1, "Could not allocate percpu trace_printk buffer");
1556 return -ENOMEM;
1557}
1558
1559void trace_printk_init_buffers(void)
1560{
1561 static int buffers_allocated;
1562
1563 if (buffers_allocated)
1564 return;
1565
1566 if (alloc_percpu_trace_buffer())
1567 return;
1568
1569 pr_info("ftrace: Allocated trace_printk buffers\n");
1570
1571 buffers_allocated = 1;
1572}
1573
1501/** 1574/**
1502 * trace_vbprintk - write binary msg to tracing buffer 1575 * trace_vbprintk - write binary msg to tracing buffer
1503 * 1576 *
1504 */ 1577 */
1505int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1578int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1506{ 1579{
1507 static arch_spinlock_t trace_buf_lock =
1508 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1509 static u32 trace_buf[TRACE_BUF_SIZE];
1510
1511 struct ftrace_event_call *call = &event_bprint; 1580 struct ftrace_event_call *call = &event_bprint;
1512 struct ring_buffer_event *event; 1581 struct ring_buffer_event *event;
1513 struct ring_buffer *buffer; 1582 struct ring_buffer *buffer;
1514 struct trace_array *tr = &global_trace; 1583 struct trace_array *tr = &global_trace;
1515 struct trace_array_cpu *data;
1516 struct bprint_entry *entry; 1584 struct bprint_entry *entry;
1517 unsigned long flags; 1585 unsigned long flags;
1518 int disable; 1586 char *tbuffer;
1519 int cpu, len = 0, size, pc; 1587 int len = 0, size, pc;
1520 1588
1521 if (unlikely(tracing_selftest_running || tracing_disabled)) 1589 if (unlikely(tracing_selftest_running || tracing_disabled))
1522 return 0; 1590 return 0;
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1526 1594
1527 pc = preempt_count(); 1595 pc = preempt_count();
1528 preempt_disable_notrace(); 1596 preempt_disable_notrace();
1529 cpu = raw_smp_processor_id();
1530 data = tr->data[cpu];
1531 1597
1532 disable = atomic_inc_return(&data->disabled); 1598 tbuffer = get_trace_buf();
1533 if (unlikely(disable != 1)) 1599 if (!tbuffer) {
1600 len = 0;
1534 goto out; 1601 goto out;
1602 }
1535 1603
1536 /* Lockdep uses trace_printk for lock tracing */ 1604 len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
1537 local_irq_save(flags);
1538 arch_spin_lock(&trace_buf_lock);
1539 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1540 1605
1541 if (len > TRACE_BUF_SIZE || len < 0) 1606 if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
1542 goto out_unlock; 1607 goto out;
1543 1608
1609 local_save_flags(flags);
1544 size = sizeof(*entry) + sizeof(u32) * len; 1610 size = sizeof(*entry) + sizeof(u32) * len;
1545 buffer = tr->buffer; 1611 buffer = tr->buffer;
1546 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1612 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1547 flags, pc); 1613 flags, pc);
1548 if (!event) 1614 if (!event)
1549 goto out_unlock; 1615 goto out;
1550 entry = ring_buffer_event_data(event); 1616 entry = ring_buffer_event_data(event);
1551 entry->ip = ip; 1617 entry->ip = ip;
1552 entry->fmt = fmt; 1618 entry->fmt = fmt;
1553 1619
1554 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1620 memcpy(entry->buf, tbuffer, sizeof(u32) * len);
1555 if (!filter_check_discard(call, entry, buffer, event)) { 1621 if (!filter_check_discard(call, entry, buffer, event)) {
1556 ring_buffer_unlock_commit(buffer, event); 1622 ring_buffer_unlock_commit(buffer, event);
1557 ftrace_trace_stack(buffer, flags, 6, pc); 1623 ftrace_trace_stack(buffer, flags, 6, pc);
1558 } 1624 }
1559 1625
1560out_unlock:
1561 arch_spin_unlock(&trace_buf_lock);
1562 local_irq_restore(flags);
1563
1564out: 1626out:
1565 atomic_dec_return(&data->disabled);
1566 preempt_enable_notrace(); 1627 preempt_enable_notrace();
1567 unpause_graph_tracing(); 1628 unpause_graph_tracing();
1568 1629
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr,
1588int trace_array_vprintk(struct trace_array *tr, 1649int trace_array_vprintk(struct trace_array *tr,
1589 unsigned long ip, const char *fmt, va_list args) 1650 unsigned long ip, const char *fmt, va_list args)
1590{ 1651{
1591 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1592 static char trace_buf[TRACE_BUF_SIZE];
1593
1594 struct ftrace_event_call *call = &event_print; 1652 struct ftrace_event_call *call = &event_print;
1595 struct ring_buffer_event *event; 1653 struct ring_buffer_event *event;
1596 struct ring_buffer *buffer; 1654 struct ring_buffer *buffer;
1597 struct trace_array_cpu *data; 1655 int len = 0, size, pc;
1598 int cpu, len = 0, size, pc;
1599 struct print_entry *entry; 1656 struct print_entry *entry;
1600 unsigned long irq_flags; 1657 unsigned long flags;
1601 int disable; 1658 char *tbuffer;
1602 1659
1603 if (tracing_disabled || tracing_selftest_running) 1660 if (tracing_disabled || tracing_selftest_running)
1604 return 0; 1661 return 0;
1605 1662
1663 /* Don't pollute graph traces with trace_vprintk internals */
1664 pause_graph_tracing();
1665
1606 pc = preempt_count(); 1666 pc = preempt_count();
1607 preempt_disable_notrace(); 1667 preempt_disable_notrace();
1608 cpu = raw_smp_processor_id();
1609 data = tr->data[cpu];
1610 1668
1611 disable = atomic_inc_return(&data->disabled); 1669
1612 if (unlikely(disable != 1)) 1670 tbuffer = get_trace_buf();
1671 if (!tbuffer) {
1672 len = 0;
1613 goto out; 1673 goto out;
1674 }
1614 1675
1615 pause_graph_tracing(); 1676 len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
1616 raw_local_irq_save(irq_flags); 1677 if (len > TRACE_BUF_SIZE)
1617 arch_spin_lock(&trace_buf_lock); 1678 goto out;
1618 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1619 1679
1680 local_save_flags(flags);
1620 size = sizeof(*entry) + len + 1; 1681 size = sizeof(*entry) + len + 1;
1621 buffer = tr->buffer; 1682 buffer = tr->buffer;
1622 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1683 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1623 irq_flags, pc); 1684 flags, pc);
1624 if (!event) 1685 if (!event)
1625 goto out_unlock; 1686 goto out;
1626 entry = ring_buffer_event_data(event); 1687 entry = ring_buffer_event_data(event);
1627 entry->ip = ip; 1688 entry->ip = ip;
1628 1689
1629 memcpy(&entry->buf, trace_buf, len); 1690 memcpy(&entry->buf, tbuffer, len);
1630 entry->buf[len] = '\0'; 1691 entry->buf[len] = '\0';
1631 if (!filter_check_discard(call, entry, buffer, event)) { 1692 if (!filter_check_discard(call, entry, buffer, event)) {
1632 ring_buffer_unlock_commit(buffer, event); 1693 ring_buffer_unlock_commit(buffer, event);
1633 ftrace_trace_stack(buffer, irq_flags, 6, pc); 1694 ftrace_trace_stack(buffer, flags, 6, pc);
1634 } 1695 }
1635
1636 out_unlock:
1637 arch_spin_unlock(&trace_buf_lock);
1638 raw_local_irq_restore(irq_flags);
1639 unpause_graph_tracing();
1640 out: 1696 out:
1641 atomic_dec_return(&data->disabled);
1642 preempt_enable_notrace(); 1697 preempt_enable_notrace();
1698 unpause_graph_tracing();
1643 1699
1644 return len; 1700 return len;
1645} 1701}
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1652 1708
1653static void trace_iterator_increment(struct trace_iterator *iter) 1709static void trace_iterator_increment(struct trace_iterator *iter)
1654{ 1710{
1655 /* Don't allow ftrace to trace into the ring buffers */
1656 ftrace_disable_cpu();
1657
1658 iter->idx++; 1711 iter->idx++;
1659 if (iter->buffer_iter[iter->cpu]) 1712 if (iter->buffer_iter[iter->cpu])
1660 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); 1713 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1661
1662 ftrace_enable_cpu();
1663} 1714}
1664 1715
1665static struct trace_entry * 1716static struct trace_entry *
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1669 struct ring_buffer_event *event; 1720 struct ring_buffer_event *event;
1670 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1721 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
1671 1722
1672 /* Don't allow ftrace to trace into the ring buffers */
1673 ftrace_disable_cpu();
1674
1675 if (buf_iter) 1723 if (buf_iter)
1676 event = ring_buffer_iter_peek(buf_iter, ts); 1724 event = ring_buffer_iter_peek(buf_iter, ts);
1677 else 1725 else
1678 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 1726 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1679 lost_events); 1727 lost_events);
1680 1728
1681 ftrace_enable_cpu();
1682
1683 if (event) { 1729 if (event) {
1684 iter->ent_size = ring_buffer_event_length(event); 1730 iter->ent_size = ring_buffer_event_length(event);
1685 return ring_buffer_event_data(event); 1731 return ring_buffer_event_data(event);
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1769 1815
1770static void trace_consume(struct trace_iterator *iter) 1816static void trace_consume(struct trace_iterator *iter)
1771{ 1817{
1772 /* Don't allow ftrace to trace into the ring buffers */
1773 ftrace_disable_cpu();
1774 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 1818 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1775 &iter->lost_events); 1819 &iter->lost_events);
1776 ftrace_enable_cpu();
1777} 1820}
1778 1821
1779static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1822static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1862 iter->cpu = 0; 1905 iter->cpu = 0;
1863 iter->idx = -1; 1906 iter->idx = -1;
1864 1907
1865 ftrace_disable_cpu();
1866
1867 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1908 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1868 for_each_tracing_cpu(cpu) 1909 for_each_tracing_cpu(cpu)
1869 tracing_iter_reset(iter, cpu); 1910 tracing_iter_reset(iter, cpu);
1870 } else 1911 } else
1871 tracing_iter_reset(iter, cpu_file); 1912 tracing_iter_reset(iter, cpu_file);
1872 1913
1873 ftrace_enable_cpu();
1874
1875 iter->leftover = 0; 1914 iter->leftover = 0;
1876 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1915 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1877 ; 1916 ;
@@ -2332,15 +2371,13 @@ static struct trace_iterator *
2332__tracing_open(struct inode *inode, struct file *file) 2371__tracing_open(struct inode *inode, struct file *file)
2333{ 2372{
2334 long cpu_file = (long) inode->i_private; 2373 long cpu_file = (long) inode->i_private;
2335 void *fail_ret = ERR_PTR(-ENOMEM);
2336 struct trace_iterator *iter; 2374 struct trace_iterator *iter;
2337 struct seq_file *m; 2375 int cpu;
2338 int cpu, ret;
2339 2376
2340 if (tracing_disabled) 2377 if (tracing_disabled)
2341 return ERR_PTR(-ENODEV); 2378 return ERR_PTR(-ENODEV);
2342 2379
2343 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2380 iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
2344 if (!iter) 2381 if (!iter)
2345 return ERR_PTR(-ENOMEM); 2382 return ERR_PTR(-ENOMEM);
2346 2383
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file)
2397 tracing_iter_reset(iter, cpu); 2434 tracing_iter_reset(iter, cpu);
2398 } 2435 }
2399 2436
2400 ret = seq_open(file, &tracer_seq_ops);
2401 if (ret < 0) {
2402 fail_ret = ERR_PTR(ret);
2403 goto fail_buffer;
2404 }
2405
2406 m = file->private_data;
2407 m->private = iter;
2408
2409 mutex_unlock(&trace_types_lock); 2437 mutex_unlock(&trace_types_lock);
2410 2438
2411 return iter; 2439 return iter;
2412 2440
2413 fail_buffer:
2414 for_each_tracing_cpu(cpu) {
2415 if (iter->buffer_iter[cpu])
2416 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2417 }
2418 free_cpumask_var(iter->started);
2419 tracing_start();
2420 fail: 2441 fail:
2421 mutex_unlock(&trace_types_lock); 2442 mutex_unlock(&trace_types_lock);
2422 kfree(iter->trace); 2443 kfree(iter->trace);
2423 kfree(iter); 2444 seq_release_private(inode, file);
2424 2445 return ERR_PTR(-ENOMEM);
2425 return fail_ret;
2426} 2446}
2427 2447
2428int tracing_open_generic(struct inode *inode, struct file *filp) 2448int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file)
2458 tracing_start(); 2478 tracing_start();
2459 mutex_unlock(&trace_types_lock); 2479 mutex_unlock(&trace_types_lock);
2460 2480
2461 seq_release(inode, file);
2462 mutex_destroy(&iter->mutex); 2481 mutex_destroy(&iter->mutex);
2463 free_cpumask_var(iter->started); 2482 free_cpumask_var(iter->started);
2464 kfree(iter->trace); 2483 kfree(iter->trace);
2465 kfree(iter); 2484 seq_release_private(inode, file);
2466 return 0; 2485 return 0;
2467} 2486}
2468 2487
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2648 if (cpumask_test_cpu(cpu, tracing_cpumask) && 2667 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2649 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2668 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2650 atomic_inc(&global_trace.data[cpu]->disabled); 2669 atomic_inc(&global_trace.data[cpu]->disabled);
2670 ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
2651 } 2671 }
2652 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 2672 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2653 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 2673 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2654 atomic_dec(&global_trace.data[cpu]->disabled); 2674 atomic_dec(&global_trace.data[cpu]->disabled);
2675 ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
2655 } 2676 }
2656 } 2677 }
2657 arch_spin_unlock(&ftrace_max_lock); 2678 arch_spin_unlock(&ftrace_max_lock);
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2974 return t->init(tr); 2995 return t->init(tr);
2975} 2996}
2976 2997
2977static int __tracing_resize_ring_buffer(unsigned long size) 2998static void set_buffer_entries(struct trace_array *tr, unsigned long val)
2999{
3000 int cpu;
3001 for_each_tracing_cpu(cpu)
3002 tr->data[cpu]->entries = val;
3003}
3004
3005static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
2978{ 3006{
2979 int ret; 3007 int ret;
2980 3008
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
2985 */ 3013 */
2986 ring_buffer_expanded = 1; 3014 ring_buffer_expanded = 1;
2987 3015
2988 ret = ring_buffer_resize(global_trace.buffer, size); 3016 ret = ring_buffer_resize(global_trace.buffer, size, cpu);
2989 if (ret < 0) 3017 if (ret < 0)
2990 return ret; 3018 return ret;
2991 3019
2992 if (!current_trace->use_max_tr) 3020 if (!current_trace->use_max_tr)
2993 goto out; 3021 goto out;
2994 3022
2995 ret = ring_buffer_resize(max_tr.buffer, size); 3023 ret = ring_buffer_resize(max_tr.buffer, size, cpu);
2996 if (ret < 0) { 3024 if (ret < 0) {
2997 int r; 3025 int r = 0;
3026
3027 if (cpu == RING_BUFFER_ALL_CPUS) {
3028 int i;
3029 for_each_tracing_cpu(i) {
3030 r = ring_buffer_resize(global_trace.buffer,
3031 global_trace.data[i]->entries,
3032 i);
3033 if (r < 0)
3034 break;
3035 }
3036 } else {
3037 r = ring_buffer_resize(global_trace.buffer,
3038 global_trace.data[cpu]->entries,
3039 cpu);
3040 }
2998 3041
2999 r = ring_buffer_resize(global_trace.buffer,
3000 global_trace.entries);
3001 if (r < 0) { 3042 if (r < 0) {
3002 /* 3043 /*
3003 * AARGH! We are left with different 3044 * AARGH! We are left with different
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size)
3019 return ret; 3060 return ret;
3020 } 3061 }
3021 3062
3022 max_tr.entries = size; 3063 if (cpu == RING_BUFFER_ALL_CPUS)
3064 set_buffer_entries(&max_tr, size);
3065 else
3066 max_tr.data[cpu]->entries = size;
3067
3023 out: 3068 out:
3024 global_trace.entries = size; 3069 if (cpu == RING_BUFFER_ALL_CPUS)
3070 set_buffer_entries(&global_trace, size);
3071 else
3072 global_trace.data[cpu]->entries = size;
3025 3073
3026 return ret; 3074 return ret;
3027} 3075}
3028 3076
3029static ssize_t tracing_resize_ring_buffer(unsigned long size) 3077static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3030{ 3078{
3031 int cpu, ret = size; 3079 int ret = size;
3032 3080
3033 mutex_lock(&trace_types_lock); 3081 mutex_lock(&trace_types_lock);
3034 3082
3035 tracing_stop(); 3083 if (cpu_id != RING_BUFFER_ALL_CPUS) {
3036 3084 /* make sure, this cpu is enabled in the mask */
3037 /* disable all cpu buffers */ 3085 if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
3038 for_each_tracing_cpu(cpu) { 3086 ret = -EINVAL;
3039 if (global_trace.data[cpu]) 3087 goto out;
3040 atomic_inc(&global_trace.data[cpu]->disabled); 3088 }
3041 if (max_tr.data[cpu])
3042 atomic_inc(&max_tr.data[cpu]->disabled);
3043 } 3089 }
3044 3090
3045 if (size != global_trace.entries) 3091 ret = __tracing_resize_ring_buffer(size, cpu_id);
3046 ret = __tracing_resize_ring_buffer(size);
3047
3048 if (ret < 0) 3092 if (ret < 0)
3049 ret = -ENOMEM; 3093 ret = -ENOMEM;
3050 3094
3051 for_each_tracing_cpu(cpu) { 3095out:
3052 if (global_trace.data[cpu])
3053 atomic_dec(&global_trace.data[cpu]->disabled);
3054 if (max_tr.data[cpu])
3055 atomic_dec(&max_tr.data[cpu]->disabled);
3056 }
3057
3058 tracing_start();
3059 mutex_unlock(&trace_types_lock); 3096 mutex_unlock(&trace_types_lock);
3060 3097
3061 return ret; 3098 return ret;
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void)
3078 3115
3079 mutex_lock(&trace_types_lock); 3116 mutex_lock(&trace_types_lock);
3080 if (!ring_buffer_expanded) 3117 if (!ring_buffer_expanded)
3081 ret = __tracing_resize_ring_buffer(trace_buf_size); 3118 ret = __tracing_resize_ring_buffer(trace_buf_size,
3119 RING_BUFFER_ALL_CPUS);
3082 mutex_unlock(&trace_types_lock); 3120 mutex_unlock(&trace_types_lock);
3083 3121
3084 return ret; 3122 return ret;
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf)
3102 mutex_lock(&trace_types_lock); 3140 mutex_lock(&trace_types_lock);
3103 3141
3104 if (!ring_buffer_expanded) { 3142 if (!ring_buffer_expanded) {
3105 ret = __tracing_resize_ring_buffer(trace_buf_size); 3143 ret = __tracing_resize_ring_buffer(trace_buf_size,
3144 RING_BUFFER_ALL_CPUS);
3106 if (ret < 0) 3145 if (ret < 0)
3107 goto out; 3146 goto out;
3108 ret = 0; 3147 ret = 0;
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf)
3128 * The max_tr ring buffer has some state (e.g. ring->clock) and 3167 * The max_tr ring buffer has some state (e.g. ring->clock) and
3129 * we want preserve it. 3168 * we want preserve it.
3130 */ 3169 */
3131 ring_buffer_resize(max_tr.buffer, 1); 3170 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3132 max_tr.entries = 1; 3171 set_buffer_entries(&max_tr, 1);
3133 } 3172 }
3134 destroy_trace_option_files(topts); 3173 destroy_trace_option_files(topts);
3135 3174
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf)
3137 3176
3138 topts = create_trace_option_files(current_trace); 3177 topts = create_trace_option_files(current_trace);
3139 if (current_trace->use_max_tr) { 3178 if (current_trace->use_max_tr) {
3140 ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); 3179 int cpu;
3141 if (ret < 0) 3180 /* we need to make per cpu buffer sizes equivalent */
3142 goto out; 3181 for_each_tracing_cpu(cpu) {
3143 max_tr.entries = global_trace.entries; 3182 ret = ring_buffer_resize(max_tr.buffer,
3183 global_trace.data[cpu]->entries,
3184 cpu);
3185 if (ret < 0)
3186 goto out;
3187 max_tr.data[cpu]->entries =
3188 global_trace.data[cpu]->entries;
3189 }
3144 } 3190 }
3145 3191
3146 if (t->init) { 3192 if (t->init) {
@@ -3642,30 +3688,82 @@ out_err:
3642 goto out; 3688 goto out;
3643} 3689}
3644 3690
3691struct ftrace_entries_info {
3692 struct trace_array *tr;
3693 int cpu;
3694};
3695
3696static int tracing_entries_open(struct inode *inode, struct file *filp)
3697{
3698 struct ftrace_entries_info *info;
3699
3700 if (tracing_disabled)
3701 return -ENODEV;
3702
3703 info = kzalloc(sizeof(*info), GFP_KERNEL);
3704 if (!info)
3705 return -ENOMEM;
3706
3707 info->tr = &global_trace;
3708 info->cpu = (unsigned long)inode->i_private;
3709
3710 filp->private_data = info;
3711
3712 return 0;
3713}
3714
3645static ssize_t 3715static ssize_t
3646tracing_entries_read(struct file *filp, char __user *ubuf, 3716tracing_entries_read(struct file *filp, char __user *ubuf,
3647 size_t cnt, loff_t *ppos) 3717 size_t cnt, loff_t *ppos)
3648{ 3718{
3649 struct trace_array *tr = filp->private_data; 3719 struct ftrace_entries_info *info = filp->private_data;
3650 char buf[96]; 3720 struct trace_array *tr = info->tr;
3651 int r; 3721 char buf[64];
3722 int r = 0;
3723 ssize_t ret;
3652 3724
3653 mutex_lock(&trace_types_lock); 3725 mutex_lock(&trace_types_lock);
3654 if (!ring_buffer_expanded) 3726
3655 r = sprintf(buf, "%lu (expanded: %lu)\n", 3727 if (info->cpu == RING_BUFFER_ALL_CPUS) {
3656 tr->entries >> 10, 3728 int cpu, buf_size_same;
3657 trace_buf_size >> 10); 3729 unsigned long size;
3658 else 3730
3659 r = sprintf(buf, "%lu\n", tr->entries >> 10); 3731 size = 0;
3732 buf_size_same = 1;
3733 /* check if all cpu sizes are same */
3734 for_each_tracing_cpu(cpu) {
3735 /* fill in the size from first enabled cpu */
3736 if (size == 0)
3737 size = tr->data[cpu]->entries;
3738 if (size != tr->data[cpu]->entries) {
3739 buf_size_same = 0;
3740 break;
3741 }
3742 }
3743
3744 if (buf_size_same) {
3745 if (!ring_buffer_expanded)
3746 r = sprintf(buf, "%lu (expanded: %lu)\n",
3747 size >> 10,
3748 trace_buf_size >> 10);
3749 else
3750 r = sprintf(buf, "%lu\n", size >> 10);
3751 } else
3752 r = sprintf(buf, "X\n");
3753 } else
3754 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
3755
3660 mutex_unlock(&trace_types_lock); 3756 mutex_unlock(&trace_types_lock);
3661 3757
3662 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3758 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3759 return ret;
3663} 3760}
3664 3761
3665static ssize_t 3762static ssize_t
3666tracing_entries_write(struct file *filp, const char __user *ubuf, 3763tracing_entries_write(struct file *filp, const char __user *ubuf,
3667 size_t cnt, loff_t *ppos) 3764 size_t cnt, loff_t *ppos)
3668{ 3765{
3766 struct ftrace_entries_info *info = filp->private_data;
3669 unsigned long val; 3767 unsigned long val;
3670 int ret; 3768 int ret;
3671 3769
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3680 /* value is in KB */ 3778 /* value is in KB */
3681 val <<= 10; 3779 val <<= 10;
3682 3780
3683 ret = tracing_resize_ring_buffer(val); 3781 ret = tracing_resize_ring_buffer(val, info->cpu);
3684 if (ret < 0) 3782 if (ret < 0)
3685 return ret; 3783 return ret;
3686 3784
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3689 return cnt; 3787 return cnt;
3690} 3788}
3691 3789
3790static int
3791tracing_entries_release(struct inode *inode, struct file *filp)
3792{
3793 struct ftrace_entries_info *info = filp->private_data;
3794
3795 kfree(info);
3796
3797 return 0;
3798}
3799
3692static ssize_t 3800static ssize_t
3693tracing_total_entries_read(struct file *filp, char __user *ubuf, 3801tracing_total_entries_read(struct file *filp, char __user *ubuf,
3694 size_t cnt, loff_t *ppos) 3802 size_t cnt, loff_t *ppos)
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3700 3808
3701 mutex_lock(&trace_types_lock); 3809 mutex_lock(&trace_types_lock);
3702 for_each_tracing_cpu(cpu) { 3810 for_each_tracing_cpu(cpu) {
3703 size += tr->entries >> 10; 3811 size += tr->data[cpu]->entries >> 10;
3704 if (!ring_buffer_expanded) 3812 if (!ring_buffer_expanded)
3705 expanded_size += trace_buf_size >> 10; 3813 expanded_size += trace_buf_size >> 10;
3706 } 3814 }
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3734 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 3842 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3735 tracing_off(); 3843 tracing_off();
3736 /* resize the ring buffer to 0 */ 3844 /* resize the ring buffer to 0 */
3737 tracing_resize_ring_buffer(0); 3845 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
3738 3846
3739 return 0; 3847 return 0;
3740} 3848}
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3749 struct print_entry *entry; 3857 struct print_entry *entry;
3750 unsigned long irq_flags; 3858 unsigned long irq_flags;
3751 struct page *pages[2]; 3859 struct page *pages[2];
3860 void *map_page[2];
3752 int nr_pages = 1; 3861 int nr_pages = 1;
3753 ssize_t written; 3862 ssize_t written;
3754 void *page1;
3755 void *page2;
3756 int offset; 3863 int offset;
3757 int size; 3864 int size;
3758 int len; 3865 int len;
3759 int ret; 3866 int ret;
3867 int i;
3760 3868
3761 if (tracing_disabled) 3869 if (tracing_disabled)
3762 return -EINVAL; 3870 return -EINVAL;
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3795 goto out; 3903 goto out;
3796 } 3904 }
3797 3905
3798 page1 = kmap_atomic(pages[0]); 3906 for (i = 0; i < nr_pages; i++)
3799 if (nr_pages == 2) 3907 map_page[i] = kmap_atomic(pages[i]);
3800 page2 = kmap_atomic(pages[1]);
3801 3908
3802 local_save_flags(irq_flags); 3909 local_save_flags(irq_flags);
3803 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 3910 size = sizeof(*entry) + cnt + 2; /* possible \n added */
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3815 3922
3816 if (nr_pages == 2) { 3923 if (nr_pages == 2) {
3817 len = PAGE_SIZE - offset; 3924 len = PAGE_SIZE - offset;
3818 memcpy(&entry->buf, page1 + offset, len); 3925 memcpy(&entry->buf, map_page[0] + offset, len);
3819 memcpy(&entry->buf[len], page2, cnt - len); 3926 memcpy(&entry->buf[len], map_page[1], cnt - len);
3820 } else 3927 } else
3821 memcpy(&entry->buf, page1 + offset, cnt); 3928 memcpy(&entry->buf, map_page[0] + offset, cnt);
3822 3929
3823 if (entry->buf[cnt - 1] != '\n') { 3930 if (entry->buf[cnt - 1] != '\n') {
3824 entry->buf[cnt] = '\n'; 3931 entry->buf[cnt] = '\n';
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3833 *fpos += written; 3940 *fpos += written;
3834 3941
3835 out_unlock: 3942 out_unlock:
3836 if (nr_pages == 2) 3943 for (i = 0; i < nr_pages; i++){
3837 kunmap_atomic(page2); 3944 kunmap_atomic(map_page[i]);
3838 kunmap_atomic(page1); 3945 put_page(pages[i]);
3839 while (nr_pages > 0) 3946 }
3840 put_page(pages[--nr_pages]);
3841 out: 3947 out:
3842 return written; 3948 return written;
3843} 3949}
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = {
3933}; 4039};
3934 4040
3935static const struct file_operations tracing_entries_fops = { 4041static const struct file_operations tracing_entries_fops = {
3936 .open = tracing_open_generic, 4042 .open = tracing_entries_open,
3937 .read = tracing_entries_read, 4043 .read = tracing_entries_read,
3938 .write = tracing_entries_write, 4044 .write = tracing_entries_write,
4045 .release = tracing_entries_release,
3939 .llseek = generic_file_llseek, 4046 .llseek = generic_file_llseek,
3940}; 4047};
3941 4048
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4367 struct dentry *d_cpu; 4474 struct dentry *d_cpu;
4368 char cpu_dir[30]; /* 30 characters should be more than enough */ 4475 char cpu_dir[30]; /* 30 characters should be more than enough */
4369 4476
4477 if (!d_percpu)
4478 return;
4479
4370 snprintf(cpu_dir, 30, "cpu%ld", cpu); 4480 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4371 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4481 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4372 if (!d_cpu) { 4482 if (!d_cpu) {
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu)
4387 4497
4388 trace_create_file("stats", 0444, d_cpu, 4498 trace_create_file("stats", 0444, d_cpu,
4389 (void *) cpu, &tracing_stats_fops); 4499 (void *) cpu, &tracing_stats_fops);
4500
4501 trace_create_file("buffer_size_kb", 0444, d_cpu,
4502 (void *) cpu, &tracing_entries_fops);
4390} 4503}
4391 4504
4392#ifdef CONFIG_FTRACE_SELFTEST 4505#ifdef CONFIG_FTRACE_SELFTEST
@@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void)
4718 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4831 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
4719 4832
4720 trace_create_file("buffer_size_kb", 0644, d_tracer, 4833 trace_create_file("buffer_size_kb", 0644, d_tracer,
4721 &global_trace, &tracing_entries_fops); 4834 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
4722 4835
4723 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 4836 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
4724 &global_trace, &tracing_total_entries_fops); 4837 &global_trace, &tracing_total_entries_fops);
@@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void)
4957 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 5070 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4958 goto out_free_buffer_mask; 5071 goto out_free_buffer_mask;
4959 5072
5073 /* Only allocate trace_printk buffers if a trace_printk exists */
5074 if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
5075 trace_printk_init_buffers();
5076
4960 /* To save memory, keep the ring buffer size to its minimum */ 5077 /* To save memory, keep the ring buffer size to its minimum */
4961 if (ring_buffer_expanded) 5078 if (ring_buffer_expanded)
4962 ring_buf_size = trace_buf_size; 5079 ring_buf_size = trace_buf_size;
@@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void)
4975 WARN_ON(1); 5092 WARN_ON(1);
4976 goto out_free_cpumask; 5093 goto out_free_cpumask;
4977 } 5094 }
4978 global_trace.entries = ring_buffer_size(global_trace.buffer);
4979 if (global_trace.buffer_disabled) 5095 if (global_trace.buffer_disabled)
4980 tracing_off(); 5096 tracing_off();
4981 5097
@@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void)
4988 ring_buffer_free(global_trace.buffer); 5104 ring_buffer_free(global_trace.buffer);
4989 goto out_free_cpumask; 5105 goto out_free_cpumask;
4990 } 5106 }
4991 max_tr.entries = 1;
4992#endif 5107#endif
4993 5108
4994 /* Allocate the first page for all buffers */ 5109 /* Allocate the first page for all buffers */
@@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void)
4997 max_tr.data[i] = &per_cpu(max_tr_data, i); 5112 max_tr.data[i] = &per_cpu(max_tr_data, i);
4998 } 5113 }
4999 5114
5115 set_buffer_entries(&global_trace,
5116 ring_buffer_size(global_trace.buffer, 0));
5117#ifdef CONFIG_TRACER_MAX_TRACE
5118 set_buffer_entries(&max_tr, 1);
5119#endif
5120
5000 trace_init_cmdlines(); 5121 trace_init_cmdlines();
5001 5122
5002 register_tracer(&nop_trace); 5123 register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f95d65da6db8..5aec220d2de0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 103 unsigned long ret_ip;
104}; 104};
105 105
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
106/* 111/*
107 * trace_flag_type is an enumeration that holds different 112 * trace_flag_type is an enumeration that holds different
108 * states when a trace occurs. These are: 113 * states when a trace occurs. These are:
@@ -131,6 +136,7 @@ struct trace_array_cpu {
131 atomic_t disabled; 136 atomic_t disabled;
132 void *buffer_page; /* ring buffer spare */ 137 void *buffer_page; /* ring buffer spare */
133 138
139 unsigned long entries;
134 unsigned long saved_latency; 140 unsigned long saved_latency;
135 unsigned long critical_start; 141 unsigned long critical_start;
136 unsigned long critical_end; 142 unsigned long critical_end;
@@ -152,7 +158,6 @@ struct trace_array_cpu {
152 */ 158 */
153struct trace_array { 159struct trace_array {
154 struct ring_buffer *buffer; 160 struct ring_buffer *buffer;
155 unsigned long entries;
156 int cpu; 161 int cpu;
157 int buffer_disabled; 162 int buffer_disabled;
158 cycle_t time_start; 163 cycle_t time_start;
@@ -826,6 +831,8 @@ extern struct list_head ftrace_events;
826extern const char *__start___trace_bprintk_fmt[]; 831extern const char *__start___trace_bprintk_fmt[];
827extern const char *__stop___trace_bprintk_fmt[]; 832extern const char *__stop___trace_bprintk_fmt[];
828 833
834void trace_printk_init_buffers(void);
835
829#undef FTRACE_ENTRY 836#undef FTRACE_ENTRY
830#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 837#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
831 extern struct ftrace_event_call \ 838 extern struct ftrace_event_call \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9d..29111da1d100 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
294 if (!call->name || !call->class || !call->class->reg) 294 if (!call->name || !call->class || !call->class->reg)
295 continue; 295 continue;
296 296
297 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
298 continue;
299
297 if (match && 300 if (match &&
298 strcmp(match, call->name) != 0 && 301 strcmp(match, call->name) != 0 &&
299 strcmp(match, call->class->system) != 0) 302 strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1164 return -1; 1167 return -1;
1165 } 1168 }
1166 1169
1167 if (call->class->reg) 1170 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1168 trace_create_file("enable", 0644, call->dir, call, 1171 trace_create_file("enable", 0644, call->dir, call,
1169 enable); 1172 enable);
1170 1173
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc856..e039906b037d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \
180 .event.type = etype, \ 180 .event.type = etype, \
181 .class = &event_class_ftrace_##call, \ 181 .class = &event_class_ftrace_##call, \
182 .print_fmt = print, \ 182 .print_fmt = print, \
183 .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
183}; \ 184}; \
184struct ftrace_event_call __used \ 185struct ftrace_event_call __used \
185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 186__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 580a05ec926b..b31d3d5699fe 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,547 +19,15 @@
19 19
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <linux/limits.h>
34#include <asm/bitsperlong.h>
35
36#include "trace.h"
37#include "trace_output.h"
38
39#define MAX_TRACE_ARGS 128
40#define MAX_ARGSTR_LEN 63
41#define MAX_EVENT_NAME_LEN 64
42#define MAX_STRING_SIZE PATH_MAX
43#define KPROBE_EVENT_SYSTEM "kprobes"
44
45/* Reserved field names */
46#define FIELD_STRING_IP "__probe_ip"
47#define FIELD_STRING_RETIP "__probe_ret_ip"
48#define FIELD_STRING_FUNC "__probe_func"
49
50const char *reserved_field_names[] = {
51 "common_type",
52 "common_flags",
53 "common_preempt_count",
54 "common_pid",
55 "common_tgid",
56 FIELD_STRING_IP,
57 FIELD_STRING_RETIP,
58 FIELD_STRING_FUNC,
59};
60
61/* Printing function type */
62typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
63 void *);
64#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
65#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
66
67/* Printing in basic type function template */
68#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
69static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
70 const char *name, \
71 void *data, void *ent)\
72{ \
73 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
74} \
75static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
76
77DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
78DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
80DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
82DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
83DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
84DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
85
86/* data_rloc: data relative location, compatible with u32 */
87#define make_data_rloc(len, roffs) \
88 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
89#define get_rloc_len(dl) ((u32)(dl) >> 16)
90#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
91
92static inline void *get_rloc_data(u32 *dl)
93{
94 return (u8 *)dl + get_rloc_offs(*dl);
95}
96
97/* For data_loc conversion */
98static inline void *get_loc_data(u32 *dl, void *ent)
99{
100 return (u8 *)ent + get_rloc_offs(*dl);
101}
102
103/*
104 * Convert data_rloc to data_loc:
105 * data_rloc stores the offset from data_rloc itself, but data_loc
106 * stores the offset from event entry.
107 */
108#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
109
110/* For defining macros, define string/string_size types */
111typedef u32 string;
112typedef u32 string_size;
113
114/* Print type function for string type */
115static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
116 const char *name,
117 void *data, void *ent)
118{
119 int len = *(u32 *)data >> 16;
120
121 if (!len)
122 return trace_seq_printf(s, " %s=(fault)", name);
123 else
124 return trace_seq_printf(s, " %s=\"%s\"", name,
125 (const char *)get_loc_data(data, ent));
126}
127static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
128
129/* Data fetch function type */
130typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
131
132struct fetch_param {
133 fetch_func_t fn;
134 void *data;
135};
136
137static __kprobes void call_fetch(struct fetch_param *fprm,
138 struct pt_regs *regs, void *dest)
139{
140 return fprm->fn(regs, fprm->data, dest);
141}
142
143#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
144/*
145 * Define macro for basic types - we don't need to define s* types, because
146 * we have to care only about bitwidth at recording time.
147 */
148#define DEFINE_BASIC_FETCH_FUNCS(method) \
149DEFINE_FETCH_##method(u8) \
150DEFINE_FETCH_##method(u16) \
151DEFINE_FETCH_##method(u32) \
152DEFINE_FETCH_##method(u64)
153
154#define CHECK_FETCH_FUNCS(method, fn) \
155 (((FETCH_FUNC_NAME(method, u8) == fn) || \
156 (FETCH_FUNC_NAME(method, u16) == fn) || \
157 (FETCH_FUNC_NAME(method, u32) == fn) || \
158 (FETCH_FUNC_NAME(method, u64) == fn) || \
159 (FETCH_FUNC_NAME(method, string) == fn) || \
160 (FETCH_FUNC_NAME(method, string_size) == fn)) \
161 && (fn != NULL))
162
163/* Data fetch function templates */
164#define DEFINE_FETCH_reg(type) \
165static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
166 void *offset, void *dest) \
167{ \
168 *(type *)dest = (type)regs_get_register(regs, \
169 (unsigned int)((unsigned long)offset)); \
170}
171DEFINE_BASIC_FETCH_FUNCS(reg)
172/* No string on the register */
173#define fetch_reg_string NULL
174#define fetch_reg_string_size NULL
175
176#define DEFINE_FETCH_stack(type) \
177static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
178 void *offset, void *dest) \
179{ \
180 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
181 (unsigned int)((unsigned long)offset)); \
182}
183DEFINE_BASIC_FETCH_FUNCS(stack)
184/* No string on the stack entry */
185#define fetch_stack_string NULL
186#define fetch_stack_string_size NULL
187
188#define DEFINE_FETCH_retval(type) \
189static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
190 void *dummy, void *dest) \
191{ \
192 *(type *)dest = (type)regs_return_value(regs); \
193}
194DEFINE_BASIC_FETCH_FUNCS(retval)
195/* No string on the retval */
196#define fetch_retval_string NULL
197#define fetch_retval_string_size NULL
198
199#define DEFINE_FETCH_memory(type) \
200static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
201 void *addr, void *dest) \
202{ \
203 type retval; \
204 if (probe_kernel_address(addr, retval)) \
205 *(type *)dest = 0; \
206 else \
207 *(type *)dest = retval; \
208}
209DEFINE_BASIC_FETCH_FUNCS(memory)
210/*
211 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
212 * length and relative data location.
213 */
214static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
215 void *addr, void *dest)
216{
217 long ret;
218 int maxlen = get_rloc_len(*(u32 *)dest);
219 u8 *dst = get_rloc_data(dest);
220 u8 *src = addr;
221 mm_segment_t old_fs = get_fs();
222 if (!maxlen)
223 return;
224 /*
225 * Try to get string again, since the string can be changed while
226 * probing.
227 */
228 set_fs(KERNEL_DS);
229 pagefault_disable();
230 do
231 ret = __copy_from_user_inatomic(dst++, src++, 1);
232 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
233 dst[-1] = '\0';
234 pagefault_enable();
235 set_fs(old_fs);
236
237 if (ret < 0) { /* Failed to fetch string */
238 ((u8 *)get_rloc_data(dest))[0] = '\0';
239 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
240 } else
241 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
242 get_rloc_offs(*(u32 *)dest));
243}
244/* Return the length of string -- including null terminal byte */
245static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
246 void *addr, void *dest)
247{
248 int ret, len = 0;
249 u8 c;
250 mm_segment_t old_fs = get_fs();
251
252 set_fs(KERNEL_DS);
253 pagefault_disable();
254 do {
255 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
256 len++;
257 } while (c && ret == 0 && len < MAX_STRING_SIZE);
258 pagefault_enable();
259 set_fs(old_fs);
260
261 if (ret < 0) /* Failed to check the length */
262 *(u32 *)dest = 0;
263 else
264 *(u32 *)dest = len;
265}
266
267/* Memory fetching by symbol */
268struct symbol_cache {
269 char *symbol;
270 long offset;
271 unsigned long addr;
272};
273
274static unsigned long update_symbol_cache(struct symbol_cache *sc)
275{
276 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
277 if (sc->addr)
278 sc->addr += sc->offset;
279 return sc->addr;
280}
281
282static void free_symbol_cache(struct symbol_cache *sc)
283{
284 kfree(sc->symbol);
285 kfree(sc);
286}
287
288static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
289{
290 struct symbol_cache *sc;
291
292 if (!sym || strlen(sym) == 0)
293 return NULL;
294 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
295 if (!sc)
296 return NULL;
297
298 sc->symbol = kstrdup(sym, GFP_KERNEL);
299 if (!sc->symbol) {
300 kfree(sc);
301 return NULL;
302 }
303 sc->offset = offset;
304 22
305 update_symbol_cache(sc); 23#include "trace_probe.h"
306 return sc;
307}
308
309#define DEFINE_FETCH_symbol(type) \
310static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
311 void *data, void *dest) \
312{ \
313 struct symbol_cache *sc = data; \
314 if (sc->addr) \
315 fetch_memory_##type(regs, (void *)sc->addr, dest); \
316 else \
317 *(type *)dest = 0; \
318}
319DEFINE_BASIC_FETCH_FUNCS(symbol)
320DEFINE_FETCH_symbol(string)
321DEFINE_FETCH_symbol(string_size)
322
323/* Dereference memory access function */
324struct deref_fetch_param {
325 struct fetch_param orig;
326 long offset;
327};
328
329#define DEFINE_FETCH_deref(type) \
330static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
331 void *data, void *dest) \
332{ \
333 struct deref_fetch_param *dprm = data; \
334 unsigned long addr; \
335 call_fetch(&dprm->orig, regs, &addr); \
336 if (addr) { \
337 addr += dprm->offset; \
338 fetch_memory_##type(regs, (void *)addr, dest); \
339 } else \
340 *(type *)dest = 0; \
341}
342DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size)
345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
355{
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 free_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 free_symbol_cache(data->orig.data);
360 kfree(data);
361}
362
363/* Bitfield fetch function */
364struct bitfield_fetch_param {
365 struct fetch_param orig;
366 unsigned char hi_shift;
367 unsigned char low_shift;
368};
369 24
370#define DEFINE_FETCH_bitfield(type) \ 25#define KPROBE_EVENT_SYSTEM "kprobes"
371static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
372 void *data, void *dest) \
373{ \
374 struct bitfield_fetch_param *bprm = data; \
375 type buf = 0; \
376 call_fetch(&bprm->orig, regs, &buf); \
377 if (buf) { \
378 buf <<= bprm->hi_shift; \
379 buf >>= bprm->low_shift; \
380 } \
381 *(type *)dest = buf; \
382}
383DEFINE_BASIC_FETCH_FUNCS(bitfield)
384#define fetch_bitfield_string NULL
385#define fetch_bitfield_string_size NULL
386
387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
402{
403 /*
404 * Don't check the bitfield itself, because this must be the
405 * last fetch function.
406 */
407 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
408 free_deref_fetch_param(data->orig.data);
409 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
410 free_symbol_cache(data->orig.data);
411 kfree(data);
412}
413
414/* Default (unsigned long) fetch type */
415#define __DEFAULT_FETCH_TYPE(t) u##t
416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
417#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
418#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
419
420/* Fetch types */
421enum {
422 FETCH_MTD_reg = 0,
423 FETCH_MTD_stack,
424 FETCH_MTD_retval,
425 FETCH_MTD_memory,
426 FETCH_MTD_symbol,
427 FETCH_MTD_deref,
428 FETCH_MTD_bitfield,
429 FETCH_MTD_END,
430};
431
432#define ASSIGN_FETCH_FUNC(method, type) \
433 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
434
435#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
436 {.name = _name, \
437 .size = _size, \
438 .is_signed = sign, \
439 .print = PRINT_TYPE_FUNC_NAME(ptype), \
440 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
441 .fmttype = _fmttype, \
442 .fetch = { \
443ASSIGN_FETCH_FUNC(reg, ftype), \
444ASSIGN_FETCH_FUNC(stack, ftype), \
445ASSIGN_FETCH_FUNC(retval, ftype), \
446ASSIGN_FETCH_FUNC(memory, ftype), \
447ASSIGN_FETCH_FUNC(symbol, ftype), \
448ASSIGN_FETCH_FUNC(deref, ftype), \
449ASSIGN_FETCH_FUNC(bitfield, ftype), \
450 } \
451 }
452
453#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
454 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
455
456#define FETCH_TYPE_STRING 0
457#define FETCH_TYPE_STRSIZE 1
458
459/* Fetch type information table */
460static const struct fetch_type {
461 const char *name; /* Name of type */
462 size_t size; /* Byte size of type */
463 int is_signed; /* Signed flag */
464 print_type_func_t print; /* Print functions */
465 const char *fmt; /* Fromat string */
466 const char *fmttype; /* Name in format file */
467 /* Fetch functions */
468 fetch_func_t fetch[FETCH_MTD_END];
469} fetch_type_table[] = {
470 /* Special types */
471 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
472 sizeof(u32), 1, "__data_loc char[]"),
473 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
474 string_size, sizeof(u32), 0, "u32"),
475 /* Basic types */
476 ASSIGN_FETCH_TYPE(u8, u8, 0),
477 ASSIGN_FETCH_TYPE(u16, u16, 0),
478 ASSIGN_FETCH_TYPE(u32, u32, 0),
479 ASSIGN_FETCH_TYPE(u64, u64, 0),
480 ASSIGN_FETCH_TYPE(s8, u8, 1),
481 ASSIGN_FETCH_TYPE(s16, u16, 1),
482 ASSIGN_FETCH_TYPE(s32, u32, 1),
483 ASSIGN_FETCH_TYPE(s64, u64, 1),
484};
485
486static const struct fetch_type *find_fetch_type(const char *type)
487{
488 int i;
489
490 if (!type)
491 type = DEFAULT_FETCH_TYPE_STR;
492
493 /* Special case: bitfield */
494 if (*type == 'b') {
495 unsigned long bs;
496 type = strchr(type, '/');
497 if (!type)
498 goto fail;
499 type++;
500 if (strict_strtoul(type, 0, &bs))
501 goto fail;
502 switch (bs) {
503 case 8:
504 return find_fetch_type("u8");
505 case 16:
506 return find_fetch_type("u16");
507 case 32:
508 return find_fetch_type("u32");
509 case 64:
510 return find_fetch_type("u64");
511 default:
512 goto fail;
513 }
514 }
515
516 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
517 if (strcmp(type, fetch_type_table[i].name) == 0)
518 return &fetch_type_table[i];
519fail:
520 return NULL;
521}
522
523/* Special function : only accept unsigned long */
524static __kprobes void fetch_stack_address(struct pt_regs *regs,
525 void *dummy, void *dest)
526{
527 *(unsigned long *)dest = kernel_stack_pointer(regs);
528}
529
530static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
531 fetch_func_t orig_fn)
532{
533 int i;
534
535 if (type != &fetch_type_table[FETCH_TYPE_STRING])
536 return NULL; /* Only string type needs size function */
537 for (i = 0; i < FETCH_MTD_END; i++)
538 if (type->fetch[i] == orig_fn)
539 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
540
541 WARN_ON(1); /* This should not happen */
542 return NULL;
543}
544 26
545/** 27/**
546 * Kprobe event core functions 28 * Kprobe event core functions
547 */ 29 */
548 30
549struct probe_arg {
550 struct fetch_param fetch;
551 struct fetch_param fetch_size;
552 unsigned int offset; /* Offset from argument entry */
553 const char *name; /* Name of this argument */
554 const char *comm; /* Command of this argument */
555 const struct fetch_type *type; /* Type of this argument */
556};
557
558/* Flags for trace_probe */
559#define TP_FLAG_TRACE 1
560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
562
563struct trace_probe { 31struct trace_probe {
564 struct list_head list; 32 struct list_head list;
565 struct kretprobe rp; /* Use rp.kp for kprobe use */ 33 struct kretprobe rp; /* Use rp.kp for kprobe use */
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
631static int kretprobe_dispatcher(struct kretprobe_instance *ri, 99static int kretprobe_dispatcher(struct kretprobe_instance *ri,
632 struct pt_regs *regs); 100 struct pt_regs *regs);
633 101
634/* Check the name is good for event/group/fields */
635static int is_good_name(const char *name)
636{
637 if (!isalpha(*name) && *name != '_')
638 return 0;
639 while (*++name != '\0') {
640 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
641 return 0;
642 }
643 return 1;
644}
645
646/* 102/*
647 * Allocate new trace_probe and initialize it (including kprobes). 103 * Allocate new trace_probe and initialize it (including kprobes).
648 */ 104 */
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
651 void *addr, 107 void *addr,
652 const char *symbol, 108 const char *symbol,
653 unsigned long offs, 109 unsigned long offs,
654 int nargs, int is_return) 110 int nargs, bool is_return)
655{ 111{
656 struct trace_probe *tp; 112 struct trace_probe *tp;
657 int ret = -ENOMEM; 113 int ret = -ENOMEM;
@@ -702,34 +158,12 @@ error:
702 return ERR_PTR(ret); 158 return ERR_PTR(ret);
703} 159}
704 160
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
715static void free_probe_arg(struct probe_arg *arg)
716{
717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
718 free_bitfield_fetch_param(arg->fetch.data);
719 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
720 free_deref_fetch_param(arg->fetch.data);
721 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
722 free_symbol_cache(arg->fetch.data);
723 kfree(arg->name);
724 kfree(arg->comm);
725}
726
727static void free_trace_probe(struct trace_probe *tp) 161static void free_trace_probe(struct trace_probe *tp)
728{ 162{
729 int i; 163 int i;
730 164
731 for (i = 0; i < tp->nr_args; i++) 165 for (i = 0; i < tp->nr_args; i++)
732 free_probe_arg(&tp->args[i]); 166 traceprobe_free_probe_arg(&tp->args[i]);
733 167
734 kfree(tp->call.class->system); 168 kfree(tp->call.class->system);
735 kfree(tp->call.name); 169 kfree(tp->call.name);
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp)
787 return -EINVAL; 221 return -EINVAL;
788 222
789 for (i = 0; i < tp->nr_args; i++) 223 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]); 224 traceprobe_update_arg(&tp->args[i]);
791 225
792 /* Set/clear disabled flag according to tp->flag */ 226 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp)) 227 if (trace_probe_is_enabled(tp))
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = {
919 .priority = 1 /* Invoked after kprobe module callback */ 353 .priority = 1 /* Invoked after kprobe module callback */
920}; 354};
921 355
922/* Split symbol and offset. */
923static int split_symbol_offset(char *symbol, unsigned long *offset)
924{
925 char *tmp;
926 int ret;
927
928 if (!offset)
929 return -EINVAL;
930
931 tmp = strchr(symbol, '+');
932 if (tmp) {
933 /* skip sign because strict_strtol doesn't accept '+' */
934 ret = strict_strtoul(tmp + 1, 0, offset);
935 if (ret)
936 return ret;
937 *tmp = '\0';
938 } else
939 *offset = 0;
940 return 0;
941}
942
943#define PARAM_MAX_ARGS 16
944#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
945
946static int parse_probe_vars(char *arg, const struct fetch_type *t,
947 struct fetch_param *f, int is_return)
948{
949 int ret = 0;
950 unsigned long param;
951
952 if (strcmp(arg, "retval") == 0) {
953 if (is_return)
954 f->fn = t->fetch[FETCH_MTD_retval];
955 else
956 ret = -EINVAL;
957 } else if (strncmp(arg, "stack", 5) == 0) {
958 if (arg[5] == '\0') {
959 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
960 f->fn = fetch_stack_address;
961 else
962 ret = -EINVAL;
963 } else if (isdigit(arg[5])) {
964 ret = strict_strtoul(arg + 5, 10, &param);
965 if (ret || param > PARAM_MAX_STACK)
966 ret = -EINVAL;
967 else {
968 f->fn = t->fetch[FETCH_MTD_stack];
969 f->data = (void *)param;
970 }
971 } else
972 ret = -EINVAL;
973 } else
974 ret = -EINVAL;
975 return ret;
976}
977
978/* Recursive argument parser */
979static int __parse_probe_arg(char *arg, const struct fetch_type *t,
980 struct fetch_param *f, int is_return)
981{
982 int ret = 0;
983 unsigned long param;
984 long offset;
985 char *tmp;
986
987 switch (arg[0]) {
988 case '$':
989 ret = parse_probe_vars(arg + 1, t, f, is_return);
990 break;
991 case '%': /* named register */
992 ret = regs_query_register_offset(arg + 1);
993 if (ret >= 0) {
994 f->fn = t->fetch[FETCH_MTD_reg];
995 f->data = (void *)(unsigned long)ret;
996 ret = 0;
997 }
998 break;
999 case '@': /* memory or symbol */
1000 if (isdigit(arg[1])) {
1001 ret = strict_strtoul(arg + 1, 0, &param);
1002 if (ret)
1003 break;
1004 f->fn = t->fetch[FETCH_MTD_memory];
1005 f->data = (void *)param;
1006 } else {
1007 ret = split_symbol_offset(arg + 1, &offset);
1008 if (ret)
1009 break;
1010 f->data = alloc_symbol_cache(arg + 1, offset);
1011 if (f->data)
1012 f->fn = t->fetch[FETCH_MTD_symbol];
1013 }
1014 break;
1015 case '+': /* deref memory */
1016 arg++; /* Skip '+', because strict_strtol() rejects it. */
1017 case '-':
1018 tmp = strchr(arg, '(');
1019 if (!tmp)
1020 break;
1021 *tmp = '\0';
1022 ret = strict_strtol(arg, 0, &offset);
1023 if (ret)
1024 break;
1025 arg = tmp + 1;
1026 tmp = strrchr(arg, ')');
1027 if (tmp) {
1028 struct deref_fetch_param *dprm;
1029 const struct fetch_type *t2 = find_fetch_type(NULL);
1030 *tmp = '\0';
1031 dprm = kzalloc(sizeof(struct deref_fetch_param),
1032 GFP_KERNEL);
1033 if (!dprm)
1034 return -ENOMEM;
1035 dprm->offset = offset;
1036 ret = __parse_probe_arg(arg, t2, &dprm->orig,
1037 is_return);
1038 if (ret)
1039 kfree(dprm);
1040 else {
1041 f->fn = t->fetch[FETCH_MTD_deref];
1042 f->data = (void *)dprm;
1043 }
1044 }
1045 break;
1046 }
1047 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
1048 pr_info("%s type has no corresponding fetch method.\n",
1049 t->name);
1050 ret = -EINVAL;
1051 }
1052 return ret;
1053}
1054
1055#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
1056
1057/* Bitfield type needs to be parsed into a fetch function */
1058static int __parse_bitfield_probe_arg(const char *bf,
1059 const struct fetch_type *t,
1060 struct fetch_param *f)
1061{
1062 struct bitfield_fetch_param *bprm;
1063 unsigned long bw, bo;
1064 char *tail;
1065
1066 if (*bf != 'b')
1067 return 0;
1068
1069 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1070 if (!bprm)
1071 return -ENOMEM;
1072 bprm->orig = *f;
1073 f->fn = t->fetch[FETCH_MTD_bitfield];
1074 f->data = (void *)bprm;
1075
1076 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
1077 if (bw == 0 || *tail != '@')
1078 return -EINVAL;
1079
1080 bf = tail + 1;
1081 bo = simple_strtoul(bf, &tail, 0);
1082 if (tail == bf || *tail != '/')
1083 return -EINVAL;
1084
1085 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
1086 bprm->low_shift = bprm->hi_shift + bo;
1087 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
1088}
1089
1090/* String length checking wrapper */
1091static int parse_probe_arg(char *arg, struct trace_probe *tp,
1092 struct probe_arg *parg, int is_return)
1093{
1094 const char *t;
1095 int ret;
1096
1097 if (strlen(arg) > MAX_ARGSTR_LEN) {
1098 pr_info("Argument is too long.: %s\n", arg);
1099 return -ENOSPC;
1100 }
1101 parg->comm = kstrdup(arg, GFP_KERNEL);
1102 if (!parg->comm) {
1103 pr_info("Failed to allocate memory for command '%s'.\n", arg);
1104 return -ENOMEM;
1105 }
1106 t = strchr(parg->comm, ':');
1107 if (t) {
1108 arg[t - parg->comm] = '\0';
1109 t++;
1110 }
1111 parg->type = find_fetch_type(t);
1112 if (!parg->type) {
1113 pr_info("Unsupported type: %s\n", t);
1114 return -EINVAL;
1115 }
1116 parg->offset = tp->size;
1117 tp->size += parg->type->size;
1118 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
1119 if (ret >= 0 && t != NULL)
1120 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
1121 if (ret >= 0) {
1122 parg->fetch_size.fn = get_fetch_size_function(parg->type,
1123 parg->fetch.fn);
1124 parg->fetch_size.data = parg->fetch.data;
1125 }
1126 return ret;
1127}
1128
1129/* Return 1 if name is reserved or already used by another argument */
1130static int conflict_field_name(const char *name,
1131 struct probe_arg *args, int narg)
1132{
1133 int i;
1134 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
1135 if (strcmp(reserved_field_names[i], name) == 0)
1136 return 1;
1137 for (i = 0; i < narg; i++)
1138 if (strcmp(args[i].name, name) == 0)
1139 return 1;
1140 return 0;
1141}
1142
1143static int create_trace_probe(int argc, char **argv) 356static int create_trace_probe(int argc, char **argv)
1144{ 357{
1145 /* 358 /*
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv)
1162 */ 375 */
1163 struct trace_probe *tp; 376 struct trace_probe *tp;
1164 int i, ret = 0; 377 int i, ret = 0;
1165 int is_return = 0, is_delete = 0; 378 bool is_return = false, is_delete = false;
1166 char *symbol = NULL, *event = NULL, *group = NULL; 379 char *symbol = NULL, *event = NULL, *group = NULL;
1167 char *arg; 380 char *arg;
1168 unsigned long offset = 0; 381 unsigned long offset = 0;
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv)
1171 384
1172 /* argc must be >= 1 */ 385 /* argc must be >= 1 */
1173 if (argv[0][0] == 'p') 386 if (argv[0][0] == 'p')
1174 is_return = 0; 387 is_return = false;
1175 else if (argv[0][0] == 'r') 388 else if (argv[0][0] == 'r')
1176 is_return = 1; 389 is_return = true;
1177 else if (argv[0][0] == '-') 390 else if (argv[0][0] == '-')
1178 is_delete = 1; 391 is_delete = true;
1179 else { 392 else {
1180 pr_info("Probe definition must be started with 'p', 'r' or" 393 pr_info("Probe definition must be started with 'p', 'r' or"
1181 " '-'.\n"); 394 " '-'.\n");
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv)
1240 /* a symbol specified */ 453 /* a symbol specified */
1241 symbol = argv[1]; 454 symbol = argv[1];
1242 /* TODO: support .init module functions */ 455 /* TODO: support .init module functions */
1243 ret = split_symbol_offset(symbol, &offset); 456 ret = traceprobe_split_symbol_offset(symbol, &offset);
1244 if (ret) { 457 if (ret) {
1245 pr_info("Failed to parse symbol.\n"); 458 pr_info("Failed to parse symbol.\n");
1246 return ret; 459 return ret;
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv)
1302 goto error; 515 goto error;
1303 } 516 }
1304 517
1305 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 518 if (traceprobe_conflict_field_name(tp->args[i].name,
519 tp->args, i)) {
1306 pr_info("Argument[%d] name '%s' conflicts with " 520 pr_info("Argument[%d] name '%s' conflicts with "
1307 "another field.\n", i, argv[i]); 521 "another field.\n", i, argv[i]);
1308 ret = -EINVAL; 522 ret = -EINVAL;
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv)
1310 } 524 }
1311 525
1312 /* Parse fetch argument */ 526 /* Parse fetch argument */
1313 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 527 ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
528 is_return, true);
1314 if (ret) { 529 if (ret) {
1315 pr_info("Parse error at argument[%d]. (%d)\n", i, ret); 530 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
1316 goto error; 531 goto error;
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file)
1412 return seq_open(file, &probes_seq_op); 627 return seq_open(file, &probes_seq_op);
1413} 628}
1414 629
1415static int command_trace_probe(const char *buf)
1416{
1417 char **argv;
1418 int argc = 0, ret = 0;
1419
1420 argv = argv_split(GFP_KERNEL, buf, &argc);
1421 if (!argv)
1422 return -ENOMEM;
1423
1424 if (argc)
1425 ret = create_trace_probe(argc, argv);
1426
1427 argv_free(argv);
1428 return ret;
1429}
1430
1431#define WRITE_BUFSIZE 4096
1432
1433static ssize_t probes_write(struct file *file, const char __user *buffer, 630static ssize_t probes_write(struct file *file, const char __user *buffer,
1434 size_t count, loff_t *ppos) 631 size_t count, loff_t *ppos)
1435{ 632{
1436 char *kbuf, *tmp; 633 return traceprobe_probes_write(file, buffer, count, ppos,
1437 int ret; 634 create_trace_probe);
1438 size_t done;
1439 size_t size;
1440
1441 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
1442 if (!kbuf)
1443 return -ENOMEM;
1444
1445 ret = done = 0;
1446 while (done < count) {
1447 size = count - done;
1448 if (size >= WRITE_BUFSIZE)
1449 size = WRITE_BUFSIZE - 1;
1450 if (copy_from_user(kbuf, buffer + done, size)) {
1451 ret = -EFAULT;
1452 goto out;
1453 }
1454 kbuf[size] = '\0';
1455 tmp = strchr(kbuf, '\n');
1456 if (tmp) {
1457 *tmp = '\0';
1458 size = tmp - kbuf + 1;
1459 } else if (done + size < count) {
1460 pr_warning("Line length is too long: "
1461 "Should be less than %d.", WRITE_BUFSIZE);
1462 ret = -EINVAL;
1463 goto out;
1464 }
1465 done += size;
1466 /* Remove comments */
1467 tmp = strchr(kbuf, '#');
1468 if (tmp)
1469 *tmp = '\0';
1470
1471 ret = command_trace_probe(kbuf);
1472 if (ret)
1473 goto out;
1474 }
1475 ret = done;
1476out:
1477 kfree(kbuf);
1478 return ret;
1479} 635}
1480 636
1481static const struct file_operations kprobe_events_ops = { 637static const struct file_operations kprobe_events_ops = {
@@ -1711,16 +867,6 @@ partial:
1711 return TRACE_TYPE_PARTIAL_LINE; 867 return TRACE_TYPE_PARTIAL_LINE;
1712} 868}
1713 869
1714#undef DEFINE_FIELD
1715#define DEFINE_FIELD(type, item, name, is_signed) \
1716 do { \
1717 ret = trace_define_field(event_call, #type, name, \
1718 offsetof(typeof(field), item), \
1719 sizeof(field.item), is_signed, \
1720 FILTER_OTHER); \
1721 if (ret) \
1722 return ret; \
1723 } while (0)
1724 870
1725static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 871static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1726{ 872{
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void)
2051 1197
2052 pr_info("Testing kprobe tracing: "); 1198 pr_info("Testing kprobe tracing: ");
2053 1199
2054 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1200 ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
2055 "$stack $stack0 +0($stack)"); 1201 "$stack $stack0 +0($stack)",
1202 create_trace_probe);
2056 if (WARN_ON_ONCE(ret)) { 1203 if (WARN_ON_ONCE(ret)) {
2057 pr_warning("error on probing function entry.\n"); 1204 pr_warning("error on probing function entry.\n");
2058 warn++; 1205 warn++;
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void)
2066 enable_trace_probe(tp, TP_FLAG_TRACE); 1213 enable_trace_probe(tp, TP_FLAG_TRACE);
2067 } 1214 }
2068 1215
2069 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1216 ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
2070 "$retval"); 1217 "$retval", create_trace_probe);
2071 if (WARN_ON_ONCE(ret)) { 1218 if (WARN_ON_ONCE(ret)) {
2072 pr_warning("error on probing function return.\n"); 1219 pr_warning("error on probing function return.\n");
2073 warn++; 1220 warn++;
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void)
2101 } else 1248 } else
2102 disable_trace_probe(tp, TP_FLAG_TRACE); 1249 disable_trace_probe(tp, TP_FLAG_TRACE);
2103 1250
2104 ret = command_trace_probe("-:testprobe"); 1251 ret = traceprobe_command("-:testprobe", create_trace_probe);
2105 if (WARN_ON_ONCE(ret)) { 1252 if (WARN_ON_ONCE(ret)) {
2106 pr_warning("error on deleting a probe.\n"); 1253 pr_warning("error on deleting a probe.\n");
2107 warn++; 1254 warn++;
2108 } 1255 }
2109 1256
2110 ret = command_trace_probe("-:testprobe2"); 1257 ret = traceprobe_command("-:testprobe2", create_trace_probe);
2111 if (WARN_ON_ONCE(ret)) { 1258 if (WARN_ON_ONCE(ret)) {
2112 pr_warning("error on deleting a probe.\n"); 1259 pr_warning("error on deleting a probe.\n");
2113 warn++; 1260 warn++;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 6fd4ffd042f9..a9077c1b4ad3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
51 const char **iter; 51 const char **iter;
52 char *fmt; 52 char *fmt;
53 53
54 /* allocate the trace_printk per cpu buffers */
55 if (start != end)
56 trace_printk_init_buffers();
57
54 mutex_lock(&btrace_mutex); 58 mutex_lock(&btrace_mutex);
55 for (iter = start; iter < end; iter++) { 59 for (iter = start; iter < end; iter++) {
56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); 60 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 000000000000..daa9980153af
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,839 @@
1/*
2 * Common code for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.c written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include "trace_probe.h"
26
27const char *reserved_field_names[] = {
28 "common_type",
29 "common_flags",
30 "common_preempt_count",
31 "common_pid",
32 "common_tgid",
33 FIELD_STRING_IP,
34 FIELD_STRING_RETIP,
35 FIELD_STRING_FUNC,
36};
37
38/* Printing function type */
39#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
40#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
41
42/* Printing in basic type function template */
43#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
44static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
45 const char *name, \
46 void *data, void *ent)\
47{ \
48 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
49} \
50static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
51
52DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
53DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
54DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
55DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
56DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
57DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
58DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
59DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
60
61static inline void *get_rloc_data(u32 *dl)
62{
63 return (u8 *)dl + get_rloc_offs(*dl);
64}
65
66/* For data_loc conversion */
67static inline void *get_loc_data(u32 *dl, void *ent)
68{
69 return (u8 *)ent + get_rloc_offs(*dl);
70}
71
72/* For defining macros, define string/string_size types */
73typedef u32 string;
74typedef u32 string_size;
75
76/* Print type function for string type */
77static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
78 const char *name,
79 void *data, void *ent)
80{
81 int len = *(u32 *)data >> 16;
82
83 if (!len)
84 return trace_seq_printf(s, " %s=(fault)", name);
85 else
86 return trace_seq_printf(s, " %s=\"%s\"", name,
87 (const char *)get_loc_data(data, ent));
88}
89
90static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
91
92#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
93/*
94 * Define macro for basic types - we don't need to define s* types, because
95 * we have to care only about bitwidth at recording time.
96 */
97#define DEFINE_BASIC_FETCH_FUNCS(method) \
98DEFINE_FETCH_##method(u8) \
99DEFINE_FETCH_##method(u16) \
100DEFINE_FETCH_##method(u32) \
101DEFINE_FETCH_##method(u64)
102
103#define CHECK_FETCH_FUNCS(method, fn) \
104 (((FETCH_FUNC_NAME(method, u8) == fn) || \
105 (FETCH_FUNC_NAME(method, u16) == fn) || \
106 (FETCH_FUNC_NAME(method, u32) == fn) || \
107 (FETCH_FUNC_NAME(method, u64) == fn) || \
108 (FETCH_FUNC_NAME(method, string) == fn) || \
109 (FETCH_FUNC_NAME(method, string_size) == fn)) \
110 && (fn != NULL))
111
112/* Data fetch function templates */
113#define DEFINE_FETCH_reg(type) \
114static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
115 void *offset, void *dest) \
116{ \
117 *(type *)dest = (type)regs_get_register(regs, \
118 (unsigned int)((unsigned long)offset)); \
119}
120DEFINE_BASIC_FETCH_FUNCS(reg)
121/* No string on the register */
122#define fetch_reg_string NULL
123#define fetch_reg_string_size NULL
124
125#define DEFINE_FETCH_stack(type) \
126static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
127 void *offset, void *dest) \
128{ \
129 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
130 (unsigned int)((unsigned long)offset)); \
131}
132DEFINE_BASIC_FETCH_FUNCS(stack)
133/* No string on the stack entry */
134#define fetch_stack_string NULL
135#define fetch_stack_string_size NULL
136
137#define DEFINE_FETCH_retval(type) \
138static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
139 void *dummy, void *dest) \
140{ \
141 *(type *)dest = (type)regs_return_value(regs); \
142}
143DEFINE_BASIC_FETCH_FUNCS(retval)
144/* No string on the retval */
145#define fetch_retval_string NULL
146#define fetch_retval_string_size NULL
147
148#define DEFINE_FETCH_memory(type) \
149static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
150 void *addr, void *dest) \
151{ \
152 type retval; \
153 if (probe_kernel_address(addr, retval)) \
154 *(type *)dest = 0; \
155 else \
156 *(type *)dest = retval; \
157}
158DEFINE_BASIC_FETCH_FUNCS(memory)
159/*
160 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
161 * length and relative data location.
162 */
163static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
164 void *addr, void *dest)
165{
166 long ret;
167 int maxlen = get_rloc_len(*(u32 *)dest);
168 u8 *dst = get_rloc_data(dest);
169 u8 *src = addr;
170 mm_segment_t old_fs = get_fs();
171
172 if (!maxlen)
173 return;
174
175 /*
176 * Try to get string again, since the string can be changed while
177 * probing.
178 */
179 set_fs(KERNEL_DS);
180 pagefault_disable();
181
182 do
183 ret = __copy_from_user_inatomic(dst++, src++, 1);
184 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
185
186 dst[-1] = '\0';
187 pagefault_enable();
188 set_fs(old_fs);
189
190 if (ret < 0) { /* Failed to fetch string */
191 ((u8 *)get_rloc_data(dest))[0] = '\0';
192 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
193 } else {
194 *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
195 get_rloc_offs(*(u32 *)dest));
196 }
197}
198
199/* Return the length of string -- including null terminal byte */
200static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
201 void *addr, void *dest)
202{
203 mm_segment_t old_fs;
204 int ret, len = 0;
205 u8 c;
206
207 old_fs = get_fs();
208 set_fs(KERNEL_DS);
209 pagefault_disable();
210
211 do {
212 ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
213 len++;
214 } while (c && ret == 0 && len < MAX_STRING_SIZE);
215
216 pagefault_enable();
217 set_fs(old_fs);
218
219 if (ret < 0) /* Failed to check the length */
220 *(u32 *)dest = 0;
221 else
222 *(u32 *)dest = len;
223}
224
225/* Memory fetching by symbol */
226struct symbol_cache {
227 char *symbol;
228 long offset;
229 unsigned long addr;
230};
231
232static unsigned long update_symbol_cache(struct symbol_cache *sc)
233{
234 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
235
236 if (sc->addr)
237 sc->addr += sc->offset;
238
239 return sc->addr;
240}
241
242static void free_symbol_cache(struct symbol_cache *sc)
243{
244 kfree(sc->symbol);
245 kfree(sc);
246}
247
248static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
249{
250 struct symbol_cache *sc;
251
252 if (!sym || strlen(sym) == 0)
253 return NULL;
254
255 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
256 if (!sc)
257 return NULL;
258
259 sc->symbol = kstrdup(sym, GFP_KERNEL);
260 if (!sc->symbol) {
261 kfree(sc);
262 return NULL;
263 }
264 sc->offset = offset;
265 update_symbol_cache(sc);
266
267 return sc;
268}
269
270#define DEFINE_FETCH_symbol(type) \
271static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
272 void *data, void *dest) \
273{ \
274 struct symbol_cache *sc = data; \
275 if (sc->addr) \
276 fetch_memory_##type(regs, (void *)sc->addr, dest); \
277 else \
278 *(type *)dest = 0; \
279}
280DEFINE_BASIC_FETCH_FUNCS(symbol)
281DEFINE_FETCH_symbol(string)
282DEFINE_FETCH_symbol(string_size)
283
284/* Dereference memory access function */
285struct deref_fetch_param {
286 struct fetch_param orig;
287 long offset;
288};
289
290#define DEFINE_FETCH_deref(type) \
291static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
292 void *data, void *dest) \
293{ \
294 struct deref_fetch_param *dprm = data; \
295 unsigned long addr; \
296 call_fetch(&dprm->orig, regs, &addr); \
297 if (addr) { \
298 addr += dprm->offset; \
299 fetch_memory_##type(regs, (void *)addr, dest); \
300 } else \
301 *(type *)dest = 0; \
302}
303DEFINE_BASIC_FETCH_FUNCS(deref)
304DEFINE_FETCH_deref(string)
305DEFINE_FETCH_deref(string_size)
306
307static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
308{
309 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
310 update_deref_fetch_param(data->orig.data);
311 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
312 update_symbol_cache(data->orig.data);
313}
314
315static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
316{
317 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
318 free_deref_fetch_param(data->orig.data);
319 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
320 free_symbol_cache(data->orig.data);
321 kfree(data);
322}
323
324/* Bitfield fetch function */
325struct bitfield_fetch_param {
326 struct fetch_param orig;
327 unsigned char hi_shift;
328 unsigned char low_shift;
329};
330
331#define DEFINE_FETCH_bitfield(type) \
332static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
333 void *data, void *dest) \
334{ \
335 struct bitfield_fetch_param *bprm = data; \
336 type buf = 0; \
337 call_fetch(&bprm->orig, regs, &buf); \
338 if (buf) { \
339 buf <<= bprm->hi_shift; \
340 buf >>= bprm->low_shift; \
341 } \
342 *(type *)dest = buf; \
343}
344
345DEFINE_BASIC_FETCH_FUNCS(bitfield)
346#define fetch_bitfield_string NULL
347#define fetch_bitfield_string_size NULL
348
349static __kprobes void
350update_bitfield_fetch_param(struct bitfield_fetch_param *data)
351{
352 /*
353 * Don't check the bitfield itself, because this must be the
354 * last fetch function.
355 */
356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
357 update_deref_fetch_param(data->orig.data);
358 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
359 update_symbol_cache(data->orig.data);
360}
361
362static __kprobes void
363free_bitfield_fetch_param(struct bitfield_fetch_param *data)
364{
365 /*
366 * Don't check the bitfield itself, because this must be the
367 * last fetch function.
368 */
369 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
370 free_deref_fetch_param(data->orig.data);
371 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
372 free_symbol_cache(data->orig.data);
373
374 kfree(data);
375}
376
377/* Default (unsigned long) fetch type */
378#define __DEFAULT_FETCH_TYPE(t) u##t
379#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
380#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
381#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
382
383#define ASSIGN_FETCH_FUNC(method, type) \
384 [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
385
386#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
387 {.name = _name, \
388 .size = _size, \
389 .is_signed = sign, \
390 .print = PRINT_TYPE_FUNC_NAME(ptype), \
391 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
392 .fmttype = _fmttype, \
393 .fetch = { \
394ASSIGN_FETCH_FUNC(reg, ftype), \
395ASSIGN_FETCH_FUNC(stack, ftype), \
396ASSIGN_FETCH_FUNC(retval, ftype), \
397ASSIGN_FETCH_FUNC(memory, ftype), \
398ASSIGN_FETCH_FUNC(symbol, ftype), \
399ASSIGN_FETCH_FUNC(deref, ftype), \
400ASSIGN_FETCH_FUNC(bitfield, ftype), \
401 } \
402 }
403
404#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
405 __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
406
407#define FETCH_TYPE_STRING 0
408#define FETCH_TYPE_STRSIZE 1
409
410/* Fetch type information table */
411static const struct fetch_type fetch_type_table[] = {
412 /* Special types */
413 [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
414 sizeof(u32), 1, "__data_loc char[]"),
415 [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
416 string_size, sizeof(u32), 0, "u32"),
417 /* Basic types */
418 ASSIGN_FETCH_TYPE(u8, u8, 0),
419 ASSIGN_FETCH_TYPE(u16, u16, 0),
420 ASSIGN_FETCH_TYPE(u32, u32, 0),
421 ASSIGN_FETCH_TYPE(u64, u64, 0),
422 ASSIGN_FETCH_TYPE(s8, u8, 1),
423 ASSIGN_FETCH_TYPE(s16, u16, 1),
424 ASSIGN_FETCH_TYPE(s32, u32, 1),
425 ASSIGN_FETCH_TYPE(s64, u64, 1),
426};
427
428static const struct fetch_type *find_fetch_type(const char *type)
429{
430 int i;
431
432 if (!type)
433 type = DEFAULT_FETCH_TYPE_STR;
434
435 /* Special case: bitfield */
436 if (*type == 'b') {
437 unsigned long bs;
438
439 type = strchr(type, '/');
440 if (!type)
441 goto fail;
442
443 type++;
444 if (strict_strtoul(type, 0, &bs))
445 goto fail;
446
447 switch (bs) {
448 case 8:
449 return find_fetch_type("u8");
450 case 16:
451 return find_fetch_type("u16");
452 case 32:
453 return find_fetch_type("u32");
454 case 64:
455 return find_fetch_type("u64");
456 default:
457 goto fail;
458 }
459 }
460
461 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
462 if (strcmp(type, fetch_type_table[i].name) == 0)
463 return &fetch_type_table[i];
464
465fail:
466 return NULL;
467}
468
469/* Special function : only accept unsigned long */
470static __kprobes void fetch_stack_address(struct pt_regs *regs,
471 void *dummy, void *dest)
472{
473 *(unsigned long *)dest = kernel_stack_pointer(regs);
474}
475
476static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
477 fetch_func_t orig_fn)
478{
479 int i;
480
481 if (type != &fetch_type_table[FETCH_TYPE_STRING])
482 return NULL; /* Only string type needs size function */
483
484 for (i = 0; i < FETCH_MTD_END; i++)
485 if (type->fetch[i] == orig_fn)
486 return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
487
488 WARN_ON(1); /* This should not happen */
489
490 return NULL;
491}
492
493/* Split symbol and offset. */
494int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
495{
496 char *tmp;
497 int ret;
498
499 if (!offset)
500 return -EINVAL;
501
502 tmp = strchr(symbol, '+');
503 if (tmp) {
504 /* skip sign because strict_strtol doesn't accept '+' */
505 ret = strict_strtoul(tmp + 1, 0, offset);
506 if (ret)
507 return ret;
508
509 *tmp = '\0';
510 } else
511 *offset = 0;
512
513 return 0;
514}
515
516#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
517
518static int parse_probe_vars(char *arg, const struct fetch_type *t,
519 struct fetch_param *f, bool is_return)
520{
521 int ret = 0;
522 unsigned long param;
523
524 if (strcmp(arg, "retval") == 0) {
525 if (is_return)
526 f->fn = t->fetch[FETCH_MTD_retval];
527 else
528 ret = -EINVAL;
529 } else if (strncmp(arg, "stack", 5) == 0) {
530 if (arg[5] == '\0') {
531 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
532 f->fn = fetch_stack_address;
533 else
534 ret = -EINVAL;
535 } else if (isdigit(arg[5])) {
536 ret = strict_strtoul(arg + 5, 10, &param);
537 if (ret || param > PARAM_MAX_STACK)
538 ret = -EINVAL;
539 else {
540 f->fn = t->fetch[FETCH_MTD_stack];
541 f->data = (void *)param;
542 }
543 } else
544 ret = -EINVAL;
545 } else
546 ret = -EINVAL;
547
548 return ret;
549}
550
551/* Recursive argument parser */
552static int parse_probe_arg(char *arg, const struct fetch_type *t,
553 struct fetch_param *f, bool is_return, bool is_kprobe)
554{
555 unsigned long param;
556 long offset;
557 char *tmp;
558 int ret;
559
560 ret = 0;
561
562 /* Until uprobe_events supports only reg arguments */
563 if (!is_kprobe && arg[0] != '%')
564 return -EINVAL;
565
566 switch (arg[0]) {
567 case '$':
568 ret = parse_probe_vars(arg + 1, t, f, is_return);
569 break;
570
571 case '%': /* named register */
572 ret = regs_query_register_offset(arg + 1);
573 if (ret >= 0) {
574 f->fn = t->fetch[FETCH_MTD_reg];
575 f->data = (void *)(unsigned long)ret;
576 ret = 0;
577 }
578 break;
579
580 case '@': /* memory or symbol */
581 if (isdigit(arg[1])) {
582 ret = strict_strtoul(arg + 1, 0, &param);
583 if (ret)
584 break;
585
586 f->fn = t->fetch[FETCH_MTD_memory];
587 f->data = (void *)param;
588 } else {
589 ret = traceprobe_split_symbol_offset(arg + 1, &offset);
590 if (ret)
591 break;
592
593 f->data = alloc_symbol_cache(arg + 1, offset);
594 if (f->data)
595 f->fn = t->fetch[FETCH_MTD_symbol];
596 }
597 break;
598
599 case '+': /* deref memory */
600 arg++; /* Skip '+', because strict_strtol() rejects it. */
601 case '-':
602 tmp = strchr(arg, '(');
603 if (!tmp)
604 break;
605
606 *tmp = '\0';
607 ret = strict_strtol(arg, 0, &offset);
608
609 if (ret)
610 break;
611
612 arg = tmp + 1;
613 tmp = strrchr(arg, ')');
614
615 if (tmp) {
616 struct deref_fetch_param *dprm;
617 const struct fetch_type *t2;
618
619 t2 = find_fetch_type(NULL);
620 *tmp = '\0';
621 dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
622
623 if (!dprm)
624 return -ENOMEM;
625
626 dprm->offset = offset;
627 ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
628 is_kprobe);
629 if (ret)
630 kfree(dprm);
631 else {
632 f->fn = t->fetch[FETCH_MTD_deref];
633 f->data = (void *)dprm;
634 }
635 }
636 break;
637 }
638 if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
639 pr_info("%s type has no corresponding fetch method.\n", t->name);
640 ret = -EINVAL;
641 }
642
643 return ret;
644}
645
646#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
647
648/* Bitfield type needs to be parsed into a fetch function */
649static int __parse_bitfield_probe_arg(const char *bf,
650 const struct fetch_type *t,
651 struct fetch_param *f)
652{
653 struct bitfield_fetch_param *bprm;
654 unsigned long bw, bo;
655 char *tail;
656
657 if (*bf != 'b')
658 return 0;
659
660 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
661 if (!bprm)
662 return -ENOMEM;
663
664 bprm->orig = *f;
665 f->fn = t->fetch[FETCH_MTD_bitfield];
666 f->data = (void *)bprm;
667 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
668
669 if (bw == 0 || *tail != '@')
670 return -EINVAL;
671
672 bf = tail + 1;
673 bo = simple_strtoul(bf, &tail, 0);
674
675 if (tail == bf || *tail != '/')
676 return -EINVAL;
677
678 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
679 bprm->low_shift = bprm->hi_shift + bo;
680
681 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
682}
683
684/* String length checking wrapper */
685int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
686 struct probe_arg *parg, bool is_return, bool is_kprobe)
687{
688 const char *t;
689 int ret;
690
691 if (strlen(arg) > MAX_ARGSTR_LEN) {
692 pr_info("Argument is too long.: %s\n", arg);
693 return -ENOSPC;
694 }
695 parg->comm = kstrdup(arg, GFP_KERNEL);
696 if (!parg->comm) {
697 pr_info("Failed to allocate memory for command '%s'.\n", arg);
698 return -ENOMEM;
699 }
700 t = strchr(parg->comm, ':');
701 if (t) {
702 arg[t - parg->comm] = '\0';
703 t++;
704 }
705 parg->type = find_fetch_type(t);
706 if (!parg->type) {
707 pr_info("Unsupported type: %s\n", t);
708 return -EINVAL;
709 }
710 parg->offset = *size;
711 *size += parg->type->size;
712 ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
713
714 if (ret >= 0 && t != NULL)
715 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
716
717 if (ret >= 0) {
718 parg->fetch_size.fn = get_fetch_size_function(parg->type,
719 parg->fetch.fn);
720 parg->fetch_size.data = parg->fetch.data;
721 }
722
723 return ret;
724}
725
726/* Return 1 if name is reserved or already used by another argument */
727int traceprobe_conflict_field_name(const char *name,
728 struct probe_arg *args, int narg)
729{
730 int i;
731
732 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
733 if (strcmp(reserved_field_names[i], name) == 0)
734 return 1;
735
736 for (i = 0; i < narg; i++)
737 if (strcmp(args[i].name, name) == 0)
738 return 1;
739
740 return 0;
741}
742
743void traceprobe_update_arg(struct probe_arg *arg)
744{
745 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
746 update_bitfield_fetch_param(arg->fetch.data);
747 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
748 update_deref_fetch_param(arg->fetch.data);
749 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
750 update_symbol_cache(arg->fetch.data);
751}
752
753void traceprobe_free_probe_arg(struct probe_arg *arg)
754{
755 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
756 free_bitfield_fetch_param(arg->fetch.data);
757 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
758 free_deref_fetch_param(arg->fetch.data);
759 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
760 free_symbol_cache(arg->fetch.data);
761
762 kfree(arg->name);
763 kfree(arg->comm);
764}
765
766int traceprobe_command(const char *buf, int (*createfn)(int, char **))
767{
768 char **argv;
769 int argc, ret;
770
771 argc = 0;
772 ret = 0;
773 argv = argv_split(GFP_KERNEL, buf, &argc);
774 if (!argv)
775 return -ENOMEM;
776
777 if (argc)
778 ret = createfn(argc, argv);
779
780 argv_free(argv);
781
782 return ret;
783}
784
785#define WRITE_BUFSIZE 4096
786
787ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
788 size_t count, loff_t *ppos,
789 int (*createfn)(int, char **))
790{
791 char *kbuf, *tmp;
792 int ret = 0;
793 size_t done = 0;
794 size_t size;
795
796 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
797 if (!kbuf)
798 return -ENOMEM;
799
800 while (done < count) {
801 size = count - done;
802
803 if (size >= WRITE_BUFSIZE)
804 size = WRITE_BUFSIZE - 1;
805
806 if (copy_from_user(kbuf, buffer + done, size)) {
807 ret = -EFAULT;
808 goto out;
809 }
810 kbuf[size] = '\0';
811 tmp = strchr(kbuf, '\n');
812
813 if (tmp) {
814 *tmp = '\0';
815 size = tmp - kbuf + 1;
816 } else if (done + size < count) {
817 pr_warning("Line length is too long: "
818 "Should be less than %d.", WRITE_BUFSIZE);
819 ret = -EINVAL;
820 goto out;
821 }
822 done += size;
823 /* Remove comments */
824 tmp = strchr(kbuf, '#');
825
826 if (tmp)
827 *tmp = '\0';
828
829 ret = traceprobe_command(kbuf, createfn);
830 if (ret)
831 goto out;
832 }
833 ret = done;
834
835out:
836 kfree(kbuf);
837
838 return ret;
839}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 000000000000..933708677814
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,161 @@
1/*
2 * Common header file for probe-based Dynamic events.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * This code was copied from kernel/trace/trace_kprobe.h written by
18 * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
19 *
20 * Updates to make this generic:
21 * Copyright (C) IBM Corporation, 2010-2011
22 * Author: Srikar Dronamraju
23 */
24
25#include <linux/seq_file.h>
26#include <linux/slab.h>
27#include <linux/smp.h>
28#include <linux/debugfs.h>
29#include <linux/types.h>
30#include <linux/string.h>
31#include <linux/ctype.h>
32#include <linux/ptrace.h>
33#include <linux/perf_event.h>
34#include <linux/kprobes.h>
35#include <linux/stringify.h>
36#include <linux/limits.h>
37#include <linux/uaccess.h>
38#include <asm/bitsperlong.h>
39
40#include "trace.h"
41#include "trace_output.h"
42
43#define MAX_TRACE_ARGS 128
44#define MAX_ARGSTR_LEN 63
45#define MAX_EVENT_NAME_LEN 64
46#define MAX_STRING_SIZE PATH_MAX
47
48/* Reserved field names */
49#define FIELD_STRING_IP "__probe_ip"
50#define FIELD_STRING_RETIP "__probe_ret_ip"
51#define FIELD_STRING_FUNC "__probe_func"
52
53#undef DEFINE_FIELD
54#define DEFINE_FIELD(type, item, name, is_signed) \
55 do { \
56 ret = trace_define_field(event_call, #type, name, \
57 offsetof(typeof(field), item), \
58 sizeof(field.item), is_signed, \
59 FILTER_OTHER); \
60 if (ret) \
61 return ret; \
62 } while (0)
63
64
65/* Flags for trace_probe */
66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70
71
72/* data_rloc: data relative location, compatible with u32 */
73#define make_data_rloc(len, roffs) \
74 (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
75#define get_rloc_len(dl) ((u32)(dl) >> 16)
76#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
77
78/*
79 * Convert data_rloc to data_loc:
80 * data_rloc stores the offset from data_rloc itself, but data_loc
81 * stores the offset from event entry.
82 */
83#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
84
85/* Data fetch function type */
86typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
87/* Printing function type */
88typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
89
90/* Fetch types */
91enum {
92 FETCH_MTD_reg = 0,
93 FETCH_MTD_stack,
94 FETCH_MTD_retval,
95 FETCH_MTD_memory,
96 FETCH_MTD_symbol,
97 FETCH_MTD_deref,
98 FETCH_MTD_bitfield,
99 FETCH_MTD_END,
100};
101
102/* Fetch type information table */
103struct fetch_type {
104 const char *name; /* Name of type */
105 size_t size; /* Byte size of type */
106 int is_signed; /* Signed flag */
107 print_type_func_t print; /* Print functions */
108 const char *fmt; /* Fromat string */
109 const char *fmttype; /* Name in format file */
110 /* Fetch functions */
111 fetch_func_t fetch[FETCH_MTD_END];
112};
113
114struct fetch_param {
115 fetch_func_t fn;
116 void *data;
117};
118
119struct probe_arg {
120 struct fetch_param fetch;
121 struct fetch_param fetch_size;
122 unsigned int offset; /* Offset from argument entry */
123 const char *name; /* Name of this argument */
124 const char *comm; /* Command of this argument */
125 const struct fetch_type *type; /* Type of this argument */
126};
127
128static inline __kprobes void call_fetch(struct fetch_param *fprm,
129 struct pt_regs *regs, void *dest)
130{
131 return fprm->fn(regs, fprm->data, dest);
132}
133
134/* Check the name is good for event/group/fields */
135static inline int is_good_name(const char *name)
136{
137 if (!isalpha(*name) && *name != '_')
138 return 0;
139 while (*++name != '\0') {
140 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
141 return 0;
142 }
143 return 1;
144}
145
146extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
147 struct probe_arg *parg, bool is_return, bool is_kprobe);
148
149extern int traceprobe_conflict_field_name(const char *name,
150 struct probe_arg *args, int narg);
151
152extern void traceprobe_update_arg(struct probe_arg *arg);
153extern void traceprobe_free_probe_arg(struct probe_arg *arg);
154
155extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
156
157extern ssize_t traceprobe_probes_write(struct file *file,
158 const char __user *buffer, size_t count, loff_t *ppos,
159 int (*createfn)(int, char**));
160
161extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 000000000000..2b36ac68549e
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,788 @@
1/*
2 * uprobes-based tracing events
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 *
17 * Copyright (C) IBM Corporation, 2010-2012
18 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
19 */
20
21#include <linux/module.h>
22#include <linux/uaccess.h>
23#include <linux/uprobes.h>
24#include <linux/namei.h>
25
26#include "trace_probe.h"
27
28#define UPROBE_EVENT_SYSTEM "uprobes"
29
30/*
31 * uprobe event core functions
32 */
33struct trace_uprobe;
34struct uprobe_trace_consumer {
35 struct uprobe_consumer cons;
36 struct trace_uprobe *tu;
37};
38
39struct trace_uprobe {
40 struct list_head list;
41 struct ftrace_event_class class;
42 struct ftrace_event_call call;
43 struct uprobe_trace_consumer *consumer;
44 struct inode *inode;
45 char *filename;
46 unsigned long offset;
47 unsigned long nhit;
48 unsigned int flags; /* For TP_FLAG_* */
49 ssize_t size; /* trace entry size */
50 unsigned int nr_args;
51 struct probe_arg args[];
52};
53
54#define SIZEOF_TRACE_UPROBE(n) \
55 (offsetof(struct trace_uprobe, args) + \
56 (sizeof(struct probe_arg) * (n)))
57
58static int register_uprobe_event(struct trace_uprobe *tu);
59static void unregister_uprobe_event(struct trace_uprobe *tu);
60
61static DEFINE_MUTEX(uprobe_lock);
62static LIST_HEAD(uprobe_list);
63
64static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
65
66/*
67 * Allocate new trace_uprobe and initialize it (including uprobes).
68 */
69static struct trace_uprobe *
70alloc_trace_uprobe(const char *group, const char *event, int nargs)
71{
72 struct trace_uprobe *tu;
73
74 if (!event || !is_good_name(event))
75 return ERR_PTR(-EINVAL);
76
77 if (!group || !is_good_name(group))
78 return ERR_PTR(-EINVAL);
79
80 tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
81 if (!tu)
82 return ERR_PTR(-ENOMEM);
83
84 tu->call.class = &tu->class;
85 tu->call.name = kstrdup(event, GFP_KERNEL);
86 if (!tu->call.name)
87 goto error;
88
89 tu->class.system = kstrdup(group, GFP_KERNEL);
90 if (!tu->class.system)
91 goto error;
92
93 INIT_LIST_HEAD(&tu->list);
94 return tu;
95
96error:
97 kfree(tu->call.name);
98 kfree(tu);
99
100 return ERR_PTR(-ENOMEM);
101}
102
103static void free_trace_uprobe(struct trace_uprobe *tu)
104{
105 int i;
106
107 for (i = 0; i < tu->nr_args; i++)
108 traceprobe_free_probe_arg(&tu->args[i]);
109
110 iput(tu->inode);
111 kfree(tu->call.class->system);
112 kfree(tu->call.name);
113 kfree(tu->filename);
114 kfree(tu);
115}
116
117static struct trace_uprobe *find_probe_event(const char *event, const char *group)
118{
119 struct trace_uprobe *tu;
120
121 list_for_each_entry(tu, &uprobe_list, list)
122 if (strcmp(tu->call.name, event) == 0 &&
123 strcmp(tu->call.class->system, group) == 0)
124 return tu;
125
126 return NULL;
127}
128
129/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
130static void unregister_trace_uprobe(struct trace_uprobe *tu)
131{
132 list_del(&tu->list);
133 unregister_uprobe_event(tu);
134 free_trace_uprobe(tu);
135}
136
137/* Register a trace_uprobe and probe_event */
138static int register_trace_uprobe(struct trace_uprobe *tu)
139{
140 struct trace_uprobe *old_tp;
141 int ret;
142
143 mutex_lock(&uprobe_lock);
144
145 /* register as an event */
146 old_tp = find_probe_event(tu->call.name, tu->call.class->system);
147 if (old_tp)
148 /* delete old event */
149 unregister_trace_uprobe(old_tp);
150
151 ret = register_uprobe_event(tu);
152 if (ret) {
153 pr_warning("Failed to register probe event(%d)\n", ret);
154 goto end;
155 }
156
157 list_add_tail(&tu->list, &uprobe_list);
158
159end:
160 mutex_unlock(&uprobe_lock);
161
162 return ret;
163}
164
165/*
166 * Argument syntax:
167 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
168 *
169 * - Remove uprobe: -:[GRP/]EVENT
170 */
171static int create_trace_uprobe(int argc, char **argv)
172{
173 struct trace_uprobe *tu;
174 struct inode *inode;
175 char *arg, *event, *group, *filename;
176 char buf[MAX_EVENT_NAME_LEN];
177 struct path path;
178 unsigned long offset;
179 bool is_delete;
180 int i, ret;
181
182 inode = NULL;
183 ret = 0;
184 is_delete = false;
185 event = NULL;
186 group = NULL;
187
188 /* argc must be >= 1 */
189 if (argv[0][0] == '-')
190 is_delete = true;
191 else if (argv[0][0] != 'p') {
192 pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n");
193 return -EINVAL;
194 }
195
196 if (argv[0][1] == ':') {
197 event = &argv[0][2];
198 arg = strchr(event, '/');
199
200 if (arg) {
201 group = event;
202 event = arg + 1;
203 event[-1] = '\0';
204
205 if (strlen(group) == 0) {
206 pr_info("Group name is not specified\n");
207 return -EINVAL;
208 }
209 }
210 if (strlen(event) == 0) {
211 pr_info("Event name is not specified\n");
212 return -EINVAL;
213 }
214 }
215 if (!group)
216 group = UPROBE_EVENT_SYSTEM;
217
218 if (is_delete) {
219 if (!event) {
220 pr_info("Delete command needs an event name.\n");
221 return -EINVAL;
222 }
223 mutex_lock(&uprobe_lock);
224 tu = find_probe_event(event, group);
225
226 if (!tu) {
227 mutex_unlock(&uprobe_lock);
228 pr_info("Event %s/%s doesn't exist.\n", group, event);
229 return -ENOENT;
230 }
231 /* delete an event */
232 unregister_trace_uprobe(tu);
233 mutex_unlock(&uprobe_lock);
234 return 0;
235 }
236
237 if (argc < 2) {
238 pr_info("Probe point is not specified.\n");
239 return -EINVAL;
240 }
241 if (isdigit(argv[1][0])) {
242 pr_info("probe point must be have a filename.\n");
243 return -EINVAL;
244 }
245 arg = strchr(argv[1], ':');
246 if (!arg)
247 goto fail_address_parse;
248
249 *arg++ = '\0';
250 filename = argv[1];
251 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
252 if (ret)
253 goto fail_address_parse;
254
255 ret = strict_strtoul(arg, 0, &offset);
256 if (ret)
257 goto fail_address_parse;
258
259 inode = igrab(path.dentry->d_inode);
260
261 argc -= 2;
262 argv += 2;
263
264 /* setup a probe */
265 if (!event) {
266 char *tail = strrchr(filename, '/');
267 char *ptr;
268
269 ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
270 if (!ptr) {
271 ret = -ENOMEM;
272 goto fail_address_parse;
273 }
274
275 tail = ptr;
276 ptr = strpbrk(tail, ".-_");
277 if (ptr)
278 *ptr = '\0';
279
280 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
281 event = buf;
282 kfree(tail);
283 }
284
285 tu = alloc_trace_uprobe(group, event, argc);
286 if (IS_ERR(tu)) {
287 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
288 ret = PTR_ERR(tu);
289 goto fail_address_parse;
290 }
291 tu->offset = offset;
292 tu->inode = inode;
293 tu->filename = kstrdup(filename, GFP_KERNEL);
294
295 if (!tu->filename) {
296 pr_info("Failed to allocate filename.\n");
297 ret = -ENOMEM;
298 goto error;
299 }
300
301 /* parse arguments */
302 ret = 0;
303 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
304 /* Increment count for freeing args in error case */
305 tu->nr_args++;
306
307 /* Parse argument name */
308 arg = strchr(argv[i], '=');
309 if (arg) {
310 *arg++ = '\0';
311 tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
312 } else {
313 arg = argv[i];
314 /* If argument name is omitted, set "argN" */
315 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
316 tu->args[i].name = kstrdup(buf, GFP_KERNEL);
317 }
318
319 if (!tu->args[i].name) {
320 pr_info("Failed to allocate argument[%d] name.\n", i);
321 ret = -ENOMEM;
322 goto error;
323 }
324
325 if (!is_good_name(tu->args[i].name)) {
326 pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
327 ret = -EINVAL;
328 goto error;
329 }
330
331 if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
332 pr_info("Argument[%d] name '%s' conflicts with "
333 "another field.\n", i, argv[i]);
334 ret = -EINVAL;
335 goto error;
336 }
337
338 /* Parse fetch argument */
339 ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
340 if (ret) {
341 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
342 goto error;
343 }
344 }
345
346 ret = register_trace_uprobe(tu);
347 if (ret)
348 goto error;
349 return 0;
350
351error:
352 free_trace_uprobe(tu);
353 return ret;
354
355fail_address_parse:
356 if (inode)
357 iput(inode);
358
359 pr_info("Failed to parse address.\n");
360
361 return ret;
362}
363
364static void cleanup_all_probes(void)
365{
366 struct trace_uprobe *tu;
367
368 mutex_lock(&uprobe_lock);
369 while (!list_empty(&uprobe_list)) {
370 tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
371 unregister_trace_uprobe(tu);
372 }
373 mutex_unlock(&uprobe_lock);
374}
375
376/* Probes listing interfaces */
377static void *probes_seq_start(struct seq_file *m, loff_t *pos)
378{
379 mutex_lock(&uprobe_lock);
380 return seq_list_start(&uprobe_list, *pos);
381}
382
383static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
384{
385 return seq_list_next(v, &uprobe_list, pos);
386}
387
388static void probes_seq_stop(struct seq_file *m, void *v)
389{
390 mutex_unlock(&uprobe_lock);
391}
392
393static int probes_seq_show(struct seq_file *m, void *v)
394{
395 struct trace_uprobe *tu = v;
396 int i;
397
398 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
399 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
400
401 for (i = 0; i < tu->nr_args; i++)
402 seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
403
404 seq_printf(m, "\n");
405 return 0;
406}
407
408static const struct seq_operations probes_seq_op = {
409 .start = probes_seq_start,
410 .next = probes_seq_next,
411 .stop = probes_seq_stop,
412 .show = probes_seq_show
413};
414
415static int probes_open(struct inode *inode, struct file *file)
416{
417 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
418 cleanup_all_probes();
419
420 return seq_open(file, &probes_seq_op);
421}
422
423static ssize_t probes_write(struct file *file, const char __user *buffer,
424 size_t count, loff_t *ppos)
425{
426 return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
427}
428
429static const struct file_operations uprobe_events_ops = {
430 .owner = THIS_MODULE,
431 .open = probes_open,
432 .read = seq_read,
433 .llseek = seq_lseek,
434 .release = seq_release,
435 .write = probes_write,
436};
437
438/* Probes profiling interfaces */
439static int probes_profile_seq_show(struct seq_file *m, void *v)
440{
441 struct trace_uprobe *tu = v;
442
443 seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
444 return 0;
445}
446
447static const struct seq_operations profile_seq_op = {
448 .start = probes_seq_start,
449 .next = probes_seq_next,
450 .stop = probes_seq_stop,
451 .show = probes_profile_seq_show
452};
453
454static int profile_open(struct inode *inode, struct file *file)
455{
456 return seq_open(file, &profile_seq_op);
457}
458
459static const struct file_operations uprobe_profile_ops = {
460 .owner = THIS_MODULE,
461 .open = profile_open,
462 .read = seq_read,
463 .llseek = seq_lseek,
464 .release = seq_release,
465};
466
467/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{
470 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event;
472 struct ring_buffer *buffer;
473 u8 *data;
474 int size, i, pc;
475 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call;
477
478 tu->nhit++;
479
480 local_save_flags(irq_flags);
481 pc = preempt_count();
482
483 size = sizeof(*entry) + tu->size;
484
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc);
487 if (!event)
488 return;
489
490 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
492 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495
496 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
498}
499
500/* Event entry printers */
501static enum print_line_t
502print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
503{
504 struct uprobe_trace_entry_head *field;
505 struct trace_seq *s = &iter->seq;
506 struct trace_uprobe *tu;
507 u8 *data;
508 int i;
509
510 field = (struct uprobe_trace_entry_head *)iter->ent;
511 tu = container_of(event, struct trace_uprobe, call.event);
512
513 if (!trace_seq_printf(s, "%s: (", tu->call.name))
514 goto partial;
515
516 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
517 goto partial;
518
519 if (!trace_seq_puts(s, ")"))
520 goto partial;
521
522 data = (u8 *)&field[1];
523 for (i = 0; i < tu->nr_args; i++) {
524 if (!tu->args[i].type->print(s, tu->args[i].name,
525 data + tu->args[i].offset, field))
526 goto partial;
527 }
528
529 if (trace_seq_puts(s, "\n"))
530 return TRACE_TYPE_HANDLED;
531
532partial:
533 return TRACE_TYPE_PARTIAL_LINE;
534}
535
536static int probe_event_enable(struct trace_uprobe *tu, int flag)
537{
538 struct uprobe_trace_consumer *utc;
539 int ret = 0;
540
541 if (!tu->inode || tu->consumer)
542 return -EINTR;
543
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
545 if (!utc)
546 return -EINTR;
547
548 utc->cons.handler = uprobe_dispatcher;
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555
556 tu->flags |= flag;
557 utc->tu = tu;
558 tu->consumer = utc;
559
560 return 0;
561}
562
563static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{
565 if (!tu->inode || !tu->consumer)
566 return;
567
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
569 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572}
573
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
575{
576 int ret, i;
577 struct uprobe_trace_entry_head field;
578 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
579
580 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
581 /* Set argument names as fields */
582 for (i = 0; i < tu->nr_args; i++) {
583 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
584 tu->args[i].name,
585 sizeof(field) + tu->args[i].offset,
586 tu->args[i].type->size,
587 tu->args[i].type->is_signed,
588 FILTER_OTHER);
589
590 if (ret)
591 return ret;
592 }
593 return 0;
594}
595
596#define LEN_OR_ZERO (len ? len - pos : 0)
597static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
598{
599 const char *fmt, *arg;
600 int i;
601 int pos = 0;
602
603 fmt = "(%lx)";
604 arg = "REC->" FIELD_STRING_IP;
605
606 /* When len=0, we just calculate the needed length */
607
608 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
609
610 for (i = 0; i < tu->nr_args; i++) {
611 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
612 tu->args[i].name, tu->args[i].type->fmt);
613 }
614
615 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
616
617 for (i = 0; i < tu->nr_args; i++) {
618 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
619 tu->args[i].name);
620 }
621
622 return pos; /* return the length of print_fmt */
623}
624#undef LEN_OR_ZERO
625
626static int set_print_fmt(struct trace_uprobe *tu)
627{
628 char *print_fmt;
629 int len;
630
631 /* First: called with 0 length to calculate the needed length */
632 len = __set_print_fmt(tu, NULL, 0);
633 print_fmt = kmalloc(len + 1, GFP_KERNEL);
634 if (!print_fmt)
635 return -ENOMEM;
636
637 /* Second: actually write the @print_fmt */
638 __set_print_fmt(tu, print_fmt, len + 1);
639 tu->call.print_fmt = print_fmt;
640
641 return 0;
642}
643
644#ifdef CONFIG_PERF_EVENTS
645/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{
648 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry;
650 struct hlist_head *head;
651 u8 *data;
652 int size, __size, i;
653 int rctx;
654
655 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return;
660
661 preempt_disable();
662
663 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
664 if (!entry)
665 goto out;
666
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
668 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671
672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
674
675 out:
676 preempt_enable();
677}
678#endif /* CONFIG_PERF_EVENTS */
679
680static
681int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
682{
683 struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
684
685 switch (type) {
686 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE);
688
689 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE);
691 return 0;
692
693#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE);
696
697 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0;
700#endif
701 default:
702 return 0;
703 }
704 return 0;
705}
706
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu;
711
712 utc = container_of(con, struct uprobe_trace_consumer, cons);
713 tu = utc->tu;
714 if (!tu || tu->consumer != utc)
715 return 0;
716
717 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs);
719
720#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs);
723#endif
724 return 0;
725}
726
727static struct trace_event_functions uprobe_funcs = {
728 .trace = print_uprobe_event
729};
730
731static int register_uprobe_event(struct trace_uprobe *tu)
732{
733 struct ftrace_event_call *call = &tu->call;
734 int ret;
735
736 /* Initialize ftrace_event_call */
737 INIT_LIST_HEAD(&call->class->fields);
738 call->event.funcs = &uprobe_funcs;
739 call->class->define_fields = uprobe_event_define_fields;
740
741 if (set_print_fmt(tu) < 0)
742 return -ENOMEM;
743
744 ret = register_ftrace_event(&call->event);
745 if (!ret) {
746 kfree(call->print_fmt);
747 return -ENODEV;
748 }
749 call->flags = 0;
750 call->class->reg = trace_uprobe_register;
751 call->data = tu;
752 ret = trace_add_event_call(call);
753
754 if (ret) {
755 pr_info("Failed to register uprobe event: %s\n", call->name);
756 kfree(call->print_fmt);
757 unregister_ftrace_event(&call->event);
758 }
759
760 return ret;
761}
762
763static void unregister_uprobe_event(struct trace_uprobe *tu)
764{
765 /* tu->event is unregistered in trace_remove_event_call() */
766 trace_remove_event_call(&tu->call);
767 kfree(tu->call.print_fmt);
768 tu->call.print_fmt = NULL;
769}
770
771/* Make a trace interface for controling probe points */
772static __init int init_uprobe_trace(void)
773{
774 struct dentry *d_tracer;
775
776 d_tracer = tracing_init_dentry();
777 if (!d_tracer)
778 return 0;
779
780 trace_create_file("uprobe_events", 0644, d_tracer,
781 NULL, &uprobe_events_ops);
782 /* Profile interface */
783 trace_create_file("uprobe_profile", 0444, d_tracer,
784 NULL, &uprobe_profile_ops);
785 return 0;
786}
787
788fs_initcall(init_uprobe_trace);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a4721..000000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/events/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include <linux/slab.h>
13#include <linux/kref.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* A cpu workqueue thread */
19struct cpu_workqueue_stats {
20 struct list_head list;
21 struct kref kref;
22 int cpu;
23 pid_t pid;
24/* Can be inserted from interrupt or user context, need to be atomic */
25 atomic_t inserted;
26/*
27 * Don't need to be atomic, works are serialized in a single workqueue thread
28 * on a single CPU.
29 */
30 unsigned int executed;
31};
32
33/* List of workqueue threads on one cpu */
34struct workqueue_global_stats {
35 struct list_head list;
36 spinlock_t lock;
37};
38
39/* Don't need a global lock because allocated before the workqueues, and
40 * never freed.
41 */
42static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
43#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
44
45static void cpu_workqueue_stat_free(struct kref *kref)
46{
47 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
48}
49
50/* Insertion of a work */
51static void
52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
54 struct work_struct *work)
55{
56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
57 struct cpu_workqueue_stats *node;
58 unsigned long flags;
59
60 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
61 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
62 if (node->pid == wq_thread->pid) {
63 atomic_inc(&node->inserted);
64 goto found;
65 }
66 }
67 pr_debug("trace_workqueue: entry not found\n");
68found:
69 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
70}
71
72/* Execution of a work */
73static void
74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
76 struct work_struct *work)
77{
78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
79 struct cpu_workqueue_stats *node;
80 unsigned long flags;
81
82 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
83 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
84 if (node->pid == wq_thread->pid) {
85 node->executed++;
86 goto found;
87 }
88 }
89 pr_debug("trace_workqueue: entry not found\n");
90found:
91 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
92}
93
94/* Creation of a cpu workqueue thread */
95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
97{
98 struct cpu_workqueue_stats *cws;
99 unsigned long flags;
100
101 WARN_ON(cpu < 0);
102
103 /* Workqueues are sometimes created in atomic context */
104 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
105 if (!cws) {
106 pr_warning("trace_workqueue: not enough memory\n");
107 return;
108 }
109 INIT_LIST_HEAD(&cws->list);
110 kref_init(&cws->kref);
111 cws->cpu = cpu;
112 cws->pid = wq_thread->pid;
113
114 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
115 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
116 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
117}
118
119/* Destruction of a cpu workqueue thread */
120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
122{
123 /* Workqueue only execute on one cpu */
124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
125 struct cpu_workqueue_stats *node, *next;
126 unsigned long flags;
127
128 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
129 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
130 list) {
131 if (node->pid == wq_thread->pid) {
132 list_del(&node->list);
133 kref_put(&node->kref, cpu_workqueue_stat_free);
134 goto found;
135 }
136 }
137
138 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
139found:
140 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
141
142}
143
144static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
145{
146 unsigned long flags;
147 struct cpu_workqueue_stats *ret = NULL;
148
149
150 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
153 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
154 struct cpu_workqueue_stats, list);
155 kref_get(&ret->kref);
156 }
157
158 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
159
160 return ret;
161}
162
163static void *workqueue_stat_start(struct tracer_stat *trace)
164{
165 int cpu;
166 void *ret = NULL;
167
168 for_each_possible_cpu(cpu) {
169 ret = workqueue_stat_start_cpu(cpu);
170 if (ret)
171 return ret;
172 }
173 return NULL;
174}
175
176static void *workqueue_stat_next(void *prev, int idx)
177{
178 struct cpu_workqueue_stats *prev_cws = prev;
179 struct cpu_workqueue_stats *ret;
180 int cpu = prev_cws->cpu;
181 unsigned long flags;
182
183 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
184 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186 do {
187 cpu = cpumask_next(cpu, cpu_possible_mask);
188 if (cpu >= nr_cpu_ids)
189 return NULL;
190 } while (!(ret = workqueue_stat_start_cpu(cpu)));
191 return ret;
192 } else {
193 ret = list_entry(prev_cws->list.next,
194 struct cpu_workqueue_stats, list);
195 kref_get(&ret->kref);
196 }
197 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
198
199 return ret;
200}
201
202static int workqueue_stat_show(struct seq_file *s, void *p)
203{
204 struct cpu_workqueue_stats *cws = p;
205 struct pid *pid;
206 struct task_struct *tsk;
207
208 pid = find_get_pid(cws->pid);
209 if (pid) {
210 tsk = get_pid_task(pid, PIDTYPE_PID);
211 if (tsk) {
212 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
213 atomic_read(&cws->inserted), cws->executed,
214 tsk->comm);
215 put_task_struct(tsk);
216 }
217 put_pid(pid);
218 }
219
220 return 0;
221}
222
223static void workqueue_stat_release(void *stat)
224{
225 struct cpu_workqueue_stats *node = stat;
226
227 kref_put(&node->kref, cpu_workqueue_stat_free);
228}
229
230static int workqueue_stat_headers(struct seq_file *s)
231{
232 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
233 seq_printf(s, "# | | | |\n");
234 return 0;
235}
236
237struct tracer_stat workqueue_stats __read_mostly = {
238 .name = "workqueues",
239 .stat_start = workqueue_stat_start,
240 .stat_next = workqueue_stat_next,
241 .stat_show = workqueue_stat_show,
242 .stat_release = workqueue_stat_release,
243 .stat_headers = workqueue_stat_headers
244};
245
246
247int __init stat_workqueue_init(void)
248{
249 if (register_stat_tracer(&workqueue_stats)) {
250 pr_warning("Unable to register workqueue stat tracer\n");
251 return 1;
252 }
253
254 return 0;
255}
256fs_initcall(stat_workqueue_init);
257
258/*
259 * Workqueues are created very early, just after pre-smp initcalls.
260 * So we must register our tracepoints at this stage.
261 */
262int __init trace_workqueue_early_init(void)
263{
264 int ret, cpu;
265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
272 if (ret)
273 goto out;
274
275 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
276 if (ret)
277 goto no_insertion;
278
279 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
280 if (ret)
281 goto no_execution;
282
283 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
284 if (ret)
285 goto no_creation;
286
287 return 0;
288
289no_creation:
290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
291no_execution:
292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
293no_insertion:
294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
295out:
296 pr_warning("trace_workqueue: unable to trace workqueues\n");
297
298 return 1;
299}
300early_initcall(trace_workqueue_early_init);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 51c6e89e8619..d7948eb10225 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
81 return ret; 81 return ret;
82} 82}
83 83
84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) 84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
85{ 85{
86 const struct cred *cred = current_cred(); 86 const struct cred *cred = current_cred();
87 int retval; 87 int retval;
88 old_uid_t ruid, euid, suid;
88 89
89 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && 90 ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
90 !(retval = put_user(high2lowuid(cred->euid), euid))) 91 euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
91 retval = put_user(high2lowuid(cred->suid), suid); 92 suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
93
94 if (!(retval = put_user(ruid, ruidp)) &&
95 !(retval = put_user(euid, euidp)))
96 retval = put_user(suid, suidp);
92 97
93 return retval; 98 return retval;
94} 99}
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
103} 108}
104 109
105 110
106SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) 111SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
107{ 112{
108 const struct cred *cred = current_cred(); 113 const struct cred *cred = current_cred();
109 int retval; 114 int retval;
115 old_gid_t rgid, egid, sgid;
116
117 rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
118 egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
119 sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
110 120
111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && 121 if (!(retval = put_user(rgid, rgidp)) &&
112 !(retval = put_user(high2lowgid(cred->egid), egid))) 122 !(retval = put_user(egid, egidp)))
113 retval = put_user(high2lowgid(cred->sgid), sgid); 123 retval = put_user(sgid, sgidp);
114 124
115 return retval; 125 return retval;
116} 126}
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
134static int groups16_to_user(old_gid_t __user *grouplist, 144static int groups16_to_user(old_gid_t __user *grouplist,
135 struct group_info *group_info) 145 struct group_info *group_info)
136{ 146{
147 struct user_namespace *user_ns = current_user_ns();
137 int i; 148 int i;
138 old_gid_t group; 149 old_gid_t group;
150 kgid_t kgid;
139 151
140 for (i = 0; i < group_info->ngroups; i++) { 152 for (i = 0; i < group_info->ngroups; i++) {
141 group = high2lowgid(GROUP_AT(group_info, i)); 153 kgid = GROUP_AT(group_info, i);
154 group = high2lowgid(from_kgid_munged(user_ns, kgid));
142 if (put_user(group, grouplist+i)) 155 if (put_user(group, grouplist+i))
143 return -EFAULT; 156 return -EFAULT;
144 } 157 }
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist,
149static int groups16_from_user(struct group_info *group_info, 162static int groups16_from_user(struct group_info *group_info,
150 old_gid_t __user *grouplist) 163 old_gid_t __user *grouplist)
151{ 164{
165 struct user_namespace *user_ns = current_user_ns();
152 int i; 166 int i;
153 old_gid_t group; 167 old_gid_t group;
168 kgid_t kgid;
154 169
155 for (i = 0; i < group_info->ngroups; i++) { 170 for (i = 0; i < group_info->ngroups; i++) {
156 if (get_user(group, grouplist+i)) 171 if (get_user(group, grouplist+i))
157 return -EFAULT; 172 return -EFAULT;
158 GROUP_AT(group_info, i) = low2highgid(group); 173
174 kgid = make_kgid(user_ns, low2highgid(group));
175 if (!gid_valid(kgid))
176 return -EINVAL;
177
178 GROUP_AT(group_info, i) = kgid;
159 } 179 }
160 180
161 return 0; 181 return 0;
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
211 231
212SYSCALL_DEFINE0(getuid16) 232SYSCALL_DEFINE0(getuid16)
213{ 233{
214 return high2lowuid(current_uid()); 234 return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
215} 235}
216 236
217SYSCALL_DEFINE0(geteuid16) 237SYSCALL_DEFINE0(geteuid16)
218{ 238{
219 return high2lowuid(current_euid()); 239 return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
220} 240}
221 241
222SYSCALL_DEFINE0(getgid16) 242SYSCALL_DEFINE0(getgid16)
223{ 243{
224 return high2lowgid(current_gid()); 244 return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
225} 245}
226 246
227SYSCALL_DEFINE0(getegid16) 247SYSCALL_DEFINE0(getegid16)
228{ 248{
229 return high2lowgid(current_egid()); 249 return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
230} 250}
diff --git a/kernel/user.c b/kernel/user.c
index 71dd2363ab0f..b815fefbe76f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -22,10 +22,27 @@
22 * and 1 for... ? 22 * and 1 for... ?
23 */ 23 */
24struct user_namespace init_user_ns = { 24struct user_namespace init_user_ns = {
25 .uid_map = {
26 .nr_extents = 1,
27 .extent[0] = {
28 .first = 0,
29 .lower_first = 0,
30 .count = 4294967295U,
31 },
32 },
33 .gid_map = {
34 .nr_extents = 1,
35 .extent[0] = {
36 .first = 0,
37 .lower_first = 0,
38 .count = 4294967295U,
39 },
40 },
25 .kref = { 41 .kref = {
26 .refcount = ATOMIC_INIT(3), 42 .refcount = ATOMIC_INIT(3),
27 }, 43 },
28 .creator = &root_user, 44 .owner = GLOBAL_ROOT_UID,
45 .group = GLOBAL_ROOT_GID,
29}; 46};
30EXPORT_SYMBOL_GPL(init_user_ns); 47EXPORT_SYMBOL_GPL(init_user_ns);
31 48
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns);
34 * when changing user ID's (ie setuid() and friends). 51 * when changing user ID's (ie setuid() and friends).
35 */ 52 */
36 53
54#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
55#define UIDHASH_SZ (1 << UIDHASH_BITS)
37#define UIDHASH_MASK (UIDHASH_SZ - 1) 56#define UIDHASH_MASK (UIDHASH_SZ - 1)
38#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 57#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
39#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) 58#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
40 59
41static struct kmem_cache *uid_cachep; 60static struct kmem_cache *uid_cachep;
61struct hlist_head uidhash_table[UIDHASH_SZ];
42 62
43/* 63/*
44 * The uidhash_lock is mostly taken from process context, but it is 64 * The uidhash_lock is mostly taken from process context, but it is
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep;
51 */ 71 */
52static DEFINE_SPINLOCK(uidhash_lock); 72static DEFINE_SPINLOCK(uidhash_lock);
53 73
54/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ 74/* root_user.__count is 1, for init task cred */
55struct user_struct root_user = { 75struct user_struct root_user = {
56 .__count = ATOMIC_INIT(2), 76 .__count = ATOMIC_INIT(1),
57 .processes = ATOMIC_INIT(1), 77 .processes = ATOMIC_INIT(1),
58 .files = ATOMIC_INIT(0), 78 .files = ATOMIC_INIT(0),
59 .sigpending = ATOMIC_INIT(0), 79 .sigpending = ATOMIC_INIT(0),
60 .locked_shm = 0, 80 .locked_shm = 0,
61 .user_ns = &init_user_ns, 81 .uid = GLOBAL_ROOT_UID,
62}; 82};
63 83
64/* 84/*
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
72static void uid_hash_remove(struct user_struct *up) 92static void uid_hash_remove(struct user_struct *up)
73{ 93{
74 hlist_del_init(&up->uidhash_node); 94 hlist_del_init(&up->uidhash_node);
75 put_user_ns(up->user_ns);
76} 95}
77 96
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 97static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
79{ 98{
80 struct user_struct *user; 99 struct user_struct *user;
81 struct hlist_node *h; 100 struct hlist_node *h;
82 101
83 hlist_for_each_entry(user, h, hashent, uidhash_node) { 102 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) { 103 if (uid_eq(user->uid, uid)) {
85 atomic_inc(&user->__count); 104 atomic_inc(&user->__count);
86 return user; 105 return user;
87 } 106 }
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags)
110 * 129 *
111 * If the user_struct could not be found, return NULL. 130 * If the user_struct could not be found, return NULL.
112 */ 131 */
113struct user_struct *find_user(uid_t uid) 132struct user_struct *find_user(kuid_t uid)
114{ 133{
115 struct user_struct *ret; 134 struct user_struct *ret;
116 unsigned long flags; 135 unsigned long flags;
117 struct user_namespace *ns = current_user_ns();
118 136
119 spin_lock_irqsave(&uidhash_lock, flags); 137 spin_lock_irqsave(&uidhash_lock, flags);
120 ret = uid_hash_find(uid, uidhashentry(ns, uid)); 138 ret = uid_hash_find(uid, uidhashentry(uid));
121 spin_unlock_irqrestore(&uidhash_lock, flags); 139 spin_unlock_irqrestore(&uidhash_lock, flags);
122 return ret; 140 return ret;
123} 141}
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up)
136 local_irq_restore(flags); 154 local_irq_restore(flags);
137} 155}
138 156
139struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) 157struct user_struct *alloc_uid(kuid_t uid)
140{ 158{
141 struct hlist_head *hashent = uidhashentry(ns, uid); 159 struct hlist_head *hashent = uidhashentry(uid);
142 struct user_struct *up, *new; 160 struct user_struct *up, *new;
143 161
144 spin_lock_irq(&uidhash_lock); 162 spin_lock_irq(&uidhash_lock);
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
153 new->uid = uid; 171 new->uid = uid;
154 atomic_set(&new->__count, 1); 172 atomic_set(&new->__count, 1);
155 173
156 new->user_ns = get_user_ns(ns);
157
158 /* 174 /*
159 * Before adding this, check whether we raced 175 * Before adding this, check whether we raced
160 * on adding the same user already.. 176 * on adding the same user already..
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
162 spin_lock_irq(&uidhash_lock); 178 spin_lock_irq(&uidhash_lock);
163 up = uid_hash_find(uid, hashent); 179 up = uid_hash_find(uid, hashent);
164 if (up) { 180 if (up) {
165 put_user_ns(ns);
166 key_put(new->uid_keyring); 181 key_put(new->uid_keyring);
167 key_put(new->session_keyring); 182 key_put(new->session_keyring);
168 kmem_cache_free(uid_cachep, new); 183 kmem_cache_free(uid_cachep, new);
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void)
187 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 202 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
188 203
189 for(n = 0; n < UIDHASH_SZ; ++n) 204 for(n = 0; n < UIDHASH_SZ; ++n)
190 INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); 205 INIT_HLIST_HEAD(uidhash_table + n);
191 206
192 /* Insert the root user immediately (init already runs as root) */ 207 /* Insert the root user immediately (init already runs as root) */
193 spin_lock_irq(&uidhash_lock); 208 spin_lock_irq(&uidhash_lock);
194 uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); 209 uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
195 spin_unlock_irq(&uidhash_lock); 210 spin_unlock_irq(&uidhash_lock);
196 211
197 return 0; 212 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3b906e98b1db..86602316422d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,9 +11,20 @@
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14#include <linux/securebits.h>
15#include <linux/keyctl.h>
16#include <linux/key-type.h>
17#include <keys/user-type.h>
18#include <linux/seq_file.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/ctype.h>
14 22
15static struct kmem_cache *user_ns_cachep __read_mostly; 23static struct kmem_cache *user_ns_cachep __read_mostly;
16 24
25static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
26 struct uid_gid_map *map);
27
17/* 28/*
18 * Create a new user namespace, deriving the creator from the user in the 29 * Create a new user namespace, deriving the creator from the user in the
19 * passed credentials, and replacing that user with the new root user for the 30 * passed credentials, and replacing that user with the new root user for the
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
24 */ 35 */
25int create_user_ns(struct cred *new) 36int create_user_ns(struct cred *new)
26{ 37{
27 struct user_namespace *ns; 38 struct user_namespace *ns, *parent_ns = new->user_ns;
28 struct user_struct *root_user; 39 kuid_t owner = new->euid;
29 int n; 40 kgid_t group = new->egid;
41
42 /* The creator needs a mapping in the parent user namespace
43 * or else we won't be able to reasonably tell userspace who
44 * created a user_namespace.
45 */
46 if (!kuid_has_mapping(parent_ns, owner) ||
47 !kgid_has_mapping(parent_ns, group))
48 return -EPERM;
30 49
31 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); 50 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
32 if (!ns) 51 if (!ns)
33 return -ENOMEM; 52 return -ENOMEM;
34 53
35 kref_init(&ns->kref); 54 kref_init(&ns->kref);
55 ns->parent = parent_ns;
56 ns->owner = owner;
57 ns->group = group;
36 58
37 for (n = 0; n < UIDHASH_SZ; ++n) 59 /* Start with the same capabilities as init but useless for doing
38 INIT_HLIST_HEAD(ns->uidhash_table + n); 60 * anything as the capabilities are bound to the new user namespace.
39 61 */
40 /* Alloc new root user. */ 62 new->securebits = SECUREBITS_DEFAULT;
41 root_user = alloc_uid(ns, 0); 63 new->cap_inheritable = CAP_EMPTY_SET;
42 if (!root_user) { 64 new->cap_permitted = CAP_FULL_SET;
43 kmem_cache_free(user_ns_cachep, ns); 65 new->cap_effective = CAP_FULL_SET;
44 return -ENOMEM; 66 new->cap_bset = CAP_FULL_SET;
45 }
46
47 /* set the new root user in the credentials under preparation */
48 ns->creator = new->user;
49 new->user = root_user;
50 new->uid = new->euid = new->suid = new->fsuid = 0;
51 new->gid = new->egid = new->sgid = new->fsgid = 0;
52 put_group_info(new->group_info);
53 new->group_info = get_group_info(&init_groups);
54#ifdef CONFIG_KEYS 67#ifdef CONFIG_KEYS
55 key_put(new->request_key_auth); 68 key_put(new->request_key_auth);
56 new->request_key_auth = NULL; 69 new->request_key_auth = NULL;
57#endif 70#endif
58 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 71 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
59 72
60 /* root_user holds a reference to ns, our reference can be dropped */ 73 /* Leave the new->user_ns reference with the new user namespace. */
61 put_user_ns(ns); 74 /* Leave the reference to our user_ns with the new cred. */
75 new->user_ns = ns;
62 76
63 return 0; 77 return 0;
64} 78}
65 79
66/* 80void free_user_ns(struct kref *kref)
67 * Deferred destructor for a user namespace. This is required because
68 * free_user_ns() may be called with uidhash_lock held, but we need to call
69 * back to free_uid() which will want to take the lock again.
70 */
71static void free_user_ns_work(struct work_struct *work)
72{ 81{
73 struct user_namespace *ns = 82 struct user_namespace *parent, *ns =
74 container_of(work, struct user_namespace, destroyer); 83 container_of(kref, struct user_namespace, kref);
75 free_uid(ns->creator); 84
85 parent = ns->parent;
76 kmem_cache_free(user_ns_cachep, ns); 86 kmem_cache_free(user_ns_cachep, ns);
87 put_user_ns(parent);
77} 88}
89EXPORT_SYMBOL(free_user_ns);
78 90
79void free_user_ns(struct kref *kref) 91static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
80{ 92{
81 struct user_namespace *ns = 93 unsigned idx, extents;
82 container_of(kref, struct user_namespace, kref); 94 u32 first, last, id2;
95
96 id2 = id + count - 1;
97
98 /* Find the matching extent */
99 extents = map->nr_extents;
100 smp_read_barrier_depends();
101 for (idx = 0; idx < extents; idx++) {
102 first = map->extent[idx].first;
103 last = first + map->extent[idx].count - 1;
104 if (id >= first && id <= last &&
105 (id2 >= first && id2 <= last))
106 break;
107 }
108 /* Map the id or note failure */
109 if (idx < extents)
110 id = (id - first) + map->extent[idx].lower_first;
111 else
112 id = (u32) -1;
83 113
84 INIT_WORK(&ns->destroyer, free_user_ns_work); 114 return id;
85 schedule_work(&ns->destroyer);
86} 115}
87EXPORT_SYMBOL(free_user_ns);
88 116
89uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) 117static u32 map_id_down(struct uid_gid_map *map, u32 id)
90{ 118{
91 struct user_namespace *tmp; 119 unsigned idx, extents;
120 u32 first, last;
92 121
93 if (likely(to == cred->user->user_ns)) 122 /* Find the matching extent */
94 return uid; 123 extents = map->nr_extents;
124 smp_read_barrier_depends();
125 for (idx = 0; idx < extents; idx++) {
126 first = map->extent[idx].first;
127 last = first + map->extent[idx].count - 1;
128 if (id >= first && id <= last)
129 break;
130 }
131 /* Map the id or note failure */
132 if (idx < extents)
133 id = (id - first) + map->extent[idx].lower_first;
134 else
135 id = (u32) -1;
95 136
137 return id;
138}
96 139
97 /* Is cred->user the creator of the target user_ns 140static u32 map_id_up(struct uid_gid_map *map, u32 id)
98 * or the creator of one of it's parents? 141{
99 */ 142 unsigned idx, extents;
100 for ( tmp = to; tmp != &init_user_ns; 143 u32 first, last;
101 tmp = tmp->creator->user_ns ) { 144
102 if (cred->user == tmp->creator) { 145 /* Find the matching extent */
103 return (uid_t)0; 146 extents = map->nr_extents;
104 } 147 smp_read_barrier_depends();
148 for (idx = 0; idx < extents; idx++) {
149 first = map->extent[idx].lower_first;
150 last = first + map->extent[idx].count - 1;
151 if (id >= first && id <= last)
152 break;
105 } 153 }
154 /* Map the id or note failure */
155 if (idx < extents)
156 id = (id - first) + map->extent[idx].first;
157 else
158 id = (u32) -1;
159
160 return id;
161}
162
163/**
164 * make_kuid - Map a user-namespace uid pair into a kuid.
165 * @ns: User namespace that the uid is in
166 * @uid: User identifier
167 *
168 * Maps a user-namespace uid pair into a kernel internal kuid,
169 * and returns that kuid.
170 *
171 * When there is no mapping defined for the user-namespace uid
172 * pair INVALID_UID is returned. Callers are expected to test
173 * for and handle handle INVALID_UID being returned. INVALID_UID
174 * may be tested for using uid_valid().
175 */
176kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
177{
178 /* Map the uid to a global kernel uid */
179 return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
180}
181EXPORT_SYMBOL(make_kuid);
182
183/**
184 * from_kuid - Create a uid from a kuid user-namespace pair.
185 * @targ: The user namespace we want a uid in.
186 * @kuid: The kernel internal uid to start with.
187 *
188 * Map @kuid into the user-namespace specified by @targ and
189 * return the resulting uid.
190 *
191 * There is always a mapping into the initial user_namespace.
192 *
193 * If @kuid has no mapping in @targ (uid_t)-1 is returned.
194 */
195uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
196{
197 /* Map the uid from a global kernel uid */
198 return map_id_up(&targ->uid_map, __kuid_val(kuid));
199}
200EXPORT_SYMBOL(from_kuid);
106 201
107 /* No useful relationship so no mapping */ 202/**
108 return overflowuid; 203 * from_kuid_munged - Create a uid from a kuid user-namespace pair.
204 * @targ: The user namespace we want a uid in.
205 * @kuid: The kernel internal uid to start with.
206 *
207 * Map @kuid into the user-namespace specified by @targ and
208 * return the resulting uid.
209 *
210 * There is always a mapping into the initial user_namespace.
211 *
212 * Unlike from_kuid from_kuid_munged never fails and always
213 * returns a valid uid. This makes from_kuid_munged appropriate
214 * for use in syscalls like stat and getuid where failing the
215 * system call and failing to provide a valid uid are not an
216 * options.
217 *
218 * If @kuid has no mapping in @targ overflowuid is returned.
219 */
220uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
221{
222 uid_t uid;
223 uid = from_kuid(targ, kuid);
224
225 if (uid == (uid_t) -1)
226 uid = overflowuid;
227 return uid;
109} 228}
229EXPORT_SYMBOL(from_kuid_munged);
110 230
111gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) 231/**
232 * make_kgid - Map a user-namespace gid pair into a kgid.
233 * @ns: User namespace that the gid is in
234 * @uid: group identifier
235 *
236 * Maps a user-namespace gid pair into a kernel internal kgid,
237 * and returns that kgid.
238 *
239 * When there is no mapping defined for the user-namespace gid
240 * pair INVALID_GID is returned. Callers are expected to test
241 * for and handle INVALID_GID being returned. INVALID_GID may be
242 * tested for using gid_valid().
243 */
244kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
112{ 245{
113 struct user_namespace *tmp; 246 /* Map the gid to a global kernel gid */
247 return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
248}
249EXPORT_SYMBOL(make_kgid);
114 250
115 if (likely(to == cred->user->user_ns)) 251/**
116 return gid; 252 * from_kgid - Create a gid from a kgid user-namespace pair.
253 * @targ: The user namespace we want a gid in.
254 * @kgid: The kernel internal gid to start with.
255 *
256 * Map @kgid into the user-namespace specified by @targ and
257 * return the resulting gid.
258 *
259 * There is always a mapping into the initial user_namespace.
260 *
261 * If @kgid has no mapping in @targ (gid_t)-1 is returned.
262 */
263gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
264{
265 /* Map the gid from a global kernel gid */
266 return map_id_up(&targ->gid_map, __kgid_val(kgid));
267}
268EXPORT_SYMBOL(from_kgid);
269
270/**
271 * from_kgid_munged - Create a gid from a kgid user-namespace pair.
272 * @targ: The user namespace we want a gid in.
273 * @kgid: The kernel internal gid to start with.
274 *
275 * Map @kgid into the user-namespace specified by @targ and
276 * return the resulting gid.
277 *
278 * There is always a mapping into the initial user_namespace.
279 *
280 * Unlike from_kgid from_kgid_munged never fails and always
281 * returns a valid gid. This makes from_kgid_munged appropriate
282 * for use in syscalls like stat and getgid where failing the
283 * system call and failing to provide a valid gid are not options.
284 *
285 * If @kgid has no mapping in @targ overflowgid is returned.
286 */
287gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
288{
289 gid_t gid;
290 gid = from_kgid(targ, kgid);
117 291
118 /* Is cred->user the creator of the target user_ns 292 if (gid == (gid_t) -1)
119 * or the creator of one of it's parents? 293 gid = overflowgid;
294 return gid;
295}
296EXPORT_SYMBOL(from_kgid_munged);
297
298static int uid_m_show(struct seq_file *seq, void *v)
299{
300 struct user_namespace *ns = seq->private;
301 struct uid_gid_extent *extent = v;
302 struct user_namespace *lower_ns;
303 uid_t lower;
304
305 lower_ns = current_user_ns();
306 if ((lower_ns == ns) && lower_ns->parent)
307 lower_ns = lower_ns->parent;
308
309 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
310
311 seq_printf(seq, "%10u %10u %10u\n",
312 extent->first,
313 lower,
314 extent->count);
315
316 return 0;
317}
318
319static int gid_m_show(struct seq_file *seq, void *v)
320{
321 struct user_namespace *ns = seq->private;
322 struct uid_gid_extent *extent = v;
323 struct user_namespace *lower_ns;
324 gid_t lower;
325
326 lower_ns = current_user_ns();
327 if ((lower_ns == ns) && lower_ns->parent)
328 lower_ns = lower_ns->parent;
329
330 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
331
332 seq_printf(seq, "%10u %10u %10u\n",
333 extent->first,
334 lower,
335 extent->count);
336
337 return 0;
338}
339
340static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
341{
342 struct uid_gid_extent *extent = NULL;
343 loff_t pos = *ppos;
344
345 if (pos < map->nr_extents)
346 extent = &map->extent[pos];
347
348 return extent;
349}
350
351static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
352{
353 struct user_namespace *ns = seq->private;
354
355 return m_start(seq, ppos, &ns->uid_map);
356}
357
358static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
359{
360 struct user_namespace *ns = seq->private;
361
362 return m_start(seq, ppos, &ns->gid_map);
363}
364
365static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
366{
367 (*pos)++;
368 return seq->op->start(seq, pos);
369}
370
371static void m_stop(struct seq_file *seq, void *v)
372{
373 return;
374}
375
376struct seq_operations proc_uid_seq_operations = {
377 .start = uid_m_start,
378 .stop = m_stop,
379 .next = m_next,
380 .show = uid_m_show,
381};
382
383struct seq_operations proc_gid_seq_operations = {
384 .start = gid_m_start,
385 .stop = m_stop,
386 .next = m_next,
387 .show = gid_m_show,
388};
389
390static DEFINE_MUTEX(id_map_mutex);
391
392static ssize_t map_write(struct file *file, const char __user *buf,
393 size_t count, loff_t *ppos,
394 int cap_setid,
395 struct uid_gid_map *map,
396 struct uid_gid_map *parent_map)
397{
398 struct seq_file *seq = file->private_data;
399 struct user_namespace *ns = seq->private;
400 struct uid_gid_map new_map;
401 unsigned idx;
402 struct uid_gid_extent *extent, *last = NULL;
403 unsigned long page = 0;
404 char *kbuf, *pos, *next_line;
405 ssize_t ret = -EINVAL;
406
407 /*
408 * The id_map_mutex serializes all writes to any given map.
409 *
410 * Any map is only ever written once.
411 *
412 * An id map fits within 1 cache line on most architectures.
413 *
414 * On read nothing needs to be done unless you are on an
415 * architecture with a crazy cache coherency model like alpha.
416 *
417 * There is a one time data dependency between reading the
418 * count of the extents and the values of the extents. The
419 * desired behavior is to see the values of the extents that
420 * were written before the count of the extents.
421 *
422 * To achieve this smp_wmb() is used on guarantee the write
423 * order and smp_read_barrier_depends() is guaranteed that we
424 * don't have crazy architectures returning stale data.
425 *
120 */ 426 */
121 for ( tmp = to; tmp != &init_user_ns; 427 mutex_lock(&id_map_mutex);
122 tmp = tmp->creator->user_ns ) { 428
123 if (cred->user == tmp->creator) { 429 ret = -EPERM;
124 return (gid_t)0; 430 /* Only allow one successful write to the map */
431 if (map->nr_extents != 0)
432 goto out;
433
434 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
435 * over the user namespace in order to set the id mapping.
436 */
437 if (!ns_capable(ns, cap_setid))
438 goto out;
439
440 /* Get a buffer */
441 ret = -ENOMEM;
442 page = __get_free_page(GFP_TEMPORARY);
443 kbuf = (char *) page;
444 if (!page)
445 goto out;
446
447 /* Only allow <= page size writes at the beginning of the file */
448 ret = -EINVAL;
449 if ((*ppos != 0) || (count >= PAGE_SIZE))
450 goto out;
451
452 /* Slurp in the user data */
453 ret = -EFAULT;
454 if (copy_from_user(kbuf, buf, count))
455 goto out;
456 kbuf[count] = '\0';
457
458 /* Parse the user data */
459 ret = -EINVAL;
460 pos = kbuf;
461 new_map.nr_extents = 0;
462 for (;pos; pos = next_line) {
463 extent = &new_map.extent[new_map.nr_extents];
464
465 /* Find the end of line and ensure I don't look past it */
466 next_line = strchr(pos, '\n');
467 if (next_line) {
468 *next_line = '\0';
469 next_line++;
470 if (*next_line == '\0')
471 next_line = NULL;
125 } 472 }
473
474 pos = skip_spaces(pos);
475 extent->first = simple_strtoul(pos, &pos, 10);
476 if (!isspace(*pos))
477 goto out;
478
479 pos = skip_spaces(pos);
480 extent->lower_first = simple_strtoul(pos, &pos, 10);
481 if (!isspace(*pos))
482 goto out;
483
484 pos = skip_spaces(pos);
485 extent->count = simple_strtoul(pos, &pos, 10);
486 if (*pos && !isspace(*pos))
487 goto out;
488
489 /* Verify there is not trailing junk on the line */
490 pos = skip_spaces(pos);
491 if (*pos != '\0')
492 goto out;
493
494 /* Verify we have been given valid starting values */
495 if ((extent->first == (u32) -1) ||
496 (extent->lower_first == (u32) -1 ))
497 goto out;
498
499 /* Verify count is not zero and does not cause the extent to wrap */
500 if ((extent->first + extent->count) <= extent->first)
501 goto out;
502 if ((extent->lower_first + extent->count) <= extent->lower_first)
503 goto out;
504
505 /* For now only accept extents that are strictly in order */
506 if (last &&
507 (((last->first + last->count) > extent->first) ||
508 ((last->lower_first + last->count) > extent->lower_first)))
509 goto out;
510
511 new_map.nr_extents++;
512 last = extent;
513
514 /* Fail if the file contains too many extents */
515 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
516 (next_line != NULL))
517 goto out;
518 }
519 /* Be very certaint the new map actually exists */
520 if (new_map.nr_extents == 0)
521 goto out;
522
523 ret = -EPERM;
524 /* Validate the user is allowed to use user id's mapped to. */
525 if (!new_idmap_permitted(ns, cap_setid, &new_map))
526 goto out;
527
528 /* Map the lower ids from the parent user namespace to the
529 * kernel global id space.
530 */
531 for (idx = 0; idx < new_map.nr_extents; idx++) {
532 u32 lower_first;
533 extent = &new_map.extent[idx];
534
535 lower_first = map_id_range_down(parent_map,
536 extent->lower_first,
537 extent->count);
538
539 /* Fail if we can not map the specified extent to
540 * the kernel global id space.
541 */
542 if (lower_first == (u32) -1)
543 goto out;
544
545 extent->lower_first = lower_first;
126 } 546 }
127 547
128 /* No useful relationship so no mapping */ 548 /* Install the map */
129 return overflowgid; 549 memcpy(map->extent, new_map.extent,
550 new_map.nr_extents*sizeof(new_map.extent[0]));
551 smp_wmb();
552 map->nr_extents = new_map.nr_extents;
553
554 *ppos = count;
555 ret = count;
556out:
557 mutex_unlock(&id_map_mutex);
558 if (page)
559 free_page(page);
560 return ret;
561}
562
563ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
564{
565 struct seq_file *seq = file->private_data;
566 struct user_namespace *ns = seq->private;
567
568 if (!ns->parent)
569 return -EPERM;
570
571 return map_write(file, buf, size, ppos, CAP_SETUID,
572 &ns->uid_map, &ns->parent->uid_map);
573}
574
575ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
576{
577 struct seq_file *seq = file->private_data;
578 struct user_namespace *ns = seq->private;
579
580 if (!ns->parent)
581 return -EPERM;
582
583 return map_write(file, buf, size, ppos, CAP_SETGID,
584 &ns->gid_map, &ns->parent->gid_map);
585}
586
587static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
588 struct uid_gid_map *new_map)
589{
590 /* Allow the specified ids if we have the appropriate capability
591 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
592 */
593 if (ns_capable(ns->parent, cap_setid))
594 return true;
595
596 return false;
130} 597}
131 598
132static __init int user_namespaces_init(void) 599static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 405caf91aad5..679d97a5d3fd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
43 43
44 down_read(&uts_sem); 44 down_read(&uts_sem);
45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 45 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); 46 ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
47 up_read(&uts_sem); 47 up_read(&uts_sem);
48 return ns; 48 return ns;
49} 49}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index df30ee08bdd4..e5e1d85b8c7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
24#include <linux/sysctl.h> 24#include <linux/sysctl.h>
25 25
26#include <asm/irq_regs.h> 26#include <asm/irq_regs.h>
27#include <linux/kvm_para.h>
27#include <linux/perf_event.h> 28#include <linux/perf_event.h>
28 29
29int watchdog_enabled = 1; 30int watchdog_enabled = 1;
@@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
280 __this_cpu_write(softlockup_touch_sync, false); 281 __this_cpu_write(softlockup_touch_sync, false);
281 sched_clock_tick(); 282 sched_clock_tick();
282 } 283 }
284
285 /* Clear the guest paused flag on watchdog reset */
286 kvm_check_and_clear_guest_paused();
283 __touch_watchdog(); 287 __touch_watchdog();
284 return HRTIMER_RESTART; 288 return HRTIMER_RESTART;
285 } 289 }
@@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
292 */ 296 */
293 duration = is_softlockup(touch_ts); 297 duration = is_softlockup(touch_ts);
294 if (unlikely(duration)) { 298 if (unlikely(duration)) {
299 /*
300 * If a virtual machine is stopped by the host it can look to
301 * the watchdog like a soft lockup, check to see if the host
302 * stopped the vm before we issue the warning
303 */
304 if (kvm_check_and_clear_guest_paused())
305 return HRTIMER_RESTART;
306
295 /* only warn once */ 307 /* only warn once */
296 if (__this_cpu_read(soft_watchdog_warn) == true) 308 if (__this_cpu_read(soft_watchdog_warn) == true)
297 return HRTIMER_RESTART; 309 return HRTIMER_RESTART;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c08..9a3128dc67df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1032 cwq = get_cwq(gcwq->cpu, wq); 1032 cwq = get_cwq(gcwq->cpu, wq);
1033 trace_workqueue_queue_work(cpu, cwq, work); 1033 trace_workqueue_queue_work(cpu, cwq, work);
1034 1034
1035 BUG_ON(!list_empty(&work->entry)); 1035 if (WARN_ON(!list_empty(&work->entry))) {
1036 spin_unlock_irqrestore(&gcwq->lock, flags);
1037 return;
1038 }
1036 1039
1037 cwq->nr_in_flight[cwq->work_color]++; 1040 cwq->nr_in_flight[cwq->work_color]++;
1038 work_flags = work_color_to_flags(cwq->work_color); 1041 work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
1210 } else 1213 } else
1211 wake_up_all(&gcwq->trustee_wait); 1214 wake_up_all(&gcwq->trustee_wait);
1212 1215
1213 /* sanity check nr_running */ 1216 /*
1214 WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && 1217 * Sanity check nr_running. Because trustee releases gcwq->lock
1218 * between setting %WORKER_ROGUE and zapping nr_running, the
1219 * warning may trigger spuriously. Check iff trustee is idle.
1220 */
1221 WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
1222 gcwq->nr_workers == gcwq->nr_idle &&
1215 atomic_read(get_gcwq_nr_running(gcwq->cpu))); 1223 atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1216} 1224}
1217 1225
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
1810 * lock freed" warnings as well as problems when looking into 1818 * lock freed" warnings as well as problems when looking into
1811 * work->lockdep_map, make a copy and use that here. 1819 * work->lockdep_map, make a copy and use that here.
1812 */ 1820 */
1813 struct lockdep_map lockdep_map = work->lockdep_map; 1821 struct lockdep_map lockdep_map;
1822
1823 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
1814#endif 1824#endif
1815 /* 1825 /*
1816 * A single work shouldn't be executed concurrently by 1826 * A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
2506{ 2516{
2507 struct wq_barrier barr; 2517 struct wq_barrier barr;
2508 2518
2519 lock_map_acquire(&work->lockdep_map);
2520 lock_map_release(&work->lockdep_map);
2521
2509 if (start_flush_work(work, &barr, true)) { 2522 if (start_flush_work(work, &barr, true)) {
2510 wait_for_completion(&barr.done); 2523 wait_for_completion(&barr.done);
2511 destroy_work_on_stack(&barr.work); 2524 destroy_work_on_stack(&barr.work);