diff options
Diffstat (limited to 'kernel')
98 files changed, 10664 insertions, 3991 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9f..c0cc67ad764c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -5,12 +5,12 @@ | |||
5 | obj-y = fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o printk.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
13 | async.o range.o groups.o | 13 | async.o range.o groups.o lglock.o |
14 | 14 | ||
15 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
16 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
@@ -25,6 +25,9 @@ endif | |||
25 | obj-y += sched/ | 25 | obj-y += sched/ |
26 | obj-y += power/ | 26 | obj-y += power/ |
27 | 27 | ||
28 | ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) | ||
29 | obj-$(CONFIG_X86) += kcmp.o | ||
30 | endif | ||
28 | obj-$(CONFIG_FREEZER) += freezer.o | 31 | obj-$(CONFIG_FREEZER) += freezer.o |
29 | obj-$(CONFIG_PROFILING) += profile.o | 32 | obj-$(CONFIG_PROFILING) += profile.o |
30 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 33 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
@@ -43,6 +46,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 46 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 47 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
45 | obj-$(CONFIG_SMP) += smp.o | 48 | obj-$(CONFIG_SMP) += smp.o |
49 | obj-$(CONFIG_SMP) += smpboot.o | ||
46 | ifneq ($(CONFIG_SMP),y) | 50 | ifneq ($(CONFIG_SMP),y) |
47 | obj-y += up.o | 51 | obj-y += up.o |
48 | endif | 52 | endif |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34eae..4b96415527b8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
69 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
70 | #include <linux/compat.h> | ||
70 | 71 | ||
71 | #include "audit.h" | 72 | #include "audit.h" |
72 | 73 | ||
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) | |||
2710 | audit_log_end(ab); | 2711 | audit_log_end(ab); |
2711 | } | 2712 | } |
2712 | 2713 | ||
2713 | void __audit_seccomp(unsigned long syscall) | 2714 | void __audit_seccomp(unsigned long syscall, long signr, int code) |
2714 | { | 2715 | { |
2715 | struct audit_buffer *ab; | 2716 | struct audit_buffer *ab; |
2716 | 2717 | ||
2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2718 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2718 | audit_log_abend(ab, "seccomp", SIGKILL); | 2719 | audit_log_abend(ab, "seccomp", signr); |
2719 | audit_log_format(ab, " syscall=%ld", syscall); | 2720 | audit_log_format(ab, " syscall=%ld", syscall); |
2721 | audit_log_format(ab, " compat=%d", is_compat_task()); | ||
2722 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
2723 | audit_log_format(ab, " code=0x%x", code); | ||
2720 | audit_log_end(ab); | 2724 | audit_log_end(ab); |
2721 | } | 2725 | } |
2722 | 2726 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 3f1adb6c6470..493d97259484 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -419,3 +419,24 @@ bool nsown_capable(int cap) | |||
419 | { | 419 | { |
420 | return ns_capable(current_user_ns(), cap); | 420 | return ns_capable(current_user_ns(), cap); |
421 | } | 421 | } |
422 | |||
423 | /** | ||
424 | * inode_capable - Check superior capability over inode | ||
425 | * @inode: The inode in question | ||
426 | * @cap: The capability in question | ||
427 | * | ||
428 | * Return true if the current task has the given superior capability | ||
429 | * targeted at it's own user namespace and that the given inode is owned | ||
430 | * by the current user namespace or a child namespace. | ||
431 | * | ||
432 | * Currently we check to see if an inode is owned by the current | ||
433 | * user namespace by seeing if the inode's owner maps into the | ||
434 | * current user namespace. | ||
435 | * | ||
436 | */ | ||
437 | bool inode_capable(const struct inode *inode, int cap) | ||
438 | { | ||
439 | struct user_namespace *ns = current_user_ns(); | ||
440 | |||
441 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | ||
442 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c9..72fcd3069a90 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -60,9 +60,13 @@ | |||
60 | #include <linux/eventfd.h> | 60 | #include <linux/eventfd.h> |
61 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
63 | #include <linux/kthread.h> | ||
63 | 64 | ||
64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
65 | 66 | ||
67 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
68 | #define CSS_DEACT_BIAS INT_MIN | ||
69 | |||
66 | /* | 70 | /* |
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 71 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
68 | * hierarchy must be performed while holding it. | 72 | * hierarchy must be performed while holding it. |
@@ -127,6 +131,9 @@ struct cgroupfs_root { | |||
127 | /* A list running through the active hierarchies */ | 131 | /* A list running through the active hierarchies */ |
128 | struct list_head root_list; | 132 | struct list_head root_list; |
129 | 133 | ||
134 | /* All cgroups on this root, cgroup_mutex protected */ | ||
135 | struct list_head allcg_list; | ||
136 | |||
130 | /* Hierarchy-specific flags */ | 137 | /* Hierarchy-specific flags */ |
131 | unsigned long flags; | 138 | unsigned long flags; |
132 | 139 | ||
@@ -145,6 +152,15 @@ struct cgroupfs_root { | |||
145 | static struct cgroupfs_root rootnode; | 152 | static struct cgroupfs_root rootnode; |
146 | 153 | ||
147 | /* | 154 | /* |
155 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
156 | */ | ||
157 | struct cfent { | ||
158 | struct list_head node; | ||
159 | struct dentry *dentry; | ||
160 | struct cftype *type; | ||
161 | }; | ||
162 | |||
163 | /* | ||
148 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 164 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
149 | * cgroup_subsys->use_id != 0. | 165 | * cgroup_subsys->use_id != 0. |
150 | */ | 166 | */ |
@@ -239,6 +255,14 @@ int cgroup_lock_is_held(void) | |||
239 | 255 | ||
240 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
241 | 257 | ||
258 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
259 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
260 | { | ||
261 | int v = atomic_read(&css->refcnt); | ||
262 | |||
263 | return v >= 0 ? v : v - CSS_DEACT_BIAS; | ||
264 | } | ||
265 | |||
242 | /* convenient tests for these bits */ | 266 | /* convenient tests for these bits */ |
243 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 267 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
244 | { | 268 | { |
@@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
279 | #define for_each_active_root(_root) \ | 303 | #define for_each_active_root(_root) \ |
280 | list_for_each_entry(_root, &roots, root_list) | 304 | list_for_each_entry(_root, &roots, root_list) |
281 | 305 | ||
306 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
307 | { | ||
308 | return dentry->d_fsdata; | ||
309 | } | ||
310 | |||
311 | static inline struct cfent *__d_cfe(struct dentry *dentry) | ||
312 | { | ||
313 | return dentry->d_fsdata; | ||
314 | } | ||
315 | |||
316 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
317 | { | ||
318 | return __d_cfe(dentry)->type; | ||
319 | } | ||
320 | |||
282 | /* the list of cgroups eligible for automatic release. Protected by | 321 | /* the list of cgroups eligible for automatic release. Protected by |
283 | * release_list_lock */ | 322 | * release_list_lock */ |
284 | static LIST_HEAD(release_list); | 323 | static LIST_HEAD(release_list); |
@@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
816 | struct cgroup_subsys *ss; | 855 | struct cgroup_subsys *ss; |
817 | int ret = 0; | 856 | int ret = 0; |
818 | 857 | ||
819 | for_each_subsys(cgrp->root, ss) | 858 | for_each_subsys(cgrp->root, ss) { |
820 | if (ss->pre_destroy) { | 859 | if (!ss->pre_destroy) |
821 | ret = ss->pre_destroy(cgrp); | 860 | continue; |
822 | if (ret) | 861 | |
823 | break; | 862 | ret = ss->pre_destroy(cgrp); |
863 | if (ret) { | ||
864 | /* ->pre_destroy() failure is being deprecated */ | ||
865 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
866 | break; | ||
824 | } | 867 | } |
868 | } | ||
825 | 869 | ||
826 | return ret; | 870 | return ret; |
827 | } | 871 | } |
@@ -852,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
852 | mutex_unlock(&cgroup_mutex); | 896 | mutex_unlock(&cgroup_mutex); |
853 | 897 | ||
854 | /* | 898 | /* |
855 | * Drop the active superblock reference that we took when we | 899 | * We want to drop the active superblock reference from the |
856 | * created the cgroup | 900 | * cgroup creation after all the dentry refs are gone - |
901 | * kill_sb gets mighty unhappy otherwise. Mark | ||
902 | * dentry->d_fsdata with cgroup_diput() to tell | ||
903 | * cgroup_d_release() to call deactivate_super(). | ||
857 | */ | 904 | */ |
858 | deactivate_super(cgrp->root->sb); | 905 | dentry->d_fsdata = cgroup_diput; |
859 | 906 | ||
860 | /* | 907 | /* |
861 | * if we're getting rid of the cgroup, refcount should ensure | 908 | * if we're getting rid of the cgroup, refcount should ensure |
@@ -864,6 +911,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
864 | BUG_ON(!list_empty(&cgrp->pidlists)); | 911 | BUG_ON(!list_empty(&cgrp->pidlists)); |
865 | 912 | ||
866 | kfree_rcu(cgrp, rcu_head); | 913 | kfree_rcu(cgrp, rcu_head); |
914 | } else { | ||
915 | struct cfent *cfe = __d_cfe(dentry); | ||
916 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | ||
917 | |||
918 | WARN_ONCE(!list_empty(&cfe->node) && | ||
919 | cgrp != &cgrp->root->top_cgroup, | ||
920 | "cfe still linked for %s\n", cfe->type->name); | ||
921 | kfree(cfe); | ||
867 | } | 922 | } |
868 | iput(inode); | 923 | iput(inode); |
869 | } | 924 | } |
@@ -873,6 +928,13 @@ static int cgroup_delete(const struct dentry *d) | |||
873 | return 1; | 928 | return 1; |
874 | } | 929 | } |
875 | 930 | ||
931 | static void cgroup_d_release(struct dentry *dentry) | ||
932 | { | ||
933 | /* did cgroup_diput() tell me to deactivate super? */ | ||
934 | if (dentry->d_fsdata == cgroup_diput) | ||
935 | deactivate_super(dentry->d_sb); | ||
936 | } | ||
937 | |||
876 | static void remove_dir(struct dentry *d) | 938 | static void remove_dir(struct dentry *d) |
877 | { | 939 | { |
878 | struct dentry *parent = dget(d->d_parent); | 940 | struct dentry *parent = dget(d->d_parent); |
@@ -882,34 +944,36 @@ static void remove_dir(struct dentry *d) | |||
882 | dput(parent); | 944 | dput(parent); |
883 | } | 945 | } |
884 | 946 | ||
885 | static void cgroup_clear_directory(struct dentry *dentry) | 947 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
886 | { | 948 | { |
887 | struct list_head *node; | 949 | struct cfent *cfe; |
888 | 950 | ||
889 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 951 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
890 | spin_lock(&dentry->d_lock); | 952 | lockdep_assert_held(&cgroup_mutex); |
891 | node = dentry->d_subdirs.next; | 953 | |
892 | while (node != &dentry->d_subdirs) { | 954 | list_for_each_entry(cfe, &cgrp->files, node) { |
893 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 955 | struct dentry *d = cfe->dentry; |
894 | 956 | ||
895 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 957 | if (cft && cfe->type != cft) |
896 | list_del_init(node); | 958 | continue; |
897 | if (d->d_inode) { | 959 | |
898 | /* This should never be called on a cgroup | 960 | dget(d); |
899 | * directory with child cgroups */ | 961 | d_delete(d); |
900 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 962 | simple_unlink(d->d_inode, d); |
901 | dget_dlock(d); | 963 | list_del_init(&cfe->node); |
902 | spin_unlock(&d->d_lock); | 964 | dput(d); |
903 | spin_unlock(&dentry->d_lock); | 965 | |
904 | d_delete(d); | 966 | return 0; |
905 | simple_unlink(dentry->d_inode, d); | ||
906 | dput(d); | ||
907 | spin_lock(&dentry->d_lock); | ||
908 | } else | ||
909 | spin_unlock(&d->d_lock); | ||
910 | node = dentry->d_subdirs.next; | ||
911 | } | 967 | } |
912 | spin_unlock(&dentry->d_lock); | 968 | return -ENOENT; |
969 | } | ||
970 | |||
971 | static void cgroup_clear_directory(struct dentry *dir) | ||
972 | { | ||
973 | struct cgroup *cgrp = __d_cgrp(dir); | ||
974 | |||
975 | while (!list_empty(&cgrp->files)) | ||
976 | cgroup_rm_file(cgrp, NULL); | ||
913 | } | 977 | } |
914 | 978 | ||
915 | /* | 979 | /* |
@@ -1294,6 +1358,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1294 | if (ret) | 1358 | if (ret) |
1295 | goto out_unlock; | 1359 | goto out_unlock; |
1296 | 1360 | ||
1361 | /* See feature-removal-schedule.txt */ | ||
1362 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1363 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | ||
1364 | task_tgid_nr(current), current->comm); | ||
1365 | |||
1297 | /* Don't allow flags or name to change at remount */ | 1366 | /* Don't allow flags or name to change at remount */ |
1298 | if (opts.flags != root->flags || | 1367 | if (opts.flags != root->flags || |
1299 | (opts.name && strcmp(opts.name, root->name))) { | 1368 | (opts.name && strcmp(opts.name, root->name))) { |
@@ -1308,7 +1377,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1308 | goto out_unlock; | 1377 | goto out_unlock; |
1309 | } | 1378 | } |
1310 | 1379 | ||
1311 | /* (re)populate subsystem files */ | 1380 | /* clear out any existing files and repopulate subsystem files */ |
1381 | cgroup_clear_directory(cgrp->dentry); | ||
1312 | cgroup_populate_dir(cgrp); | 1382 | cgroup_populate_dir(cgrp); |
1313 | 1383 | ||
1314 | if (opts.release_agent) | 1384 | if (opts.release_agent) |
@@ -1333,6 +1403,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1333 | { | 1403 | { |
1334 | INIT_LIST_HEAD(&cgrp->sibling); | 1404 | INIT_LIST_HEAD(&cgrp->sibling); |
1335 | INIT_LIST_HEAD(&cgrp->children); | 1405 | INIT_LIST_HEAD(&cgrp->children); |
1406 | INIT_LIST_HEAD(&cgrp->files); | ||
1336 | INIT_LIST_HEAD(&cgrp->css_sets); | 1407 | INIT_LIST_HEAD(&cgrp->css_sets); |
1337 | INIT_LIST_HEAD(&cgrp->release_list); | 1408 | INIT_LIST_HEAD(&cgrp->release_list); |
1338 | INIT_LIST_HEAD(&cgrp->pidlists); | 1409 | INIT_LIST_HEAD(&cgrp->pidlists); |
@@ -1344,11 +1415,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1344 | static void init_cgroup_root(struct cgroupfs_root *root) | 1415 | static void init_cgroup_root(struct cgroupfs_root *root) |
1345 | { | 1416 | { |
1346 | struct cgroup *cgrp = &root->top_cgroup; | 1417 | struct cgroup *cgrp = &root->top_cgroup; |
1418 | |||
1347 | INIT_LIST_HEAD(&root->subsys_list); | 1419 | INIT_LIST_HEAD(&root->subsys_list); |
1348 | INIT_LIST_HEAD(&root->root_list); | 1420 | INIT_LIST_HEAD(&root->root_list); |
1421 | INIT_LIST_HEAD(&root->allcg_list); | ||
1349 | root->number_of_cgroups = 1; | 1422 | root->number_of_cgroups = 1; |
1350 | cgrp->root = root; | 1423 | cgrp->root = root; |
1351 | cgrp->top_cgroup = cgrp; | 1424 | cgrp->top_cgroup = cgrp; |
1425 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1352 | init_cgroup_housekeeping(cgrp); | 1426 | init_cgroup_housekeeping(cgrp); |
1353 | } | 1427 | } |
1354 | 1428 | ||
@@ -1468,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb) | |||
1468 | static const struct dentry_operations cgroup_dops = { | 1542 | static const struct dentry_operations cgroup_dops = { |
1469 | .d_iput = cgroup_diput, | 1543 | .d_iput = cgroup_diput, |
1470 | .d_delete = cgroup_delete, | 1544 | .d_delete = cgroup_delete, |
1545 | .d_release = cgroup_d_release, | ||
1471 | }; | 1546 | }; |
1472 | 1547 | ||
1473 | struct inode *inode = | 1548 | struct inode *inode = |
@@ -1692,16 +1767,6 @@ static struct file_system_type cgroup_fs_type = { | |||
1692 | 1767 | ||
1693 | static struct kobject *cgroup_kobj; | 1768 | static struct kobject *cgroup_kobj; |
1694 | 1769 | ||
1695 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
1696 | { | ||
1697 | return dentry->d_fsdata; | ||
1698 | } | ||
1699 | |||
1700 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
1701 | { | ||
1702 | return dentry->d_fsdata; | ||
1703 | } | ||
1704 | |||
1705 | /** | 1770 | /** |
1706 | * cgroup_path - generate the path of a cgroup | 1771 | * cgroup_path - generate the path of a cgroup |
1707 | * @cgrp: the cgroup in question | 1772 | * @cgrp: the cgroup in question |
@@ -2160,9 +2225,9 @@ retry_find_task: | |||
2160 | * only need to check permissions on one of them. | 2225 | * only need to check permissions on one of them. |
2161 | */ | 2226 | */ |
2162 | tcred = __task_cred(tsk); | 2227 | tcred = __task_cred(tsk); |
2163 | if (cred->euid && | 2228 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && |
2164 | cred->euid != tcred->uid && | 2229 | !uid_eq(cred->euid, tcred->uid) && |
2165 | cred->euid != tcred->suid) { | 2230 | !uid_eq(cred->euid, tcred->suid)) { |
2166 | rcu_read_unlock(); | 2231 | rcu_read_unlock(); |
2167 | ret = -EACCES; | 2232 | ret = -EACCES; |
2168 | goto out_unlock_cgroup; | 2233 | goto out_unlock_cgroup; |
@@ -2172,6 +2237,18 @@ retry_find_task: | |||
2172 | 2237 | ||
2173 | if (threadgroup) | 2238 | if (threadgroup) |
2174 | tsk = tsk->group_leader; | 2239 | tsk = tsk->group_leader; |
2240 | |||
2241 | /* | ||
2242 | * Workqueue threads may acquire PF_THREAD_BOUND and become | ||
2243 | * trapped in a cpuset, or RT worker may be born in a cgroup | ||
2244 | * with no rt_runtime allocated. Just say no. | ||
2245 | */ | ||
2246 | if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { | ||
2247 | ret = -EINVAL; | ||
2248 | rcu_read_unlock(); | ||
2249 | goto out_unlock_cgroup; | ||
2250 | } | ||
2251 | |||
2175 | get_task_struct(tsk); | 2252 | get_task_struct(tsk); |
2176 | rcu_read_unlock(); | 2253 | rcu_read_unlock(); |
2177 | 2254 | ||
@@ -2603,50 +2680,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2603 | return mode; | 2680 | return mode; |
2604 | } | 2681 | } |
2605 | 2682 | ||
2606 | int cgroup_add_file(struct cgroup *cgrp, | 2683 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2607 | struct cgroup_subsys *subsys, | 2684 | const struct cftype *cft) |
2608 | const struct cftype *cft) | ||
2609 | { | 2685 | { |
2610 | struct dentry *dir = cgrp->dentry; | 2686 | struct dentry *dir = cgrp->dentry; |
2687 | struct cgroup *parent = __d_cgrp(dir); | ||
2611 | struct dentry *dentry; | 2688 | struct dentry *dentry; |
2689 | struct cfent *cfe; | ||
2612 | int error; | 2690 | int error; |
2613 | umode_t mode; | 2691 | umode_t mode; |
2614 | |||
2615 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2692 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2693 | |||
2694 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2695 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2696 | return 0; | ||
2697 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2698 | return 0; | ||
2699 | |||
2616 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2700 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2617 | strcpy(name, subsys->name); | 2701 | strcpy(name, subsys->name); |
2618 | strcat(name, "."); | 2702 | strcat(name, "."); |
2619 | } | 2703 | } |
2620 | strcat(name, cft->name); | 2704 | strcat(name, cft->name); |
2705 | |||
2621 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2706 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2707 | |||
2708 | cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); | ||
2709 | if (!cfe) | ||
2710 | return -ENOMEM; | ||
2711 | |||
2622 | dentry = lookup_one_len(name, dir, strlen(name)); | 2712 | dentry = lookup_one_len(name, dir, strlen(name)); |
2623 | if (!IS_ERR(dentry)) { | 2713 | if (IS_ERR(dentry)) { |
2624 | mode = cgroup_file_mode(cft); | ||
2625 | error = cgroup_create_file(dentry, mode | S_IFREG, | ||
2626 | cgrp->root->sb); | ||
2627 | if (!error) | ||
2628 | dentry->d_fsdata = (void *)cft; | ||
2629 | dput(dentry); | ||
2630 | } else | ||
2631 | error = PTR_ERR(dentry); | 2714 | error = PTR_ERR(dentry); |
2715 | goto out; | ||
2716 | } | ||
2717 | |||
2718 | mode = cgroup_file_mode(cft); | ||
2719 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | ||
2720 | if (!error) { | ||
2721 | cfe->type = (void *)cft; | ||
2722 | cfe->dentry = dentry; | ||
2723 | dentry->d_fsdata = cfe; | ||
2724 | list_add_tail(&cfe->node, &parent->files); | ||
2725 | cfe = NULL; | ||
2726 | } | ||
2727 | dput(dentry); | ||
2728 | out: | ||
2729 | kfree(cfe); | ||
2632 | return error; | 2730 | return error; |
2633 | } | 2731 | } |
2634 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2635 | 2732 | ||
2636 | int cgroup_add_files(struct cgroup *cgrp, | 2733 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2637 | struct cgroup_subsys *subsys, | 2734 | const struct cftype cfts[], bool is_add) |
2638 | const struct cftype cft[], | ||
2639 | int count) | ||
2640 | { | 2735 | { |
2641 | int i, err; | 2736 | const struct cftype *cft; |
2642 | for (i = 0; i < count; i++) { | 2737 | int err, ret = 0; |
2643 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2738 | |
2644 | if (err) | 2739 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2645 | return err; | 2740 | if (is_add) |
2741 | err = cgroup_add_file(cgrp, subsys, cft); | ||
2742 | else | ||
2743 | err = cgroup_rm_file(cgrp, cft); | ||
2744 | if (err) { | ||
2745 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2746 | is_add ? "add" : "remove", cft->name, err); | ||
2747 | ret = err; | ||
2748 | } | ||
2646 | } | 2749 | } |
2750 | return ret; | ||
2751 | } | ||
2752 | |||
2753 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2754 | |||
2755 | static void cgroup_cfts_prepare(void) | ||
2756 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | ||
2757 | { | ||
2758 | /* | ||
2759 | * Thanks to the entanglement with vfs inode locking, we can't walk | ||
2760 | * the existing cgroups under cgroup_mutex and create files. | ||
2761 | * Instead, we increment reference on all cgroups and build list of | ||
2762 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | ||
2763 | * exclusive access to the field. | ||
2764 | */ | ||
2765 | mutex_lock(&cgroup_cft_mutex); | ||
2766 | mutex_lock(&cgroup_mutex); | ||
2767 | } | ||
2768 | |||
2769 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | ||
2770 | const struct cftype *cfts, bool is_add) | ||
2771 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | ||
2772 | { | ||
2773 | LIST_HEAD(pending); | ||
2774 | struct cgroup *cgrp, *n; | ||
2775 | |||
2776 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | ||
2777 | if (cfts && ss->root != &rootnode) { | ||
2778 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | ||
2779 | dget(cgrp->dentry); | ||
2780 | list_add_tail(&cgrp->cft_q_node, &pending); | ||
2781 | } | ||
2782 | } | ||
2783 | |||
2784 | mutex_unlock(&cgroup_mutex); | ||
2785 | |||
2786 | /* | ||
2787 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | ||
2788 | * files for all cgroups which were created before. | ||
2789 | */ | ||
2790 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | ||
2791 | struct inode *inode = cgrp->dentry->d_inode; | ||
2792 | |||
2793 | mutex_lock(&inode->i_mutex); | ||
2794 | mutex_lock(&cgroup_mutex); | ||
2795 | if (!cgroup_is_removed(cgrp)) | ||
2796 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | ||
2797 | mutex_unlock(&cgroup_mutex); | ||
2798 | mutex_unlock(&inode->i_mutex); | ||
2799 | |||
2800 | list_del_init(&cgrp->cft_q_node); | ||
2801 | dput(cgrp->dentry); | ||
2802 | } | ||
2803 | |||
2804 | mutex_unlock(&cgroup_cft_mutex); | ||
2805 | } | ||
2806 | |||
2807 | /** | ||
2808 | * cgroup_add_cftypes - add an array of cftypes to a subsystem | ||
2809 | * @ss: target cgroup subsystem | ||
2810 | * @cfts: zero-length name terminated array of cftypes | ||
2811 | * | ||
2812 | * Register @cfts to @ss. Files described by @cfts are created for all | ||
2813 | * existing cgroups to which @ss is attached and all future cgroups will | ||
2814 | * have them too. This function can be called anytime whether @ss is | ||
2815 | * attached or not. | ||
2816 | * | ||
2817 | * Returns 0 on successful registration, -errno on failure. Note that this | ||
2818 | * function currently returns 0 as long as @cfts registration is successful | ||
2819 | * even if some file creation attempts on existing cgroups fail. | ||
2820 | */ | ||
2821 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2822 | { | ||
2823 | struct cftype_set *set; | ||
2824 | |||
2825 | set = kzalloc(sizeof(*set), GFP_KERNEL); | ||
2826 | if (!set) | ||
2827 | return -ENOMEM; | ||
2828 | |||
2829 | cgroup_cfts_prepare(); | ||
2830 | set->cfts = cfts; | ||
2831 | list_add_tail(&set->node, &ss->cftsets); | ||
2832 | cgroup_cfts_commit(ss, cfts, true); | ||
2833 | |||
2647 | return 0; | 2834 | return 0; |
2648 | } | 2835 | } |
2649 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2836 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2837 | |||
2838 | /** | ||
2839 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | ||
2840 | * @ss: target cgroup subsystem | ||
2841 | * @cfts: zero-length name terminated array of cftypes | ||
2842 | * | ||
2843 | * Unregister @cfts from @ss. Files described by @cfts are removed from | ||
2844 | * all existing cgroups to which @ss is attached and all future cgroups | ||
2845 | * won't have them either. This function can be called anytime whether @ss | ||
2846 | * is attached or not. | ||
2847 | * | ||
2848 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | ||
2849 | * registered with @ss. | ||
2850 | */ | ||
2851 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2852 | { | ||
2853 | struct cftype_set *set; | ||
2854 | |||
2855 | cgroup_cfts_prepare(); | ||
2856 | |||
2857 | list_for_each_entry(set, &ss->cftsets, node) { | ||
2858 | if (set->cfts == cfts) { | ||
2859 | list_del_init(&set->node); | ||
2860 | cgroup_cfts_commit(ss, cfts, false); | ||
2861 | return 0; | ||
2862 | } | ||
2863 | } | ||
2864 | |||
2865 | cgroup_cfts_commit(ss, NULL, false); | ||
2866 | return -ENOENT; | ||
2867 | } | ||
2650 | 2868 | ||
2651 | /** | 2869 | /** |
2652 | * cgroup_task_count - count the number of tasks in a cgroup. | 2870 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -3625,13 +3843,14 @@ static struct cftype files[] = { | |||
3625 | .read_u64 = cgroup_clone_children_read, | 3843 | .read_u64 = cgroup_clone_children_read, |
3626 | .write_u64 = cgroup_clone_children_write, | 3844 | .write_u64 = cgroup_clone_children_write, |
3627 | }, | 3845 | }, |
3628 | }; | 3846 | { |
3629 | 3847 | .name = "release_agent", | |
3630 | static struct cftype cft_release_agent = { | 3848 | .flags = CFTYPE_ONLY_ON_ROOT, |
3631 | .name = "release_agent", | 3849 | .read_seq_string = cgroup_release_agent_show, |
3632 | .read_seq_string = cgroup_release_agent_show, | 3850 | .write_string = cgroup_release_agent_write, |
3633 | .write_string = cgroup_release_agent_write, | 3851 | .max_write_len = PATH_MAX, |
3634 | .max_write_len = PATH_MAX, | 3852 | }, |
3853 | { } /* terminate */ | ||
3635 | }; | 3854 | }; |
3636 | 3855 | ||
3637 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3856 | static int cgroup_populate_dir(struct cgroup *cgrp) |
@@ -3639,22 +3858,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3639 | int err; | 3858 | int err; |
3640 | struct cgroup_subsys *ss; | 3859 | struct cgroup_subsys *ss; |
3641 | 3860 | ||
3642 | /* First clear out any existing files */ | 3861 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3643 | cgroup_clear_directory(cgrp->dentry); | ||
3644 | |||
3645 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | ||
3646 | if (err < 0) | 3862 | if (err < 0) |
3647 | return err; | 3863 | return err; |
3648 | 3864 | ||
3649 | if (cgrp == cgrp->top_cgroup) { | 3865 | /* process cftsets of each subsystem */ |
3650 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | ||
3651 | return err; | ||
3652 | } | ||
3653 | |||
3654 | for_each_subsys(cgrp->root, ss) { | 3866 | for_each_subsys(cgrp->root, ss) { |
3655 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3867 | struct cftype_set *set; |
3656 | return err; | 3868 | |
3869 | list_for_each_entry(set, &ss->cftsets, node) | ||
3870 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | ||
3657 | } | 3871 | } |
3872 | |||
3658 | /* This cgroup is ready now */ | 3873 | /* This cgroup is ready now */ |
3659 | for_each_subsys(cgrp->root, ss) { | 3874 | for_each_subsys(cgrp->root, ss) { |
3660 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3875 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
@@ -3670,6 +3885,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3670 | return 0; | 3885 | return 0; |
3671 | } | 3886 | } |
3672 | 3887 | ||
3888 | static void css_dput_fn(struct work_struct *work) | ||
3889 | { | ||
3890 | struct cgroup_subsys_state *css = | ||
3891 | container_of(work, struct cgroup_subsys_state, dput_work); | ||
3892 | |||
3893 | dput(css->cgroup->dentry); | ||
3894 | } | ||
3895 | |||
3673 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3896 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3674 | struct cgroup_subsys *ss, | 3897 | struct cgroup_subsys *ss, |
3675 | struct cgroup *cgrp) | 3898 | struct cgroup *cgrp) |
@@ -3682,6 +3905,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3682 | set_bit(CSS_ROOT, &css->flags); | 3905 | set_bit(CSS_ROOT, &css->flags); |
3683 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3906 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3684 | cgrp->subsys[ss->subsys_id] = css; | 3907 | cgrp->subsys[ss->subsys_id] = css; |
3908 | |||
3909 | /* | ||
3910 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | ||
3911 | * which is put on the last css_put(). dput() requires process | ||
3912 | * context, which css_put() may be called without. @css->dput_work | ||
3913 | * will be used to invoke dput() asynchronously from css_put(). | ||
3914 | */ | ||
3915 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
3916 | if (ss->__DEPRECATED_clear_css_refs) | ||
3917 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
3685 | } | 3918 | } |
3686 | 3919 | ||
3687 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3920 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
@@ -3784,9 +4017,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3784 | if (err < 0) | 4017 | if (err < 0) |
3785 | goto err_remove; | 4018 | goto err_remove; |
3786 | 4019 | ||
4020 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4021 | for_each_subsys(root, ss) | ||
4022 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4023 | dget(dentry); | ||
4024 | |||
3787 | /* The cgroup directory was pre-locked for us */ | 4025 | /* The cgroup directory was pre-locked for us */ |
3788 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4026 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3789 | 4027 | ||
4028 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4029 | |||
3790 | err = cgroup_populate_dir(cgrp); | 4030 | err = cgroup_populate_dir(cgrp); |
3791 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4031 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3792 | 4032 | ||
@@ -3826,18 +4066,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
3826 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4066 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3827 | } | 4067 | } |
3828 | 4068 | ||
4069 | /* | ||
4070 | * Check the reference count on each subsystem. Since we already | ||
4071 | * established that there are no tasks in the cgroup, if the css refcount | ||
4072 | * is also 1, then there should be no outstanding references, so the | ||
4073 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4074 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4075 | * be called via check_for_release() with no synchronization other than | ||
4076 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4077 | */ | ||
3829 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 4078 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3830 | { | 4079 | { |
3831 | /* Check the reference count on each subsystem. Since we | ||
3832 | * already established that there are no tasks in the | ||
3833 | * cgroup, if the css refcount is also 1, then there should | ||
3834 | * be no outstanding references, so the subsystem is safe to | ||
3835 | * destroy. We scan across all subsystems rather than using | ||
3836 | * the per-hierarchy linked list of mounted subsystems since | ||
3837 | * we can be called via check_for_release() with no | ||
3838 | * synchronization other than RCU, and the subsystem linked | ||
3839 | * list isn't RCU-safe */ | ||
3840 | int i; | 4080 | int i; |
4081 | |||
3841 | /* | 4082 | /* |
3842 | * We won't need to lock the subsys array, because the subsystems | 4083 | * We won't need to lock the subsys array, because the subsystems |
3843 | * we're concerned about aren't going anywhere since our cgroup root | 4084 | * we're concerned about aren't going anywhere since our cgroup root |
@@ -3846,17 +4087,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4087 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3847 | struct cgroup_subsys *ss = subsys[i]; | 4088 | struct cgroup_subsys *ss = subsys[i]; |
3848 | struct cgroup_subsys_state *css; | 4089 | struct cgroup_subsys_state *css; |
4090 | |||
3849 | /* Skip subsystems not present or not in this hierarchy */ | 4091 | /* Skip subsystems not present or not in this hierarchy */ |
3850 | if (ss == NULL || ss->root != cgrp->root) | 4092 | if (ss == NULL || ss->root != cgrp->root) |
3851 | continue; | 4093 | continue; |
4094 | |||
3852 | css = cgrp->subsys[ss->subsys_id]; | 4095 | css = cgrp->subsys[ss->subsys_id]; |
3853 | /* When called from check_for_release() it's possible | 4096 | /* |
4097 | * When called from check_for_release() it's possible | ||
3854 | * that by this point the cgroup has been removed | 4098 | * that by this point the cgroup has been removed |
3855 | * and the css deleted. But a false-positive doesn't | 4099 | * and the css deleted. But a false-positive doesn't |
3856 | * matter, since it can only happen if the cgroup | 4100 | * matter, since it can only happen if the cgroup |
3857 | * has been deleted and hence no longer needs the | 4101 | * has been deleted and hence no longer needs the |
3858 | * release agent to be called anyway. */ | 4102 | * release agent to be called anyway. |
3859 | if (css && (atomic_read(&css->refcnt) > 1)) | 4103 | */ |
4104 | if (css && css_refcnt(css) > 1) | ||
3860 | return 1; | 4105 | return 1; |
3861 | } | 4106 | } |
3862 | return 0; | 4107 | return 0; |
@@ -3866,51 +4111,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3866 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4111 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3867 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 4112 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3868 | * busy subsystems. Call with cgroup_mutex held | 4113 | * busy subsystems. Call with cgroup_mutex held |
4114 | * | ||
4115 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4116 | * not, cgroup removal behaves differently. | ||
4117 | * | ||
4118 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4119 | * cgroup removal can be committed. This is implemented by | ||
4120 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4121 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4122 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4123 | * removed as soon as the existing user (memcg) is updated. | ||
4124 | * | ||
4125 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4126 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4127 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4128 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4129 | * is put so that dentry destruction happens only after all css's are | ||
4130 | * released. | ||
3869 | */ | 4131 | */ |
3870 | |||
3871 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 4132 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3872 | { | 4133 | { |
3873 | struct cgroup_subsys *ss; | 4134 | struct cgroup_subsys *ss; |
3874 | unsigned long flags; | 4135 | unsigned long flags; |
3875 | bool failed = false; | 4136 | bool failed = false; |
4137 | |||
3876 | local_irq_save(flags); | 4138 | local_irq_save(flags); |
4139 | |||
4140 | /* | ||
4141 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4142 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4143 | * deactivation, we succeeded. | ||
4144 | */ | ||
3877 | for_each_subsys(cgrp->root, ss) { | 4145 | for_each_subsys(cgrp->root, ss) { |
3878 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4146 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3879 | int refcnt; | 4147 | |
3880 | while (1) { | 4148 | WARN_ON(atomic_read(&css->refcnt) < 0); |
3881 | /* We can only remove a CSS with a refcnt==1 */ | 4149 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
3882 | refcnt = atomic_read(&css->refcnt); | 4150 | |
3883 | if (refcnt > 1) { | 4151 | if (ss->__DEPRECATED_clear_css_refs) |
3884 | failed = true; | 4152 | failed |= css_refcnt(css) != 1; |
3885 | goto done; | ||
3886 | } | ||
3887 | BUG_ON(!refcnt); | ||
3888 | /* | ||
3889 | * Drop the refcnt to 0 while we check other | ||
3890 | * subsystems. This will cause any racing | ||
3891 | * css_tryget() to spin until we set the | ||
3892 | * CSS_REMOVED bits or abort | ||
3893 | */ | ||
3894 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | ||
3895 | break; | ||
3896 | cpu_relax(); | ||
3897 | } | ||
3898 | } | 4153 | } |
3899 | done: | 4154 | |
4155 | /* | ||
4156 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4157 | * restore refcnts to positive values. Either way, all in-progress | ||
4158 | * css_tryget() will be released. | ||
4159 | */ | ||
3900 | for_each_subsys(cgrp->root, ss) { | 4160 | for_each_subsys(cgrp->root, ss) { |
3901 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4161 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3902 | if (failed) { | 4162 | |
3903 | /* | 4163 | if (!failed) { |
3904 | * Restore old refcnt if we previously managed | ||
3905 | * to clear it from 1 to 0 | ||
3906 | */ | ||
3907 | if (!atomic_read(&css->refcnt)) | ||
3908 | atomic_set(&css->refcnt, 1); | ||
3909 | } else { | ||
3910 | /* Commit the fact that the CSS is removed */ | ||
3911 | set_bit(CSS_REMOVED, &css->flags); | 4164 | set_bit(CSS_REMOVED, &css->flags); |
4165 | css_put(css); | ||
4166 | } else { | ||
4167 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
3912 | } | 4168 | } |
3913 | } | 4169 | } |
4170 | |||
3914 | local_irq_restore(flags); | 4171 | local_irq_restore(flags); |
3915 | return !failed; | 4172 | return !failed; |
3916 | } | 4173 | } |
@@ -3995,6 +4252,8 @@ again: | |||
3995 | list_del_init(&cgrp->sibling); | 4252 | list_del_init(&cgrp->sibling); |
3996 | cgroup_unlock_hierarchy(cgrp->root); | 4253 | cgroup_unlock_hierarchy(cgrp->root); |
3997 | 4254 | ||
4255 | list_del_init(&cgrp->allcg_node); | ||
4256 | |||
3998 | d = dget(cgrp->dentry); | 4257 | d = dget(cgrp->dentry); |
3999 | 4258 | ||
4000 | cgroup_d_remove_dir(d); | 4259 | cgroup_d_remove_dir(d); |
@@ -4021,12 +4280,29 @@ again: | |||
4021 | return 0; | 4280 | return 0; |
4022 | } | 4281 | } |
4023 | 4282 | ||
4283 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | ||
4284 | { | ||
4285 | INIT_LIST_HEAD(&ss->cftsets); | ||
4286 | |||
4287 | /* | ||
4288 | * base_cftset is embedded in subsys itself, no need to worry about | ||
4289 | * deregistration. | ||
4290 | */ | ||
4291 | if (ss->base_cftypes) { | ||
4292 | ss->base_cftset.cfts = ss->base_cftypes; | ||
4293 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | ||
4294 | } | ||
4295 | } | ||
4296 | |||
4024 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4297 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4025 | { | 4298 | { |
4026 | struct cgroup_subsys_state *css; | 4299 | struct cgroup_subsys_state *css; |
4027 | 4300 | ||
4028 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4301 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4029 | 4302 | ||
4303 | /* init base cftset */ | ||
4304 | cgroup_init_cftsets(ss); | ||
4305 | |||
4030 | /* Create the top cgroup state for this subsystem */ | 4306 | /* Create the top cgroup state for this subsystem */ |
4031 | list_add(&ss->sibling, &rootnode.subsys_list); | 4307 | list_add(&ss->sibling, &rootnode.subsys_list); |
4032 | ss->root = &rootnode; | 4308 | ss->root = &rootnode; |
@@ -4096,6 +4372,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4096 | return 0; | 4372 | return 0; |
4097 | } | 4373 | } |
4098 | 4374 | ||
4375 | /* init base cftset */ | ||
4376 | cgroup_init_cftsets(ss); | ||
4377 | |||
4099 | /* | 4378 | /* |
4100 | * need to register a subsys id before anything else - for example, | 4379 | * need to register a subsys id before anything else - for example, |
4101 | * init_cgroup_css needs it. | 4380 | * init_cgroup_css needs it. |
@@ -4685,21 +4964,41 @@ static void check_for_release(struct cgroup *cgrp) | |||
4685 | } | 4964 | } |
4686 | 4965 | ||
4687 | /* Caller must verify that the css is not for root cgroup */ | 4966 | /* Caller must verify that the css is not for root cgroup */ |
4688 | void __css_put(struct cgroup_subsys_state *css, int count) | 4967 | bool __css_tryget(struct cgroup_subsys_state *css) |
4968 | { | ||
4969 | do { | ||
4970 | int v = css_refcnt(css); | ||
4971 | |||
4972 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | ||
4973 | return true; | ||
4974 | cpu_relax(); | ||
4975 | } while (!test_bit(CSS_REMOVED, &css->flags)); | ||
4976 | |||
4977 | return false; | ||
4978 | } | ||
4979 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4980 | |||
4981 | /* Caller must verify that the css is not for root cgroup */ | ||
4982 | void __css_put(struct cgroup_subsys_state *css) | ||
4689 | { | 4983 | { |
4690 | struct cgroup *cgrp = css->cgroup; | 4984 | struct cgroup *cgrp = css->cgroup; |
4691 | int val; | 4985 | |
4692 | rcu_read_lock(); | 4986 | rcu_read_lock(); |
4693 | val = atomic_sub_return(count, &css->refcnt); | 4987 | atomic_dec(&css->refcnt); |
4694 | if (val == 1) { | 4988 | switch (css_refcnt(css)) { |
4989 | case 1: | ||
4695 | if (notify_on_release(cgrp)) { | 4990 | if (notify_on_release(cgrp)) { |
4696 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4991 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4697 | check_for_release(cgrp); | 4992 | check_for_release(cgrp); |
4698 | } | 4993 | } |
4699 | cgroup_wakeup_rmdir_waiter(cgrp); | 4994 | cgroup_wakeup_rmdir_waiter(cgrp); |
4995 | break; | ||
4996 | case 0: | ||
4997 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | ||
4998 | schedule_work(&css->dput_work); | ||
4999 | break; | ||
4700 | } | 5000 | } |
4701 | rcu_read_unlock(); | 5001 | rcu_read_unlock(); |
4702 | WARN_ON_ONCE(val < 1); | ||
4703 | } | 5002 | } |
4704 | EXPORT_SYMBOL_GPL(__css_put); | 5003 | EXPORT_SYMBOL_GPL(__css_put); |
4705 | 5004 | ||
@@ -4818,7 +5117,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4818 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5117 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4819 | * it's unchanged until freed. | 5118 | * it's unchanged until freed. |
4820 | */ | 5119 | */ |
4821 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5120 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4822 | 5121 | ||
4823 | if (cssid) | 5122 | if (cssid) |
4824 | return cssid->id; | 5123 | return cssid->id; |
@@ -4830,7 +5129,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4830 | { | 5129 | { |
4831 | struct css_id *cssid; | 5130 | struct css_id *cssid; |
4832 | 5131 | ||
4833 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5132 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4834 | 5133 | ||
4835 | if (cssid) | 5134 | if (cssid) |
4836 | return cssid->depth; | 5135 | return cssid->depth; |
@@ -4844,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth); | |||
4844 | * @root: the css supporsed to be an ancestor of the child. | 5143 | * @root: the css supporsed to be an ancestor of the child. |
4845 | * | 5144 | * |
4846 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because | 5145 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because |
4847 | * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). | 5146 | * this function reads css->id, the caller must hold rcu_read_lock(). |
4848 | * But, considering usual usage, the csses should be valid objects after test. | 5147 | * But, considering usual usage, the csses should be valid objects after test. |
4849 | * Assuming that the caller will do some action to the child if this returns | 5148 | * Assuming that the caller will do some action to the child if this returns |
4850 | * returns true, the caller must take "child";s reference count. | 5149 | * returns true, the caller must take "child";s reference count. |
@@ -4856,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
4856 | { | 5155 | { |
4857 | struct css_id *child_id; | 5156 | struct css_id *child_id; |
4858 | struct css_id *root_id; | 5157 | struct css_id *root_id; |
4859 | bool ret = true; | ||
4860 | 5158 | ||
4861 | rcu_read_lock(); | ||
4862 | child_id = rcu_dereference(child->id); | 5159 | child_id = rcu_dereference(child->id); |
5160 | if (!child_id) | ||
5161 | return false; | ||
4863 | root_id = rcu_dereference(root->id); | 5162 | root_id = rcu_dereference(root->id); |
4864 | if (!child_id | 5163 | if (!root_id) |
4865 | || !root_id | 5164 | return false; |
4866 | || (child_id->depth < root_id->depth) | 5165 | if (child_id->depth < root_id->depth) |
4867 | || (child_id->stack[root_id->depth] != root_id->id)) | 5166 | return false; |
4868 | ret = false; | 5167 | if (child_id->stack[root_id->depth] != root_id->id) |
4869 | rcu_read_unlock(); | 5168 | return false; |
4870 | return ret; | 5169 | return true; |
4871 | } | 5170 | } |
4872 | 5171 | ||
4873 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 5172 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
@@ -5211,19 +5510,15 @@ static struct cftype debug_files[] = { | |||
5211 | .name = "releasable", | 5510 | .name = "releasable", |
5212 | .read_u64 = releasable_read, | 5511 | .read_u64 = releasable_read, |
5213 | }, | 5512 | }, |
5214 | }; | ||
5215 | 5513 | ||
5216 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5514 | { } /* terminate */ |
5217 | { | 5515 | }; |
5218 | return cgroup_add_files(cont, ss, debug_files, | ||
5219 | ARRAY_SIZE(debug_files)); | ||
5220 | } | ||
5221 | 5516 | ||
5222 | struct cgroup_subsys debug_subsys = { | 5517 | struct cgroup_subsys debug_subsys = { |
5223 | .name = "debug", | 5518 | .name = "debug", |
5224 | .create = debug_create, | 5519 | .create = debug_create, |
5225 | .destroy = debug_destroy, | 5520 | .destroy = debug_destroy, |
5226 | .populate = debug_populate, | ||
5227 | .subsys_id = debug_subsys_id, | 5521 | .subsys_id = debug_subsys_id, |
5522 | .base_cftypes = debug_files, | ||
5228 | }; | 5523 | }; |
5229 | #endif /* CONFIG_CGROUP_DEBUG */ | 5524 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b62..3649fc6b3eaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, | |||
358 | static struct cftype files[] = { | 358 | static struct cftype files[] = { |
359 | { | 359 | { |
360 | .name = "state", | 360 | .name = "state", |
361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
361 | .read_seq_string = freezer_read, | 362 | .read_seq_string = freezer_read, |
362 | .write_string = freezer_write, | 363 | .write_string = freezer_write, |
363 | }, | 364 | }, |
365 | { } /* terminate */ | ||
364 | }; | 366 | }; |
365 | 367 | ||
366 | static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | ||
367 | { | ||
368 | if (!cgroup->parent) | ||
369 | return 0; | ||
370 | return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | ||
371 | } | ||
372 | |||
373 | struct cgroup_subsys freezer_subsys = { | 368 | struct cgroup_subsys freezer_subsys = { |
374 | .name = "freezer", | 369 | .name = "freezer", |
375 | .create = freezer_create, | 370 | .create = freezer_create, |
376 | .destroy = freezer_destroy, | 371 | .destroy = freezer_destroy, |
377 | .populate = freezer_populate, | ||
378 | .subsys_id = freezer_subsys_id, | 372 | .subsys_id = freezer_subsys_id, |
379 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
380 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
375 | .base_cftypes = files, | ||
381 | }; | 376 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index 74ff8498809a..c28a306ae05c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
372 | 372 | ||
373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
374 | 374 | ||
375 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 375 | /* |
376 | compat_old_sigset_t __user *oset) | 376 | * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the |
377 | * blocked set of signals to the supplied signal set | ||
378 | */ | ||
379 | static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) | ||
377 | { | 380 | { |
378 | old_sigset_t s; | 381 | memcpy(blocked->sig, &set, sizeof(set)); |
379 | long ret; | 382 | } |
380 | mm_segment_t old_fs; | ||
381 | 383 | ||
382 | if (set && get_user(s, set)) | 384 | asmlinkage long compat_sys_sigprocmask(int how, |
383 | return -EFAULT; | 385 | compat_old_sigset_t __user *nset, |
384 | old_fs = get_fs(); | 386 | compat_old_sigset_t __user *oset) |
385 | set_fs(KERNEL_DS); | 387 | { |
386 | ret = sys_sigprocmask(how, | 388 | old_sigset_t old_set, new_set; |
387 | set ? (old_sigset_t __user *) &s : NULL, | 389 | sigset_t new_blocked; |
388 | oset ? (old_sigset_t __user *) &s : NULL); | 390 | |
389 | set_fs(old_fs); | 391 | old_set = current->blocked.sig[0]; |
390 | if (ret == 0) | 392 | |
391 | if (oset) | 393 | if (nset) { |
392 | ret = put_user(s, oset); | 394 | if (get_user(new_set, nset)) |
393 | return ret; | 395 | return -EFAULT; |
396 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
397 | |||
398 | new_blocked = current->blocked; | ||
399 | |||
400 | switch (how) { | ||
401 | case SIG_BLOCK: | ||
402 | sigaddsetmask(&new_blocked, new_set); | ||
403 | break; | ||
404 | case SIG_UNBLOCK: | ||
405 | sigdelsetmask(&new_blocked, new_set); | ||
406 | break; | ||
407 | case SIG_SETMASK: | ||
408 | compat_sig_setmask(&new_blocked, new_set); | ||
409 | break; | ||
410 | default: | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | |||
414 | set_current_blocked(&new_blocked); | ||
415 | } | ||
416 | |||
417 | if (oset) { | ||
418 | if (put_user(old_set, oset)) | ||
419 | return -EFAULT; | ||
420 | } | ||
421 | |||
422 | return 0; | ||
394 | } | 423 | } |
395 | 424 | ||
396 | #endif | 425 | #endif |
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
1044 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) | 1073 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) |
1045 | return -EFAULT; | 1074 | return -EFAULT; |
1046 | sigset_from_compat(&newset, &newset32); | 1075 | sigset_from_compat(&newset, &newset32); |
1047 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 1076 | return sigsuspend(&newset); |
1048 | |||
1049 | current->saved_sigmask = current->blocked; | ||
1050 | set_current_blocked(&newset); | ||
1051 | |||
1052 | current->state = TASK_INTERRUPTIBLE; | ||
1053 | schedule(); | ||
1054 | set_restore_sigmask(); | ||
1055 | return -ERESTARTNOHAND; | ||
1056 | } | 1077 | } |
1057 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | 1078 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ |
1058 | 1079 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e57027..a4eb5227a19e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -10,13 +10,18 @@ | |||
10 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
11 | #include <linux/unistd.h> | 11 | #include <linux/unistd.h> |
12 | #include <linux/cpu.h> | 12 | #include <linux/cpu.h> |
13 | #include <linux/oom.h> | ||
14 | #include <linux/rcupdate.h> | ||
13 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/bug.h> | ||
14 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 18 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
17 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> |
19 | 22 | ||
23 | #include "smpboot.h" | ||
24 | |||
20 | #ifdef CONFIG_SMP | 25 | #ifdef CONFIG_SMP |
21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 26 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
22 | static DEFINE_MUTEX(cpu_add_remove_lock); | 27 | static DEFINE_MUTEX(cpu_add_remove_lock); |
@@ -171,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) | |||
171 | } | 176 | } |
172 | EXPORT_SYMBOL(unregister_cpu_notifier); | 177 | EXPORT_SYMBOL(unregister_cpu_notifier); |
173 | 178 | ||
179 | /** | ||
180 | * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU | ||
181 | * @cpu: a CPU id | ||
182 | * | ||
183 | * This function walks all processes, finds a valid mm struct for each one and | ||
184 | * then clears a corresponding bit in mm's cpumask. While this all sounds | ||
185 | * trivial, there are various non-obvious corner cases, which this function | ||
186 | * tries to solve in a safe manner. | ||
187 | * | ||
188 | * Also note that the function uses a somewhat relaxed locking scheme, so it may | ||
189 | * be called only for an already offlined CPU. | ||
190 | */ | ||
191 | void clear_tasks_mm_cpumask(int cpu) | ||
192 | { | ||
193 | struct task_struct *p; | ||
194 | |||
195 | /* | ||
196 | * This function is called after the cpu is taken down and marked | ||
197 | * offline, so its not like new tasks will ever get this cpu set in | ||
198 | * their mm mask. -- Peter Zijlstra | ||
199 | * Thus, we may use rcu_read_lock() here, instead of grabbing | ||
200 | * full-fledged tasklist_lock. | ||
201 | */ | ||
202 | WARN_ON(cpu_online(cpu)); | ||
203 | rcu_read_lock(); | ||
204 | for_each_process(p) { | ||
205 | struct task_struct *t; | ||
206 | |||
207 | /* | ||
208 | * Main thread might exit, but other threads may still have | ||
209 | * a valid mm. Find one. | ||
210 | */ | ||
211 | t = find_lock_task_mm(p); | ||
212 | if (!t) | ||
213 | continue; | ||
214 | cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); | ||
215 | task_unlock(t); | ||
216 | } | ||
217 | rcu_read_unlock(); | ||
218 | } | ||
219 | |||
174 | static inline void check_for_tasks(int cpu) | 220 | static inline void check_for_tasks(int cpu) |
175 | { | 221 | { |
176 | struct task_struct *p; | 222 | struct task_struct *p; |
@@ -295,11 +341,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
295 | int ret, nr_calls = 0; | 341 | int ret, nr_calls = 0; |
296 | void *hcpu = (void *)(long)cpu; | 342 | void *hcpu = (void *)(long)cpu; |
297 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 343 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
344 | struct task_struct *idle; | ||
298 | 345 | ||
299 | if (cpu_online(cpu) || !cpu_present(cpu)) | 346 | if (cpu_online(cpu) || !cpu_present(cpu)) |
300 | return -EINVAL; | 347 | return -EINVAL; |
301 | 348 | ||
302 | cpu_hotplug_begin(); | 349 | cpu_hotplug_begin(); |
350 | |||
351 | idle = idle_thread_get(cpu); | ||
352 | if (IS_ERR(idle)) { | ||
353 | ret = PTR_ERR(idle); | ||
354 | goto out; | ||
355 | } | ||
356 | |||
303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 357 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
304 | if (ret) { | 358 | if (ret) { |
305 | nr_calls--; | 359 | nr_calls--; |
@@ -309,7 +363,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
309 | } | 363 | } |
310 | 364 | ||
311 | /* Arch-specific enabling code. */ | 365 | /* Arch-specific enabling code. */ |
312 | ret = __cpu_up(cpu); | 366 | ret = __cpu_up(cpu, idle); |
313 | if (ret != 0) | 367 | if (ret != 0) |
314 | goto out_notify; | 368 | goto out_notify; |
315 | BUG_ON(!cpu_online(cpu)); | 369 | BUG_ON(!cpu_online(cpu)); |
@@ -320,6 +374,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
320 | out_notify: | 374 | out_notify: |
321 | if (ret != 0) | 375 | if (ret != 0) |
322 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 376 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
377 | out: | ||
323 | cpu_hotplug_done(); | 378 | cpu_hotplug_done(); |
324 | 379 | ||
325 | return ret; | 380 | return ret; |
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 249152e15308..9656a3c36503 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c | |||
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb) | |||
81 | EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); | 81 | EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); |
82 | 82 | ||
83 | /** | 83 | /** |
84 | * cpm_pm_enter - CPU low power entry notifier | 84 | * cpu_pm_enter - CPU low power entry notifier |
85 | * | 85 | * |
86 | * Notifies listeners that a single CPU is entering a low power state that may | 86 | * Notifies listeners that a single CPU is entering a low power state that may |
87 | * cause some blocks in the same power domain as the cpu to reset. | 87 | * cause some blocks in the same power domain as the cpu to reset. |
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); | |||
89 | * Must be called on the affected CPU with interrupts disabled. Platform is | 89 | * Must be called on the affected CPU with interrupts disabled. Platform is |
90 | * responsible for ensuring that cpu_pm_enter is not called twice on the same | 90 | * responsible for ensuring that cpu_pm_enter is not called twice on the same |
91 | * CPU before cpu_pm_exit is called. Notified drivers can include VFP | 91 | * CPU before cpu_pm_exit is called. Notified drivers can include VFP |
92 | * co-processor, interrupt controller and it's PM extensions, local CPU | 92 | * co-processor, interrupt controller and its PM extensions, local CPU |
93 | * timers context save/restore which shouldn't be interrupted. Hence it | 93 | * timers context save/restore which shouldn't be interrupted. Hence it |
94 | * must be called with interrupts disabled. | 94 | * must be called with interrupts disabled. |
95 | * | 95 | * |
@@ -115,13 +115,13 @@ int cpu_pm_enter(void) | |||
115 | EXPORT_SYMBOL_GPL(cpu_pm_enter); | 115 | EXPORT_SYMBOL_GPL(cpu_pm_enter); |
116 | 116 | ||
117 | /** | 117 | /** |
118 | * cpm_pm_exit - CPU low power exit notifier | 118 | * cpu_pm_exit - CPU low power exit notifier |
119 | * | 119 | * |
120 | * Notifies listeners that a single CPU is exiting a low power state that may | 120 | * Notifies listeners that a single CPU is exiting a low power state that may |
121 | * have caused some blocks in the same power domain as the cpu to reset. | 121 | * have caused some blocks in the same power domain as the cpu to reset. |
122 | * | 122 | * |
123 | * Notified drivers can include VFP co-processor, interrupt controller | 123 | * Notified drivers can include VFP co-processor, interrupt controller |
124 | * and it's PM extensions, local CPU timers context save/restore which | 124 | * and its PM extensions, local CPU timers context save/restore which |
125 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | 125 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. |
126 | * | 126 | * |
127 | * Return conditions are same as __raw_notifier_call_chain. | 127 | * Return conditions are same as __raw_notifier_call_chain. |
@@ -139,7 +139,7 @@ int cpu_pm_exit(void) | |||
139 | EXPORT_SYMBOL_GPL(cpu_pm_exit); | 139 | EXPORT_SYMBOL_GPL(cpu_pm_exit); |
140 | 140 | ||
141 | /** | 141 | /** |
142 | * cpm_cluster_pm_enter - CPU cluster low power entry notifier | 142 | * cpu_cluster_pm_enter - CPU cluster low power entry notifier |
143 | * | 143 | * |
144 | * Notifies listeners that all cpus in a power domain are entering a low power | 144 | * Notifies listeners that all cpus in a power domain are entering a low power |
145 | * state that may cause some blocks in the same power domain to reset. | 145 | * state that may cause some blocks in the same power domain to reset. |
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); | |||
147 | * Must be called after cpu_pm_enter has been called on all cpus in the power | 147 | * Must be called after cpu_pm_enter has been called on all cpus in the power |
148 | * domain, and before cpu_pm_exit has been called on any cpu in the power | 148 | * domain, and before cpu_pm_exit has been called on any cpu in the power |
149 | * domain. Notified drivers can include VFP co-processor, interrupt controller | 149 | * domain. Notified drivers can include VFP co-processor, interrupt controller |
150 | * and it's PM extensions, local CPU timers context save/restore which | 150 | * and its PM extensions, local CPU timers context save/restore which |
151 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | 151 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. |
152 | * | 152 | * |
153 | * Must be called with interrupts disabled. | 153 | * Must be called with interrupts disabled. |
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void) | |||
174 | EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); | 174 | EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); |
175 | 175 | ||
176 | /** | 176 | /** |
177 | * cpm_cluster_pm_exit - CPU cluster low power exit notifier | 177 | * cpu_cluster_pm_exit - CPU cluster low power exit notifier |
178 | * | 178 | * |
179 | * Notifies listeners that all cpus in a power domain are exiting form a | 179 | * Notifies listeners that all cpus in a power domain are exiting form a |
180 | * low power state that may have caused some blocks in the same power domain | 180 | * low power state that may have caused some blocks in the same power domain |
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); | |||
183 | * Must be called after cpu_pm_exit has been called on all cpus in the power | 183 | * Must be called after cpu_pm_exit has been called on all cpus in the power |
184 | * domain, and before cpu_pm_exit has been called on any cpu in the power | 184 | * domain, and before cpu_pm_exit has been called on any cpu in the power |
185 | * domain. Notified drivers can include VFP co-processor, interrupt controller | 185 | * domain. Notified drivers can include VFP co-processor, interrupt controller |
186 | * and it's PM extensions, local CPU timers context save/restore which | 186 | * and its PM extensions, local CPU timers context save/restore which |
187 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | 187 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. |
188 | * | 188 | * |
189 | * Return conditions are same as __raw_notifier_call_chain. | 189 | * Return conditions are same as __raw_notifier_call_chain. |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba2..8c8bd652dd12 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1765,28 +1765,17 @@ static struct cftype files[] = { | |||
1765 | .write_u64 = cpuset_write_u64, | 1765 | .write_u64 = cpuset_write_u64, |
1766 | .private = FILE_SPREAD_SLAB, | 1766 | .private = FILE_SPREAD_SLAB, |
1767 | }, | 1767 | }, |
1768 | }; | ||
1769 | |||
1770 | static struct cftype cft_memory_pressure_enabled = { | ||
1771 | .name = "memory_pressure_enabled", | ||
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }; | ||
1776 | 1768 | ||
1777 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 1769 | { |
1778 | { | 1770 | .name = "memory_pressure_enabled", |
1779 | int err; | 1771 | .flags = CFTYPE_ONLY_ON_ROOT, |
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }, | ||
1780 | 1776 | ||
1781 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 1777 | { } /* terminate */ |
1782 | if (err) | 1778 | }; |
1783 | return err; | ||
1784 | /* memory_pressure_enabled is in root cpuset only */ | ||
1785 | if (!cont->parent) | ||
1786 | err = cgroup_add_file(cont, ss, | ||
1787 | &cft_memory_pressure_enabled); | ||
1788 | return err; | ||
1789 | } | ||
1790 | 1779 | ||
1791 | /* | 1780 | /* |
1792 | * post_clone() is called during cgroup_create() when the | 1781 | * post_clone() is called during cgroup_create() when the |
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1887 | .destroy = cpuset_destroy, | 1876 | .destroy = cpuset_destroy, |
1888 | .can_attach = cpuset_can_attach, | 1877 | .can_attach = cpuset_can_attach, |
1889 | .attach = cpuset_attach, | 1878 | .attach = cpuset_attach, |
1890 | .populate = cpuset_populate, | ||
1891 | .post_clone = cpuset_post_clone, | 1879 | .post_clone = cpuset_post_clone, |
1892 | .subsys_id = cpuset_subsys_id, | 1880 | .subsys_id = cpuset_subsys_id, |
1881 | .base_cftypes = files, | ||
1893 | .early_init = 1, | 1882 | .early_init = 1, |
1894 | }; | 1883 | }; |
1895 | 1884 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index e70683d9ec32..de728ac50d82 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -49,6 +49,14 @@ struct cred init_cred = { | |||
49 | .subscribers = ATOMIC_INIT(2), | 49 | .subscribers = ATOMIC_INIT(2), |
50 | .magic = CRED_MAGIC, | 50 | .magic = CRED_MAGIC, |
51 | #endif | 51 | #endif |
52 | .uid = GLOBAL_ROOT_UID, | ||
53 | .gid = GLOBAL_ROOT_GID, | ||
54 | .suid = GLOBAL_ROOT_UID, | ||
55 | .sgid = GLOBAL_ROOT_GID, | ||
56 | .euid = GLOBAL_ROOT_UID, | ||
57 | .egid = GLOBAL_ROOT_GID, | ||
58 | .fsuid = GLOBAL_ROOT_UID, | ||
59 | .fsgid = GLOBAL_ROOT_GID, | ||
52 | .securebits = SECUREBITS_DEFAULT, | 60 | .securebits = SECUREBITS_DEFAULT, |
53 | .cap_inheritable = CAP_EMPTY_SET, | 61 | .cap_inheritable = CAP_EMPTY_SET, |
54 | .cap_permitted = CAP_FULL_SET, | 62 | .cap_permitted = CAP_FULL_SET, |
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu) | |||
148 | if (cred->group_info) | 156 | if (cred->group_info) |
149 | put_group_info(cred->group_info); | 157 | put_group_info(cred->group_info); |
150 | free_uid(cred->user); | 158 | free_uid(cred->user); |
159 | put_user_ns(cred->user_ns); | ||
151 | kmem_cache_free(cred_jar, cred); | 160 | kmem_cache_free(cred_jar, cred); |
152 | } | 161 | } |
153 | 162 | ||
@@ -198,13 +207,6 @@ void exit_creds(struct task_struct *tsk) | |||
198 | validate_creds(cred); | 207 | validate_creds(cred); |
199 | alter_cred_subscribers(cred, -1); | 208 | alter_cred_subscribers(cred, -1); |
200 | put_cred(cred); | 209 | put_cred(cred); |
201 | |||
202 | cred = (struct cred *) tsk->replacement_session_keyring; | ||
203 | if (cred) { | ||
204 | tsk->replacement_session_keyring = NULL; | ||
205 | validate_creds(cred); | ||
206 | put_cred(cred); | ||
207 | } | ||
208 | } | 210 | } |
209 | 211 | ||
210 | /** | 212 | /** |
@@ -303,6 +305,7 @@ struct cred *prepare_creds(void) | |||
303 | set_cred_subscribers(new, 0); | 305 | set_cred_subscribers(new, 0); |
304 | get_group_info(new->group_info); | 306 | get_group_info(new->group_info); |
305 | get_uid(new->user); | 307 | get_uid(new->user); |
308 | get_user_ns(new->user_ns); | ||
306 | 309 | ||
307 | #ifdef CONFIG_KEYS | 310 | #ifdef CONFIG_KEYS |
308 | key_get(new->thread_keyring); | 311 | key_get(new->thread_keyring); |
@@ -386,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
386 | struct cred *new; | 389 | struct cred *new; |
387 | int ret; | 390 | int ret; |
388 | 391 | ||
389 | p->replacement_session_keyring = NULL; | ||
390 | |||
391 | if ( | 392 | if ( |
392 | #ifdef CONFIG_KEYS | 393 | #ifdef CONFIG_KEYS |
393 | !p->cred->thread_keyring && | 394 | !p->cred->thread_keyring && |
@@ -414,11 +415,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
414 | goto error_put; | 415 | goto error_put; |
415 | } | 416 | } |
416 | 417 | ||
417 | /* cache user_ns in cred. Doesn't need a refcount because it will | ||
418 | * stay pinned by cred->user | ||
419 | */ | ||
420 | new->user_ns = new->user->user_ns; | ||
421 | |||
422 | #ifdef CONFIG_KEYS | 418 | #ifdef CONFIG_KEYS |
423 | /* new threads get their own thread keyrings if their parent already | 419 | /* new threads get their own thread keyrings if their parent already |
424 | * had one */ | 420 | * had one */ |
@@ -493,10 +489,10 @@ int commit_creds(struct cred *new) | |||
493 | get_cred(new); /* we will require a ref for the subj creds too */ | 489 | get_cred(new); /* we will require a ref for the subj creds too */ |
494 | 490 | ||
495 | /* dumpability changes */ | 491 | /* dumpability changes */ |
496 | if (old->euid != new->euid || | 492 | if (!uid_eq(old->euid, new->euid) || |
497 | old->egid != new->egid || | 493 | !gid_eq(old->egid, new->egid) || |
498 | old->fsuid != new->fsuid || | 494 | !uid_eq(old->fsuid, new->fsuid) || |
499 | old->fsgid != new->fsgid || | 495 | !gid_eq(old->fsgid, new->fsgid) || |
500 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { | 496 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { |
501 | if (task->mm) | 497 | if (task->mm) |
502 | set_dumpable(task->mm, suid_dumpable); | 498 | set_dumpable(task->mm, suid_dumpable); |
@@ -505,9 +501,9 @@ int commit_creds(struct cred *new) | |||
505 | } | 501 | } |
506 | 502 | ||
507 | /* alter the thread keyring */ | 503 | /* alter the thread keyring */ |
508 | if (new->fsuid != old->fsuid) | 504 | if (!uid_eq(new->fsuid, old->fsuid)) |
509 | key_fsuid_changed(task); | 505 | key_fsuid_changed(task); |
510 | if (new->fsgid != old->fsgid) | 506 | if (!gid_eq(new->fsgid, old->fsgid)) |
511 | key_fsgid_changed(task); | 507 | key_fsgid_changed(task); |
512 | 508 | ||
513 | /* do it | 509 | /* do it |
@@ -524,16 +520,16 @@ int commit_creds(struct cred *new) | |||
524 | alter_cred_subscribers(old, -2); | 520 | alter_cred_subscribers(old, -2); |
525 | 521 | ||
526 | /* send notifications */ | 522 | /* send notifications */ |
527 | if (new->uid != old->uid || | 523 | if (!uid_eq(new->uid, old->uid) || |
528 | new->euid != old->euid || | 524 | !uid_eq(new->euid, old->euid) || |
529 | new->suid != old->suid || | 525 | !uid_eq(new->suid, old->suid) || |
530 | new->fsuid != old->fsuid) | 526 | !uid_eq(new->fsuid, old->fsuid)) |
531 | proc_id_connector(task, PROC_EVENT_UID); | 527 | proc_id_connector(task, PROC_EVENT_UID); |
532 | 528 | ||
533 | if (new->gid != old->gid || | 529 | if (!gid_eq(new->gid, old->gid) || |
534 | new->egid != old->egid || | 530 | !gid_eq(new->egid, old->egid) || |
535 | new->sgid != old->sgid || | 531 | !gid_eq(new->sgid, old->sgid) || |
536 | new->fsgid != old->fsgid) | 532 | !gid_eq(new->fsgid, old->fsgid)) |
537 | proc_id_connector(task, PROC_EVENT_GID); | 533 | proc_id_connector(task, PROC_EVENT_GID); |
538 | 534 | ||
539 | /* release the old obj and subj refs both */ | 535 | /* release the old obj and subj refs both */ |
@@ -678,6 +674,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
678 | atomic_set(&new->usage, 1); | 674 | atomic_set(&new->usage, 1); |
679 | set_cred_subscribers(new, 0); | 675 | set_cred_subscribers(new, 0); |
680 | get_uid(new->user); | 676 | get_uid(new->user); |
677 | get_user_ns(new->user_ns); | ||
681 | get_group_info(new->group_info); | 678 | get_group_info(new->group_info); |
682 | 679 | ||
683 | #ifdef CONFIG_KEYS | 680 | #ifdef CONFIG_KEYS |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 22d901f9caf4..103f5d147b2f 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile | |||
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg | |||
3 | endif | 3 | endif |
4 | 4 | ||
5 | obj-y := core.o ring_buffer.o callchain.o | 5 | obj-y := core.o ring_buffer.o callchain.o |
6 | |||
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 7 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
8 | obj-$(CONFIG_UPROBES) += uprobes.o | ||
9 | |||
diff --git a/kernel/events/core.c b/kernel/events/core.c index fd126f82b57c..5b06cbbf6931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | |||
4957 | if (rctx < 0) | 4957 | if (rctx < 0) |
4958 | return; | 4958 | return; |
4959 | 4959 | ||
4960 | perf_sample_data_init(&data, addr); | 4960 | perf_sample_data_init(&data, addr, 0); |
4961 | 4961 | ||
4962 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); | 4962 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
4963 | 4963 | ||
@@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
5215 | .data = record, | 5215 | .data = record, |
5216 | }; | 5216 | }; |
5217 | 5217 | ||
5218 | perf_sample_data_init(&data, addr); | 5218 | perf_sample_data_init(&data, addr, 0); |
5219 | data.raw = &raw; | 5219 | data.raw = &raw; |
5220 | 5220 | ||
5221 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5221 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
@@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
5318 | struct perf_sample_data sample; | 5318 | struct perf_sample_data sample; |
5319 | struct pt_regs *regs = data; | 5319 | struct pt_regs *regs = data; |
5320 | 5320 | ||
5321 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5321 | perf_sample_data_init(&sample, bp->attr.bp_addr, 0); |
5322 | 5322 | ||
5323 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | 5323 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) |
5324 | perf_swevent_event(bp, 1, &sample, regs); | 5324 | perf_swevent_event(bp, 1, &sample, regs); |
@@ -5344,13 +5344,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5344 | 5344 | ||
5345 | event->pmu->read(event); | 5345 | event->pmu->read(event); |
5346 | 5346 | ||
5347 | perf_sample_data_init(&data, 0); | 5347 | perf_sample_data_init(&data, 0, event->hw.last_period); |
5348 | data.period = event->hw.last_period; | ||
5349 | regs = get_irq_regs(); | 5348 | regs = get_irq_regs(); |
5350 | 5349 | ||
5351 | if (regs && !perf_exclude_event(event, regs)) { | 5350 | if (regs && !perf_exclude_event(event, regs)) { |
5352 | if (!(event->attr.exclude_idle && is_idle_task(current))) | 5351 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5353 | if (perf_event_overflow(event, &data, regs)) | 5352 | if (__perf_event_overflow(event, 1, &data, regs)) |
5354 | ret = HRTIMER_NORESTART; | 5353 | ret = HRTIMER_NORESTART; |
5355 | } | 5354 | } |
5356 | 5355 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c new file mode 100644 index 000000000000..985be4d80fe8 --- /dev/null +++ b/kernel/events/uprobes.c | |||
@@ -0,0 +1,1667 @@ | |||
1 | /* | ||
2 | * User-space Probes (UProbes) | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2008-2012 | ||
19 | * Authors: | ||
20 | * Srikar Dronamraju | ||
21 | * Jim Keniston | ||
22 | * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
23 | */ | ||
24 | |||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/highmem.h> | ||
27 | #include <linux/pagemap.h> /* read_mapping_page */ | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/rmap.h> /* anon_vma_prepare */ | ||
31 | #include <linux/mmu_notifier.h> /* set_pte_at_notify */ | ||
32 | #include <linux/swap.h> /* try_to_free_swap */ | ||
33 | #include <linux/ptrace.h> /* user_enable_single_step */ | ||
34 | #include <linux/kdebug.h> /* notifier mechanism */ | ||
35 | |||
36 | #include <linux/uprobes.h> | ||
37 | |||
38 | #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) | ||
39 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE | ||
40 | |||
41 | static struct srcu_struct uprobes_srcu; | ||
42 | static struct rb_root uprobes_tree = RB_ROOT; | ||
43 | |||
44 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ | ||
45 | |||
46 | #define UPROBES_HASH_SZ 13 | ||
47 | |||
48 | /* serialize (un)register */ | ||
49 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | ||
50 | |||
51 | #define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | ||
52 | |||
53 | /* serialize uprobe->pending_list */ | ||
54 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | ||
55 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | ||
56 | |||
57 | /* | ||
58 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe | ||
59 | * events active at this time. Probably a fine grained per inode count is | ||
60 | * better? | ||
61 | */ | ||
62 | static atomic_t uprobe_events = ATOMIC_INIT(0); | ||
63 | |||
64 | /* | ||
65 | * Maintain a temporary per vma info that can be used to search if a vma | ||
66 | * has already been handled. This structure is introduced since extending | ||
67 | * vm_area_struct wasnt recommended. | ||
68 | */ | ||
69 | struct vma_info { | ||
70 | struct list_head probe_list; | ||
71 | struct mm_struct *mm; | ||
72 | loff_t vaddr; | ||
73 | }; | ||
74 | |||
75 | struct uprobe { | ||
76 | struct rb_node rb_node; /* node in the rb tree */ | ||
77 | atomic_t ref; | ||
78 | struct rw_semaphore consumer_rwsem; | ||
79 | struct list_head pending_list; | ||
80 | struct uprobe_consumer *consumers; | ||
81 | struct inode *inode; /* Also hold a ref to inode */ | ||
82 | loff_t offset; | ||
83 | int flags; | ||
84 | struct arch_uprobe arch; | ||
85 | }; | ||
86 | |||
87 | /* | ||
88 | * valid_vma: Verify if the specified vma is an executable vma | ||
89 | * Relax restrictions while unregistering: vm_flags might have | ||
90 | * changed after breakpoint was inserted. | ||
91 | * - is_register: indicates if we are in register context. | ||
92 | * - Return 1 if the specified virtual address is in an | ||
93 | * executable vma. | ||
94 | */ | ||
95 | static bool valid_vma(struct vm_area_struct *vma, bool is_register) | ||
96 | { | ||
97 | if (!vma->vm_file) | ||
98 | return false; | ||
99 | |||
100 | if (!is_register) | ||
101 | return true; | ||
102 | |||
103 | if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) | ||
104 | return true; | ||
105 | |||
106 | return false; | ||
107 | } | ||
108 | |||
109 | static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) | ||
110 | { | ||
111 | loff_t vaddr; | ||
112 | |||
113 | vaddr = vma->vm_start + offset; | ||
114 | vaddr -= vma->vm_pgoff << PAGE_SHIFT; | ||
115 | |||
116 | return vaddr; | ||
117 | } | ||
118 | |||
119 | /** | ||
120 | * __replace_page - replace page in vma by new page. | ||
121 | * based on replace_page in mm/ksm.c | ||
122 | * | ||
123 | * @vma: vma that holds the pte pointing to page | ||
124 | * @page: the cowed page we are replacing by kpage | ||
125 | * @kpage: the modified page we replace page by | ||
126 | * | ||
127 | * Returns 0 on success, -EFAULT on failure. | ||
128 | */ | ||
129 | static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) | ||
130 | { | ||
131 | struct mm_struct *mm = vma->vm_mm; | ||
132 | pgd_t *pgd; | ||
133 | pud_t *pud; | ||
134 | pmd_t *pmd; | ||
135 | pte_t *ptep; | ||
136 | spinlock_t *ptl; | ||
137 | unsigned long addr; | ||
138 | int err = -EFAULT; | ||
139 | |||
140 | addr = page_address_in_vma(page, vma); | ||
141 | if (addr == -EFAULT) | ||
142 | goto out; | ||
143 | |||
144 | pgd = pgd_offset(mm, addr); | ||
145 | if (!pgd_present(*pgd)) | ||
146 | goto out; | ||
147 | |||
148 | pud = pud_offset(pgd, addr); | ||
149 | if (!pud_present(*pud)) | ||
150 | goto out; | ||
151 | |||
152 | pmd = pmd_offset(pud, addr); | ||
153 | if (!pmd_present(*pmd)) | ||
154 | goto out; | ||
155 | |||
156 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | ||
157 | if (!ptep) | ||
158 | goto out; | ||
159 | |||
160 | get_page(kpage); | ||
161 | page_add_new_anon_rmap(kpage, vma, addr); | ||
162 | |||
163 | if (!PageAnon(page)) { | ||
164 | dec_mm_counter(mm, MM_FILEPAGES); | ||
165 | inc_mm_counter(mm, MM_ANONPAGES); | ||
166 | } | ||
167 | |||
168 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | ||
169 | ptep_clear_flush(vma, addr, ptep); | ||
170 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | ||
171 | |||
172 | page_remove_rmap(page); | ||
173 | if (!page_mapped(page)) | ||
174 | try_to_free_swap(page); | ||
175 | put_page(page); | ||
176 | pte_unmap_unlock(ptep, ptl); | ||
177 | err = 0; | ||
178 | |||
179 | out: | ||
180 | return err; | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * is_swbp_insn - check if instruction is breakpoint instruction. | ||
185 | * @insn: instruction to be checked. | ||
186 | * Default implementation of is_swbp_insn | ||
187 | * Returns true if @insn is a breakpoint instruction. | ||
188 | */ | ||
189 | bool __weak is_swbp_insn(uprobe_opcode_t *insn) | ||
190 | { | ||
191 | return *insn == UPROBE_SWBP_INSN; | ||
192 | } | ||
193 | |||
194 | /* | ||
195 | * NOTE: | ||
196 | * Expect the breakpoint instruction to be the smallest size instruction for | ||
197 | * the architecture. If an arch has variable length instruction and the | ||
198 | * breakpoint instruction is not of the smallest length instruction | ||
199 | * supported by that architecture then we need to modify read_opcode / | ||
200 | * write_opcode accordingly. This would never be a problem for archs that | ||
201 | * have fixed length instructions. | ||
202 | */ | ||
203 | |||
204 | /* | ||
205 | * write_opcode - write the opcode at a given virtual address. | ||
206 | * @auprobe: arch breakpointing information. | ||
207 | * @mm: the probed process address space. | ||
208 | * @vaddr: the virtual address to store the opcode. | ||
209 | * @opcode: opcode to be written at @vaddr. | ||
210 | * | ||
211 | * Called with mm->mmap_sem held (for read and with a reference to | ||
212 | * mm). | ||
213 | * | ||
214 | * For mm @mm, write the opcode at @vaddr. | ||
215 | * Return 0 (success) or a negative errno. | ||
216 | */ | ||
217 | static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | ||
218 | unsigned long vaddr, uprobe_opcode_t opcode) | ||
219 | { | ||
220 | struct page *old_page, *new_page; | ||
221 | struct address_space *mapping; | ||
222 | void *vaddr_old, *vaddr_new; | ||
223 | struct vm_area_struct *vma; | ||
224 | struct uprobe *uprobe; | ||
225 | loff_t addr; | ||
226 | int ret; | ||
227 | |||
228 | /* Read the page with vaddr into memory */ | ||
229 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); | ||
230 | if (ret <= 0) | ||
231 | return ret; | ||
232 | |||
233 | ret = -EINVAL; | ||
234 | |||
235 | /* | ||
236 | * We are interested in text pages only. Our pages of interest | ||
237 | * should be mapped for read and execute only. We desist from | ||
238 | * adding probes in write mapped pages since the breakpoints | ||
239 | * might end up in the file copy. | ||
240 | */ | ||
241 | if (!valid_vma(vma, is_swbp_insn(&opcode))) | ||
242 | goto put_out; | ||
243 | |||
244 | uprobe = container_of(auprobe, struct uprobe, arch); | ||
245 | mapping = uprobe->inode->i_mapping; | ||
246 | if (mapping != vma->vm_file->f_mapping) | ||
247 | goto put_out; | ||
248 | |||
249 | addr = vma_address(vma, uprobe->offset); | ||
250 | if (vaddr != (unsigned long)addr) | ||
251 | goto put_out; | ||
252 | |||
253 | ret = -ENOMEM; | ||
254 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); | ||
255 | if (!new_page) | ||
256 | goto put_out; | ||
257 | |||
258 | __SetPageUptodate(new_page); | ||
259 | |||
260 | /* | ||
261 | * lock page will serialize against do_wp_page()'s | ||
262 | * PageAnon() handling | ||
263 | */ | ||
264 | lock_page(old_page); | ||
265 | /* copy the page now that we've got it stable */ | ||
266 | vaddr_old = kmap_atomic(old_page); | ||
267 | vaddr_new = kmap_atomic(new_page); | ||
268 | |||
269 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); | ||
270 | |||
271 | /* poke the new insn in, ASSUMES we don't cross page boundary */ | ||
272 | vaddr &= ~PAGE_MASK; | ||
273 | BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
274 | memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | ||
275 | |||
276 | kunmap_atomic(vaddr_new); | ||
277 | kunmap_atomic(vaddr_old); | ||
278 | |||
279 | ret = anon_vma_prepare(vma); | ||
280 | if (ret) | ||
281 | goto unlock_out; | ||
282 | |||
283 | lock_page(new_page); | ||
284 | ret = __replace_page(vma, old_page, new_page); | ||
285 | unlock_page(new_page); | ||
286 | |||
287 | unlock_out: | ||
288 | unlock_page(old_page); | ||
289 | page_cache_release(new_page); | ||
290 | |||
291 | put_out: | ||
292 | put_page(old_page); | ||
293 | |||
294 | return ret; | ||
295 | } | ||
296 | |||
297 | /** | ||
298 | * read_opcode - read the opcode at a given virtual address. | ||
299 | * @mm: the probed process address space. | ||
300 | * @vaddr: the virtual address to read the opcode. | ||
301 | * @opcode: location to store the read opcode. | ||
302 | * | ||
303 | * Called with mm->mmap_sem held (for read and with a reference to | ||
304 | * mm. | ||
305 | * | ||
306 | * For mm @mm, read the opcode at @vaddr and store it in @opcode. | ||
307 | * Return 0 (success) or a negative errno. | ||
308 | */ | ||
309 | static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) | ||
310 | { | ||
311 | struct page *page; | ||
312 | void *vaddr_new; | ||
313 | int ret; | ||
314 | |||
315 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); | ||
316 | if (ret <= 0) | ||
317 | return ret; | ||
318 | |||
319 | lock_page(page); | ||
320 | vaddr_new = kmap_atomic(page); | ||
321 | vaddr &= ~PAGE_MASK; | ||
322 | memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); | ||
323 | kunmap_atomic(vaddr_new); | ||
324 | unlock_page(page); | ||
325 | |||
326 | put_page(page); | ||
327 | |||
328 | return 0; | ||
329 | } | ||
330 | |||
331 | static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | ||
332 | { | ||
333 | uprobe_opcode_t opcode; | ||
334 | int result; | ||
335 | |||
336 | result = read_opcode(mm, vaddr, &opcode); | ||
337 | if (result) | ||
338 | return result; | ||
339 | |||
340 | if (is_swbp_insn(&opcode)) | ||
341 | return 1; | ||
342 | |||
343 | return 0; | ||
344 | } | ||
345 | |||
346 | /** | ||
347 | * set_swbp - store breakpoint at a given address. | ||
348 | * @auprobe: arch specific probepoint information. | ||
349 | * @mm: the probed process address space. | ||
350 | * @vaddr: the virtual address to insert the opcode. | ||
351 | * | ||
352 | * For mm @mm, store the breakpoint instruction at @vaddr. | ||
353 | * Return 0 (success) or a negative errno. | ||
354 | */ | ||
355 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | ||
356 | { | ||
357 | int result; | ||
358 | |||
359 | result = is_swbp_at_addr(mm, vaddr); | ||
360 | if (result == 1) | ||
361 | return -EEXIST; | ||
362 | |||
363 | if (result) | ||
364 | return result; | ||
365 | |||
366 | return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); | ||
367 | } | ||
368 | |||
369 | /** | ||
370 | * set_orig_insn - Restore the original instruction. | ||
371 | * @mm: the probed process address space. | ||
372 | * @auprobe: arch specific probepoint information. | ||
373 | * @vaddr: the virtual address to insert the opcode. | ||
374 | * @verify: if true, verify existance of breakpoint instruction. | ||
375 | * | ||
376 | * For mm @mm, restore the original opcode (opcode) at @vaddr. | ||
377 | * Return 0 (success) or a negative errno. | ||
378 | */ | ||
379 | int __weak | ||
380 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) | ||
381 | { | ||
382 | if (verify) { | ||
383 | int result; | ||
384 | |||
385 | result = is_swbp_at_addr(mm, vaddr); | ||
386 | if (!result) | ||
387 | return -EINVAL; | ||
388 | |||
389 | if (result != 1) | ||
390 | return result; | ||
391 | } | ||
392 | return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | ||
393 | } | ||
394 | |||
395 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | ||
396 | { | ||
397 | if (l->inode < r->inode) | ||
398 | return -1; | ||
399 | |||
400 | if (l->inode > r->inode) | ||
401 | return 1; | ||
402 | |||
403 | if (l->offset < r->offset) | ||
404 | return -1; | ||
405 | |||
406 | if (l->offset > r->offset) | ||
407 | return 1; | ||
408 | |||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) | ||
413 | { | ||
414 | struct uprobe u = { .inode = inode, .offset = offset }; | ||
415 | struct rb_node *n = uprobes_tree.rb_node; | ||
416 | struct uprobe *uprobe; | ||
417 | int match; | ||
418 | |||
419 | while (n) { | ||
420 | uprobe = rb_entry(n, struct uprobe, rb_node); | ||
421 | match = match_uprobe(&u, uprobe); | ||
422 | if (!match) { | ||
423 | atomic_inc(&uprobe->ref); | ||
424 | return uprobe; | ||
425 | } | ||
426 | |||
427 | if (match < 0) | ||
428 | n = n->rb_left; | ||
429 | else | ||
430 | n = n->rb_right; | ||
431 | } | ||
432 | return NULL; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * Find a uprobe corresponding to a given inode:offset | ||
437 | * Acquires uprobes_treelock | ||
438 | */ | ||
439 | static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) | ||
440 | { | ||
441 | struct uprobe *uprobe; | ||
442 | unsigned long flags; | ||
443 | |||
444 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
445 | uprobe = __find_uprobe(inode, offset); | ||
446 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
447 | |||
448 | return uprobe; | ||
449 | } | ||
450 | |||
451 | static struct uprobe *__insert_uprobe(struct uprobe *uprobe) | ||
452 | { | ||
453 | struct rb_node **p = &uprobes_tree.rb_node; | ||
454 | struct rb_node *parent = NULL; | ||
455 | struct uprobe *u; | ||
456 | int match; | ||
457 | |||
458 | while (*p) { | ||
459 | parent = *p; | ||
460 | u = rb_entry(parent, struct uprobe, rb_node); | ||
461 | match = match_uprobe(uprobe, u); | ||
462 | if (!match) { | ||
463 | atomic_inc(&u->ref); | ||
464 | return u; | ||
465 | } | ||
466 | |||
467 | if (match < 0) | ||
468 | p = &parent->rb_left; | ||
469 | else | ||
470 | p = &parent->rb_right; | ||
471 | |||
472 | } | ||
473 | |||
474 | u = NULL; | ||
475 | rb_link_node(&uprobe->rb_node, parent, p); | ||
476 | rb_insert_color(&uprobe->rb_node, &uprobes_tree); | ||
477 | /* get access + creation ref */ | ||
478 | atomic_set(&uprobe->ref, 2); | ||
479 | |||
480 | return u; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Acquire uprobes_treelock. | ||
485 | * Matching uprobe already exists in rbtree; | ||
486 | * increment (access refcount) and return the matching uprobe. | ||
487 | * | ||
488 | * No matching uprobe; insert the uprobe in rb_tree; | ||
489 | * get a double refcount (access + creation) and return NULL. | ||
490 | */ | ||
491 | static struct uprobe *insert_uprobe(struct uprobe *uprobe) | ||
492 | { | ||
493 | unsigned long flags; | ||
494 | struct uprobe *u; | ||
495 | |||
496 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
497 | u = __insert_uprobe(uprobe); | ||
498 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
499 | |||
500 | /* For now assume that the instruction need not be single-stepped */ | ||
501 | uprobe->flags |= UPROBE_SKIP_SSTEP; | ||
502 | |||
503 | return u; | ||
504 | } | ||
505 | |||
506 | static void put_uprobe(struct uprobe *uprobe) | ||
507 | { | ||
508 | if (atomic_dec_and_test(&uprobe->ref)) | ||
509 | kfree(uprobe); | ||
510 | } | ||
511 | |||
512 | static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | ||
513 | { | ||
514 | struct uprobe *uprobe, *cur_uprobe; | ||
515 | |||
516 | uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); | ||
517 | if (!uprobe) | ||
518 | return NULL; | ||
519 | |||
520 | uprobe->inode = igrab(inode); | ||
521 | uprobe->offset = offset; | ||
522 | init_rwsem(&uprobe->consumer_rwsem); | ||
523 | INIT_LIST_HEAD(&uprobe->pending_list); | ||
524 | |||
525 | /* add to uprobes_tree, sorted on inode:offset */ | ||
526 | cur_uprobe = insert_uprobe(uprobe); | ||
527 | |||
528 | /* a uprobe exists for this inode:offset combination */ | ||
529 | if (cur_uprobe) { | ||
530 | kfree(uprobe); | ||
531 | uprobe = cur_uprobe; | ||
532 | iput(inode); | ||
533 | } else { | ||
534 | atomic_inc(&uprobe_events); | ||
535 | } | ||
536 | |||
537 | return uprobe; | ||
538 | } | ||
539 | |||
540 | static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | ||
541 | { | ||
542 | struct uprobe_consumer *uc; | ||
543 | |||
544 | if (!(uprobe->flags & UPROBE_RUN_HANDLER)) | ||
545 | return; | ||
546 | |||
547 | down_read(&uprobe->consumer_rwsem); | ||
548 | for (uc = uprobe->consumers; uc; uc = uc->next) { | ||
549 | if (!uc->filter || uc->filter(uc, current)) | ||
550 | uc->handler(uc, regs); | ||
551 | } | ||
552 | up_read(&uprobe->consumer_rwsem); | ||
553 | } | ||
554 | |||
555 | /* Returns the previous consumer */ | ||
556 | static struct uprobe_consumer * | ||
557 | consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) | ||
558 | { | ||
559 | down_write(&uprobe->consumer_rwsem); | ||
560 | uc->next = uprobe->consumers; | ||
561 | uprobe->consumers = uc; | ||
562 | up_write(&uprobe->consumer_rwsem); | ||
563 | |||
564 | return uc->next; | ||
565 | } | ||
566 | |||
567 | /* | ||
568 | * For uprobe @uprobe, delete the consumer @uc. | ||
569 | * Return true if the @uc is deleted successfully | ||
570 | * or return false. | ||
571 | */ | ||
572 | static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | ||
573 | { | ||
574 | struct uprobe_consumer **con; | ||
575 | bool ret = false; | ||
576 | |||
577 | down_write(&uprobe->consumer_rwsem); | ||
578 | for (con = &uprobe->consumers; *con; con = &(*con)->next) { | ||
579 | if (*con == uc) { | ||
580 | *con = uc->next; | ||
581 | ret = true; | ||
582 | break; | ||
583 | } | ||
584 | } | ||
585 | up_write(&uprobe->consumer_rwsem); | ||
586 | |||
587 | return ret; | ||
588 | } | ||
589 | |||
590 | static int | ||
591 | __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, | ||
592 | unsigned long nbytes, unsigned long offset) | ||
593 | { | ||
594 | struct file *filp = vma->vm_file; | ||
595 | struct page *page; | ||
596 | void *vaddr; | ||
597 | unsigned long off1; | ||
598 | unsigned long idx; | ||
599 | |||
600 | if (!filp) | ||
601 | return -EINVAL; | ||
602 | |||
603 | idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); | ||
604 | off1 = offset &= ~PAGE_MASK; | ||
605 | |||
606 | /* | ||
607 | * Ensure that the page that has the original instruction is | ||
608 | * populated and in page-cache. | ||
609 | */ | ||
610 | page = read_mapping_page(mapping, idx, filp); | ||
611 | if (IS_ERR(page)) | ||
612 | return PTR_ERR(page); | ||
613 | |||
614 | vaddr = kmap_atomic(page); | ||
615 | memcpy(insn, vaddr + off1, nbytes); | ||
616 | kunmap_atomic(vaddr); | ||
617 | page_cache_release(page); | ||
618 | |||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | static int | ||
623 | copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | ||
624 | { | ||
625 | struct address_space *mapping; | ||
626 | unsigned long nbytes; | ||
627 | int bytes; | ||
628 | |||
629 | addr &= ~PAGE_MASK; | ||
630 | nbytes = PAGE_SIZE - addr; | ||
631 | mapping = uprobe->inode->i_mapping; | ||
632 | |||
633 | /* Instruction at end of binary; copy only available bytes */ | ||
634 | if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) | ||
635 | bytes = uprobe->inode->i_size - uprobe->offset; | ||
636 | else | ||
637 | bytes = MAX_UINSN_BYTES; | ||
638 | |||
639 | /* Instruction at the page-boundary; copy bytes in second page */ | ||
640 | if (nbytes < bytes) { | ||
641 | if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, | ||
642 | bytes - nbytes, uprobe->offset + nbytes)) | ||
643 | return -ENOMEM; | ||
644 | |||
645 | bytes = nbytes; | ||
646 | } | ||
647 | return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | * How mm->uprobes_state.count gets updated | ||
652 | * uprobe_mmap() increments the count if | ||
653 | * - it successfully adds a breakpoint. | ||
654 | * - it cannot add a breakpoint, but sees that there is a underlying | ||
655 | * breakpoint (via a is_swbp_at_addr()). | ||
656 | * | ||
657 | * uprobe_munmap() decrements the count if | ||
658 | * - it sees a underlying breakpoint, (via is_swbp_at_addr) | ||
659 | * (Subsequent uprobe_unregister wouldnt find the breakpoint | ||
660 | * unless a uprobe_mmap kicks in, since the old vma would be | ||
661 | * dropped just after uprobe_munmap.) | ||
662 | * | ||
663 | * uprobe_register increments the count if: | ||
664 | * - it successfully adds a breakpoint. | ||
665 | * | ||
666 | * uprobe_unregister decrements the count if: | ||
667 | * - it sees a underlying breakpoint and removes successfully. | ||
668 | * (via is_swbp_at_addr) | ||
669 | * (Subsequent uprobe_munmap wouldnt find the breakpoint | ||
670 | * since there is no underlying breakpoint after the | ||
671 | * breakpoint removal.) | ||
672 | */ | ||
673 | static int | ||
674 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | ||
675 | struct vm_area_struct *vma, loff_t vaddr) | ||
676 | { | ||
677 | unsigned long addr; | ||
678 | int ret; | ||
679 | |||
680 | /* | ||
681 | * If probe is being deleted, unregister thread could be done with | ||
682 | * the vma-rmap-walk through. Adding a probe now can be fatal since | ||
683 | * nobody will be able to cleanup. Also we could be from fork or | ||
684 | * mremap path, where the probe might have already been inserted. | ||
685 | * Hence behave as if probe already existed. | ||
686 | */ | ||
687 | if (!uprobe->consumers) | ||
688 | return -EEXIST; | ||
689 | |||
690 | addr = (unsigned long)vaddr; | ||
691 | |||
692 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { | ||
693 | ret = copy_insn(uprobe, vma, addr); | ||
694 | if (ret) | ||
695 | return ret; | ||
696 | |||
697 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) | ||
698 | return -EEXIST; | ||
699 | |||
700 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); | ||
701 | if (ret) | ||
702 | return ret; | ||
703 | |||
704 | uprobe->flags |= UPROBE_COPY_INSN; | ||
705 | } | ||
706 | |||
707 | /* | ||
708 | * Ideally, should be updating the probe count after the breakpoint | ||
709 | * has been successfully inserted. However a thread could hit the | ||
710 | * breakpoint we just inserted even before the probe count is | ||
711 | * incremented. If this is the first breakpoint placed, breakpoint | ||
712 | * notifier might ignore uprobes and pass the trap to the thread. | ||
713 | * Hence increment before and decrement on failure. | ||
714 | */ | ||
715 | atomic_inc(&mm->uprobes_state.count); | ||
716 | ret = set_swbp(&uprobe->arch, mm, addr); | ||
717 | if (ret) | ||
718 | atomic_dec(&mm->uprobes_state.count); | ||
719 | |||
720 | return ret; | ||
721 | } | ||
722 | |||
723 | static void | ||
724 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) | ||
725 | { | ||
726 | if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) | ||
727 | atomic_dec(&mm->uprobes_state.count); | ||
728 | } | ||
729 | |||
730 | /* | ||
731 | * There could be threads that have hit the breakpoint and are entering the | ||
732 | * notifier code and trying to acquire the uprobes_treelock. The thread | ||
733 | * calling delete_uprobe() that is removing the uprobe from the rb_tree can | ||
734 | * race with these threads and might acquire the uprobes_treelock compared | ||
735 | * to some of the breakpoint hit threads. In such a case, the breakpoint | ||
736 | * hit threads will not find the uprobe. The current unregistering thread | ||
737 | * waits till all other threads have hit a breakpoint, to acquire the | ||
738 | * uprobes_treelock before the uprobe is removed from the rbtree. | ||
739 | */ | ||
740 | static void delete_uprobe(struct uprobe *uprobe) | ||
741 | { | ||
742 | unsigned long flags; | ||
743 | |||
744 | synchronize_srcu(&uprobes_srcu); | ||
745 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
746 | rb_erase(&uprobe->rb_node, &uprobes_tree); | ||
747 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
748 | iput(uprobe->inode); | ||
749 | put_uprobe(uprobe); | ||
750 | atomic_dec(&uprobe_events); | ||
751 | } | ||
752 | |||
753 | static struct vma_info * | ||
754 | __find_next_vma_info(struct address_space *mapping, struct list_head *head, | ||
755 | struct vma_info *vi, loff_t offset, bool is_register) | ||
756 | { | ||
757 | struct prio_tree_iter iter; | ||
758 | struct vm_area_struct *vma; | ||
759 | struct vma_info *tmpvi; | ||
760 | unsigned long pgoff; | ||
761 | int existing_vma; | ||
762 | loff_t vaddr; | ||
763 | |||
764 | pgoff = offset >> PAGE_SHIFT; | ||
765 | |||
766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
767 | if (!valid_vma(vma, is_register)) | ||
768 | continue; | ||
769 | |||
770 | existing_vma = 0; | ||
771 | vaddr = vma_address(vma, offset); | ||
772 | |||
773 | list_for_each_entry(tmpvi, head, probe_list) { | ||
774 | if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { | ||
775 | existing_vma = 1; | ||
776 | break; | ||
777 | } | ||
778 | } | ||
779 | |||
780 | /* | ||
781 | * Another vma needs a probe to be installed. However skip | ||
782 | * installing the probe if the vma is about to be unlinked. | ||
783 | */ | ||
784 | if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { | ||
785 | vi->mm = vma->vm_mm; | ||
786 | vi->vaddr = vaddr; | ||
787 | list_add(&vi->probe_list, head); | ||
788 | |||
789 | return vi; | ||
790 | } | ||
791 | } | ||
792 | |||
793 | return NULL; | ||
794 | } | ||
795 | |||
796 | /* | ||
797 | * Iterate in the rmap prio tree and find a vma where a probe has not | ||
798 | * yet been inserted. | ||
799 | */ | ||
800 | static struct vma_info * | ||
801 | find_next_vma_info(struct address_space *mapping, struct list_head *head, | ||
802 | loff_t offset, bool is_register) | ||
803 | { | ||
804 | struct vma_info *vi, *retvi; | ||
805 | |||
806 | vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); | ||
807 | if (!vi) | ||
808 | return ERR_PTR(-ENOMEM); | ||
809 | |||
810 | mutex_lock(&mapping->i_mmap_mutex); | ||
811 | retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); | ||
812 | mutex_unlock(&mapping->i_mmap_mutex); | ||
813 | |||
814 | if (!retvi) | ||
815 | kfree(vi); | ||
816 | |||
817 | return retvi; | ||
818 | } | ||
819 | |||
820 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | ||
821 | { | ||
822 | struct list_head try_list; | ||
823 | struct vm_area_struct *vma; | ||
824 | struct address_space *mapping; | ||
825 | struct vma_info *vi, *tmpvi; | ||
826 | struct mm_struct *mm; | ||
827 | loff_t vaddr; | ||
828 | int ret; | ||
829 | |||
830 | mapping = uprobe->inode->i_mapping; | ||
831 | INIT_LIST_HEAD(&try_list); | ||
832 | |||
833 | ret = 0; | ||
834 | |||
835 | for (;;) { | ||
836 | vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); | ||
837 | if (!vi) | ||
838 | break; | ||
839 | |||
840 | if (IS_ERR(vi)) { | ||
841 | ret = PTR_ERR(vi); | ||
842 | break; | ||
843 | } | ||
844 | |||
845 | mm = vi->mm; | ||
846 | down_read(&mm->mmap_sem); | ||
847 | vma = find_vma(mm, (unsigned long)vi->vaddr); | ||
848 | if (!vma || !valid_vma(vma, is_register)) { | ||
849 | list_del(&vi->probe_list); | ||
850 | kfree(vi); | ||
851 | up_read(&mm->mmap_sem); | ||
852 | mmput(mm); | ||
853 | continue; | ||
854 | } | ||
855 | vaddr = vma_address(vma, uprobe->offset); | ||
856 | if (vma->vm_file->f_mapping->host != uprobe->inode || | ||
857 | vaddr != vi->vaddr) { | ||
858 | list_del(&vi->probe_list); | ||
859 | kfree(vi); | ||
860 | up_read(&mm->mmap_sem); | ||
861 | mmput(mm); | ||
862 | continue; | ||
863 | } | ||
864 | |||
865 | if (is_register) | ||
866 | ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); | ||
867 | else | ||
868 | remove_breakpoint(uprobe, mm, vi->vaddr); | ||
869 | |||
870 | up_read(&mm->mmap_sem); | ||
871 | mmput(mm); | ||
872 | if (is_register) { | ||
873 | if (ret && ret == -EEXIST) | ||
874 | ret = 0; | ||
875 | if (ret) | ||
876 | break; | ||
877 | } | ||
878 | } | ||
879 | |||
880 | list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { | ||
881 | list_del(&vi->probe_list); | ||
882 | kfree(vi); | ||
883 | } | ||
884 | |||
885 | return ret; | ||
886 | } | ||
887 | |||
888 | static int __uprobe_register(struct uprobe *uprobe) | ||
889 | { | ||
890 | return register_for_each_vma(uprobe, true); | ||
891 | } | ||
892 | |||
893 | static void __uprobe_unregister(struct uprobe *uprobe) | ||
894 | { | ||
895 | if (!register_for_each_vma(uprobe, false)) | ||
896 | delete_uprobe(uprobe); | ||
897 | |||
898 | /* TODO : cant unregister? schedule a worker thread */ | ||
899 | } | ||
900 | |||
901 | /* | ||
902 | * uprobe_register - register a probe | ||
903 | * @inode: the file in which the probe has to be placed. | ||
904 | * @offset: offset from the start of the file. | ||
905 | * @uc: information on howto handle the probe.. | ||
906 | * | ||
907 | * Apart from the access refcount, uprobe_register() takes a creation | ||
908 | * refcount (thro alloc_uprobe) if and only if this @uprobe is getting | ||
909 | * inserted into the rbtree (i.e first consumer for a @inode:@offset | ||
910 | * tuple). Creation refcount stops uprobe_unregister from freeing the | ||
911 | * @uprobe even before the register operation is complete. Creation | ||
912 | * refcount is released when the last @uc for the @uprobe | ||
913 | * unregisters. | ||
914 | * | ||
915 | * Return errno if it cannot successully install probes | ||
916 | * else return 0 (success) | ||
917 | */ | ||
918 | int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) | ||
919 | { | ||
920 | struct uprobe *uprobe; | ||
921 | int ret; | ||
922 | |||
923 | if (!inode || !uc || uc->next) | ||
924 | return -EINVAL; | ||
925 | |||
926 | if (offset > i_size_read(inode)) | ||
927 | return -EINVAL; | ||
928 | |||
929 | ret = 0; | ||
930 | mutex_lock(uprobes_hash(inode)); | ||
931 | uprobe = alloc_uprobe(inode, offset); | ||
932 | |||
933 | if (uprobe && !consumer_add(uprobe, uc)) { | ||
934 | ret = __uprobe_register(uprobe); | ||
935 | if (ret) { | ||
936 | uprobe->consumers = NULL; | ||
937 | __uprobe_unregister(uprobe); | ||
938 | } else { | ||
939 | uprobe->flags |= UPROBE_RUN_HANDLER; | ||
940 | } | ||
941 | } | ||
942 | |||
943 | mutex_unlock(uprobes_hash(inode)); | ||
944 | put_uprobe(uprobe); | ||
945 | |||
946 | return ret; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * uprobe_unregister - unregister a already registered probe. | ||
951 | * @inode: the file in which the probe has to be removed. | ||
952 | * @offset: offset from the start of the file. | ||
953 | * @uc: identify which probe if multiple probes are colocated. | ||
954 | */ | ||
955 | void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) | ||
956 | { | ||
957 | struct uprobe *uprobe; | ||
958 | |||
959 | if (!inode || !uc) | ||
960 | return; | ||
961 | |||
962 | uprobe = find_uprobe(inode, offset); | ||
963 | if (!uprobe) | ||
964 | return; | ||
965 | |||
966 | mutex_lock(uprobes_hash(inode)); | ||
967 | |||
968 | if (consumer_del(uprobe, uc)) { | ||
969 | if (!uprobe->consumers) { | ||
970 | __uprobe_unregister(uprobe); | ||
971 | uprobe->flags &= ~UPROBE_RUN_HANDLER; | ||
972 | } | ||
973 | } | ||
974 | |||
975 | mutex_unlock(uprobes_hash(inode)); | ||
976 | if (uprobe) | ||
977 | put_uprobe(uprobe); | ||
978 | } | ||
979 | |||
980 | /* | ||
981 | * Of all the nodes that correspond to the given inode, return the node | ||
982 | * with the least offset. | ||
983 | */ | ||
984 | static struct rb_node *find_least_offset_node(struct inode *inode) | ||
985 | { | ||
986 | struct uprobe u = { .inode = inode, .offset = 0}; | ||
987 | struct rb_node *n = uprobes_tree.rb_node; | ||
988 | struct rb_node *close_node = NULL; | ||
989 | struct uprobe *uprobe; | ||
990 | int match; | ||
991 | |||
992 | while (n) { | ||
993 | uprobe = rb_entry(n, struct uprobe, rb_node); | ||
994 | match = match_uprobe(&u, uprobe); | ||
995 | |||
996 | if (uprobe->inode == inode) | ||
997 | close_node = n; | ||
998 | |||
999 | if (!match) | ||
1000 | return close_node; | ||
1001 | |||
1002 | if (match < 0) | ||
1003 | n = n->rb_left; | ||
1004 | else | ||
1005 | n = n->rb_right; | ||
1006 | } | ||
1007 | |||
1008 | return close_node; | ||
1009 | } | ||
1010 | |||
1011 | /* | ||
1012 | * For a given inode, build a list of probes that need to be inserted. | ||
1013 | */ | ||
1014 | static void build_probe_list(struct inode *inode, struct list_head *head) | ||
1015 | { | ||
1016 | struct uprobe *uprobe; | ||
1017 | unsigned long flags; | ||
1018 | struct rb_node *n; | ||
1019 | |||
1020 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
1021 | |||
1022 | n = find_least_offset_node(inode); | ||
1023 | |||
1024 | for (; n; n = rb_next(n)) { | ||
1025 | uprobe = rb_entry(n, struct uprobe, rb_node); | ||
1026 | if (uprobe->inode != inode) | ||
1027 | break; | ||
1028 | |||
1029 | list_add(&uprobe->pending_list, head); | ||
1030 | atomic_inc(&uprobe->ref); | ||
1031 | } | ||
1032 | |||
1033 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
1034 | } | ||
1035 | |||
1036 | /* | ||
1037 | * Called from mmap_region. | ||
1038 | * called with mm->mmap_sem acquired. | ||
1039 | * | ||
1040 | * Return -ve no if we fail to insert probes and we cannot | ||
1041 | * bail-out. | ||
1042 | * Return 0 otherwise. i.e: | ||
1043 | * | ||
1044 | * - successful insertion of probes | ||
1045 | * - (or) no possible probes to be inserted. | ||
1046 | * - (or) insertion of probes failed but we can bail-out. | ||
1047 | */ | ||
1048 | int uprobe_mmap(struct vm_area_struct *vma) | ||
1049 | { | ||
1050 | struct list_head tmp_list; | ||
1051 | struct uprobe *uprobe, *u; | ||
1052 | struct inode *inode; | ||
1053 | int ret, count; | ||
1054 | |||
1055 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) | ||
1056 | return 0; | ||
1057 | |||
1058 | inode = vma->vm_file->f_mapping->host; | ||
1059 | if (!inode) | ||
1060 | return 0; | ||
1061 | |||
1062 | INIT_LIST_HEAD(&tmp_list); | ||
1063 | mutex_lock(uprobes_mmap_hash(inode)); | ||
1064 | build_probe_list(inode, &tmp_list); | ||
1065 | |||
1066 | ret = 0; | ||
1067 | count = 0; | ||
1068 | |||
1069 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | ||
1070 | loff_t vaddr; | ||
1071 | |||
1072 | list_del(&uprobe->pending_list); | ||
1073 | if (!ret) { | ||
1074 | vaddr = vma_address(vma, uprobe->offset); | ||
1075 | |||
1076 | if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { | ||
1077 | put_uprobe(uprobe); | ||
1078 | continue; | ||
1079 | } | ||
1080 | |||
1081 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | ||
1082 | |||
1083 | /* Ignore double add: */ | ||
1084 | if (ret == -EEXIST) { | ||
1085 | ret = 0; | ||
1086 | |||
1087 | if (!is_swbp_at_addr(vma->vm_mm, vaddr)) | ||
1088 | continue; | ||
1089 | |||
1090 | /* | ||
1091 | * Unable to insert a breakpoint, but | ||
1092 | * breakpoint lies underneath. Increment the | ||
1093 | * probe count. | ||
1094 | */ | ||
1095 | atomic_inc(&vma->vm_mm->uprobes_state.count); | ||
1096 | } | ||
1097 | |||
1098 | if (!ret) | ||
1099 | count++; | ||
1100 | } | ||
1101 | put_uprobe(uprobe); | ||
1102 | } | ||
1103 | |||
1104 | mutex_unlock(uprobes_mmap_hash(inode)); | ||
1105 | |||
1106 | if (ret) | ||
1107 | atomic_sub(count, &vma->vm_mm->uprobes_state.count); | ||
1108 | |||
1109 | return ret; | ||
1110 | } | ||
1111 | |||
1112 | /* | ||
1113 | * Called in context of a munmap of a vma. | ||
1114 | */ | ||
1115 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) | ||
1116 | { | ||
1117 | struct list_head tmp_list; | ||
1118 | struct uprobe *uprobe, *u; | ||
1119 | struct inode *inode; | ||
1120 | |||
1121 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | ||
1122 | return; | ||
1123 | |||
1124 | if (!atomic_read(&vma->vm_mm->uprobes_state.count)) | ||
1125 | return; | ||
1126 | |||
1127 | inode = vma->vm_file->f_mapping->host; | ||
1128 | if (!inode) | ||
1129 | return; | ||
1130 | |||
1131 | INIT_LIST_HEAD(&tmp_list); | ||
1132 | mutex_lock(uprobes_mmap_hash(inode)); | ||
1133 | build_probe_list(inode, &tmp_list); | ||
1134 | |||
1135 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | ||
1136 | loff_t vaddr; | ||
1137 | |||
1138 | list_del(&uprobe->pending_list); | ||
1139 | vaddr = vma_address(vma, uprobe->offset); | ||
1140 | |||
1141 | if (vaddr >= start && vaddr < end) { | ||
1142 | /* | ||
1143 | * An unregister could have removed the probe before | ||
1144 | * unmap. So check before we decrement the count. | ||
1145 | */ | ||
1146 | if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) | ||
1147 | atomic_dec(&vma->vm_mm->uprobes_state.count); | ||
1148 | } | ||
1149 | put_uprobe(uprobe); | ||
1150 | } | ||
1151 | mutex_unlock(uprobes_mmap_hash(inode)); | ||
1152 | } | ||
1153 | |||
1154 | /* Slot allocation for XOL */ | ||
1155 | static int xol_add_vma(struct xol_area *area) | ||
1156 | { | ||
1157 | struct mm_struct *mm; | ||
1158 | int ret; | ||
1159 | |||
1160 | area->page = alloc_page(GFP_HIGHUSER); | ||
1161 | if (!area->page) | ||
1162 | return -ENOMEM; | ||
1163 | |||
1164 | ret = -EALREADY; | ||
1165 | mm = current->mm; | ||
1166 | |||
1167 | down_write(&mm->mmap_sem); | ||
1168 | if (mm->uprobes_state.xol_area) | ||
1169 | goto fail; | ||
1170 | |||
1171 | ret = -ENOMEM; | ||
1172 | |||
1173 | /* Try to map as high as possible, this is only a hint. */ | ||
1174 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | ||
1175 | if (area->vaddr & ~PAGE_MASK) { | ||
1176 | ret = area->vaddr; | ||
1177 | goto fail; | ||
1178 | } | ||
1179 | |||
1180 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | ||
1181 | VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); | ||
1182 | if (ret) | ||
1183 | goto fail; | ||
1184 | |||
1185 | smp_wmb(); /* pairs with get_xol_area() */ | ||
1186 | mm->uprobes_state.xol_area = area; | ||
1187 | ret = 0; | ||
1188 | |||
1189 | fail: | ||
1190 | up_write(&mm->mmap_sem); | ||
1191 | if (ret) | ||
1192 | __free_page(area->page); | ||
1193 | |||
1194 | return ret; | ||
1195 | } | ||
1196 | |||
1197 | static struct xol_area *get_xol_area(struct mm_struct *mm) | ||
1198 | { | ||
1199 | struct xol_area *area; | ||
1200 | |||
1201 | area = mm->uprobes_state.xol_area; | ||
1202 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
1203 | |||
1204 | return area; | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * xol_alloc_area - Allocate process's xol_area. | ||
1209 | * This area will be used for storing instructions for execution out of | ||
1210 | * line. | ||
1211 | * | ||
1212 | * Returns the allocated area or NULL. | ||
1213 | */ | ||
1214 | static struct xol_area *xol_alloc_area(void) | ||
1215 | { | ||
1216 | struct xol_area *area; | ||
1217 | |||
1218 | area = kzalloc(sizeof(*area), GFP_KERNEL); | ||
1219 | if (unlikely(!area)) | ||
1220 | return NULL; | ||
1221 | |||
1222 | area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); | ||
1223 | |||
1224 | if (!area->bitmap) | ||
1225 | goto fail; | ||
1226 | |||
1227 | init_waitqueue_head(&area->wq); | ||
1228 | if (!xol_add_vma(area)) | ||
1229 | return area; | ||
1230 | |||
1231 | fail: | ||
1232 | kfree(area->bitmap); | ||
1233 | kfree(area); | ||
1234 | |||
1235 | return get_xol_area(current->mm); | ||
1236 | } | ||
1237 | |||
1238 | /* | ||
1239 | * uprobe_clear_state - Free the area allocated for slots. | ||
1240 | */ | ||
1241 | void uprobe_clear_state(struct mm_struct *mm) | ||
1242 | { | ||
1243 | struct xol_area *area = mm->uprobes_state.xol_area; | ||
1244 | |||
1245 | if (!area) | ||
1246 | return; | ||
1247 | |||
1248 | put_page(area->page); | ||
1249 | kfree(area->bitmap); | ||
1250 | kfree(area); | ||
1251 | } | ||
1252 | |||
1253 | /* | ||
1254 | * uprobe_reset_state - Free the area allocated for slots. | ||
1255 | */ | ||
1256 | void uprobe_reset_state(struct mm_struct *mm) | ||
1257 | { | ||
1258 | mm->uprobes_state.xol_area = NULL; | ||
1259 | atomic_set(&mm->uprobes_state.count, 0); | ||
1260 | } | ||
1261 | |||
1262 | /* | ||
1263 | * - search for a free slot. | ||
1264 | */ | ||
1265 | static unsigned long xol_take_insn_slot(struct xol_area *area) | ||
1266 | { | ||
1267 | unsigned long slot_addr; | ||
1268 | int slot_nr; | ||
1269 | |||
1270 | do { | ||
1271 | slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); | ||
1272 | if (slot_nr < UINSNS_PER_PAGE) { | ||
1273 | if (!test_and_set_bit(slot_nr, area->bitmap)) | ||
1274 | break; | ||
1275 | |||
1276 | slot_nr = UINSNS_PER_PAGE; | ||
1277 | continue; | ||
1278 | } | ||
1279 | wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); | ||
1280 | } while (slot_nr >= UINSNS_PER_PAGE); | ||
1281 | |||
1282 | slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); | ||
1283 | atomic_inc(&area->slot_count); | ||
1284 | |||
1285 | return slot_addr; | ||
1286 | } | ||
1287 | |||
1288 | /* | ||
1289 | * xol_get_insn_slot - If was not allocated a slot, then | ||
1290 | * allocate a slot. | ||
1291 | * Returns the allocated slot address or 0. | ||
1292 | */ | ||
1293 | static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) | ||
1294 | { | ||
1295 | struct xol_area *area; | ||
1296 | unsigned long offset; | ||
1297 | void *vaddr; | ||
1298 | |||
1299 | area = get_xol_area(current->mm); | ||
1300 | if (!area) { | ||
1301 | area = xol_alloc_area(); | ||
1302 | if (!area) | ||
1303 | return 0; | ||
1304 | } | ||
1305 | current->utask->xol_vaddr = xol_take_insn_slot(area); | ||
1306 | |||
1307 | /* | ||
1308 | * Initialize the slot if xol_vaddr points to valid | ||
1309 | * instruction slot. | ||
1310 | */ | ||
1311 | if (unlikely(!current->utask->xol_vaddr)) | ||
1312 | return 0; | ||
1313 | |||
1314 | current->utask->vaddr = slot_addr; | ||
1315 | offset = current->utask->xol_vaddr & ~PAGE_MASK; | ||
1316 | vaddr = kmap_atomic(area->page); | ||
1317 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | ||
1318 | kunmap_atomic(vaddr); | ||
1319 | |||
1320 | return current->utask->xol_vaddr; | ||
1321 | } | ||
1322 | |||
1323 | /* | ||
1324 | * xol_free_insn_slot - If slot was earlier allocated by | ||
1325 | * @xol_get_insn_slot(), make the slot available for | ||
1326 | * subsequent requests. | ||
1327 | */ | ||
1328 | static void xol_free_insn_slot(struct task_struct *tsk) | ||
1329 | { | ||
1330 | struct xol_area *area; | ||
1331 | unsigned long vma_end; | ||
1332 | unsigned long slot_addr; | ||
1333 | |||
1334 | if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) | ||
1335 | return; | ||
1336 | |||
1337 | slot_addr = tsk->utask->xol_vaddr; | ||
1338 | |||
1339 | if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) | ||
1340 | return; | ||
1341 | |||
1342 | area = tsk->mm->uprobes_state.xol_area; | ||
1343 | vma_end = area->vaddr + PAGE_SIZE; | ||
1344 | if (area->vaddr <= slot_addr && slot_addr < vma_end) { | ||
1345 | unsigned long offset; | ||
1346 | int slot_nr; | ||
1347 | |||
1348 | offset = slot_addr - area->vaddr; | ||
1349 | slot_nr = offset / UPROBE_XOL_SLOT_BYTES; | ||
1350 | if (slot_nr >= UINSNS_PER_PAGE) | ||
1351 | return; | ||
1352 | |||
1353 | clear_bit(slot_nr, area->bitmap); | ||
1354 | atomic_dec(&area->slot_count); | ||
1355 | if (waitqueue_active(&area->wq)) | ||
1356 | wake_up(&area->wq); | ||
1357 | |||
1358 | tsk->utask->xol_vaddr = 0; | ||
1359 | } | ||
1360 | } | ||
1361 | |||
1362 | /** | ||
1363 | * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs | ||
1364 | * @regs: Reflects the saved state of the task after it has hit a breakpoint | ||
1365 | * instruction. | ||
1366 | * Return the address of the breakpoint instruction. | ||
1367 | */ | ||
1368 | unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) | ||
1369 | { | ||
1370 | return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; | ||
1371 | } | ||
1372 | |||
1373 | /* | ||
1374 | * Called with no locks held. | ||
1375 | * Called in context of a exiting or a exec-ing thread. | ||
1376 | */ | ||
1377 | void uprobe_free_utask(struct task_struct *t) | ||
1378 | { | ||
1379 | struct uprobe_task *utask = t->utask; | ||
1380 | |||
1381 | if (t->uprobe_srcu_id != -1) | ||
1382 | srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); | ||
1383 | |||
1384 | if (!utask) | ||
1385 | return; | ||
1386 | |||
1387 | if (utask->active_uprobe) | ||
1388 | put_uprobe(utask->active_uprobe); | ||
1389 | |||
1390 | xol_free_insn_slot(t); | ||
1391 | kfree(utask); | ||
1392 | t->utask = NULL; | ||
1393 | } | ||
1394 | |||
1395 | /* | ||
1396 | * Called in context of a new clone/fork from copy_process. | ||
1397 | */ | ||
1398 | void uprobe_copy_process(struct task_struct *t) | ||
1399 | { | ||
1400 | t->utask = NULL; | ||
1401 | t->uprobe_srcu_id = -1; | ||
1402 | } | ||
1403 | |||
1404 | /* | ||
1405 | * Allocate a uprobe_task object for the task. | ||
1406 | * Called when the thread hits a breakpoint for the first time. | ||
1407 | * | ||
1408 | * Returns: | ||
1409 | * - pointer to new uprobe_task on success | ||
1410 | * - NULL otherwise | ||
1411 | */ | ||
1412 | static struct uprobe_task *add_utask(void) | ||
1413 | { | ||
1414 | struct uprobe_task *utask; | ||
1415 | |||
1416 | utask = kzalloc(sizeof *utask, GFP_KERNEL); | ||
1417 | if (unlikely(!utask)) | ||
1418 | return NULL; | ||
1419 | |||
1420 | utask->active_uprobe = NULL; | ||
1421 | current->utask = utask; | ||
1422 | return utask; | ||
1423 | } | ||
1424 | |||
1425 | /* Prepare to single-step probed instruction out of line. */ | ||
1426 | static int | ||
1427 | pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) | ||
1428 | { | ||
1429 | if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) | ||
1430 | return 0; | ||
1431 | |||
1432 | return -EFAULT; | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * If we are singlestepping, then ensure this thread is not connected to | ||
1437 | * non-fatal signals until completion of singlestep. When xol insn itself | ||
1438 | * triggers the signal, restart the original insn even if the task is | ||
1439 | * already SIGKILL'ed (since coredump should report the correct ip). This | ||
1440 | * is even more important if the task has a handler for SIGSEGV/etc, The | ||
1441 | * _same_ instruction should be repeated again after return from the signal | ||
1442 | * handler, and SSTEP can never finish in this case. | ||
1443 | */ | ||
1444 | bool uprobe_deny_signal(void) | ||
1445 | { | ||
1446 | struct task_struct *t = current; | ||
1447 | struct uprobe_task *utask = t->utask; | ||
1448 | |||
1449 | if (likely(!utask || !utask->active_uprobe)) | ||
1450 | return false; | ||
1451 | |||
1452 | WARN_ON_ONCE(utask->state != UTASK_SSTEP); | ||
1453 | |||
1454 | if (signal_pending(t)) { | ||
1455 | spin_lock_irq(&t->sighand->siglock); | ||
1456 | clear_tsk_thread_flag(t, TIF_SIGPENDING); | ||
1457 | spin_unlock_irq(&t->sighand->siglock); | ||
1458 | |||
1459 | if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { | ||
1460 | utask->state = UTASK_SSTEP_TRAPPED; | ||
1461 | set_tsk_thread_flag(t, TIF_UPROBE); | ||
1462 | set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); | ||
1463 | } | ||
1464 | } | ||
1465 | |||
1466 | return true; | ||
1467 | } | ||
1468 | |||
1469 | /* | ||
1470 | * Avoid singlestepping the original instruction if the original instruction | ||
1471 | * is a NOP or can be emulated. | ||
1472 | */ | ||
1473 | static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | ||
1474 | { | ||
1475 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) | ||
1476 | return true; | ||
1477 | |||
1478 | uprobe->flags &= ~UPROBE_SKIP_SSTEP; | ||
1479 | return false; | ||
1480 | } | ||
1481 | |||
1482 | /* | ||
1483 | * Run handler and ask thread to singlestep. | ||
1484 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | ||
1485 | */ | ||
1486 | static void handle_swbp(struct pt_regs *regs) | ||
1487 | { | ||
1488 | struct vm_area_struct *vma; | ||
1489 | struct uprobe_task *utask; | ||
1490 | struct uprobe *uprobe; | ||
1491 | struct mm_struct *mm; | ||
1492 | unsigned long bp_vaddr; | ||
1493 | |||
1494 | uprobe = NULL; | ||
1495 | bp_vaddr = uprobe_get_swbp_addr(regs); | ||
1496 | mm = current->mm; | ||
1497 | down_read(&mm->mmap_sem); | ||
1498 | vma = find_vma(mm, bp_vaddr); | ||
1499 | |||
1500 | if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { | ||
1501 | struct inode *inode; | ||
1502 | loff_t offset; | ||
1503 | |||
1504 | inode = vma->vm_file->f_mapping->host; | ||
1505 | offset = bp_vaddr - vma->vm_start; | ||
1506 | offset += (vma->vm_pgoff << PAGE_SHIFT); | ||
1507 | uprobe = find_uprobe(inode, offset); | ||
1508 | } | ||
1509 | |||
1510 | srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); | ||
1511 | current->uprobe_srcu_id = -1; | ||
1512 | up_read(&mm->mmap_sem); | ||
1513 | |||
1514 | if (!uprobe) { | ||
1515 | /* No matching uprobe; signal SIGTRAP. */ | ||
1516 | send_sig(SIGTRAP, current, 0); | ||
1517 | return; | ||
1518 | } | ||
1519 | |||
1520 | utask = current->utask; | ||
1521 | if (!utask) { | ||
1522 | utask = add_utask(); | ||
1523 | /* Cannot allocate; re-execute the instruction. */ | ||
1524 | if (!utask) | ||
1525 | goto cleanup_ret; | ||
1526 | } | ||
1527 | utask->active_uprobe = uprobe; | ||
1528 | handler_chain(uprobe, regs); | ||
1529 | if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) | ||
1530 | goto cleanup_ret; | ||
1531 | |||
1532 | utask->state = UTASK_SSTEP; | ||
1533 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | ||
1534 | user_enable_single_step(current); | ||
1535 | return; | ||
1536 | } | ||
1537 | |||
1538 | cleanup_ret: | ||
1539 | if (utask) { | ||
1540 | utask->active_uprobe = NULL; | ||
1541 | utask->state = UTASK_RUNNING; | ||
1542 | } | ||
1543 | if (uprobe) { | ||
1544 | if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) | ||
1545 | |||
1546 | /* | ||
1547 | * cannot singlestep; cannot skip instruction; | ||
1548 | * re-execute the instruction. | ||
1549 | */ | ||
1550 | instruction_pointer_set(regs, bp_vaddr); | ||
1551 | |||
1552 | put_uprobe(uprobe); | ||
1553 | } | ||
1554 | } | ||
1555 | |||
1556 | /* | ||
1557 | * Perform required fix-ups and disable singlestep. | ||
1558 | * Allow pending signals to take effect. | ||
1559 | */ | ||
1560 | static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | ||
1561 | { | ||
1562 | struct uprobe *uprobe; | ||
1563 | |||
1564 | uprobe = utask->active_uprobe; | ||
1565 | if (utask->state == UTASK_SSTEP_ACK) | ||
1566 | arch_uprobe_post_xol(&uprobe->arch, regs); | ||
1567 | else if (utask->state == UTASK_SSTEP_TRAPPED) | ||
1568 | arch_uprobe_abort_xol(&uprobe->arch, regs); | ||
1569 | else | ||
1570 | WARN_ON_ONCE(1); | ||
1571 | |||
1572 | put_uprobe(uprobe); | ||
1573 | utask->active_uprobe = NULL; | ||
1574 | utask->state = UTASK_RUNNING; | ||
1575 | user_disable_single_step(current); | ||
1576 | xol_free_insn_slot(current); | ||
1577 | |||
1578 | spin_lock_irq(¤t->sighand->siglock); | ||
1579 | recalc_sigpending(); /* see uprobe_deny_signal() */ | ||
1580 | spin_unlock_irq(¤t->sighand->siglock); | ||
1581 | } | ||
1582 | |||
1583 | /* | ||
1584 | * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on | ||
1585 | * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and | ||
1586 | * allows the thread to return from interrupt. | ||
1587 | * | ||
1588 | * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and | ||
1589 | * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from | ||
1590 | * interrupt. | ||
1591 | * | ||
1592 | * While returning to userspace, thread notices the TIF_UPROBE flag and calls | ||
1593 | * uprobe_notify_resume(). | ||
1594 | */ | ||
1595 | void uprobe_notify_resume(struct pt_regs *regs) | ||
1596 | { | ||
1597 | struct uprobe_task *utask; | ||
1598 | |||
1599 | utask = current->utask; | ||
1600 | if (!utask || utask->state == UTASK_BP_HIT) | ||
1601 | handle_swbp(regs); | ||
1602 | else | ||
1603 | handle_singlestep(utask, regs); | ||
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * uprobe_pre_sstep_notifier gets called from interrupt context as part of | ||
1608 | * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. | ||
1609 | */ | ||
1610 | int uprobe_pre_sstep_notifier(struct pt_regs *regs) | ||
1611 | { | ||
1612 | struct uprobe_task *utask; | ||
1613 | |||
1614 | if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) | ||
1615 | /* task is currently not uprobed */ | ||
1616 | return 0; | ||
1617 | |||
1618 | utask = current->utask; | ||
1619 | if (utask) | ||
1620 | utask->state = UTASK_BP_HIT; | ||
1621 | |||
1622 | set_thread_flag(TIF_UPROBE); | ||
1623 | current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); | ||
1624 | |||
1625 | return 1; | ||
1626 | } | ||
1627 | |||
1628 | /* | ||
1629 | * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier | ||
1630 | * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. | ||
1631 | */ | ||
1632 | int uprobe_post_sstep_notifier(struct pt_regs *regs) | ||
1633 | { | ||
1634 | struct uprobe_task *utask = current->utask; | ||
1635 | |||
1636 | if (!current->mm || !utask || !utask->active_uprobe) | ||
1637 | /* task is currently not uprobed */ | ||
1638 | return 0; | ||
1639 | |||
1640 | utask->state = UTASK_SSTEP_ACK; | ||
1641 | set_thread_flag(TIF_UPROBE); | ||
1642 | return 1; | ||
1643 | } | ||
1644 | |||
1645 | static struct notifier_block uprobe_exception_nb = { | ||
1646 | .notifier_call = arch_uprobe_exception_notify, | ||
1647 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ | ||
1648 | }; | ||
1649 | |||
1650 | static int __init init_uprobes(void) | ||
1651 | { | ||
1652 | int i; | ||
1653 | |||
1654 | for (i = 0; i < UPROBES_HASH_SZ; i++) { | ||
1655 | mutex_init(&uprobes_mutex[i]); | ||
1656 | mutex_init(&uprobes_mmap_mutex[i]); | ||
1657 | } | ||
1658 | init_srcu_struct(&uprobes_srcu); | ||
1659 | |||
1660 | return register_die_notifier(&uprobe_exception_nb); | ||
1661 | } | ||
1662 | module_init(init_uprobes); | ||
1663 | |||
1664 | static void __exit exit_uprobes(void) | ||
1665 | { | ||
1666 | } | ||
1667 | module_exit(exit_uprobes); | ||
diff --git a/kernel/exit.c b/kernel/exit.c index d8bd3b425fa7..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -884,9 +884,9 @@ static void check_stack_usage(void) | |||
884 | 884 | ||
885 | spin_lock(&low_water_lock); | 885 | spin_lock(&low_water_lock); |
886 | if (free < lowest_to_date) { | 886 | if (free < lowest_to_date) { |
887 | printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " | 887 | printk(KERN_WARNING "%s (%d) used greatest stack depth: " |
888 | "left\n", | 888 | "%lu bytes left\n", |
889 | current->comm, free); | 889 | current->comm, task_pid_nr(current), free); |
890 | lowest_to_date = free; | 890 | lowest_to_date = free; |
891 | } | 891 | } |
892 | spin_unlock(&low_water_lock); | 892 | spin_unlock(&low_water_lock); |
@@ -946,12 +946,13 @@ void do_exit(long code) | |||
946 | exit_signals(tsk); /* sets PF_EXITING */ | 946 | exit_signals(tsk); /* sets PF_EXITING */ |
947 | /* | 947 | /* |
948 | * tsk->flags are checked in the futex code to protect against | 948 | * tsk->flags are checked in the futex code to protect against |
949 | * an exiting task cleaning up the robust pi futexes. | 949 | * an exiting task cleaning up the robust pi futexes, and in |
950 | * task_work_add() to avoid the race with exit_task_work(). | ||
950 | */ | 951 | */ |
951 | smp_mb(); | 952 | smp_mb(); |
952 | raw_spin_unlock_wait(&tsk->pi_lock); | 953 | raw_spin_unlock_wait(&tsk->pi_lock); |
953 | 954 | ||
954 | exit_irq_thread(); | 955 | exit_task_work(tsk); |
955 | 956 | ||
956 | if (unlikely(in_atomic())) | 957 | if (unlikely(in_atomic())) |
957 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 958 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
@@ -1214,7 +1215,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1214 | unsigned long state; | 1215 | unsigned long state; |
1215 | int retval, status, traced; | 1216 | int retval, status, traced; |
1216 | pid_t pid = task_pid_vnr(p); | 1217 | pid_t pid = task_pid_vnr(p); |
1217 | uid_t uid = __task_cred(p)->uid; | 1218 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1218 | struct siginfo __user *infop; | 1219 | struct siginfo __user *infop; |
1219 | 1220 | ||
1220 | if (!likely(wo->wo_flags & WEXITED)) | 1221 | if (!likely(wo->wo_flags & WEXITED)) |
@@ -1427,7 +1428,7 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1427 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1428 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1428 | *p_code = 0; | 1429 | *p_code = 0; |
1429 | 1430 | ||
1430 | uid = task_uid(p); | 1431 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1431 | unlock_sig: | 1432 | unlock_sig: |
1432 | spin_unlock_irq(&p->sighand->siglock); | 1433 | spin_unlock_irq(&p->sighand->siglock); |
1433 | if (!exit_code) | 1434 | if (!exit_code) |
@@ -1500,7 +1501,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
1500 | } | 1501 | } |
1501 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1502 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1502 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1503 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1503 | uid = task_uid(p); | 1504 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1504 | spin_unlock_irq(&p->sighand->siglock); | 1505 | spin_unlock_irq(&p->sighand->siglock); |
1505 | 1506 | ||
1506 | pid = task_pid_vnr(p); | 1507 | pid = task_pid_vnr(p); |
diff --git a/kernel/extable.c b/kernel/extable.c index 5339705b8241..fe35a634bf76 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex); | |||
35 | extern struct exception_table_entry __start___ex_table[]; | 35 | extern struct exception_table_entry __start___ex_table[]; |
36 | extern struct exception_table_entry __stop___ex_table[]; | 36 | extern struct exception_table_entry __stop___ex_table[]; |
37 | 37 | ||
38 | /* Cleared by build time tools if the table is already sorted. */ | ||
39 | u32 __initdata main_extable_sort_needed = 1; | ||
40 | |||
38 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
39 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
40 | { | 43 | { |
41 | sort_extable(__start___ex_table, __stop___ex_table); | 44 | if (main_extable_sort_needed) |
45 | sort_extable(__start___ex_table, __stop___ex_table); | ||
46 | else | ||
47 | pr_notice("__ex_table already sorted, skipping sort\n"); | ||
42 | } | 48 | } |
43 | 49 | ||
44 | /* Given an address, look for it in the exception tables. */ | 50 | /* Given an address, look for it in the exception tables. */ |
diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/cgroup.h> | 34 | #include <linux/cgroup.h> |
35 | #include <linux/security.h> | 35 | #include <linux/security.h> |
36 | #include <linux/hugetlb.h> | 36 | #include <linux/hugetlb.h> |
37 | #include <linux/seccomp.h> | ||
37 | #include <linux/swap.h> | 38 | #include <linux/swap.h> |
38 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
39 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
@@ -47,6 +48,7 @@ | |||
47 | #include <linux/audit.h> | 48 | #include <linux/audit.h> |
48 | #include <linux/memcontrol.h> | 49 | #include <linux/memcontrol.h> |
49 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/proc_fs.h> | ||
50 | #include <linux/profile.h> | 52 | #include <linux/profile.h> |
51 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
52 | #include <linux/ksm.h> | 54 | #include <linux/ksm.h> |
@@ -67,6 +69,7 @@ | |||
67 | #include <linux/oom.h> | 69 | #include <linux/oom.h> |
68 | #include <linux/khugepaged.h> | 70 | #include <linux/khugepaged.h> |
69 | #include <linux/signalfd.h> | 71 | #include <linux/signalfd.h> |
72 | #include <linux/uprobes.h> | ||
70 | 73 | ||
71 | #include <asm/pgtable.h> | 74 | #include <asm/pgtable.h> |
72 | #include <asm/pgalloc.h> | 75 | #include <asm/pgalloc.h> |
@@ -111,32 +114,67 @@ int nr_processes(void) | |||
111 | return total; | 114 | return total; |
112 | } | 115 | } |
113 | 116 | ||
114 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 117 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
115 | # define alloc_task_struct_node(node) \ | ||
116 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) | ||
117 | # define free_task_struct(tsk) \ | ||
118 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
119 | static struct kmem_cache *task_struct_cachep; | 118 | static struct kmem_cache *task_struct_cachep; |
119 | |||
120 | static inline struct task_struct *alloc_task_struct_node(int node) | ||
121 | { | ||
122 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); | ||
123 | } | ||
124 | |||
125 | void __weak arch_release_task_struct(struct task_struct *tsk) { } | ||
126 | |||
127 | static inline void free_task_struct(struct task_struct *tsk) | ||
128 | { | ||
129 | arch_release_task_struct(tsk); | ||
130 | kmem_cache_free(task_struct_cachep, tsk); | ||
131 | } | ||
120 | #endif | 132 | #endif |
121 | 133 | ||
122 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 134 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR |
135 | void __weak arch_release_thread_info(struct thread_info *ti) { } | ||
136 | |||
137 | /* | ||
138 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | ||
139 | * kmemcache based allocator. | ||
140 | */ | ||
141 | # if THREAD_SIZE >= PAGE_SIZE | ||
123 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 142 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
124 | int node) | 143 | int node) |
125 | { | 144 | { |
126 | #ifdef CONFIG_DEBUG_STACK_USAGE | 145 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
127 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 146 | THREAD_SIZE_ORDER); |
128 | #else | ||
129 | gfp_t mask = GFP_KERNEL; | ||
130 | #endif | ||
131 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); | ||
132 | 147 | ||
133 | return page ? page_address(page) : NULL; | 148 | return page ? page_address(page) : NULL; |
134 | } | 149 | } |
135 | 150 | ||
136 | static inline void free_thread_info(struct thread_info *ti) | 151 | static inline void free_thread_info(struct thread_info *ti) |
137 | { | 152 | { |
153 | arch_release_thread_info(ti); | ||
138 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 154 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
139 | } | 155 | } |
156 | # else | ||
157 | static struct kmem_cache *thread_info_cache; | ||
158 | |||
159 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | ||
160 | int node) | ||
161 | { | ||
162 | return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); | ||
163 | } | ||
164 | |||
165 | static void free_thread_info(struct thread_info *ti) | ||
166 | { | ||
167 | arch_release_thread_info(ti); | ||
168 | kmem_cache_free(thread_info_cache, ti); | ||
169 | } | ||
170 | |||
171 | void thread_info_cache_init(void) | ||
172 | { | ||
173 | thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, | ||
174 | THREAD_SIZE, 0, NULL); | ||
175 | BUG_ON(thread_info_cache == NULL); | ||
176 | } | ||
177 | # endif | ||
140 | #endif | 178 | #endif |
141 | 179 | ||
142 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 180 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
@@ -170,6 +208,7 @@ void free_task(struct task_struct *tsk) | |||
170 | free_thread_info(tsk->stack); | 208 | free_thread_info(tsk->stack); |
171 | rt_mutex_debug_task_free(tsk); | 209 | rt_mutex_debug_task_free(tsk); |
172 | ftrace_graph_exit_task(tsk); | 210 | ftrace_graph_exit_task(tsk); |
211 | put_seccomp_filter(tsk); | ||
173 | free_task_struct(tsk); | 212 | free_task_struct(tsk); |
174 | } | 213 | } |
175 | EXPORT_SYMBOL(free_task); | 214 | EXPORT_SYMBOL(free_task); |
@@ -203,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk) | |||
203 | } | 242 | } |
204 | EXPORT_SYMBOL_GPL(__put_task_struct); | 243 | EXPORT_SYMBOL_GPL(__put_task_struct); |
205 | 244 | ||
206 | /* | 245 | void __init __weak arch_task_cache_init(void) { } |
207 | * macro override instead of weak attribute alias, to workaround | ||
208 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
209 | */ | ||
210 | #ifndef arch_task_cache_init | ||
211 | #define arch_task_cache_init() | ||
212 | #endif | ||
213 | 246 | ||
214 | void __init fork_init(unsigned long mempages) | 247 | void __init fork_init(unsigned long mempages) |
215 | { | 248 | { |
216 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 249 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
217 | #ifndef ARCH_MIN_TASKALIGN | 250 | #ifndef ARCH_MIN_TASKALIGN |
218 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 251 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
219 | #endif | 252 | #endif |
@@ -260,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
260 | int node = tsk_fork_get_node(orig); | 293 | int node = tsk_fork_get_node(orig); |
261 | int err; | 294 | int err; |
262 | 295 | ||
263 | prepare_to_copy(orig); | ||
264 | |||
265 | tsk = alloc_task_struct_node(node); | 296 | tsk = alloc_task_struct_node(node); |
266 | if (!tsk) | 297 | if (!tsk) |
267 | return NULL; | 298 | return NULL; |
@@ -355,7 +386,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
355 | } | 386 | } |
356 | charge = 0; | 387 | charge = 0; |
357 | if (mpnt->vm_flags & VM_ACCOUNT) { | 388 | if (mpnt->vm_flags & VM_ACCOUNT) { |
358 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | 389 | unsigned long len; |
390 | len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | ||
359 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ | 391 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ |
360 | goto fail_nomem; | 392 | goto fail_nomem; |
361 | charge = len; | 393 | charge = len; |
@@ -421,6 +453,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
421 | 453 | ||
422 | if (retval) | 454 | if (retval) |
423 | goto out; | 455 | goto out; |
456 | |||
457 | if (file && uprobe_mmap(tmp)) | ||
458 | goto out; | ||
424 | } | 459 | } |
425 | /* a new mm has just been created */ | 460 | /* a new mm has just been created */ |
426 | arch_dup_mmap(oldmm, mm); | 461 | arch_dup_mmap(oldmm, mm); |
@@ -569,6 +604,7 @@ void mmput(struct mm_struct *mm) | |||
569 | might_sleep(); | 604 | might_sleep(); |
570 | 605 | ||
571 | if (atomic_dec_and_test(&mm->mm_users)) { | 606 | if (atomic_dec_and_test(&mm->mm_users)) { |
607 | uprobe_clear_state(mm); | ||
572 | exit_aio(mm); | 608 | exit_aio(mm); |
573 | ksm_exit(mm); | 609 | ksm_exit(mm); |
574 | khugepaged_exit(mm); /* must run before exit_mmap */ | 610 | khugepaged_exit(mm); /* must run before exit_mmap */ |
@@ -579,7 +615,6 @@ void mmput(struct mm_struct *mm) | |||
579 | list_del(&mm->mmlist); | 615 | list_del(&mm->mmlist); |
580 | spin_unlock(&mmlist_lock); | 616 | spin_unlock(&mmlist_lock); |
581 | } | 617 | } |
582 | put_swap_token(mm); | ||
583 | if (mm->binfmt) | 618 | if (mm->binfmt) |
584 | module_put(mm->binfmt->module); | 619 | module_put(mm->binfmt->module); |
585 | mmdrop(mm); | 620 | mmdrop(mm); |
@@ -747,12 +782,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
747 | exit_pi_state_list(tsk); | 782 | exit_pi_state_list(tsk); |
748 | #endif | 783 | #endif |
749 | 784 | ||
785 | uprobe_free_utask(tsk); | ||
786 | |||
750 | /* Get rid of any cached register state */ | 787 | /* Get rid of any cached register state */ |
751 | deactivate_mm(tsk, mm); | 788 | deactivate_mm(tsk, mm); |
752 | 789 | ||
753 | if (tsk->vfork_done) | ||
754 | complete_vfork_done(tsk); | ||
755 | |||
756 | /* | 790 | /* |
757 | * If we're exiting normally, clear a user-space tid field if | 791 | * If we're exiting normally, clear a user-space tid field if |
758 | * requested. We leave this alone when dying by signal, to leave | 792 | * requested. We leave this alone when dying by signal, to leave |
@@ -773,6 +807,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
773 | } | 807 | } |
774 | tsk->clear_child_tid = NULL; | 808 | tsk->clear_child_tid = NULL; |
775 | } | 809 | } |
810 | |||
811 | /* | ||
812 | * All done, finally we can wake up parent and return this mm to him. | ||
813 | * Also kthread_stop() uses this completion for synchronization. | ||
814 | */ | ||
815 | if (tsk->vfork_done) | ||
816 | complete_vfork_done(tsk); | ||
776 | } | 817 | } |
777 | 818 | ||
778 | /* | 819 | /* |
@@ -794,13 +835,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
794 | memcpy(mm, oldmm, sizeof(*mm)); | 835 | memcpy(mm, oldmm, sizeof(*mm)); |
795 | mm_init_cpumask(mm); | 836 | mm_init_cpumask(mm); |
796 | 837 | ||
797 | /* Initializing for Swap token stuff */ | ||
798 | mm->token_priority = 0; | ||
799 | mm->last_interval = 0; | ||
800 | |||
801 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 838 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
802 | mm->pmd_huge_pte = NULL; | 839 | mm->pmd_huge_pte = NULL; |
803 | #endif | 840 | #endif |
841 | uprobe_reset_state(mm); | ||
804 | 842 | ||
805 | if (!mm_init(mm, tsk)) | 843 | if (!mm_init(mm, tsk)) |
806 | goto fail_nomem; | 844 | goto fail_nomem; |
@@ -875,10 +913,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | |||
875 | goto fail_nomem; | 913 | goto fail_nomem; |
876 | 914 | ||
877 | good_mm: | 915 | good_mm: |
878 | /* Initializing for Swap token stuff */ | ||
879 | mm->token_priority = 0; | ||
880 | mm->last_interval = 0; | ||
881 | |||
882 | tsk->mm = mm; | 916 | tsk->mm = mm; |
883 | tsk->active_mm = mm; | 917 | tsk->active_mm = mm; |
884 | return 0; | 918 | return 0; |
@@ -946,9 +980,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
946 | * Share io context with parent, if CLONE_IO is set | 980 | * Share io context with parent, if CLONE_IO is set |
947 | */ | 981 | */ |
948 | if (clone_flags & CLONE_IO) { | 982 | if (clone_flags & CLONE_IO) { |
949 | tsk->io_context = ioc_task_link(ioc); | 983 | ioc_task_link(ioc); |
950 | if (unlikely(!tsk->io_context)) | 984 | tsk->io_context = ioc; |
951 | return -ENOMEM; | ||
952 | } else if (ioprio_valid(ioc->ioprio)) { | 985 | } else if (ioprio_valid(ioc->ioprio)) { |
953 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); | 986 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); |
954 | if (unlikely(!new_ioc)) | 987 | if (unlikely(!new_ioc)) |
@@ -1162,6 +1195,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1162 | goto fork_out; | 1195 | goto fork_out; |
1163 | 1196 | ||
1164 | ftrace_graph_init_task(p); | 1197 | ftrace_graph_init_task(p); |
1198 | get_seccomp_filter(p); | ||
1165 | 1199 | ||
1166 | rt_mutex_init_task(p); | 1200 | rt_mutex_init_task(p); |
1167 | 1201 | ||
@@ -1342,6 +1376,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1342 | INIT_LIST_HEAD(&p->pi_state_list); | 1376 | INIT_LIST_HEAD(&p->pi_state_list); |
1343 | p->pi_state_cache = NULL; | 1377 | p->pi_state_cache = NULL; |
1344 | #endif | 1378 | #endif |
1379 | uprobe_copy_process(p); | ||
1345 | /* | 1380 | /* |
1346 | * sigaltstack should be cleared when sharing the same VM | 1381 | * sigaltstack should be cleared when sharing the same VM |
1347 | */ | 1382 | */ |
@@ -1380,6 +1415,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1380 | */ | 1415 | */ |
1381 | p->group_leader = p; | 1416 | p->group_leader = p; |
1382 | INIT_LIST_HEAD(&p->thread_group); | 1417 | INIT_LIST_HEAD(&p->thread_group); |
1418 | INIT_HLIST_HEAD(&p->task_works); | ||
1383 | 1419 | ||
1384 | /* Now that the task is set up, run cgroup callbacks if | 1420 | /* Now that the task is set up, run cgroup callbacks if |
1385 | * necessary. We need to run them before the task is visible | 1421 | * necessary. We need to run them before the task is visible |
@@ -1464,6 +1500,8 @@ bad_fork_cleanup_io: | |||
1464 | if (p->io_context) | 1500 | if (p->io_context) |
1465 | exit_io_context(p); | 1501 | exit_io_context(p); |
1466 | bad_fork_cleanup_namespaces: | 1502 | bad_fork_cleanup_namespaces: |
1503 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1504 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1467 | exit_task_namespaces(p); | 1505 | exit_task_namespaces(p); |
1468 | bad_fork_cleanup_mm: | 1506 | bad_fork_cleanup_mm: |
1469 | if (p->mm) | 1507 | if (p->mm) |
diff --git a/kernel/groups.c b/kernel/groups.c index 99b53d1eb7ea..6b2588dd04ff 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize) | |||
31 | group_info->blocks[0] = group_info->small_block; | 31 | group_info->blocks[0] = group_info->small_block; |
32 | else { | 32 | else { |
33 | for (i = 0; i < nblocks; i++) { | 33 | for (i = 0; i < nblocks; i++) { |
34 | gid_t *b; | 34 | kgid_t *b; |
35 | b = (void *)__get_free_page(GFP_USER); | 35 | b = (void *)__get_free_page(GFP_USER); |
36 | if (!b) | 36 | if (!b) |
37 | goto out_undo_partial_alloc; | 37 | goto out_undo_partial_alloc; |
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free); | |||
66 | static int groups_to_user(gid_t __user *grouplist, | 66 | static int groups_to_user(gid_t __user *grouplist, |
67 | const struct group_info *group_info) | 67 | const struct group_info *group_info) |
68 | { | 68 | { |
69 | struct user_namespace *user_ns = current_user_ns(); | ||
69 | int i; | 70 | int i; |
70 | unsigned int count = group_info->ngroups; | 71 | unsigned int count = group_info->ngroups; |
71 | 72 | ||
72 | for (i = 0; i < group_info->nblocks; i++) { | 73 | for (i = 0; i < count; i++) { |
73 | unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); | 74 | gid_t gid; |
74 | unsigned int len = cp_count * sizeof(*grouplist); | 75 | gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); |
75 | 76 | if (put_user(gid, grouplist+i)) | |
76 | if (copy_to_user(grouplist, group_info->blocks[i], len)) | ||
77 | return -EFAULT; | 77 | return -EFAULT; |
78 | |||
79 | grouplist += NGROUPS_PER_BLOCK; | ||
80 | count -= cp_count; | ||
81 | } | 78 | } |
82 | return 0; | 79 | return 0; |
83 | } | 80 | } |
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist, | |||
86 | static int groups_from_user(struct group_info *group_info, | 83 | static int groups_from_user(struct group_info *group_info, |
87 | gid_t __user *grouplist) | 84 | gid_t __user *grouplist) |
88 | { | 85 | { |
86 | struct user_namespace *user_ns = current_user_ns(); | ||
89 | int i; | 87 | int i; |
90 | unsigned int count = group_info->ngroups; | 88 | unsigned int count = group_info->ngroups; |
91 | 89 | ||
92 | for (i = 0; i < group_info->nblocks; i++) { | 90 | for (i = 0; i < count; i++) { |
93 | unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); | 91 | gid_t gid; |
94 | unsigned int len = cp_count * sizeof(*grouplist); | 92 | kgid_t kgid; |
95 | 93 | if (get_user(gid, grouplist+i)) | |
96 | if (copy_from_user(group_info->blocks[i], grouplist, len)) | ||
97 | return -EFAULT; | 94 | return -EFAULT; |
98 | 95 | ||
99 | grouplist += NGROUPS_PER_BLOCK; | 96 | kgid = make_kgid(user_ns, gid); |
100 | count -= cp_count; | 97 | if (!gid_valid(kgid)) |
98 | return -EINVAL; | ||
99 | |||
100 | GROUP_AT(group_info, i) = kgid; | ||
101 | } | 101 | } |
102 | return 0; | 102 | return 0; |
103 | } | 103 | } |
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info) | |||
117 | for (base = 0; base < max; base++) { | 117 | for (base = 0; base < max; base++) { |
118 | int left = base; | 118 | int left = base; |
119 | int right = left + stride; | 119 | int right = left + stride; |
120 | gid_t tmp = GROUP_AT(group_info, right); | 120 | kgid_t tmp = GROUP_AT(group_info, right); |
121 | 121 | ||
122 | while (left >= 0 && GROUP_AT(group_info, left) > tmp) { | 122 | while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { |
123 | GROUP_AT(group_info, right) = | 123 | GROUP_AT(group_info, right) = |
124 | GROUP_AT(group_info, left); | 124 | GROUP_AT(group_info, left); |
125 | right = left; | 125 | right = left; |
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info) | |||
132 | } | 132 | } |
133 | 133 | ||
134 | /* a simple bsearch */ | 134 | /* a simple bsearch */ |
135 | int groups_search(const struct group_info *group_info, gid_t grp) | 135 | int groups_search(const struct group_info *group_info, kgid_t grp) |
136 | { | 136 | { |
137 | unsigned int left, right; | 137 | unsigned int left, right; |
138 | 138 | ||
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) | |||
143 | right = group_info->ngroups; | 143 | right = group_info->ngroups; |
144 | while (left < right) { | 144 | while (left < right) { |
145 | unsigned int mid = (left+right)/2; | 145 | unsigned int mid = (left+right)/2; |
146 | if (grp > GROUP_AT(group_info, mid)) | 146 | if (gid_gt(grp, GROUP_AT(group_info, mid))) |
147 | left = mid + 1; | 147 | left = mid + 1; |
148 | else if (grp < GROUP_AT(group_info, mid)) | 148 | else if (gid_lt(grp, GROUP_AT(group_info, mid))) |
149 | right = mid; | 149 | right = mid; |
150 | else | 150 | else |
151 | return 1; | 151 | return 1; |
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
256 | /* | 256 | /* |
257 | * Check whether we're fsgid/egid or in the supplemental group.. | 257 | * Check whether we're fsgid/egid or in the supplemental group.. |
258 | */ | 258 | */ |
259 | int in_group_p(gid_t grp) | 259 | int in_group_p(kgid_t grp) |
260 | { | 260 | { |
261 | const struct cred *cred = current_cred(); | 261 | const struct cred *cred = current_cred(); |
262 | int retval = 1; | 262 | int retval = 1; |
263 | 263 | ||
264 | if (grp != cred->fsgid) | 264 | if (!gid_eq(grp, cred->fsgid)) |
265 | retval = groups_search(cred->group_info, grp); | 265 | retval = groups_search(cred->group_info, grp); |
266 | return retval; | 266 | return retval; |
267 | } | 267 | } |
268 | 268 | ||
269 | EXPORT_SYMBOL(in_group_p); | 269 | EXPORT_SYMBOL(in_group_p); |
270 | 270 | ||
271 | int in_egroup_p(gid_t grp) | 271 | int in_egroup_p(kgid_t grp) |
272 | { | 272 | { |
273 | const struct cred *cred = current_cred(); | 273 | const struct cred *cred = current_cred(); |
274 | int retval = 1; | 274 | int retval = 1; |
275 | 275 | ||
276 | if (grp != cred->egid) | 276 | if (!gid_eq(grp, cred->egid)) |
277 | retval = groups_search(cred->group_info, grp); | 277 | retval = groups_search(cred->group_info, grp); |
278 | return retval; | 278 | return retval; |
279 | } | 279 | } |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2a..6df614912b9d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
108 | 108 | ||
109 | touch_nmi_watchdog(); | 109 | touch_nmi_watchdog(); |
110 | 110 | ||
111 | if (sysctl_hung_task_panic) | 111 | if (sysctl_hung_task_panic) { |
112 | trigger_all_cpu_backtrace(); | ||
112 | panic("hung_task: blocked tasks"); | 113 | panic("hung_task: blocked tasks"); |
114 | } | ||
113 | } | 115 | } |
114 | 116 | ||
115 | /* | 117 | /* |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6080f6bc8c33..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) | |||
275 | kstat_incr_irqs_this_cpu(irq, desc); | 275 | kstat_incr_irqs_this_cpu(irq, desc); |
276 | 276 | ||
277 | action = desc->action; | 277 | action = desc->action; |
278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) | 278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { |
279 | desc->istate |= IRQS_PENDING; | ||
279 | goto out_unlock; | 280 | goto out_unlock; |
281 | } | ||
280 | 282 | ||
281 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 283 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
282 | raw_spin_unlock_irq(&desc->lock); | 284 | raw_spin_unlock_irq(&desc->lock); |
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
324 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 326 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
325 | kstat_incr_irqs_this_cpu(irq, desc); | 327 | kstat_incr_irqs_this_cpu(irq, desc); |
326 | 328 | ||
327 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 329 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
330 | desc->istate |= IRQS_PENDING; | ||
328 | goto out_unlock; | 331 | goto out_unlock; |
332 | } | ||
329 | 333 | ||
330 | handle_irq_event(desc); | 334 | handle_irq_event(desc); |
331 | 335 | ||
@@ -379,8 +383,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
379 | * If its disabled or no action available | 383 | * If its disabled or no action available |
380 | * keep it masked and get out of here | 384 | * keep it masked and get out of here |
381 | */ | 385 | */ |
382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 386 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
387 | desc->istate |= IRQS_PENDING; | ||
383 | goto out_unlock; | 388 | goto out_unlock; |
389 | } | ||
384 | 390 | ||
385 | handle_irq_event(desc); | 391 | handle_irq_event(desc); |
386 | 392 | ||
@@ -518,6 +524,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
518 | out_unlock: | 524 | out_unlock: |
519 | raw_spin_unlock(&desc->lock); | 525 | raw_spin_unlock(&desc->lock); |
520 | } | 526 | } |
527 | EXPORT_SYMBOL(handle_edge_irq); | ||
521 | 528 | ||
522 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | 529 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER |
523 | /** | 530 | /** |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | |||
101 | 101 | ||
102 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
103 | 103 | ||
104 | extern int irq_do_set_affinity(struct irq_data *data, | ||
105 | const struct cpumask *dest, bool force); | ||
106 | |||
104 | /* Inline functions for support of irq chips on slow busses */ | 107 | /* Inline functions for support of irq chips on slow busses */ |
105 | static inline void chip_bus_lock(struct irq_desc *desc) | 108 | static inline void chip_bus_lock(struct irq_desc *desc) |
106 | { | 109 | { |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d86e254b95eb..192a302d6cfd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
112 | { | 112 | { |
113 | return radix_tree_lookup(&irq_desc_tree, irq); | 113 | return radix_tree_lookup(&irq_desc_tree, irq); |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(irq_to_desc); | ||
115 | 116 | ||
116 | static void delete_irq_desc(unsigned int irq) | 117 | static void delete_irq_desc(unsigned int irq) |
117 | { | 118 | { |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 0e0ba5f840b2..41c1564103f1 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #define pr_fmt(fmt) "irq: " fmt | ||
2 | |||
1 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
2 | #include <linux/hardirq.h> | 4 | #include <linux/hardirq.h> |
3 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
@@ -56,14 +58,73 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, | |||
56 | return domain; | 58 | return domain; |
57 | } | 59 | } |
58 | 60 | ||
61 | static void irq_domain_free(struct irq_domain *domain) | ||
62 | { | ||
63 | of_node_put(domain->of_node); | ||
64 | kfree(domain); | ||
65 | } | ||
66 | |||
59 | static void irq_domain_add(struct irq_domain *domain) | 67 | static void irq_domain_add(struct irq_domain *domain) |
60 | { | 68 | { |
61 | mutex_lock(&irq_domain_mutex); | 69 | mutex_lock(&irq_domain_mutex); |
62 | list_add(&domain->link, &irq_domain_list); | 70 | list_add(&domain->link, &irq_domain_list); |
63 | mutex_unlock(&irq_domain_mutex); | 71 | mutex_unlock(&irq_domain_mutex); |
64 | pr_debug("irq: Allocated domain of type %d @0x%p\n", | 72 | pr_debug("Allocated domain of type %d @0x%p\n", |
73 | domain->revmap_type, domain); | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * irq_domain_remove() - Remove an irq domain. | ||
78 | * @domain: domain to remove | ||
79 | * | ||
80 | * This routine is used to remove an irq domain. The caller must ensure | ||
81 | * that all mappings within the domain have been disposed of prior to | ||
82 | * use, depending on the revmap type. | ||
83 | */ | ||
84 | void irq_domain_remove(struct irq_domain *domain) | ||
85 | { | ||
86 | mutex_lock(&irq_domain_mutex); | ||
87 | |||
88 | switch (domain->revmap_type) { | ||
89 | case IRQ_DOMAIN_MAP_LEGACY: | ||
90 | /* | ||
91 | * Legacy domains don't manage their own irq_desc | ||
92 | * allocations, we expect the caller to handle irq_desc | ||
93 | * freeing on their own. | ||
94 | */ | ||
95 | break; | ||
96 | case IRQ_DOMAIN_MAP_TREE: | ||
97 | /* | ||
98 | * radix_tree_delete() takes care of destroying the root | ||
99 | * node when all entries are removed. Shout if there are | ||
100 | * any mappings left. | ||
101 | */ | ||
102 | WARN_ON(domain->revmap_data.tree.height); | ||
103 | break; | ||
104 | case IRQ_DOMAIN_MAP_LINEAR: | ||
105 | kfree(domain->revmap_data.linear.revmap); | ||
106 | domain->revmap_data.linear.size = 0; | ||
107 | break; | ||
108 | case IRQ_DOMAIN_MAP_NOMAP: | ||
109 | break; | ||
110 | } | ||
111 | |||
112 | list_del(&domain->link); | ||
113 | |||
114 | /* | ||
115 | * If the going away domain is the default one, reset it. | ||
116 | */ | ||
117 | if (unlikely(irq_default_domain == domain)) | ||
118 | irq_set_default_host(NULL); | ||
119 | |||
120 | mutex_unlock(&irq_domain_mutex); | ||
121 | |||
122 | pr_debug("Removed domain of type %d @0x%p\n", | ||
65 | domain->revmap_type, domain); | 123 | domain->revmap_type, domain); |
124 | |||
125 | irq_domain_free(domain); | ||
66 | } | 126 | } |
127 | EXPORT_SYMBOL_GPL(irq_domain_remove); | ||
67 | 128 | ||
68 | static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | 129 | static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, |
69 | irq_hw_number_t hwirq) | 130 | irq_hw_number_t hwirq) |
@@ -117,8 +178,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
117 | 178 | ||
118 | if (WARN_ON(!irq_data || irq_data->domain)) { | 179 | if (WARN_ON(!irq_data || irq_data->domain)) { |
119 | mutex_unlock(&irq_domain_mutex); | 180 | mutex_unlock(&irq_domain_mutex); |
120 | of_node_put(domain->of_node); | 181 | irq_domain_free(domain); |
121 | kfree(domain); | ||
122 | return NULL; | 182 | return NULL; |
123 | } | 183 | } |
124 | } | 184 | } |
@@ -152,10 +212,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
152 | irq_domain_add(domain); | 212 | irq_domain_add(domain); |
153 | return domain; | 213 | return domain; |
154 | } | 214 | } |
215 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); | ||
155 | 216 | ||
156 | /** | 217 | /** |
157 | * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. | 218 | * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. |
158 | * @of_node: pointer to interrupt controller's device tree node. | 219 | * @of_node: pointer to interrupt controller's device tree node. |
220 | * @size: Number of interrupts in the domain. | ||
159 | * @ops: map/unmap domain callbacks | 221 | * @ops: map/unmap domain callbacks |
160 | * @host_data: Controller private data pointer | 222 | * @host_data: Controller private data pointer |
161 | */ | 223 | */ |
@@ -181,6 +243,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, | |||
181 | irq_domain_add(domain); | 243 | irq_domain_add(domain); |
182 | return domain; | 244 | return domain; |
183 | } | 245 | } |
246 | EXPORT_SYMBOL_GPL(irq_domain_add_linear); | ||
184 | 247 | ||
185 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | 248 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, |
186 | unsigned int max_irq, | 249 | unsigned int max_irq, |
@@ -195,6 +258,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | |||
195 | } | 258 | } |
196 | return domain; | 259 | return domain; |
197 | } | 260 | } |
261 | EXPORT_SYMBOL_GPL(irq_domain_add_nomap); | ||
198 | 262 | ||
199 | /** | 263 | /** |
200 | * irq_domain_add_tree() | 264 | * irq_domain_add_tree() |
@@ -216,6 +280,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, | |||
216 | } | 280 | } |
217 | return domain; | 281 | return domain; |
218 | } | 282 | } |
283 | EXPORT_SYMBOL_GPL(irq_domain_add_tree); | ||
219 | 284 | ||
220 | /** | 285 | /** |
221 | * irq_find_host() - Locates a domain for a given device node | 286 | * irq_find_host() - Locates a domain for a given device node |
@@ -259,10 +324,11 @@ EXPORT_SYMBOL_GPL(irq_find_host); | |||
259 | */ | 324 | */ |
260 | void irq_set_default_host(struct irq_domain *domain) | 325 | void irq_set_default_host(struct irq_domain *domain) |
261 | { | 326 | { |
262 | pr_debug("irq: Default domain set to @0x%p\n", domain); | 327 | pr_debug("Default domain set to @0x%p\n", domain); |
263 | 328 | ||
264 | irq_default_domain = domain; | 329 | irq_default_domain = domain; |
265 | } | 330 | } |
331 | EXPORT_SYMBOL_GPL(irq_set_default_host); | ||
266 | 332 | ||
267 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | 333 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, |
268 | irq_hw_number_t hwirq) | 334 | irq_hw_number_t hwirq) |
@@ -272,7 +338,7 @@ static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | |||
272 | irq_data->hwirq = hwirq; | 338 | irq_data->hwirq = hwirq; |
273 | irq_data->domain = domain; | 339 | irq_data->domain = domain; |
274 | if (domain->ops->map(domain, virq, hwirq)) { | 340 | if (domain->ops->map(domain, virq, hwirq)) { |
275 | pr_debug("irq: -> mapping failed, freeing\n"); | 341 | pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq); |
276 | irq_data->domain = NULL; | 342 | irq_data->domain = NULL; |
277 | irq_data->hwirq = 0; | 343 | irq_data->hwirq = 0; |
278 | return -1; | 344 | return -1; |
@@ -303,7 +369,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
303 | 369 | ||
304 | virq = irq_alloc_desc_from(1, 0); | 370 | virq = irq_alloc_desc_from(1, 0); |
305 | if (!virq) { | 371 | if (!virq) { |
306 | pr_debug("irq: create_direct virq allocation failed\n"); | 372 | pr_debug("create_direct virq allocation failed\n"); |
307 | return 0; | 373 | return 0; |
308 | } | 374 | } |
309 | if (virq >= domain->revmap_data.nomap.max_irq) { | 375 | if (virq >= domain->revmap_data.nomap.max_irq) { |
@@ -312,7 +378,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
312 | irq_free_desc(virq); | 378 | irq_free_desc(virq); |
313 | return 0; | 379 | return 0; |
314 | } | 380 | } |
315 | pr_debug("irq: create_direct obtained virq %d\n", virq); | 381 | pr_debug("create_direct obtained virq %d\n", virq); |
316 | 382 | ||
317 | if (irq_setup_virq(domain, virq, virq)) { | 383 | if (irq_setup_virq(domain, virq, virq)) { |
318 | irq_free_desc(virq); | 384 | irq_free_desc(virq); |
@@ -321,6 +387,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
321 | 387 | ||
322 | return virq; | 388 | return virq; |
323 | } | 389 | } |
390 | EXPORT_SYMBOL_GPL(irq_create_direct_mapping); | ||
324 | 391 | ||
325 | /** | 392 | /** |
326 | * irq_create_mapping() - Map a hardware interrupt into linux irq space | 393 | * irq_create_mapping() - Map a hardware interrupt into linux irq space |
@@ -338,23 +405,23 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
338 | unsigned int hint; | 405 | unsigned int hint; |
339 | int virq; | 406 | int virq; |
340 | 407 | ||
341 | pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | 408 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); |
342 | 409 | ||
343 | /* Look for default domain if nececssary */ | 410 | /* Look for default domain if nececssary */ |
344 | if (domain == NULL) | 411 | if (domain == NULL) |
345 | domain = irq_default_domain; | 412 | domain = irq_default_domain; |
346 | if (domain == NULL) { | 413 | if (domain == NULL) { |
347 | printk(KERN_WARNING "irq_create_mapping called for" | 414 | pr_warning("irq_create_mapping called for" |
348 | " NULL domain, hwirq=%lx\n", hwirq); | 415 | " NULL domain, hwirq=%lx\n", hwirq); |
349 | WARN_ON(1); | 416 | WARN_ON(1); |
350 | return 0; | 417 | return 0; |
351 | } | 418 | } |
352 | pr_debug("irq: -> using domain @%p\n", domain); | 419 | pr_debug("-> using domain @%p\n", domain); |
353 | 420 | ||
354 | /* Check if mapping already exists */ | 421 | /* Check if mapping already exists */ |
355 | virq = irq_find_mapping(domain, hwirq); | 422 | virq = irq_find_mapping(domain, hwirq); |
356 | if (virq) { | 423 | if (virq) { |
357 | pr_debug("irq: -> existing mapping on virq %d\n", virq); | 424 | pr_debug("-> existing mapping on virq %d\n", virq); |
358 | return virq; | 425 | return virq; |
359 | } | 426 | } |
360 | 427 | ||
@@ -370,7 +437,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
370 | if (virq <= 0) | 437 | if (virq <= 0) |
371 | virq = irq_alloc_desc_from(1, 0); | 438 | virq = irq_alloc_desc_from(1, 0); |
372 | if (virq <= 0) { | 439 | if (virq <= 0) { |
373 | pr_debug("irq: -> virq allocation failed\n"); | 440 | pr_debug("-> virq allocation failed\n"); |
374 | return 0; | 441 | return 0; |
375 | } | 442 | } |
376 | 443 | ||
@@ -380,7 +447,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
380 | return 0; | 447 | return 0; |
381 | } | 448 | } |
382 | 449 | ||
383 | pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", | 450 | pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", |
384 | hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); | 451 | hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); |
385 | 452 | ||
386 | return virq; | 453 | return virq; |
@@ -409,8 +476,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, | |||
409 | if (intsize > 0) | 476 | if (intsize > 0) |
410 | return intspec[0]; | 477 | return intspec[0]; |
411 | #endif | 478 | #endif |
412 | printk(KERN_WARNING "irq: no irq domain found for %s !\n", | 479 | pr_warning("no irq domain found for %s !\n", |
413 | controller->full_name); | 480 | controller->full_name); |
414 | return 0; | 481 | return 0; |
415 | } | 482 | } |
416 | 483 | ||
@@ -560,6 +627,7 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, | |||
560 | */ | 627 | */ |
561 | return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); | 628 | return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); |
562 | } | 629 | } |
630 | EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup); | ||
563 | 631 | ||
564 | /** | 632 | /** |
565 | * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. | 633 | * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. |
@@ -584,6 +652,7 @@ void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, | |||
584 | mutex_unlock(&revmap_trees_mutex); | 652 | mutex_unlock(&revmap_trees_mutex); |
585 | } | 653 | } |
586 | } | 654 | } |
655 | EXPORT_SYMBOL_GPL(irq_radix_revmap_insert); | ||
587 | 656 | ||
588 | /** | 657 | /** |
589 | * irq_linear_revmap() - Find a linux irq from a hw irq number. | 658 | * irq_linear_revmap() - Find a linux irq from a hw irq number. |
@@ -617,6 +686,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, | |||
617 | 686 | ||
618 | return revmap[hwirq]; | 687 | return revmap[hwirq]; |
619 | } | 688 | } |
689 | EXPORT_SYMBOL_GPL(irq_linear_revmap); | ||
620 | 690 | ||
621 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG | 691 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG |
622 | static int virq_debug_show(struct seq_file *m, void *private) | 692 | static int virq_debug_show(struct seq_file *m, void *private) |
@@ -691,8 +761,8 @@ static int __init irq_debugfs_init(void) | |||
691 | __initcall(irq_debugfs_init); | 761 | __initcall(irq_debugfs_init); |
692 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ | 762 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ |
693 | 763 | ||
694 | int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, | 764 | static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, |
695 | irq_hw_number_t hwirq) | 765 | irq_hw_number_t hwirq) |
696 | { | 766 | { |
697 | return 0; | 767 | return 0; |
698 | } | 768 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569b..8c548232ba39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -7,6 +7,8 @@ | |||
7 | * This file contains driver APIs to the irq subsystem. | 7 | * This file contains driver APIs to the irq subsystem. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #define pr_fmt(fmt) "genirq: " fmt | ||
11 | |||
10 | #include <linux/irq.h> | 12 | #include <linux/irq.h> |
11 | #include <linux/kthread.h> | 13 | #include <linux/kthread.h> |
12 | #include <linux/module.h> | 14 | #include <linux/module.h> |
@@ -14,6 +16,7 @@ | |||
14 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
15 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
16 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/task_work.h> | ||
17 | 20 | ||
18 | #include "internals.h" | 21 | #include "internals.h" |
19 | 22 | ||
@@ -139,6 +142,25 @@ static inline void | |||
139 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | 142 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } |
140 | #endif | 143 | #endif |
141 | 144 | ||
145 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | ||
146 | bool force) | ||
147 | { | ||
148 | struct irq_desc *desc = irq_data_to_desc(data); | ||
149 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
150 | int ret; | ||
151 | |||
152 | ret = chip->irq_set_affinity(data, mask, false); | ||
153 | switch (ret) { | ||
154 | case IRQ_SET_MASK_OK: | ||
155 | cpumask_copy(data->affinity, mask); | ||
156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
157 | irq_set_thread_affinity(desc); | ||
158 | ret = 0; | ||
159 | } | ||
160 | |||
161 | return ret; | ||
162 | } | ||
163 | |||
142 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | 164 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) |
143 | { | 165 | { |
144 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 166 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
@@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | |||
149 | return -EINVAL; | 171 | return -EINVAL; |
150 | 172 | ||
151 | if (irq_can_move_pcntxt(data)) { | 173 | if (irq_can_move_pcntxt(data)) { |
152 | ret = chip->irq_set_affinity(data, mask, false); | 174 | ret = irq_do_set_affinity(data, mask, false); |
153 | switch (ret) { | ||
154 | case IRQ_SET_MASK_OK: | ||
155 | cpumask_copy(data->affinity, mask); | ||
156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
157 | irq_set_thread_affinity(desc); | ||
158 | ret = 0; | ||
159 | } | ||
160 | } else { | 175 | } else { |
161 | irqd_set_move_pending(data); | 176 | irqd_set_move_pending(data); |
162 | irq_copy_pending(desc, mask); | 177 | irq_copy_pending(desc, mask); |
@@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); | |||
280 | static int | 295 | static int |
281 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | 296 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) |
282 | { | 297 | { |
283 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
284 | struct cpumask *set = irq_default_affinity; | 298 | struct cpumask *set = irq_default_affinity; |
285 | int ret, node = desc->irq_data.node; | 299 | int node = desc->irq_data.node; |
286 | 300 | ||
287 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 301 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
288 | if (!irq_can_set_affinity(irq)) | 302 | if (!irq_can_set_affinity(irq)) |
@@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
308 | if (cpumask_intersects(mask, nodemask)) | 322 | if (cpumask_intersects(mask, nodemask)) |
309 | cpumask_and(mask, mask, nodemask); | 323 | cpumask_and(mask, mask, nodemask); |
310 | } | 324 | } |
311 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); | 325 | irq_do_set_affinity(&desc->irq_data, mask, false); |
312 | switch (ret) { | ||
313 | case IRQ_SET_MASK_OK: | ||
314 | cpumask_copy(desc->irq_data.affinity, mask); | ||
315 | case IRQ_SET_MASK_OK_NOCOPY: | ||
316 | irq_set_thread_affinity(desc); | ||
317 | } | ||
318 | return 0; | 326 | return 0; |
319 | } | 327 | } |
320 | #else | 328 | #else |
@@ -566,7 +574,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
566 | * flow-types? | 574 | * flow-types? |
567 | */ | 575 | */ |
568 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, | 576 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, |
569 | chip ? (chip->name ? : "unknown") : "unknown"); | 577 | chip ? (chip->name ? : "unknown") : "unknown"); |
570 | return 0; | 578 | return 0; |
571 | } | 579 | } |
572 | 580 | ||
@@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
600 | ret = 0; | 608 | ret = 0; |
601 | break; | 609 | break; |
602 | default: | 610 | default: |
603 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 611 | pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", |
604 | flags, irq, chip->irq_set_type); | 612 | flags, irq, chip->irq_set_type); |
605 | } | 613 | } |
606 | if (unmask) | 614 | if (unmask) |
@@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc) | |||
773 | wake_up(&desc->wait_for_threads); | 781 | wake_up(&desc->wait_for_threads); |
774 | } | 782 | } |
775 | 783 | ||
784 | static void irq_thread_dtor(struct task_work *unused) | ||
785 | { | ||
786 | struct task_struct *tsk = current; | ||
787 | struct irq_desc *desc; | ||
788 | struct irqaction *action; | ||
789 | |||
790 | if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) | ||
791 | return; | ||
792 | |||
793 | action = kthread_data(tsk); | ||
794 | |||
795 | pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
796 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | ||
797 | |||
798 | |||
799 | desc = irq_to_desc(action->irq); | ||
800 | /* | ||
801 | * If IRQTF_RUNTHREAD is set, we need to decrement | ||
802 | * desc->threads_active and wake possible waiters. | ||
803 | */ | ||
804 | if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
805 | wake_threads_waitq(desc); | ||
806 | |||
807 | /* Prevent a stale desc->threads_oneshot */ | ||
808 | irq_finalize_oneshot(desc, action); | ||
809 | } | ||
810 | |||
776 | /* | 811 | /* |
777 | * Interrupt handler thread | 812 | * Interrupt handler thread |
778 | */ | 813 | */ |
779 | static int irq_thread(void *data) | 814 | static int irq_thread(void *data) |
780 | { | 815 | { |
816 | struct task_work on_exit_work; | ||
781 | static const struct sched_param param = { | 817 | static const struct sched_param param = { |
782 | .sched_priority = MAX_USER_RT_PRIO/2, | 818 | .sched_priority = MAX_USER_RT_PRIO/2, |
783 | }; | 819 | }; |
@@ -793,7 +829,9 @@ static int irq_thread(void *data) | |||
793 | handler_fn = irq_thread_fn; | 829 | handler_fn = irq_thread_fn; |
794 | 830 | ||
795 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 831 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
796 | current->irq_thread = 1; | 832 | |
833 | init_task_work(&on_exit_work, irq_thread_dtor, NULL); | ||
834 | task_work_add(current, &on_exit_work, false); | ||
797 | 835 | ||
798 | while (!irq_wait_for_interrupt(action)) { | 836 | while (!irq_wait_for_interrupt(action)) { |
799 | irqreturn_t action_ret; | 837 | irqreturn_t action_ret; |
@@ -815,45 +853,11 @@ static int irq_thread(void *data) | |||
815 | * cannot touch the oneshot mask at this point anymore as | 853 | * cannot touch the oneshot mask at this point anymore as |
816 | * __setup_irq() might have given out currents thread_mask | 854 | * __setup_irq() might have given out currents thread_mask |
817 | * again. | 855 | * again. |
818 | * | ||
819 | * Clear irq_thread. Otherwise exit_irq_thread() would make | ||
820 | * fuzz about an active irq thread going into nirvana. | ||
821 | */ | 856 | */ |
822 | current->irq_thread = 0; | 857 | task_work_cancel(current, irq_thread_dtor); |
823 | return 0; | 858 | return 0; |
824 | } | 859 | } |
825 | 860 | ||
826 | /* | ||
827 | * Called from do_exit() | ||
828 | */ | ||
829 | void exit_irq_thread(void) | ||
830 | { | ||
831 | struct task_struct *tsk = current; | ||
832 | struct irq_desc *desc; | ||
833 | struct irqaction *action; | ||
834 | |||
835 | if (!tsk->irq_thread) | ||
836 | return; | ||
837 | |||
838 | action = kthread_data(tsk); | ||
839 | |||
840 | printk(KERN_ERR | ||
841 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
842 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | ||
843 | |||
844 | desc = irq_to_desc(action->irq); | ||
845 | |||
846 | /* | ||
847 | * If IRQTF_RUNTHREAD is set, we need to decrement | ||
848 | * desc->threads_active and wake possible waiters. | ||
849 | */ | ||
850 | if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
851 | wake_threads_waitq(desc); | ||
852 | |||
853 | /* Prevent a stale desc->threads_oneshot */ | ||
854 | irq_finalize_oneshot(desc, action); | ||
855 | } | ||
856 | |||
857 | static void irq_setup_forced_threading(struct irqaction *new) | 861 | static void irq_setup_forced_threading(struct irqaction *new) |
858 | { | 862 | { |
859 | if (!force_irqthreads) | 863 | if (!force_irqthreads) |
@@ -878,7 +882,6 @@ static int | |||
878 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 882 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
879 | { | 883 | { |
880 | struct irqaction *old, **old_ptr; | 884 | struct irqaction *old, **old_ptr; |
881 | const char *old_name = NULL; | ||
882 | unsigned long flags, thread_mask = 0; | 885 | unsigned long flags, thread_mask = 0; |
883 | int ret, nested, shared = 0; | 886 | int ret, nested, shared = 0; |
884 | cpumask_var_t mask; | 887 | cpumask_var_t mask; |
@@ -972,10 +975,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
972 | */ | 975 | */ |
973 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 976 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
974 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || | 977 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
975 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | 978 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) |
976 | old_name = old->name; | ||
977 | goto mismatch; | 979 | goto mismatch; |
978 | } | ||
979 | 980 | ||
980 | /* All handlers must agree on per-cpuness */ | 981 | /* All handlers must agree on per-cpuness */ |
981 | if ((old->flags & IRQF_PERCPU) != | 982 | if ((old->flags & IRQF_PERCPU) != |
@@ -1031,6 +1032,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1031 | * all existing action->thread_mask bits. | 1032 | * all existing action->thread_mask bits. |
1032 | */ | 1033 | */ |
1033 | new->thread_mask = 1 << ffz(thread_mask); | 1034 | new->thread_mask = 1 << ffz(thread_mask); |
1035 | |||
1036 | } else if (new->handler == irq_default_primary_handler) { | ||
1037 | /* | ||
1038 | * The interrupt was requested with handler = NULL, so | ||
1039 | * we use the default primary handler for it. But it | ||
1040 | * does not have the oneshot flag set. In combination | ||
1041 | * with level interrupts this is deadly, because the | ||
1042 | * default primary handler just wakes the thread, then | ||
1043 | * the irq lines is reenabled, but the device still | ||
1044 | * has the level irq asserted. Rinse and repeat.... | ||
1045 | * | ||
1046 | * While this works for edge type interrupts, we play | ||
1047 | * it safe and reject unconditionally because we can't | ||
1048 | * say for sure which type this interrupt really | ||
1049 | * has. The type flags are unreliable as the | ||
1050 | * underlying chip implementation can override them. | ||
1051 | */ | ||
1052 | pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", | ||
1053 | irq); | ||
1054 | ret = -EINVAL; | ||
1055 | goto out_mask; | ||
1034 | } | 1056 | } |
1035 | 1057 | ||
1036 | if (!shared) { | 1058 | if (!shared) { |
@@ -1078,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1078 | 1100 | ||
1079 | if (nmsk != omsk) | 1101 | if (nmsk != omsk) |
1080 | /* hope the handler works with current trigger mode */ | 1102 | /* hope the handler works with current trigger mode */ |
1081 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", | 1103 | pr_warning("irq %d uses trigger mode %u; requested %u\n", |
1082 | irq, nmsk, omsk); | 1104 | irq, nmsk, omsk); |
1083 | } | 1105 | } |
1084 | 1106 | ||
@@ -1115,14 +1137,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1115 | return 0; | 1137 | return 0; |
1116 | 1138 | ||
1117 | mismatch: | 1139 | mismatch: |
1118 | #ifdef CONFIG_DEBUG_SHIRQ | ||
1119 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 1140 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
1120 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 1141 | pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", |
1121 | if (old_name) | 1142 | irq, new->flags, new->name, old->flags, old->name); |
1122 | printk(KERN_ERR "current handler: %s\n", old_name); | 1143 | #ifdef CONFIG_DEBUG_SHIRQ |
1123 | dump_stack(); | 1144 | dump_stack(); |
1124 | } | ||
1125 | #endif | 1145 | #endif |
1146 | } | ||
1126 | ret = -EBUSY; | 1147 | ret = -EBUSY; |
1127 | 1148 | ||
1128 | out_mask: | 1149 | out_mask: |
@@ -1204,12 +1225,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1204 | /* Found it - now remove it from the list of entries: */ | 1225 | /* Found it - now remove it from the list of entries: */ |
1205 | *action_ptr = action->next; | 1226 | *action_ptr = action->next; |
1206 | 1227 | ||
1207 | /* Currently used only by UML, might disappear one day: */ | ||
1208 | #ifdef CONFIG_IRQ_RELEASE_METHOD | ||
1209 | if (desc->irq_data.chip->release) | ||
1210 | desc->irq_data.chip->release(irq, dev_id); | ||
1211 | #endif | ||
1212 | |||
1213 | /* If this was the last handler, shut down the IRQ line: */ | 1228 | /* If this was the last handler, shut down the IRQ line: */ |
1214 | if (!desc->action) | 1229 | if (!desc->action) |
1215 | irq_shutdown(desc); | 1230 | irq_shutdown(desc); |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) | |||
42 | * For correct operation this depends on the caller | 42 | * For correct operation this depends on the caller |
43 | * masking the irqs. | 43 | * masking the irqs. |
44 | */ | 44 | */ |
45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) |
46 | < nr_cpu_ids)) { | 46 | irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); |
47 | int ret = chip->irq_set_affinity(&desc->irq_data, | ||
48 | desc->pending_mask, false); | ||
49 | switch (ret) { | ||
50 | case IRQ_SET_MASK_OK: | ||
51 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
52 | case IRQ_SET_MASK_OK_NOCOPY: | ||
53 | irq_set_thread_affinity(desc); | ||
54 | } | ||
55 | } | ||
56 | 47 | ||
57 | cpumask_clear(desc->pending_mask); | 48 | cpumask_clear(desc->pending_mask); |
58 | } | 49 | } |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a6..cb228bf21760 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void) | |||
103 | int irq; | 103 | int irq; |
104 | 104 | ||
105 | for_each_irq_desc(irq, desc) { | 105 | for_each_irq_desc(irq, desc) { |
106 | /* | ||
107 | * Only interrupts which are marked as wakeup source | ||
108 | * and have not been disabled before the suspend check | ||
109 | * can abort suspend. | ||
110 | */ | ||
106 | if (irqd_is_wakeup_set(&desc->irq_data)) { | 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
107 | if (desc->istate & IRQS_PENDING) | 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) |
108 | return -EBUSY; | 113 | return -EBUSY; |
109 | continue; | 114 | continue; |
110 | } | 115 | } |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c9..6454db7b6a4d 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
58 | /* | 58 | /* |
59 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
60 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
61 | * active. | 61 | * active. Clear the pending bit so suspend/resume does not |
62 | * get confused. | ||
62 | */ | 63 | */ |
63 | if (irq_settings_is_level(desc)) | 64 | if (irq_settings_is_level(desc)) { |
65 | desc->istate &= ~IRQS_PENDING; | ||
64 | return; | 66 | return; |
67 | } | ||
65 | if (desc->istate & IRQS_REPLAY) | 68 | if (desc->istate & IRQS_REPLAY) |
66 | return; | 69 | return; |
67 | if (desc->istate & IRQS_PENDING) { | 70 | if (desc->istate & IRQS_PENDING) { |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 079f1d39a8b8..2169feeba529 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | |||
343 | 343 | ||
344 | /* Look up a kernel symbol and return it in a text buffer. */ | 344 | /* Look up a kernel symbol and return it in a text buffer. */ |
345 | static int __sprint_symbol(char *buffer, unsigned long address, | 345 | static int __sprint_symbol(char *buffer, unsigned long address, |
346 | int symbol_offset) | 346 | int symbol_offset, int add_offset) |
347 | { | 347 | { |
348 | char *modname; | 348 | char *modname; |
349 | const char *name; | 349 | const char *name; |
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address, | |||
358 | if (name != buffer) | 358 | if (name != buffer) |
359 | strcpy(buffer, name); | 359 | strcpy(buffer, name); |
360 | len = strlen(buffer); | 360 | len = strlen(buffer); |
361 | buffer += len; | ||
362 | offset -= symbol_offset; | 361 | offset -= symbol_offset; |
363 | 362 | ||
363 | if (add_offset) | ||
364 | len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); | ||
365 | |||
364 | if (modname) | 366 | if (modname) |
365 | len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); | 367 | len += sprintf(buffer + len, " [%s]", modname); |
366 | else | ||
367 | len += sprintf(buffer, "+%#lx/%#lx", offset, size); | ||
368 | 368 | ||
369 | return len; | 369 | return len; |
370 | } | 370 | } |
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address, | |||
382 | */ | 382 | */ |
383 | int sprint_symbol(char *buffer, unsigned long address) | 383 | int sprint_symbol(char *buffer, unsigned long address) |
384 | { | 384 | { |
385 | return __sprint_symbol(buffer, address, 0); | 385 | return __sprint_symbol(buffer, address, 0, 1); |
386 | } | 386 | } |
387 | |||
388 | EXPORT_SYMBOL_GPL(sprint_symbol); | 387 | EXPORT_SYMBOL_GPL(sprint_symbol); |
389 | 388 | ||
390 | /** | 389 | /** |
390 | * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer | ||
391 | * @buffer: buffer to be stored | ||
392 | * @address: address to lookup | ||
393 | * | ||
394 | * This function looks up a kernel symbol with @address and stores its name | ||
395 | * and module name to @buffer if possible. If no symbol was found, just saves | ||
396 | * its @address as is. | ||
397 | * | ||
398 | * This function returns the number of bytes stored in @buffer. | ||
399 | */ | ||
400 | int sprint_symbol_no_offset(char *buffer, unsigned long address) | ||
401 | { | ||
402 | return __sprint_symbol(buffer, address, 0, 0); | ||
403 | } | ||
404 | EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); | ||
405 | |||
406 | /** | ||
391 | * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer | 407 | * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer |
392 | * @buffer: buffer to be stored | 408 | * @buffer: buffer to be stored |
393 | * @address: address to lookup | 409 | * @address: address to lookup |
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol); | |||
403 | */ | 419 | */ |
404 | int sprint_backtrace(char *buffer, unsigned long address) | 420 | int sprint_backtrace(char *buffer, unsigned long address) |
405 | { | 421 | { |
406 | return __sprint_symbol(buffer, address, -1); | 422 | return __sprint_symbol(buffer, address, -1, 1); |
407 | } | 423 | } |
408 | 424 | ||
409 | /* Look up a kernel symbol and print it to the kernel messages. */ | 425 | /* Look up a kernel symbol and print it to the kernel messages. */ |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c | |||
@@ -0,0 +1,196 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/syscalls.h> | ||
3 | #include <linux/fdtable.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <linux/random.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/cache.h> | ||
10 | #include <linux/bug.h> | ||
11 | #include <linux/err.h> | ||
12 | #include <linux/kcmp.h> | ||
13 | |||
14 | #include <asm/unistd.h> | ||
15 | |||
16 | /* | ||
17 | * We don't expose the real in-memory order of objects for security reasons. | ||
18 | * But still the comparison results should be suitable for sorting. So we | ||
19 | * obfuscate kernel pointers values and compare the production instead. | ||
20 | * | ||
21 | * The obfuscation is done in two steps. First we xor the kernel pointer with | ||
22 | * a random value, which puts pointer into a new position in a reordered space. | ||
23 | * Secondly we multiply the xor production with a large odd random number to | ||
24 | * permute its bits even more (the odd multiplier guarantees that the product | ||
25 | * is unique ever after the high bits are truncated, since any odd number is | ||
26 | * relative prime to 2^n). | ||
27 | * | ||
28 | * Note also that the obfuscation itself is invisible to userspace and if needed | ||
29 | * it can be changed to an alternate scheme. | ||
30 | */ | ||
31 | static unsigned long cookies[KCMP_TYPES][2] __read_mostly; | ||
32 | |||
33 | static long kptr_obfuscate(long v, int type) | ||
34 | { | ||
35 | return (v ^ cookies[type][0]) * cookies[type][1]; | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * 0 - equal, i.e. v1 = v2 | ||
40 | * 1 - less than, i.e. v1 < v2 | ||
41 | * 2 - greater than, i.e. v1 > v2 | ||
42 | * 3 - not equal but ordering unavailable (reserved for future) | ||
43 | */ | ||
44 | static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) | ||
45 | { | ||
46 | long ret; | ||
47 | |||
48 | ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); | ||
49 | |||
50 | return (ret < 0) | ((ret > 0) << 1); | ||
51 | } | ||
52 | |||
53 | /* The caller must have pinned the task */ | ||
54 | static struct file * | ||
55 | get_file_raw_ptr(struct task_struct *task, unsigned int idx) | ||
56 | { | ||
57 | struct file *file = NULL; | ||
58 | |||
59 | task_lock(task); | ||
60 | rcu_read_lock(); | ||
61 | |||
62 | if (task->files) | ||
63 | file = fcheck_files(task->files, idx); | ||
64 | |||
65 | rcu_read_unlock(); | ||
66 | task_unlock(task); | ||
67 | |||
68 | return file; | ||
69 | } | ||
70 | |||
71 | static void kcmp_unlock(struct mutex *m1, struct mutex *m2) | ||
72 | { | ||
73 | if (likely(m2 != m1)) | ||
74 | mutex_unlock(m2); | ||
75 | mutex_unlock(m1); | ||
76 | } | ||
77 | |||
78 | static int kcmp_lock(struct mutex *m1, struct mutex *m2) | ||
79 | { | ||
80 | int err; | ||
81 | |||
82 | if (m2 > m1) | ||
83 | swap(m1, m2); | ||
84 | |||
85 | err = mutex_lock_killable(m1); | ||
86 | if (!err && likely(m1 != m2)) { | ||
87 | err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); | ||
88 | if (err) | ||
89 | mutex_unlock(m1); | ||
90 | } | ||
91 | |||
92 | return err; | ||
93 | } | ||
94 | |||
95 | SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, | ||
96 | unsigned long, idx1, unsigned long, idx2) | ||
97 | { | ||
98 | struct task_struct *task1, *task2; | ||
99 | int ret; | ||
100 | |||
101 | rcu_read_lock(); | ||
102 | |||
103 | /* | ||
104 | * Tasks are looked up in caller's PID namespace only. | ||
105 | */ | ||
106 | task1 = find_task_by_vpid(pid1); | ||
107 | task2 = find_task_by_vpid(pid2); | ||
108 | if (!task1 || !task2) | ||
109 | goto err_no_task; | ||
110 | |||
111 | get_task_struct(task1); | ||
112 | get_task_struct(task2); | ||
113 | |||
114 | rcu_read_unlock(); | ||
115 | |||
116 | /* | ||
117 | * One should have enough rights to inspect task details. | ||
118 | */ | ||
119 | ret = kcmp_lock(&task1->signal->cred_guard_mutex, | ||
120 | &task2->signal->cred_guard_mutex); | ||
121 | if (ret) | ||
122 | goto err; | ||
123 | if (!ptrace_may_access(task1, PTRACE_MODE_READ) || | ||
124 | !ptrace_may_access(task2, PTRACE_MODE_READ)) { | ||
125 | ret = -EPERM; | ||
126 | goto err_unlock; | ||
127 | } | ||
128 | |||
129 | switch (type) { | ||
130 | case KCMP_FILE: { | ||
131 | struct file *filp1, *filp2; | ||
132 | |||
133 | filp1 = get_file_raw_ptr(task1, idx1); | ||
134 | filp2 = get_file_raw_ptr(task2, idx2); | ||
135 | |||
136 | if (filp1 && filp2) | ||
137 | ret = kcmp_ptr(filp1, filp2, KCMP_FILE); | ||
138 | else | ||
139 | ret = -EBADF; | ||
140 | break; | ||
141 | } | ||
142 | case KCMP_VM: | ||
143 | ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); | ||
144 | break; | ||
145 | case KCMP_FILES: | ||
146 | ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); | ||
147 | break; | ||
148 | case KCMP_FS: | ||
149 | ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); | ||
150 | break; | ||
151 | case KCMP_SIGHAND: | ||
152 | ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); | ||
153 | break; | ||
154 | case KCMP_IO: | ||
155 | ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); | ||
156 | break; | ||
157 | case KCMP_SYSVSEM: | ||
158 | #ifdef CONFIG_SYSVIPC | ||
159 | ret = kcmp_ptr(task1->sysvsem.undo_list, | ||
160 | task2->sysvsem.undo_list, | ||
161 | KCMP_SYSVSEM); | ||
162 | #else | ||
163 | ret = -EOPNOTSUPP; | ||
164 | #endif | ||
165 | break; | ||
166 | default: | ||
167 | ret = -EINVAL; | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | err_unlock: | ||
172 | kcmp_unlock(&task1->signal->cred_guard_mutex, | ||
173 | &task2->signal->cred_guard_mutex); | ||
174 | err: | ||
175 | put_task_struct(task1); | ||
176 | put_task_struct(task2); | ||
177 | |||
178 | return ret; | ||
179 | |||
180 | err_no_task: | ||
181 | rcu_read_unlock(); | ||
182 | return -ESRCH; | ||
183 | } | ||
184 | |||
185 | static __init int kcmp_cookies_init(void) | ||
186 | { | ||
187 | int i; | ||
188 | |||
189 | get_random_bytes(cookies, sizeof(cookies)); | ||
190 | |||
191 | for (i = 0; i < KCMP_TYPES; i++) | ||
192 | cookies[i][1] |= (~(~0UL >> 1) | 1); | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | arch_initcall(kcmp_cookies_init); | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index c744b88c44e2..59dcf5b81d24 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize) | |||
402 | return max; | 402 | return max; |
403 | return len; | 403 | return len; |
404 | } | 404 | } |
405 | EXPORT_SYMBOL(__kfifo_max_r); | ||
405 | 406 | ||
406 | #define __KFIFO_PEEK(data, out, mask) \ | 407 | #define __KFIFO_PEEK(data, out, mask) \ |
407 | ((data)[(out) & (mask)]) | 408 | ((data)[(out) & (mask)]) |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 05698a7415fe..ff2c7cb86d77 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -221,13 +221,12 @@ fail: | |||
221 | return 0; | 221 | return 0; |
222 | } | 222 | } |
223 | 223 | ||
224 | void call_usermodehelper_freeinfo(struct subprocess_info *info) | 224 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) |
225 | { | 225 | { |
226 | if (info->cleanup) | 226 | if (info->cleanup) |
227 | (*info->cleanup)(info); | 227 | (*info->cleanup)(info); |
228 | kfree(info); | 228 | kfree(info); |
229 | } | 229 | } |
230 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); | ||
231 | 230 | ||
232 | static void umh_complete(struct subprocess_info *sub_info) | 231 | static void umh_complete(struct subprocess_info *sub_info) |
233 | { | 232 | { |
@@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); | |||
410 | 409 | ||
411 | /** | 410 | /** |
412 | * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. | 411 | * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. |
413 | * depth: New value to assign to usermodehelper_disabled. | 412 | * @depth: New value to assign to usermodehelper_disabled. |
414 | * | 413 | * |
415 | * Change the value of usermodehelper_disabled (under umhelper_sem locked for | 414 | * Change the value of usermodehelper_disabled (under umhelper_sem locked for |
416 | * writing) and wakeup tasks waiting for it to change. | 415 | * writing) and wakeup tasks waiting for it to change. |
@@ -479,6 +478,7 @@ static void helper_unlock(void) | |||
479 | * structure. This should be passed to call_usermodehelper_exec to | 478 | * structure. This should be passed to call_usermodehelper_exec to |
480 | * exec the process and free the structure. | 479 | * exec the process and free the structure. |
481 | */ | 480 | */ |
481 | static | ||
482 | struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | 482 | struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, |
483 | char **envp, gfp_t gfp_mask) | 483 | char **envp, gfp_t gfp_mask) |
484 | { | 484 | { |
@@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | |||
494 | out: | 494 | out: |
495 | return sub_info; | 495 | return sub_info; |
496 | } | 496 | } |
497 | EXPORT_SYMBOL(call_usermodehelper_setup); | ||
498 | 497 | ||
499 | /** | 498 | /** |
500 | * call_usermodehelper_setfns - set a cleanup/init function | 499 | * call_usermodehelper_setfns - set a cleanup/init function |
@@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); | |||
512 | * Function must be runnable in either a process context or the | 511 | * Function must be runnable in either a process context or the |
513 | * context in which call_usermodehelper_exec is called. | 512 | * context in which call_usermodehelper_exec is called. |
514 | */ | 513 | */ |
514 | static | ||
515 | void call_usermodehelper_setfns(struct subprocess_info *info, | 515 | void call_usermodehelper_setfns(struct subprocess_info *info, |
516 | int (*init)(struct subprocess_info *info, struct cred *new), | 516 | int (*init)(struct subprocess_info *info, struct cred *new), |
517 | void (*cleanup)(struct subprocess_info *info), | 517 | void (*cleanup)(struct subprocess_info *info), |
@@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info, | |||
521 | info->init = init; | 521 | info->init = init; |
522 | info->data = data; | 522 | info->data = data; |
523 | } | 523 | } |
524 | EXPORT_SYMBOL(call_usermodehelper_setfns); | ||
525 | 524 | ||
526 | /** | 525 | /** |
527 | * call_usermodehelper_exec - start a usermode application | 526 | * call_usermodehelper_exec - start a usermode application |
@@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); | |||
535 | * asynchronously if wait is not set, and runs as a child of keventd. | 534 | * asynchronously if wait is not set, and runs as a child of keventd. |
536 | * (ie. it runs with full root capabilities). | 535 | * (ie. it runs with full root capabilities). |
537 | */ | 536 | */ |
537 | static | ||
538 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | 538 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) |
539 | { | 539 | { |
540 | DECLARE_COMPLETION_ONSTACK(done); | 540 | DECLARE_COMPLETION_ONSTACK(done); |
@@ -576,7 +576,25 @@ unlock: | |||
576 | helper_unlock(); | 576 | helper_unlock(); |
577 | return retval; | 577 | return retval; |
578 | } | 578 | } |
579 | EXPORT_SYMBOL(call_usermodehelper_exec); | 579 | |
580 | int call_usermodehelper_fns( | ||
581 | char *path, char **argv, char **envp, int wait, | ||
582 | int (*init)(struct subprocess_info *info, struct cred *new), | ||
583 | void (*cleanup)(struct subprocess_info *), void *data) | ||
584 | { | ||
585 | struct subprocess_info *info; | ||
586 | gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; | ||
587 | |||
588 | info = call_usermodehelper_setup(path, argv, envp, gfp_mask); | ||
589 | |||
590 | if (info == NULL) | ||
591 | return -ENOMEM; | ||
592 | |||
593 | call_usermodehelper_setfns(info, init, cleanup, data); | ||
594 | |||
595 | return call_usermodehelper_exec(info, wait); | ||
596 | } | ||
597 | EXPORT_SYMBOL(call_usermodehelper_fns); | ||
580 | 598 | ||
581 | static int proc_cap_handler(struct ctl_table *table, int write, | 599 | static int proc_cap_handler(struct ctl_table *table, int write, |
582 | void __user *buffer, size_t *lenp, loff_t *ppos) | 600 | void __user *buffer, size_t *lenp, loff_t *ppos) |
diff --git a/kernel/lglock.c b/kernel/lglock.c new file mode 100644 index 000000000000..6535a667a5a7 --- /dev/null +++ b/kernel/lglock.c | |||
@@ -0,0 +1,89 @@ | |||
1 | /* See include/linux/lglock.h for description */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/lglock.h> | ||
4 | #include <linux/cpu.h> | ||
5 | #include <linux/string.h> | ||
6 | |||
7 | /* | ||
8 | * Note there is no uninit, so lglocks cannot be defined in | ||
9 | * modules (but it's fine to use them from there) | ||
10 | * Could be added though, just undo lg_lock_init | ||
11 | */ | ||
12 | |||
13 | void lg_lock_init(struct lglock *lg, char *name) | ||
14 | { | ||
15 | LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); | ||
16 | } | ||
17 | EXPORT_SYMBOL(lg_lock_init); | ||
18 | |||
19 | void lg_local_lock(struct lglock *lg) | ||
20 | { | ||
21 | arch_spinlock_t *lock; | ||
22 | |||
23 | preempt_disable(); | ||
24 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | ||
25 | lock = this_cpu_ptr(lg->lock); | ||
26 | arch_spin_lock(lock); | ||
27 | } | ||
28 | EXPORT_SYMBOL(lg_local_lock); | ||
29 | |||
30 | void lg_local_unlock(struct lglock *lg) | ||
31 | { | ||
32 | arch_spinlock_t *lock; | ||
33 | |||
34 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
35 | lock = this_cpu_ptr(lg->lock); | ||
36 | arch_spin_unlock(lock); | ||
37 | preempt_enable(); | ||
38 | } | ||
39 | EXPORT_SYMBOL(lg_local_unlock); | ||
40 | |||
41 | void lg_local_lock_cpu(struct lglock *lg, int cpu) | ||
42 | { | ||
43 | arch_spinlock_t *lock; | ||
44 | |||
45 | preempt_disable(); | ||
46 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | ||
47 | lock = per_cpu_ptr(lg->lock, cpu); | ||
48 | arch_spin_lock(lock); | ||
49 | } | ||
50 | EXPORT_SYMBOL(lg_local_lock_cpu); | ||
51 | |||
52 | void lg_local_unlock_cpu(struct lglock *lg, int cpu) | ||
53 | { | ||
54 | arch_spinlock_t *lock; | ||
55 | |||
56 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
57 | lock = per_cpu_ptr(lg->lock, cpu); | ||
58 | arch_spin_unlock(lock); | ||
59 | preempt_enable(); | ||
60 | } | ||
61 | EXPORT_SYMBOL(lg_local_unlock_cpu); | ||
62 | |||
63 | void lg_global_lock(struct lglock *lg) | ||
64 | { | ||
65 | int i; | ||
66 | |||
67 | preempt_disable(); | ||
68 | rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); | ||
69 | for_each_possible_cpu(i) { | ||
70 | arch_spinlock_t *lock; | ||
71 | lock = per_cpu_ptr(lg->lock, i); | ||
72 | arch_spin_lock(lock); | ||
73 | } | ||
74 | } | ||
75 | EXPORT_SYMBOL(lg_global_lock); | ||
76 | |||
77 | void lg_global_unlock(struct lglock *lg) | ||
78 | { | ||
79 | int i; | ||
80 | |||
81 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
82 | for_each_possible_cpu(i) { | ||
83 | arch_spinlock_t *lock; | ||
84 | lock = per_cpu_ptr(lg->lock, i); | ||
85 | arch_spin_unlock(lock); | ||
86 | } | ||
87 | preempt_enable(); | ||
88 | } | ||
89 | EXPORT_SYMBOL(lg_global_unlock); | ||
diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e425..4edbd9c11aca 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info, | |||
2429 | goto free_hdr; | 2429 | goto free_hdr; |
2430 | } | 2430 | } |
2431 | 2431 | ||
2432 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { | 2432 | if (hdr->e_shoff >= len || |
2433 | hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { | ||
2433 | err = -ENOEXEC; | 2434 | err = -ENOEXEC; |
2434 | goto free_hdr; | 2435 | goto free_hdr; |
2435 | } | 2436 | } |
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod, | |||
2953 | 2954 | ||
2954 | /* Module is ready to execute: parsing args may do that. */ | 2955 | /* Module is ready to execute: parsing args may do that. */ |
2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 2956 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
2956 | -32768, 32767, NULL); | 2957 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
2957 | if (err < 0) | 2958 | if (err < 0) |
2958 | goto unlink; | 2959 | goto unlink; |
2959 | 2960 | ||
diff --git a/kernel/params.c b/kernel/params.c index f37d82631347..ed35345be536 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) | |||
85 | 85 | ||
86 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
87 | char *val, | 87 | char *val, |
88 | const char *doing, | ||
88 | const struct kernel_param *params, | 89 | const struct kernel_param *params, |
89 | unsigned num_params, | 90 | unsigned num_params, |
90 | s16 min_level, | 91 | s16 min_level, |
91 | s16 max_level, | 92 | s16 max_level, |
92 | int (*handle_unknown)(char *param, char *val)) | 93 | int (*handle_unknown)(char *param, char *val, |
94 | const char *doing)) | ||
93 | { | 95 | { |
94 | unsigned int i; | 96 | unsigned int i; |
95 | int err; | 97 | int err; |
@@ -104,8 +106,8 @@ static int parse_one(char *param, | |||
104 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && params[i].ops->set != param_set_bool |
105 | && params[i].ops->set != param_set_bint) | 107 | && params[i].ops->set != param_set_bint) |
106 | return -EINVAL; | 108 | return -EINVAL; |
107 | pr_debug("They are equal! Calling %p\n", | 109 | pr_debug("handling %s with %p\n", param, |
108 | params[i].ops->set); | 110 | params[i].ops->set); |
109 | mutex_lock(¶m_lock); | 111 | mutex_lock(¶m_lock); |
110 | err = params[i].ops->set(val, ¶ms[i]); | 112 | err = params[i].ops->set(val, ¶ms[i]); |
111 | mutex_unlock(¶m_lock); | 113 | mutex_unlock(¶m_lock); |
@@ -114,11 +116,11 @@ static int parse_one(char *param, | |||
114 | } | 116 | } |
115 | 117 | ||
116 | if (handle_unknown) { | 118 | if (handle_unknown) { |
117 | pr_debug("Unknown argument: calling %p\n", handle_unknown); | 119 | pr_debug("doing %s: %s='%s'\n", doing, param, val); |
118 | return handle_unknown(param, val); | 120 | return handle_unknown(param, val, doing); |
119 | } | 121 | } |
120 | 122 | ||
121 | pr_debug("Unknown argument `%s'\n", param); | 123 | pr_debug("Unknown argument '%s'\n", param); |
122 | return -ENOENT; | 124 | return -ENOENT; |
123 | } | 125 | } |
124 | 126 | ||
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) | |||
175 | } | 177 | } |
176 | 178 | ||
177 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
178 | int parse_args(const char *name, | 180 | int parse_args(const char *doing, |
179 | char *args, | 181 | char *args, |
180 | const struct kernel_param *params, | 182 | const struct kernel_param *params, |
181 | unsigned num, | 183 | unsigned num, |
182 | s16 min_level, | 184 | s16 min_level, |
183 | s16 max_level, | 185 | s16 max_level, |
184 | int (*unknown)(char *param, char *val)) | 186 | int (*unknown)(char *param, char *val, const char *doing)) |
185 | { | 187 | { |
186 | char *param, *val; | 188 | char *param, *val; |
187 | 189 | ||
188 | pr_debug("Parsing ARGS: %s\n", args); | ||
189 | |||
190 | /* Chew leading spaces */ | 190 | /* Chew leading spaces */ |
191 | args = skip_spaces(args); | 191 | args = skip_spaces(args); |
192 | 192 | ||
193 | if (*args) | ||
194 | pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); | ||
195 | |||
193 | while (*args) { | 196 | while (*args) { |
194 | int ret; | 197 | int ret; |
195 | int irq_was_disabled; | 198 | int irq_was_disabled; |
196 | 199 | ||
197 | args = next_arg(args, ¶m, &val); | 200 | args = next_arg(args, ¶m, &val); |
198 | irq_was_disabled = irqs_disabled(); | 201 | irq_was_disabled = irqs_disabled(); |
199 | ret = parse_one(param, val, params, num, | 202 | ret = parse_one(param, val, doing, params, num, |
200 | min_level, max_level, unknown); | 203 | min_level, max_level, unknown); |
201 | if (irq_was_disabled && !irqs_disabled()) { | 204 | if (irq_was_disabled && !irqs_disabled()) |
202 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | 205 | pr_warn("%s: option '%s' enabled irq's!\n", |
203 | "irq's!\n", param); | 206 | doing, param); |
204 | } | 207 | |
205 | switch (ret) { | 208 | switch (ret) { |
206 | case -ENOENT: | 209 | case -ENOENT: |
207 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | 210 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
208 | name, param); | ||
209 | return ret; | 211 | return ret; |
210 | case -ENOSPC: | 212 | case -ENOSPC: |
211 | printk(KERN_ERR | 213 | pr_err("%s: `%s' too large for parameter `%s'\n", |
212 | "%s: `%s' too large for parameter `%s'\n", | 214 | doing, val ?: "", param); |
213 | name, val ?: "", param); | ||
214 | return ret; | 215 | return ret; |
215 | case 0: | 216 | case 0: |
216 | break; | 217 | break; |
217 | default: | 218 | default: |
218 | printk(KERN_ERR | 219 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
219 | "%s: `%s' invalid for parameter `%s'\n", | 220 | doing, val ?: "", param); |
220 | name, val ?: "", param); | ||
221 | return ret; | 221 | return ret; |
222 | } | 222 | } |
223 | } | 223 | } |
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | |||
263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 263 | int param_set_charp(const char *val, const struct kernel_param *kp) |
264 | { | 264 | { |
265 | if (strlen(val) > 1024) { | 265 | if (strlen(val) > 1024) { |
266 | printk(KERN_ERR "%s: string parameter too long\n", | 266 | pr_err("%s: string parameter too long\n", kp->name); |
267 | kp->name); | ||
268 | return -ENOSPC; | 267 | return -ENOSPC; |
269 | } | 268 | } |
270 | 269 | ||
@@ -400,8 +399,7 @@ static int param_array(const char *name, | |||
400 | int len; | 399 | int len; |
401 | 400 | ||
402 | if (*num == max) { | 401 | if (*num == max) { |
403 | printk(KERN_ERR "%s: can only take %i arguments\n", | 402 | pr_err("%s: can only take %i arguments\n", name, max); |
404 | name, max); | ||
405 | return -EINVAL; | 403 | return -EINVAL; |
406 | } | 404 | } |
407 | len = strcspn(val, ","); | 405 | len = strcspn(val, ","); |
@@ -420,8 +418,7 @@ static int param_array(const char *name, | |||
420 | } while (save == ','); | 418 | } while (save == ','); |
421 | 419 | ||
422 | if (*num < min) { | 420 | if (*num < min) { |
423 | printk(KERN_ERR "%s: needs at least %i arguments\n", | 421 | pr_err("%s: needs at least %i arguments\n", name, min); |
424 | name, min); | ||
425 | return -EINVAL; | 422 | return -EINVAL; |
426 | } | 423 | } |
427 | return 0; | 424 | return 0; |
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) | |||
480 | const struct kparam_string *kps = kp->str; | 477 | const struct kparam_string *kps = kp->str; |
481 | 478 | ||
482 | if (strlen(val)+1 > kps->maxlen) { | 479 | if (strlen(val)+1 > kps->maxlen) { |
483 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 480 | pr_err("%s: string doesn't fit in %u chars.\n", |
484 | kp->name, kps->maxlen-1); | 481 | kp->name, kps->maxlen-1); |
485 | return -ENOSPC; | 482 | return -ENOSPC; |
486 | } | 483 | } |
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
750 | #endif | 747 | #endif |
751 | if (err) { | 748 | if (err) { |
752 | kobject_put(&mk->kobj); | 749 | kobject_put(&mk->kobj); |
753 | printk(KERN_ERR | 750 | pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", |
754 | "Module '%s' failed add to sysfs, error number %d\n", | ||
755 | name, err); | 751 | name, err); |
756 | printk(KERN_ERR | ||
757 | "The system will be unstable now.\n"); | ||
758 | return NULL; | 752 | return NULL; |
759 | } | 753 | } |
760 | 754 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 9f08dfabaf13..e86b291ad834 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -547,7 +547,8 @@ void __init pidhash_init(void) | |||
547 | 547 | ||
548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
549 | HASH_EARLY | HASH_SMALL, | 549 | HASH_EARLY | HASH_SMALL, |
550 | &pidhash_shift, NULL, 4096); | 550 | &pidhash_shift, NULL, |
551 | 0, 4096); | ||
551 | pidhash_size = 1U << pidhash_shift; | 552 | pidhash_size = 1U << pidhash_shift; |
552 | 553 | ||
553 | for (i = 0; i < pidhash_size; i++) | 554 | for (i = 0; i < pidhash_size; i++) |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 57bc1fd35b3c..16b20e38c4a1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
149 | { | 149 | { |
150 | int nr; | 150 | int nr; |
151 | int rc; | 151 | int rc; |
152 | struct task_struct *task; | 152 | struct task_struct *task, *me = current; |
153 | |||
154 | /* Ignore SIGCHLD causing any terminated children to autoreap */ | ||
155 | spin_lock_irq(&me->sighand->siglock); | ||
156 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; | ||
157 | spin_unlock_irq(&me->sighand->siglock); | ||
153 | 158 | ||
154 | /* | 159 | /* |
155 | * The last thread in the cgroup-init thread group is terminating. | 160 | * The last thread in the cgroup-init thread group is terminating. |
@@ -191,6 +196,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
191 | return; | 196 | return; |
192 | } | 197 | } |
193 | 198 | ||
199 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
194 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | 200 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
195 | void __user *buffer, size_t *lenp, loff_t *ppos) | 201 | void __user *buffer, size_t *lenp, loff_t *ppos) |
196 | { | 202 | { |
@@ -218,8 +224,8 @@ static struct ctl_table pid_ns_ctl_table[] = { | |||
218 | }, | 224 | }, |
219 | { } | 225 | { } |
220 | }; | 226 | }; |
221 | |||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | 227 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; |
228 | #endif /* CONFIG_CHECKPOINT_RESTORE */ | ||
223 | 229 | ||
224 | int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | 230 | int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) |
225 | { | 231 | { |
@@ -253,7 +259,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
253 | static __init int pid_namespaces_init(void) | 259 | static __init int pid_namespaces_init(void) |
254 | { | 260 | { |
255 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 261 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
262 | |||
263 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
256 | register_sysctl_paths(kern_path, pid_ns_ctl_table); | 264 | register_sysctl_paths(kern_path, pid_ns_ctl_table); |
265 | #endif | ||
257 | return 0; | 266 | return 0; |
258 | } | 267 | } |
259 | 268 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e3216..8f9b4eb974e0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP | |||
103 | select HOTPLUG | 103 | select HOTPLUG |
104 | select HOTPLUG_CPU | 104 | select HOTPLUG_CPU |
105 | 105 | ||
106 | config PM_AUTOSLEEP | ||
107 | bool "Opportunistic sleep" | ||
108 | depends on PM_SLEEP | ||
109 | default n | ||
110 | ---help--- | ||
111 | Allow the kernel to trigger a system transition into a global sleep | ||
112 | state automatically whenever there are no active wakeup sources. | ||
113 | |||
114 | config PM_WAKELOCKS | ||
115 | bool "User space wakeup sources interface" | ||
116 | depends on PM_SLEEP | ||
117 | default n | ||
118 | ---help--- | ||
119 | Allow user space to create, activate and deactivate wakeup source | ||
120 | objects with the help of a sysfs-based interface. | ||
121 | |||
122 | config PM_WAKELOCKS_LIMIT | ||
123 | int "Maximum number of user space wakeup sources (0 = no limit)" | ||
124 | range 0 100000 | ||
125 | default 100 | ||
126 | depends on PM_WAKELOCKS | ||
127 | |||
128 | config PM_WAKELOCKS_GC | ||
129 | bool "Garbage collector for user space wakeup sources" | ||
130 | depends on PM_WAKELOCKS | ||
131 | default y | ||
132 | |||
106 | config PM_RUNTIME | 133 | config PM_RUNTIME |
107 | bool "Run-time PM core functionality" | 134 | bool "Run-time PM core functionality" |
108 | depends on !IA64_HP_SIM | 135 | depends on !IA64_HP_SIM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec5252..29472bff11ef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
11 | block_io.o | 11 | block_io.o |
12 | obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o | ||
13 | obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o | ||
12 | 14 | ||
13 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 000000000000..ca304046d9e2 --- /dev/null +++ b/kernel/power/autosleep.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * kernel/power/autosleep.c | ||
3 | * | ||
4 | * Opportunistic sleep support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | */ | ||
8 | |||
9 | #include <linux/device.h> | ||
10 | #include <linux/mutex.h> | ||
11 | #include <linux/pm_wakeup.h> | ||
12 | |||
13 | #include "power.h" | ||
14 | |||
15 | static suspend_state_t autosleep_state; | ||
16 | static struct workqueue_struct *autosleep_wq; | ||
17 | /* | ||
18 | * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source | ||
19 | * is active, otherwise a deadlock with try_to_suspend() is possible. | ||
20 | * Alternatively mutex_lock_interruptible() can be used. This will then fail | ||
21 | * if an auto_sleep cycle tries to freeze processes. | ||
22 | */ | ||
23 | static DEFINE_MUTEX(autosleep_lock); | ||
24 | static struct wakeup_source *autosleep_ws; | ||
25 | |||
26 | static void try_to_suspend(struct work_struct *work) | ||
27 | { | ||
28 | unsigned int initial_count, final_count; | ||
29 | |||
30 | if (!pm_get_wakeup_count(&initial_count, true)) | ||
31 | goto out; | ||
32 | |||
33 | mutex_lock(&autosleep_lock); | ||
34 | |||
35 | if (!pm_save_wakeup_count(initial_count)) { | ||
36 | mutex_unlock(&autosleep_lock); | ||
37 | goto out; | ||
38 | } | ||
39 | |||
40 | if (autosleep_state == PM_SUSPEND_ON) { | ||
41 | mutex_unlock(&autosleep_lock); | ||
42 | return; | ||
43 | } | ||
44 | if (autosleep_state >= PM_SUSPEND_MAX) | ||
45 | hibernate(); | ||
46 | else | ||
47 | pm_suspend(autosleep_state); | ||
48 | |||
49 | mutex_unlock(&autosleep_lock); | ||
50 | |||
51 | if (!pm_get_wakeup_count(&final_count, false)) | ||
52 | goto out; | ||
53 | |||
54 | /* | ||
55 | * If the wakeup occured for an unknown reason, wait to prevent the | ||
56 | * system from trying to suspend and waking up in a tight loop. | ||
57 | */ | ||
58 | if (final_count == initial_count) | ||
59 | schedule_timeout_uninterruptible(HZ / 2); | ||
60 | |||
61 | out: | ||
62 | queue_up_suspend_work(); | ||
63 | } | ||
64 | |||
65 | static DECLARE_WORK(suspend_work, try_to_suspend); | ||
66 | |||
67 | void queue_up_suspend_work(void) | ||
68 | { | ||
69 | if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) | ||
70 | queue_work(autosleep_wq, &suspend_work); | ||
71 | } | ||
72 | |||
73 | suspend_state_t pm_autosleep_state(void) | ||
74 | { | ||
75 | return autosleep_state; | ||
76 | } | ||
77 | |||
78 | int pm_autosleep_lock(void) | ||
79 | { | ||
80 | return mutex_lock_interruptible(&autosleep_lock); | ||
81 | } | ||
82 | |||
83 | void pm_autosleep_unlock(void) | ||
84 | { | ||
85 | mutex_unlock(&autosleep_lock); | ||
86 | } | ||
87 | |||
88 | int pm_autosleep_set_state(suspend_state_t state) | ||
89 | { | ||
90 | |||
91 | #ifndef CONFIG_HIBERNATION | ||
92 | if (state >= PM_SUSPEND_MAX) | ||
93 | return -EINVAL; | ||
94 | #endif | ||
95 | |||
96 | __pm_stay_awake(autosleep_ws); | ||
97 | |||
98 | mutex_lock(&autosleep_lock); | ||
99 | |||
100 | autosleep_state = state; | ||
101 | |||
102 | __pm_relax(autosleep_ws); | ||
103 | |||
104 | if (state > PM_SUSPEND_ON) { | ||
105 | pm_wakep_autosleep_enabled(true); | ||
106 | queue_up_suspend_work(); | ||
107 | } else { | ||
108 | pm_wakep_autosleep_enabled(false); | ||
109 | } | ||
110 | |||
111 | mutex_unlock(&autosleep_lock); | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | int __init pm_autosleep_init(void) | ||
116 | { | ||
117 | autosleep_ws = wakeup_source_register("autosleep"); | ||
118 | if (!autosleep_ws) | ||
119 | return -ENOMEM; | ||
120 | |||
121 | autosleep_wq = alloc_ordered_workqueue("autosleep", 0); | ||
122 | if (autosleep_wq) | ||
123 | return 0; | ||
124 | |||
125 | wakeup_source_unregister(autosleep_ws); | ||
126 | return -ENOMEM; | ||
127 | } | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e09dfbfeecee..8b53db38a279 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/freezer.h> | 25 | #include <linux/freezer.h> |
26 | #include <linux/gfp.h> | 26 | #include <linux/gfp.h> |
27 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
28 | #include <linux/ctype.h> | ||
29 | #include <linux/genhd.h> | ||
28 | #include <scsi/scsi_scan.h> | 30 | #include <scsi/scsi_scan.h> |
29 | 31 | ||
30 | #include "power.h" | 32 | #include "power.h" |
@@ -722,6 +724,17 @@ static int software_resume(void) | |||
722 | 724 | ||
723 | /* Check if the device is there */ | 725 | /* Check if the device is there */ |
724 | swsusp_resume_device = name_to_dev_t(resume_file); | 726 | swsusp_resume_device = name_to_dev_t(resume_file); |
727 | |||
728 | /* | ||
729 | * name_to_dev_t is ineffective to verify parition if resume_file is in | ||
730 | * integer format. (e.g. major:minor) | ||
731 | */ | ||
732 | if (isdigit(resume_file[0]) && resume_wait) { | ||
733 | int partno; | ||
734 | while (!get_gendisk(swsusp_resume_device, &partno)) | ||
735 | msleep(10); | ||
736 | } | ||
737 | |||
725 | if (!swsusp_resume_device) { | 738 | if (!swsusp_resume_device) { |
726 | /* | 739 | /* |
727 | * Some device discovery might still be in progress; we need | 740 | * Some device discovery might still be in progress; we need |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c62..428f8a034e96 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
269 | return (s - buf); | 269 | return (s - buf); |
270 | } | 270 | } |
271 | 271 | ||
272 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | 272 | static suspend_state_t decode_state(const char *buf, size_t n) |
273 | const char *buf, size_t n) | ||
274 | { | 273 | { |
275 | #ifdef CONFIG_SUSPEND | 274 | #ifdef CONFIG_SUSPEND |
276 | suspend_state_t state = PM_SUSPEND_STANDBY; | 275 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
278 | #endif | 277 | #endif |
279 | char *p; | 278 | char *p; |
280 | int len; | 279 | int len; |
281 | int error = -EINVAL; | ||
282 | 280 | ||
283 | p = memchr(buf, '\n', n); | 281 | p = memchr(buf, '\n', n); |
284 | len = p ? p - buf : n; | 282 | len = p ? p - buf : n; |
285 | 283 | ||
286 | /* First, check if we are requested to hibernate */ | 284 | /* Check hibernation first. */ |
287 | if (len == 4 && !strncmp(buf, "disk", len)) { | 285 | if (len == 4 && !strncmp(buf, "disk", len)) |
288 | error = hibernate(); | 286 | return PM_SUSPEND_MAX; |
289 | goto Exit; | ||
290 | } | ||
291 | 287 | ||
292 | #ifdef CONFIG_SUSPEND | 288 | #ifdef CONFIG_SUSPEND |
293 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | 289 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) |
294 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { | 290 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) |
295 | error = pm_suspend(state); | 291 | return state; |
296 | break; | ||
297 | } | ||
298 | } | ||
299 | #endif | 292 | #endif |
300 | 293 | ||
301 | Exit: | 294 | return PM_SUSPEND_ON; |
295 | } | ||
296 | |||
297 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
298 | const char *buf, size_t n) | ||
299 | { | ||
300 | suspend_state_t state; | ||
301 | int error; | ||
302 | |||
303 | error = pm_autosleep_lock(); | ||
304 | if (error) | ||
305 | return error; | ||
306 | |||
307 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
308 | error = -EBUSY; | ||
309 | goto out; | ||
310 | } | ||
311 | |||
312 | state = decode_state(buf, n); | ||
313 | if (state < PM_SUSPEND_MAX) | ||
314 | error = pm_suspend(state); | ||
315 | else if (state == PM_SUSPEND_MAX) | ||
316 | error = hibernate(); | ||
317 | else | ||
318 | error = -EINVAL; | ||
319 | |||
320 | out: | ||
321 | pm_autosleep_unlock(); | ||
302 | return error ? error : n; | 322 | return error ? error : n; |
303 | } | 323 | } |
304 | 324 | ||
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, | |||
339 | { | 359 | { |
340 | unsigned int val; | 360 | unsigned int val; |
341 | 361 | ||
342 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; | 362 | return pm_get_wakeup_count(&val, true) ? |
363 | sprintf(buf, "%u\n", val) : -EINTR; | ||
343 | } | 364 | } |
344 | 365 | ||
345 | static ssize_t wakeup_count_store(struct kobject *kobj, | 366 | static ssize_t wakeup_count_store(struct kobject *kobj, |
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, | |||
347 | const char *buf, size_t n) | 368 | const char *buf, size_t n) |
348 | { | 369 | { |
349 | unsigned int val; | 370 | unsigned int val; |
371 | int error; | ||
372 | |||
373 | error = pm_autosleep_lock(); | ||
374 | if (error) | ||
375 | return error; | ||
376 | |||
377 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
378 | error = -EBUSY; | ||
379 | goto out; | ||
380 | } | ||
350 | 381 | ||
382 | error = -EINVAL; | ||
351 | if (sscanf(buf, "%u", &val) == 1) { | 383 | if (sscanf(buf, "%u", &val) == 1) { |
352 | if (pm_save_wakeup_count(val)) | 384 | if (pm_save_wakeup_count(val)) |
353 | return n; | 385 | error = n; |
354 | } | 386 | } |
355 | return -EINVAL; | 387 | |
388 | out: | ||
389 | pm_autosleep_unlock(); | ||
390 | return error; | ||
356 | } | 391 | } |
357 | 392 | ||
358 | power_attr(wakeup_count); | 393 | power_attr(wakeup_count); |
394 | |||
395 | #ifdef CONFIG_PM_AUTOSLEEP | ||
396 | static ssize_t autosleep_show(struct kobject *kobj, | ||
397 | struct kobj_attribute *attr, | ||
398 | char *buf) | ||
399 | { | ||
400 | suspend_state_t state = pm_autosleep_state(); | ||
401 | |||
402 | if (state == PM_SUSPEND_ON) | ||
403 | return sprintf(buf, "off\n"); | ||
404 | |||
405 | #ifdef CONFIG_SUSPEND | ||
406 | if (state < PM_SUSPEND_MAX) | ||
407 | return sprintf(buf, "%s\n", valid_state(state) ? | ||
408 | pm_states[state] : "error"); | ||
409 | #endif | ||
410 | #ifdef CONFIG_HIBERNATION | ||
411 | return sprintf(buf, "disk\n"); | ||
412 | #else | ||
413 | return sprintf(buf, "error"); | ||
414 | #endif | ||
415 | } | ||
416 | |||
417 | static ssize_t autosleep_store(struct kobject *kobj, | ||
418 | struct kobj_attribute *attr, | ||
419 | const char *buf, size_t n) | ||
420 | { | ||
421 | suspend_state_t state = decode_state(buf, n); | ||
422 | int error; | ||
423 | |||
424 | if (state == PM_SUSPEND_ON | ||
425 | && strcmp(buf, "off") && strcmp(buf, "off\n")) | ||
426 | return -EINVAL; | ||
427 | |||
428 | error = pm_autosleep_set_state(state); | ||
429 | return error ? error : n; | ||
430 | } | ||
431 | |||
432 | power_attr(autosleep); | ||
433 | #endif /* CONFIG_PM_AUTOSLEEP */ | ||
434 | |||
435 | #ifdef CONFIG_PM_WAKELOCKS | ||
436 | static ssize_t wake_lock_show(struct kobject *kobj, | ||
437 | struct kobj_attribute *attr, | ||
438 | char *buf) | ||
439 | { | ||
440 | return pm_show_wakelocks(buf, true); | ||
441 | } | ||
442 | |||
443 | static ssize_t wake_lock_store(struct kobject *kobj, | ||
444 | struct kobj_attribute *attr, | ||
445 | const char *buf, size_t n) | ||
446 | { | ||
447 | int error = pm_wake_lock(buf); | ||
448 | return error ? error : n; | ||
449 | } | ||
450 | |||
451 | power_attr(wake_lock); | ||
452 | |||
453 | static ssize_t wake_unlock_show(struct kobject *kobj, | ||
454 | struct kobj_attribute *attr, | ||
455 | char *buf) | ||
456 | { | ||
457 | return pm_show_wakelocks(buf, false); | ||
458 | } | ||
459 | |||
460 | static ssize_t wake_unlock_store(struct kobject *kobj, | ||
461 | struct kobj_attribute *attr, | ||
462 | const char *buf, size_t n) | ||
463 | { | ||
464 | int error = pm_wake_unlock(buf); | ||
465 | return error ? error : n; | ||
466 | } | ||
467 | |||
468 | power_attr(wake_unlock); | ||
469 | |||
470 | #endif /* CONFIG_PM_WAKELOCKS */ | ||
359 | #endif /* CONFIG_PM_SLEEP */ | 471 | #endif /* CONFIG_PM_SLEEP */ |
360 | 472 | ||
361 | #ifdef CONFIG_PM_TRACE | 473 | #ifdef CONFIG_PM_TRACE |
@@ -409,6 +521,13 @@ static struct attribute * g[] = { | |||
409 | #ifdef CONFIG_PM_SLEEP | 521 | #ifdef CONFIG_PM_SLEEP |
410 | &pm_async_attr.attr, | 522 | &pm_async_attr.attr, |
411 | &wakeup_count_attr.attr, | 523 | &wakeup_count_attr.attr, |
524 | #ifdef CONFIG_PM_AUTOSLEEP | ||
525 | &autosleep_attr.attr, | ||
526 | #endif | ||
527 | #ifdef CONFIG_PM_WAKELOCKS | ||
528 | &wake_lock_attr.attr, | ||
529 | &wake_unlock_attr.attr, | ||
530 | #endif | ||
412 | #ifdef CONFIG_PM_DEBUG | 531 | #ifdef CONFIG_PM_DEBUG |
413 | &pm_test_attr.attr, | 532 | &pm_test_attr.attr, |
414 | #endif | 533 | #endif |
@@ -444,7 +563,10 @@ static int __init pm_init(void) | |||
444 | power_kobj = kobject_create_and_add("power", NULL); | 563 | power_kobj = kobject_create_and_add("power", NULL); |
445 | if (!power_kobj) | 564 | if (!power_kobj) |
446 | return -ENOMEM; | 565 | return -ENOMEM; |
447 | return sysfs_create_group(power_kobj, &attr_group); | 566 | error = sysfs_create_group(power_kobj, &attr_group); |
567 | if (error) | ||
568 | return error; | ||
569 | return pm_autosleep_init(); | ||
448 | } | 570 | } |
449 | 571 | ||
450 | core_initcall(pm_init); | 572 | core_initcall(pm_init); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d7407..b0bd4beaebfe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) | |||
264 | { | 264 | { |
265 | } | 265 | } |
266 | #endif | 266 | #endif |
267 | |||
268 | #ifdef CONFIG_PM_AUTOSLEEP | ||
269 | |||
270 | /* kernel/power/autosleep.c */ | ||
271 | extern int pm_autosleep_init(void); | ||
272 | extern int pm_autosleep_lock(void); | ||
273 | extern void pm_autosleep_unlock(void); | ||
274 | extern suspend_state_t pm_autosleep_state(void); | ||
275 | extern int pm_autosleep_set_state(suspend_state_t state); | ||
276 | |||
277 | #else /* !CONFIG_PM_AUTOSLEEP */ | ||
278 | |||
279 | static inline int pm_autosleep_init(void) { return 0; } | ||
280 | static inline int pm_autosleep_lock(void) { return 0; } | ||
281 | static inline void pm_autosleep_unlock(void) {} | ||
282 | static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } | ||
283 | |||
284 | #endif /* !CONFIG_PM_AUTOSLEEP */ | ||
285 | |||
286 | #ifdef CONFIG_PM_WAKELOCKS | ||
287 | |||
288 | /* kernel/power/wakelock.c */ | ||
289 | extern ssize_t pm_show_wakelocks(char *buf, bool show_active); | ||
290 | extern int pm_wake_lock(const char *buf); | ||
291 | extern int pm_wake_unlock(const char *buf); | ||
292 | |||
293 | #endif /* !CONFIG_PM_WAKELOCKS */ | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index eef311a58a64..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | 9 | * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> |
10 | * | 10 | * |
11 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
12 | * | 12 | * |
@@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
282 | return -ENOSPC; | 282 | return -ENOSPC; |
283 | 283 | ||
284 | if (bio_chain) { | 284 | if (bio_chain) { |
285 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 285 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | |
286 | __GFP_NORETRY); | ||
286 | if (src) { | 287 | if (src) { |
287 | copy_page(src, buf); | 288 | copy_page(src, buf); |
288 | } else { | 289 | } else { |
289 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ | 290 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ |
290 | if (ret) | 291 | if (ret) |
291 | return ret; | 292 | return ret; |
292 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 293 | src = (void *)__get_free_page(__GFP_WAIT | |
294 | __GFP_NOWARN | | ||
295 | __GFP_NORETRY); | ||
293 | if (src) { | 296 | if (src) { |
294 | copy_page(src, buf); | 297 | copy_page(src, buf); |
295 | } else { | 298 | } else { |
@@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
367 | clear_page(handle->cur); | 370 | clear_page(handle->cur); |
368 | handle->cur_swap = offset; | 371 | handle->cur_swap = offset; |
369 | handle->k = 0; | 372 | handle->k = 0; |
370 | } | 373 | |
371 | if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { | 374 | if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { |
372 | error = hib_wait_on_bio_chain(bio_chain); | 375 | error = hib_wait_on_bio_chain(bio_chain); |
373 | if (error) | 376 | if (error) |
374 | goto out; | 377 | goto out; |
375 | handle->reqd_free_pages = reqd_free_pages(); | 378 | /* |
379 | * Recalculate the number of required free pages, to | ||
380 | * make sure we never take more than half. | ||
381 | */ | ||
382 | handle->reqd_free_pages = reqd_free_pages(); | ||
383 | } | ||
376 | } | 384 | } |
377 | out: | 385 | out: |
378 | return error; | 386 | return error; |
@@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
419 | /* Maximum number of threads for compression/decompression. */ | 427 | /* Maximum number of threads for compression/decompression. */ |
420 | #define LZO_THREADS 3 | 428 | #define LZO_THREADS 3 |
421 | 429 | ||
422 | /* Maximum number of pages for read buffering. */ | 430 | /* Minimum/maximum number of pages for read buffering. */ |
423 | #define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) | 431 | #define LZO_MIN_RD_PAGES 1024 |
432 | #define LZO_MAX_RD_PAGES 8192 | ||
424 | 433 | ||
425 | 434 | ||
426 | /** | 435 | /** |
@@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
631 | } | 640 | } |
632 | 641 | ||
633 | /* | 642 | /* |
634 | * Adjust number of free pages after all allocations have been done. | ||
635 | * We don't want to run out of pages when writing. | ||
636 | */ | ||
637 | handle->reqd_free_pages = reqd_free_pages(); | ||
638 | |||
639 | /* | ||
640 | * Start the CRC32 thread. | 643 | * Start the CRC32 thread. |
641 | */ | 644 | */ |
642 | init_waitqueue_head(&crc->go); | 645 | init_waitqueue_head(&crc->go); |
@@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
657 | goto out_clean; | 660 | goto out_clean; |
658 | } | 661 | } |
659 | 662 | ||
663 | /* | ||
664 | * Adjust the number of required free pages after all allocations have | ||
665 | * been done. We don't want to run out of pages when writing. | ||
666 | */ | ||
667 | handle->reqd_free_pages = reqd_free_pages(); | ||
668 | |||
660 | printk(KERN_INFO | 669 | printk(KERN_INFO |
661 | "PM: Using %u thread(s) for compression.\n" | 670 | "PM: Using %u thread(s) for compression.\n" |
662 | "PM: Compressing and saving image data (%u pages) ... ", | 671 | "PM: Compressing and saving image data (%u pages) ... ", |
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1067 | unsigned i, thr, run_threads, nr_threads; | 1076 | unsigned i, thr, run_threads, nr_threads; |
1068 | unsigned ring = 0, pg = 0, ring_size = 0, | 1077 | unsigned ring = 0, pg = 0, ring_size = 0, |
1069 | have = 0, want, need, asked = 0; | 1078 | have = 0, want, need, asked = 0; |
1070 | unsigned long read_pages; | 1079 | unsigned long read_pages = 0; |
1071 | unsigned char **page = NULL; | 1080 | unsigned char **page = NULL; |
1072 | struct dec_data *data = NULL; | 1081 | struct dec_data *data = NULL; |
1073 | struct crc_data *crc = NULL; | 1082 | struct crc_data *crc = NULL; |
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1079 | nr_threads = num_online_cpus() - 1; | 1088 | nr_threads = num_online_cpus() - 1; |
1080 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | 1089 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); |
1081 | 1090 | ||
1082 | page = vmalloc(sizeof(*page) * LZO_READ_PAGES); | 1091 | page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); |
1083 | if (!page) { | 1092 | if (!page) { |
1084 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 1093 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
1085 | ret = -ENOMEM; | 1094 | ret = -ENOMEM; |
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1144 | } | 1153 | } |
1145 | 1154 | ||
1146 | /* | 1155 | /* |
1147 | * Adjust number of pages for read buffering, in case we are short. | 1156 | * Set the number of pages for read buffering. |
1157 | * This is complete guesswork, because we'll only know the real | ||
1158 | * picture once prepare_image() is called, which is much later on | ||
1159 | * during the image load phase. We'll assume the worst case and | ||
1160 | * say that none of the image pages are from high memory. | ||
1148 | */ | 1161 | */ |
1149 | read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; | 1162 | if (low_free_pages() > snapshot_get_image_size()) |
1150 | read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); | 1163 | read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; |
1164 | read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); | ||
1151 | 1165 | ||
1152 | for (i = 0; i < read_pages; i++) { | 1166 | for (i = 0; i < read_pages; i++) { |
1153 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | 1167 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? |
1154 | __GFP_WAIT | __GFP_HIGH : | 1168 | __GFP_WAIT | __GFP_HIGH : |
1155 | __GFP_WAIT); | 1169 | __GFP_WAIT | __GFP_NOWARN | |
1170 | __GFP_NORETRY); | ||
1171 | |||
1156 | if (!page[i]) { | 1172 | if (!page[i]) { |
1157 | if (i < LZO_CMP_PAGES) { | 1173 | if (i < LZO_CMP_PAGES) { |
1158 | ring_size = i; | 1174 | ring_size = i; |
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 000000000000..c8fba3380076 --- /dev/null +++ b/kernel/power/wakelock.c | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * kernel/power/wakelock.c | ||
3 | * | ||
4 | * User space wakeup sources support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | * | ||
8 | * This code is based on the analogous interface allowing user space to | ||
9 | * manipulate wakelocks on Android. | ||
10 | */ | ||
11 | |||
12 | #include <linux/ctype.h> | ||
13 | #include <linux/device.h> | ||
14 | #include <linux/err.h> | ||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/list.h> | ||
17 | #include <linux/rbtree.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | static DEFINE_MUTEX(wakelocks_lock); | ||
21 | |||
22 | struct wakelock { | ||
23 | char *name; | ||
24 | struct rb_node node; | ||
25 | struct wakeup_source ws; | ||
26 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
27 | struct list_head lru; | ||
28 | #endif | ||
29 | }; | ||
30 | |||
31 | static struct rb_root wakelocks_tree = RB_ROOT; | ||
32 | |||
33 | ssize_t pm_show_wakelocks(char *buf, bool show_active) | ||
34 | { | ||
35 | struct rb_node *node; | ||
36 | struct wakelock *wl; | ||
37 | char *str = buf; | ||
38 | char *end = buf + PAGE_SIZE; | ||
39 | |||
40 | mutex_lock(&wakelocks_lock); | ||
41 | |||
42 | for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { | ||
43 | wl = rb_entry(node, struct wakelock, node); | ||
44 | if (wl->ws.active == show_active) | ||
45 | str += scnprintf(str, end - str, "%s ", wl->name); | ||
46 | } | ||
47 | if (str > buf) | ||
48 | str--; | ||
49 | |||
50 | str += scnprintf(str, end - str, "\n"); | ||
51 | |||
52 | mutex_unlock(&wakelocks_lock); | ||
53 | return (str - buf); | ||
54 | } | ||
55 | |||
56 | #if CONFIG_PM_WAKELOCKS_LIMIT > 0 | ||
57 | static unsigned int number_of_wakelocks; | ||
58 | |||
59 | static inline bool wakelocks_limit_exceeded(void) | ||
60 | { | ||
61 | return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; | ||
62 | } | ||
63 | |||
64 | static inline void increment_wakelocks_number(void) | ||
65 | { | ||
66 | number_of_wakelocks++; | ||
67 | } | ||
68 | |||
69 | static inline void decrement_wakelocks_number(void) | ||
70 | { | ||
71 | number_of_wakelocks--; | ||
72 | } | ||
73 | #else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ | ||
74 | static inline bool wakelocks_limit_exceeded(void) { return false; } | ||
75 | static inline void increment_wakelocks_number(void) {} | ||
76 | static inline void decrement_wakelocks_number(void) {} | ||
77 | #endif /* CONFIG_PM_WAKELOCKS_LIMIT */ | ||
78 | |||
79 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
80 | #define WL_GC_COUNT_MAX 100 | ||
81 | #define WL_GC_TIME_SEC 300 | ||
82 | |||
83 | static LIST_HEAD(wakelocks_lru_list); | ||
84 | static unsigned int wakelocks_gc_count; | ||
85 | |||
86 | static inline void wakelocks_lru_add(struct wakelock *wl) | ||
87 | { | ||
88 | list_add(&wl->lru, &wakelocks_lru_list); | ||
89 | } | ||
90 | |||
91 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) | ||
92 | { | ||
93 | list_move(&wl->lru, &wakelocks_lru_list); | ||
94 | } | ||
95 | |||
96 | static void wakelocks_gc(void) | ||
97 | { | ||
98 | struct wakelock *wl, *aux; | ||
99 | ktime_t now; | ||
100 | |||
101 | if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) | ||
102 | return; | ||
103 | |||
104 | now = ktime_get(); | ||
105 | list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { | ||
106 | u64 idle_time_ns; | ||
107 | bool active; | ||
108 | |||
109 | spin_lock_irq(&wl->ws.lock); | ||
110 | idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); | ||
111 | active = wl->ws.active; | ||
112 | spin_unlock_irq(&wl->ws.lock); | ||
113 | |||
114 | if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) | ||
115 | break; | ||
116 | |||
117 | if (!active) { | ||
118 | wakeup_source_remove(&wl->ws); | ||
119 | rb_erase(&wl->node, &wakelocks_tree); | ||
120 | list_del(&wl->lru); | ||
121 | kfree(wl->name); | ||
122 | kfree(wl); | ||
123 | decrement_wakelocks_number(); | ||
124 | } | ||
125 | } | ||
126 | wakelocks_gc_count = 0; | ||
127 | } | ||
128 | #else /* !CONFIG_PM_WAKELOCKS_GC */ | ||
129 | static inline void wakelocks_lru_add(struct wakelock *wl) {} | ||
130 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} | ||
131 | static inline void wakelocks_gc(void) {} | ||
132 | #endif /* !CONFIG_PM_WAKELOCKS_GC */ | ||
133 | |||
134 | static struct wakelock *wakelock_lookup_add(const char *name, size_t len, | ||
135 | bool add_if_not_found) | ||
136 | { | ||
137 | struct rb_node **node = &wakelocks_tree.rb_node; | ||
138 | struct rb_node *parent = *node; | ||
139 | struct wakelock *wl; | ||
140 | |||
141 | while (*node) { | ||
142 | int diff; | ||
143 | |||
144 | parent = *node; | ||
145 | wl = rb_entry(*node, struct wakelock, node); | ||
146 | diff = strncmp(name, wl->name, len); | ||
147 | if (diff == 0) { | ||
148 | if (wl->name[len]) | ||
149 | diff = -1; | ||
150 | else | ||
151 | return wl; | ||
152 | } | ||
153 | if (diff < 0) | ||
154 | node = &(*node)->rb_left; | ||
155 | else | ||
156 | node = &(*node)->rb_right; | ||
157 | } | ||
158 | if (!add_if_not_found) | ||
159 | return ERR_PTR(-EINVAL); | ||
160 | |||
161 | if (wakelocks_limit_exceeded()) | ||
162 | return ERR_PTR(-ENOSPC); | ||
163 | |||
164 | /* Not found, we have to add a new one. */ | ||
165 | wl = kzalloc(sizeof(*wl), GFP_KERNEL); | ||
166 | if (!wl) | ||
167 | return ERR_PTR(-ENOMEM); | ||
168 | |||
169 | wl->name = kstrndup(name, len, GFP_KERNEL); | ||
170 | if (!wl->name) { | ||
171 | kfree(wl); | ||
172 | return ERR_PTR(-ENOMEM); | ||
173 | } | ||
174 | wl->ws.name = wl->name; | ||
175 | wakeup_source_add(&wl->ws); | ||
176 | rb_link_node(&wl->node, parent, node); | ||
177 | rb_insert_color(&wl->node, &wakelocks_tree); | ||
178 | wakelocks_lru_add(wl); | ||
179 | increment_wakelocks_number(); | ||
180 | return wl; | ||
181 | } | ||
182 | |||
183 | int pm_wake_lock(const char *buf) | ||
184 | { | ||
185 | const char *str = buf; | ||
186 | struct wakelock *wl; | ||
187 | u64 timeout_ns = 0; | ||
188 | size_t len; | ||
189 | int ret = 0; | ||
190 | |||
191 | while (*str && !isspace(*str)) | ||
192 | str++; | ||
193 | |||
194 | len = str - buf; | ||
195 | if (!len) | ||
196 | return -EINVAL; | ||
197 | |||
198 | if (*str && *str != '\n') { | ||
199 | /* Find out if there's a valid timeout string appended. */ | ||
200 | ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); | ||
201 | if (ret) | ||
202 | return -EINVAL; | ||
203 | } | ||
204 | |||
205 | mutex_lock(&wakelocks_lock); | ||
206 | |||
207 | wl = wakelock_lookup_add(buf, len, true); | ||
208 | if (IS_ERR(wl)) { | ||
209 | ret = PTR_ERR(wl); | ||
210 | goto out; | ||
211 | } | ||
212 | if (timeout_ns) { | ||
213 | u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; | ||
214 | |||
215 | do_div(timeout_ms, NSEC_PER_MSEC); | ||
216 | __pm_wakeup_event(&wl->ws, timeout_ms); | ||
217 | } else { | ||
218 | __pm_stay_awake(&wl->ws); | ||
219 | } | ||
220 | |||
221 | wakelocks_lru_most_recent(wl); | ||
222 | |||
223 | out: | ||
224 | mutex_unlock(&wakelocks_lock); | ||
225 | return ret; | ||
226 | } | ||
227 | |||
228 | int pm_wake_unlock(const char *buf) | ||
229 | { | ||
230 | struct wakelock *wl; | ||
231 | size_t len; | ||
232 | int ret = 0; | ||
233 | |||
234 | len = strlen(buf); | ||
235 | if (!len) | ||
236 | return -EINVAL; | ||
237 | |||
238 | if (buf[len-1] == '\n') | ||
239 | len--; | ||
240 | |||
241 | if (!len) | ||
242 | return -EINVAL; | ||
243 | |||
244 | mutex_lock(&wakelocks_lock); | ||
245 | |||
246 | wl = wakelock_lookup_add(buf, len, false); | ||
247 | if (IS_ERR(wl)) { | ||
248 | ret = PTR_ERR(wl); | ||
249 | goto out; | ||
250 | } | ||
251 | __pm_relax(&wl->ws); | ||
252 | |||
253 | wakelocks_lru_most_recent(wl); | ||
254 | wakelocks_gc(); | ||
255 | |||
256 | out: | ||
257 | mutex_unlock(&wakelocks_lock); | ||
258 | return ret; | ||
259 | } | ||
diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d39..32462d2b364a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/rculist.h> | 43 | #include <linux/rculist.h> |
44 | #include <linux/poll.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | 47 | ||
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
54 | { | 55 | { |
55 | } | 56 | } |
56 | 57 | ||
57 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
58 | |||
59 | /* printk's without a loglevel use this.. */ | 58 | /* printk's without a loglevel use this.. */ |
60 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 59 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
61 | 60 | ||
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); | |||
99 | static int console_locked, console_suspended; | 98 | static int console_locked, console_suspended; |
100 | 99 | ||
101 | /* | 100 | /* |
102 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | ||
103 | * It is also used in interesting ways to provide interlocking in | ||
104 | * console_unlock();. | ||
105 | */ | ||
106 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
107 | |||
108 | #define LOG_BUF_MASK (log_buf_len-1) | ||
109 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | ||
110 | |||
111 | /* | ||
112 | * The indices into log_buf are not constrained to log_buf_len - they | ||
113 | * must be masked before subscripting | ||
114 | */ | ||
115 | static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ | ||
116 | static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ | ||
117 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | ||
118 | |||
119 | /* | ||
120 | * If exclusive_console is non-NULL then only this console is to be printed to. | 101 | * If exclusive_console is non-NULL then only this console is to be printed to. |
121 | */ | 102 | */ |
122 | static struct console *exclusive_console; | 103 | static struct console *exclusive_console; |
@@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline); | |||
145 | /* Flag: console code may call schedule() */ | 126 | /* Flag: console code may call schedule() */ |
146 | static int console_may_schedule; | 127 | static int console_may_schedule; |
147 | 128 | ||
129 | /* | ||
130 | * The printk log buffer consists of a chain of concatenated variable | ||
131 | * length records. Every record starts with a record header, containing | ||
132 | * the overall length of the record. | ||
133 | * | ||
134 | * The heads to the first and last entry in the buffer, as well as the | ||
135 | * sequence numbers of these both entries are maintained when messages | ||
136 | * are stored.. | ||
137 | * | ||
138 | * If the heads indicate available messages, the length in the header | ||
139 | * tells the start next message. A length == 0 for the next message | ||
140 | * indicates a wrap-around to the beginning of the buffer. | ||
141 | * | ||
142 | * Every record carries the monotonic timestamp in microseconds, as well as | ||
143 | * the standard userspace syslog level and syslog facility. The usual | ||
144 | * kernel messages use LOG_KERN; userspace-injected messages always carry | ||
145 | * a matching syslog facility, by default LOG_USER. The origin of every | ||
146 | * message can be reliably determined that way. | ||
147 | * | ||
148 | * The human readable log message directly follows the message header. The | ||
149 | * length of the message text is stored in the header, the stored message | ||
150 | * is not terminated. | ||
151 | * | ||
152 | * Optionally, a message can carry a dictionary of properties (key/value pairs), | ||
153 | * to provide userspace with a machine-readable message context. | ||
154 | * | ||
155 | * Examples for well-defined, commonly used property names are: | ||
156 | * DEVICE=b12:8 device identifier | ||
157 | * b12:8 block dev_t | ||
158 | * c127:3 char dev_t | ||
159 | * n8 netdev ifindex | ||
160 | * +sound:card0 subsystem:devname | ||
161 | * SUBSYSTEM=pci driver-core subsystem name | ||
162 | * | ||
163 | * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value | ||
164 | * follows directly after a '=' character. Every property is terminated by | ||
165 | * a '\0' character. The last property is not terminated. | ||
166 | * | ||
167 | * Example of a message structure: | ||
168 | * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec | ||
169 | * 0008 34 00 record is 52 bytes long | ||
170 | * 000a 0b 00 text is 11 bytes long | ||
171 | * 000c 1f 00 dictionary is 23 bytes long | ||
172 | * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) | ||
173 | * 0010 69 74 27 73 20 61 20 6c "it's a l" | ||
174 | * 69 6e 65 "ine" | ||
175 | * 001b 44 45 56 49 43 "DEVIC" | ||
176 | * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" | ||
177 | * 52 49 56 45 52 3d 62 75 "RIVER=bu" | ||
178 | * 67 "g" | ||
179 | * 0032 00 00 00 padding to next message header | ||
180 | * | ||
181 | * The 'struct log' buffer header must never be directly exported to | ||
182 | * userspace, it is a kernel-private implementation detail that might | ||
183 | * need to be changed in the future, when the requirements change. | ||
184 | * | ||
185 | * /dev/kmsg exports the structured data in the following line format: | ||
186 | * "level,sequnum,timestamp;<message text>\n" | ||
187 | * | ||
188 | * The optional key/value pairs are attached as continuation lines starting | ||
189 | * with a space character and terminated by a newline. All possible | ||
190 | * non-prinatable characters are escaped in the "\xff" notation. | ||
191 | * | ||
192 | * Users of the export format should ignore possible additional values | ||
193 | * separated by ',', and find the message after the ';' character. | ||
194 | */ | ||
195 | |||
196 | struct log { | ||
197 | u64 ts_nsec; /* timestamp in nanoseconds */ | ||
198 | u16 len; /* length of entire record */ | ||
199 | u16 text_len; /* length of text buffer */ | ||
200 | u16 dict_len; /* length of dictionary buffer */ | ||
201 | u16 level; /* syslog level + facility */ | ||
202 | }; | ||
203 | |||
204 | /* | ||
205 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | ||
206 | * used in interesting ways to provide interlocking in console_unlock(); | ||
207 | */ | ||
208 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
209 | |||
210 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | ||
211 | static u64 syslog_seq; | ||
212 | static u32 syslog_idx; | ||
213 | |||
214 | /* index and sequence number of the first record stored in the buffer */ | ||
215 | static u64 log_first_seq; | ||
216 | static u32 log_first_idx; | ||
217 | |||
218 | /* index and sequence number of the next record to store in the buffer */ | ||
219 | static u64 log_next_seq; | ||
148 | #ifdef CONFIG_PRINTK | 220 | #ifdef CONFIG_PRINTK |
221 | static u32 log_next_idx; | ||
222 | |||
223 | /* the next printk record to read after the last 'clear' command */ | ||
224 | static u64 clear_seq; | ||
225 | static u32 clear_idx; | ||
226 | |||
227 | #define LOG_LINE_MAX 1024 | ||
149 | 228 | ||
150 | static char __log_buf[__LOG_BUF_LEN]; | 229 | /* record buffer */ |
230 | #if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
231 | #define LOG_ALIGN 4 | ||
232 | #else | ||
233 | #define LOG_ALIGN 8 | ||
234 | #endif | ||
235 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
236 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | ||
151 | static char *log_buf = __log_buf; | 237 | static char *log_buf = __log_buf; |
152 | static int log_buf_len = __LOG_BUF_LEN; | 238 | static u32 log_buf_len = __LOG_BUF_LEN; |
153 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 239 | |
154 | static int saved_console_loglevel = -1; | 240 | /* cpu currently holding logbuf_lock */ |
241 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
242 | |||
243 | /* human readable text of the record */ | ||
244 | static char *log_text(const struct log *msg) | ||
245 | { | ||
246 | return (char *)msg + sizeof(struct log); | ||
247 | } | ||
248 | |||
249 | /* optional key/value pair dictionary attached to the record */ | ||
250 | static char *log_dict(const struct log *msg) | ||
251 | { | ||
252 | return (char *)msg + sizeof(struct log) + msg->text_len; | ||
253 | } | ||
254 | |||
255 | /* get record by index; idx must point to valid msg */ | ||
256 | static struct log *log_from_idx(u32 idx) | ||
257 | { | ||
258 | struct log *msg = (struct log *)(log_buf + idx); | ||
259 | |||
260 | /* | ||
261 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
262 | * read the message at the start of the buffer. | ||
263 | */ | ||
264 | if (!msg->len) | ||
265 | return (struct log *)log_buf; | ||
266 | return msg; | ||
267 | } | ||
268 | |||
269 | /* get next record; idx must point to valid msg */ | ||
270 | static u32 log_next(u32 idx) | ||
271 | { | ||
272 | struct log *msg = (struct log *)(log_buf + idx); | ||
273 | |||
274 | /* length == 0 indicates the end of the buffer; wrap */ | ||
275 | /* | ||
276 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
277 | * read the message at the start of the buffer as *this* one, and | ||
278 | * return the one after that. | ||
279 | */ | ||
280 | if (!msg->len) { | ||
281 | msg = (struct log *)log_buf; | ||
282 | return msg->len; | ||
283 | } | ||
284 | return idx + msg->len; | ||
285 | } | ||
286 | |||
287 | /* insert record into the buffer, discard old ones, update heads */ | ||
288 | static void log_store(int facility, int level, | ||
289 | const char *dict, u16 dict_len, | ||
290 | const char *text, u16 text_len) | ||
291 | { | ||
292 | struct log *msg; | ||
293 | u32 size, pad_len; | ||
294 | |||
295 | /* number of '\0' padding bytes to next message */ | ||
296 | size = sizeof(struct log) + text_len + dict_len; | ||
297 | pad_len = (-size) & (LOG_ALIGN - 1); | ||
298 | size += pad_len; | ||
299 | |||
300 | while (log_first_seq < log_next_seq) { | ||
301 | u32 free; | ||
302 | |||
303 | if (log_next_idx > log_first_idx) | ||
304 | free = max(log_buf_len - log_next_idx, log_first_idx); | ||
305 | else | ||
306 | free = log_first_idx - log_next_idx; | ||
307 | |||
308 | if (free > size + sizeof(struct log)) | ||
309 | break; | ||
310 | |||
311 | /* drop old messages until we have enough contiuous space */ | ||
312 | log_first_idx = log_next(log_first_idx); | ||
313 | log_first_seq++; | ||
314 | } | ||
315 | |||
316 | if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { | ||
317 | /* | ||
318 | * This message + an additional empty header does not fit | ||
319 | * at the end of the buffer. Add an empty header with len == 0 | ||
320 | * to signify a wrap around. | ||
321 | */ | ||
322 | memset(log_buf + log_next_idx, 0, sizeof(struct log)); | ||
323 | log_next_idx = 0; | ||
324 | } | ||
325 | |||
326 | /* fill message */ | ||
327 | msg = (struct log *)(log_buf + log_next_idx); | ||
328 | memcpy(log_text(msg), text, text_len); | ||
329 | msg->text_len = text_len; | ||
330 | memcpy(log_dict(msg), dict, dict_len); | ||
331 | msg->dict_len = dict_len; | ||
332 | msg->level = (facility << 3) | (level & 7); | ||
333 | msg->ts_nsec = local_clock(); | ||
334 | memset(log_dict(msg) + dict_len, 0, pad_len); | ||
335 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | ||
336 | |||
337 | /* insert message */ | ||
338 | log_next_idx += msg->len; | ||
339 | log_next_seq++; | ||
340 | } | ||
341 | |||
342 | /* /dev/kmsg - userspace message inject/listen interface */ | ||
343 | struct devkmsg_user { | ||
344 | u64 seq; | ||
345 | u32 idx; | ||
346 | struct mutex lock; | ||
347 | char buf[8192]; | ||
348 | }; | ||
349 | |||
350 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | ||
351 | unsigned long count, loff_t pos) | ||
352 | { | ||
353 | char *buf, *line; | ||
354 | int i; | ||
355 | int level = default_message_loglevel; | ||
356 | int facility = 1; /* LOG_USER */ | ||
357 | size_t len = iov_length(iv, count); | ||
358 | ssize_t ret = len; | ||
359 | |||
360 | if (len > LOG_LINE_MAX) | ||
361 | return -EINVAL; | ||
362 | buf = kmalloc(len+1, GFP_KERNEL); | ||
363 | if (buf == NULL) | ||
364 | return -ENOMEM; | ||
365 | |||
366 | line = buf; | ||
367 | for (i = 0; i < count; i++) { | ||
368 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) | ||
369 | goto out; | ||
370 | line += iv[i].iov_len; | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace | ||
375 | * the decimal value represents 32bit, the lower 3 bit are the log | ||
376 | * level, the rest are the log facility. | ||
377 | * | ||
378 | * If no prefix or no userspace facility is specified, we | ||
379 | * enforce LOG_USER, to be able to reliably distinguish | ||
380 | * kernel-generated messages from userspace-injected ones. | ||
381 | */ | ||
382 | line = buf; | ||
383 | if (line[0] == '<') { | ||
384 | char *endp = NULL; | ||
385 | |||
386 | i = simple_strtoul(line+1, &endp, 10); | ||
387 | if (endp && endp[0] == '>') { | ||
388 | level = i & 7; | ||
389 | if (i >> 3) | ||
390 | facility = i >> 3; | ||
391 | endp++; | ||
392 | len -= endp - line; | ||
393 | line = endp; | ||
394 | } | ||
395 | } | ||
396 | line[len] = '\0'; | ||
397 | |||
398 | printk_emit(facility, level, NULL, 0, "%s", line); | ||
399 | out: | ||
400 | kfree(buf); | ||
401 | return ret; | ||
402 | } | ||
403 | |||
404 | static ssize_t devkmsg_read(struct file *file, char __user *buf, | ||
405 | size_t count, loff_t *ppos) | ||
406 | { | ||
407 | struct devkmsg_user *user = file->private_data; | ||
408 | struct log *msg; | ||
409 | u64 ts_usec; | ||
410 | size_t i; | ||
411 | size_t len; | ||
412 | ssize_t ret; | ||
413 | |||
414 | if (!user) | ||
415 | return -EBADF; | ||
416 | |||
417 | mutex_lock(&user->lock); | ||
418 | raw_spin_lock(&logbuf_lock); | ||
419 | while (user->seq == log_next_seq) { | ||
420 | if (file->f_flags & O_NONBLOCK) { | ||
421 | ret = -EAGAIN; | ||
422 | raw_spin_unlock(&logbuf_lock); | ||
423 | goto out; | ||
424 | } | ||
425 | |||
426 | raw_spin_unlock(&logbuf_lock); | ||
427 | ret = wait_event_interruptible(log_wait, | ||
428 | user->seq != log_next_seq); | ||
429 | if (ret) | ||
430 | goto out; | ||
431 | raw_spin_lock(&logbuf_lock); | ||
432 | } | ||
433 | |||
434 | if (user->seq < log_first_seq) { | ||
435 | /* our last seen message is gone, return error and reset */ | ||
436 | user->idx = log_first_idx; | ||
437 | user->seq = log_first_seq; | ||
438 | ret = -EPIPE; | ||
439 | raw_spin_unlock(&logbuf_lock); | ||
440 | goto out; | ||
441 | } | ||
442 | |||
443 | msg = log_from_idx(user->idx); | ||
444 | ts_usec = msg->ts_nsec; | ||
445 | do_div(ts_usec, 1000); | ||
446 | len = sprintf(user->buf, "%u,%llu,%llu;", | ||
447 | msg->level, user->seq, ts_usec); | ||
448 | |||
449 | /* escape non-printable characters */ | ||
450 | for (i = 0; i < msg->text_len; i++) { | ||
451 | unsigned char c = log_text(msg)[i]; | ||
452 | |||
453 | if (c < ' ' || c >= 128) | ||
454 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
455 | else | ||
456 | user->buf[len++] = c; | ||
457 | } | ||
458 | user->buf[len++] = '\n'; | ||
459 | |||
460 | if (msg->dict_len) { | ||
461 | bool line = true; | ||
462 | |||
463 | for (i = 0; i < msg->dict_len; i++) { | ||
464 | unsigned char c = log_dict(msg)[i]; | ||
465 | |||
466 | if (line) { | ||
467 | user->buf[len++] = ' '; | ||
468 | line = false; | ||
469 | } | ||
470 | |||
471 | if (c == '\0') { | ||
472 | user->buf[len++] = '\n'; | ||
473 | line = true; | ||
474 | continue; | ||
475 | } | ||
476 | |||
477 | if (c < ' ' || c >= 128) { | ||
478 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
479 | continue; | ||
480 | } | ||
481 | |||
482 | user->buf[len++] = c; | ||
483 | } | ||
484 | user->buf[len++] = '\n'; | ||
485 | } | ||
486 | |||
487 | user->idx = log_next(user->idx); | ||
488 | user->seq++; | ||
489 | raw_spin_unlock(&logbuf_lock); | ||
490 | |||
491 | if (len > count) { | ||
492 | ret = -EINVAL; | ||
493 | goto out; | ||
494 | } | ||
495 | |||
496 | if (copy_to_user(buf, user->buf, len)) { | ||
497 | ret = -EFAULT; | ||
498 | goto out; | ||
499 | } | ||
500 | ret = len; | ||
501 | out: | ||
502 | mutex_unlock(&user->lock); | ||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | ||
507 | { | ||
508 | struct devkmsg_user *user = file->private_data; | ||
509 | loff_t ret = 0; | ||
510 | |||
511 | if (!user) | ||
512 | return -EBADF; | ||
513 | if (offset) | ||
514 | return -ESPIPE; | ||
515 | |||
516 | raw_spin_lock(&logbuf_lock); | ||
517 | switch (whence) { | ||
518 | case SEEK_SET: | ||
519 | /* the first record */ | ||
520 | user->idx = log_first_idx; | ||
521 | user->seq = log_first_seq; | ||
522 | break; | ||
523 | case SEEK_DATA: | ||
524 | /* | ||
525 | * The first record after the last SYSLOG_ACTION_CLEAR, | ||
526 | * like issued by 'dmesg -c'. Reading /dev/kmsg itself | ||
527 | * changes no global state, and does not clear anything. | ||
528 | */ | ||
529 | user->idx = clear_idx; | ||
530 | user->seq = clear_seq; | ||
531 | break; | ||
532 | case SEEK_END: | ||
533 | /* after the last record */ | ||
534 | user->idx = log_next_idx; | ||
535 | user->seq = log_next_seq; | ||
536 | break; | ||
537 | default: | ||
538 | ret = -EINVAL; | ||
539 | } | ||
540 | raw_spin_unlock(&logbuf_lock); | ||
541 | return ret; | ||
542 | } | ||
543 | |||
544 | static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | ||
545 | { | ||
546 | struct devkmsg_user *user = file->private_data; | ||
547 | int ret = 0; | ||
548 | |||
549 | if (!user) | ||
550 | return POLLERR|POLLNVAL; | ||
551 | |||
552 | poll_wait(file, &log_wait, wait); | ||
553 | |||
554 | raw_spin_lock(&logbuf_lock); | ||
555 | if (user->seq < log_next_seq) { | ||
556 | /* return error when data has vanished underneath us */ | ||
557 | if (user->seq < log_first_seq) | ||
558 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | ||
559 | ret = POLLIN|POLLRDNORM; | ||
560 | } | ||
561 | raw_spin_unlock(&logbuf_lock); | ||
562 | |||
563 | return ret; | ||
564 | } | ||
565 | |||
566 | static int devkmsg_open(struct inode *inode, struct file *file) | ||
567 | { | ||
568 | struct devkmsg_user *user; | ||
569 | int err; | ||
570 | |||
571 | /* write-only does not need any file context */ | ||
572 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) | ||
573 | return 0; | ||
574 | |||
575 | err = security_syslog(SYSLOG_ACTION_READ_ALL); | ||
576 | if (err) | ||
577 | return err; | ||
578 | |||
579 | user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); | ||
580 | if (!user) | ||
581 | return -ENOMEM; | ||
582 | |||
583 | mutex_init(&user->lock); | ||
584 | |||
585 | raw_spin_lock(&logbuf_lock); | ||
586 | user->idx = log_first_idx; | ||
587 | user->seq = log_first_seq; | ||
588 | raw_spin_unlock(&logbuf_lock); | ||
589 | |||
590 | file->private_data = user; | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | static int devkmsg_release(struct inode *inode, struct file *file) | ||
595 | { | ||
596 | struct devkmsg_user *user = file->private_data; | ||
597 | |||
598 | if (!user) | ||
599 | return 0; | ||
600 | |||
601 | mutex_destroy(&user->lock); | ||
602 | kfree(user); | ||
603 | return 0; | ||
604 | } | ||
605 | |||
606 | const struct file_operations kmsg_fops = { | ||
607 | .open = devkmsg_open, | ||
608 | .read = devkmsg_read, | ||
609 | .aio_write = devkmsg_writev, | ||
610 | .llseek = devkmsg_llseek, | ||
611 | .poll = devkmsg_poll, | ||
612 | .release = devkmsg_release, | ||
613 | }; | ||
155 | 614 | ||
156 | #ifdef CONFIG_KEXEC | 615 | #ifdef CONFIG_KEXEC |
157 | /* | 616 | /* |
@@ -165,9 +624,9 @@ static int saved_console_loglevel = -1; | |||
165 | void log_buf_kexec_setup(void) | 624 | void log_buf_kexec_setup(void) |
166 | { | 625 | { |
167 | VMCOREINFO_SYMBOL(log_buf); | 626 | VMCOREINFO_SYMBOL(log_buf); |
168 | VMCOREINFO_SYMBOL(log_end); | ||
169 | VMCOREINFO_SYMBOL(log_buf_len); | 627 | VMCOREINFO_SYMBOL(log_buf_len); |
170 | VMCOREINFO_SYMBOL(logged_chars); | 628 | VMCOREINFO_SYMBOL(log_first_idx); |
629 | VMCOREINFO_SYMBOL(log_next_idx); | ||
171 | } | 630 | } |
172 | #endif | 631 | #endif |
173 | 632 | ||
@@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup); | |||
191 | void __init setup_log_buf(int early) | 650 | void __init setup_log_buf(int early) |
192 | { | 651 | { |
193 | unsigned long flags; | 652 | unsigned long flags; |
194 | unsigned start, dest_idx, offset; | ||
195 | char *new_log_buf; | 653 | char *new_log_buf; |
196 | int free; | 654 | int free; |
197 | 655 | ||
@@ -219,20 +677,8 @@ void __init setup_log_buf(int early) | |||
219 | log_buf_len = new_log_buf_len; | 677 | log_buf_len = new_log_buf_len; |
220 | log_buf = new_log_buf; | 678 | log_buf = new_log_buf; |
221 | new_log_buf_len = 0; | 679 | new_log_buf_len = 0; |
222 | free = __LOG_BUF_LEN - log_end; | 680 | free = __LOG_BUF_LEN - log_next_idx; |
223 | 681 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | |
224 | offset = start = min(con_start, log_start); | ||
225 | dest_idx = 0; | ||
226 | while (start != log_end) { | ||
227 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
228 | |||
229 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
230 | start++; | ||
231 | dest_idx++; | ||
232 | } | ||
233 | log_start -= offset; | ||
234 | con_start -= offset; | ||
235 | log_end -= offset; | ||
236 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 682 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
237 | 683 | ||
238 | pr_info("log_buf_len: %d\n", log_buf_len); | 684 | pr_info("log_buf_len: %d\n", log_buf_len); |
@@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file) | |||
332 | return 0; | 778 | return 0; |
333 | } | 779 | } |
334 | 780 | ||
781 | #if defined(CONFIG_PRINTK_TIME) | ||
782 | static bool printk_time = 1; | ||
783 | #else | ||
784 | static bool printk_time; | ||
785 | #endif | ||
786 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
787 | |||
788 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | ||
789 | { | ||
790 | size_t len = 0; | ||
791 | |||
792 | if (syslog) { | ||
793 | if (buf) { | ||
794 | len += sprintf(buf, "<%u>", msg->level); | ||
795 | } else { | ||
796 | len += 3; | ||
797 | if (msg->level > 9) | ||
798 | len++; | ||
799 | if (msg->level > 99) | ||
800 | len++; | ||
801 | } | ||
802 | } | ||
803 | |||
804 | if (printk_time) { | ||
805 | if (buf) { | ||
806 | unsigned long long ts = msg->ts_nsec; | ||
807 | unsigned long rem_nsec = do_div(ts, 1000000000); | ||
808 | |||
809 | len += sprintf(buf + len, "[%5lu.%06lu] ", | ||
810 | (unsigned long) ts, rem_nsec / 1000); | ||
811 | } else { | ||
812 | len += 15; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | return len; | ||
817 | } | ||
818 | |||
819 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
820 | char *buf, size_t size) | ||
821 | { | ||
822 | const char *text = log_text(msg); | ||
823 | size_t text_size = msg->text_len; | ||
824 | size_t len = 0; | ||
825 | |||
826 | do { | ||
827 | const char *next = memchr(text, '\n', text_size); | ||
828 | size_t text_len; | ||
829 | |||
830 | if (next) { | ||
831 | text_len = next - text; | ||
832 | next++; | ||
833 | text_size -= next - text; | ||
834 | } else { | ||
835 | text_len = text_size; | ||
836 | } | ||
837 | |||
838 | if (buf) { | ||
839 | if (print_prefix(msg, syslog, NULL) + | ||
840 | text_len + 1>= size - len) | ||
841 | break; | ||
842 | |||
843 | len += print_prefix(msg, syslog, buf + len); | ||
844 | memcpy(buf + len, text, text_len); | ||
845 | len += text_len; | ||
846 | buf[len++] = '\n'; | ||
847 | } else { | ||
848 | /* SYSLOG_ACTION_* buffer size only calculation */ | ||
849 | len += print_prefix(msg, syslog, NULL); | ||
850 | len += text_len + 1; | ||
851 | } | ||
852 | |||
853 | text = next; | ||
854 | } while (text); | ||
855 | |||
856 | return len; | ||
857 | } | ||
858 | |||
859 | static int syslog_print(char __user *buf, int size) | ||
860 | { | ||
861 | char *text; | ||
862 | struct log *msg; | ||
863 | int len; | ||
864 | |||
865 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
866 | if (!text) | ||
867 | return -ENOMEM; | ||
868 | |||
869 | raw_spin_lock_irq(&logbuf_lock); | ||
870 | if (syslog_seq < log_first_seq) { | ||
871 | /* messages are gone, move to first one */ | ||
872 | syslog_seq = log_first_seq; | ||
873 | syslog_idx = log_first_idx; | ||
874 | } | ||
875 | msg = log_from_idx(syslog_idx); | ||
876 | len = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
877 | syslog_idx = log_next(syslog_idx); | ||
878 | syslog_seq++; | ||
879 | raw_spin_unlock_irq(&logbuf_lock); | ||
880 | |||
881 | if (len > 0 && copy_to_user(buf, text, len)) | ||
882 | len = -EFAULT; | ||
883 | |||
884 | kfree(text); | ||
885 | return len; | ||
886 | } | ||
887 | |||
888 | static int syslog_print_all(char __user *buf, int size, bool clear) | ||
889 | { | ||
890 | char *text; | ||
891 | int len = 0; | ||
892 | |||
893 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
894 | if (!text) | ||
895 | return -ENOMEM; | ||
896 | |||
897 | raw_spin_lock_irq(&logbuf_lock); | ||
898 | if (buf) { | ||
899 | u64 next_seq; | ||
900 | u64 seq; | ||
901 | u32 idx; | ||
902 | |||
903 | if (clear_seq < log_first_seq) { | ||
904 | /* messages are gone, move to first available one */ | ||
905 | clear_seq = log_first_seq; | ||
906 | clear_idx = log_first_idx; | ||
907 | } | ||
908 | |||
909 | /* | ||
910 | * Find first record that fits, including all following records, | ||
911 | * into the user-provided buffer for this dump. | ||
912 | */ | ||
913 | seq = clear_seq; | ||
914 | idx = clear_idx; | ||
915 | while (seq < log_next_seq) { | ||
916 | struct log *msg = log_from_idx(idx); | ||
917 | |||
918 | len += msg_print_text(msg, true, NULL, 0); | ||
919 | idx = log_next(idx); | ||
920 | seq++; | ||
921 | } | ||
922 | seq = clear_seq; | ||
923 | idx = clear_idx; | ||
924 | while (len > size && seq < log_next_seq) { | ||
925 | struct log *msg = log_from_idx(idx); | ||
926 | |||
927 | len -= msg_print_text(msg, true, NULL, 0); | ||
928 | idx = log_next(idx); | ||
929 | seq++; | ||
930 | } | ||
931 | |||
932 | /* last message in this dump */ | ||
933 | next_seq = log_next_seq; | ||
934 | |||
935 | len = 0; | ||
936 | while (len >= 0 && seq < next_seq) { | ||
937 | struct log *msg = log_from_idx(idx); | ||
938 | int textlen; | ||
939 | |||
940 | textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); | ||
941 | if (textlen < 0) { | ||
942 | len = textlen; | ||
943 | break; | ||
944 | } | ||
945 | idx = log_next(idx); | ||
946 | seq++; | ||
947 | |||
948 | raw_spin_unlock_irq(&logbuf_lock); | ||
949 | if (copy_to_user(buf + len, text, textlen)) | ||
950 | len = -EFAULT; | ||
951 | else | ||
952 | len += textlen; | ||
953 | raw_spin_lock_irq(&logbuf_lock); | ||
954 | |||
955 | if (seq < log_first_seq) { | ||
956 | /* messages are gone, move to next one */ | ||
957 | seq = log_first_seq; | ||
958 | idx = log_first_idx; | ||
959 | } | ||
960 | } | ||
961 | } | ||
962 | |||
963 | if (clear) { | ||
964 | clear_seq = log_next_seq; | ||
965 | clear_idx = log_next_idx; | ||
966 | } | ||
967 | raw_spin_unlock_irq(&logbuf_lock); | ||
968 | |||
969 | kfree(text); | ||
970 | return len; | ||
971 | } | ||
972 | |||
335 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 973 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
336 | { | 974 | { |
337 | unsigned i, j, limit, count; | 975 | bool clear = false; |
338 | int do_clear = 0; | 976 | static int saved_console_loglevel = -1; |
339 | char c; | ||
340 | int error; | 977 | int error; |
341 | 978 | ||
342 | error = check_syslog_permissions(type, from_file); | 979 | error = check_syslog_permissions(type, from_file); |
@@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
364 | goto out; | 1001 | goto out; |
365 | } | 1002 | } |
366 | error = wait_event_interruptible(log_wait, | 1003 | error = wait_event_interruptible(log_wait, |
367 | (log_start - log_end)); | 1004 | syslog_seq != log_next_seq); |
368 | if (error) | 1005 | if (error) |
369 | goto out; | 1006 | goto out; |
370 | i = 0; | 1007 | error = syslog_print(buf, len); |
371 | raw_spin_lock_irq(&logbuf_lock); | ||
372 | while (!error && (log_start != log_end) && i < len) { | ||
373 | c = LOG_BUF(log_start); | ||
374 | log_start++; | ||
375 | raw_spin_unlock_irq(&logbuf_lock); | ||
376 | error = __put_user(c,buf); | ||
377 | buf++; | ||
378 | i++; | ||
379 | cond_resched(); | ||
380 | raw_spin_lock_irq(&logbuf_lock); | ||
381 | } | ||
382 | raw_spin_unlock_irq(&logbuf_lock); | ||
383 | if (!error) | ||
384 | error = i; | ||
385 | break; | 1008 | break; |
386 | /* Read/clear last kernel messages */ | 1009 | /* Read/clear last kernel messages */ |
387 | case SYSLOG_ACTION_READ_CLEAR: | 1010 | case SYSLOG_ACTION_READ_CLEAR: |
388 | do_clear = 1; | 1011 | clear = true; |
389 | /* FALL THRU */ | 1012 | /* FALL THRU */ |
390 | /* Read last kernel messages */ | 1013 | /* Read last kernel messages */ |
391 | case SYSLOG_ACTION_READ_ALL: | 1014 | case SYSLOG_ACTION_READ_ALL: |
@@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | error = -EFAULT; | 1022 | error = -EFAULT; |
400 | goto out; | 1023 | goto out; |
401 | } | 1024 | } |
402 | count = len; | 1025 | error = syslog_print_all(buf, len, clear); |
403 | if (count > log_buf_len) | ||
404 | count = log_buf_len; | ||
405 | raw_spin_lock_irq(&logbuf_lock); | ||
406 | if (count > logged_chars) | ||
407 | count = logged_chars; | ||
408 | if (do_clear) | ||
409 | logged_chars = 0; | ||
410 | limit = log_end; | ||
411 | /* | ||
412 | * __put_user() could sleep, and while we sleep | ||
413 | * printk() could overwrite the messages | ||
414 | * we try to copy to user space. Therefore | ||
415 | * the messages are copied in reverse. <manfreds> | ||
416 | */ | ||
417 | for (i = 0; i < count && !error; i++) { | ||
418 | j = limit-1-i; | ||
419 | if (j + log_buf_len < log_end) | ||
420 | break; | ||
421 | c = LOG_BUF(j); | ||
422 | raw_spin_unlock_irq(&logbuf_lock); | ||
423 | error = __put_user(c,&buf[count-1-i]); | ||
424 | cond_resched(); | ||
425 | raw_spin_lock_irq(&logbuf_lock); | ||
426 | } | ||
427 | raw_spin_unlock_irq(&logbuf_lock); | ||
428 | if (error) | ||
429 | break; | ||
430 | error = i; | ||
431 | if (i != count) { | ||
432 | int offset = count-error; | ||
433 | /* buffer overflow during copy, correct user buffer. */ | ||
434 | for (i = 0; i < error; i++) { | ||
435 | if (__get_user(c,&buf[i+offset]) || | ||
436 | __put_user(c,&buf[i])) { | ||
437 | error = -EFAULT; | ||
438 | break; | ||
439 | } | ||
440 | cond_resched(); | ||
441 | } | ||
442 | } | ||
443 | break; | 1026 | break; |
444 | /* Clear ring buffer */ | 1027 | /* Clear ring buffer */ |
445 | case SYSLOG_ACTION_CLEAR: | 1028 | case SYSLOG_ACTION_CLEAR: |
446 | logged_chars = 0; | 1029 | syslog_print_all(NULL, 0, true); |
447 | break; | ||
448 | /* Disable logging to console */ | 1030 | /* Disable logging to console */ |
449 | case SYSLOG_ACTION_CONSOLE_OFF: | 1031 | case SYSLOG_ACTION_CONSOLE_OFF: |
450 | if (saved_console_loglevel == -1) | 1032 | if (saved_console_loglevel == -1) |
@@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
472 | break; | 1054 | break; |
473 | /* Number of chars in the log buffer */ | 1055 | /* Number of chars in the log buffer */ |
474 | case SYSLOG_ACTION_SIZE_UNREAD: | 1056 | case SYSLOG_ACTION_SIZE_UNREAD: |
475 | error = log_end - log_start; | 1057 | raw_spin_lock_irq(&logbuf_lock); |
1058 | if (syslog_seq < log_first_seq) { | ||
1059 | /* messages are gone, move to first one */ | ||
1060 | syslog_seq = log_first_seq; | ||
1061 | syslog_idx = log_first_idx; | ||
1062 | } | ||
1063 | if (from_file) { | ||
1064 | /* | ||
1065 | * Short-cut for poll(/"proc/kmsg") which simply checks | ||
1066 | * for pending data, not the size; return the count of | ||
1067 | * records, not the length. | ||
1068 | */ | ||
1069 | error = log_next_idx - syslog_idx; | ||
1070 | } else { | ||
1071 | u64 seq; | ||
1072 | u32 idx; | ||
1073 | |||
1074 | error = 0; | ||
1075 | seq = syslog_seq; | ||
1076 | idx = syslog_idx; | ||
1077 | while (seq < log_next_seq) { | ||
1078 | struct log *msg = log_from_idx(idx); | ||
1079 | |||
1080 | error += msg_print_text(msg, true, NULL, 0); | ||
1081 | idx = log_next(idx); | ||
1082 | seq++; | ||
1083 | } | ||
1084 | } | ||
1085 | raw_spin_unlock_irq(&logbuf_lock); | ||
476 | break; | 1086 | break; |
477 | /* Size of the log buffer */ | 1087 | /* Size of the log buffer */ |
478 | case SYSLOG_ACTION_SIZE_BUFFER: | 1088 | case SYSLOG_ACTION_SIZE_BUFFER: |
@@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4]) | |||
501 | { | 1111 | { |
502 | syslog_data[0] = log_buf; | 1112 | syslog_data[0] = log_buf; |
503 | syslog_data[1] = log_buf + log_buf_len; | 1113 | syslog_data[1] = log_buf + log_buf_len; |
504 | syslog_data[2] = log_buf + log_end - | 1114 | syslog_data[2] = log_buf + log_first_idx; |
505 | (logged_chars < log_buf_len ? logged_chars : log_buf_len); | 1115 | syslog_data[3] = log_buf + log_next_idx; |
506 | syslog_data[3] = log_buf + log_end; | ||
507 | } | 1116 | } |
508 | #endif /* CONFIG_KGDB_KDB */ | 1117 | #endif /* CONFIG_KGDB_KDB */ |
509 | 1118 | ||
510 | /* | ||
511 | * Call the console drivers on a range of log_buf | ||
512 | */ | ||
513 | static void __call_console_drivers(unsigned start, unsigned end) | ||
514 | { | ||
515 | struct console *con; | ||
516 | |||
517 | for_each_console(con) { | ||
518 | if (exclusive_console && con != exclusive_console) | ||
519 | continue; | ||
520 | if ((con->flags & CON_ENABLED) && con->write && | ||
521 | (cpu_online(smp_processor_id()) || | ||
522 | (con->flags & CON_ANYTIME))) | ||
523 | con->write(con, &LOG_BUF(start), end - start); | ||
524 | } | ||
525 | } | ||
526 | |||
527 | static bool __read_mostly ignore_loglevel; | 1119 | static bool __read_mostly ignore_loglevel; |
528 | 1120 | ||
529 | static int __init ignore_loglevel_setup(char *str) | 1121 | static int __init ignore_loglevel_setup(char *str) |
@@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | |||
540 | "print all kernel messages to the console."); | 1132 | "print all kernel messages to the console."); |
541 | 1133 | ||
542 | /* | 1134 | /* |
543 | * Write out chars from start to end - 1 inclusive | ||
544 | */ | ||
545 | static void _call_console_drivers(unsigned start, | ||
546 | unsigned end, int msg_log_level) | ||
547 | { | ||
548 | trace_console(&LOG_BUF(0), start, end, log_buf_len); | ||
549 | |||
550 | if ((msg_log_level < console_loglevel || ignore_loglevel) && | ||
551 | console_drivers && start != end) { | ||
552 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | ||
553 | /* wrapped write */ | ||
554 | __call_console_drivers(start & LOG_BUF_MASK, | ||
555 | log_buf_len); | ||
556 | __call_console_drivers(0, end & LOG_BUF_MASK); | ||
557 | } else { | ||
558 | __call_console_drivers(start, end); | ||
559 | } | ||
560 | } | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
565 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
566 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
567 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
568 | * to extract the correct log level for in-kernel processing, and not mangle | ||
569 | * the original value. | ||
570 | * | ||
571 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
572 | * passed, it will be filled in with the log level without a possible facility | ||
573 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
574 | * and returned. If no valid header is found, 0 is returned and the passed | ||
575 | * variables are not touched. | ||
576 | */ | ||
577 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
578 | { | ||
579 | unsigned int lev = 0; | ||
580 | char sp = '\0'; | ||
581 | size_t len; | ||
582 | |||
583 | if (p[0] != '<' || !p[1]) | ||
584 | return 0; | ||
585 | if (p[2] == '>') { | ||
586 | /* usual single digit level number or special char */ | ||
587 | switch (p[1]) { | ||
588 | case '0' ... '7': | ||
589 | lev = p[1] - '0'; | ||
590 | break; | ||
591 | case 'c': /* KERN_CONT */ | ||
592 | case 'd': /* KERN_DEFAULT */ | ||
593 | sp = p[1]; | ||
594 | break; | ||
595 | default: | ||
596 | return 0; | ||
597 | } | ||
598 | len = 3; | ||
599 | } else { | ||
600 | /* multi digit including the level and facility number */ | ||
601 | char *endp = NULL; | ||
602 | |||
603 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
604 | if (endp == NULL || endp[0] != '>') | ||
605 | return 0; | ||
606 | len = (endp + 1) - p; | ||
607 | } | ||
608 | |||
609 | /* do not accept special char if not asked for */ | ||
610 | if (sp && !special) | ||
611 | return 0; | ||
612 | |||
613 | if (special) { | ||
614 | *special = sp; | ||
615 | /* return special char, do not touch level */ | ||
616 | if (sp) | ||
617 | return len; | ||
618 | } | ||
619 | |||
620 | if (level) | ||
621 | *level = lev; | ||
622 | return len; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * Call the console drivers, asking them to write out | 1135 | * Call the console drivers, asking them to write out |
627 | * log_buf[start] to log_buf[end - 1]. | 1136 | * log_buf[start] to log_buf[end - 1]. |
628 | * The console_lock must be held. | 1137 | * The console_lock must be held. |
629 | */ | 1138 | */ |
630 | static void call_console_drivers(unsigned start, unsigned end) | 1139 | static void call_console_drivers(int level, const char *text, size_t len) |
631 | { | 1140 | { |
632 | unsigned cur_index, start_print; | 1141 | struct console *con; |
633 | static int msg_level = -1; | ||
634 | 1142 | ||
635 | BUG_ON(((int)(start - end)) > 0); | 1143 | trace_console(text, 0, len, len); |
636 | 1144 | ||
637 | cur_index = start; | 1145 | if (level >= console_loglevel && !ignore_loglevel) |
638 | start_print = start; | 1146 | return; |
639 | while (cur_index != end) { | 1147 | if (!console_drivers) |
640 | if (msg_level < 0 && ((end - cur_index) > 2)) { | 1148 | return; |
641 | /* strip log prefix */ | ||
642 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); | ||
643 | start_print = cur_index; | ||
644 | } | ||
645 | while (cur_index != end) { | ||
646 | char c = LOG_BUF(cur_index); | ||
647 | |||
648 | cur_index++; | ||
649 | if (c == '\n') { | ||
650 | if (msg_level < 0) { | ||
651 | /* | ||
652 | * printk() has already given us loglevel tags in | ||
653 | * the buffer. This code is here in case the | ||
654 | * log buffer has wrapped right round and scribbled | ||
655 | * on those tags | ||
656 | */ | ||
657 | msg_level = default_message_loglevel; | ||
658 | } | ||
659 | _call_console_drivers(start_print, cur_index, msg_level); | ||
660 | msg_level = -1; | ||
661 | start_print = cur_index; | ||
662 | break; | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | _call_console_drivers(start_print, end, msg_level); | ||
667 | } | ||
668 | 1149 | ||
669 | static void emit_log_char(char c) | 1150 | for_each_console(con) { |
670 | { | 1151 | if (exclusive_console && con != exclusive_console) |
671 | LOG_BUF(log_end) = c; | 1152 | continue; |
672 | log_end++; | 1153 | if (!(con->flags & CON_ENABLED)) |
673 | if (log_end - log_start > log_buf_len) | 1154 | continue; |
674 | log_start = log_end - log_buf_len; | 1155 | if (!con->write) |
675 | if (log_end - con_start > log_buf_len) | 1156 | continue; |
676 | con_start = log_end - log_buf_len; | 1157 | if (!cpu_online(smp_processor_id()) && |
677 | if (logged_chars < log_buf_len) | 1158 | !(con->flags & CON_ANYTIME)) |
678 | logged_chars++; | 1159 | continue; |
1160 | con->write(con, text, len); | ||
1161 | } | ||
679 | } | 1162 | } |
680 | 1163 | ||
681 | /* | 1164 | /* |
@@ -700,16 +1183,6 @@ static void zap_locks(void) | |||
700 | sema_init(&console_sem, 1); | 1183 | sema_init(&console_sem, 1); |
701 | } | 1184 | } |
702 | 1185 | ||
703 | #if defined(CONFIG_PRINTK_TIME) | ||
704 | static bool printk_time = 1; | ||
705 | #else | ||
706 | static bool printk_time = 0; | ||
707 | #endif | ||
708 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
709 | |||
710 | static bool always_kmsg_dump; | ||
711 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
712 | |||
713 | /* Check if we have any console registered that can be called early in boot. */ | 1186 | /* Check if we have any console registered that can be called early in boot. */ |
714 | static int have_callable_console(void) | 1187 | static int have_callable_console(void) |
715 | { | 1188 | { |
@@ -722,51 +1195,6 @@ static int have_callable_console(void) | |||
722 | return 0; | 1195 | return 0; |
723 | } | 1196 | } |
724 | 1197 | ||
725 | /** | ||
726 | * printk - print a kernel message | ||
727 | * @fmt: format string | ||
728 | * | ||
729 | * This is printk(). It can be called from any context. We want it to work. | ||
730 | * | ||
731 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and | ||
732 | * call the console drivers. If we fail to get the semaphore we place the output | ||
733 | * into the log buffer and return. The current holder of the console_sem will | ||
734 | * notice the new output in console_unlock(); and will send it to the | ||
735 | * consoles before releasing the lock. | ||
736 | * | ||
737 | * One effect of this deferred printing is that code which calls printk() and | ||
738 | * then changes console_loglevel may break. This is because console_loglevel | ||
739 | * is inspected when the actual printing occurs. | ||
740 | * | ||
741 | * See also: | ||
742 | * printf(3) | ||
743 | * | ||
744 | * See the vsnprintf() documentation for format string extensions over C99. | ||
745 | */ | ||
746 | |||
747 | asmlinkage int printk(const char *fmt, ...) | ||
748 | { | ||
749 | va_list args; | ||
750 | int r; | ||
751 | |||
752 | #ifdef CONFIG_KGDB_KDB | ||
753 | if (unlikely(kdb_trap_printk)) { | ||
754 | va_start(args, fmt); | ||
755 | r = vkdb_printf(fmt, args); | ||
756 | va_end(args); | ||
757 | return r; | ||
758 | } | ||
759 | #endif | ||
760 | va_start(args, fmt); | ||
761 | r = vprintk(fmt, args); | ||
762 | va_end(args); | ||
763 | |||
764 | return r; | ||
765 | } | ||
766 | |||
767 | /* cpu currently holding logbuf_lock */ | ||
768 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
769 | |||
770 | /* | 1198 | /* |
771 | * Can we actually use the console at this time on this cpu? | 1199 | * Can we actually use the console at this time on this cpu? |
772 | * | 1200 | * |
@@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
810 | retval = 0; | 1238 | retval = 0; |
811 | } | 1239 | } |
812 | } | 1240 | } |
813 | printk_cpu = UINT_MAX; | 1241 | logbuf_cpu = UINT_MAX; |
814 | if (wake) | 1242 | if (wake) |
815 | up(&console_sem); | 1243 | up(&console_sem); |
816 | raw_spin_unlock(&logbuf_lock); | 1244 | raw_spin_unlock(&logbuf_lock); |
817 | return retval; | 1245 | return retval; |
818 | } | 1246 | } |
819 | static const char recursion_bug_msg [] = | ||
820 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
821 | static int recursion_bug; | ||
822 | static int new_text_line = 1; | ||
823 | static char printk_buf[1024]; | ||
824 | 1247 | ||
825 | int printk_delay_msec __read_mostly; | 1248 | int printk_delay_msec __read_mostly; |
826 | 1249 | ||
@@ -836,15 +1259,23 @@ static inline void printk_delay(void) | |||
836 | } | 1259 | } |
837 | } | 1260 | } |
838 | 1261 | ||
839 | asmlinkage int vprintk(const char *fmt, va_list args) | 1262 | asmlinkage int vprintk_emit(int facility, int level, |
1263 | const char *dict, size_t dictlen, | ||
1264 | const char *fmt, va_list args) | ||
840 | { | 1265 | { |
841 | int printed_len = 0; | 1266 | static int recursion_bug; |
842 | int current_log_level = default_message_loglevel; | 1267 | static char cont_buf[LOG_LINE_MAX]; |
1268 | static size_t cont_len; | ||
1269 | static int cont_level; | ||
1270 | static struct task_struct *cont_task; | ||
1271 | static char textbuf[LOG_LINE_MAX]; | ||
1272 | char *text = textbuf; | ||
1273 | size_t text_len; | ||
843 | unsigned long flags; | 1274 | unsigned long flags; |
844 | int this_cpu; | 1275 | int this_cpu; |
845 | char *p; | 1276 | bool newline = false; |
846 | size_t plen; | 1277 | bool prefix = false; |
847 | char special; | 1278 | int printed_len = 0; |
848 | 1279 | ||
849 | boot_delay_msec(); | 1280 | boot_delay_msec(); |
850 | printk_delay(); | 1281 | printk_delay(); |
@@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | /* | 1287 | /* |
857 | * Ouch, printk recursed into itself! | 1288 | * Ouch, printk recursed into itself! |
858 | */ | 1289 | */ |
859 | if (unlikely(printk_cpu == this_cpu)) { | 1290 | if (unlikely(logbuf_cpu == this_cpu)) { |
860 | /* | 1291 | /* |
861 | * If a crash is occurring during printk() on this CPU, | 1292 | * If a crash is occurring during printk() on this CPU, |
862 | * then try to get the crash message out but make sure | 1293 | * then try to get the crash message out but make sure |
@@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
873 | 1304 | ||
874 | lockdep_off(); | 1305 | lockdep_off(); |
875 | raw_spin_lock(&logbuf_lock); | 1306 | raw_spin_lock(&logbuf_lock); |
876 | printk_cpu = this_cpu; | 1307 | logbuf_cpu = this_cpu; |
877 | 1308 | ||
878 | if (recursion_bug) { | 1309 | if (recursion_bug) { |
1310 | static const char recursion_msg[] = | ||
1311 | "BUG: recent printk recursion!"; | ||
1312 | |||
879 | recursion_bug = 0; | 1313 | recursion_bug = 0; |
880 | strcpy(printk_buf, recursion_bug_msg); | 1314 | printed_len += strlen(recursion_msg); |
881 | printed_len = strlen(recursion_bug_msg); | 1315 | /* emit KERN_CRIT message */ |
1316 | log_store(0, 2, NULL, 0, recursion_msg, printed_len); | ||
882 | } | 1317 | } |
883 | /* Emit the output into the temporary buffer */ | ||
884 | printed_len += vscnprintf(printk_buf + printed_len, | ||
885 | sizeof(printk_buf) - printed_len, fmt, args); | ||
886 | 1318 | ||
887 | p = printk_buf; | 1319 | /* |
1320 | * The printf needs to come first; we need the syslog | ||
1321 | * prefix which might be passed-in as a parameter. | ||
1322 | */ | ||
1323 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | ||
888 | 1324 | ||
889 | /* Read log level and handle special printk prefix */ | 1325 | /* mark and strip a trailing newline */ |
890 | plen = log_prefix(p, ¤t_log_level, &special); | 1326 | if (text_len && text[text_len-1] == '\n') { |
891 | if (plen) { | 1327 | text_len--; |
892 | p += plen; | 1328 | newline = true; |
1329 | } | ||
893 | 1330 | ||
894 | switch (special) { | 1331 | /* strip syslog prefix and extract log level or control flags */ |
895 | case 'c': /* Strip <c> KERN_CONT, continue line */ | 1332 | if (text[0] == '<' && text[1] && text[2] == '>') { |
896 | plen = 0; | 1333 | switch (text[1]) { |
897 | break; | 1334 | case '0' ... '7': |
898 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ | 1335 | if (level == -1) |
899 | plen = 0; | 1336 | level = text[1] - '0'; |
900 | default: | 1337 | case 'd': /* KERN_DEFAULT */ |
901 | if (!new_text_line) { | 1338 | prefix = true; |
902 | emit_log_char('\n'); | 1339 | case 'c': /* KERN_CONT */ |
903 | new_text_line = 1; | 1340 | text += 3; |
904 | } | 1341 | text_len -= 3; |
905 | } | 1342 | } |
906 | } | 1343 | } |
907 | 1344 | ||
908 | /* | 1345 | if (level == -1) |
909 | * Copy the output into log_buf. If the caller didn't provide | 1346 | level = default_message_loglevel; |
910 | * the appropriate log prefix, we insert them here | ||
911 | */ | ||
912 | for (; *p; p++) { | ||
913 | if (new_text_line) { | ||
914 | new_text_line = 0; | ||
915 | |||
916 | if (plen) { | ||
917 | /* Copy original log prefix */ | ||
918 | int i; | ||
919 | |||
920 | for (i = 0; i < plen; i++) | ||
921 | emit_log_char(printk_buf[i]); | ||
922 | printed_len += plen; | ||
923 | } else { | ||
924 | /* Add log prefix */ | ||
925 | emit_log_char('<'); | ||
926 | emit_log_char(current_log_level + '0'); | ||
927 | emit_log_char('>'); | ||
928 | printed_len += 3; | ||
929 | } | ||
930 | 1347 | ||
931 | if (printk_time) { | 1348 | if (dict) { |
932 | /* Add the current time stamp */ | 1349 | prefix = true; |
933 | char tbuf[50], *tp; | 1350 | newline = true; |
934 | unsigned tlen; | 1351 | } |
935 | unsigned long long t; | ||
936 | unsigned long nanosec_rem; | ||
937 | |||
938 | t = cpu_clock(printk_cpu); | ||
939 | nanosec_rem = do_div(t, 1000000000); | ||
940 | tlen = sprintf(tbuf, "[%5lu.%06lu] ", | ||
941 | (unsigned long) t, | ||
942 | nanosec_rem / 1000); | ||
943 | |||
944 | for (tp = tbuf; tp < tbuf + tlen; tp++) | ||
945 | emit_log_char(*tp); | ||
946 | printed_len += tlen; | ||
947 | } | ||
948 | 1352 | ||
949 | if (!*p) | 1353 | if (!newline) { |
950 | break; | 1354 | if (cont_len && (prefix || cont_task != current)) { |
1355 | /* | ||
1356 | * Flush earlier buffer, which is either from a | ||
1357 | * different thread, or when we got a new prefix. | ||
1358 | */ | ||
1359 | log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); | ||
1360 | cont_len = 0; | ||
951 | } | 1361 | } |
952 | 1362 | ||
953 | emit_log_char(*p); | 1363 | if (!cont_len) { |
954 | if (*p == '\n') | 1364 | cont_level = level; |
955 | new_text_line = 1; | 1365 | cont_task = current; |
1366 | } | ||
1367 | |||
1368 | /* buffer or append to earlier buffer from the same thread */ | ||
1369 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1370 | text_len = sizeof(cont_buf) - cont_len; | ||
1371 | memcpy(cont_buf + cont_len, text, text_len); | ||
1372 | cont_len += text_len; | ||
1373 | } else { | ||
1374 | if (cont_len && cont_task == current) { | ||
1375 | if (prefix) { | ||
1376 | /* | ||
1377 | * New prefix from the same thread; flush. We | ||
1378 | * either got no earlier newline, or we race | ||
1379 | * with an interrupt. | ||
1380 | */ | ||
1381 | log_store(facility, cont_level, | ||
1382 | NULL, 0, cont_buf, cont_len); | ||
1383 | cont_len = 0; | ||
1384 | } | ||
1385 | |||
1386 | /* append to the earlier buffer and flush */ | ||
1387 | if (cont_len + text_len > sizeof(cont_buf)) | ||
1388 | text_len = sizeof(cont_buf) - cont_len; | ||
1389 | memcpy(cont_buf + cont_len, text, text_len); | ||
1390 | cont_len += text_len; | ||
1391 | log_store(facility, cont_level, | ||
1392 | NULL, 0, cont_buf, cont_len); | ||
1393 | cont_len = 0; | ||
1394 | cont_task = NULL; | ||
1395 | printed_len = cont_len; | ||
1396 | } else { | ||
1397 | /* ordinary single and terminated line */ | ||
1398 | log_store(facility, level, | ||
1399 | dict, dictlen, text, text_len); | ||
1400 | printed_len = text_len; | ||
1401 | } | ||
956 | } | 1402 | } |
957 | 1403 | ||
958 | /* | 1404 | /* |
959 | * Try to acquire and then immediately release the | 1405 | * Try to acquire and then immediately release the console semaphore. |
960 | * console semaphore. The release will do all the | 1406 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
961 | * actual magic (print out buffers, wake up klogd, | 1407 | * users. |
962 | * etc). | ||
963 | * | 1408 | * |
964 | * The console_trylock_for_printk() function | 1409 | * The console_trylock_for_printk() function will release 'logbuf_lock' |
965 | * will release 'logbuf_lock' regardless of whether it | 1410 | * regardless of whether it actually gets the console semaphore or not. |
966 | * actually gets the semaphore or not. | ||
967 | */ | 1411 | */ |
968 | if (console_trylock_for_printk(this_cpu)) | 1412 | if (console_trylock_for_printk(this_cpu)) |
969 | console_unlock(); | 1413 | console_unlock(); |
@@ -974,16 +1418,81 @@ out_restore_irqs: | |||
974 | 1418 | ||
975 | return printed_len; | 1419 | return printed_len; |
976 | } | 1420 | } |
977 | EXPORT_SYMBOL(printk); | 1421 | EXPORT_SYMBOL(vprintk_emit); |
978 | EXPORT_SYMBOL(vprintk); | ||
979 | 1422 | ||
980 | #else | 1423 | asmlinkage int vprintk(const char *fmt, va_list args) |
1424 | { | ||
1425 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1426 | } | ||
1427 | EXPORT_SYMBOL(vprintk); | ||
981 | 1428 | ||
982 | static void call_console_drivers(unsigned start, unsigned end) | 1429 | asmlinkage int printk_emit(int facility, int level, |
1430 | const char *dict, size_t dictlen, | ||
1431 | const char *fmt, ...) | ||
983 | { | 1432 | { |
1433 | va_list args; | ||
1434 | int r; | ||
1435 | |||
1436 | va_start(args, fmt); | ||
1437 | r = vprintk_emit(facility, level, dict, dictlen, fmt, args); | ||
1438 | va_end(args); | ||
1439 | |||
1440 | return r; | ||
984 | } | 1441 | } |
1442 | EXPORT_SYMBOL(printk_emit); | ||
985 | 1443 | ||
1444 | /** | ||
1445 | * printk - print a kernel message | ||
1446 | * @fmt: format string | ||
1447 | * | ||
1448 | * This is printk(). It can be called from any context. We want it to work. | ||
1449 | * | ||
1450 | * We try to grab the console_lock. If we succeed, it's easy - we log the | ||
1451 | * output and call the console drivers. If we fail to get the semaphore, we | ||
1452 | * place the output into the log buffer and return. The current holder of | ||
1453 | * the console_sem will notice the new output in console_unlock(); and will | ||
1454 | * send it to the consoles before releasing the lock. | ||
1455 | * | ||
1456 | * One effect of this deferred printing is that code which calls printk() and | ||
1457 | * then changes console_loglevel may break. This is because console_loglevel | ||
1458 | * is inspected when the actual printing occurs. | ||
1459 | * | ||
1460 | * See also: | ||
1461 | * printf(3) | ||
1462 | * | ||
1463 | * See the vsnprintf() documentation for format string extensions over C99. | ||
1464 | */ | ||
1465 | asmlinkage int printk(const char *fmt, ...) | ||
1466 | { | ||
1467 | va_list args; | ||
1468 | int r; | ||
1469 | |||
1470 | #ifdef CONFIG_KGDB_KDB | ||
1471 | if (unlikely(kdb_trap_printk)) { | ||
1472 | va_start(args, fmt); | ||
1473 | r = vkdb_printf(fmt, args); | ||
1474 | va_end(args); | ||
1475 | return r; | ||
1476 | } | ||
986 | #endif | 1477 | #endif |
1478 | va_start(args, fmt); | ||
1479 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1480 | va_end(args); | ||
1481 | |||
1482 | return r; | ||
1483 | } | ||
1484 | EXPORT_SYMBOL(printk); | ||
1485 | |||
1486 | #else | ||
1487 | |||
1488 | #define LOG_LINE_MAX 0 | ||
1489 | static struct log *log_from_idx(u32 idx) { return NULL; } | ||
1490 | static u32 log_next(u32 idx) { return 0; } | ||
1491 | static void call_console_drivers(int level, const char *text, size_t len) {} | ||
1492 | static size_t msg_print_text(const struct log *msg, bool syslog, | ||
1493 | char *buf, size_t size) { return 0; } | ||
1494 | |||
1495 | #endif /* CONFIG_PRINTK */ | ||
987 | 1496 | ||
988 | static int __add_preferred_console(char *name, int idx, char *options, | 1497 | static int __add_preferred_console(char *name, int idx, char *options, |
989 | char *brl_options) | 1498 | char *brl_options) |
@@ -1217,7 +1726,7 @@ int is_console_locked(void) | |||
1217 | } | 1726 | } |
1218 | 1727 | ||
1219 | /* | 1728 | /* |
1220 | * Delayed printk facility, for scheduler-internal messages: | 1729 | * Delayed printk version, for scheduler-internal messages: |
1221 | */ | 1730 | */ |
1222 | #define PRINTK_BUF_SIZE 512 | 1731 | #define PRINTK_BUF_SIZE 512 |
1223 | 1732 | ||
@@ -1253,6 +1762,10 @@ void wake_up_klogd(void) | |||
1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1762 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1254 | } | 1763 | } |
1255 | 1764 | ||
1765 | /* the next printk record to write to the console */ | ||
1766 | static u64 console_seq; | ||
1767 | static u32 console_idx; | ||
1768 | |||
1256 | /** | 1769 | /** |
1257 | * console_unlock - unlock the console system | 1770 | * console_unlock - unlock the console system |
1258 | * | 1771 | * |
@@ -1263,15 +1776,16 @@ void wake_up_klogd(void) | |||
1263 | * by printk(). If this is the case, console_unlock(); emits | 1776 | * by printk(). If this is the case, console_unlock(); emits |
1264 | * the output prior to releasing the lock. | 1777 | * the output prior to releasing the lock. |
1265 | * | 1778 | * |
1266 | * If there is output waiting for klogd, we wake it up. | 1779 | * If there is output waiting, we wake /dev/kmsg and syslog() users. |
1267 | * | 1780 | * |
1268 | * console_unlock(); may be called from any context. | 1781 | * console_unlock(); may be called from any context. |
1269 | */ | 1782 | */ |
1270 | void console_unlock(void) | 1783 | void console_unlock(void) |
1271 | { | 1784 | { |
1785 | static u64 seen_seq; | ||
1272 | unsigned long flags; | 1786 | unsigned long flags; |
1273 | unsigned _con_start, _log_end; | 1787 | bool wake_klogd = false; |
1274 | unsigned wake_klogd = 0, retry = 0; | 1788 | bool retry; |
1275 | 1789 | ||
1276 | if (console_suspended) { | 1790 | if (console_suspended) { |
1277 | up(&console_sem); | 1791 | up(&console_sem); |
@@ -1281,17 +1795,38 @@ void console_unlock(void) | |||
1281 | console_may_schedule = 0; | 1795 | console_may_schedule = 0; |
1282 | 1796 | ||
1283 | again: | 1797 | again: |
1284 | for ( ; ; ) { | 1798 | for (;;) { |
1799 | struct log *msg; | ||
1800 | static char text[LOG_LINE_MAX]; | ||
1801 | size_t len; | ||
1802 | int level; | ||
1803 | |||
1285 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 1804 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1286 | wake_klogd |= log_start - log_end; | 1805 | if (seen_seq != log_next_seq) { |
1287 | if (con_start == log_end) | 1806 | wake_klogd = true; |
1288 | break; /* Nothing to print */ | 1807 | seen_seq = log_next_seq; |
1289 | _con_start = con_start; | 1808 | } |
1290 | _log_end = log_end; | 1809 | |
1291 | con_start = log_end; /* Flush */ | 1810 | if (console_seq < log_first_seq) { |
1811 | /* messages are gone, move to first one */ | ||
1812 | console_seq = log_first_seq; | ||
1813 | console_idx = log_first_idx; | ||
1814 | } | ||
1815 | |||
1816 | if (console_seq == log_next_seq) | ||
1817 | break; | ||
1818 | |||
1819 | msg = log_from_idx(console_idx); | ||
1820 | level = msg->level & 7; | ||
1821 | |||
1822 | len = msg_print_text(msg, false, text, sizeof(text)); | ||
1823 | |||
1824 | console_idx = log_next(console_idx); | ||
1825 | console_seq++; | ||
1292 | raw_spin_unlock(&logbuf_lock); | 1826 | raw_spin_unlock(&logbuf_lock); |
1827 | |||
1293 | stop_critical_timings(); /* don't trace print latency */ | 1828 | stop_critical_timings(); /* don't trace print latency */ |
1294 | call_console_drivers(_con_start, _log_end); | 1829 | call_console_drivers(level, text, len); |
1295 | start_critical_timings(); | 1830 | start_critical_timings(); |
1296 | local_irq_restore(flags); | 1831 | local_irq_restore(flags); |
1297 | } | 1832 | } |
@@ -1312,8 +1847,7 @@ again: | |||
1312 | * flush, no worries. | 1847 | * flush, no worries. |
1313 | */ | 1848 | */ |
1314 | raw_spin_lock(&logbuf_lock); | 1849 | raw_spin_lock(&logbuf_lock); |
1315 | if (con_start != log_end) | 1850 | retry = console_seq != log_next_seq; |
1316 | retry = 1; | ||
1317 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 1851 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1318 | 1852 | ||
1319 | if (retry && console_trylock()) | 1853 | if (retry && console_trylock()) |
@@ -1549,7 +2083,8 @@ void register_console(struct console *newcon) | |||
1549 | * for us. | 2083 | * for us. |
1550 | */ | 2084 | */ |
1551 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2085 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1552 | con_start = log_start; | 2086 | console_seq = syslog_seq; |
2087 | console_idx = syslog_idx; | ||
1553 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2088 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1554 | /* | 2089 | /* |
1555 | * We're about to replay the log buffer. Only do this to the | 2090 | * We're about to replay the log buffer. Only do this to the |
@@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1758 | } | 2293 | } |
1759 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 2294 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1760 | 2295 | ||
2296 | static bool always_kmsg_dump; | ||
2297 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
2298 | |||
1761 | /** | 2299 | /** |
1762 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2300 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1763 | * @reason: the reason (oops, panic etc) for dumping | 2301 | * @reason: the reason (oops, panic etc) for dumping |
@@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | |||
1767 | */ | 2305 | */ |
1768 | void kmsg_dump(enum kmsg_dump_reason reason) | 2306 | void kmsg_dump(enum kmsg_dump_reason reason) |
1769 | { | 2307 | { |
1770 | unsigned long end; | 2308 | u64 idx; |
1771 | unsigned chars; | ||
1772 | struct kmsg_dumper *dumper; | 2309 | struct kmsg_dumper *dumper; |
1773 | const char *s1, *s2; | 2310 | const char *s1, *s2; |
1774 | unsigned long l1, l2; | 2311 | unsigned long l1, l2; |
@@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1780 | /* Theoretically, the log could move on after we do this, but | 2317 | /* Theoretically, the log could move on after we do this, but |
1781 | there's not a lot we can do about that. The new messages | 2318 | there's not a lot we can do about that. The new messages |
1782 | will overwrite the start of what we dump. */ | 2319 | will overwrite the start of what we dump. */ |
2320 | |||
1783 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2321 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1784 | end = log_end & LOG_BUF_MASK; | 2322 | if (syslog_seq < log_first_seq) |
1785 | chars = logged_chars; | 2323 | idx = syslog_idx; |
1786 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2324 | else |
2325 | idx = log_first_idx; | ||
1787 | 2326 | ||
1788 | if (chars > end) { | 2327 | if (idx > log_next_idx) { |
1789 | s1 = log_buf + log_buf_len - chars + end; | 2328 | s1 = log_buf; |
1790 | l1 = chars - end; | 2329 | l1 = log_next_idx; |
1791 | 2330 | ||
1792 | s2 = log_buf; | 2331 | s2 = log_buf + idx; |
1793 | l2 = end; | 2332 | l2 = log_buf_len - idx; |
1794 | } else { | 2333 | } else { |
1795 | s1 = ""; | 2334 | s1 = ""; |
1796 | l1 = 0; | 2335 | l1 = 0; |
1797 | 2336 | ||
1798 | s2 = log_buf + end - chars; | 2337 | s2 = log_buf + idx; |
1799 | l2 = chars; | 2338 | l2 = log_next_idx - idx; |
1800 | } | 2339 | } |
2340 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1801 | 2341 | ||
1802 | rcu_read_lock(); | 2342 | rcu_read_lock(); |
1803 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2343 | list_for_each_entry_rcu(dumper, &dump_list, list) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ee8d49b9c309..a232bb59d93f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
198 | return 0; | 198 | return 0; |
199 | rcu_read_lock(); | 199 | rcu_read_lock(); |
200 | tcred = __task_cred(task); | 200 | tcred = __task_cred(task); |
201 | if (cred->user->user_ns == tcred->user->user_ns && | 201 | if (uid_eq(cred->uid, tcred->euid) && |
202 | (cred->uid == tcred->euid && | 202 | uid_eq(cred->uid, tcred->suid) && |
203 | cred->uid == tcred->suid && | 203 | uid_eq(cred->uid, tcred->uid) && |
204 | cred->uid == tcred->uid && | 204 | gid_eq(cred->gid, tcred->egid) && |
205 | cred->gid == tcred->egid && | 205 | gid_eq(cred->gid, tcred->sgid) && |
206 | cred->gid == tcred->sgid && | 206 | gid_eq(cred->gid, tcred->gid)) |
207 | cred->gid == tcred->gid)) | ||
208 | goto ok; | 207 | goto ok; |
209 | if (ptrace_has_cap(tcred->user->user_ns, mode)) | 208 | if (ptrace_has_cap(tcred->user_ns, mode)) |
210 | goto ok; | 209 | goto ok; |
211 | rcu_read_unlock(); | 210 | rcu_read_unlock(); |
212 | return -EPERM; | 211 | return -EPERM; |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc27..95cba41ce1e9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -51,6 +51,34 @@ | |||
51 | 51 | ||
52 | #include "rcu.h" | 52 | #include "rcu.h" |
53 | 53 | ||
54 | #ifdef CONFIG_PREEMPT_RCU | ||
55 | |||
56 | /* | ||
57 | * Check for a task exiting while in a preemptible-RCU read-side | ||
58 | * critical section, clean up if so. No need to issue warnings, | ||
59 | * as debug_check_no_locks_held() already does this if lockdep | ||
60 | * is enabled. | ||
61 | */ | ||
62 | void exit_rcu(void) | ||
63 | { | ||
64 | struct task_struct *t = current; | ||
65 | |||
66 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
67 | return; | ||
68 | t->rcu_read_lock_nesting = 1; | ||
69 | barrier(); | ||
70 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
71 | __rcu_read_unlock(); | ||
72 | } | ||
73 | |||
74 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
75 | |||
76 | void exit_rcu(void) | ||
77 | { | ||
78 | } | ||
79 | |||
80 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
81 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 82 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | static struct lock_class_key rcu_lock_key; | 83 | static struct lock_class_key rcu_lock_key; |
56 | struct lockdep_map rcu_lock_map = | 84 | struct lockdep_map rcu_lock_map = |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb62..fc31a2d65100 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) | |||
851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | ||
855 | * Check for a task exiting while in a preemptible -RCU read-side | ||
856 | * critical section, clean up if so. No need to issue warnings, | ||
857 | * as debug_check_no_locks_held() already does this if lockdep | ||
858 | * is enabled. | ||
859 | */ | ||
860 | void exit_rcu(void) | ||
861 | { | ||
862 | struct task_struct *t = current; | ||
863 | |||
864 | if (t->rcu_read_lock_nesting == 0) | ||
865 | return; | ||
866 | t->rcu_read_lock_nesting = 1; | ||
867 | __rcu_read_unlock(); | ||
868 | } | ||
869 | |||
870 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 854 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
871 | 855 | ||
872 | #ifdef CONFIG_RCU_TRACE | 856 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6e..e66b34ab7555 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | 68 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ |
68 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | 69 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ |
69 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | 70 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ |
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); | |||
96 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 97 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
97 | module_param(fqs_stutter, int, 0444); | 98 | module_param(fqs_stutter, int, 0444); |
98 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 99 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
100 | module_param(n_barrier_cbs, int, 0444); | ||
101 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | ||
99 | module_param(onoff_interval, int, 0444); | 102 | module_param(onoff_interval, int, 0444); |
100 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 103 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); |
101 | module_param(onoff_holdoff, int, 0444); | 104 | module_param(onoff_holdoff, int, 0444); |
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; | |||
139 | static struct task_struct *onoff_task; | 142 | static struct task_struct *onoff_task; |
140 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 143 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
141 | static struct task_struct *stall_task; | 144 | static struct task_struct *stall_task; |
145 | static struct task_struct **barrier_cbs_tasks; | ||
146 | static struct task_struct *barrier_task; | ||
142 | 147 | ||
143 | #define RCU_TORTURE_PIPE_LEN 10 | 148 | #define RCU_TORTURE_PIPE_LEN 10 |
144 | 149 | ||
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
164 | static atomic_t n_rcu_torture_free; | 169 | static atomic_t n_rcu_torture_free; |
165 | static atomic_t n_rcu_torture_mberror; | 170 | static atomic_t n_rcu_torture_mberror; |
166 | static atomic_t n_rcu_torture_error; | 171 | static atomic_t n_rcu_torture_error; |
172 | static long n_rcu_torture_barrier_error; | ||
167 | static long n_rcu_torture_boost_ktrerror; | 173 | static long n_rcu_torture_boost_ktrerror; |
168 | static long n_rcu_torture_boost_rterror; | 174 | static long n_rcu_torture_boost_rterror; |
169 | static long n_rcu_torture_boost_failure; | 175 | static long n_rcu_torture_boost_failure; |
@@ -173,6 +179,8 @@ static long n_offline_attempts; | |||
173 | static long n_offline_successes; | 179 | static long n_offline_successes; |
174 | static long n_online_attempts; | 180 | static long n_online_attempts; |
175 | static long n_online_successes; | 181 | static long n_online_successes; |
182 | static long n_barrier_attempts; | ||
183 | static long n_barrier_successes; | ||
176 | static struct list_head rcu_torture_removed; | 184 | static struct list_head rcu_torture_removed; |
177 | static cpumask_var_t shuffle_tmp_mask; | 185 | static cpumask_var_t shuffle_tmp_mask; |
178 | 186 | ||
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ | |||
197 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 205 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
198 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
199 | /* and boost task create/destroy. */ | 207 | /* and boost task create/destroy. */ |
208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | ||
209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | ||
210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | ||
211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | ||
200 | 212 | ||
201 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 213 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
202 | 214 | ||
@@ -327,6 +339,7 @@ struct rcu_torture_ops { | |||
327 | int (*completed)(void); | 339 | int (*completed)(void); |
328 | void (*deferred_free)(struct rcu_torture *p); | 340 | void (*deferred_free)(struct rcu_torture *p); |
329 | void (*sync)(void); | 341 | void (*sync)(void); |
342 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
330 | void (*cb_barrier)(void); | 343 | void (*cb_barrier)(void); |
331 | void (*fqs)(void); | 344 | void (*fqs)(void); |
332 | int (*stats)(char *page); | 345 | int (*stats)(char *page); |
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
417 | .completed = rcu_torture_completed, | 430 | .completed = rcu_torture_completed, |
418 | .deferred_free = rcu_torture_deferred_free, | 431 | .deferred_free = rcu_torture_deferred_free, |
419 | .sync = synchronize_rcu, | 432 | .sync = synchronize_rcu, |
433 | .call = call_rcu, | ||
420 | .cb_barrier = rcu_barrier, | 434 | .cb_barrier = rcu_barrier, |
421 | .fqs = rcu_force_quiescent_state, | 435 | .fqs = rcu_force_quiescent_state, |
422 | .stats = NULL, | 436 | .stats = NULL, |
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
460 | .completed = rcu_torture_completed, | 474 | .completed = rcu_torture_completed, |
461 | .deferred_free = rcu_sync_torture_deferred_free, | 475 | .deferred_free = rcu_sync_torture_deferred_free, |
462 | .sync = synchronize_rcu, | 476 | .sync = synchronize_rcu, |
477 | .call = NULL, | ||
463 | .cb_barrier = NULL, | 478 | .cb_barrier = NULL, |
464 | .fqs = rcu_force_quiescent_state, | 479 | .fqs = rcu_force_quiescent_state, |
465 | .stats = NULL, | 480 | .stats = NULL, |
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
477 | .completed = rcu_no_completed, | 492 | .completed = rcu_no_completed, |
478 | .deferred_free = rcu_sync_torture_deferred_free, | 493 | .deferred_free = rcu_sync_torture_deferred_free, |
479 | .sync = synchronize_rcu_expedited, | 494 | .sync = synchronize_rcu_expedited, |
495 | .call = NULL, | ||
480 | .cb_barrier = NULL, | 496 | .cb_barrier = NULL, |
481 | .fqs = rcu_force_quiescent_state, | 497 | .fqs = rcu_force_quiescent_state, |
482 | .stats = NULL, | 498 | .stats = NULL, |
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
519 | .completed = rcu_bh_torture_completed, | 535 | .completed = rcu_bh_torture_completed, |
520 | .deferred_free = rcu_bh_torture_deferred_free, | 536 | .deferred_free = rcu_bh_torture_deferred_free, |
521 | .sync = synchronize_rcu_bh, | 537 | .sync = synchronize_rcu_bh, |
538 | .call = call_rcu_bh, | ||
522 | .cb_barrier = rcu_barrier_bh, | 539 | .cb_barrier = rcu_barrier_bh, |
523 | .fqs = rcu_bh_force_quiescent_state, | 540 | .fqs = rcu_bh_force_quiescent_state, |
524 | .stats = NULL, | 541 | .stats = NULL, |
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
535 | .completed = rcu_bh_torture_completed, | 552 | .completed = rcu_bh_torture_completed, |
536 | .deferred_free = rcu_sync_torture_deferred_free, | 553 | .deferred_free = rcu_sync_torture_deferred_free, |
537 | .sync = synchronize_rcu_bh, | 554 | .sync = synchronize_rcu_bh, |
555 | .call = NULL, | ||
538 | .cb_barrier = NULL, | 556 | .cb_barrier = NULL, |
539 | .fqs = rcu_bh_force_quiescent_state, | 557 | .fqs = rcu_bh_force_quiescent_state, |
540 | .stats = NULL, | 558 | .stats = NULL, |
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
551 | .completed = rcu_bh_torture_completed, | 569 | .completed = rcu_bh_torture_completed, |
552 | .deferred_free = rcu_sync_torture_deferred_free, | 570 | .deferred_free = rcu_sync_torture_deferred_free, |
553 | .sync = synchronize_rcu_bh_expedited, | 571 | .sync = synchronize_rcu_bh_expedited, |
572 | .call = NULL, | ||
554 | .cb_barrier = NULL, | 573 | .cb_barrier = NULL, |
555 | .fqs = rcu_bh_force_quiescent_state, | 574 | .fqs = rcu_bh_force_quiescent_state, |
556 | .stats = NULL, | 575 | .stats = NULL, |
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void) | |||
606 | return srcu_batches_completed(&srcu_ctl); | 625 | return srcu_batches_completed(&srcu_ctl); |
607 | } | 626 | } |
608 | 627 | ||
628 | static void srcu_torture_deferred_free(struct rcu_torture *rp) | ||
629 | { | ||
630 | call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); | ||
631 | } | ||
632 | |||
609 | static void srcu_torture_synchronize(void) | 633 | static void srcu_torture_synchronize(void) |
610 | { | 634 | { |
611 | synchronize_srcu(&srcu_ctl); | 635 | synchronize_srcu(&srcu_ctl); |
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page) | |||
620 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 644 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", |
621 | torture_type, TORTURE_FLAG, idx); | 645 | torture_type, TORTURE_FLAG, idx); |
622 | for_each_possible_cpu(cpu) { | 646 | for_each_possible_cpu(cpu) { |
623 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, | 647 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, |
624 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 648 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
625 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 649 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
626 | } | 650 | } |
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = { | |||
635 | .read_delay = srcu_read_delay, | 659 | .read_delay = srcu_read_delay, |
636 | .readunlock = srcu_torture_read_unlock, | 660 | .readunlock = srcu_torture_read_unlock, |
637 | .completed = srcu_torture_completed, | 661 | .completed = srcu_torture_completed, |
638 | .deferred_free = rcu_sync_torture_deferred_free, | 662 | .deferred_free = srcu_torture_deferred_free, |
639 | .sync = srcu_torture_synchronize, | 663 | .sync = srcu_torture_synchronize, |
664 | .call = NULL, | ||
640 | .cb_barrier = NULL, | 665 | .cb_barrier = NULL, |
641 | .stats = srcu_torture_stats, | 666 | .stats = srcu_torture_stats, |
642 | .name = "srcu" | 667 | .name = "srcu" |
643 | }; | 668 | }; |
644 | 669 | ||
670 | static struct rcu_torture_ops srcu_sync_ops = { | ||
671 | .init = srcu_torture_init, | ||
672 | .cleanup = srcu_torture_cleanup, | ||
673 | .readlock = srcu_torture_read_lock, | ||
674 | .read_delay = srcu_read_delay, | ||
675 | .readunlock = srcu_torture_read_unlock, | ||
676 | .completed = srcu_torture_completed, | ||
677 | .deferred_free = rcu_sync_torture_deferred_free, | ||
678 | .sync = srcu_torture_synchronize, | ||
679 | .call = NULL, | ||
680 | .cb_barrier = NULL, | ||
681 | .stats = srcu_torture_stats, | ||
682 | .name = "srcu_sync" | ||
683 | }; | ||
684 | |||
645 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | 685 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) |
646 | { | 686 | { |
647 | return srcu_read_lock_raw(&srcu_ctl); | 687 | return srcu_read_lock_raw(&srcu_ctl); |
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
659 | .read_delay = srcu_read_delay, | 699 | .read_delay = srcu_read_delay, |
660 | .readunlock = srcu_torture_read_unlock_raw, | 700 | .readunlock = srcu_torture_read_unlock_raw, |
661 | .completed = srcu_torture_completed, | 701 | .completed = srcu_torture_completed, |
662 | .deferred_free = rcu_sync_torture_deferred_free, | 702 | .deferred_free = srcu_torture_deferred_free, |
663 | .sync = srcu_torture_synchronize, | 703 | .sync = srcu_torture_synchronize, |
704 | .call = NULL, | ||
664 | .cb_barrier = NULL, | 705 | .cb_barrier = NULL, |
665 | .stats = srcu_torture_stats, | 706 | .stats = srcu_torture_stats, |
666 | .name = "srcu_raw" | 707 | .name = "srcu_raw" |
667 | }; | 708 | }; |
668 | 709 | ||
710 | static struct rcu_torture_ops srcu_raw_sync_ops = { | ||
711 | .init = srcu_torture_init, | ||
712 | .cleanup = srcu_torture_cleanup, | ||
713 | .readlock = srcu_torture_read_lock_raw, | ||
714 | .read_delay = srcu_read_delay, | ||
715 | .readunlock = srcu_torture_read_unlock_raw, | ||
716 | .completed = srcu_torture_completed, | ||
717 | .deferred_free = rcu_sync_torture_deferred_free, | ||
718 | .sync = srcu_torture_synchronize, | ||
719 | .call = NULL, | ||
720 | .cb_barrier = NULL, | ||
721 | .stats = srcu_torture_stats, | ||
722 | .name = "srcu_raw_sync" | ||
723 | }; | ||
724 | |||
669 | static void srcu_torture_synchronize_expedited(void) | 725 | static void srcu_torture_synchronize_expedited(void) |
670 | { | 726 | { |
671 | synchronize_srcu_expedited(&srcu_ctl); | 727 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { | |||
680 | .completed = srcu_torture_completed, | 736 | .completed = srcu_torture_completed, |
681 | .deferred_free = rcu_sync_torture_deferred_free, | 737 | .deferred_free = rcu_sync_torture_deferred_free, |
682 | .sync = srcu_torture_synchronize_expedited, | 738 | .sync = srcu_torture_synchronize_expedited, |
739 | .call = NULL, | ||
683 | .cb_barrier = NULL, | 740 | .cb_barrier = NULL, |
684 | .stats = srcu_torture_stats, | 741 | .stats = srcu_torture_stats, |
685 | .name = "srcu_expedited" | 742 | .name = "srcu_expedited" |
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page) | |||
1129 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1130 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1131 | "rtbf: %ld rtb: %ld nt: %ld " | 1188 | "rtbf: %ld rtb: %ld nt: %ld " |
1132 | "onoff: %ld/%ld:%ld/%ld", | 1189 | "onoff: %ld/%ld:%ld/%ld " |
1190 | "barrier: %ld/%ld:%ld", | ||
1133 | rcu_torture_current, | 1191 | rcu_torture_current, |
1134 | rcu_torture_current_version, | 1192 | rcu_torture_current_version, |
1135 | list_empty(&rcu_torture_freelist), | 1193 | list_empty(&rcu_torture_freelist), |
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page) | |||
1145 | n_online_successes, | 1203 | n_online_successes, |
1146 | n_online_attempts, | 1204 | n_online_attempts, |
1147 | n_offline_successes, | 1205 | n_offline_successes, |
1148 | n_offline_attempts); | 1206 | n_offline_attempts, |
1207 | n_barrier_successes, | ||
1208 | n_barrier_attempts, | ||
1209 | n_rcu_torture_barrier_error); | ||
1210 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1149 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1211 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1212 | n_rcu_torture_barrier_error != 0 || | ||
1150 | n_rcu_torture_boost_ktrerror != 0 || | 1213 | n_rcu_torture_boost_ktrerror != 0 || |
1151 | n_rcu_torture_boost_rterror != 0 || | 1214 | n_rcu_torture_boost_rterror != 0 || |
1152 | n_rcu_torture_boost_failure != 0) | 1215 | n_rcu_torture_boost_failure != 0 || |
1153 | cnt += sprintf(&page[cnt], " !!!"); | 1216 | i > 1) { |
1154 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1155 | if (i > 1) { | ||
1156 | cnt += sprintf(&page[cnt], "!!! "); | 1217 | cnt += sprintf(&page[cnt], "!!! "); |
1157 | atomic_inc(&n_rcu_torture_error); | 1218 | atomic_inc(&n_rcu_torture_error); |
1158 | WARN_ON_ONCE(1); | 1219 | WARN_ON_ONCE(1); |
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu) | |||
1337 | 1398 | ||
1338 | /* This must be outside of the mutex, otherwise deadlock! */ | 1399 | /* This must be outside of the mutex, otherwise deadlock! */ |
1339 | kthread_stop(t); | 1400 | kthread_stop(t); |
1401 | boost_tasks[cpu] = NULL; | ||
1340 | } | 1402 | } |
1341 | 1403 | ||
1342 | static int rcutorture_booster_init(int cpu) | 1404 | static int rcutorture_booster_init(int cpu) |
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void) | |||
1484 | return; | 1546 | return; |
1485 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | 1547 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); |
1486 | kthread_stop(onoff_task); | 1548 | kthread_stop(onoff_task); |
1549 | onoff_task = NULL; | ||
1487 | } | 1550 | } |
1488 | 1551 | ||
1489 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1552 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1490 | 1553 | ||
1491 | static void | 1554 | static int |
1492 | rcu_torture_onoff_init(void) | 1555 | rcu_torture_onoff_init(void) |
1493 | { | 1556 | { |
1557 | return 0; | ||
1494 | } | 1558 | } |
1495 | 1559 | ||
1496 | static void rcu_torture_onoff_cleanup(void) | 1560 | static void rcu_torture_onoff_cleanup(void) |
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void) | |||
1554 | return; | 1618 | return; |
1555 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | 1619 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); |
1556 | kthread_stop(stall_task); | 1620 | kthread_stop(stall_task); |
1621 | stall_task = NULL; | ||
1622 | } | ||
1623 | |||
1624 | /* Callback function for RCU barrier testing. */ | ||
1625 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | ||
1626 | { | ||
1627 | atomic_inc(&barrier_cbs_invoked); | ||
1628 | } | ||
1629 | |||
1630 | /* kthread function to register callbacks used to test RCU barriers. */ | ||
1631 | static int rcu_torture_barrier_cbs(void *arg) | ||
1632 | { | ||
1633 | long myid = (long)arg; | ||
1634 | struct rcu_head rcu; | ||
1635 | |||
1636 | init_rcu_head_on_stack(&rcu); | ||
1637 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); | ||
1638 | set_user_nice(current, 19); | ||
1639 | do { | ||
1640 | wait_event(barrier_cbs_wq[myid], | ||
1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | ||
1642 | kthread_should_stop() || | ||
1643 | fullstop != FULLSTOP_DONTSTOP); | ||
1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1645 | break; | ||
1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | ||
1647 | if (atomic_dec_and_test(&barrier_cbs_count)) | ||
1648 | wake_up(&barrier_wq); | ||
1649 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1650 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); | ||
1651 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1652 | while (!kthread_should_stop()) | ||
1653 | schedule_timeout_interruptible(1); | ||
1654 | cur_ops->cb_barrier(); | ||
1655 | destroy_rcu_head_on_stack(&rcu); | ||
1656 | return 0; | ||
1657 | } | ||
1658 | |||
1659 | /* kthread function to drive and coordinate RCU barrier testing. */ | ||
1660 | static int rcu_torture_barrier(void *arg) | ||
1661 | { | ||
1662 | int i; | ||
1663 | |||
1664 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); | ||
1665 | do { | ||
1666 | atomic_set(&barrier_cbs_invoked, 0); | ||
1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | ||
1668 | /* wake_up() path contains the required barriers. */ | ||
1669 | for (i = 0; i < n_barrier_cbs; i++) | ||
1670 | wake_up(&barrier_cbs_wq[i]); | ||
1671 | wait_event(barrier_wq, | ||
1672 | atomic_read(&barrier_cbs_count) == 0 || | ||
1673 | kthread_should_stop() || | ||
1674 | fullstop != FULLSTOP_DONTSTOP); | ||
1675 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1676 | break; | ||
1677 | n_barrier_attempts++; | ||
1678 | cur_ops->cb_barrier(); | ||
1679 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | ||
1680 | n_rcu_torture_barrier_error++; | ||
1681 | WARN_ON_ONCE(1); | ||
1682 | } | ||
1683 | n_barrier_successes++; | ||
1684 | schedule_timeout_interruptible(HZ / 10); | ||
1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | ||
1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1688 | while (!kthread_should_stop()) | ||
1689 | schedule_timeout_interruptible(1); | ||
1690 | return 0; | ||
1691 | } | ||
1692 | |||
1693 | /* Initialize RCU barrier testing. */ | ||
1694 | static int rcu_torture_barrier_init(void) | ||
1695 | { | ||
1696 | int i; | ||
1697 | int ret; | ||
1698 | |||
1699 | if (n_barrier_cbs == 0) | ||
1700 | return 0; | ||
1701 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | ||
1702 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1703 | " Call or barrier ops missing for %s,\n", | ||
1704 | torture_type, cur_ops->name); | ||
1705 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1706 | " RCU barrier testing omitted from run.\n", | ||
1707 | torture_type); | ||
1708 | return 0; | ||
1709 | } | ||
1710 | atomic_set(&barrier_cbs_count, 0); | ||
1711 | atomic_set(&barrier_cbs_invoked, 0); | ||
1712 | barrier_cbs_tasks = | ||
1713 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | ||
1714 | GFP_KERNEL); | ||
1715 | barrier_cbs_wq = | ||
1716 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | ||
1717 | GFP_KERNEL); | ||
1718 | if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) | ||
1719 | return -ENOMEM; | ||
1720 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1721 | init_waitqueue_head(&barrier_cbs_wq[i]); | ||
1722 | barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, | ||
1723 | (void *)(long)i, | ||
1724 | "rcu_torture_barrier_cbs"); | ||
1725 | if (IS_ERR(barrier_cbs_tasks[i])) { | ||
1726 | ret = PTR_ERR(barrier_cbs_tasks[i]); | ||
1727 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); | ||
1728 | barrier_cbs_tasks[i] = NULL; | ||
1729 | return ret; | ||
1730 | } | ||
1731 | } | ||
1732 | barrier_task = kthread_run(rcu_torture_barrier, NULL, | ||
1733 | "rcu_torture_barrier"); | ||
1734 | if (IS_ERR(barrier_task)) { | ||
1735 | ret = PTR_ERR(barrier_task); | ||
1736 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); | ||
1737 | barrier_task = NULL; | ||
1738 | } | ||
1739 | return 0; | ||
1740 | } | ||
1741 | |||
1742 | /* Clean up after RCU barrier testing. */ | ||
1743 | static void rcu_torture_barrier_cleanup(void) | ||
1744 | { | ||
1745 | int i; | ||
1746 | |||
1747 | if (barrier_task != NULL) { | ||
1748 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); | ||
1749 | kthread_stop(barrier_task); | ||
1750 | barrier_task = NULL; | ||
1751 | } | ||
1752 | if (barrier_cbs_tasks != NULL) { | ||
1753 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1754 | if (barrier_cbs_tasks[i] != NULL) { | ||
1755 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); | ||
1756 | kthread_stop(barrier_cbs_tasks[i]); | ||
1757 | barrier_cbs_tasks[i] = NULL; | ||
1758 | } | ||
1759 | } | ||
1760 | kfree(barrier_cbs_tasks); | ||
1761 | barrier_cbs_tasks = NULL; | ||
1762 | } | ||
1763 | if (barrier_cbs_wq != NULL) { | ||
1764 | kfree(barrier_cbs_wq); | ||
1765 | barrier_cbs_wq = NULL; | ||
1766 | } | ||
1557 | } | 1767 | } |
1558 | 1768 | ||
1559 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1769 | static int rcutorture_cpu_notify(struct notifier_block *self, |
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void) | |||
1598 | fullstop = FULLSTOP_RMMOD; | 1808 | fullstop = FULLSTOP_RMMOD; |
1599 | mutex_unlock(&fullstop_mutex); | 1809 | mutex_unlock(&fullstop_mutex); |
1600 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1810 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1811 | rcu_torture_barrier_cleanup(); | ||
1601 | rcu_torture_stall_cleanup(); | 1812 | rcu_torture_stall_cleanup(); |
1602 | if (stutter_task) { | 1813 | if (stutter_task) { |
1603 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1814 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void) | |||
1665 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | 1876 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); |
1666 | kthread_stop(shutdown_task); | 1877 | kthread_stop(shutdown_task); |
1667 | } | 1878 | } |
1879 | shutdown_task = NULL; | ||
1668 | rcu_torture_onoff_cleanup(); | 1880 | rcu_torture_onoff_cleanup(); |
1669 | 1881 | ||
1670 | /* Wait for all RCU callbacks to fire. */ | 1882 | /* Wait for all RCU callbacks to fire. */ |
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void) | |||
1676 | 1888 | ||
1677 | if (cur_ops->cleanup) | 1889 | if (cur_ops->cleanup) |
1678 | cur_ops->cleanup(); | 1890 | cur_ops->cleanup(); |
1679 | if (atomic_read(&n_rcu_torture_error)) | 1891 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1680 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1892 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1681 | else if (n_online_successes != n_online_attempts || | 1893 | else if (n_online_successes != n_online_attempts || |
1682 | n_offline_successes != n_offline_attempts) | 1894 | n_offline_successes != n_offline_attempts) |
@@ -1692,10 +1904,12 @@ rcu_torture_init(void) | |||
1692 | int i; | 1904 | int i; |
1693 | int cpu; | 1905 | int cpu; |
1694 | int firsterr = 0; | 1906 | int firsterr = 0; |
1907 | int retval; | ||
1695 | static struct rcu_torture_ops *torture_ops[] = | 1908 | static struct rcu_torture_ops *torture_ops[] = |
1696 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1697 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1698 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, | 1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, |
1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | ||
1699 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1700 | 1914 | ||
1701 | mutex_lock(&fullstop_mutex); | 1915 | mutex_lock(&fullstop_mutex); |
@@ -1749,6 +1963,7 @@ rcu_torture_init(void) | |||
1749 | atomic_set(&n_rcu_torture_free, 0); | 1963 | atomic_set(&n_rcu_torture_free, 0); |
1750 | atomic_set(&n_rcu_torture_mberror, 0); | 1964 | atomic_set(&n_rcu_torture_mberror, 0); |
1751 | atomic_set(&n_rcu_torture_error, 0); | 1965 | atomic_set(&n_rcu_torture_error, 0); |
1966 | n_rcu_torture_barrier_error = 0; | ||
1752 | n_rcu_torture_boost_ktrerror = 0; | 1967 | n_rcu_torture_boost_ktrerror = 0; |
1753 | n_rcu_torture_boost_rterror = 0; | 1968 | n_rcu_torture_boost_rterror = 0; |
1754 | n_rcu_torture_boost_failure = 0; | 1969 | n_rcu_torture_boost_failure = 0; |
@@ -1872,7 +2087,6 @@ rcu_torture_init(void) | |||
1872 | test_boost_duration = 2; | 2087 | test_boost_duration = 2; |
1873 | if ((test_boost == 1 && cur_ops->can_boost) || | 2088 | if ((test_boost == 1 && cur_ops->can_boost) || |
1874 | test_boost == 2) { | 2089 | test_boost == 2) { |
1875 | int retval; | ||
1876 | 2090 | ||
1877 | boost_starttime = jiffies + test_boost_interval * HZ; | 2091 | boost_starttime = jiffies + test_boost_interval * HZ; |
1878 | register_cpu_notifier(&rcutorture_cpu_nb); | 2092 | register_cpu_notifier(&rcutorture_cpu_nb); |
@@ -1897,9 +2111,22 @@ rcu_torture_init(void) | |||
1897 | goto unwind; | 2111 | goto unwind; |
1898 | } | 2112 | } |
1899 | } | 2113 | } |
1900 | rcu_torture_onoff_init(); | 2114 | i = rcu_torture_onoff_init(); |
2115 | if (i != 0) { | ||
2116 | firsterr = i; | ||
2117 | goto unwind; | ||
2118 | } | ||
1901 | register_reboot_notifier(&rcutorture_shutdown_nb); | 2119 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1902 | rcu_torture_stall_init(); | 2120 | i = rcu_torture_stall_init(); |
2121 | if (i != 0) { | ||
2122 | firsterr = i; | ||
2123 | goto unwind; | ||
2124 | } | ||
2125 | retval = rcu_torture_barrier_init(); | ||
2126 | if (retval != 0) { | ||
2127 | firsterr = retval; | ||
2128 | goto unwind; | ||
2129 | } | ||
1903 | rcutorture_record_test_transition(); | 2130 | rcutorture_record_test_transition(); |
1904 | mutex_unlock(&fullstop_mutex); | 2131 | mutex_unlock(&fullstop_mutex); |
1905 | return 0; | 2132 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0c5baf1ab18..0da7b88d92d0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
75 | .gpnum = -300, \ | 75 | .gpnum = -300, \ |
76 | .completed = -300, \ | 76 | .completed = -300, \ |
77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | ||
79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | ||
78 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
79 | .n_force_qs = 0, \ | 81 | .n_force_qs = 0, \ |
80 | .n_force_qs_ngp = 0, \ | 82 | .n_force_qs_ngp = 0, \ |
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
145 | unsigned long rcutorture_testseq; | 147 | unsigned long rcutorture_testseq; |
146 | unsigned long rcutorture_vernum; | 148 | unsigned long rcutorture_vernum; |
147 | 149 | ||
150 | /* State information for rcu_barrier() and friends. */ | ||
151 | |||
152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
153 | static atomic_t rcu_barrier_cpu_count; | ||
154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
155 | static struct completion rcu_barrier_completion; | ||
156 | |||
148 | /* | 157 | /* |
149 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
150 | * permit this function to be invoked without holding the root rcu_node | 159 | * permit this function to be invoked without holding the root rcu_node |
@@ -192,7 +201,6 @@ void rcu_note_context_switch(int cpu) | |||
192 | { | 201 | { |
193 | trace_rcu_utilization("Start context switch"); | 202 | trace_rcu_utilization("Start context switch"); |
194 | rcu_sched_qs(cpu); | 203 | rcu_sched_qs(cpu); |
195 | rcu_preempt_note_context_switch(cpu); | ||
196 | trace_rcu_utilization("End context switch"); | 204 | trace_rcu_utilization("End context switch"); |
197 | } | 205 | } |
198 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
@@ -1311,95 +1319,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1311 | #ifdef CONFIG_HOTPLUG_CPU | 1319 | #ifdef CONFIG_HOTPLUG_CPU |
1312 | 1320 | ||
1313 | /* | 1321 | /* |
1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1322 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1315 | * Also record a quiescent state for this CPU for the current grace period. | 1323 | * specified CPU must be offline, and the caller must hold the |
1316 | * Synchronization and interrupt disabling are not required because | 1324 | * ->onofflock. |
1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
1318 | * operations that might block must be done later from the CPU_DEAD | ||
1319 | * notifier. | ||
1320 | * | ||
1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
1323 | * destination from the bits set in that mask. | ||
1324 | */ | 1325 | */ |
1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1326 | static void |
1327 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
1328 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
1326 | { | 1329 | { |
1327 | int i; | 1330 | int i; |
1328 | unsigned long mask; | ||
1329 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
1333 | 1331 | ||
1334 | /* First, adjust the counts. */ | 1332 | /* |
1333 | * Orphan the callbacks. First adjust the counts. This is safe | ||
1334 | * because ->onofflock excludes _rcu_barrier()'s adoption of | ||
1335 | * the callbacks, thus no memory barrier is required. | ||
1336 | */ | ||
1335 | if (rdp->nxtlist != NULL) { | 1337 | if (rdp->nxtlist != NULL) { |
1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | 1338 | rsp->qlen_lazy += rdp->qlen_lazy; |
1337 | receive_rdp->qlen += rdp->qlen; | 1339 | rsp->qlen += rdp->qlen; |
1340 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1338 | rdp->qlen_lazy = 0; | 1341 | rdp->qlen_lazy = 0; |
1339 | rdp->qlen = 0; | 1342 | rdp->qlen = 0; |
1340 | } | 1343 | } |
1341 | 1344 | ||
1342 | /* | 1345 | /* |
1343 | * Next, move ready-to-invoke callbacks to be invoked on some | 1346 | * Next, move those callbacks still needing a grace period to |
1344 | * other CPU. These will not be required to pass through another | 1347 | * the orphanage, where some other CPU will pick them up. |
1345 | * grace period: They are done, regardless of CPU. | 1348 | * Some of the callbacks might have gone partway through a grace |
1349 | * period, but that is too bad. They get to start over because we | ||
1350 | * cannot assume that grace periods are synchronized across CPUs. | ||
1351 | * We don't bother updating the ->nxttail[] array yet, instead | ||
1352 | * we just reset the whole thing later on. | ||
1346 | */ | 1353 | */ |
1347 | if (rdp->nxtlist != NULL && | 1354 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { |
1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | 1355 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; |
1349 | struct rcu_head *oldhead; | 1356 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; |
1350 | struct rcu_head **oldtail; | 1357 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1351 | struct rcu_head **newtail; | ||
1352 | |||
1353 | oldhead = rdp->nxtlist; | ||
1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
1361 | receive_rdp->nxttail[i] = newtail; | ||
1362 | if (rdp->nxttail[i] == newtail) | ||
1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1364 | } | ||
1365 | } | 1358 | } |
1366 | 1359 | ||
1367 | /* | 1360 | /* |
1368 | * Finally, put the rest of the callbacks at the end of the list. | 1361 | * Then move the ready-to-invoke callbacks to the orphanage, |
1369 | * The ones that made it partway through get to start over: We | 1362 | * where some other CPU will pick them up. These will not be |
1370 | * cannot assume that grace periods are synchronized across CPUs. | 1363 | * required to pass though another grace period: They are done. |
1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but | ||
1372 | * this does not seem compelling. Not yet, anyway.) | ||
1373 | */ | 1364 | */ |
1374 | if (rdp->nxtlist != NULL) { | 1365 | if (rdp->nxtlist != NULL) { |
1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1366 | *rsp->orphan_donetail = rdp->nxtlist; |
1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | 1367 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1380 | |||
1381 | rdp->nxtlist = NULL; | ||
1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1384 | } | 1368 | } |
1385 | 1369 | ||
1370 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
1371 | rdp->nxtlist = NULL; | ||
1372 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1373 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
1378 | * orphanage. The caller must hold the ->onofflock. | ||
1379 | */ | ||
1380 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1381 | { | ||
1382 | int i; | ||
1383 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
1384 | |||
1386 | /* | 1385 | /* |
1387 | * Record a quiescent state for the dying CPU. This is safe | 1386 | * If there is an rcu_barrier() operation in progress, then |
1388 | * only because we have already cleared out the callbacks. | 1387 | * only the task doing that operation is permitted to adopt |
1389 | * (Otherwise, the RCU core might try to schedule the invocation | 1388 | * callbacks. To do otherwise breaks rcu_barrier() and friends |
1390 | * of callbacks on this now-offline CPU, which would be bad.) | 1389 | * by causing them to fail to wait for the callbacks in the |
1390 | * orphanage. | ||
1391 | */ | 1391 | */ |
1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1392 | if (rsp->rcu_barrier_in_progress && |
1393 | rsp->rcu_barrier_in_progress != current) | ||
1394 | return; | ||
1395 | |||
1396 | /* Do the accounting first. */ | ||
1397 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
1398 | rdp->qlen += rsp->qlen; | ||
1399 | rdp->n_cbs_adopted += rsp->qlen; | ||
1400 | rsp->qlen_lazy = 0; | ||
1401 | rsp->qlen = 0; | ||
1402 | |||
1403 | /* | ||
1404 | * We do not need a memory barrier here because the only way we | ||
1405 | * can get here if there is an rcu_barrier() in flight is if | ||
1406 | * we are the task doing the rcu_barrier(). | ||
1407 | */ | ||
1408 | |||
1409 | /* First adopt the ready-to-invoke callbacks. */ | ||
1410 | if (rsp->orphan_donelist != NULL) { | ||
1411 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1412 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
1413 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
1414 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
1415 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
1416 | rsp->orphan_donelist = NULL; | ||
1417 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1418 | } | ||
1419 | |||
1420 | /* And then adopt the callbacks that still need a grace period. */ | ||
1421 | if (rsp->orphan_nxtlist != NULL) { | ||
1422 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
1423 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
1424 | rsp->orphan_nxtlist = NULL; | ||
1425 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * Trace the fact that this CPU is going offline. | ||
1431 | */ | ||
1432 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
1433 | { | ||
1434 | RCU_TRACE(unsigned long mask); | ||
1435 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
1436 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
1437 | |||
1438 | RCU_TRACE(mask = rdp->grpmask); | ||
1393 | trace_rcu_grace_period(rsp->name, | 1439 | trace_rcu_grace_period(rsp->name, |
1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1440 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1395 | "cpuofl"); | 1441 | "cpuofl"); |
1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
1398 | } | 1442 | } |
1399 | 1443 | ||
1400 | /* | 1444 | /* |
1401 | * The CPU has been completely removed, and some other CPU is reporting | 1445 | * The CPU has been completely removed, and some other CPU is reporting |
1402 | * this fact from process context. Do the remainder of the cleanup. | 1446 | * this fact from process context. Do the remainder of the cleanup, |
1447 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
1448 | * adopting them, if there is no _rcu_barrier() instance running. | ||
1403 | * There can only be one CPU hotplug operation at a time, so no other | 1449 | * There can only be one CPU hotplug operation at a time, so no other |
1404 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1450 | * CPU can be attempting to update rcu_cpu_kthread_task. |
1405 | */ | 1451 | */ |
@@ -1409,17 +1455,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1409 | unsigned long mask; | 1455 | unsigned long mask; |
1410 | int need_report = 0; | 1456 | int need_report = 0; |
1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1457 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ | 1458 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
1413 | 1459 | ||
1414 | /* Adjust any no-longer-needed kthreads. */ | 1460 | /* Adjust any no-longer-needed kthreads. */ |
1415 | rcu_stop_cpu_kthread(cpu); | 1461 | rcu_stop_cpu_kthread(cpu); |
1416 | rcu_node_kthread_setaffinity(rnp, -1); | 1462 | rcu_node_kthread_setaffinity(rnp, -1); |
1417 | 1463 | ||
1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | 1464 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1419 | 1465 | ||
1420 | /* Exclude any attempts to start a new grace period. */ | 1466 | /* Exclude any attempts to start a new grace period. */ |
1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1467 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1422 | 1468 | ||
1469 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
1470 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
1471 | rcu_adopt_orphan_cbs(rsp); | ||
1472 | |||
1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1473 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1474 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1425 | do { | 1475 | do { |
@@ -1456,6 +1506,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1456 | 1506 | ||
1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1507 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1458 | 1508 | ||
1509 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1510 | { | ||
1511 | } | ||
1512 | |||
1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1513 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1460 | { | 1514 | { |
1461 | } | 1515 | } |
@@ -1524,9 +1578,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1524 | rcu_is_callbacks_kthread()); | 1578 | rcu_is_callbacks_kthread()); |
1525 | 1579 | ||
1526 | /* Update count, and requeue any remaining callbacks. */ | 1580 | /* Update count, and requeue any remaining callbacks. */ |
1527 | rdp->qlen_lazy -= count_lazy; | ||
1528 | rdp->qlen -= count; | ||
1529 | rdp->n_cbs_invoked += count; | ||
1530 | if (list != NULL) { | 1581 | if (list != NULL) { |
1531 | *tail = rdp->nxtlist; | 1582 | *tail = rdp->nxtlist; |
1532 | rdp->nxtlist = list; | 1583 | rdp->nxtlist = list; |
@@ -1536,6 +1587,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1536 | else | 1587 | else |
1537 | break; | 1588 | break; |
1538 | } | 1589 | } |
1590 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
1591 | rdp->qlen_lazy -= count_lazy; | ||
1592 | rdp->qlen -= count; | ||
1593 | rdp->n_cbs_invoked += count; | ||
1539 | 1594 | ||
1540 | /* Reinstate batch limit if we have worked down the excess. */ | 1595 | /* Reinstate batch limit if we have worked down the excess. */ |
1541 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1596 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
@@ -1823,11 +1878,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1823 | rdp = this_cpu_ptr(rsp->rda); | 1878 | rdp = this_cpu_ptr(rsp->rda); |
1824 | 1879 | ||
1825 | /* Add the callback to our list. */ | 1880 | /* Add the callback to our list. */ |
1826 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1827 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1828 | rdp->qlen++; | 1881 | rdp->qlen++; |
1829 | if (lazy) | 1882 | if (lazy) |
1830 | rdp->qlen_lazy++; | 1883 | rdp->qlen_lazy++; |
1884 | else | ||
1885 | rcu_idle_count_callbacks_posted(); | ||
1886 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
1887 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1888 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1831 | 1889 | ||
1832 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1890 | if (__is_kfree_rcu_offset((unsigned long)func)) |
1833 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1891 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
@@ -1893,6 +1951,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
1893 | } | 1951 | } |
1894 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 1952 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1895 | 1953 | ||
1954 | /* | ||
1955 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | ||
1956 | * any blocking grace-period wait automatically implies a grace period | ||
1957 | * if there is only one CPU online at any point time during execution | ||
1958 | * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to | ||
1959 | * occasionally incorrectly indicate that there are multiple CPUs online | ||
1960 | * when there was in fact only one the whole time, as this just adds | ||
1961 | * some overhead: RCU still operates correctly. | ||
1962 | * | ||
1963 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
1964 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
1965 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
1966 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
1967 | * all times, but num_online_cpus() might well return one (or even zero). | ||
1968 | * | ||
1969 | * However, all such demonic sequences require at least one CPU-offline | ||
1970 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
1971 | * is only a problem if there is an RCU read-side critical section executing | ||
1972 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
1973 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
1974 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
1975 | * that there is only one CPU when in fact there was more than one throughout | ||
1976 | * is when there were no RCU readers in the system. If there are no | ||
1977 | * RCU readers, the grace period by definition can be of zero length, | ||
1978 | * regardless of the number of online CPUs. | ||
1979 | */ | ||
1980 | static inline int rcu_blocking_is_gp(void) | ||
1981 | { | ||
1982 | might_sleep(); /* Check for RCU read-side critical section. */ | ||
1983 | return num_online_cpus() <= 1; | ||
1984 | } | ||
1985 | |||
1896 | /** | 1986 | /** |
1897 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. | 1987 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. |
1898 | * | 1988 | * |
@@ -2166,11 +2256,10 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
2166 | rcu_preempt_cpu_has_callbacks(cpu); | 2256 | rcu_preempt_cpu_has_callbacks(cpu); |
2167 | } | 2257 | } |
2168 | 2258 | ||
2169 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2259 | /* |
2170 | static atomic_t rcu_barrier_cpu_count; | 2260 | * RCU callback function for _rcu_barrier(). If we are last, wake |
2171 | static DEFINE_MUTEX(rcu_barrier_mutex); | 2261 | * up the task executing _rcu_barrier(). |
2172 | static struct completion rcu_barrier_completion; | 2262 | */ |
2173 | |||
2174 | static void rcu_barrier_callback(struct rcu_head *notused) | 2263 | static void rcu_barrier_callback(struct rcu_head *notused) |
2175 | { | 2264 | { |
2176 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2265 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -2200,27 +2289,94 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2200 | void (*call_rcu_func)(struct rcu_head *head, | 2289 | void (*call_rcu_func)(struct rcu_head *head, |
2201 | void (*func)(struct rcu_head *head))) | 2290 | void (*func)(struct rcu_head *head))) |
2202 | { | 2291 | { |
2203 | BUG_ON(in_interrupt()); | 2292 | int cpu; |
2293 | unsigned long flags; | ||
2294 | struct rcu_data *rdp; | ||
2295 | struct rcu_head rh; | ||
2296 | |||
2297 | init_rcu_head_on_stack(&rh); | ||
2298 | |||
2204 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2299 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
2205 | mutex_lock(&rcu_barrier_mutex); | 2300 | mutex_lock(&rcu_barrier_mutex); |
2206 | init_completion(&rcu_barrier_completion); | 2301 | |
2302 | smp_mb(); /* Prevent any prior operations from leaking in. */ | ||
2303 | |||
2207 | /* | 2304 | /* |
2208 | * Initialize rcu_barrier_cpu_count to 1, then invoke | 2305 | * Initialize the count to one rather than to zero in order to |
2209 | * rcu_barrier_func() on each CPU, so that each CPU also has | 2306 | * avoid a too-soon return to zero in case of a short grace period |
2210 | * incremented rcu_barrier_cpu_count. Only then is it safe to | 2307 | * (or preemption of this task). Also flag this task as doing |
2211 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 2308 | * an rcu_barrier(). This will prevent anyone else from adopting |
2212 | * might complete its grace period before all of the other CPUs | 2309 | * orphaned callbacks, which could cause otherwise failure if a |
2213 | * did their increment, causing this function to return too | 2310 | * CPU went offline and quickly came back online. To see this, |
2214 | * early. Note that on_each_cpu() disables irqs, which prevents | 2311 | * consider the following sequence of events: |
2215 | * any CPUs from coming online or going offline until each online | 2312 | * |
2216 | * CPU has queued its RCU-barrier callback. | 2313 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. |
2314 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
2315 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
2316 | * 4. CPU 1 comes back online. | ||
2317 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
2318 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
2319 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
2217 | */ | 2320 | */ |
2321 | init_completion(&rcu_barrier_completion); | ||
2218 | atomic_set(&rcu_barrier_cpu_count, 1); | 2322 | atomic_set(&rcu_barrier_cpu_count, 1); |
2219 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 2323 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
2324 | rsp->rcu_barrier_in_progress = current; | ||
2325 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2326 | |||
2327 | /* | ||
2328 | * Force every CPU with callbacks to register a new callback | ||
2329 | * that will tell us when all the preceding callbacks have | ||
2330 | * been invoked. If an offline CPU has callbacks, wait for | ||
2331 | * it to either come back online or to finish orphaning those | ||
2332 | * callbacks. | ||
2333 | */ | ||
2334 | for_each_possible_cpu(cpu) { | ||
2335 | preempt_disable(); | ||
2336 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2337 | if (cpu_is_offline(cpu)) { | ||
2338 | preempt_enable(); | ||
2339 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
2340 | schedule_timeout_interruptible(1); | ||
2341 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2342 | smp_call_function_single(cpu, rcu_barrier_func, | ||
2343 | (void *)call_rcu_func, 1); | ||
2344 | preempt_enable(); | ||
2345 | } else { | ||
2346 | preempt_enable(); | ||
2347 | } | ||
2348 | } | ||
2349 | |||
2350 | /* | ||
2351 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
2352 | * posted, we can adopt all of the orphaned callbacks and place | ||
2353 | * an rcu_barrier_callback() callback after them. When that is done, | ||
2354 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
2355 | * following every callback that could possibly have been | ||
2356 | * registered before _rcu_barrier() was called. | ||
2357 | */ | ||
2358 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
2359 | rcu_adopt_orphan_cbs(rsp); | ||
2360 | rsp->rcu_barrier_in_progress = NULL; | ||
2361 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2362 | atomic_inc(&rcu_barrier_cpu_count); | ||
2363 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
2364 | call_rcu_func(&rh, rcu_barrier_callback); | ||
2365 | |||
2366 | /* | ||
2367 | * Now that we have an rcu_barrier_callback() callback on each | ||
2368 | * CPU, and thus each counted, remove the initial count. | ||
2369 | */ | ||
2220 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2370 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
2221 | complete(&rcu_barrier_completion); | 2371 | complete(&rcu_barrier_completion); |
2372 | |||
2373 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
2222 | wait_for_completion(&rcu_barrier_completion); | 2374 | wait_for_completion(&rcu_barrier_completion); |
2375 | |||
2376 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
2223 | mutex_unlock(&rcu_barrier_mutex); | 2377 | mutex_unlock(&rcu_barrier_mutex); |
2378 | |||
2379 | destroy_rcu_head_on_stack(&rh); | ||
2224 | } | 2380 | } |
2225 | 2381 | ||
2226 | /** | 2382 | /** |
@@ -2417,7 +2573,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2417 | 2573 | ||
2418 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2574 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
2419 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2575 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
2420 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | 2576 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; |
2421 | } | 2577 | } |
2422 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2578 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
2423 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2579 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a4072..7f5d138dedf5 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -29,18 +29,14 @@ | |||
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
33 | * CONFIG_RCU_FANOUT_LEAF. | ||
33 | * In theory, it should be possible to add more levels straightforwardly. | 34 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this did work well going from three levels to four. | 35 | * In practice, this did work well going from three levels to four. |
35 | * Of course, your mileage may vary. | 36 | * Of course, your mileage may vary. |
36 | */ | 37 | */ |
37 | #define MAX_RCU_LVLS 4 | 38 | #define MAX_RCU_LVLS 4 |
38 | #if CONFIG_RCU_FANOUT > 16 | 39 | #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) |
39 | #define RCU_FANOUT_LEAF 16 | ||
40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ | ||
41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) | ||
42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | ||
43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | 40 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) |
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | 41 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) |
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
@@ -371,6 +367,17 @@ struct rcu_state { | |||
371 | 367 | ||
372 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 368 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
373 | /* starting new GP. */ | 369 | /* starting new GP. */ |
370 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
371 | /* need a grace period. */ | ||
372 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
373 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
374 | /* are ready to invoke. */ | ||
375 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
376 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
377 | long qlen; /* Total number of callbacks. */ | ||
378 | struct task_struct *rcu_barrier_in_progress; | ||
379 | /* Task doing rcu_barrier(), */ | ||
380 | /* or NULL if no barrier. */ | ||
374 | raw_spinlock_t fqslock; /* Only one task forcing */ | 381 | raw_spinlock_t fqslock; /* Only one task forcing */ |
375 | /* quiescent states. */ | 382 | /* quiescent states. */ |
376 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 383 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -423,7 +430,6 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
423 | /* Forward declarations for rcutree_plugin.h */ | 430 | /* Forward declarations for rcutree_plugin.h */ |
424 | static void rcu_bootup_announce(void); | 431 | static void rcu_bootup_announce(void); |
425 | long rcu_batches_completed(void); | 432 | long rcu_batches_completed(void); |
426 | static void rcu_preempt_note_context_switch(int cpu); | ||
427 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 433 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
428 | #ifdef CONFIG_HOTPLUG_CPU | 434 | #ifdef CONFIG_HOTPLUG_CPU |
429 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 435 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
@@ -471,6 +477,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); | |||
471 | static void rcu_prepare_for_idle_init(int cpu); | 477 | static void rcu_prepare_for_idle_init(int cpu); |
472 | static void rcu_cleanup_after_idle(int cpu); | 478 | static void rcu_cleanup_after_idle(int cpu); |
473 | static void rcu_prepare_for_idle(int cpu); | 479 | static void rcu_prepare_for_idle(int cpu); |
480 | static void rcu_idle_count_callbacks_posted(void); | ||
474 | static void print_cpu_stall_info_begin(void); | 481 | static void print_cpu_stall_info_begin(void); |
475 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 482 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
476 | static void print_cpu_stall_info_end(void); | 483 | static void print_cpu_stall_info_end(void); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816be..2411000d9869 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) | |||
153 | * | 153 | * |
154 | * Caller must disable preemption. | 154 | * Caller must disable preemption. |
155 | */ | 155 | */ |
156 | static void rcu_preempt_note_context_switch(int cpu) | 156 | void rcu_preempt_note_context_switch(void) |
157 | { | 157 | { |
158 | struct task_struct *t = current; | 158 | struct task_struct *t = current; |
159 | unsigned long flags; | 159 | unsigned long flags; |
@@ -164,7 +164,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 164 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
165 | 165 | ||
166 | /* Possibly blocking in an RCU read-side critical section. */ | 166 | /* Possibly blocking in an RCU read-side critical section. */ |
167 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | 167 | rdp = __this_cpu_ptr(rcu_preempt_state.rda); |
168 | rnp = rdp->mynode; | 168 | rnp = rdp->mynode; |
169 | raw_spin_lock_irqsave(&rnp->lock, flags); | 169 | raw_spin_lock_irqsave(&rnp->lock, flags); |
170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 170 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
@@ -228,7 +228,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
228 | * means that we continue to block the current grace period. | 228 | * means that we continue to block the current grace period. |
229 | */ | 229 | */ |
230 | local_irq_save(flags); | 230 | local_irq_save(flags); |
231 | rcu_preempt_qs(cpu); | 231 | rcu_preempt_qs(smp_processor_id()); |
232 | local_irq_restore(flags); | 232 | local_irq_restore(flags); |
233 | } | 233 | } |
234 | 234 | ||
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) | |||
969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | 969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
970 | } | 970 | } |
971 | 971 | ||
972 | /* | ||
973 | * Check for a task exiting while in a preemptible-RCU read-side | ||
974 | * critical section, clean up if so. No need to issue warnings, | ||
975 | * as debug_check_no_locks_held() already does this if lockdep | ||
976 | * is enabled. | ||
977 | */ | ||
978 | void exit_rcu(void) | ||
979 | { | ||
980 | struct task_struct *t = current; | ||
981 | |||
982 | if (t->rcu_read_lock_nesting == 0) | ||
983 | return; | ||
984 | t->rcu_read_lock_nesting = 1; | ||
985 | __rcu_read_unlock(); | ||
986 | } | ||
987 | |||
988 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 972 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
989 | 973 | ||
990 | static struct rcu_state *rcu_state = &rcu_sched_state; | 974 | static struct rcu_state *rcu_state = &rcu_sched_state; |
@@ -1018,14 +1002,6 @@ void rcu_force_quiescent_state(void) | |||
1018 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 1002 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
1019 | 1003 | ||
1020 | /* | 1004 | /* |
1021 | * Because preemptible RCU does not exist, we never have to check for | ||
1022 | * CPUs being in quiescent states. | ||
1023 | */ | ||
1024 | static void rcu_preempt_note_context_switch(int cpu) | ||
1025 | { | ||
1026 | } | ||
1027 | |||
1028 | /* | ||
1029 | * Because preemptible RCU does not exist, there are never any preempted | 1005 | * Because preemptible RCU does not exist, there are never any preempted |
1030 | * RCU readers. | 1006 | * RCU readers. |
1031 | */ | 1007 | */ |
@@ -1938,6 +1914,14 @@ static void rcu_prepare_for_idle(int cpu) | |||
1938 | { | 1914 | { |
1939 | } | 1915 | } |
1940 | 1916 | ||
1917 | /* | ||
1918 | * Don't bother keeping a running count of the number of RCU callbacks | ||
1919 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
1920 | */ | ||
1921 | static void rcu_idle_count_callbacks_posted(void) | ||
1922 | { | ||
1923 | } | ||
1924 | |||
1941 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1925 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1942 | 1926 | ||
1943 | /* | 1927 | /* |
@@ -1978,11 +1962,20 @@ static void rcu_prepare_for_idle(int cpu) | |||
1978 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1962 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1979 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1963 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1980 | 1964 | ||
1965 | /* Loop counter for rcu_prepare_for_idle(). */ | ||
1981 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 1966 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1967 | /* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ | ||
1982 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 1968 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
1983 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | 1969 | /* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ |
1984 | static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ | 1970 | static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); |
1985 | static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | 1971 | /* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ |
1972 | static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); | ||
1973 | /* Enable special processing on first attempt to enter dyntick-idle mode. */ | ||
1974 | static DEFINE_PER_CPU(bool, rcu_idle_first_pass); | ||
1975 | /* Running count of non-lazy callbacks posted, never decremented. */ | ||
1976 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); | ||
1977 | /* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ | ||
1978 | static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); | ||
1986 | 1979 | ||
1987 | /* | 1980 | /* |
1988 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1981 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
@@ -1995,6 +1988,8 @@ static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | |||
1995 | */ | 1988 | */ |
1996 | int rcu_needs_cpu(int cpu) | 1989 | int rcu_needs_cpu(int cpu) |
1997 | { | 1990 | { |
1991 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
1992 | per_cpu(rcu_idle_first_pass, cpu) = 1; | ||
1998 | /* If no callbacks, RCU doesn't need the CPU. */ | 1993 | /* If no callbacks, RCU doesn't need the CPU. */ |
1999 | if (!rcu_cpu_has_callbacks(cpu)) | 1994 | if (!rcu_cpu_has_callbacks(cpu)) |
2000 | return 0; | 1995 | return 0; |
@@ -2045,16 +2040,34 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2045 | } | 2040 | } |
2046 | 2041 | ||
2047 | /* | 2042 | /* |
2043 | * Handler for smp_call_function_single(). The only point of this | ||
2044 | * handler is to wake the CPU up, so the handler does only tracing. | ||
2045 | */ | ||
2046 | void rcu_idle_demigrate(void *unused) | ||
2047 | { | ||
2048 | trace_rcu_prep_idle("Demigrate"); | ||
2049 | } | ||
2050 | |||
2051 | /* | ||
2048 | * Timer handler used to force CPU to start pushing its remaining RCU | 2052 | * Timer handler used to force CPU to start pushing its remaining RCU |
2049 | * callbacks in the case where it entered dyntick-idle mode with callbacks | 2053 | * callbacks in the case where it entered dyntick-idle mode with callbacks |
2050 | * pending. The hander doesn't really need to do anything because the | 2054 | * pending. The hander doesn't really need to do anything because the |
2051 | * real work is done upon re-entry to idle, or by the next scheduling-clock | 2055 | * real work is done upon re-entry to idle, or by the next scheduling-clock |
2052 | * interrupt should idle not be re-entered. | 2056 | * interrupt should idle not be re-entered. |
2057 | * | ||
2058 | * One special case: the timer gets migrated without awakening the CPU | ||
2059 | * on which the timer was scheduled on. In this case, we must wake up | ||
2060 | * that CPU. We do so with smp_call_function_single(). | ||
2053 | */ | 2061 | */ |
2054 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | 2062 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) |
2055 | { | 2063 | { |
2064 | int cpu = (int)cpu_in; | ||
2065 | |||
2056 | trace_rcu_prep_idle("Timer"); | 2066 | trace_rcu_prep_idle("Timer"); |
2057 | return HRTIMER_NORESTART; | 2067 | if (cpu != smp_processor_id()) |
2068 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
2069 | else | ||
2070 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
2058 | } | 2071 | } |
2059 | 2072 | ||
2060 | /* | 2073 | /* |
@@ -2062,19 +2075,11 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | |||
2062 | */ | 2075 | */ |
2063 | static void rcu_prepare_for_idle_init(int cpu) | 2076 | static void rcu_prepare_for_idle_init(int cpu) |
2064 | { | 2077 | { |
2065 | static int firsttime = 1; | 2078 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
2066 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2079 | setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), |
2067 | 2080 | rcu_idle_gp_timer_func, cpu); | |
2068 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 2081 | per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; |
2069 | hrtp->function = rcu_idle_gp_timer_func; | 2082 | per_cpu(rcu_idle_first_pass, cpu) = 1; |
2070 | if (firsttime) { | ||
2071 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2072 | |||
2073 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2074 | upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); | ||
2075 | rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2076 | firsttime = 0; | ||
2077 | } | ||
2078 | } | 2083 | } |
2079 | 2084 | ||
2080 | /* | 2085 | /* |
@@ -2084,7 +2089,8 @@ static void rcu_prepare_for_idle_init(int cpu) | |||
2084 | */ | 2089 | */ |
2085 | static void rcu_cleanup_after_idle(int cpu) | 2090 | static void rcu_cleanup_after_idle(int cpu) |
2086 | { | 2091 | { |
2087 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | 2092 | del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); |
2093 | trace_rcu_prep_idle("Cleanup after idle"); | ||
2088 | } | 2094 | } |
2089 | 2095 | ||
2090 | /* | 2096 | /* |
@@ -2108,6 +2114,29 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2108 | */ | 2114 | */ |
2109 | static void rcu_prepare_for_idle(int cpu) | 2115 | static void rcu_prepare_for_idle(int cpu) |
2110 | { | 2116 | { |
2117 | struct timer_list *tp; | ||
2118 | |||
2119 | /* | ||
2120 | * If this is an idle re-entry, for example, due to use of | ||
2121 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | ||
2122 | * loop, then don't take any state-machine actions, unless the | ||
2123 | * momentary exit from idle queued additional non-lazy callbacks. | ||
2124 | * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks | ||
2125 | * pending. | ||
2126 | */ | ||
2127 | if (!per_cpu(rcu_idle_first_pass, cpu) && | ||
2128 | (per_cpu(rcu_nonlazy_posted, cpu) == | ||
2129 | per_cpu(rcu_nonlazy_posted_snap, cpu))) { | ||
2130 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2131 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2132 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2133 | } | ||
2134 | return; | ||
2135 | } | ||
2136 | per_cpu(rcu_idle_first_pass, cpu) = 0; | ||
2137 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2138 | per_cpu(rcu_nonlazy_posted, cpu) - 1; | ||
2139 | |||
2111 | /* | 2140 | /* |
2112 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2141 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2113 | * Also reset state to avoid prejudicing later attempts. | 2142 | * Also reset state to avoid prejudicing later attempts. |
@@ -2140,11 +2169,15 @@ static void rcu_prepare_for_idle(int cpu) | |||
2140 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2169 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
2141 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2170 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
2142 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2171 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) |
2143 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2172 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2144 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | 2173 | jiffies + RCU_IDLE_GP_DELAY; |
2145 | else | 2174 | else |
2146 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2175 | per_cpu(rcu_idle_gp_timer_expires, cpu) = |
2147 | rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); | 2176 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2177 | tp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2178 | mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); | ||
2179 | per_cpu(rcu_nonlazy_posted_snap, cpu) = | ||
2180 | per_cpu(rcu_nonlazy_posted, cpu); | ||
2148 | return; /* Nothing more to do immediately. */ | 2181 | return; /* Nothing more to do immediately. */ |
2149 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2182 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
2150 | /* We have hit the limit, so time to give up. */ | 2183 | /* We have hit the limit, so time to give up. */ |
@@ -2184,6 +2217,19 @@ static void rcu_prepare_for_idle(int cpu) | |||
2184 | trace_rcu_prep_idle("Callbacks drained"); | 2217 | trace_rcu_prep_idle("Callbacks drained"); |
2185 | } | 2218 | } |
2186 | 2219 | ||
2220 | /* | ||
2221 | * Keep a running count of the number of non-lazy callbacks posted | ||
2222 | * on this CPU. This running counter (which is never decremented) allows | ||
2223 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
2224 | * posts a callback, even if an equal number of callbacks are invoked. | ||
2225 | * Of course, callbacks should only be posted from within a trace event | ||
2226 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
2227 | */ | ||
2228 | static void rcu_idle_count_callbacks_posted(void) | ||
2229 | { | ||
2230 | __this_cpu_add(rcu_nonlazy_posted, 1); | ||
2231 | } | ||
2232 | |||
2187 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2233 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2188 | 2234 | ||
2189 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 2235 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -2192,14 +2238,12 @@ static void rcu_prepare_for_idle(int cpu) | |||
2192 | 2238 | ||
2193 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2239 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2194 | { | 2240 | { |
2195 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2241 | struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); |
2196 | 2242 | ||
2197 | sprintf(cp, "drain=%d %c timer=%lld", | 2243 | sprintf(cp, "drain=%d %c timer=%lu", |
2198 | per_cpu(rcu_dyntick_drain, cpu), | 2244 | per_cpu(rcu_dyntick_drain, cpu), |
2199 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2245 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', |
2200 | hrtimer_active(hrtp) | 2246 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2201 | ? ktime_to_us(hrtimer_get_remaining(hrtp)) | ||
2202 | : -1); | ||
2203 | } | 2247 | } |
2204 | 2248 | ||
2205 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2249 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff43..d4bc16ddd1d4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
271 | 271 | ||
272 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
275 | rsp->completed, gpnum, rsp->fqs_state, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
276 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
277 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs_lh); | 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
282 | if (rnp->level != level) { | 282 | if (rnp->level != level) { |
283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b3..ad581aa2369a 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) | |||
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val, |
26 | bool force) | ||
26 | { | 27 | { |
28 | int ret = 0; | ||
29 | |||
27 | if (counter->usage + val > counter->limit) { | 30 | if (counter->usage + val > counter->limit) { |
28 | counter->failcnt++; | 31 | counter->failcnt++; |
29 | return -ENOMEM; | 32 | ret = -ENOMEM; |
33 | if (!force) | ||
34 | return ret; | ||
30 | } | 35 | } |
31 | 36 | ||
32 | counter->usage += val; | 37 | counter->usage += val; |
33 | if (counter->usage > counter->max_usage) | 38 | if (counter->usage > counter->max_usage) |
34 | counter->max_usage = counter->usage; | 39 | counter->max_usage = counter->usage; |
35 | return 0; | 40 | return ret; |
36 | } | 41 | } |
37 | 42 | ||
38 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 43 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, |
39 | struct res_counter **limit_fail_at) | 44 | struct res_counter **limit_fail_at, bool force) |
40 | { | 45 | { |
41 | int ret; | 46 | int ret, r; |
42 | unsigned long flags; | 47 | unsigned long flags; |
43 | struct res_counter *c, *u; | 48 | struct res_counter *c, *u; |
44 | 49 | ||
50 | r = ret = 0; | ||
45 | *limit_fail_at = NULL; | 51 | *limit_fail_at = NULL; |
46 | local_irq_save(flags); | 52 | local_irq_save(flags); |
47 | for (c = counter; c != NULL; c = c->parent) { | 53 | for (c = counter; c != NULL; c = c->parent) { |
48 | spin_lock(&c->lock); | 54 | spin_lock(&c->lock); |
49 | ret = res_counter_charge_locked(c, val); | 55 | r = res_counter_charge_locked(c, val, force); |
50 | spin_unlock(&c->lock); | 56 | spin_unlock(&c->lock); |
51 | if (ret < 0) { | 57 | if (r < 0 && !ret) { |
58 | ret = r; | ||
52 | *limit_fail_at = c; | 59 | *limit_fail_at = c; |
53 | goto undo; | 60 | if (!force) |
61 | break; | ||
54 | } | 62 | } |
55 | } | 63 | } |
56 | ret = 0; | 64 | |
57 | goto done; | 65 | if (ret < 0 && !force) { |
58 | undo: | 66 | for (u = counter; u != c; u = u->parent) { |
59 | for (u = counter; u != c; u = u->parent) { | 67 | spin_lock(&u->lock); |
60 | spin_lock(&u->lock); | 68 | res_counter_uncharge_locked(u, val); |
61 | res_counter_uncharge_locked(u, val); | 69 | spin_unlock(&u->lock); |
62 | spin_unlock(&u->lock); | 70 | } |
63 | } | 71 | } |
64 | done: | ||
65 | local_irq_restore(flags); | 72 | local_irq_restore(flags); |
73 | |||
66 | return ret; | 74 | return ret; |
67 | } | 75 | } |
68 | 76 | ||
77 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
78 | struct res_counter **limit_fail_at) | ||
79 | { | ||
80 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
81 | } | ||
82 | |||
69 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | 83 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, |
70 | struct res_counter **limit_fail_at) | 84 | struct res_counter **limit_fail_at) |
71 | { | 85 | { |
72 | int ret, r; | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
73 | unsigned long flags; | ||
74 | struct res_counter *c; | ||
75 | |||
76 | r = ret = 0; | ||
77 | *limit_fail_at = NULL; | ||
78 | local_irq_save(flags); | ||
79 | for (c = counter; c != NULL; c = c->parent) { | ||
80 | spin_lock(&c->lock); | ||
81 | r = res_counter_charge_locked(c, val); | ||
82 | if (r) | ||
83 | c->usage += val; | ||
84 | spin_unlock(&c->lock); | ||
85 | if (r < 0 && ret == 0) { | ||
86 | *limit_fail_at = c; | ||
87 | ret = r; | ||
88 | } | ||
89 | } | ||
90 | local_irq_restore(flags); | ||
91 | |||
92 | return ret; | ||
93 | } | 87 | } |
88 | |||
94 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
95 | { | 90 | { |
96 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
@@ -99,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | |||
99 | counter->usage -= val; | 94 | counter->usage -= val; |
100 | } | 95 | } |
101 | 96 | ||
102 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 97 | void res_counter_uncharge_until(struct res_counter *counter, |
98 | struct res_counter *top, | ||
99 | unsigned long val) | ||
103 | { | 100 | { |
104 | unsigned long flags; | 101 | unsigned long flags; |
105 | struct res_counter *c; | 102 | struct res_counter *c; |
106 | 103 | ||
107 | local_irq_save(flags); | 104 | local_irq_save(flags); |
108 | for (c = counter; c != NULL; c = c->parent) { | 105 | for (c = counter; c != top; c = c->parent) { |
109 | spin_lock(&c->lock); | 106 | spin_lock(&c->lock); |
110 | res_counter_uncharge_locked(c, val); | 107 | res_counter_uncharge_locked(c, val); |
111 | spin_unlock(&c->lock); | 108 | spin_unlock(&c->lock); |
@@ -113,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) | |||
113 | local_irq_restore(flags); | 110 | local_irq_restore(flags); |
114 | } | 111 | } |
115 | 112 | ||
113 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | ||
114 | { | ||
115 | res_counter_uncharge_until(counter, NULL, val); | ||
116 | } | ||
116 | 117 | ||
117 | static inline unsigned long long * | 118 | static inline unsigned long long * |
118 | res_counter_member(struct res_counter *counter, int member) | 119 | res_counter_member(struct res_counter *counter, int member) |
diff --git a/kernel/resource.c b/kernel/resource.c index 7e8ea66a8c01..e1d2b8ee76d5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -515,8 +515,8 @@ out: | |||
515 | * @root: root resource descriptor | 515 | * @root: root resource descriptor |
516 | * @new: resource descriptor desired by caller | 516 | * @new: resource descriptor desired by caller |
517 | * @size: requested resource region size | 517 | * @size: requested resource region size |
518 | * @min: minimum size to allocate | 518 | * @min: minimum boundary to allocate |
519 | * @max: maximum size to allocate | 519 | * @max: maximum boundary to allocate |
520 | * @align: alignment requested, in bytes | 520 | * @align: alignment requested, in bytes |
521 | * @alignf: alignment function, optional, called if not NULL | 521 | * @alignf: alignment function, optional, called if not NULL |
522 | * @alignf_data: arbitrary data to pass to the @alignf function | 522 | * @alignf_data: arbitrary data to pass to the @alignf function |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a3..173ea52f3af0 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | |||
20 | |||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533a688ce22..c46958e26121 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
141 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
142 | #name , | 143 | #name , |
143 | 144 | ||
144 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
145 | #include "features.h" | 146 | #include "features.h" |
146 | NULL | ||
147 | }; | 147 | }; |
148 | 148 | ||
149 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 692 | } |
693 | #endif | 693 | #endif |
694 | 694 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 695 | static void set_load_weight(struct task_struct *p) |
698 | { | 696 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 697 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -2083,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2083 | #endif | 2081 | #endif |
2084 | 2082 | ||
2085 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
2084 | rcu_switch_from(prev); | ||
2086 | switch_to(prev, next, prev); | 2085 | switch_to(prev, next, prev); |
2087 | 2086 | ||
2088 | barrier(); | 2087 | barrier(); |
@@ -2486,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2485 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2487 | * every tick. We fix it up based on jiffies. | 2486 | * every tick. We fix it up based on jiffies. |
2488 | */ | 2487 | */ |
2489 | void update_cpu_load(struct rq *this_rq) | 2488 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2489 | unsigned long pending_updates) | ||
2490 | { | 2490 | { |
2491 | unsigned long this_load = this_rq->load.weight; | ||
2492 | unsigned long curr_jiffies = jiffies; | ||
2493 | unsigned long pending_updates; | ||
2494 | int i, scale; | 2491 | int i, scale; |
2495 | 2492 | ||
2496 | this_rq->nr_load_updates++; | 2493 | this_rq->nr_load_updates++; |
2497 | 2494 | ||
2498 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2499 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2500 | return; | ||
2501 | |||
2502 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2503 | this_rq->last_load_update_tick = curr_jiffies; | ||
2504 | |||
2505 | /* Update our load: */ | 2495 | /* Update our load: */ |
2506 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2496 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2507 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2497 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2526,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq) | |||
2526 | sched_avg_update(this_rq); | 2516 | sched_avg_update(this_rq); |
2527 | } | 2517 | } |
2528 | 2518 | ||
2519 | #ifdef CONFIG_NO_HZ | ||
2520 | /* | ||
2521 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2522 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2523 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2524 | * | ||
2525 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2526 | * would seriously skew the load calculation. However we'll make do for those | ||
2527 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2528 | * (tick_nohz_idle_exit). | ||
2529 | * | ||
2530 | * This means we might still be one tick off for nohz periods. | ||
2531 | */ | ||
2532 | |||
2533 | /* | ||
2534 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2535 | * idle balance. | ||
2536 | */ | ||
2537 | void update_idle_cpu_load(struct rq *this_rq) | ||
2538 | { | ||
2539 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2540 | unsigned long load = this_rq->load.weight; | ||
2541 | unsigned long pending_updates; | ||
2542 | |||
2543 | /* | ||
2544 | * bail if there's load or we're actually up-to-date. | ||
2545 | */ | ||
2546 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2547 | return; | ||
2548 | |||
2549 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2550 | this_rq->last_load_update_tick = curr_jiffies; | ||
2551 | |||
2552 | __update_cpu_load(this_rq, load, pending_updates); | ||
2553 | } | ||
2554 | |||
2555 | /* | ||
2556 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2557 | */ | ||
2558 | void update_cpu_load_nohz(void) | ||
2559 | { | ||
2560 | struct rq *this_rq = this_rq(); | ||
2561 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2562 | unsigned long pending_updates; | ||
2563 | |||
2564 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2565 | return; | ||
2566 | |||
2567 | raw_spin_lock(&this_rq->lock); | ||
2568 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2569 | if (pending_updates) { | ||
2570 | this_rq->last_load_update_tick = curr_jiffies; | ||
2571 | /* | ||
2572 | * We were idle, this means load 0, the current load might be | ||
2573 | * !0 due to remote wakeups and the sort. | ||
2574 | */ | ||
2575 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2576 | } | ||
2577 | raw_spin_unlock(&this_rq->lock); | ||
2578 | } | ||
2579 | #endif /* CONFIG_NO_HZ */ | ||
2580 | |||
2581 | /* | ||
2582 | * Called from scheduler_tick() | ||
2583 | */ | ||
2529 | static void update_cpu_load_active(struct rq *this_rq) | 2584 | static void update_cpu_load_active(struct rq *this_rq) |
2530 | { | 2585 | { |
2531 | update_cpu_load(this_rq); | 2586 | /* |
2587 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2588 | */ | ||
2589 | this_rq->last_load_update_tick = jiffies; | ||
2590 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2532 | 2591 | ||
2533 | calc_load_account_active(this_rq); | 2592 | calc_load_account_active(this_rq); |
2534 | } | 2593 | } |
@@ -3113,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3113 | if (irqs_disabled()) | 3172 | if (irqs_disabled()) |
3114 | print_irqtrace_events(prev); | 3173 | print_irqtrace_events(prev); |
3115 | dump_stack(); | 3174 | dump_stack(); |
3175 | add_taint(TAINT_WARN); | ||
3116 | } | 3176 | } |
3117 | 3177 | ||
3118 | /* | 3178 | /* |
@@ -4042,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p) | |||
4042 | 4102 | ||
4043 | rcu_read_lock(); | 4103 | rcu_read_lock(); |
4044 | pcred = __task_cred(p); | 4104 | pcred = __task_cred(p); |
4045 | if (cred->user->user_ns == pcred->user->user_ns) | 4105 | match = (uid_eq(cred->euid, pcred->euid) || |
4046 | match = (cred->euid == pcred->euid || | 4106 | uid_eq(cred->euid, pcred->uid)); |
4047 | cred->euid == pcred->uid); | ||
4048 | else | ||
4049 | match = false; | ||
4050 | rcu_read_unlock(); | 4107 | rcu_read_unlock(); |
4051 | return match; | 4108 | return match; |
4052 | } | 4109 | } |
@@ -4957,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
4957 | p->sched_class->set_cpus_allowed(p, new_mask); | 5014 | p->sched_class->set_cpus_allowed(p, new_mask); |
4958 | 5015 | ||
4959 | cpumask_copy(&p->cpus_allowed, new_mask); | 5016 | cpumask_copy(&p->cpus_allowed, new_mask); |
4960 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5017 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
4961 | } | 5018 | } |
4962 | 5019 | ||
4963 | /* | 5020 | /* |
@@ -5560,7 +5617,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5560 | break; | 5617 | break; |
5561 | } | 5618 | } |
5562 | 5619 | ||
5563 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5620 | if (!(sd->flags & SD_OVERLAP) && |
5621 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5564 | printk(KERN_CONT "\n"); | 5622 | printk(KERN_CONT "\n"); |
5565 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5623 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5566 | break; | 5624 | break; |
@@ -5898,99 +5956,11 @@ static int __init isolated_cpu_setup(char *str) | |||
5898 | 5956 | ||
5899 | __setup("isolcpus=", isolated_cpu_setup); | 5957 | __setup("isolcpus=", isolated_cpu_setup); |
5900 | 5958 | ||
5901 | #ifdef CONFIG_NUMA | ||
5902 | |||
5903 | /** | ||
5904 | * find_next_best_node - find the next node to include in a sched_domain | ||
5905 | * @node: node whose sched_domain we're building | ||
5906 | * @used_nodes: nodes already in the sched_domain | ||
5907 | * | ||
5908 | * Find the next node to include in a given scheduling domain. Simply | ||
5909 | * finds the closest node not already in the @used_nodes map. | ||
5910 | * | ||
5911 | * Should use nodemask_t. | ||
5912 | */ | ||
5913 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5914 | { | ||
5915 | int i, n, val, min_val, best_node = -1; | ||
5916 | |||
5917 | min_val = INT_MAX; | ||
5918 | |||
5919 | for (i = 0; i < nr_node_ids; i++) { | ||
5920 | /* Start at @node */ | ||
5921 | n = (node + i) % nr_node_ids; | ||
5922 | |||
5923 | if (!nr_cpus_node(n)) | ||
5924 | continue; | ||
5925 | |||
5926 | /* Skip already used nodes */ | ||
5927 | if (node_isset(n, *used_nodes)) | ||
5928 | continue; | ||
5929 | |||
5930 | /* Simple min distance search */ | ||
5931 | val = node_distance(node, n); | ||
5932 | |||
5933 | if (val < min_val) { | ||
5934 | min_val = val; | ||
5935 | best_node = n; | ||
5936 | } | ||
5937 | } | ||
5938 | |||
5939 | if (best_node != -1) | ||
5940 | node_set(best_node, *used_nodes); | ||
5941 | return best_node; | ||
5942 | } | ||
5943 | |||
5944 | /** | ||
5945 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5946 | * @node: node whose cpumask we're constructing | ||
5947 | * @span: resulting cpumask | ||
5948 | * | ||
5949 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5950 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5951 | * out optimally. | ||
5952 | */ | ||
5953 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5954 | { | ||
5955 | nodemask_t used_nodes; | ||
5956 | int i; | ||
5957 | |||
5958 | cpumask_clear(span); | ||
5959 | nodes_clear(used_nodes); | ||
5960 | |||
5961 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5962 | node_set(node, used_nodes); | ||
5963 | |||
5964 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5965 | int next_node = find_next_best_node(node, &used_nodes); | ||
5966 | if (next_node < 0) | ||
5967 | break; | ||
5968 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5969 | } | ||
5970 | } | ||
5971 | |||
5972 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5973 | { | ||
5974 | lockdep_assert_held(&sched_domains_mutex); | ||
5975 | |||
5976 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5977 | |||
5978 | return sched_domains_tmpmask; | ||
5979 | } | ||
5980 | |||
5981 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5982 | { | ||
5983 | return cpu_possible_mask; | ||
5984 | } | ||
5985 | #endif /* CONFIG_NUMA */ | ||
5986 | |||
5987 | static const struct cpumask *cpu_cpu_mask(int cpu) | 5959 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5988 | { | 5960 | { |
5989 | return cpumask_of_node(cpu_to_node(cpu)); | 5961 | return cpumask_of_node(cpu_to_node(cpu)); |
5990 | } | 5962 | } |
5991 | 5963 | ||
5992 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5993 | |||
5994 | struct sd_data { | 5964 | struct sd_data { |
5995 | struct sched_domain **__percpu sd; | 5965 | struct sched_domain **__percpu sd; |
5996 | struct sched_group **__percpu sg; | 5966 | struct sched_group **__percpu sg; |
@@ -6020,6 +5990,7 @@ struct sched_domain_topology_level { | |||
6020 | sched_domain_init_f init; | 5990 | sched_domain_init_f init; |
6021 | sched_domain_mask_f mask; | 5991 | sched_domain_mask_f mask; |
6022 | int flags; | 5992 | int flags; |
5993 | int numa_level; | ||
6023 | struct sd_data data; | 5994 | struct sd_data data; |
6024 | }; | 5995 | }; |
6025 | 5996 | ||
@@ -6058,11 +6029,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6058 | 6029 | ||
6059 | cpumask_or(covered, covered, sg_span); | 6030 | cpumask_or(covered, covered, sg_span); |
6060 | 6031 | ||
6061 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6032 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
6062 | atomic_inc(&sg->sgp->ref); | 6033 | atomic_inc(&sg->sgp->ref); |
6063 | 6034 | ||
6064 | if (cpumask_test_cpu(cpu, sg_span)) | 6035 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || |
6036 | cpumask_first(sg_span) == cpu) { | ||
6037 | WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); | ||
6065 | groups = sg; | 6038 | groups = sg; |
6039 | } | ||
6066 | 6040 | ||
6067 | if (!first) | 6041 | if (!first) |
6068 | first = sg; | 6042 | first = sg; |
@@ -6211,10 +6185,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6211 | } | 6185 | } |
6212 | 6186 | ||
6213 | SD_INIT_FUNC(CPU) | 6187 | SD_INIT_FUNC(CPU) |
6214 | #ifdef CONFIG_NUMA | ||
6215 | SD_INIT_FUNC(ALLNODES) | ||
6216 | SD_INIT_FUNC(NODE) | ||
6217 | #endif | ||
6218 | #ifdef CONFIG_SCHED_SMT | 6188 | #ifdef CONFIG_SCHED_SMT |
6219 | SD_INIT_FUNC(SIBLING) | 6189 | SD_INIT_FUNC(SIBLING) |
6220 | #endif | 6190 | #endif |
@@ -6336,15 +6306,184 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6336 | { sd_init_BOOK, cpu_book_mask, }, | 6306 | { sd_init_BOOK, cpu_book_mask, }, |
6337 | #endif | 6307 | #endif |
6338 | { sd_init_CPU, cpu_cpu_mask, }, | 6308 | { sd_init_CPU, cpu_cpu_mask, }, |
6339 | #ifdef CONFIG_NUMA | ||
6340 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6341 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6342 | #endif | ||
6343 | { NULL, }, | 6309 | { NULL, }, |
6344 | }; | 6310 | }; |
6345 | 6311 | ||
6346 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6312 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6347 | 6313 | ||
6314 | #ifdef CONFIG_NUMA | ||
6315 | |||
6316 | static int sched_domains_numa_levels; | ||
6317 | static int sched_domains_numa_scale; | ||
6318 | static int *sched_domains_numa_distance; | ||
6319 | static struct cpumask ***sched_domains_numa_masks; | ||
6320 | static int sched_domains_curr_level; | ||
6321 | |||
6322 | static inline int sd_local_flags(int level) | ||
6323 | { | ||
6324 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | ||
6325 | return 0; | ||
6326 | |||
6327 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6328 | } | ||
6329 | |||
6330 | static struct sched_domain * | ||
6331 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6332 | { | ||
6333 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6334 | int level = tl->numa_level; | ||
6335 | int sd_weight = cpumask_weight( | ||
6336 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6337 | |||
6338 | *sd = (struct sched_domain){ | ||
6339 | .min_interval = sd_weight, | ||
6340 | .max_interval = 2*sd_weight, | ||
6341 | .busy_factor = 32, | ||
6342 | .imbalance_pct = 125, | ||
6343 | .cache_nice_tries = 2, | ||
6344 | .busy_idx = 3, | ||
6345 | .idle_idx = 2, | ||
6346 | .newidle_idx = 0, | ||
6347 | .wake_idx = 0, | ||
6348 | .forkexec_idx = 0, | ||
6349 | |||
6350 | .flags = 1*SD_LOAD_BALANCE | ||
6351 | | 1*SD_BALANCE_NEWIDLE | ||
6352 | | 0*SD_BALANCE_EXEC | ||
6353 | | 0*SD_BALANCE_FORK | ||
6354 | | 0*SD_BALANCE_WAKE | ||
6355 | | 0*SD_WAKE_AFFINE | ||
6356 | | 0*SD_PREFER_LOCAL | ||
6357 | | 0*SD_SHARE_CPUPOWER | ||
6358 | | 0*SD_SHARE_PKG_RESOURCES | ||
6359 | | 1*SD_SERIALIZE | ||
6360 | | 0*SD_PREFER_SIBLING | ||
6361 | | sd_local_flags(level) | ||
6362 | , | ||
6363 | .last_balance = jiffies, | ||
6364 | .balance_interval = sd_weight, | ||
6365 | }; | ||
6366 | SD_INIT_NAME(sd, NUMA); | ||
6367 | sd->private = &tl->data; | ||
6368 | |||
6369 | /* | ||
6370 | * Ugly hack to pass state to sd_numa_mask()... | ||
6371 | */ | ||
6372 | sched_domains_curr_level = tl->numa_level; | ||
6373 | |||
6374 | return sd; | ||
6375 | } | ||
6376 | |||
6377 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6378 | { | ||
6379 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6380 | } | ||
6381 | |||
6382 | static void sched_init_numa(void) | ||
6383 | { | ||
6384 | int next_distance, curr_distance = node_distance(0, 0); | ||
6385 | struct sched_domain_topology_level *tl; | ||
6386 | int level = 0; | ||
6387 | int i, j, k; | ||
6388 | |||
6389 | sched_domains_numa_scale = curr_distance; | ||
6390 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6391 | if (!sched_domains_numa_distance) | ||
6392 | return; | ||
6393 | |||
6394 | /* | ||
6395 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6396 | * unique distances in the node_distance() table. | ||
6397 | * | ||
6398 | * Assumes node_distance(0,j) includes all distances in | ||
6399 | * node_distance(i,j) in order to avoid cubic time. | ||
6400 | * | ||
6401 | * XXX: could be optimized to O(n log n) by using sort() | ||
6402 | */ | ||
6403 | next_distance = curr_distance; | ||
6404 | for (i = 0; i < nr_node_ids; i++) { | ||
6405 | for (j = 0; j < nr_node_ids; j++) { | ||
6406 | int distance = node_distance(0, j); | ||
6407 | if (distance > curr_distance && | ||
6408 | (distance < next_distance || | ||
6409 | next_distance == curr_distance)) | ||
6410 | next_distance = distance; | ||
6411 | } | ||
6412 | if (next_distance != curr_distance) { | ||
6413 | sched_domains_numa_distance[level++] = next_distance; | ||
6414 | sched_domains_numa_levels = level; | ||
6415 | curr_distance = next_distance; | ||
6416 | } else break; | ||
6417 | } | ||
6418 | /* | ||
6419 | * 'level' contains the number of unique distances, excluding the | ||
6420 | * identity distance node_distance(i,i). | ||
6421 | * | ||
6422 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6423 | * numbers. | ||
6424 | */ | ||
6425 | |||
6426 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6427 | if (!sched_domains_numa_masks) | ||
6428 | return; | ||
6429 | |||
6430 | /* | ||
6431 | * Now for each level, construct a mask per node which contains all | ||
6432 | * cpus of nodes that are that many hops away from us. | ||
6433 | */ | ||
6434 | for (i = 0; i < level; i++) { | ||
6435 | sched_domains_numa_masks[i] = | ||
6436 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6437 | if (!sched_domains_numa_masks[i]) | ||
6438 | return; | ||
6439 | |||
6440 | for (j = 0; j < nr_node_ids; j++) { | ||
6441 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6442 | if (!mask) | ||
6443 | return; | ||
6444 | |||
6445 | sched_domains_numa_masks[i][j] = mask; | ||
6446 | |||
6447 | for (k = 0; k < nr_node_ids; k++) { | ||
6448 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6449 | continue; | ||
6450 | |||
6451 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6452 | } | ||
6453 | } | ||
6454 | } | ||
6455 | |||
6456 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6457 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6458 | if (!tl) | ||
6459 | return; | ||
6460 | |||
6461 | /* | ||
6462 | * Copy the default topology bits.. | ||
6463 | */ | ||
6464 | for (i = 0; default_topology[i].init; i++) | ||
6465 | tl[i] = default_topology[i]; | ||
6466 | |||
6467 | /* | ||
6468 | * .. and append 'j' levels of NUMA goodness. | ||
6469 | */ | ||
6470 | for (j = 0; j < level; i++, j++) { | ||
6471 | tl[i] = (struct sched_domain_topology_level){ | ||
6472 | .init = sd_numa_init, | ||
6473 | .mask = sd_numa_mask, | ||
6474 | .flags = SDTL_OVERLAP, | ||
6475 | .numa_level = j, | ||
6476 | }; | ||
6477 | } | ||
6478 | |||
6479 | sched_domain_topology = tl; | ||
6480 | } | ||
6481 | #else | ||
6482 | static inline void sched_init_numa(void) | ||
6483 | { | ||
6484 | } | ||
6485 | #endif /* CONFIG_NUMA */ | ||
6486 | |||
6348 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6487 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6349 | { | 6488 | { |
6350 | struct sched_domain_topology_level *tl; | 6489 | struct sched_domain_topology_level *tl; |
@@ -6382,6 +6521,8 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6382 | if (!sg) | 6521 | if (!sg) |
6383 | return -ENOMEM; | 6522 | return -ENOMEM; |
6384 | 6523 | ||
6524 | sg->next = sg; | ||
6525 | |||
6385 | *per_cpu_ptr(sdd->sg, j) = sg; | 6526 | *per_cpu_ptr(sdd->sg, j) = sg; |
6386 | 6527 | ||
6387 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6528 | sgp = kzalloc_node(sizeof(struct sched_group_power), |
@@ -6585,7 +6726,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
6585 | if (!doms_cur) | 6726 | if (!doms_cur) |
6586 | doms_cur = &fallback_doms; | 6727 | doms_cur = &fallback_doms; |
6587 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6728 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
6588 | dattr_cur = NULL; | ||
6589 | err = build_sched_domains(doms_cur[0], NULL); | 6729 | err = build_sched_domains(doms_cur[0], NULL); |
6590 | register_sched_domain_sysctl(); | 6730 | register_sched_domain_sysctl(); |
6591 | 6731 | ||
@@ -6710,97 +6850,6 @@ match2: | |||
6710 | mutex_unlock(&sched_domains_mutex); | 6850 | mutex_unlock(&sched_domains_mutex); |
6711 | } | 6851 | } |
6712 | 6852 | ||
6713 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6714 | static void reinit_sched_domains(void) | ||
6715 | { | ||
6716 | get_online_cpus(); | ||
6717 | |||
6718 | /* Destroy domains first to force the rebuild */ | ||
6719 | partition_sched_domains(0, NULL, NULL); | ||
6720 | |||
6721 | rebuild_sched_domains(); | ||
6722 | put_online_cpus(); | ||
6723 | } | ||
6724 | |||
6725 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6726 | { | ||
6727 | unsigned int level = 0; | ||
6728 | |||
6729 | if (sscanf(buf, "%u", &level) != 1) | ||
6730 | return -EINVAL; | ||
6731 | |||
6732 | /* | ||
6733 | * level is always be positive so don't check for | ||
6734 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | ||
6735 | * What happens on 0 or 1 byte write, | ||
6736 | * need to check for count as well? | ||
6737 | */ | ||
6738 | |||
6739 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | ||
6740 | return -EINVAL; | ||
6741 | |||
6742 | if (smt) | ||
6743 | sched_smt_power_savings = level; | ||
6744 | else | ||
6745 | sched_mc_power_savings = level; | ||
6746 | |||
6747 | reinit_sched_domains(); | ||
6748 | |||
6749 | return count; | ||
6750 | } | ||
6751 | |||
6752 | #ifdef CONFIG_SCHED_MC | ||
6753 | static ssize_t sched_mc_power_savings_show(struct device *dev, | ||
6754 | struct device_attribute *attr, | ||
6755 | char *buf) | ||
6756 | { | ||
6757 | return sprintf(buf, "%u\n", sched_mc_power_savings); | ||
6758 | } | ||
6759 | static ssize_t sched_mc_power_savings_store(struct device *dev, | ||
6760 | struct device_attribute *attr, | ||
6761 | const char *buf, size_t count) | ||
6762 | { | ||
6763 | return sched_power_savings_store(buf, count, 0); | ||
6764 | } | ||
6765 | static DEVICE_ATTR(sched_mc_power_savings, 0644, | ||
6766 | sched_mc_power_savings_show, | ||
6767 | sched_mc_power_savings_store); | ||
6768 | #endif | ||
6769 | |||
6770 | #ifdef CONFIG_SCHED_SMT | ||
6771 | static ssize_t sched_smt_power_savings_show(struct device *dev, | ||
6772 | struct device_attribute *attr, | ||
6773 | char *buf) | ||
6774 | { | ||
6775 | return sprintf(buf, "%u\n", sched_smt_power_savings); | ||
6776 | } | ||
6777 | static ssize_t sched_smt_power_savings_store(struct device *dev, | ||
6778 | struct device_attribute *attr, | ||
6779 | const char *buf, size_t count) | ||
6780 | { | ||
6781 | return sched_power_savings_store(buf, count, 1); | ||
6782 | } | ||
6783 | static DEVICE_ATTR(sched_smt_power_savings, 0644, | ||
6784 | sched_smt_power_savings_show, | ||
6785 | sched_smt_power_savings_store); | ||
6786 | #endif | ||
6787 | |||
6788 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) | ||
6789 | { | ||
6790 | int err = 0; | ||
6791 | |||
6792 | #ifdef CONFIG_SCHED_SMT | ||
6793 | if (smt_capable()) | ||
6794 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); | ||
6795 | #endif | ||
6796 | #ifdef CONFIG_SCHED_MC | ||
6797 | if (!err && mc_capable()) | ||
6798 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); | ||
6799 | #endif | ||
6800 | return err; | ||
6801 | } | ||
6802 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
6803 | |||
6804 | /* | 6853 | /* |
6805 | * Update cpusets according to cpu_active mask. If cpusets are | 6854 | * Update cpusets according to cpu_active mask. If cpusets are |
6806 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 6855 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
@@ -6838,6 +6887,8 @@ void __init sched_init_smp(void) | |||
6838 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 6887 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6839 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 6888 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6840 | 6889 | ||
6890 | sched_init_numa(); | ||
6891 | |||
6841 | get_online_cpus(); | 6892 | get_online_cpus(); |
6842 | mutex_lock(&sched_domains_mutex); | 6893 | mutex_lock(&sched_domains_mutex); |
6843 | init_sched_domains(cpu_active_mask); | 6894 | init_sched_domains(cpu_active_mask); |
@@ -7059,6 +7110,7 @@ void __init sched_init(void) | |||
7059 | /* May be allocated at isolcpus cmdline parse time */ | 7110 | /* May be allocated at isolcpus cmdline parse time */ |
7060 | if (cpu_isolated_map == NULL) | 7111 | if (cpu_isolated_map == NULL) |
7061 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7112 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7113 | idle_thread_set_boot_cpu(); | ||
7062 | #endif | 7114 | #endif |
7063 | init_sched_fair_class(); | 7115 | init_sched_fair_class(); |
7064 | 7116 | ||
@@ -7980,13 +8032,9 @@ static struct cftype cpu_files[] = { | |||
7980 | .write_u64 = cpu_rt_period_write_uint, | 8032 | .write_u64 = cpu_rt_period_write_uint, |
7981 | }, | 8033 | }, |
7982 | #endif | 8034 | #endif |
8035 | { } /* terminate */ | ||
7983 | }; | 8036 | }; |
7984 | 8037 | ||
7985 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7986 | { | ||
7987 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7988 | } | ||
7989 | |||
7990 | struct cgroup_subsys cpu_cgroup_subsys = { | 8038 | struct cgroup_subsys cpu_cgroup_subsys = { |
7991 | .name = "cpu", | 8039 | .name = "cpu", |
7992 | .create = cpu_cgroup_create, | 8040 | .create = cpu_cgroup_create, |
@@ -7994,8 +8042,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7994 | .can_attach = cpu_cgroup_can_attach, | 8042 | .can_attach = cpu_cgroup_can_attach, |
7995 | .attach = cpu_cgroup_attach, | 8043 | .attach = cpu_cgroup_attach, |
7996 | .exit = cpu_cgroup_exit, | 8044 | .exit = cpu_cgroup_exit, |
7997 | .populate = cpu_cgroup_populate, | ||
7998 | .subsys_id = cpu_cgroup_subsys_id, | 8045 | .subsys_id = cpu_cgroup_subsys_id, |
8046 | .base_cftypes = cpu_files, | ||
7999 | .early_init = 1, | 8047 | .early_init = 1, |
8000 | }; | 8048 | }; |
8001 | 8049 | ||
@@ -8180,13 +8228,9 @@ static struct cftype files[] = { | |||
8180 | .name = "stat", | 8228 | .name = "stat", |
8181 | .read_map = cpuacct_stats_show, | 8229 | .read_map = cpuacct_stats_show, |
8182 | }, | 8230 | }, |
8231 | { } /* terminate */ | ||
8183 | }; | 8232 | }; |
8184 | 8233 | ||
8185 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8186 | { | ||
8187 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8188 | } | ||
8189 | |||
8190 | /* | 8234 | /* |
8191 | * charge this task's execution time to its accounting group. | 8235 | * charge this task's execution time to its accounting group. |
8192 | * | 8236 | * |
@@ -8218,7 +8262,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8218 | .name = "cpuacct", | 8262 | .name = "cpuacct", |
8219 | .create = cpuacct_create, | 8263 | .create = cpuacct_create, |
8220 | .destroy = cpuacct_destroy, | 8264 | .destroy = cpuacct_destroy, |
8221 | .populate = cpuacct_populate, | ||
8222 | .subsys_id = cpuacct_subsys_id, | 8265 | .subsys_id = cpuacct_subsys_id, |
8266 | .base_cftypes = files, | ||
8223 | }; | 8267 | }; |
8224 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8268 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161d..6f79596e0ea9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | SPLIT_NS(spread0)); | 202 | SPLIT_NS(spread0)); |
203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
204 | cfs_rq->nr_spread_over); | 204 | cfs_rq->nr_spread_over); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 207 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
260 | SEQ_printf(m, "\ncpu#%d\n", cpu); | 260 | SEQ_printf(m, "\ncpu#%d\n", cpu); |
261 | #endif | 261 | #endif |
262 | 262 | ||
263 | #define P(x) \ | 263 | #define P(x) \ |
264 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 264 | do { \ |
265 | if (sizeof(rq->x) == 4) \ | ||
266 | SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ | ||
267 | else \ | ||
268 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ | ||
269 | } while (0) | ||
270 | |||
265 | #define PN(x) \ | 271 | #define PN(x) \ |
266 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 272 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
267 | 273 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9553640c1c3..b2a2d236f27b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2703 | int want_sd = 1; | 2703 | int want_sd = 1; |
2704 | int sync = wake_flags & WF_SYNC; | 2704 | int sync = wake_flags & WF_SYNC; |
2705 | 2705 | ||
2706 | if (p->rt.nr_cpus_allowed == 1) | 2706 | if (p->nr_cpus_allowed == 1) |
2707 | return prev_cpu; | 2707 | return prev_cpu; |
2708 | 2708 | ||
2709 | if (sd_flag & SD_BALANCE_WAKE) { | 2709 | if (sd_flag & SD_BALANCE_WAKE) { |
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2721 | * If power savings logic is enabled for a domain, see if we | 2721 | * If power savings logic is enabled for a domain, see if we |
2722 | * are not overloaded, if so, don't balance wider. | 2722 | * are not overloaded, if so, don't balance wider. |
2723 | */ | 2723 | */ |
2724 | if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { | 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { |
2725 | unsigned long power = 0; | 2725 | unsigned long power = 0; |
2726 | unsigned long nr_running = 0; | 2726 | unsigned long nr_running = 0; |
2727 | unsigned long capacity; | 2727 | unsigned long capacity; |
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2734 | 2734 | ||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
2736 | 2736 | ||
2737 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2738 | nr_running /= 2; | ||
2739 | |||
2740 | if (nr_running < capacity) | 2737 | if (nr_running < capacity) |
2741 | want_sd = 0; | 2738 | want_sd = 0; |
2742 | } | 2739 | } |
@@ -3082,7 +3079,7 @@ struct lb_env { | |||
3082 | struct rq *dst_rq; | 3079 | struct rq *dst_rq; |
3083 | 3080 | ||
3084 | enum cpu_idle_type idle; | 3081 | enum cpu_idle_type idle; |
3085 | long load_move; | 3082 | long imbalance; |
3086 | unsigned int flags; | 3083 | unsigned int flags; |
3087 | 3084 | ||
3088 | unsigned int loop; | 3085 | unsigned int loop; |
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p); | |||
3218 | static const unsigned int sched_nr_migrate_break = 32; | 3215 | static const unsigned int sched_nr_migrate_break = 32; |
3219 | 3216 | ||
3220 | /* | 3217 | /* |
3221 | * move_tasks tries to move up to load_move weighted load from busiest to | 3218 | * move_tasks tries to move up to imbalance weighted load from busiest to |
3222 | * this_rq, as part of a balancing operation within domain "sd". | 3219 | * this_rq, as part of a balancing operation within domain "sd". |
3223 | * Returns 1 if successful and 0 otherwise. | 3220 | * Returns 1 if successful and 0 otherwise. |
3224 | * | 3221 | * |
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env) | |||
3231 | unsigned long load; | 3228 | unsigned long load; |
3232 | int pulled = 0; | 3229 | int pulled = 0; |
3233 | 3230 | ||
3234 | if (env->load_move <= 0) | 3231 | if (env->imbalance <= 0) |
3235 | return 0; | 3232 | return 0; |
3236 | 3233 | ||
3237 | while (!list_empty(tasks)) { | 3234 | while (!list_empty(tasks)) { |
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env) | |||
3257 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) | 3254 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) |
3258 | goto next; | 3255 | goto next; |
3259 | 3256 | ||
3260 | if ((load / 2) > env->load_move) | 3257 | if ((load / 2) > env->imbalance) |
3261 | goto next; | 3258 | goto next; |
3262 | 3259 | ||
3263 | if (!can_migrate_task(p, env)) | 3260 | if (!can_migrate_task(p, env)) |
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env) | |||
3265 | 3262 | ||
3266 | move_task(p, env); | 3263 | move_task(p, env); |
3267 | pulled++; | 3264 | pulled++; |
3268 | env->load_move -= load; | 3265 | env->imbalance -= load; |
3269 | 3266 | ||
3270 | #ifdef CONFIG_PREEMPT | 3267 | #ifdef CONFIG_PREEMPT |
3271 | /* | 3268 | /* |
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env) | |||
3281 | * We only want to steal up to the prescribed amount of | 3278 | * We only want to steal up to the prescribed amount of |
3282 | * weighted load. | 3279 | * weighted load. |
3283 | */ | 3280 | */ |
3284 | if (env->load_move <= 0) | 3281 | if (env->imbalance <= 0) |
3285 | break; | 3282 | break; |
3286 | 3283 | ||
3287 | continue; | 3284 | continue; |
@@ -3435,14 +3432,6 @@ struct sd_lb_stats { | |||
3435 | unsigned int busiest_group_weight; | 3432 | unsigned int busiest_group_weight; |
3436 | 3433 | ||
3437 | int group_imb; /* Is there imbalance in this sd */ | 3434 | int group_imb; /* Is there imbalance in this sd */ |
3438 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3439 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3440 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3441 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3442 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3443 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3444 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3445 | #endif | ||
3446 | }; | 3435 | }; |
3447 | 3436 | ||
3448 | /* | 3437 | /* |
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
3486 | return load_idx; | 3475 | return load_idx; |
3487 | } | 3476 | } |
3488 | 3477 | ||
3489 | |||
3490 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3491 | /** | ||
3492 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3493 | * the given sched_domain, during load balancing. | ||
3494 | * | ||
3495 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3496 | * @sds: Variable containing the statistics for sd. | ||
3497 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3498 | */ | ||
3499 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3500 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3501 | { | ||
3502 | /* | ||
3503 | * Busy processors will not participate in power savings | ||
3504 | * balance. | ||
3505 | */ | ||
3506 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3507 | sds->power_savings_balance = 0; | ||
3508 | else { | ||
3509 | sds->power_savings_balance = 1; | ||
3510 | sds->min_nr_running = ULONG_MAX; | ||
3511 | sds->leader_nr_running = 0; | ||
3512 | } | ||
3513 | } | ||
3514 | |||
3515 | /** | ||
3516 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3517 | * sched_domain while performing load balancing. | ||
3518 | * | ||
3519 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3520 | * @sds: Variable containing the statistics of the sched_domain | ||
3521 | * @local_group: Does group contain the CPU for which we're performing | ||
3522 | * load balancing ? | ||
3523 | * @sgs: Variable containing the statistics of the group. | ||
3524 | */ | ||
3525 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3526 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3527 | { | ||
3528 | |||
3529 | if (!sds->power_savings_balance) | ||
3530 | return; | ||
3531 | |||
3532 | /* | ||
3533 | * If the local group is idle or completely loaded | ||
3534 | * no need to do power savings balance at this domain | ||
3535 | */ | ||
3536 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3537 | !sds->this_nr_running)) | ||
3538 | sds->power_savings_balance = 0; | ||
3539 | |||
3540 | /* | ||
3541 | * If a group is already running at full capacity or idle, | ||
3542 | * don't include that group in power savings calculations | ||
3543 | */ | ||
3544 | if (!sds->power_savings_balance || | ||
3545 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3546 | !sgs->sum_nr_running) | ||
3547 | return; | ||
3548 | |||
3549 | /* | ||
3550 | * Calculate the group which has the least non-idle load. | ||
3551 | * This is the group from where we need to pick up the load | ||
3552 | * for saving power | ||
3553 | */ | ||
3554 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3555 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3556 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3557 | sds->group_min = group; | ||
3558 | sds->min_nr_running = sgs->sum_nr_running; | ||
3559 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3560 | sgs->sum_nr_running; | ||
3561 | } | ||
3562 | |||
3563 | /* | ||
3564 | * Calculate the group which is almost near its | ||
3565 | * capacity but still has some space to pick up some load | ||
3566 | * from other group and save more power | ||
3567 | */ | ||
3568 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3569 | return; | ||
3570 | |||
3571 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3572 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3573 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3574 | sds->group_leader = group; | ||
3575 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3576 | } | ||
3577 | } | ||
3578 | |||
3579 | /** | ||
3580 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3581 | * @sds: Variable containing the statistics of the sched_domain | ||
3582 | * under consideration. | ||
3583 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3584 | * @imbalance: Variable to store the imbalance. | ||
3585 | * | ||
3586 | * Description: | ||
3587 | * Check if we have potential to perform some power-savings balance. | ||
3588 | * If yes, set the busiest group to be the least loaded group in the | ||
3589 | * sched_domain, so that it's CPUs can be put to idle. | ||
3590 | * | ||
3591 | * Returns 1 if there is potential to perform power-savings balance. | ||
3592 | * Else returns 0. | ||
3593 | */ | ||
3594 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3595 | int this_cpu, unsigned long *imbalance) | ||
3596 | { | ||
3597 | if (!sds->power_savings_balance) | ||
3598 | return 0; | ||
3599 | |||
3600 | if (sds->this != sds->group_leader || | ||
3601 | sds->group_leader == sds->group_min) | ||
3602 | return 0; | ||
3603 | |||
3604 | *imbalance = sds->min_load_per_task; | ||
3605 | sds->busiest = sds->group_min; | ||
3606 | |||
3607 | return 1; | ||
3608 | |||
3609 | } | ||
3610 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3611 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3612 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3613 | { | ||
3614 | return; | ||
3615 | } | ||
3616 | |||
3617 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3618 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3619 | { | ||
3620 | return; | ||
3621 | } | ||
3622 | |||
3623 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3624 | int this_cpu, unsigned long *imbalance) | ||
3625 | { | ||
3626 | return 0; | ||
3627 | } | ||
3628 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3629 | |||
3630 | |||
3631 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 3478 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
3632 | { | 3479 | { |
3633 | return SCHED_POWER_SCALE; | 3480 | return SCHED_POWER_SCALE; |
@@ -3656,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
3656 | unsigned long scale_rt_power(int cpu) | 3503 | unsigned long scale_rt_power(int cpu) |
3657 | { | 3504 | { |
3658 | struct rq *rq = cpu_rq(cpu); | 3505 | struct rq *rq = cpu_rq(cpu); |
3659 | u64 total, available; | 3506 | u64 total, available, age_stamp, avg; |
3507 | |||
3508 | /* | ||
3509 | * Since we're reading these variables without serialization make sure | ||
3510 | * we read them once before doing sanity checks on them. | ||
3511 | */ | ||
3512 | age_stamp = ACCESS_ONCE(rq->age_stamp); | ||
3513 | avg = ACCESS_ONCE(rq->rt_avg); | ||
3660 | 3514 | ||
3661 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 3515 | total = sched_avg_period() + (rq->clock - age_stamp); |
3662 | 3516 | ||
3663 | if (unlikely(total < rq->rt_avg)) { | 3517 | if (unlikely(total < avg)) { |
3664 | /* Ensures that power won't end up being negative */ | 3518 | /* Ensures that power won't end up being negative */ |
3665 | available = 0; | 3519 | available = 0; |
3666 | } else { | 3520 | } else { |
3667 | available = total - rq->rt_avg; | 3521 | available = total - avg; |
3668 | } | 3522 | } |
3669 | 3523 | ||
3670 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 3524 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
@@ -3727,11 +3581,26 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3727 | 3581 | ||
3728 | power = 0; | 3582 | power = 0; |
3729 | 3583 | ||
3730 | group = child->groups; | 3584 | if (child->flags & SD_OVERLAP) { |
3731 | do { | 3585 | /* |
3732 | power += group->sgp->power; | 3586 | * SD_OVERLAP domains cannot assume that child groups |
3733 | group = group->next; | 3587 | * span the current group. |
3734 | } while (group != child->groups); | 3588 | */ |
3589 | |||
3590 | for_each_cpu(cpu, sched_group_cpus(sdg)) | ||
3591 | power += power_of(cpu); | ||
3592 | } else { | ||
3593 | /* | ||
3594 | * !SD_OVERLAP domains can assume that child groups | ||
3595 | * span the current group. | ||
3596 | */ | ||
3597 | |||
3598 | group = child->groups; | ||
3599 | do { | ||
3600 | power += group->sgp->power; | ||
3601 | group = group->next; | ||
3602 | } while (group != child->groups); | ||
3603 | } | ||
3735 | 3604 | ||
3736 | sdg->sgp->power = power; | 3605 | sdg->sgp->power = power; |
3737 | } | 3606 | } |
@@ -3765,24 +3634,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3765 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3634 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3766 | * @sd: The sched_domain whose statistics are to be updated. | 3635 | * @sd: The sched_domain whose statistics are to be updated. |
3767 | * @group: sched_group whose statistics are to be updated. | 3636 | * @group: sched_group whose statistics are to be updated. |
3768 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3769 | * @idle: Idle status of this_cpu | ||
3770 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3637 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3771 | * @local_group: Does group contain this_cpu. | 3638 | * @local_group: Does group contain this_cpu. |
3772 | * @cpus: Set of cpus considered for load balancing. | 3639 | * @cpus: Set of cpus considered for load balancing. |
3773 | * @balance: Should we balance. | 3640 | * @balance: Should we balance. |
3774 | * @sgs: variable to hold the statistics for this group. | 3641 | * @sgs: variable to hold the statistics for this group. |
3775 | */ | 3642 | */ |
3776 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 3643 | static inline void update_sg_lb_stats(struct lb_env *env, |
3777 | struct sched_group *group, int this_cpu, | 3644 | struct sched_group *group, int load_idx, |
3778 | enum cpu_idle_type idle, int load_idx, | ||
3779 | int local_group, const struct cpumask *cpus, | 3645 | int local_group, const struct cpumask *cpus, |
3780 | int *balance, struct sg_lb_stats *sgs) | 3646 | int *balance, struct sg_lb_stats *sgs) |
3781 | { | 3647 | { |
3782 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; | 3648 | unsigned long nr_running, max_nr_running, min_nr_running; |
3783 | int i; | 3649 | unsigned long load, max_cpu_load, min_cpu_load; |
3784 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3650 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3785 | unsigned long avg_load_per_task = 0; | 3651 | unsigned long avg_load_per_task = 0; |
3652 | int i; | ||
3786 | 3653 | ||
3787 | if (local_group) | 3654 | if (local_group) |
3788 | balance_cpu = group_first_cpu(group); | 3655 | balance_cpu = group_first_cpu(group); |
@@ -3791,10 +3658,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3791 | max_cpu_load = 0; | 3658 | max_cpu_load = 0; |
3792 | min_cpu_load = ~0UL; | 3659 | min_cpu_load = ~0UL; |
3793 | max_nr_running = 0; | 3660 | max_nr_running = 0; |
3661 | min_nr_running = ~0UL; | ||
3794 | 3662 | ||
3795 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3663 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
3796 | struct rq *rq = cpu_rq(i); | 3664 | struct rq *rq = cpu_rq(i); |
3797 | 3665 | ||
3666 | nr_running = rq->nr_running; | ||
3667 | |||
3798 | /* Bias balancing toward cpus of our domain */ | 3668 | /* Bias balancing toward cpus of our domain */ |
3799 | if (local_group) { | 3669 | if (local_group) { |
3800 | if (idle_cpu(i) && !first_idle_cpu) { | 3670 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -3805,16 +3675,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3805 | load = target_load(i, load_idx); | 3675 | load = target_load(i, load_idx); |
3806 | } else { | 3676 | } else { |
3807 | load = source_load(i, load_idx); | 3677 | load = source_load(i, load_idx); |
3808 | if (load > max_cpu_load) { | 3678 | if (load > max_cpu_load) |
3809 | max_cpu_load = load; | 3679 | max_cpu_load = load; |
3810 | max_nr_running = rq->nr_running; | ||
3811 | } | ||
3812 | if (min_cpu_load > load) | 3680 | if (min_cpu_load > load) |
3813 | min_cpu_load = load; | 3681 | min_cpu_load = load; |
3682 | |||
3683 | if (nr_running > max_nr_running) | ||
3684 | max_nr_running = nr_running; | ||
3685 | if (min_nr_running > nr_running) | ||
3686 | min_nr_running = nr_running; | ||
3814 | } | 3687 | } |
3815 | 3688 | ||
3816 | sgs->group_load += load; | 3689 | sgs->group_load += load; |
3817 | sgs->sum_nr_running += rq->nr_running; | 3690 | sgs->sum_nr_running += nr_running; |
3818 | sgs->sum_weighted_load += weighted_cpuload(i); | 3691 | sgs->sum_weighted_load += weighted_cpuload(i); |
3819 | if (idle_cpu(i)) | 3692 | if (idle_cpu(i)) |
3820 | sgs->idle_cpus++; | 3693 | sgs->idle_cpus++; |
@@ -3827,14 +3700,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3827 | * to do the newly idle load balance. | 3700 | * to do the newly idle load balance. |
3828 | */ | 3701 | */ |
3829 | if (local_group) { | 3702 | if (local_group) { |
3830 | if (idle != CPU_NEWLY_IDLE) { | 3703 | if (env->idle != CPU_NEWLY_IDLE) { |
3831 | if (balance_cpu != this_cpu) { | 3704 | if (balance_cpu != env->dst_cpu) { |
3832 | *balance = 0; | 3705 | *balance = 0; |
3833 | return; | 3706 | return; |
3834 | } | 3707 | } |
3835 | update_group_power(sd, this_cpu); | 3708 | update_group_power(env->sd, env->dst_cpu); |
3836 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | 3709 | } else if (time_after_eq(jiffies, group->sgp->next_update)) |
3837 | update_group_power(sd, this_cpu); | 3710 | update_group_power(env->sd, env->dst_cpu); |
3838 | } | 3711 | } |
3839 | 3712 | ||
3840 | /* Adjust by relative CPU power of the group */ | 3713 | /* Adjust by relative CPU power of the group */ |
@@ -3852,13 +3725,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3852 | if (sgs->sum_nr_running) | 3725 | if (sgs->sum_nr_running) |
3853 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 3726 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
3854 | 3727 | ||
3855 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 3728 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && |
3729 | (max_nr_running - min_nr_running) > 1) | ||
3856 | sgs->group_imb = 1; | 3730 | sgs->group_imb = 1; |
3857 | 3731 | ||
3858 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | 3732 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
3859 | SCHED_POWER_SCALE); | 3733 | SCHED_POWER_SCALE); |
3860 | if (!sgs->group_capacity) | 3734 | if (!sgs->group_capacity) |
3861 | sgs->group_capacity = fix_small_capacity(sd, group); | 3735 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
3862 | sgs->group_weight = group->group_weight; | 3736 | sgs->group_weight = group->group_weight; |
3863 | 3737 | ||
3864 | if (sgs->group_capacity > sgs->sum_nr_running) | 3738 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -3876,11 +3750,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3876 | * Determine if @sg is a busier group than the previously selected | 3750 | * Determine if @sg is a busier group than the previously selected |
3877 | * busiest group. | 3751 | * busiest group. |
3878 | */ | 3752 | */ |
3879 | static bool update_sd_pick_busiest(struct sched_domain *sd, | 3753 | static bool update_sd_pick_busiest(struct lb_env *env, |
3880 | struct sd_lb_stats *sds, | 3754 | struct sd_lb_stats *sds, |
3881 | struct sched_group *sg, | 3755 | struct sched_group *sg, |
3882 | struct sg_lb_stats *sgs, | 3756 | struct sg_lb_stats *sgs) |
3883 | int this_cpu) | ||
3884 | { | 3757 | { |
3885 | if (sgs->avg_load <= sds->max_load) | 3758 | if (sgs->avg_load <= sds->max_load) |
3886 | return false; | 3759 | return false; |
@@ -3896,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3896 | * numbered CPUs in the group, therefore mark all groups | 3769 | * numbered CPUs in the group, therefore mark all groups |
3897 | * higher than ourself as busy. | 3770 | * higher than ourself as busy. |
3898 | */ | 3771 | */ |
3899 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 3772 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && |
3900 | this_cpu < group_first_cpu(sg)) { | 3773 | env->dst_cpu < group_first_cpu(sg)) { |
3901 | if (!sds->busiest) | 3774 | if (!sds->busiest) |
3902 | return true; | 3775 | return true; |
3903 | 3776 | ||
@@ -3917,28 +3790,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3917 | * @balance: Should we balance. | 3790 | * @balance: Should we balance. |
3918 | * @sds: variable to hold the statistics for this sched_domain. | 3791 | * @sds: variable to hold the statistics for this sched_domain. |
3919 | */ | 3792 | */ |
3920 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 3793 | static inline void update_sd_lb_stats(struct lb_env *env, |
3921 | enum cpu_idle_type idle, const struct cpumask *cpus, | 3794 | const struct cpumask *cpus, |
3922 | int *balance, struct sd_lb_stats *sds) | 3795 | int *balance, struct sd_lb_stats *sds) |
3923 | { | 3796 | { |
3924 | struct sched_domain *child = sd->child; | 3797 | struct sched_domain *child = env->sd->child; |
3925 | struct sched_group *sg = sd->groups; | 3798 | struct sched_group *sg = env->sd->groups; |
3926 | struct sg_lb_stats sgs; | 3799 | struct sg_lb_stats sgs; |
3927 | int load_idx, prefer_sibling = 0; | 3800 | int load_idx, prefer_sibling = 0; |
3928 | 3801 | ||
3929 | if (child && child->flags & SD_PREFER_SIBLING) | 3802 | if (child && child->flags & SD_PREFER_SIBLING) |
3930 | prefer_sibling = 1; | 3803 | prefer_sibling = 1; |
3931 | 3804 | ||
3932 | init_sd_power_savings_stats(sd, sds, idle); | 3805 | load_idx = get_sd_load_idx(env->sd, env->idle); |
3933 | load_idx = get_sd_load_idx(sd, idle); | ||
3934 | 3806 | ||
3935 | do { | 3807 | do { |
3936 | int local_group; | 3808 | int local_group; |
3937 | 3809 | ||
3938 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 3810 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
3939 | memset(&sgs, 0, sizeof(sgs)); | 3811 | memset(&sgs, 0, sizeof(sgs)); |
3940 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, | 3812 | update_sg_lb_stats(env, sg, load_idx, local_group, |
3941 | local_group, cpus, balance, &sgs); | 3813 | cpus, balance, &sgs); |
3942 | 3814 | ||
3943 | if (local_group && !(*balance)) | 3815 | if (local_group && !(*balance)) |
3944 | return; | 3816 | return; |
@@ -3966,7 +3838,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3966 | sds->this_load_per_task = sgs.sum_weighted_load; | 3838 | sds->this_load_per_task = sgs.sum_weighted_load; |
3967 | sds->this_has_capacity = sgs.group_has_capacity; | 3839 | sds->this_has_capacity = sgs.group_has_capacity; |
3968 | sds->this_idle_cpus = sgs.idle_cpus; | 3840 | sds->this_idle_cpus = sgs.idle_cpus; |
3969 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 3841 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { |
3970 | sds->max_load = sgs.avg_load; | 3842 | sds->max_load = sgs.avg_load; |
3971 | sds->busiest = sg; | 3843 | sds->busiest = sg; |
3972 | sds->busiest_nr_running = sgs.sum_nr_running; | 3844 | sds->busiest_nr_running = sgs.sum_nr_running; |
@@ -3978,9 +3850,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3978 | sds->group_imb = sgs.group_imb; | 3850 | sds->group_imb = sgs.group_imb; |
3979 | } | 3851 | } |
3980 | 3852 | ||
3981 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); | ||
3982 | sg = sg->next; | 3853 | sg = sg->next; |
3983 | } while (sg != sd->groups); | 3854 | } while (sg != env->sd->groups); |
3984 | } | 3855 | } |
3985 | 3856 | ||
3986 | /** | 3857 | /** |
@@ -4008,24 +3879,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
4008 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | 3879 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. |
4009 | * @imbalance: returns amount of imbalanced due to packing. | 3880 | * @imbalance: returns amount of imbalanced due to packing. |
4010 | */ | 3881 | */ |
4011 | static int check_asym_packing(struct sched_domain *sd, | 3882 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
4012 | struct sd_lb_stats *sds, | ||
4013 | int this_cpu, unsigned long *imbalance) | ||
4014 | { | 3883 | { |
4015 | int busiest_cpu; | 3884 | int busiest_cpu; |
4016 | 3885 | ||
4017 | if (!(sd->flags & SD_ASYM_PACKING)) | 3886 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
4018 | return 0; | 3887 | return 0; |
4019 | 3888 | ||
4020 | if (!sds->busiest) | 3889 | if (!sds->busiest) |
4021 | return 0; | 3890 | return 0; |
4022 | 3891 | ||
4023 | busiest_cpu = group_first_cpu(sds->busiest); | 3892 | busiest_cpu = group_first_cpu(sds->busiest); |
4024 | if (this_cpu > busiest_cpu) | 3893 | if (env->dst_cpu > busiest_cpu) |
4025 | return 0; | 3894 | return 0; |
4026 | 3895 | ||
4027 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, | 3896 | env->imbalance = DIV_ROUND_CLOSEST( |
4028 | SCHED_POWER_SCALE); | 3897 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); |
3898 | |||
4029 | return 1; | 3899 | return 1; |
4030 | } | 3900 | } |
4031 | 3901 | ||
@@ -4037,8 +3907,8 @@ static int check_asym_packing(struct sched_domain *sd, | |||
4037 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | 3907 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. |
4038 | * @imbalance: Variable to store the imbalance. | 3908 | * @imbalance: Variable to store the imbalance. |
4039 | */ | 3909 | */ |
4040 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | 3910 | static inline |
4041 | int this_cpu, unsigned long *imbalance) | 3911 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4042 | { | 3912 | { |
4043 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 3913 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4044 | unsigned int imbn = 2; | 3914 | unsigned int imbn = 2; |
@@ -4049,9 +3919,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4049 | if (sds->busiest_load_per_task > | 3919 | if (sds->busiest_load_per_task > |
4050 | sds->this_load_per_task) | 3920 | sds->this_load_per_task) |
4051 | imbn = 1; | 3921 | imbn = 1; |
4052 | } else | 3922 | } else { |
4053 | sds->this_load_per_task = | 3923 | sds->this_load_per_task = |
4054 | cpu_avg_load_per_task(this_cpu); | 3924 | cpu_avg_load_per_task(env->dst_cpu); |
3925 | } | ||
4055 | 3926 | ||
4056 | scaled_busy_load_per_task = sds->busiest_load_per_task | 3927 | scaled_busy_load_per_task = sds->busiest_load_per_task |
4057 | * SCHED_POWER_SCALE; | 3928 | * SCHED_POWER_SCALE; |
@@ -4059,7 +3930,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4059 | 3930 | ||
4060 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3931 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
4061 | (scaled_busy_load_per_task * imbn)) { | 3932 | (scaled_busy_load_per_task * imbn)) { |
4062 | *imbalance = sds->busiest_load_per_task; | 3933 | env->imbalance = sds->busiest_load_per_task; |
4063 | return; | 3934 | return; |
4064 | } | 3935 | } |
4065 | 3936 | ||
@@ -4096,18 +3967,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4096 | 3967 | ||
4097 | /* Move if we gain throughput */ | 3968 | /* Move if we gain throughput */ |
4098 | if (pwr_move > pwr_now) | 3969 | if (pwr_move > pwr_now) |
4099 | *imbalance = sds->busiest_load_per_task; | 3970 | env->imbalance = sds->busiest_load_per_task; |
4100 | } | 3971 | } |
4101 | 3972 | ||
4102 | /** | 3973 | /** |
4103 | * calculate_imbalance - Calculate the amount of imbalance present within the | 3974 | * calculate_imbalance - Calculate the amount of imbalance present within the |
4104 | * groups of a given sched_domain during load balance. | 3975 | * groups of a given sched_domain during load balance. |
3976 | * @env: load balance environment | ||
4105 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | 3977 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. |
4106 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
4107 | * @imbalance: The variable to store the imbalance. | ||
4108 | */ | 3978 | */ |
4109 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 3979 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4110 | unsigned long *imbalance) | ||
4111 | { | 3980 | { |
4112 | unsigned long max_pull, load_above_capacity = ~0UL; | 3981 | unsigned long max_pull, load_above_capacity = ~0UL; |
4113 | 3982 | ||
@@ -4123,8 +3992,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4123 | * its cpu_power, while calculating max_load..) | 3992 | * its cpu_power, while calculating max_load..) |
4124 | */ | 3993 | */ |
4125 | if (sds->max_load < sds->avg_load) { | 3994 | if (sds->max_load < sds->avg_load) { |
4126 | *imbalance = 0; | 3995 | env->imbalance = 0; |
4127 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3996 | return fix_small_imbalance(env, sds); |
4128 | } | 3997 | } |
4129 | 3998 | ||
4130 | if (!sds->group_imb) { | 3999 | if (!sds->group_imb) { |
@@ -4152,7 +4021,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4152 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4021 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
4153 | 4022 | ||
4154 | /* How much load to actually move to equalise the imbalance */ | 4023 | /* How much load to actually move to equalise the imbalance */ |
4155 | *imbalance = min(max_pull * sds->busiest->sgp->power, | 4024 | env->imbalance = min(max_pull * sds->busiest->sgp->power, |
4156 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4025 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
4157 | / SCHED_POWER_SCALE; | 4026 | / SCHED_POWER_SCALE; |
4158 | 4027 | ||
@@ -4162,8 +4031,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4162 | * a think about bumping its value to force at least one task to be | 4031 | * a think about bumping its value to force at least one task to be |
4163 | * moved | 4032 | * moved |
4164 | */ | 4033 | */ |
4165 | if (*imbalance < sds->busiest_load_per_task) | 4034 | if (env->imbalance < sds->busiest_load_per_task) |
4166 | return fix_small_imbalance(sds, this_cpu, imbalance); | 4035 | return fix_small_imbalance(env, sds); |
4167 | 4036 | ||
4168 | } | 4037 | } |
4169 | 4038 | ||
@@ -4194,9 +4063,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4194 | * put to idle by rebalancing its tasks onto our group. | 4063 | * put to idle by rebalancing its tasks onto our group. |
4195 | */ | 4064 | */ |
4196 | static struct sched_group * | 4065 | static struct sched_group * |
4197 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 4066 | find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) |
4198 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4199 | const struct cpumask *cpus, int *balance) | ||
4200 | { | 4067 | { |
4201 | struct sd_lb_stats sds; | 4068 | struct sd_lb_stats sds; |
4202 | 4069 | ||
@@ -4206,7 +4073,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4206 | * Compute the various statistics relavent for load balancing at | 4073 | * Compute the various statistics relavent for load balancing at |
4207 | * this level. | 4074 | * this level. |
4208 | */ | 4075 | */ |
4209 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); | 4076 | update_sd_lb_stats(env, cpus, balance, &sds); |
4210 | 4077 | ||
4211 | /* | 4078 | /* |
4212 | * this_cpu is not the appropriate cpu to perform load balancing at | 4079 | * this_cpu is not the appropriate cpu to perform load balancing at |
@@ -4215,8 +4082,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4215 | if (!(*balance)) | 4082 | if (!(*balance)) |
4216 | goto ret; | 4083 | goto ret; |
4217 | 4084 | ||
4218 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | 4085 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4219 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 4086 | check_asym_packing(env, &sds)) |
4220 | return sds.busiest; | 4087 | return sds.busiest; |
4221 | 4088 | ||
4222 | /* There is no busy sibling group to pull tasks from */ | 4089 | /* There is no busy sibling group to pull tasks from */ |
@@ -4234,7 +4101,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4234 | goto force_balance; | 4101 | goto force_balance; |
4235 | 4102 | ||
4236 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4103 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4237 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4104 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
4238 | !sds.busiest_has_capacity) | 4105 | !sds.busiest_has_capacity) |
4239 | goto force_balance; | 4106 | goto force_balance; |
4240 | 4107 | ||
@@ -4252,7 +4119,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4252 | if (sds.this_load >= sds.avg_load) | 4119 | if (sds.this_load >= sds.avg_load) |
4253 | goto out_balanced; | 4120 | goto out_balanced; |
4254 | 4121 | ||
4255 | if (idle == CPU_IDLE) { | 4122 | if (env->idle == CPU_IDLE) { |
4256 | /* | 4123 | /* |
4257 | * This cpu is idle. If the busiest group load doesn't | 4124 | * This cpu is idle. If the busiest group load doesn't |
4258 | * have more tasks than the number of available cpu's and | 4125 | * have more tasks than the number of available cpu's and |
@@ -4267,34 +4134,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4267 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 4134 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4268 | * imbalance_pct to be conservative. | 4135 | * imbalance_pct to be conservative. |
4269 | */ | 4136 | */ |
4270 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 4137 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) |
4271 | goto out_balanced; | 4138 | goto out_balanced; |
4272 | } | 4139 | } |
4273 | 4140 | ||
4274 | force_balance: | 4141 | force_balance: |
4275 | /* Looks like there is an imbalance. Compute it */ | 4142 | /* Looks like there is an imbalance. Compute it */ |
4276 | calculate_imbalance(&sds, this_cpu, imbalance); | 4143 | calculate_imbalance(env, &sds); |
4277 | return sds.busiest; | 4144 | return sds.busiest; |
4278 | 4145 | ||
4279 | out_balanced: | 4146 | out_balanced: |
4280 | /* | ||
4281 | * There is no obvious imbalance. But check if we can do some balancing | ||
4282 | * to save power. | ||
4283 | */ | ||
4284 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4285 | return sds.busiest; | ||
4286 | ret: | 4147 | ret: |
4287 | *imbalance = 0; | 4148 | env->imbalance = 0; |
4288 | return NULL; | 4149 | return NULL; |
4289 | } | 4150 | } |
4290 | 4151 | ||
4291 | /* | 4152 | /* |
4292 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4153 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
4293 | */ | 4154 | */ |
4294 | static struct rq * | 4155 | static struct rq *find_busiest_queue(struct lb_env *env, |
4295 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | 4156 | struct sched_group *group, |
4296 | enum cpu_idle_type idle, unsigned long imbalance, | 4157 | const struct cpumask *cpus) |
4297 | const struct cpumask *cpus) | ||
4298 | { | 4158 | { |
4299 | struct rq *busiest = NULL, *rq; | 4159 | struct rq *busiest = NULL, *rq; |
4300 | unsigned long max_load = 0; | 4160 | unsigned long max_load = 0; |
@@ -4307,7 +4167,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4307 | unsigned long wl; | 4167 | unsigned long wl; |
4308 | 4168 | ||
4309 | if (!capacity) | 4169 | if (!capacity) |
4310 | capacity = fix_small_capacity(sd, group); | 4170 | capacity = fix_small_capacity(env->sd, group); |
4311 | 4171 | ||
4312 | if (!cpumask_test_cpu(i, cpus)) | 4172 | if (!cpumask_test_cpu(i, cpus)) |
4313 | continue; | 4173 | continue; |
@@ -4319,7 +4179,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4319 | * When comparing with imbalance, use weighted_cpuload() | 4179 | * When comparing with imbalance, use weighted_cpuload() |
4320 | * which is not scaled with the cpu power. | 4180 | * which is not scaled with the cpu power. |
4321 | */ | 4181 | */ |
4322 | if (capacity && rq->nr_running == 1 && wl > imbalance) | 4182 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) |
4323 | continue; | 4183 | continue; |
4324 | 4184 | ||
4325 | /* | 4185 | /* |
@@ -4348,40 +4208,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4348 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4208 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4349 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4209 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
4350 | 4210 | ||
4351 | static int need_active_balance(struct sched_domain *sd, int idle, | 4211 | static int need_active_balance(struct lb_env *env) |
4352 | int busiest_cpu, int this_cpu) | ||
4353 | { | 4212 | { |
4354 | if (idle == CPU_NEWLY_IDLE) { | 4213 | struct sched_domain *sd = env->sd; |
4214 | |||
4215 | if (env->idle == CPU_NEWLY_IDLE) { | ||
4355 | 4216 | ||
4356 | /* | 4217 | /* |
4357 | * ASYM_PACKING needs to force migrate tasks from busy but | 4218 | * ASYM_PACKING needs to force migrate tasks from busy but |
4358 | * higher numbered CPUs in order to pack all tasks in the | 4219 | * higher numbered CPUs in order to pack all tasks in the |
4359 | * lowest numbered CPUs. | 4220 | * lowest numbered CPUs. |
4360 | */ | 4221 | */ |
4361 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | 4222 | if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) |
4362 | return 1; | 4223 | return 1; |
4363 | |||
4364 | /* | ||
4365 | * The only task running in a non-idle cpu can be moved to this | ||
4366 | * cpu in an attempt to completely freeup the other CPU | ||
4367 | * package. | ||
4368 | * | ||
4369 | * The package power saving logic comes from | ||
4370 | * find_busiest_group(). If there are no imbalance, then | ||
4371 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4372 | * f_b_g() will select a group from which a running task may be | ||
4373 | * pulled to this cpu in order to make the other package idle. | ||
4374 | * If there is no opportunity to make a package idle and if | ||
4375 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4376 | * action will be taken in load_balance_newidle(). | ||
4377 | * | ||
4378 | * Under normal task pull operation due to imbalance, there | ||
4379 | * will be more than one task in the source run queue and | ||
4380 | * move_tasks() will succeed. ld_moved will be true and this | ||
4381 | * active balance code will not be triggered. | ||
4382 | */ | ||
4383 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4384 | return 0; | ||
4385 | } | 4224 | } |
4386 | 4225 | ||
4387 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 4226 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
@@ -4399,7 +4238,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4399 | { | 4238 | { |
4400 | int ld_moved, active_balance = 0; | 4239 | int ld_moved, active_balance = 0; |
4401 | struct sched_group *group; | 4240 | struct sched_group *group; |
4402 | unsigned long imbalance; | ||
4403 | struct rq *busiest; | 4241 | struct rq *busiest; |
4404 | unsigned long flags; | 4242 | unsigned long flags; |
4405 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4243 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
@@ -4417,8 +4255,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4417 | schedstat_inc(sd, lb_count[idle]); | 4255 | schedstat_inc(sd, lb_count[idle]); |
4418 | 4256 | ||
4419 | redo: | 4257 | redo: |
4420 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, | 4258 | group = find_busiest_group(&env, cpus, balance); |
4421 | cpus, balance); | ||
4422 | 4259 | ||
4423 | if (*balance == 0) | 4260 | if (*balance == 0) |
4424 | goto out_balanced; | 4261 | goto out_balanced; |
@@ -4428,7 +4265,7 @@ redo: | |||
4428 | goto out_balanced; | 4265 | goto out_balanced; |
4429 | } | 4266 | } |
4430 | 4267 | ||
4431 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); | 4268 | busiest = find_busiest_queue(&env, group, cpus); |
4432 | if (!busiest) { | 4269 | if (!busiest) { |
4433 | schedstat_inc(sd, lb_nobusyq[idle]); | 4270 | schedstat_inc(sd, lb_nobusyq[idle]); |
4434 | goto out_balanced; | 4271 | goto out_balanced; |
@@ -4436,7 +4273,7 @@ redo: | |||
4436 | 4273 | ||
4437 | BUG_ON(busiest == this_rq); | 4274 | BUG_ON(busiest == this_rq); |
4438 | 4275 | ||
4439 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 4276 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4440 | 4277 | ||
4441 | ld_moved = 0; | 4278 | ld_moved = 0; |
4442 | if (busiest->nr_running > 1) { | 4279 | if (busiest->nr_running > 1) { |
@@ -4447,10 +4284,9 @@ redo: | |||
4447 | * correctly treated as an imbalance. | 4284 | * correctly treated as an imbalance. |
4448 | */ | 4285 | */ |
4449 | env.flags |= LBF_ALL_PINNED; | 4286 | env.flags |= LBF_ALL_PINNED; |
4450 | env.load_move = imbalance; | 4287 | env.src_cpu = busiest->cpu; |
4451 | env.src_cpu = busiest->cpu; | 4288 | env.src_rq = busiest; |
4452 | env.src_rq = busiest; | 4289 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
4453 | env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); | ||
4454 | 4290 | ||
4455 | more_balance: | 4291 | more_balance: |
4456 | local_irq_save(flags); | 4292 | local_irq_save(flags); |
@@ -4492,7 +4328,7 @@ more_balance: | |||
4492 | if (idle != CPU_NEWLY_IDLE) | 4328 | if (idle != CPU_NEWLY_IDLE) |
4493 | sd->nr_balance_failed++; | 4329 | sd->nr_balance_failed++; |
4494 | 4330 | ||
4495 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { | 4331 | if (need_active_balance(&env)) { |
4496 | raw_spin_lock_irqsave(&busiest->lock, flags); | 4332 | raw_spin_lock_irqsave(&busiest->lock, flags); |
4497 | 4333 | ||
4498 | /* don't kick the active_load_balance_cpu_stop, | 4334 | /* don't kick the active_load_balance_cpu_stop, |
@@ -4519,10 +4355,11 @@ more_balance: | |||
4519 | } | 4355 | } |
4520 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 4356 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
4521 | 4357 | ||
4522 | if (active_balance) | 4358 | if (active_balance) { |
4523 | stop_one_cpu_nowait(cpu_of(busiest), | 4359 | stop_one_cpu_nowait(cpu_of(busiest), |
4524 | active_load_balance_cpu_stop, busiest, | 4360 | active_load_balance_cpu_stop, busiest, |
4525 | &busiest->active_balance_work); | 4361 | &busiest->active_balance_work); |
4362 | } | ||
4526 | 4363 | ||
4527 | /* | 4364 | /* |
4528 | * We've kicked active balancing, reset the failure | 4365 | * We've kicked active balancing, reset the failure |
@@ -4703,104 +4540,15 @@ static struct { | |||
4703 | unsigned long next_balance; /* in jiffy units */ | 4540 | unsigned long next_balance; /* in jiffy units */ |
4704 | } nohz ____cacheline_aligned; | 4541 | } nohz ____cacheline_aligned; |
4705 | 4542 | ||
4706 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4543 | static inline int find_new_ilb(int call_cpu) |
4707 | /** | ||
4708 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4709 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4710 | * be returned. | ||
4711 | * @flag: The flag to check for the lowest sched_domain | ||
4712 | * for the given cpu. | ||
4713 | * | ||
4714 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4715 | */ | ||
4716 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4717 | { | ||
4718 | struct sched_domain *sd; | ||
4719 | |||
4720 | for_each_domain(cpu, sd) | ||
4721 | if (sd->flags & flag) | ||
4722 | break; | ||
4723 | |||
4724 | return sd; | ||
4725 | } | ||
4726 | |||
4727 | /** | ||
4728 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4729 | * @cpu: The cpu whose domains we're iterating over. | ||
4730 | * @sd: variable holding the value of the power_savings_sd | ||
4731 | * for cpu. | ||
4732 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4733 | * | ||
4734 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4735 | * set, starting from the lowest sched_domain to the highest. | ||
4736 | */ | ||
4737 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4738 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4739 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4740 | |||
4741 | /** | ||
4742 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4743 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4744 | * | ||
4745 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4746 | * Else, returns >= nr_cpu_ids. | ||
4747 | * | ||
4748 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4749 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4750 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4751 | * when there are other idle cpu's which are better suited for that job. | ||
4752 | */ | ||
4753 | static int find_new_ilb(int cpu) | ||
4754 | { | 4544 | { |
4755 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 4545 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
4756 | struct sched_group *ilbg; | ||
4757 | struct sched_domain *sd; | ||
4758 | 4546 | ||
4759 | /* | ||
4760 | * Have idle load balancer selection from semi-idle packages only | ||
4761 | * when power-aware load balancing is enabled | ||
4762 | */ | ||
4763 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4764 | goto out_done; | ||
4765 | |||
4766 | /* | ||
4767 | * Optimize for the case when we have no idle CPUs or only one | ||
4768 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4769 | */ | ||
4770 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | ||
4771 | goto out_done; | ||
4772 | |||
4773 | rcu_read_lock(); | ||
4774 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4775 | ilbg = sd->groups; | ||
4776 | |||
4777 | do { | ||
4778 | if (ilbg->group_weight != | ||
4779 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { | ||
4780 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4781 | sched_group_cpus(ilbg)); | ||
4782 | goto unlock; | ||
4783 | } | ||
4784 | |||
4785 | ilbg = ilbg->next; | ||
4786 | |||
4787 | } while (ilbg != sd->groups); | ||
4788 | } | ||
4789 | unlock: | ||
4790 | rcu_read_unlock(); | ||
4791 | |||
4792 | out_done: | ||
4793 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | 4547 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4794 | return ilb; | 4548 | return ilb; |
4795 | 4549 | ||
4796 | return nr_cpu_ids; | 4550 | return nr_cpu_ids; |
4797 | } | 4551 | } |
4798 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4799 | static inline int find_new_ilb(int call_cpu) | ||
4800 | { | ||
4801 | return nr_cpu_ids; | ||
4802 | } | ||
4803 | #endif | ||
4804 | 4552 | ||
4805 | /* | 4553 | /* |
4806 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | 4554 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the |
@@ -5023,7 +4771,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
5023 | 4771 | ||
5024 | raw_spin_lock_irq(&this_rq->lock); | 4772 | raw_spin_lock_irq(&this_rq->lock); |
5025 | update_rq_clock(this_rq); | 4773 | update_rq_clock(this_rq); |
5026 | update_cpu_load(this_rq); | 4774 | update_idle_cpu_load(this_rq); |
5027 | raw_spin_unlock_irq(&this_rq->lock); | 4775 | raw_spin_unlock_irq(&this_rq->lock); |
5028 | 4776 | ||
5029 | rebalance_domains(balance_cpu, CPU_IDLE); | 4777 | rebalance_domains(balance_cpu, CPU_IDLE); |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f289..b44d604b35d1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
5 | * | 5 | * |
6 | * (NOTE: these are not related to SCHED_IDLE tasks which are | 6 | * (NOTE: these are not related to SCHED_IDLE tasks which are |
7 | * handled in sched_fair.c) | 7 | * handled in sched/fair.c) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d0..2a4e8dffbd6b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) | |||
274 | 274 | ||
275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
276 | { | 276 | { |
277 | struct task_struct *p; | ||
278 | |||
277 | if (!rt_entity_is_task(rt_se)) | 279 | if (!rt_entity_is_task(rt_se)) |
278 | return; | 280 | return; |
279 | 281 | ||
282 | p = rt_task_of(rt_se); | ||
280 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 283 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
281 | 284 | ||
282 | rt_rq->rt_nr_total++; | 285 | rt_rq->rt_nr_total++; |
283 | if (rt_se->nr_cpus_allowed > 1) | 286 | if (p->nr_cpus_allowed > 1) |
284 | rt_rq->rt_nr_migratory++; | 287 | rt_rq->rt_nr_migratory++; |
285 | 288 | ||
286 | update_rt_migration(rt_rq); | 289 | update_rt_migration(rt_rq); |
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
288 | 291 | ||
289 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 292 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
290 | { | 293 | { |
294 | struct task_struct *p; | ||
295 | |||
291 | if (!rt_entity_is_task(rt_se)) | 296 | if (!rt_entity_is_task(rt_se)) |
292 | return; | 297 | return; |
293 | 298 | ||
299 | p = rt_task_of(rt_se); | ||
294 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 300 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
295 | 301 | ||
296 | rt_rq->rt_nr_total--; | 302 | rt_rq->rt_nr_total--; |
297 | if (rt_se->nr_cpus_allowed > 1) | 303 | if (p->nr_cpus_allowed > 1) |
298 | rt_rq->rt_nr_migratory--; | 304 | rt_rq->rt_nr_migratory--; |
299 | 305 | ||
300 | update_rt_migration(rt_rq); | 306 | update_rt_migration(rt_rq); |
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1161 | 1167 | ||
1162 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1168 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); |
1163 | 1169 | ||
1164 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1170 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
1165 | enqueue_pushable_task(rq, p); | 1171 | enqueue_pushable_task(rq, p); |
1166 | 1172 | ||
1167 | inc_nr_running(rq); | 1173 | inc_nr_running(rq); |
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1225 | 1231 | ||
1226 | cpu = task_cpu(p); | 1232 | cpu = task_cpu(p); |
1227 | 1233 | ||
1228 | if (p->rt.nr_cpus_allowed == 1) | 1234 | if (p->nr_cpus_allowed == 1) |
1229 | goto out; | 1235 | goto out; |
1230 | 1236 | ||
1231 | /* For anything but wake ups, just return the task_cpu */ | 1237 | /* For anything but wake ups, just return the task_cpu */ |
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1260 | * will have to sort it out. | 1266 | * will have to sort it out. |
1261 | */ | 1267 | */ |
1262 | if (curr && unlikely(rt_task(curr)) && | 1268 | if (curr && unlikely(rt_task(curr)) && |
1263 | (curr->rt.nr_cpus_allowed < 2 || | 1269 | (curr->nr_cpus_allowed < 2 || |
1264 | curr->prio <= p->prio) && | 1270 | curr->prio <= p->prio) && |
1265 | (p->rt.nr_cpus_allowed > 1)) { | 1271 | (p->nr_cpus_allowed > 1)) { |
1266 | int target = find_lowest_rq(p); | 1272 | int target = find_lowest_rq(p); |
1267 | 1273 | ||
1268 | if (target != -1) | 1274 | if (target != -1) |
@@ -1276,10 +1282,10 @@ out: | |||
1276 | 1282 | ||
1277 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1283 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
1278 | { | 1284 | { |
1279 | if (rq->curr->rt.nr_cpus_allowed == 1) | 1285 | if (rq->curr->nr_cpus_allowed == 1) |
1280 | return; | 1286 | return; |
1281 | 1287 | ||
1282 | if (p->rt.nr_cpus_allowed != 1 | 1288 | if (p->nr_cpus_allowed != 1 |
1283 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1289 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
1284 | return; | 1290 | return; |
1285 | 1291 | ||
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1395 | * The previous task needs to be made eligible for pushing | 1401 | * The previous task needs to be made eligible for pushing |
1396 | * if it is still active | 1402 | * if it is still active |
1397 | */ | 1403 | */ |
1398 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) | 1404 | if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) |
1399 | enqueue_pushable_task(rq, p); | 1405 | enqueue_pushable_task(rq, p); |
1400 | } | 1406 | } |
1401 | 1407 | ||
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1408 | { | 1414 | { |
1409 | if (!task_running(rq, p) && | 1415 | if (!task_running(rq, p) && |
1410 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1416 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1411 | (p->rt.nr_cpus_allowed > 1)) | 1417 | (p->nr_cpus_allowed > 1)) |
1412 | return 1; | 1418 | return 1; |
1413 | return 0; | 1419 | return 0; |
1414 | } | 1420 | } |
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1464 | if (unlikely(!lowest_mask)) | 1470 | if (unlikely(!lowest_mask)) |
1465 | return -1; | 1471 | return -1; |
1466 | 1472 | ||
1467 | if (task->rt.nr_cpus_allowed == 1) | 1473 | if (task->nr_cpus_allowed == 1) |
1468 | return -1; /* No other targets possible */ | 1474 | return -1; /* No other targets possible */ |
1469 | 1475 | ||
1470 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) | 1476 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1586 | 1592 | ||
1587 | BUG_ON(rq->cpu != task_cpu(p)); | 1593 | BUG_ON(rq->cpu != task_cpu(p)); |
1588 | BUG_ON(task_current(rq, p)); | 1594 | BUG_ON(task_current(rq, p)); |
1589 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1595 | BUG_ON(p->nr_cpus_allowed <= 1); |
1590 | 1596 | ||
1591 | BUG_ON(!p->on_rq); | 1597 | BUG_ON(!p->on_rq); |
1592 | BUG_ON(!rt_task(p)); | 1598 | BUG_ON(!rt_task(p)); |
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1793 | if (!task_running(rq, p) && | 1799 | if (!task_running(rq, p) && |
1794 | !test_tsk_need_resched(rq->curr) && | 1800 | !test_tsk_need_resched(rq->curr) && |
1795 | has_pushable_tasks(rq) && | 1801 | has_pushable_tasks(rq) && |
1796 | p->rt.nr_cpus_allowed > 1 && | 1802 | p->nr_cpus_allowed > 1 && |
1797 | rt_task(rq->curr) && | 1803 | rt_task(rq->curr) && |
1798 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1804 | (rq->curr->nr_cpus_allowed < 2 || |
1799 | rq->curr->prio <= p->prio)) | 1805 | rq->curr->prio <= p->prio)) |
1800 | push_rt_tasks(rq); | 1806 | push_rt_tasks(rq); |
1801 | } | 1807 | } |
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1803 | static void set_cpus_allowed_rt(struct task_struct *p, | 1809 | static void set_cpus_allowed_rt(struct task_struct *p, |
1804 | const struct cpumask *new_mask) | 1810 | const struct cpumask *new_mask) |
1805 | { | 1811 | { |
1806 | int weight = cpumask_weight(new_mask); | 1812 | struct rq *rq; |
1813 | int weight; | ||
1807 | 1814 | ||
1808 | BUG_ON(!rt_task(p)); | 1815 | BUG_ON(!rt_task(p)); |
1809 | 1816 | ||
1810 | /* | 1817 | if (!p->on_rq) |
1811 | * Update the migration status of the RQ if we have an RT task | 1818 | return; |
1812 | * which is running AND changing its weight value. | ||
1813 | */ | ||
1814 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
1815 | struct rq *rq = task_rq(p); | ||
1816 | 1819 | ||
1817 | if (!task_current(rq, p)) { | 1820 | weight = cpumask_weight(new_mask); |
1818 | /* | ||
1819 | * Make sure we dequeue this task from the pushable list | ||
1820 | * before going further. It will either remain off of | ||
1821 | * the list because we are no longer pushable, or it | ||
1822 | * will be requeued. | ||
1823 | */ | ||
1824 | if (p->rt.nr_cpus_allowed > 1) | ||
1825 | dequeue_pushable_task(rq, p); | ||
1826 | 1821 | ||
1827 | /* | 1822 | /* |
1828 | * Requeue if our weight is changing and still > 1 | 1823 | * Only update if the process changes its state from whether it |
1829 | */ | 1824 | * can migrate or not. |
1830 | if (weight > 1) | 1825 | */ |
1831 | enqueue_pushable_task(rq, p); | 1826 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
1832 | 1827 | return; | |
1833 | } | ||
1834 | 1828 | ||
1835 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | 1829 | rq = task_rq(p); |
1836 | rq->rt.rt_nr_migratory++; | ||
1837 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
1838 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1839 | rq->rt.rt_nr_migratory--; | ||
1840 | } | ||
1841 | 1830 | ||
1842 | update_rt_migration(&rq->rt); | 1831 | /* |
1832 | * The process used to be able to migrate OR it can now migrate | ||
1833 | */ | ||
1834 | if (weight <= 1) { | ||
1835 | if (!task_current(rq, p)) | ||
1836 | dequeue_pushable_task(rq, p); | ||
1837 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1838 | rq->rt.rt_nr_migratory--; | ||
1839 | } else { | ||
1840 | if (!task_current(rq, p)) | ||
1841 | enqueue_pushable_task(rq, p); | ||
1842 | rq->rt.rt_nr_migratory++; | ||
1843 | } | 1843 | } |
1844 | |||
1845 | update_rt_migration(&rq->rt); | ||
1844 | } | 1846 | } |
1845 | 1847 | ||
1846 | /* Assumes rq->lock is held */ | 1848 | /* Assumes rq->lock is held */ |
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1983 | 1985 | ||
1984 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 1986 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
1985 | { | 1987 | { |
1988 | struct sched_rt_entity *rt_se = &p->rt; | ||
1989 | |||
1986 | update_curr_rt(rq); | 1990 | update_curr_rt(rq); |
1987 | 1991 | ||
1988 | watchdog(rq, p); | 1992 | watchdog(rq, p); |
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
2000 | p->rt.time_slice = RR_TIMESLICE; | 2004 | p->rt.time_slice = RR_TIMESLICE; |
2001 | 2005 | ||
2002 | /* | 2006 | /* |
2003 | * Requeue to the end of queue if we are not the only element | 2007 | * Requeue to the end of queue if we (and all of our ancestors) are the |
2004 | * on the queue: | 2008 | * only element on the queue |
2005 | */ | 2009 | */ |
2006 | if (p->rt.run_list.prev != p->rt.run_list.next) { | 2010 | for_each_sched_rt_entity(rt_se) { |
2007 | requeue_task_rt(rq, p, 0); | 2011 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
2008 | set_tsk_need_resched(p); | 2012 | requeue_task_rt(rq, p, 0); |
2013 | set_tsk_need_resched(p); | ||
2014 | return; | ||
2015 | } | ||
2009 | } | 2016 | } |
2010 | } | 2017 | } |
2011 | 2018 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52e..ba9dccfd24ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -201,7 +201,7 @@ struct cfs_bandwidth { }; | |||
201 | /* CFS-related fields in a runqueue */ | 201 | /* CFS-related fields in a runqueue */ |
202 | struct cfs_rq { | 202 | struct cfs_rq { |
203 | struct load_weight load; | 203 | struct load_weight load; |
204 | unsigned long nr_running, h_nr_running; | 204 | unsigned int nr_running, h_nr_running; |
205 | 205 | ||
206 | u64 exec_clock; | 206 | u64 exec_clock; |
207 | u64 min_vruntime; | 207 | u64 min_vruntime; |
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void) | |||
279 | /* Real-Time classes' related field in a runqueue: */ | 279 | /* Real-Time classes' related field in a runqueue: */ |
280 | struct rt_rq { | 280 | struct rt_rq { |
281 | struct rt_prio_array active; | 281 | struct rt_prio_array active; |
282 | unsigned long rt_nr_running; | 282 | unsigned int rt_nr_running; |
283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
284 | struct { | 284 | struct { |
285 | int curr; /* highest queued rt task prio */ | 285 | int curr; /* highest queued rt task prio */ |
@@ -353,7 +353,7 @@ struct rq { | |||
353 | * nr_running and cpu_load should be in the same cacheline because | 353 | * nr_running and cpu_load should be in the same cacheline because |
354 | * remote CPUs use both these fields when doing load calculation. | 354 | * remote CPUs use both these fields when doing load calculation. |
355 | */ | 355 | */ |
356 | unsigned long nr_running; | 356 | unsigned int nr_running; |
357 | #define CPU_LOAD_IDX_MAX 5 | 357 | #define CPU_LOAD_IDX_MAX 5 |
358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
359 | unsigned long last_load_update_tick; | 359 | unsigned long last_load_update_tick; |
@@ -876,7 +876,7 @@ extern void resched_cpu(int cpu); | |||
876 | extern struct rt_bandwidth def_rt_bandwidth; | 876 | extern struct rt_bandwidth def_rt_bandwidth; |
877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
878 | 878 | ||
879 | extern void update_cpu_load(struct rq *this_rq); | 879 | extern void update_idle_cpu_load(struct rq *this_rq); |
880 | 880 | ||
881 | #ifdef CONFIG_CGROUP_CPUACCT | 881 | #ifdef CONFIG_CGROUP_CPUACCT |
882 | #include <linux/cgroup.h> | 882 | #include <linux/cgroup.h> |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895ea..ee376beedaf9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -3,16 +3,357 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | 4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * | 5 | * |
6 | * This defines a simple but solid secure-computing mode. | 6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> | ||
8 | * | ||
9 | * This defines a simple but solid secure-computing facility. | ||
10 | * | ||
11 | * Mode 1 uses a fixed list of allowed system calls. | ||
12 | * Mode 2 allows user-defined system call filters in the form | ||
13 | * of Berkeley Packet Filters/Linux Socket Filters. | ||
7 | */ | 14 | */ |
8 | 15 | ||
16 | #include <linux/atomic.h> | ||
9 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
10 | #include <linux/seccomp.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> | ||
20 | #include <linux/seccomp.h> | ||
13 | 21 | ||
14 | /* #define SECCOMP_DEBUG 1 */ | 22 | /* #define SECCOMP_DEBUG 1 */ |
15 | #define NR_SECCOMP_MODES 1 | 23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER | ||
25 | #include <asm/syscall.h> | ||
26 | #include <linux/filter.h> | ||
27 | #include <linux/ptrace.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/tracehook.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | |||
33 | /** | ||
34 | * struct seccomp_filter - container for seccomp BPF programs | ||
35 | * | ||
36 | * @usage: reference count to manage the object lifetime. | ||
37 | * get/put helpers should be used when accessing an instance | ||
38 | * outside of a lifetime-guarded section. In general, this | ||
39 | * is only needed for handling filters shared across tasks. | ||
40 | * @prev: points to a previously installed, or inherited, filter | ||
41 | * @len: the number of instructions in the program | ||
42 | * @insns: the BPF program instructions to evaluate | ||
43 | * | ||
44 | * seccomp_filter objects are organized in a tree linked via the @prev | ||
45 | * pointer. For any task, it appears to be a singly-linked list starting | ||
46 | * with current->seccomp.filter, the most recently attached or inherited filter. | ||
47 | * However, multiple filters may share a @prev node, by way of fork(), which | ||
48 | * results in a unidirectional tree existing in memory. This is similar to | ||
49 | * how namespaces work. | ||
50 | * | ||
51 | * seccomp_filter objects should never be modified after being attached | ||
52 | * to a task_struct (other than @usage). | ||
53 | */ | ||
54 | struct seccomp_filter { | ||
55 | atomic_t usage; | ||
56 | struct seccomp_filter *prev; | ||
57 | unsigned short len; /* Instruction count */ | ||
58 | struct sock_filter insns[]; | ||
59 | }; | ||
60 | |||
61 | /* Limit any path through the tree to 256KB worth of instructions. */ | ||
62 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) | ||
63 | |||
64 | /** | ||
65 | * get_u32 - returns a u32 offset into data | ||
66 | * @data: a unsigned 64 bit value | ||
67 | * @index: 0 or 1 to return the first or second 32-bits | ||
68 | * | ||
69 | * This inline exists to hide the length of unsigned long. If a 32-bit | ||
70 | * unsigned long is passed in, it will be extended and the top 32-bits will be | ||
71 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be | ||
72 | * properly returned. | ||
73 | * | ||
74 | * Endianness is explicitly ignored and left for BPF program authors to manage | ||
75 | * as per the specific architecture. | ||
76 | */ | ||
77 | static inline u32 get_u32(u64 data, int index) | ||
78 | { | ||
79 | return ((u32 *)&data)[index]; | ||
80 | } | ||
81 | |||
82 | /* Helper for bpf_load below. */ | ||
83 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) | ||
84 | /** | ||
85 | * bpf_load: checks and returns a pointer to the requested offset | ||
86 | * @off: offset into struct seccomp_data to load from | ||
87 | * | ||
88 | * Returns the requested 32-bits of data. | ||
89 | * seccomp_check_filter() should assure that @off is 32-bit aligned | ||
90 | * and not out of bounds. Failure to do so is a BUG. | ||
91 | */ | ||
92 | u32 seccomp_bpf_load(int off) | ||
93 | { | ||
94 | struct pt_regs *regs = task_pt_regs(current); | ||
95 | if (off == BPF_DATA(nr)) | ||
96 | return syscall_get_nr(current, regs); | ||
97 | if (off == BPF_DATA(arch)) | ||
98 | return syscall_get_arch(current, regs); | ||
99 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { | ||
100 | unsigned long value; | ||
101 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); | ||
102 | int index = !!(off % sizeof(u64)); | ||
103 | syscall_get_arguments(current, regs, arg, 1, &value); | ||
104 | return get_u32(value, index); | ||
105 | } | ||
106 | if (off == BPF_DATA(instruction_pointer)) | ||
107 | return get_u32(KSTK_EIP(current), 0); | ||
108 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) | ||
109 | return get_u32(KSTK_EIP(current), 1); | ||
110 | /* seccomp_check_filter should make this impossible. */ | ||
111 | BUG(); | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * seccomp_check_filter - verify seccomp filter code | ||
116 | * @filter: filter to verify | ||
117 | * @flen: length of filter | ||
118 | * | ||
119 | * Takes a previously checked filter (by sk_chk_filter) and | ||
120 | * redirects all filter code that loads struct sk_buff data | ||
121 | * and related data through seccomp_bpf_load. It also | ||
122 | * enforces length and alignment checking of those loads. | ||
123 | * | ||
124 | * Returns 0 if the rule set is legal or -EINVAL if not. | ||
125 | */ | ||
126 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | ||
127 | { | ||
128 | int pc; | ||
129 | for (pc = 0; pc < flen; pc++) { | ||
130 | struct sock_filter *ftest = &filter[pc]; | ||
131 | u16 code = ftest->code; | ||
132 | u32 k = ftest->k; | ||
133 | |||
134 | switch (code) { | ||
135 | case BPF_S_LD_W_ABS: | ||
136 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; | ||
137 | /* 32-bit aligned and not out of bounds. */ | ||
138 | if (k >= sizeof(struct seccomp_data) || k & 3) | ||
139 | return -EINVAL; | ||
140 | continue; | ||
141 | case BPF_S_LD_W_LEN: | ||
142 | ftest->code = BPF_S_LD_IMM; | ||
143 | ftest->k = sizeof(struct seccomp_data); | ||
144 | continue; | ||
145 | case BPF_S_LDX_W_LEN: | ||
146 | ftest->code = BPF_S_LDX_IMM; | ||
147 | ftest->k = sizeof(struct seccomp_data); | ||
148 | continue; | ||
149 | /* Explicitly include allowed calls. */ | ||
150 | case BPF_S_RET_K: | ||
151 | case BPF_S_RET_A: | ||
152 | case BPF_S_ALU_ADD_K: | ||
153 | case BPF_S_ALU_ADD_X: | ||
154 | case BPF_S_ALU_SUB_K: | ||
155 | case BPF_S_ALU_SUB_X: | ||
156 | case BPF_S_ALU_MUL_K: | ||
157 | case BPF_S_ALU_MUL_X: | ||
158 | case BPF_S_ALU_DIV_X: | ||
159 | case BPF_S_ALU_AND_K: | ||
160 | case BPF_S_ALU_AND_X: | ||
161 | case BPF_S_ALU_OR_K: | ||
162 | case BPF_S_ALU_OR_X: | ||
163 | case BPF_S_ALU_LSH_K: | ||
164 | case BPF_S_ALU_LSH_X: | ||
165 | case BPF_S_ALU_RSH_K: | ||
166 | case BPF_S_ALU_RSH_X: | ||
167 | case BPF_S_ALU_NEG: | ||
168 | case BPF_S_LD_IMM: | ||
169 | case BPF_S_LDX_IMM: | ||
170 | case BPF_S_MISC_TAX: | ||
171 | case BPF_S_MISC_TXA: | ||
172 | case BPF_S_ALU_DIV_K: | ||
173 | case BPF_S_LD_MEM: | ||
174 | case BPF_S_LDX_MEM: | ||
175 | case BPF_S_ST: | ||
176 | case BPF_S_STX: | ||
177 | case BPF_S_JMP_JA: | ||
178 | case BPF_S_JMP_JEQ_K: | ||
179 | case BPF_S_JMP_JEQ_X: | ||
180 | case BPF_S_JMP_JGE_K: | ||
181 | case BPF_S_JMP_JGE_X: | ||
182 | case BPF_S_JMP_JGT_K: | ||
183 | case BPF_S_JMP_JGT_X: | ||
184 | case BPF_S_JMP_JSET_K: | ||
185 | case BPF_S_JMP_JSET_X: | ||
186 | continue; | ||
187 | default: | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * seccomp_run_filters - evaluates all seccomp filters against @syscall | ||
196 | * @syscall: number of the current system call | ||
197 | * | ||
198 | * Returns valid seccomp BPF response codes. | ||
199 | */ | ||
200 | static u32 seccomp_run_filters(int syscall) | ||
201 | { | ||
202 | struct seccomp_filter *f; | ||
203 | u32 ret = SECCOMP_RET_ALLOW; | ||
204 | |||
205 | /* Ensure unexpected behavior doesn't result in failing open. */ | ||
206 | if (WARN_ON(current->seccomp.filter == NULL)) | ||
207 | return SECCOMP_RET_KILL; | ||
208 | |||
209 | /* | ||
210 | * All filters in the list are evaluated and the lowest BPF return | ||
211 | * value always takes priority (ignoring the DATA). | ||
212 | */ | ||
213 | for (f = current->seccomp.filter; f; f = f->prev) { | ||
214 | u32 cur_ret = sk_run_filter(NULL, f->insns); | ||
215 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | ||
216 | ret = cur_ret; | ||
217 | } | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | /** | ||
222 | * seccomp_attach_filter: Attaches a seccomp filter to current. | ||
223 | * @fprog: BPF program to install | ||
224 | * | ||
225 | * Returns 0 on success or an errno on failure. | ||
226 | */ | ||
227 | static long seccomp_attach_filter(struct sock_fprog *fprog) | ||
228 | { | ||
229 | struct seccomp_filter *filter; | ||
230 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | ||
231 | unsigned long total_insns = fprog->len; | ||
232 | long ret; | ||
233 | |||
234 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | ||
235 | return -EINVAL; | ||
236 | |||
237 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | ||
238 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | ||
239 | if (total_insns > MAX_INSNS_PER_PATH) | ||
240 | return -ENOMEM; | ||
241 | |||
242 | /* | ||
243 | * Installing a seccomp filter requires that the task have | ||
244 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | ||
245 | * This avoids scenarios where unprivileged tasks can affect the | ||
246 | * behavior of privileged children. | ||
247 | */ | ||
248 | if (!current->no_new_privs && | ||
249 | security_capable_noaudit(current_cred(), current_user_ns(), | ||
250 | CAP_SYS_ADMIN) != 0) | ||
251 | return -EACCES; | ||
252 | |||
253 | /* Allocate a new seccomp_filter */ | ||
254 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, | ||
255 | GFP_KERNEL|__GFP_NOWARN); | ||
256 | if (!filter) | ||
257 | return -ENOMEM; | ||
258 | atomic_set(&filter->usage, 1); | ||
259 | filter->len = fprog->len; | ||
260 | |||
261 | /* Copy the instructions from fprog. */ | ||
262 | ret = -EFAULT; | ||
263 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) | ||
264 | goto fail; | ||
265 | |||
266 | /* Check and rewrite the fprog via the skb checker */ | ||
267 | ret = sk_chk_filter(filter->insns, filter->len); | ||
268 | if (ret) | ||
269 | goto fail; | ||
270 | |||
271 | /* Check and rewrite the fprog for seccomp use */ | ||
272 | ret = seccomp_check_filter(filter->insns, filter->len); | ||
273 | if (ret) | ||
274 | goto fail; | ||
275 | |||
276 | /* | ||
277 | * If there is an existing filter, make it the prev and don't drop its | ||
278 | * task reference. | ||
279 | */ | ||
280 | filter->prev = current->seccomp.filter; | ||
281 | current->seccomp.filter = filter; | ||
282 | return 0; | ||
283 | fail: | ||
284 | kfree(filter); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | ||
290 | * @user_filter: pointer to the user data containing a sock_fprog. | ||
291 | * | ||
292 | * Returns 0 on success and non-zero otherwise. | ||
293 | */ | ||
294 | long seccomp_attach_user_filter(char __user *user_filter) | ||
295 | { | ||
296 | struct sock_fprog fprog; | ||
297 | long ret = -EFAULT; | ||
298 | |||
299 | #ifdef CONFIG_COMPAT | ||
300 | if (is_compat_task()) { | ||
301 | struct compat_sock_fprog fprog32; | ||
302 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) | ||
303 | goto out; | ||
304 | fprog.len = fprog32.len; | ||
305 | fprog.filter = compat_ptr(fprog32.filter); | ||
306 | } else /* falls through to the if below. */ | ||
307 | #endif | ||
308 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | ||
309 | goto out; | ||
310 | ret = seccomp_attach_filter(&fprog); | ||
311 | out: | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | ||
316 | void get_seccomp_filter(struct task_struct *tsk) | ||
317 | { | ||
318 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
319 | if (!orig) | ||
320 | return; | ||
321 | /* Reference count is bounded by the number of total processes. */ | ||
322 | atomic_inc(&orig->usage); | ||
323 | } | ||
324 | |||
325 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | ||
326 | void put_seccomp_filter(struct task_struct *tsk) | ||
327 | { | ||
328 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
329 | /* Clean up single-reference branches iteratively. */ | ||
330 | while (orig && atomic_dec_and_test(&orig->usage)) { | ||
331 | struct seccomp_filter *freeme = orig; | ||
332 | orig = orig->prev; | ||
333 | kfree(freeme); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation | ||
339 | * @syscall: syscall number to send to userland | ||
340 | * @reason: filter-supplied reason code to send to userland (via si_errno) | ||
341 | * | ||
342 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. | ||
343 | */ | ||
344 | static void seccomp_send_sigsys(int syscall, int reason) | ||
345 | { | ||
346 | struct siginfo info; | ||
347 | memset(&info, 0, sizeof(info)); | ||
348 | info.si_signo = SIGSYS; | ||
349 | info.si_code = SYS_SECCOMP; | ||
350 | info.si_call_addr = (void __user *)KSTK_EIP(current); | ||
351 | info.si_errno = reason; | ||
352 | info.si_arch = syscall_get_arch(current, task_pt_regs(current)); | ||
353 | info.si_syscall = syscall; | ||
354 | force_sig_info(SIGSYS, &info, current); | ||
355 | } | ||
356 | #endif /* CONFIG_SECCOMP_FILTER */ | ||
16 | 357 | ||
17 | /* | 358 | /* |
18 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 359 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { | |||
31 | }; | 372 | }; |
32 | #endif | 373 | #endif |
33 | 374 | ||
34 | void __secure_computing(int this_syscall) | 375 | int __secure_computing(int this_syscall) |
35 | { | 376 | { |
36 | int mode = current->seccomp.mode; | 377 | int mode = current->seccomp.mode; |
37 | int * syscall; | 378 | int exit_sig = 0; |
379 | int *syscall; | ||
380 | u32 ret; | ||
38 | 381 | ||
39 | switch (mode) { | 382 | switch (mode) { |
40 | case 1: | 383 | case SECCOMP_MODE_STRICT: |
41 | syscall = mode1_syscalls; | 384 | syscall = mode1_syscalls; |
42 | #ifdef CONFIG_COMPAT | 385 | #ifdef CONFIG_COMPAT |
43 | if (is_compat_task()) | 386 | if (is_compat_task()) |
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) | |||
45 | #endif | 388 | #endif |
46 | do { | 389 | do { |
47 | if (*syscall == this_syscall) | 390 | if (*syscall == this_syscall) |
48 | return; | 391 | return 0; |
49 | } while (*++syscall); | 392 | } while (*++syscall); |
393 | exit_sig = SIGKILL; | ||
394 | ret = SECCOMP_RET_KILL; | ||
395 | break; | ||
396 | #ifdef CONFIG_SECCOMP_FILTER | ||
397 | case SECCOMP_MODE_FILTER: { | ||
398 | int data; | ||
399 | ret = seccomp_run_filters(this_syscall); | ||
400 | data = ret & SECCOMP_RET_DATA; | ||
401 | ret &= SECCOMP_RET_ACTION; | ||
402 | switch (ret) { | ||
403 | case SECCOMP_RET_ERRNO: | ||
404 | /* Set the low-order 16-bits as a errno. */ | ||
405 | syscall_set_return_value(current, task_pt_regs(current), | ||
406 | -data, 0); | ||
407 | goto skip; | ||
408 | case SECCOMP_RET_TRAP: | ||
409 | /* Show the handler the original registers. */ | ||
410 | syscall_rollback(current, task_pt_regs(current)); | ||
411 | /* Let the filter pass back 16 bits of data. */ | ||
412 | seccomp_send_sigsys(this_syscall, data); | ||
413 | goto skip; | ||
414 | case SECCOMP_RET_TRACE: | ||
415 | /* Skip these calls if there is no tracer. */ | ||
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | ||
417 | goto skip; | ||
418 | /* Allow the BPF to provide the event message */ | ||
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
420 | /* | ||
421 | * The delivery of a fatal signal during event | ||
422 | * notification may silently skip tracer notification. | ||
423 | * Terminating the task now avoids executing a system | ||
424 | * call that may not be intended. | ||
425 | */ | ||
426 | if (fatal_signal_pending(current)) | ||
427 | break; | ||
428 | return 0; | ||
429 | case SECCOMP_RET_ALLOW: | ||
430 | return 0; | ||
431 | case SECCOMP_RET_KILL: | ||
432 | default: | ||
433 | break; | ||
434 | } | ||
435 | exit_sig = SIGSYS; | ||
50 | break; | 436 | break; |
437 | } | ||
438 | #endif | ||
51 | default: | 439 | default: |
52 | BUG(); | 440 | BUG(); |
53 | } | 441 | } |
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) | |||
55 | #ifdef SECCOMP_DEBUG | 443 | #ifdef SECCOMP_DEBUG |
56 | dump_stack(); | 444 | dump_stack(); |
57 | #endif | 445 | #endif |
58 | audit_seccomp(this_syscall); | 446 | audit_seccomp(this_syscall, exit_sig, ret); |
59 | do_exit(SIGKILL); | 447 | do_exit(exit_sig); |
448 | #ifdef CONFIG_SECCOMP_FILTER | ||
449 | skip: | ||
450 | audit_seccomp(this_syscall, exit_sig, ret); | ||
451 | #endif | ||
452 | return -1; | ||
60 | } | 453 | } |
61 | 454 | ||
62 | long prctl_get_seccomp(void) | 455 | long prctl_get_seccomp(void) |
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void) | |||
64 | return current->seccomp.mode; | 457 | return current->seccomp.mode; |
65 | } | 458 | } |
66 | 459 | ||
67 | long prctl_set_seccomp(unsigned long seccomp_mode) | 460 | /** |
461 | * prctl_set_seccomp: configures current->seccomp.mode | ||
462 | * @seccomp_mode: requested mode to use | ||
463 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
464 | * | ||
465 | * This function may be called repeatedly with a @seccomp_mode of | ||
466 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | ||
467 | * successfully installed will be evaluated (in reverse order) for each system | ||
468 | * call the task makes. | ||
469 | * | ||
470 | * Once current->seccomp.mode is non-zero, it may not be changed. | ||
471 | * | ||
472 | * Returns 0 on success or -EINVAL on failure. | ||
473 | */ | ||
474 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | ||
68 | { | 475 | { |
69 | long ret; | 476 | long ret = -EINVAL; |
70 | 477 | ||
71 | /* can set it only once to be even more secure */ | 478 | if (current->seccomp.mode && |
72 | ret = -EPERM; | 479 | current->seccomp.mode != seccomp_mode) |
73 | if (unlikely(current->seccomp.mode)) | ||
74 | goto out; | 480 | goto out; |
75 | 481 | ||
76 | ret = -EINVAL; | 482 | switch (seccomp_mode) { |
77 | if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { | 483 | case SECCOMP_MODE_STRICT: |
78 | current->seccomp.mode = seccomp_mode; | 484 | ret = 0; |
79 | set_thread_flag(TIF_SECCOMP); | ||
80 | #ifdef TIF_NOTSC | 485 | #ifdef TIF_NOTSC |
81 | disable_TSC(); | 486 | disable_TSC(); |
82 | #endif | 487 | #endif |
83 | ret = 0; | 488 | break; |
489 | #ifdef CONFIG_SECCOMP_FILTER | ||
490 | case SECCOMP_MODE_FILTER: | ||
491 | ret = seccomp_attach_user_filter(filter); | ||
492 | if (ret) | ||
493 | goto out; | ||
494 | break; | ||
495 | #endif | ||
496 | default: | ||
497 | goto out; | ||
84 | } | 498 | } |
85 | 499 | ||
86 | out: | 500 | current->seccomp.mode = seccomp_mode; |
501 | set_thread_flag(TIF_SECCOMP); | ||
502 | out: | ||
87 | return ret; | 503 | return ret; |
88 | } | 504 | } |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c3..4567fc020fe3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable); | |||
118 | * down_trylock - try to acquire the semaphore, without waiting | 118 | * down_trylock - try to acquire the semaphore, without waiting |
119 | * @sem: the semaphore to be acquired | 119 | * @sem: the semaphore to be acquired |
120 | * | 120 | * |
121 | * Try to acquire the semaphore atomically. Returns 0 if the mutex has | 121 | * Try to acquire the semaphore atomically. Returns 0 if the semaphore has |
122 | * been acquired successfully or 1 if it it cannot be acquired. | 122 | * been acquired successfully or 1 if it it cannot be acquired. |
123 | * | 123 | * |
124 | * NOTE: This return value is inverted from both spin_trylock and | 124 | * NOTE: This return value is inverted from both spin_trylock and |
diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d0..677102789cf2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/pid_namespace.h> | 29 | #include <linux/pid_namespace.h> |
30 | #include <linux/nsproxy.h> | 30 | #include <linux/nsproxy.h> |
31 | #include <linux/user_namespace.h> | 31 | #include <linux/user_namespace.h> |
32 | #include <linux/uprobes.h> | ||
32 | #define CREATE_TRACE_POINTS | 33 | #define CREATE_TRACE_POINTS |
33 | #include <trace/events/signal.h> | 34 | #include <trace/events/signal.h> |
34 | 35 | ||
@@ -160,7 +161,7 @@ void recalc_sigpending(void) | |||
160 | 161 | ||
161 | #define SYNCHRONOUS_MASK \ | 162 | #define SYNCHRONOUS_MASK \ |
162 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | 163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ |
163 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | 164 | sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) |
164 | 165 | ||
165 | int next_signal(struct sigpending *pending, sigset_t *mask) | 166 | int next_signal(struct sigpending *pending, sigset_t *mask) |
166 | { | 167 | { |
@@ -767,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t) | |||
767 | const struct cred *cred = current_cred(); | 768 | const struct cred *cred = current_cred(); |
768 | const struct cred *tcred = __task_cred(t); | 769 | const struct cred *tcred = __task_cred(t); |
769 | 770 | ||
770 | if (cred->user->user_ns == tcred->user->user_ns && | 771 | if (uid_eq(cred->euid, tcred->suid) || |
771 | (cred->euid == tcred->suid || | 772 | uid_eq(cred->euid, tcred->uid) || |
772 | cred->euid == tcred->uid || | 773 | uid_eq(cred->uid, tcred->suid) || |
773 | cred->uid == tcred->suid || | 774 | uid_eq(cred->uid, tcred->uid)) |
774 | cred->uid == tcred->uid)) | ||
775 | return 1; | 775 | return 1; |
776 | 776 | ||
777 | if (ns_capable(tcred->user->user_ns, CAP_KILL)) | 777 | if (ns_capable(tcred->user_ns, CAP_KILL)) |
778 | return 1; | 778 | return 1; |
779 | 779 | ||
780 | return 0; | 780 | return 0; |
@@ -1020,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig) | |||
1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | 1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); |
1021 | } | 1021 | } |
1022 | 1022 | ||
1023 | /* | ||
1024 | * map the uid in struct cred into user namespace *ns | ||
1025 | */ | ||
1026 | static inline uid_t map_cred_ns(const struct cred *cred, | ||
1027 | struct user_namespace *ns) | ||
1028 | { | ||
1029 | return user_ns_map_uid(ns, cred, cred->uid); | ||
1030 | } | ||
1031 | |||
1032 | #ifdef CONFIG_USER_NS | 1023 | #ifdef CONFIG_USER_NS |
1033 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | 1024 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) |
1034 | { | 1025 | { |
@@ -1038,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str | |||
1038 | if (SI_FROMKERNEL(info)) | 1029 | if (SI_FROMKERNEL(info)) |
1039 | return; | 1030 | return; |
1040 | 1031 | ||
1041 | info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), | 1032 | rcu_read_lock(); |
1042 | current_cred(), info->si_uid); | 1033 | info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), |
1034 | make_kuid(current_user_ns(), info->si_uid)); | ||
1035 | rcu_read_unlock(); | ||
1043 | } | 1036 | } |
1044 | #else | 1037 | #else |
1045 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | 1038 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) |
@@ -1106,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1106 | q->info.si_code = SI_USER; | 1099 | q->info.si_code = SI_USER; |
1107 | q->info.si_pid = task_tgid_nr_ns(current, | 1100 | q->info.si_pid = task_tgid_nr_ns(current, |
1108 | task_active_pid_ns(t)); | 1101 | task_active_pid_ns(t)); |
1109 | q->info.si_uid = current_uid(); | 1102 | q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
1110 | break; | 1103 | break; |
1111 | case (unsigned long) SEND_SIG_PRIV: | 1104 | case (unsigned long) SEND_SIG_PRIV: |
1112 | q->info.si_signo = sig; | 1105 | q->info.si_signo = sig; |
@@ -1387,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred, | |||
1387 | struct task_struct *target) | 1380 | struct task_struct *target) |
1388 | { | 1381 | { |
1389 | const struct cred *pcred = __task_cred(target); | 1382 | const struct cred *pcred = __task_cred(target); |
1390 | if (cred->user_ns != pcred->user_ns) | 1383 | if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && |
1391 | return 0; | 1384 | !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) |
1392 | if (cred->euid != pcred->suid && cred->euid != pcred->uid && | ||
1393 | cred->uid != pcred->suid && cred->uid != pcred->uid) | ||
1394 | return 0; | 1385 | return 0; |
1395 | return 1; | 1386 | return 1; |
1396 | } | 1387 | } |
@@ -1665,21 +1656,20 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1665 | info.si_signo = sig; | 1656 | info.si_signo = sig; |
1666 | info.si_errno = 0; | 1657 | info.si_errno = 0; |
1667 | /* | 1658 | /* |
1668 | * we are under tasklist_lock here so our parent is tied to | 1659 | * We are under tasklist_lock here so our parent is tied to |
1669 | * us and cannot exit and release its namespace. | 1660 | * us and cannot change. |
1670 | * | 1661 | * |
1671 | * the only it can is to switch its nsproxy with sys_unshare, | 1662 | * task_active_pid_ns will always return the same pid namespace |
1672 | * bu uncharing pid namespaces is not allowed, so we'll always | 1663 | * until a task passes through release_task. |
1673 | * see relevant namespace | ||
1674 | * | 1664 | * |
1675 | * write_lock() currently calls preempt_disable() which is the | 1665 | * write_lock() currently calls preempt_disable() which is the |
1676 | * same as rcu_read_lock(), but according to Oleg, this is not | 1666 | * same as rcu_read_lock(), but according to Oleg, this is not |
1677 | * correct to rely on this | 1667 | * correct to rely on this |
1678 | */ | 1668 | */ |
1679 | rcu_read_lock(); | 1669 | rcu_read_lock(); |
1680 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); | 1670 | info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); |
1681 | info.si_uid = map_cred_ns(__task_cred(tsk), | 1671 | info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), |
1682 | task_cred_xxx(tsk->parent, user_ns)); | 1672 | task_uid(tsk)); |
1683 | rcu_read_unlock(); | 1673 | rcu_read_unlock(); |
1684 | 1674 | ||
1685 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); | 1675 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); |
@@ -1762,8 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1762 | */ | 1752 | */ |
1763 | rcu_read_lock(); | 1753 | rcu_read_lock(); |
1764 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1754 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
1765 | info.si_uid = map_cred_ns(__task_cred(tsk), | 1755 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); |
1766 | task_cred_xxx(parent, user_ns)); | ||
1767 | rcu_read_unlock(); | 1756 | rcu_read_unlock(); |
1768 | 1757 | ||
1769 | info.si_utime = cputime_to_clock_t(tsk->utime); | 1758 | info.si_utime = cputime_to_clock_t(tsk->utime); |
@@ -1973,7 +1962,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
1973 | info.si_signo = signr; | 1962 | info.si_signo = signr; |
1974 | info.si_code = exit_code; | 1963 | info.si_code = exit_code; |
1975 | info.si_pid = task_pid_vnr(current); | 1964 | info.si_pid = task_pid_vnr(current); |
1976 | info.si_uid = current_uid(); | 1965 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
1977 | 1966 | ||
1978 | /* Let the debugger run. */ | 1967 | /* Let the debugger run. */ |
1979 | ptrace_stop(exit_code, why, 1, &info); | 1968 | ptrace_stop(exit_code, why, 1, &info); |
@@ -2181,8 +2170,8 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
2181 | info->si_code = SI_USER; | 2170 | info->si_code = SI_USER; |
2182 | rcu_read_lock(); | 2171 | rcu_read_lock(); |
2183 | info->si_pid = task_pid_vnr(current->parent); | 2172 | info->si_pid = task_pid_vnr(current->parent); |
2184 | info->si_uid = map_cred_ns(__task_cred(current->parent), | 2173 | info->si_uid = from_kuid_munged(current_user_ns(), |
2185 | current_user_ns()); | 2174 | task_uid(current->parent)); |
2186 | rcu_read_unlock(); | 2175 | rcu_read_unlock(); |
2187 | } | 2176 | } |
2188 | 2177 | ||
@@ -2202,6 +2191,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2202 | struct signal_struct *signal = current->signal; | 2191 | struct signal_struct *signal = current->signal; |
2203 | int signr; | 2192 | int signr; |
2204 | 2193 | ||
2194 | if (unlikely(uprobe_deny_signal())) | ||
2195 | return 0; | ||
2196 | |||
2205 | relock: | 2197 | relock: |
2206 | /* | 2198 | /* |
2207 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | 2199 | * We'll jump back here after any time we were stopped in TASK_STOPPED. |
@@ -2376,24 +2368,34 @@ relock: | |||
2376 | } | 2368 | } |
2377 | 2369 | ||
2378 | /** | 2370 | /** |
2379 | * block_sigmask - add @ka's signal mask to current->blocked | 2371 | * signal_delivered - |
2380 | * @ka: action for @signr | 2372 | * @sig: number of signal being delivered |
2381 | * @signr: signal that has been successfully delivered | 2373 | * @info: siginfo_t of signal being delivered |
2374 | * @ka: sigaction setting that chose the handler | ||
2375 | * @regs: user register state | ||
2376 | * @stepping: nonzero if debugger single-step or block-step in use | ||
2382 | * | 2377 | * |
2383 | * This function should be called when a signal has succesfully been | 2378 | * This function should be called when a signal has succesfully been |
2384 | * delivered. It adds the mask of signals for @ka to current->blocked | 2379 | * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask |
2385 | * so that they are blocked during the execution of the signal | 2380 | * is always blocked, and the signal itself is blocked unless %SA_NODEFER |
2386 | * handler. In addition, @signr will be blocked unless %SA_NODEFER is | 2381 | * is set in @ka->sa.sa_flags. Tracing is notified. |
2387 | * set in @ka->sa.sa_flags. | ||
2388 | */ | 2382 | */ |
2389 | void block_sigmask(struct k_sigaction *ka, int signr) | 2383 | void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, |
2384 | struct pt_regs *regs, int stepping) | ||
2390 | { | 2385 | { |
2391 | sigset_t blocked; | 2386 | sigset_t blocked; |
2392 | 2387 | ||
2388 | /* A signal was successfully delivered, and the | ||
2389 | saved sigmask was stored on the signal frame, | ||
2390 | and will be restored by sigreturn. So we can | ||
2391 | simply clear the restore sigmask flag. */ | ||
2392 | clear_restore_sigmask(); | ||
2393 | |||
2393 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | 2394 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); |
2394 | if (!(ka->sa.sa_flags & SA_NODEFER)) | 2395 | if (!(ka->sa.sa_flags & SA_NODEFER)) |
2395 | sigaddset(&blocked, signr); | 2396 | sigaddset(&blocked, sig); |
2396 | set_current_blocked(&blocked); | 2397 | set_current_blocked(&blocked); |
2398 | tracehook_signal_handler(sig, info, ka, regs, stepping); | ||
2397 | } | 2399 | } |
2398 | 2400 | ||
2399 | /* | 2401 | /* |
@@ -2526,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) | |||
2526 | * It is wrong to change ->blocked directly, this helper should be used | 2528 | * It is wrong to change ->blocked directly, this helper should be used |
2527 | * to ensure the process can't miss a shared signal we are going to block. | 2529 | * to ensure the process can't miss a shared signal we are going to block. |
2528 | */ | 2530 | */ |
2529 | void set_current_blocked(const sigset_t *newset) | 2531 | void set_current_blocked(sigset_t *newset) |
2532 | { | ||
2533 | struct task_struct *tsk = current; | ||
2534 | sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2535 | spin_lock_irq(&tsk->sighand->siglock); | ||
2536 | __set_task_blocked(tsk, newset); | ||
2537 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2538 | } | ||
2539 | |||
2540 | void __set_current_blocked(const sigset_t *newset) | ||
2530 | { | 2541 | { |
2531 | struct task_struct *tsk = current; | 2542 | struct task_struct *tsk = current; |
2532 | 2543 | ||
@@ -2566,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | |||
2566 | return -EINVAL; | 2577 | return -EINVAL; |
2567 | } | 2578 | } |
2568 | 2579 | ||
2569 | set_current_blocked(&newset); | 2580 | __set_current_blocked(&newset); |
2570 | return 0; | 2581 | return 0; |
2571 | } | 2582 | } |
2572 | 2583 | ||
@@ -2706,6 +2717,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2706 | err |= __put_user(from->si_uid, &to->si_uid); | 2717 | err |= __put_user(from->si_uid, &to->si_uid); |
2707 | err |= __put_user(from->si_ptr, &to->si_ptr); | 2718 | err |= __put_user(from->si_ptr, &to->si_ptr); |
2708 | break; | 2719 | break; |
2720 | #ifdef __ARCH_SIGSYS | ||
2721 | case __SI_SYS: | ||
2722 | err |= __put_user(from->si_call_addr, &to->si_call_addr); | ||
2723 | err |= __put_user(from->si_syscall, &to->si_syscall); | ||
2724 | err |= __put_user(from->si_arch, &to->si_arch); | ||
2725 | break; | ||
2726 | #endif | ||
2709 | default: /* this is just in case for now ... */ | 2727 | default: /* this is just in case for now ... */ |
2710 | err |= __put_user(from->si_pid, &to->si_pid); | 2728 | err |= __put_user(from->si_pid, &to->si_pid); |
2711 | err |= __put_user(from->si_uid, &to->si_uid); | 2729 | err |= __put_user(from->si_uid, &to->si_uid); |
@@ -2828,7 +2846,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | |||
2828 | info.si_errno = 0; | 2846 | info.si_errno = 0; |
2829 | info.si_code = SI_USER; | 2847 | info.si_code = SI_USER; |
2830 | info.si_pid = task_tgid_vnr(current); | 2848 | info.si_pid = task_tgid_vnr(current); |
2831 | info.si_uid = current_uid(); | 2849 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
2832 | 2850 | ||
2833 | return kill_something_info(sig, &info, pid); | 2851 | return kill_something_info(sig, &info, pid); |
2834 | } | 2852 | } |
@@ -2871,7 +2889,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) | |||
2871 | info.si_errno = 0; | 2889 | info.si_errno = 0; |
2872 | info.si_code = SI_TKILL; | 2890 | info.si_code = SI_TKILL; |
2873 | info.si_pid = task_tgid_vnr(current); | 2891 | info.si_pid = task_tgid_vnr(current); |
2874 | info.si_uid = current_uid(); | 2892 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
2875 | 2893 | ||
2876 | return do_send_specific(tgid, pid, sig, &info); | 2894 | return do_send_specific(tgid, pid, sig, &info); |
2877 | } | 2895 | } |
@@ -3133,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, | |||
3133 | return -EINVAL; | 3151 | return -EINVAL; |
3134 | } | 3152 | } |
3135 | 3153 | ||
3136 | set_current_blocked(&new_blocked); | 3154 | __set_current_blocked(&new_blocked); |
3137 | } | 3155 | } |
3138 | 3156 | ||
3139 | if (oset) { | 3157 | if (oset) { |
@@ -3197,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) | |||
3197 | int old = current->blocked.sig[0]; | 3215 | int old = current->blocked.sig[0]; |
3198 | sigset_t newset; | 3216 | sigset_t newset; |
3199 | 3217 | ||
3200 | siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); | ||
3201 | set_current_blocked(&newset); | 3218 | set_current_blocked(&newset); |
3202 | 3219 | ||
3203 | return old; | 3220 | return old; |
@@ -3236,6 +3253,17 @@ SYSCALL_DEFINE0(pause) | |||
3236 | 3253 | ||
3237 | #endif | 3254 | #endif |
3238 | 3255 | ||
3256 | int sigsuspend(sigset_t *set) | ||
3257 | { | ||
3258 | current->saved_sigmask = current->blocked; | ||
3259 | set_current_blocked(set); | ||
3260 | |||
3261 | current->state = TASK_INTERRUPTIBLE; | ||
3262 | schedule(); | ||
3263 | set_restore_sigmask(); | ||
3264 | return -ERESTARTNOHAND; | ||
3265 | } | ||
3266 | |||
3239 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | 3267 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND |
3240 | /** | 3268 | /** |
3241 | * sys_rt_sigsuspend - replace the signal mask for a value with the | 3269 | * sys_rt_sigsuspend - replace the signal mask for a value with the |
@@ -3253,15 +3281,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | |||
3253 | 3281 | ||
3254 | if (copy_from_user(&newset, unewset, sizeof(newset))) | 3282 | if (copy_from_user(&newset, unewset, sizeof(newset))) |
3255 | return -EFAULT; | 3283 | return -EFAULT; |
3256 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 3284 | return sigsuspend(&newset); |
3257 | |||
3258 | current->saved_sigmask = current->blocked; | ||
3259 | set_current_blocked(&newset); | ||
3260 | |||
3261 | current->state = TASK_INTERRUPTIBLE; | ||
3262 | schedule(); | ||
3263 | set_restore_sigmask(); | ||
3264 | return -ERESTARTNOHAND; | ||
3265 | } | 3285 | } |
3266 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | 3286 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ |
3267 | 3287 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf759..d0ae5b24875e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,8 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #include "smpboot.h" | ||
17 | |||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
17 | static struct { | 19 | static struct { |
18 | struct list_head queue; | 20 | struct list_head queue; |
@@ -669,6 +671,8 @@ void __init smp_init(void) | |||
669 | { | 671 | { |
670 | unsigned int cpu; | 672 | unsigned int cpu; |
671 | 673 | ||
674 | idle_threads_init(); | ||
675 | |||
672 | /* FIXME: This should be done in userspace --RR */ | 676 | /* FIXME: This should be done in userspace --RR */ |
673 | for_each_present_cpu(cpu) { | 677 | for_each_present_cpu(cpu) { |
674 | if (num_online_cpus() >= setup_max_cpus) | 678 | if (num_online_cpus() >= setup_max_cpus) |
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
791 | } | 795 | } |
792 | } | 796 | } |
793 | EXPORT_SYMBOL(on_each_cpu_cond); | 797 | EXPORT_SYMBOL(on_each_cpu_cond); |
798 | |||
799 | static void do_nothing(void *unused) | ||
800 | { | ||
801 | } | ||
802 | |||
803 | /** | ||
804 | * kick_all_cpus_sync - Force all cpus out of idle | ||
805 | * | ||
806 | * Used to synchronize the update of pm_idle function pointer. It's | ||
807 | * called after the pointer is updated and returns after the dummy | ||
808 | * callback function has been executed on all cpus. The execution of | ||
809 | * the function can only happen on the remote cpus after they have | ||
810 | * left the idle function which had been called via pm_idle function | ||
811 | * pointer. So it's guaranteed that nothing uses the previous pointer | ||
812 | * anymore. | ||
813 | */ | ||
814 | void kick_all_cpus_sync(void) | ||
815 | { | ||
816 | /* Make sure the change is visible before we kick the cpus */ | ||
817 | smp_mb(); | ||
818 | smp_call_function(do_nothing, NULL, 1); | ||
819 | } | ||
820 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 000000000000..98f60c5caa1b --- /dev/null +++ b/kernel/smpboot.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Common SMP CPU bringup/teardown functions | ||
3 | */ | ||
4 | #include <linux/err.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/percpu.h> | ||
9 | |||
10 | #include "smpboot.h" | ||
11 | |||
12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
13 | /* | ||
14 | * For the hotplug case we keep the task structs around and reuse | ||
15 | * them. | ||
16 | */ | ||
17 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); | ||
18 | |||
19 | struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) | ||
20 | { | ||
21 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
22 | |||
23 | if (!tsk) | ||
24 | return ERR_PTR(-ENOMEM); | ||
25 | init_idle(tsk, cpu); | ||
26 | return tsk; | ||
27 | } | ||
28 | |||
29 | void __init idle_thread_set_boot_cpu(void) | ||
30 | { | ||
31 | per_cpu(idle_threads, smp_processor_id()) = current; | ||
32 | } | ||
33 | |||
34 | /** | ||
35 | * idle_init - Initialize the idle thread for a cpu | ||
36 | * @cpu: The cpu for which the idle thread should be initialized | ||
37 | * | ||
38 | * Creates the thread if it does not exist. | ||
39 | */ | ||
40 | static inline void idle_init(unsigned int cpu) | ||
41 | { | ||
42 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
43 | |||
44 | if (!tsk) { | ||
45 | tsk = fork_idle(cpu); | ||
46 | if (IS_ERR(tsk)) | ||
47 | pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); | ||
48 | else | ||
49 | per_cpu(idle_threads, cpu) = tsk; | ||
50 | } | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * idle_threads_init - Initialize idle threads for all cpus | ||
55 | */ | ||
56 | void __init idle_threads_init(void) | ||
57 | { | ||
58 | unsigned int cpu, boot_cpu; | ||
59 | |||
60 | boot_cpu = smp_processor_id(); | ||
61 | |||
62 | for_each_possible_cpu(cpu) { | ||
63 | if (cpu != boot_cpu) | ||
64 | idle_init(cpu); | ||
65 | } | ||
66 | } | ||
67 | #endif | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 000000000000..80c0acfb8472 --- /dev/null +++ b/kernel/smpboot.h | |||
@@ -0,0 +1,18 @@ | |||
1 | #ifndef SMPBOOT_H | ||
2 | #define SMPBOOT_H | ||
3 | |||
4 | struct task_struct; | ||
5 | |||
6 | int smpboot_prepare(unsigned int cpu); | ||
7 | |||
8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
9 | struct task_struct *idle_thread_get(unsigned int cpu); | ||
10 | void idle_thread_set_boot_cpu(void); | ||
11 | void idle_threads_init(void); | ||
12 | #else | ||
13 | static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } | ||
14 | static inline void idle_thread_set_boot_cpu(void) { } | ||
15 | static inline void idle_threads_init(void) { } | ||
16 | #endif | ||
17 | |||
18 | #endif | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..2095be3318d5 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,10 +34,77 @@ | |||
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | /* | ||
38 | * Initialize an rcu_batch structure to empty. | ||
39 | */ | ||
40 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
41 | { | ||
42 | b->head = NULL; | ||
43 | b->tail = &b->head; | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
48 | */ | ||
49 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
50 | { | ||
51 | *b->tail = head; | ||
52 | b->tail = &head->next; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Is the specified rcu_batch structure empty? | ||
57 | */ | ||
58 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
59 | { | ||
60 | return b->tail == &b->head; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Remove the callback at the head of the specified rcu_batch structure | ||
65 | * and return a pointer to it, or return NULL if the structure is empty. | ||
66 | */ | ||
67 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
68 | { | ||
69 | struct rcu_head *head; | ||
70 | |||
71 | if (rcu_batch_empty(b)) | ||
72 | return NULL; | ||
73 | |||
74 | head = b->head; | ||
75 | b->head = head->next; | ||
76 | if (b->tail == &head->next) | ||
77 | rcu_batch_init(b); | ||
78 | |||
79 | return head; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
84 | * the structure specified by "to". | ||
85 | */ | ||
86 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
87 | { | ||
88 | if (!rcu_batch_empty(from)) { | ||
89 | *to->tail = from->head; | ||
90 | to->tail = from->tail; | ||
91 | rcu_batch_init(from); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 98 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | 99 | { |
39 | sp->completed = 0; | 100 | sp->completed = 0; |
40 | mutex_init(&sp->mutex); | 101 | spin_lock_init(&sp->queue_lock); |
102 | sp->running = false; | ||
103 | rcu_batch_init(&sp->batch_queue); | ||
104 | rcu_batch_init(&sp->batch_check0); | ||
105 | rcu_batch_init(&sp->batch_check1); | ||
106 | rcu_batch_init(&sp->batch_done); | ||
107 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | 108 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | 109 | return sp->per_cpu_ref ? 0 : -ENOMEM; |
43 | } | 110 | } |
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 140 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 141 | ||
75 | /* | 142 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 143 | * Returns approximate total of the readers' ->seq[] values for the |
77 | * active on the specified rank of per-CPU counters. | 144 | * rank of per-CPU counters specified by idx. |
78 | */ | 145 | */ |
146 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
147 | { | ||
148 | int cpu; | ||
149 | unsigned long sum = 0; | ||
150 | unsigned long t; | ||
79 | 151 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 152 | for_each_possible_cpu(cpu) { |
153 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
154 | sum += t; | ||
155 | } | ||
156 | return sum; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Returns approximate number of readers active on the specified rank | ||
161 | * of the per-CPU ->c[] counters. | ||
162 | */ | ||
163 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
81 | { | 164 | { |
82 | int cpu; | 165 | int cpu; |
83 | int sum; | 166 | unsigned long sum = 0; |
167 | unsigned long t; | ||
84 | 168 | ||
85 | sum = 0; | 169 | for_each_possible_cpu(cpu) { |
86 | for_each_possible_cpu(cpu) | 170 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 171 | sum += t; |
172 | } | ||
88 | return sum; | 173 | return sum; |
89 | } | 174 | } |
90 | 175 | ||
176 | /* | ||
177 | * Return true if the number of pre-existing readers is determined to | ||
178 | * be stably zero. An example unstable zero can occur if the call | ||
179 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
180 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
181 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
182 | * time to sum the array, and might in fact be interrupted or preempted | ||
183 | * partway through the summation. | ||
184 | */ | ||
185 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
186 | { | ||
187 | unsigned long seq; | ||
188 | |||
189 | seq = srcu_readers_seq_idx(sp, idx); | ||
190 | |||
191 | /* | ||
192 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
193 | * __srcu_read_lock(). This pairing ensures that if an | ||
194 | * __srcu_read_lock() increments its counter after the summation | ||
195 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
196 | * critical section will see any changes made prior to the start | ||
197 | * of the current SRCU grace period. | ||
198 | * | ||
199 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
200 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
201 | * must see the increment of ->c[]. | ||
202 | */ | ||
203 | smp_mb(); /* A */ | ||
204 | |||
205 | /* | ||
206 | * Note that srcu_readers_active_idx() can incorrectly return | ||
207 | * zero even though there is a pre-existing reader throughout. | ||
208 | * To see this, suppose that task A is in a very long SRCU | ||
209 | * read-side critical section that started on CPU 0, and that | ||
210 | * no other reader exists, so that the sum of the counters | ||
211 | * is equal to one. Then suppose that task B starts executing | ||
212 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
213 | * task C starts reading on CPU 0, so that its increment is not | ||
214 | * summed, but finishes reading on CPU 2, so that its decrement | ||
215 | * -is- summed. Then when task B completes its sum, it will | ||
216 | * incorrectly get zero, despite the fact that task A has been | ||
217 | * in its SRCU read-side critical section the whole time. | ||
218 | * | ||
219 | * We therefore do a validation step should srcu_readers_active_idx() | ||
220 | * return zero. | ||
221 | */ | ||
222 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * The remainder of this function is the validation step. | ||
227 | * The following smp_mb() D pairs with the smp_mb() C in | ||
228 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
229 | * by srcu_readers_active_idx() above, then any destructive | ||
230 | * operation performed after the grace period will happen after | ||
231 | * the corresponding SRCU read-side critical section. | ||
232 | * | ||
233 | * Note that there can be at most NR_CPUS worth of readers using | ||
234 | * the old index, which is not enough to overflow even a 32-bit | ||
235 | * integer. (Yes, this does mean that systems having more than | ||
236 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
237 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
238 | * Therefore, the only way that the return values of the two | ||
239 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
240 | * no increments of the corresponding rank of ->seq[] counts | ||
241 | * in the interim. But the missed-increment scenario laid out | ||
242 | * above includes an increment of the ->seq[] counter by | ||
243 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
244 | * scenario occurs, the return values from the two calls to | ||
245 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
246 | * step below suffices. | ||
247 | */ | ||
248 | smp_mb(); /* D */ | ||
249 | |||
250 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
251 | } | ||
252 | |||
91 | /** | 253 | /** |
92 | * srcu_readers_active - returns approximate number of readers. | 254 | * srcu_readers_active - returns approximate number of readers. |
93 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 255 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
98 | */ | 260 | */ |
99 | static int srcu_readers_active(struct srcu_struct *sp) | 261 | static int srcu_readers_active(struct srcu_struct *sp) |
100 | { | 262 | { |
101 | return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); | 263 | int cpu; |
264 | unsigned long sum = 0; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
268 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
269 | } | ||
270 | return sum; | ||
102 | } | 271 | } |
103 | 272 | ||
104 | /** | 273 | /** |
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 300 | int idx; |
132 | 301 | ||
133 | preempt_disable(); | 302 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 303 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 304 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 306 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
307 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
138 | preempt_enable(); | 308 | preempt_enable(); |
139 | return idx; | 309 | return idx; |
140 | } | 310 | } |
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 319 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 320 | { |
151 | preempt_disable(); | 321 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 322 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 323 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
154 | preempt_enable(); | 324 | preempt_enable(); |
155 | } | 325 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 326 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 333 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 334 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 335 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 336 | #define SRCU_RETRY_CHECK_DELAY 5 |
337 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
338 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
167 | 339 | ||
168 | /* | 340 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 341 | * @@@ Wait until all pre-existing readers complete. Such readers |
342 | * will have used the index specified by "idx". | ||
343 | * the caller should ensures the ->completed is not changed while checking | ||
344 | * and idx = (->completed & 1) ^ 1 | ||
170 | */ | 345 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 346 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) |
172 | { | 347 | { |
173 | int idx; | 348 | for (;;) { |
174 | 349 | if (srcu_readers_active_idx_check(sp, idx)) | |
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 350 | return true; |
176 | !lock_is_held(&rcu_bh_lock_map) && | 351 | if (--trycount <= 0) |
177 | !lock_is_held(&rcu_lock_map) && | 352 | return false; |
178 | !lock_is_held(&rcu_sched_lock_map), | 353 | udelay(SRCU_RETRY_CHECK_DELAY); |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 354 | } |
180 | 355 | } | |
181 | idx = sp->completed; | ||
182 | mutex_lock(&sp->mutex); | ||
183 | 356 | ||
184 | /* | 357 | /* |
185 | * Check to see if someone else did the work for us while we were | 358 | * Increment the ->completed counter so that future SRCU readers will |
186 | * waiting to acquire the lock. We need -two- advances of | 359 | * use the other rank of the ->c[] and ->seq[] arrays. This allows |
187 | * the counter, not just one. If there was but one, we might have | 360 | * us to wait for pre-existing readers in a starvation-free manner. |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 361 | */ |
189 | * having failed to prevent CPU-reordering races with concurrent | 362 | static void srcu_flip(struct srcu_struct *sp) |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 363 | { |
191 | * either (1) wait for two or (2) supply the second ourselves. | 364 | sp->completed++; |
192 | */ | 365 | } |
193 | 366 | ||
194 | if ((sp->completed - idx) >= 2) { | 367 | /* |
195 | mutex_unlock(&sp->mutex); | 368 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
196 | return; | 369 | * initiating grace-period processing if it is not already running. |
370 | */ | ||
371 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
372 | void (*func)(struct rcu_head *head)) | ||
373 | { | ||
374 | unsigned long flags; | ||
375 | |||
376 | head->next = NULL; | ||
377 | head->func = func; | ||
378 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
379 | rcu_batch_queue(&sp->batch_queue, head); | ||
380 | if (!sp->running) { | ||
381 | sp->running = true; | ||
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | ||
197 | } | 383 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
385 | } | ||
386 | EXPORT_SYMBOL_GPL(call_srcu); | ||
198 | 387 | ||
199 | sync_func(); /* Force memory barrier on all CPUs. */ | 388 | struct rcu_synchronize { |
389 | struct rcu_head head; | ||
390 | struct completion completion; | ||
391 | }; | ||
200 | 392 | ||
201 | /* | 393 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 394 | * Awaken the corresponding synchronize_srcu() instance now that a |
203 | * sees the new value of sp->completed will also see any preceding | 395 | * grace period has elapsed. |
204 | * changes to data structures made by this CPU. This prevents | 396 | */ |
205 | * some other CPU from reordering the accesses in its SRCU | 397 | static void wakeme_after_rcu(struct rcu_head *head) |
206 | * read-side critical section to precede the corresponding | 398 | { |
207 | * srcu_read_lock() -- ensuring that such references will in | 399 | struct rcu_synchronize *rcu; |
208 | * fact be protected. | ||
209 | * | ||
210 | * So it is now safe to do the flip. | ||
211 | */ | ||
212 | 400 | ||
213 | idx = sp->completed & 0x1; | 401 | rcu = container_of(head, struct rcu_synchronize, head); |
214 | sp->completed++; | 402 | complete(&rcu->completion); |
403 | } | ||
215 | 404 | ||
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
406 | static void srcu_reschedule(struct srcu_struct *sp); | ||
217 | 407 | ||
218 | /* | 408 | /* |
219 | * At this point, because of the preceding synchronize_sched(), | 409 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
220 | * all srcu_read_lock() calls using the old counters have completed. | 410 | */ |
221 | * Their corresponding critical sections might well be still | 411 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
222 | * executing, but the srcu_read_lock() primitives themselves | 412 | { |
223 | * will have finished executing. We initially give readers | 413 | struct rcu_synchronize rcu; |
224 | * an arbitrarily chosen 10 microseconds to get out of their | 414 | struct rcu_head *head = &rcu.head; |
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | 415 | bool done = false; |
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | 416 | ||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | 417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
418 | !lock_is_held(&rcu_bh_lock_map) && | ||
419 | !lock_is_held(&rcu_lock_map) && | ||
420 | !lock_is_held(&rcu_sched_lock_map), | ||
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
236 | 422 | ||
237 | /* | 423 | init_completion(&rcu.completion); |
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | 424 | |
239 | * primitives that were executing concurrently with the preceding | 425 | head->next = NULL; |
240 | * for_each_possible_cpu() loop to have completed by this point. | 426 | head->func = wakeme_after_rcu; |
241 | * More importantly, it also forces the corresponding SRCU read-side | 427 | spin_lock_irq(&sp->queue_lock); |
242 | * critical sections to have also completed, and the corresponding | 428 | if (!sp->running) { |
243 | * references to SRCU-protected data items to be dropped. | 429 | /* steal the processing owner */ |
244 | * | 430 | sp->running = true; |
245 | * Note: | 431 | rcu_batch_queue(&sp->batch_check0, head); |
246 | * | 432 | spin_unlock_irq(&sp->queue_lock); |
247 | * Despite what you might think at first glance, the | 433 | |
248 | * preceding synchronize_sched() -must- be within the | 434 | srcu_advance_batches(sp, trycount); |
249 | * critical section ended by the following mutex_unlock(). | 435 | if (!rcu_batch_empty(&sp->batch_done)) { |
250 | * Otherwise, a task taking the early exit can race | 436 | BUG_ON(sp->batch_done.head != head); |
251 | * with a srcu_read_unlock(), which might have executed | 437 | rcu_batch_dequeue(&sp->batch_done); |
252 | * just before the preceding srcu_readers_active() check, | 438 | done = true; |
253 | * and whose CPU might have reordered the srcu_read_unlock() | 439 | } |
254 | * with the preceding critical section. In this case, there | 440 | /* give the processing owner to work_struct */ |
255 | * is nothing preventing the synchronize_sched() task that is | 441 | srcu_reschedule(sp); |
256 | * taking the early exit from freeing a data structure that | 442 | } else { |
257 | * is still being referenced (out of order) by the task | 443 | rcu_batch_queue(&sp->batch_queue, head); |
258 | * doing the srcu_read_unlock(). | 444 | spin_unlock_irq(&sp->queue_lock); |
259 | * | 445 | } |
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | ||
264 | 446 | ||
265 | mutex_unlock(&sp->mutex); | 447 | if (!done) |
448 | wait_for_completion(&rcu.completion); | ||
266 | } | 449 | } |
267 | 450 | ||
268 | /** | 451 | /** |
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 464 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 465 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 466 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
285 | } | 468 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 469 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 470 | ||
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 472 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 473 | * @sp: srcu_struct with which to synchronize. |
291 | * | 474 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 475 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 476 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 477 | * |
300 | * Note that it is illegal to call this function while holding any lock | 478 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 479 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 480 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 481 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 482 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 485 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 487 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
313 | } | 489 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 491 | ||
316 | /** | 492 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
494 | */ | ||
495 | void srcu_barrier(struct srcu_struct *sp) | ||
496 | { | ||
497 | synchronize_srcu(sp); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
500 | |||
501 | /** | ||
317 | * srcu_batches_completed - return batches completed. | 502 | * srcu_batches_completed - return batches completed. |
318 | * @sp: srcu_struct on which to report batch completion. | 503 | * @sp: srcu_struct on which to report batch completion. |
319 | * | 504 | * |
320 | * Report the number of batches, correlated with, but not necessarily | 505 | * Report the number of batches, correlated with, but not necessarily |
321 | * precisely the same as, the number of grace periods that have elapsed. | 506 | * precisely the same as, the number of grace periods that have elapsed. |
322 | */ | 507 | */ |
323 | |||
324 | long srcu_batches_completed(struct srcu_struct *sp) | 508 | long srcu_batches_completed(struct srcu_struct *sp) |
325 | { | 509 | { |
326 | return sp->completed; | 510 | return sp->completed; |
327 | } | 511 | } |
328 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |
513 | |||
514 | #define SRCU_CALLBACK_BATCH 10 | ||
515 | #define SRCU_INTERVAL 1 | ||
516 | |||
517 | /* | ||
518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
519 | * period pipeline. | ||
520 | */ | ||
521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
522 | { | ||
523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
524 | spin_lock_irq(&sp->queue_lock); | ||
525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
526 | spin_unlock_irq(&sp->queue_lock); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
533 | */ | ||
534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
535 | { | ||
536 | int idx = 1 ^ (sp->completed & 1); | ||
537 | |||
538 | /* | ||
539 | * Because readers might be delayed for an extended period after | ||
540 | * fetching ->completed for their index, at any point in time there | ||
541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
542 | * need to wait for readers to clear from both index values before | ||
543 | * invoking a callback. | ||
544 | */ | ||
545 | |||
546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
547 | rcu_batch_empty(&sp->batch_check1)) | ||
548 | return; /* no callbacks need to be advanced */ | ||
549 | |||
550 | if (!try_check_zero(sp, idx, trycount)) | ||
551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
552 | |||
553 | /* | ||
554 | * The callbacks in ->batch_check1 have already done with their | ||
555 | * first zero check and flip back when they were enqueued on | ||
556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
557 | * (Presumably try_check_zero() returned false during that | ||
558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
560 | */ | ||
561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
562 | |||
563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
564 | return; /* no callbacks need to be advanced */ | ||
565 | srcu_flip(sp); | ||
566 | |||
567 | /* | ||
568 | * The callbacks in ->batch_check0 just finished their | ||
569 | * first check zero and flip, so move them to ->batch_check1 | ||
570 | * for future checking on the other idx. | ||
571 | */ | ||
572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
573 | |||
574 | /* | ||
575 | * SRCU read-side critical sections are normally short, so check | ||
576 | * at least twice in quick succession after a flip. | ||
577 | */ | ||
578 | trycount = trycount < 2 ? 2 : trycount; | ||
579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
581 | |||
582 | /* | ||
583 | * The callbacks in ->batch_check1 have now waited for all | ||
584 | * pre-existing readers using both idx values. They are therefore | ||
585 | * ready to invoke, so move them to ->batch_done. | ||
586 | */ | ||
587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
592 | * their grace period. If there are more to do, SRCU will reschedule | ||
593 | * the workqueue. | ||
594 | */ | ||
595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
596 | { | ||
597 | int i; | ||
598 | struct rcu_head *head; | ||
599 | |||
600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
602 | if (!head) | ||
603 | break; | ||
604 | local_bh_disable(); | ||
605 | head->func(head); | ||
606 | local_bh_enable(); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Finished one round of SRCU grace period. Start another if there are | ||
612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
613 | */ | ||
614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
615 | { | ||
616 | bool pending = true; | ||
617 | |||
618 | if (rcu_batch_empty(&sp->batch_done) && | ||
619 | rcu_batch_empty(&sp->batch_check1) && | ||
620 | rcu_batch_empty(&sp->batch_check0) && | ||
621 | rcu_batch_empty(&sp->batch_queue)) { | ||
622 | spin_lock_irq(&sp->queue_lock); | ||
623 | if (rcu_batch_empty(&sp->batch_done) && | ||
624 | rcu_batch_empty(&sp->batch_check1) && | ||
625 | rcu_batch_empty(&sp->batch_check0) && | ||
626 | rcu_batch_empty(&sp->batch_queue)) { | ||
627 | sp->running = false; | ||
628 | pending = false; | ||
629 | } | ||
630 | spin_unlock_irq(&sp->queue_lock); | ||
631 | } | ||
632 | |||
633 | if (pending) | ||
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * This is the work-queue function that handles SRCU grace periods. | ||
639 | */ | ||
640 | static void process_srcu(struct work_struct *work) | ||
641 | { | ||
642 | struct srcu_struct *sp; | ||
643 | |||
644 | sp = container_of(work, struct srcu_struct, work.work); | ||
645 | |||
646 | srcu_collect_new(sp); | ||
647 | srcu_advance_batches(sp, 1); | ||
648 | srcu_invoke_callbacks(sp); | ||
649 | srcu_reschedule(sp); | ||
650 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e4..9ff89cb9657a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -36,6 +36,8 @@ | |||
36 | #include <linux/personality.h> | 36 | #include <linux/personality.h> |
37 | #include <linux/ptrace.h> | 37 | #include <linux/ptrace.h> |
38 | #include <linux/fs_struct.h> | 38 | #include <linux/fs_struct.h> |
39 | #include <linux/file.h> | ||
40 | #include <linux/mount.h> | ||
39 | #include <linux/gfp.h> | 41 | #include <linux/gfp.h> |
40 | #include <linux/syscore_ops.h> | 42 | #include <linux/syscore_ops.h> |
41 | #include <linux/version.h> | 43 | #include <linux/version.h> |
@@ -93,10 +95,8 @@ | |||
93 | int overflowuid = DEFAULT_OVERFLOWUID; | 95 | int overflowuid = DEFAULT_OVERFLOWUID; |
94 | int overflowgid = DEFAULT_OVERFLOWGID; | 96 | int overflowgid = DEFAULT_OVERFLOWGID; |
95 | 97 | ||
96 | #ifdef CONFIG_UID16 | ||
97 | EXPORT_SYMBOL(overflowuid); | 98 | EXPORT_SYMBOL(overflowuid); |
98 | EXPORT_SYMBOL(overflowgid); | 99 | EXPORT_SYMBOL(overflowgid); |
99 | #endif | ||
100 | 100 | ||
101 | /* | 101 | /* |
102 | * the same as above, but for filesystems which can only store a 16-bit | 102 | * the same as above, but for filesystems which can only store a 16-bit |
@@ -133,11 +133,10 @@ static bool set_one_prio_perm(struct task_struct *p) | |||
133 | { | 133 | { |
134 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | 134 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); |
135 | 135 | ||
136 | if (pcred->user->user_ns == cred->user->user_ns && | 136 | if (uid_eq(pcred->uid, cred->euid) || |
137 | (pcred->uid == cred->euid || | 137 | uid_eq(pcred->euid, cred->euid)) |
138 | pcred->euid == cred->euid)) | ||
139 | return true; | 138 | return true; |
140 | if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) | 139 | if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) |
141 | return true; | 140 | return true; |
142 | return false; | 141 | return false; |
143 | } | 142 | } |
@@ -177,6 +176,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
177 | const struct cred *cred = current_cred(); | 176 | const struct cred *cred = current_cred(); |
178 | int error = -EINVAL; | 177 | int error = -EINVAL; |
179 | struct pid *pgrp; | 178 | struct pid *pgrp; |
179 | kuid_t uid; | ||
180 | 180 | ||
181 | if (which > PRIO_USER || which < PRIO_PROCESS) | 181 | if (which > PRIO_USER || which < PRIO_PROCESS) |
182 | goto out; | 182 | goto out; |
@@ -209,18 +209,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
209 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 209 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
210 | break; | 210 | break; |
211 | case PRIO_USER: | 211 | case PRIO_USER: |
212 | user = (struct user_struct *) cred->user; | 212 | uid = make_kuid(cred->user_ns, who); |
213 | user = cred->user; | ||
213 | if (!who) | 214 | if (!who) |
214 | who = cred->uid; | 215 | uid = cred->uid; |
215 | else if ((who != cred->uid) && | 216 | else if (!uid_eq(uid, cred->uid) && |
216 | !(user = find_user(who))) | 217 | !(user = find_user(uid))) |
217 | goto out_unlock; /* No processes for this user */ | 218 | goto out_unlock; /* No processes for this user */ |
218 | 219 | ||
219 | do_each_thread(g, p) { | 220 | do_each_thread(g, p) { |
220 | if (__task_cred(p)->uid == who) | 221 | if (uid_eq(task_uid(p), uid)) |
221 | error = set_one_prio(p, niceval, error); | 222 | error = set_one_prio(p, niceval, error); |
222 | } while_each_thread(g, p); | 223 | } while_each_thread(g, p); |
223 | if (who != cred->uid) | 224 | if (!uid_eq(uid, cred->uid)) |
224 | free_uid(user); /* For find_user() */ | 225 | free_uid(user); /* For find_user() */ |
225 | break; | 226 | break; |
226 | } | 227 | } |
@@ -244,6 +245,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
244 | const struct cred *cred = current_cred(); | 245 | const struct cred *cred = current_cred(); |
245 | long niceval, retval = -ESRCH; | 246 | long niceval, retval = -ESRCH; |
246 | struct pid *pgrp; | 247 | struct pid *pgrp; |
248 | kuid_t uid; | ||
247 | 249 | ||
248 | if (which > PRIO_USER || which < PRIO_PROCESS) | 250 | if (which > PRIO_USER || which < PRIO_PROCESS) |
249 | return -EINVAL; | 251 | return -EINVAL; |
@@ -274,21 +276,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
274 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 276 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
275 | break; | 277 | break; |
276 | case PRIO_USER: | 278 | case PRIO_USER: |
277 | user = (struct user_struct *) cred->user; | 279 | uid = make_kuid(cred->user_ns, who); |
280 | user = cred->user; | ||
278 | if (!who) | 281 | if (!who) |
279 | who = cred->uid; | 282 | uid = cred->uid; |
280 | else if ((who != cred->uid) && | 283 | else if (!uid_eq(uid, cred->uid) && |
281 | !(user = find_user(who))) | 284 | !(user = find_user(uid))) |
282 | goto out_unlock; /* No processes for this user */ | 285 | goto out_unlock; /* No processes for this user */ |
283 | 286 | ||
284 | do_each_thread(g, p) { | 287 | do_each_thread(g, p) { |
285 | if (__task_cred(p)->uid == who) { | 288 | if (uid_eq(task_uid(p), uid)) { |
286 | niceval = 20 - task_nice(p); | 289 | niceval = 20 - task_nice(p); |
287 | if (niceval > retval) | 290 | if (niceval > retval) |
288 | retval = niceval; | 291 | retval = niceval; |
289 | } | 292 | } |
290 | } while_each_thread(g, p); | 293 | } while_each_thread(g, p); |
291 | if (who != cred->uid) | 294 | if (!uid_eq(uid, cred->uid)) |
292 | free_uid(user); /* for find_user() */ | 295 | free_uid(user); /* for find_user() */ |
293 | break; | 296 | break; |
294 | } | 297 | } |
@@ -553,9 +556,19 @@ void ctrl_alt_del(void) | |||
553 | */ | 556 | */ |
554 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 557 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) |
555 | { | 558 | { |
559 | struct user_namespace *ns = current_user_ns(); | ||
556 | const struct cred *old; | 560 | const struct cred *old; |
557 | struct cred *new; | 561 | struct cred *new; |
558 | int retval; | 562 | int retval; |
563 | kgid_t krgid, kegid; | ||
564 | |||
565 | krgid = make_kgid(ns, rgid); | ||
566 | kegid = make_kgid(ns, egid); | ||
567 | |||
568 | if ((rgid != (gid_t) -1) && !gid_valid(krgid)) | ||
569 | return -EINVAL; | ||
570 | if ((egid != (gid_t) -1) && !gid_valid(kegid)) | ||
571 | return -EINVAL; | ||
559 | 572 | ||
560 | new = prepare_creds(); | 573 | new = prepare_creds(); |
561 | if (!new) | 574 | if (!new) |
@@ -564,25 +577,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
564 | 577 | ||
565 | retval = -EPERM; | 578 | retval = -EPERM; |
566 | if (rgid != (gid_t) -1) { | 579 | if (rgid != (gid_t) -1) { |
567 | if (old->gid == rgid || | 580 | if (gid_eq(old->gid, krgid) || |
568 | old->egid == rgid || | 581 | gid_eq(old->egid, krgid) || |
569 | nsown_capable(CAP_SETGID)) | 582 | nsown_capable(CAP_SETGID)) |
570 | new->gid = rgid; | 583 | new->gid = krgid; |
571 | else | 584 | else |
572 | goto error; | 585 | goto error; |
573 | } | 586 | } |
574 | if (egid != (gid_t) -1) { | 587 | if (egid != (gid_t) -1) { |
575 | if (old->gid == egid || | 588 | if (gid_eq(old->gid, kegid) || |
576 | old->egid == egid || | 589 | gid_eq(old->egid, kegid) || |
577 | old->sgid == egid || | 590 | gid_eq(old->sgid, kegid) || |
578 | nsown_capable(CAP_SETGID)) | 591 | nsown_capable(CAP_SETGID)) |
579 | new->egid = egid; | 592 | new->egid = kegid; |
580 | else | 593 | else |
581 | goto error; | 594 | goto error; |
582 | } | 595 | } |
583 | 596 | ||
584 | if (rgid != (gid_t) -1 || | 597 | if (rgid != (gid_t) -1 || |
585 | (egid != (gid_t) -1 && egid != old->gid)) | 598 | (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) |
586 | new->sgid = new->egid; | 599 | new->sgid = new->egid; |
587 | new->fsgid = new->egid; | 600 | new->fsgid = new->egid; |
588 | 601 | ||
@@ -600,9 +613,15 @@ error: | |||
600 | */ | 613 | */ |
601 | SYSCALL_DEFINE1(setgid, gid_t, gid) | 614 | SYSCALL_DEFINE1(setgid, gid_t, gid) |
602 | { | 615 | { |
616 | struct user_namespace *ns = current_user_ns(); | ||
603 | const struct cred *old; | 617 | const struct cred *old; |
604 | struct cred *new; | 618 | struct cred *new; |
605 | int retval; | 619 | int retval; |
620 | kgid_t kgid; | ||
621 | |||
622 | kgid = make_kgid(ns, gid); | ||
623 | if (!gid_valid(kgid)) | ||
624 | return -EINVAL; | ||
606 | 625 | ||
607 | new = prepare_creds(); | 626 | new = prepare_creds(); |
608 | if (!new) | 627 | if (!new) |
@@ -611,9 +630,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
611 | 630 | ||
612 | retval = -EPERM; | 631 | retval = -EPERM; |
613 | if (nsown_capable(CAP_SETGID)) | 632 | if (nsown_capable(CAP_SETGID)) |
614 | new->gid = new->egid = new->sgid = new->fsgid = gid; | 633 | new->gid = new->egid = new->sgid = new->fsgid = kgid; |
615 | else if (gid == old->gid || gid == old->sgid) | 634 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) |
616 | new->egid = new->fsgid = gid; | 635 | new->egid = new->fsgid = kgid; |
617 | else | 636 | else |
618 | goto error; | 637 | goto error; |
619 | 638 | ||
@@ -631,7 +650,7 @@ static int set_user(struct cred *new) | |||
631 | { | 650 | { |
632 | struct user_struct *new_user; | 651 | struct user_struct *new_user; |
633 | 652 | ||
634 | new_user = alloc_uid(current_user_ns(), new->uid); | 653 | new_user = alloc_uid(new->uid); |
635 | if (!new_user) | 654 | if (!new_user) |
636 | return -EAGAIN; | 655 | return -EAGAIN; |
637 | 656 | ||
@@ -670,9 +689,19 @@ static int set_user(struct cred *new) | |||
670 | */ | 689 | */ |
671 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 690 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) |
672 | { | 691 | { |
692 | struct user_namespace *ns = current_user_ns(); | ||
673 | const struct cred *old; | 693 | const struct cred *old; |
674 | struct cred *new; | 694 | struct cred *new; |
675 | int retval; | 695 | int retval; |
696 | kuid_t kruid, keuid; | ||
697 | |||
698 | kruid = make_kuid(ns, ruid); | ||
699 | keuid = make_kuid(ns, euid); | ||
700 | |||
701 | if ((ruid != (uid_t) -1) && !uid_valid(kruid)) | ||
702 | return -EINVAL; | ||
703 | if ((euid != (uid_t) -1) && !uid_valid(keuid)) | ||
704 | return -EINVAL; | ||
676 | 705 | ||
677 | new = prepare_creds(); | 706 | new = prepare_creds(); |
678 | if (!new) | 707 | if (!new) |
@@ -681,29 +710,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
681 | 710 | ||
682 | retval = -EPERM; | 711 | retval = -EPERM; |
683 | if (ruid != (uid_t) -1) { | 712 | if (ruid != (uid_t) -1) { |
684 | new->uid = ruid; | 713 | new->uid = kruid; |
685 | if (old->uid != ruid && | 714 | if (!uid_eq(old->uid, kruid) && |
686 | old->euid != ruid && | 715 | !uid_eq(old->euid, kruid) && |
687 | !nsown_capable(CAP_SETUID)) | 716 | !nsown_capable(CAP_SETUID)) |
688 | goto error; | 717 | goto error; |
689 | } | 718 | } |
690 | 719 | ||
691 | if (euid != (uid_t) -1) { | 720 | if (euid != (uid_t) -1) { |
692 | new->euid = euid; | 721 | new->euid = keuid; |
693 | if (old->uid != euid && | 722 | if (!uid_eq(old->uid, keuid) && |
694 | old->euid != euid && | 723 | !uid_eq(old->euid, keuid) && |
695 | old->suid != euid && | 724 | !uid_eq(old->suid, keuid) && |
696 | !nsown_capable(CAP_SETUID)) | 725 | !nsown_capable(CAP_SETUID)) |
697 | goto error; | 726 | goto error; |
698 | } | 727 | } |
699 | 728 | ||
700 | if (new->uid != old->uid) { | 729 | if (!uid_eq(new->uid, old->uid)) { |
701 | retval = set_user(new); | 730 | retval = set_user(new); |
702 | if (retval < 0) | 731 | if (retval < 0) |
703 | goto error; | 732 | goto error; |
704 | } | 733 | } |
705 | if (ruid != (uid_t) -1 || | 734 | if (ruid != (uid_t) -1 || |
706 | (euid != (uid_t) -1 && euid != old->uid)) | 735 | (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) |
707 | new->suid = new->euid; | 736 | new->suid = new->euid; |
708 | new->fsuid = new->euid; | 737 | new->fsuid = new->euid; |
709 | 738 | ||
@@ -731,9 +760,15 @@ error: | |||
731 | */ | 760 | */ |
732 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 761 | SYSCALL_DEFINE1(setuid, uid_t, uid) |
733 | { | 762 | { |
763 | struct user_namespace *ns = current_user_ns(); | ||
734 | const struct cred *old; | 764 | const struct cred *old; |
735 | struct cred *new; | 765 | struct cred *new; |
736 | int retval; | 766 | int retval; |
767 | kuid_t kuid; | ||
768 | |||
769 | kuid = make_kuid(ns, uid); | ||
770 | if (!uid_valid(kuid)) | ||
771 | return -EINVAL; | ||
737 | 772 | ||
738 | new = prepare_creds(); | 773 | new = prepare_creds(); |
739 | if (!new) | 774 | if (!new) |
@@ -742,17 +777,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
742 | 777 | ||
743 | retval = -EPERM; | 778 | retval = -EPERM; |
744 | if (nsown_capable(CAP_SETUID)) { | 779 | if (nsown_capable(CAP_SETUID)) { |
745 | new->suid = new->uid = uid; | 780 | new->suid = new->uid = kuid; |
746 | if (uid != old->uid) { | 781 | if (!uid_eq(kuid, old->uid)) { |
747 | retval = set_user(new); | 782 | retval = set_user(new); |
748 | if (retval < 0) | 783 | if (retval < 0) |
749 | goto error; | 784 | goto error; |
750 | } | 785 | } |
751 | } else if (uid != old->uid && uid != new->suid) { | 786 | } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { |
752 | goto error; | 787 | goto error; |
753 | } | 788 | } |
754 | 789 | ||
755 | new->fsuid = new->euid = uid; | 790 | new->fsuid = new->euid = kuid; |
756 | 791 | ||
757 | retval = security_task_fix_setuid(new, old, LSM_SETID_ID); | 792 | retval = security_task_fix_setuid(new, old, LSM_SETID_ID); |
758 | if (retval < 0) | 793 | if (retval < 0) |
@@ -772,9 +807,24 @@ error: | |||
772 | */ | 807 | */ |
773 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | 808 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) |
774 | { | 809 | { |
810 | struct user_namespace *ns = current_user_ns(); | ||
775 | const struct cred *old; | 811 | const struct cred *old; |
776 | struct cred *new; | 812 | struct cred *new; |
777 | int retval; | 813 | int retval; |
814 | kuid_t kruid, keuid, ksuid; | ||
815 | |||
816 | kruid = make_kuid(ns, ruid); | ||
817 | keuid = make_kuid(ns, euid); | ||
818 | ksuid = make_kuid(ns, suid); | ||
819 | |||
820 | if ((ruid != (uid_t) -1) && !uid_valid(kruid)) | ||
821 | return -EINVAL; | ||
822 | |||
823 | if ((euid != (uid_t) -1) && !uid_valid(keuid)) | ||
824 | return -EINVAL; | ||
825 | |||
826 | if ((suid != (uid_t) -1) && !uid_valid(ksuid)) | ||
827 | return -EINVAL; | ||
778 | 828 | ||
779 | new = prepare_creds(); | 829 | new = prepare_creds(); |
780 | if (!new) | 830 | if (!new) |
@@ -784,29 +834,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
784 | 834 | ||
785 | retval = -EPERM; | 835 | retval = -EPERM; |
786 | if (!nsown_capable(CAP_SETUID)) { | 836 | if (!nsown_capable(CAP_SETUID)) { |
787 | if (ruid != (uid_t) -1 && ruid != old->uid && | 837 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && |
788 | ruid != old->euid && ruid != old->suid) | 838 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) |
789 | goto error; | 839 | goto error; |
790 | if (euid != (uid_t) -1 && euid != old->uid && | 840 | if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && |
791 | euid != old->euid && euid != old->suid) | 841 | !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) |
792 | goto error; | 842 | goto error; |
793 | if (suid != (uid_t) -1 && suid != old->uid && | 843 | if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && |
794 | suid != old->euid && suid != old->suid) | 844 | !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) |
795 | goto error; | 845 | goto error; |
796 | } | 846 | } |
797 | 847 | ||
798 | if (ruid != (uid_t) -1) { | 848 | if (ruid != (uid_t) -1) { |
799 | new->uid = ruid; | 849 | new->uid = kruid; |
800 | if (ruid != old->uid) { | 850 | if (!uid_eq(kruid, old->uid)) { |
801 | retval = set_user(new); | 851 | retval = set_user(new); |
802 | if (retval < 0) | 852 | if (retval < 0) |
803 | goto error; | 853 | goto error; |
804 | } | 854 | } |
805 | } | 855 | } |
806 | if (euid != (uid_t) -1) | 856 | if (euid != (uid_t) -1) |
807 | new->euid = euid; | 857 | new->euid = keuid; |
808 | if (suid != (uid_t) -1) | 858 | if (suid != (uid_t) -1) |
809 | new->suid = suid; | 859 | new->suid = ksuid; |
810 | new->fsuid = new->euid; | 860 | new->fsuid = new->euid; |
811 | 861 | ||
812 | retval = security_task_fix_setuid(new, old, LSM_SETID_RES); | 862 | retval = security_task_fix_setuid(new, old, LSM_SETID_RES); |
@@ -820,14 +870,19 @@ error: | |||
820 | return retval; | 870 | return retval; |
821 | } | 871 | } |
822 | 872 | ||
823 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) | 873 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) |
824 | { | 874 | { |
825 | const struct cred *cred = current_cred(); | 875 | const struct cred *cred = current_cred(); |
826 | int retval; | 876 | int retval; |
877 | uid_t ruid, euid, suid; | ||
827 | 878 | ||
828 | if (!(retval = put_user(cred->uid, ruid)) && | 879 | ruid = from_kuid_munged(cred->user_ns, cred->uid); |
829 | !(retval = put_user(cred->euid, euid))) | 880 | euid = from_kuid_munged(cred->user_ns, cred->euid); |
830 | retval = put_user(cred->suid, suid); | 881 | suid = from_kuid_munged(cred->user_ns, cred->suid); |
882 | |||
883 | if (!(retval = put_user(ruid, ruidp)) && | ||
884 | !(retval = put_user(euid, euidp))) | ||
885 | retval = put_user(suid, suidp); | ||
831 | 886 | ||
832 | return retval; | 887 | return retval; |
833 | } | 888 | } |
@@ -837,9 +892,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u | |||
837 | */ | 892 | */ |
838 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | 893 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) |
839 | { | 894 | { |
895 | struct user_namespace *ns = current_user_ns(); | ||
840 | const struct cred *old; | 896 | const struct cred *old; |
841 | struct cred *new; | 897 | struct cred *new; |
842 | int retval; | 898 | int retval; |
899 | kgid_t krgid, kegid, ksgid; | ||
900 | |||
901 | krgid = make_kgid(ns, rgid); | ||
902 | kegid = make_kgid(ns, egid); | ||
903 | ksgid = make_kgid(ns, sgid); | ||
904 | |||
905 | if ((rgid != (gid_t) -1) && !gid_valid(krgid)) | ||
906 | return -EINVAL; | ||
907 | if ((egid != (gid_t) -1) && !gid_valid(kegid)) | ||
908 | return -EINVAL; | ||
909 | if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) | ||
910 | return -EINVAL; | ||
843 | 911 | ||
844 | new = prepare_creds(); | 912 | new = prepare_creds(); |
845 | if (!new) | 913 | if (!new) |
@@ -848,23 +916,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
848 | 916 | ||
849 | retval = -EPERM; | 917 | retval = -EPERM; |
850 | if (!nsown_capable(CAP_SETGID)) { | 918 | if (!nsown_capable(CAP_SETGID)) { |
851 | if (rgid != (gid_t) -1 && rgid != old->gid && | 919 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && |
852 | rgid != old->egid && rgid != old->sgid) | 920 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) |
853 | goto error; | 921 | goto error; |
854 | if (egid != (gid_t) -1 && egid != old->gid && | 922 | if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && |
855 | egid != old->egid && egid != old->sgid) | 923 | !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) |
856 | goto error; | 924 | goto error; |
857 | if (sgid != (gid_t) -1 && sgid != old->gid && | 925 | if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && |
858 | sgid != old->egid && sgid != old->sgid) | 926 | !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) |
859 | goto error; | 927 | goto error; |
860 | } | 928 | } |
861 | 929 | ||
862 | if (rgid != (gid_t) -1) | 930 | if (rgid != (gid_t) -1) |
863 | new->gid = rgid; | 931 | new->gid = krgid; |
864 | if (egid != (gid_t) -1) | 932 | if (egid != (gid_t) -1) |
865 | new->egid = egid; | 933 | new->egid = kegid; |
866 | if (sgid != (gid_t) -1) | 934 | if (sgid != (gid_t) -1) |
867 | new->sgid = sgid; | 935 | new->sgid = ksgid; |
868 | new->fsgid = new->egid; | 936 | new->fsgid = new->egid; |
869 | 937 | ||
870 | return commit_creds(new); | 938 | return commit_creds(new); |
@@ -874,14 +942,19 @@ error: | |||
874 | return retval; | 942 | return retval; |
875 | } | 943 | } |
876 | 944 | ||
877 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) | 945 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) |
878 | { | 946 | { |
879 | const struct cred *cred = current_cred(); | 947 | const struct cred *cred = current_cred(); |
880 | int retval; | 948 | int retval; |
949 | gid_t rgid, egid, sgid; | ||
881 | 950 | ||
882 | if (!(retval = put_user(cred->gid, rgid)) && | 951 | rgid = from_kgid_munged(cred->user_ns, cred->gid); |
883 | !(retval = put_user(cred->egid, egid))) | 952 | egid = from_kgid_munged(cred->user_ns, cred->egid); |
884 | retval = put_user(cred->sgid, sgid); | 953 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); |
954 | |||
955 | if (!(retval = put_user(rgid, rgidp)) && | ||
956 | !(retval = put_user(egid, egidp))) | ||
957 | retval = put_user(sgid, sgidp); | ||
885 | 958 | ||
886 | return retval; | 959 | return retval; |
887 | } | 960 | } |
@@ -898,18 +971,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
898 | const struct cred *old; | 971 | const struct cred *old; |
899 | struct cred *new; | 972 | struct cred *new; |
900 | uid_t old_fsuid; | 973 | uid_t old_fsuid; |
974 | kuid_t kuid; | ||
975 | |||
976 | old = current_cred(); | ||
977 | old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); | ||
978 | |||
979 | kuid = make_kuid(old->user_ns, uid); | ||
980 | if (!uid_valid(kuid)) | ||
981 | return old_fsuid; | ||
901 | 982 | ||
902 | new = prepare_creds(); | 983 | new = prepare_creds(); |
903 | if (!new) | 984 | if (!new) |
904 | return current_fsuid(); | 985 | return old_fsuid; |
905 | old = current_cred(); | ||
906 | old_fsuid = old->fsuid; | ||
907 | 986 | ||
908 | if (uid == old->uid || uid == old->euid || | 987 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || |
909 | uid == old->suid || uid == old->fsuid || | 988 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || |
910 | nsown_capable(CAP_SETUID)) { | 989 | nsown_capable(CAP_SETUID)) { |
911 | if (uid != old_fsuid) { | 990 | if (!uid_eq(kuid, old->fsuid)) { |
912 | new->fsuid = uid; | 991 | new->fsuid = kuid; |
913 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 992 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
914 | goto change_okay; | 993 | goto change_okay; |
915 | } | 994 | } |
@@ -931,18 +1010,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
931 | const struct cred *old; | 1010 | const struct cred *old; |
932 | struct cred *new; | 1011 | struct cred *new; |
933 | gid_t old_fsgid; | 1012 | gid_t old_fsgid; |
1013 | kgid_t kgid; | ||
1014 | |||
1015 | old = current_cred(); | ||
1016 | old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); | ||
1017 | |||
1018 | kgid = make_kgid(old->user_ns, gid); | ||
1019 | if (!gid_valid(kgid)) | ||
1020 | return old_fsgid; | ||
934 | 1021 | ||
935 | new = prepare_creds(); | 1022 | new = prepare_creds(); |
936 | if (!new) | 1023 | if (!new) |
937 | return current_fsgid(); | 1024 | return old_fsgid; |
938 | old = current_cred(); | ||
939 | old_fsgid = old->fsgid; | ||
940 | 1025 | ||
941 | if (gid == old->gid || gid == old->egid || | 1026 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || |
942 | gid == old->sgid || gid == old->fsgid || | 1027 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || |
943 | nsown_capable(CAP_SETGID)) { | 1028 | nsown_capable(CAP_SETGID)) { |
944 | if (gid != old_fsgid) { | 1029 | if (!gid_eq(kgid, old->fsgid)) { |
945 | new->fsgid = gid; | 1030 | new->fsgid = kgid; |
946 | goto change_okay; | 1031 | goto change_okay; |
947 | } | 1032 | } |
948 | } | 1033 | } |
@@ -1295,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) | |||
1295 | memcpy(u->nodename, tmp, len); | 1380 | memcpy(u->nodename, tmp, len); |
1296 | memset(u->nodename + len, 0, sizeof(u->nodename) - len); | 1381 | memset(u->nodename + len, 0, sizeof(u->nodename) - len); |
1297 | errno = 0; | 1382 | errno = 0; |
1383 | uts_proc_notify(UTS_PROC_HOSTNAME); | ||
1298 | } | 1384 | } |
1299 | uts_proc_notify(UTS_PROC_HOSTNAME); | ||
1300 | up_write(&uts_sem); | 1385 | up_write(&uts_sem); |
1301 | return errno; | 1386 | return errno; |
1302 | } | 1387 | } |
@@ -1346,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
1346 | memcpy(u->domainname, tmp, len); | 1431 | memcpy(u->domainname, tmp, len); |
1347 | memset(u->domainname + len, 0, sizeof(u->domainname) - len); | 1432 | memset(u->domainname + len, 0, sizeof(u->domainname) - len); |
1348 | errno = 0; | 1433 | errno = 0; |
1434 | uts_proc_notify(UTS_PROC_DOMAINNAME); | ||
1349 | } | 1435 | } |
1350 | uts_proc_notify(UTS_PROC_DOMAINNAME); | ||
1351 | up_write(&uts_sem); | 1436 | up_write(&uts_sem); |
1352 | return errno; | 1437 | return errno; |
1353 | } | 1438 | } |
@@ -1498,15 +1583,14 @@ static int check_prlimit_permission(struct task_struct *task) | |||
1498 | return 0; | 1583 | return 0; |
1499 | 1584 | ||
1500 | tcred = __task_cred(task); | 1585 | tcred = __task_cred(task); |
1501 | if (cred->user->user_ns == tcred->user->user_ns && | 1586 | if (uid_eq(cred->uid, tcred->euid) && |
1502 | (cred->uid == tcred->euid && | 1587 | uid_eq(cred->uid, tcred->suid) && |
1503 | cred->uid == tcred->suid && | 1588 | uid_eq(cred->uid, tcred->uid) && |
1504 | cred->uid == tcred->uid && | 1589 | gid_eq(cred->gid, tcred->egid) && |
1505 | cred->gid == tcred->egid && | 1590 | gid_eq(cred->gid, tcred->sgid) && |
1506 | cred->gid == tcred->sgid && | 1591 | gid_eq(cred->gid, tcred->gid)) |
1507 | cred->gid == tcred->gid)) | ||
1508 | return 0; | 1592 | return 0; |
1509 | if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) | 1593 | if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) |
1510 | return 0; | 1594 | return 0; |
1511 | 1595 | ||
1512 | return -EPERM; | 1596 | return -EPERM; |
@@ -1702,77 +1786,102 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1702 | } | 1786 | } |
1703 | 1787 | ||
1704 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
1789 | static bool vma_flags_mismatch(struct vm_area_struct *vma, | ||
1790 | unsigned long required, | ||
1791 | unsigned long banned) | ||
1792 | { | ||
1793 | return (vma->vm_flags & required) != required || | ||
1794 | (vma->vm_flags & banned); | ||
1795 | } | ||
1796 | |||
1797 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | ||
1798 | { | ||
1799 | struct file *exe_file; | ||
1800 | struct dentry *dentry; | ||
1801 | int err; | ||
1802 | |||
1803 | /* | ||
1804 | * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's | ||
1805 | * remain. So perform a quick test first. | ||
1806 | */ | ||
1807 | if (mm->num_exe_file_vmas) | ||
1808 | return -EBUSY; | ||
1809 | |||
1810 | exe_file = fget(fd); | ||
1811 | if (!exe_file) | ||
1812 | return -EBADF; | ||
1813 | |||
1814 | dentry = exe_file->f_path.dentry; | ||
1815 | |||
1816 | /* | ||
1817 | * Because the original mm->exe_file points to executable file, make | ||
1818 | * sure that this one is executable as well, to avoid breaking an | ||
1819 | * overall picture. | ||
1820 | */ | ||
1821 | err = -EACCES; | ||
1822 | if (!S_ISREG(dentry->d_inode->i_mode) || | ||
1823 | exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) | ||
1824 | goto exit; | ||
1825 | |||
1826 | err = inode_permission(dentry->d_inode, MAY_EXEC); | ||
1827 | if (err) | ||
1828 | goto exit; | ||
1829 | |||
1830 | /* | ||
1831 | * The symlink can be changed only once, just to disallow arbitrary | ||
1832 | * transitions malicious software might bring in. This means one | ||
1833 | * could make a snapshot over all processes running and monitor | ||
1834 | * /proc/pid/exe changes to notice unusual activity if needed. | ||
1835 | */ | ||
1836 | down_write(&mm->mmap_sem); | ||
1837 | if (likely(!mm->exe_file)) | ||
1838 | set_mm_exe_file(mm, exe_file); | ||
1839 | else | ||
1840 | err = -EBUSY; | ||
1841 | up_write(&mm->mmap_sem); | ||
1842 | |||
1843 | exit: | ||
1844 | fput(exe_file); | ||
1845 | return err; | ||
1846 | } | ||
1847 | |||
1705 | static int prctl_set_mm(int opt, unsigned long addr, | 1848 | static int prctl_set_mm(int opt, unsigned long addr, |
1706 | unsigned long arg4, unsigned long arg5) | 1849 | unsigned long arg4, unsigned long arg5) |
1707 | { | 1850 | { |
1708 | unsigned long rlim = rlimit(RLIMIT_DATA); | 1851 | unsigned long rlim = rlimit(RLIMIT_DATA); |
1709 | unsigned long vm_req_flags; | ||
1710 | unsigned long vm_bad_flags; | ||
1711 | struct vm_area_struct *vma; | ||
1712 | int error = 0; | ||
1713 | struct mm_struct *mm = current->mm; | 1852 | struct mm_struct *mm = current->mm; |
1853 | struct vm_area_struct *vma; | ||
1854 | int error; | ||
1714 | 1855 | ||
1715 | if (arg4 | arg5) | 1856 | if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) |
1716 | return -EINVAL; | 1857 | return -EINVAL; |
1717 | 1858 | ||
1718 | if (!capable(CAP_SYS_RESOURCE)) | 1859 | if (!capable(CAP_SYS_RESOURCE)) |
1719 | return -EPERM; | 1860 | return -EPERM; |
1720 | 1861 | ||
1862 | if (opt == PR_SET_MM_EXE_FILE) | ||
1863 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | ||
1864 | |||
1721 | if (addr >= TASK_SIZE) | 1865 | if (addr >= TASK_SIZE) |
1722 | return -EINVAL; | 1866 | return -EINVAL; |
1723 | 1867 | ||
1868 | error = -EINVAL; | ||
1869 | |||
1724 | down_read(&mm->mmap_sem); | 1870 | down_read(&mm->mmap_sem); |
1725 | vma = find_vma(mm, addr); | 1871 | vma = find_vma(mm, addr); |
1726 | 1872 | ||
1727 | if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { | ||
1728 | /* It must be existing VMA */ | ||
1729 | if (!vma || vma->vm_start > addr) | ||
1730 | goto out; | ||
1731 | } | ||
1732 | |||
1733 | error = -EINVAL; | ||
1734 | switch (opt) { | 1873 | switch (opt) { |
1735 | case PR_SET_MM_START_CODE: | 1874 | case PR_SET_MM_START_CODE: |
1875 | mm->start_code = addr; | ||
1876 | break; | ||
1736 | case PR_SET_MM_END_CODE: | 1877 | case PR_SET_MM_END_CODE: |
1737 | vm_req_flags = VM_READ | VM_EXEC; | 1878 | mm->end_code = addr; |
1738 | vm_bad_flags = VM_WRITE | VM_MAYSHARE; | ||
1739 | |||
1740 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1741 | (vma->vm_flags & vm_bad_flags)) | ||
1742 | goto out; | ||
1743 | |||
1744 | if (opt == PR_SET_MM_START_CODE) | ||
1745 | mm->start_code = addr; | ||
1746 | else | ||
1747 | mm->end_code = addr; | ||
1748 | break; | 1879 | break; |
1749 | |||
1750 | case PR_SET_MM_START_DATA: | 1880 | case PR_SET_MM_START_DATA: |
1751 | case PR_SET_MM_END_DATA: | 1881 | mm->start_data = addr; |
1752 | vm_req_flags = VM_READ | VM_WRITE; | ||
1753 | vm_bad_flags = VM_EXEC | VM_MAYSHARE; | ||
1754 | |||
1755 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1756 | (vma->vm_flags & vm_bad_flags)) | ||
1757 | goto out; | ||
1758 | |||
1759 | if (opt == PR_SET_MM_START_DATA) | ||
1760 | mm->start_data = addr; | ||
1761 | else | ||
1762 | mm->end_data = addr; | ||
1763 | break; | 1882 | break; |
1764 | 1883 | case PR_SET_MM_END_DATA: | |
1765 | case PR_SET_MM_START_STACK: | 1884 | mm->end_data = addr; |
1766 | |||
1767 | #ifdef CONFIG_STACK_GROWSUP | ||
1768 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; | ||
1769 | #else | ||
1770 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; | ||
1771 | #endif | ||
1772 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags) | ||
1773 | goto out; | ||
1774 | |||
1775 | mm->start_stack = addr; | ||
1776 | break; | 1885 | break; |
1777 | 1886 | ||
1778 | case PR_SET_MM_START_BRK: | 1887 | case PR_SET_MM_START_BRK: |
@@ -1799,16 +1908,77 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1799 | mm->brk = addr; | 1908 | mm->brk = addr; |
1800 | break; | 1909 | break; |
1801 | 1910 | ||
1911 | /* | ||
1912 | * If command line arguments and environment | ||
1913 | * are placed somewhere else on stack, we can | ||
1914 | * set them up here, ARG_START/END to setup | ||
1915 | * command line argumets and ENV_START/END | ||
1916 | * for environment. | ||
1917 | */ | ||
1918 | case PR_SET_MM_START_STACK: | ||
1919 | case PR_SET_MM_ARG_START: | ||
1920 | case PR_SET_MM_ARG_END: | ||
1921 | case PR_SET_MM_ENV_START: | ||
1922 | case PR_SET_MM_ENV_END: | ||
1923 | if (!vma) { | ||
1924 | error = -EFAULT; | ||
1925 | goto out; | ||
1926 | } | ||
1927 | #ifdef CONFIG_STACK_GROWSUP | ||
1928 | if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) | ||
1929 | #else | ||
1930 | if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) | ||
1931 | #endif | ||
1932 | goto out; | ||
1933 | if (opt == PR_SET_MM_START_STACK) | ||
1934 | mm->start_stack = addr; | ||
1935 | else if (opt == PR_SET_MM_ARG_START) | ||
1936 | mm->arg_start = addr; | ||
1937 | else if (opt == PR_SET_MM_ARG_END) | ||
1938 | mm->arg_end = addr; | ||
1939 | else if (opt == PR_SET_MM_ENV_START) | ||
1940 | mm->env_start = addr; | ||
1941 | else if (opt == PR_SET_MM_ENV_END) | ||
1942 | mm->env_end = addr; | ||
1943 | break; | ||
1944 | |||
1945 | /* | ||
1946 | * This doesn't move auxiliary vector itself | ||
1947 | * since it's pinned to mm_struct, but allow | ||
1948 | * to fill vector with new values. It's up | ||
1949 | * to a caller to provide sane values here | ||
1950 | * otherwise user space tools which use this | ||
1951 | * vector might be unhappy. | ||
1952 | */ | ||
1953 | case PR_SET_MM_AUXV: { | ||
1954 | unsigned long user_auxv[AT_VECTOR_SIZE]; | ||
1955 | |||
1956 | if (arg4 > sizeof(user_auxv)) | ||
1957 | goto out; | ||
1958 | up_read(&mm->mmap_sem); | ||
1959 | |||
1960 | if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) | ||
1961 | return -EFAULT; | ||
1962 | |||
1963 | /* Make sure the last entry is always AT_NULL */ | ||
1964 | user_auxv[AT_VECTOR_SIZE - 2] = 0; | ||
1965 | user_auxv[AT_VECTOR_SIZE - 1] = 0; | ||
1966 | |||
1967 | BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); | ||
1968 | |||
1969 | task_lock(current); | ||
1970 | memcpy(mm->saved_auxv, user_auxv, arg4); | ||
1971 | task_unlock(current); | ||
1972 | |||
1973 | return 0; | ||
1974 | } | ||
1802 | default: | 1975 | default: |
1803 | error = -EINVAL; | ||
1804 | goto out; | 1976 | goto out; |
1805 | } | 1977 | } |
1806 | 1978 | ||
1807 | error = 0; | 1979 | error = 0; |
1808 | |||
1809 | out: | 1980 | out: |
1810 | up_read(&mm->mmap_sem); | 1981 | up_read(&mm->mmap_sem); |
1811 | |||
1812 | return error; | 1982 | return error; |
1813 | } | 1983 | } |
1814 | #else /* CONFIG_CHECKPOINT_RESTORE */ | 1984 | #else /* CONFIG_CHECKPOINT_RESTORE */ |
@@ -1908,7 +2078,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1908 | error = prctl_get_seccomp(); | 2078 | error = prctl_get_seccomp(); |
1909 | break; | 2079 | break; |
1910 | case PR_SET_SECCOMP: | 2080 | case PR_SET_SECCOMP: |
1911 | error = prctl_set_seccomp(arg2); | 2081 | error = prctl_set_seccomp(arg2, (char __user *)arg3); |
1912 | break; | 2082 | break; |
1913 | case PR_GET_TSC: | 2083 | case PR_GET_TSC: |
1914 | error = GET_TSC_CTL(arg2); | 2084 | error = GET_TSC_CTL(arg2); |
@@ -1979,6 +2149,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1979 | error = put_user(me->signal->is_child_subreaper, | 2149 | error = put_user(me->signal->is_child_subreaper, |
1980 | (int __user *) arg2); | 2150 | (int __user *) arg2); |
1981 | break; | 2151 | break; |
2152 | case PR_SET_NO_NEW_PRIVS: | ||
2153 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
2154 | return -EINVAL; | ||
2155 | |||
2156 | current->no_new_privs = 1; | ||
2157 | break; | ||
2158 | case PR_GET_NO_NEW_PRIVS: | ||
2159 | if (arg2 || arg3 || arg4 || arg5) | ||
2160 | return -EINVAL; | ||
2161 | return current->no_new_privs ? 1 : 0; | ||
1982 | default: | 2162 | default: |
1983 | error = -EINVAL; | 2163 | error = -EINVAL; |
1984 | break; | 2164 | break; |
@@ -2022,7 +2202,6 @@ int orderly_poweroff(bool force) | |||
2022 | NULL | 2202 | NULL |
2023 | }; | 2203 | }; |
2024 | int ret = -ENOMEM; | 2204 | int ret = -ENOMEM; |
2025 | struct subprocess_info *info; | ||
2026 | 2205 | ||
2027 | if (argv == NULL) { | 2206 | if (argv == NULL) { |
2028 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | 2207 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", |
@@ -2030,18 +2209,16 @@ int orderly_poweroff(bool force) | |||
2030 | goto out; | 2209 | goto out; |
2031 | } | 2210 | } |
2032 | 2211 | ||
2033 | info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); | 2212 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, |
2034 | if (info == NULL) { | 2213 | NULL, argv_cleanup, NULL); |
2035 | argv_free(argv); | 2214 | out: |
2036 | goto out; | 2215 | if (likely(!ret)) |
2037 | } | 2216 | return 0; |
2038 | |||
2039 | call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); | ||
2040 | 2217 | ||
2041 | ret = call_usermodehelper_exec(info, UMH_NO_WAIT); | 2218 | if (ret == -ENOMEM) |
2219 | argv_free(argv); | ||
2042 | 2220 | ||
2043 | out: | 2221 | if (force) { |
2044 | if (ret && force) { | ||
2045 | printk(KERN_WARNING "Failed to start orderly shutdown: " | 2222 | printk(KERN_WARNING "Failed to start orderly shutdown: " |
2046 | "forcing the issue\n"); | 2223 | "forcing the issue\n"); |
2047 | 2224 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); | |||
203 | cond_syscall(sys_name_to_handle_at); | 203 | cond_syscall(sys_name_to_handle_at); |
204 | cond_syscall(sys_open_by_handle_at); | 204 | cond_syscall(sys_open_by_handle_at); |
205 | cond_syscall(compat_sys_open_by_handle_at); | 205 | cond_syscall(compat_sys_open_by_handle_at); |
206 | |||
207 | /* compare kernel pointers */ | ||
208 | cond_syscall(sys_kcmp); | ||
diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 000000000000..82d1c794066d --- /dev/null +++ b/kernel/task_work.c | |||
@@ -0,0 +1,84 @@ | |||
1 | #include <linux/spinlock.h> | ||
2 | #include <linux/task_work.h> | ||
3 | #include <linux/tracehook.h> | ||
4 | |||
5 | int | ||
6 | task_work_add(struct task_struct *task, struct task_work *twork, bool notify) | ||
7 | { | ||
8 | unsigned long flags; | ||
9 | int err = -ESRCH; | ||
10 | |||
11 | #ifndef TIF_NOTIFY_RESUME | ||
12 | if (notify) | ||
13 | return -ENOTSUPP; | ||
14 | #endif | ||
15 | /* | ||
16 | * We must not insert the new work if the task has already passed | ||
17 | * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() | ||
18 | * and check PF_EXITING under pi_lock. | ||
19 | */ | ||
20 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
21 | if (likely(!(task->flags & PF_EXITING))) { | ||
22 | hlist_add_head(&twork->hlist, &task->task_works); | ||
23 | err = 0; | ||
24 | } | ||
25 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
26 | |||
27 | /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ | ||
28 | if (likely(!err) && notify) | ||
29 | set_notify_resume(task); | ||
30 | return err; | ||
31 | } | ||
32 | |||
33 | struct task_work * | ||
34 | task_work_cancel(struct task_struct *task, task_work_func_t func) | ||
35 | { | ||
36 | unsigned long flags; | ||
37 | struct task_work *twork; | ||
38 | struct hlist_node *pos; | ||
39 | |||
40 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
41 | hlist_for_each_entry(twork, pos, &task->task_works, hlist) { | ||
42 | if (twork->func == func) { | ||
43 | hlist_del(&twork->hlist); | ||
44 | goto found; | ||
45 | } | ||
46 | } | ||
47 | twork = NULL; | ||
48 | found: | ||
49 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
50 | |||
51 | return twork; | ||
52 | } | ||
53 | |||
54 | void task_work_run(void) | ||
55 | { | ||
56 | struct task_struct *task = current; | ||
57 | struct hlist_head task_works; | ||
58 | struct hlist_node *pos; | ||
59 | |||
60 | raw_spin_lock_irq(&task->pi_lock); | ||
61 | hlist_move_list(&task->task_works, &task_works); | ||
62 | raw_spin_unlock_irq(&task->pi_lock); | ||
63 | |||
64 | if (unlikely(hlist_empty(&task_works))) | ||
65 | return; | ||
66 | /* | ||
67 | * We use hlist to save the space in task_struct, but we want fifo. | ||
68 | * Find the last entry, the list should be short, then process them | ||
69 | * in reverse order. | ||
70 | */ | ||
71 | for (pos = task_works.first; pos->next; pos = pos->next) | ||
72 | ; | ||
73 | |||
74 | for (;;) { | ||
75 | struct hlist_node **pprev = pos->pprev; | ||
76 | struct task_work *twork = container_of(pos, struct task_work, | ||
77 | hlist); | ||
78 | twork->func(twork); | ||
79 | |||
80 | if (pprev == &task_works.first) | ||
81 | break; | ||
82 | pos = container_of(pprev, struct hlist_node, next); | ||
83 | } | ||
84 | } | ||
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a20dc8a3c949..fd42bd452b75 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -2,6 +2,55 @@ | |||
2 | # Timer subsystem related configuration options | 2 | # Timer subsystem related configuration options |
3 | # | 3 | # |
4 | 4 | ||
5 | # Options selectable by arch Kconfig | ||
6 | |||
7 | # Watchdog function for clocksources to detect instabilities | ||
8 | config CLOCKSOURCE_WATCHDOG | ||
9 | bool | ||
10 | |||
11 | # Architecture has extra clocksource data | ||
12 | config ARCH_CLOCKSOURCE_DATA | ||
13 | bool | ||
14 | |||
15 | # Timekeeping vsyscall support | ||
16 | config GENERIC_TIME_VSYSCALL | ||
17 | bool | ||
18 | |||
19 | # ktime_t scalar 64bit nsec representation | ||
20 | config KTIME_SCALAR | ||
21 | bool | ||
22 | |||
23 | # Old style timekeeping | ||
24 | config ARCH_USES_GETTIMEOFFSET | ||
25 | bool | ||
26 | |||
27 | # The generic clock events infrastructure | ||
28 | config GENERIC_CLOCKEVENTS | ||
29 | bool | ||
30 | |||
31 | # Migration helper. Builds, but does not invoke | ||
32 | config GENERIC_CLOCKEVENTS_BUILD | ||
33 | bool | ||
34 | default y | ||
35 | depends on GENERIC_CLOCKEVENTS | ||
36 | |||
37 | # Clockevents broadcasting infrastructure | ||
38 | config GENERIC_CLOCKEVENTS_BROADCAST | ||
39 | bool | ||
40 | depends on GENERIC_CLOCKEVENTS | ||
41 | |||
42 | # Automatically adjust the min. reprogramming time for | ||
43 | # clock event device | ||
44 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
45 | bool | ||
46 | |||
47 | # Generic update of CMOS clock | ||
48 | config GENERIC_CMOS_UPDATE | ||
49 | bool | ||
50 | |||
51 | if GENERIC_CLOCKEVENTS | ||
52 | menu "Timers subsystem" | ||
53 | |||
5 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is | 54 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is |
6 | # only related to the tick functionality. Oneshot clockevent devices | 55 | # only related to the tick functionality. Oneshot clockevent devices |
7 | # are supported independ of this. | 56 | # are supported independ of this. |
@@ -26,10 +75,5 @@ config HIGH_RES_TIMERS | |||
26 | hardware is not capable then this option only increases | 75 | hardware is not capable then this option only increases |
27 | the size of the kernel image. | 76 | the size of the kernel image. |
28 | 77 | ||
29 | config GENERIC_CLOCKEVENTS_BUILD | 78 | endmenu |
30 | bool | 79 | endif |
31 | default y | ||
32 | depends on GENERIC_CLOCKEVENTS | ||
33 | |||
34 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
35 | bool | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7b..aa27d391bfc8 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); | |||
59 | * If one has not already been chosen, it checks to see if a | 59 | * If one has not already been chosen, it checks to see if a |
60 | * functional rtc device is available. | 60 | * functional rtc device is available. |
61 | */ | 61 | */ |
62 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | struct rtc_device *alarmtimer_get_rtcdev(void) |
63 | { | 63 | { |
64 | unsigned long flags; | 64 | unsigned long flags; |
65 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) | |||
115 | class_interface_unregister(&alarmtimer_rtc_interface); | 115 | class_interface_unregister(&alarmtimer_rtc_interface); |
116 | } | 116 | } |
117 | #else | 117 | #else |
118 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) | 118 | struct rtc_device *alarmtimer_get_rtcdev(void) |
119 | { | 119 | { |
120 | return NULL; | 120 | return NULL; |
121 | } | 121 | } |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
297 | } | 297 | } |
298 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 298 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
299 | 299 | ||
300 | static void clockevents_config(struct clock_event_device *dev, | 300 | void clockevents_config(struct clock_event_device *dev, u32 freq) |
301 | u32 freq) | ||
302 | { | 301 | { |
303 | u64 sec; | 302 | u64 sec; |
304 | 303 | ||
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f03fd83b170b..70b33abcc7bb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -412,6 +412,7 @@ int second_overflow(unsigned long secs) | |||
412 | if (secs % 86400 == 0) { | 412 | if (secs % 86400 == 0) { |
413 | leap = -1; | 413 | leap = -1; |
414 | time_state = TIME_OOP; | 414 | time_state = TIME_OOP; |
415 | time_tai++; | ||
415 | printk(KERN_NOTICE | 416 | printk(KERN_NOTICE |
416 | "Clock: inserting leap second 23:59:60 UTC\n"); | 417 | "Clock: inserting leap second 23:59:60 UTC\n"); |
417 | } | 418 | } |
@@ -426,7 +427,6 @@ int second_overflow(unsigned long secs) | |||
426 | } | 427 | } |
427 | break; | 428 | break; |
428 | case TIME_OOP: | 429 | case TIME_OOP: |
429 | time_tai++; | ||
430 | time_state = TIME_WAIT; | 430 | time_state = TIME_WAIT; |
431 | break; | 431 | break; |
432 | 432 | ||
@@ -473,8 +473,6 @@ int second_overflow(unsigned long secs) | |||
473 | << NTP_SCALE_SHIFT; | 473 | << NTP_SCALE_SHIFT; |
474 | time_adjust = 0; | 474 | time_adjust = 0; |
475 | 475 | ||
476 | |||
477 | |||
478 | out: | 476 | out: |
479 | spin_unlock_irqrestore(&ntp_lock, flags); | 477 | spin_unlock_irqrestore(&ntp_lock, flags); |
480 | 478 | ||
@@ -559,10 +557,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
559 | /* only set allowed bits */ | 557 | /* only set allowed bits */ |
560 | time_status &= STA_RONLY; | 558 | time_status &= STA_RONLY; |
561 | time_status |= txc->status & ~STA_RONLY; | 559 | time_status |= txc->status & ~STA_RONLY; |
562 | |||
563 | } | 560 | } |
561 | |||
564 | /* | 562 | /* |
565 | * Called with the xtime lock held, so we can access and modify | 563 | * Called with ntp_lock held, so we can access and modify |
566 | * all the global NTP state: | 564 | * all the global NTP state: |
567 | */ | 565 | */ |
568 | static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) | 566 | static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..da70c6db496c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) | |||
576 | /* Update jiffies first */ | 576 | /* Update jiffies first */ |
577 | select_nohz_load_balancer(0); | 577 | select_nohz_load_balancer(0); |
578 | tick_do_update_jiffies64(now); | 578 | tick_do_update_jiffies64(now); |
579 | update_cpu_load_nohz(); | ||
579 | 580 | ||
580 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 581 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
581 | /* | 582 | /* |
@@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
814 | return HRTIMER_RESTART; | 815 | return HRTIMER_RESTART; |
815 | } | 816 | } |
816 | 817 | ||
818 | static int sched_skew_tick; | ||
819 | |||
820 | static int __init skew_tick(char *str) | ||
821 | { | ||
822 | get_option(&str, &sched_skew_tick); | ||
823 | |||
824 | return 0; | ||
825 | } | ||
826 | early_param("skew_tick", skew_tick); | ||
827 | |||
817 | /** | 828 | /** |
818 | * tick_setup_sched_timer - setup the tick emulation timer | 829 | * tick_setup_sched_timer - setup the tick emulation timer |
819 | */ | 830 | */ |
@@ -831,6 +842,14 @@ void tick_setup_sched_timer(void) | |||
831 | /* Get the next period (per cpu) */ | 842 | /* Get the next period (per cpu) */ |
832 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 843 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
833 | 844 | ||
845 | /* Offset the tick to avert xtime_lock contention. */ | ||
846 | if (sched_skew_tick) { | ||
847 | u64 offset = ktime_to_ns(tick_period) >> 1; | ||
848 | do_div(offset, num_possible_cpus()); | ||
849 | offset *= smp_processor_id(); | ||
850 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
851 | } | ||
852 | |||
834 | for (;;) { | 853 | for (;;) { |
835 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 854 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
836 | hrtimer_start_expires(&ts->sched_timer, | 855 | hrtimer_start_expires(&ts->sched_timer, |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d66b21308f7c..6e46cacf5969 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -240,7 +240,6 @@ void getnstimeofday(struct timespec *ts) | |||
240 | 240 | ||
241 | timespec_add_ns(ts, nsecs); | 241 | timespec_add_ns(ts, nsecs); |
242 | } | 242 | } |
243 | |||
244 | EXPORT_SYMBOL(getnstimeofday); | 243 | EXPORT_SYMBOL(getnstimeofday); |
245 | 244 | ||
246 | ktime_t ktime_get(void) | 245 | ktime_t ktime_get(void) |
@@ -357,8 +356,8 @@ void do_gettimeofday(struct timeval *tv) | |||
357 | tv->tv_sec = now.tv_sec; | 356 | tv->tv_sec = now.tv_sec; |
358 | tv->tv_usec = now.tv_nsec/1000; | 357 | tv->tv_usec = now.tv_nsec/1000; |
359 | } | 358 | } |
360 | |||
361 | EXPORT_SYMBOL(do_gettimeofday); | 359 | EXPORT_SYMBOL(do_gettimeofday); |
360 | |||
362 | /** | 361 | /** |
363 | * do_settimeofday - Sets the time of day | 362 | * do_settimeofday - Sets the time of day |
364 | * @tv: pointer to the timespec variable containing the new time | 363 | * @tv: pointer to the timespec variable containing the new time |
@@ -392,7 +391,6 @@ int do_settimeofday(const struct timespec *tv) | |||
392 | 391 | ||
393 | return 0; | 392 | return 0; |
394 | } | 393 | } |
395 | |||
396 | EXPORT_SYMBOL(do_settimeofday); | 394 | EXPORT_SYMBOL(do_settimeofday); |
397 | 395 | ||
398 | 396 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888e..6ec7e7e0db43 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); | |||
861 | * | 861 | * |
862 | * mod_timer_pinned() is a way to update the expire field of an | 862 | * mod_timer_pinned() is a way to update the expire field of an |
863 | * active timer (if the timer is inactive it will be activated) | 863 | * active timer (if the timer is inactive it will be activated) |
864 | * and not allow the timer to be migrated to a different CPU. | 864 | * and to ensure that the timer is scheduled on the current CPU. |
865 | * | ||
866 | * Note that this does not prevent the timer from being migrated | ||
867 | * when the current CPU goes offline. If this is a problem for | ||
868 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
869 | * example, cancelling the timer when the corresponding CPU goes | ||
870 | * offline. | ||
865 | * | 871 | * |
866 | * mod_timer_pinned(timer, expires) is equivalent to: | 872 | * mod_timer_pinned(timer, expires) is equivalent to: |
867 | * | 873 | * |
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1102 | * warnings as well as problems when looking into | 1108 | * warnings as well as problems when looking into |
1103 | * timer->lockdep_map, make a copy and use that here. | 1109 | * timer->lockdep_map, make a copy and use that here. |
1104 | */ | 1110 | */ |
1105 | struct lockdep_map lockdep_map = timer->lockdep_map; | 1111 | struct lockdep_map lockdep_map; |
1112 | |||
1113 | lockdep_copy_map(&lockdep_map, &timer->lockdep_map); | ||
1106 | #endif | 1114 | #endif |
1107 | /* | 1115 | /* |
1108 | * Couple the lock chain with the lock chain at | 1116 | * Couple the lock chain with the lock chain at |
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid) | |||
1427 | SYSCALL_DEFINE0(getuid) | 1435 | SYSCALL_DEFINE0(getuid) |
1428 | { | 1436 | { |
1429 | /* Only we change this so SMP safe */ | 1437 | /* Only we change this so SMP safe */ |
1430 | return current_uid(); | 1438 | return from_kuid_munged(current_user_ns(), current_uid()); |
1431 | } | 1439 | } |
1432 | 1440 | ||
1433 | SYSCALL_DEFINE0(geteuid) | 1441 | SYSCALL_DEFINE0(geteuid) |
1434 | { | 1442 | { |
1435 | /* Only we change this so SMP safe */ | 1443 | /* Only we change this so SMP safe */ |
1436 | return current_euid(); | 1444 | return from_kuid_munged(current_user_ns(), current_euid()); |
1437 | } | 1445 | } |
1438 | 1446 | ||
1439 | SYSCALL_DEFINE0(getgid) | 1447 | SYSCALL_DEFINE0(getgid) |
1440 | { | 1448 | { |
1441 | /* Only we change this so SMP safe */ | 1449 | /* Only we change this so SMP safe */ |
1442 | return current_gid(); | 1450 | return from_kgid_munged(current_user_ns(), current_gid()); |
1443 | } | 1451 | } |
1444 | 1452 | ||
1445 | SYSCALL_DEFINE0(getegid) | 1453 | SYSCALL_DEFINE0(getegid) |
1446 | { | 1454 | { |
1447 | /* Only we change this so SMP safe */ | 1455 | /* Only we change this so SMP safe */ |
1448 | return current_egid(); | 1456 | return from_kgid_munged(current_user_ns(), current_egid()); |
1449 | } | 1457 | } |
1450 | 1458 | ||
1451 | #endif | 1459 | #endif |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..8c4c07071cc5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,6 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE | ||
145 | select KALLSYMS | 144 | select KALLSYMS |
146 | select GENERIC_TRACER | 145 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 146 | select CONTEXT_SWITCH_TRACER |
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
272 | bool "Trace likely/unlikely profiler" | 271 | bool "Trace likely/unlikely profiler" |
273 | select TRACE_BRANCH_PROFILING | 272 | select TRACE_BRANCH_PROFILING |
274 | help | 273 | help |
275 | This tracer profiles all the the likely and unlikely macros | 274 | This tracer profiles all likely and unlikely macros |
276 | in the kernel. It will display the results in: | 275 | in the kernel. It will display the results in: |
277 | 276 | ||
278 | /sys/kernel/debug/tracing/trace_stat/branch_annotated | 277 | /sys/kernel/debug/tracing/trace_stat/branch_annotated |
@@ -373,6 +372,7 @@ config KPROBE_EVENT | |||
373 | depends on HAVE_REGS_AND_STACK_ACCESS_API | 372 | depends on HAVE_REGS_AND_STACK_ACCESS_API |
374 | bool "Enable kprobes-based dynamic events" | 373 | bool "Enable kprobes-based dynamic events" |
375 | select TRACING | 374 | select TRACING |
375 | select PROBE_EVENTS | ||
376 | default y | 376 | default y |
377 | help | 377 | help |
378 | This allows the user to add tracing events (similar to tracepoints) | 378 | This allows the user to add tracing events (similar to tracepoints) |
@@ -385,6 +385,25 @@ config KPROBE_EVENT | |||
385 | This option is also required by perf-probe subcommand of perf tools. | 385 | This option is also required by perf-probe subcommand of perf tools. |
386 | If you want to use perf tools, this option is strongly recommended. | 386 | If you want to use perf tools, this option is strongly recommended. |
387 | 387 | ||
388 | config UPROBE_EVENT | ||
389 | bool "Enable uprobes-based dynamic events" | ||
390 | depends on ARCH_SUPPORTS_UPROBES | ||
391 | depends on MMU | ||
392 | select UPROBES | ||
393 | select PROBE_EVENTS | ||
394 | select TRACING | ||
395 | default n | ||
396 | help | ||
397 | This allows the user to add tracing events on top of userspace | ||
398 | dynamic events (similar to tracepoints) on the fly via the trace | ||
399 | events interface. Those events can be inserted wherever uprobes | ||
400 | can probe, and record various registers. | ||
401 | This option is required if you plan to use perf-probe subcommand | ||
402 | of perf tools on user space applications. | ||
403 | |||
404 | config PROBE_EVENTS | ||
405 | def_bool n | ||
406 | |||
388 | config DYNAMIC_FTRACE | 407 | config DYNAMIC_FTRACE |
389 | bool "enable/disable ftrace tracepoints dynamically" | 408 | bool "enable/disable ftrace tracepoints dynamically" |
390 | depends on FUNCTION_TRACER | 409 | depends on FUNCTION_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5ea..b831087c8200 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o | |||
41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | ||
45 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 44 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
46 | ifeq ($(CONFIG_BLOCK),y) | 45 | ifeq ($(CONFIG_BLOCK),y) |
47 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o | 46 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o |
@@ -61,5 +60,7 @@ endif | |||
61 | ifeq ($(CONFIG_TRACING),y) | 60 | ifeq ($(CONFIG_TRACING),y) |
62 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 61 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
63 | endif | 62 | endif |
63 | obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o | ||
64 | obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o | ||
64 | 65 | ||
65 | libftrace-y := ftrace.o | 66 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c92..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1383 | 1383 | ||
1384 | static int ftrace_cmp_recs(const void *a, const void *b) | 1384 | static int ftrace_cmp_recs(const void *a, const void *b) |
1385 | { | 1385 | { |
1386 | const struct dyn_ftrace *reca = a; | 1386 | const struct dyn_ftrace *key = a; |
1387 | const struct dyn_ftrace *recb = b; | 1387 | const struct dyn_ftrace *rec = b; |
1388 | 1388 | ||
1389 | if (reca->ip > recb->ip) | 1389 | if (key->flags < rec->ip) |
1390 | return 1; | ||
1391 | if (reca->ip < recb->ip) | ||
1392 | return -1; | 1390 | return -1; |
1391 | if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) | ||
1392 | return 1; | ||
1393 | return 0; | 1393 | return 0; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | /** | 1396 | static unsigned long ftrace_location_range(unsigned long start, unsigned long end) |
1397 | * ftrace_location - return true if the ip giving is a traced location | ||
1398 | * @ip: the instruction pointer to check | ||
1399 | * | ||
1400 | * Returns 1 if @ip given is a pointer to a ftrace location. | ||
1401 | * That is, the instruction that is either a NOP or call to | ||
1402 | * the function tracer. It checks the ftrace internal tables to | ||
1403 | * determine if the address belongs or not. | ||
1404 | */ | ||
1405 | int ftrace_location(unsigned long ip) | ||
1406 | { | 1397 | { |
1407 | struct ftrace_page *pg; | 1398 | struct ftrace_page *pg; |
1408 | struct dyn_ftrace *rec; | 1399 | struct dyn_ftrace *rec; |
1409 | struct dyn_ftrace key; | 1400 | struct dyn_ftrace key; |
1410 | 1401 | ||
1411 | key.ip = ip; | 1402 | key.ip = start; |
1403 | key.flags = end; /* overload flags, as it is unsigned long */ | ||
1412 | 1404 | ||
1413 | for (pg = ftrace_pages_start; pg; pg = pg->next) { | 1405 | for (pg = ftrace_pages_start; pg; pg = pg->next) { |
1406 | if (end < pg->records[0].ip || | ||
1407 | start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) | ||
1408 | continue; | ||
1414 | rec = bsearch(&key, pg->records, pg->index, | 1409 | rec = bsearch(&key, pg->records, pg->index, |
1415 | sizeof(struct dyn_ftrace), | 1410 | sizeof(struct dyn_ftrace), |
1416 | ftrace_cmp_recs); | 1411 | ftrace_cmp_recs); |
1417 | if (rec) | 1412 | if (rec) |
1418 | return 1; | 1413 | return rec->ip; |
1419 | } | 1414 | } |
1420 | 1415 | ||
1421 | return 0; | 1416 | return 0; |
1422 | } | 1417 | } |
1423 | 1418 | ||
1419 | /** | ||
1420 | * ftrace_location - return true if the ip giving is a traced location | ||
1421 | * @ip: the instruction pointer to check | ||
1422 | * | ||
1423 | * Returns rec->ip if @ip given is a pointer to a ftrace location. | ||
1424 | * That is, the instruction that is either a NOP or call to | ||
1425 | * the function tracer. It checks the ftrace internal tables to | ||
1426 | * determine if the address belongs or not. | ||
1427 | */ | ||
1428 | unsigned long ftrace_location(unsigned long ip) | ||
1429 | { | ||
1430 | return ftrace_location_range(ip, ip); | ||
1431 | } | ||
1432 | |||
1433 | /** | ||
1434 | * ftrace_text_reserved - return true if range contains an ftrace location | ||
1435 | * @start: start of range to search | ||
1436 | * @end: end of range to search (inclusive). @end points to the last byte to check. | ||
1437 | * | ||
1438 | * Returns 1 if @start and @end contains a ftrace location. | ||
1439 | * That is, the instruction that is either a NOP or call to | ||
1440 | * the function tracer. It checks the ftrace internal tables to | ||
1441 | * determine if the address belongs or not. | ||
1442 | */ | ||
1443 | int ftrace_text_reserved(void *start, void *end) | ||
1444 | { | ||
1445 | unsigned long ret; | ||
1446 | |||
1447 | ret = ftrace_location_range((unsigned long)start, | ||
1448 | (unsigned long)end); | ||
1449 | |||
1450 | return (int)!!ret; | ||
1451 | } | ||
1452 | |||
1424 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1453 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1425 | int filter_hash, | 1454 | int filter_hash, |
1426 | bool inc) | 1455 | bool inc) |
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
1520 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1549 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
1521 | } | 1550 | } |
1522 | 1551 | ||
1523 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) | ||
1524 | { | ||
1525 | if (ftrace_pages->index == ftrace_pages->size) { | ||
1526 | /* We should have allocated enough */ | ||
1527 | if (WARN_ON(!ftrace_pages->next)) | ||
1528 | return NULL; | ||
1529 | ftrace_pages = ftrace_pages->next; | ||
1530 | } | ||
1531 | |||
1532 | return &ftrace_pages->records[ftrace_pages->index++]; | ||
1533 | } | ||
1534 | |||
1535 | static struct dyn_ftrace * | ||
1536 | ftrace_record_ip(unsigned long ip) | ||
1537 | { | ||
1538 | struct dyn_ftrace *rec; | ||
1539 | |||
1540 | if (ftrace_disabled) | ||
1541 | return NULL; | ||
1542 | |||
1543 | rec = ftrace_alloc_dyn_node(ip); | ||
1544 | if (!rec) | ||
1545 | return NULL; | ||
1546 | |||
1547 | rec->ip = ip; | ||
1548 | |||
1549 | return rec; | ||
1550 | } | ||
1551 | |||
1552 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1552 | static void print_ip_ins(const char *fmt, unsigned char *p) |
1553 | { | 1553 | { |
1554 | int i; | 1554 | int i; |
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) | |||
1598 | } | 1598 | } |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | |||
1602 | /* Return 1 if the address range is reserved for ftrace */ | ||
1603 | int ftrace_text_reserved(void *start, void *end) | ||
1604 | { | ||
1605 | struct dyn_ftrace *rec; | ||
1606 | struct ftrace_page *pg; | ||
1607 | |||
1608 | do_for_each_ftrace_rec(pg, rec) { | ||
1609 | if (rec->ip <= (unsigned long)end && | ||
1610 | rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) | ||
1611 | return 1; | ||
1612 | } while_for_each_ftrace_rec(); | ||
1613 | return 0; | ||
1614 | } | ||
1615 | |||
1616 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1601 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
1617 | { | 1602 | { |
1618 | unsigned long flag = 0UL; | 1603 | unsigned long flag = 0UL; |
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1698 | return -1; /* unknow ftrace bug */ | 1683 | return -1; /* unknow ftrace bug */ |
1699 | } | 1684 | } |
1700 | 1685 | ||
1701 | static void ftrace_replace_code(int update) | 1686 | void __weak ftrace_replace_code(int enable) |
1702 | { | 1687 | { |
1703 | struct dyn_ftrace *rec; | 1688 | struct dyn_ftrace *rec; |
1704 | struct ftrace_page *pg; | 1689 | struct ftrace_page *pg; |
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) | |||
1708 | return; | 1693 | return; |
1709 | 1694 | ||
1710 | do_for_each_ftrace_rec(pg, rec) { | 1695 | do_for_each_ftrace_rec(pg, rec) { |
1711 | failed = __ftrace_replace_code(rec, update); | 1696 | failed = __ftrace_replace_code(rec, enable); |
1712 | if (failed) { | 1697 | if (failed) { |
1713 | ftrace_bug(failed, rec->ip); | 1698 | ftrace_bug(failed, rec->ip); |
1714 | /* Stop processing */ | 1699 | /* Stop processing */ |
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
1826 | return 0; | 1811 | return 0; |
1827 | } | 1812 | } |
1828 | 1813 | ||
1829 | static int __ftrace_modify_code(void *data) | 1814 | void ftrace_modify_all_code(int command) |
1830 | { | 1815 | { |
1831 | int *command = data; | 1816 | if (command & FTRACE_UPDATE_CALLS) |
1832 | |||
1833 | if (*command & FTRACE_UPDATE_CALLS) | ||
1834 | ftrace_replace_code(1); | 1817 | ftrace_replace_code(1); |
1835 | else if (*command & FTRACE_DISABLE_CALLS) | 1818 | else if (command & FTRACE_DISABLE_CALLS) |
1836 | ftrace_replace_code(0); | 1819 | ftrace_replace_code(0); |
1837 | 1820 | ||
1838 | if (*command & FTRACE_UPDATE_TRACE_FUNC) | 1821 | if (command & FTRACE_UPDATE_TRACE_FUNC) |
1839 | ftrace_update_ftrace_func(ftrace_trace_function); | 1822 | ftrace_update_ftrace_func(ftrace_trace_function); |
1840 | 1823 | ||
1841 | if (*command & FTRACE_START_FUNC_RET) | 1824 | if (command & FTRACE_START_FUNC_RET) |
1842 | ftrace_enable_ftrace_graph_caller(); | 1825 | ftrace_enable_ftrace_graph_caller(); |
1843 | else if (*command & FTRACE_STOP_FUNC_RET) | 1826 | else if (command & FTRACE_STOP_FUNC_RET) |
1844 | ftrace_disable_ftrace_graph_caller(); | 1827 | ftrace_disable_ftrace_graph_caller(); |
1828 | } | ||
1829 | |||
1830 | static int __ftrace_modify_code(void *data) | ||
1831 | { | ||
1832 | int *command = data; | ||
1833 | |||
1834 | ftrace_modify_all_code(*command); | ||
1845 | 1835 | ||
1846 | return 0; | 1836 | return 0; |
1847 | } | 1837 | } |
@@ -2469,57 +2459,35 @@ static int | |||
2469 | ftrace_avail_open(struct inode *inode, struct file *file) | 2459 | ftrace_avail_open(struct inode *inode, struct file *file) |
2470 | { | 2460 | { |
2471 | struct ftrace_iterator *iter; | 2461 | struct ftrace_iterator *iter; |
2472 | int ret; | ||
2473 | 2462 | ||
2474 | if (unlikely(ftrace_disabled)) | 2463 | if (unlikely(ftrace_disabled)) |
2475 | return -ENODEV; | 2464 | return -ENODEV; |
2476 | 2465 | ||
2477 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2466 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2478 | if (!iter) | 2467 | if (iter) { |
2479 | return -ENOMEM; | 2468 | iter->pg = ftrace_pages_start; |
2480 | 2469 | iter->ops = &global_ops; | |
2481 | iter->pg = ftrace_pages_start; | ||
2482 | iter->ops = &global_ops; | ||
2483 | |||
2484 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2485 | if (!ret) { | ||
2486 | struct seq_file *m = file->private_data; | ||
2487 | |||
2488 | m->private = iter; | ||
2489 | } else { | ||
2490 | kfree(iter); | ||
2491 | } | 2470 | } |
2492 | 2471 | ||
2493 | return ret; | 2472 | return iter ? 0 : -ENOMEM; |
2494 | } | 2473 | } |
2495 | 2474 | ||
2496 | static int | 2475 | static int |
2497 | ftrace_enabled_open(struct inode *inode, struct file *file) | 2476 | ftrace_enabled_open(struct inode *inode, struct file *file) |
2498 | { | 2477 | { |
2499 | struct ftrace_iterator *iter; | 2478 | struct ftrace_iterator *iter; |
2500 | int ret; | ||
2501 | 2479 | ||
2502 | if (unlikely(ftrace_disabled)) | 2480 | if (unlikely(ftrace_disabled)) |
2503 | return -ENODEV; | 2481 | return -ENODEV; |
2504 | 2482 | ||
2505 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2483 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2506 | if (!iter) | 2484 | if (iter) { |
2507 | return -ENOMEM; | 2485 | iter->pg = ftrace_pages_start; |
2508 | 2486 | iter->flags = FTRACE_ITER_ENABLED; | |
2509 | iter->pg = ftrace_pages_start; | 2487 | iter->ops = &global_ops; |
2510 | iter->flags = FTRACE_ITER_ENABLED; | ||
2511 | iter->ops = &global_ops; | ||
2512 | |||
2513 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2514 | if (!ret) { | ||
2515 | struct seq_file *m = file->private_data; | ||
2516 | |||
2517 | m->private = iter; | ||
2518 | } else { | ||
2519 | kfree(iter); | ||
2520 | } | 2488 | } |
2521 | 2489 | ||
2522 | return ret; | 2490 | return iter ? 0 : -ENOMEM; |
2523 | } | 2491 | } |
2524 | 2492 | ||
2525 | static void ftrace_filter_reset(struct ftrace_hash *hash) | 2493 | static void ftrace_filter_reset(struct ftrace_hash *hash) |
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3688 | return 0; | 3656 | return 0; |
3689 | } | 3657 | } |
3690 | 3658 | ||
3691 | static void ftrace_swap_recs(void *a, void *b, int size) | 3659 | static int ftrace_cmp_ips(const void *a, const void *b) |
3660 | { | ||
3661 | const unsigned long *ipa = a; | ||
3662 | const unsigned long *ipb = b; | ||
3663 | |||
3664 | if (*ipa > *ipb) | ||
3665 | return 1; | ||
3666 | if (*ipa < *ipb) | ||
3667 | return -1; | ||
3668 | return 0; | ||
3669 | } | ||
3670 | |||
3671 | static void ftrace_swap_ips(void *a, void *b, int size) | ||
3692 | { | 3672 | { |
3693 | struct dyn_ftrace *reca = a; | 3673 | unsigned long *ipa = a; |
3694 | struct dyn_ftrace *recb = b; | 3674 | unsigned long *ipb = b; |
3695 | struct dyn_ftrace t; | 3675 | unsigned long t; |
3696 | 3676 | ||
3697 | t = *reca; | 3677 | t = *ipa; |
3698 | *reca = *recb; | 3678 | *ipa = *ipb; |
3699 | *recb = t; | 3679 | *ipb = t; |
3700 | } | 3680 | } |
3701 | 3681 | ||
3702 | static int ftrace_process_locs(struct module *mod, | 3682 | static int ftrace_process_locs(struct module *mod, |
3703 | unsigned long *start, | 3683 | unsigned long *start, |
3704 | unsigned long *end) | 3684 | unsigned long *end) |
3705 | { | 3685 | { |
3686 | struct ftrace_page *start_pg; | ||
3706 | struct ftrace_page *pg; | 3687 | struct ftrace_page *pg; |
3688 | struct dyn_ftrace *rec; | ||
3707 | unsigned long count; | 3689 | unsigned long count; |
3708 | unsigned long *p; | 3690 | unsigned long *p; |
3709 | unsigned long addr; | 3691 | unsigned long addr; |
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3715 | if (!count) | 3697 | if (!count) |
3716 | return 0; | 3698 | return 0; |
3717 | 3699 | ||
3718 | pg = ftrace_allocate_pages(count); | 3700 | sort(start, count, sizeof(*start), |
3719 | if (!pg) | 3701 | ftrace_cmp_ips, ftrace_swap_ips); |
3702 | |||
3703 | start_pg = ftrace_allocate_pages(count); | ||
3704 | if (!start_pg) | ||
3720 | return -ENOMEM; | 3705 | return -ENOMEM; |
3721 | 3706 | ||
3722 | mutex_lock(&ftrace_lock); | 3707 | mutex_lock(&ftrace_lock); |
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, | |||
3729 | if (!mod) { | 3714 | if (!mod) { |
3730 | WARN_ON(ftrace_pages || ftrace_pages_start); | 3715 | WARN_ON(ftrace_pages || ftrace_pages_start); |
3731 | /* First initialization */ | 3716 | /* First initialization */ |
3732 | ftrace_pages = ftrace_pages_start = pg; | 3717 | ftrace_pages = ftrace_pages_start = start_pg; |
3733 | } else { | 3718 | } else { |
3734 | if (!ftrace_pages) | 3719 | if (!ftrace_pages) |
3735 | goto out; | 3720 | goto out; |
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3740 | ftrace_pages = ftrace_pages->next; | 3725 | ftrace_pages = ftrace_pages->next; |
3741 | } | 3726 | } |
3742 | 3727 | ||
3743 | ftrace_pages->next = pg; | 3728 | ftrace_pages->next = start_pg; |
3744 | ftrace_pages = pg; | ||
3745 | } | 3729 | } |
3746 | 3730 | ||
3747 | p = start; | 3731 | p = start; |
3732 | pg = start_pg; | ||
3748 | while (p < end) { | 3733 | while (p < end) { |
3749 | addr = ftrace_call_adjust(*p++); | 3734 | addr = ftrace_call_adjust(*p++); |
3750 | /* | 3735 | /* |
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, | |||
3755 | */ | 3740 | */ |
3756 | if (!addr) | 3741 | if (!addr) |
3757 | continue; | 3742 | continue; |
3758 | if (!ftrace_record_ip(addr)) | 3743 | |
3759 | break; | 3744 | if (pg->index == pg->size) { |
3745 | /* We should have allocated enough */ | ||
3746 | if (WARN_ON(!pg->next)) | ||
3747 | break; | ||
3748 | pg = pg->next; | ||
3749 | } | ||
3750 | |||
3751 | rec = &pg->records[pg->index++]; | ||
3752 | rec->ip = addr; | ||
3760 | } | 3753 | } |
3761 | 3754 | ||
3762 | /* These new locations need to be initialized */ | 3755 | /* We should have used all pages */ |
3763 | ftrace_new_pgs = pg; | 3756 | WARN_ON(pg->next); |
3757 | |||
3758 | /* Assign the last page to ftrace_pages */ | ||
3759 | ftrace_pages = pg; | ||
3764 | 3760 | ||
3765 | /* Make each individual set of pages sorted by ips */ | 3761 | /* These new locations need to be initialized */ |
3766 | for (; pg; pg = pg->next) | 3762 | ftrace_new_pgs = start_pg; |
3767 | sort(pg->records, pg->index, sizeof(struct dyn_ftrace), | ||
3768 | ftrace_cmp_recs, ftrace_swap_recs); | ||
3769 | 3763 | ||
3770 | /* | 3764 | /* |
3771 | * We only need to disable interrupts on start up | 3765 | * We only need to disable interrupts on start up |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91efd..1d0f6a8a0e5e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -23,6 +23,8 @@ | |||
23 | #include <asm/local.h> | 23 | #include <asm/local.h> |
24 | #include "trace.h" | 24 | #include "trace.h" |
25 | 25 | ||
26 | static void update_pages_handler(struct work_struct *work); | ||
27 | |||
26 | /* | 28 | /* |
27 | * The ring buffer header is special. We must manually up keep it. | 29 | * The ring buffer header is special. We must manually up keep it. |
28 | */ | 30 | */ |
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu { | |||
449 | raw_spinlock_t reader_lock; /* serialize readers */ | 451 | raw_spinlock_t reader_lock; /* serialize readers */ |
450 | arch_spinlock_t lock; | 452 | arch_spinlock_t lock; |
451 | struct lock_class_key lock_key; | 453 | struct lock_class_key lock_key; |
454 | unsigned int nr_pages; | ||
452 | struct list_head *pages; | 455 | struct list_head *pages; |
453 | struct buffer_page *head_page; /* read from head */ | 456 | struct buffer_page *head_page; /* read from head */ |
454 | struct buffer_page *tail_page; /* write to tail */ | 457 | struct buffer_page *tail_page; /* write to tail */ |
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu { | |||
466 | unsigned long read_bytes; | 469 | unsigned long read_bytes; |
467 | u64 write_stamp; | 470 | u64 write_stamp; |
468 | u64 read_stamp; | 471 | u64 read_stamp; |
472 | /* ring buffer pages to update, > 0 to add, < 0 to remove */ | ||
473 | int nr_pages_to_update; | ||
474 | struct list_head new_pages; /* new pages to add */ | ||
475 | struct work_struct update_pages_work; | ||
476 | struct completion update_done; | ||
469 | }; | 477 | }; |
470 | 478 | ||
471 | struct ring_buffer { | 479 | struct ring_buffer { |
472 | unsigned pages; | ||
473 | unsigned flags; | 480 | unsigned flags; |
474 | int cpus; | 481 | int cpus; |
475 | atomic_t record_disabled; | 482 | atomic_t record_disabled; |
483 | atomic_t resize_disabled; | ||
476 | cpumask_var_t cpumask; | 484 | cpumask_var_t cpumask; |
477 | 485 | ||
478 | struct lock_class_key *reader_lock_key; | 486 | struct lock_class_key *reader_lock_key; |
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
937 | struct list_head *head = cpu_buffer->pages; | 945 | struct list_head *head = cpu_buffer->pages; |
938 | struct buffer_page *bpage, *tmp; | 946 | struct buffer_page *bpage, *tmp; |
939 | 947 | ||
948 | /* Reset the head page if it exists */ | ||
949 | if (cpu_buffer->head_page) | ||
950 | rb_set_head_page(cpu_buffer); | ||
951 | |||
940 | rb_head_page_deactivate(cpu_buffer); | 952 | rb_head_page_deactivate(cpu_buffer); |
941 | 953 | ||
942 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) | 954 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) |
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
963 | return 0; | 975 | return 0; |
964 | } | 976 | } |
965 | 977 | ||
966 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | 978 | static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) |
967 | unsigned nr_pages) | ||
968 | { | 979 | { |
980 | int i; | ||
969 | struct buffer_page *bpage, *tmp; | 981 | struct buffer_page *bpage, *tmp; |
970 | LIST_HEAD(pages); | ||
971 | unsigned i; | ||
972 | |||
973 | WARN_ON(!nr_pages); | ||
974 | 982 | ||
975 | for (i = 0; i < nr_pages; i++) { | 983 | for (i = 0; i < nr_pages; i++) { |
976 | struct page *page; | 984 | struct page *page; |
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
981 | */ | 989 | */ |
982 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 990 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
983 | GFP_KERNEL | __GFP_NORETRY, | 991 | GFP_KERNEL | __GFP_NORETRY, |
984 | cpu_to_node(cpu_buffer->cpu)); | 992 | cpu_to_node(cpu)); |
985 | if (!bpage) | 993 | if (!bpage) |
986 | goto free_pages; | 994 | goto free_pages; |
987 | 995 | ||
988 | rb_check_bpage(cpu_buffer, bpage); | 996 | list_add(&bpage->list, pages); |
989 | 997 | ||
990 | list_add(&bpage->list, &pages); | 998 | page = alloc_pages_node(cpu_to_node(cpu), |
991 | |||
992 | page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), | ||
993 | GFP_KERNEL | __GFP_NORETRY, 0); | 999 | GFP_KERNEL | __GFP_NORETRY, 0); |
994 | if (!page) | 1000 | if (!page) |
995 | goto free_pages; | 1001 | goto free_pages; |
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
997 | rb_init_page(bpage->page); | 1003 | rb_init_page(bpage->page); |
998 | } | 1004 | } |
999 | 1005 | ||
1006 | return 0; | ||
1007 | |||
1008 | free_pages: | ||
1009 | list_for_each_entry_safe(bpage, tmp, pages, list) { | ||
1010 | list_del_init(&bpage->list); | ||
1011 | free_buffer_page(bpage); | ||
1012 | } | ||
1013 | |||
1014 | return -ENOMEM; | ||
1015 | } | ||
1016 | |||
1017 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | ||
1018 | unsigned nr_pages) | ||
1019 | { | ||
1020 | LIST_HEAD(pages); | ||
1021 | |||
1022 | WARN_ON(!nr_pages); | ||
1023 | |||
1024 | if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) | ||
1025 | return -ENOMEM; | ||
1026 | |||
1000 | /* | 1027 | /* |
1001 | * The ring buffer page list is a circular list that does not | 1028 | * The ring buffer page list is a circular list that does not |
1002 | * start and end with a list head. All page list items point to | 1029 | * start and end with a list head. All page list items point to |
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1005 | cpu_buffer->pages = pages.next; | 1032 | cpu_buffer->pages = pages.next; |
1006 | list_del(&pages); | 1033 | list_del(&pages); |
1007 | 1034 | ||
1035 | cpu_buffer->nr_pages = nr_pages; | ||
1036 | |||
1008 | rb_check_pages(cpu_buffer); | 1037 | rb_check_pages(cpu_buffer); |
1009 | 1038 | ||
1010 | return 0; | 1039 | return 0; |
1011 | |||
1012 | free_pages: | ||
1013 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | ||
1014 | list_del_init(&bpage->list); | ||
1015 | free_buffer_page(bpage); | ||
1016 | } | ||
1017 | return -ENOMEM; | ||
1018 | } | 1040 | } |
1019 | 1041 | ||
1020 | static struct ring_buffer_per_cpu * | 1042 | static struct ring_buffer_per_cpu * |
1021 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | 1043 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) |
1022 | { | 1044 | { |
1023 | struct ring_buffer_per_cpu *cpu_buffer; | 1045 | struct ring_buffer_per_cpu *cpu_buffer; |
1024 | struct buffer_page *bpage; | 1046 | struct buffer_page *bpage; |
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1035 | raw_spin_lock_init(&cpu_buffer->reader_lock); | 1057 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1036 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1058 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1037 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1059 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1060 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); | ||
1061 | init_completion(&cpu_buffer->update_done); | ||
1038 | 1062 | ||
1039 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1063 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1040 | GFP_KERNEL, cpu_to_node(cpu)); | 1064 | GFP_KERNEL, cpu_to_node(cpu)); |
@@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1052 | 1076 | ||
1053 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
1054 | 1078 | ||
1055 | ret = rb_allocate_pages(cpu_buffer, buffer->pages); | 1079 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
1056 | if (ret < 0) | 1080 | if (ret < 0) |
1057 | goto fail_free_reader; | 1081 | goto fail_free_reader; |
1058 | 1082 | ||
@@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1113 | { | 1137 | { |
1114 | struct ring_buffer *buffer; | 1138 | struct ring_buffer *buffer; |
1115 | int bsize; | 1139 | int bsize; |
1116 | int cpu; | 1140 | int cpu, nr_pages; |
1117 | 1141 | ||
1118 | /* keep it in its own cache line */ | 1142 | /* keep it in its own cache line */ |
1119 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), | 1143 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), |
@@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1124 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) | 1148 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) |
1125 | goto fail_free_buffer; | 1149 | goto fail_free_buffer; |
1126 | 1150 | ||
1127 | buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1151 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1128 | buffer->flags = flags; | 1152 | buffer->flags = flags; |
1129 | buffer->clock = trace_clock_local; | 1153 | buffer->clock = trace_clock_local; |
1130 | buffer->reader_lock_key = key; | 1154 | buffer->reader_lock_key = key; |
1131 | 1155 | ||
1132 | /* need at least two pages */ | 1156 | /* need at least two pages */ |
1133 | if (buffer->pages < 2) | 1157 | if (nr_pages < 2) |
1134 | buffer->pages = 2; | 1158 | nr_pages = 2; |
1135 | 1159 | ||
1136 | /* | 1160 | /* |
1137 | * In case of non-hotplug cpu, if the ring-buffer is allocated | 1161 | * In case of non-hotplug cpu, if the ring-buffer is allocated |
@@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1154 | 1178 | ||
1155 | for_each_buffer_cpu(buffer, cpu) { | 1179 | for_each_buffer_cpu(buffer, cpu) { |
1156 | buffer->buffers[cpu] = | 1180 | buffer->buffers[cpu] = |
1157 | rb_allocate_cpu_buffer(buffer, cpu); | 1181 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
1158 | if (!buffer->buffers[cpu]) | 1182 | if (!buffer->buffers[cpu]) |
1159 | goto fail_free_buffers; | 1183 | goto fail_free_buffers; |
1160 | } | 1184 | } |
@@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, | |||
1222 | 1246 | ||
1223 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); | 1247 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); |
1224 | 1248 | ||
1225 | static void | 1249 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) |
1226 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | ||
1227 | { | 1250 | { |
1228 | struct buffer_page *bpage; | 1251 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1229 | struct list_head *p; | 1252 | } |
1230 | unsigned i; | 1253 | |
1254 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1255 | { | ||
1256 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1257 | } | ||
1258 | |||
1259 | static int | ||
1260 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | ||
1261 | { | ||
1262 | struct list_head *tail_page, *to_remove, *next_page; | ||
1263 | struct buffer_page *to_remove_page, *tmp_iter_page; | ||
1264 | struct buffer_page *last_page, *first_page; | ||
1265 | unsigned int nr_removed; | ||
1266 | unsigned long head_bit; | ||
1267 | int page_entries; | ||
1268 | |||
1269 | head_bit = 0; | ||
1231 | 1270 | ||
1232 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1271 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1233 | rb_head_page_deactivate(cpu_buffer); | 1272 | atomic_inc(&cpu_buffer->record_disabled); |
1273 | /* | ||
1274 | * We don't race with the readers since we have acquired the reader | ||
1275 | * lock. We also don't race with writers after disabling recording. | ||
1276 | * This makes it easy to figure out the first and the last page to be | ||
1277 | * removed from the list. We unlink all the pages in between including | ||
1278 | * the first and last pages. This is done in a busy loop so that we | ||
1279 | * lose the least number of traces. | ||
1280 | * The pages are freed after we restart recording and unlock readers. | ||
1281 | */ | ||
1282 | tail_page = &cpu_buffer->tail_page->list; | ||
1234 | 1283 | ||
1235 | for (i = 0; i < nr_pages; i++) { | 1284 | /* |
1236 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1285 | * tail page might be on reader page, we remove the next page |
1237 | goto out; | 1286 | * from the ring buffer |
1238 | p = cpu_buffer->pages->next; | 1287 | */ |
1239 | bpage = list_entry(p, struct buffer_page, list); | 1288 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) |
1240 | list_del_init(&bpage->list); | 1289 | tail_page = rb_list_head(tail_page->next); |
1241 | free_buffer_page(bpage); | 1290 | to_remove = tail_page; |
1291 | |||
1292 | /* start of pages to remove */ | ||
1293 | first_page = list_entry(rb_list_head(to_remove->next), | ||
1294 | struct buffer_page, list); | ||
1295 | |||
1296 | for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { | ||
1297 | to_remove = rb_list_head(to_remove)->next; | ||
1298 | head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; | ||
1242 | } | 1299 | } |
1243 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | ||
1244 | goto out; | ||
1245 | 1300 | ||
1246 | rb_reset_cpu(cpu_buffer); | 1301 | next_page = rb_list_head(to_remove)->next; |
1247 | rb_check_pages(cpu_buffer); | ||
1248 | 1302 | ||
1249 | out: | 1303 | /* |
1304 | * Now we remove all pages between tail_page and next_page. | ||
1305 | * Make sure that we have head_bit value preserved for the | ||
1306 | * next page | ||
1307 | */ | ||
1308 | tail_page->next = (struct list_head *)((unsigned long)next_page | | ||
1309 | head_bit); | ||
1310 | next_page = rb_list_head(next_page); | ||
1311 | next_page->prev = tail_page; | ||
1312 | |||
1313 | /* make sure pages points to a valid page in the ring buffer */ | ||
1314 | cpu_buffer->pages = next_page; | ||
1315 | |||
1316 | /* update head page */ | ||
1317 | if (head_bit) | ||
1318 | cpu_buffer->head_page = list_entry(next_page, | ||
1319 | struct buffer_page, list); | ||
1320 | |||
1321 | /* | ||
1322 | * change read pointer to make sure any read iterators reset | ||
1323 | * themselves | ||
1324 | */ | ||
1325 | cpu_buffer->read = 0; | ||
1326 | |||
1327 | /* pages are removed, resume tracing and then free the pages */ | ||
1328 | atomic_dec(&cpu_buffer->record_disabled); | ||
1250 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1329 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1330 | |||
1331 | RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); | ||
1332 | |||
1333 | /* last buffer page to remove */ | ||
1334 | last_page = list_entry(rb_list_head(to_remove), struct buffer_page, | ||
1335 | list); | ||
1336 | tmp_iter_page = first_page; | ||
1337 | |||
1338 | do { | ||
1339 | to_remove_page = tmp_iter_page; | ||
1340 | rb_inc_page(cpu_buffer, &tmp_iter_page); | ||
1341 | |||
1342 | /* update the counters */ | ||
1343 | page_entries = rb_page_entries(to_remove_page); | ||
1344 | if (page_entries) { | ||
1345 | /* | ||
1346 | * If something was added to this page, it was full | ||
1347 | * since it is not the tail page. So we deduct the | ||
1348 | * bytes consumed in ring buffer from here. | ||
1349 | * No need to update overruns, since this page is | ||
1350 | * deleted from ring buffer and its entries are | ||
1351 | * already accounted for. | ||
1352 | */ | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * We have already removed references to this list item, just | ||
1358 | * free up the buffer_page and its page | ||
1359 | */ | ||
1360 | free_buffer_page(to_remove_page); | ||
1361 | nr_removed--; | ||
1362 | |||
1363 | } while (to_remove_page != last_page); | ||
1364 | |||
1365 | RB_WARN_ON(cpu_buffer, nr_removed); | ||
1366 | |||
1367 | return nr_removed == 0; | ||
1251 | } | 1368 | } |
1252 | 1369 | ||
1253 | static void | 1370 | static int |
1254 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | 1371 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) |
1255 | struct list_head *pages, unsigned nr_pages) | ||
1256 | { | 1372 | { |
1257 | struct buffer_page *bpage; | 1373 | struct list_head *pages = &cpu_buffer->new_pages; |
1258 | struct list_head *p; | 1374 | int retries, success; |
1259 | unsigned i; | ||
1260 | 1375 | ||
1261 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1376 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1262 | rb_head_page_deactivate(cpu_buffer); | 1377 | /* |
1378 | * We are holding the reader lock, so the reader page won't be swapped | ||
1379 | * in the ring buffer. Now we are racing with the writer trying to | ||
1380 | * move head page and the tail page. | ||
1381 | * We are going to adapt the reader page update process where: | ||
1382 | * 1. We first splice the start and end of list of new pages between | ||
1383 | * the head page and its previous page. | ||
1384 | * 2. We cmpxchg the prev_page->next to point from head page to the | ||
1385 | * start of new pages list. | ||
1386 | * 3. Finally, we update the head->prev to the end of new list. | ||
1387 | * | ||
1388 | * We will try this process 10 times, to make sure that we don't keep | ||
1389 | * spinning. | ||
1390 | */ | ||
1391 | retries = 10; | ||
1392 | success = 0; | ||
1393 | while (retries--) { | ||
1394 | struct list_head *head_page, *prev_page, *r; | ||
1395 | struct list_head *last_page, *first_page; | ||
1396 | struct list_head *head_page_with_bit; | ||
1263 | 1397 | ||
1264 | for (i = 0; i < nr_pages; i++) { | 1398 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1265 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1399 | prev_page = head_page->prev; |
1266 | goto out; | 1400 | |
1267 | p = pages->next; | 1401 | first_page = pages->next; |
1268 | bpage = list_entry(p, struct buffer_page, list); | 1402 | last_page = pages->prev; |
1269 | list_del_init(&bpage->list); | 1403 | |
1270 | list_add_tail(&bpage->list, cpu_buffer->pages); | 1404 | head_page_with_bit = (struct list_head *) |
1405 | ((unsigned long)head_page | RB_PAGE_HEAD); | ||
1406 | |||
1407 | last_page->next = head_page_with_bit; | ||
1408 | first_page->prev = prev_page; | ||
1409 | |||
1410 | r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); | ||
1411 | |||
1412 | if (r == head_page_with_bit) { | ||
1413 | /* | ||
1414 | * yay, we replaced the page pointer to our new list, | ||
1415 | * now, we just have to update to head page's prev | ||
1416 | * pointer to point to end of list | ||
1417 | */ | ||
1418 | head_page->prev = last_page; | ||
1419 | success = 1; | ||
1420 | break; | ||
1421 | } | ||
1271 | } | 1422 | } |
1272 | rb_reset_cpu(cpu_buffer); | ||
1273 | rb_check_pages(cpu_buffer); | ||
1274 | 1423 | ||
1275 | out: | 1424 | if (success) |
1425 | INIT_LIST_HEAD(pages); | ||
1426 | /* | ||
1427 | * If we weren't successful in adding in new pages, warn and stop | ||
1428 | * tracing | ||
1429 | */ | ||
1430 | RB_WARN_ON(cpu_buffer, !success); | ||
1276 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1431 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1432 | |||
1433 | /* free pages if they weren't inserted */ | ||
1434 | if (!success) { | ||
1435 | struct buffer_page *bpage, *tmp; | ||
1436 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, | ||
1437 | list) { | ||
1438 | list_del_init(&bpage->list); | ||
1439 | free_buffer_page(bpage); | ||
1440 | } | ||
1441 | } | ||
1442 | return success; | ||
1443 | } | ||
1444 | |||
1445 | static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) | ||
1446 | { | ||
1447 | int success; | ||
1448 | |||
1449 | if (cpu_buffer->nr_pages_to_update > 0) | ||
1450 | success = rb_insert_pages(cpu_buffer); | ||
1451 | else | ||
1452 | success = rb_remove_pages(cpu_buffer, | ||
1453 | -cpu_buffer->nr_pages_to_update); | ||
1454 | |||
1455 | if (success) | ||
1456 | cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; | ||
1457 | } | ||
1458 | |||
1459 | static void update_pages_handler(struct work_struct *work) | ||
1460 | { | ||
1461 | struct ring_buffer_per_cpu *cpu_buffer = container_of(work, | ||
1462 | struct ring_buffer_per_cpu, update_pages_work); | ||
1463 | rb_update_pages(cpu_buffer); | ||
1464 | complete(&cpu_buffer->update_done); | ||
1277 | } | 1465 | } |
1278 | 1466 | ||
1279 | /** | 1467 | /** |
@@ -1283,16 +1471,14 @@ out: | |||
1283 | * | 1471 | * |
1284 | * Minimum size is 2 * BUF_PAGE_SIZE. | 1472 | * Minimum size is 2 * BUF_PAGE_SIZE. |
1285 | * | 1473 | * |
1286 | * Returns -1 on failure. | 1474 | * Returns 0 on success and < 0 on failure. |
1287 | */ | 1475 | */ |
1288 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | 1476 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, |
1477 | int cpu_id) | ||
1289 | { | 1478 | { |
1290 | struct ring_buffer_per_cpu *cpu_buffer; | 1479 | struct ring_buffer_per_cpu *cpu_buffer; |
1291 | unsigned nr_pages, rm_pages, new_pages; | 1480 | unsigned nr_pages; |
1292 | struct buffer_page *bpage, *tmp; | 1481 | int cpu, err = 0; |
1293 | unsigned long buffer_size; | ||
1294 | LIST_HEAD(pages); | ||
1295 | int i, cpu; | ||
1296 | 1482 | ||
1297 | /* | 1483 | /* |
1298 | * Always succeed at resizing a non-existent buffer: | 1484 | * Always succeed at resizing a non-existent buffer: |
@@ -1300,115 +1486,161 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1300 | if (!buffer) | 1486 | if (!buffer) |
1301 | return size; | 1487 | return size; |
1302 | 1488 | ||
1489 | /* Make sure the requested buffer exists */ | ||
1490 | if (cpu_id != RING_BUFFER_ALL_CPUS && | ||
1491 | !cpumask_test_cpu(cpu_id, buffer->cpumask)) | ||
1492 | return size; | ||
1493 | |||
1303 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1494 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1304 | size *= BUF_PAGE_SIZE; | 1495 | size *= BUF_PAGE_SIZE; |
1305 | buffer_size = buffer->pages * BUF_PAGE_SIZE; | ||
1306 | 1496 | ||
1307 | /* we need a minimum of two pages */ | 1497 | /* we need a minimum of two pages */ |
1308 | if (size < BUF_PAGE_SIZE * 2) | 1498 | if (size < BUF_PAGE_SIZE * 2) |
1309 | size = BUF_PAGE_SIZE * 2; | 1499 | size = BUF_PAGE_SIZE * 2; |
1310 | 1500 | ||
1311 | if (size == buffer_size) | 1501 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1312 | return size; | ||
1313 | |||
1314 | atomic_inc(&buffer->record_disabled); | ||
1315 | 1502 | ||
1316 | /* Make sure all writers are done with this buffer. */ | 1503 | /* |
1317 | synchronize_sched(); | 1504 | * Don't succeed if resizing is disabled, as a reader might be |
1505 | * manipulating the ring buffer and is expecting a sane state while | ||
1506 | * this is true. | ||
1507 | */ | ||
1508 | if (atomic_read(&buffer->resize_disabled)) | ||
1509 | return -EBUSY; | ||
1318 | 1510 | ||
1511 | /* prevent another thread from changing buffer sizes */ | ||
1319 | mutex_lock(&buffer->mutex); | 1512 | mutex_lock(&buffer->mutex); |
1320 | get_online_cpus(); | ||
1321 | |||
1322 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | ||
1323 | 1513 | ||
1324 | if (size < buffer_size) { | 1514 | if (cpu_id == RING_BUFFER_ALL_CPUS) { |
1515 | /* calculate the pages to update */ | ||
1516 | for_each_buffer_cpu(buffer, cpu) { | ||
1517 | cpu_buffer = buffer->buffers[cpu]; | ||
1325 | 1518 | ||
1326 | /* easy case, just free pages */ | 1519 | cpu_buffer->nr_pages_to_update = nr_pages - |
1327 | if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) | 1520 | cpu_buffer->nr_pages; |
1328 | goto out_fail; | 1521 | /* |
1522 | * nothing more to do for removing pages or no update | ||
1523 | */ | ||
1524 | if (cpu_buffer->nr_pages_to_update <= 0) | ||
1525 | continue; | ||
1526 | /* | ||
1527 | * to add pages, make sure all new pages can be | ||
1528 | * allocated without receiving ENOMEM | ||
1529 | */ | ||
1530 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1531 | if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, | ||
1532 | &cpu_buffer->new_pages, cpu)) { | ||
1533 | /* not enough memory for new pages */ | ||
1534 | err = -ENOMEM; | ||
1535 | goto out_err; | ||
1536 | } | ||
1537 | } | ||
1329 | 1538 | ||
1330 | rm_pages = buffer->pages - nr_pages; | 1539 | get_online_cpus(); |
1540 | /* | ||
1541 | * Fire off all the required work handlers | ||
1542 | * We can't schedule on offline CPUs, but it's not necessary | ||
1543 | * since we can change their buffer sizes without any race. | ||
1544 | */ | ||
1545 | for_each_buffer_cpu(buffer, cpu) { | ||
1546 | cpu_buffer = buffer->buffers[cpu]; | ||
1547 | if (!cpu_buffer->nr_pages_to_update) | ||
1548 | continue; | ||
1549 | |||
1550 | if (cpu_online(cpu)) | ||
1551 | schedule_work_on(cpu, | ||
1552 | &cpu_buffer->update_pages_work); | ||
1553 | else | ||
1554 | rb_update_pages(cpu_buffer); | ||
1555 | } | ||
1331 | 1556 | ||
1557 | /* wait for all the updates to complete */ | ||
1332 | for_each_buffer_cpu(buffer, cpu) { | 1558 | for_each_buffer_cpu(buffer, cpu) { |
1333 | cpu_buffer = buffer->buffers[cpu]; | 1559 | cpu_buffer = buffer->buffers[cpu]; |
1334 | rb_remove_pages(cpu_buffer, rm_pages); | 1560 | if (!cpu_buffer->nr_pages_to_update) |
1561 | continue; | ||
1562 | |||
1563 | if (cpu_online(cpu)) | ||
1564 | wait_for_completion(&cpu_buffer->update_done); | ||
1565 | cpu_buffer->nr_pages_to_update = 0; | ||
1335 | } | 1566 | } |
1336 | goto out; | ||
1337 | } | ||
1338 | 1567 | ||
1339 | /* | 1568 | put_online_cpus(); |
1340 | * This is a bit more difficult. We only want to add pages | 1569 | } else { |
1341 | * when we can allocate enough for all CPUs. We do this | 1570 | cpu_buffer = buffer->buffers[cpu_id]; |
1342 | * by allocating all the pages and storing them on a local | ||
1343 | * link list. If we succeed in our allocation, then we | ||
1344 | * add these pages to the cpu_buffers. Otherwise we just free | ||
1345 | * them all and return -ENOMEM; | ||
1346 | */ | ||
1347 | if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) | ||
1348 | goto out_fail; | ||
1349 | 1571 | ||
1350 | new_pages = nr_pages - buffer->pages; | 1572 | if (nr_pages == cpu_buffer->nr_pages) |
1573 | goto out; | ||
1351 | 1574 | ||
1352 | for_each_buffer_cpu(buffer, cpu) { | 1575 | cpu_buffer->nr_pages_to_update = nr_pages - |
1353 | for (i = 0; i < new_pages; i++) { | 1576 | cpu_buffer->nr_pages; |
1354 | struct page *page; | 1577 | |
1355 | /* | 1578 | INIT_LIST_HEAD(&cpu_buffer->new_pages); |
1356 | * __GFP_NORETRY flag makes sure that the allocation | 1579 | if (cpu_buffer->nr_pages_to_update > 0 && |
1357 | * fails gracefully without invoking oom-killer and | 1580 | __rb_allocate_pages(cpu_buffer->nr_pages_to_update, |
1358 | * the system is not destabilized. | 1581 | &cpu_buffer->new_pages, cpu_id)) { |
1359 | */ | 1582 | err = -ENOMEM; |
1360 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), | 1583 | goto out_err; |
1361 | cache_line_size()), | ||
1362 | GFP_KERNEL | __GFP_NORETRY, | ||
1363 | cpu_to_node(cpu)); | ||
1364 | if (!bpage) | ||
1365 | goto free_pages; | ||
1366 | list_add(&bpage->list, &pages); | ||
1367 | page = alloc_pages_node(cpu_to_node(cpu), | ||
1368 | GFP_KERNEL | __GFP_NORETRY, 0); | ||
1369 | if (!page) | ||
1370 | goto free_pages; | ||
1371 | bpage->page = page_address(page); | ||
1372 | rb_init_page(bpage->page); | ||
1373 | } | 1584 | } |
1374 | } | ||
1375 | 1585 | ||
1376 | for_each_buffer_cpu(buffer, cpu) { | 1586 | get_online_cpus(); |
1377 | cpu_buffer = buffer->buffers[cpu]; | ||
1378 | rb_insert_pages(cpu_buffer, &pages, new_pages); | ||
1379 | } | ||
1380 | 1587 | ||
1381 | if (RB_WARN_ON(buffer, !list_empty(&pages))) | 1588 | if (cpu_online(cpu_id)) { |
1382 | goto out_fail; | 1589 | schedule_work_on(cpu_id, |
1590 | &cpu_buffer->update_pages_work); | ||
1591 | wait_for_completion(&cpu_buffer->update_done); | ||
1592 | } else | ||
1593 | rb_update_pages(cpu_buffer); | ||
1594 | |||
1595 | cpu_buffer->nr_pages_to_update = 0; | ||
1596 | put_online_cpus(); | ||
1597 | } | ||
1383 | 1598 | ||
1384 | out: | 1599 | out: |
1385 | buffer->pages = nr_pages; | 1600 | /* |
1386 | put_online_cpus(); | 1601 | * The ring buffer resize can happen with the ring buffer |
1602 | * enabled, so that the update disturbs the tracing as little | ||
1603 | * as possible. But if the buffer is disabled, we do not need | ||
1604 | * to worry about that, and we can take the time to verify | ||
1605 | * that the buffer is not corrupt. | ||
1606 | */ | ||
1607 | if (atomic_read(&buffer->record_disabled)) { | ||
1608 | atomic_inc(&buffer->record_disabled); | ||
1609 | /* | ||
1610 | * Even though the buffer was disabled, we must make sure | ||
1611 | * that it is truly disabled before calling rb_check_pages. | ||
1612 | * There could have been a race between checking | ||
1613 | * record_disable and incrementing it. | ||
1614 | */ | ||
1615 | synchronize_sched(); | ||
1616 | for_each_buffer_cpu(buffer, cpu) { | ||
1617 | cpu_buffer = buffer->buffers[cpu]; | ||
1618 | rb_check_pages(cpu_buffer); | ||
1619 | } | ||
1620 | atomic_dec(&buffer->record_disabled); | ||
1621 | } | ||
1622 | |||
1387 | mutex_unlock(&buffer->mutex); | 1623 | mutex_unlock(&buffer->mutex); |
1624 | return size; | ||
1388 | 1625 | ||
1389 | atomic_dec(&buffer->record_disabled); | 1626 | out_err: |
1627 | for_each_buffer_cpu(buffer, cpu) { | ||
1628 | struct buffer_page *bpage, *tmp; | ||
1390 | 1629 | ||
1391 | return size; | 1630 | cpu_buffer = buffer->buffers[cpu]; |
1631 | cpu_buffer->nr_pages_to_update = 0; | ||
1392 | 1632 | ||
1393 | free_pages: | 1633 | if (list_empty(&cpu_buffer->new_pages)) |
1394 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | 1634 | continue; |
1395 | list_del_init(&bpage->list); | ||
1396 | free_buffer_page(bpage); | ||
1397 | } | ||
1398 | put_online_cpus(); | ||
1399 | mutex_unlock(&buffer->mutex); | ||
1400 | atomic_dec(&buffer->record_disabled); | ||
1401 | return -ENOMEM; | ||
1402 | 1635 | ||
1403 | /* | 1636 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, |
1404 | * Something went totally wrong, and we are too paranoid | 1637 | list) { |
1405 | * to even clean up the mess. | 1638 | list_del_init(&bpage->list); |
1406 | */ | 1639 | free_buffer_page(bpage); |
1407 | out_fail: | 1640 | } |
1408 | put_online_cpus(); | 1641 | } |
1409 | mutex_unlock(&buffer->mutex); | 1642 | mutex_unlock(&buffer->mutex); |
1410 | atomic_dec(&buffer->record_disabled); | 1643 | return err; |
1411 | return -1; | ||
1412 | } | 1644 | } |
1413 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1645 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1414 | 1646 | ||
@@ -1447,21 +1679,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) | |||
1447 | return __rb_page_index(iter->head_page, iter->head); | 1679 | return __rb_page_index(iter->head_page, iter->head); |
1448 | } | 1680 | } |
1449 | 1681 | ||
1450 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1451 | { | ||
1452 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1453 | } | ||
1454 | |||
1455 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | 1682 | static inline unsigned rb_page_commit(struct buffer_page *bpage) |
1456 | { | 1683 | { |
1457 | return local_read(&bpage->page->commit); | 1684 | return local_read(&bpage->page->commit); |
1458 | } | 1685 | } |
1459 | 1686 | ||
1460 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | ||
1461 | { | ||
1462 | return local_read(&bpage->entries) & RB_WRITE_MASK; | ||
1463 | } | ||
1464 | |||
1465 | /* Size is determined by what has been committed */ | 1687 | /* Size is determined by what has been committed */ |
1466 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1688 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1467 | { | 1689 | { |
@@ -1510,7 +1732,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | |||
1510 | * assign the commit to the tail. | 1732 | * assign the commit to the tail. |
1511 | */ | 1733 | */ |
1512 | again: | 1734 | again: |
1513 | max_count = cpu_buffer->buffer->pages * 100; | 1735 | max_count = cpu_buffer->nr_pages * 100; |
1514 | 1736 | ||
1515 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | 1737 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { |
1516 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) | 1738 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) |
@@ -3486,6 +3708,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) | |||
3486 | 3708 | ||
3487 | iter->cpu_buffer = cpu_buffer; | 3709 | iter->cpu_buffer = cpu_buffer; |
3488 | 3710 | ||
3711 | atomic_inc(&buffer->resize_disabled); | ||
3489 | atomic_inc(&cpu_buffer->record_disabled); | 3712 | atomic_inc(&cpu_buffer->record_disabled); |
3490 | 3713 | ||
3491 | return iter; | 3714 | return iter; |
@@ -3548,7 +3771,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) | |||
3548 | { | 3771 | { |
3549 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3772 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3550 | 3773 | ||
3774 | /* | ||
3775 | * Ring buffer is disabled from recording, here's a good place | ||
3776 | * to check the integrity of the ring buffer. | ||
3777 | */ | ||
3778 | rb_check_pages(cpu_buffer); | ||
3779 | |||
3551 | atomic_dec(&cpu_buffer->record_disabled); | 3780 | atomic_dec(&cpu_buffer->record_disabled); |
3781 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | ||
3552 | kfree(iter); | 3782 | kfree(iter); |
3553 | } | 3783 | } |
3554 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); | 3784 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); |
@@ -3588,9 +3818,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); | |||
3588 | * ring_buffer_size - return the size of the ring buffer (in bytes) | 3818 | * ring_buffer_size - return the size of the ring buffer (in bytes) |
3589 | * @buffer: The ring buffer. | 3819 | * @buffer: The ring buffer. |
3590 | */ | 3820 | */ |
3591 | unsigned long ring_buffer_size(struct ring_buffer *buffer) | 3821 | unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) |
3592 | { | 3822 | { |
3593 | return BUF_PAGE_SIZE * buffer->pages; | 3823 | /* |
3824 | * Earlier, this method returned | ||
3825 | * BUF_PAGE_SIZE * buffer->nr_pages | ||
3826 | * Since the nr_pages field is now removed, we have converted this to | ||
3827 | * return the per cpu buffer value. | ||
3828 | */ | ||
3829 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3830 | return 0; | ||
3831 | |||
3832 | return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; | ||
3594 | } | 3833 | } |
3595 | EXPORT_SYMBOL_GPL(ring_buffer_size); | 3834 | EXPORT_SYMBOL_GPL(ring_buffer_size); |
3596 | 3835 | ||
@@ -3611,6 +3850,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3611 | cpu_buffer->commit_page = cpu_buffer->head_page; | 3850 | cpu_buffer->commit_page = cpu_buffer->head_page; |
3612 | 3851 | ||
3613 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 3852 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
3853 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
3614 | local_set(&cpu_buffer->reader_page->write, 0); | 3854 | local_set(&cpu_buffer->reader_page->write, 0); |
3615 | local_set(&cpu_buffer->reader_page->entries, 0); | 3855 | local_set(&cpu_buffer->reader_page->entries, 0); |
3616 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3856 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
@@ -3647,8 +3887,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3647 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 3887 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
3648 | return; | 3888 | return; |
3649 | 3889 | ||
3890 | atomic_inc(&buffer->resize_disabled); | ||
3650 | atomic_inc(&cpu_buffer->record_disabled); | 3891 | atomic_inc(&cpu_buffer->record_disabled); |
3651 | 3892 | ||
3893 | /* Make sure all commits have finished */ | ||
3894 | synchronize_sched(); | ||
3895 | |||
3652 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3896 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3653 | 3897 | ||
3654 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3898 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
@@ -3664,6 +3908,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3664 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3908 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3665 | 3909 | ||
3666 | atomic_dec(&cpu_buffer->record_disabled); | 3910 | atomic_dec(&cpu_buffer->record_disabled); |
3911 | atomic_dec(&buffer->resize_disabled); | ||
3667 | } | 3912 | } |
3668 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); | 3913 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); |
3669 | 3914 | ||
@@ -3765,8 +4010,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3765 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) | 4010 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) |
3766 | goto out; | 4011 | goto out; |
3767 | 4012 | ||
4013 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
4014 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
4015 | |||
3768 | /* At least make sure the two buffers are somewhat the same */ | 4016 | /* At least make sure the two buffers are somewhat the same */ |
3769 | if (buffer_a->pages != buffer_b->pages) | 4017 | if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) |
3770 | goto out; | 4018 | goto out; |
3771 | 4019 | ||
3772 | ret = -EAGAIN; | 4020 | ret = -EAGAIN; |
@@ -3780,9 +4028,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3780 | if (atomic_read(&buffer_b->record_disabled)) | 4028 | if (atomic_read(&buffer_b->record_disabled)) |
3781 | goto out; | 4029 | goto out; |
3782 | 4030 | ||
3783 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
3784 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
3785 | |||
3786 | if (atomic_read(&cpu_buffer_a->record_disabled)) | 4031 | if (atomic_read(&cpu_buffer_a->record_disabled)) |
3787 | goto out; | 4032 | goto out; |
3788 | 4033 | ||
@@ -4071,6 +4316,8 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4071 | struct ring_buffer *buffer = | 4316 | struct ring_buffer *buffer = |
4072 | container_of(self, struct ring_buffer, cpu_notify); | 4317 | container_of(self, struct ring_buffer, cpu_notify); |
4073 | long cpu = (long)hcpu; | 4318 | long cpu = (long)hcpu; |
4319 | int cpu_i, nr_pages_same; | ||
4320 | unsigned int nr_pages; | ||
4074 | 4321 | ||
4075 | switch (action) { | 4322 | switch (action) { |
4076 | case CPU_UP_PREPARE: | 4323 | case CPU_UP_PREPARE: |
@@ -4078,8 +4325,23 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4078 | if (cpumask_test_cpu(cpu, buffer->cpumask)) | 4325 | if (cpumask_test_cpu(cpu, buffer->cpumask)) |
4079 | return NOTIFY_OK; | 4326 | return NOTIFY_OK; |
4080 | 4327 | ||
4328 | nr_pages = 0; | ||
4329 | nr_pages_same = 1; | ||
4330 | /* check if all cpu sizes are same */ | ||
4331 | for_each_buffer_cpu(buffer, cpu_i) { | ||
4332 | /* fill in the size from first enabled cpu */ | ||
4333 | if (nr_pages == 0) | ||
4334 | nr_pages = buffer->buffers[cpu_i]->nr_pages; | ||
4335 | if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { | ||
4336 | nr_pages_same = 0; | ||
4337 | break; | ||
4338 | } | ||
4339 | } | ||
4340 | /* allocate minimum pages, user can later expand it */ | ||
4341 | if (!nr_pages_same) | ||
4342 | nr_pages = 2; | ||
4081 | buffer->buffers[cpu] = | 4343 | buffer->buffers[cpu] = |
4082 | rb_allocate_cpu_buffer(buffer, cpu); | 4344 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
4083 | if (!buffer->buffers[cpu]) { | 4345 | if (!buffer->buffers[cpu]) { |
4084 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", | 4346 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", |
4085 | cpu); | 4347 | cpu); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a22255c1010..68032c6177db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -87,18 +87,6 @@ static int tracing_disabled = 1; | |||
87 | 87 | ||
88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); | 88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); |
89 | 89 | ||
90 | static inline void ftrace_disable_cpu(void) | ||
91 | { | ||
92 | preempt_disable(); | ||
93 | __this_cpu_inc(ftrace_cpu_disabled); | ||
94 | } | ||
95 | |||
96 | static inline void ftrace_enable_cpu(void) | ||
97 | { | ||
98 | __this_cpu_dec(ftrace_cpu_disabled); | ||
99 | preempt_enable(); | ||
100 | } | ||
101 | |||
102 | cpumask_var_t __read_mostly tracing_buffer_mask; | 90 | cpumask_var_t __read_mostly tracing_buffer_mask; |
103 | 91 | ||
104 | /* | 92 | /* |
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) | |||
629 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 617 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
630 | { | 618 | { |
631 | int len; | 619 | int len; |
632 | void *ret; | ||
633 | 620 | ||
634 | if (s->len <= s->readpos) | 621 | if (s->len <= s->readpos) |
635 | return -EBUSY; | 622 | return -EBUSY; |
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
637 | len = s->len - s->readpos; | 624 | len = s->len - s->readpos; |
638 | if (cnt > len) | 625 | if (cnt > len) |
639 | cnt = len; | 626 | cnt = len; |
640 | ret = memcpy(buf, s->buffer + s->readpos, cnt); | 627 | memcpy(buf, s->buffer + s->readpos, cnt); |
641 | if (!ret) | ||
642 | return -EFAULT; | ||
643 | 628 | ||
644 | s->readpos += cnt; | 629 | s->readpos += cnt; |
645 | return cnt; | 630 | return cnt; |
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
751 | 736 | ||
752 | arch_spin_lock(&ftrace_max_lock); | 737 | arch_spin_lock(&ftrace_max_lock); |
753 | 738 | ||
754 | ftrace_disable_cpu(); | ||
755 | |||
756 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); | 739 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); |
757 | 740 | ||
758 | if (ret == -EBUSY) { | 741 | if (ret == -EBUSY) { |
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
766 | "Failed to swap buffers due to commit in progress\n"); | 749 | "Failed to swap buffers due to commit in progress\n"); |
767 | } | 750 | } |
768 | 751 | ||
769 | ftrace_enable_cpu(); | ||
770 | |||
771 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); | 752 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); |
772 | 753 | ||
773 | __update_max_tr(tr, tsk, cpu); | 754 | __update_max_tr(tr, tsk, cpu); |
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
782 | * Register a new plugin tracer. | 763 | * Register a new plugin tracer. |
783 | */ | 764 | */ |
784 | int register_tracer(struct tracer *type) | 765 | int register_tracer(struct tracer *type) |
785 | __releases(kernel_lock) | ||
786 | __acquires(kernel_lock) | ||
787 | { | 766 | { |
788 | struct tracer *t; | 767 | struct tracer *t; |
789 | int ret = 0; | 768 | int ret = 0; |
@@ -841,7 +820,8 @@ __acquires(kernel_lock) | |||
841 | 820 | ||
842 | /* If we expanded the buffers, make sure the max is expanded too */ | 821 | /* If we expanded the buffers, make sure the max is expanded too */ |
843 | if (ring_buffer_expanded && type->use_max_tr) | 822 | if (ring_buffer_expanded && type->use_max_tr) |
844 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | 823 | ring_buffer_resize(max_tr.buffer, trace_buf_size, |
824 | RING_BUFFER_ALL_CPUS); | ||
845 | 825 | ||
846 | /* the test is responsible for initializing and enabling */ | 826 | /* the test is responsible for initializing and enabling */ |
847 | pr_info("Testing tracer %s: ", type->name); | 827 | pr_info("Testing tracer %s: ", type->name); |
@@ -857,7 +837,8 @@ __acquires(kernel_lock) | |||
857 | 837 | ||
858 | /* Shrink the max buffer again */ | 838 | /* Shrink the max buffer again */ |
859 | if (ring_buffer_expanded && type->use_max_tr) | 839 | if (ring_buffer_expanded && type->use_max_tr) |
860 | ring_buffer_resize(max_tr.buffer, 1); | 840 | ring_buffer_resize(max_tr.buffer, 1, |
841 | RING_BUFFER_ALL_CPUS); | ||
861 | 842 | ||
862 | printk(KERN_CONT "PASSED\n"); | 843 | printk(KERN_CONT "PASSED\n"); |
863 | } | 844 | } |
@@ -917,13 +898,6 @@ out: | |||
917 | mutex_unlock(&trace_types_lock); | 898 | mutex_unlock(&trace_types_lock); |
918 | } | 899 | } |
919 | 900 | ||
920 | static void __tracing_reset(struct ring_buffer *buffer, int cpu) | ||
921 | { | ||
922 | ftrace_disable_cpu(); | ||
923 | ring_buffer_reset_cpu(buffer, cpu); | ||
924 | ftrace_enable_cpu(); | ||
925 | } | ||
926 | |||
927 | void tracing_reset(struct trace_array *tr, int cpu) | 901 | void tracing_reset(struct trace_array *tr, int cpu) |
928 | { | 902 | { |
929 | struct ring_buffer *buffer = tr->buffer; | 903 | struct ring_buffer *buffer = tr->buffer; |
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
932 | 906 | ||
933 | /* Make sure all commits have finished */ | 907 | /* Make sure all commits have finished */ |
934 | synchronize_sched(); | 908 | synchronize_sched(); |
935 | __tracing_reset(buffer, cpu); | 909 | ring_buffer_reset_cpu(buffer, cpu); |
936 | 910 | ||
937 | ring_buffer_record_enable(buffer); | 911 | ring_buffer_record_enable(buffer); |
938 | } | 912 | } |
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
950 | tr->time_start = ftrace_now(tr->cpu); | 924 | tr->time_start = ftrace_now(tr->cpu); |
951 | 925 | ||
952 | for_each_online_cpu(cpu) | 926 | for_each_online_cpu(cpu) |
953 | __tracing_reset(buffer, cpu); | 927 | ring_buffer_reset_cpu(buffer, cpu); |
954 | 928 | ||
955 | ring_buffer_record_enable(buffer); | 929 | ring_buffer_record_enable(buffer); |
956 | } | 930 | } |
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
1498 | 1472 | ||
1499 | #endif /* CONFIG_STACKTRACE */ | 1473 | #endif /* CONFIG_STACKTRACE */ |
1500 | 1474 | ||
1475 | /* created for use with alloc_percpu */ | ||
1476 | struct trace_buffer_struct { | ||
1477 | char buffer[TRACE_BUF_SIZE]; | ||
1478 | }; | ||
1479 | |||
1480 | static struct trace_buffer_struct *trace_percpu_buffer; | ||
1481 | static struct trace_buffer_struct *trace_percpu_sirq_buffer; | ||
1482 | static struct trace_buffer_struct *trace_percpu_irq_buffer; | ||
1483 | static struct trace_buffer_struct *trace_percpu_nmi_buffer; | ||
1484 | |||
1485 | /* | ||
1486 | * The buffer used is dependent on the context. There is a per cpu | ||
1487 | * buffer for normal context, softirq contex, hard irq context and | ||
1488 | * for NMI context. Thise allows for lockless recording. | ||
1489 | * | ||
1490 | * Note, if the buffers failed to be allocated, then this returns NULL | ||
1491 | */ | ||
1492 | static char *get_trace_buf(void) | ||
1493 | { | ||
1494 | struct trace_buffer_struct *percpu_buffer; | ||
1495 | struct trace_buffer_struct *buffer; | ||
1496 | |||
1497 | /* | ||
1498 | * If we have allocated per cpu buffers, then we do not | ||
1499 | * need to do any locking. | ||
1500 | */ | ||
1501 | if (in_nmi()) | ||
1502 | percpu_buffer = trace_percpu_nmi_buffer; | ||
1503 | else if (in_irq()) | ||
1504 | percpu_buffer = trace_percpu_irq_buffer; | ||
1505 | else if (in_softirq()) | ||
1506 | percpu_buffer = trace_percpu_sirq_buffer; | ||
1507 | else | ||
1508 | percpu_buffer = trace_percpu_buffer; | ||
1509 | |||
1510 | if (!percpu_buffer) | ||
1511 | return NULL; | ||
1512 | |||
1513 | buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); | ||
1514 | |||
1515 | return buffer->buffer; | ||
1516 | } | ||
1517 | |||
1518 | static int alloc_percpu_trace_buffer(void) | ||
1519 | { | ||
1520 | struct trace_buffer_struct *buffers; | ||
1521 | struct trace_buffer_struct *sirq_buffers; | ||
1522 | struct trace_buffer_struct *irq_buffers; | ||
1523 | struct trace_buffer_struct *nmi_buffers; | ||
1524 | |||
1525 | buffers = alloc_percpu(struct trace_buffer_struct); | ||
1526 | if (!buffers) | ||
1527 | goto err_warn; | ||
1528 | |||
1529 | sirq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1530 | if (!sirq_buffers) | ||
1531 | goto err_sirq; | ||
1532 | |||
1533 | irq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1534 | if (!irq_buffers) | ||
1535 | goto err_irq; | ||
1536 | |||
1537 | nmi_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1538 | if (!nmi_buffers) | ||
1539 | goto err_nmi; | ||
1540 | |||
1541 | trace_percpu_buffer = buffers; | ||
1542 | trace_percpu_sirq_buffer = sirq_buffers; | ||
1543 | trace_percpu_irq_buffer = irq_buffers; | ||
1544 | trace_percpu_nmi_buffer = nmi_buffers; | ||
1545 | |||
1546 | return 0; | ||
1547 | |||
1548 | err_nmi: | ||
1549 | free_percpu(irq_buffers); | ||
1550 | err_irq: | ||
1551 | free_percpu(sirq_buffers); | ||
1552 | err_sirq: | ||
1553 | free_percpu(buffers); | ||
1554 | err_warn: | ||
1555 | WARN(1, "Could not allocate percpu trace_printk buffer"); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | |||
1559 | void trace_printk_init_buffers(void) | ||
1560 | { | ||
1561 | static int buffers_allocated; | ||
1562 | |||
1563 | if (buffers_allocated) | ||
1564 | return; | ||
1565 | |||
1566 | if (alloc_percpu_trace_buffer()) | ||
1567 | return; | ||
1568 | |||
1569 | pr_info("ftrace: Allocated trace_printk buffers\n"); | ||
1570 | |||
1571 | buffers_allocated = 1; | ||
1572 | } | ||
1573 | |||
1501 | /** | 1574 | /** |
1502 | * trace_vbprintk - write binary msg to tracing buffer | 1575 | * trace_vbprintk - write binary msg to tracing buffer |
1503 | * | 1576 | * |
1504 | */ | 1577 | */ |
1505 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | 1578 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) |
1506 | { | 1579 | { |
1507 | static arch_spinlock_t trace_buf_lock = | ||
1508 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
1509 | static u32 trace_buf[TRACE_BUF_SIZE]; | ||
1510 | |||
1511 | struct ftrace_event_call *call = &event_bprint; | 1580 | struct ftrace_event_call *call = &event_bprint; |
1512 | struct ring_buffer_event *event; | 1581 | struct ring_buffer_event *event; |
1513 | struct ring_buffer *buffer; | 1582 | struct ring_buffer *buffer; |
1514 | struct trace_array *tr = &global_trace; | 1583 | struct trace_array *tr = &global_trace; |
1515 | struct trace_array_cpu *data; | ||
1516 | struct bprint_entry *entry; | 1584 | struct bprint_entry *entry; |
1517 | unsigned long flags; | 1585 | unsigned long flags; |
1518 | int disable; | 1586 | char *tbuffer; |
1519 | int cpu, len = 0, size, pc; | 1587 | int len = 0, size, pc; |
1520 | 1588 | ||
1521 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1589 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
1522 | return 0; | 1590 | return 0; |
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1526 | 1594 | ||
1527 | pc = preempt_count(); | 1595 | pc = preempt_count(); |
1528 | preempt_disable_notrace(); | 1596 | preempt_disable_notrace(); |
1529 | cpu = raw_smp_processor_id(); | ||
1530 | data = tr->data[cpu]; | ||
1531 | 1597 | ||
1532 | disable = atomic_inc_return(&data->disabled); | 1598 | tbuffer = get_trace_buf(); |
1533 | if (unlikely(disable != 1)) | 1599 | if (!tbuffer) { |
1600 | len = 0; | ||
1534 | goto out; | 1601 | goto out; |
1602 | } | ||
1535 | 1603 | ||
1536 | /* Lockdep uses trace_printk for lock tracing */ | 1604 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); |
1537 | local_irq_save(flags); | ||
1538 | arch_spin_lock(&trace_buf_lock); | ||
1539 | len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1540 | 1605 | ||
1541 | if (len > TRACE_BUF_SIZE || len < 0) | 1606 | if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) |
1542 | goto out_unlock; | 1607 | goto out; |
1543 | 1608 | ||
1609 | local_save_flags(flags); | ||
1544 | size = sizeof(*entry) + sizeof(u32) * len; | 1610 | size = sizeof(*entry) + sizeof(u32) * len; |
1545 | buffer = tr->buffer; | 1611 | buffer = tr->buffer; |
1546 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, | 1612 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, |
1547 | flags, pc); | 1613 | flags, pc); |
1548 | if (!event) | 1614 | if (!event) |
1549 | goto out_unlock; | 1615 | goto out; |
1550 | entry = ring_buffer_event_data(event); | 1616 | entry = ring_buffer_event_data(event); |
1551 | entry->ip = ip; | 1617 | entry->ip = ip; |
1552 | entry->fmt = fmt; | 1618 | entry->fmt = fmt; |
1553 | 1619 | ||
1554 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1620 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1555 | if (!filter_check_discard(call, entry, buffer, event)) { | 1621 | if (!filter_check_discard(call, entry, buffer, event)) { |
1556 | ring_buffer_unlock_commit(buffer, event); | 1622 | ring_buffer_unlock_commit(buffer, event); |
1557 | ftrace_trace_stack(buffer, flags, 6, pc); | 1623 | ftrace_trace_stack(buffer, flags, 6, pc); |
1558 | } | 1624 | } |
1559 | 1625 | ||
1560 | out_unlock: | ||
1561 | arch_spin_unlock(&trace_buf_lock); | ||
1562 | local_irq_restore(flags); | ||
1563 | |||
1564 | out: | 1626 | out: |
1565 | atomic_dec_return(&data->disabled); | ||
1566 | preempt_enable_notrace(); | 1627 | preempt_enable_notrace(); |
1567 | unpause_graph_tracing(); | 1628 | unpause_graph_tracing(); |
1568 | 1629 | ||
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr, | |||
1588 | int trace_array_vprintk(struct trace_array *tr, | 1649 | int trace_array_vprintk(struct trace_array *tr, |
1589 | unsigned long ip, const char *fmt, va_list args) | 1650 | unsigned long ip, const char *fmt, va_list args) |
1590 | { | 1651 | { |
1591 | static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
1592 | static char trace_buf[TRACE_BUF_SIZE]; | ||
1593 | |||
1594 | struct ftrace_event_call *call = &event_print; | 1652 | struct ftrace_event_call *call = &event_print; |
1595 | struct ring_buffer_event *event; | 1653 | struct ring_buffer_event *event; |
1596 | struct ring_buffer *buffer; | 1654 | struct ring_buffer *buffer; |
1597 | struct trace_array_cpu *data; | 1655 | int len = 0, size, pc; |
1598 | int cpu, len = 0, size, pc; | ||
1599 | struct print_entry *entry; | 1656 | struct print_entry *entry; |
1600 | unsigned long irq_flags; | 1657 | unsigned long flags; |
1601 | int disable; | 1658 | char *tbuffer; |
1602 | 1659 | ||
1603 | if (tracing_disabled || tracing_selftest_running) | 1660 | if (tracing_disabled || tracing_selftest_running) |
1604 | return 0; | 1661 | return 0; |
1605 | 1662 | ||
1663 | /* Don't pollute graph traces with trace_vprintk internals */ | ||
1664 | pause_graph_tracing(); | ||
1665 | |||
1606 | pc = preempt_count(); | 1666 | pc = preempt_count(); |
1607 | preempt_disable_notrace(); | 1667 | preempt_disable_notrace(); |
1608 | cpu = raw_smp_processor_id(); | ||
1609 | data = tr->data[cpu]; | ||
1610 | 1668 | ||
1611 | disable = atomic_inc_return(&data->disabled); | 1669 | |
1612 | if (unlikely(disable != 1)) | 1670 | tbuffer = get_trace_buf(); |
1671 | if (!tbuffer) { | ||
1672 | len = 0; | ||
1613 | goto out; | 1673 | goto out; |
1674 | } | ||
1614 | 1675 | ||
1615 | pause_graph_tracing(); | 1676 | len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
1616 | raw_local_irq_save(irq_flags); | 1677 | if (len > TRACE_BUF_SIZE) |
1617 | arch_spin_lock(&trace_buf_lock); | 1678 | goto out; |
1618 | len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1619 | 1679 | ||
1680 | local_save_flags(flags); | ||
1620 | size = sizeof(*entry) + len + 1; | 1681 | size = sizeof(*entry) + len + 1; |
1621 | buffer = tr->buffer; | 1682 | buffer = tr->buffer; |
1622 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 1683 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
1623 | irq_flags, pc); | 1684 | flags, pc); |
1624 | if (!event) | 1685 | if (!event) |
1625 | goto out_unlock; | 1686 | goto out; |
1626 | entry = ring_buffer_event_data(event); | 1687 | entry = ring_buffer_event_data(event); |
1627 | entry->ip = ip; | 1688 | entry->ip = ip; |
1628 | 1689 | ||
1629 | memcpy(&entry->buf, trace_buf, len); | 1690 | memcpy(&entry->buf, tbuffer, len); |
1630 | entry->buf[len] = '\0'; | 1691 | entry->buf[len] = '\0'; |
1631 | if (!filter_check_discard(call, entry, buffer, event)) { | 1692 | if (!filter_check_discard(call, entry, buffer, event)) { |
1632 | ring_buffer_unlock_commit(buffer, event); | 1693 | ring_buffer_unlock_commit(buffer, event); |
1633 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | 1694 | ftrace_trace_stack(buffer, flags, 6, pc); |
1634 | } | 1695 | } |
1635 | |||
1636 | out_unlock: | ||
1637 | arch_spin_unlock(&trace_buf_lock); | ||
1638 | raw_local_irq_restore(irq_flags); | ||
1639 | unpause_graph_tracing(); | ||
1640 | out: | 1696 | out: |
1641 | atomic_dec_return(&data->disabled); | ||
1642 | preempt_enable_notrace(); | 1697 | preempt_enable_notrace(); |
1698 | unpause_graph_tracing(); | ||
1643 | 1699 | ||
1644 | return len; | 1700 | return len; |
1645 | } | 1701 | } |
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk); | |||
1652 | 1708 | ||
1653 | static void trace_iterator_increment(struct trace_iterator *iter) | 1709 | static void trace_iterator_increment(struct trace_iterator *iter) |
1654 | { | 1710 | { |
1655 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1656 | ftrace_disable_cpu(); | ||
1657 | |||
1658 | iter->idx++; | 1711 | iter->idx++; |
1659 | if (iter->buffer_iter[iter->cpu]) | 1712 | if (iter->buffer_iter[iter->cpu]) |
1660 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); | 1713 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); |
1661 | |||
1662 | ftrace_enable_cpu(); | ||
1663 | } | 1714 | } |
1664 | 1715 | ||
1665 | static struct trace_entry * | 1716 | static struct trace_entry * |
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1669 | struct ring_buffer_event *event; | 1720 | struct ring_buffer_event *event; |
1670 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; | 1721 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; |
1671 | 1722 | ||
1672 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1673 | ftrace_disable_cpu(); | ||
1674 | |||
1675 | if (buf_iter) | 1723 | if (buf_iter) |
1676 | event = ring_buffer_iter_peek(buf_iter, ts); | 1724 | event = ring_buffer_iter_peek(buf_iter, ts); |
1677 | else | 1725 | else |
1678 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, | 1726 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, |
1679 | lost_events); | 1727 | lost_events); |
1680 | 1728 | ||
1681 | ftrace_enable_cpu(); | ||
1682 | |||
1683 | if (event) { | 1729 | if (event) { |
1684 | iter->ent_size = ring_buffer_event_length(event); | 1730 | iter->ent_size = ring_buffer_event_length(event); |
1685 | return ring_buffer_event_data(event); | 1731 | return ring_buffer_event_data(event); |
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) | |||
1769 | 1815 | ||
1770 | static void trace_consume(struct trace_iterator *iter) | 1816 | static void trace_consume(struct trace_iterator *iter) |
1771 | { | 1817 | { |
1772 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1773 | ftrace_disable_cpu(); | ||
1774 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, | 1818 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, |
1775 | &iter->lost_events); | 1819 | &iter->lost_events); |
1776 | ftrace_enable_cpu(); | ||
1777 | } | 1820 | } |
1778 | 1821 | ||
1779 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) | 1822 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) |
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1862 | iter->cpu = 0; | 1905 | iter->cpu = 0; |
1863 | iter->idx = -1; | 1906 | iter->idx = -1; |
1864 | 1907 | ||
1865 | ftrace_disable_cpu(); | ||
1866 | |||
1867 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | 1908 | if (cpu_file == TRACE_PIPE_ALL_CPU) { |
1868 | for_each_tracing_cpu(cpu) | 1909 | for_each_tracing_cpu(cpu) |
1869 | tracing_iter_reset(iter, cpu); | 1910 | tracing_iter_reset(iter, cpu); |
1870 | } else | 1911 | } else |
1871 | tracing_iter_reset(iter, cpu_file); | 1912 | tracing_iter_reset(iter, cpu_file); |
1872 | 1913 | ||
1873 | ftrace_enable_cpu(); | ||
1874 | |||
1875 | iter->leftover = 0; | 1914 | iter->leftover = 0; |
1876 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) | 1915 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) |
1877 | ; | 1916 | ; |
@@ -2332,15 +2371,13 @@ static struct trace_iterator * | |||
2332 | __tracing_open(struct inode *inode, struct file *file) | 2371 | __tracing_open(struct inode *inode, struct file *file) |
2333 | { | 2372 | { |
2334 | long cpu_file = (long) inode->i_private; | 2373 | long cpu_file = (long) inode->i_private; |
2335 | void *fail_ret = ERR_PTR(-ENOMEM); | ||
2336 | struct trace_iterator *iter; | 2374 | struct trace_iterator *iter; |
2337 | struct seq_file *m; | 2375 | int cpu; |
2338 | int cpu, ret; | ||
2339 | 2376 | ||
2340 | if (tracing_disabled) | 2377 | if (tracing_disabled) |
2341 | return ERR_PTR(-ENODEV); | 2378 | return ERR_PTR(-ENODEV); |
2342 | 2379 | ||
2343 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2380 | iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); |
2344 | if (!iter) | 2381 | if (!iter) |
2345 | return ERR_PTR(-ENOMEM); | 2382 | return ERR_PTR(-ENOMEM); |
2346 | 2383 | ||
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2397 | tracing_iter_reset(iter, cpu); | 2434 | tracing_iter_reset(iter, cpu); |
2398 | } | 2435 | } |
2399 | 2436 | ||
2400 | ret = seq_open(file, &tracer_seq_ops); | ||
2401 | if (ret < 0) { | ||
2402 | fail_ret = ERR_PTR(ret); | ||
2403 | goto fail_buffer; | ||
2404 | } | ||
2405 | |||
2406 | m = file->private_data; | ||
2407 | m->private = iter; | ||
2408 | |||
2409 | mutex_unlock(&trace_types_lock); | 2437 | mutex_unlock(&trace_types_lock); |
2410 | 2438 | ||
2411 | return iter; | 2439 | return iter; |
2412 | 2440 | ||
2413 | fail_buffer: | ||
2414 | for_each_tracing_cpu(cpu) { | ||
2415 | if (iter->buffer_iter[cpu]) | ||
2416 | ring_buffer_read_finish(iter->buffer_iter[cpu]); | ||
2417 | } | ||
2418 | free_cpumask_var(iter->started); | ||
2419 | tracing_start(); | ||
2420 | fail: | 2441 | fail: |
2421 | mutex_unlock(&trace_types_lock); | 2442 | mutex_unlock(&trace_types_lock); |
2422 | kfree(iter->trace); | 2443 | kfree(iter->trace); |
2423 | kfree(iter); | 2444 | seq_release_private(inode, file); |
2424 | 2445 | return ERR_PTR(-ENOMEM); | |
2425 | return fail_ret; | ||
2426 | } | 2446 | } |
2427 | 2447 | ||
2428 | int tracing_open_generic(struct inode *inode, struct file *filp) | 2448 | int tracing_open_generic(struct inode *inode, struct file *filp) |
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2458 | tracing_start(); | 2478 | tracing_start(); |
2459 | mutex_unlock(&trace_types_lock); | 2479 | mutex_unlock(&trace_types_lock); |
2460 | 2480 | ||
2461 | seq_release(inode, file); | ||
2462 | mutex_destroy(&iter->mutex); | 2481 | mutex_destroy(&iter->mutex); |
2463 | free_cpumask_var(iter->started); | 2482 | free_cpumask_var(iter->started); |
2464 | kfree(iter->trace); | 2483 | kfree(iter->trace); |
2465 | kfree(iter); | 2484 | seq_release_private(inode, file); |
2466 | return 0; | 2485 | return 0; |
2467 | } | 2486 | } |
2468 | 2487 | ||
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
2648 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 2667 | if (cpumask_test_cpu(cpu, tracing_cpumask) && |
2649 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2668 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2650 | atomic_inc(&global_trace.data[cpu]->disabled); | 2669 | atomic_inc(&global_trace.data[cpu]->disabled); |
2670 | ring_buffer_record_disable_cpu(global_trace.buffer, cpu); | ||
2651 | } | 2671 | } |
2652 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 2672 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && |
2653 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2673 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2654 | atomic_dec(&global_trace.data[cpu]->disabled); | 2674 | atomic_dec(&global_trace.data[cpu]->disabled); |
2675 | ring_buffer_record_enable_cpu(global_trace.buffer, cpu); | ||
2655 | } | 2676 | } |
2656 | } | 2677 | } |
2657 | arch_spin_unlock(&ftrace_max_lock); | 2678 | arch_spin_unlock(&ftrace_max_lock); |
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr) | |||
2974 | return t->init(tr); | 2995 | return t->init(tr); |
2975 | } | 2996 | } |
2976 | 2997 | ||
2977 | static int __tracing_resize_ring_buffer(unsigned long size) | 2998 | static void set_buffer_entries(struct trace_array *tr, unsigned long val) |
2999 | { | ||
3000 | int cpu; | ||
3001 | for_each_tracing_cpu(cpu) | ||
3002 | tr->data[cpu]->entries = val; | ||
3003 | } | ||
3004 | |||
3005 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | ||
2978 | { | 3006 | { |
2979 | int ret; | 3007 | int ret; |
2980 | 3008 | ||
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
2985 | */ | 3013 | */ |
2986 | ring_buffer_expanded = 1; | 3014 | ring_buffer_expanded = 1; |
2987 | 3015 | ||
2988 | ret = ring_buffer_resize(global_trace.buffer, size); | 3016 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
2989 | if (ret < 0) | 3017 | if (ret < 0) |
2990 | return ret; | 3018 | return ret; |
2991 | 3019 | ||
2992 | if (!current_trace->use_max_tr) | 3020 | if (!current_trace->use_max_tr) |
2993 | goto out; | 3021 | goto out; |
2994 | 3022 | ||
2995 | ret = ring_buffer_resize(max_tr.buffer, size); | 3023 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); |
2996 | if (ret < 0) { | 3024 | if (ret < 0) { |
2997 | int r; | 3025 | int r = 0; |
3026 | |||
3027 | if (cpu == RING_BUFFER_ALL_CPUS) { | ||
3028 | int i; | ||
3029 | for_each_tracing_cpu(i) { | ||
3030 | r = ring_buffer_resize(global_trace.buffer, | ||
3031 | global_trace.data[i]->entries, | ||
3032 | i); | ||
3033 | if (r < 0) | ||
3034 | break; | ||
3035 | } | ||
3036 | } else { | ||
3037 | r = ring_buffer_resize(global_trace.buffer, | ||
3038 | global_trace.data[cpu]->entries, | ||
3039 | cpu); | ||
3040 | } | ||
2998 | 3041 | ||
2999 | r = ring_buffer_resize(global_trace.buffer, | ||
3000 | global_trace.entries); | ||
3001 | if (r < 0) { | 3042 | if (r < 0) { |
3002 | /* | 3043 | /* |
3003 | * AARGH! We are left with different | 3044 | * AARGH! We are left with different |
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
3019 | return ret; | 3060 | return ret; |
3020 | } | 3061 | } |
3021 | 3062 | ||
3022 | max_tr.entries = size; | 3063 | if (cpu == RING_BUFFER_ALL_CPUS) |
3064 | set_buffer_entries(&max_tr, size); | ||
3065 | else | ||
3066 | max_tr.data[cpu]->entries = size; | ||
3067 | |||
3023 | out: | 3068 | out: |
3024 | global_trace.entries = size; | 3069 | if (cpu == RING_BUFFER_ALL_CPUS) |
3070 | set_buffer_entries(&global_trace, size); | ||
3071 | else | ||
3072 | global_trace.data[cpu]->entries = size; | ||
3025 | 3073 | ||
3026 | return ret; | 3074 | return ret; |
3027 | } | 3075 | } |
3028 | 3076 | ||
3029 | static ssize_t tracing_resize_ring_buffer(unsigned long size) | 3077 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) |
3030 | { | 3078 | { |
3031 | int cpu, ret = size; | 3079 | int ret = size; |
3032 | 3080 | ||
3033 | mutex_lock(&trace_types_lock); | 3081 | mutex_lock(&trace_types_lock); |
3034 | 3082 | ||
3035 | tracing_stop(); | 3083 | if (cpu_id != RING_BUFFER_ALL_CPUS) { |
3036 | 3084 | /* make sure, this cpu is enabled in the mask */ | |
3037 | /* disable all cpu buffers */ | 3085 | if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { |
3038 | for_each_tracing_cpu(cpu) { | 3086 | ret = -EINVAL; |
3039 | if (global_trace.data[cpu]) | 3087 | goto out; |
3040 | atomic_inc(&global_trace.data[cpu]->disabled); | 3088 | } |
3041 | if (max_tr.data[cpu]) | ||
3042 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
3043 | } | 3089 | } |
3044 | 3090 | ||
3045 | if (size != global_trace.entries) | 3091 | ret = __tracing_resize_ring_buffer(size, cpu_id); |
3046 | ret = __tracing_resize_ring_buffer(size); | ||
3047 | |||
3048 | if (ret < 0) | 3092 | if (ret < 0) |
3049 | ret = -ENOMEM; | 3093 | ret = -ENOMEM; |
3050 | 3094 | ||
3051 | for_each_tracing_cpu(cpu) { | 3095 | out: |
3052 | if (global_trace.data[cpu]) | ||
3053 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
3054 | if (max_tr.data[cpu]) | ||
3055 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
3056 | } | ||
3057 | |||
3058 | tracing_start(); | ||
3059 | mutex_unlock(&trace_types_lock); | 3096 | mutex_unlock(&trace_types_lock); |
3060 | 3097 | ||
3061 | return ret; | 3098 | return ret; |
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void) | |||
3078 | 3115 | ||
3079 | mutex_lock(&trace_types_lock); | 3116 | mutex_lock(&trace_types_lock); |
3080 | if (!ring_buffer_expanded) | 3117 | if (!ring_buffer_expanded) |
3081 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3118 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3119 | RING_BUFFER_ALL_CPUS); | ||
3082 | mutex_unlock(&trace_types_lock); | 3120 | mutex_unlock(&trace_types_lock); |
3083 | 3121 | ||
3084 | return ret; | 3122 | return ret; |
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf) | |||
3102 | mutex_lock(&trace_types_lock); | 3140 | mutex_lock(&trace_types_lock); |
3103 | 3141 | ||
3104 | if (!ring_buffer_expanded) { | 3142 | if (!ring_buffer_expanded) { |
3105 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3143 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3144 | RING_BUFFER_ALL_CPUS); | ||
3106 | if (ret < 0) | 3145 | if (ret < 0) |
3107 | goto out; | 3146 | goto out; |
3108 | ret = 0; | 3147 | ret = 0; |
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf) | |||
3128 | * The max_tr ring buffer has some state (e.g. ring->clock) and | 3167 | * The max_tr ring buffer has some state (e.g. ring->clock) and |
3129 | * we want preserve it. | 3168 | * we want preserve it. |
3130 | */ | 3169 | */ |
3131 | ring_buffer_resize(max_tr.buffer, 1); | 3170 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); |
3132 | max_tr.entries = 1; | 3171 | set_buffer_entries(&max_tr, 1); |
3133 | } | 3172 | } |
3134 | destroy_trace_option_files(topts); | 3173 | destroy_trace_option_files(topts); |
3135 | 3174 | ||
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf) | |||
3137 | 3176 | ||
3138 | topts = create_trace_option_files(current_trace); | 3177 | topts = create_trace_option_files(current_trace); |
3139 | if (current_trace->use_max_tr) { | 3178 | if (current_trace->use_max_tr) { |
3140 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | 3179 | int cpu; |
3141 | if (ret < 0) | 3180 | /* we need to make per cpu buffer sizes equivalent */ |
3142 | goto out; | 3181 | for_each_tracing_cpu(cpu) { |
3143 | max_tr.entries = global_trace.entries; | 3182 | ret = ring_buffer_resize(max_tr.buffer, |
3183 | global_trace.data[cpu]->entries, | ||
3184 | cpu); | ||
3185 | if (ret < 0) | ||
3186 | goto out; | ||
3187 | max_tr.data[cpu]->entries = | ||
3188 | global_trace.data[cpu]->entries; | ||
3189 | } | ||
3144 | } | 3190 | } |
3145 | 3191 | ||
3146 | if (t->init) { | 3192 | if (t->init) { |
@@ -3642,30 +3688,82 @@ out_err: | |||
3642 | goto out; | 3688 | goto out; |
3643 | } | 3689 | } |
3644 | 3690 | ||
3691 | struct ftrace_entries_info { | ||
3692 | struct trace_array *tr; | ||
3693 | int cpu; | ||
3694 | }; | ||
3695 | |||
3696 | static int tracing_entries_open(struct inode *inode, struct file *filp) | ||
3697 | { | ||
3698 | struct ftrace_entries_info *info; | ||
3699 | |||
3700 | if (tracing_disabled) | ||
3701 | return -ENODEV; | ||
3702 | |||
3703 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
3704 | if (!info) | ||
3705 | return -ENOMEM; | ||
3706 | |||
3707 | info->tr = &global_trace; | ||
3708 | info->cpu = (unsigned long)inode->i_private; | ||
3709 | |||
3710 | filp->private_data = info; | ||
3711 | |||
3712 | return 0; | ||
3713 | } | ||
3714 | |||
3645 | static ssize_t | 3715 | static ssize_t |
3646 | tracing_entries_read(struct file *filp, char __user *ubuf, | 3716 | tracing_entries_read(struct file *filp, char __user *ubuf, |
3647 | size_t cnt, loff_t *ppos) | 3717 | size_t cnt, loff_t *ppos) |
3648 | { | 3718 | { |
3649 | struct trace_array *tr = filp->private_data; | 3719 | struct ftrace_entries_info *info = filp->private_data; |
3650 | char buf[96]; | 3720 | struct trace_array *tr = info->tr; |
3651 | int r; | 3721 | char buf[64]; |
3722 | int r = 0; | ||
3723 | ssize_t ret; | ||
3652 | 3724 | ||
3653 | mutex_lock(&trace_types_lock); | 3725 | mutex_lock(&trace_types_lock); |
3654 | if (!ring_buffer_expanded) | 3726 | |
3655 | r = sprintf(buf, "%lu (expanded: %lu)\n", | 3727 | if (info->cpu == RING_BUFFER_ALL_CPUS) { |
3656 | tr->entries >> 10, | 3728 | int cpu, buf_size_same; |
3657 | trace_buf_size >> 10); | 3729 | unsigned long size; |
3658 | else | 3730 | |
3659 | r = sprintf(buf, "%lu\n", tr->entries >> 10); | 3731 | size = 0; |
3732 | buf_size_same = 1; | ||
3733 | /* check if all cpu sizes are same */ | ||
3734 | for_each_tracing_cpu(cpu) { | ||
3735 | /* fill in the size from first enabled cpu */ | ||
3736 | if (size == 0) | ||
3737 | size = tr->data[cpu]->entries; | ||
3738 | if (size != tr->data[cpu]->entries) { | ||
3739 | buf_size_same = 0; | ||
3740 | break; | ||
3741 | } | ||
3742 | } | ||
3743 | |||
3744 | if (buf_size_same) { | ||
3745 | if (!ring_buffer_expanded) | ||
3746 | r = sprintf(buf, "%lu (expanded: %lu)\n", | ||
3747 | size >> 10, | ||
3748 | trace_buf_size >> 10); | ||
3749 | else | ||
3750 | r = sprintf(buf, "%lu\n", size >> 10); | ||
3751 | } else | ||
3752 | r = sprintf(buf, "X\n"); | ||
3753 | } else | ||
3754 | r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); | ||
3755 | |||
3660 | mutex_unlock(&trace_types_lock); | 3756 | mutex_unlock(&trace_types_lock); |
3661 | 3757 | ||
3662 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 3758 | ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
3759 | return ret; | ||
3663 | } | 3760 | } |
3664 | 3761 | ||
3665 | static ssize_t | 3762 | static ssize_t |
3666 | tracing_entries_write(struct file *filp, const char __user *ubuf, | 3763 | tracing_entries_write(struct file *filp, const char __user *ubuf, |
3667 | size_t cnt, loff_t *ppos) | 3764 | size_t cnt, loff_t *ppos) |
3668 | { | 3765 | { |
3766 | struct ftrace_entries_info *info = filp->private_data; | ||
3669 | unsigned long val; | 3767 | unsigned long val; |
3670 | int ret; | 3768 | int ret; |
3671 | 3769 | ||
@@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3680 | /* value is in KB */ | 3778 | /* value is in KB */ |
3681 | val <<= 10; | 3779 | val <<= 10; |
3682 | 3780 | ||
3683 | ret = tracing_resize_ring_buffer(val); | 3781 | ret = tracing_resize_ring_buffer(val, info->cpu); |
3684 | if (ret < 0) | 3782 | if (ret < 0) |
3685 | return ret; | 3783 | return ret; |
3686 | 3784 | ||
@@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3689 | return cnt; | 3787 | return cnt; |
3690 | } | 3788 | } |
3691 | 3789 | ||
3790 | static int | ||
3791 | tracing_entries_release(struct inode *inode, struct file *filp) | ||
3792 | { | ||
3793 | struct ftrace_entries_info *info = filp->private_data; | ||
3794 | |||
3795 | kfree(info); | ||
3796 | |||
3797 | return 0; | ||
3798 | } | ||
3799 | |||
3692 | static ssize_t | 3800 | static ssize_t |
3693 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | 3801 | tracing_total_entries_read(struct file *filp, char __user *ubuf, |
3694 | size_t cnt, loff_t *ppos) | 3802 | size_t cnt, loff_t *ppos) |
@@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, | |||
3700 | 3808 | ||
3701 | mutex_lock(&trace_types_lock); | 3809 | mutex_lock(&trace_types_lock); |
3702 | for_each_tracing_cpu(cpu) { | 3810 | for_each_tracing_cpu(cpu) { |
3703 | size += tr->entries >> 10; | 3811 | size += tr->data[cpu]->entries >> 10; |
3704 | if (!ring_buffer_expanded) | 3812 | if (!ring_buffer_expanded) |
3705 | expanded_size += trace_buf_size >> 10; | 3813 | expanded_size += trace_buf_size >> 10; |
3706 | } | 3814 | } |
@@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
3734 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | 3842 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) |
3735 | tracing_off(); | 3843 | tracing_off(); |
3736 | /* resize the ring buffer to 0 */ | 3844 | /* resize the ring buffer to 0 */ |
3737 | tracing_resize_ring_buffer(0); | 3845 | tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); |
3738 | 3846 | ||
3739 | return 0; | 3847 | return 0; |
3740 | } | 3848 | } |
@@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3749 | struct print_entry *entry; | 3857 | struct print_entry *entry; |
3750 | unsigned long irq_flags; | 3858 | unsigned long irq_flags; |
3751 | struct page *pages[2]; | 3859 | struct page *pages[2]; |
3860 | void *map_page[2]; | ||
3752 | int nr_pages = 1; | 3861 | int nr_pages = 1; |
3753 | ssize_t written; | 3862 | ssize_t written; |
3754 | void *page1; | ||
3755 | void *page2; | ||
3756 | int offset; | 3863 | int offset; |
3757 | int size; | 3864 | int size; |
3758 | int len; | 3865 | int len; |
3759 | int ret; | 3866 | int ret; |
3867 | int i; | ||
3760 | 3868 | ||
3761 | if (tracing_disabled) | 3869 | if (tracing_disabled) |
3762 | return -EINVAL; | 3870 | return -EINVAL; |
@@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3795 | goto out; | 3903 | goto out; |
3796 | } | 3904 | } |
3797 | 3905 | ||
3798 | page1 = kmap_atomic(pages[0]); | 3906 | for (i = 0; i < nr_pages; i++) |
3799 | if (nr_pages == 2) | 3907 | map_page[i] = kmap_atomic(pages[i]); |
3800 | page2 = kmap_atomic(pages[1]); | ||
3801 | 3908 | ||
3802 | local_save_flags(irq_flags); | 3909 | local_save_flags(irq_flags); |
3803 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 3910 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
@@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3815 | 3922 | ||
3816 | if (nr_pages == 2) { | 3923 | if (nr_pages == 2) { |
3817 | len = PAGE_SIZE - offset; | 3924 | len = PAGE_SIZE - offset; |
3818 | memcpy(&entry->buf, page1 + offset, len); | 3925 | memcpy(&entry->buf, map_page[0] + offset, len); |
3819 | memcpy(&entry->buf[len], page2, cnt - len); | 3926 | memcpy(&entry->buf[len], map_page[1], cnt - len); |
3820 | } else | 3927 | } else |
3821 | memcpy(&entry->buf, page1 + offset, cnt); | 3928 | memcpy(&entry->buf, map_page[0] + offset, cnt); |
3822 | 3929 | ||
3823 | if (entry->buf[cnt - 1] != '\n') { | 3930 | if (entry->buf[cnt - 1] != '\n') { |
3824 | entry->buf[cnt] = '\n'; | 3931 | entry->buf[cnt] = '\n'; |
@@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3833 | *fpos += written; | 3940 | *fpos += written; |
3834 | 3941 | ||
3835 | out_unlock: | 3942 | out_unlock: |
3836 | if (nr_pages == 2) | 3943 | for (i = 0; i < nr_pages; i++){ |
3837 | kunmap_atomic(page2); | 3944 | kunmap_atomic(map_page[i]); |
3838 | kunmap_atomic(page1); | 3945 | put_page(pages[i]); |
3839 | while (nr_pages > 0) | 3946 | } |
3840 | put_page(pages[--nr_pages]); | ||
3841 | out: | 3947 | out: |
3842 | return written; | 3948 | return written; |
3843 | } | 3949 | } |
@@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = { | |||
3933 | }; | 4039 | }; |
3934 | 4040 | ||
3935 | static const struct file_operations tracing_entries_fops = { | 4041 | static const struct file_operations tracing_entries_fops = { |
3936 | .open = tracing_open_generic, | 4042 | .open = tracing_entries_open, |
3937 | .read = tracing_entries_read, | 4043 | .read = tracing_entries_read, |
3938 | .write = tracing_entries_write, | 4044 | .write = tracing_entries_write, |
4045 | .release = tracing_entries_release, | ||
3939 | .llseek = generic_file_llseek, | 4046 | .llseek = generic_file_llseek, |
3940 | }; | 4047 | }; |
3941 | 4048 | ||
@@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4367 | struct dentry *d_cpu; | 4474 | struct dentry *d_cpu; |
4368 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 4475 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4369 | 4476 | ||
4477 | if (!d_percpu) | ||
4478 | return; | ||
4479 | |||
4370 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 4480 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4371 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4481 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4372 | if (!d_cpu) { | 4482 | if (!d_cpu) { |
@@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4387 | 4497 | ||
4388 | trace_create_file("stats", 0444, d_cpu, | 4498 | trace_create_file("stats", 0444, d_cpu, |
4389 | (void *) cpu, &tracing_stats_fops); | 4499 | (void *) cpu, &tracing_stats_fops); |
4500 | |||
4501 | trace_create_file("buffer_size_kb", 0444, d_cpu, | ||
4502 | (void *) cpu, &tracing_entries_fops); | ||
4390 | } | 4503 | } |
4391 | 4504 | ||
4392 | #ifdef CONFIG_FTRACE_SELFTEST | 4505 | #ifdef CONFIG_FTRACE_SELFTEST |
@@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void) | |||
4718 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); | 4831 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); |
4719 | 4832 | ||
4720 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4833 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4721 | &global_trace, &tracing_entries_fops); | 4834 | (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); |
4722 | 4835 | ||
4723 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | 4836 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, |
4724 | &global_trace, &tracing_total_entries_fops); | 4837 | &global_trace, &tracing_total_entries_fops); |
@@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void) | |||
4957 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 5070 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4958 | goto out_free_buffer_mask; | 5071 | goto out_free_buffer_mask; |
4959 | 5072 | ||
5073 | /* Only allocate trace_printk buffers if a trace_printk exists */ | ||
5074 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | ||
5075 | trace_printk_init_buffers(); | ||
5076 | |||
4960 | /* To save memory, keep the ring buffer size to its minimum */ | 5077 | /* To save memory, keep the ring buffer size to its minimum */ |
4961 | if (ring_buffer_expanded) | 5078 | if (ring_buffer_expanded) |
4962 | ring_buf_size = trace_buf_size; | 5079 | ring_buf_size = trace_buf_size; |
@@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void) | |||
4975 | WARN_ON(1); | 5092 | WARN_ON(1); |
4976 | goto out_free_cpumask; | 5093 | goto out_free_cpumask; |
4977 | } | 5094 | } |
4978 | global_trace.entries = ring_buffer_size(global_trace.buffer); | ||
4979 | if (global_trace.buffer_disabled) | 5095 | if (global_trace.buffer_disabled) |
4980 | tracing_off(); | 5096 | tracing_off(); |
4981 | 5097 | ||
@@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void) | |||
4988 | ring_buffer_free(global_trace.buffer); | 5104 | ring_buffer_free(global_trace.buffer); |
4989 | goto out_free_cpumask; | 5105 | goto out_free_cpumask; |
4990 | } | 5106 | } |
4991 | max_tr.entries = 1; | ||
4992 | #endif | 5107 | #endif |
4993 | 5108 | ||
4994 | /* Allocate the first page for all buffers */ | 5109 | /* Allocate the first page for all buffers */ |
@@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void) | |||
4997 | max_tr.data[i] = &per_cpu(max_tr_data, i); | 5112 | max_tr.data[i] = &per_cpu(max_tr_data, i); |
4998 | } | 5113 | } |
4999 | 5114 | ||
5115 | set_buffer_entries(&global_trace, | ||
5116 | ring_buffer_size(global_trace.buffer, 0)); | ||
5117 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5118 | set_buffer_entries(&max_tr, 1); | ||
5119 | #endif | ||
5120 | |||
5000 | trace_init_cmdlines(); | 5121 | trace_init_cmdlines(); |
5001 | 5122 | ||
5002 | register_tracer(&nop_trace); | 5123 | register_tracer(&nop_trace); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f95d65da6db8..5aec220d2de0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head { | |||
103 | unsigned long ret_ip; | 103 | unsigned long ret_ip; |
104 | }; | 104 | }; |
105 | 105 | ||
106 | struct uprobe_trace_entry_head { | ||
107 | struct trace_entry ent; | ||
108 | unsigned long ip; | ||
109 | }; | ||
110 | |||
106 | /* | 111 | /* |
107 | * trace_flag_type is an enumeration that holds different | 112 | * trace_flag_type is an enumeration that holds different |
108 | * states when a trace occurs. These are: | 113 | * states when a trace occurs. These are: |
@@ -131,6 +136,7 @@ struct trace_array_cpu { | |||
131 | atomic_t disabled; | 136 | atomic_t disabled; |
132 | void *buffer_page; /* ring buffer spare */ | 137 | void *buffer_page; /* ring buffer spare */ |
133 | 138 | ||
139 | unsigned long entries; | ||
134 | unsigned long saved_latency; | 140 | unsigned long saved_latency; |
135 | unsigned long critical_start; | 141 | unsigned long critical_start; |
136 | unsigned long critical_end; | 142 | unsigned long critical_end; |
@@ -152,7 +158,6 @@ struct trace_array_cpu { | |||
152 | */ | 158 | */ |
153 | struct trace_array { | 159 | struct trace_array { |
154 | struct ring_buffer *buffer; | 160 | struct ring_buffer *buffer; |
155 | unsigned long entries; | ||
156 | int cpu; | 161 | int cpu; |
157 | int buffer_disabled; | 162 | int buffer_disabled; |
158 | cycle_t time_start; | 163 | cycle_t time_start; |
@@ -826,6 +831,8 @@ extern struct list_head ftrace_events; | |||
826 | extern const char *__start___trace_bprintk_fmt[]; | 831 | extern const char *__start___trace_bprintk_fmt[]; |
827 | extern const char *__stop___trace_bprintk_fmt[]; | 832 | extern const char *__stop___trace_bprintk_fmt[]; |
828 | 833 | ||
834 | void trace_printk_init_buffers(void); | ||
835 | |||
829 | #undef FTRACE_ENTRY | 836 | #undef FTRACE_ENTRY |
830 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 837 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
831 | extern struct ftrace_event_call \ | 838 | extern struct ftrace_event_call \ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93ae8a9d..29111da1d100 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
294 | if (!call->name || !call->class || !call->class->reg) | 294 | if (!call->name || !call->class || !call->class->reg) |
295 | continue; | 295 | continue; |
296 | 296 | ||
297 | if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) | ||
298 | continue; | ||
299 | |||
297 | if (match && | 300 | if (match && |
298 | strcmp(match, call->name) != 0 && | 301 | strcmp(match, call->name) != 0 && |
299 | strcmp(match, call->class->system) != 0) | 302 | strcmp(match, call->class->system) != 0) |
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1164 | return -1; | 1167 | return -1; |
1165 | } | 1168 | } |
1166 | 1169 | ||
1167 | if (call->class->reg) | 1170 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1168 | trace_create_file("enable", 0644, call->dir, call, | 1171 | trace_create_file("enable", 0644, call->dir, call, |
1169 | enable); | 1172 | enable); |
1170 | 1173 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8bc856..e039906b037d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
180 | .event.type = etype, \ | 180 | .event.type = etype, \ |
181 | .class = &event_class_ftrace_##call, \ | 181 | .class = &event_class_ftrace_##call, \ |
182 | .print_fmt = print, \ | 182 | .print_fmt = print, \ |
183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | ||
183 | }; \ | 184 | }; \ |
184 | struct ftrace_event_call __used \ | 185 | struct ftrace_event_call __used \ |
185 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 580a05ec926b..b31d3d5699fe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -19,547 +19,15 @@ | |||
19 | 19 | ||
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/seq_file.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/smp.h> | ||
26 | #include <linux/debugfs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/string.h> | ||
29 | #include <linux/ctype.h> | ||
30 | #include <linux/ptrace.h> | ||
31 | #include <linux/perf_event.h> | ||
32 | #include <linux/stringify.h> | ||
33 | #include <linux/limits.h> | ||
34 | #include <asm/bitsperlong.h> | ||
35 | |||
36 | #include "trace.h" | ||
37 | #include "trace_output.h" | ||
38 | |||
39 | #define MAX_TRACE_ARGS 128 | ||
40 | #define MAX_ARGSTR_LEN 63 | ||
41 | #define MAX_EVENT_NAME_LEN 64 | ||
42 | #define MAX_STRING_SIZE PATH_MAX | ||
43 | #define KPROBE_EVENT_SYSTEM "kprobes" | ||
44 | |||
45 | /* Reserved field names */ | ||
46 | #define FIELD_STRING_IP "__probe_ip" | ||
47 | #define FIELD_STRING_RETIP "__probe_ret_ip" | ||
48 | #define FIELD_STRING_FUNC "__probe_func" | ||
49 | |||
50 | const char *reserved_field_names[] = { | ||
51 | "common_type", | ||
52 | "common_flags", | ||
53 | "common_preempt_count", | ||
54 | "common_pid", | ||
55 | "common_tgid", | ||
56 | FIELD_STRING_IP, | ||
57 | FIELD_STRING_RETIP, | ||
58 | FIELD_STRING_FUNC, | ||
59 | }; | ||
60 | |||
61 | /* Printing function type */ | ||
62 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, | ||
63 | void *); | ||
64 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | ||
65 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | ||
66 | |||
67 | /* Printing in basic type function template */ | ||
68 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | ||
69 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | ||
70 | const char *name, \ | ||
71 | void *data, void *ent)\ | ||
72 | { \ | ||
73 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | ||
74 | } \ | ||
75 | static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; | ||
76 | |||
77 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) | ||
78 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) | ||
79 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) | ||
80 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) | ||
81 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) | ||
82 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | ||
83 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | ||
84 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | ||
85 | |||
86 | /* data_rloc: data relative location, compatible with u32 */ | ||
87 | #define make_data_rloc(len, roffs) \ | ||
88 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
89 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
90 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
91 | |||
92 | static inline void *get_rloc_data(u32 *dl) | ||
93 | { | ||
94 | return (u8 *)dl + get_rloc_offs(*dl); | ||
95 | } | ||
96 | |||
97 | /* For data_loc conversion */ | ||
98 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
99 | { | ||
100 | return (u8 *)ent + get_rloc_offs(*dl); | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Convert data_rloc to data_loc: | ||
105 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
106 | * stores the offset from event entry. | ||
107 | */ | ||
108 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
109 | |||
110 | /* For defining macros, define string/string_size types */ | ||
111 | typedef u32 string; | ||
112 | typedef u32 string_size; | ||
113 | |||
114 | /* Print type function for string type */ | ||
115 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
116 | const char *name, | ||
117 | void *data, void *ent) | ||
118 | { | ||
119 | int len = *(u32 *)data >> 16; | ||
120 | |||
121 | if (!len) | ||
122 | return trace_seq_printf(s, " %s=(fault)", name); | ||
123 | else | ||
124 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
125 | (const char *)get_loc_data(data, ent)); | ||
126 | } | ||
127 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
128 | |||
129 | /* Data fetch function type */ | ||
130 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | ||
131 | |||
132 | struct fetch_param { | ||
133 | fetch_func_t fn; | ||
134 | void *data; | ||
135 | }; | ||
136 | |||
137 | static __kprobes void call_fetch(struct fetch_param *fprm, | ||
138 | struct pt_regs *regs, void *dest) | ||
139 | { | ||
140 | return fprm->fn(regs, fprm->data, dest); | ||
141 | } | ||
142 | |||
143 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | ||
144 | /* | ||
145 | * Define macro for basic types - we don't need to define s* types, because | ||
146 | * we have to care only about bitwidth at recording time. | ||
147 | */ | ||
148 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ | ||
149 | DEFINE_FETCH_##method(u8) \ | ||
150 | DEFINE_FETCH_##method(u16) \ | ||
151 | DEFINE_FETCH_##method(u32) \ | ||
152 | DEFINE_FETCH_##method(u64) | ||
153 | |||
154 | #define CHECK_FETCH_FUNCS(method, fn) \ | ||
155 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ | ||
156 | (FETCH_FUNC_NAME(method, u16) == fn) || \ | ||
157 | (FETCH_FUNC_NAME(method, u32) == fn) || \ | ||
158 | (FETCH_FUNC_NAME(method, u64) == fn) || \ | ||
159 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
160 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
161 | && (fn != NULL)) | ||
162 | |||
163 | /* Data fetch function templates */ | ||
164 | #define DEFINE_FETCH_reg(type) \ | ||
165 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | ||
166 | void *offset, void *dest) \ | ||
167 | { \ | ||
168 | *(type *)dest = (type)regs_get_register(regs, \ | ||
169 | (unsigned int)((unsigned long)offset)); \ | ||
170 | } | ||
171 | DEFINE_BASIC_FETCH_FUNCS(reg) | ||
172 | /* No string on the register */ | ||
173 | #define fetch_reg_string NULL | ||
174 | #define fetch_reg_string_size NULL | ||
175 | |||
176 | #define DEFINE_FETCH_stack(type) \ | ||
177 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
178 | void *offset, void *dest) \ | ||
179 | { \ | ||
180 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | ||
181 | (unsigned int)((unsigned long)offset)); \ | ||
182 | } | ||
183 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
184 | /* No string on the stack entry */ | ||
185 | #define fetch_stack_string NULL | ||
186 | #define fetch_stack_string_size NULL | ||
187 | |||
188 | #define DEFINE_FETCH_retval(type) \ | ||
189 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | ||
190 | void *dummy, void *dest) \ | ||
191 | { \ | ||
192 | *(type *)dest = (type)regs_return_value(regs); \ | ||
193 | } | ||
194 | DEFINE_BASIC_FETCH_FUNCS(retval) | ||
195 | /* No string on the retval */ | ||
196 | #define fetch_retval_string NULL | ||
197 | #define fetch_retval_string_size NULL | ||
198 | |||
199 | #define DEFINE_FETCH_memory(type) \ | ||
200 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
201 | void *addr, void *dest) \ | ||
202 | { \ | ||
203 | type retval; \ | ||
204 | if (probe_kernel_address(addr, retval)) \ | ||
205 | *(type *)dest = 0; \ | ||
206 | else \ | ||
207 | *(type *)dest = retval; \ | ||
208 | } | ||
209 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
210 | /* | ||
211 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
212 | * length and relative data location. | ||
213 | */ | ||
214 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
215 | void *addr, void *dest) | ||
216 | { | ||
217 | long ret; | ||
218 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
219 | u8 *dst = get_rloc_data(dest); | ||
220 | u8 *src = addr; | ||
221 | mm_segment_t old_fs = get_fs(); | ||
222 | if (!maxlen) | ||
223 | return; | ||
224 | /* | ||
225 | * Try to get string again, since the string can be changed while | ||
226 | * probing. | ||
227 | */ | ||
228 | set_fs(KERNEL_DS); | ||
229 | pagefault_disable(); | ||
230 | do | ||
231 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
232 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
233 | dst[-1] = '\0'; | ||
234 | pagefault_enable(); | ||
235 | set_fs(old_fs); | ||
236 | |||
237 | if (ret < 0) { /* Failed to fetch string */ | ||
238 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
239 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
240 | } else | ||
241 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
242 | get_rloc_offs(*(u32 *)dest)); | ||
243 | } | ||
244 | /* Return the length of string -- including null terminal byte */ | ||
245 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
246 | void *addr, void *dest) | ||
247 | { | ||
248 | int ret, len = 0; | ||
249 | u8 c; | ||
250 | mm_segment_t old_fs = get_fs(); | ||
251 | |||
252 | set_fs(KERNEL_DS); | ||
253 | pagefault_disable(); | ||
254 | do { | ||
255 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
256 | len++; | ||
257 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
258 | pagefault_enable(); | ||
259 | set_fs(old_fs); | ||
260 | |||
261 | if (ret < 0) /* Failed to check the length */ | ||
262 | *(u32 *)dest = 0; | ||
263 | else | ||
264 | *(u32 *)dest = len; | ||
265 | } | ||
266 | |||
267 | /* Memory fetching by symbol */ | ||
268 | struct symbol_cache { | ||
269 | char *symbol; | ||
270 | long offset; | ||
271 | unsigned long addr; | ||
272 | }; | ||
273 | |||
274 | static unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
275 | { | ||
276 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
277 | if (sc->addr) | ||
278 | sc->addr += sc->offset; | ||
279 | return sc->addr; | ||
280 | } | ||
281 | |||
282 | static void free_symbol_cache(struct symbol_cache *sc) | ||
283 | { | ||
284 | kfree(sc->symbol); | ||
285 | kfree(sc); | ||
286 | } | ||
287 | |||
288 | static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
289 | { | ||
290 | struct symbol_cache *sc; | ||
291 | |||
292 | if (!sym || strlen(sym) == 0) | ||
293 | return NULL; | ||
294 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
295 | if (!sc) | ||
296 | return NULL; | ||
297 | |||
298 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
299 | if (!sc->symbol) { | ||
300 | kfree(sc); | ||
301 | return NULL; | ||
302 | } | ||
303 | sc->offset = offset; | ||
304 | 22 | ||
305 | update_symbol_cache(sc); | 23 | #include "trace_probe.h" |
306 | return sc; | ||
307 | } | ||
308 | |||
309 | #define DEFINE_FETCH_symbol(type) \ | ||
310 | static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | ||
311 | void *data, void *dest) \ | ||
312 | { \ | ||
313 | struct symbol_cache *sc = data; \ | ||
314 | if (sc->addr) \ | ||
315 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | ||
316 | else \ | ||
317 | *(type *)dest = 0; \ | ||
318 | } | ||
319 | DEFINE_BASIC_FETCH_FUNCS(symbol) | ||
320 | DEFINE_FETCH_symbol(string) | ||
321 | DEFINE_FETCH_symbol(string_size) | ||
322 | |||
323 | /* Dereference memory access function */ | ||
324 | struct deref_fetch_param { | ||
325 | struct fetch_param orig; | ||
326 | long offset; | ||
327 | }; | ||
328 | |||
329 | #define DEFINE_FETCH_deref(type) \ | ||
330 | static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | ||
331 | void *data, void *dest) \ | ||
332 | { \ | ||
333 | struct deref_fetch_param *dprm = data; \ | ||
334 | unsigned long addr; \ | ||
335 | call_fetch(&dprm->orig, regs, &addr); \ | ||
336 | if (addr) { \ | ||
337 | addr += dprm->offset; \ | ||
338 | fetch_memory_##type(regs, (void *)addr, dest); \ | ||
339 | } else \ | ||
340 | *(type *)dest = 0; \ | ||
341 | } | ||
342 | DEFINE_BASIC_FETCH_FUNCS(deref) | ||
343 | DEFINE_FETCH_deref(string) | ||
344 | DEFINE_FETCH_deref(string_size) | ||
345 | |||
346 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | ||
347 | { | ||
348 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
349 | update_deref_fetch_param(data->orig.data); | ||
350 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
351 | update_symbol_cache(data->orig.data); | ||
352 | } | ||
353 | |||
354 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | ||
355 | { | ||
356 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
357 | free_deref_fetch_param(data->orig.data); | ||
358 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
359 | free_symbol_cache(data->orig.data); | ||
360 | kfree(data); | ||
361 | } | ||
362 | |||
363 | /* Bitfield fetch function */ | ||
364 | struct bitfield_fetch_param { | ||
365 | struct fetch_param orig; | ||
366 | unsigned char hi_shift; | ||
367 | unsigned char low_shift; | ||
368 | }; | ||
369 | 24 | ||
370 | #define DEFINE_FETCH_bitfield(type) \ | 25 | #define KPROBE_EVENT_SYSTEM "kprobes" |
371 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
372 | void *data, void *dest) \ | ||
373 | { \ | ||
374 | struct bitfield_fetch_param *bprm = data; \ | ||
375 | type buf = 0; \ | ||
376 | call_fetch(&bprm->orig, regs, &buf); \ | ||
377 | if (buf) { \ | ||
378 | buf <<= bprm->hi_shift; \ | ||
379 | buf >>= bprm->low_shift; \ | ||
380 | } \ | ||
381 | *(type *)dest = buf; \ | ||
382 | } | ||
383 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
384 | #define fetch_bitfield_string NULL | ||
385 | #define fetch_bitfield_string_size NULL | ||
386 | |||
387 | static __kprobes void | ||
388 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
389 | { | ||
390 | /* | ||
391 | * Don't check the bitfield itself, because this must be the | ||
392 | * last fetch function. | ||
393 | */ | ||
394 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
395 | update_deref_fetch_param(data->orig.data); | ||
396 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
397 | update_symbol_cache(data->orig.data); | ||
398 | } | ||
399 | |||
400 | static __kprobes void | ||
401 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
402 | { | ||
403 | /* | ||
404 | * Don't check the bitfield itself, because this must be the | ||
405 | * last fetch function. | ||
406 | */ | ||
407 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
408 | free_deref_fetch_param(data->orig.data); | ||
409 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
410 | free_symbol_cache(data->orig.data); | ||
411 | kfree(data); | ||
412 | } | ||
413 | |||
414 | /* Default (unsigned long) fetch type */ | ||
415 | #define __DEFAULT_FETCH_TYPE(t) u##t | ||
416 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | ||
417 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | ||
418 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | ||
419 | |||
420 | /* Fetch types */ | ||
421 | enum { | ||
422 | FETCH_MTD_reg = 0, | ||
423 | FETCH_MTD_stack, | ||
424 | FETCH_MTD_retval, | ||
425 | FETCH_MTD_memory, | ||
426 | FETCH_MTD_symbol, | ||
427 | FETCH_MTD_deref, | ||
428 | FETCH_MTD_bitfield, | ||
429 | FETCH_MTD_END, | ||
430 | }; | ||
431 | |||
432 | #define ASSIGN_FETCH_FUNC(method, type) \ | ||
433 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) | ||
434 | |||
435 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ | ||
436 | {.name = _name, \ | ||
437 | .size = _size, \ | ||
438 | .is_signed = sign, \ | ||
439 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
440 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
441 | .fmttype = _fmttype, \ | ||
442 | .fetch = { \ | ||
443 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
444 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
445 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
446 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
447 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
448 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
449 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
450 | } \ | ||
451 | } | ||
452 | |||
453 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
454 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
455 | |||
456 | #define FETCH_TYPE_STRING 0 | ||
457 | #define FETCH_TYPE_STRSIZE 1 | ||
458 | |||
459 | /* Fetch type information table */ | ||
460 | static const struct fetch_type { | ||
461 | const char *name; /* Name of type */ | ||
462 | size_t size; /* Byte size of type */ | ||
463 | int is_signed; /* Signed flag */ | ||
464 | print_type_func_t print; /* Print functions */ | ||
465 | const char *fmt; /* Fromat string */ | ||
466 | const char *fmttype; /* Name in format file */ | ||
467 | /* Fetch functions */ | ||
468 | fetch_func_t fetch[FETCH_MTD_END]; | ||
469 | } fetch_type_table[] = { | ||
470 | /* Special types */ | ||
471 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
472 | sizeof(u32), 1, "__data_loc char[]"), | ||
473 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
474 | string_size, sizeof(u32), 0, "u32"), | ||
475 | /* Basic types */ | ||
476 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
477 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
478 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
479 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
480 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
481 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
482 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
483 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
484 | }; | ||
485 | |||
486 | static const struct fetch_type *find_fetch_type(const char *type) | ||
487 | { | ||
488 | int i; | ||
489 | |||
490 | if (!type) | ||
491 | type = DEFAULT_FETCH_TYPE_STR; | ||
492 | |||
493 | /* Special case: bitfield */ | ||
494 | if (*type == 'b') { | ||
495 | unsigned long bs; | ||
496 | type = strchr(type, '/'); | ||
497 | if (!type) | ||
498 | goto fail; | ||
499 | type++; | ||
500 | if (strict_strtoul(type, 0, &bs)) | ||
501 | goto fail; | ||
502 | switch (bs) { | ||
503 | case 8: | ||
504 | return find_fetch_type("u8"); | ||
505 | case 16: | ||
506 | return find_fetch_type("u16"); | ||
507 | case 32: | ||
508 | return find_fetch_type("u32"); | ||
509 | case 64: | ||
510 | return find_fetch_type("u64"); | ||
511 | default: | ||
512 | goto fail; | ||
513 | } | ||
514 | } | ||
515 | |||
516 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | ||
517 | if (strcmp(type, fetch_type_table[i].name) == 0) | ||
518 | return &fetch_type_table[i]; | ||
519 | fail: | ||
520 | return NULL; | ||
521 | } | ||
522 | |||
523 | /* Special function : only accept unsigned long */ | ||
524 | static __kprobes void fetch_stack_address(struct pt_regs *regs, | ||
525 | void *dummy, void *dest) | ||
526 | { | ||
527 | *(unsigned long *)dest = kernel_stack_pointer(regs); | ||
528 | } | ||
529 | |||
530 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
531 | fetch_func_t orig_fn) | ||
532 | { | ||
533 | int i; | ||
534 | |||
535 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
536 | return NULL; /* Only string type needs size function */ | ||
537 | for (i = 0; i < FETCH_MTD_END; i++) | ||
538 | if (type->fetch[i] == orig_fn) | ||
539 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
540 | |||
541 | WARN_ON(1); /* This should not happen */ | ||
542 | return NULL; | ||
543 | } | ||
544 | 26 | ||
545 | /** | 27 | /** |
546 | * Kprobe event core functions | 28 | * Kprobe event core functions |
547 | */ | 29 | */ |
548 | 30 | ||
549 | struct probe_arg { | ||
550 | struct fetch_param fetch; | ||
551 | struct fetch_param fetch_size; | ||
552 | unsigned int offset; /* Offset from argument entry */ | ||
553 | const char *name; /* Name of this argument */ | ||
554 | const char *comm; /* Command of this argument */ | ||
555 | const struct fetch_type *type; /* Type of this argument */ | ||
556 | }; | ||
557 | |||
558 | /* Flags for trace_probe */ | ||
559 | #define TP_FLAG_TRACE 1 | ||
560 | #define TP_FLAG_PROFILE 2 | ||
561 | #define TP_FLAG_REGISTERED 4 | ||
562 | |||
563 | struct trace_probe { | 31 | struct trace_probe { |
564 | struct list_head list; | 32 | struct list_head list; |
565 | struct kretprobe rp; /* Use rp.kp for kprobe use */ | 33 | struct kretprobe rp; /* Use rp.kp for kprobe use */ |
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | |||
631 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | 99 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, |
632 | struct pt_regs *regs); | 100 | struct pt_regs *regs); |
633 | 101 | ||
634 | /* Check the name is good for event/group/fields */ | ||
635 | static int is_good_name(const char *name) | ||
636 | { | ||
637 | if (!isalpha(*name) && *name != '_') | ||
638 | return 0; | ||
639 | while (*++name != '\0') { | ||
640 | if (!isalpha(*name) && !isdigit(*name) && *name != '_') | ||
641 | return 0; | ||
642 | } | ||
643 | return 1; | ||
644 | } | ||
645 | |||
646 | /* | 102 | /* |
647 | * Allocate new trace_probe and initialize it (including kprobes). | 103 | * Allocate new trace_probe and initialize it (including kprobes). |
648 | */ | 104 | */ |
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
651 | void *addr, | 107 | void *addr, |
652 | const char *symbol, | 108 | const char *symbol, |
653 | unsigned long offs, | 109 | unsigned long offs, |
654 | int nargs, int is_return) | 110 | int nargs, bool is_return) |
655 | { | 111 | { |
656 | struct trace_probe *tp; | 112 | struct trace_probe *tp; |
657 | int ret = -ENOMEM; | 113 | int ret = -ENOMEM; |
@@ -702,34 +158,12 @@ error: | |||
702 | return ERR_PTR(ret); | 158 | return ERR_PTR(ret); |
703 | } | 159 | } |
704 | 160 | ||
705 | static void update_probe_arg(struct probe_arg *arg) | ||
706 | { | ||
707 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
708 | update_bitfield_fetch_param(arg->fetch.data); | ||
709 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
710 | update_deref_fetch_param(arg->fetch.data); | ||
711 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
712 | update_symbol_cache(arg->fetch.data); | ||
713 | } | ||
714 | |||
715 | static void free_probe_arg(struct probe_arg *arg) | ||
716 | { | ||
717 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
718 | free_bitfield_fetch_param(arg->fetch.data); | ||
719 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
720 | free_deref_fetch_param(arg->fetch.data); | ||
721 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
722 | free_symbol_cache(arg->fetch.data); | ||
723 | kfree(arg->name); | ||
724 | kfree(arg->comm); | ||
725 | } | ||
726 | |||
727 | static void free_trace_probe(struct trace_probe *tp) | 161 | static void free_trace_probe(struct trace_probe *tp) |
728 | { | 162 | { |
729 | int i; | 163 | int i; |
730 | 164 | ||
731 | for (i = 0; i < tp->nr_args; i++) | 165 | for (i = 0; i < tp->nr_args; i++) |
732 | free_probe_arg(&tp->args[i]); | 166 | traceprobe_free_probe_arg(&tp->args[i]); |
733 | 167 | ||
734 | kfree(tp->call.class->system); | 168 | kfree(tp->call.class->system); |
735 | kfree(tp->call.name); | 169 | kfree(tp->call.name); |
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp) | |||
787 | return -EINVAL; | 221 | return -EINVAL; |
788 | 222 | ||
789 | for (i = 0; i < tp->nr_args; i++) | 223 | for (i = 0; i < tp->nr_args; i++) |
790 | update_probe_arg(&tp->args[i]); | 224 | traceprobe_update_arg(&tp->args[i]); |
791 | 225 | ||
792 | /* Set/clear disabled flag according to tp->flag */ | 226 | /* Set/clear disabled flag according to tp->flag */ |
793 | if (trace_probe_is_enabled(tp)) | 227 | if (trace_probe_is_enabled(tp)) |
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = { | |||
919 | .priority = 1 /* Invoked after kprobe module callback */ | 353 | .priority = 1 /* Invoked after kprobe module callback */ |
920 | }; | 354 | }; |
921 | 355 | ||
922 | /* Split symbol and offset. */ | ||
923 | static int split_symbol_offset(char *symbol, unsigned long *offset) | ||
924 | { | ||
925 | char *tmp; | ||
926 | int ret; | ||
927 | |||
928 | if (!offset) | ||
929 | return -EINVAL; | ||
930 | |||
931 | tmp = strchr(symbol, '+'); | ||
932 | if (tmp) { | ||
933 | /* skip sign because strict_strtol doesn't accept '+' */ | ||
934 | ret = strict_strtoul(tmp + 1, 0, offset); | ||
935 | if (ret) | ||
936 | return ret; | ||
937 | *tmp = '\0'; | ||
938 | } else | ||
939 | *offset = 0; | ||
940 | return 0; | ||
941 | } | ||
942 | |||
943 | #define PARAM_MAX_ARGS 16 | ||
944 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | ||
945 | |||
946 | static int parse_probe_vars(char *arg, const struct fetch_type *t, | ||
947 | struct fetch_param *f, int is_return) | ||
948 | { | ||
949 | int ret = 0; | ||
950 | unsigned long param; | ||
951 | |||
952 | if (strcmp(arg, "retval") == 0) { | ||
953 | if (is_return) | ||
954 | f->fn = t->fetch[FETCH_MTD_retval]; | ||
955 | else | ||
956 | ret = -EINVAL; | ||
957 | } else if (strncmp(arg, "stack", 5) == 0) { | ||
958 | if (arg[5] == '\0') { | ||
959 | if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) | ||
960 | f->fn = fetch_stack_address; | ||
961 | else | ||
962 | ret = -EINVAL; | ||
963 | } else if (isdigit(arg[5])) { | ||
964 | ret = strict_strtoul(arg + 5, 10, ¶m); | ||
965 | if (ret || param > PARAM_MAX_STACK) | ||
966 | ret = -EINVAL; | ||
967 | else { | ||
968 | f->fn = t->fetch[FETCH_MTD_stack]; | ||
969 | f->data = (void *)param; | ||
970 | } | ||
971 | } else | ||
972 | ret = -EINVAL; | ||
973 | } else | ||
974 | ret = -EINVAL; | ||
975 | return ret; | ||
976 | } | ||
977 | |||
978 | /* Recursive argument parser */ | ||
979 | static int __parse_probe_arg(char *arg, const struct fetch_type *t, | ||
980 | struct fetch_param *f, int is_return) | ||
981 | { | ||
982 | int ret = 0; | ||
983 | unsigned long param; | ||
984 | long offset; | ||
985 | char *tmp; | ||
986 | |||
987 | switch (arg[0]) { | ||
988 | case '$': | ||
989 | ret = parse_probe_vars(arg + 1, t, f, is_return); | ||
990 | break; | ||
991 | case '%': /* named register */ | ||
992 | ret = regs_query_register_offset(arg + 1); | ||
993 | if (ret >= 0) { | ||
994 | f->fn = t->fetch[FETCH_MTD_reg]; | ||
995 | f->data = (void *)(unsigned long)ret; | ||
996 | ret = 0; | ||
997 | } | ||
998 | break; | ||
999 | case '@': /* memory or symbol */ | ||
1000 | if (isdigit(arg[1])) { | ||
1001 | ret = strict_strtoul(arg + 1, 0, ¶m); | ||
1002 | if (ret) | ||
1003 | break; | ||
1004 | f->fn = t->fetch[FETCH_MTD_memory]; | ||
1005 | f->data = (void *)param; | ||
1006 | } else { | ||
1007 | ret = split_symbol_offset(arg + 1, &offset); | ||
1008 | if (ret) | ||
1009 | break; | ||
1010 | f->data = alloc_symbol_cache(arg + 1, offset); | ||
1011 | if (f->data) | ||
1012 | f->fn = t->fetch[FETCH_MTD_symbol]; | ||
1013 | } | ||
1014 | break; | ||
1015 | case '+': /* deref memory */ | ||
1016 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
1017 | case '-': | ||
1018 | tmp = strchr(arg, '('); | ||
1019 | if (!tmp) | ||
1020 | break; | ||
1021 | *tmp = '\0'; | ||
1022 | ret = strict_strtol(arg, 0, &offset); | ||
1023 | if (ret) | ||
1024 | break; | ||
1025 | arg = tmp + 1; | ||
1026 | tmp = strrchr(arg, ')'); | ||
1027 | if (tmp) { | ||
1028 | struct deref_fetch_param *dprm; | ||
1029 | const struct fetch_type *t2 = find_fetch_type(NULL); | ||
1030 | *tmp = '\0'; | ||
1031 | dprm = kzalloc(sizeof(struct deref_fetch_param), | ||
1032 | GFP_KERNEL); | ||
1033 | if (!dprm) | ||
1034 | return -ENOMEM; | ||
1035 | dprm->offset = offset; | ||
1036 | ret = __parse_probe_arg(arg, t2, &dprm->orig, | ||
1037 | is_return); | ||
1038 | if (ret) | ||
1039 | kfree(dprm); | ||
1040 | else { | ||
1041 | f->fn = t->fetch[FETCH_MTD_deref]; | ||
1042 | f->data = (void *)dprm; | ||
1043 | } | ||
1044 | } | ||
1045 | break; | ||
1046 | } | ||
1047 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ | ||
1048 | pr_info("%s type has no corresponding fetch method.\n", | ||
1049 | t->name); | ||
1050 | ret = -EINVAL; | ||
1051 | } | ||
1052 | return ret; | ||
1053 | } | ||
1054 | |||
1055 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
1056 | |||
1057 | /* Bitfield type needs to be parsed into a fetch function */ | ||
1058 | static int __parse_bitfield_probe_arg(const char *bf, | ||
1059 | const struct fetch_type *t, | ||
1060 | struct fetch_param *f) | ||
1061 | { | ||
1062 | struct bitfield_fetch_param *bprm; | ||
1063 | unsigned long bw, bo; | ||
1064 | char *tail; | ||
1065 | |||
1066 | if (*bf != 'b') | ||
1067 | return 0; | ||
1068 | |||
1069 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
1070 | if (!bprm) | ||
1071 | return -ENOMEM; | ||
1072 | bprm->orig = *f; | ||
1073 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
1074 | f->data = (void *)bprm; | ||
1075 | |||
1076 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
1077 | if (bw == 0 || *tail != '@') | ||
1078 | return -EINVAL; | ||
1079 | |||
1080 | bf = tail + 1; | ||
1081 | bo = simple_strtoul(bf, &tail, 0); | ||
1082 | if (tail == bf || *tail != '/') | ||
1083 | return -EINVAL; | ||
1084 | |||
1085 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
1086 | bprm->low_shift = bprm->hi_shift + bo; | ||
1087 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
1088 | } | ||
1089 | |||
1090 | /* String length checking wrapper */ | ||
1091 | static int parse_probe_arg(char *arg, struct trace_probe *tp, | ||
1092 | struct probe_arg *parg, int is_return) | ||
1093 | { | ||
1094 | const char *t; | ||
1095 | int ret; | ||
1096 | |||
1097 | if (strlen(arg) > MAX_ARGSTR_LEN) { | ||
1098 | pr_info("Argument is too long.: %s\n", arg); | ||
1099 | return -ENOSPC; | ||
1100 | } | ||
1101 | parg->comm = kstrdup(arg, GFP_KERNEL); | ||
1102 | if (!parg->comm) { | ||
1103 | pr_info("Failed to allocate memory for command '%s'.\n", arg); | ||
1104 | return -ENOMEM; | ||
1105 | } | ||
1106 | t = strchr(parg->comm, ':'); | ||
1107 | if (t) { | ||
1108 | arg[t - parg->comm] = '\0'; | ||
1109 | t++; | ||
1110 | } | ||
1111 | parg->type = find_fetch_type(t); | ||
1112 | if (!parg->type) { | ||
1113 | pr_info("Unsupported type: %s\n", t); | ||
1114 | return -EINVAL; | ||
1115 | } | ||
1116 | parg->offset = tp->size; | ||
1117 | tp->size += parg->type->size; | ||
1118 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | ||
1119 | if (ret >= 0 && t != NULL) | ||
1120 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
1121 | if (ret >= 0) { | ||
1122 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
1123 | parg->fetch.fn); | ||
1124 | parg->fetch_size.data = parg->fetch.data; | ||
1125 | } | ||
1126 | return ret; | ||
1127 | } | ||
1128 | |||
1129 | /* Return 1 if name is reserved or already used by another argument */ | ||
1130 | static int conflict_field_name(const char *name, | ||
1131 | struct probe_arg *args, int narg) | ||
1132 | { | ||
1133 | int i; | ||
1134 | for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) | ||
1135 | if (strcmp(reserved_field_names[i], name) == 0) | ||
1136 | return 1; | ||
1137 | for (i = 0; i < narg; i++) | ||
1138 | if (strcmp(args[i].name, name) == 0) | ||
1139 | return 1; | ||
1140 | return 0; | ||
1141 | } | ||
1142 | |||
1143 | static int create_trace_probe(int argc, char **argv) | 356 | static int create_trace_probe(int argc, char **argv) |
1144 | { | 357 | { |
1145 | /* | 358 | /* |
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv) | |||
1162 | */ | 375 | */ |
1163 | struct trace_probe *tp; | 376 | struct trace_probe *tp; |
1164 | int i, ret = 0; | 377 | int i, ret = 0; |
1165 | int is_return = 0, is_delete = 0; | 378 | bool is_return = false, is_delete = false; |
1166 | char *symbol = NULL, *event = NULL, *group = NULL; | 379 | char *symbol = NULL, *event = NULL, *group = NULL; |
1167 | char *arg; | 380 | char *arg; |
1168 | unsigned long offset = 0; | 381 | unsigned long offset = 0; |
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv) | |||
1171 | 384 | ||
1172 | /* argc must be >= 1 */ | 385 | /* argc must be >= 1 */ |
1173 | if (argv[0][0] == 'p') | 386 | if (argv[0][0] == 'p') |
1174 | is_return = 0; | 387 | is_return = false; |
1175 | else if (argv[0][0] == 'r') | 388 | else if (argv[0][0] == 'r') |
1176 | is_return = 1; | 389 | is_return = true; |
1177 | else if (argv[0][0] == '-') | 390 | else if (argv[0][0] == '-') |
1178 | is_delete = 1; | 391 | is_delete = true; |
1179 | else { | 392 | else { |
1180 | pr_info("Probe definition must be started with 'p', 'r' or" | 393 | pr_info("Probe definition must be started with 'p', 'r' or" |
1181 | " '-'.\n"); | 394 | " '-'.\n"); |
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv) | |||
1240 | /* a symbol specified */ | 453 | /* a symbol specified */ |
1241 | symbol = argv[1]; | 454 | symbol = argv[1]; |
1242 | /* TODO: support .init module functions */ | 455 | /* TODO: support .init module functions */ |
1243 | ret = split_symbol_offset(symbol, &offset); | 456 | ret = traceprobe_split_symbol_offset(symbol, &offset); |
1244 | if (ret) { | 457 | if (ret) { |
1245 | pr_info("Failed to parse symbol.\n"); | 458 | pr_info("Failed to parse symbol.\n"); |
1246 | return ret; | 459 | return ret; |
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv) | |||
1302 | goto error; | 515 | goto error; |
1303 | } | 516 | } |
1304 | 517 | ||
1305 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { | 518 | if (traceprobe_conflict_field_name(tp->args[i].name, |
519 | tp->args, i)) { | ||
1306 | pr_info("Argument[%d] name '%s' conflicts with " | 520 | pr_info("Argument[%d] name '%s' conflicts with " |
1307 | "another field.\n", i, argv[i]); | 521 | "another field.\n", i, argv[i]); |
1308 | ret = -EINVAL; | 522 | ret = -EINVAL; |
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv) | |||
1310 | } | 524 | } |
1311 | 525 | ||
1312 | /* Parse fetch argument */ | 526 | /* Parse fetch argument */ |
1313 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); | 527 | ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], |
528 | is_return, true); | ||
1314 | if (ret) { | 529 | if (ret) { |
1315 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 530 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
1316 | goto error; | 531 | goto error; |
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file) | |||
1412 | return seq_open(file, &probes_seq_op); | 627 | return seq_open(file, &probes_seq_op); |
1413 | } | 628 | } |
1414 | 629 | ||
1415 | static int command_trace_probe(const char *buf) | ||
1416 | { | ||
1417 | char **argv; | ||
1418 | int argc = 0, ret = 0; | ||
1419 | |||
1420 | argv = argv_split(GFP_KERNEL, buf, &argc); | ||
1421 | if (!argv) | ||
1422 | return -ENOMEM; | ||
1423 | |||
1424 | if (argc) | ||
1425 | ret = create_trace_probe(argc, argv); | ||
1426 | |||
1427 | argv_free(argv); | ||
1428 | return ret; | ||
1429 | } | ||
1430 | |||
1431 | #define WRITE_BUFSIZE 4096 | ||
1432 | |||
1433 | static ssize_t probes_write(struct file *file, const char __user *buffer, | 630 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
1434 | size_t count, loff_t *ppos) | 631 | size_t count, loff_t *ppos) |
1435 | { | 632 | { |
1436 | char *kbuf, *tmp; | 633 | return traceprobe_probes_write(file, buffer, count, ppos, |
1437 | int ret; | 634 | create_trace_probe); |
1438 | size_t done; | ||
1439 | size_t size; | ||
1440 | |||
1441 | kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | ||
1442 | if (!kbuf) | ||
1443 | return -ENOMEM; | ||
1444 | |||
1445 | ret = done = 0; | ||
1446 | while (done < count) { | ||
1447 | size = count - done; | ||
1448 | if (size >= WRITE_BUFSIZE) | ||
1449 | size = WRITE_BUFSIZE - 1; | ||
1450 | if (copy_from_user(kbuf, buffer + done, size)) { | ||
1451 | ret = -EFAULT; | ||
1452 | goto out; | ||
1453 | } | ||
1454 | kbuf[size] = '\0'; | ||
1455 | tmp = strchr(kbuf, '\n'); | ||
1456 | if (tmp) { | ||
1457 | *tmp = '\0'; | ||
1458 | size = tmp - kbuf + 1; | ||
1459 | } else if (done + size < count) { | ||
1460 | pr_warning("Line length is too long: " | ||
1461 | "Should be less than %d.", WRITE_BUFSIZE); | ||
1462 | ret = -EINVAL; | ||
1463 | goto out; | ||
1464 | } | ||
1465 | done += size; | ||
1466 | /* Remove comments */ | ||
1467 | tmp = strchr(kbuf, '#'); | ||
1468 | if (tmp) | ||
1469 | *tmp = '\0'; | ||
1470 | |||
1471 | ret = command_trace_probe(kbuf); | ||
1472 | if (ret) | ||
1473 | goto out; | ||
1474 | } | ||
1475 | ret = done; | ||
1476 | out: | ||
1477 | kfree(kbuf); | ||
1478 | return ret; | ||
1479 | } | 635 | } |
1480 | 636 | ||
1481 | static const struct file_operations kprobe_events_ops = { | 637 | static const struct file_operations kprobe_events_ops = { |
@@ -1711,16 +867,6 @@ partial: | |||
1711 | return TRACE_TYPE_PARTIAL_LINE; | 867 | return TRACE_TYPE_PARTIAL_LINE; |
1712 | } | 868 | } |
1713 | 869 | ||
1714 | #undef DEFINE_FIELD | ||
1715 | #define DEFINE_FIELD(type, item, name, is_signed) \ | ||
1716 | do { \ | ||
1717 | ret = trace_define_field(event_call, #type, name, \ | ||
1718 | offsetof(typeof(field), item), \ | ||
1719 | sizeof(field.item), is_signed, \ | ||
1720 | FILTER_OTHER); \ | ||
1721 | if (ret) \ | ||
1722 | return ret; \ | ||
1723 | } while (0) | ||
1724 | 870 | ||
1725 | static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | 871 | static int kprobe_event_define_fields(struct ftrace_event_call *event_call) |
1726 | { | 872 | { |
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2051 | 1197 | ||
2052 | pr_info("Testing kprobe tracing: "); | 1198 | pr_info("Testing kprobe tracing: "); |
2053 | 1199 | ||
2054 | ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " | 1200 | ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " |
2055 | "$stack $stack0 +0($stack)"); | 1201 | "$stack $stack0 +0($stack)", |
1202 | create_trace_probe); | ||
2056 | if (WARN_ON_ONCE(ret)) { | 1203 | if (WARN_ON_ONCE(ret)) { |
2057 | pr_warning("error on probing function entry.\n"); | 1204 | pr_warning("error on probing function entry.\n"); |
2058 | warn++; | 1205 | warn++; |
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2066 | enable_trace_probe(tp, TP_FLAG_TRACE); | 1213 | enable_trace_probe(tp, TP_FLAG_TRACE); |
2067 | } | 1214 | } |
2068 | 1215 | ||
2069 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " | 1216 | ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " |
2070 | "$retval"); | 1217 | "$retval", create_trace_probe); |
2071 | if (WARN_ON_ONCE(ret)) { | 1218 | if (WARN_ON_ONCE(ret)) { |
2072 | pr_warning("error on probing function return.\n"); | 1219 | pr_warning("error on probing function return.\n"); |
2073 | warn++; | 1220 | warn++; |
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2101 | } else | 1248 | } else |
2102 | disable_trace_probe(tp, TP_FLAG_TRACE); | 1249 | disable_trace_probe(tp, TP_FLAG_TRACE); |
2103 | 1250 | ||
2104 | ret = command_trace_probe("-:testprobe"); | 1251 | ret = traceprobe_command("-:testprobe", create_trace_probe); |
2105 | if (WARN_ON_ONCE(ret)) { | 1252 | if (WARN_ON_ONCE(ret)) { |
2106 | pr_warning("error on deleting a probe.\n"); | 1253 | pr_warning("error on deleting a probe.\n"); |
2107 | warn++; | 1254 | warn++; |
2108 | } | 1255 | } |
2109 | 1256 | ||
2110 | ret = command_trace_probe("-:testprobe2"); | 1257 | ret = traceprobe_command("-:testprobe2", create_trace_probe); |
2111 | if (WARN_ON_ONCE(ret)) { | 1258 | if (WARN_ON_ONCE(ret)) { |
2112 | pr_warning("error on deleting a probe.\n"); | 1259 | pr_warning("error on deleting a probe.\n"); |
2113 | warn++; | 1260 | warn++; |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f9..a9077c1b4ad3 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | 52 | char *fmt; |
53 | 53 | ||
54 | /* allocate the trace_printk per cpu buffers */ | ||
55 | if (start != end) | ||
56 | trace_printk_init_buffers(); | ||
57 | |||
54 | mutex_lock(&btrace_mutex); | 58 | mutex_lock(&btrace_mutex); |
55 | for (iter = start; iter < end; iter++) { | 59 | for (iter = start; iter < end; iter++) { |
56 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); | 60 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 000000000000..daa9980153af --- /dev/null +++ b/kernel/trace/trace_probe.c | |||
@@ -0,0 +1,839 @@ | |||
1 | /* | ||
2 | * Common code for probe-based Dynamic events. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
16 | * | ||
17 | * This code was copied from kernel/trace/trace_kprobe.c written by | ||
18 | * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> | ||
19 | * | ||
20 | * Updates to make this generic: | ||
21 | * Copyright (C) IBM Corporation, 2010-2011 | ||
22 | * Author: Srikar Dronamraju | ||
23 | */ | ||
24 | |||
25 | #include "trace_probe.h" | ||
26 | |||
27 | const char *reserved_field_names[] = { | ||
28 | "common_type", | ||
29 | "common_flags", | ||
30 | "common_preempt_count", | ||
31 | "common_pid", | ||
32 | "common_tgid", | ||
33 | FIELD_STRING_IP, | ||
34 | FIELD_STRING_RETIP, | ||
35 | FIELD_STRING_FUNC, | ||
36 | }; | ||
37 | |||
38 | /* Printing function type */ | ||
39 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | ||
40 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | ||
41 | |||
42 | /* Printing in basic type function template */ | ||
43 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | ||
44 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | ||
45 | const char *name, \ | ||
46 | void *data, void *ent)\ | ||
47 | { \ | ||
48 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | ||
49 | } \ | ||
50 | static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; | ||
51 | |||
52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) | ||
53 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) | ||
54 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) | ||
55 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) | ||
56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) | ||
57 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | ||
58 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | ||
59 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | ||
60 | |||
61 | static inline void *get_rloc_data(u32 *dl) | ||
62 | { | ||
63 | return (u8 *)dl + get_rloc_offs(*dl); | ||
64 | } | ||
65 | |||
66 | /* For data_loc conversion */ | ||
67 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
68 | { | ||
69 | return (u8 *)ent + get_rloc_offs(*dl); | ||
70 | } | ||
71 | |||
72 | /* For defining macros, define string/string_size types */ | ||
73 | typedef u32 string; | ||
74 | typedef u32 string_size; | ||
75 | |||
76 | /* Print type function for string type */ | ||
77 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
78 | const char *name, | ||
79 | void *data, void *ent) | ||
80 | { | ||
81 | int len = *(u32 *)data >> 16; | ||
82 | |||
83 | if (!len) | ||
84 | return trace_seq_printf(s, " %s=(fault)", name); | ||
85 | else | ||
86 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
87 | (const char *)get_loc_data(data, ent)); | ||
88 | } | ||
89 | |||
90 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
91 | |||
92 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | ||
93 | /* | ||
94 | * Define macro for basic types - we don't need to define s* types, because | ||
95 | * we have to care only about bitwidth at recording time. | ||
96 | */ | ||
97 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ | ||
98 | DEFINE_FETCH_##method(u8) \ | ||
99 | DEFINE_FETCH_##method(u16) \ | ||
100 | DEFINE_FETCH_##method(u32) \ | ||
101 | DEFINE_FETCH_##method(u64) | ||
102 | |||
103 | #define CHECK_FETCH_FUNCS(method, fn) \ | ||
104 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ | ||
105 | (FETCH_FUNC_NAME(method, u16) == fn) || \ | ||
106 | (FETCH_FUNC_NAME(method, u32) == fn) || \ | ||
107 | (FETCH_FUNC_NAME(method, u64) == fn) || \ | ||
108 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
109 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
110 | && (fn != NULL)) | ||
111 | |||
112 | /* Data fetch function templates */ | ||
113 | #define DEFINE_FETCH_reg(type) \ | ||
114 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | ||
115 | void *offset, void *dest) \ | ||
116 | { \ | ||
117 | *(type *)dest = (type)regs_get_register(regs, \ | ||
118 | (unsigned int)((unsigned long)offset)); \ | ||
119 | } | ||
120 | DEFINE_BASIC_FETCH_FUNCS(reg) | ||
121 | /* No string on the register */ | ||
122 | #define fetch_reg_string NULL | ||
123 | #define fetch_reg_string_size NULL | ||
124 | |||
125 | #define DEFINE_FETCH_stack(type) \ | ||
126 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
127 | void *offset, void *dest) \ | ||
128 | { \ | ||
129 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | ||
130 | (unsigned int)((unsigned long)offset)); \ | ||
131 | } | ||
132 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
133 | /* No string on the stack entry */ | ||
134 | #define fetch_stack_string NULL | ||
135 | #define fetch_stack_string_size NULL | ||
136 | |||
137 | #define DEFINE_FETCH_retval(type) \ | ||
138 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | ||
139 | void *dummy, void *dest) \ | ||
140 | { \ | ||
141 | *(type *)dest = (type)regs_return_value(regs); \ | ||
142 | } | ||
143 | DEFINE_BASIC_FETCH_FUNCS(retval) | ||
144 | /* No string on the retval */ | ||
145 | #define fetch_retval_string NULL | ||
146 | #define fetch_retval_string_size NULL | ||
147 | |||
148 | #define DEFINE_FETCH_memory(type) \ | ||
149 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
150 | void *addr, void *dest) \ | ||
151 | { \ | ||
152 | type retval; \ | ||
153 | if (probe_kernel_address(addr, retval)) \ | ||
154 | *(type *)dest = 0; \ | ||
155 | else \ | ||
156 | *(type *)dest = retval; \ | ||
157 | } | ||
158 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
159 | /* | ||
160 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
161 | * length and relative data location. | ||
162 | */ | ||
163 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
164 | void *addr, void *dest) | ||
165 | { | ||
166 | long ret; | ||
167 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
168 | u8 *dst = get_rloc_data(dest); | ||
169 | u8 *src = addr; | ||
170 | mm_segment_t old_fs = get_fs(); | ||
171 | |||
172 | if (!maxlen) | ||
173 | return; | ||
174 | |||
175 | /* | ||
176 | * Try to get string again, since the string can be changed while | ||
177 | * probing. | ||
178 | */ | ||
179 | set_fs(KERNEL_DS); | ||
180 | pagefault_disable(); | ||
181 | |||
182 | do | ||
183 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
184 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
185 | |||
186 | dst[-1] = '\0'; | ||
187 | pagefault_enable(); | ||
188 | set_fs(old_fs); | ||
189 | |||
190 | if (ret < 0) { /* Failed to fetch string */ | ||
191 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
192 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
193 | } else { | ||
194 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
195 | get_rloc_offs(*(u32 *)dest)); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* Return the length of string -- including null terminal byte */ | ||
200 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
201 | void *addr, void *dest) | ||
202 | { | ||
203 | mm_segment_t old_fs; | ||
204 | int ret, len = 0; | ||
205 | u8 c; | ||
206 | |||
207 | old_fs = get_fs(); | ||
208 | set_fs(KERNEL_DS); | ||
209 | pagefault_disable(); | ||
210 | |||
211 | do { | ||
212 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
213 | len++; | ||
214 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
215 | |||
216 | pagefault_enable(); | ||
217 | set_fs(old_fs); | ||
218 | |||
219 | if (ret < 0) /* Failed to check the length */ | ||
220 | *(u32 *)dest = 0; | ||
221 | else | ||
222 | *(u32 *)dest = len; | ||
223 | } | ||
224 | |||
225 | /* Memory fetching by symbol */ | ||
226 | struct symbol_cache { | ||
227 | char *symbol; | ||
228 | long offset; | ||
229 | unsigned long addr; | ||
230 | }; | ||
231 | |||
232 | static unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
233 | { | ||
234 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
235 | |||
236 | if (sc->addr) | ||
237 | sc->addr += sc->offset; | ||
238 | |||
239 | return sc->addr; | ||
240 | } | ||
241 | |||
242 | static void free_symbol_cache(struct symbol_cache *sc) | ||
243 | { | ||
244 | kfree(sc->symbol); | ||
245 | kfree(sc); | ||
246 | } | ||
247 | |||
248 | static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
249 | { | ||
250 | struct symbol_cache *sc; | ||
251 | |||
252 | if (!sym || strlen(sym) == 0) | ||
253 | return NULL; | ||
254 | |||
255 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
256 | if (!sc) | ||
257 | return NULL; | ||
258 | |||
259 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
260 | if (!sc->symbol) { | ||
261 | kfree(sc); | ||
262 | return NULL; | ||
263 | } | ||
264 | sc->offset = offset; | ||
265 | update_symbol_cache(sc); | ||
266 | |||
267 | return sc; | ||
268 | } | ||
269 | |||
270 | #define DEFINE_FETCH_symbol(type) \ | ||
271 | static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | ||
272 | void *data, void *dest) \ | ||
273 | { \ | ||
274 | struct symbol_cache *sc = data; \ | ||
275 | if (sc->addr) \ | ||
276 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | ||
277 | else \ | ||
278 | *(type *)dest = 0; \ | ||
279 | } | ||
280 | DEFINE_BASIC_FETCH_FUNCS(symbol) | ||
281 | DEFINE_FETCH_symbol(string) | ||
282 | DEFINE_FETCH_symbol(string_size) | ||
283 | |||
284 | /* Dereference memory access function */ | ||
285 | struct deref_fetch_param { | ||
286 | struct fetch_param orig; | ||
287 | long offset; | ||
288 | }; | ||
289 | |||
290 | #define DEFINE_FETCH_deref(type) \ | ||
291 | static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | ||
292 | void *data, void *dest) \ | ||
293 | { \ | ||
294 | struct deref_fetch_param *dprm = data; \ | ||
295 | unsigned long addr; \ | ||
296 | call_fetch(&dprm->orig, regs, &addr); \ | ||
297 | if (addr) { \ | ||
298 | addr += dprm->offset; \ | ||
299 | fetch_memory_##type(regs, (void *)addr, dest); \ | ||
300 | } else \ | ||
301 | *(type *)dest = 0; \ | ||
302 | } | ||
303 | DEFINE_BASIC_FETCH_FUNCS(deref) | ||
304 | DEFINE_FETCH_deref(string) | ||
305 | DEFINE_FETCH_deref(string_size) | ||
306 | |||
307 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | ||
308 | { | ||
309 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
310 | update_deref_fetch_param(data->orig.data); | ||
311 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
312 | update_symbol_cache(data->orig.data); | ||
313 | } | ||
314 | |||
315 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | ||
316 | { | ||
317 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
318 | free_deref_fetch_param(data->orig.data); | ||
319 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
320 | free_symbol_cache(data->orig.data); | ||
321 | kfree(data); | ||
322 | } | ||
323 | |||
324 | /* Bitfield fetch function */ | ||
325 | struct bitfield_fetch_param { | ||
326 | struct fetch_param orig; | ||
327 | unsigned char hi_shift; | ||
328 | unsigned char low_shift; | ||
329 | }; | ||
330 | |||
331 | #define DEFINE_FETCH_bitfield(type) \ | ||
332 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
333 | void *data, void *dest) \ | ||
334 | { \ | ||
335 | struct bitfield_fetch_param *bprm = data; \ | ||
336 | type buf = 0; \ | ||
337 | call_fetch(&bprm->orig, regs, &buf); \ | ||
338 | if (buf) { \ | ||
339 | buf <<= bprm->hi_shift; \ | ||
340 | buf >>= bprm->low_shift; \ | ||
341 | } \ | ||
342 | *(type *)dest = buf; \ | ||
343 | } | ||
344 | |||
345 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
346 | #define fetch_bitfield_string NULL | ||
347 | #define fetch_bitfield_string_size NULL | ||
348 | |||
349 | static __kprobes void | ||
350 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
351 | { | ||
352 | /* | ||
353 | * Don't check the bitfield itself, because this must be the | ||
354 | * last fetch function. | ||
355 | */ | ||
356 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
357 | update_deref_fetch_param(data->orig.data); | ||
358 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
359 | update_symbol_cache(data->orig.data); | ||
360 | } | ||
361 | |||
362 | static __kprobes void | ||
363 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
364 | { | ||
365 | /* | ||
366 | * Don't check the bitfield itself, because this must be the | ||
367 | * last fetch function. | ||
368 | */ | ||
369 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
370 | free_deref_fetch_param(data->orig.data); | ||
371 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
372 | free_symbol_cache(data->orig.data); | ||
373 | |||
374 | kfree(data); | ||
375 | } | ||
376 | |||
377 | /* Default (unsigned long) fetch type */ | ||
378 | #define __DEFAULT_FETCH_TYPE(t) u##t | ||
379 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | ||
380 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | ||
381 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | ||
382 | |||
383 | #define ASSIGN_FETCH_FUNC(method, type) \ | ||
384 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) | ||
385 | |||
386 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ | ||
387 | {.name = _name, \ | ||
388 | .size = _size, \ | ||
389 | .is_signed = sign, \ | ||
390 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
391 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
392 | .fmttype = _fmttype, \ | ||
393 | .fetch = { \ | ||
394 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
395 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
396 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
397 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
398 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
399 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
400 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
401 | } \ | ||
402 | } | ||
403 | |||
404 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
405 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
406 | |||
407 | #define FETCH_TYPE_STRING 0 | ||
408 | #define FETCH_TYPE_STRSIZE 1 | ||
409 | |||
410 | /* Fetch type information table */ | ||
411 | static const struct fetch_type fetch_type_table[] = { | ||
412 | /* Special types */ | ||
413 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
414 | sizeof(u32), 1, "__data_loc char[]"), | ||
415 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
416 | string_size, sizeof(u32), 0, "u32"), | ||
417 | /* Basic types */ | ||
418 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
419 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
420 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
421 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
422 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
423 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
424 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
425 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
426 | }; | ||
427 | |||
428 | static const struct fetch_type *find_fetch_type(const char *type) | ||
429 | { | ||
430 | int i; | ||
431 | |||
432 | if (!type) | ||
433 | type = DEFAULT_FETCH_TYPE_STR; | ||
434 | |||
435 | /* Special case: bitfield */ | ||
436 | if (*type == 'b') { | ||
437 | unsigned long bs; | ||
438 | |||
439 | type = strchr(type, '/'); | ||
440 | if (!type) | ||
441 | goto fail; | ||
442 | |||
443 | type++; | ||
444 | if (strict_strtoul(type, 0, &bs)) | ||
445 | goto fail; | ||
446 | |||
447 | switch (bs) { | ||
448 | case 8: | ||
449 | return find_fetch_type("u8"); | ||
450 | case 16: | ||
451 | return find_fetch_type("u16"); | ||
452 | case 32: | ||
453 | return find_fetch_type("u32"); | ||
454 | case 64: | ||
455 | return find_fetch_type("u64"); | ||
456 | default: | ||
457 | goto fail; | ||
458 | } | ||
459 | } | ||
460 | |||
461 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | ||
462 | if (strcmp(type, fetch_type_table[i].name) == 0) | ||
463 | return &fetch_type_table[i]; | ||
464 | |||
465 | fail: | ||
466 | return NULL; | ||
467 | } | ||
468 | |||
469 | /* Special function : only accept unsigned long */ | ||
470 | static __kprobes void fetch_stack_address(struct pt_regs *regs, | ||
471 | void *dummy, void *dest) | ||
472 | { | ||
473 | *(unsigned long *)dest = kernel_stack_pointer(regs); | ||
474 | } | ||
475 | |||
476 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
477 | fetch_func_t orig_fn) | ||
478 | { | ||
479 | int i; | ||
480 | |||
481 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
482 | return NULL; /* Only string type needs size function */ | ||
483 | |||
484 | for (i = 0; i < FETCH_MTD_END; i++) | ||
485 | if (type->fetch[i] == orig_fn) | ||
486 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
487 | |||
488 | WARN_ON(1); /* This should not happen */ | ||
489 | |||
490 | return NULL; | ||
491 | } | ||
492 | |||
493 | /* Split symbol and offset. */ | ||
494 | int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) | ||
495 | { | ||
496 | char *tmp; | ||
497 | int ret; | ||
498 | |||
499 | if (!offset) | ||
500 | return -EINVAL; | ||
501 | |||
502 | tmp = strchr(symbol, '+'); | ||
503 | if (tmp) { | ||
504 | /* skip sign because strict_strtol doesn't accept '+' */ | ||
505 | ret = strict_strtoul(tmp + 1, 0, offset); | ||
506 | if (ret) | ||
507 | return ret; | ||
508 | |||
509 | *tmp = '\0'; | ||
510 | } else | ||
511 | *offset = 0; | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | ||
517 | |||
518 | static int parse_probe_vars(char *arg, const struct fetch_type *t, | ||
519 | struct fetch_param *f, bool is_return) | ||
520 | { | ||
521 | int ret = 0; | ||
522 | unsigned long param; | ||
523 | |||
524 | if (strcmp(arg, "retval") == 0) { | ||
525 | if (is_return) | ||
526 | f->fn = t->fetch[FETCH_MTD_retval]; | ||
527 | else | ||
528 | ret = -EINVAL; | ||
529 | } else if (strncmp(arg, "stack", 5) == 0) { | ||
530 | if (arg[5] == '\0') { | ||
531 | if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) | ||
532 | f->fn = fetch_stack_address; | ||
533 | else | ||
534 | ret = -EINVAL; | ||
535 | } else if (isdigit(arg[5])) { | ||
536 | ret = strict_strtoul(arg + 5, 10, ¶m); | ||
537 | if (ret || param > PARAM_MAX_STACK) | ||
538 | ret = -EINVAL; | ||
539 | else { | ||
540 | f->fn = t->fetch[FETCH_MTD_stack]; | ||
541 | f->data = (void *)param; | ||
542 | } | ||
543 | } else | ||
544 | ret = -EINVAL; | ||
545 | } else | ||
546 | ret = -EINVAL; | ||
547 | |||
548 | return ret; | ||
549 | } | ||
550 | |||
551 | /* Recursive argument parser */ | ||
552 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | ||
553 | struct fetch_param *f, bool is_return, bool is_kprobe) | ||
554 | { | ||
555 | unsigned long param; | ||
556 | long offset; | ||
557 | char *tmp; | ||
558 | int ret; | ||
559 | |||
560 | ret = 0; | ||
561 | |||
562 | /* Until uprobe_events supports only reg arguments */ | ||
563 | if (!is_kprobe && arg[0] != '%') | ||
564 | return -EINVAL; | ||
565 | |||
566 | switch (arg[0]) { | ||
567 | case '$': | ||
568 | ret = parse_probe_vars(arg + 1, t, f, is_return); | ||
569 | break; | ||
570 | |||
571 | case '%': /* named register */ | ||
572 | ret = regs_query_register_offset(arg + 1); | ||
573 | if (ret >= 0) { | ||
574 | f->fn = t->fetch[FETCH_MTD_reg]; | ||
575 | f->data = (void *)(unsigned long)ret; | ||
576 | ret = 0; | ||
577 | } | ||
578 | break; | ||
579 | |||
580 | case '@': /* memory or symbol */ | ||
581 | if (isdigit(arg[1])) { | ||
582 | ret = strict_strtoul(arg + 1, 0, ¶m); | ||
583 | if (ret) | ||
584 | break; | ||
585 | |||
586 | f->fn = t->fetch[FETCH_MTD_memory]; | ||
587 | f->data = (void *)param; | ||
588 | } else { | ||
589 | ret = traceprobe_split_symbol_offset(arg + 1, &offset); | ||
590 | if (ret) | ||
591 | break; | ||
592 | |||
593 | f->data = alloc_symbol_cache(arg + 1, offset); | ||
594 | if (f->data) | ||
595 | f->fn = t->fetch[FETCH_MTD_symbol]; | ||
596 | } | ||
597 | break; | ||
598 | |||
599 | case '+': /* deref memory */ | ||
600 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
601 | case '-': | ||
602 | tmp = strchr(arg, '('); | ||
603 | if (!tmp) | ||
604 | break; | ||
605 | |||
606 | *tmp = '\0'; | ||
607 | ret = strict_strtol(arg, 0, &offset); | ||
608 | |||
609 | if (ret) | ||
610 | break; | ||
611 | |||
612 | arg = tmp + 1; | ||
613 | tmp = strrchr(arg, ')'); | ||
614 | |||
615 | if (tmp) { | ||
616 | struct deref_fetch_param *dprm; | ||
617 | const struct fetch_type *t2; | ||
618 | |||
619 | t2 = find_fetch_type(NULL); | ||
620 | *tmp = '\0'; | ||
621 | dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); | ||
622 | |||
623 | if (!dprm) | ||
624 | return -ENOMEM; | ||
625 | |||
626 | dprm->offset = offset; | ||
627 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | ||
628 | is_kprobe); | ||
629 | if (ret) | ||
630 | kfree(dprm); | ||
631 | else { | ||
632 | f->fn = t->fetch[FETCH_MTD_deref]; | ||
633 | f->data = (void *)dprm; | ||
634 | } | ||
635 | } | ||
636 | break; | ||
637 | } | ||
638 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ | ||
639 | pr_info("%s type has no corresponding fetch method.\n", t->name); | ||
640 | ret = -EINVAL; | ||
641 | } | ||
642 | |||
643 | return ret; | ||
644 | } | ||
645 | |||
646 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
647 | |||
648 | /* Bitfield type needs to be parsed into a fetch function */ | ||
649 | static int __parse_bitfield_probe_arg(const char *bf, | ||
650 | const struct fetch_type *t, | ||
651 | struct fetch_param *f) | ||
652 | { | ||
653 | struct bitfield_fetch_param *bprm; | ||
654 | unsigned long bw, bo; | ||
655 | char *tail; | ||
656 | |||
657 | if (*bf != 'b') | ||
658 | return 0; | ||
659 | |||
660 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
661 | if (!bprm) | ||
662 | return -ENOMEM; | ||
663 | |||
664 | bprm->orig = *f; | ||
665 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
666 | f->data = (void *)bprm; | ||
667 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
668 | |||
669 | if (bw == 0 || *tail != '@') | ||
670 | return -EINVAL; | ||
671 | |||
672 | bf = tail + 1; | ||
673 | bo = simple_strtoul(bf, &tail, 0); | ||
674 | |||
675 | if (tail == bf || *tail != '/') | ||
676 | return -EINVAL; | ||
677 | |||
678 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
679 | bprm->low_shift = bprm->hi_shift + bo; | ||
680 | |||
681 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
682 | } | ||
683 | |||
684 | /* String length checking wrapper */ | ||
685 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | ||
686 | struct probe_arg *parg, bool is_return, bool is_kprobe) | ||
687 | { | ||
688 | const char *t; | ||
689 | int ret; | ||
690 | |||
691 | if (strlen(arg) > MAX_ARGSTR_LEN) { | ||
692 | pr_info("Argument is too long.: %s\n", arg); | ||
693 | return -ENOSPC; | ||
694 | } | ||
695 | parg->comm = kstrdup(arg, GFP_KERNEL); | ||
696 | if (!parg->comm) { | ||
697 | pr_info("Failed to allocate memory for command '%s'.\n", arg); | ||
698 | return -ENOMEM; | ||
699 | } | ||
700 | t = strchr(parg->comm, ':'); | ||
701 | if (t) { | ||
702 | arg[t - parg->comm] = '\0'; | ||
703 | t++; | ||
704 | } | ||
705 | parg->type = find_fetch_type(t); | ||
706 | if (!parg->type) { | ||
707 | pr_info("Unsupported type: %s\n", t); | ||
708 | return -EINVAL; | ||
709 | } | ||
710 | parg->offset = *size; | ||
711 | *size += parg->type->size; | ||
712 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); | ||
713 | |||
714 | if (ret >= 0 && t != NULL) | ||
715 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
716 | |||
717 | if (ret >= 0) { | ||
718 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
719 | parg->fetch.fn); | ||
720 | parg->fetch_size.data = parg->fetch.data; | ||
721 | } | ||
722 | |||
723 | return ret; | ||
724 | } | ||
725 | |||
726 | /* Return 1 if name is reserved or already used by another argument */ | ||
727 | int traceprobe_conflict_field_name(const char *name, | ||
728 | struct probe_arg *args, int narg) | ||
729 | { | ||
730 | int i; | ||
731 | |||
732 | for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) | ||
733 | if (strcmp(reserved_field_names[i], name) == 0) | ||
734 | return 1; | ||
735 | |||
736 | for (i = 0; i < narg; i++) | ||
737 | if (strcmp(args[i].name, name) == 0) | ||
738 | return 1; | ||
739 | |||
740 | return 0; | ||
741 | } | ||
742 | |||
743 | void traceprobe_update_arg(struct probe_arg *arg) | ||
744 | { | ||
745 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
746 | update_bitfield_fetch_param(arg->fetch.data); | ||
747 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
748 | update_deref_fetch_param(arg->fetch.data); | ||
749 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
750 | update_symbol_cache(arg->fetch.data); | ||
751 | } | ||
752 | |||
753 | void traceprobe_free_probe_arg(struct probe_arg *arg) | ||
754 | { | ||
755 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
756 | free_bitfield_fetch_param(arg->fetch.data); | ||
757 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
758 | free_deref_fetch_param(arg->fetch.data); | ||
759 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
760 | free_symbol_cache(arg->fetch.data); | ||
761 | |||
762 | kfree(arg->name); | ||
763 | kfree(arg->comm); | ||
764 | } | ||
765 | |||
766 | int traceprobe_command(const char *buf, int (*createfn)(int, char **)) | ||
767 | { | ||
768 | char **argv; | ||
769 | int argc, ret; | ||
770 | |||
771 | argc = 0; | ||
772 | ret = 0; | ||
773 | argv = argv_split(GFP_KERNEL, buf, &argc); | ||
774 | if (!argv) | ||
775 | return -ENOMEM; | ||
776 | |||
777 | if (argc) | ||
778 | ret = createfn(argc, argv); | ||
779 | |||
780 | argv_free(argv); | ||
781 | |||
782 | return ret; | ||
783 | } | ||
784 | |||
785 | #define WRITE_BUFSIZE 4096 | ||
786 | |||
787 | ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, | ||
788 | size_t count, loff_t *ppos, | ||
789 | int (*createfn)(int, char **)) | ||
790 | { | ||
791 | char *kbuf, *tmp; | ||
792 | int ret = 0; | ||
793 | size_t done = 0; | ||
794 | size_t size; | ||
795 | |||
796 | kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | ||
797 | if (!kbuf) | ||
798 | return -ENOMEM; | ||
799 | |||
800 | while (done < count) { | ||
801 | size = count - done; | ||
802 | |||
803 | if (size >= WRITE_BUFSIZE) | ||
804 | size = WRITE_BUFSIZE - 1; | ||
805 | |||
806 | if (copy_from_user(kbuf, buffer + done, size)) { | ||
807 | ret = -EFAULT; | ||
808 | goto out; | ||
809 | } | ||
810 | kbuf[size] = '\0'; | ||
811 | tmp = strchr(kbuf, '\n'); | ||
812 | |||
813 | if (tmp) { | ||
814 | *tmp = '\0'; | ||
815 | size = tmp - kbuf + 1; | ||
816 | } else if (done + size < count) { | ||
817 | pr_warning("Line length is too long: " | ||
818 | "Should be less than %d.", WRITE_BUFSIZE); | ||
819 | ret = -EINVAL; | ||
820 | goto out; | ||
821 | } | ||
822 | done += size; | ||
823 | /* Remove comments */ | ||
824 | tmp = strchr(kbuf, '#'); | ||
825 | |||
826 | if (tmp) | ||
827 | *tmp = '\0'; | ||
828 | |||
829 | ret = traceprobe_command(kbuf, createfn); | ||
830 | if (ret) | ||
831 | goto out; | ||
832 | } | ||
833 | ret = done; | ||
834 | |||
835 | out: | ||
836 | kfree(kbuf); | ||
837 | |||
838 | return ret; | ||
839 | } | ||
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 000000000000..933708677814 --- /dev/null +++ b/kernel/trace/trace_probe.h | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * Common header file for probe-based Dynamic events. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
16 | * | ||
17 | * This code was copied from kernel/trace/trace_kprobe.h written by | ||
18 | * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> | ||
19 | * | ||
20 | * Updates to make this generic: | ||
21 | * Copyright (C) IBM Corporation, 2010-2011 | ||
22 | * Author: Srikar Dronamraju | ||
23 | */ | ||
24 | |||
25 | #include <linux/seq_file.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/smp.h> | ||
28 | #include <linux/debugfs.h> | ||
29 | #include <linux/types.h> | ||
30 | #include <linux/string.h> | ||
31 | #include <linux/ctype.h> | ||
32 | #include <linux/ptrace.h> | ||
33 | #include <linux/perf_event.h> | ||
34 | #include <linux/kprobes.h> | ||
35 | #include <linux/stringify.h> | ||
36 | #include <linux/limits.h> | ||
37 | #include <linux/uaccess.h> | ||
38 | #include <asm/bitsperlong.h> | ||
39 | |||
40 | #include "trace.h" | ||
41 | #include "trace_output.h" | ||
42 | |||
43 | #define MAX_TRACE_ARGS 128 | ||
44 | #define MAX_ARGSTR_LEN 63 | ||
45 | #define MAX_EVENT_NAME_LEN 64 | ||
46 | #define MAX_STRING_SIZE PATH_MAX | ||
47 | |||
48 | /* Reserved field names */ | ||
49 | #define FIELD_STRING_IP "__probe_ip" | ||
50 | #define FIELD_STRING_RETIP "__probe_ret_ip" | ||
51 | #define FIELD_STRING_FUNC "__probe_func" | ||
52 | |||
53 | #undef DEFINE_FIELD | ||
54 | #define DEFINE_FIELD(type, item, name, is_signed) \ | ||
55 | do { \ | ||
56 | ret = trace_define_field(event_call, #type, name, \ | ||
57 | offsetof(typeof(field), item), \ | ||
58 | sizeof(field.item), is_signed, \ | ||
59 | FILTER_OTHER); \ | ||
60 | if (ret) \ | ||
61 | return ret; \ | ||
62 | } while (0) | ||
63 | |||
64 | |||
65 | /* Flags for trace_probe */ | ||
66 | #define TP_FLAG_TRACE 1 | ||
67 | #define TP_FLAG_PROFILE 2 | ||
68 | #define TP_FLAG_REGISTERED 4 | ||
69 | #define TP_FLAG_UPROBE 8 | ||
70 | |||
71 | |||
72 | /* data_rloc: data relative location, compatible with u32 */ | ||
73 | #define make_data_rloc(len, roffs) \ | ||
74 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
75 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
76 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
77 | |||
78 | /* | ||
79 | * Convert data_rloc to data_loc: | ||
80 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
81 | * stores the offset from event entry. | ||
82 | */ | ||
83 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
84 | |||
85 | /* Data fetch function type */ | ||
86 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | ||
87 | /* Printing function type */ | ||
88 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); | ||
89 | |||
90 | /* Fetch types */ | ||
91 | enum { | ||
92 | FETCH_MTD_reg = 0, | ||
93 | FETCH_MTD_stack, | ||
94 | FETCH_MTD_retval, | ||
95 | FETCH_MTD_memory, | ||
96 | FETCH_MTD_symbol, | ||
97 | FETCH_MTD_deref, | ||
98 | FETCH_MTD_bitfield, | ||
99 | FETCH_MTD_END, | ||
100 | }; | ||
101 | |||
102 | /* Fetch type information table */ | ||
103 | struct fetch_type { | ||
104 | const char *name; /* Name of type */ | ||
105 | size_t size; /* Byte size of type */ | ||
106 | int is_signed; /* Signed flag */ | ||
107 | print_type_func_t print; /* Print functions */ | ||
108 | const char *fmt; /* Fromat string */ | ||
109 | const char *fmttype; /* Name in format file */ | ||
110 | /* Fetch functions */ | ||
111 | fetch_func_t fetch[FETCH_MTD_END]; | ||
112 | }; | ||
113 | |||
114 | struct fetch_param { | ||
115 | fetch_func_t fn; | ||
116 | void *data; | ||
117 | }; | ||
118 | |||
119 | struct probe_arg { | ||
120 | struct fetch_param fetch; | ||
121 | struct fetch_param fetch_size; | ||
122 | unsigned int offset; /* Offset from argument entry */ | ||
123 | const char *name; /* Name of this argument */ | ||
124 | const char *comm; /* Command of this argument */ | ||
125 | const struct fetch_type *type; /* Type of this argument */ | ||
126 | }; | ||
127 | |||
128 | static inline __kprobes void call_fetch(struct fetch_param *fprm, | ||
129 | struct pt_regs *regs, void *dest) | ||
130 | { | ||
131 | return fprm->fn(regs, fprm->data, dest); | ||
132 | } | ||
133 | |||
134 | /* Check the name is good for event/group/fields */ | ||
135 | static inline int is_good_name(const char *name) | ||
136 | { | ||
137 | if (!isalpha(*name) && *name != '_') | ||
138 | return 0; | ||
139 | while (*++name != '\0') { | ||
140 | if (!isalpha(*name) && !isdigit(*name) && *name != '_') | ||
141 | return 0; | ||
142 | } | ||
143 | return 1; | ||
144 | } | ||
145 | |||
146 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | ||
147 | struct probe_arg *parg, bool is_return, bool is_kprobe); | ||
148 | |||
149 | extern int traceprobe_conflict_field_name(const char *name, | ||
150 | struct probe_arg *args, int narg); | ||
151 | |||
152 | extern void traceprobe_update_arg(struct probe_arg *arg); | ||
153 | extern void traceprobe_free_probe_arg(struct probe_arg *arg); | ||
154 | |||
155 | extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); | ||
156 | |||
157 | extern ssize_t traceprobe_probes_write(struct file *file, | ||
158 | const char __user *buffer, size_t count, loff_t *ppos, | ||
159 | int (*createfn)(int, char**)); | ||
160 | |||
161 | extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 000000000000..2b36ac68549e --- /dev/null +++ b/kernel/trace/trace_uprobe.c | |||
@@ -0,0 +1,788 @@ | |||
1 | /* | ||
2 | * uprobes-based tracing events | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
16 | * | ||
17 | * Copyright (C) IBM Corporation, 2010-2012 | ||
18 | * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/uprobes.h> | ||
24 | #include <linux/namei.h> | ||
25 | |||
26 | #include "trace_probe.h" | ||
27 | |||
28 | #define UPROBE_EVENT_SYSTEM "uprobes" | ||
29 | |||
30 | /* | ||
31 | * uprobe event core functions | ||
32 | */ | ||
33 | struct trace_uprobe; | ||
34 | struct uprobe_trace_consumer { | ||
35 | struct uprobe_consumer cons; | ||
36 | struct trace_uprobe *tu; | ||
37 | }; | ||
38 | |||
39 | struct trace_uprobe { | ||
40 | struct list_head list; | ||
41 | struct ftrace_event_class class; | ||
42 | struct ftrace_event_call call; | ||
43 | struct uprobe_trace_consumer *consumer; | ||
44 | struct inode *inode; | ||
45 | char *filename; | ||
46 | unsigned long offset; | ||
47 | unsigned long nhit; | ||
48 | unsigned int flags; /* For TP_FLAG_* */ | ||
49 | ssize_t size; /* trace entry size */ | ||
50 | unsigned int nr_args; | ||
51 | struct probe_arg args[]; | ||
52 | }; | ||
53 | |||
54 | #define SIZEOF_TRACE_UPROBE(n) \ | ||
55 | (offsetof(struct trace_uprobe, args) + \ | ||
56 | (sizeof(struct probe_arg) * (n))) | ||
57 | |||
58 | static int register_uprobe_event(struct trace_uprobe *tu); | ||
59 | static void unregister_uprobe_event(struct trace_uprobe *tu); | ||
60 | |||
61 | static DEFINE_MUTEX(uprobe_lock); | ||
62 | static LIST_HEAD(uprobe_list); | ||
63 | |||
64 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); | ||
65 | |||
66 | /* | ||
67 | * Allocate new trace_uprobe and initialize it (including uprobes). | ||
68 | */ | ||
69 | static struct trace_uprobe * | ||
70 | alloc_trace_uprobe(const char *group, const char *event, int nargs) | ||
71 | { | ||
72 | struct trace_uprobe *tu; | ||
73 | |||
74 | if (!event || !is_good_name(event)) | ||
75 | return ERR_PTR(-EINVAL); | ||
76 | |||
77 | if (!group || !is_good_name(group)) | ||
78 | return ERR_PTR(-EINVAL); | ||
79 | |||
80 | tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); | ||
81 | if (!tu) | ||
82 | return ERR_PTR(-ENOMEM); | ||
83 | |||
84 | tu->call.class = &tu->class; | ||
85 | tu->call.name = kstrdup(event, GFP_KERNEL); | ||
86 | if (!tu->call.name) | ||
87 | goto error; | ||
88 | |||
89 | tu->class.system = kstrdup(group, GFP_KERNEL); | ||
90 | if (!tu->class.system) | ||
91 | goto error; | ||
92 | |||
93 | INIT_LIST_HEAD(&tu->list); | ||
94 | return tu; | ||
95 | |||
96 | error: | ||
97 | kfree(tu->call.name); | ||
98 | kfree(tu); | ||
99 | |||
100 | return ERR_PTR(-ENOMEM); | ||
101 | } | ||
102 | |||
103 | static void free_trace_uprobe(struct trace_uprobe *tu) | ||
104 | { | ||
105 | int i; | ||
106 | |||
107 | for (i = 0; i < tu->nr_args; i++) | ||
108 | traceprobe_free_probe_arg(&tu->args[i]); | ||
109 | |||
110 | iput(tu->inode); | ||
111 | kfree(tu->call.class->system); | ||
112 | kfree(tu->call.name); | ||
113 | kfree(tu->filename); | ||
114 | kfree(tu); | ||
115 | } | ||
116 | |||
117 | static struct trace_uprobe *find_probe_event(const char *event, const char *group) | ||
118 | { | ||
119 | struct trace_uprobe *tu; | ||
120 | |||
121 | list_for_each_entry(tu, &uprobe_list, list) | ||
122 | if (strcmp(tu->call.name, event) == 0 && | ||
123 | strcmp(tu->call.class->system, group) == 0) | ||
124 | return tu; | ||
125 | |||
126 | return NULL; | ||
127 | } | ||
128 | |||
129 | /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ | ||
130 | static void unregister_trace_uprobe(struct trace_uprobe *tu) | ||
131 | { | ||
132 | list_del(&tu->list); | ||
133 | unregister_uprobe_event(tu); | ||
134 | free_trace_uprobe(tu); | ||
135 | } | ||
136 | |||
137 | /* Register a trace_uprobe and probe_event */ | ||
138 | static int register_trace_uprobe(struct trace_uprobe *tu) | ||
139 | { | ||
140 | struct trace_uprobe *old_tp; | ||
141 | int ret; | ||
142 | |||
143 | mutex_lock(&uprobe_lock); | ||
144 | |||
145 | /* register as an event */ | ||
146 | old_tp = find_probe_event(tu->call.name, tu->call.class->system); | ||
147 | if (old_tp) | ||
148 | /* delete old event */ | ||
149 | unregister_trace_uprobe(old_tp); | ||
150 | |||
151 | ret = register_uprobe_event(tu); | ||
152 | if (ret) { | ||
153 | pr_warning("Failed to register probe event(%d)\n", ret); | ||
154 | goto end; | ||
155 | } | ||
156 | |||
157 | list_add_tail(&tu->list, &uprobe_list); | ||
158 | |||
159 | end: | ||
160 | mutex_unlock(&uprobe_lock); | ||
161 | |||
162 | return ret; | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Argument syntax: | ||
167 | * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] | ||
168 | * | ||
169 | * - Remove uprobe: -:[GRP/]EVENT | ||
170 | */ | ||
171 | static int create_trace_uprobe(int argc, char **argv) | ||
172 | { | ||
173 | struct trace_uprobe *tu; | ||
174 | struct inode *inode; | ||
175 | char *arg, *event, *group, *filename; | ||
176 | char buf[MAX_EVENT_NAME_LEN]; | ||
177 | struct path path; | ||
178 | unsigned long offset; | ||
179 | bool is_delete; | ||
180 | int i, ret; | ||
181 | |||
182 | inode = NULL; | ||
183 | ret = 0; | ||
184 | is_delete = false; | ||
185 | event = NULL; | ||
186 | group = NULL; | ||
187 | |||
188 | /* argc must be >= 1 */ | ||
189 | if (argv[0][0] == '-') | ||
190 | is_delete = true; | ||
191 | else if (argv[0][0] != 'p') { | ||
192 | pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); | ||
193 | return -EINVAL; | ||
194 | } | ||
195 | |||
196 | if (argv[0][1] == ':') { | ||
197 | event = &argv[0][2]; | ||
198 | arg = strchr(event, '/'); | ||
199 | |||
200 | if (arg) { | ||
201 | group = event; | ||
202 | event = arg + 1; | ||
203 | event[-1] = '\0'; | ||
204 | |||
205 | if (strlen(group) == 0) { | ||
206 | pr_info("Group name is not specified\n"); | ||
207 | return -EINVAL; | ||
208 | } | ||
209 | } | ||
210 | if (strlen(event) == 0) { | ||
211 | pr_info("Event name is not specified\n"); | ||
212 | return -EINVAL; | ||
213 | } | ||
214 | } | ||
215 | if (!group) | ||
216 | group = UPROBE_EVENT_SYSTEM; | ||
217 | |||
218 | if (is_delete) { | ||
219 | if (!event) { | ||
220 | pr_info("Delete command needs an event name.\n"); | ||
221 | return -EINVAL; | ||
222 | } | ||
223 | mutex_lock(&uprobe_lock); | ||
224 | tu = find_probe_event(event, group); | ||
225 | |||
226 | if (!tu) { | ||
227 | mutex_unlock(&uprobe_lock); | ||
228 | pr_info("Event %s/%s doesn't exist.\n", group, event); | ||
229 | return -ENOENT; | ||
230 | } | ||
231 | /* delete an event */ | ||
232 | unregister_trace_uprobe(tu); | ||
233 | mutex_unlock(&uprobe_lock); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | if (argc < 2) { | ||
238 | pr_info("Probe point is not specified.\n"); | ||
239 | return -EINVAL; | ||
240 | } | ||
241 | if (isdigit(argv[1][0])) { | ||
242 | pr_info("probe point must be have a filename.\n"); | ||
243 | return -EINVAL; | ||
244 | } | ||
245 | arg = strchr(argv[1], ':'); | ||
246 | if (!arg) | ||
247 | goto fail_address_parse; | ||
248 | |||
249 | *arg++ = '\0'; | ||
250 | filename = argv[1]; | ||
251 | ret = kern_path(filename, LOOKUP_FOLLOW, &path); | ||
252 | if (ret) | ||
253 | goto fail_address_parse; | ||
254 | |||
255 | ret = strict_strtoul(arg, 0, &offset); | ||
256 | if (ret) | ||
257 | goto fail_address_parse; | ||
258 | |||
259 | inode = igrab(path.dentry->d_inode); | ||
260 | |||
261 | argc -= 2; | ||
262 | argv += 2; | ||
263 | |||
264 | /* setup a probe */ | ||
265 | if (!event) { | ||
266 | char *tail = strrchr(filename, '/'); | ||
267 | char *ptr; | ||
268 | |||
269 | ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); | ||
270 | if (!ptr) { | ||
271 | ret = -ENOMEM; | ||
272 | goto fail_address_parse; | ||
273 | } | ||
274 | |||
275 | tail = ptr; | ||
276 | ptr = strpbrk(tail, ".-_"); | ||
277 | if (ptr) | ||
278 | *ptr = '\0'; | ||
279 | |||
280 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); | ||
281 | event = buf; | ||
282 | kfree(tail); | ||
283 | } | ||
284 | |||
285 | tu = alloc_trace_uprobe(group, event, argc); | ||
286 | if (IS_ERR(tu)) { | ||
287 | pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); | ||
288 | ret = PTR_ERR(tu); | ||
289 | goto fail_address_parse; | ||
290 | } | ||
291 | tu->offset = offset; | ||
292 | tu->inode = inode; | ||
293 | tu->filename = kstrdup(filename, GFP_KERNEL); | ||
294 | |||
295 | if (!tu->filename) { | ||
296 | pr_info("Failed to allocate filename.\n"); | ||
297 | ret = -ENOMEM; | ||
298 | goto error; | ||
299 | } | ||
300 | |||
301 | /* parse arguments */ | ||
302 | ret = 0; | ||
303 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | ||
304 | /* Increment count for freeing args in error case */ | ||
305 | tu->nr_args++; | ||
306 | |||
307 | /* Parse argument name */ | ||
308 | arg = strchr(argv[i], '='); | ||
309 | if (arg) { | ||
310 | *arg++ = '\0'; | ||
311 | tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); | ||
312 | } else { | ||
313 | arg = argv[i]; | ||
314 | /* If argument name is omitted, set "argN" */ | ||
315 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | ||
316 | tu->args[i].name = kstrdup(buf, GFP_KERNEL); | ||
317 | } | ||
318 | |||
319 | if (!tu->args[i].name) { | ||
320 | pr_info("Failed to allocate argument[%d] name.\n", i); | ||
321 | ret = -ENOMEM; | ||
322 | goto error; | ||
323 | } | ||
324 | |||
325 | if (!is_good_name(tu->args[i].name)) { | ||
326 | pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); | ||
327 | ret = -EINVAL; | ||
328 | goto error; | ||
329 | } | ||
330 | |||
331 | if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { | ||
332 | pr_info("Argument[%d] name '%s' conflicts with " | ||
333 | "another field.\n", i, argv[i]); | ||
334 | ret = -EINVAL; | ||
335 | goto error; | ||
336 | } | ||
337 | |||
338 | /* Parse fetch argument */ | ||
339 | ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); | ||
340 | if (ret) { | ||
341 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | ||
342 | goto error; | ||
343 | } | ||
344 | } | ||
345 | |||
346 | ret = register_trace_uprobe(tu); | ||
347 | if (ret) | ||
348 | goto error; | ||
349 | return 0; | ||
350 | |||
351 | error: | ||
352 | free_trace_uprobe(tu); | ||
353 | return ret; | ||
354 | |||
355 | fail_address_parse: | ||
356 | if (inode) | ||
357 | iput(inode); | ||
358 | |||
359 | pr_info("Failed to parse address.\n"); | ||
360 | |||
361 | return ret; | ||
362 | } | ||
363 | |||
364 | static void cleanup_all_probes(void) | ||
365 | { | ||
366 | struct trace_uprobe *tu; | ||
367 | |||
368 | mutex_lock(&uprobe_lock); | ||
369 | while (!list_empty(&uprobe_list)) { | ||
370 | tu = list_entry(uprobe_list.next, struct trace_uprobe, list); | ||
371 | unregister_trace_uprobe(tu); | ||
372 | } | ||
373 | mutex_unlock(&uprobe_lock); | ||
374 | } | ||
375 | |||
376 | /* Probes listing interfaces */ | ||
377 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) | ||
378 | { | ||
379 | mutex_lock(&uprobe_lock); | ||
380 | return seq_list_start(&uprobe_list, *pos); | ||
381 | } | ||
382 | |||
383 | static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
384 | { | ||
385 | return seq_list_next(v, &uprobe_list, pos); | ||
386 | } | ||
387 | |||
388 | static void probes_seq_stop(struct seq_file *m, void *v) | ||
389 | { | ||
390 | mutex_unlock(&uprobe_lock); | ||
391 | } | ||
392 | |||
393 | static int probes_seq_show(struct seq_file *m, void *v) | ||
394 | { | ||
395 | struct trace_uprobe *tu = v; | ||
396 | int i; | ||
397 | |||
398 | seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); | ||
399 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); | ||
400 | |||
401 | for (i = 0; i < tu->nr_args; i++) | ||
402 | seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); | ||
403 | |||
404 | seq_printf(m, "\n"); | ||
405 | return 0; | ||
406 | } | ||
407 | |||
408 | static const struct seq_operations probes_seq_op = { | ||
409 | .start = probes_seq_start, | ||
410 | .next = probes_seq_next, | ||
411 | .stop = probes_seq_stop, | ||
412 | .show = probes_seq_show | ||
413 | }; | ||
414 | |||
415 | static int probes_open(struct inode *inode, struct file *file) | ||
416 | { | ||
417 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) | ||
418 | cleanup_all_probes(); | ||
419 | |||
420 | return seq_open(file, &probes_seq_op); | ||
421 | } | ||
422 | |||
423 | static ssize_t probes_write(struct file *file, const char __user *buffer, | ||
424 | size_t count, loff_t *ppos) | ||
425 | { | ||
426 | return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); | ||
427 | } | ||
428 | |||
429 | static const struct file_operations uprobe_events_ops = { | ||
430 | .owner = THIS_MODULE, | ||
431 | .open = probes_open, | ||
432 | .read = seq_read, | ||
433 | .llseek = seq_lseek, | ||
434 | .release = seq_release, | ||
435 | .write = probes_write, | ||
436 | }; | ||
437 | |||
438 | /* Probes profiling interfaces */ | ||
439 | static int probes_profile_seq_show(struct seq_file *m, void *v) | ||
440 | { | ||
441 | struct trace_uprobe *tu = v; | ||
442 | |||
443 | seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); | ||
444 | return 0; | ||
445 | } | ||
446 | |||
447 | static const struct seq_operations profile_seq_op = { | ||
448 | .start = probes_seq_start, | ||
449 | .next = probes_seq_next, | ||
450 | .stop = probes_seq_stop, | ||
451 | .show = probes_profile_seq_show | ||
452 | }; | ||
453 | |||
454 | static int profile_open(struct inode *inode, struct file *file) | ||
455 | { | ||
456 | return seq_open(file, &profile_seq_op); | ||
457 | } | ||
458 | |||
459 | static const struct file_operations uprobe_profile_ops = { | ||
460 | .owner = THIS_MODULE, | ||
461 | .open = profile_open, | ||
462 | .read = seq_read, | ||
463 | .llseek = seq_lseek, | ||
464 | .release = seq_release, | ||
465 | }; | ||
466 | |||
467 | /* uprobe handler */ | ||
468 | static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | ||
469 | { | ||
470 | struct uprobe_trace_entry_head *entry; | ||
471 | struct ring_buffer_event *event; | ||
472 | struct ring_buffer *buffer; | ||
473 | u8 *data; | ||
474 | int size, i, pc; | ||
475 | unsigned long irq_flags; | ||
476 | struct ftrace_event_call *call = &tu->call; | ||
477 | |||
478 | tu->nhit++; | ||
479 | |||
480 | local_save_flags(irq_flags); | ||
481 | pc = preempt_count(); | ||
482 | |||
483 | size = sizeof(*entry) + tu->size; | ||
484 | |||
485 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | ||
486 | size, irq_flags, pc); | ||
487 | if (!event) | ||
488 | return; | ||
489 | |||
490 | entry = ring_buffer_event_data(event); | ||
491 | entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); | ||
492 | data = (u8 *)&entry[1]; | ||
493 | for (i = 0; i < tu->nr_args; i++) | ||
494 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | ||
495 | |||
496 | if (!filter_current_check_discard(buffer, call, entry, event)) | ||
497 | trace_buffer_unlock_commit(buffer, event, irq_flags, pc); | ||
498 | } | ||
499 | |||
500 | /* Event entry printers */ | ||
501 | static enum print_line_t | ||
502 | print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) | ||
503 | { | ||
504 | struct uprobe_trace_entry_head *field; | ||
505 | struct trace_seq *s = &iter->seq; | ||
506 | struct trace_uprobe *tu; | ||
507 | u8 *data; | ||
508 | int i; | ||
509 | |||
510 | field = (struct uprobe_trace_entry_head *)iter->ent; | ||
511 | tu = container_of(event, struct trace_uprobe, call.event); | ||
512 | |||
513 | if (!trace_seq_printf(s, "%s: (", tu->call.name)) | ||
514 | goto partial; | ||
515 | |||
516 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) | ||
517 | goto partial; | ||
518 | |||
519 | if (!trace_seq_puts(s, ")")) | ||
520 | goto partial; | ||
521 | |||
522 | data = (u8 *)&field[1]; | ||
523 | for (i = 0; i < tu->nr_args; i++) { | ||
524 | if (!tu->args[i].type->print(s, tu->args[i].name, | ||
525 | data + tu->args[i].offset, field)) | ||
526 | goto partial; | ||
527 | } | ||
528 | |||
529 | if (trace_seq_puts(s, "\n")) | ||
530 | return TRACE_TYPE_HANDLED; | ||
531 | |||
532 | partial: | ||
533 | return TRACE_TYPE_PARTIAL_LINE; | ||
534 | } | ||
535 | |||
536 | static int probe_event_enable(struct trace_uprobe *tu, int flag) | ||
537 | { | ||
538 | struct uprobe_trace_consumer *utc; | ||
539 | int ret = 0; | ||
540 | |||
541 | if (!tu->inode || tu->consumer) | ||
542 | return -EINTR; | ||
543 | |||
544 | utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); | ||
545 | if (!utc) | ||
546 | return -EINTR; | ||
547 | |||
548 | utc->cons.handler = uprobe_dispatcher; | ||
549 | utc->cons.filter = NULL; | ||
550 | ret = uprobe_register(tu->inode, tu->offset, &utc->cons); | ||
551 | if (ret) { | ||
552 | kfree(utc); | ||
553 | return ret; | ||
554 | } | ||
555 | |||
556 | tu->flags |= flag; | ||
557 | utc->tu = tu; | ||
558 | tu->consumer = utc; | ||
559 | |||
560 | return 0; | ||
561 | } | ||
562 | |||
563 | static void probe_event_disable(struct trace_uprobe *tu, int flag) | ||
564 | { | ||
565 | if (!tu->inode || !tu->consumer) | ||
566 | return; | ||
567 | |||
568 | uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); | ||
569 | tu->flags &= ~flag; | ||
570 | kfree(tu->consumer); | ||
571 | tu->consumer = NULL; | ||
572 | } | ||
573 | |||
574 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | ||
575 | { | ||
576 | int ret, i; | ||
577 | struct uprobe_trace_entry_head field; | ||
578 | struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; | ||
579 | |||
580 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | ||
581 | /* Set argument names as fields */ | ||
582 | for (i = 0; i < tu->nr_args; i++) { | ||
583 | ret = trace_define_field(event_call, tu->args[i].type->fmttype, | ||
584 | tu->args[i].name, | ||
585 | sizeof(field) + tu->args[i].offset, | ||
586 | tu->args[i].type->size, | ||
587 | tu->args[i].type->is_signed, | ||
588 | FILTER_OTHER); | ||
589 | |||
590 | if (ret) | ||
591 | return ret; | ||
592 | } | ||
593 | return 0; | ||
594 | } | ||
595 | |||
596 | #define LEN_OR_ZERO (len ? len - pos : 0) | ||
597 | static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) | ||
598 | { | ||
599 | const char *fmt, *arg; | ||
600 | int i; | ||
601 | int pos = 0; | ||
602 | |||
603 | fmt = "(%lx)"; | ||
604 | arg = "REC->" FIELD_STRING_IP; | ||
605 | |||
606 | /* When len=0, we just calculate the needed length */ | ||
607 | |||
608 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); | ||
609 | |||
610 | for (i = 0; i < tu->nr_args; i++) { | ||
611 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", | ||
612 | tu->args[i].name, tu->args[i].type->fmt); | ||
613 | } | ||
614 | |||
615 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | ||
616 | |||
617 | for (i = 0; i < tu->nr_args; i++) { | ||
618 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
619 | tu->args[i].name); | ||
620 | } | ||
621 | |||
622 | return pos; /* return the length of print_fmt */ | ||
623 | } | ||
624 | #undef LEN_OR_ZERO | ||
625 | |||
626 | static int set_print_fmt(struct trace_uprobe *tu) | ||
627 | { | ||
628 | char *print_fmt; | ||
629 | int len; | ||
630 | |||
631 | /* First: called with 0 length to calculate the needed length */ | ||
632 | len = __set_print_fmt(tu, NULL, 0); | ||
633 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
634 | if (!print_fmt) | ||
635 | return -ENOMEM; | ||
636 | |||
637 | /* Second: actually write the @print_fmt */ | ||
638 | __set_print_fmt(tu, print_fmt, len + 1); | ||
639 | tu->call.print_fmt = print_fmt; | ||
640 | |||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | #ifdef CONFIG_PERF_EVENTS | ||
645 | /* uprobe profile handler */ | ||
646 | static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | ||
647 | { | ||
648 | struct ftrace_event_call *call = &tu->call; | ||
649 | struct uprobe_trace_entry_head *entry; | ||
650 | struct hlist_head *head; | ||
651 | u8 *data; | ||
652 | int size, __size, i; | ||
653 | int rctx; | ||
654 | |||
655 | __size = sizeof(*entry) + tu->size; | ||
656 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | ||
657 | size -= sizeof(u32); | ||
658 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) | ||
659 | return; | ||
660 | |||
661 | preempt_disable(); | ||
662 | |||
663 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | ||
664 | if (!entry) | ||
665 | goto out; | ||
666 | |||
667 | entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); | ||
668 | data = (u8 *)&entry[1]; | ||
669 | for (i = 0; i < tu->nr_args; i++) | ||
670 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | ||
671 | |||
672 | head = this_cpu_ptr(call->perf_events); | ||
673 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | ||
674 | |||
675 | out: | ||
676 | preempt_enable(); | ||
677 | } | ||
678 | #endif /* CONFIG_PERF_EVENTS */ | ||
679 | |||
680 | static | ||
681 | int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) | ||
682 | { | ||
683 | struct trace_uprobe *tu = (struct trace_uprobe *)event->data; | ||
684 | |||
685 | switch (type) { | ||
686 | case TRACE_REG_REGISTER: | ||
687 | return probe_event_enable(tu, TP_FLAG_TRACE); | ||
688 | |||
689 | case TRACE_REG_UNREGISTER: | ||
690 | probe_event_disable(tu, TP_FLAG_TRACE); | ||
691 | return 0; | ||
692 | |||
693 | #ifdef CONFIG_PERF_EVENTS | ||
694 | case TRACE_REG_PERF_REGISTER: | ||
695 | return probe_event_enable(tu, TP_FLAG_PROFILE); | ||
696 | |||
697 | case TRACE_REG_PERF_UNREGISTER: | ||
698 | probe_event_disable(tu, TP_FLAG_PROFILE); | ||
699 | return 0; | ||
700 | #endif | ||
701 | default: | ||
702 | return 0; | ||
703 | } | ||
704 | return 0; | ||
705 | } | ||
706 | |||
707 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) | ||
708 | { | ||
709 | struct uprobe_trace_consumer *utc; | ||
710 | struct trace_uprobe *tu; | ||
711 | |||
712 | utc = container_of(con, struct uprobe_trace_consumer, cons); | ||
713 | tu = utc->tu; | ||
714 | if (!tu || tu->consumer != utc) | ||
715 | return 0; | ||
716 | |||
717 | if (tu->flags & TP_FLAG_TRACE) | ||
718 | uprobe_trace_func(tu, regs); | ||
719 | |||
720 | #ifdef CONFIG_PERF_EVENTS | ||
721 | if (tu->flags & TP_FLAG_PROFILE) | ||
722 | uprobe_perf_func(tu, regs); | ||
723 | #endif | ||
724 | return 0; | ||
725 | } | ||
726 | |||
727 | static struct trace_event_functions uprobe_funcs = { | ||
728 | .trace = print_uprobe_event | ||
729 | }; | ||
730 | |||
731 | static int register_uprobe_event(struct trace_uprobe *tu) | ||
732 | { | ||
733 | struct ftrace_event_call *call = &tu->call; | ||
734 | int ret; | ||
735 | |||
736 | /* Initialize ftrace_event_call */ | ||
737 | INIT_LIST_HEAD(&call->class->fields); | ||
738 | call->event.funcs = &uprobe_funcs; | ||
739 | call->class->define_fields = uprobe_event_define_fields; | ||
740 | |||
741 | if (set_print_fmt(tu) < 0) | ||
742 | return -ENOMEM; | ||
743 | |||
744 | ret = register_ftrace_event(&call->event); | ||
745 | if (!ret) { | ||
746 | kfree(call->print_fmt); | ||
747 | return -ENODEV; | ||
748 | } | ||
749 | call->flags = 0; | ||
750 | call->class->reg = trace_uprobe_register; | ||
751 | call->data = tu; | ||
752 | ret = trace_add_event_call(call); | ||
753 | |||
754 | if (ret) { | ||
755 | pr_info("Failed to register uprobe event: %s\n", call->name); | ||
756 | kfree(call->print_fmt); | ||
757 | unregister_ftrace_event(&call->event); | ||
758 | } | ||
759 | |||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | static void unregister_uprobe_event(struct trace_uprobe *tu) | ||
764 | { | ||
765 | /* tu->event is unregistered in trace_remove_event_call() */ | ||
766 | trace_remove_event_call(&tu->call); | ||
767 | kfree(tu->call.print_fmt); | ||
768 | tu->call.print_fmt = NULL; | ||
769 | } | ||
770 | |||
771 | /* Make a trace interface for controling probe points */ | ||
772 | static __init int init_uprobe_trace(void) | ||
773 | { | ||
774 | struct dentry *d_tracer; | ||
775 | |||
776 | d_tracer = tracing_init_dentry(); | ||
777 | if (!d_tracer) | ||
778 | return 0; | ||
779 | |||
780 | trace_create_file("uprobe_events", 0644, d_tracer, | ||
781 | NULL, &uprobe_events_ops); | ||
782 | /* Profile interface */ | ||
783 | trace_create_file("uprobe_profile", 0444, d_tracer, | ||
784 | NULL, &uprobe_profile_ops); | ||
785 | return 0; | ||
786 | } | ||
787 | |||
788 | fs_initcall(init_uprobe_trace); | ||
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a4721..000000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null | |||
@@ -1,300 +0,0 @@ | |||
1 | /* | ||
2 | * Workqueue statistical tracer. | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | |||
9 | #include <trace/events/workqueue.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/percpu.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/kref.h> | ||
14 | #include "trace_stat.h" | ||
15 | #include "trace.h" | ||
16 | |||
17 | |||
18 | /* A cpu workqueue thread */ | ||
19 | struct cpu_workqueue_stats { | ||
20 | struct list_head list; | ||
21 | struct kref kref; | ||
22 | int cpu; | ||
23 | pid_t pid; | ||
24 | /* Can be inserted from interrupt or user context, need to be atomic */ | ||
25 | atomic_t inserted; | ||
26 | /* | ||
27 | * Don't need to be atomic, works are serialized in a single workqueue thread | ||
28 | * on a single CPU. | ||
29 | */ | ||
30 | unsigned int executed; | ||
31 | }; | ||
32 | |||
33 | /* List of workqueue threads on one cpu */ | ||
34 | struct workqueue_global_stats { | ||
35 | struct list_head list; | ||
36 | spinlock_t lock; | ||
37 | }; | ||
38 | |||
39 | /* Don't need a global lock because allocated before the workqueues, and | ||
40 | * never freed. | ||
41 | */ | ||
42 | static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); | ||
43 | #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) | ||
44 | |||
45 | static void cpu_workqueue_stat_free(struct kref *kref) | ||
46 | { | ||
47 | kfree(container_of(kref, struct cpu_workqueue_stats, kref)); | ||
48 | } | ||
49 | |||
50 | /* Insertion of a work */ | ||
51 | static void | ||
52 | probe_workqueue_insertion(void *ignore, | ||
53 | struct task_struct *wq_thread, | ||
54 | struct work_struct *work) | ||
55 | { | ||
56 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
57 | struct cpu_workqueue_stats *node; | ||
58 | unsigned long flags; | ||
59 | |||
60 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
61 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
62 | if (node->pid == wq_thread->pid) { | ||
63 | atomic_inc(&node->inserted); | ||
64 | goto found; | ||
65 | } | ||
66 | } | ||
67 | pr_debug("trace_workqueue: entry not found\n"); | ||
68 | found: | ||
69 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
70 | } | ||
71 | |||
72 | /* Execution of a work */ | ||
73 | static void | ||
74 | probe_workqueue_execution(void *ignore, | ||
75 | struct task_struct *wq_thread, | ||
76 | struct work_struct *work) | ||
77 | { | ||
78 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
79 | struct cpu_workqueue_stats *node; | ||
80 | unsigned long flags; | ||
81 | |||
82 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
83 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
84 | if (node->pid == wq_thread->pid) { | ||
85 | node->executed++; | ||
86 | goto found; | ||
87 | } | ||
88 | } | ||
89 | pr_debug("trace_workqueue: entry not found\n"); | ||
90 | found: | ||
91 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
92 | } | ||
93 | |||
94 | /* Creation of a cpu workqueue thread */ | ||
95 | static void probe_workqueue_creation(void *ignore, | ||
96 | struct task_struct *wq_thread, int cpu) | ||
97 | { | ||
98 | struct cpu_workqueue_stats *cws; | ||
99 | unsigned long flags; | ||
100 | |||
101 | WARN_ON(cpu < 0); | ||
102 | |||
103 | /* Workqueues are sometimes created in atomic context */ | ||
104 | cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); | ||
105 | if (!cws) { | ||
106 | pr_warning("trace_workqueue: not enough memory\n"); | ||
107 | return; | ||
108 | } | ||
109 | INIT_LIST_HEAD(&cws->list); | ||
110 | kref_init(&cws->kref); | ||
111 | cws->cpu = cpu; | ||
112 | cws->pid = wq_thread->pid; | ||
113 | |||
114 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
115 | list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); | ||
116 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
117 | } | ||
118 | |||
119 | /* Destruction of a cpu workqueue thread */ | ||
120 | static void | ||
121 | probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) | ||
122 | { | ||
123 | /* Workqueue only execute on one cpu */ | ||
124 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
125 | struct cpu_workqueue_stats *node, *next; | ||
126 | unsigned long flags; | ||
127 | |||
128 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
129 | list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, | ||
130 | list) { | ||
131 | if (node->pid == wq_thread->pid) { | ||
132 | list_del(&node->list); | ||
133 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
134 | goto found; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | pr_debug("trace_workqueue: don't find workqueue to destroy\n"); | ||
139 | found: | ||
140 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
141 | |||
142 | } | ||
143 | |||
144 | static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) | ||
145 | { | ||
146 | unsigned long flags; | ||
147 | struct cpu_workqueue_stats *ret = NULL; | ||
148 | |||
149 | |||
150 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
151 | |||
152 | if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { | ||
153 | ret = list_entry(workqueue_cpu_stat(cpu)->list.next, | ||
154 | struct cpu_workqueue_stats, list); | ||
155 | kref_get(&ret->kref); | ||
156 | } | ||
157 | |||
158 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
159 | |||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | static void *workqueue_stat_start(struct tracer_stat *trace) | ||
164 | { | ||
165 | int cpu; | ||
166 | void *ret = NULL; | ||
167 | |||
168 | for_each_possible_cpu(cpu) { | ||
169 | ret = workqueue_stat_start_cpu(cpu); | ||
170 | if (ret) | ||
171 | return ret; | ||
172 | } | ||
173 | return NULL; | ||
174 | } | ||
175 | |||
176 | static void *workqueue_stat_next(void *prev, int idx) | ||
177 | { | ||
178 | struct cpu_workqueue_stats *prev_cws = prev; | ||
179 | struct cpu_workqueue_stats *ret; | ||
180 | int cpu = prev_cws->cpu; | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
184 | if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { | ||
185 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
186 | do { | ||
187 | cpu = cpumask_next(cpu, cpu_possible_mask); | ||
188 | if (cpu >= nr_cpu_ids) | ||
189 | return NULL; | ||
190 | } while (!(ret = workqueue_stat_start_cpu(cpu))); | ||
191 | return ret; | ||
192 | } else { | ||
193 | ret = list_entry(prev_cws->list.next, | ||
194 | struct cpu_workqueue_stats, list); | ||
195 | kref_get(&ret->kref); | ||
196 | } | ||
197 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static int workqueue_stat_show(struct seq_file *s, void *p) | ||
203 | { | ||
204 | struct cpu_workqueue_stats *cws = p; | ||
205 | struct pid *pid; | ||
206 | struct task_struct *tsk; | ||
207 | |||
208 | pid = find_get_pid(cws->pid); | ||
209 | if (pid) { | ||
210 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
211 | if (tsk) { | ||
212 | seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, | ||
213 | atomic_read(&cws->inserted), cws->executed, | ||
214 | tsk->comm); | ||
215 | put_task_struct(tsk); | ||
216 | } | ||
217 | put_pid(pid); | ||
218 | } | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static void workqueue_stat_release(void *stat) | ||
224 | { | ||
225 | struct cpu_workqueue_stats *node = stat; | ||
226 | |||
227 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
228 | } | ||
229 | |||
230 | static int workqueue_stat_headers(struct seq_file *s) | ||
231 | { | ||
232 | seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); | ||
233 | seq_printf(s, "# | | | |\n"); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct tracer_stat workqueue_stats __read_mostly = { | ||
238 | .name = "workqueues", | ||
239 | .stat_start = workqueue_stat_start, | ||
240 | .stat_next = workqueue_stat_next, | ||
241 | .stat_show = workqueue_stat_show, | ||
242 | .stat_release = workqueue_stat_release, | ||
243 | .stat_headers = workqueue_stat_headers | ||
244 | }; | ||
245 | |||
246 | |||
247 | int __init stat_workqueue_init(void) | ||
248 | { | ||
249 | if (register_stat_tracer(&workqueue_stats)) { | ||
250 | pr_warning("Unable to register workqueue stat tracer\n"); | ||
251 | return 1; | ||
252 | } | ||
253 | |||
254 | return 0; | ||
255 | } | ||
256 | fs_initcall(stat_workqueue_init); | ||
257 | |||
258 | /* | ||
259 | * Workqueues are created very early, just after pre-smp initcalls. | ||
260 | * So we must register our tracepoints at this stage. | ||
261 | */ | ||
262 | int __init trace_workqueue_early_init(void) | ||
263 | { | ||
264 | int ret, cpu; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
272 | if (ret) | ||
273 | goto out; | ||
274 | |||
275 | ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
276 | if (ret) | ||
277 | goto no_insertion; | ||
278 | |||
279 | ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
280 | if (ret) | ||
281 | goto no_execution; | ||
282 | |||
283 | ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); | ||
284 | if (ret) | ||
285 | goto no_creation; | ||
286 | |||
287 | return 0; | ||
288 | |||
289 | no_creation: | ||
290 | unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
291 | no_execution: | ||
292 | unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
293 | no_insertion: | ||
294 | unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
295 | out: | ||
296 | pr_warning("trace_workqueue: unable to trace workqueues\n"); | ||
297 | |||
298 | return 1; | ||
299 | } | ||
300 | early_initcall(trace_workqueue_early_init); | ||
diff --git a/kernel/uid16.c b/kernel/uid16.c index 51c6e89e8619..d7948eb10225 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) | |||
81 | return ret; | 81 | return ret; |
82 | } | 82 | } |
83 | 83 | ||
84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) | 84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) |
85 | { | 85 | { |
86 | const struct cred *cred = current_cred(); | 86 | const struct cred *cred = current_cred(); |
87 | int retval; | 87 | int retval; |
88 | old_uid_t ruid, euid, suid; | ||
88 | 89 | ||
89 | if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && | 90 | ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); |
90 | !(retval = put_user(high2lowuid(cred->euid), euid))) | 91 | euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); |
91 | retval = put_user(high2lowuid(cred->suid), suid); | 92 | suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); |
93 | |||
94 | if (!(retval = put_user(ruid, ruidp)) && | ||
95 | !(retval = put_user(euid, euidp))) | ||
96 | retval = put_user(suid, suidp); | ||
92 | 97 | ||
93 | return retval; | 98 | return retval; |
94 | } | 99 | } |
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) | |||
103 | } | 108 | } |
104 | 109 | ||
105 | 110 | ||
106 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) | 111 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) |
107 | { | 112 | { |
108 | const struct cred *cred = current_cred(); | 113 | const struct cred *cred = current_cred(); |
109 | int retval; | 114 | int retval; |
115 | old_gid_t rgid, egid, sgid; | ||
116 | |||
117 | rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid)); | ||
118 | egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid)); | ||
119 | sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid)); | ||
110 | 120 | ||
111 | if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && | 121 | if (!(retval = put_user(rgid, rgidp)) && |
112 | !(retval = put_user(high2lowgid(cred->egid), egid))) | 122 | !(retval = put_user(egid, egidp))) |
113 | retval = put_user(high2lowgid(cred->sgid), sgid); | 123 | retval = put_user(sgid, sgidp); |
114 | 124 | ||
115 | return retval; | 125 | return retval; |
116 | } | 126 | } |
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) | |||
134 | static int groups16_to_user(old_gid_t __user *grouplist, | 144 | static int groups16_to_user(old_gid_t __user *grouplist, |
135 | struct group_info *group_info) | 145 | struct group_info *group_info) |
136 | { | 146 | { |
147 | struct user_namespace *user_ns = current_user_ns(); | ||
137 | int i; | 148 | int i; |
138 | old_gid_t group; | 149 | old_gid_t group; |
150 | kgid_t kgid; | ||
139 | 151 | ||
140 | for (i = 0; i < group_info->ngroups; i++) { | 152 | for (i = 0; i < group_info->ngroups; i++) { |
141 | group = high2lowgid(GROUP_AT(group_info, i)); | 153 | kgid = GROUP_AT(group_info, i); |
154 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); | ||
142 | if (put_user(group, grouplist+i)) | 155 | if (put_user(group, grouplist+i)) |
143 | return -EFAULT; | 156 | return -EFAULT; |
144 | } | 157 | } |
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist, | |||
149 | static int groups16_from_user(struct group_info *group_info, | 162 | static int groups16_from_user(struct group_info *group_info, |
150 | old_gid_t __user *grouplist) | 163 | old_gid_t __user *grouplist) |
151 | { | 164 | { |
165 | struct user_namespace *user_ns = current_user_ns(); | ||
152 | int i; | 166 | int i; |
153 | old_gid_t group; | 167 | old_gid_t group; |
168 | kgid_t kgid; | ||
154 | 169 | ||
155 | for (i = 0; i < group_info->ngroups; i++) { | 170 | for (i = 0; i < group_info->ngroups; i++) { |
156 | if (get_user(group, grouplist+i)) | 171 | if (get_user(group, grouplist+i)) |
157 | return -EFAULT; | 172 | return -EFAULT; |
158 | GROUP_AT(group_info, i) = low2highgid(group); | 173 | |
174 | kgid = make_kgid(user_ns, low2highgid(group)); | ||
175 | if (!gid_valid(kgid)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | GROUP_AT(group_info, i) = kgid; | ||
159 | } | 179 | } |
160 | 180 | ||
161 | return 0; | 181 | return 0; |
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
211 | 231 | ||
212 | SYSCALL_DEFINE0(getuid16) | 232 | SYSCALL_DEFINE0(getuid16) |
213 | { | 233 | { |
214 | return high2lowuid(current_uid()); | 234 | return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); |
215 | } | 235 | } |
216 | 236 | ||
217 | SYSCALL_DEFINE0(geteuid16) | 237 | SYSCALL_DEFINE0(geteuid16) |
218 | { | 238 | { |
219 | return high2lowuid(current_euid()); | 239 | return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); |
220 | } | 240 | } |
221 | 241 | ||
222 | SYSCALL_DEFINE0(getgid16) | 242 | SYSCALL_DEFINE0(getgid16) |
223 | { | 243 | { |
224 | return high2lowgid(current_gid()); | 244 | return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); |
225 | } | 245 | } |
226 | 246 | ||
227 | SYSCALL_DEFINE0(getegid16) | 247 | SYSCALL_DEFINE0(getegid16) |
228 | { | 248 | { |
229 | return high2lowgid(current_egid()); | 249 | return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); |
230 | } | 250 | } |
diff --git a/kernel/user.c b/kernel/user.c index 71dd2363ab0f..b815fefbe76f 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -22,10 +22,27 @@ | |||
22 | * and 1 for... ? | 22 | * and 1 for... ? |
23 | */ | 23 | */ |
24 | struct user_namespace init_user_ns = { | 24 | struct user_namespace init_user_ns = { |
25 | .uid_map = { | ||
26 | .nr_extents = 1, | ||
27 | .extent[0] = { | ||
28 | .first = 0, | ||
29 | .lower_first = 0, | ||
30 | .count = 4294967295U, | ||
31 | }, | ||
32 | }, | ||
33 | .gid_map = { | ||
34 | .nr_extents = 1, | ||
35 | .extent[0] = { | ||
36 | .first = 0, | ||
37 | .lower_first = 0, | ||
38 | .count = 4294967295U, | ||
39 | }, | ||
40 | }, | ||
25 | .kref = { | 41 | .kref = { |
26 | .refcount = ATOMIC_INIT(3), | 42 | .refcount = ATOMIC_INIT(3), |
27 | }, | 43 | }, |
28 | .creator = &root_user, | 44 | .owner = GLOBAL_ROOT_UID, |
45 | .group = GLOBAL_ROOT_GID, | ||
29 | }; | 46 | }; |
30 | EXPORT_SYMBOL_GPL(init_user_ns); | 47 | EXPORT_SYMBOL_GPL(init_user_ns); |
31 | 48 | ||
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns); | |||
34 | * when changing user ID's (ie setuid() and friends). | 51 | * when changing user ID's (ie setuid() and friends). |
35 | */ | 52 | */ |
36 | 53 | ||
54 | #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) | ||
55 | #define UIDHASH_SZ (1 << UIDHASH_BITS) | ||
37 | #define UIDHASH_MASK (UIDHASH_SZ - 1) | 56 | #define UIDHASH_MASK (UIDHASH_SZ - 1) |
38 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | 57 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) |
39 | #define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) | 58 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) |
40 | 59 | ||
41 | static struct kmem_cache *uid_cachep; | 60 | static struct kmem_cache *uid_cachep; |
61 | struct hlist_head uidhash_table[UIDHASH_SZ]; | ||
42 | 62 | ||
43 | /* | 63 | /* |
44 | * The uidhash_lock is mostly taken from process context, but it is | 64 | * The uidhash_lock is mostly taken from process context, but it is |
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep; | |||
51 | */ | 71 | */ |
52 | static DEFINE_SPINLOCK(uidhash_lock); | 72 | static DEFINE_SPINLOCK(uidhash_lock); |
53 | 73 | ||
54 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ | 74 | /* root_user.__count is 1, for init task cred */ |
55 | struct user_struct root_user = { | 75 | struct user_struct root_user = { |
56 | .__count = ATOMIC_INIT(2), | 76 | .__count = ATOMIC_INIT(1), |
57 | .processes = ATOMIC_INIT(1), | 77 | .processes = ATOMIC_INIT(1), |
58 | .files = ATOMIC_INIT(0), | 78 | .files = ATOMIC_INIT(0), |
59 | .sigpending = ATOMIC_INIT(0), | 79 | .sigpending = ATOMIC_INIT(0), |
60 | .locked_shm = 0, | 80 | .locked_shm = 0, |
61 | .user_ns = &init_user_ns, | 81 | .uid = GLOBAL_ROOT_UID, |
62 | }; | 82 | }; |
63 | 83 | ||
64 | /* | 84 | /* |
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) | |||
72 | static void uid_hash_remove(struct user_struct *up) | 92 | static void uid_hash_remove(struct user_struct *up) |
73 | { | 93 | { |
74 | hlist_del_init(&up->uidhash_node); | 94 | hlist_del_init(&up->uidhash_node); |
75 | put_user_ns(up->user_ns); | ||
76 | } | 95 | } |
77 | 96 | ||
78 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 97 | static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) |
79 | { | 98 | { |
80 | struct user_struct *user; | 99 | struct user_struct *user; |
81 | struct hlist_node *h; | 100 | struct hlist_node *h; |
82 | 101 | ||
83 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 102 | hlist_for_each_entry(user, h, hashent, uidhash_node) { |
84 | if (user->uid == uid) { | 103 | if (uid_eq(user->uid, uid)) { |
85 | atomic_inc(&user->__count); | 104 | atomic_inc(&user->__count); |
86 | return user; | 105 | return user; |
87 | } | 106 | } |
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
110 | * | 129 | * |
111 | * If the user_struct could not be found, return NULL. | 130 | * If the user_struct could not be found, return NULL. |
112 | */ | 131 | */ |
113 | struct user_struct *find_user(uid_t uid) | 132 | struct user_struct *find_user(kuid_t uid) |
114 | { | 133 | { |
115 | struct user_struct *ret; | 134 | struct user_struct *ret; |
116 | unsigned long flags; | 135 | unsigned long flags; |
117 | struct user_namespace *ns = current_user_ns(); | ||
118 | 136 | ||
119 | spin_lock_irqsave(&uidhash_lock, flags); | 137 | spin_lock_irqsave(&uidhash_lock, flags); |
120 | ret = uid_hash_find(uid, uidhashentry(ns, uid)); | 138 | ret = uid_hash_find(uid, uidhashentry(uid)); |
121 | spin_unlock_irqrestore(&uidhash_lock, flags); | 139 | spin_unlock_irqrestore(&uidhash_lock, flags); |
122 | return ret; | 140 | return ret; |
123 | } | 141 | } |
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up) | |||
136 | local_irq_restore(flags); | 154 | local_irq_restore(flags); |
137 | } | 155 | } |
138 | 156 | ||
139 | struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | 157 | struct user_struct *alloc_uid(kuid_t uid) |
140 | { | 158 | { |
141 | struct hlist_head *hashent = uidhashentry(ns, uid); | 159 | struct hlist_head *hashent = uidhashentry(uid); |
142 | struct user_struct *up, *new; | 160 | struct user_struct *up, *new; |
143 | 161 | ||
144 | spin_lock_irq(&uidhash_lock); | 162 | spin_lock_irq(&uidhash_lock); |
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
153 | new->uid = uid; | 171 | new->uid = uid; |
154 | atomic_set(&new->__count, 1); | 172 | atomic_set(&new->__count, 1); |
155 | 173 | ||
156 | new->user_ns = get_user_ns(ns); | ||
157 | |||
158 | /* | 174 | /* |
159 | * Before adding this, check whether we raced | 175 | * Before adding this, check whether we raced |
160 | * on adding the same user already.. | 176 | * on adding the same user already.. |
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
162 | spin_lock_irq(&uidhash_lock); | 178 | spin_lock_irq(&uidhash_lock); |
163 | up = uid_hash_find(uid, hashent); | 179 | up = uid_hash_find(uid, hashent); |
164 | if (up) { | 180 | if (up) { |
165 | put_user_ns(ns); | ||
166 | key_put(new->uid_keyring); | 181 | key_put(new->uid_keyring); |
167 | key_put(new->session_keyring); | 182 | key_put(new->session_keyring); |
168 | kmem_cache_free(uid_cachep, new); | 183 | kmem_cache_free(uid_cachep, new); |
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void) | |||
187 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 202 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
188 | 203 | ||
189 | for(n = 0; n < UIDHASH_SZ; ++n) | 204 | for(n = 0; n < UIDHASH_SZ; ++n) |
190 | INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); | 205 | INIT_HLIST_HEAD(uidhash_table + n); |
191 | 206 | ||
192 | /* Insert the root user immediately (init already runs as root) */ | 207 | /* Insert the root user immediately (init already runs as root) */ |
193 | spin_lock_irq(&uidhash_lock); | 208 | spin_lock_irq(&uidhash_lock); |
194 | uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); | 209 | uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); |
195 | spin_unlock_irq(&uidhash_lock); | 210 | spin_unlock_irq(&uidhash_lock); |
196 | 211 | ||
197 | return 0; | 212 | return 0; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3b906e98b1db..86602316422d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -11,9 +11,20 @@ | |||
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
14 | #include <linux/securebits.h> | ||
15 | #include <linux/keyctl.h> | ||
16 | #include <linux/key-type.h> | ||
17 | #include <keys/user-type.h> | ||
18 | #include <linux/seq_file.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/uaccess.h> | ||
21 | #include <linux/ctype.h> | ||
14 | 22 | ||
15 | static struct kmem_cache *user_ns_cachep __read_mostly; | 23 | static struct kmem_cache *user_ns_cachep __read_mostly; |
16 | 24 | ||
25 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | ||
26 | struct uid_gid_map *map); | ||
27 | |||
17 | /* | 28 | /* |
18 | * Create a new user namespace, deriving the creator from the user in the | 29 | * Create a new user namespace, deriving the creator from the user in the |
19 | * passed credentials, and replacing that user with the new root user for the | 30 | * passed credentials, and replacing that user with the new root user for the |
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly; | |||
24 | */ | 35 | */ |
25 | int create_user_ns(struct cred *new) | 36 | int create_user_ns(struct cred *new) |
26 | { | 37 | { |
27 | struct user_namespace *ns; | 38 | struct user_namespace *ns, *parent_ns = new->user_ns; |
28 | struct user_struct *root_user; | 39 | kuid_t owner = new->euid; |
29 | int n; | 40 | kgid_t group = new->egid; |
41 | |||
42 | /* The creator needs a mapping in the parent user namespace | ||
43 | * or else we won't be able to reasonably tell userspace who | ||
44 | * created a user_namespace. | ||
45 | */ | ||
46 | if (!kuid_has_mapping(parent_ns, owner) || | ||
47 | !kgid_has_mapping(parent_ns, group)) | ||
48 | return -EPERM; | ||
30 | 49 | ||
31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); | 50 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); |
32 | if (!ns) | 51 | if (!ns) |
33 | return -ENOMEM; | 52 | return -ENOMEM; |
34 | 53 | ||
35 | kref_init(&ns->kref); | 54 | kref_init(&ns->kref); |
55 | ns->parent = parent_ns; | ||
56 | ns->owner = owner; | ||
57 | ns->group = group; | ||
36 | 58 | ||
37 | for (n = 0; n < UIDHASH_SZ; ++n) | 59 | /* Start with the same capabilities as init but useless for doing |
38 | INIT_HLIST_HEAD(ns->uidhash_table + n); | 60 | * anything as the capabilities are bound to the new user namespace. |
39 | 61 | */ | |
40 | /* Alloc new root user. */ | 62 | new->securebits = SECUREBITS_DEFAULT; |
41 | root_user = alloc_uid(ns, 0); | 63 | new->cap_inheritable = CAP_EMPTY_SET; |
42 | if (!root_user) { | 64 | new->cap_permitted = CAP_FULL_SET; |
43 | kmem_cache_free(user_ns_cachep, ns); | 65 | new->cap_effective = CAP_FULL_SET; |
44 | return -ENOMEM; | 66 | new->cap_bset = CAP_FULL_SET; |
45 | } | ||
46 | |||
47 | /* set the new root user in the credentials under preparation */ | ||
48 | ns->creator = new->user; | ||
49 | new->user = root_user; | ||
50 | new->uid = new->euid = new->suid = new->fsuid = 0; | ||
51 | new->gid = new->egid = new->sgid = new->fsgid = 0; | ||
52 | put_group_info(new->group_info); | ||
53 | new->group_info = get_group_info(&init_groups); | ||
54 | #ifdef CONFIG_KEYS | 67 | #ifdef CONFIG_KEYS |
55 | key_put(new->request_key_auth); | 68 | key_put(new->request_key_auth); |
56 | new->request_key_auth = NULL; | 69 | new->request_key_auth = NULL; |
57 | #endif | 70 | #endif |
58 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | 71 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ |
59 | 72 | ||
60 | /* root_user holds a reference to ns, our reference can be dropped */ | 73 | /* Leave the new->user_ns reference with the new user namespace. */ |
61 | put_user_ns(ns); | 74 | /* Leave the reference to our user_ns with the new cred. */ |
75 | new->user_ns = ns; | ||
62 | 76 | ||
63 | return 0; | 77 | return 0; |
64 | } | 78 | } |
65 | 79 | ||
66 | /* | 80 | void free_user_ns(struct kref *kref) |
67 | * Deferred destructor for a user namespace. This is required because | ||
68 | * free_user_ns() may be called with uidhash_lock held, but we need to call | ||
69 | * back to free_uid() which will want to take the lock again. | ||
70 | */ | ||
71 | static void free_user_ns_work(struct work_struct *work) | ||
72 | { | 81 | { |
73 | struct user_namespace *ns = | 82 | struct user_namespace *parent, *ns = |
74 | container_of(work, struct user_namespace, destroyer); | 83 | container_of(kref, struct user_namespace, kref); |
75 | free_uid(ns->creator); | 84 | |
85 | parent = ns->parent; | ||
76 | kmem_cache_free(user_ns_cachep, ns); | 86 | kmem_cache_free(user_ns_cachep, ns); |
87 | put_user_ns(parent); | ||
77 | } | 88 | } |
89 | EXPORT_SYMBOL(free_user_ns); | ||
78 | 90 | ||
79 | void free_user_ns(struct kref *kref) | 91 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) |
80 | { | 92 | { |
81 | struct user_namespace *ns = | 93 | unsigned idx, extents; |
82 | container_of(kref, struct user_namespace, kref); | 94 | u32 first, last, id2; |
95 | |||
96 | id2 = id + count - 1; | ||
97 | |||
98 | /* Find the matching extent */ | ||
99 | extents = map->nr_extents; | ||
100 | smp_read_barrier_depends(); | ||
101 | for (idx = 0; idx < extents; idx++) { | ||
102 | first = map->extent[idx].first; | ||
103 | last = first + map->extent[idx].count - 1; | ||
104 | if (id >= first && id <= last && | ||
105 | (id2 >= first && id2 <= last)) | ||
106 | break; | ||
107 | } | ||
108 | /* Map the id or note failure */ | ||
109 | if (idx < extents) | ||
110 | id = (id - first) + map->extent[idx].lower_first; | ||
111 | else | ||
112 | id = (u32) -1; | ||
83 | 113 | ||
84 | INIT_WORK(&ns->destroyer, free_user_ns_work); | 114 | return id; |
85 | schedule_work(&ns->destroyer); | ||
86 | } | 115 | } |
87 | EXPORT_SYMBOL(free_user_ns); | ||
88 | 116 | ||
89 | uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) | 117 | static u32 map_id_down(struct uid_gid_map *map, u32 id) |
90 | { | 118 | { |
91 | struct user_namespace *tmp; | 119 | unsigned idx, extents; |
120 | u32 first, last; | ||
92 | 121 | ||
93 | if (likely(to == cred->user->user_ns)) | 122 | /* Find the matching extent */ |
94 | return uid; | 123 | extents = map->nr_extents; |
124 | smp_read_barrier_depends(); | ||
125 | for (idx = 0; idx < extents; idx++) { | ||
126 | first = map->extent[idx].first; | ||
127 | last = first + map->extent[idx].count - 1; | ||
128 | if (id >= first && id <= last) | ||
129 | break; | ||
130 | } | ||
131 | /* Map the id or note failure */ | ||
132 | if (idx < extents) | ||
133 | id = (id - first) + map->extent[idx].lower_first; | ||
134 | else | ||
135 | id = (u32) -1; | ||
95 | 136 | ||
137 | return id; | ||
138 | } | ||
96 | 139 | ||
97 | /* Is cred->user the creator of the target user_ns | 140 | static u32 map_id_up(struct uid_gid_map *map, u32 id) |
98 | * or the creator of one of it's parents? | 141 | { |
99 | */ | 142 | unsigned idx, extents; |
100 | for ( tmp = to; tmp != &init_user_ns; | 143 | u32 first, last; |
101 | tmp = tmp->creator->user_ns ) { | 144 | |
102 | if (cred->user == tmp->creator) { | 145 | /* Find the matching extent */ |
103 | return (uid_t)0; | 146 | extents = map->nr_extents; |
104 | } | 147 | smp_read_barrier_depends(); |
148 | for (idx = 0; idx < extents; idx++) { | ||
149 | first = map->extent[idx].lower_first; | ||
150 | last = first + map->extent[idx].count - 1; | ||
151 | if (id >= first && id <= last) | ||
152 | break; | ||
105 | } | 153 | } |
154 | /* Map the id or note failure */ | ||
155 | if (idx < extents) | ||
156 | id = (id - first) + map->extent[idx].first; | ||
157 | else | ||
158 | id = (u32) -1; | ||
159 | |||
160 | return id; | ||
161 | } | ||
162 | |||
163 | /** | ||
164 | * make_kuid - Map a user-namespace uid pair into a kuid. | ||
165 | * @ns: User namespace that the uid is in | ||
166 | * @uid: User identifier | ||
167 | * | ||
168 | * Maps a user-namespace uid pair into a kernel internal kuid, | ||
169 | * and returns that kuid. | ||
170 | * | ||
171 | * When there is no mapping defined for the user-namespace uid | ||
172 | * pair INVALID_UID is returned. Callers are expected to test | ||
173 | * for and handle handle INVALID_UID being returned. INVALID_UID | ||
174 | * may be tested for using uid_valid(). | ||
175 | */ | ||
176 | kuid_t make_kuid(struct user_namespace *ns, uid_t uid) | ||
177 | { | ||
178 | /* Map the uid to a global kernel uid */ | ||
179 | return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); | ||
180 | } | ||
181 | EXPORT_SYMBOL(make_kuid); | ||
182 | |||
183 | /** | ||
184 | * from_kuid - Create a uid from a kuid user-namespace pair. | ||
185 | * @targ: The user namespace we want a uid in. | ||
186 | * @kuid: The kernel internal uid to start with. | ||
187 | * | ||
188 | * Map @kuid into the user-namespace specified by @targ and | ||
189 | * return the resulting uid. | ||
190 | * | ||
191 | * There is always a mapping into the initial user_namespace. | ||
192 | * | ||
193 | * If @kuid has no mapping in @targ (uid_t)-1 is returned. | ||
194 | */ | ||
195 | uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) | ||
196 | { | ||
197 | /* Map the uid from a global kernel uid */ | ||
198 | return map_id_up(&targ->uid_map, __kuid_val(kuid)); | ||
199 | } | ||
200 | EXPORT_SYMBOL(from_kuid); | ||
106 | 201 | ||
107 | /* No useful relationship so no mapping */ | 202 | /** |
108 | return overflowuid; | 203 | * from_kuid_munged - Create a uid from a kuid user-namespace pair. |
204 | * @targ: The user namespace we want a uid in. | ||
205 | * @kuid: The kernel internal uid to start with. | ||
206 | * | ||
207 | * Map @kuid into the user-namespace specified by @targ and | ||
208 | * return the resulting uid. | ||
209 | * | ||
210 | * There is always a mapping into the initial user_namespace. | ||
211 | * | ||
212 | * Unlike from_kuid from_kuid_munged never fails and always | ||
213 | * returns a valid uid. This makes from_kuid_munged appropriate | ||
214 | * for use in syscalls like stat and getuid where failing the | ||
215 | * system call and failing to provide a valid uid are not an | ||
216 | * options. | ||
217 | * | ||
218 | * If @kuid has no mapping in @targ overflowuid is returned. | ||
219 | */ | ||
220 | uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) | ||
221 | { | ||
222 | uid_t uid; | ||
223 | uid = from_kuid(targ, kuid); | ||
224 | |||
225 | if (uid == (uid_t) -1) | ||
226 | uid = overflowuid; | ||
227 | return uid; | ||
109 | } | 228 | } |
229 | EXPORT_SYMBOL(from_kuid_munged); | ||
110 | 230 | ||
111 | gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) | 231 | /** |
232 | * make_kgid - Map a user-namespace gid pair into a kgid. | ||
233 | * @ns: User namespace that the gid is in | ||
234 | * @uid: group identifier | ||
235 | * | ||
236 | * Maps a user-namespace gid pair into a kernel internal kgid, | ||
237 | * and returns that kgid. | ||
238 | * | ||
239 | * When there is no mapping defined for the user-namespace gid | ||
240 | * pair INVALID_GID is returned. Callers are expected to test | ||
241 | * for and handle INVALID_GID being returned. INVALID_GID may be | ||
242 | * tested for using gid_valid(). | ||
243 | */ | ||
244 | kgid_t make_kgid(struct user_namespace *ns, gid_t gid) | ||
112 | { | 245 | { |
113 | struct user_namespace *tmp; | 246 | /* Map the gid to a global kernel gid */ |
247 | return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); | ||
248 | } | ||
249 | EXPORT_SYMBOL(make_kgid); | ||
114 | 250 | ||
115 | if (likely(to == cred->user->user_ns)) | 251 | /** |
116 | return gid; | 252 | * from_kgid - Create a gid from a kgid user-namespace pair. |
253 | * @targ: The user namespace we want a gid in. | ||
254 | * @kgid: The kernel internal gid to start with. | ||
255 | * | ||
256 | * Map @kgid into the user-namespace specified by @targ and | ||
257 | * return the resulting gid. | ||
258 | * | ||
259 | * There is always a mapping into the initial user_namespace. | ||
260 | * | ||
261 | * If @kgid has no mapping in @targ (gid_t)-1 is returned. | ||
262 | */ | ||
263 | gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) | ||
264 | { | ||
265 | /* Map the gid from a global kernel gid */ | ||
266 | return map_id_up(&targ->gid_map, __kgid_val(kgid)); | ||
267 | } | ||
268 | EXPORT_SYMBOL(from_kgid); | ||
269 | |||
270 | /** | ||
271 | * from_kgid_munged - Create a gid from a kgid user-namespace pair. | ||
272 | * @targ: The user namespace we want a gid in. | ||
273 | * @kgid: The kernel internal gid to start with. | ||
274 | * | ||
275 | * Map @kgid into the user-namespace specified by @targ and | ||
276 | * return the resulting gid. | ||
277 | * | ||
278 | * There is always a mapping into the initial user_namespace. | ||
279 | * | ||
280 | * Unlike from_kgid from_kgid_munged never fails and always | ||
281 | * returns a valid gid. This makes from_kgid_munged appropriate | ||
282 | * for use in syscalls like stat and getgid where failing the | ||
283 | * system call and failing to provide a valid gid are not options. | ||
284 | * | ||
285 | * If @kgid has no mapping in @targ overflowgid is returned. | ||
286 | */ | ||
287 | gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) | ||
288 | { | ||
289 | gid_t gid; | ||
290 | gid = from_kgid(targ, kgid); | ||
117 | 291 | ||
118 | /* Is cred->user the creator of the target user_ns | 292 | if (gid == (gid_t) -1) |
119 | * or the creator of one of it's parents? | 293 | gid = overflowgid; |
294 | return gid; | ||
295 | } | ||
296 | EXPORT_SYMBOL(from_kgid_munged); | ||
297 | |||
298 | static int uid_m_show(struct seq_file *seq, void *v) | ||
299 | { | ||
300 | struct user_namespace *ns = seq->private; | ||
301 | struct uid_gid_extent *extent = v; | ||
302 | struct user_namespace *lower_ns; | ||
303 | uid_t lower; | ||
304 | |||
305 | lower_ns = current_user_ns(); | ||
306 | if ((lower_ns == ns) && lower_ns->parent) | ||
307 | lower_ns = lower_ns->parent; | ||
308 | |||
309 | lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); | ||
310 | |||
311 | seq_printf(seq, "%10u %10u %10u\n", | ||
312 | extent->first, | ||
313 | lower, | ||
314 | extent->count); | ||
315 | |||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static int gid_m_show(struct seq_file *seq, void *v) | ||
320 | { | ||
321 | struct user_namespace *ns = seq->private; | ||
322 | struct uid_gid_extent *extent = v; | ||
323 | struct user_namespace *lower_ns; | ||
324 | gid_t lower; | ||
325 | |||
326 | lower_ns = current_user_ns(); | ||
327 | if ((lower_ns == ns) && lower_ns->parent) | ||
328 | lower_ns = lower_ns->parent; | ||
329 | |||
330 | lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); | ||
331 | |||
332 | seq_printf(seq, "%10u %10u %10u\n", | ||
333 | extent->first, | ||
334 | lower, | ||
335 | extent->count); | ||
336 | |||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | ||
341 | { | ||
342 | struct uid_gid_extent *extent = NULL; | ||
343 | loff_t pos = *ppos; | ||
344 | |||
345 | if (pos < map->nr_extents) | ||
346 | extent = &map->extent[pos]; | ||
347 | |||
348 | return extent; | ||
349 | } | ||
350 | |||
351 | static void *uid_m_start(struct seq_file *seq, loff_t *ppos) | ||
352 | { | ||
353 | struct user_namespace *ns = seq->private; | ||
354 | |||
355 | return m_start(seq, ppos, &ns->uid_map); | ||
356 | } | ||
357 | |||
358 | static void *gid_m_start(struct seq_file *seq, loff_t *ppos) | ||
359 | { | ||
360 | struct user_namespace *ns = seq->private; | ||
361 | |||
362 | return m_start(seq, ppos, &ns->gid_map); | ||
363 | } | ||
364 | |||
365 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) | ||
366 | { | ||
367 | (*pos)++; | ||
368 | return seq->op->start(seq, pos); | ||
369 | } | ||
370 | |||
371 | static void m_stop(struct seq_file *seq, void *v) | ||
372 | { | ||
373 | return; | ||
374 | } | ||
375 | |||
376 | struct seq_operations proc_uid_seq_operations = { | ||
377 | .start = uid_m_start, | ||
378 | .stop = m_stop, | ||
379 | .next = m_next, | ||
380 | .show = uid_m_show, | ||
381 | }; | ||
382 | |||
383 | struct seq_operations proc_gid_seq_operations = { | ||
384 | .start = gid_m_start, | ||
385 | .stop = m_stop, | ||
386 | .next = m_next, | ||
387 | .show = gid_m_show, | ||
388 | }; | ||
389 | |||
390 | static DEFINE_MUTEX(id_map_mutex); | ||
391 | |||
392 | static ssize_t map_write(struct file *file, const char __user *buf, | ||
393 | size_t count, loff_t *ppos, | ||
394 | int cap_setid, | ||
395 | struct uid_gid_map *map, | ||
396 | struct uid_gid_map *parent_map) | ||
397 | { | ||
398 | struct seq_file *seq = file->private_data; | ||
399 | struct user_namespace *ns = seq->private; | ||
400 | struct uid_gid_map new_map; | ||
401 | unsigned idx; | ||
402 | struct uid_gid_extent *extent, *last = NULL; | ||
403 | unsigned long page = 0; | ||
404 | char *kbuf, *pos, *next_line; | ||
405 | ssize_t ret = -EINVAL; | ||
406 | |||
407 | /* | ||
408 | * The id_map_mutex serializes all writes to any given map. | ||
409 | * | ||
410 | * Any map is only ever written once. | ||
411 | * | ||
412 | * An id map fits within 1 cache line on most architectures. | ||
413 | * | ||
414 | * On read nothing needs to be done unless you are on an | ||
415 | * architecture with a crazy cache coherency model like alpha. | ||
416 | * | ||
417 | * There is a one time data dependency between reading the | ||
418 | * count of the extents and the values of the extents. The | ||
419 | * desired behavior is to see the values of the extents that | ||
420 | * were written before the count of the extents. | ||
421 | * | ||
422 | * To achieve this smp_wmb() is used on guarantee the write | ||
423 | * order and smp_read_barrier_depends() is guaranteed that we | ||
424 | * don't have crazy architectures returning stale data. | ||
425 | * | ||
120 | */ | 426 | */ |
121 | for ( tmp = to; tmp != &init_user_ns; | 427 | mutex_lock(&id_map_mutex); |
122 | tmp = tmp->creator->user_ns ) { | 428 | |
123 | if (cred->user == tmp->creator) { | 429 | ret = -EPERM; |
124 | return (gid_t)0; | 430 | /* Only allow one successful write to the map */ |
431 | if (map->nr_extents != 0) | ||
432 | goto out; | ||
433 | |||
434 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID | ||
435 | * over the user namespace in order to set the id mapping. | ||
436 | */ | ||
437 | if (!ns_capable(ns, cap_setid)) | ||
438 | goto out; | ||
439 | |||
440 | /* Get a buffer */ | ||
441 | ret = -ENOMEM; | ||
442 | page = __get_free_page(GFP_TEMPORARY); | ||
443 | kbuf = (char *) page; | ||
444 | if (!page) | ||
445 | goto out; | ||
446 | |||
447 | /* Only allow <= page size writes at the beginning of the file */ | ||
448 | ret = -EINVAL; | ||
449 | if ((*ppos != 0) || (count >= PAGE_SIZE)) | ||
450 | goto out; | ||
451 | |||
452 | /* Slurp in the user data */ | ||
453 | ret = -EFAULT; | ||
454 | if (copy_from_user(kbuf, buf, count)) | ||
455 | goto out; | ||
456 | kbuf[count] = '\0'; | ||
457 | |||
458 | /* Parse the user data */ | ||
459 | ret = -EINVAL; | ||
460 | pos = kbuf; | ||
461 | new_map.nr_extents = 0; | ||
462 | for (;pos; pos = next_line) { | ||
463 | extent = &new_map.extent[new_map.nr_extents]; | ||
464 | |||
465 | /* Find the end of line and ensure I don't look past it */ | ||
466 | next_line = strchr(pos, '\n'); | ||
467 | if (next_line) { | ||
468 | *next_line = '\0'; | ||
469 | next_line++; | ||
470 | if (*next_line == '\0') | ||
471 | next_line = NULL; | ||
125 | } | 472 | } |
473 | |||
474 | pos = skip_spaces(pos); | ||
475 | extent->first = simple_strtoul(pos, &pos, 10); | ||
476 | if (!isspace(*pos)) | ||
477 | goto out; | ||
478 | |||
479 | pos = skip_spaces(pos); | ||
480 | extent->lower_first = simple_strtoul(pos, &pos, 10); | ||
481 | if (!isspace(*pos)) | ||
482 | goto out; | ||
483 | |||
484 | pos = skip_spaces(pos); | ||
485 | extent->count = simple_strtoul(pos, &pos, 10); | ||
486 | if (*pos && !isspace(*pos)) | ||
487 | goto out; | ||
488 | |||
489 | /* Verify there is not trailing junk on the line */ | ||
490 | pos = skip_spaces(pos); | ||
491 | if (*pos != '\0') | ||
492 | goto out; | ||
493 | |||
494 | /* Verify we have been given valid starting values */ | ||
495 | if ((extent->first == (u32) -1) || | ||
496 | (extent->lower_first == (u32) -1 )) | ||
497 | goto out; | ||
498 | |||
499 | /* Verify count is not zero and does not cause the extent to wrap */ | ||
500 | if ((extent->first + extent->count) <= extent->first) | ||
501 | goto out; | ||
502 | if ((extent->lower_first + extent->count) <= extent->lower_first) | ||
503 | goto out; | ||
504 | |||
505 | /* For now only accept extents that are strictly in order */ | ||
506 | if (last && | ||
507 | (((last->first + last->count) > extent->first) || | ||
508 | ((last->lower_first + last->count) > extent->lower_first))) | ||
509 | goto out; | ||
510 | |||
511 | new_map.nr_extents++; | ||
512 | last = extent; | ||
513 | |||
514 | /* Fail if the file contains too many extents */ | ||
515 | if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && | ||
516 | (next_line != NULL)) | ||
517 | goto out; | ||
518 | } | ||
519 | /* Be very certaint the new map actually exists */ | ||
520 | if (new_map.nr_extents == 0) | ||
521 | goto out; | ||
522 | |||
523 | ret = -EPERM; | ||
524 | /* Validate the user is allowed to use user id's mapped to. */ | ||
525 | if (!new_idmap_permitted(ns, cap_setid, &new_map)) | ||
526 | goto out; | ||
527 | |||
528 | /* Map the lower ids from the parent user namespace to the | ||
529 | * kernel global id space. | ||
530 | */ | ||
531 | for (idx = 0; idx < new_map.nr_extents; idx++) { | ||
532 | u32 lower_first; | ||
533 | extent = &new_map.extent[idx]; | ||
534 | |||
535 | lower_first = map_id_range_down(parent_map, | ||
536 | extent->lower_first, | ||
537 | extent->count); | ||
538 | |||
539 | /* Fail if we can not map the specified extent to | ||
540 | * the kernel global id space. | ||
541 | */ | ||
542 | if (lower_first == (u32) -1) | ||
543 | goto out; | ||
544 | |||
545 | extent->lower_first = lower_first; | ||
126 | } | 546 | } |
127 | 547 | ||
128 | /* No useful relationship so no mapping */ | 548 | /* Install the map */ |
129 | return overflowgid; | 549 | memcpy(map->extent, new_map.extent, |
550 | new_map.nr_extents*sizeof(new_map.extent[0])); | ||
551 | smp_wmb(); | ||
552 | map->nr_extents = new_map.nr_extents; | ||
553 | |||
554 | *ppos = count; | ||
555 | ret = count; | ||
556 | out: | ||
557 | mutex_unlock(&id_map_mutex); | ||
558 | if (page) | ||
559 | free_page(page); | ||
560 | return ret; | ||
561 | } | ||
562 | |||
563 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
564 | { | ||
565 | struct seq_file *seq = file->private_data; | ||
566 | struct user_namespace *ns = seq->private; | ||
567 | |||
568 | if (!ns->parent) | ||
569 | return -EPERM; | ||
570 | |||
571 | return map_write(file, buf, size, ppos, CAP_SETUID, | ||
572 | &ns->uid_map, &ns->parent->uid_map); | ||
573 | } | ||
574 | |||
575 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
576 | { | ||
577 | struct seq_file *seq = file->private_data; | ||
578 | struct user_namespace *ns = seq->private; | ||
579 | |||
580 | if (!ns->parent) | ||
581 | return -EPERM; | ||
582 | |||
583 | return map_write(file, buf, size, ppos, CAP_SETGID, | ||
584 | &ns->gid_map, &ns->parent->gid_map); | ||
585 | } | ||
586 | |||
587 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | ||
588 | struct uid_gid_map *new_map) | ||
589 | { | ||
590 | /* Allow the specified ids if we have the appropriate capability | ||
591 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. | ||
592 | */ | ||
593 | if (ns_capable(ns->parent, cap_setid)) | ||
594 | return true; | ||
595 | |||
596 | return false; | ||
130 | } | 597 | } |
131 | 598 | ||
132 | static __init int user_namespaces_init(void) | 599 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 405caf91aad5..679d97a5d3fd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | |||
43 | 43 | ||
44 | down_read(&uts_sem); | 44 | down_read(&uts_sem); |
45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); | 46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); |
47 | up_read(&uts_sem); | 47 | up_read(&uts_sem); |
48 | return ns; | 48 | return ns; |
49 | } | 49 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index df30ee08bdd4..e5e1d85b8c7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/sysctl.h> | 24 | #include <linux/sysctl.h> |
25 | 25 | ||
26 | #include <asm/irq_regs.h> | 26 | #include <asm/irq_regs.h> |
27 | #include <linux/kvm_para.h> | ||
27 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
28 | 29 | ||
29 | int watchdog_enabled = 1; | 30 | int watchdog_enabled = 1; |
@@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
280 | __this_cpu_write(softlockup_touch_sync, false); | 281 | __this_cpu_write(softlockup_touch_sync, false); |
281 | sched_clock_tick(); | 282 | sched_clock_tick(); |
282 | } | 283 | } |
284 | |||
285 | /* Clear the guest paused flag on watchdog reset */ | ||
286 | kvm_check_and_clear_guest_paused(); | ||
283 | __touch_watchdog(); | 287 | __touch_watchdog(); |
284 | return HRTIMER_RESTART; | 288 | return HRTIMER_RESTART; |
285 | } | 289 | } |
@@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
292 | */ | 296 | */ |
293 | duration = is_softlockup(touch_ts); | 297 | duration = is_softlockup(touch_ts); |
294 | if (unlikely(duration)) { | 298 | if (unlikely(duration)) { |
299 | /* | ||
300 | * If a virtual machine is stopped by the host it can look to | ||
301 | * the watchdog like a soft lockup, check to see if the host | ||
302 | * stopped the vm before we issue the warning | ||
303 | */ | ||
304 | if (kvm_check_and_clear_guest_paused()) | ||
305 | return HRTIMER_RESTART; | ||
306 | |||
295 | /* only warn once */ | 307 | /* only warn once */ |
296 | if (__this_cpu_read(soft_watchdog_warn) == true) | 308 | if (__this_cpu_read(soft_watchdog_warn) == true) |
297 | return HRTIMER_RESTART; | 309 | return HRTIMER_RESTART; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c08..9a3128dc67df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1032 | cwq = get_cwq(gcwq->cpu, wq); | 1032 | cwq = get_cwq(gcwq->cpu, wq); |
1033 | trace_workqueue_queue_work(cpu, cwq, work); | 1033 | trace_workqueue_queue_work(cpu, cwq, work); |
1034 | 1034 | ||
1035 | BUG_ON(!list_empty(&work->entry)); | 1035 | if (WARN_ON(!list_empty(&work->entry))) { |
1036 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
1037 | return; | ||
1038 | } | ||
1036 | 1039 | ||
1037 | cwq->nr_in_flight[cwq->work_color]++; | 1040 | cwq->nr_in_flight[cwq->work_color]++; |
1038 | work_flags = work_color_to_flags(cwq->work_color); | 1041 | work_flags = work_color_to_flags(cwq->work_color); |
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) | |||
1210 | } else | 1213 | } else |
1211 | wake_up_all(&gcwq->trustee_wait); | 1214 | wake_up_all(&gcwq->trustee_wait); |
1212 | 1215 | ||
1213 | /* sanity check nr_running */ | 1216 | /* |
1214 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | 1217 | * Sanity check nr_running. Because trustee releases gcwq->lock |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | ||
1219 | * warning may trigger spuriously. Check iff trustee is idle. | ||
1220 | */ | ||
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | ||
1222 | gcwq->nr_workers == gcwq->nr_idle && | ||
1215 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); |
1216 | } | 1224 | } |
1217 | 1225 | ||
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) | |||
1810 | * lock freed" warnings as well as problems when looking into | 1818 | * lock freed" warnings as well as problems when looking into |
1811 | * work->lockdep_map, make a copy and use that here. | 1819 | * work->lockdep_map, make a copy and use that here. |
1812 | */ | 1820 | */ |
1813 | struct lockdep_map lockdep_map = work->lockdep_map; | 1821 | struct lockdep_map lockdep_map; |
1822 | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | ||
1814 | #endif | 1824 | #endif |
1815 | /* | 1825 | /* |
1816 | * A single work shouldn't be executed concurrently by | 1826 | * A single work shouldn't be executed concurrently by |
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) | |||
2506 | { | 2516 | { |
2507 | struct wq_barrier barr; | 2517 | struct wq_barrier barr; |
2508 | 2518 | ||
2519 | lock_map_acquire(&work->lockdep_map); | ||
2520 | lock_map_release(&work->lockdep_map); | ||
2521 | |||
2509 | if (start_flush_work(work, &barr, true)) { | 2522 | if (start_flush_work(work, &barr, true)) { |
2510 | wait_for_completion(&barr.done); | 2523 | wait_for_completion(&barr.done); |
2511 | destroy_work_on_stack(&barr.work); | 2524 | destroy_work_on_stack(&barr.work); |