diff options
Diffstat (limited to 'kernel')
104 files changed, 11722 insertions, 4271 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb41b9547c9f..c0cc67ad764c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -5,12 +5,12 @@ | |||
5 | obj-y = fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o printk.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
13 | async.o range.o groups.o | 13 | async.o range.o groups.o lglock.o |
14 | 14 | ||
15 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
16 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
@@ -25,6 +25,9 @@ endif | |||
25 | obj-y += sched/ | 25 | obj-y += sched/ |
26 | obj-y += power/ | 26 | obj-y += power/ |
27 | 27 | ||
28 | ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) | ||
29 | obj-$(CONFIG_X86) += kcmp.o | ||
30 | endif | ||
28 | obj-$(CONFIG_FREEZER) += freezer.o | 31 | obj-$(CONFIG_FREEZER) += freezer.o |
29 | obj-$(CONFIG_PROFILING) += profile.o | 32 | obj-$(CONFIG_PROFILING) += profile.o |
30 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 33 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
@@ -43,6 +46,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
43 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 46 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
44 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 47 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
45 | obj-$(CONFIG_SMP) += smp.o | 48 | obj-$(CONFIG_SMP) += smp.o |
49 | obj-$(CONFIG_SMP) += smpboot.o | ||
46 | ifneq ($(CONFIG_SMP),y) | 50 | ifneq ($(CONFIG_SMP),y) |
47 | obj-y += up.o | 51 | obj-y += up.o |
48 | endif | 52 | endif |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34eae..4b96415527b8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
69 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
70 | #include <linux/compat.h> | ||
70 | 71 | ||
71 | #include "audit.h" | 72 | #include "audit.h" |
72 | 73 | ||
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) | |||
2710 | audit_log_end(ab); | 2711 | audit_log_end(ab); |
2711 | } | 2712 | } |
2712 | 2713 | ||
2713 | void __audit_seccomp(unsigned long syscall) | 2714 | void __audit_seccomp(unsigned long syscall, long signr, int code) |
2714 | { | 2715 | { |
2715 | struct audit_buffer *ab; | 2716 | struct audit_buffer *ab; |
2716 | 2717 | ||
2717 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2718 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2718 | audit_log_abend(ab, "seccomp", SIGKILL); | 2719 | audit_log_abend(ab, "seccomp", signr); |
2719 | audit_log_format(ab, " syscall=%ld", syscall); | 2720 | audit_log_format(ab, " syscall=%ld", syscall); |
2721 | audit_log_format(ab, " compat=%d", is_compat_task()); | ||
2722 | audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); | ||
2723 | audit_log_format(ab, " code=0x%x", code); | ||
2720 | audit_log_end(ab); | 2724 | audit_log_end(ab); |
2721 | } | 2725 | } |
2722 | 2726 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 3f1adb6c6470..493d97259484 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -419,3 +419,24 @@ bool nsown_capable(int cap) | |||
419 | { | 419 | { |
420 | return ns_capable(current_user_ns(), cap); | 420 | return ns_capable(current_user_ns(), cap); |
421 | } | 421 | } |
422 | |||
423 | /** | ||
424 | * inode_capable - Check superior capability over inode | ||
425 | * @inode: The inode in question | ||
426 | * @cap: The capability in question | ||
427 | * | ||
428 | * Return true if the current task has the given superior capability | ||
429 | * targeted at it's own user namespace and that the given inode is owned | ||
430 | * by the current user namespace or a child namespace. | ||
431 | * | ||
432 | * Currently we check to see if an inode is owned by the current | ||
433 | * user namespace by seeing if the inode's owner maps into the | ||
434 | * current user namespace. | ||
435 | * | ||
436 | */ | ||
437 | bool inode_capable(const struct inode *inode, int cap) | ||
438 | { | ||
439 | struct user_namespace *ns = current_user_ns(); | ||
440 | |||
441 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | ||
442 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c9..b303dfc7dce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -60,9 +60,13 @@ | |||
60 | #include <linux/eventfd.h> | 60 | #include <linux/eventfd.h> |
61 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
63 | #include <linux/kthread.h> | ||
63 | 64 | ||
64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
65 | 66 | ||
67 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
68 | #define CSS_DEACT_BIAS INT_MIN | ||
69 | |||
66 | /* | 70 | /* |
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 71 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
68 | * hierarchy must be performed while holding it. | 72 | * hierarchy must be performed while holding it. |
@@ -127,6 +131,9 @@ struct cgroupfs_root { | |||
127 | /* A list running through the active hierarchies */ | 131 | /* A list running through the active hierarchies */ |
128 | struct list_head root_list; | 132 | struct list_head root_list; |
129 | 133 | ||
134 | /* All cgroups on this root, cgroup_mutex protected */ | ||
135 | struct list_head allcg_list; | ||
136 | |||
130 | /* Hierarchy-specific flags */ | 137 | /* Hierarchy-specific flags */ |
131 | unsigned long flags; | 138 | unsigned long flags; |
132 | 139 | ||
@@ -145,6 +152,15 @@ struct cgroupfs_root { | |||
145 | static struct cgroupfs_root rootnode; | 152 | static struct cgroupfs_root rootnode; |
146 | 153 | ||
147 | /* | 154 | /* |
155 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | ||
156 | */ | ||
157 | struct cfent { | ||
158 | struct list_head node; | ||
159 | struct dentry *dentry; | ||
160 | struct cftype *type; | ||
161 | }; | ||
162 | |||
163 | /* | ||
148 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | 164 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when |
149 | * cgroup_subsys->use_id != 0. | 165 | * cgroup_subsys->use_id != 0. |
150 | */ | 166 | */ |
@@ -239,6 +255,19 @@ int cgroup_lock_is_held(void) | |||
239 | 255 | ||
240 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); | 256 | EXPORT_SYMBOL_GPL(cgroup_lock_is_held); |
241 | 257 | ||
258 | static int css_unbias_refcnt(int refcnt) | ||
259 | { | ||
260 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
261 | } | ||
262 | |||
263 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
264 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
265 | { | ||
266 | int v = atomic_read(&css->refcnt); | ||
267 | |||
268 | return css_unbias_refcnt(v); | ||
269 | } | ||
270 | |||
242 | /* convenient tests for these bits */ | 271 | /* convenient tests for these bits */ |
243 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 272 | inline int cgroup_is_removed(const struct cgroup *cgrp) |
244 | { | 273 | { |
@@ -279,6 +308,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) | |||
279 | #define for_each_active_root(_root) \ | 308 | #define for_each_active_root(_root) \ |
280 | list_for_each_entry(_root, &roots, root_list) | 309 | list_for_each_entry(_root, &roots, root_list) |
281 | 310 | ||
311 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
312 | { | ||
313 | return dentry->d_fsdata; | ||
314 | } | ||
315 | |||
316 | static inline struct cfent *__d_cfe(struct dentry *dentry) | ||
317 | { | ||
318 | return dentry->d_fsdata; | ||
319 | } | ||
320 | |||
321 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
322 | { | ||
323 | return __d_cfe(dentry)->type; | ||
324 | } | ||
325 | |||
282 | /* the list of cgroups eligible for automatic release. Protected by | 326 | /* the list of cgroups eligible for automatic release. Protected by |
283 | * release_list_lock */ | 327 | * release_list_lock */ |
284 | static LIST_HEAD(release_list); | 328 | static LIST_HEAD(release_list); |
@@ -816,12 +860,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
816 | struct cgroup_subsys *ss; | 860 | struct cgroup_subsys *ss; |
817 | int ret = 0; | 861 | int ret = 0; |
818 | 862 | ||
819 | for_each_subsys(cgrp->root, ss) | 863 | for_each_subsys(cgrp->root, ss) { |
820 | if (ss->pre_destroy) { | 864 | if (!ss->pre_destroy) |
821 | ret = ss->pre_destroy(cgrp); | 865 | continue; |
822 | if (ret) | 866 | |
823 | break; | 867 | ret = ss->pre_destroy(cgrp); |
868 | if (ret) { | ||
869 | /* ->pre_destroy() failure is being deprecated */ | ||
870 | WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); | ||
871 | break; | ||
824 | } | 872 | } |
873 | } | ||
825 | 874 | ||
826 | return ret; | 875 | return ret; |
827 | } | 876 | } |
@@ -864,6 +913,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
864 | BUG_ON(!list_empty(&cgrp->pidlists)); | 913 | BUG_ON(!list_empty(&cgrp->pidlists)); |
865 | 914 | ||
866 | kfree_rcu(cgrp, rcu_head); | 915 | kfree_rcu(cgrp, rcu_head); |
916 | } else { | ||
917 | struct cfent *cfe = __d_cfe(dentry); | ||
918 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | ||
919 | |||
920 | WARN_ONCE(!list_empty(&cfe->node) && | ||
921 | cgrp != &cgrp->root->top_cgroup, | ||
922 | "cfe still linked for %s\n", cfe->type->name); | ||
923 | kfree(cfe); | ||
867 | } | 924 | } |
868 | iput(inode); | 925 | iput(inode); |
869 | } | 926 | } |
@@ -882,34 +939,36 @@ static void remove_dir(struct dentry *d) | |||
882 | dput(parent); | 939 | dput(parent); |
883 | } | 940 | } |
884 | 941 | ||
885 | static void cgroup_clear_directory(struct dentry *dentry) | 942 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
886 | { | 943 | { |
887 | struct list_head *node; | 944 | struct cfent *cfe; |
888 | 945 | ||
889 | BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); | 946 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
890 | spin_lock(&dentry->d_lock); | 947 | lockdep_assert_held(&cgroup_mutex); |
891 | node = dentry->d_subdirs.next; | 948 | |
892 | while (node != &dentry->d_subdirs) { | 949 | list_for_each_entry(cfe, &cgrp->files, node) { |
893 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); | 950 | struct dentry *d = cfe->dentry; |
894 | 951 | ||
895 | spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); | 952 | if (cft && cfe->type != cft) |
896 | list_del_init(node); | 953 | continue; |
897 | if (d->d_inode) { | 954 | |
898 | /* This should never be called on a cgroup | 955 | dget(d); |
899 | * directory with child cgroups */ | 956 | d_delete(d); |
900 | BUG_ON(d->d_inode->i_mode & S_IFDIR); | 957 | simple_unlink(d->d_inode, d); |
901 | dget_dlock(d); | 958 | list_del_init(&cfe->node); |
902 | spin_unlock(&d->d_lock); | 959 | dput(d); |
903 | spin_unlock(&dentry->d_lock); | 960 | |
904 | d_delete(d); | 961 | return 0; |
905 | simple_unlink(dentry->d_inode, d); | ||
906 | dput(d); | ||
907 | spin_lock(&dentry->d_lock); | ||
908 | } else | ||
909 | spin_unlock(&d->d_lock); | ||
910 | node = dentry->d_subdirs.next; | ||
911 | } | 962 | } |
912 | spin_unlock(&dentry->d_lock); | 963 | return -ENOENT; |
964 | } | ||
965 | |||
966 | static void cgroup_clear_directory(struct dentry *dir) | ||
967 | { | ||
968 | struct cgroup *cgrp = __d_cgrp(dir); | ||
969 | |||
970 | while (!list_empty(&cgrp->files)) | ||
971 | cgroup_rm_file(cgrp, NULL); | ||
913 | } | 972 | } |
914 | 973 | ||
915 | /* | 974 | /* |
@@ -1294,6 +1353,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1294 | if (ret) | 1353 | if (ret) |
1295 | goto out_unlock; | 1354 | goto out_unlock; |
1296 | 1355 | ||
1356 | /* See feature-removal-schedule.txt */ | ||
1357 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | ||
1358 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | ||
1359 | task_tgid_nr(current), current->comm); | ||
1360 | |||
1297 | /* Don't allow flags or name to change at remount */ | 1361 | /* Don't allow flags or name to change at remount */ |
1298 | if (opts.flags != root->flags || | 1362 | if (opts.flags != root->flags || |
1299 | (opts.name && strcmp(opts.name, root->name))) { | 1363 | (opts.name && strcmp(opts.name, root->name))) { |
@@ -1308,7 +1372,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1308 | goto out_unlock; | 1372 | goto out_unlock; |
1309 | } | 1373 | } |
1310 | 1374 | ||
1311 | /* (re)populate subsystem files */ | 1375 | /* clear out any existing files and repopulate subsystem files */ |
1376 | cgroup_clear_directory(cgrp->dentry); | ||
1312 | cgroup_populate_dir(cgrp); | 1377 | cgroup_populate_dir(cgrp); |
1313 | 1378 | ||
1314 | if (opts.release_agent) | 1379 | if (opts.release_agent) |
@@ -1333,6 +1398,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1333 | { | 1398 | { |
1334 | INIT_LIST_HEAD(&cgrp->sibling); | 1399 | INIT_LIST_HEAD(&cgrp->sibling); |
1335 | INIT_LIST_HEAD(&cgrp->children); | 1400 | INIT_LIST_HEAD(&cgrp->children); |
1401 | INIT_LIST_HEAD(&cgrp->files); | ||
1336 | INIT_LIST_HEAD(&cgrp->css_sets); | 1402 | INIT_LIST_HEAD(&cgrp->css_sets); |
1337 | INIT_LIST_HEAD(&cgrp->release_list); | 1403 | INIT_LIST_HEAD(&cgrp->release_list); |
1338 | INIT_LIST_HEAD(&cgrp->pidlists); | 1404 | INIT_LIST_HEAD(&cgrp->pidlists); |
@@ -1344,11 +1410,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1344 | static void init_cgroup_root(struct cgroupfs_root *root) | 1410 | static void init_cgroup_root(struct cgroupfs_root *root) |
1345 | { | 1411 | { |
1346 | struct cgroup *cgrp = &root->top_cgroup; | 1412 | struct cgroup *cgrp = &root->top_cgroup; |
1413 | |||
1347 | INIT_LIST_HEAD(&root->subsys_list); | 1414 | INIT_LIST_HEAD(&root->subsys_list); |
1348 | INIT_LIST_HEAD(&root->root_list); | 1415 | INIT_LIST_HEAD(&root->root_list); |
1416 | INIT_LIST_HEAD(&root->allcg_list); | ||
1349 | root->number_of_cgroups = 1; | 1417 | root->number_of_cgroups = 1; |
1350 | cgrp->root = root; | 1418 | cgrp->root = root; |
1351 | cgrp->top_cgroup = cgrp; | 1419 | cgrp->top_cgroup = cgrp; |
1420 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1352 | init_cgroup_housekeeping(cgrp); | 1421 | init_cgroup_housekeeping(cgrp); |
1353 | } | 1422 | } |
1354 | 1423 | ||
@@ -1692,16 +1761,6 @@ static struct file_system_type cgroup_fs_type = { | |||
1692 | 1761 | ||
1693 | static struct kobject *cgroup_kobj; | 1762 | static struct kobject *cgroup_kobj; |
1694 | 1763 | ||
1695 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | ||
1696 | { | ||
1697 | return dentry->d_fsdata; | ||
1698 | } | ||
1699 | |||
1700 | static inline struct cftype *__d_cft(struct dentry *dentry) | ||
1701 | { | ||
1702 | return dentry->d_fsdata; | ||
1703 | } | ||
1704 | |||
1705 | /** | 1764 | /** |
1706 | * cgroup_path - generate the path of a cgroup | 1765 | * cgroup_path - generate the path of a cgroup |
1707 | * @cgrp: the cgroup in question | 1766 | * @cgrp: the cgroup in question |
@@ -2160,9 +2219,9 @@ retry_find_task: | |||
2160 | * only need to check permissions on one of them. | 2219 | * only need to check permissions on one of them. |
2161 | */ | 2220 | */ |
2162 | tcred = __task_cred(tsk); | 2221 | tcred = __task_cred(tsk); |
2163 | if (cred->euid && | 2222 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && |
2164 | cred->euid != tcred->uid && | 2223 | !uid_eq(cred->euid, tcred->uid) && |
2165 | cred->euid != tcred->suid) { | 2224 | !uid_eq(cred->euid, tcred->suid)) { |
2166 | rcu_read_unlock(); | 2225 | rcu_read_unlock(); |
2167 | ret = -EACCES; | 2226 | ret = -EACCES; |
2168 | goto out_unlock_cgroup; | 2227 | goto out_unlock_cgroup; |
@@ -2172,6 +2231,18 @@ retry_find_task: | |||
2172 | 2231 | ||
2173 | if (threadgroup) | 2232 | if (threadgroup) |
2174 | tsk = tsk->group_leader; | 2233 | tsk = tsk->group_leader; |
2234 | |||
2235 | /* | ||
2236 | * Workqueue threads may acquire PF_THREAD_BOUND and become | ||
2237 | * trapped in a cpuset, or RT worker may be born in a cgroup | ||
2238 | * with no rt_runtime allocated. Just say no. | ||
2239 | */ | ||
2240 | if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { | ||
2241 | ret = -EINVAL; | ||
2242 | rcu_read_unlock(); | ||
2243 | goto out_unlock_cgroup; | ||
2244 | } | ||
2245 | |||
2175 | get_task_struct(tsk); | 2246 | get_task_struct(tsk); |
2176 | rcu_read_unlock(); | 2247 | rcu_read_unlock(); |
2177 | 2248 | ||
@@ -2603,50 +2674,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2603 | return mode; | 2674 | return mode; |
2604 | } | 2675 | } |
2605 | 2676 | ||
2606 | int cgroup_add_file(struct cgroup *cgrp, | 2677 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2607 | struct cgroup_subsys *subsys, | 2678 | const struct cftype *cft) |
2608 | const struct cftype *cft) | ||
2609 | { | 2679 | { |
2610 | struct dentry *dir = cgrp->dentry; | 2680 | struct dentry *dir = cgrp->dentry; |
2681 | struct cgroup *parent = __d_cgrp(dir); | ||
2611 | struct dentry *dentry; | 2682 | struct dentry *dentry; |
2683 | struct cfent *cfe; | ||
2612 | int error; | 2684 | int error; |
2613 | umode_t mode; | 2685 | umode_t mode; |
2614 | |||
2615 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2686 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2687 | |||
2688 | /* does @cft->flags tell us to skip creation on @cgrp? */ | ||
2689 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | ||
2690 | return 0; | ||
2691 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | ||
2692 | return 0; | ||
2693 | |||
2616 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2694 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
2617 | strcpy(name, subsys->name); | 2695 | strcpy(name, subsys->name); |
2618 | strcat(name, "."); | 2696 | strcat(name, "."); |
2619 | } | 2697 | } |
2620 | strcat(name, cft->name); | 2698 | strcat(name, cft->name); |
2699 | |||
2621 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); | 2700 | BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); |
2701 | |||
2702 | cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); | ||
2703 | if (!cfe) | ||
2704 | return -ENOMEM; | ||
2705 | |||
2622 | dentry = lookup_one_len(name, dir, strlen(name)); | 2706 | dentry = lookup_one_len(name, dir, strlen(name)); |
2623 | if (!IS_ERR(dentry)) { | 2707 | if (IS_ERR(dentry)) { |
2624 | mode = cgroup_file_mode(cft); | ||
2625 | error = cgroup_create_file(dentry, mode | S_IFREG, | ||
2626 | cgrp->root->sb); | ||
2627 | if (!error) | ||
2628 | dentry->d_fsdata = (void *)cft; | ||
2629 | dput(dentry); | ||
2630 | } else | ||
2631 | error = PTR_ERR(dentry); | 2708 | error = PTR_ERR(dentry); |
2709 | goto out; | ||
2710 | } | ||
2711 | |||
2712 | mode = cgroup_file_mode(cft); | ||
2713 | error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); | ||
2714 | if (!error) { | ||
2715 | cfe->type = (void *)cft; | ||
2716 | cfe->dentry = dentry; | ||
2717 | dentry->d_fsdata = cfe; | ||
2718 | list_add_tail(&cfe->node, &parent->files); | ||
2719 | cfe = NULL; | ||
2720 | } | ||
2721 | dput(dentry); | ||
2722 | out: | ||
2723 | kfree(cfe); | ||
2632 | return error; | 2724 | return error; |
2633 | } | 2725 | } |
2634 | EXPORT_SYMBOL_GPL(cgroup_add_file); | ||
2635 | 2726 | ||
2636 | int cgroup_add_files(struct cgroup *cgrp, | 2727 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
2637 | struct cgroup_subsys *subsys, | 2728 | const struct cftype cfts[], bool is_add) |
2638 | const struct cftype cft[], | ||
2639 | int count) | ||
2640 | { | 2729 | { |
2641 | int i, err; | 2730 | const struct cftype *cft; |
2642 | for (i = 0; i < count; i++) { | 2731 | int err, ret = 0; |
2643 | err = cgroup_add_file(cgrp, subsys, &cft[i]); | 2732 | |
2644 | if (err) | 2733 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2645 | return err; | 2734 | if (is_add) |
2735 | err = cgroup_add_file(cgrp, subsys, cft); | ||
2736 | else | ||
2737 | err = cgroup_rm_file(cgrp, cft); | ||
2738 | if (err) { | ||
2739 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2740 | is_add ? "add" : "remove", cft->name, err); | ||
2741 | ret = err; | ||
2742 | } | ||
2743 | } | ||
2744 | return ret; | ||
2745 | } | ||
2746 | |||
2747 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2748 | |||
2749 | static void cgroup_cfts_prepare(void) | ||
2750 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | ||
2751 | { | ||
2752 | /* | ||
2753 | * Thanks to the entanglement with vfs inode locking, we can't walk | ||
2754 | * the existing cgroups under cgroup_mutex and create files. | ||
2755 | * Instead, we increment reference on all cgroups and build list of | ||
2756 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | ||
2757 | * exclusive access to the field. | ||
2758 | */ | ||
2759 | mutex_lock(&cgroup_cft_mutex); | ||
2760 | mutex_lock(&cgroup_mutex); | ||
2761 | } | ||
2762 | |||
2763 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | ||
2764 | const struct cftype *cfts, bool is_add) | ||
2765 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | ||
2766 | { | ||
2767 | LIST_HEAD(pending); | ||
2768 | struct cgroup *cgrp, *n; | ||
2769 | |||
2770 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | ||
2771 | if (cfts && ss->root != &rootnode) { | ||
2772 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | ||
2773 | dget(cgrp->dentry); | ||
2774 | list_add_tail(&cgrp->cft_q_node, &pending); | ||
2775 | } | ||
2646 | } | 2776 | } |
2777 | |||
2778 | mutex_unlock(&cgroup_mutex); | ||
2779 | |||
2780 | /* | ||
2781 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | ||
2782 | * files for all cgroups which were created before. | ||
2783 | */ | ||
2784 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | ||
2785 | struct inode *inode = cgrp->dentry->d_inode; | ||
2786 | |||
2787 | mutex_lock(&inode->i_mutex); | ||
2788 | mutex_lock(&cgroup_mutex); | ||
2789 | if (!cgroup_is_removed(cgrp)) | ||
2790 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | ||
2791 | mutex_unlock(&cgroup_mutex); | ||
2792 | mutex_unlock(&inode->i_mutex); | ||
2793 | |||
2794 | list_del_init(&cgrp->cft_q_node); | ||
2795 | dput(cgrp->dentry); | ||
2796 | } | ||
2797 | |||
2798 | mutex_unlock(&cgroup_cft_mutex); | ||
2799 | } | ||
2800 | |||
2801 | /** | ||
2802 | * cgroup_add_cftypes - add an array of cftypes to a subsystem | ||
2803 | * @ss: target cgroup subsystem | ||
2804 | * @cfts: zero-length name terminated array of cftypes | ||
2805 | * | ||
2806 | * Register @cfts to @ss. Files described by @cfts are created for all | ||
2807 | * existing cgroups to which @ss is attached and all future cgroups will | ||
2808 | * have them too. This function can be called anytime whether @ss is | ||
2809 | * attached or not. | ||
2810 | * | ||
2811 | * Returns 0 on successful registration, -errno on failure. Note that this | ||
2812 | * function currently returns 0 as long as @cfts registration is successful | ||
2813 | * even if some file creation attempts on existing cgroups fail. | ||
2814 | */ | ||
2815 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2816 | { | ||
2817 | struct cftype_set *set; | ||
2818 | |||
2819 | set = kzalloc(sizeof(*set), GFP_KERNEL); | ||
2820 | if (!set) | ||
2821 | return -ENOMEM; | ||
2822 | |||
2823 | cgroup_cfts_prepare(); | ||
2824 | set->cfts = cfts; | ||
2825 | list_add_tail(&set->node, &ss->cftsets); | ||
2826 | cgroup_cfts_commit(ss, cfts, true); | ||
2827 | |||
2647 | return 0; | 2828 | return 0; |
2648 | } | 2829 | } |
2649 | EXPORT_SYMBOL_GPL(cgroup_add_files); | 2830 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2831 | |||
2832 | /** | ||
2833 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | ||
2834 | * @ss: target cgroup subsystem | ||
2835 | * @cfts: zero-length name terminated array of cftypes | ||
2836 | * | ||
2837 | * Unregister @cfts from @ss. Files described by @cfts are removed from | ||
2838 | * all existing cgroups to which @ss is attached and all future cgroups | ||
2839 | * won't have them either. This function can be called anytime whether @ss | ||
2840 | * is attached or not. | ||
2841 | * | ||
2842 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | ||
2843 | * registered with @ss. | ||
2844 | */ | ||
2845 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | ||
2846 | { | ||
2847 | struct cftype_set *set; | ||
2848 | |||
2849 | cgroup_cfts_prepare(); | ||
2850 | |||
2851 | list_for_each_entry(set, &ss->cftsets, node) { | ||
2852 | if (set->cfts == cfts) { | ||
2853 | list_del_init(&set->node); | ||
2854 | cgroup_cfts_commit(ss, cfts, false); | ||
2855 | return 0; | ||
2856 | } | ||
2857 | } | ||
2858 | |||
2859 | cgroup_cfts_commit(ss, NULL, false); | ||
2860 | return -ENOENT; | ||
2861 | } | ||
2650 | 2862 | ||
2651 | /** | 2863 | /** |
2652 | * cgroup_task_count - count the number of tasks in a cgroup. | 2864 | * cgroup_task_count - count the number of tasks in a cgroup. |
@@ -3625,13 +3837,14 @@ static struct cftype files[] = { | |||
3625 | .read_u64 = cgroup_clone_children_read, | 3837 | .read_u64 = cgroup_clone_children_read, |
3626 | .write_u64 = cgroup_clone_children_write, | 3838 | .write_u64 = cgroup_clone_children_write, |
3627 | }, | 3839 | }, |
3628 | }; | 3840 | { |
3629 | 3841 | .name = "release_agent", | |
3630 | static struct cftype cft_release_agent = { | 3842 | .flags = CFTYPE_ONLY_ON_ROOT, |
3631 | .name = "release_agent", | 3843 | .read_seq_string = cgroup_release_agent_show, |
3632 | .read_seq_string = cgroup_release_agent_show, | 3844 | .write_string = cgroup_release_agent_write, |
3633 | .write_string = cgroup_release_agent_write, | 3845 | .max_write_len = PATH_MAX, |
3634 | .max_write_len = PATH_MAX, | 3846 | }, |
3847 | { } /* terminate */ | ||
3635 | }; | 3848 | }; |
3636 | 3849 | ||
3637 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3850 | static int cgroup_populate_dir(struct cgroup *cgrp) |
@@ -3639,22 +3852,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3639 | int err; | 3852 | int err; |
3640 | struct cgroup_subsys *ss; | 3853 | struct cgroup_subsys *ss; |
3641 | 3854 | ||
3642 | /* First clear out any existing files */ | 3855 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
3643 | cgroup_clear_directory(cgrp->dentry); | ||
3644 | |||
3645 | err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); | ||
3646 | if (err < 0) | 3856 | if (err < 0) |
3647 | return err; | 3857 | return err; |
3648 | 3858 | ||
3649 | if (cgrp == cgrp->top_cgroup) { | 3859 | /* process cftsets of each subsystem */ |
3650 | if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) | ||
3651 | return err; | ||
3652 | } | ||
3653 | |||
3654 | for_each_subsys(cgrp->root, ss) { | 3860 | for_each_subsys(cgrp->root, ss) { |
3655 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 3861 | struct cftype_set *set; |
3656 | return err; | 3862 | |
3863 | list_for_each_entry(set, &ss->cftsets, node) | ||
3864 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | ||
3657 | } | 3865 | } |
3866 | |||
3658 | /* This cgroup is ready now */ | 3867 | /* This cgroup is ready now */ |
3659 | for_each_subsys(cgrp->root, ss) { | 3868 | for_each_subsys(cgrp->root, ss) { |
3660 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 3869 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
@@ -3670,6 +3879,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
3670 | return 0; | 3879 | return 0; |
3671 | } | 3880 | } |
3672 | 3881 | ||
3882 | static void css_dput_fn(struct work_struct *work) | ||
3883 | { | ||
3884 | struct cgroup_subsys_state *css = | ||
3885 | container_of(work, struct cgroup_subsys_state, dput_work); | ||
3886 | struct dentry *dentry = css->cgroup->dentry; | ||
3887 | struct super_block *sb = dentry->d_sb; | ||
3888 | |||
3889 | atomic_inc(&sb->s_active); | ||
3890 | dput(dentry); | ||
3891 | deactivate_super(sb); | ||
3892 | } | ||
3893 | |||
3673 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 3894 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
3674 | struct cgroup_subsys *ss, | 3895 | struct cgroup_subsys *ss, |
3675 | struct cgroup *cgrp) | 3896 | struct cgroup *cgrp) |
@@ -3682,6 +3903,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
3682 | set_bit(CSS_ROOT, &css->flags); | 3903 | set_bit(CSS_ROOT, &css->flags); |
3683 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 3904 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
3684 | cgrp->subsys[ss->subsys_id] = css; | 3905 | cgrp->subsys[ss->subsys_id] = css; |
3906 | |||
3907 | /* | ||
3908 | * If !clear_css_refs, css holds an extra ref to @cgrp->dentry | ||
3909 | * which is put on the last css_put(). dput() requires process | ||
3910 | * context, which css_put() may be called without. @css->dput_work | ||
3911 | * will be used to invoke dput() asynchronously from css_put(). | ||
3912 | */ | ||
3913 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
3914 | if (ss->__DEPRECATED_clear_css_refs) | ||
3915 | set_bit(CSS_CLEAR_CSS_REFS, &css->flags); | ||
3685 | } | 3916 | } |
3686 | 3917 | ||
3687 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) | 3918 | static void cgroup_lock_hierarchy(struct cgroupfs_root *root) |
@@ -3784,9 +4015,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3784 | if (err < 0) | 4015 | if (err < 0) |
3785 | goto err_remove; | 4016 | goto err_remove; |
3786 | 4017 | ||
4018 | /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ | ||
4019 | for_each_subsys(root, ss) | ||
4020 | if (!ss->__DEPRECATED_clear_css_refs) | ||
4021 | dget(dentry); | ||
4022 | |||
3787 | /* The cgroup directory was pre-locked for us */ | 4023 | /* The cgroup directory was pre-locked for us */ |
3788 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 4024 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3789 | 4025 | ||
4026 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4027 | |||
3790 | err = cgroup_populate_dir(cgrp); | 4028 | err = cgroup_populate_dir(cgrp); |
3791 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4029 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
3792 | 4030 | ||
@@ -3826,18 +4064,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
3826 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4064 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
3827 | } | 4065 | } |
3828 | 4066 | ||
4067 | /* | ||
4068 | * Check the reference count on each subsystem. Since we already | ||
4069 | * established that there are no tasks in the cgroup, if the css refcount | ||
4070 | * is also 1, then there should be no outstanding references, so the | ||
4071 | * subsystem is safe to destroy. We scan across all subsystems rather than | ||
4072 | * using the per-hierarchy linked list of mounted subsystems since we can | ||
4073 | * be called via check_for_release() with no synchronization other than | ||
4074 | * RCU, and the subsystem linked list isn't RCU-safe. | ||
4075 | */ | ||
3829 | static int cgroup_has_css_refs(struct cgroup *cgrp) | 4076 | static int cgroup_has_css_refs(struct cgroup *cgrp) |
3830 | { | 4077 | { |
3831 | /* Check the reference count on each subsystem. Since we | ||
3832 | * already established that there are no tasks in the | ||
3833 | * cgroup, if the css refcount is also 1, then there should | ||
3834 | * be no outstanding references, so the subsystem is safe to | ||
3835 | * destroy. We scan across all subsystems rather than using | ||
3836 | * the per-hierarchy linked list of mounted subsystems since | ||
3837 | * we can be called via check_for_release() with no | ||
3838 | * synchronization other than RCU, and the subsystem linked | ||
3839 | * list isn't RCU-safe */ | ||
3840 | int i; | 4078 | int i; |
4079 | |||
3841 | /* | 4080 | /* |
3842 | * We won't need to lock the subsys array, because the subsystems | 4081 | * We won't need to lock the subsys array, because the subsystems |
3843 | * we're concerned about aren't going anywhere since our cgroup root | 4082 | * we're concerned about aren't going anywhere since our cgroup root |
@@ -3846,17 +4085,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3846 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4085 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
3847 | struct cgroup_subsys *ss = subsys[i]; | 4086 | struct cgroup_subsys *ss = subsys[i]; |
3848 | struct cgroup_subsys_state *css; | 4087 | struct cgroup_subsys_state *css; |
4088 | |||
3849 | /* Skip subsystems not present or not in this hierarchy */ | 4089 | /* Skip subsystems not present or not in this hierarchy */ |
3850 | if (ss == NULL || ss->root != cgrp->root) | 4090 | if (ss == NULL || ss->root != cgrp->root) |
3851 | continue; | 4091 | continue; |
4092 | |||
3852 | css = cgrp->subsys[ss->subsys_id]; | 4093 | css = cgrp->subsys[ss->subsys_id]; |
3853 | /* When called from check_for_release() it's possible | 4094 | /* |
4095 | * When called from check_for_release() it's possible | ||
3854 | * that by this point the cgroup has been removed | 4096 | * that by this point the cgroup has been removed |
3855 | * and the css deleted. But a false-positive doesn't | 4097 | * and the css deleted. But a false-positive doesn't |
3856 | * matter, since it can only happen if the cgroup | 4098 | * matter, since it can only happen if the cgroup |
3857 | * has been deleted and hence no longer needs the | 4099 | * has been deleted and hence no longer needs the |
3858 | * release agent to be called anyway. */ | 4100 | * release agent to be called anyway. |
3859 | if (css && (atomic_read(&css->refcnt) > 1)) | 4101 | */ |
4102 | if (css && css_refcnt(css) > 1) | ||
3860 | return 1; | 4103 | return 1; |
3861 | } | 4104 | } |
3862 | return 0; | 4105 | return 0; |
@@ -3866,51 +4109,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) | |||
3866 | * Atomically mark all (or else none) of the cgroup's CSS objects as | 4109 | * Atomically mark all (or else none) of the cgroup's CSS objects as |
3867 | * CSS_REMOVED. Return true on success, or false if the cgroup has | 4110 | * CSS_REMOVED. Return true on success, or false if the cgroup has |
3868 | * busy subsystems. Call with cgroup_mutex held | 4111 | * busy subsystems. Call with cgroup_mutex held |
4112 | * | ||
4113 | * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or | ||
4114 | * not, cgroup removal behaves differently. | ||
4115 | * | ||
4116 | * If clear is set, css refcnt for the subsystem should be zero before | ||
4117 | * cgroup removal can be committed. This is implemented by | ||
4118 | * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be | ||
4119 | * called multiple times until all css refcnts reach zero and is allowed to | ||
4120 | * veto removal on any invocation. This behavior is deprecated and will be | ||
4121 | * removed as soon as the existing user (memcg) is updated. | ||
4122 | * | ||
4123 | * If clear is not set, each css holds an extra reference to the cgroup's | ||
4124 | * dentry and cgroup removal proceeds regardless of css refs. | ||
4125 | * ->pre_destroy() will be called at least once and is not allowed to fail. | ||
4126 | * On the last put of each css, whenever that may be, the extra dentry ref | ||
4127 | * is put so that dentry destruction happens only after all css's are | ||
4128 | * released. | ||
3869 | */ | 4129 | */ |
3870 | |||
3871 | static int cgroup_clear_css_refs(struct cgroup *cgrp) | 4130 | static int cgroup_clear_css_refs(struct cgroup *cgrp) |
3872 | { | 4131 | { |
3873 | struct cgroup_subsys *ss; | 4132 | struct cgroup_subsys *ss; |
3874 | unsigned long flags; | 4133 | unsigned long flags; |
3875 | bool failed = false; | 4134 | bool failed = false; |
4135 | |||
3876 | local_irq_save(flags); | 4136 | local_irq_save(flags); |
4137 | |||
4138 | /* | ||
4139 | * Block new css_tryget() by deactivating refcnt. If all refcnts | ||
4140 | * for subsystems w/ clear_css_refs set were 1 at the moment of | ||
4141 | * deactivation, we succeeded. | ||
4142 | */ | ||
3877 | for_each_subsys(cgrp->root, ss) { | 4143 | for_each_subsys(cgrp->root, ss) { |
3878 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4144 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3879 | int refcnt; | 4145 | |
3880 | while (1) { | 4146 | WARN_ON(atomic_read(&css->refcnt) < 0); |
3881 | /* We can only remove a CSS with a refcnt==1 */ | 4147 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); |
3882 | refcnt = atomic_read(&css->refcnt); | 4148 | |
3883 | if (refcnt > 1) { | 4149 | if (ss->__DEPRECATED_clear_css_refs) |
3884 | failed = true; | 4150 | failed |= css_refcnt(css) != 1; |
3885 | goto done; | ||
3886 | } | ||
3887 | BUG_ON(!refcnt); | ||
3888 | /* | ||
3889 | * Drop the refcnt to 0 while we check other | ||
3890 | * subsystems. This will cause any racing | ||
3891 | * css_tryget() to spin until we set the | ||
3892 | * CSS_REMOVED bits or abort | ||
3893 | */ | ||
3894 | if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) | ||
3895 | break; | ||
3896 | cpu_relax(); | ||
3897 | } | ||
3898 | } | 4151 | } |
3899 | done: | 4152 | |
4153 | /* | ||
4154 | * If succeeded, set REMOVED and put all the base refs; otherwise, | ||
4155 | * restore refcnts to positive values. Either way, all in-progress | ||
4156 | * css_tryget() will be released. | ||
4157 | */ | ||
3900 | for_each_subsys(cgrp->root, ss) { | 4158 | for_each_subsys(cgrp->root, ss) { |
3901 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4159 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
3902 | if (failed) { | 4160 | |
3903 | /* | 4161 | if (!failed) { |
3904 | * Restore old refcnt if we previously managed | ||
3905 | * to clear it from 1 to 0 | ||
3906 | */ | ||
3907 | if (!atomic_read(&css->refcnt)) | ||
3908 | atomic_set(&css->refcnt, 1); | ||
3909 | } else { | ||
3910 | /* Commit the fact that the CSS is removed */ | ||
3911 | set_bit(CSS_REMOVED, &css->flags); | 4162 | set_bit(CSS_REMOVED, &css->flags); |
4163 | css_put(css); | ||
4164 | } else { | ||
4165 | atomic_sub(CSS_DEACT_BIAS, &css->refcnt); | ||
3912 | } | 4166 | } |
3913 | } | 4167 | } |
4168 | |||
3914 | local_irq_restore(flags); | 4169 | local_irq_restore(flags); |
3915 | return !failed; | 4170 | return !failed; |
3916 | } | 4171 | } |
@@ -3995,6 +4250,8 @@ again: | |||
3995 | list_del_init(&cgrp->sibling); | 4250 | list_del_init(&cgrp->sibling); |
3996 | cgroup_unlock_hierarchy(cgrp->root); | 4251 | cgroup_unlock_hierarchy(cgrp->root); |
3997 | 4252 | ||
4253 | list_del_init(&cgrp->allcg_node); | ||
4254 | |||
3998 | d = dget(cgrp->dentry); | 4255 | d = dget(cgrp->dentry); |
3999 | 4256 | ||
4000 | cgroup_d_remove_dir(d); | 4257 | cgroup_d_remove_dir(d); |
@@ -4021,12 +4278,29 @@ again: | |||
4021 | return 0; | 4278 | return 0; |
4022 | } | 4279 | } |
4023 | 4280 | ||
4281 | static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | ||
4282 | { | ||
4283 | INIT_LIST_HEAD(&ss->cftsets); | ||
4284 | |||
4285 | /* | ||
4286 | * base_cftset is embedded in subsys itself, no need to worry about | ||
4287 | * deregistration. | ||
4288 | */ | ||
4289 | if (ss->base_cftypes) { | ||
4290 | ss->base_cftset.cfts = ss->base_cftypes; | ||
4291 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | ||
4292 | } | ||
4293 | } | ||
4294 | |||
4024 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4295 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) |
4025 | { | 4296 | { |
4026 | struct cgroup_subsys_state *css; | 4297 | struct cgroup_subsys_state *css; |
4027 | 4298 | ||
4028 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4299 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4029 | 4300 | ||
4301 | /* init base cftset */ | ||
4302 | cgroup_init_cftsets(ss); | ||
4303 | |||
4030 | /* Create the top cgroup state for this subsystem */ | 4304 | /* Create the top cgroup state for this subsystem */ |
4031 | list_add(&ss->sibling, &rootnode.subsys_list); | 4305 | list_add(&ss->sibling, &rootnode.subsys_list); |
4032 | ss->root = &rootnode; | 4306 | ss->root = &rootnode; |
@@ -4096,6 +4370,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4096 | return 0; | 4370 | return 0; |
4097 | } | 4371 | } |
4098 | 4372 | ||
4373 | /* init base cftset */ | ||
4374 | cgroup_init_cftsets(ss); | ||
4375 | |||
4099 | /* | 4376 | /* |
4100 | * need to register a subsys id before anything else - for example, | 4377 | * need to register a subsys id before anything else - for example, |
4101 | * init_cgroup_css needs it. | 4378 | * init_cgroup_css needs it. |
@@ -4685,21 +4962,43 @@ static void check_for_release(struct cgroup *cgrp) | |||
4685 | } | 4962 | } |
4686 | 4963 | ||
4687 | /* Caller must verify that the css is not for root cgroup */ | 4964 | /* Caller must verify that the css is not for root cgroup */ |
4688 | void __css_put(struct cgroup_subsys_state *css, int count) | 4965 | bool __css_tryget(struct cgroup_subsys_state *css) |
4966 | { | ||
4967 | do { | ||
4968 | int v = css_refcnt(css); | ||
4969 | |||
4970 | if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) | ||
4971 | return true; | ||
4972 | cpu_relax(); | ||
4973 | } while (!test_bit(CSS_REMOVED, &css->flags)); | ||
4974 | |||
4975 | return false; | ||
4976 | } | ||
4977 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4978 | |||
4979 | /* Caller must verify that the css is not for root cgroup */ | ||
4980 | void __css_put(struct cgroup_subsys_state *css) | ||
4689 | { | 4981 | { |
4690 | struct cgroup *cgrp = css->cgroup; | 4982 | struct cgroup *cgrp = css->cgroup; |
4691 | int val; | 4983 | int v; |
4984 | |||
4692 | rcu_read_lock(); | 4985 | rcu_read_lock(); |
4693 | val = atomic_sub_return(count, &css->refcnt); | 4986 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); |
4694 | if (val == 1) { | 4987 | |
4988 | switch (v) { | ||
4989 | case 1: | ||
4695 | if (notify_on_release(cgrp)) { | 4990 | if (notify_on_release(cgrp)) { |
4696 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 4991 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
4697 | check_for_release(cgrp); | 4992 | check_for_release(cgrp); |
4698 | } | 4993 | } |
4699 | cgroup_wakeup_rmdir_waiter(cgrp); | 4994 | cgroup_wakeup_rmdir_waiter(cgrp); |
4995 | break; | ||
4996 | case 0: | ||
4997 | if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) | ||
4998 | schedule_work(&css->dput_work); | ||
4999 | break; | ||
4700 | } | 5000 | } |
4701 | rcu_read_unlock(); | 5001 | rcu_read_unlock(); |
4702 | WARN_ON_ONCE(val < 1); | ||
4703 | } | 5002 | } |
4704 | EXPORT_SYMBOL_GPL(__css_put); | 5003 | EXPORT_SYMBOL_GPL(__css_put); |
4705 | 5004 | ||
@@ -4818,7 +5117,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4818 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5117 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4819 | * it's unchanged until freed. | 5118 | * it's unchanged until freed. |
4820 | */ | 5119 | */ |
4821 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5120 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4822 | 5121 | ||
4823 | if (cssid) | 5122 | if (cssid) |
4824 | return cssid->id; | 5123 | return cssid->id; |
@@ -4830,7 +5129,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4830 | { | 5129 | { |
4831 | struct css_id *cssid; | 5130 | struct css_id *cssid; |
4832 | 5131 | ||
4833 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); | 5132 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); |
4834 | 5133 | ||
4835 | if (cssid) | 5134 | if (cssid) |
4836 | return cssid->depth; | 5135 | return cssid->depth; |
@@ -4844,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth); | |||
4844 | * @root: the css supporsed to be an ancestor of the child. | 5143 | * @root: the css supporsed to be an ancestor of the child. |
4845 | * | 5144 | * |
4846 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because | 5145 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because |
4847 | * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). | 5146 | * this function reads css->id, the caller must hold rcu_read_lock(). |
4848 | * But, considering usual usage, the csses should be valid objects after test. | 5147 | * But, considering usual usage, the csses should be valid objects after test. |
4849 | * Assuming that the caller will do some action to the child if this returns | 5148 | * Assuming that the caller will do some action to the child if this returns |
4850 | * returns true, the caller must take "child";s reference count. | 5149 | * returns true, the caller must take "child";s reference count. |
@@ -4856,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
4856 | { | 5155 | { |
4857 | struct css_id *child_id; | 5156 | struct css_id *child_id; |
4858 | struct css_id *root_id; | 5157 | struct css_id *root_id; |
4859 | bool ret = true; | ||
4860 | 5158 | ||
4861 | rcu_read_lock(); | ||
4862 | child_id = rcu_dereference(child->id); | 5159 | child_id = rcu_dereference(child->id); |
5160 | if (!child_id) | ||
5161 | return false; | ||
4863 | root_id = rcu_dereference(root->id); | 5162 | root_id = rcu_dereference(root->id); |
4864 | if (!child_id | 5163 | if (!root_id) |
4865 | || !root_id | 5164 | return false; |
4866 | || (child_id->depth < root_id->depth) | 5165 | if (child_id->depth < root_id->depth) |
4867 | || (child_id->stack[root_id->depth] != root_id->id)) | 5166 | return false; |
4868 | ret = false; | 5167 | if (child_id->stack[root_id->depth] != root_id->id) |
4869 | rcu_read_unlock(); | 5168 | return false; |
4870 | return ret; | 5169 | return true; |
4871 | } | 5170 | } |
4872 | 5171 | ||
4873 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 5172 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
@@ -5211,19 +5510,15 @@ static struct cftype debug_files[] = { | |||
5211 | .name = "releasable", | 5510 | .name = "releasable", |
5212 | .read_u64 = releasable_read, | 5511 | .read_u64 = releasable_read, |
5213 | }, | 5512 | }, |
5214 | }; | ||
5215 | 5513 | ||
5216 | static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 5514 | { } /* terminate */ |
5217 | { | 5515 | }; |
5218 | return cgroup_add_files(cont, ss, debug_files, | ||
5219 | ARRAY_SIZE(debug_files)); | ||
5220 | } | ||
5221 | 5516 | ||
5222 | struct cgroup_subsys debug_subsys = { | 5517 | struct cgroup_subsys debug_subsys = { |
5223 | .name = "debug", | 5518 | .name = "debug", |
5224 | .create = debug_create, | 5519 | .create = debug_create, |
5225 | .destroy = debug_destroy, | 5520 | .destroy = debug_destroy, |
5226 | .populate = debug_populate, | ||
5227 | .subsys_id = debug_subsys_id, | 5521 | .subsys_id = debug_subsys_id, |
5522 | .base_cftypes = debug_files, | ||
5228 | }; | 5523 | }; |
5229 | #endif /* CONFIG_CGROUP_DEBUG */ | 5524 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b62..3649fc6b3eaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, | |||
358 | static struct cftype files[] = { | 358 | static struct cftype files[] = { |
359 | { | 359 | { |
360 | .name = "state", | 360 | .name = "state", |
361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
361 | .read_seq_string = freezer_read, | 362 | .read_seq_string = freezer_read, |
362 | .write_string = freezer_write, | 363 | .write_string = freezer_write, |
363 | }, | 364 | }, |
365 | { } /* terminate */ | ||
364 | }; | 366 | }; |
365 | 367 | ||
366 | static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) | ||
367 | { | ||
368 | if (!cgroup->parent) | ||
369 | return 0; | ||
370 | return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); | ||
371 | } | ||
372 | |||
373 | struct cgroup_subsys freezer_subsys = { | 368 | struct cgroup_subsys freezer_subsys = { |
374 | .name = "freezer", | 369 | .name = "freezer", |
375 | .create = freezer_create, | 370 | .create = freezer_create, |
376 | .destroy = freezer_destroy, | 371 | .destroy = freezer_destroy, |
377 | .populate = freezer_populate, | ||
378 | .subsys_id = freezer_subsys_id, | 372 | .subsys_id = freezer_subsys_id, |
379 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
380 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
375 | .base_cftypes = files, | ||
381 | }; | 376 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index 74ff8498809a..c28a306ae05c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
372 | 372 | ||
373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 373 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
374 | 374 | ||
375 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 375 | /* |
376 | compat_old_sigset_t __user *oset) | 376 | * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the |
377 | * blocked set of signals to the supplied signal set | ||
378 | */ | ||
379 | static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) | ||
377 | { | 380 | { |
378 | old_sigset_t s; | 381 | memcpy(blocked->sig, &set, sizeof(set)); |
379 | long ret; | 382 | } |
380 | mm_segment_t old_fs; | ||
381 | 383 | ||
382 | if (set && get_user(s, set)) | 384 | asmlinkage long compat_sys_sigprocmask(int how, |
383 | return -EFAULT; | 385 | compat_old_sigset_t __user *nset, |
384 | old_fs = get_fs(); | 386 | compat_old_sigset_t __user *oset) |
385 | set_fs(KERNEL_DS); | 387 | { |
386 | ret = sys_sigprocmask(how, | 388 | old_sigset_t old_set, new_set; |
387 | set ? (old_sigset_t __user *) &s : NULL, | 389 | sigset_t new_blocked; |
388 | oset ? (old_sigset_t __user *) &s : NULL); | 390 | |
389 | set_fs(old_fs); | 391 | old_set = current->blocked.sig[0]; |
390 | if (ret == 0) | 392 | |
391 | if (oset) | 393 | if (nset) { |
392 | ret = put_user(s, oset); | 394 | if (get_user(new_set, nset)) |
393 | return ret; | 395 | return -EFAULT; |
396 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
397 | |||
398 | new_blocked = current->blocked; | ||
399 | |||
400 | switch (how) { | ||
401 | case SIG_BLOCK: | ||
402 | sigaddsetmask(&new_blocked, new_set); | ||
403 | break; | ||
404 | case SIG_UNBLOCK: | ||
405 | sigdelsetmask(&new_blocked, new_set); | ||
406 | break; | ||
407 | case SIG_SETMASK: | ||
408 | compat_sig_setmask(&new_blocked, new_set); | ||
409 | break; | ||
410 | default: | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | |||
414 | set_current_blocked(&new_blocked); | ||
415 | } | ||
416 | |||
417 | if (oset) { | ||
418 | if (put_user(old_set, oset)) | ||
419 | return -EFAULT; | ||
420 | } | ||
421 | |||
422 | return 0; | ||
394 | } | 423 | } |
395 | 424 | ||
396 | #endif | 425 | #endif |
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
1044 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) | 1073 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) |
1045 | return -EFAULT; | 1074 | return -EFAULT; |
1046 | sigset_from_compat(&newset, &newset32); | 1075 | sigset_from_compat(&newset, &newset32); |
1047 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 1076 | return sigsuspend(&newset); |
1048 | |||
1049 | current->saved_sigmask = current->blocked; | ||
1050 | set_current_blocked(&newset); | ||
1051 | |||
1052 | current->state = TASK_INTERRUPTIBLE; | ||
1053 | schedule(); | ||
1054 | set_restore_sigmask(); | ||
1055 | return -ERESTARTNOHAND; | ||
1056 | } | 1077 | } |
1057 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | 1078 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ |
1058 | 1079 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e57027..a4eb5227a19e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -10,13 +10,18 @@ | |||
10 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
11 | #include <linux/unistd.h> | 11 | #include <linux/unistd.h> |
12 | #include <linux/cpu.h> | 12 | #include <linux/cpu.h> |
13 | #include <linux/oom.h> | ||
14 | #include <linux/rcupdate.h> | ||
13 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/bug.h> | ||
14 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 18 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
17 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> |
19 | 22 | ||
23 | #include "smpboot.h" | ||
24 | |||
20 | #ifdef CONFIG_SMP | 25 | #ifdef CONFIG_SMP |
21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 26 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
22 | static DEFINE_MUTEX(cpu_add_remove_lock); | 27 | static DEFINE_MUTEX(cpu_add_remove_lock); |
@@ -171,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) | |||
171 | } | 176 | } |
172 | EXPORT_SYMBOL(unregister_cpu_notifier); | 177 | EXPORT_SYMBOL(unregister_cpu_notifier); |
173 | 178 | ||
179 | /** | ||
180 | * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU | ||
181 | * @cpu: a CPU id | ||
182 | * | ||
183 | * This function walks all processes, finds a valid mm struct for each one and | ||
184 | * then clears a corresponding bit in mm's cpumask. While this all sounds | ||
185 | * trivial, there are various non-obvious corner cases, which this function | ||
186 | * tries to solve in a safe manner. | ||
187 | * | ||
188 | * Also note that the function uses a somewhat relaxed locking scheme, so it may | ||
189 | * be called only for an already offlined CPU. | ||
190 | */ | ||
191 | void clear_tasks_mm_cpumask(int cpu) | ||
192 | { | ||
193 | struct task_struct *p; | ||
194 | |||
195 | /* | ||
196 | * This function is called after the cpu is taken down and marked | ||
197 | * offline, so its not like new tasks will ever get this cpu set in | ||
198 | * their mm mask. -- Peter Zijlstra | ||
199 | * Thus, we may use rcu_read_lock() here, instead of grabbing | ||
200 | * full-fledged tasklist_lock. | ||
201 | */ | ||
202 | WARN_ON(cpu_online(cpu)); | ||
203 | rcu_read_lock(); | ||
204 | for_each_process(p) { | ||
205 | struct task_struct *t; | ||
206 | |||
207 | /* | ||
208 | * Main thread might exit, but other threads may still have | ||
209 | * a valid mm. Find one. | ||
210 | */ | ||
211 | t = find_lock_task_mm(p); | ||
212 | if (!t) | ||
213 | continue; | ||
214 | cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); | ||
215 | task_unlock(t); | ||
216 | } | ||
217 | rcu_read_unlock(); | ||
218 | } | ||
219 | |||
174 | static inline void check_for_tasks(int cpu) | 220 | static inline void check_for_tasks(int cpu) |
175 | { | 221 | { |
176 | struct task_struct *p; | 222 | struct task_struct *p; |
@@ -295,11 +341,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
295 | int ret, nr_calls = 0; | 341 | int ret, nr_calls = 0; |
296 | void *hcpu = (void *)(long)cpu; | 342 | void *hcpu = (void *)(long)cpu; |
297 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 343 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
344 | struct task_struct *idle; | ||
298 | 345 | ||
299 | if (cpu_online(cpu) || !cpu_present(cpu)) | 346 | if (cpu_online(cpu) || !cpu_present(cpu)) |
300 | return -EINVAL; | 347 | return -EINVAL; |
301 | 348 | ||
302 | cpu_hotplug_begin(); | 349 | cpu_hotplug_begin(); |
350 | |||
351 | idle = idle_thread_get(cpu); | ||
352 | if (IS_ERR(idle)) { | ||
353 | ret = PTR_ERR(idle); | ||
354 | goto out; | ||
355 | } | ||
356 | |||
303 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 357 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
304 | if (ret) { | 358 | if (ret) { |
305 | nr_calls--; | 359 | nr_calls--; |
@@ -309,7 +363,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
309 | } | 363 | } |
310 | 364 | ||
311 | /* Arch-specific enabling code. */ | 365 | /* Arch-specific enabling code. */ |
312 | ret = __cpu_up(cpu); | 366 | ret = __cpu_up(cpu, idle); |
313 | if (ret != 0) | 367 | if (ret != 0) |
314 | goto out_notify; | 368 | goto out_notify; |
315 | BUG_ON(!cpu_online(cpu)); | 369 | BUG_ON(!cpu_online(cpu)); |
@@ -320,6 +374,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
320 | out_notify: | 374 | out_notify: |
321 | if (ret != 0) | 375 | if (ret != 0) |
322 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 376 | __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
377 | out: | ||
323 | cpu_hotplug_done(); | 378 | cpu_hotplug_done(); |
324 | 379 | ||
325 | return ret; | 380 | return ret; |
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 249152e15308..9656a3c36503 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c | |||
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb) | |||
81 | EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); | 81 | EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); |
82 | 82 | ||
83 | /** | 83 | /** |
84 | * cpm_pm_enter - CPU low power entry notifier | 84 | * cpu_pm_enter - CPU low power entry notifier |
85 | * | 85 | * |
86 | * Notifies listeners that a single CPU is entering a low power state that may | 86 | * Notifies listeners that a single CPU is entering a low power state that may |
87 | * cause some blocks in the same power domain as the cpu to reset. | 87 | * cause some blocks in the same power domain as the cpu to reset. |
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); | |||
89 | * Must be called on the affected CPU with interrupts disabled. Platform is | 89 | * Must be called on the affected CPU with interrupts disabled. Platform is |
90 | * responsible for ensuring that cpu_pm_enter is not called twice on the same | 90 | * responsible for ensuring that cpu_pm_enter is not called twice on the same |
91 | * CPU before cpu_pm_exit is called. Notified drivers can include VFP | 91 | * CPU before cpu_pm_exit is called. Notified drivers can include VFP |
92 | * co-processor, interrupt controller and it's PM extensions, local CPU | 92 | * co-processor, interrupt controller and its PM extensions, local CPU |
93 | * timers context save/restore which shouldn't be interrupted. Hence it | 93 | * timers context save/restore which shouldn't be interrupted. Hence it |
94 | * must be called with interrupts disabled. | 94 | * must be called with interrupts disabled. |
95 | * | 95 | * |
@@ -115,13 +115,13 @@ int cpu_pm_enter(void) | |||
115 | EXPORT_SYMBOL_GPL(cpu_pm_enter); | 115 | EXPORT_SYMBOL_GPL(cpu_pm_enter); |
116 | 116 | ||
117 | /** | 117 | /** |
118 | * cpm_pm_exit - CPU low power exit notifier | 118 | * cpu_pm_exit - CPU low power exit notifier |
119 | * | 119 | * |
120 | * Notifies listeners that a single CPU is exiting a low power state that may | 120 | * Notifies listeners that a single CPU is exiting a low power state that may |
121 | * have caused some blocks in the same power domain as the cpu to reset. | 121 | * have caused some blocks in the same power domain as the cpu to reset. |
122 | * | 122 | * |
123 | * Notified drivers can include VFP co-processor, interrupt controller | 123 | * Notified drivers can include VFP co-processor, interrupt controller |
124 | * and it's PM extensions, local CPU timers context save/restore which | 124 | * and its PM extensions, local CPU timers context save/restore which |
125 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | 125 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. |
126 | * | 126 | * |
127 | * Return conditions are same as __raw_notifier_call_chain. | 127 | * Return conditions are same as __raw_notifier_call_chain. |
@@ -139,7 +139,7 @@ int cpu_pm_exit(void) | |||
139 | EXPORT_SYMBOL_GPL(cpu_pm_exit); | 139 | EXPORT_SYMBOL_GPL(cpu_pm_exit); |
140 | 140 | ||
141 | /** | 141 | /** |
142 | * cpm_cluster_pm_enter - CPU cluster low power entry notifier | 142 | * cpu_cluster_pm_enter - CPU cluster low power entry notifier |
143 | * | 143 | * |
144 | * Notifies listeners that all cpus in a power domain are entering a low power | 144 | * Notifies listeners that all cpus in a power domain are entering a low power |
145 | * state that may cause some blocks in the same power domain to reset. | 145 | * state that may cause some blocks in the same power domain to reset. |
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit); | |||
147 | * Must be called after cpu_pm_enter has been called on all cpus in the power | 147 | * Must be called after cpu_pm_enter has been called on all cpus in the power |
148 | * domain, and before cpu_pm_exit has been called on any cpu in the power | 148 | * domain, and before cpu_pm_exit has been called on any cpu in the power |
149 | * domain. Notified drivers can include VFP co-processor, interrupt controller | 149 | * domain. Notified drivers can include VFP co-processor, interrupt controller |
150 | * and it's PM extensions, local CPU timers context save/restore which | 150 | * and its PM extensions, local CPU timers context save/restore which |
151 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | 151 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. |
152 | * | 152 | * |
153 | * Must be called with interrupts disabled. | 153 | * Must be called with interrupts disabled. |
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void) | |||
174 | EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); | 174 | EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); |
175 | 175 | ||
176 | /** | 176 | /** |
177 | * cpm_cluster_pm_exit - CPU cluster low power exit notifier | 177 | * cpu_cluster_pm_exit - CPU cluster low power exit notifier |
178 | * | 178 | * |
179 | * Notifies listeners that all cpus in a power domain are exiting form a | 179 | * Notifies listeners that all cpus in a power domain are exiting form a |
180 | * low power state that may have caused some blocks in the same power domain | 180 | * low power state that may have caused some blocks in the same power domain |
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); | |||
183 | * Must be called after cpu_pm_exit has been called on all cpus in the power | 183 | * Must be called after cpu_pm_exit has been called on all cpus in the power |
184 | * domain, and before cpu_pm_exit has been called on any cpu in the power | 184 | * domain, and before cpu_pm_exit has been called on any cpu in the power |
185 | * domain. Notified drivers can include VFP co-processor, interrupt controller | 185 | * domain. Notified drivers can include VFP co-processor, interrupt controller |
186 | * and it's PM extensions, local CPU timers context save/restore which | 186 | * and its PM extensions, local CPU timers context save/restore which |
187 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | 187 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. |
188 | * | 188 | * |
189 | * Return conditions are same as __raw_notifier_call_chain. | 189 | * Return conditions are same as __raw_notifier_call_chain. |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba2..8c8bd652dd12 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1765,28 +1765,17 @@ static struct cftype files[] = { | |||
1765 | .write_u64 = cpuset_write_u64, | 1765 | .write_u64 = cpuset_write_u64, |
1766 | .private = FILE_SPREAD_SLAB, | 1766 | .private = FILE_SPREAD_SLAB, |
1767 | }, | 1767 | }, |
1768 | }; | ||
1769 | |||
1770 | static struct cftype cft_memory_pressure_enabled = { | ||
1771 | .name = "memory_pressure_enabled", | ||
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }; | ||
1776 | 1768 | ||
1777 | static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 1769 | { |
1778 | { | 1770 | .name = "memory_pressure_enabled", |
1779 | int err; | 1771 | .flags = CFTYPE_ONLY_ON_ROOT, |
1772 | .read_u64 = cpuset_read_u64, | ||
1773 | .write_u64 = cpuset_write_u64, | ||
1774 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1775 | }, | ||
1780 | 1776 | ||
1781 | err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | 1777 | { } /* terminate */ |
1782 | if (err) | 1778 | }; |
1783 | return err; | ||
1784 | /* memory_pressure_enabled is in root cpuset only */ | ||
1785 | if (!cont->parent) | ||
1786 | err = cgroup_add_file(cont, ss, | ||
1787 | &cft_memory_pressure_enabled); | ||
1788 | return err; | ||
1789 | } | ||
1790 | 1779 | ||
1791 | /* | 1780 | /* |
1792 | * post_clone() is called during cgroup_create() when the | 1781 | * post_clone() is called during cgroup_create() when the |
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
1887 | .destroy = cpuset_destroy, | 1876 | .destroy = cpuset_destroy, |
1888 | .can_attach = cpuset_can_attach, | 1877 | .can_attach = cpuset_can_attach, |
1889 | .attach = cpuset_attach, | 1878 | .attach = cpuset_attach, |
1890 | .populate = cpuset_populate, | ||
1891 | .post_clone = cpuset_post_clone, | 1879 | .post_clone = cpuset_post_clone, |
1892 | .subsys_id = cpuset_subsys_id, | 1880 | .subsys_id = cpuset_subsys_id, |
1881 | .base_cftypes = files, | ||
1893 | .early_init = 1, | 1882 | .early_init = 1, |
1894 | }; | 1883 | }; |
1895 | 1884 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index e70683d9ec32..de728ac50d82 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -49,6 +49,14 @@ struct cred init_cred = { | |||
49 | .subscribers = ATOMIC_INIT(2), | 49 | .subscribers = ATOMIC_INIT(2), |
50 | .magic = CRED_MAGIC, | 50 | .magic = CRED_MAGIC, |
51 | #endif | 51 | #endif |
52 | .uid = GLOBAL_ROOT_UID, | ||
53 | .gid = GLOBAL_ROOT_GID, | ||
54 | .suid = GLOBAL_ROOT_UID, | ||
55 | .sgid = GLOBAL_ROOT_GID, | ||
56 | .euid = GLOBAL_ROOT_UID, | ||
57 | .egid = GLOBAL_ROOT_GID, | ||
58 | .fsuid = GLOBAL_ROOT_UID, | ||
59 | .fsgid = GLOBAL_ROOT_GID, | ||
52 | .securebits = SECUREBITS_DEFAULT, | 60 | .securebits = SECUREBITS_DEFAULT, |
53 | .cap_inheritable = CAP_EMPTY_SET, | 61 | .cap_inheritable = CAP_EMPTY_SET, |
54 | .cap_permitted = CAP_FULL_SET, | 62 | .cap_permitted = CAP_FULL_SET, |
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu) | |||
148 | if (cred->group_info) | 156 | if (cred->group_info) |
149 | put_group_info(cred->group_info); | 157 | put_group_info(cred->group_info); |
150 | free_uid(cred->user); | 158 | free_uid(cred->user); |
159 | put_user_ns(cred->user_ns); | ||
151 | kmem_cache_free(cred_jar, cred); | 160 | kmem_cache_free(cred_jar, cred); |
152 | } | 161 | } |
153 | 162 | ||
@@ -198,13 +207,6 @@ void exit_creds(struct task_struct *tsk) | |||
198 | validate_creds(cred); | 207 | validate_creds(cred); |
199 | alter_cred_subscribers(cred, -1); | 208 | alter_cred_subscribers(cred, -1); |
200 | put_cred(cred); | 209 | put_cred(cred); |
201 | |||
202 | cred = (struct cred *) tsk->replacement_session_keyring; | ||
203 | if (cred) { | ||
204 | tsk->replacement_session_keyring = NULL; | ||
205 | validate_creds(cred); | ||
206 | put_cred(cred); | ||
207 | } | ||
208 | } | 210 | } |
209 | 211 | ||
210 | /** | 212 | /** |
@@ -303,6 +305,7 @@ struct cred *prepare_creds(void) | |||
303 | set_cred_subscribers(new, 0); | 305 | set_cred_subscribers(new, 0); |
304 | get_group_info(new->group_info); | 306 | get_group_info(new->group_info); |
305 | get_uid(new->user); | 307 | get_uid(new->user); |
308 | get_user_ns(new->user_ns); | ||
306 | 309 | ||
307 | #ifdef CONFIG_KEYS | 310 | #ifdef CONFIG_KEYS |
308 | key_get(new->thread_keyring); | 311 | key_get(new->thread_keyring); |
@@ -386,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
386 | struct cred *new; | 389 | struct cred *new; |
387 | int ret; | 390 | int ret; |
388 | 391 | ||
389 | p->replacement_session_keyring = NULL; | ||
390 | |||
391 | if ( | 392 | if ( |
392 | #ifdef CONFIG_KEYS | 393 | #ifdef CONFIG_KEYS |
393 | !p->cred->thread_keyring && | 394 | !p->cred->thread_keyring && |
@@ -414,11 +415,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
414 | goto error_put; | 415 | goto error_put; |
415 | } | 416 | } |
416 | 417 | ||
417 | /* cache user_ns in cred. Doesn't need a refcount because it will | ||
418 | * stay pinned by cred->user | ||
419 | */ | ||
420 | new->user_ns = new->user->user_ns; | ||
421 | |||
422 | #ifdef CONFIG_KEYS | 418 | #ifdef CONFIG_KEYS |
423 | /* new threads get their own thread keyrings if their parent already | 419 | /* new threads get their own thread keyrings if their parent already |
424 | * had one */ | 420 | * had one */ |
@@ -493,10 +489,10 @@ int commit_creds(struct cred *new) | |||
493 | get_cred(new); /* we will require a ref for the subj creds too */ | 489 | get_cred(new); /* we will require a ref for the subj creds too */ |
494 | 490 | ||
495 | /* dumpability changes */ | 491 | /* dumpability changes */ |
496 | if (old->euid != new->euid || | 492 | if (!uid_eq(old->euid, new->euid) || |
497 | old->egid != new->egid || | 493 | !gid_eq(old->egid, new->egid) || |
498 | old->fsuid != new->fsuid || | 494 | !uid_eq(old->fsuid, new->fsuid) || |
499 | old->fsgid != new->fsgid || | 495 | !gid_eq(old->fsgid, new->fsgid) || |
500 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { | 496 | !cap_issubset(new->cap_permitted, old->cap_permitted)) { |
501 | if (task->mm) | 497 | if (task->mm) |
502 | set_dumpable(task->mm, suid_dumpable); | 498 | set_dumpable(task->mm, suid_dumpable); |
@@ -505,9 +501,9 @@ int commit_creds(struct cred *new) | |||
505 | } | 501 | } |
506 | 502 | ||
507 | /* alter the thread keyring */ | 503 | /* alter the thread keyring */ |
508 | if (new->fsuid != old->fsuid) | 504 | if (!uid_eq(new->fsuid, old->fsuid)) |
509 | key_fsuid_changed(task); | 505 | key_fsuid_changed(task); |
510 | if (new->fsgid != old->fsgid) | 506 | if (!gid_eq(new->fsgid, old->fsgid)) |
511 | key_fsgid_changed(task); | 507 | key_fsgid_changed(task); |
512 | 508 | ||
513 | /* do it | 509 | /* do it |
@@ -524,16 +520,16 @@ int commit_creds(struct cred *new) | |||
524 | alter_cred_subscribers(old, -2); | 520 | alter_cred_subscribers(old, -2); |
525 | 521 | ||
526 | /* send notifications */ | 522 | /* send notifications */ |
527 | if (new->uid != old->uid || | 523 | if (!uid_eq(new->uid, old->uid) || |
528 | new->euid != old->euid || | 524 | !uid_eq(new->euid, old->euid) || |
529 | new->suid != old->suid || | 525 | !uid_eq(new->suid, old->suid) || |
530 | new->fsuid != old->fsuid) | 526 | !uid_eq(new->fsuid, old->fsuid)) |
531 | proc_id_connector(task, PROC_EVENT_UID); | 527 | proc_id_connector(task, PROC_EVENT_UID); |
532 | 528 | ||
533 | if (new->gid != old->gid || | 529 | if (!gid_eq(new->gid, old->gid) || |
534 | new->egid != old->egid || | 530 | !gid_eq(new->egid, old->egid) || |
535 | new->sgid != old->sgid || | 531 | !gid_eq(new->sgid, old->sgid) || |
536 | new->fsgid != old->fsgid) | 532 | !gid_eq(new->fsgid, old->fsgid)) |
537 | proc_id_connector(task, PROC_EVENT_GID); | 533 | proc_id_connector(task, PROC_EVENT_GID); |
538 | 534 | ||
539 | /* release the old obj and subj refs both */ | 535 | /* release the old obj and subj refs both */ |
@@ -678,6 +674,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
678 | atomic_set(&new->usage, 1); | 674 | atomic_set(&new->usage, 1); |
679 | set_cred_subscribers(new, 0); | 675 | set_cred_subscribers(new, 0); |
680 | get_uid(new->user); | 676 | get_uid(new->user); |
677 | get_user_ns(new->user_ns); | ||
681 | get_group_info(new->group_info); | 678 | get_group_info(new->group_info); |
682 | 679 | ||
683 | #ifdef CONFIG_KEYS | 680 | #ifdef CONFIG_KEYS |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 67b847dfa2bb..1f91413edb87 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
15 | #include <linux/string.h> | 15 | #include <linux/string.h> |
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/kmsg_dump.h> | ||
17 | #include <linux/reboot.h> | 18 | #include <linux/reboot.h> |
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/sysrq.h> | 20 | #include <linux/sysrq.h> |
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv) | |||
2040 | */ | 2041 | */ |
2041 | static int kdb_dmesg(int argc, const char **argv) | 2042 | static int kdb_dmesg(int argc, const char **argv) |
2042 | { | 2043 | { |
2043 | char *syslog_data[4], *start, *end, c = '\0', *p; | 2044 | int diag; |
2044 | int diag, logging, logsize, lines = 0, adjust = 0, n; | 2045 | int logging; |
2046 | int lines = 0; | ||
2047 | int adjust = 0; | ||
2048 | int n = 0; | ||
2049 | int skip = 0; | ||
2050 | struct kmsg_dumper dumper = { .active = 1 }; | ||
2051 | size_t len; | ||
2052 | char buf[201]; | ||
2045 | 2053 | ||
2046 | if (argc > 2) | 2054 | if (argc > 2) |
2047 | return KDB_ARGCOUNT; | 2055 | return KDB_ARGCOUNT; |
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2064 | kdb_set(2, setargs); | 2072 | kdb_set(2, setargs); |
2065 | } | 2073 | } |
2066 | 2074 | ||
2067 | /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] | 2075 | kmsg_dump_rewind_nolock(&dumper); |
2068 | * logical start, end+1. */ | 2076 | while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) |
2069 | kdb_syslog_data(syslog_data); | 2077 | n++; |
2070 | if (syslog_data[2] == syslog_data[3]) | 2078 | |
2071 | return 0; | ||
2072 | logsize = syslog_data[1] - syslog_data[0]; | ||
2073 | start = syslog_data[2]; | ||
2074 | end = syslog_data[3]; | ||
2075 | #define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) | ||
2076 | for (n = 0, p = start; p < end; ++p) { | ||
2077 | c = *KDB_WRAP(p); | ||
2078 | if (c == '\n') | ||
2079 | ++n; | ||
2080 | } | ||
2081 | if (c != '\n') | ||
2082 | ++n; | ||
2083 | if (lines < 0) { | 2079 | if (lines < 0) { |
2084 | if (adjust >= n) | 2080 | if (adjust >= n) |
2085 | kdb_printf("buffer only contains %d lines, nothing " | 2081 | kdb_printf("buffer only contains %d lines, nothing " |
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2087 | else if (adjust - lines >= n) | 2083 | else if (adjust - lines >= n) |
2088 | kdb_printf("buffer only contains %d lines, last %d " | 2084 | kdb_printf("buffer only contains %d lines, last %d " |
2089 | "lines printed\n", n, n - adjust); | 2085 | "lines printed\n", n, n - adjust); |
2090 | if (adjust) { | 2086 | skip = adjust; |
2091 | for (; start < end && adjust; ++start) { | 2087 | lines = abs(lines); |
2092 | if (*KDB_WRAP(start) == '\n') | ||
2093 | --adjust; | ||
2094 | } | ||
2095 | if (start < end) | ||
2096 | ++start; | ||
2097 | } | ||
2098 | for (p = start; p < end && lines; ++p) { | ||
2099 | if (*KDB_WRAP(p) == '\n') | ||
2100 | ++lines; | ||
2101 | } | ||
2102 | end = p; | ||
2103 | } else if (lines > 0) { | 2088 | } else if (lines > 0) { |
2104 | int skip = n - (adjust + lines); | 2089 | skip = n - lines - adjust; |
2090 | lines = abs(lines); | ||
2105 | if (adjust >= n) { | 2091 | if (adjust >= n) { |
2106 | kdb_printf("buffer only contains %d lines, " | 2092 | kdb_printf("buffer only contains %d lines, " |
2107 | "nothing printed\n", n); | 2093 | "nothing printed\n", n); |
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv) | |||
2112 | kdb_printf("buffer only contains %d lines, first " | 2098 | kdb_printf("buffer only contains %d lines, first " |
2113 | "%d lines printed\n", n, lines); | 2099 | "%d lines printed\n", n, lines); |
2114 | } | 2100 | } |
2115 | for (; start < end && skip; ++start) { | 2101 | } else { |
2116 | if (*KDB_WRAP(start) == '\n') | 2102 | lines = n; |
2117 | --skip; | ||
2118 | } | ||
2119 | for (p = start; p < end && lines; ++p) { | ||
2120 | if (*KDB_WRAP(p) == '\n') | ||
2121 | --lines; | ||
2122 | } | ||
2123 | end = p; | ||
2124 | } | 2103 | } |
2125 | /* Do a line at a time (max 200 chars) to reduce protocol overhead */ | 2104 | |
2126 | c = '\n'; | 2105 | if (skip >= n || skip < 0) |
2127 | while (start != end) { | 2106 | return 0; |
2128 | char buf[201]; | 2107 | |
2129 | p = buf; | 2108 | kmsg_dump_rewind_nolock(&dumper); |
2130 | if (KDB_FLAG(CMD_INTERRUPT)) | 2109 | while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { |
2131 | return 0; | 2110 | if (skip) { |
2132 | while (start < end && (c = *KDB_WRAP(start)) && | 2111 | skip--; |
2133 | (p - buf) < sizeof(buf)-1) { | 2112 | continue; |
2134 | ++start; | ||
2135 | *p++ = c; | ||
2136 | if (c == '\n') | ||
2137 | break; | ||
2138 | } | 2113 | } |
2139 | *p = '\0'; | 2114 | if (!lines--) |
2140 | kdb_printf("%s", buf); | 2115 | break; |
2116 | |||
2117 | kdb_printf("%.*s\n", (int)len - 1, buf); | ||
2141 | } | 2118 | } |
2142 | if (c != '\n') | ||
2143 | kdb_printf("\n"); | ||
2144 | 2119 | ||
2145 | return 0; | 2120 | return 0; |
2146 | } | 2121 | } |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 47c4e56e513b..392ec6a25844 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -205,7 +205,6 @@ extern char kdb_grep_string[]; | |||
205 | extern int kdb_grep_leading; | 205 | extern int kdb_grep_leading; |
206 | extern int kdb_grep_trailing; | 206 | extern int kdb_grep_trailing; |
207 | extern char *kdb_cmds[]; | 207 | extern char *kdb_cmds[]; |
208 | extern void kdb_syslog_data(char *syslog_data[]); | ||
209 | extern unsigned long kdb_task_state_string(const char *); | 208 | extern unsigned long kdb_task_state_string(const char *); |
210 | extern char kdb_task_state_char (const struct task_struct *); | 209 | extern char kdb_task_state_char (const struct task_struct *); |
211 | extern unsigned long kdb_task_state(const struct task_struct *p, | 210 | extern unsigned long kdb_task_state(const struct task_struct *p, |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 22d901f9caf4..103f5d147b2f 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile | |||
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg | |||
3 | endif | 3 | endif |
4 | 4 | ||
5 | obj-y := core.o ring_buffer.o callchain.o | 5 | obj-y := core.o ring_buffer.o callchain.o |
6 | |||
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 7 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
8 | obj-$(CONFIG_UPROBES) += uprobes.o | ||
9 | |||
diff --git a/kernel/events/core.c b/kernel/events/core.c index fd126f82b57c..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) | |||
253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | 253 | return !event->cgrp || event->cgrp == cpuctx->cgrp; |
254 | } | 254 | } |
255 | 255 | ||
256 | static inline void perf_get_cgroup(struct perf_event *event) | 256 | static inline bool perf_tryget_cgroup(struct perf_event *event) |
257 | { | 257 | { |
258 | css_get(&event->cgrp->css); | 258 | return css_tryget(&event->cgrp->css); |
259 | } | 259 | } |
260 | 260 | ||
261 | static inline void perf_put_cgroup(struct perf_event *event) | 261 | static inline void perf_put_cgroup(struct perf_event *event) |
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
484 | event->cgrp = cgrp; | 484 | event->cgrp = cgrp; |
485 | 485 | ||
486 | /* must be done before we fput() the file */ | 486 | /* must be done before we fput() the file */ |
487 | perf_get_cgroup(event); | 487 | if (!perf_tryget_cgroup(event)) { |
488 | event->cgrp = NULL; | ||
489 | ret = -ENOENT; | ||
490 | goto out; | ||
491 | } | ||
488 | 492 | ||
489 | /* | 493 | /* |
490 | * all events in a group must monitor | 494 | * all events in a group must monitor |
@@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event, | |||
3181 | event = event->group_leader; | 3185 | event = event->group_leader; |
3182 | 3186 | ||
3183 | perf_event_for_each_child(event, func); | 3187 | perf_event_for_each_child(event, func); |
3184 | func(event); | ||
3185 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 3188 | list_for_each_entry(sibling, &event->sibling_list, group_entry) |
3186 | perf_event_for_each_child(sibling, func); | 3189 | perf_event_for_each_child(sibling, func); |
3187 | mutex_unlock(&ctx->mutex); | 3190 | mutex_unlock(&ctx->mutex); |
@@ -4957,7 +4960,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | |||
4957 | if (rctx < 0) | 4960 | if (rctx < 0) |
4958 | return; | 4961 | return; |
4959 | 4962 | ||
4960 | perf_sample_data_init(&data, addr); | 4963 | perf_sample_data_init(&data, addr, 0); |
4961 | 4964 | ||
4962 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); | 4965 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
4963 | 4966 | ||
@@ -5215,7 +5218,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
5215 | .data = record, | 5218 | .data = record, |
5216 | }; | 5219 | }; |
5217 | 5220 | ||
5218 | perf_sample_data_init(&data, addr); | 5221 | perf_sample_data_init(&data, addr, 0); |
5219 | data.raw = &raw; | 5222 | data.raw = &raw; |
5220 | 5223 | ||
5221 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5224 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
@@ -5318,7 +5321,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
5318 | struct perf_sample_data sample; | 5321 | struct perf_sample_data sample; |
5319 | struct pt_regs *regs = data; | 5322 | struct pt_regs *regs = data; |
5320 | 5323 | ||
5321 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5324 | perf_sample_data_init(&sample, bp->attr.bp_addr, 0); |
5322 | 5325 | ||
5323 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | 5326 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) |
5324 | perf_swevent_event(bp, 1, &sample, regs); | 5327 | perf_swevent_event(bp, 1, &sample, regs); |
@@ -5344,13 +5347,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5344 | 5347 | ||
5345 | event->pmu->read(event); | 5348 | event->pmu->read(event); |
5346 | 5349 | ||
5347 | perf_sample_data_init(&data, 0); | 5350 | perf_sample_data_init(&data, 0, event->hw.last_period); |
5348 | data.period = event->hw.last_period; | ||
5349 | regs = get_irq_regs(); | 5351 | regs = get_irq_regs(); |
5350 | 5352 | ||
5351 | if (regs && !perf_exclude_event(event, regs)) { | 5353 | if (regs && !perf_exclude_event(event, regs)) { |
5352 | if (!(event->attr.exclude_idle && is_idle_task(current))) | 5354 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5353 | if (perf_event_overflow(event, &data, regs)) | 5355 | if (__perf_event_overflow(event, 1, &data, regs)) |
5354 | ret = HRTIMER_NORESTART; | 5356 | ret = HRTIMER_NORESTART; |
5355 | } | 5357 | } |
5356 | 5358 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c new file mode 100644 index 000000000000..985be4d80fe8 --- /dev/null +++ b/kernel/events/uprobes.c | |||
@@ -0,0 +1,1667 @@ | |||
1 | /* | ||
2 | * User-space Probes (UProbes) | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2008-2012 | ||
19 | * Authors: | ||
20 | * Srikar Dronamraju | ||
21 | * Jim Keniston | ||
22 | * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
23 | */ | ||
24 | |||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/highmem.h> | ||
27 | #include <linux/pagemap.h> /* read_mapping_page */ | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/rmap.h> /* anon_vma_prepare */ | ||
31 | #include <linux/mmu_notifier.h> /* set_pte_at_notify */ | ||
32 | #include <linux/swap.h> /* try_to_free_swap */ | ||
33 | #include <linux/ptrace.h> /* user_enable_single_step */ | ||
34 | #include <linux/kdebug.h> /* notifier mechanism */ | ||
35 | |||
36 | #include <linux/uprobes.h> | ||
37 | |||
38 | #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) | ||
39 | #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE | ||
40 | |||
41 | static struct srcu_struct uprobes_srcu; | ||
42 | static struct rb_root uprobes_tree = RB_ROOT; | ||
43 | |||
44 | static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ | ||
45 | |||
46 | #define UPROBES_HASH_SZ 13 | ||
47 | |||
48 | /* serialize (un)register */ | ||
49 | static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; | ||
50 | |||
51 | #define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | ||
52 | |||
53 | /* serialize uprobe->pending_list */ | ||
54 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | ||
55 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | ||
56 | |||
57 | /* | ||
58 | * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe | ||
59 | * events active at this time. Probably a fine grained per inode count is | ||
60 | * better? | ||
61 | */ | ||
62 | static atomic_t uprobe_events = ATOMIC_INIT(0); | ||
63 | |||
64 | /* | ||
65 | * Maintain a temporary per vma info that can be used to search if a vma | ||
66 | * has already been handled. This structure is introduced since extending | ||
67 | * vm_area_struct wasnt recommended. | ||
68 | */ | ||
69 | struct vma_info { | ||
70 | struct list_head probe_list; | ||
71 | struct mm_struct *mm; | ||
72 | loff_t vaddr; | ||
73 | }; | ||
74 | |||
75 | struct uprobe { | ||
76 | struct rb_node rb_node; /* node in the rb tree */ | ||
77 | atomic_t ref; | ||
78 | struct rw_semaphore consumer_rwsem; | ||
79 | struct list_head pending_list; | ||
80 | struct uprobe_consumer *consumers; | ||
81 | struct inode *inode; /* Also hold a ref to inode */ | ||
82 | loff_t offset; | ||
83 | int flags; | ||
84 | struct arch_uprobe arch; | ||
85 | }; | ||
86 | |||
87 | /* | ||
88 | * valid_vma: Verify if the specified vma is an executable vma | ||
89 | * Relax restrictions while unregistering: vm_flags might have | ||
90 | * changed after breakpoint was inserted. | ||
91 | * - is_register: indicates if we are in register context. | ||
92 | * - Return 1 if the specified virtual address is in an | ||
93 | * executable vma. | ||
94 | */ | ||
95 | static bool valid_vma(struct vm_area_struct *vma, bool is_register) | ||
96 | { | ||
97 | if (!vma->vm_file) | ||
98 | return false; | ||
99 | |||
100 | if (!is_register) | ||
101 | return true; | ||
102 | |||
103 | if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) | ||
104 | return true; | ||
105 | |||
106 | return false; | ||
107 | } | ||
108 | |||
109 | static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) | ||
110 | { | ||
111 | loff_t vaddr; | ||
112 | |||
113 | vaddr = vma->vm_start + offset; | ||
114 | vaddr -= vma->vm_pgoff << PAGE_SHIFT; | ||
115 | |||
116 | return vaddr; | ||
117 | } | ||
118 | |||
119 | /** | ||
120 | * __replace_page - replace page in vma by new page. | ||
121 | * based on replace_page in mm/ksm.c | ||
122 | * | ||
123 | * @vma: vma that holds the pte pointing to page | ||
124 | * @page: the cowed page we are replacing by kpage | ||
125 | * @kpage: the modified page we replace page by | ||
126 | * | ||
127 | * Returns 0 on success, -EFAULT on failure. | ||
128 | */ | ||
129 | static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) | ||
130 | { | ||
131 | struct mm_struct *mm = vma->vm_mm; | ||
132 | pgd_t *pgd; | ||
133 | pud_t *pud; | ||
134 | pmd_t *pmd; | ||
135 | pte_t *ptep; | ||
136 | spinlock_t *ptl; | ||
137 | unsigned long addr; | ||
138 | int err = -EFAULT; | ||
139 | |||
140 | addr = page_address_in_vma(page, vma); | ||
141 | if (addr == -EFAULT) | ||
142 | goto out; | ||
143 | |||
144 | pgd = pgd_offset(mm, addr); | ||
145 | if (!pgd_present(*pgd)) | ||
146 | goto out; | ||
147 | |||
148 | pud = pud_offset(pgd, addr); | ||
149 | if (!pud_present(*pud)) | ||
150 | goto out; | ||
151 | |||
152 | pmd = pmd_offset(pud, addr); | ||
153 | if (!pmd_present(*pmd)) | ||
154 | goto out; | ||
155 | |||
156 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | ||
157 | if (!ptep) | ||
158 | goto out; | ||
159 | |||
160 | get_page(kpage); | ||
161 | page_add_new_anon_rmap(kpage, vma, addr); | ||
162 | |||
163 | if (!PageAnon(page)) { | ||
164 | dec_mm_counter(mm, MM_FILEPAGES); | ||
165 | inc_mm_counter(mm, MM_ANONPAGES); | ||
166 | } | ||
167 | |||
168 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | ||
169 | ptep_clear_flush(vma, addr, ptep); | ||
170 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | ||
171 | |||
172 | page_remove_rmap(page); | ||
173 | if (!page_mapped(page)) | ||
174 | try_to_free_swap(page); | ||
175 | put_page(page); | ||
176 | pte_unmap_unlock(ptep, ptl); | ||
177 | err = 0; | ||
178 | |||
179 | out: | ||
180 | return err; | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * is_swbp_insn - check if instruction is breakpoint instruction. | ||
185 | * @insn: instruction to be checked. | ||
186 | * Default implementation of is_swbp_insn | ||
187 | * Returns true if @insn is a breakpoint instruction. | ||
188 | */ | ||
189 | bool __weak is_swbp_insn(uprobe_opcode_t *insn) | ||
190 | { | ||
191 | return *insn == UPROBE_SWBP_INSN; | ||
192 | } | ||
193 | |||
194 | /* | ||
195 | * NOTE: | ||
196 | * Expect the breakpoint instruction to be the smallest size instruction for | ||
197 | * the architecture. If an arch has variable length instruction and the | ||
198 | * breakpoint instruction is not of the smallest length instruction | ||
199 | * supported by that architecture then we need to modify read_opcode / | ||
200 | * write_opcode accordingly. This would never be a problem for archs that | ||
201 | * have fixed length instructions. | ||
202 | */ | ||
203 | |||
204 | /* | ||
205 | * write_opcode - write the opcode at a given virtual address. | ||
206 | * @auprobe: arch breakpointing information. | ||
207 | * @mm: the probed process address space. | ||
208 | * @vaddr: the virtual address to store the opcode. | ||
209 | * @opcode: opcode to be written at @vaddr. | ||
210 | * | ||
211 | * Called with mm->mmap_sem held (for read and with a reference to | ||
212 | * mm). | ||
213 | * | ||
214 | * For mm @mm, write the opcode at @vaddr. | ||
215 | * Return 0 (success) or a negative errno. | ||
216 | */ | ||
217 | static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | ||
218 | unsigned long vaddr, uprobe_opcode_t opcode) | ||
219 | { | ||
220 | struct page *old_page, *new_page; | ||
221 | struct address_space *mapping; | ||
222 | void *vaddr_old, *vaddr_new; | ||
223 | struct vm_area_struct *vma; | ||
224 | struct uprobe *uprobe; | ||
225 | loff_t addr; | ||
226 | int ret; | ||
227 | |||
228 | /* Read the page with vaddr into memory */ | ||
229 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); | ||
230 | if (ret <= 0) | ||
231 | return ret; | ||
232 | |||
233 | ret = -EINVAL; | ||
234 | |||
235 | /* | ||
236 | * We are interested in text pages only. Our pages of interest | ||
237 | * should be mapped for read and execute only. We desist from | ||
238 | * adding probes in write mapped pages since the breakpoints | ||
239 | * might end up in the file copy. | ||
240 | */ | ||
241 | if (!valid_vma(vma, is_swbp_insn(&opcode))) | ||
242 | goto put_out; | ||
243 | |||
244 | uprobe = container_of(auprobe, struct uprobe, arch); | ||
245 | mapping = uprobe->inode->i_mapping; | ||
246 | if (mapping != vma->vm_file->f_mapping) | ||
247 | goto put_out; | ||
248 | |||
249 | addr = vma_address(vma, uprobe->offset); | ||
250 | if (vaddr != (unsigned long)addr) | ||
251 | goto put_out; | ||
252 | |||
253 | ret = -ENOMEM; | ||
254 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); | ||
255 | if (!new_page) | ||
256 | goto put_out; | ||
257 | |||
258 | __SetPageUptodate(new_page); | ||
259 | |||
260 | /* | ||
261 | * lock page will serialize against do_wp_page()'s | ||
262 | * PageAnon() handling | ||
263 | */ | ||
264 | lock_page(old_page); | ||
265 | /* copy the page now that we've got it stable */ | ||
266 | vaddr_old = kmap_atomic(old_page); | ||
267 | vaddr_new = kmap_atomic(new_page); | ||
268 | |||
269 | memcpy(vaddr_new, vaddr_old, PAGE_SIZE); | ||
270 | |||
271 | /* poke the new insn in, ASSUMES we don't cross page boundary */ | ||
272 | vaddr &= ~PAGE_MASK; | ||
273 | BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | ||
274 | memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | ||
275 | |||
276 | kunmap_atomic(vaddr_new); | ||
277 | kunmap_atomic(vaddr_old); | ||
278 | |||
279 | ret = anon_vma_prepare(vma); | ||
280 | if (ret) | ||
281 | goto unlock_out; | ||
282 | |||
283 | lock_page(new_page); | ||
284 | ret = __replace_page(vma, old_page, new_page); | ||
285 | unlock_page(new_page); | ||
286 | |||
287 | unlock_out: | ||
288 | unlock_page(old_page); | ||
289 | page_cache_release(new_page); | ||
290 | |||
291 | put_out: | ||
292 | put_page(old_page); | ||
293 | |||
294 | return ret; | ||
295 | } | ||
296 | |||
297 | /** | ||
298 | * read_opcode - read the opcode at a given virtual address. | ||
299 | * @mm: the probed process address space. | ||
300 | * @vaddr: the virtual address to read the opcode. | ||
301 | * @opcode: location to store the read opcode. | ||
302 | * | ||
303 | * Called with mm->mmap_sem held (for read and with a reference to | ||
304 | * mm. | ||
305 | * | ||
306 | * For mm @mm, read the opcode at @vaddr and store it in @opcode. | ||
307 | * Return 0 (success) or a negative errno. | ||
308 | */ | ||
309 | static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) | ||
310 | { | ||
311 | struct page *page; | ||
312 | void *vaddr_new; | ||
313 | int ret; | ||
314 | |||
315 | ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); | ||
316 | if (ret <= 0) | ||
317 | return ret; | ||
318 | |||
319 | lock_page(page); | ||
320 | vaddr_new = kmap_atomic(page); | ||
321 | vaddr &= ~PAGE_MASK; | ||
322 | memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); | ||
323 | kunmap_atomic(vaddr_new); | ||
324 | unlock_page(page); | ||
325 | |||
326 | put_page(page); | ||
327 | |||
328 | return 0; | ||
329 | } | ||
330 | |||
331 | static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) | ||
332 | { | ||
333 | uprobe_opcode_t opcode; | ||
334 | int result; | ||
335 | |||
336 | result = read_opcode(mm, vaddr, &opcode); | ||
337 | if (result) | ||
338 | return result; | ||
339 | |||
340 | if (is_swbp_insn(&opcode)) | ||
341 | return 1; | ||
342 | |||
343 | return 0; | ||
344 | } | ||
345 | |||
346 | /** | ||
347 | * set_swbp - store breakpoint at a given address. | ||
348 | * @auprobe: arch specific probepoint information. | ||
349 | * @mm: the probed process address space. | ||
350 | * @vaddr: the virtual address to insert the opcode. | ||
351 | * | ||
352 | * For mm @mm, store the breakpoint instruction at @vaddr. | ||
353 | * Return 0 (success) or a negative errno. | ||
354 | */ | ||
355 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | ||
356 | { | ||
357 | int result; | ||
358 | |||
359 | result = is_swbp_at_addr(mm, vaddr); | ||
360 | if (result == 1) | ||
361 | return -EEXIST; | ||
362 | |||
363 | if (result) | ||
364 | return result; | ||
365 | |||
366 | return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); | ||
367 | } | ||
368 | |||
369 | /** | ||
370 | * set_orig_insn - Restore the original instruction. | ||
371 | * @mm: the probed process address space. | ||
372 | * @auprobe: arch specific probepoint information. | ||
373 | * @vaddr: the virtual address to insert the opcode. | ||
374 | * @verify: if true, verify existance of breakpoint instruction. | ||
375 | * | ||
376 | * For mm @mm, restore the original opcode (opcode) at @vaddr. | ||
377 | * Return 0 (success) or a negative errno. | ||
378 | */ | ||
379 | int __weak | ||
380 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) | ||
381 | { | ||
382 | if (verify) { | ||
383 | int result; | ||
384 | |||
385 | result = is_swbp_at_addr(mm, vaddr); | ||
386 | if (!result) | ||
387 | return -EINVAL; | ||
388 | |||
389 | if (result != 1) | ||
390 | return result; | ||
391 | } | ||
392 | return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | ||
393 | } | ||
394 | |||
395 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | ||
396 | { | ||
397 | if (l->inode < r->inode) | ||
398 | return -1; | ||
399 | |||
400 | if (l->inode > r->inode) | ||
401 | return 1; | ||
402 | |||
403 | if (l->offset < r->offset) | ||
404 | return -1; | ||
405 | |||
406 | if (l->offset > r->offset) | ||
407 | return 1; | ||
408 | |||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) | ||
413 | { | ||
414 | struct uprobe u = { .inode = inode, .offset = offset }; | ||
415 | struct rb_node *n = uprobes_tree.rb_node; | ||
416 | struct uprobe *uprobe; | ||
417 | int match; | ||
418 | |||
419 | while (n) { | ||
420 | uprobe = rb_entry(n, struct uprobe, rb_node); | ||
421 | match = match_uprobe(&u, uprobe); | ||
422 | if (!match) { | ||
423 | atomic_inc(&uprobe->ref); | ||
424 | return uprobe; | ||
425 | } | ||
426 | |||
427 | if (match < 0) | ||
428 | n = n->rb_left; | ||
429 | else | ||
430 | n = n->rb_right; | ||
431 | } | ||
432 | return NULL; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * Find a uprobe corresponding to a given inode:offset | ||
437 | * Acquires uprobes_treelock | ||
438 | */ | ||
439 | static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) | ||
440 | { | ||
441 | struct uprobe *uprobe; | ||
442 | unsigned long flags; | ||
443 | |||
444 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
445 | uprobe = __find_uprobe(inode, offset); | ||
446 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
447 | |||
448 | return uprobe; | ||
449 | } | ||
450 | |||
451 | static struct uprobe *__insert_uprobe(struct uprobe *uprobe) | ||
452 | { | ||
453 | struct rb_node **p = &uprobes_tree.rb_node; | ||
454 | struct rb_node *parent = NULL; | ||
455 | struct uprobe *u; | ||
456 | int match; | ||
457 | |||
458 | while (*p) { | ||
459 | parent = *p; | ||
460 | u = rb_entry(parent, struct uprobe, rb_node); | ||
461 | match = match_uprobe(uprobe, u); | ||
462 | if (!match) { | ||
463 | atomic_inc(&u->ref); | ||
464 | return u; | ||
465 | } | ||
466 | |||
467 | if (match < 0) | ||
468 | p = &parent->rb_left; | ||
469 | else | ||
470 | p = &parent->rb_right; | ||
471 | |||
472 | } | ||
473 | |||
474 | u = NULL; | ||
475 | rb_link_node(&uprobe->rb_node, parent, p); | ||
476 | rb_insert_color(&uprobe->rb_node, &uprobes_tree); | ||
477 | /* get access + creation ref */ | ||
478 | atomic_set(&uprobe->ref, 2); | ||
479 | |||
480 | return u; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Acquire uprobes_treelock. | ||
485 | * Matching uprobe already exists in rbtree; | ||
486 | * increment (access refcount) and return the matching uprobe. | ||
487 | * | ||
488 | * No matching uprobe; insert the uprobe in rb_tree; | ||
489 | * get a double refcount (access + creation) and return NULL. | ||
490 | */ | ||
491 | static struct uprobe *insert_uprobe(struct uprobe *uprobe) | ||
492 | { | ||
493 | unsigned long flags; | ||
494 | struct uprobe *u; | ||
495 | |||
496 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
497 | u = __insert_uprobe(uprobe); | ||
498 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
499 | |||
500 | /* For now assume that the instruction need not be single-stepped */ | ||
501 | uprobe->flags |= UPROBE_SKIP_SSTEP; | ||
502 | |||
503 | return u; | ||
504 | } | ||
505 | |||
506 | static void put_uprobe(struct uprobe *uprobe) | ||
507 | { | ||
508 | if (atomic_dec_and_test(&uprobe->ref)) | ||
509 | kfree(uprobe); | ||
510 | } | ||
511 | |||
512 | static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | ||
513 | { | ||
514 | struct uprobe *uprobe, *cur_uprobe; | ||
515 | |||
516 | uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); | ||
517 | if (!uprobe) | ||
518 | return NULL; | ||
519 | |||
520 | uprobe->inode = igrab(inode); | ||
521 | uprobe->offset = offset; | ||
522 | init_rwsem(&uprobe->consumer_rwsem); | ||
523 | INIT_LIST_HEAD(&uprobe->pending_list); | ||
524 | |||
525 | /* add to uprobes_tree, sorted on inode:offset */ | ||
526 | cur_uprobe = insert_uprobe(uprobe); | ||
527 | |||
528 | /* a uprobe exists for this inode:offset combination */ | ||
529 | if (cur_uprobe) { | ||
530 | kfree(uprobe); | ||
531 | uprobe = cur_uprobe; | ||
532 | iput(inode); | ||
533 | } else { | ||
534 | atomic_inc(&uprobe_events); | ||
535 | } | ||
536 | |||
537 | return uprobe; | ||
538 | } | ||
539 | |||
540 | static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | ||
541 | { | ||
542 | struct uprobe_consumer *uc; | ||
543 | |||
544 | if (!(uprobe->flags & UPROBE_RUN_HANDLER)) | ||
545 | return; | ||
546 | |||
547 | down_read(&uprobe->consumer_rwsem); | ||
548 | for (uc = uprobe->consumers; uc; uc = uc->next) { | ||
549 | if (!uc->filter || uc->filter(uc, current)) | ||
550 | uc->handler(uc, regs); | ||
551 | } | ||
552 | up_read(&uprobe->consumer_rwsem); | ||
553 | } | ||
554 | |||
555 | /* Returns the previous consumer */ | ||
556 | static struct uprobe_consumer * | ||
557 | consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) | ||
558 | { | ||
559 | down_write(&uprobe->consumer_rwsem); | ||
560 | uc->next = uprobe->consumers; | ||
561 | uprobe->consumers = uc; | ||
562 | up_write(&uprobe->consumer_rwsem); | ||
563 | |||
564 | return uc->next; | ||
565 | } | ||
566 | |||
567 | /* | ||
568 | * For uprobe @uprobe, delete the consumer @uc. | ||
569 | * Return true if the @uc is deleted successfully | ||
570 | * or return false. | ||
571 | */ | ||
572 | static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | ||
573 | { | ||
574 | struct uprobe_consumer **con; | ||
575 | bool ret = false; | ||
576 | |||
577 | down_write(&uprobe->consumer_rwsem); | ||
578 | for (con = &uprobe->consumers; *con; con = &(*con)->next) { | ||
579 | if (*con == uc) { | ||
580 | *con = uc->next; | ||
581 | ret = true; | ||
582 | break; | ||
583 | } | ||
584 | } | ||
585 | up_write(&uprobe->consumer_rwsem); | ||
586 | |||
587 | return ret; | ||
588 | } | ||
589 | |||
590 | static int | ||
591 | __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, | ||
592 | unsigned long nbytes, unsigned long offset) | ||
593 | { | ||
594 | struct file *filp = vma->vm_file; | ||
595 | struct page *page; | ||
596 | void *vaddr; | ||
597 | unsigned long off1; | ||
598 | unsigned long idx; | ||
599 | |||
600 | if (!filp) | ||
601 | return -EINVAL; | ||
602 | |||
603 | idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); | ||
604 | off1 = offset &= ~PAGE_MASK; | ||
605 | |||
606 | /* | ||
607 | * Ensure that the page that has the original instruction is | ||
608 | * populated and in page-cache. | ||
609 | */ | ||
610 | page = read_mapping_page(mapping, idx, filp); | ||
611 | if (IS_ERR(page)) | ||
612 | return PTR_ERR(page); | ||
613 | |||
614 | vaddr = kmap_atomic(page); | ||
615 | memcpy(insn, vaddr + off1, nbytes); | ||
616 | kunmap_atomic(vaddr); | ||
617 | page_cache_release(page); | ||
618 | |||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | static int | ||
623 | copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) | ||
624 | { | ||
625 | struct address_space *mapping; | ||
626 | unsigned long nbytes; | ||
627 | int bytes; | ||
628 | |||
629 | addr &= ~PAGE_MASK; | ||
630 | nbytes = PAGE_SIZE - addr; | ||
631 | mapping = uprobe->inode->i_mapping; | ||
632 | |||
633 | /* Instruction at end of binary; copy only available bytes */ | ||
634 | if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) | ||
635 | bytes = uprobe->inode->i_size - uprobe->offset; | ||
636 | else | ||
637 | bytes = MAX_UINSN_BYTES; | ||
638 | |||
639 | /* Instruction at the page-boundary; copy bytes in second page */ | ||
640 | if (nbytes < bytes) { | ||
641 | if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, | ||
642 | bytes - nbytes, uprobe->offset + nbytes)) | ||
643 | return -ENOMEM; | ||
644 | |||
645 | bytes = nbytes; | ||
646 | } | ||
647 | return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | * How mm->uprobes_state.count gets updated | ||
652 | * uprobe_mmap() increments the count if | ||
653 | * - it successfully adds a breakpoint. | ||
654 | * - it cannot add a breakpoint, but sees that there is a underlying | ||
655 | * breakpoint (via a is_swbp_at_addr()). | ||
656 | * | ||
657 | * uprobe_munmap() decrements the count if | ||
658 | * - it sees a underlying breakpoint, (via is_swbp_at_addr) | ||
659 | * (Subsequent uprobe_unregister wouldnt find the breakpoint | ||
660 | * unless a uprobe_mmap kicks in, since the old vma would be | ||
661 | * dropped just after uprobe_munmap.) | ||
662 | * | ||
663 | * uprobe_register increments the count if: | ||
664 | * - it successfully adds a breakpoint. | ||
665 | * | ||
666 | * uprobe_unregister decrements the count if: | ||
667 | * - it sees a underlying breakpoint and removes successfully. | ||
668 | * (via is_swbp_at_addr) | ||
669 | * (Subsequent uprobe_munmap wouldnt find the breakpoint | ||
670 | * since there is no underlying breakpoint after the | ||
671 | * breakpoint removal.) | ||
672 | */ | ||
673 | static int | ||
674 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | ||
675 | struct vm_area_struct *vma, loff_t vaddr) | ||
676 | { | ||
677 | unsigned long addr; | ||
678 | int ret; | ||
679 | |||
680 | /* | ||
681 | * If probe is being deleted, unregister thread could be done with | ||
682 | * the vma-rmap-walk through. Adding a probe now can be fatal since | ||
683 | * nobody will be able to cleanup. Also we could be from fork or | ||
684 | * mremap path, where the probe might have already been inserted. | ||
685 | * Hence behave as if probe already existed. | ||
686 | */ | ||
687 | if (!uprobe->consumers) | ||
688 | return -EEXIST; | ||
689 | |||
690 | addr = (unsigned long)vaddr; | ||
691 | |||
692 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { | ||
693 | ret = copy_insn(uprobe, vma, addr); | ||
694 | if (ret) | ||
695 | return ret; | ||
696 | |||
697 | if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) | ||
698 | return -EEXIST; | ||
699 | |||
700 | ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); | ||
701 | if (ret) | ||
702 | return ret; | ||
703 | |||
704 | uprobe->flags |= UPROBE_COPY_INSN; | ||
705 | } | ||
706 | |||
707 | /* | ||
708 | * Ideally, should be updating the probe count after the breakpoint | ||
709 | * has been successfully inserted. However a thread could hit the | ||
710 | * breakpoint we just inserted even before the probe count is | ||
711 | * incremented. If this is the first breakpoint placed, breakpoint | ||
712 | * notifier might ignore uprobes and pass the trap to the thread. | ||
713 | * Hence increment before and decrement on failure. | ||
714 | */ | ||
715 | atomic_inc(&mm->uprobes_state.count); | ||
716 | ret = set_swbp(&uprobe->arch, mm, addr); | ||
717 | if (ret) | ||
718 | atomic_dec(&mm->uprobes_state.count); | ||
719 | |||
720 | return ret; | ||
721 | } | ||
722 | |||
723 | static void | ||
724 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) | ||
725 | { | ||
726 | if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) | ||
727 | atomic_dec(&mm->uprobes_state.count); | ||
728 | } | ||
729 | |||
730 | /* | ||
731 | * There could be threads that have hit the breakpoint and are entering the | ||
732 | * notifier code and trying to acquire the uprobes_treelock. The thread | ||
733 | * calling delete_uprobe() that is removing the uprobe from the rb_tree can | ||
734 | * race with these threads and might acquire the uprobes_treelock compared | ||
735 | * to some of the breakpoint hit threads. In such a case, the breakpoint | ||
736 | * hit threads will not find the uprobe. The current unregistering thread | ||
737 | * waits till all other threads have hit a breakpoint, to acquire the | ||
738 | * uprobes_treelock before the uprobe is removed from the rbtree. | ||
739 | */ | ||
740 | static void delete_uprobe(struct uprobe *uprobe) | ||
741 | { | ||
742 | unsigned long flags; | ||
743 | |||
744 | synchronize_srcu(&uprobes_srcu); | ||
745 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
746 | rb_erase(&uprobe->rb_node, &uprobes_tree); | ||
747 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
748 | iput(uprobe->inode); | ||
749 | put_uprobe(uprobe); | ||
750 | atomic_dec(&uprobe_events); | ||
751 | } | ||
752 | |||
753 | static struct vma_info * | ||
754 | __find_next_vma_info(struct address_space *mapping, struct list_head *head, | ||
755 | struct vma_info *vi, loff_t offset, bool is_register) | ||
756 | { | ||
757 | struct prio_tree_iter iter; | ||
758 | struct vm_area_struct *vma; | ||
759 | struct vma_info *tmpvi; | ||
760 | unsigned long pgoff; | ||
761 | int existing_vma; | ||
762 | loff_t vaddr; | ||
763 | |||
764 | pgoff = offset >> PAGE_SHIFT; | ||
765 | |||
766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
767 | if (!valid_vma(vma, is_register)) | ||
768 | continue; | ||
769 | |||
770 | existing_vma = 0; | ||
771 | vaddr = vma_address(vma, offset); | ||
772 | |||
773 | list_for_each_entry(tmpvi, head, probe_list) { | ||
774 | if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { | ||
775 | existing_vma = 1; | ||
776 | break; | ||
777 | } | ||
778 | } | ||
779 | |||
780 | /* | ||
781 | * Another vma needs a probe to be installed. However skip | ||
782 | * installing the probe if the vma is about to be unlinked. | ||
783 | */ | ||
784 | if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { | ||
785 | vi->mm = vma->vm_mm; | ||
786 | vi->vaddr = vaddr; | ||
787 | list_add(&vi->probe_list, head); | ||
788 | |||
789 | return vi; | ||
790 | } | ||
791 | } | ||
792 | |||
793 | return NULL; | ||
794 | } | ||
795 | |||
796 | /* | ||
797 | * Iterate in the rmap prio tree and find a vma where a probe has not | ||
798 | * yet been inserted. | ||
799 | */ | ||
800 | static struct vma_info * | ||
801 | find_next_vma_info(struct address_space *mapping, struct list_head *head, | ||
802 | loff_t offset, bool is_register) | ||
803 | { | ||
804 | struct vma_info *vi, *retvi; | ||
805 | |||
806 | vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); | ||
807 | if (!vi) | ||
808 | return ERR_PTR(-ENOMEM); | ||
809 | |||
810 | mutex_lock(&mapping->i_mmap_mutex); | ||
811 | retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); | ||
812 | mutex_unlock(&mapping->i_mmap_mutex); | ||
813 | |||
814 | if (!retvi) | ||
815 | kfree(vi); | ||
816 | |||
817 | return retvi; | ||
818 | } | ||
819 | |||
820 | static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | ||
821 | { | ||
822 | struct list_head try_list; | ||
823 | struct vm_area_struct *vma; | ||
824 | struct address_space *mapping; | ||
825 | struct vma_info *vi, *tmpvi; | ||
826 | struct mm_struct *mm; | ||
827 | loff_t vaddr; | ||
828 | int ret; | ||
829 | |||
830 | mapping = uprobe->inode->i_mapping; | ||
831 | INIT_LIST_HEAD(&try_list); | ||
832 | |||
833 | ret = 0; | ||
834 | |||
835 | for (;;) { | ||
836 | vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); | ||
837 | if (!vi) | ||
838 | break; | ||
839 | |||
840 | if (IS_ERR(vi)) { | ||
841 | ret = PTR_ERR(vi); | ||
842 | break; | ||
843 | } | ||
844 | |||
845 | mm = vi->mm; | ||
846 | down_read(&mm->mmap_sem); | ||
847 | vma = find_vma(mm, (unsigned long)vi->vaddr); | ||
848 | if (!vma || !valid_vma(vma, is_register)) { | ||
849 | list_del(&vi->probe_list); | ||
850 | kfree(vi); | ||
851 | up_read(&mm->mmap_sem); | ||
852 | mmput(mm); | ||
853 | continue; | ||
854 | } | ||
855 | vaddr = vma_address(vma, uprobe->offset); | ||
856 | if (vma->vm_file->f_mapping->host != uprobe->inode || | ||
857 | vaddr != vi->vaddr) { | ||
858 | list_del(&vi->probe_list); | ||
859 | kfree(vi); | ||
860 | up_read(&mm->mmap_sem); | ||
861 | mmput(mm); | ||
862 | continue; | ||
863 | } | ||
864 | |||
865 | if (is_register) | ||
866 | ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); | ||
867 | else | ||
868 | remove_breakpoint(uprobe, mm, vi->vaddr); | ||
869 | |||
870 | up_read(&mm->mmap_sem); | ||
871 | mmput(mm); | ||
872 | if (is_register) { | ||
873 | if (ret && ret == -EEXIST) | ||
874 | ret = 0; | ||
875 | if (ret) | ||
876 | break; | ||
877 | } | ||
878 | } | ||
879 | |||
880 | list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { | ||
881 | list_del(&vi->probe_list); | ||
882 | kfree(vi); | ||
883 | } | ||
884 | |||
885 | return ret; | ||
886 | } | ||
887 | |||
888 | static int __uprobe_register(struct uprobe *uprobe) | ||
889 | { | ||
890 | return register_for_each_vma(uprobe, true); | ||
891 | } | ||
892 | |||
893 | static void __uprobe_unregister(struct uprobe *uprobe) | ||
894 | { | ||
895 | if (!register_for_each_vma(uprobe, false)) | ||
896 | delete_uprobe(uprobe); | ||
897 | |||
898 | /* TODO : cant unregister? schedule a worker thread */ | ||
899 | } | ||
900 | |||
901 | /* | ||
902 | * uprobe_register - register a probe | ||
903 | * @inode: the file in which the probe has to be placed. | ||
904 | * @offset: offset from the start of the file. | ||
905 | * @uc: information on howto handle the probe.. | ||
906 | * | ||
907 | * Apart from the access refcount, uprobe_register() takes a creation | ||
908 | * refcount (thro alloc_uprobe) if and only if this @uprobe is getting | ||
909 | * inserted into the rbtree (i.e first consumer for a @inode:@offset | ||
910 | * tuple). Creation refcount stops uprobe_unregister from freeing the | ||
911 | * @uprobe even before the register operation is complete. Creation | ||
912 | * refcount is released when the last @uc for the @uprobe | ||
913 | * unregisters. | ||
914 | * | ||
915 | * Return errno if it cannot successully install probes | ||
916 | * else return 0 (success) | ||
917 | */ | ||
918 | int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) | ||
919 | { | ||
920 | struct uprobe *uprobe; | ||
921 | int ret; | ||
922 | |||
923 | if (!inode || !uc || uc->next) | ||
924 | return -EINVAL; | ||
925 | |||
926 | if (offset > i_size_read(inode)) | ||
927 | return -EINVAL; | ||
928 | |||
929 | ret = 0; | ||
930 | mutex_lock(uprobes_hash(inode)); | ||
931 | uprobe = alloc_uprobe(inode, offset); | ||
932 | |||
933 | if (uprobe && !consumer_add(uprobe, uc)) { | ||
934 | ret = __uprobe_register(uprobe); | ||
935 | if (ret) { | ||
936 | uprobe->consumers = NULL; | ||
937 | __uprobe_unregister(uprobe); | ||
938 | } else { | ||
939 | uprobe->flags |= UPROBE_RUN_HANDLER; | ||
940 | } | ||
941 | } | ||
942 | |||
943 | mutex_unlock(uprobes_hash(inode)); | ||
944 | put_uprobe(uprobe); | ||
945 | |||
946 | return ret; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * uprobe_unregister - unregister a already registered probe. | ||
951 | * @inode: the file in which the probe has to be removed. | ||
952 | * @offset: offset from the start of the file. | ||
953 | * @uc: identify which probe if multiple probes are colocated. | ||
954 | */ | ||
955 | void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) | ||
956 | { | ||
957 | struct uprobe *uprobe; | ||
958 | |||
959 | if (!inode || !uc) | ||
960 | return; | ||
961 | |||
962 | uprobe = find_uprobe(inode, offset); | ||
963 | if (!uprobe) | ||
964 | return; | ||
965 | |||
966 | mutex_lock(uprobes_hash(inode)); | ||
967 | |||
968 | if (consumer_del(uprobe, uc)) { | ||
969 | if (!uprobe->consumers) { | ||
970 | __uprobe_unregister(uprobe); | ||
971 | uprobe->flags &= ~UPROBE_RUN_HANDLER; | ||
972 | } | ||
973 | } | ||
974 | |||
975 | mutex_unlock(uprobes_hash(inode)); | ||
976 | if (uprobe) | ||
977 | put_uprobe(uprobe); | ||
978 | } | ||
979 | |||
980 | /* | ||
981 | * Of all the nodes that correspond to the given inode, return the node | ||
982 | * with the least offset. | ||
983 | */ | ||
984 | static struct rb_node *find_least_offset_node(struct inode *inode) | ||
985 | { | ||
986 | struct uprobe u = { .inode = inode, .offset = 0}; | ||
987 | struct rb_node *n = uprobes_tree.rb_node; | ||
988 | struct rb_node *close_node = NULL; | ||
989 | struct uprobe *uprobe; | ||
990 | int match; | ||
991 | |||
992 | while (n) { | ||
993 | uprobe = rb_entry(n, struct uprobe, rb_node); | ||
994 | match = match_uprobe(&u, uprobe); | ||
995 | |||
996 | if (uprobe->inode == inode) | ||
997 | close_node = n; | ||
998 | |||
999 | if (!match) | ||
1000 | return close_node; | ||
1001 | |||
1002 | if (match < 0) | ||
1003 | n = n->rb_left; | ||
1004 | else | ||
1005 | n = n->rb_right; | ||
1006 | } | ||
1007 | |||
1008 | return close_node; | ||
1009 | } | ||
1010 | |||
1011 | /* | ||
1012 | * For a given inode, build a list of probes that need to be inserted. | ||
1013 | */ | ||
1014 | static void build_probe_list(struct inode *inode, struct list_head *head) | ||
1015 | { | ||
1016 | struct uprobe *uprobe; | ||
1017 | unsigned long flags; | ||
1018 | struct rb_node *n; | ||
1019 | |||
1020 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
1021 | |||
1022 | n = find_least_offset_node(inode); | ||
1023 | |||
1024 | for (; n; n = rb_next(n)) { | ||
1025 | uprobe = rb_entry(n, struct uprobe, rb_node); | ||
1026 | if (uprobe->inode != inode) | ||
1027 | break; | ||
1028 | |||
1029 | list_add(&uprobe->pending_list, head); | ||
1030 | atomic_inc(&uprobe->ref); | ||
1031 | } | ||
1032 | |||
1033 | spin_unlock_irqrestore(&uprobes_treelock, flags); | ||
1034 | } | ||
1035 | |||
1036 | /* | ||
1037 | * Called from mmap_region. | ||
1038 | * called with mm->mmap_sem acquired. | ||
1039 | * | ||
1040 | * Return -ve no if we fail to insert probes and we cannot | ||
1041 | * bail-out. | ||
1042 | * Return 0 otherwise. i.e: | ||
1043 | * | ||
1044 | * - successful insertion of probes | ||
1045 | * - (or) no possible probes to be inserted. | ||
1046 | * - (or) insertion of probes failed but we can bail-out. | ||
1047 | */ | ||
1048 | int uprobe_mmap(struct vm_area_struct *vma) | ||
1049 | { | ||
1050 | struct list_head tmp_list; | ||
1051 | struct uprobe *uprobe, *u; | ||
1052 | struct inode *inode; | ||
1053 | int ret, count; | ||
1054 | |||
1055 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) | ||
1056 | return 0; | ||
1057 | |||
1058 | inode = vma->vm_file->f_mapping->host; | ||
1059 | if (!inode) | ||
1060 | return 0; | ||
1061 | |||
1062 | INIT_LIST_HEAD(&tmp_list); | ||
1063 | mutex_lock(uprobes_mmap_hash(inode)); | ||
1064 | build_probe_list(inode, &tmp_list); | ||
1065 | |||
1066 | ret = 0; | ||
1067 | count = 0; | ||
1068 | |||
1069 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | ||
1070 | loff_t vaddr; | ||
1071 | |||
1072 | list_del(&uprobe->pending_list); | ||
1073 | if (!ret) { | ||
1074 | vaddr = vma_address(vma, uprobe->offset); | ||
1075 | |||
1076 | if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { | ||
1077 | put_uprobe(uprobe); | ||
1078 | continue; | ||
1079 | } | ||
1080 | |||
1081 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | ||
1082 | |||
1083 | /* Ignore double add: */ | ||
1084 | if (ret == -EEXIST) { | ||
1085 | ret = 0; | ||
1086 | |||
1087 | if (!is_swbp_at_addr(vma->vm_mm, vaddr)) | ||
1088 | continue; | ||
1089 | |||
1090 | /* | ||
1091 | * Unable to insert a breakpoint, but | ||
1092 | * breakpoint lies underneath. Increment the | ||
1093 | * probe count. | ||
1094 | */ | ||
1095 | atomic_inc(&vma->vm_mm->uprobes_state.count); | ||
1096 | } | ||
1097 | |||
1098 | if (!ret) | ||
1099 | count++; | ||
1100 | } | ||
1101 | put_uprobe(uprobe); | ||
1102 | } | ||
1103 | |||
1104 | mutex_unlock(uprobes_mmap_hash(inode)); | ||
1105 | |||
1106 | if (ret) | ||
1107 | atomic_sub(count, &vma->vm_mm->uprobes_state.count); | ||
1108 | |||
1109 | return ret; | ||
1110 | } | ||
1111 | |||
1112 | /* | ||
1113 | * Called in context of a munmap of a vma. | ||
1114 | */ | ||
1115 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) | ||
1116 | { | ||
1117 | struct list_head tmp_list; | ||
1118 | struct uprobe *uprobe, *u; | ||
1119 | struct inode *inode; | ||
1120 | |||
1121 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | ||
1122 | return; | ||
1123 | |||
1124 | if (!atomic_read(&vma->vm_mm->uprobes_state.count)) | ||
1125 | return; | ||
1126 | |||
1127 | inode = vma->vm_file->f_mapping->host; | ||
1128 | if (!inode) | ||
1129 | return; | ||
1130 | |||
1131 | INIT_LIST_HEAD(&tmp_list); | ||
1132 | mutex_lock(uprobes_mmap_hash(inode)); | ||
1133 | build_probe_list(inode, &tmp_list); | ||
1134 | |||
1135 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | ||
1136 | loff_t vaddr; | ||
1137 | |||
1138 | list_del(&uprobe->pending_list); | ||
1139 | vaddr = vma_address(vma, uprobe->offset); | ||
1140 | |||
1141 | if (vaddr >= start && vaddr < end) { | ||
1142 | /* | ||
1143 | * An unregister could have removed the probe before | ||
1144 | * unmap. So check before we decrement the count. | ||
1145 | */ | ||
1146 | if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) | ||
1147 | atomic_dec(&vma->vm_mm->uprobes_state.count); | ||
1148 | } | ||
1149 | put_uprobe(uprobe); | ||
1150 | } | ||
1151 | mutex_unlock(uprobes_mmap_hash(inode)); | ||
1152 | } | ||
1153 | |||
1154 | /* Slot allocation for XOL */ | ||
1155 | static int xol_add_vma(struct xol_area *area) | ||
1156 | { | ||
1157 | struct mm_struct *mm; | ||
1158 | int ret; | ||
1159 | |||
1160 | area->page = alloc_page(GFP_HIGHUSER); | ||
1161 | if (!area->page) | ||
1162 | return -ENOMEM; | ||
1163 | |||
1164 | ret = -EALREADY; | ||
1165 | mm = current->mm; | ||
1166 | |||
1167 | down_write(&mm->mmap_sem); | ||
1168 | if (mm->uprobes_state.xol_area) | ||
1169 | goto fail; | ||
1170 | |||
1171 | ret = -ENOMEM; | ||
1172 | |||
1173 | /* Try to map as high as possible, this is only a hint. */ | ||
1174 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | ||
1175 | if (area->vaddr & ~PAGE_MASK) { | ||
1176 | ret = area->vaddr; | ||
1177 | goto fail; | ||
1178 | } | ||
1179 | |||
1180 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | ||
1181 | VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); | ||
1182 | if (ret) | ||
1183 | goto fail; | ||
1184 | |||
1185 | smp_wmb(); /* pairs with get_xol_area() */ | ||
1186 | mm->uprobes_state.xol_area = area; | ||
1187 | ret = 0; | ||
1188 | |||
1189 | fail: | ||
1190 | up_write(&mm->mmap_sem); | ||
1191 | if (ret) | ||
1192 | __free_page(area->page); | ||
1193 | |||
1194 | return ret; | ||
1195 | } | ||
1196 | |||
1197 | static struct xol_area *get_xol_area(struct mm_struct *mm) | ||
1198 | { | ||
1199 | struct xol_area *area; | ||
1200 | |||
1201 | area = mm->uprobes_state.xol_area; | ||
1202 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
1203 | |||
1204 | return area; | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * xol_alloc_area - Allocate process's xol_area. | ||
1209 | * This area will be used for storing instructions for execution out of | ||
1210 | * line. | ||
1211 | * | ||
1212 | * Returns the allocated area or NULL. | ||
1213 | */ | ||
1214 | static struct xol_area *xol_alloc_area(void) | ||
1215 | { | ||
1216 | struct xol_area *area; | ||
1217 | |||
1218 | area = kzalloc(sizeof(*area), GFP_KERNEL); | ||
1219 | if (unlikely(!area)) | ||
1220 | return NULL; | ||
1221 | |||
1222 | area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); | ||
1223 | |||
1224 | if (!area->bitmap) | ||
1225 | goto fail; | ||
1226 | |||
1227 | init_waitqueue_head(&area->wq); | ||
1228 | if (!xol_add_vma(area)) | ||
1229 | return area; | ||
1230 | |||
1231 | fail: | ||
1232 | kfree(area->bitmap); | ||
1233 | kfree(area); | ||
1234 | |||
1235 | return get_xol_area(current->mm); | ||
1236 | } | ||
1237 | |||
1238 | /* | ||
1239 | * uprobe_clear_state - Free the area allocated for slots. | ||
1240 | */ | ||
1241 | void uprobe_clear_state(struct mm_struct *mm) | ||
1242 | { | ||
1243 | struct xol_area *area = mm->uprobes_state.xol_area; | ||
1244 | |||
1245 | if (!area) | ||
1246 | return; | ||
1247 | |||
1248 | put_page(area->page); | ||
1249 | kfree(area->bitmap); | ||
1250 | kfree(area); | ||
1251 | } | ||
1252 | |||
1253 | /* | ||
1254 | * uprobe_reset_state - Free the area allocated for slots. | ||
1255 | */ | ||
1256 | void uprobe_reset_state(struct mm_struct *mm) | ||
1257 | { | ||
1258 | mm->uprobes_state.xol_area = NULL; | ||
1259 | atomic_set(&mm->uprobes_state.count, 0); | ||
1260 | } | ||
1261 | |||
1262 | /* | ||
1263 | * - search for a free slot. | ||
1264 | */ | ||
1265 | static unsigned long xol_take_insn_slot(struct xol_area *area) | ||
1266 | { | ||
1267 | unsigned long slot_addr; | ||
1268 | int slot_nr; | ||
1269 | |||
1270 | do { | ||
1271 | slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); | ||
1272 | if (slot_nr < UINSNS_PER_PAGE) { | ||
1273 | if (!test_and_set_bit(slot_nr, area->bitmap)) | ||
1274 | break; | ||
1275 | |||
1276 | slot_nr = UINSNS_PER_PAGE; | ||
1277 | continue; | ||
1278 | } | ||
1279 | wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); | ||
1280 | } while (slot_nr >= UINSNS_PER_PAGE); | ||
1281 | |||
1282 | slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); | ||
1283 | atomic_inc(&area->slot_count); | ||
1284 | |||
1285 | return slot_addr; | ||
1286 | } | ||
1287 | |||
1288 | /* | ||
1289 | * xol_get_insn_slot - If was not allocated a slot, then | ||
1290 | * allocate a slot. | ||
1291 | * Returns the allocated slot address or 0. | ||
1292 | */ | ||
1293 | static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) | ||
1294 | { | ||
1295 | struct xol_area *area; | ||
1296 | unsigned long offset; | ||
1297 | void *vaddr; | ||
1298 | |||
1299 | area = get_xol_area(current->mm); | ||
1300 | if (!area) { | ||
1301 | area = xol_alloc_area(); | ||
1302 | if (!area) | ||
1303 | return 0; | ||
1304 | } | ||
1305 | current->utask->xol_vaddr = xol_take_insn_slot(area); | ||
1306 | |||
1307 | /* | ||
1308 | * Initialize the slot if xol_vaddr points to valid | ||
1309 | * instruction slot. | ||
1310 | */ | ||
1311 | if (unlikely(!current->utask->xol_vaddr)) | ||
1312 | return 0; | ||
1313 | |||
1314 | current->utask->vaddr = slot_addr; | ||
1315 | offset = current->utask->xol_vaddr & ~PAGE_MASK; | ||
1316 | vaddr = kmap_atomic(area->page); | ||
1317 | memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); | ||
1318 | kunmap_atomic(vaddr); | ||
1319 | |||
1320 | return current->utask->xol_vaddr; | ||
1321 | } | ||
1322 | |||
1323 | /* | ||
1324 | * xol_free_insn_slot - If slot was earlier allocated by | ||
1325 | * @xol_get_insn_slot(), make the slot available for | ||
1326 | * subsequent requests. | ||
1327 | */ | ||
1328 | static void xol_free_insn_slot(struct task_struct *tsk) | ||
1329 | { | ||
1330 | struct xol_area *area; | ||
1331 | unsigned long vma_end; | ||
1332 | unsigned long slot_addr; | ||
1333 | |||
1334 | if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) | ||
1335 | return; | ||
1336 | |||
1337 | slot_addr = tsk->utask->xol_vaddr; | ||
1338 | |||
1339 | if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) | ||
1340 | return; | ||
1341 | |||
1342 | area = tsk->mm->uprobes_state.xol_area; | ||
1343 | vma_end = area->vaddr + PAGE_SIZE; | ||
1344 | if (area->vaddr <= slot_addr && slot_addr < vma_end) { | ||
1345 | unsigned long offset; | ||
1346 | int slot_nr; | ||
1347 | |||
1348 | offset = slot_addr - area->vaddr; | ||
1349 | slot_nr = offset / UPROBE_XOL_SLOT_BYTES; | ||
1350 | if (slot_nr >= UINSNS_PER_PAGE) | ||
1351 | return; | ||
1352 | |||
1353 | clear_bit(slot_nr, area->bitmap); | ||
1354 | atomic_dec(&area->slot_count); | ||
1355 | if (waitqueue_active(&area->wq)) | ||
1356 | wake_up(&area->wq); | ||
1357 | |||
1358 | tsk->utask->xol_vaddr = 0; | ||
1359 | } | ||
1360 | } | ||
1361 | |||
1362 | /** | ||
1363 | * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs | ||
1364 | * @regs: Reflects the saved state of the task after it has hit a breakpoint | ||
1365 | * instruction. | ||
1366 | * Return the address of the breakpoint instruction. | ||
1367 | */ | ||
1368 | unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) | ||
1369 | { | ||
1370 | return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; | ||
1371 | } | ||
1372 | |||
1373 | /* | ||
1374 | * Called with no locks held. | ||
1375 | * Called in context of a exiting or a exec-ing thread. | ||
1376 | */ | ||
1377 | void uprobe_free_utask(struct task_struct *t) | ||
1378 | { | ||
1379 | struct uprobe_task *utask = t->utask; | ||
1380 | |||
1381 | if (t->uprobe_srcu_id != -1) | ||
1382 | srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); | ||
1383 | |||
1384 | if (!utask) | ||
1385 | return; | ||
1386 | |||
1387 | if (utask->active_uprobe) | ||
1388 | put_uprobe(utask->active_uprobe); | ||
1389 | |||
1390 | xol_free_insn_slot(t); | ||
1391 | kfree(utask); | ||
1392 | t->utask = NULL; | ||
1393 | } | ||
1394 | |||
1395 | /* | ||
1396 | * Called in context of a new clone/fork from copy_process. | ||
1397 | */ | ||
1398 | void uprobe_copy_process(struct task_struct *t) | ||
1399 | { | ||
1400 | t->utask = NULL; | ||
1401 | t->uprobe_srcu_id = -1; | ||
1402 | } | ||
1403 | |||
1404 | /* | ||
1405 | * Allocate a uprobe_task object for the task. | ||
1406 | * Called when the thread hits a breakpoint for the first time. | ||
1407 | * | ||
1408 | * Returns: | ||
1409 | * - pointer to new uprobe_task on success | ||
1410 | * - NULL otherwise | ||
1411 | */ | ||
1412 | static struct uprobe_task *add_utask(void) | ||
1413 | { | ||
1414 | struct uprobe_task *utask; | ||
1415 | |||
1416 | utask = kzalloc(sizeof *utask, GFP_KERNEL); | ||
1417 | if (unlikely(!utask)) | ||
1418 | return NULL; | ||
1419 | |||
1420 | utask->active_uprobe = NULL; | ||
1421 | current->utask = utask; | ||
1422 | return utask; | ||
1423 | } | ||
1424 | |||
1425 | /* Prepare to single-step probed instruction out of line. */ | ||
1426 | static int | ||
1427 | pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) | ||
1428 | { | ||
1429 | if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) | ||
1430 | return 0; | ||
1431 | |||
1432 | return -EFAULT; | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * If we are singlestepping, then ensure this thread is not connected to | ||
1437 | * non-fatal signals until completion of singlestep. When xol insn itself | ||
1438 | * triggers the signal, restart the original insn even if the task is | ||
1439 | * already SIGKILL'ed (since coredump should report the correct ip). This | ||
1440 | * is even more important if the task has a handler for SIGSEGV/etc, The | ||
1441 | * _same_ instruction should be repeated again after return from the signal | ||
1442 | * handler, and SSTEP can never finish in this case. | ||
1443 | */ | ||
1444 | bool uprobe_deny_signal(void) | ||
1445 | { | ||
1446 | struct task_struct *t = current; | ||
1447 | struct uprobe_task *utask = t->utask; | ||
1448 | |||
1449 | if (likely(!utask || !utask->active_uprobe)) | ||
1450 | return false; | ||
1451 | |||
1452 | WARN_ON_ONCE(utask->state != UTASK_SSTEP); | ||
1453 | |||
1454 | if (signal_pending(t)) { | ||
1455 | spin_lock_irq(&t->sighand->siglock); | ||
1456 | clear_tsk_thread_flag(t, TIF_SIGPENDING); | ||
1457 | spin_unlock_irq(&t->sighand->siglock); | ||
1458 | |||
1459 | if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { | ||
1460 | utask->state = UTASK_SSTEP_TRAPPED; | ||
1461 | set_tsk_thread_flag(t, TIF_UPROBE); | ||
1462 | set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); | ||
1463 | } | ||
1464 | } | ||
1465 | |||
1466 | return true; | ||
1467 | } | ||
1468 | |||
1469 | /* | ||
1470 | * Avoid singlestepping the original instruction if the original instruction | ||
1471 | * is a NOP or can be emulated. | ||
1472 | */ | ||
1473 | static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | ||
1474 | { | ||
1475 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) | ||
1476 | return true; | ||
1477 | |||
1478 | uprobe->flags &= ~UPROBE_SKIP_SSTEP; | ||
1479 | return false; | ||
1480 | } | ||
1481 | |||
1482 | /* | ||
1483 | * Run handler and ask thread to singlestep. | ||
1484 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | ||
1485 | */ | ||
1486 | static void handle_swbp(struct pt_regs *regs) | ||
1487 | { | ||
1488 | struct vm_area_struct *vma; | ||
1489 | struct uprobe_task *utask; | ||
1490 | struct uprobe *uprobe; | ||
1491 | struct mm_struct *mm; | ||
1492 | unsigned long bp_vaddr; | ||
1493 | |||
1494 | uprobe = NULL; | ||
1495 | bp_vaddr = uprobe_get_swbp_addr(regs); | ||
1496 | mm = current->mm; | ||
1497 | down_read(&mm->mmap_sem); | ||
1498 | vma = find_vma(mm, bp_vaddr); | ||
1499 | |||
1500 | if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { | ||
1501 | struct inode *inode; | ||
1502 | loff_t offset; | ||
1503 | |||
1504 | inode = vma->vm_file->f_mapping->host; | ||
1505 | offset = bp_vaddr - vma->vm_start; | ||
1506 | offset += (vma->vm_pgoff << PAGE_SHIFT); | ||
1507 | uprobe = find_uprobe(inode, offset); | ||
1508 | } | ||
1509 | |||
1510 | srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); | ||
1511 | current->uprobe_srcu_id = -1; | ||
1512 | up_read(&mm->mmap_sem); | ||
1513 | |||
1514 | if (!uprobe) { | ||
1515 | /* No matching uprobe; signal SIGTRAP. */ | ||
1516 | send_sig(SIGTRAP, current, 0); | ||
1517 | return; | ||
1518 | } | ||
1519 | |||
1520 | utask = current->utask; | ||
1521 | if (!utask) { | ||
1522 | utask = add_utask(); | ||
1523 | /* Cannot allocate; re-execute the instruction. */ | ||
1524 | if (!utask) | ||
1525 | goto cleanup_ret; | ||
1526 | } | ||
1527 | utask->active_uprobe = uprobe; | ||
1528 | handler_chain(uprobe, regs); | ||
1529 | if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) | ||
1530 | goto cleanup_ret; | ||
1531 | |||
1532 | utask->state = UTASK_SSTEP; | ||
1533 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | ||
1534 | user_enable_single_step(current); | ||
1535 | return; | ||
1536 | } | ||
1537 | |||
1538 | cleanup_ret: | ||
1539 | if (utask) { | ||
1540 | utask->active_uprobe = NULL; | ||
1541 | utask->state = UTASK_RUNNING; | ||
1542 | } | ||
1543 | if (uprobe) { | ||
1544 | if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) | ||
1545 | |||
1546 | /* | ||
1547 | * cannot singlestep; cannot skip instruction; | ||
1548 | * re-execute the instruction. | ||
1549 | */ | ||
1550 | instruction_pointer_set(regs, bp_vaddr); | ||
1551 | |||
1552 | put_uprobe(uprobe); | ||
1553 | } | ||
1554 | } | ||
1555 | |||
1556 | /* | ||
1557 | * Perform required fix-ups and disable singlestep. | ||
1558 | * Allow pending signals to take effect. | ||
1559 | */ | ||
1560 | static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | ||
1561 | { | ||
1562 | struct uprobe *uprobe; | ||
1563 | |||
1564 | uprobe = utask->active_uprobe; | ||
1565 | if (utask->state == UTASK_SSTEP_ACK) | ||
1566 | arch_uprobe_post_xol(&uprobe->arch, regs); | ||
1567 | else if (utask->state == UTASK_SSTEP_TRAPPED) | ||
1568 | arch_uprobe_abort_xol(&uprobe->arch, regs); | ||
1569 | else | ||
1570 | WARN_ON_ONCE(1); | ||
1571 | |||
1572 | put_uprobe(uprobe); | ||
1573 | utask->active_uprobe = NULL; | ||
1574 | utask->state = UTASK_RUNNING; | ||
1575 | user_disable_single_step(current); | ||
1576 | xol_free_insn_slot(current); | ||
1577 | |||
1578 | spin_lock_irq(¤t->sighand->siglock); | ||
1579 | recalc_sigpending(); /* see uprobe_deny_signal() */ | ||
1580 | spin_unlock_irq(¤t->sighand->siglock); | ||
1581 | } | ||
1582 | |||
1583 | /* | ||
1584 | * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on | ||
1585 | * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and | ||
1586 | * allows the thread to return from interrupt. | ||
1587 | * | ||
1588 | * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and | ||
1589 | * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from | ||
1590 | * interrupt. | ||
1591 | * | ||
1592 | * While returning to userspace, thread notices the TIF_UPROBE flag and calls | ||
1593 | * uprobe_notify_resume(). | ||
1594 | */ | ||
1595 | void uprobe_notify_resume(struct pt_regs *regs) | ||
1596 | { | ||
1597 | struct uprobe_task *utask; | ||
1598 | |||
1599 | utask = current->utask; | ||
1600 | if (!utask || utask->state == UTASK_BP_HIT) | ||
1601 | handle_swbp(regs); | ||
1602 | else | ||
1603 | handle_singlestep(utask, regs); | ||
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * uprobe_pre_sstep_notifier gets called from interrupt context as part of | ||
1608 | * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. | ||
1609 | */ | ||
1610 | int uprobe_pre_sstep_notifier(struct pt_regs *regs) | ||
1611 | { | ||
1612 | struct uprobe_task *utask; | ||
1613 | |||
1614 | if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) | ||
1615 | /* task is currently not uprobed */ | ||
1616 | return 0; | ||
1617 | |||
1618 | utask = current->utask; | ||
1619 | if (utask) | ||
1620 | utask->state = UTASK_BP_HIT; | ||
1621 | |||
1622 | set_thread_flag(TIF_UPROBE); | ||
1623 | current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); | ||
1624 | |||
1625 | return 1; | ||
1626 | } | ||
1627 | |||
1628 | /* | ||
1629 | * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier | ||
1630 | * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. | ||
1631 | */ | ||
1632 | int uprobe_post_sstep_notifier(struct pt_regs *regs) | ||
1633 | { | ||
1634 | struct uprobe_task *utask = current->utask; | ||
1635 | |||
1636 | if (!current->mm || !utask || !utask->active_uprobe) | ||
1637 | /* task is currently not uprobed */ | ||
1638 | return 0; | ||
1639 | |||
1640 | utask->state = UTASK_SSTEP_ACK; | ||
1641 | set_thread_flag(TIF_UPROBE); | ||
1642 | return 1; | ||
1643 | } | ||
1644 | |||
1645 | static struct notifier_block uprobe_exception_nb = { | ||
1646 | .notifier_call = arch_uprobe_exception_notify, | ||
1647 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ | ||
1648 | }; | ||
1649 | |||
1650 | static int __init init_uprobes(void) | ||
1651 | { | ||
1652 | int i; | ||
1653 | |||
1654 | for (i = 0; i < UPROBES_HASH_SZ; i++) { | ||
1655 | mutex_init(&uprobes_mutex[i]); | ||
1656 | mutex_init(&uprobes_mmap_mutex[i]); | ||
1657 | } | ||
1658 | init_srcu_struct(&uprobes_srcu); | ||
1659 | |||
1660 | return register_die_notifier(&uprobe_exception_nb); | ||
1661 | } | ||
1662 | module_init(init_uprobes); | ||
1663 | |||
1664 | static void __exit exit_uprobes(void) | ||
1665 | { | ||
1666 | } | ||
1667 | module_exit(exit_uprobes); | ||
diff --git a/kernel/exit.c b/kernel/exit.c index d8bd3b425fa7..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
72 | list_del_rcu(&p->tasks); | 72 | list_del_rcu(&p->tasks); |
73 | list_del_init(&p->sibling); | 73 | list_del_init(&p->sibling); |
74 | __this_cpu_dec(process_counts); | 74 | __this_cpu_dec(process_counts); |
75 | /* | ||
76 | * If we are the last child process in a pid namespace to be | ||
77 | * reaped, notify the reaper sleeping zap_pid_ns_processes(). | ||
78 | */ | ||
79 | if (IS_ENABLED(CONFIG_PID_NS)) { | ||
80 | struct task_struct *parent = p->real_parent; | ||
81 | |||
82 | if ((task_active_pid_ns(parent)->child_reaper == parent) && | ||
83 | list_empty(&parent->children) && | ||
84 | (parent->flags & PF_EXITING)) | ||
85 | wake_up_process(parent); | ||
86 | } | ||
75 | } | 87 | } |
76 | list_del_rcu(&p->thread_group); | 88 | list_del_rcu(&p->thread_group); |
77 | } | 89 | } |
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk) | |||
643 | mm_release(tsk, mm); | 655 | mm_release(tsk, mm); |
644 | if (!mm) | 656 | if (!mm) |
645 | return; | 657 | return; |
658 | sync_mm_rss(mm); | ||
646 | /* | 659 | /* |
647 | * Serialize with any possible pending coredump. | 660 | * Serialize with any possible pending coredump. |
648 | * We must hold mmap_sem around checking core_state | 661 | * We must hold mmap_sem around checking core_state |
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) | |||
719 | 732 | ||
720 | zap_pid_ns_processes(pid_ns); | 733 | zap_pid_ns_processes(pid_ns); |
721 | write_lock_irq(&tasklist_lock); | 734 | write_lock_irq(&tasklist_lock); |
722 | /* | ||
723 | * We can not clear ->child_reaper or leave it alone. | ||
724 | * There may by stealth EXIT_DEAD tasks on ->children, | ||
725 | * forget_original_parent() must move them somewhere. | ||
726 | */ | ||
727 | pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
728 | } else if (father->signal->has_child_subreaper) { | 735 | } else if (father->signal->has_child_subreaper) { |
729 | struct task_struct *reaper; | 736 | struct task_struct *reaper; |
730 | 737 | ||
@@ -884,9 +891,9 @@ static void check_stack_usage(void) | |||
884 | 891 | ||
885 | spin_lock(&low_water_lock); | 892 | spin_lock(&low_water_lock); |
886 | if (free < lowest_to_date) { | 893 | if (free < lowest_to_date) { |
887 | printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " | 894 | printk(KERN_WARNING "%s (%d) used greatest stack depth: " |
888 | "left\n", | 895 | "%lu bytes left\n", |
889 | current->comm, free); | 896 | current->comm, task_pid_nr(current), free); |
890 | lowest_to_date = free; | 897 | lowest_to_date = free; |
891 | } | 898 | } |
892 | spin_unlock(&low_water_lock); | 899 | spin_unlock(&low_water_lock); |
@@ -946,12 +953,13 @@ void do_exit(long code) | |||
946 | exit_signals(tsk); /* sets PF_EXITING */ | 953 | exit_signals(tsk); /* sets PF_EXITING */ |
947 | /* | 954 | /* |
948 | * tsk->flags are checked in the futex code to protect against | 955 | * tsk->flags are checked in the futex code to protect against |
949 | * an exiting task cleaning up the robust pi futexes. | 956 | * an exiting task cleaning up the robust pi futexes, and in |
957 | * task_work_add() to avoid the race with exit_task_work(). | ||
950 | */ | 958 | */ |
951 | smp_mb(); | 959 | smp_mb(); |
952 | raw_spin_unlock_wait(&tsk->pi_lock); | 960 | raw_spin_unlock_wait(&tsk->pi_lock); |
953 | 961 | ||
954 | exit_irq_thread(); | 962 | exit_task_work(tsk); |
955 | 963 | ||
956 | if (unlikely(in_atomic())) | 964 | if (unlikely(in_atomic())) |
957 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 965 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
@@ -1214,7 +1222,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1214 | unsigned long state; | 1222 | unsigned long state; |
1215 | int retval, status, traced; | 1223 | int retval, status, traced; |
1216 | pid_t pid = task_pid_vnr(p); | 1224 | pid_t pid = task_pid_vnr(p); |
1217 | uid_t uid = __task_cred(p)->uid; | 1225 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1218 | struct siginfo __user *infop; | 1226 | struct siginfo __user *infop; |
1219 | 1227 | ||
1220 | if (!likely(wo->wo_flags & WEXITED)) | 1228 | if (!likely(wo->wo_flags & WEXITED)) |
@@ -1427,7 +1435,7 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1427 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1435 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1428 | *p_code = 0; | 1436 | *p_code = 0; |
1429 | 1437 | ||
1430 | uid = task_uid(p); | 1438 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1431 | unlock_sig: | 1439 | unlock_sig: |
1432 | spin_unlock_irq(&p->sighand->siglock); | 1440 | spin_unlock_irq(&p->sighand->siglock); |
1433 | if (!exit_code) | 1441 | if (!exit_code) |
@@ -1500,7 +1508,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
1500 | } | 1508 | } |
1501 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1509 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1502 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1510 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1503 | uid = task_uid(p); | 1511 | uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
1504 | spin_unlock_irq(&p->sighand->siglock); | 1512 | spin_unlock_irq(&p->sighand->siglock); |
1505 | 1513 | ||
1506 | pid = task_pid_vnr(p); | 1514 | pid = task_pid_vnr(p); |
diff --git a/kernel/extable.c b/kernel/extable.c index 5339705b8241..fe35a634bf76 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex); | |||
35 | extern struct exception_table_entry __start___ex_table[]; | 35 | extern struct exception_table_entry __start___ex_table[]; |
36 | extern struct exception_table_entry __stop___ex_table[]; | 36 | extern struct exception_table_entry __stop___ex_table[]; |
37 | 37 | ||
38 | /* Cleared by build time tools if the table is already sorted. */ | ||
39 | u32 __initdata main_extable_sort_needed = 1; | ||
40 | |||
38 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
39 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
40 | { | 43 | { |
41 | sort_extable(__start___ex_table, __stop___ex_table); | 44 | if (main_extable_sort_needed) |
45 | sort_extable(__start___ex_table, __stop___ex_table); | ||
46 | else | ||
47 | pr_notice("__ex_table already sorted, skipping sort\n"); | ||
42 | } | 48 | } |
43 | 49 | ||
44 | /* Given an address, look for it in the exception tables. */ | 50 | /* Given an address, look for it in the exception tables. */ |
diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/cgroup.h> | 34 | #include <linux/cgroup.h> |
35 | #include <linux/security.h> | 35 | #include <linux/security.h> |
36 | #include <linux/hugetlb.h> | 36 | #include <linux/hugetlb.h> |
37 | #include <linux/seccomp.h> | ||
37 | #include <linux/swap.h> | 38 | #include <linux/swap.h> |
38 | #include <linux/syscalls.h> | 39 | #include <linux/syscalls.h> |
39 | #include <linux/jiffies.h> | 40 | #include <linux/jiffies.h> |
@@ -47,6 +48,7 @@ | |||
47 | #include <linux/audit.h> | 48 | #include <linux/audit.h> |
48 | #include <linux/memcontrol.h> | 49 | #include <linux/memcontrol.h> |
49 | #include <linux/ftrace.h> | 50 | #include <linux/ftrace.h> |
51 | #include <linux/proc_fs.h> | ||
50 | #include <linux/profile.h> | 52 | #include <linux/profile.h> |
51 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
52 | #include <linux/ksm.h> | 54 | #include <linux/ksm.h> |
@@ -67,6 +69,7 @@ | |||
67 | #include <linux/oom.h> | 69 | #include <linux/oom.h> |
68 | #include <linux/khugepaged.h> | 70 | #include <linux/khugepaged.h> |
69 | #include <linux/signalfd.h> | 71 | #include <linux/signalfd.h> |
72 | #include <linux/uprobes.h> | ||
70 | 73 | ||
71 | #include <asm/pgtable.h> | 74 | #include <asm/pgtable.h> |
72 | #include <asm/pgalloc.h> | 75 | #include <asm/pgalloc.h> |
@@ -111,32 +114,67 @@ int nr_processes(void) | |||
111 | return total; | 114 | return total; |
112 | } | 115 | } |
113 | 116 | ||
114 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 117 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
115 | # define alloc_task_struct_node(node) \ | ||
116 | kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) | ||
117 | # define free_task_struct(tsk) \ | ||
118 | kmem_cache_free(task_struct_cachep, (tsk)) | ||
119 | static struct kmem_cache *task_struct_cachep; | 118 | static struct kmem_cache *task_struct_cachep; |
119 | |||
120 | static inline struct task_struct *alloc_task_struct_node(int node) | ||
121 | { | ||
122 | return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); | ||
123 | } | ||
124 | |||
125 | void __weak arch_release_task_struct(struct task_struct *tsk) { } | ||
126 | |||
127 | static inline void free_task_struct(struct task_struct *tsk) | ||
128 | { | ||
129 | arch_release_task_struct(tsk); | ||
130 | kmem_cache_free(task_struct_cachep, tsk); | ||
131 | } | ||
120 | #endif | 132 | #endif |
121 | 133 | ||
122 | #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR | 134 | #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR |
135 | void __weak arch_release_thread_info(struct thread_info *ti) { } | ||
136 | |||
137 | /* | ||
138 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | ||
139 | * kmemcache based allocator. | ||
140 | */ | ||
141 | # if THREAD_SIZE >= PAGE_SIZE | ||
123 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 142 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
124 | int node) | 143 | int node) |
125 | { | 144 | { |
126 | #ifdef CONFIG_DEBUG_STACK_USAGE | 145 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
127 | gfp_t mask = GFP_KERNEL | __GFP_ZERO; | 146 | THREAD_SIZE_ORDER); |
128 | #else | ||
129 | gfp_t mask = GFP_KERNEL; | ||
130 | #endif | ||
131 | struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); | ||
132 | 147 | ||
133 | return page ? page_address(page) : NULL; | 148 | return page ? page_address(page) : NULL; |
134 | } | 149 | } |
135 | 150 | ||
136 | static inline void free_thread_info(struct thread_info *ti) | 151 | static inline void free_thread_info(struct thread_info *ti) |
137 | { | 152 | { |
153 | arch_release_thread_info(ti); | ||
138 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 154 | free_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
139 | } | 155 | } |
156 | # else | ||
157 | static struct kmem_cache *thread_info_cache; | ||
158 | |||
159 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | ||
160 | int node) | ||
161 | { | ||
162 | return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); | ||
163 | } | ||
164 | |||
165 | static void free_thread_info(struct thread_info *ti) | ||
166 | { | ||
167 | arch_release_thread_info(ti); | ||
168 | kmem_cache_free(thread_info_cache, ti); | ||
169 | } | ||
170 | |||
171 | void thread_info_cache_init(void) | ||
172 | { | ||
173 | thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, | ||
174 | THREAD_SIZE, 0, NULL); | ||
175 | BUG_ON(thread_info_cache == NULL); | ||
176 | } | ||
177 | # endif | ||
140 | #endif | 178 | #endif |
141 | 179 | ||
142 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 180 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
@@ -170,6 +208,7 @@ void free_task(struct task_struct *tsk) | |||
170 | free_thread_info(tsk->stack); | 208 | free_thread_info(tsk->stack); |
171 | rt_mutex_debug_task_free(tsk); | 209 | rt_mutex_debug_task_free(tsk); |
172 | ftrace_graph_exit_task(tsk); | 210 | ftrace_graph_exit_task(tsk); |
211 | put_seccomp_filter(tsk); | ||
173 | free_task_struct(tsk); | 212 | free_task_struct(tsk); |
174 | } | 213 | } |
175 | EXPORT_SYMBOL(free_task); | 214 | EXPORT_SYMBOL(free_task); |
@@ -203,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk) | |||
203 | } | 242 | } |
204 | EXPORT_SYMBOL_GPL(__put_task_struct); | 243 | EXPORT_SYMBOL_GPL(__put_task_struct); |
205 | 244 | ||
206 | /* | 245 | void __init __weak arch_task_cache_init(void) { } |
207 | * macro override instead of weak attribute alias, to workaround | ||
208 | * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. | ||
209 | */ | ||
210 | #ifndef arch_task_cache_init | ||
211 | #define arch_task_cache_init() | ||
212 | #endif | ||
213 | 246 | ||
214 | void __init fork_init(unsigned long mempages) | 247 | void __init fork_init(unsigned long mempages) |
215 | { | 248 | { |
216 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 249 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
217 | #ifndef ARCH_MIN_TASKALIGN | 250 | #ifndef ARCH_MIN_TASKALIGN |
218 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES | 251 | #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES |
219 | #endif | 252 | #endif |
@@ -260,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
260 | int node = tsk_fork_get_node(orig); | 293 | int node = tsk_fork_get_node(orig); |
261 | int err; | 294 | int err; |
262 | 295 | ||
263 | prepare_to_copy(orig); | ||
264 | |||
265 | tsk = alloc_task_struct_node(node); | 296 | tsk = alloc_task_struct_node(node); |
266 | if (!tsk) | 297 | if (!tsk) |
267 | return NULL; | 298 | return NULL; |
@@ -273,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
273 | } | 304 | } |
274 | 305 | ||
275 | err = arch_dup_task_struct(tsk, orig); | 306 | err = arch_dup_task_struct(tsk, orig); |
276 | if (err) | ||
277 | goto out; | ||
278 | 307 | ||
308 | /* | ||
309 | * We defer looking at err, because we will need this setup | ||
310 | * for the clean up path to work correctly. | ||
311 | */ | ||
279 | tsk->stack = ti; | 312 | tsk->stack = ti; |
280 | |||
281 | setup_thread_stack(tsk, orig); | 313 | setup_thread_stack(tsk, orig); |
314 | |||
315 | if (err) | ||
316 | goto out; | ||
317 | |||
282 | clear_user_return_notifier(tsk); | 318 | clear_user_return_notifier(tsk); |
283 | clear_tsk_need_resched(tsk); | 319 | clear_tsk_need_resched(tsk); |
284 | stackend = end_of_stack(tsk); | 320 | stackend = end_of_stack(tsk); |
@@ -355,7 +391,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
355 | } | 391 | } |
356 | charge = 0; | 392 | charge = 0; |
357 | if (mpnt->vm_flags & VM_ACCOUNT) { | 393 | if (mpnt->vm_flags & VM_ACCOUNT) { |
358 | unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | 394 | unsigned long len; |
395 | len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; | ||
359 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ | 396 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ |
360 | goto fail_nomem; | 397 | goto fail_nomem; |
361 | charge = len; | 398 | charge = len; |
@@ -421,6 +458,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
421 | 458 | ||
422 | if (retval) | 459 | if (retval) |
423 | goto out; | 460 | goto out; |
461 | |||
462 | if (file && uprobe_mmap(tmp)) | ||
463 | goto out; | ||
424 | } | 464 | } |
425 | /* a new mm has just been created */ | 465 | /* a new mm has just been created */ |
426 | arch_dup_mmap(oldmm, mm); | 466 | arch_dup_mmap(oldmm, mm); |
@@ -569,6 +609,7 @@ void mmput(struct mm_struct *mm) | |||
569 | might_sleep(); | 609 | might_sleep(); |
570 | 610 | ||
571 | if (atomic_dec_and_test(&mm->mm_users)) { | 611 | if (atomic_dec_and_test(&mm->mm_users)) { |
612 | uprobe_clear_state(mm); | ||
572 | exit_aio(mm); | 613 | exit_aio(mm); |
573 | ksm_exit(mm); | 614 | ksm_exit(mm); |
574 | khugepaged_exit(mm); /* must run before exit_mmap */ | 615 | khugepaged_exit(mm); /* must run before exit_mmap */ |
@@ -579,7 +620,6 @@ void mmput(struct mm_struct *mm) | |||
579 | list_del(&mm->mmlist); | 620 | list_del(&mm->mmlist); |
580 | spin_unlock(&mmlist_lock); | 621 | spin_unlock(&mmlist_lock); |
581 | } | 622 | } |
582 | put_swap_token(mm); | ||
583 | if (mm->binfmt) | 623 | if (mm->binfmt) |
584 | module_put(mm->binfmt->module); | 624 | module_put(mm->binfmt->module); |
585 | mmdrop(mm); | 625 | mmdrop(mm); |
@@ -747,12 +787,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
747 | exit_pi_state_list(tsk); | 787 | exit_pi_state_list(tsk); |
748 | #endif | 788 | #endif |
749 | 789 | ||
790 | uprobe_free_utask(tsk); | ||
791 | |||
750 | /* Get rid of any cached register state */ | 792 | /* Get rid of any cached register state */ |
751 | deactivate_mm(tsk, mm); | 793 | deactivate_mm(tsk, mm); |
752 | 794 | ||
753 | if (tsk->vfork_done) | ||
754 | complete_vfork_done(tsk); | ||
755 | |||
756 | /* | 795 | /* |
757 | * If we're exiting normally, clear a user-space tid field if | 796 | * If we're exiting normally, clear a user-space tid field if |
758 | * requested. We leave this alone when dying by signal, to leave | 797 | * requested. We leave this alone when dying by signal, to leave |
@@ -773,6 +812,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
773 | } | 812 | } |
774 | tsk->clear_child_tid = NULL; | 813 | tsk->clear_child_tid = NULL; |
775 | } | 814 | } |
815 | |||
816 | /* | ||
817 | * All done, finally we can wake up parent and return this mm to him. | ||
818 | * Also kthread_stop() uses this completion for synchronization. | ||
819 | */ | ||
820 | if (tsk->vfork_done) | ||
821 | complete_vfork_done(tsk); | ||
776 | } | 822 | } |
777 | 823 | ||
778 | /* | 824 | /* |
@@ -794,13 +840,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
794 | memcpy(mm, oldmm, sizeof(*mm)); | 840 | memcpy(mm, oldmm, sizeof(*mm)); |
795 | mm_init_cpumask(mm); | 841 | mm_init_cpumask(mm); |
796 | 842 | ||
797 | /* Initializing for Swap token stuff */ | ||
798 | mm->token_priority = 0; | ||
799 | mm->last_interval = 0; | ||
800 | |||
801 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 843 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
802 | mm->pmd_huge_pte = NULL; | 844 | mm->pmd_huge_pte = NULL; |
803 | #endif | 845 | #endif |
846 | uprobe_reset_state(mm); | ||
804 | 847 | ||
805 | if (!mm_init(mm, tsk)) | 848 | if (!mm_init(mm, tsk)) |
806 | goto fail_nomem; | 849 | goto fail_nomem; |
@@ -875,10 +918,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | |||
875 | goto fail_nomem; | 918 | goto fail_nomem; |
876 | 919 | ||
877 | good_mm: | 920 | good_mm: |
878 | /* Initializing for Swap token stuff */ | ||
879 | mm->token_priority = 0; | ||
880 | mm->last_interval = 0; | ||
881 | |||
882 | tsk->mm = mm; | 921 | tsk->mm = mm; |
883 | tsk->active_mm = mm; | 922 | tsk->active_mm = mm; |
884 | return 0; | 923 | return 0; |
@@ -946,9 +985,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
946 | * Share io context with parent, if CLONE_IO is set | 985 | * Share io context with parent, if CLONE_IO is set |
947 | */ | 986 | */ |
948 | if (clone_flags & CLONE_IO) { | 987 | if (clone_flags & CLONE_IO) { |
949 | tsk->io_context = ioc_task_link(ioc); | 988 | ioc_task_link(ioc); |
950 | if (unlikely(!tsk->io_context)) | 989 | tsk->io_context = ioc; |
951 | return -ENOMEM; | ||
952 | } else if (ioprio_valid(ioc->ioprio)) { | 990 | } else if (ioprio_valid(ioc->ioprio)) { |
953 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); | 991 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); |
954 | if (unlikely(!new_ioc)) | 992 | if (unlikely(!new_ioc)) |
@@ -1162,6 +1200,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1162 | goto fork_out; | 1200 | goto fork_out; |
1163 | 1201 | ||
1164 | ftrace_graph_init_task(p); | 1202 | ftrace_graph_init_task(p); |
1203 | get_seccomp_filter(p); | ||
1165 | 1204 | ||
1166 | rt_mutex_init_task(p); | 1205 | rt_mutex_init_task(p); |
1167 | 1206 | ||
@@ -1342,6 +1381,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1342 | INIT_LIST_HEAD(&p->pi_state_list); | 1381 | INIT_LIST_HEAD(&p->pi_state_list); |
1343 | p->pi_state_cache = NULL; | 1382 | p->pi_state_cache = NULL; |
1344 | #endif | 1383 | #endif |
1384 | uprobe_copy_process(p); | ||
1345 | /* | 1385 | /* |
1346 | * sigaltstack should be cleared when sharing the same VM | 1386 | * sigaltstack should be cleared when sharing the same VM |
1347 | */ | 1387 | */ |
@@ -1380,6 +1420,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1380 | */ | 1420 | */ |
1381 | p->group_leader = p; | 1421 | p->group_leader = p; |
1382 | INIT_LIST_HEAD(&p->thread_group); | 1422 | INIT_LIST_HEAD(&p->thread_group); |
1423 | INIT_HLIST_HEAD(&p->task_works); | ||
1383 | 1424 | ||
1384 | /* Now that the task is set up, run cgroup callbacks if | 1425 | /* Now that the task is set up, run cgroup callbacks if |
1385 | * necessary. We need to run them before the task is visible | 1426 | * necessary. We need to run them before the task is visible |
@@ -1464,6 +1505,8 @@ bad_fork_cleanup_io: | |||
1464 | if (p->io_context) | 1505 | if (p->io_context) |
1465 | exit_io_context(p); | 1506 | exit_io_context(p); |
1466 | bad_fork_cleanup_namespaces: | 1507 | bad_fork_cleanup_namespaces: |
1508 | if (unlikely(clone_flags & CLONE_NEWPID)) | ||
1509 | pid_ns_release_proc(p->nsproxy->pid_ns); | ||
1467 | exit_task_namespaces(p); | 1510 | exit_task_namespaces(p); |
1468 | bad_fork_cleanup_mm: | 1511 | bad_fork_cleanup_mm: |
1469 | if (p->mm) | 1512 | if (p->mm) |
diff --git a/kernel/groups.c b/kernel/groups.c index 99b53d1eb7ea..6b2588dd04ff 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize) | |||
31 | group_info->blocks[0] = group_info->small_block; | 31 | group_info->blocks[0] = group_info->small_block; |
32 | else { | 32 | else { |
33 | for (i = 0; i < nblocks; i++) { | 33 | for (i = 0; i < nblocks; i++) { |
34 | gid_t *b; | 34 | kgid_t *b; |
35 | b = (void *)__get_free_page(GFP_USER); | 35 | b = (void *)__get_free_page(GFP_USER); |
36 | if (!b) | 36 | if (!b) |
37 | goto out_undo_partial_alloc; | 37 | goto out_undo_partial_alloc; |
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free); | |||
66 | static int groups_to_user(gid_t __user *grouplist, | 66 | static int groups_to_user(gid_t __user *grouplist, |
67 | const struct group_info *group_info) | 67 | const struct group_info *group_info) |
68 | { | 68 | { |
69 | struct user_namespace *user_ns = current_user_ns(); | ||
69 | int i; | 70 | int i; |
70 | unsigned int count = group_info->ngroups; | 71 | unsigned int count = group_info->ngroups; |
71 | 72 | ||
72 | for (i = 0; i < group_info->nblocks; i++) { | 73 | for (i = 0; i < count; i++) { |
73 | unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); | 74 | gid_t gid; |
74 | unsigned int len = cp_count * sizeof(*grouplist); | 75 | gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); |
75 | 76 | if (put_user(gid, grouplist+i)) | |
76 | if (copy_to_user(grouplist, group_info->blocks[i], len)) | ||
77 | return -EFAULT; | 77 | return -EFAULT; |
78 | |||
79 | grouplist += NGROUPS_PER_BLOCK; | ||
80 | count -= cp_count; | ||
81 | } | 78 | } |
82 | return 0; | 79 | return 0; |
83 | } | 80 | } |
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist, | |||
86 | static int groups_from_user(struct group_info *group_info, | 83 | static int groups_from_user(struct group_info *group_info, |
87 | gid_t __user *grouplist) | 84 | gid_t __user *grouplist) |
88 | { | 85 | { |
86 | struct user_namespace *user_ns = current_user_ns(); | ||
89 | int i; | 87 | int i; |
90 | unsigned int count = group_info->ngroups; | 88 | unsigned int count = group_info->ngroups; |
91 | 89 | ||
92 | for (i = 0; i < group_info->nblocks; i++) { | 90 | for (i = 0; i < count; i++) { |
93 | unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); | 91 | gid_t gid; |
94 | unsigned int len = cp_count * sizeof(*grouplist); | 92 | kgid_t kgid; |
95 | 93 | if (get_user(gid, grouplist+i)) | |
96 | if (copy_from_user(group_info->blocks[i], grouplist, len)) | ||
97 | return -EFAULT; | 94 | return -EFAULT; |
98 | 95 | ||
99 | grouplist += NGROUPS_PER_BLOCK; | 96 | kgid = make_kgid(user_ns, gid); |
100 | count -= cp_count; | 97 | if (!gid_valid(kgid)) |
98 | return -EINVAL; | ||
99 | |||
100 | GROUP_AT(group_info, i) = kgid; | ||
101 | } | 101 | } |
102 | return 0; | 102 | return 0; |
103 | } | 103 | } |
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info) | |||
117 | for (base = 0; base < max; base++) { | 117 | for (base = 0; base < max; base++) { |
118 | int left = base; | 118 | int left = base; |
119 | int right = left + stride; | 119 | int right = left + stride; |
120 | gid_t tmp = GROUP_AT(group_info, right); | 120 | kgid_t tmp = GROUP_AT(group_info, right); |
121 | 121 | ||
122 | while (left >= 0 && GROUP_AT(group_info, left) > tmp) { | 122 | while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { |
123 | GROUP_AT(group_info, right) = | 123 | GROUP_AT(group_info, right) = |
124 | GROUP_AT(group_info, left); | 124 | GROUP_AT(group_info, left); |
125 | right = left; | 125 | right = left; |
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info) | |||
132 | } | 132 | } |
133 | 133 | ||
134 | /* a simple bsearch */ | 134 | /* a simple bsearch */ |
135 | int groups_search(const struct group_info *group_info, gid_t grp) | 135 | int groups_search(const struct group_info *group_info, kgid_t grp) |
136 | { | 136 | { |
137 | unsigned int left, right; | 137 | unsigned int left, right; |
138 | 138 | ||
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) | |||
143 | right = group_info->ngroups; | 143 | right = group_info->ngroups; |
144 | while (left < right) { | 144 | while (left < right) { |
145 | unsigned int mid = (left+right)/2; | 145 | unsigned int mid = (left+right)/2; |
146 | if (grp > GROUP_AT(group_info, mid)) | 146 | if (gid_gt(grp, GROUP_AT(group_info, mid))) |
147 | left = mid + 1; | 147 | left = mid + 1; |
148 | else if (grp < GROUP_AT(group_info, mid)) | 148 | else if (gid_lt(grp, GROUP_AT(group_info, mid))) |
149 | right = mid; | 149 | right = mid; |
150 | else | 150 | else |
151 | return 1; | 151 | return 1; |
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
256 | /* | 256 | /* |
257 | * Check whether we're fsgid/egid or in the supplemental group.. | 257 | * Check whether we're fsgid/egid or in the supplemental group.. |
258 | */ | 258 | */ |
259 | int in_group_p(gid_t grp) | 259 | int in_group_p(kgid_t grp) |
260 | { | 260 | { |
261 | const struct cred *cred = current_cred(); | 261 | const struct cred *cred = current_cred(); |
262 | int retval = 1; | 262 | int retval = 1; |
263 | 263 | ||
264 | if (grp != cred->fsgid) | 264 | if (!gid_eq(grp, cred->fsgid)) |
265 | retval = groups_search(cred->group_info, grp); | 265 | retval = groups_search(cred->group_info, grp); |
266 | return retval; | 266 | return retval; |
267 | } | 267 | } |
268 | 268 | ||
269 | EXPORT_SYMBOL(in_group_p); | 269 | EXPORT_SYMBOL(in_group_p); |
270 | 270 | ||
271 | int in_egroup_p(gid_t grp) | 271 | int in_egroup_p(kgid_t grp) |
272 | { | 272 | { |
273 | const struct cred *cred = current_cred(); | 273 | const struct cred *cred = current_cred(); |
274 | int retval = 1; | 274 | int retval = 1; |
275 | 275 | ||
276 | if (grp != cred->egid) | 276 | if (!gid_eq(grp, cred->egid)) |
277 | retval = groups_search(cred->group_info, grp); | 277 | retval = groups_search(cred->group_info, grp); |
278 | return retval; | 278 | return retval; |
279 | } | 279 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
657 | return 0; | 657 | return 0; |
658 | } | 658 | } |
659 | 659 | ||
660 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | ||
661 | { | ||
662 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | ||
663 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | ||
664 | |||
665 | return ktime_get_update_offsets(offs_real, offs_boot); | ||
666 | } | ||
667 | |||
660 | /* | 668 | /* |
661 | * Retrigger next event is called after clock was set | 669 | * Retrigger next event is called after clock was set |
662 | * | 670 | * |
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
665 | static void retrigger_next_event(void *arg) | 673 | static void retrigger_next_event(void *arg) |
666 | { | 674 | { |
667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | 675 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); |
668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
669 | 676 | ||
670 | if (!hrtimer_hres_active()) | 677 | if (!hrtimer_hres_active()) |
671 | return; | 678 | return; |
672 | 679 | ||
673 | /* Optimized out for !HIGH_RES */ | ||
674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
676 | |||
677 | /* Adjust CLOCK_REALTIME offset */ | ||
678 | raw_spin_lock(&base->lock); | 680 | raw_spin_lock(&base->lock); |
679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | 681 | hrtimer_update_base(base); |
680 | timespec_to_ktime(realtime_offset); | ||
681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
682 | timespec_to_ktime(sleep); | ||
683 | |||
684 | hrtimer_force_reprogram(base, 0); | 682 | hrtimer_force_reprogram(base, 0); |
685 | raw_spin_unlock(&base->lock); | 683 | raw_spin_unlock(&base->lock); |
686 | } | 684 | } |
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void) | |||
710 | base->clock_base[i].resolution = KTIME_HIGH_RES; | 708 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
711 | 709 | ||
712 | tick_setup_sched_timer(); | 710 | tick_setup_sched_timer(); |
713 | |||
714 | /* "Retrigger" the interrupt to get things going */ | 711 | /* "Retrigger" the interrupt to get things going */ |
715 | retrigger_next_event(NULL); | 712 | retrigger_next_event(NULL); |
716 | local_irq_restore(flags); | 713 | local_irq_restore(flags); |
717 | return 1; | 714 | return 1; |
718 | } | 715 | } |
719 | 716 | ||
717 | /* | ||
718 | * Called from timekeeping code to reprogramm the hrtimer interrupt | ||
719 | * device. If called from the timer interrupt context we defer it to | ||
720 | * softirq context. | ||
721 | */ | ||
722 | void clock_was_set_delayed(void) | ||
723 | { | ||
724 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
725 | |||
726 | cpu_base->clock_was_set = 1; | ||
727 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
728 | } | ||
729 | |||
720 | #else | 730 | #else |
721 | 731 | ||
722 | static inline int hrtimer_hres_active(void) { return 0; } | 732 | static inline int hrtimer_hres_active(void) { return 0; } |
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1250 | cpu_base->nr_events++; | 1260 | cpu_base->nr_events++; |
1251 | dev->next_event.tv64 = KTIME_MAX; | 1261 | dev->next_event.tv64 = KTIME_MAX; |
1252 | 1262 | ||
1253 | entry_time = now = ktime_get(); | 1263 | raw_spin_lock(&cpu_base->lock); |
1264 | entry_time = now = hrtimer_update_base(cpu_base); | ||
1254 | retry: | 1265 | retry: |
1255 | expires_next.tv64 = KTIME_MAX; | 1266 | expires_next.tv64 = KTIME_MAX; |
1256 | |||
1257 | raw_spin_lock(&cpu_base->lock); | ||
1258 | /* | 1267 | /* |
1259 | * We set expires_next to KTIME_MAX here with cpu_base->lock | 1268 | * We set expires_next to KTIME_MAX here with cpu_base->lock |
1260 | * held to prevent that a timer is enqueued in our queue via | 1269 | * held to prevent that a timer is enqueued in our queue via |
@@ -1330,8 +1339,12 @@ retry: | |||
1330 | * We need to prevent that we loop forever in the hrtimer | 1339 | * We need to prevent that we loop forever in the hrtimer |
1331 | * interrupt routine. We give it 3 attempts to avoid | 1340 | * interrupt routine. We give it 3 attempts to avoid |
1332 | * overreacting on some spurious event. | 1341 | * overreacting on some spurious event. |
1342 | * | ||
1343 | * Acquire base lock for updating the offsets and retrieving | ||
1344 | * the current time. | ||
1333 | */ | 1345 | */ |
1334 | now = ktime_get(); | 1346 | raw_spin_lock(&cpu_base->lock); |
1347 | now = hrtimer_update_base(cpu_base); | ||
1335 | cpu_base->nr_retries++; | 1348 | cpu_base->nr_retries++; |
1336 | if (++retries < 3) | 1349 | if (++retries < 3) |
1337 | goto retry; | 1350 | goto retry; |
@@ -1343,6 +1356,7 @@ retry: | |||
1343 | */ | 1356 | */ |
1344 | cpu_base->nr_hangs++; | 1357 | cpu_base->nr_hangs++; |
1345 | cpu_base->hang_detected = 1; | 1358 | cpu_base->hang_detected = 1; |
1359 | raw_spin_unlock(&cpu_base->lock); | ||
1346 | delta = ktime_sub(now, entry_time); | 1360 | delta = ktime_sub(now, entry_time); |
1347 | if (delta.tv64 > cpu_base->max_hang_time.tv64) | 1361 | if (delta.tv64 > cpu_base->max_hang_time.tv64) |
1348 | cpu_base->max_hang_time = delta; | 1362 | cpu_base->max_hang_time = delta; |
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void) | |||
1395 | 1409 | ||
1396 | static void run_hrtimer_softirq(struct softirq_action *h) | 1410 | static void run_hrtimer_softirq(struct softirq_action *h) |
1397 | { | 1411 | { |
1412 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1413 | |||
1414 | if (cpu_base->clock_was_set) { | ||
1415 | cpu_base->clock_was_set = 0; | ||
1416 | clock_was_set(); | ||
1417 | } | ||
1418 | |||
1398 | hrtimer_peek_ahead_timers(); | 1419 | hrtimer_peek_ahead_timers(); |
1399 | } | 1420 | } |
1400 | 1421 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c21449f85a2a..6df614912b9d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
108 | 108 | ||
109 | touch_nmi_watchdog(); | 109 | touch_nmi_watchdog(); |
110 | 110 | ||
111 | if (sysctl_hung_task_panic) | 111 | if (sysctl_hung_task_panic) { |
112 | trigger_all_cpu_backtrace(); | ||
112 | panic("hung_task: blocked tasks"); | 113 | panic("hung_task: blocked tasks"); |
114 | } | ||
113 | } | 115 | } |
114 | 116 | ||
115 | /* | 117 | /* |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6080f6bc8c33..eebd6d5cfb44 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) | |||
275 | kstat_incr_irqs_this_cpu(irq, desc); | 275 | kstat_incr_irqs_this_cpu(irq, desc); |
276 | 276 | ||
277 | action = desc->action; | 277 | action = desc->action; |
278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) | 278 | if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { |
279 | desc->istate |= IRQS_PENDING; | ||
279 | goto out_unlock; | 280 | goto out_unlock; |
281 | } | ||
280 | 282 | ||
281 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 283 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
282 | raw_spin_unlock_irq(&desc->lock); | 284 | raw_spin_unlock_irq(&desc->lock); |
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
324 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); | 326 | desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); |
325 | kstat_incr_irqs_this_cpu(irq, desc); | 327 | kstat_incr_irqs_this_cpu(irq, desc); |
326 | 328 | ||
327 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 329 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
330 | desc->istate |= IRQS_PENDING; | ||
328 | goto out_unlock; | 331 | goto out_unlock; |
332 | } | ||
329 | 333 | ||
330 | handle_irq_event(desc); | 334 | handle_irq_event(desc); |
331 | 335 | ||
@@ -379,8 +383,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) | |||
379 | * If its disabled or no action available | 383 | * If its disabled or no action available |
380 | * keep it masked and get out of here | 384 | * keep it masked and get out of here |
381 | */ | 385 | */ |
382 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) | 386 | if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { |
387 | desc->istate |= IRQS_PENDING; | ||
383 | goto out_unlock; | 388 | goto out_unlock; |
389 | } | ||
384 | 390 | ||
385 | handle_irq_event(desc); | 391 | handle_irq_event(desc); |
386 | 392 | ||
@@ -518,6 +524,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) | |||
518 | out_unlock: | 524 | out_unlock: |
519 | raw_spin_unlock(&desc->lock); | 525 | raw_spin_unlock(&desc->lock); |
520 | } | 526 | } |
527 | EXPORT_SYMBOL(handle_edge_irq); | ||
521 | 528 | ||
522 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER | 529 | #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER |
523 | /** | 530 | /** |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d9..001fa5bab490 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | |||
101 | 101 | ||
102 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 102 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
103 | 103 | ||
104 | extern int irq_do_set_affinity(struct irq_data *data, | ||
105 | const struct cpumask *dest, bool force); | ||
106 | |||
104 | /* Inline functions for support of irq chips on slow busses */ | 107 | /* Inline functions for support of irq chips on slow busses */ |
105 | static inline void chip_bus_lock(struct irq_desc *desc) | 108 | static inline void chip_bus_lock(struct irq_desc *desc) |
106 | { | 109 | { |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d86e254b95eb..192a302d6cfd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) | |||
112 | { | 112 | { |
113 | return radix_tree_lookup(&irq_desc_tree, irq); | 113 | return radix_tree_lookup(&irq_desc_tree, irq); |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(irq_to_desc); | ||
115 | 116 | ||
116 | static void delete_irq_desc(unsigned int irq) | 117 | static void delete_irq_desc(unsigned int irq) |
117 | { | 118 | { |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 0e0ba5f840b2..41c1564103f1 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #define pr_fmt(fmt) "irq: " fmt | ||
2 | |||
1 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
2 | #include <linux/hardirq.h> | 4 | #include <linux/hardirq.h> |
3 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
@@ -56,14 +58,73 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, | |||
56 | return domain; | 58 | return domain; |
57 | } | 59 | } |
58 | 60 | ||
61 | static void irq_domain_free(struct irq_domain *domain) | ||
62 | { | ||
63 | of_node_put(domain->of_node); | ||
64 | kfree(domain); | ||
65 | } | ||
66 | |||
59 | static void irq_domain_add(struct irq_domain *domain) | 67 | static void irq_domain_add(struct irq_domain *domain) |
60 | { | 68 | { |
61 | mutex_lock(&irq_domain_mutex); | 69 | mutex_lock(&irq_domain_mutex); |
62 | list_add(&domain->link, &irq_domain_list); | 70 | list_add(&domain->link, &irq_domain_list); |
63 | mutex_unlock(&irq_domain_mutex); | 71 | mutex_unlock(&irq_domain_mutex); |
64 | pr_debug("irq: Allocated domain of type %d @0x%p\n", | 72 | pr_debug("Allocated domain of type %d @0x%p\n", |
73 | domain->revmap_type, domain); | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * irq_domain_remove() - Remove an irq domain. | ||
78 | * @domain: domain to remove | ||
79 | * | ||
80 | * This routine is used to remove an irq domain. The caller must ensure | ||
81 | * that all mappings within the domain have been disposed of prior to | ||
82 | * use, depending on the revmap type. | ||
83 | */ | ||
84 | void irq_domain_remove(struct irq_domain *domain) | ||
85 | { | ||
86 | mutex_lock(&irq_domain_mutex); | ||
87 | |||
88 | switch (domain->revmap_type) { | ||
89 | case IRQ_DOMAIN_MAP_LEGACY: | ||
90 | /* | ||
91 | * Legacy domains don't manage their own irq_desc | ||
92 | * allocations, we expect the caller to handle irq_desc | ||
93 | * freeing on their own. | ||
94 | */ | ||
95 | break; | ||
96 | case IRQ_DOMAIN_MAP_TREE: | ||
97 | /* | ||
98 | * radix_tree_delete() takes care of destroying the root | ||
99 | * node when all entries are removed. Shout if there are | ||
100 | * any mappings left. | ||
101 | */ | ||
102 | WARN_ON(domain->revmap_data.tree.height); | ||
103 | break; | ||
104 | case IRQ_DOMAIN_MAP_LINEAR: | ||
105 | kfree(domain->revmap_data.linear.revmap); | ||
106 | domain->revmap_data.linear.size = 0; | ||
107 | break; | ||
108 | case IRQ_DOMAIN_MAP_NOMAP: | ||
109 | break; | ||
110 | } | ||
111 | |||
112 | list_del(&domain->link); | ||
113 | |||
114 | /* | ||
115 | * If the going away domain is the default one, reset it. | ||
116 | */ | ||
117 | if (unlikely(irq_default_domain == domain)) | ||
118 | irq_set_default_host(NULL); | ||
119 | |||
120 | mutex_unlock(&irq_domain_mutex); | ||
121 | |||
122 | pr_debug("Removed domain of type %d @0x%p\n", | ||
65 | domain->revmap_type, domain); | 123 | domain->revmap_type, domain); |
124 | |||
125 | irq_domain_free(domain); | ||
66 | } | 126 | } |
127 | EXPORT_SYMBOL_GPL(irq_domain_remove); | ||
67 | 128 | ||
68 | static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | 129 | static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, |
69 | irq_hw_number_t hwirq) | 130 | irq_hw_number_t hwirq) |
@@ -117,8 +178,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
117 | 178 | ||
118 | if (WARN_ON(!irq_data || irq_data->domain)) { | 179 | if (WARN_ON(!irq_data || irq_data->domain)) { |
119 | mutex_unlock(&irq_domain_mutex); | 180 | mutex_unlock(&irq_domain_mutex); |
120 | of_node_put(domain->of_node); | 181 | irq_domain_free(domain); |
121 | kfree(domain); | ||
122 | return NULL; | 182 | return NULL; |
123 | } | 183 | } |
124 | } | 184 | } |
@@ -152,10 +212,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
152 | irq_domain_add(domain); | 212 | irq_domain_add(domain); |
153 | return domain; | 213 | return domain; |
154 | } | 214 | } |
215 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); | ||
155 | 216 | ||
156 | /** | 217 | /** |
157 | * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. | 218 | * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. |
158 | * @of_node: pointer to interrupt controller's device tree node. | 219 | * @of_node: pointer to interrupt controller's device tree node. |
220 | * @size: Number of interrupts in the domain. | ||
159 | * @ops: map/unmap domain callbacks | 221 | * @ops: map/unmap domain callbacks |
160 | * @host_data: Controller private data pointer | 222 | * @host_data: Controller private data pointer |
161 | */ | 223 | */ |
@@ -181,6 +243,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, | |||
181 | irq_domain_add(domain); | 243 | irq_domain_add(domain); |
182 | return domain; | 244 | return domain; |
183 | } | 245 | } |
246 | EXPORT_SYMBOL_GPL(irq_domain_add_linear); | ||
184 | 247 | ||
185 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | 248 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, |
186 | unsigned int max_irq, | 249 | unsigned int max_irq, |
@@ -195,6 +258,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | |||
195 | } | 258 | } |
196 | return domain; | 259 | return domain; |
197 | } | 260 | } |
261 | EXPORT_SYMBOL_GPL(irq_domain_add_nomap); | ||
198 | 262 | ||
199 | /** | 263 | /** |
200 | * irq_domain_add_tree() | 264 | * irq_domain_add_tree() |
@@ -216,6 +280,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, | |||
216 | } | 280 | } |
217 | return domain; | 281 | return domain; |
218 | } | 282 | } |
283 | EXPORT_SYMBOL_GPL(irq_domain_add_tree); | ||
219 | 284 | ||
220 | /** | 285 | /** |
221 | * irq_find_host() - Locates a domain for a given device node | 286 | * irq_find_host() - Locates a domain for a given device node |
@@ -259,10 +324,11 @@ EXPORT_SYMBOL_GPL(irq_find_host); | |||
259 | */ | 324 | */ |
260 | void irq_set_default_host(struct irq_domain *domain) | 325 | void irq_set_default_host(struct irq_domain *domain) |
261 | { | 326 | { |
262 | pr_debug("irq: Default domain set to @0x%p\n", domain); | 327 | pr_debug("Default domain set to @0x%p\n", domain); |
263 | 328 | ||
264 | irq_default_domain = domain; | 329 | irq_default_domain = domain; |
265 | } | 330 | } |
331 | EXPORT_SYMBOL_GPL(irq_set_default_host); | ||
266 | 332 | ||
267 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | 333 | static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, |
268 | irq_hw_number_t hwirq) | 334 | irq_hw_number_t hwirq) |
@@ -272,7 +338,7 @@ static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, | |||
272 | irq_data->hwirq = hwirq; | 338 | irq_data->hwirq = hwirq; |
273 | irq_data->domain = domain; | 339 | irq_data->domain = domain; |
274 | if (domain->ops->map(domain, virq, hwirq)) { | 340 | if (domain->ops->map(domain, virq, hwirq)) { |
275 | pr_debug("irq: -> mapping failed, freeing\n"); | 341 | pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq); |
276 | irq_data->domain = NULL; | 342 | irq_data->domain = NULL; |
277 | irq_data->hwirq = 0; | 343 | irq_data->hwirq = 0; |
278 | return -1; | 344 | return -1; |
@@ -303,7 +369,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
303 | 369 | ||
304 | virq = irq_alloc_desc_from(1, 0); | 370 | virq = irq_alloc_desc_from(1, 0); |
305 | if (!virq) { | 371 | if (!virq) { |
306 | pr_debug("irq: create_direct virq allocation failed\n"); | 372 | pr_debug("create_direct virq allocation failed\n"); |
307 | return 0; | 373 | return 0; |
308 | } | 374 | } |
309 | if (virq >= domain->revmap_data.nomap.max_irq) { | 375 | if (virq >= domain->revmap_data.nomap.max_irq) { |
@@ -312,7 +378,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
312 | irq_free_desc(virq); | 378 | irq_free_desc(virq); |
313 | return 0; | 379 | return 0; |
314 | } | 380 | } |
315 | pr_debug("irq: create_direct obtained virq %d\n", virq); | 381 | pr_debug("create_direct obtained virq %d\n", virq); |
316 | 382 | ||
317 | if (irq_setup_virq(domain, virq, virq)) { | 383 | if (irq_setup_virq(domain, virq, virq)) { |
318 | irq_free_desc(virq); | 384 | irq_free_desc(virq); |
@@ -321,6 +387,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
321 | 387 | ||
322 | return virq; | 388 | return virq; |
323 | } | 389 | } |
390 | EXPORT_SYMBOL_GPL(irq_create_direct_mapping); | ||
324 | 391 | ||
325 | /** | 392 | /** |
326 | * irq_create_mapping() - Map a hardware interrupt into linux irq space | 393 | * irq_create_mapping() - Map a hardware interrupt into linux irq space |
@@ -338,23 +405,23 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
338 | unsigned int hint; | 405 | unsigned int hint; |
339 | int virq; | 406 | int virq; |
340 | 407 | ||
341 | pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | 408 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); |
342 | 409 | ||
343 | /* Look for default domain if nececssary */ | 410 | /* Look for default domain if nececssary */ |
344 | if (domain == NULL) | 411 | if (domain == NULL) |
345 | domain = irq_default_domain; | 412 | domain = irq_default_domain; |
346 | if (domain == NULL) { | 413 | if (domain == NULL) { |
347 | printk(KERN_WARNING "irq_create_mapping called for" | 414 | pr_warning("irq_create_mapping called for" |
348 | " NULL domain, hwirq=%lx\n", hwirq); | 415 | " NULL domain, hwirq=%lx\n", hwirq); |
349 | WARN_ON(1); | 416 | WARN_ON(1); |
350 | return 0; | 417 | return 0; |
351 | } | 418 | } |
352 | pr_debug("irq: -> using domain @%p\n", domain); | 419 | pr_debug("-> using domain @%p\n", domain); |
353 | 420 | ||
354 | /* Check if mapping already exists */ | 421 | /* Check if mapping already exists */ |
355 | virq = irq_find_mapping(domain, hwirq); | 422 | virq = irq_find_mapping(domain, hwirq); |
356 | if (virq) { | 423 | if (virq) { |
357 | pr_debug("irq: -> existing mapping on virq %d\n", virq); | 424 | pr_debug("-> existing mapping on virq %d\n", virq); |
358 | return virq; | 425 | return virq; |
359 | } | 426 | } |
360 | 427 | ||
@@ -370,7 +437,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
370 | if (virq <= 0) | 437 | if (virq <= 0) |
371 | virq = irq_alloc_desc_from(1, 0); | 438 | virq = irq_alloc_desc_from(1, 0); |
372 | if (virq <= 0) { | 439 | if (virq <= 0) { |
373 | pr_debug("irq: -> virq allocation failed\n"); | 440 | pr_debug("-> virq allocation failed\n"); |
374 | return 0; | 441 | return 0; |
375 | } | 442 | } |
376 | 443 | ||
@@ -380,7 +447,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
380 | return 0; | 447 | return 0; |
381 | } | 448 | } |
382 | 449 | ||
383 | pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", | 450 | pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", |
384 | hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); | 451 | hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); |
385 | 452 | ||
386 | return virq; | 453 | return virq; |
@@ -409,8 +476,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, | |||
409 | if (intsize > 0) | 476 | if (intsize > 0) |
410 | return intspec[0]; | 477 | return intspec[0]; |
411 | #endif | 478 | #endif |
412 | printk(KERN_WARNING "irq: no irq domain found for %s !\n", | 479 | pr_warning("no irq domain found for %s !\n", |
413 | controller->full_name); | 480 | controller->full_name); |
414 | return 0; | 481 | return 0; |
415 | } | 482 | } |
416 | 483 | ||
@@ -560,6 +627,7 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, | |||
560 | */ | 627 | */ |
561 | return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); | 628 | return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); |
562 | } | 629 | } |
630 | EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup); | ||
563 | 631 | ||
564 | /** | 632 | /** |
565 | * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. | 633 | * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. |
@@ -584,6 +652,7 @@ void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, | |||
584 | mutex_unlock(&revmap_trees_mutex); | 652 | mutex_unlock(&revmap_trees_mutex); |
585 | } | 653 | } |
586 | } | 654 | } |
655 | EXPORT_SYMBOL_GPL(irq_radix_revmap_insert); | ||
587 | 656 | ||
588 | /** | 657 | /** |
589 | * irq_linear_revmap() - Find a linux irq from a hw irq number. | 658 | * irq_linear_revmap() - Find a linux irq from a hw irq number. |
@@ -617,6 +686,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, | |||
617 | 686 | ||
618 | return revmap[hwirq]; | 687 | return revmap[hwirq]; |
619 | } | 688 | } |
689 | EXPORT_SYMBOL_GPL(irq_linear_revmap); | ||
620 | 690 | ||
621 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG | 691 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG |
622 | static int virq_debug_show(struct seq_file *m, void *private) | 692 | static int virq_debug_show(struct seq_file *m, void *private) |
@@ -691,8 +761,8 @@ static int __init irq_debugfs_init(void) | |||
691 | __initcall(irq_debugfs_init); | 761 | __initcall(irq_debugfs_init); |
692 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ | 762 | #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ |
693 | 763 | ||
694 | int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, | 764 | static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, |
695 | irq_hw_number_t hwirq) | 765 | irq_hw_number_t hwirq) |
696 | { | 766 | { |
697 | return 0; | 767 | return 0; |
698 | } | 768 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569b..8c548232ba39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -7,6 +7,8 @@ | |||
7 | * This file contains driver APIs to the irq subsystem. | 7 | * This file contains driver APIs to the irq subsystem. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #define pr_fmt(fmt) "genirq: " fmt | ||
11 | |||
10 | #include <linux/irq.h> | 12 | #include <linux/irq.h> |
11 | #include <linux/kthread.h> | 13 | #include <linux/kthread.h> |
12 | #include <linux/module.h> | 14 | #include <linux/module.h> |
@@ -14,6 +16,7 @@ | |||
14 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
15 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
16 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/task_work.h> | ||
17 | 20 | ||
18 | #include "internals.h" | 21 | #include "internals.h" |
19 | 22 | ||
@@ -139,6 +142,25 @@ static inline void | |||
139 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | 142 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } |
140 | #endif | 143 | #endif |
141 | 144 | ||
145 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | ||
146 | bool force) | ||
147 | { | ||
148 | struct irq_desc *desc = irq_data_to_desc(data); | ||
149 | struct irq_chip *chip = irq_data_get_irq_chip(data); | ||
150 | int ret; | ||
151 | |||
152 | ret = chip->irq_set_affinity(data, mask, false); | ||
153 | switch (ret) { | ||
154 | case IRQ_SET_MASK_OK: | ||
155 | cpumask_copy(data->affinity, mask); | ||
156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
157 | irq_set_thread_affinity(desc); | ||
158 | ret = 0; | ||
159 | } | ||
160 | |||
161 | return ret; | ||
162 | } | ||
163 | |||
142 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | 164 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) |
143 | { | 165 | { |
144 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 166 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
@@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | |||
149 | return -EINVAL; | 171 | return -EINVAL; |
150 | 172 | ||
151 | if (irq_can_move_pcntxt(data)) { | 173 | if (irq_can_move_pcntxt(data)) { |
152 | ret = chip->irq_set_affinity(data, mask, false); | 174 | ret = irq_do_set_affinity(data, mask, false); |
153 | switch (ret) { | ||
154 | case IRQ_SET_MASK_OK: | ||
155 | cpumask_copy(data->affinity, mask); | ||
156 | case IRQ_SET_MASK_OK_NOCOPY: | ||
157 | irq_set_thread_affinity(desc); | ||
158 | ret = 0; | ||
159 | } | ||
160 | } else { | 175 | } else { |
161 | irqd_set_move_pending(data); | 176 | irqd_set_move_pending(data); |
162 | irq_copy_pending(desc, mask); | 177 | irq_copy_pending(desc, mask); |
@@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); | |||
280 | static int | 295 | static int |
281 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | 296 | setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) |
282 | { | 297 | { |
283 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
284 | struct cpumask *set = irq_default_affinity; | 298 | struct cpumask *set = irq_default_affinity; |
285 | int ret, node = desc->irq_data.node; | 299 | int node = desc->irq_data.node; |
286 | 300 | ||
287 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 301 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
288 | if (!irq_can_set_affinity(irq)) | 302 | if (!irq_can_set_affinity(irq)) |
@@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) | |||
308 | if (cpumask_intersects(mask, nodemask)) | 322 | if (cpumask_intersects(mask, nodemask)) |
309 | cpumask_and(mask, mask, nodemask); | 323 | cpumask_and(mask, mask, nodemask); |
310 | } | 324 | } |
311 | ret = chip->irq_set_affinity(&desc->irq_data, mask, false); | 325 | irq_do_set_affinity(&desc->irq_data, mask, false); |
312 | switch (ret) { | ||
313 | case IRQ_SET_MASK_OK: | ||
314 | cpumask_copy(desc->irq_data.affinity, mask); | ||
315 | case IRQ_SET_MASK_OK_NOCOPY: | ||
316 | irq_set_thread_affinity(desc); | ||
317 | } | ||
318 | return 0; | 326 | return 0; |
319 | } | 327 | } |
320 | #else | 328 | #else |
@@ -566,7 +574,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
566 | * flow-types? | 574 | * flow-types? |
567 | */ | 575 | */ |
568 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, | 576 | pr_debug("No set_type function for IRQ %d (%s)\n", irq, |
569 | chip ? (chip->name ? : "unknown") : "unknown"); | 577 | chip ? (chip->name ? : "unknown") : "unknown"); |
570 | return 0; | 578 | return 0; |
571 | } | 579 | } |
572 | 580 | ||
@@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, | |||
600 | ret = 0; | 608 | ret = 0; |
601 | break; | 609 | break; |
602 | default: | 610 | default: |
603 | pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", | 611 | pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", |
604 | flags, irq, chip->irq_set_type); | 612 | flags, irq, chip->irq_set_type); |
605 | } | 613 | } |
606 | if (unmask) | 614 | if (unmask) |
@@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc) | |||
773 | wake_up(&desc->wait_for_threads); | 781 | wake_up(&desc->wait_for_threads); |
774 | } | 782 | } |
775 | 783 | ||
784 | static void irq_thread_dtor(struct task_work *unused) | ||
785 | { | ||
786 | struct task_struct *tsk = current; | ||
787 | struct irq_desc *desc; | ||
788 | struct irqaction *action; | ||
789 | |||
790 | if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) | ||
791 | return; | ||
792 | |||
793 | action = kthread_data(tsk); | ||
794 | |||
795 | pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
796 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | ||
797 | |||
798 | |||
799 | desc = irq_to_desc(action->irq); | ||
800 | /* | ||
801 | * If IRQTF_RUNTHREAD is set, we need to decrement | ||
802 | * desc->threads_active and wake possible waiters. | ||
803 | */ | ||
804 | if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
805 | wake_threads_waitq(desc); | ||
806 | |||
807 | /* Prevent a stale desc->threads_oneshot */ | ||
808 | irq_finalize_oneshot(desc, action); | ||
809 | } | ||
810 | |||
776 | /* | 811 | /* |
777 | * Interrupt handler thread | 812 | * Interrupt handler thread |
778 | */ | 813 | */ |
779 | static int irq_thread(void *data) | 814 | static int irq_thread(void *data) |
780 | { | 815 | { |
816 | struct task_work on_exit_work; | ||
781 | static const struct sched_param param = { | 817 | static const struct sched_param param = { |
782 | .sched_priority = MAX_USER_RT_PRIO/2, | 818 | .sched_priority = MAX_USER_RT_PRIO/2, |
783 | }; | 819 | }; |
@@ -793,7 +829,9 @@ static int irq_thread(void *data) | |||
793 | handler_fn = irq_thread_fn; | 829 | handler_fn = irq_thread_fn; |
794 | 830 | ||
795 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 831 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
796 | current->irq_thread = 1; | 832 | |
833 | init_task_work(&on_exit_work, irq_thread_dtor, NULL); | ||
834 | task_work_add(current, &on_exit_work, false); | ||
797 | 835 | ||
798 | while (!irq_wait_for_interrupt(action)) { | 836 | while (!irq_wait_for_interrupt(action)) { |
799 | irqreturn_t action_ret; | 837 | irqreturn_t action_ret; |
@@ -815,45 +853,11 @@ static int irq_thread(void *data) | |||
815 | * cannot touch the oneshot mask at this point anymore as | 853 | * cannot touch the oneshot mask at this point anymore as |
816 | * __setup_irq() might have given out currents thread_mask | 854 | * __setup_irq() might have given out currents thread_mask |
817 | * again. | 855 | * again. |
818 | * | ||
819 | * Clear irq_thread. Otherwise exit_irq_thread() would make | ||
820 | * fuzz about an active irq thread going into nirvana. | ||
821 | */ | 856 | */ |
822 | current->irq_thread = 0; | 857 | task_work_cancel(current, irq_thread_dtor); |
823 | return 0; | 858 | return 0; |
824 | } | 859 | } |
825 | 860 | ||
826 | /* | ||
827 | * Called from do_exit() | ||
828 | */ | ||
829 | void exit_irq_thread(void) | ||
830 | { | ||
831 | struct task_struct *tsk = current; | ||
832 | struct irq_desc *desc; | ||
833 | struct irqaction *action; | ||
834 | |||
835 | if (!tsk->irq_thread) | ||
836 | return; | ||
837 | |||
838 | action = kthread_data(tsk); | ||
839 | |||
840 | printk(KERN_ERR | ||
841 | "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", | ||
842 | tsk->comm ? tsk->comm : "", tsk->pid, action->irq); | ||
843 | |||
844 | desc = irq_to_desc(action->irq); | ||
845 | |||
846 | /* | ||
847 | * If IRQTF_RUNTHREAD is set, we need to decrement | ||
848 | * desc->threads_active and wake possible waiters. | ||
849 | */ | ||
850 | if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) | ||
851 | wake_threads_waitq(desc); | ||
852 | |||
853 | /* Prevent a stale desc->threads_oneshot */ | ||
854 | irq_finalize_oneshot(desc, action); | ||
855 | } | ||
856 | |||
857 | static void irq_setup_forced_threading(struct irqaction *new) | 861 | static void irq_setup_forced_threading(struct irqaction *new) |
858 | { | 862 | { |
859 | if (!force_irqthreads) | 863 | if (!force_irqthreads) |
@@ -878,7 +882,6 @@ static int | |||
878 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 882 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
879 | { | 883 | { |
880 | struct irqaction *old, **old_ptr; | 884 | struct irqaction *old, **old_ptr; |
881 | const char *old_name = NULL; | ||
882 | unsigned long flags, thread_mask = 0; | 885 | unsigned long flags, thread_mask = 0; |
883 | int ret, nested, shared = 0; | 886 | int ret, nested, shared = 0; |
884 | cpumask_var_t mask; | 887 | cpumask_var_t mask; |
@@ -972,10 +975,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
972 | */ | 975 | */ |
973 | if (!((old->flags & new->flags) & IRQF_SHARED) || | 976 | if (!((old->flags & new->flags) & IRQF_SHARED) || |
974 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || | 977 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || |
975 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) { | 978 | ((old->flags ^ new->flags) & IRQF_ONESHOT)) |
976 | old_name = old->name; | ||
977 | goto mismatch; | 979 | goto mismatch; |
978 | } | ||
979 | 980 | ||
980 | /* All handlers must agree on per-cpuness */ | 981 | /* All handlers must agree on per-cpuness */ |
981 | if ((old->flags & IRQF_PERCPU) != | 982 | if ((old->flags & IRQF_PERCPU) != |
@@ -1031,6 +1032,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1031 | * all existing action->thread_mask bits. | 1032 | * all existing action->thread_mask bits. |
1032 | */ | 1033 | */ |
1033 | new->thread_mask = 1 << ffz(thread_mask); | 1034 | new->thread_mask = 1 << ffz(thread_mask); |
1035 | |||
1036 | } else if (new->handler == irq_default_primary_handler) { | ||
1037 | /* | ||
1038 | * The interrupt was requested with handler = NULL, so | ||
1039 | * we use the default primary handler for it. But it | ||
1040 | * does not have the oneshot flag set. In combination | ||
1041 | * with level interrupts this is deadly, because the | ||
1042 | * default primary handler just wakes the thread, then | ||
1043 | * the irq lines is reenabled, but the device still | ||
1044 | * has the level irq asserted. Rinse and repeat.... | ||
1045 | * | ||
1046 | * While this works for edge type interrupts, we play | ||
1047 | * it safe and reject unconditionally because we can't | ||
1048 | * say for sure which type this interrupt really | ||
1049 | * has. The type flags are unreliable as the | ||
1050 | * underlying chip implementation can override them. | ||
1051 | */ | ||
1052 | pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", | ||
1053 | irq); | ||
1054 | ret = -EINVAL; | ||
1055 | goto out_mask; | ||
1034 | } | 1056 | } |
1035 | 1057 | ||
1036 | if (!shared) { | 1058 | if (!shared) { |
@@ -1078,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1078 | 1100 | ||
1079 | if (nmsk != omsk) | 1101 | if (nmsk != omsk) |
1080 | /* hope the handler works with current trigger mode */ | 1102 | /* hope the handler works with current trigger mode */ |
1081 | pr_warning("IRQ %d uses trigger mode %u; requested %u\n", | 1103 | pr_warning("irq %d uses trigger mode %u; requested %u\n", |
1082 | irq, nmsk, omsk); | 1104 | irq, nmsk, omsk); |
1083 | } | 1105 | } |
1084 | 1106 | ||
@@ -1115,14 +1137,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1115 | return 0; | 1137 | return 0; |
1116 | 1138 | ||
1117 | mismatch: | 1139 | mismatch: |
1118 | #ifdef CONFIG_DEBUG_SHIRQ | ||
1119 | if (!(new->flags & IRQF_PROBE_SHARED)) { | 1140 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
1120 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); | 1141 | pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", |
1121 | if (old_name) | 1142 | irq, new->flags, new->name, old->flags, old->name); |
1122 | printk(KERN_ERR "current handler: %s\n", old_name); | 1143 | #ifdef CONFIG_DEBUG_SHIRQ |
1123 | dump_stack(); | 1144 | dump_stack(); |
1124 | } | ||
1125 | #endif | 1145 | #endif |
1146 | } | ||
1126 | ret = -EBUSY; | 1147 | ret = -EBUSY; |
1127 | 1148 | ||
1128 | out_mask: | 1149 | out_mask: |
@@ -1204,12 +1225,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1204 | /* Found it - now remove it from the list of entries: */ | 1225 | /* Found it - now remove it from the list of entries: */ |
1205 | *action_ptr = action->next; | 1226 | *action_ptr = action->next; |
1206 | 1227 | ||
1207 | /* Currently used only by UML, might disappear one day: */ | ||
1208 | #ifdef CONFIG_IRQ_RELEASE_METHOD | ||
1209 | if (desc->irq_data.chip->release) | ||
1210 | desc->irq_data.chip->release(irq, dev_id); | ||
1211 | #endif | ||
1212 | |||
1213 | /* If this was the last handler, shut down the IRQ line: */ | 1228 | /* If this was the last handler, shut down the IRQ line: */ |
1214 | if (!desc->action) | 1229 | if (!desc->action) |
1215 | irq_shutdown(desc); | 1230 | irq_shutdown(desc); |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b327..ca3f4aaff707 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) | |||
42 | * For correct operation this depends on the caller | 42 | * For correct operation this depends on the caller |
43 | * masking the irqs. | 43 | * masking the irqs. |
44 | */ | 44 | */ |
45 | if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) | 45 | if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) |
46 | < nr_cpu_ids)) { | 46 | irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); |
47 | int ret = chip->irq_set_affinity(&desc->irq_data, | ||
48 | desc->pending_mask, false); | ||
49 | switch (ret) { | ||
50 | case IRQ_SET_MASK_OK: | ||
51 | cpumask_copy(desc->irq_data.affinity, desc->pending_mask); | ||
52 | case IRQ_SET_MASK_OK_NOCOPY: | ||
53 | irq_set_thread_affinity(desc); | ||
54 | } | ||
55 | } | ||
56 | 47 | ||
57 | cpumask_clear(desc->pending_mask); | 48 | cpumask_clear(desc->pending_mask); |
58 | } | 49 | } |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a6..cb228bf21760 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void) | |||
103 | int irq; | 103 | int irq; |
104 | 104 | ||
105 | for_each_irq_desc(irq, desc) { | 105 | for_each_irq_desc(irq, desc) { |
106 | /* | ||
107 | * Only interrupts which are marked as wakeup source | ||
108 | * and have not been disabled before the suspend check | ||
109 | * can abort suspend. | ||
110 | */ | ||
106 | if (irqd_is_wakeup_set(&desc->irq_data)) { | 111 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
107 | if (desc->istate & IRQS_PENDING) | 112 | if (desc->depth == 1 && desc->istate & IRQS_PENDING) |
108 | return -EBUSY; | 113 | return -EBUSY; |
109 | continue; | 114 | continue; |
110 | } | 115 | } |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c9..6454db7b6a4d 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) | |||
58 | /* | 58 | /* |
59 | * We do not resend level type interrupts. Level type | 59 | * We do not resend level type interrupts. Level type |
60 | * interrupts are resent by hardware when they are still | 60 | * interrupts are resent by hardware when they are still |
61 | * active. | 61 | * active. Clear the pending bit so suspend/resume does not |
62 | * get confused. | ||
62 | */ | 63 | */ |
63 | if (irq_settings_is_level(desc)) | 64 | if (irq_settings_is_level(desc)) { |
65 | desc->istate &= ~IRQS_PENDING; | ||
64 | return; | 66 | return; |
67 | } | ||
65 | if (desc->istate & IRQS_REPLAY) | 68 | if (desc->istate & IRQS_REPLAY) |
66 | return; | 69 | return; |
67 | if (desc->istate & IRQS_PENDING) { | 70 | if (desc->istate & IRQS_PENDING) { |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 079f1d39a8b8..2169feeba529 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, | |||
343 | 343 | ||
344 | /* Look up a kernel symbol and return it in a text buffer. */ | 344 | /* Look up a kernel symbol and return it in a text buffer. */ |
345 | static int __sprint_symbol(char *buffer, unsigned long address, | 345 | static int __sprint_symbol(char *buffer, unsigned long address, |
346 | int symbol_offset) | 346 | int symbol_offset, int add_offset) |
347 | { | 347 | { |
348 | char *modname; | 348 | char *modname; |
349 | const char *name; | 349 | const char *name; |
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address, | |||
358 | if (name != buffer) | 358 | if (name != buffer) |
359 | strcpy(buffer, name); | 359 | strcpy(buffer, name); |
360 | len = strlen(buffer); | 360 | len = strlen(buffer); |
361 | buffer += len; | ||
362 | offset -= symbol_offset; | 361 | offset -= symbol_offset; |
363 | 362 | ||
363 | if (add_offset) | ||
364 | len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); | ||
365 | |||
364 | if (modname) | 366 | if (modname) |
365 | len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); | 367 | len += sprintf(buffer + len, " [%s]", modname); |
366 | else | ||
367 | len += sprintf(buffer, "+%#lx/%#lx", offset, size); | ||
368 | 368 | ||
369 | return len; | 369 | return len; |
370 | } | 370 | } |
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address, | |||
382 | */ | 382 | */ |
383 | int sprint_symbol(char *buffer, unsigned long address) | 383 | int sprint_symbol(char *buffer, unsigned long address) |
384 | { | 384 | { |
385 | return __sprint_symbol(buffer, address, 0); | 385 | return __sprint_symbol(buffer, address, 0, 1); |
386 | } | 386 | } |
387 | |||
388 | EXPORT_SYMBOL_GPL(sprint_symbol); | 387 | EXPORT_SYMBOL_GPL(sprint_symbol); |
389 | 388 | ||
390 | /** | 389 | /** |
390 | * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer | ||
391 | * @buffer: buffer to be stored | ||
392 | * @address: address to lookup | ||
393 | * | ||
394 | * This function looks up a kernel symbol with @address and stores its name | ||
395 | * and module name to @buffer if possible. If no symbol was found, just saves | ||
396 | * its @address as is. | ||
397 | * | ||
398 | * This function returns the number of bytes stored in @buffer. | ||
399 | */ | ||
400 | int sprint_symbol_no_offset(char *buffer, unsigned long address) | ||
401 | { | ||
402 | return __sprint_symbol(buffer, address, 0, 0); | ||
403 | } | ||
404 | EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); | ||
405 | |||
406 | /** | ||
391 | * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer | 407 | * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer |
392 | * @buffer: buffer to be stored | 408 | * @buffer: buffer to be stored |
393 | * @address: address to lookup | 409 | * @address: address to lookup |
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol); | |||
403 | */ | 419 | */ |
404 | int sprint_backtrace(char *buffer, unsigned long address) | 420 | int sprint_backtrace(char *buffer, unsigned long address) |
405 | { | 421 | { |
406 | return __sprint_symbol(buffer, address, -1); | 422 | return __sprint_symbol(buffer, address, -1, 1); |
407 | } | 423 | } |
408 | 424 | ||
409 | /* Look up a kernel symbol and print it to the kernel messages. */ | 425 | /* Look up a kernel symbol and print it to the kernel messages. */ |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c | |||
@@ -0,0 +1,196 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/syscalls.h> | ||
3 | #include <linux/fdtable.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <linux/random.h> | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/errno.h> | ||
9 | #include <linux/cache.h> | ||
10 | #include <linux/bug.h> | ||
11 | #include <linux/err.h> | ||
12 | #include <linux/kcmp.h> | ||
13 | |||
14 | #include <asm/unistd.h> | ||
15 | |||
16 | /* | ||
17 | * We don't expose the real in-memory order of objects for security reasons. | ||
18 | * But still the comparison results should be suitable for sorting. So we | ||
19 | * obfuscate kernel pointers values and compare the production instead. | ||
20 | * | ||
21 | * The obfuscation is done in two steps. First we xor the kernel pointer with | ||
22 | * a random value, which puts pointer into a new position in a reordered space. | ||
23 | * Secondly we multiply the xor production with a large odd random number to | ||
24 | * permute its bits even more (the odd multiplier guarantees that the product | ||
25 | * is unique ever after the high bits are truncated, since any odd number is | ||
26 | * relative prime to 2^n). | ||
27 | * | ||
28 | * Note also that the obfuscation itself is invisible to userspace and if needed | ||
29 | * it can be changed to an alternate scheme. | ||
30 | */ | ||
31 | static unsigned long cookies[KCMP_TYPES][2] __read_mostly; | ||
32 | |||
33 | static long kptr_obfuscate(long v, int type) | ||
34 | { | ||
35 | return (v ^ cookies[type][0]) * cookies[type][1]; | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * 0 - equal, i.e. v1 = v2 | ||
40 | * 1 - less than, i.e. v1 < v2 | ||
41 | * 2 - greater than, i.e. v1 > v2 | ||
42 | * 3 - not equal but ordering unavailable (reserved for future) | ||
43 | */ | ||
44 | static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) | ||
45 | { | ||
46 | long ret; | ||
47 | |||
48 | ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); | ||
49 | |||
50 | return (ret < 0) | ((ret > 0) << 1); | ||
51 | } | ||
52 | |||
53 | /* The caller must have pinned the task */ | ||
54 | static struct file * | ||
55 | get_file_raw_ptr(struct task_struct *task, unsigned int idx) | ||
56 | { | ||
57 | struct file *file = NULL; | ||
58 | |||
59 | task_lock(task); | ||
60 | rcu_read_lock(); | ||
61 | |||
62 | if (task->files) | ||
63 | file = fcheck_files(task->files, idx); | ||
64 | |||
65 | rcu_read_unlock(); | ||
66 | task_unlock(task); | ||
67 | |||
68 | return file; | ||
69 | } | ||
70 | |||
71 | static void kcmp_unlock(struct mutex *m1, struct mutex *m2) | ||
72 | { | ||
73 | if (likely(m2 != m1)) | ||
74 | mutex_unlock(m2); | ||
75 | mutex_unlock(m1); | ||
76 | } | ||
77 | |||
78 | static int kcmp_lock(struct mutex *m1, struct mutex *m2) | ||
79 | { | ||
80 | int err; | ||
81 | |||
82 | if (m2 > m1) | ||
83 | swap(m1, m2); | ||
84 | |||
85 | err = mutex_lock_killable(m1); | ||
86 | if (!err && likely(m1 != m2)) { | ||
87 | err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); | ||
88 | if (err) | ||
89 | mutex_unlock(m1); | ||
90 | } | ||
91 | |||
92 | return err; | ||
93 | } | ||
94 | |||
95 | SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, | ||
96 | unsigned long, idx1, unsigned long, idx2) | ||
97 | { | ||
98 | struct task_struct *task1, *task2; | ||
99 | int ret; | ||
100 | |||
101 | rcu_read_lock(); | ||
102 | |||
103 | /* | ||
104 | * Tasks are looked up in caller's PID namespace only. | ||
105 | */ | ||
106 | task1 = find_task_by_vpid(pid1); | ||
107 | task2 = find_task_by_vpid(pid2); | ||
108 | if (!task1 || !task2) | ||
109 | goto err_no_task; | ||
110 | |||
111 | get_task_struct(task1); | ||
112 | get_task_struct(task2); | ||
113 | |||
114 | rcu_read_unlock(); | ||
115 | |||
116 | /* | ||
117 | * One should have enough rights to inspect task details. | ||
118 | */ | ||
119 | ret = kcmp_lock(&task1->signal->cred_guard_mutex, | ||
120 | &task2->signal->cred_guard_mutex); | ||
121 | if (ret) | ||
122 | goto err; | ||
123 | if (!ptrace_may_access(task1, PTRACE_MODE_READ) || | ||
124 | !ptrace_may_access(task2, PTRACE_MODE_READ)) { | ||
125 | ret = -EPERM; | ||
126 | goto err_unlock; | ||
127 | } | ||
128 | |||
129 | switch (type) { | ||
130 | case KCMP_FILE: { | ||
131 | struct file *filp1, *filp2; | ||
132 | |||
133 | filp1 = get_file_raw_ptr(task1, idx1); | ||
134 | filp2 = get_file_raw_ptr(task2, idx2); | ||
135 | |||
136 | if (filp1 && filp2) | ||
137 | ret = kcmp_ptr(filp1, filp2, KCMP_FILE); | ||
138 | else | ||
139 | ret = -EBADF; | ||
140 | break; | ||
141 | } | ||
142 | case KCMP_VM: | ||
143 | ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); | ||
144 | break; | ||
145 | case KCMP_FILES: | ||
146 | ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); | ||
147 | break; | ||
148 | case KCMP_FS: | ||
149 | ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); | ||
150 | break; | ||
151 | case KCMP_SIGHAND: | ||
152 | ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); | ||
153 | break; | ||
154 | case KCMP_IO: | ||
155 | ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); | ||
156 | break; | ||
157 | case KCMP_SYSVSEM: | ||
158 | #ifdef CONFIG_SYSVIPC | ||
159 | ret = kcmp_ptr(task1->sysvsem.undo_list, | ||
160 | task2->sysvsem.undo_list, | ||
161 | KCMP_SYSVSEM); | ||
162 | #else | ||
163 | ret = -EOPNOTSUPP; | ||
164 | #endif | ||
165 | break; | ||
166 | default: | ||
167 | ret = -EINVAL; | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | err_unlock: | ||
172 | kcmp_unlock(&task1->signal->cred_guard_mutex, | ||
173 | &task2->signal->cred_guard_mutex); | ||
174 | err: | ||
175 | put_task_struct(task1); | ||
176 | put_task_struct(task2); | ||
177 | |||
178 | return ret; | ||
179 | |||
180 | err_no_task: | ||
181 | rcu_read_unlock(); | ||
182 | return -ESRCH; | ||
183 | } | ||
184 | |||
185 | static __init int kcmp_cookies_init(void) | ||
186 | { | ||
187 | int i; | ||
188 | |||
189 | get_random_bytes(cookies, sizeof(cookies)); | ||
190 | |||
191 | for (i = 0; i < KCMP_TYPES; i++) | ||
192 | cookies[i][1] |= (~(~0UL >> 1) | 1); | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | arch_initcall(kcmp_cookies_init); | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index c744b88c44e2..59dcf5b81d24 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize) | |||
402 | return max; | 402 | return max; |
403 | return len; | 403 | return len; |
404 | } | 404 | } |
405 | EXPORT_SYMBOL(__kfifo_max_r); | ||
405 | 406 | ||
406 | #define __KFIFO_PEEK(data, out, mask) \ | 407 | #define __KFIFO_PEEK(data, out, mask) \ |
407 | ((data)[(out) & (mask)]) | 408 | ((data)[(out) & (mask)]) |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 05698a7415fe..ff2c7cb86d77 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -221,13 +221,12 @@ fail: | |||
221 | return 0; | 221 | return 0; |
222 | } | 222 | } |
223 | 223 | ||
224 | void call_usermodehelper_freeinfo(struct subprocess_info *info) | 224 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) |
225 | { | 225 | { |
226 | if (info->cleanup) | 226 | if (info->cleanup) |
227 | (*info->cleanup)(info); | 227 | (*info->cleanup)(info); |
228 | kfree(info); | 228 | kfree(info); |
229 | } | 229 | } |
230 | EXPORT_SYMBOL(call_usermodehelper_freeinfo); | ||
231 | 230 | ||
232 | static void umh_complete(struct subprocess_info *sub_info) | 231 | static void umh_complete(struct subprocess_info *sub_info) |
233 | { | 232 | { |
@@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); | |||
410 | 409 | ||
411 | /** | 410 | /** |
412 | * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. | 411 | * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. |
413 | * depth: New value to assign to usermodehelper_disabled. | 412 | * @depth: New value to assign to usermodehelper_disabled. |
414 | * | 413 | * |
415 | * Change the value of usermodehelper_disabled (under umhelper_sem locked for | 414 | * Change the value of usermodehelper_disabled (under umhelper_sem locked for |
416 | * writing) and wakeup tasks waiting for it to change. | 415 | * writing) and wakeup tasks waiting for it to change. |
@@ -479,6 +478,7 @@ static void helper_unlock(void) | |||
479 | * structure. This should be passed to call_usermodehelper_exec to | 478 | * structure. This should be passed to call_usermodehelper_exec to |
480 | * exec the process and free the structure. | 479 | * exec the process and free the structure. |
481 | */ | 480 | */ |
481 | static | ||
482 | struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | 482 | struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, |
483 | char **envp, gfp_t gfp_mask) | 483 | char **envp, gfp_t gfp_mask) |
484 | { | 484 | { |
@@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, | |||
494 | out: | 494 | out: |
495 | return sub_info; | 495 | return sub_info; |
496 | } | 496 | } |
497 | EXPORT_SYMBOL(call_usermodehelper_setup); | ||
498 | 497 | ||
499 | /** | 498 | /** |
500 | * call_usermodehelper_setfns - set a cleanup/init function | 499 | * call_usermodehelper_setfns - set a cleanup/init function |
@@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); | |||
512 | * Function must be runnable in either a process context or the | 511 | * Function must be runnable in either a process context or the |
513 | * context in which call_usermodehelper_exec is called. | 512 | * context in which call_usermodehelper_exec is called. |
514 | */ | 513 | */ |
514 | static | ||
515 | void call_usermodehelper_setfns(struct subprocess_info *info, | 515 | void call_usermodehelper_setfns(struct subprocess_info *info, |
516 | int (*init)(struct subprocess_info *info, struct cred *new), | 516 | int (*init)(struct subprocess_info *info, struct cred *new), |
517 | void (*cleanup)(struct subprocess_info *info), | 517 | void (*cleanup)(struct subprocess_info *info), |
@@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info, | |||
521 | info->init = init; | 521 | info->init = init; |
522 | info->data = data; | 522 | info->data = data; |
523 | } | 523 | } |
524 | EXPORT_SYMBOL(call_usermodehelper_setfns); | ||
525 | 524 | ||
526 | /** | 525 | /** |
527 | * call_usermodehelper_exec - start a usermode application | 526 | * call_usermodehelper_exec - start a usermode application |
@@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); | |||
535 | * asynchronously if wait is not set, and runs as a child of keventd. | 534 | * asynchronously if wait is not set, and runs as a child of keventd. |
536 | * (ie. it runs with full root capabilities). | 535 | * (ie. it runs with full root capabilities). |
537 | */ | 536 | */ |
537 | static | ||
538 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | 538 | int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) |
539 | { | 539 | { |
540 | DECLARE_COMPLETION_ONSTACK(done); | 540 | DECLARE_COMPLETION_ONSTACK(done); |
@@ -576,7 +576,25 @@ unlock: | |||
576 | helper_unlock(); | 576 | helper_unlock(); |
577 | return retval; | 577 | return retval; |
578 | } | 578 | } |
579 | EXPORT_SYMBOL(call_usermodehelper_exec); | 579 | |
580 | int call_usermodehelper_fns( | ||
581 | char *path, char **argv, char **envp, int wait, | ||
582 | int (*init)(struct subprocess_info *info, struct cred *new), | ||
583 | void (*cleanup)(struct subprocess_info *), void *data) | ||
584 | { | ||
585 | struct subprocess_info *info; | ||
586 | gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; | ||
587 | |||
588 | info = call_usermodehelper_setup(path, argv, envp, gfp_mask); | ||
589 | |||
590 | if (info == NULL) | ||
591 | return -ENOMEM; | ||
592 | |||
593 | call_usermodehelper_setfns(info, init, cleanup, data); | ||
594 | |||
595 | return call_usermodehelper_exec(info, wait); | ||
596 | } | ||
597 | EXPORT_SYMBOL(call_usermodehelper_fns); | ||
580 | 598 | ||
581 | static int proc_cap_handler(struct ctl_table *table, int write, | 599 | static int proc_cap_handler(struct ctl_table *table, int write, |
582 | void __user *buffer, size_t *lenp, loff_t *ppos) | 600 | void __user *buffer, size_t *lenp, loff_t *ppos) |
diff --git a/kernel/lglock.c b/kernel/lglock.c new file mode 100644 index 000000000000..6535a667a5a7 --- /dev/null +++ b/kernel/lglock.c | |||
@@ -0,0 +1,89 @@ | |||
1 | /* See include/linux/lglock.h for description */ | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/lglock.h> | ||
4 | #include <linux/cpu.h> | ||
5 | #include <linux/string.h> | ||
6 | |||
7 | /* | ||
8 | * Note there is no uninit, so lglocks cannot be defined in | ||
9 | * modules (but it's fine to use them from there) | ||
10 | * Could be added though, just undo lg_lock_init | ||
11 | */ | ||
12 | |||
13 | void lg_lock_init(struct lglock *lg, char *name) | ||
14 | { | ||
15 | LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); | ||
16 | } | ||
17 | EXPORT_SYMBOL(lg_lock_init); | ||
18 | |||
19 | void lg_local_lock(struct lglock *lg) | ||
20 | { | ||
21 | arch_spinlock_t *lock; | ||
22 | |||
23 | preempt_disable(); | ||
24 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | ||
25 | lock = this_cpu_ptr(lg->lock); | ||
26 | arch_spin_lock(lock); | ||
27 | } | ||
28 | EXPORT_SYMBOL(lg_local_lock); | ||
29 | |||
30 | void lg_local_unlock(struct lglock *lg) | ||
31 | { | ||
32 | arch_spinlock_t *lock; | ||
33 | |||
34 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
35 | lock = this_cpu_ptr(lg->lock); | ||
36 | arch_spin_unlock(lock); | ||
37 | preempt_enable(); | ||
38 | } | ||
39 | EXPORT_SYMBOL(lg_local_unlock); | ||
40 | |||
41 | void lg_local_lock_cpu(struct lglock *lg, int cpu) | ||
42 | { | ||
43 | arch_spinlock_t *lock; | ||
44 | |||
45 | preempt_disable(); | ||
46 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | ||
47 | lock = per_cpu_ptr(lg->lock, cpu); | ||
48 | arch_spin_lock(lock); | ||
49 | } | ||
50 | EXPORT_SYMBOL(lg_local_lock_cpu); | ||
51 | |||
52 | void lg_local_unlock_cpu(struct lglock *lg, int cpu) | ||
53 | { | ||
54 | arch_spinlock_t *lock; | ||
55 | |||
56 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
57 | lock = per_cpu_ptr(lg->lock, cpu); | ||
58 | arch_spin_unlock(lock); | ||
59 | preempt_enable(); | ||
60 | } | ||
61 | EXPORT_SYMBOL(lg_local_unlock_cpu); | ||
62 | |||
63 | void lg_global_lock(struct lglock *lg) | ||
64 | { | ||
65 | int i; | ||
66 | |||
67 | preempt_disable(); | ||
68 | rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); | ||
69 | for_each_possible_cpu(i) { | ||
70 | arch_spinlock_t *lock; | ||
71 | lock = per_cpu_ptr(lg->lock, i); | ||
72 | arch_spin_lock(lock); | ||
73 | } | ||
74 | } | ||
75 | EXPORT_SYMBOL(lg_global_lock); | ||
76 | |||
77 | void lg_global_unlock(struct lglock *lg) | ||
78 | { | ||
79 | int i; | ||
80 | |||
81 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | ||
82 | for_each_possible_cpu(i) { | ||
83 | arch_spinlock_t *lock; | ||
84 | lock = per_cpu_ptr(lg->lock, i); | ||
85 | arch_spin_unlock(lock); | ||
86 | } | ||
87 | preempt_enable(); | ||
88 | } | ||
89 | EXPORT_SYMBOL(lg_global_unlock); | ||
diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e425..4edbd9c11aca 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info, | |||
2429 | goto free_hdr; | 2429 | goto free_hdr; |
2430 | } | 2430 | } |
2431 | 2431 | ||
2432 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { | 2432 | if (hdr->e_shoff >= len || |
2433 | hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { | ||
2433 | err = -ENOEXEC; | 2434 | err = -ENOEXEC; |
2434 | goto free_hdr; | 2435 | goto free_hdr; |
2435 | } | 2436 | } |
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod, | |||
2953 | 2954 | ||
2954 | /* Module is ready to execute: parsing args may do that. */ | 2955 | /* Module is ready to execute: parsing args may do that. */ |
2955 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 2956 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
2956 | -32768, 32767, NULL); | 2957 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
2957 | if (err < 0) | 2958 | if (err < 0) |
2958 | goto unlink; | 2959 | goto unlink; |
2959 | 2960 | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #define PANIC_TIMER_STEP 100 | 27 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 28 | #define PANIC_BLINK_SPD 18 |
29 | 29 | ||
30 | int panic_on_oops; | 30 | int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; |
31 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...) | |||
108 | */ | 108 | */ |
109 | crash_kexec(NULL); | 109 | crash_kexec(NULL); |
110 | 110 | ||
111 | kmsg_dump(KMSG_DUMP_PANIC); | ||
112 | |||
113 | /* | 111 | /* |
114 | * Note smp_send_stop is the usual smp shutdown function, which | 112 | * Note smp_send_stop is the usual smp shutdown function, which |
115 | * unfortunately means it may not be hardened to work in a panic | 113 | * unfortunately means it may not be hardened to work in a panic |
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...) | |||
117 | */ | 115 | */ |
118 | smp_send_stop(); | 116 | smp_send_stop(); |
119 | 117 | ||
118 | kmsg_dump(KMSG_DUMP_PANIC); | ||
119 | |||
120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 120 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
121 | 121 | ||
122 | bust_spinlocks(0); | 122 | bust_spinlocks(0); |
diff --git a/kernel/params.c b/kernel/params.c index f37d82631347..ed35345be536 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) | |||
85 | 85 | ||
86 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
87 | char *val, | 87 | char *val, |
88 | const char *doing, | ||
88 | const struct kernel_param *params, | 89 | const struct kernel_param *params, |
89 | unsigned num_params, | 90 | unsigned num_params, |
90 | s16 min_level, | 91 | s16 min_level, |
91 | s16 max_level, | 92 | s16 max_level, |
92 | int (*handle_unknown)(char *param, char *val)) | 93 | int (*handle_unknown)(char *param, char *val, |
94 | const char *doing)) | ||
93 | { | 95 | { |
94 | unsigned int i; | 96 | unsigned int i; |
95 | int err; | 97 | int err; |
@@ -104,8 +106,8 @@ static int parse_one(char *param, | |||
104 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && params[i].ops->set != param_set_bool |
105 | && params[i].ops->set != param_set_bint) | 107 | && params[i].ops->set != param_set_bint) |
106 | return -EINVAL; | 108 | return -EINVAL; |
107 | pr_debug("They are equal! Calling %p\n", | 109 | pr_debug("handling %s with %p\n", param, |
108 | params[i].ops->set); | 110 | params[i].ops->set); |
109 | mutex_lock(¶m_lock); | 111 | mutex_lock(¶m_lock); |
110 | err = params[i].ops->set(val, ¶ms[i]); | 112 | err = params[i].ops->set(val, ¶ms[i]); |
111 | mutex_unlock(¶m_lock); | 113 | mutex_unlock(¶m_lock); |
@@ -114,11 +116,11 @@ static int parse_one(char *param, | |||
114 | } | 116 | } |
115 | 117 | ||
116 | if (handle_unknown) { | 118 | if (handle_unknown) { |
117 | pr_debug("Unknown argument: calling %p\n", handle_unknown); | 119 | pr_debug("doing %s: %s='%s'\n", doing, param, val); |
118 | return handle_unknown(param, val); | 120 | return handle_unknown(param, val, doing); |
119 | } | 121 | } |
120 | 122 | ||
121 | pr_debug("Unknown argument `%s'\n", param); | 123 | pr_debug("Unknown argument '%s'\n", param); |
122 | return -ENOENT; | 124 | return -ENOENT; |
123 | } | 125 | } |
124 | 126 | ||
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) | |||
175 | } | 177 | } |
176 | 178 | ||
177 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
178 | int parse_args(const char *name, | 180 | int parse_args(const char *doing, |
179 | char *args, | 181 | char *args, |
180 | const struct kernel_param *params, | 182 | const struct kernel_param *params, |
181 | unsigned num, | 183 | unsigned num, |
182 | s16 min_level, | 184 | s16 min_level, |
183 | s16 max_level, | 185 | s16 max_level, |
184 | int (*unknown)(char *param, char *val)) | 186 | int (*unknown)(char *param, char *val, const char *doing)) |
185 | { | 187 | { |
186 | char *param, *val; | 188 | char *param, *val; |
187 | 189 | ||
188 | pr_debug("Parsing ARGS: %s\n", args); | ||
189 | |||
190 | /* Chew leading spaces */ | 190 | /* Chew leading spaces */ |
191 | args = skip_spaces(args); | 191 | args = skip_spaces(args); |
192 | 192 | ||
193 | if (*args) | ||
194 | pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); | ||
195 | |||
193 | while (*args) { | 196 | while (*args) { |
194 | int ret; | 197 | int ret; |
195 | int irq_was_disabled; | 198 | int irq_was_disabled; |
196 | 199 | ||
197 | args = next_arg(args, ¶m, &val); | 200 | args = next_arg(args, ¶m, &val); |
198 | irq_was_disabled = irqs_disabled(); | 201 | irq_was_disabled = irqs_disabled(); |
199 | ret = parse_one(param, val, params, num, | 202 | ret = parse_one(param, val, doing, params, num, |
200 | min_level, max_level, unknown); | 203 | min_level, max_level, unknown); |
201 | if (irq_was_disabled && !irqs_disabled()) { | 204 | if (irq_was_disabled && !irqs_disabled()) |
202 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | 205 | pr_warn("%s: option '%s' enabled irq's!\n", |
203 | "irq's!\n", param); | 206 | doing, param); |
204 | } | 207 | |
205 | switch (ret) { | 208 | switch (ret) { |
206 | case -ENOENT: | 209 | case -ENOENT: |
207 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | 210 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
208 | name, param); | ||
209 | return ret; | 211 | return ret; |
210 | case -ENOSPC: | 212 | case -ENOSPC: |
211 | printk(KERN_ERR | 213 | pr_err("%s: `%s' too large for parameter `%s'\n", |
212 | "%s: `%s' too large for parameter `%s'\n", | 214 | doing, val ?: "", param); |
213 | name, val ?: "", param); | ||
214 | return ret; | 215 | return ret; |
215 | case 0: | 216 | case 0: |
216 | break; | 217 | break; |
217 | default: | 218 | default: |
218 | printk(KERN_ERR | 219 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
219 | "%s: `%s' invalid for parameter `%s'\n", | 220 | doing, val ?: "", param); |
220 | name, val ?: "", param); | ||
221 | return ret; | 221 | return ret; |
222 | } | 222 | } |
223 | } | 223 | } |
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | |||
263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 263 | int param_set_charp(const char *val, const struct kernel_param *kp) |
264 | { | 264 | { |
265 | if (strlen(val) > 1024) { | 265 | if (strlen(val) > 1024) { |
266 | printk(KERN_ERR "%s: string parameter too long\n", | 266 | pr_err("%s: string parameter too long\n", kp->name); |
267 | kp->name); | ||
268 | return -ENOSPC; | 267 | return -ENOSPC; |
269 | } | 268 | } |
270 | 269 | ||
@@ -400,8 +399,7 @@ static int param_array(const char *name, | |||
400 | int len; | 399 | int len; |
401 | 400 | ||
402 | if (*num == max) { | 401 | if (*num == max) { |
403 | printk(KERN_ERR "%s: can only take %i arguments\n", | 402 | pr_err("%s: can only take %i arguments\n", name, max); |
404 | name, max); | ||
405 | return -EINVAL; | 403 | return -EINVAL; |
406 | } | 404 | } |
407 | len = strcspn(val, ","); | 405 | len = strcspn(val, ","); |
@@ -420,8 +418,7 @@ static int param_array(const char *name, | |||
420 | } while (save == ','); | 418 | } while (save == ','); |
421 | 419 | ||
422 | if (*num < min) { | 420 | if (*num < min) { |
423 | printk(KERN_ERR "%s: needs at least %i arguments\n", | 421 | pr_err("%s: needs at least %i arguments\n", name, min); |
424 | name, min); | ||
425 | return -EINVAL; | 422 | return -EINVAL; |
426 | } | 423 | } |
427 | return 0; | 424 | return 0; |
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) | |||
480 | const struct kparam_string *kps = kp->str; | 477 | const struct kparam_string *kps = kp->str; |
481 | 478 | ||
482 | if (strlen(val)+1 > kps->maxlen) { | 479 | if (strlen(val)+1 > kps->maxlen) { |
483 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 480 | pr_err("%s: string doesn't fit in %u chars.\n", |
484 | kp->name, kps->maxlen-1); | 481 | kp->name, kps->maxlen-1); |
485 | return -ENOSPC; | 482 | return -ENOSPC; |
486 | } | 483 | } |
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
750 | #endif | 747 | #endif |
751 | if (err) { | 748 | if (err) { |
752 | kobject_put(&mk->kobj); | 749 | kobject_put(&mk->kobj); |
753 | printk(KERN_ERR | 750 | pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", |
754 | "Module '%s' failed add to sysfs, error number %d\n", | ||
755 | name, err); | 751 | name, err); |
756 | printk(KERN_ERR | ||
757 | "The system will be unstable now.\n"); | ||
758 | return NULL; | 752 | return NULL; |
759 | } | 753 | } |
760 | 754 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 9f08dfabaf13..e86b291ad834 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -547,7 +547,8 @@ void __init pidhash_init(void) | |||
547 | 547 | ||
548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 548 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
549 | HASH_EARLY | HASH_SMALL, | 549 | HASH_EARLY | HASH_SMALL, |
550 | &pidhash_shift, NULL, 4096); | 550 | &pidhash_shift, NULL, |
551 | 0, 4096); | ||
551 | pidhash_size = 1U << pidhash_shift; | 552 | pidhash_size = 1U << pidhash_shift; |
552 | 553 | ||
553 | for (i = 0; i < pidhash_size; i++) | 554 | for (i = 0; i < pidhash_size; i++) |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 57bc1fd35b3c..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
149 | { | 149 | { |
150 | int nr; | 150 | int nr; |
151 | int rc; | 151 | int rc; |
152 | struct task_struct *task; | 152 | struct task_struct *task, *me = current; |
153 | |||
154 | /* Ignore SIGCHLD causing any terminated children to autoreap */ | ||
155 | spin_lock_irq(&me->sighand->siglock); | ||
156 | me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; | ||
157 | spin_unlock_irq(&me->sighand->siglock); | ||
153 | 158 | ||
154 | /* | 159 | /* |
155 | * The last thread in the cgroup-init thread group is terminating. | 160 | * The last thread in the cgroup-init thread group is terminating. |
@@ -179,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
179 | } | 184 | } |
180 | read_unlock(&tasklist_lock); | 185 | read_unlock(&tasklist_lock); |
181 | 186 | ||
187 | /* Firstly reap the EXIT_ZOMBIE children we may have. */ | ||
182 | do { | 188 | do { |
183 | clear_thread_flag(TIF_SIGPENDING); | 189 | clear_thread_flag(TIF_SIGPENDING); |
184 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 190 | rc = sys_wait4(-1, NULL, __WALL, NULL); |
185 | } while (rc != -ECHILD); | 191 | } while (rc != -ECHILD); |
186 | 192 | ||
193 | /* | ||
194 | * sys_wait4() above can't reap the TASK_DEAD children. | ||
195 | * Make sure they all go away, see __unhash_process(). | ||
196 | */ | ||
197 | for (;;) { | ||
198 | bool need_wait = false; | ||
199 | |||
200 | read_lock(&tasklist_lock); | ||
201 | if (!list_empty(¤t->children)) { | ||
202 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
203 | need_wait = true; | ||
204 | } | ||
205 | read_unlock(&tasklist_lock); | ||
206 | |||
207 | if (!need_wait) | ||
208 | break; | ||
209 | schedule(); | ||
210 | } | ||
211 | |||
187 | if (pid_ns->reboot) | 212 | if (pid_ns->reboot) |
188 | current->signal->group_exit_code = pid_ns->reboot; | 213 | current->signal->group_exit_code = pid_ns->reboot; |
189 | 214 | ||
@@ -191,6 +216,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
191 | return; | 216 | return; |
192 | } | 217 | } |
193 | 218 | ||
219 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
194 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | 220 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
195 | void __user *buffer, size_t *lenp, loff_t *ppos) | 221 | void __user *buffer, size_t *lenp, loff_t *ppos) |
196 | { | 222 | { |
@@ -218,8 +244,8 @@ static struct ctl_table pid_ns_ctl_table[] = { | |||
218 | }, | 244 | }, |
219 | { } | 245 | { } |
220 | }; | 246 | }; |
221 | |||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | 247 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; |
248 | #endif /* CONFIG_CHECKPOINT_RESTORE */ | ||
223 | 249 | ||
224 | int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | 250 | int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) |
225 | { | 251 | { |
@@ -253,7 +279,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
253 | static __init int pid_namespaces_init(void) | 279 | static __init int pid_namespaces_init(void) |
254 | { | 280 | { |
255 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 281 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
282 | |||
283 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
256 | register_sysctl_paths(kern_path, pid_ns_ctl_table); | 284 | register_sysctl_paths(kern_path, pid_ns_ctl_table); |
285 | #endif | ||
257 | return 0; | 286 | return 0; |
258 | } | 287 | } |
259 | 288 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e3216..8f9b4eb974e0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP | |||
103 | select HOTPLUG | 103 | select HOTPLUG |
104 | select HOTPLUG_CPU | 104 | select HOTPLUG_CPU |
105 | 105 | ||
106 | config PM_AUTOSLEEP | ||
107 | bool "Opportunistic sleep" | ||
108 | depends on PM_SLEEP | ||
109 | default n | ||
110 | ---help--- | ||
111 | Allow the kernel to trigger a system transition into a global sleep | ||
112 | state automatically whenever there are no active wakeup sources. | ||
113 | |||
114 | config PM_WAKELOCKS | ||
115 | bool "User space wakeup sources interface" | ||
116 | depends on PM_SLEEP | ||
117 | default n | ||
118 | ---help--- | ||
119 | Allow user space to create, activate and deactivate wakeup source | ||
120 | objects with the help of a sysfs-based interface. | ||
121 | |||
122 | config PM_WAKELOCKS_LIMIT | ||
123 | int "Maximum number of user space wakeup sources (0 = no limit)" | ||
124 | range 0 100000 | ||
125 | default 100 | ||
126 | depends on PM_WAKELOCKS | ||
127 | |||
128 | config PM_WAKELOCKS_GC | ||
129 | bool "Garbage collector for user space wakeup sources" | ||
130 | depends on PM_WAKELOCKS | ||
131 | default y | ||
132 | |||
106 | config PM_RUNTIME | 133 | config PM_RUNTIME |
107 | bool "Run-time PM core functionality" | 134 | bool "Run-time PM core functionality" |
108 | depends on !IA64_HP_SIM | 135 | depends on !IA64_HP_SIM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec5252..29472bff11ef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 9 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 10 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
11 | block_io.o | 11 | block_io.o |
12 | obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o | ||
13 | obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o | ||
12 | 14 | ||
13 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 15 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 000000000000..ca304046d9e2 --- /dev/null +++ b/kernel/power/autosleep.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * kernel/power/autosleep.c | ||
3 | * | ||
4 | * Opportunistic sleep support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | */ | ||
8 | |||
9 | #include <linux/device.h> | ||
10 | #include <linux/mutex.h> | ||
11 | #include <linux/pm_wakeup.h> | ||
12 | |||
13 | #include "power.h" | ||
14 | |||
15 | static suspend_state_t autosleep_state; | ||
16 | static struct workqueue_struct *autosleep_wq; | ||
17 | /* | ||
18 | * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source | ||
19 | * is active, otherwise a deadlock with try_to_suspend() is possible. | ||
20 | * Alternatively mutex_lock_interruptible() can be used. This will then fail | ||
21 | * if an auto_sleep cycle tries to freeze processes. | ||
22 | */ | ||
23 | static DEFINE_MUTEX(autosleep_lock); | ||
24 | static struct wakeup_source *autosleep_ws; | ||
25 | |||
26 | static void try_to_suspend(struct work_struct *work) | ||
27 | { | ||
28 | unsigned int initial_count, final_count; | ||
29 | |||
30 | if (!pm_get_wakeup_count(&initial_count, true)) | ||
31 | goto out; | ||
32 | |||
33 | mutex_lock(&autosleep_lock); | ||
34 | |||
35 | if (!pm_save_wakeup_count(initial_count)) { | ||
36 | mutex_unlock(&autosleep_lock); | ||
37 | goto out; | ||
38 | } | ||
39 | |||
40 | if (autosleep_state == PM_SUSPEND_ON) { | ||
41 | mutex_unlock(&autosleep_lock); | ||
42 | return; | ||
43 | } | ||
44 | if (autosleep_state >= PM_SUSPEND_MAX) | ||
45 | hibernate(); | ||
46 | else | ||
47 | pm_suspend(autosleep_state); | ||
48 | |||
49 | mutex_unlock(&autosleep_lock); | ||
50 | |||
51 | if (!pm_get_wakeup_count(&final_count, false)) | ||
52 | goto out; | ||
53 | |||
54 | /* | ||
55 | * If the wakeup occured for an unknown reason, wait to prevent the | ||
56 | * system from trying to suspend and waking up in a tight loop. | ||
57 | */ | ||
58 | if (final_count == initial_count) | ||
59 | schedule_timeout_uninterruptible(HZ / 2); | ||
60 | |||
61 | out: | ||
62 | queue_up_suspend_work(); | ||
63 | } | ||
64 | |||
65 | static DECLARE_WORK(suspend_work, try_to_suspend); | ||
66 | |||
67 | void queue_up_suspend_work(void) | ||
68 | { | ||
69 | if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) | ||
70 | queue_work(autosleep_wq, &suspend_work); | ||
71 | } | ||
72 | |||
73 | suspend_state_t pm_autosleep_state(void) | ||
74 | { | ||
75 | return autosleep_state; | ||
76 | } | ||
77 | |||
78 | int pm_autosleep_lock(void) | ||
79 | { | ||
80 | return mutex_lock_interruptible(&autosleep_lock); | ||
81 | } | ||
82 | |||
83 | void pm_autosleep_unlock(void) | ||
84 | { | ||
85 | mutex_unlock(&autosleep_lock); | ||
86 | } | ||
87 | |||
88 | int pm_autosleep_set_state(suspend_state_t state) | ||
89 | { | ||
90 | |||
91 | #ifndef CONFIG_HIBERNATION | ||
92 | if (state >= PM_SUSPEND_MAX) | ||
93 | return -EINVAL; | ||
94 | #endif | ||
95 | |||
96 | __pm_stay_awake(autosleep_ws); | ||
97 | |||
98 | mutex_lock(&autosleep_lock); | ||
99 | |||
100 | autosleep_state = state; | ||
101 | |||
102 | __pm_relax(autosleep_ws); | ||
103 | |||
104 | if (state > PM_SUSPEND_ON) { | ||
105 | pm_wakep_autosleep_enabled(true); | ||
106 | queue_up_suspend_work(); | ||
107 | } else { | ||
108 | pm_wakep_autosleep_enabled(false); | ||
109 | } | ||
110 | |||
111 | mutex_unlock(&autosleep_lock); | ||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | int __init pm_autosleep_init(void) | ||
116 | { | ||
117 | autosleep_ws = wakeup_source_register("autosleep"); | ||
118 | if (!autosleep_ws) | ||
119 | return -ENOMEM; | ||
120 | |||
121 | autosleep_wq = alloc_ordered_workqueue("autosleep", 0); | ||
122 | if (autosleep_wq) | ||
123 | return 0; | ||
124 | |||
125 | wakeup_source_unregister(autosleep_ws); | ||
126 | return -ENOMEM; | ||
127 | } | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e09dfbfeecee..238025f5472e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -25,7 +25,8 @@ | |||
25 | #include <linux/freezer.h> | 25 | #include <linux/freezer.h> |
26 | #include <linux/gfp.h> | 26 | #include <linux/gfp.h> |
27 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
28 | #include <scsi/scsi_scan.h> | 28 | #include <linux/ctype.h> |
29 | #include <linux/genhd.h> | ||
29 | 30 | ||
30 | #include "power.h" | 31 | #include "power.h" |
31 | 32 | ||
@@ -722,6 +723,17 @@ static int software_resume(void) | |||
722 | 723 | ||
723 | /* Check if the device is there */ | 724 | /* Check if the device is there */ |
724 | swsusp_resume_device = name_to_dev_t(resume_file); | 725 | swsusp_resume_device = name_to_dev_t(resume_file); |
726 | |||
727 | /* | ||
728 | * name_to_dev_t is ineffective to verify parition if resume_file is in | ||
729 | * integer format. (e.g. major:minor) | ||
730 | */ | ||
731 | if (isdigit(resume_file[0]) && resume_wait) { | ||
732 | int partno; | ||
733 | while (!get_gendisk(swsusp_resume_device, &partno)) | ||
734 | msleep(10); | ||
735 | } | ||
736 | |||
725 | if (!swsusp_resume_device) { | 737 | if (!swsusp_resume_device) { |
726 | /* | 738 | /* |
727 | * Some device discovery might still be in progress; we need | 739 | * Some device discovery might still be in progress; we need |
@@ -735,13 +747,6 @@ static int software_resume(void) | |||
735 | async_synchronize_full(); | 747 | async_synchronize_full(); |
736 | } | 748 | } |
737 | 749 | ||
738 | /* | ||
739 | * We can't depend on SCSI devices being available after loading | ||
740 | * one of their modules until scsi_complete_async_scans() is | ||
741 | * called and the resume device usually is a SCSI one. | ||
742 | */ | ||
743 | scsi_complete_async_scans(); | ||
744 | |||
745 | swsusp_resume_device = name_to_dev_t(resume_file); | 750 | swsusp_resume_device = name_to_dev_t(resume_file); |
746 | if (!swsusp_resume_device) { | 751 | if (!swsusp_resume_device) { |
747 | error = -ENODEV; | 752 | error = -ENODEV; |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c62..428f8a034e96 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
269 | return (s - buf); | 269 | return (s - buf); |
270 | } | 270 | } |
271 | 271 | ||
272 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | 272 | static suspend_state_t decode_state(const char *buf, size_t n) |
273 | const char *buf, size_t n) | ||
274 | { | 273 | { |
275 | #ifdef CONFIG_SUSPEND | 274 | #ifdef CONFIG_SUSPEND |
276 | suspend_state_t state = PM_SUSPEND_STANDBY; | 275 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
278 | #endif | 277 | #endif |
279 | char *p; | 278 | char *p; |
280 | int len; | 279 | int len; |
281 | int error = -EINVAL; | ||
282 | 280 | ||
283 | p = memchr(buf, '\n', n); | 281 | p = memchr(buf, '\n', n); |
284 | len = p ? p - buf : n; | 282 | len = p ? p - buf : n; |
285 | 283 | ||
286 | /* First, check if we are requested to hibernate */ | 284 | /* Check hibernation first. */ |
287 | if (len == 4 && !strncmp(buf, "disk", len)) { | 285 | if (len == 4 && !strncmp(buf, "disk", len)) |
288 | error = hibernate(); | 286 | return PM_SUSPEND_MAX; |
289 | goto Exit; | ||
290 | } | ||
291 | 287 | ||
292 | #ifdef CONFIG_SUSPEND | 288 | #ifdef CONFIG_SUSPEND |
293 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { | 289 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) |
294 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { | 290 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) |
295 | error = pm_suspend(state); | 291 | return state; |
296 | break; | ||
297 | } | ||
298 | } | ||
299 | #endif | 292 | #endif |
300 | 293 | ||
301 | Exit: | 294 | return PM_SUSPEND_ON; |
295 | } | ||
296 | |||
297 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
298 | const char *buf, size_t n) | ||
299 | { | ||
300 | suspend_state_t state; | ||
301 | int error; | ||
302 | |||
303 | error = pm_autosleep_lock(); | ||
304 | if (error) | ||
305 | return error; | ||
306 | |||
307 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
308 | error = -EBUSY; | ||
309 | goto out; | ||
310 | } | ||
311 | |||
312 | state = decode_state(buf, n); | ||
313 | if (state < PM_SUSPEND_MAX) | ||
314 | error = pm_suspend(state); | ||
315 | else if (state == PM_SUSPEND_MAX) | ||
316 | error = hibernate(); | ||
317 | else | ||
318 | error = -EINVAL; | ||
319 | |||
320 | out: | ||
321 | pm_autosleep_unlock(); | ||
302 | return error ? error : n; | 322 | return error ? error : n; |
303 | } | 323 | } |
304 | 324 | ||
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, | |||
339 | { | 359 | { |
340 | unsigned int val; | 360 | unsigned int val; |
341 | 361 | ||
342 | return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; | 362 | return pm_get_wakeup_count(&val, true) ? |
363 | sprintf(buf, "%u\n", val) : -EINTR; | ||
343 | } | 364 | } |
344 | 365 | ||
345 | static ssize_t wakeup_count_store(struct kobject *kobj, | 366 | static ssize_t wakeup_count_store(struct kobject *kobj, |
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, | |||
347 | const char *buf, size_t n) | 368 | const char *buf, size_t n) |
348 | { | 369 | { |
349 | unsigned int val; | 370 | unsigned int val; |
371 | int error; | ||
372 | |||
373 | error = pm_autosleep_lock(); | ||
374 | if (error) | ||
375 | return error; | ||
376 | |||
377 | if (pm_autosleep_state() > PM_SUSPEND_ON) { | ||
378 | error = -EBUSY; | ||
379 | goto out; | ||
380 | } | ||
350 | 381 | ||
382 | error = -EINVAL; | ||
351 | if (sscanf(buf, "%u", &val) == 1) { | 383 | if (sscanf(buf, "%u", &val) == 1) { |
352 | if (pm_save_wakeup_count(val)) | 384 | if (pm_save_wakeup_count(val)) |
353 | return n; | 385 | error = n; |
354 | } | 386 | } |
355 | return -EINVAL; | 387 | |
388 | out: | ||
389 | pm_autosleep_unlock(); | ||
390 | return error; | ||
356 | } | 391 | } |
357 | 392 | ||
358 | power_attr(wakeup_count); | 393 | power_attr(wakeup_count); |
394 | |||
395 | #ifdef CONFIG_PM_AUTOSLEEP | ||
396 | static ssize_t autosleep_show(struct kobject *kobj, | ||
397 | struct kobj_attribute *attr, | ||
398 | char *buf) | ||
399 | { | ||
400 | suspend_state_t state = pm_autosleep_state(); | ||
401 | |||
402 | if (state == PM_SUSPEND_ON) | ||
403 | return sprintf(buf, "off\n"); | ||
404 | |||
405 | #ifdef CONFIG_SUSPEND | ||
406 | if (state < PM_SUSPEND_MAX) | ||
407 | return sprintf(buf, "%s\n", valid_state(state) ? | ||
408 | pm_states[state] : "error"); | ||
409 | #endif | ||
410 | #ifdef CONFIG_HIBERNATION | ||
411 | return sprintf(buf, "disk\n"); | ||
412 | #else | ||
413 | return sprintf(buf, "error"); | ||
414 | #endif | ||
415 | } | ||
416 | |||
417 | static ssize_t autosleep_store(struct kobject *kobj, | ||
418 | struct kobj_attribute *attr, | ||
419 | const char *buf, size_t n) | ||
420 | { | ||
421 | suspend_state_t state = decode_state(buf, n); | ||
422 | int error; | ||
423 | |||
424 | if (state == PM_SUSPEND_ON | ||
425 | && strcmp(buf, "off") && strcmp(buf, "off\n")) | ||
426 | return -EINVAL; | ||
427 | |||
428 | error = pm_autosleep_set_state(state); | ||
429 | return error ? error : n; | ||
430 | } | ||
431 | |||
432 | power_attr(autosleep); | ||
433 | #endif /* CONFIG_PM_AUTOSLEEP */ | ||
434 | |||
435 | #ifdef CONFIG_PM_WAKELOCKS | ||
436 | static ssize_t wake_lock_show(struct kobject *kobj, | ||
437 | struct kobj_attribute *attr, | ||
438 | char *buf) | ||
439 | { | ||
440 | return pm_show_wakelocks(buf, true); | ||
441 | } | ||
442 | |||
443 | static ssize_t wake_lock_store(struct kobject *kobj, | ||
444 | struct kobj_attribute *attr, | ||
445 | const char *buf, size_t n) | ||
446 | { | ||
447 | int error = pm_wake_lock(buf); | ||
448 | return error ? error : n; | ||
449 | } | ||
450 | |||
451 | power_attr(wake_lock); | ||
452 | |||
453 | static ssize_t wake_unlock_show(struct kobject *kobj, | ||
454 | struct kobj_attribute *attr, | ||
455 | char *buf) | ||
456 | { | ||
457 | return pm_show_wakelocks(buf, false); | ||
458 | } | ||
459 | |||
460 | static ssize_t wake_unlock_store(struct kobject *kobj, | ||
461 | struct kobj_attribute *attr, | ||
462 | const char *buf, size_t n) | ||
463 | { | ||
464 | int error = pm_wake_unlock(buf); | ||
465 | return error ? error : n; | ||
466 | } | ||
467 | |||
468 | power_attr(wake_unlock); | ||
469 | |||
470 | #endif /* CONFIG_PM_WAKELOCKS */ | ||
359 | #endif /* CONFIG_PM_SLEEP */ | 471 | #endif /* CONFIG_PM_SLEEP */ |
360 | 472 | ||
361 | #ifdef CONFIG_PM_TRACE | 473 | #ifdef CONFIG_PM_TRACE |
@@ -409,6 +521,13 @@ static struct attribute * g[] = { | |||
409 | #ifdef CONFIG_PM_SLEEP | 521 | #ifdef CONFIG_PM_SLEEP |
410 | &pm_async_attr.attr, | 522 | &pm_async_attr.attr, |
411 | &wakeup_count_attr.attr, | 523 | &wakeup_count_attr.attr, |
524 | #ifdef CONFIG_PM_AUTOSLEEP | ||
525 | &autosleep_attr.attr, | ||
526 | #endif | ||
527 | #ifdef CONFIG_PM_WAKELOCKS | ||
528 | &wake_lock_attr.attr, | ||
529 | &wake_unlock_attr.attr, | ||
530 | #endif | ||
412 | #ifdef CONFIG_PM_DEBUG | 531 | #ifdef CONFIG_PM_DEBUG |
413 | &pm_test_attr.attr, | 532 | &pm_test_attr.attr, |
414 | #endif | 533 | #endif |
@@ -444,7 +563,10 @@ static int __init pm_init(void) | |||
444 | power_kobj = kobject_create_and_add("power", NULL); | 563 | power_kobj = kobject_create_and_add("power", NULL); |
445 | if (!power_kobj) | 564 | if (!power_kobj) |
446 | return -ENOMEM; | 565 | return -ENOMEM; |
447 | return sysfs_create_group(power_kobj, &attr_group); | 566 | error = sysfs_create_group(power_kobj, &attr_group); |
567 | if (error) | ||
568 | return error; | ||
569 | return pm_autosleep_init(); | ||
448 | } | 570 | } |
449 | 571 | ||
450 | core_initcall(pm_init); | 572 | core_initcall(pm_init); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d7407..b0bd4beaebfe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) | |||
264 | { | 264 | { |
265 | } | 265 | } |
266 | #endif | 266 | #endif |
267 | |||
268 | #ifdef CONFIG_PM_AUTOSLEEP | ||
269 | |||
270 | /* kernel/power/autosleep.c */ | ||
271 | extern int pm_autosleep_init(void); | ||
272 | extern int pm_autosleep_lock(void); | ||
273 | extern void pm_autosleep_unlock(void); | ||
274 | extern suspend_state_t pm_autosleep_state(void); | ||
275 | extern int pm_autosleep_set_state(suspend_state_t state); | ||
276 | |||
277 | #else /* !CONFIG_PM_AUTOSLEEP */ | ||
278 | |||
279 | static inline int pm_autosleep_init(void) { return 0; } | ||
280 | static inline int pm_autosleep_lock(void) { return 0; } | ||
281 | static inline void pm_autosleep_unlock(void) {} | ||
282 | static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } | ||
283 | |||
284 | #endif /* !CONFIG_PM_AUTOSLEEP */ | ||
285 | |||
286 | #ifdef CONFIG_PM_WAKELOCKS | ||
287 | |||
288 | /* kernel/power/wakelock.c */ | ||
289 | extern ssize_t pm_show_wakelocks(char *buf, bool show_active); | ||
290 | extern int pm_wake_lock(const char *buf); | ||
291 | extern int pm_wake_unlock(const char *buf); | ||
292 | |||
293 | #endif /* !CONFIG_PM_WAKELOCKS */ | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index eef311a58a64..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * | 6 | * |
7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
9 | * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> | 9 | * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> |
10 | * | 10 | * |
11 | * This file is released under the GPLv2. | 11 | * This file is released under the GPLv2. |
12 | * | 12 | * |
@@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
282 | return -ENOSPC; | 282 | return -ENOSPC; |
283 | 283 | ||
284 | if (bio_chain) { | 284 | if (bio_chain) { |
285 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 285 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | |
286 | __GFP_NORETRY); | ||
286 | if (src) { | 287 | if (src) { |
287 | copy_page(src, buf); | 288 | copy_page(src, buf); |
288 | } else { | 289 | } else { |
289 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ | 290 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ |
290 | if (ret) | 291 | if (ret) |
291 | return ret; | 292 | return ret; |
292 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 293 | src = (void *)__get_free_page(__GFP_WAIT | |
294 | __GFP_NOWARN | | ||
295 | __GFP_NORETRY); | ||
293 | if (src) { | 296 | if (src) { |
294 | copy_page(src, buf); | 297 | copy_page(src, buf); |
295 | } else { | 298 | } else { |
@@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
367 | clear_page(handle->cur); | 370 | clear_page(handle->cur); |
368 | handle->cur_swap = offset; | 371 | handle->cur_swap = offset; |
369 | handle->k = 0; | 372 | handle->k = 0; |
370 | } | 373 | |
371 | if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { | 374 | if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { |
372 | error = hib_wait_on_bio_chain(bio_chain); | 375 | error = hib_wait_on_bio_chain(bio_chain); |
373 | if (error) | 376 | if (error) |
374 | goto out; | 377 | goto out; |
375 | handle->reqd_free_pages = reqd_free_pages(); | 378 | /* |
379 | * Recalculate the number of required free pages, to | ||
380 | * make sure we never take more than half. | ||
381 | */ | ||
382 | handle->reqd_free_pages = reqd_free_pages(); | ||
383 | } | ||
376 | } | 384 | } |
377 | out: | 385 | out: |
378 | return error; | 386 | return error; |
@@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
419 | /* Maximum number of threads for compression/decompression. */ | 427 | /* Maximum number of threads for compression/decompression. */ |
420 | #define LZO_THREADS 3 | 428 | #define LZO_THREADS 3 |
421 | 429 | ||
422 | /* Maximum number of pages for read buffering. */ | 430 | /* Minimum/maximum number of pages for read buffering. */ |
423 | #define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) | 431 | #define LZO_MIN_RD_PAGES 1024 |
432 | #define LZO_MAX_RD_PAGES 8192 | ||
424 | 433 | ||
425 | 434 | ||
426 | /** | 435 | /** |
@@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
631 | } | 640 | } |
632 | 641 | ||
633 | /* | 642 | /* |
634 | * Adjust number of free pages after all allocations have been done. | ||
635 | * We don't want to run out of pages when writing. | ||
636 | */ | ||
637 | handle->reqd_free_pages = reqd_free_pages(); | ||
638 | |||
639 | /* | ||
640 | * Start the CRC32 thread. | 643 | * Start the CRC32 thread. |
641 | */ | 644 | */ |
642 | init_waitqueue_head(&crc->go); | 645 | init_waitqueue_head(&crc->go); |
@@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
657 | goto out_clean; | 660 | goto out_clean; |
658 | } | 661 | } |
659 | 662 | ||
663 | /* | ||
664 | * Adjust the number of required free pages after all allocations have | ||
665 | * been done. We don't want to run out of pages when writing. | ||
666 | */ | ||
667 | handle->reqd_free_pages = reqd_free_pages(); | ||
668 | |||
660 | printk(KERN_INFO | 669 | printk(KERN_INFO |
661 | "PM: Using %u thread(s) for compression.\n" | 670 | "PM: Using %u thread(s) for compression.\n" |
662 | "PM: Compressing and saving image data (%u pages) ... ", | 671 | "PM: Compressing and saving image data (%u pages) ... ", |
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1067 | unsigned i, thr, run_threads, nr_threads; | 1076 | unsigned i, thr, run_threads, nr_threads; |
1068 | unsigned ring = 0, pg = 0, ring_size = 0, | 1077 | unsigned ring = 0, pg = 0, ring_size = 0, |
1069 | have = 0, want, need, asked = 0; | 1078 | have = 0, want, need, asked = 0; |
1070 | unsigned long read_pages; | 1079 | unsigned long read_pages = 0; |
1071 | unsigned char **page = NULL; | 1080 | unsigned char **page = NULL; |
1072 | struct dec_data *data = NULL; | 1081 | struct dec_data *data = NULL; |
1073 | struct crc_data *crc = NULL; | 1082 | struct crc_data *crc = NULL; |
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1079 | nr_threads = num_online_cpus() - 1; | 1088 | nr_threads = num_online_cpus() - 1; |
1080 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | 1089 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); |
1081 | 1090 | ||
1082 | page = vmalloc(sizeof(*page) * LZO_READ_PAGES); | 1091 | page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); |
1083 | if (!page) { | 1092 | if (!page) { |
1084 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 1093 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
1085 | ret = -ENOMEM; | 1094 | ret = -ENOMEM; |
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1144 | } | 1153 | } |
1145 | 1154 | ||
1146 | /* | 1155 | /* |
1147 | * Adjust number of pages for read buffering, in case we are short. | 1156 | * Set the number of pages for read buffering. |
1157 | * This is complete guesswork, because we'll only know the real | ||
1158 | * picture once prepare_image() is called, which is much later on | ||
1159 | * during the image load phase. We'll assume the worst case and | ||
1160 | * say that none of the image pages are from high memory. | ||
1148 | */ | 1161 | */ |
1149 | read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; | 1162 | if (low_free_pages() > snapshot_get_image_size()) |
1150 | read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); | 1163 | read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; |
1164 | read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); | ||
1151 | 1165 | ||
1152 | for (i = 0; i < read_pages; i++) { | 1166 | for (i = 0; i < read_pages; i++) { |
1153 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | 1167 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? |
1154 | __GFP_WAIT | __GFP_HIGH : | 1168 | __GFP_WAIT | __GFP_HIGH : |
1155 | __GFP_WAIT); | 1169 | __GFP_WAIT | __GFP_NOWARN | |
1170 | __GFP_NORETRY); | ||
1171 | |||
1156 | if (!page[i]) { | 1172 | if (!page[i]) { |
1157 | if (i < LZO_CMP_PAGES) { | 1173 | if (i < LZO_CMP_PAGES) { |
1158 | ring_size = i; | 1174 | ring_size = i; |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
25 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
26 | #include <linux/freezer.h> | 26 | #include <linux/freezer.h> |
27 | #include <scsi/scsi_scan.h> | ||
28 | 27 | ||
29 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
30 | 29 | ||
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
84 | * appear. | 83 | * appear. |
85 | */ | 84 | */ |
86 | wait_for_device_probe(); | 85 | wait_for_device_probe(); |
87 | scsi_complete_async_scans(); | ||
88 | 86 | ||
89 | data->swap = -1; | 87 | data->swap = -1; |
90 | data->mode = O_WRONLY; | 88 | data->mode = O_WRONLY; |
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 000000000000..c8fba3380076 --- /dev/null +++ b/kernel/power/wakelock.c | |||
@@ -0,0 +1,259 @@ | |||
1 | /* | ||
2 | * kernel/power/wakelock.c | ||
3 | * | ||
4 | * User space wakeup sources support. | ||
5 | * | ||
6 | * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> | ||
7 | * | ||
8 | * This code is based on the analogous interface allowing user space to | ||
9 | * manipulate wakelocks on Android. | ||
10 | */ | ||
11 | |||
12 | #include <linux/ctype.h> | ||
13 | #include <linux/device.h> | ||
14 | #include <linux/err.h> | ||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/list.h> | ||
17 | #include <linux/rbtree.h> | ||
18 | #include <linux/slab.h> | ||
19 | |||
20 | static DEFINE_MUTEX(wakelocks_lock); | ||
21 | |||
22 | struct wakelock { | ||
23 | char *name; | ||
24 | struct rb_node node; | ||
25 | struct wakeup_source ws; | ||
26 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
27 | struct list_head lru; | ||
28 | #endif | ||
29 | }; | ||
30 | |||
31 | static struct rb_root wakelocks_tree = RB_ROOT; | ||
32 | |||
33 | ssize_t pm_show_wakelocks(char *buf, bool show_active) | ||
34 | { | ||
35 | struct rb_node *node; | ||
36 | struct wakelock *wl; | ||
37 | char *str = buf; | ||
38 | char *end = buf + PAGE_SIZE; | ||
39 | |||
40 | mutex_lock(&wakelocks_lock); | ||
41 | |||
42 | for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { | ||
43 | wl = rb_entry(node, struct wakelock, node); | ||
44 | if (wl->ws.active == show_active) | ||
45 | str += scnprintf(str, end - str, "%s ", wl->name); | ||
46 | } | ||
47 | if (str > buf) | ||
48 | str--; | ||
49 | |||
50 | str += scnprintf(str, end - str, "\n"); | ||
51 | |||
52 | mutex_unlock(&wakelocks_lock); | ||
53 | return (str - buf); | ||
54 | } | ||
55 | |||
56 | #if CONFIG_PM_WAKELOCKS_LIMIT > 0 | ||
57 | static unsigned int number_of_wakelocks; | ||
58 | |||
59 | static inline bool wakelocks_limit_exceeded(void) | ||
60 | { | ||
61 | return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; | ||
62 | } | ||
63 | |||
64 | static inline void increment_wakelocks_number(void) | ||
65 | { | ||
66 | number_of_wakelocks++; | ||
67 | } | ||
68 | |||
69 | static inline void decrement_wakelocks_number(void) | ||
70 | { | ||
71 | number_of_wakelocks--; | ||
72 | } | ||
73 | #else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ | ||
74 | static inline bool wakelocks_limit_exceeded(void) { return false; } | ||
75 | static inline void increment_wakelocks_number(void) {} | ||
76 | static inline void decrement_wakelocks_number(void) {} | ||
77 | #endif /* CONFIG_PM_WAKELOCKS_LIMIT */ | ||
78 | |||
79 | #ifdef CONFIG_PM_WAKELOCKS_GC | ||
80 | #define WL_GC_COUNT_MAX 100 | ||
81 | #define WL_GC_TIME_SEC 300 | ||
82 | |||
83 | static LIST_HEAD(wakelocks_lru_list); | ||
84 | static unsigned int wakelocks_gc_count; | ||
85 | |||
86 | static inline void wakelocks_lru_add(struct wakelock *wl) | ||
87 | { | ||
88 | list_add(&wl->lru, &wakelocks_lru_list); | ||
89 | } | ||
90 | |||
91 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) | ||
92 | { | ||
93 | list_move(&wl->lru, &wakelocks_lru_list); | ||
94 | } | ||
95 | |||
96 | static void wakelocks_gc(void) | ||
97 | { | ||
98 | struct wakelock *wl, *aux; | ||
99 | ktime_t now; | ||
100 | |||
101 | if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) | ||
102 | return; | ||
103 | |||
104 | now = ktime_get(); | ||
105 | list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { | ||
106 | u64 idle_time_ns; | ||
107 | bool active; | ||
108 | |||
109 | spin_lock_irq(&wl->ws.lock); | ||
110 | idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); | ||
111 | active = wl->ws.active; | ||
112 | spin_unlock_irq(&wl->ws.lock); | ||
113 | |||
114 | if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) | ||
115 | break; | ||
116 | |||
117 | if (!active) { | ||
118 | wakeup_source_remove(&wl->ws); | ||
119 | rb_erase(&wl->node, &wakelocks_tree); | ||
120 | list_del(&wl->lru); | ||
121 | kfree(wl->name); | ||
122 | kfree(wl); | ||
123 | decrement_wakelocks_number(); | ||
124 | } | ||
125 | } | ||
126 | wakelocks_gc_count = 0; | ||
127 | } | ||
128 | #else /* !CONFIG_PM_WAKELOCKS_GC */ | ||
129 | static inline void wakelocks_lru_add(struct wakelock *wl) {} | ||
130 | static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} | ||
131 | static inline void wakelocks_gc(void) {} | ||
132 | #endif /* !CONFIG_PM_WAKELOCKS_GC */ | ||
133 | |||
134 | static struct wakelock *wakelock_lookup_add(const char *name, size_t len, | ||
135 | bool add_if_not_found) | ||
136 | { | ||
137 | struct rb_node **node = &wakelocks_tree.rb_node; | ||
138 | struct rb_node *parent = *node; | ||
139 | struct wakelock *wl; | ||
140 | |||
141 | while (*node) { | ||
142 | int diff; | ||
143 | |||
144 | parent = *node; | ||
145 | wl = rb_entry(*node, struct wakelock, node); | ||
146 | diff = strncmp(name, wl->name, len); | ||
147 | if (diff == 0) { | ||
148 | if (wl->name[len]) | ||
149 | diff = -1; | ||
150 | else | ||
151 | return wl; | ||
152 | } | ||
153 | if (diff < 0) | ||
154 | node = &(*node)->rb_left; | ||
155 | else | ||
156 | node = &(*node)->rb_right; | ||
157 | } | ||
158 | if (!add_if_not_found) | ||
159 | return ERR_PTR(-EINVAL); | ||
160 | |||
161 | if (wakelocks_limit_exceeded()) | ||
162 | return ERR_PTR(-ENOSPC); | ||
163 | |||
164 | /* Not found, we have to add a new one. */ | ||
165 | wl = kzalloc(sizeof(*wl), GFP_KERNEL); | ||
166 | if (!wl) | ||
167 | return ERR_PTR(-ENOMEM); | ||
168 | |||
169 | wl->name = kstrndup(name, len, GFP_KERNEL); | ||
170 | if (!wl->name) { | ||
171 | kfree(wl); | ||
172 | return ERR_PTR(-ENOMEM); | ||
173 | } | ||
174 | wl->ws.name = wl->name; | ||
175 | wakeup_source_add(&wl->ws); | ||
176 | rb_link_node(&wl->node, parent, node); | ||
177 | rb_insert_color(&wl->node, &wakelocks_tree); | ||
178 | wakelocks_lru_add(wl); | ||
179 | increment_wakelocks_number(); | ||
180 | return wl; | ||
181 | } | ||
182 | |||
183 | int pm_wake_lock(const char *buf) | ||
184 | { | ||
185 | const char *str = buf; | ||
186 | struct wakelock *wl; | ||
187 | u64 timeout_ns = 0; | ||
188 | size_t len; | ||
189 | int ret = 0; | ||
190 | |||
191 | while (*str && !isspace(*str)) | ||
192 | str++; | ||
193 | |||
194 | len = str - buf; | ||
195 | if (!len) | ||
196 | return -EINVAL; | ||
197 | |||
198 | if (*str && *str != '\n') { | ||
199 | /* Find out if there's a valid timeout string appended. */ | ||
200 | ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); | ||
201 | if (ret) | ||
202 | return -EINVAL; | ||
203 | } | ||
204 | |||
205 | mutex_lock(&wakelocks_lock); | ||
206 | |||
207 | wl = wakelock_lookup_add(buf, len, true); | ||
208 | if (IS_ERR(wl)) { | ||
209 | ret = PTR_ERR(wl); | ||
210 | goto out; | ||
211 | } | ||
212 | if (timeout_ns) { | ||
213 | u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; | ||
214 | |||
215 | do_div(timeout_ms, NSEC_PER_MSEC); | ||
216 | __pm_wakeup_event(&wl->ws, timeout_ms); | ||
217 | } else { | ||
218 | __pm_stay_awake(&wl->ws); | ||
219 | } | ||
220 | |||
221 | wakelocks_lru_most_recent(wl); | ||
222 | |||
223 | out: | ||
224 | mutex_unlock(&wakelocks_lock); | ||
225 | return ret; | ||
226 | } | ||
227 | |||
228 | int pm_wake_unlock(const char *buf) | ||
229 | { | ||
230 | struct wakelock *wl; | ||
231 | size_t len; | ||
232 | int ret = 0; | ||
233 | |||
234 | len = strlen(buf); | ||
235 | if (!len) | ||
236 | return -EINVAL; | ||
237 | |||
238 | if (buf[len-1] == '\n') | ||
239 | len--; | ||
240 | |||
241 | if (!len) | ||
242 | return -EINVAL; | ||
243 | |||
244 | mutex_lock(&wakelocks_lock); | ||
245 | |||
246 | wl = wakelock_lookup_add(buf, len, false); | ||
247 | if (IS_ERR(wl)) { | ||
248 | ret = PTR_ERR(wl); | ||
249 | goto out; | ||
250 | } | ||
251 | __pm_relax(&wl->ws); | ||
252 | |||
253 | wakelocks_lru_most_recent(wl); | ||
254 | wakelocks_gc(); | ||
255 | |||
256 | out: | ||
257 | mutex_unlock(&wakelocks_lock); | ||
258 | return ret; | ||
259 | } | ||
diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d39..ac4bc9e79465 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/cpu.h> | 41 | #include <linux/cpu.h> |
42 | #include <linux/notifier.h> | 42 | #include <linux/notifier.h> |
43 | #include <linux/rculist.h> | 43 | #include <linux/rculist.h> |
44 | #include <linux/poll.h> | ||
44 | 45 | ||
45 | #include <asm/uaccess.h> | 46 | #include <asm/uaccess.h> |
46 | 47 | ||
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
54 | { | 55 | { |
55 | } | 56 | } |
56 | 57 | ||
57 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
58 | |||
59 | /* printk's without a loglevel use this.. */ | 58 | /* printk's without a loglevel use this.. */ |
60 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 59 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
61 | 60 | ||
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); | |||
99 | static int console_locked, console_suspended; | 98 | static int console_locked, console_suspended; |
100 | 99 | ||
101 | /* | 100 | /* |
102 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | ||
103 | * It is also used in interesting ways to provide interlocking in | ||
104 | * console_unlock();. | ||
105 | */ | ||
106 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
107 | |||
108 | #define LOG_BUF_MASK (log_buf_len-1) | ||
109 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | ||
110 | |||
111 | /* | ||
112 | * The indices into log_buf are not constrained to log_buf_len - they | ||
113 | * must be masked before subscripting | ||
114 | */ | ||
115 | static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ | ||
116 | static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ | ||
117 | static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ | ||
118 | |||
119 | /* | ||
120 | * If exclusive_console is non-NULL then only this console is to be printed to. | 101 | * If exclusive_console is non-NULL then only this console is to be printed to. |
121 | */ | 102 | */ |
122 | static struct console *exclusive_console; | 103 | static struct console *exclusive_console; |
@@ -145,13 +126,510 @@ EXPORT_SYMBOL(console_set_on_cmdline); | |||
145 | /* Flag: console code may call schedule() */ | 126 | /* Flag: console code may call schedule() */ |
146 | static int console_may_schedule; | 127 | static int console_may_schedule; |
147 | 128 | ||
129 | /* | ||
130 | * The printk log buffer consists of a chain of concatenated variable | ||
131 | * length records. Every record starts with a record header, containing | ||
132 | * the overall length of the record. | ||
133 | * | ||
134 | * The heads to the first and last entry in the buffer, as well as the | ||
135 | * sequence numbers of these both entries are maintained when messages | ||
136 | * are stored.. | ||
137 | * | ||
138 | * If the heads indicate available messages, the length in the header | ||
139 | * tells the start next message. A length == 0 for the next message | ||
140 | * indicates a wrap-around to the beginning of the buffer. | ||
141 | * | ||
142 | * Every record carries the monotonic timestamp in microseconds, as well as | ||
143 | * the standard userspace syslog level and syslog facility. The usual | ||
144 | * kernel messages use LOG_KERN; userspace-injected messages always carry | ||
145 | * a matching syslog facility, by default LOG_USER. The origin of every | ||
146 | * message can be reliably determined that way. | ||
147 | * | ||
148 | * The human readable log message directly follows the message header. The | ||
149 | * length of the message text is stored in the header, the stored message | ||
150 | * is not terminated. | ||
151 | * | ||
152 | * Optionally, a message can carry a dictionary of properties (key/value pairs), | ||
153 | * to provide userspace with a machine-readable message context. | ||
154 | * | ||
155 | * Examples for well-defined, commonly used property names are: | ||
156 | * DEVICE=b12:8 device identifier | ||
157 | * b12:8 block dev_t | ||
158 | * c127:3 char dev_t | ||
159 | * n8 netdev ifindex | ||
160 | * +sound:card0 subsystem:devname | ||
161 | * SUBSYSTEM=pci driver-core subsystem name | ||
162 | * | ||
163 | * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value | ||
164 | * follows directly after a '=' character. Every property is terminated by | ||
165 | * a '\0' character. The last property is not terminated. | ||
166 | * | ||
167 | * Example of a message structure: | ||
168 | * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec | ||
169 | * 0008 34 00 record is 52 bytes long | ||
170 | * 000a 0b 00 text is 11 bytes long | ||
171 | * 000c 1f 00 dictionary is 23 bytes long | ||
172 | * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) | ||
173 | * 0010 69 74 27 73 20 61 20 6c "it's a l" | ||
174 | * 69 6e 65 "ine" | ||
175 | * 001b 44 45 56 49 43 "DEVIC" | ||
176 | * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" | ||
177 | * 52 49 56 45 52 3d 62 75 "RIVER=bu" | ||
178 | * 67 "g" | ||
179 | * 0032 00 00 00 padding to next message header | ||
180 | * | ||
181 | * The 'struct log' buffer header must never be directly exported to | ||
182 | * userspace, it is a kernel-private implementation detail that might | ||
183 | * need to be changed in the future, when the requirements change. | ||
184 | * | ||
185 | * /dev/kmsg exports the structured data in the following line format: | ||
186 | * "level,sequnum,timestamp;<message text>\n" | ||
187 | * | ||
188 | * The optional key/value pairs are attached as continuation lines starting | ||
189 | * with a space character and terminated by a newline. All possible | ||
190 | * non-prinatable characters are escaped in the "\xff" notation. | ||
191 | * | ||
192 | * Users of the export format should ignore possible additional values | ||
193 | * separated by ',', and find the message after the ';' character. | ||
194 | */ | ||
195 | |||
196 | enum log_flags { | ||
197 | LOG_NOCONS = 1, /* already flushed, do not print to console */ | ||
198 | LOG_NEWLINE = 2, /* text ended with a newline */ | ||
199 | LOG_PREFIX = 4, /* text started with a prefix */ | ||
200 | LOG_CONT = 8, /* text is a fragment of a continuation line */ | ||
201 | }; | ||
202 | |||
203 | struct log { | ||
204 | u64 ts_nsec; /* timestamp in nanoseconds */ | ||
205 | u16 len; /* length of entire record */ | ||
206 | u16 text_len; /* length of text buffer */ | ||
207 | u16 dict_len; /* length of dictionary buffer */ | ||
208 | u8 facility; /* syslog facility */ | ||
209 | u8 flags:5; /* internal record flags */ | ||
210 | u8 level:3; /* syslog level */ | ||
211 | }; | ||
212 | |||
213 | /* | ||
214 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | ||
215 | * used in interesting ways to provide interlocking in console_unlock(); | ||
216 | */ | ||
217 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | ||
218 | |||
219 | /* the next printk record to read by syslog(READ) or /proc/kmsg */ | ||
220 | static u64 syslog_seq; | ||
221 | static u32 syslog_idx; | ||
222 | static enum log_flags syslog_prev; | ||
223 | static size_t syslog_partial; | ||
224 | |||
225 | /* index and sequence number of the first record stored in the buffer */ | ||
226 | static u64 log_first_seq; | ||
227 | static u32 log_first_idx; | ||
228 | |||
229 | /* index and sequence number of the next record to store in the buffer */ | ||
230 | static u64 log_next_seq; | ||
148 | #ifdef CONFIG_PRINTK | 231 | #ifdef CONFIG_PRINTK |
232 | static u32 log_next_idx; | ||
149 | 233 | ||
150 | static char __log_buf[__LOG_BUF_LEN]; | 234 | /* the next printk record to read after the last 'clear' command */ |
235 | static u64 clear_seq; | ||
236 | static u32 clear_idx; | ||
237 | |||
238 | #define LOG_LINE_MAX 1024 | ||
239 | |||
240 | /* record buffer */ | ||
241 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | ||
242 | #define LOG_ALIGN 4 | ||
243 | #else | ||
244 | #define LOG_ALIGN __alignof__(struct log) | ||
245 | #endif | ||
246 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | ||
247 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | ||
151 | static char *log_buf = __log_buf; | 248 | static char *log_buf = __log_buf; |
152 | static int log_buf_len = __LOG_BUF_LEN; | 249 | static u32 log_buf_len = __LOG_BUF_LEN; |
153 | static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ | 250 | |
154 | static int saved_console_loglevel = -1; | 251 | /* cpu currently holding logbuf_lock */ |
252 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
253 | |||
254 | /* human readable text of the record */ | ||
255 | static char *log_text(const struct log *msg) | ||
256 | { | ||
257 | return (char *)msg + sizeof(struct log); | ||
258 | } | ||
259 | |||
260 | /* optional key/value pair dictionary attached to the record */ | ||
261 | static char *log_dict(const struct log *msg) | ||
262 | { | ||
263 | return (char *)msg + sizeof(struct log) + msg->text_len; | ||
264 | } | ||
265 | |||
266 | /* get record by index; idx must point to valid msg */ | ||
267 | static struct log *log_from_idx(u32 idx) | ||
268 | { | ||
269 | struct log *msg = (struct log *)(log_buf + idx); | ||
270 | |||
271 | /* | ||
272 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
273 | * read the message at the start of the buffer. | ||
274 | */ | ||
275 | if (!msg->len) | ||
276 | return (struct log *)log_buf; | ||
277 | return msg; | ||
278 | } | ||
279 | |||
280 | /* get next record; idx must point to valid msg */ | ||
281 | static u32 log_next(u32 idx) | ||
282 | { | ||
283 | struct log *msg = (struct log *)(log_buf + idx); | ||
284 | |||
285 | /* length == 0 indicates the end of the buffer; wrap */ | ||
286 | /* | ||
287 | * A length == 0 record is the end of buffer marker. Wrap around and | ||
288 | * read the message at the start of the buffer as *this* one, and | ||
289 | * return the one after that. | ||
290 | */ | ||
291 | if (!msg->len) { | ||
292 | msg = (struct log *)log_buf; | ||
293 | return msg->len; | ||
294 | } | ||
295 | return idx + msg->len; | ||
296 | } | ||
297 | |||
298 | /* insert record into the buffer, discard old ones, update heads */ | ||
299 | static void log_store(int facility, int level, | ||
300 | enum log_flags flags, u64 ts_nsec, | ||
301 | const char *dict, u16 dict_len, | ||
302 | const char *text, u16 text_len) | ||
303 | { | ||
304 | struct log *msg; | ||
305 | u32 size, pad_len; | ||
306 | |||
307 | /* number of '\0' padding bytes to next message */ | ||
308 | size = sizeof(struct log) + text_len + dict_len; | ||
309 | pad_len = (-size) & (LOG_ALIGN - 1); | ||
310 | size += pad_len; | ||
311 | |||
312 | while (log_first_seq < log_next_seq) { | ||
313 | u32 free; | ||
314 | |||
315 | if (log_next_idx > log_first_idx) | ||
316 | free = max(log_buf_len - log_next_idx, log_first_idx); | ||
317 | else | ||
318 | free = log_first_idx - log_next_idx; | ||
319 | |||
320 | if (free > size + sizeof(struct log)) | ||
321 | break; | ||
322 | |||
323 | /* drop old messages until we have enough contiuous space */ | ||
324 | log_first_idx = log_next(log_first_idx); | ||
325 | log_first_seq++; | ||
326 | } | ||
327 | |||
328 | if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { | ||
329 | /* | ||
330 | * This message + an additional empty header does not fit | ||
331 | * at the end of the buffer. Add an empty header with len == 0 | ||
332 | * to signify a wrap around. | ||
333 | */ | ||
334 | memset(log_buf + log_next_idx, 0, sizeof(struct log)); | ||
335 | log_next_idx = 0; | ||
336 | } | ||
337 | |||
338 | /* fill message */ | ||
339 | msg = (struct log *)(log_buf + log_next_idx); | ||
340 | memcpy(log_text(msg), text, text_len); | ||
341 | msg->text_len = text_len; | ||
342 | memcpy(log_dict(msg), dict, dict_len); | ||
343 | msg->dict_len = dict_len; | ||
344 | msg->facility = facility; | ||
345 | msg->level = level & 7; | ||
346 | msg->flags = flags & 0x1f; | ||
347 | if (ts_nsec > 0) | ||
348 | msg->ts_nsec = ts_nsec; | ||
349 | else | ||
350 | msg->ts_nsec = local_clock(); | ||
351 | memset(log_dict(msg) + dict_len, 0, pad_len); | ||
352 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | ||
353 | |||
354 | /* insert message */ | ||
355 | log_next_idx += msg->len; | ||
356 | log_next_seq++; | ||
357 | } | ||
358 | |||
359 | /* /dev/kmsg - userspace message inject/listen interface */ | ||
360 | struct devkmsg_user { | ||
361 | u64 seq; | ||
362 | u32 idx; | ||
363 | struct mutex lock; | ||
364 | char buf[8192]; | ||
365 | }; | ||
366 | |||
367 | static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, | ||
368 | unsigned long count, loff_t pos) | ||
369 | { | ||
370 | char *buf, *line; | ||
371 | int i; | ||
372 | int level = default_message_loglevel; | ||
373 | int facility = 1; /* LOG_USER */ | ||
374 | size_t len = iov_length(iv, count); | ||
375 | ssize_t ret = len; | ||
376 | |||
377 | if (len > LOG_LINE_MAX) | ||
378 | return -EINVAL; | ||
379 | buf = kmalloc(len+1, GFP_KERNEL); | ||
380 | if (buf == NULL) | ||
381 | return -ENOMEM; | ||
382 | |||
383 | line = buf; | ||
384 | for (i = 0; i < count; i++) { | ||
385 | if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) | ||
386 | goto out; | ||
387 | line += iv[i].iov_len; | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace | ||
392 | * the decimal value represents 32bit, the lower 3 bit are the log | ||
393 | * level, the rest are the log facility. | ||
394 | * | ||
395 | * If no prefix or no userspace facility is specified, we | ||
396 | * enforce LOG_USER, to be able to reliably distinguish | ||
397 | * kernel-generated messages from userspace-injected ones. | ||
398 | */ | ||
399 | line = buf; | ||
400 | if (line[0] == '<') { | ||
401 | char *endp = NULL; | ||
402 | |||
403 | i = simple_strtoul(line+1, &endp, 10); | ||
404 | if (endp && endp[0] == '>') { | ||
405 | level = i & 7; | ||
406 | if (i >> 3) | ||
407 | facility = i >> 3; | ||
408 | endp++; | ||
409 | len -= endp - line; | ||
410 | line = endp; | ||
411 | } | ||
412 | } | ||
413 | line[len] = '\0'; | ||
414 | |||
415 | printk_emit(facility, level, NULL, 0, "%s", line); | ||
416 | out: | ||
417 | kfree(buf); | ||
418 | return ret; | ||
419 | } | ||
420 | |||
421 | static ssize_t devkmsg_read(struct file *file, char __user *buf, | ||
422 | size_t count, loff_t *ppos) | ||
423 | { | ||
424 | struct devkmsg_user *user = file->private_data; | ||
425 | struct log *msg; | ||
426 | u64 ts_usec; | ||
427 | size_t i; | ||
428 | size_t len; | ||
429 | ssize_t ret; | ||
430 | |||
431 | if (!user) | ||
432 | return -EBADF; | ||
433 | |||
434 | ret = mutex_lock_interruptible(&user->lock); | ||
435 | if (ret) | ||
436 | return ret; | ||
437 | raw_spin_lock_irq(&logbuf_lock); | ||
438 | while (user->seq == log_next_seq) { | ||
439 | if (file->f_flags & O_NONBLOCK) { | ||
440 | ret = -EAGAIN; | ||
441 | raw_spin_unlock_irq(&logbuf_lock); | ||
442 | goto out; | ||
443 | } | ||
444 | |||
445 | raw_spin_unlock_irq(&logbuf_lock); | ||
446 | ret = wait_event_interruptible(log_wait, | ||
447 | user->seq != log_next_seq); | ||
448 | if (ret) | ||
449 | goto out; | ||
450 | raw_spin_lock_irq(&logbuf_lock); | ||
451 | } | ||
452 | |||
453 | if (user->seq < log_first_seq) { | ||
454 | /* our last seen message is gone, return error and reset */ | ||
455 | user->idx = log_first_idx; | ||
456 | user->seq = log_first_seq; | ||
457 | ret = -EPIPE; | ||
458 | raw_spin_unlock_irq(&logbuf_lock); | ||
459 | goto out; | ||
460 | } | ||
461 | |||
462 | msg = log_from_idx(user->idx); | ||
463 | ts_usec = msg->ts_nsec; | ||
464 | do_div(ts_usec, 1000); | ||
465 | len = sprintf(user->buf, "%u,%llu,%llu;", | ||
466 | (msg->facility << 3) | msg->level, user->seq, ts_usec); | ||
467 | |||
468 | /* escape non-printable characters */ | ||
469 | for (i = 0; i < msg->text_len; i++) { | ||
470 | unsigned char c = log_text(msg)[i]; | ||
471 | |||
472 | if (c < ' ' || c >= 127 || c == '\\') | ||
473 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
474 | else | ||
475 | user->buf[len++] = c; | ||
476 | } | ||
477 | user->buf[len++] = '\n'; | ||
478 | |||
479 | if (msg->dict_len) { | ||
480 | bool line = true; | ||
481 | |||
482 | for (i = 0; i < msg->dict_len; i++) { | ||
483 | unsigned char c = log_dict(msg)[i]; | ||
484 | |||
485 | if (line) { | ||
486 | user->buf[len++] = ' '; | ||
487 | line = false; | ||
488 | } | ||
489 | |||
490 | if (c == '\0') { | ||
491 | user->buf[len++] = '\n'; | ||
492 | line = true; | ||
493 | continue; | ||
494 | } | ||
495 | |||
496 | if (c < ' ' || c >= 127 || c == '\\') { | ||
497 | len += sprintf(user->buf + len, "\\x%02x", c); | ||
498 | continue; | ||
499 | } | ||
500 | |||
501 | user->buf[len++] = c; | ||
502 | } | ||
503 | user->buf[len++] = '\n'; | ||
504 | } | ||
505 | |||
506 | user->idx = log_next(user->idx); | ||
507 | user->seq++; | ||
508 | raw_spin_unlock_irq(&logbuf_lock); | ||
509 | |||
510 | if (len > count) { | ||
511 | ret = -EINVAL; | ||
512 | goto out; | ||
513 | } | ||
514 | |||
515 | if (copy_to_user(buf, user->buf, len)) { | ||
516 | ret = -EFAULT; | ||
517 | goto out; | ||
518 | } | ||
519 | ret = len; | ||
520 | out: | ||
521 | mutex_unlock(&user->lock); | ||
522 | return ret; | ||
523 | } | ||
524 | |||
525 | static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | ||
526 | { | ||
527 | struct devkmsg_user *user = file->private_data; | ||
528 | loff_t ret = 0; | ||
529 | |||
530 | if (!user) | ||
531 | return -EBADF; | ||
532 | if (offset) | ||
533 | return -ESPIPE; | ||
534 | |||
535 | raw_spin_lock_irq(&logbuf_lock); | ||
536 | switch (whence) { | ||
537 | case SEEK_SET: | ||
538 | /* the first record */ | ||
539 | user->idx = log_first_idx; | ||
540 | user->seq = log_first_seq; | ||
541 | break; | ||
542 | case SEEK_DATA: | ||
543 | /* | ||
544 | * The first record after the last SYSLOG_ACTION_CLEAR, | ||
545 | * like issued by 'dmesg -c'. Reading /dev/kmsg itself | ||
546 | * changes no global state, and does not clear anything. | ||
547 | */ | ||
548 | user->idx = clear_idx; | ||
549 | user->seq = clear_seq; | ||
550 | break; | ||
551 | case SEEK_END: | ||
552 | /* after the last record */ | ||
553 | user->idx = log_next_idx; | ||
554 | user->seq = log_next_seq; | ||
555 | break; | ||
556 | default: | ||
557 | ret = -EINVAL; | ||
558 | } | ||
559 | raw_spin_unlock_irq(&logbuf_lock); | ||
560 | return ret; | ||
561 | } | ||
562 | |||
563 | static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | ||
564 | { | ||
565 | struct devkmsg_user *user = file->private_data; | ||
566 | int ret = 0; | ||
567 | |||
568 | if (!user) | ||
569 | return POLLERR|POLLNVAL; | ||
570 | |||
571 | poll_wait(file, &log_wait, wait); | ||
572 | |||
573 | raw_spin_lock_irq(&logbuf_lock); | ||
574 | if (user->seq < log_next_seq) { | ||
575 | /* return error when data has vanished underneath us */ | ||
576 | if (user->seq < log_first_seq) | ||
577 | ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; | ||
578 | ret = POLLIN|POLLRDNORM; | ||
579 | } | ||
580 | raw_spin_unlock_irq(&logbuf_lock); | ||
581 | |||
582 | return ret; | ||
583 | } | ||
584 | |||
585 | static int devkmsg_open(struct inode *inode, struct file *file) | ||
586 | { | ||
587 | struct devkmsg_user *user; | ||
588 | int err; | ||
589 | |||
590 | /* write-only does not need any file context */ | ||
591 | if ((file->f_flags & O_ACCMODE) == O_WRONLY) | ||
592 | return 0; | ||
593 | |||
594 | err = security_syslog(SYSLOG_ACTION_READ_ALL); | ||
595 | if (err) | ||
596 | return err; | ||
597 | |||
598 | user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); | ||
599 | if (!user) | ||
600 | return -ENOMEM; | ||
601 | |||
602 | mutex_init(&user->lock); | ||
603 | |||
604 | raw_spin_lock_irq(&logbuf_lock); | ||
605 | user->idx = log_first_idx; | ||
606 | user->seq = log_first_seq; | ||
607 | raw_spin_unlock_irq(&logbuf_lock); | ||
608 | |||
609 | file->private_data = user; | ||
610 | return 0; | ||
611 | } | ||
612 | |||
613 | static int devkmsg_release(struct inode *inode, struct file *file) | ||
614 | { | ||
615 | struct devkmsg_user *user = file->private_data; | ||
616 | |||
617 | if (!user) | ||
618 | return 0; | ||
619 | |||
620 | mutex_destroy(&user->lock); | ||
621 | kfree(user); | ||
622 | return 0; | ||
623 | } | ||
624 | |||
625 | const struct file_operations kmsg_fops = { | ||
626 | .open = devkmsg_open, | ||
627 | .read = devkmsg_read, | ||
628 | .aio_write = devkmsg_writev, | ||
629 | .llseek = devkmsg_llseek, | ||
630 | .poll = devkmsg_poll, | ||
631 | .release = devkmsg_release, | ||
632 | }; | ||
155 | 633 | ||
156 | #ifdef CONFIG_KEXEC | 634 | #ifdef CONFIG_KEXEC |
157 | /* | 635 | /* |
@@ -165,9 +643,9 @@ static int saved_console_loglevel = -1; | |||
165 | void log_buf_kexec_setup(void) | 643 | void log_buf_kexec_setup(void) |
166 | { | 644 | { |
167 | VMCOREINFO_SYMBOL(log_buf); | 645 | VMCOREINFO_SYMBOL(log_buf); |
168 | VMCOREINFO_SYMBOL(log_end); | ||
169 | VMCOREINFO_SYMBOL(log_buf_len); | 646 | VMCOREINFO_SYMBOL(log_buf_len); |
170 | VMCOREINFO_SYMBOL(logged_chars); | 647 | VMCOREINFO_SYMBOL(log_first_idx); |
648 | VMCOREINFO_SYMBOL(log_next_idx); | ||
171 | } | 649 | } |
172 | #endif | 650 | #endif |
173 | 651 | ||
@@ -191,7 +669,6 @@ early_param("log_buf_len", log_buf_len_setup); | |||
191 | void __init setup_log_buf(int early) | 669 | void __init setup_log_buf(int early) |
192 | { | 670 | { |
193 | unsigned long flags; | 671 | unsigned long flags; |
194 | unsigned start, dest_idx, offset; | ||
195 | char *new_log_buf; | 672 | char *new_log_buf; |
196 | int free; | 673 | int free; |
197 | 674 | ||
@@ -219,20 +696,8 @@ void __init setup_log_buf(int early) | |||
219 | log_buf_len = new_log_buf_len; | 696 | log_buf_len = new_log_buf_len; |
220 | log_buf = new_log_buf; | 697 | log_buf = new_log_buf; |
221 | new_log_buf_len = 0; | 698 | new_log_buf_len = 0; |
222 | free = __LOG_BUF_LEN - log_end; | 699 | free = __LOG_BUF_LEN - log_next_idx; |
223 | 700 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | |
224 | offset = start = min(con_start, log_start); | ||
225 | dest_idx = 0; | ||
226 | while (start != log_end) { | ||
227 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
228 | |||
229 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
230 | start++; | ||
231 | dest_idx++; | ||
232 | } | ||
233 | log_start -= offset; | ||
234 | con_start -= offset; | ||
235 | log_end -= offset; | ||
236 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 701 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
237 | 702 | ||
238 | pr_info("log_buf_len: %d\n", log_buf_len); | 703 | pr_info("log_buf_len: %d\n", log_buf_len); |
@@ -332,11 +797,270 @@ static int check_syslog_permissions(int type, bool from_file) | |||
332 | return 0; | 797 | return 0; |
333 | } | 798 | } |
334 | 799 | ||
800 | #if defined(CONFIG_PRINTK_TIME) | ||
801 | static bool printk_time = 1; | ||
802 | #else | ||
803 | static bool printk_time; | ||
804 | #endif | ||
805 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
806 | |||
807 | static size_t print_time(u64 ts, char *buf) | ||
808 | { | ||
809 | unsigned long rem_nsec; | ||
810 | |||
811 | if (!printk_time) | ||
812 | return 0; | ||
813 | |||
814 | if (!buf) | ||
815 | return 15; | ||
816 | |||
817 | rem_nsec = do_div(ts, 1000000000); | ||
818 | return sprintf(buf, "[%5lu.%06lu] ", | ||
819 | (unsigned long)ts, rem_nsec / 1000); | ||
820 | } | ||
821 | |||
822 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | ||
823 | { | ||
824 | size_t len = 0; | ||
825 | unsigned int prefix = (msg->facility << 3) | msg->level; | ||
826 | |||
827 | if (syslog) { | ||
828 | if (buf) { | ||
829 | len += sprintf(buf, "<%u>", prefix); | ||
830 | } else { | ||
831 | len += 3; | ||
832 | if (prefix > 999) | ||
833 | len += 3; | ||
834 | else if (prefix > 99) | ||
835 | len += 2; | ||
836 | else if (prefix > 9) | ||
837 | len++; | ||
838 | } | ||
839 | } | ||
840 | |||
841 | len += print_time(msg->ts_nsec, buf ? buf + len : NULL); | ||
842 | return len; | ||
843 | } | ||
844 | |||
845 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, | ||
846 | bool syslog, char *buf, size_t size) | ||
847 | { | ||
848 | const char *text = log_text(msg); | ||
849 | size_t text_size = msg->text_len; | ||
850 | bool prefix = true; | ||
851 | bool newline = true; | ||
852 | size_t len = 0; | ||
853 | |||
854 | if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) | ||
855 | prefix = false; | ||
856 | |||
857 | if (msg->flags & LOG_CONT) { | ||
858 | if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) | ||
859 | prefix = false; | ||
860 | |||
861 | if (!(msg->flags & LOG_NEWLINE)) | ||
862 | newline = false; | ||
863 | } | ||
864 | |||
865 | do { | ||
866 | const char *next = memchr(text, '\n', text_size); | ||
867 | size_t text_len; | ||
868 | |||
869 | if (next) { | ||
870 | text_len = next - text; | ||
871 | next++; | ||
872 | text_size -= next - text; | ||
873 | } else { | ||
874 | text_len = text_size; | ||
875 | } | ||
876 | |||
877 | if (buf) { | ||
878 | if (print_prefix(msg, syslog, NULL) + | ||
879 | text_len + 1>= size - len) | ||
880 | break; | ||
881 | |||
882 | if (prefix) | ||
883 | len += print_prefix(msg, syslog, buf + len); | ||
884 | memcpy(buf + len, text, text_len); | ||
885 | len += text_len; | ||
886 | if (next || newline) | ||
887 | buf[len++] = '\n'; | ||
888 | } else { | ||
889 | /* SYSLOG_ACTION_* buffer size only calculation */ | ||
890 | if (prefix) | ||
891 | len += print_prefix(msg, syslog, NULL); | ||
892 | len += text_len; | ||
893 | if (next || newline) | ||
894 | len++; | ||
895 | } | ||
896 | |||
897 | prefix = true; | ||
898 | text = next; | ||
899 | } while (text); | ||
900 | |||
901 | return len; | ||
902 | } | ||
903 | |||
904 | static int syslog_print(char __user *buf, int size) | ||
905 | { | ||
906 | char *text; | ||
907 | struct log *msg; | ||
908 | int len = 0; | ||
909 | |||
910 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
911 | if (!text) | ||
912 | return -ENOMEM; | ||
913 | |||
914 | while (size > 0) { | ||
915 | size_t n; | ||
916 | size_t skip; | ||
917 | |||
918 | raw_spin_lock_irq(&logbuf_lock); | ||
919 | if (syslog_seq < log_first_seq) { | ||
920 | /* messages are gone, move to first one */ | ||
921 | syslog_seq = log_first_seq; | ||
922 | syslog_idx = log_first_idx; | ||
923 | syslog_prev = 0; | ||
924 | syslog_partial = 0; | ||
925 | } | ||
926 | if (syslog_seq == log_next_seq) { | ||
927 | raw_spin_unlock_irq(&logbuf_lock); | ||
928 | break; | ||
929 | } | ||
930 | |||
931 | skip = syslog_partial; | ||
932 | msg = log_from_idx(syslog_idx); | ||
933 | n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); | ||
934 | if (n - syslog_partial <= size) { | ||
935 | /* message fits into buffer, move forward */ | ||
936 | syslog_idx = log_next(syslog_idx); | ||
937 | syslog_seq++; | ||
938 | syslog_prev = msg->flags; | ||
939 | n -= syslog_partial; | ||
940 | syslog_partial = 0; | ||
941 | } else if (!len){ | ||
942 | /* partial read(), remember position */ | ||
943 | n = size; | ||
944 | syslog_partial += n; | ||
945 | } else | ||
946 | n = 0; | ||
947 | raw_spin_unlock_irq(&logbuf_lock); | ||
948 | |||
949 | if (!n) | ||
950 | break; | ||
951 | |||
952 | if (copy_to_user(buf, text + skip, n)) { | ||
953 | if (!len) | ||
954 | len = -EFAULT; | ||
955 | break; | ||
956 | } | ||
957 | |||
958 | len += n; | ||
959 | size -= n; | ||
960 | buf += n; | ||
961 | } | ||
962 | |||
963 | kfree(text); | ||
964 | return len; | ||
965 | } | ||
966 | |||
967 | static int syslog_print_all(char __user *buf, int size, bool clear) | ||
968 | { | ||
969 | char *text; | ||
970 | int len = 0; | ||
971 | |||
972 | text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); | ||
973 | if (!text) | ||
974 | return -ENOMEM; | ||
975 | |||
976 | raw_spin_lock_irq(&logbuf_lock); | ||
977 | if (buf) { | ||
978 | u64 next_seq; | ||
979 | u64 seq; | ||
980 | u32 idx; | ||
981 | enum log_flags prev; | ||
982 | |||
983 | if (clear_seq < log_first_seq) { | ||
984 | /* messages are gone, move to first available one */ | ||
985 | clear_seq = log_first_seq; | ||
986 | clear_idx = log_first_idx; | ||
987 | } | ||
988 | |||
989 | /* | ||
990 | * Find first record that fits, including all following records, | ||
991 | * into the user-provided buffer for this dump. | ||
992 | */ | ||
993 | seq = clear_seq; | ||
994 | idx = clear_idx; | ||
995 | prev = 0; | ||
996 | while (seq < log_next_seq) { | ||
997 | struct log *msg = log_from_idx(idx); | ||
998 | |||
999 | len += msg_print_text(msg, prev, true, NULL, 0); | ||
1000 | idx = log_next(idx); | ||
1001 | seq++; | ||
1002 | } | ||
1003 | |||
1004 | /* move first record forward until length fits into the buffer */ | ||
1005 | seq = clear_seq; | ||
1006 | idx = clear_idx; | ||
1007 | prev = 0; | ||
1008 | while (len > size && seq < log_next_seq) { | ||
1009 | struct log *msg = log_from_idx(idx); | ||
1010 | |||
1011 | len -= msg_print_text(msg, prev, true, NULL, 0); | ||
1012 | idx = log_next(idx); | ||
1013 | seq++; | ||
1014 | } | ||
1015 | |||
1016 | /* last message fitting into this dump */ | ||
1017 | next_seq = log_next_seq; | ||
1018 | |||
1019 | len = 0; | ||
1020 | prev = 0; | ||
1021 | while (len >= 0 && seq < next_seq) { | ||
1022 | struct log *msg = log_from_idx(idx); | ||
1023 | int textlen; | ||
1024 | |||
1025 | textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); | ||
1026 | if (textlen < 0) { | ||
1027 | len = textlen; | ||
1028 | break; | ||
1029 | } | ||
1030 | idx = log_next(idx); | ||
1031 | seq++; | ||
1032 | prev = msg->flags; | ||
1033 | |||
1034 | raw_spin_unlock_irq(&logbuf_lock); | ||
1035 | if (copy_to_user(buf + len, text, textlen)) | ||
1036 | len = -EFAULT; | ||
1037 | else | ||
1038 | len += textlen; | ||
1039 | raw_spin_lock_irq(&logbuf_lock); | ||
1040 | |||
1041 | if (seq < log_first_seq) { | ||
1042 | /* messages are gone, move to next one */ | ||
1043 | seq = log_first_seq; | ||
1044 | idx = log_first_idx; | ||
1045 | prev = 0; | ||
1046 | } | ||
1047 | } | ||
1048 | } | ||
1049 | |||
1050 | if (clear) { | ||
1051 | clear_seq = log_next_seq; | ||
1052 | clear_idx = log_next_idx; | ||
1053 | } | ||
1054 | raw_spin_unlock_irq(&logbuf_lock); | ||
1055 | |||
1056 | kfree(text); | ||
1057 | return len; | ||
1058 | } | ||
1059 | |||
335 | int do_syslog(int type, char __user *buf, int len, bool from_file) | 1060 | int do_syslog(int type, char __user *buf, int len, bool from_file) |
336 | { | 1061 | { |
337 | unsigned i, j, limit, count; | 1062 | bool clear = false; |
338 | int do_clear = 0; | 1063 | static int saved_console_loglevel = -1; |
339 | char c; | ||
340 | int error; | 1064 | int error; |
341 | 1065 | ||
342 | error = check_syslog_permissions(type, from_file); | 1066 | error = check_syslog_permissions(type, from_file); |
@@ -364,28 +1088,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
364 | goto out; | 1088 | goto out; |
365 | } | 1089 | } |
366 | error = wait_event_interruptible(log_wait, | 1090 | error = wait_event_interruptible(log_wait, |
367 | (log_start - log_end)); | 1091 | syslog_seq != log_next_seq); |
368 | if (error) | 1092 | if (error) |
369 | goto out; | 1093 | goto out; |
370 | i = 0; | 1094 | error = syslog_print(buf, len); |
371 | raw_spin_lock_irq(&logbuf_lock); | ||
372 | while (!error && (log_start != log_end) && i < len) { | ||
373 | c = LOG_BUF(log_start); | ||
374 | log_start++; | ||
375 | raw_spin_unlock_irq(&logbuf_lock); | ||
376 | error = __put_user(c,buf); | ||
377 | buf++; | ||
378 | i++; | ||
379 | cond_resched(); | ||
380 | raw_spin_lock_irq(&logbuf_lock); | ||
381 | } | ||
382 | raw_spin_unlock_irq(&logbuf_lock); | ||
383 | if (!error) | ||
384 | error = i; | ||
385 | break; | 1095 | break; |
386 | /* Read/clear last kernel messages */ | 1096 | /* Read/clear last kernel messages */ |
387 | case SYSLOG_ACTION_READ_CLEAR: | 1097 | case SYSLOG_ACTION_READ_CLEAR: |
388 | do_clear = 1; | 1098 | clear = true; |
389 | /* FALL THRU */ | 1099 | /* FALL THRU */ |
390 | /* Read last kernel messages */ | 1100 | /* Read last kernel messages */ |
391 | case SYSLOG_ACTION_READ_ALL: | 1101 | case SYSLOG_ACTION_READ_ALL: |
@@ -399,51 +1109,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | error = -EFAULT; | 1109 | error = -EFAULT; |
400 | goto out; | 1110 | goto out; |
401 | } | 1111 | } |
402 | count = len; | 1112 | error = syslog_print_all(buf, len, clear); |
403 | if (count > log_buf_len) | ||
404 | count = log_buf_len; | ||
405 | raw_spin_lock_irq(&logbuf_lock); | ||
406 | if (count > logged_chars) | ||
407 | count = logged_chars; | ||
408 | if (do_clear) | ||
409 | logged_chars = 0; | ||
410 | limit = log_end; | ||
411 | /* | ||
412 | * __put_user() could sleep, and while we sleep | ||
413 | * printk() could overwrite the messages | ||
414 | * we try to copy to user space. Therefore | ||
415 | * the messages are copied in reverse. <manfreds> | ||
416 | */ | ||
417 | for (i = 0; i < count && !error; i++) { | ||
418 | j = limit-1-i; | ||
419 | if (j + log_buf_len < log_end) | ||
420 | break; | ||
421 | c = LOG_BUF(j); | ||
422 | raw_spin_unlock_irq(&logbuf_lock); | ||
423 | error = __put_user(c,&buf[count-1-i]); | ||
424 | cond_resched(); | ||
425 | raw_spin_lock_irq(&logbuf_lock); | ||
426 | } | ||
427 | raw_spin_unlock_irq(&logbuf_lock); | ||
428 | if (error) | ||
429 | break; | ||
430 | error = i; | ||
431 | if (i != count) { | ||
432 | int offset = count-error; | ||
433 | /* buffer overflow during copy, correct user buffer. */ | ||
434 | for (i = 0; i < error; i++) { | ||
435 | if (__get_user(c,&buf[i+offset]) || | ||
436 | __put_user(c,&buf[i])) { | ||
437 | error = -EFAULT; | ||
438 | break; | ||
439 | } | ||
440 | cond_resched(); | ||
441 | } | ||
442 | } | ||
443 | break; | 1113 | break; |
444 | /* Clear ring buffer */ | 1114 | /* Clear ring buffer */ |
445 | case SYSLOG_ACTION_CLEAR: | 1115 | case SYSLOG_ACTION_CLEAR: |
446 | logged_chars = 0; | 1116 | syslog_print_all(NULL, 0, true); |
447 | break; | 1117 | break; |
448 | /* Disable logging to console */ | 1118 | /* Disable logging to console */ |
449 | case SYSLOG_ACTION_CONSOLE_OFF: | 1119 | case SYSLOG_ACTION_CONSOLE_OFF: |
@@ -472,7 +1142,38 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
472 | break; | 1142 | break; |
473 | /* Number of chars in the log buffer */ | 1143 | /* Number of chars in the log buffer */ |
474 | case SYSLOG_ACTION_SIZE_UNREAD: | 1144 | case SYSLOG_ACTION_SIZE_UNREAD: |
475 | error = log_end - log_start; | 1145 | raw_spin_lock_irq(&logbuf_lock); |
1146 | if (syslog_seq < log_first_seq) { | ||
1147 | /* messages are gone, move to first one */ | ||
1148 | syslog_seq = log_first_seq; | ||
1149 | syslog_idx = log_first_idx; | ||
1150 | syslog_prev = 0; | ||
1151 | syslog_partial = 0; | ||
1152 | } | ||
1153 | if (from_file) { | ||
1154 | /* | ||
1155 | * Short-cut for poll(/"proc/kmsg") which simply checks | ||
1156 | * for pending data, not the size; return the count of | ||
1157 | * records, not the length. | ||
1158 | */ | ||
1159 | error = log_next_idx - syslog_idx; | ||
1160 | } else { | ||
1161 | u64 seq = syslog_seq; | ||
1162 | u32 idx = syslog_idx; | ||
1163 | enum log_flags prev = syslog_prev; | ||
1164 | |||
1165 | error = 0; | ||
1166 | while (seq < log_next_seq) { | ||
1167 | struct log *msg = log_from_idx(idx); | ||
1168 | |||
1169 | error += msg_print_text(msg, prev, true, NULL, 0); | ||
1170 | idx = log_next(idx); | ||
1171 | seq++; | ||
1172 | prev = msg->flags; | ||
1173 | } | ||
1174 | error -= syslog_partial; | ||
1175 | } | ||
1176 | raw_spin_unlock_irq(&logbuf_lock); | ||
476 | break; | 1177 | break; |
477 | /* Size of the log buffer */ | 1178 | /* Size of the log buffer */ |
478 | case SYSLOG_ACTION_SIZE_BUFFER: | 1179 | case SYSLOG_ACTION_SIZE_BUFFER: |
@@ -491,39 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
491 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); | 1192 | return do_syslog(type, buf, len, SYSLOG_FROM_CALL); |
492 | } | 1193 | } |
493 | 1194 | ||
494 | #ifdef CONFIG_KGDB_KDB | ||
495 | /* kdb dmesg command needs access to the syslog buffer. do_syslog() | ||
496 | * uses locks so it cannot be used during debugging. Just tell kdb | ||
497 | * where the start and end of the physical and logical logs are. This | ||
498 | * is equivalent to do_syslog(3). | ||
499 | */ | ||
500 | void kdb_syslog_data(char *syslog_data[4]) | ||
501 | { | ||
502 | syslog_data[0] = log_buf; | ||
503 | syslog_data[1] = log_buf + log_buf_len; | ||
504 | syslog_data[2] = log_buf + log_end - | ||
505 | (logged_chars < log_buf_len ? logged_chars : log_buf_len); | ||
506 | syslog_data[3] = log_buf + log_end; | ||
507 | } | ||
508 | #endif /* CONFIG_KGDB_KDB */ | ||
509 | |||
510 | /* | ||
511 | * Call the console drivers on a range of log_buf | ||
512 | */ | ||
513 | static void __call_console_drivers(unsigned start, unsigned end) | ||
514 | { | ||
515 | struct console *con; | ||
516 | |||
517 | for_each_console(con) { | ||
518 | if (exclusive_console && con != exclusive_console) | ||
519 | continue; | ||
520 | if ((con->flags & CON_ENABLED) && con->write && | ||
521 | (cpu_online(smp_processor_id()) || | ||
522 | (con->flags & CON_ANYTIME))) | ||
523 | con->write(con, &LOG_BUF(start), end - start); | ||
524 | } | ||
525 | } | ||
526 | |||
527 | static bool __read_mostly ignore_loglevel; | 1195 | static bool __read_mostly ignore_loglevel; |
528 | 1196 | ||
529 | static int __init ignore_loglevel_setup(char *str) | 1197 | static int __init ignore_loglevel_setup(char *str) |
@@ -540,142 +1208,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | |||
540 | "print all kernel messages to the console."); | 1208 | "print all kernel messages to the console."); |
541 | 1209 | ||
542 | /* | 1210 | /* |
543 | * Write out chars from start to end - 1 inclusive | ||
544 | */ | ||
545 | static void _call_console_drivers(unsigned start, | ||
546 | unsigned end, int msg_log_level) | ||
547 | { | ||
548 | trace_console(&LOG_BUF(0), start, end, log_buf_len); | ||
549 | |||
550 | if ((msg_log_level < console_loglevel || ignore_loglevel) && | ||
551 | console_drivers && start != end) { | ||
552 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | ||
553 | /* wrapped write */ | ||
554 | __call_console_drivers(start & LOG_BUF_MASK, | ||
555 | log_buf_len); | ||
556 | __call_console_drivers(0, end & LOG_BUF_MASK); | ||
557 | } else { | ||
558 | __call_console_drivers(start, end); | ||
559 | } | ||
560 | } | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the | ||
565 | * lower 3 bit are the log level, the rest are the log facility. In case | ||
566 | * userspace passes usual userspace syslog messages to /dev/kmsg or | ||
567 | * /dev/ttyprintk, the log prefix might contain the facility. Printk needs | ||
568 | * to extract the correct log level for in-kernel processing, and not mangle | ||
569 | * the original value. | ||
570 | * | ||
571 | * If a prefix is found, the length of the prefix is returned. If 'level' is | ||
572 | * passed, it will be filled in with the log level without a possible facility | ||
573 | * value. If 'special' is passed, the special printk prefix chars are accepted | ||
574 | * and returned. If no valid header is found, 0 is returned and the passed | ||
575 | * variables are not touched. | ||
576 | */ | ||
577 | static size_t log_prefix(const char *p, unsigned int *level, char *special) | ||
578 | { | ||
579 | unsigned int lev = 0; | ||
580 | char sp = '\0'; | ||
581 | size_t len; | ||
582 | |||
583 | if (p[0] != '<' || !p[1]) | ||
584 | return 0; | ||
585 | if (p[2] == '>') { | ||
586 | /* usual single digit level number or special char */ | ||
587 | switch (p[1]) { | ||
588 | case '0' ... '7': | ||
589 | lev = p[1] - '0'; | ||
590 | break; | ||
591 | case 'c': /* KERN_CONT */ | ||
592 | case 'd': /* KERN_DEFAULT */ | ||
593 | sp = p[1]; | ||
594 | break; | ||
595 | default: | ||
596 | return 0; | ||
597 | } | ||
598 | len = 3; | ||
599 | } else { | ||
600 | /* multi digit including the level and facility number */ | ||
601 | char *endp = NULL; | ||
602 | |||
603 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | ||
604 | if (endp == NULL || endp[0] != '>') | ||
605 | return 0; | ||
606 | len = (endp + 1) - p; | ||
607 | } | ||
608 | |||
609 | /* do not accept special char if not asked for */ | ||
610 | if (sp && !special) | ||
611 | return 0; | ||
612 | |||
613 | if (special) { | ||
614 | *special = sp; | ||
615 | /* return special char, do not touch level */ | ||
616 | if (sp) | ||
617 | return len; | ||
618 | } | ||
619 | |||
620 | if (level) | ||
621 | *level = lev; | ||
622 | return len; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * Call the console drivers, asking them to write out | 1211 | * Call the console drivers, asking them to write out |
627 | * log_buf[start] to log_buf[end - 1]. | 1212 | * log_buf[start] to log_buf[end - 1]. |
628 | * The console_lock must be held. | 1213 | * The console_lock must be held. |
629 | */ | 1214 | */ |
630 | static void call_console_drivers(unsigned start, unsigned end) | 1215 | static void call_console_drivers(int level, const char *text, size_t len) |
631 | { | 1216 | { |
632 | unsigned cur_index, start_print; | 1217 | struct console *con; |
633 | static int msg_level = -1; | ||
634 | 1218 | ||
635 | BUG_ON(((int)(start - end)) > 0); | 1219 | trace_console(text, 0, len, len); |
636 | 1220 | ||
637 | cur_index = start; | 1221 | if (level >= console_loglevel && !ignore_loglevel) |
638 | start_print = start; | 1222 | return; |
639 | while (cur_index != end) { | 1223 | if (!console_drivers) |
640 | if (msg_level < 0 && ((end - cur_index) > 2)) { | 1224 | return; |
641 | /* strip log prefix */ | ||
642 | cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); | ||
643 | start_print = cur_index; | ||
644 | } | ||
645 | while (cur_index != end) { | ||
646 | char c = LOG_BUF(cur_index); | ||
647 | |||
648 | cur_index++; | ||
649 | if (c == '\n') { | ||
650 | if (msg_level < 0) { | ||
651 | /* | ||
652 | * printk() has already given us loglevel tags in | ||
653 | * the buffer. This code is here in case the | ||
654 | * log buffer has wrapped right round and scribbled | ||
655 | * on those tags | ||
656 | */ | ||
657 | msg_level = default_message_loglevel; | ||
658 | } | ||
659 | _call_console_drivers(start_print, cur_index, msg_level); | ||
660 | msg_level = -1; | ||
661 | start_print = cur_index; | ||
662 | break; | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | _call_console_drivers(start_print, end, msg_level); | ||
667 | } | ||
668 | 1225 | ||
669 | static void emit_log_char(char c) | 1226 | for_each_console(con) { |
670 | { | 1227 | if (exclusive_console && con != exclusive_console) |
671 | LOG_BUF(log_end) = c; | 1228 | continue; |
672 | log_end++; | 1229 | if (!(con->flags & CON_ENABLED)) |
673 | if (log_end - log_start > log_buf_len) | 1230 | continue; |
674 | log_start = log_end - log_buf_len; | 1231 | if (!con->write) |
675 | if (log_end - con_start > log_buf_len) | 1232 | continue; |
676 | con_start = log_end - log_buf_len; | 1233 | if (!cpu_online(smp_processor_id()) && |
677 | if (logged_chars < log_buf_len) | 1234 | !(con->flags & CON_ANYTIME)) |
678 | logged_chars++; | 1235 | continue; |
1236 | con->write(con, text, len); | ||
1237 | } | ||
679 | } | 1238 | } |
680 | 1239 | ||
681 | /* | 1240 | /* |
@@ -700,16 +1259,6 @@ static void zap_locks(void) | |||
700 | sema_init(&console_sem, 1); | 1259 | sema_init(&console_sem, 1); |
701 | } | 1260 | } |
702 | 1261 | ||
703 | #if defined(CONFIG_PRINTK_TIME) | ||
704 | static bool printk_time = 1; | ||
705 | #else | ||
706 | static bool printk_time = 0; | ||
707 | #endif | ||
708 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | ||
709 | |||
710 | static bool always_kmsg_dump; | ||
711 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
712 | |||
713 | /* Check if we have any console registered that can be called early in boot. */ | 1262 | /* Check if we have any console registered that can be called early in boot. */ |
714 | static int have_callable_console(void) | 1263 | static int have_callable_console(void) |
715 | { | 1264 | { |
@@ -722,51 +1271,6 @@ static int have_callable_console(void) | |||
722 | return 0; | 1271 | return 0; |
723 | } | 1272 | } |
724 | 1273 | ||
725 | /** | ||
726 | * printk - print a kernel message | ||
727 | * @fmt: format string | ||
728 | * | ||
729 | * This is printk(). It can be called from any context. We want it to work. | ||
730 | * | ||
731 | * We try to grab the console_lock. If we succeed, it's easy - we log the output and | ||
732 | * call the console drivers. If we fail to get the semaphore we place the output | ||
733 | * into the log buffer and return. The current holder of the console_sem will | ||
734 | * notice the new output in console_unlock(); and will send it to the | ||
735 | * consoles before releasing the lock. | ||
736 | * | ||
737 | * One effect of this deferred printing is that code which calls printk() and | ||
738 | * then changes console_loglevel may break. This is because console_loglevel | ||
739 | * is inspected when the actual printing occurs. | ||
740 | * | ||
741 | * See also: | ||
742 | * printf(3) | ||
743 | * | ||
744 | * See the vsnprintf() documentation for format string extensions over C99. | ||
745 | */ | ||
746 | |||
747 | asmlinkage int printk(const char *fmt, ...) | ||
748 | { | ||
749 | va_list args; | ||
750 | int r; | ||
751 | |||
752 | #ifdef CONFIG_KGDB_KDB | ||
753 | if (unlikely(kdb_trap_printk)) { | ||
754 | va_start(args, fmt); | ||
755 | r = vkdb_printf(fmt, args); | ||
756 | va_end(args); | ||
757 | return r; | ||
758 | } | ||
759 | #endif | ||
760 | va_start(args, fmt); | ||
761 | r = vprintk(fmt, args); | ||
762 | va_end(args); | ||
763 | |||
764 | return r; | ||
765 | } | ||
766 | |||
767 | /* cpu currently holding logbuf_lock */ | ||
768 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
769 | |||
770 | /* | 1274 | /* |
771 | * Can we actually use the console at this time on this cpu? | 1275 | * Can we actually use the console at this time on this cpu? |
772 | * | 1276 | * |
@@ -810,17 +1314,12 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
810 | retval = 0; | 1314 | retval = 0; |
811 | } | 1315 | } |
812 | } | 1316 | } |
813 | printk_cpu = UINT_MAX; | 1317 | logbuf_cpu = UINT_MAX; |
814 | if (wake) | 1318 | if (wake) |
815 | up(&console_sem); | 1319 | up(&console_sem); |
816 | raw_spin_unlock(&logbuf_lock); | 1320 | raw_spin_unlock(&logbuf_lock); |
817 | return retval; | 1321 | return retval; |
818 | } | 1322 | } |
819 | static const char recursion_bug_msg [] = | ||
820 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
821 | static int recursion_bug; | ||
822 | static int new_text_line = 1; | ||
823 | static char printk_buf[1024]; | ||
824 | 1323 | ||
825 | int printk_delay_msec __read_mostly; | 1324 | int printk_delay_msec __read_mostly; |
826 | 1325 | ||
@@ -836,15 +1335,99 @@ static inline void printk_delay(void) | |||
836 | } | 1335 | } |
837 | } | 1336 | } |
838 | 1337 | ||
839 | asmlinkage int vprintk(const char *fmt, va_list args) | 1338 | /* |
1339 | * Continuation lines are buffered, and not committed to the record buffer | ||
1340 | * until the line is complete, or a race forces it. The line fragments | ||
1341 | * though, are printed immediately to the consoles to ensure everything has | ||
1342 | * reached the console in case of a kernel crash. | ||
1343 | */ | ||
1344 | static struct cont { | ||
1345 | char buf[LOG_LINE_MAX]; | ||
1346 | size_t len; /* length == 0 means unused buffer */ | ||
1347 | size_t cons; /* bytes written to console */ | ||
1348 | struct task_struct *owner; /* task of first print*/ | ||
1349 | u64 ts_nsec; /* time of first print */ | ||
1350 | u8 level; /* log level of first message */ | ||
1351 | u8 facility; /* log level of first message */ | ||
1352 | bool flushed:1; /* buffer sealed and committed */ | ||
1353 | } cont; | ||
1354 | |||
1355 | static void cont_flush(void) | ||
840 | { | 1356 | { |
841 | int printed_len = 0; | 1357 | if (cont.flushed) |
842 | int current_log_level = default_message_loglevel; | 1358 | return; |
1359 | if (cont.len == 0) | ||
1360 | return; | ||
1361 | |||
1362 | log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, | ||
1363 | NULL, 0, cont.buf, cont.len); | ||
1364 | |||
1365 | cont.flushed = true; | ||
1366 | } | ||
1367 | |||
1368 | static bool cont_add(int facility, int level, const char *text, size_t len) | ||
1369 | { | ||
1370 | if (cont.len && cont.flushed) | ||
1371 | return false; | ||
1372 | |||
1373 | if (cont.len + len > sizeof(cont.buf)) { | ||
1374 | cont_flush(); | ||
1375 | return false; | ||
1376 | } | ||
1377 | |||
1378 | if (!cont.len) { | ||
1379 | cont.facility = facility; | ||
1380 | cont.level = level; | ||
1381 | cont.owner = current; | ||
1382 | cont.ts_nsec = local_clock(); | ||
1383 | cont.cons = 0; | ||
1384 | cont.flushed = false; | ||
1385 | } | ||
1386 | |||
1387 | memcpy(cont.buf + cont.len, text, len); | ||
1388 | cont.len += len; | ||
1389 | return true; | ||
1390 | } | ||
1391 | |||
1392 | static size_t cont_print_text(char *text, size_t size) | ||
1393 | { | ||
1394 | size_t textlen = 0; | ||
1395 | size_t len; | ||
1396 | |||
1397 | if (cont.cons == 0) { | ||
1398 | textlen += print_time(cont.ts_nsec, text); | ||
1399 | size -= textlen; | ||
1400 | } | ||
1401 | |||
1402 | len = cont.len - cont.cons; | ||
1403 | if (len > 0) { | ||
1404 | if (len+1 > size) | ||
1405 | len = size-1; | ||
1406 | memcpy(text + textlen, cont.buf + cont.cons, len); | ||
1407 | textlen += len; | ||
1408 | cont.cons = cont.len; | ||
1409 | } | ||
1410 | |||
1411 | if (cont.flushed) { | ||
1412 | text[textlen++] = '\n'; | ||
1413 | /* got everything, release buffer */ | ||
1414 | cont.len = 0; | ||
1415 | } | ||
1416 | return textlen; | ||
1417 | } | ||
1418 | |||
1419 | asmlinkage int vprintk_emit(int facility, int level, | ||
1420 | const char *dict, size_t dictlen, | ||
1421 | const char *fmt, va_list args) | ||
1422 | { | ||
1423 | static int recursion_bug; | ||
1424 | static char textbuf[LOG_LINE_MAX]; | ||
1425 | char *text = textbuf; | ||
1426 | size_t text_len; | ||
1427 | enum log_flags lflags = 0; | ||
843 | unsigned long flags; | 1428 | unsigned long flags; |
844 | int this_cpu; | 1429 | int this_cpu; |
845 | char *p; | 1430 | int printed_len = 0; |
846 | size_t plen; | ||
847 | char special; | ||
848 | 1431 | ||
849 | boot_delay_msec(); | 1432 | boot_delay_msec(); |
850 | printk_delay(); | 1433 | printk_delay(); |
@@ -856,7 +1439,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | /* | 1439 | /* |
857 | * Ouch, printk recursed into itself! | 1440 | * Ouch, printk recursed into itself! |
858 | */ | 1441 | */ |
859 | if (unlikely(printk_cpu == this_cpu)) { | 1442 | if (unlikely(logbuf_cpu == this_cpu)) { |
860 | /* | 1443 | /* |
861 | * If a crash is occurring during printk() on this CPU, | 1444 | * If a crash is occurring during printk() on this CPU, |
862 | * then try to get the crash message out but make sure | 1445 | * then try to get the crash message out but make sure |
@@ -873,97 +1456,91 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
873 | 1456 | ||
874 | lockdep_off(); | 1457 | lockdep_off(); |
875 | raw_spin_lock(&logbuf_lock); | 1458 | raw_spin_lock(&logbuf_lock); |
876 | printk_cpu = this_cpu; | 1459 | logbuf_cpu = this_cpu; |
877 | 1460 | ||
878 | if (recursion_bug) { | 1461 | if (recursion_bug) { |
1462 | static const char recursion_msg[] = | ||
1463 | "BUG: recent printk recursion!"; | ||
1464 | |||
879 | recursion_bug = 0; | 1465 | recursion_bug = 0; |
880 | strcpy(printk_buf, recursion_bug_msg); | 1466 | printed_len += strlen(recursion_msg); |
881 | printed_len = strlen(recursion_bug_msg); | 1467 | /* emit KERN_CRIT message */ |
1468 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, | ||
1469 | NULL, 0, recursion_msg, printed_len); | ||
882 | } | 1470 | } |
883 | /* Emit the output into the temporary buffer */ | ||
884 | printed_len += vscnprintf(printk_buf + printed_len, | ||
885 | sizeof(printk_buf) - printed_len, fmt, args); | ||
886 | 1471 | ||
887 | p = printk_buf; | 1472 | /* |
1473 | * The printf needs to come first; we need the syslog | ||
1474 | * prefix which might be passed-in as a parameter. | ||
1475 | */ | ||
1476 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | ||
888 | 1477 | ||
889 | /* Read log level and handle special printk prefix */ | 1478 | /* mark and strip a trailing newline */ |
890 | plen = log_prefix(p, ¤t_log_level, &special); | 1479 | if (text_len && text[text_len-1] == '\n') { |
891 | if (plen) { | 1480 | text_len--; |
892 | p += plen; | 1481 | lflags |= LOG_NEWLINE; |
1482 | } | ||
893 | 1483 | ||
894 | switch (special) { | 1484 | /* strip syslog prefix and extract log level or control flags */ |
895 | case 'c': /* Strip <c> KERN_CONT, continue line */ | 1485 | if (text[0] == '<' && text[1] && text[2] == '>') { |
896 | plen = 0; | 1486 | switch (text[1]) { |
897 | break; | 1487 | case '0' ... '7': |
898 | case 'd': /* Strip <d> KERN_DEFAULT, start new line */ | 1488 | if (level == -1) |
899 | plen = 0; | 1489 | level = text[1] - '0'; |
900 | default: | 1490 | case 'd': /* KERN_DEFAULT */ |
901 | if (!new_text_line) { | 1491 | lflags |= LOG_PREFIX; |
902 | emit_log_char('\n'); | 1492 | case 'c': /* KERN_CONT */ |
903 | new_text_line = 1; | 1493 | text += 3; |
904 | } | 1494 | text_len -= 3; |
905 | } | 1495 | } |
906 | } | 1496 | } |
907 | 1497 | ||
908 | /* | 1498 | if (level == -1) |
909 | * Copy the output into log_buf. If the caller didn't provide | 1499 | level = default_message_loglevel; |
910 | * the appropriate log prefix, we insert them here | ||
911 | */ | ||
912 | for (; *p; p++) { | ||
913 | if (new_text_line) { | ||
914 | new_text_line = 0; | ||
915 | |||
916 | if (plen) { | ||
917 | /* Copy original log prefix */ | ||
918 | int i; | ||
919 | |||
920 | for (i = 0; i < plen; i++) | ||
921 | emit_log_char(printk_buf[i]); | ||
922 | printed_len += plen; | ||
923 | } else { | ||
924 | /* Add log prefix */ | ||
925 | emit_log_char('<'); | ||
926 | emit_log_char(current_log_level + '0'); | ||
927 | emit_log_char('>'); | ||
928 | printed_len += 3; | ||
929 | } | ||
930 | 1500 | ||
931 | if (printk_time) { | 1501 | if (dict) |
932 | /* Add the current time stamp */ | 1502 | lflags |= LOG_PREFIX|LOG_NEWLINE; |
933 | char tbuf[50], *tp; | ||
934 | unsigned tlen; | ||
935 | unsigned long long t; | ||
936 | unsigned long nanosec_rem; | ||
937 | |||
938 | t = cpu_clock(printk_cpu); | ||
939 | nanosec_rem = do_div(t, 1000000000); | ||
940 | tlen = sprintf(tbuf, "[%5lu.%06lu] ", | ||
941 | (unsigned long) t, | ||
942 | nanosec_rem / 1000); | ||
943 | |||
944 | for (tp = tbuf; tp < tbuf + tlen; tp++) | ||
945 | emit_log_char(*tp); | ||
946 | printed_len += tlen; | ||
947 | } | ||
948 | 1503 | ||
949 | if (!*p) | 1504 | if (!(lflags & LOG_NEWLINE)) { |
950 | break; | 1505 | /* |
1506 | * Flush the conflicting buffer. An earlier newline was missing, | ||
1507 | * or another task also prints continuation lines. | ||
1508 | */ | ||
1509 | if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) | ||
1510 | cont_flush(); | ||
1511 | |||
1512 | /* buffer line if possible, otherwise store it right away */ | ||
1513 | if (!cont_add(facility, level, text, text_len)) | ||
1514 | log_store(facility, level, lflags | LOG_CONT, 0, | ||
1515 | dict, dictlen, text, text_len); | ||
1516 | } else { | ||
1517 | bool stored = false; | ||
1518 | |||
1519 | /* | ||
1520 | * If an earlier newline was missing and it was the same task, | ||
1521 | * either merge it with the current buffer and flush, or if | ||
1522 | * there was a race with interrupts (prefix == true) then just | ||
1523 | * flush it out and store this line separately. | ||
1524 | */ | ||
1525 | if (cont.len && cont.owner == current) { | ||
1526 | if (!(lflags & LOG_PREFIX)) | ||
1527 | stored = cont_add(facility, level, text, text_len); | ||
1528 | cont_flush(); | ||
951 | } | 1529 | } |
952 | 1530 | ||
953 | emit_log_char(*p); | 1531 | if (!stored) |
954 | if (*p == '\n') | 1532 | log_store(facility, level, lflags, 0, |
955 | new_text_line = 1; | 1533 | dict, dictlen, text, text_len); |
956 | } | 1534 | } |
1535 | printed_len += text_len; | ||
957 | 1536 | ||
958 | /* | 1537 | /* |
959 | * Try to acquire and then immediately release the | 1538 | * Try to acquire and then immediately release the console semaphore. |
960 | * console semaphore. The release will do all the | 1539 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
961 | * actual magic (print out buffers, wake up klogd, | 1540 | * users. |
962 | * etc). | ||
963 | * | 1541 | * |
964 | * The console_trylock_for_printk() function | 1542 | * The console_trylock_for_printk() function will release 'logbuf_lock' |
965 | * will release 'logbuf_lock' regardless of whether it | 1543 | * regardless of whether it actually gets the console semaphore or not. |
966 | * actually gets the semaphore or not. | ||
967 | */ | 1544 | */ |
968 | if (console_trylock_for_printk(this_cpu)) | 1545 | if (console_trylock_for_printk(this_cpu)) |
969 | console_unlock(); | 1546 | console_unlock(); |
@@ -974,16 +1551,88 @@ out_restore_irqs: | |||
974 | 1551 | ||
975 | return printed_len; | 1552 | return printed_len; |
976 | } | 1553 | } |
977 | EXPORT_SYMBOL(printk); | 1554 | EXPORT_SYMBOL(vprintk_emit); |
978 | EXPORT_SYMBOL(vprintk); | ||
979 | 1555 | ||
980 | #else | 1556 | asmlinkage int vprintk(const char *fmt, va_list args) |
1557 | { | ||
1558 | return vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1559 | } | ||
1560 | EXPORT_SYMBOL(vprintk); | ||
981 | 1561 | ||
982 | static void call_console_drivers(unsigned start, unsigned end) | 1562 | asmlinkage int printk_emit(int facility, int level, |
1563 | const char *dict, size_t dictlen, | ||
1564 | const char *fmt, ...) | ||
983 | { | 1565 | { |
1566 | va_list args; | ||
1567 | int r; | ||
1568 | |||
1569 | va_start(args, fmt); | ||
1570 | r = vprintk_emit(facility, level, dict, dictlen, fmt, args); | ||
1571 | va_end(args); | ||
1572 | |||
1573 | return r; | ||
984 | } | 1574 | } |
1575 | EXPORT_SYMBOL(printk_emit); | ||
1576 | |||
1577 | /** | ||
1578 | * printk - print a kernel message | ||
1579 | * @fmt: format string | ||
1580 | * | ||
1581 | * This is printk(). It can be called from any context. We want it to work. | ||
1582 | * | ||
1583 | * We try to grab the console_lock. If we succeed, it's easy - we log the | ||
1584 | * output and call the console drivers. If we fail to get the semaphore, we | ||
1585 | * place the output into the log buffer and return. The current holder of | ||
1586 | * the console_sem will notice the new output in console_unlock(); and will | ||
1587 | * send it to the consoles before releasing the lock. | ||
1588 | * | ||
1589 | * One effect of this deferred printing is that code which calls printk() and | ||
1590 | * then changes console_loglevel may break. This is because console_loglevel | ||
1591 | * is inspected when the actual printing occurs. | ||
1592 | * | ||
1593 | * See also: | ||
1594 | * printf(3) | ||
1595 | * | ||
1596 | * See the vsnprintf() documentation for format string extensions over C99. | ||
1597 | */ | ||
1598 | asmlinkage int printk(const char *fmt, ...) | ||
1599 | { | ||
1600 | va_list args; | ||
1601 | int r; | ||
985 | 1602 | ||
1603 | #ifdef CONFIG_KGDB_KDB | ||
1604 | if (unlikely(kdb_trap_printk)) { | ||
1605 | va_start(args, fmt); | ||
1606 | r = vkdb_printf(fmt, args); | ||
1607 | va_end(args); | ||
1608 | return r; | ||
1609 | } | ||
986 | #endif | 1610 | #endif |
1611 | va_start(args, fmt); | ||
1612 | r = vprintk_emit(0, -1, NULL, 0, fmt, args); | ||
1613 | va_end(args); | ||
1614 | |||
1615 | return r; | ||
1616 | } | ||
1617 | EXPORT_SYMBOL(printk); | ||
1618 | |||
1619 | #else | ||
1620 | |||
1621 | #define LOG_LINE_MAX 0 | ||
1622 | static struct cont { | ||
1623 | size_t len; | ||
1624 | size_t cons; | ||
1625 | u8 level; | ||
1626 | bool flushed:1; | ||
1627 | } cont; | ||
1628 | static struct log *log_from_idx(u32 idx) { return NULL; } | ||
1629 | static u32 log_next(u32 idx) { return 0; } | ||
1630 | static void call_console_drivers(int level, const char *text, size_t len) {} | ||
1631 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, | ||
1632 | bool syslog, char *buf, size_t size) { return 0; } | ||
1633 | static size_t cont_print_text(char *text, size_t size) { return 0; } | ||
1634 | |||
1635 | #endif /* CONFIG_PRINTK */ | ||
987 | 1636 | ||
988 | static int __add_preferred_console(char *name, int idx, char *options, | 1637 | static int __add_preferred_console(char *name, int idx, char *options, |
989 | char *brl_options) | 1638 | char *brl_options) |
@@ -1217,7 +1866,7 @@ int is_console_locked(void) | |||
1217 | } | 1866 | } |
1218 | 1867 | ||
1219 | /* | 1868 | /* |
1220 | * Delayed printk facility, for scheduler-internal messages: | 1869 | * Delayed printk version, for scheduler-internal messages: |
1221 | */ | 1870 | */ |
1222 | #define PRINTK_BUF_SIZE 512 | 1871 | #define PRINTK_BUF_SIZE 512 |
1223 | 1872 | ||
@@ -1253,6 +1902,11 @@ void wake_up_klogd(void) | |||
1253 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); | 1902 | this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); |
1254 | } | 1903 | } |
1255 | 1904 | ||
1905 | /* the next printk record to write to the console */ | ||
1906 | static u64 console_seq; | ||
1907 | static u32 console_idx; | ||
1908 | static enum log_flags console_prev; | ||
1909 | |||
1256 | /** | 1910 | /** |
1257 | * console_unlock - unlock the console system | 1911 | * console_unlock - unlock the console system |
1258 | * | 1912 | * |
@@ -1263,15 +1917,17 @@ void wake_up_klogd(void) | |||
1263 | * by printk(). If this is the case, console_unlock(); emits | 1917 | * by printk(). If this is the case, console_unlock(); emits |
1264 | * the output prior to releasing the lock. | 1918 | * the output prior to releasing the lock. |
1265 | * | 1919 | * |
1266 | * If there is output waiting for klogd, we wake it up. | 1920 | * If there is output waiting, we wake /dev/kmsg and syslog() users. |
1267 | * | 1921 | * |
1268 | * console_unlock(); may be called from any context. | 1922 | * console_unlock(); may be called from any context. |
1269 | */ | 1923 | */ |
1270 | void console_unlock(void) | 1924 | void console_unlock(void) |
1271 | { | 1925 | { |
1926 | static char text[LOG_LINE_MAX]; | ||
1927 | static u64 seen_seq; | ||
1272 | unsigned long flags; | 1928 | unsigned long flags; |
1273 | unsigned _con_start, _log_end; | 1929 | bool wake_klogd = false; |
1274 | unsigned wake_klogd = 0, retry = 0; | 1930 | bool retry; |
1275 | 1931 | ||
1276 | if (console_suspended) { | 1932 | if (console_suspended) { |
1277 | up(&console_sem); | 1933 | up(&console_sem); |
@@ -1280,18 +1936,69 @@ void console_unlock(void) | |||
1280 | 1936 | ||
1281 | console_may_schedule = 0; | 1937 | console_may_schedule = 0; |
1282 | 1938 | ||
1939 | /* flush buffered message fragment immediately to console */ | ||
1940 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
1941 | if (cont.len && (cont.cons < cont.len || cont.flushed)) { | ||
1942 | size_t len; | ||
1943 | |||
1944 | len = cont_print_text(text, sizeof(text)); | ||
1945 | raw_spin_unlock(&logbuf_lock); | ||
1946 | stop_critical_timings(); | ||
1947 | call_console_drivers(cont.level, text, len); | ||
1948 | start_critical_timings(); | ||
1949 | local_irq_restore(flags); | ||
1950 | } else | ||
1951 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1952 | |||
1283 | again: | 1953 | again: |
1284 | for ( ; ; ) { | 1954 | for (;;) { |
1955 | struct log *msg; | ||
1956 | size_t len; | ||
1957 | int level; | ||
1958 | |||
1285 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 1959 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1286 | wake_klogd |= log_start - log_end; | 1960 | if (seen_seq != log_next_seq) { |
1287 | if (con_start == log_end) | 1961 | wake_klogd = true; |
1288 | break; /* Nothing to print */ | 1962 | seen_seq = log_next_seq; |
1289 | _con_start = con_start; | 1963 | } |
1290 | _log_end = log_end; | 1964 | |
1291 | con_start = log_end; /* Flush */ | 1965 | if (console_seq < log_first_seq) { |
1966 | /* messages are gone, move to first one */ | ||
1967 | console_seq = log_first_seq; | ||
1968 | console_idx = log_first_idx; | ||
1969 | console_prev = 0; | ||
1970 | } | ||
1971 | skip: | ||
1972 | if (console_seq == log_next_seq) | ||
1973 | break; | ||
1974 | |||
1975 | msg = log_from_idx(console_idx); | ||
1976 | if (msg->flags & LOG_NOCONS) { | ||
1977 | /* | ||
1978 | * Skip record we have buffered and already printed | ||
1979 | * directly to the console when we received it. | ||
1980 | */ | ||
1981 | console_idx = log_next(console_idx); | ||
1982 | console_seq++; | ||
1983 | /* | ||
1984 | * We will get here again when we register a new | ||
1985 | * CON_PRINTBUFFER console. Clear the flag so we | ||
1986 | * will properly dump everything later. | ||
1987 | */ | ||
1988 | msg->flags &= ~LOG_NOCONS; | ||
1989 | goto skip; | ||
1990 | } | ||
1991 | |||
1992 | level = msg->level; | ||
1993 | len = msg_print_text(msg, console_prev, false, | ||
1994 | text, sizeof(text)); | ||
1995 | console_idx = log_next(console_idx); | ||
1996 | console_seq++; | ||
1997 | console_prev = msg->flags; | ||
1292 | raw_spin_unlock(&logbuf_lock); | 1998 | raw_spin_unlock(&logbuf_lock); |
1999 | |||
1293 | stop_critical_timings(); /* don't trace print latency */ | 2000 | stop_critical_timings(); /* don't trace print latency */ |
1294 | call_console_drivers(_con_start, _log_end); | 2001 | call_console_drivers(level, text, len); |
1295 | start_critical_timings(); | 2002 | start_critical_timings(); |
1296 | local_irq_restore(flags); | 2003 | local_irq_restore(flags); |
1297 | } | 2004 | } |
@@ -1312,8 +2019,7 @@ again: | |||
1312 | * flush, no worries. | 2019 | * flush, no worries. |
1313 | */ | 2020 | */ |
1314 | raw_spin_lock(&logbuf_lock); | 2021 | raw_spin_lock(&logbuf_lock); |
1315 | if (con_start != log_end) | 2022 | retry = console_seq != log_next_seq; |
1316 | retry = 1; | ||
1317 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2023 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1318 | 2024 | ||
1319 | if (retry && console_trylock()) | 2025 | if (retry && console_trylock()) |
@@ -1549,7 +2255,9 @@ void register_console(struct console *newcon) | |||
1549 | * for us. | 2255 | * for us. |
1550 | */ | 2256 | */ |
1551 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2257 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1552 | con_start = log_start; | 2258 | console_seq = syslog_seq; |
2259 | console_idx = syslog_idx; | ||
2260 | console_prev = syslog_prev; | ||
1553 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2261 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1554 | /* | 2262 | /* |
1555 | * We're about to replay the log buffer. Only do this to the | 2263 | * We're about to replay the log buffer. Only do this to the |
@@ -1758,50 +2466,263 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) | |||
1758 | } | 2466 | } |
1759 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); | 2467 | EXPORT_SYMBOL_GPL(kmsg_dump_unregister); |
1760 | 2468 | ||
2469 | static bool always_kmsg_dump; | ||
2470 | module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); | ||
2471 | |||
1761 | /** | 2472 | /** |
1762 | * kmsg_dump - dump kernel log to kernel message dumpers. | 2473 | * kmsg_dump - dump kernel log to kernel message dumpers. |
1763 | * @reason: the reason (oops, panic etc) for dumping | 2474 | * @reason: the reason (oops, panic etc) for dumping |
1764 | * | 2475 | * |
1765 | * Iterate through each of the dump devices and call the oops/panic | 2476 | * Call each of the registered dumper's dump() callback, which can |
1766 | * callbacks with the log buffer. | 2477 | * retrieve the kmsg records with kmsg_dump_get_line() or |
2478 | * kmsg_dump_get_buffer(). | ||
1767 | */ | 2479 | */ |
1768 | void kmsg_dump(enum kmsg_dump_reason reason) | 2480 | void kmsg_dump(enum kmsg_dump_reason reason) |
1769 | { | 2481 | { |
1770 | unsigned long end; | ||
1771 | unsigned chars; | ||
1772 | struct kmsg_dumper *dumper; | 2482 | struct kmsg_dumper *dumper; |
1773 | const char *s1, *s2; | ||
1774 | unsigned long l1, l2; | ||
1775 | unsigned long flags; | 2483 | unsigned long flags; |
1776 | 2484 | ||
1777 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) | 2485 | if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) |
1778 | return; | 2486 | return; |
1779 | 2487 | ||
1780 | /* Theoretically, the log could move on after we do this, but | 2488 | rcu_read_lock(); |
1781 | there's not a lot we can do about that. The new messages | 2489 | list_for_each_entry_rcu(dumper, &dump_list, list) { |
1782 | will overwrite the start of what we dump. */ | 2490 | if (dumper->max_reason && reason > dumper->max_reason) |
2491 | continue; | ||
2492 | |||
2493 | /* initialize iterator with data about the stored records */ | ||
2494 | dumper->active = true; | ||
2495 | |||
2496 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2497 | dumper->cur_seq = clear_seq; | ||
2498 | dumper->cur_idx = clear_idx; | ||
2499 | dumper->next_seq = log_next_seq; | ||
2500 | dumper->next_idx = log_next_idx; | ||
2501 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2502 | |||
2503 | /* invoke dumper which will iterate over records */ | ||
2504 | dumper->dump(dumper, reason); | ||
2505 | |||
2506 | /* reset iterator */ | ||
2507 | dumper->active = false; | ||
2508 | } | ||
2509 | rcu_read_unlock(); | ||
2510 | } | ||
2511 | |||
2512 | /** | ||
2513 | * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) | ||
2514 | * @dumper: registered kmsg dumper | ||
2515 | * @syslog: include the "<4>" prefixes | ||
2516 | * @line: buffer to copy the line to | ||
2517 | * @size: maximum size of the buffer | ||
2518 | * @len: length of line placed into buffer | ||
2519 | * | ||
2520 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
2521 | * record, and copy one record into the provided buffer. | ||
2522 | * | ||
2523 | * Consecutive calls will return the next available record moving | ||
2524 | * towards the end of the buffer with the youngest messages. | ||
2525 | * | ||
2526 | * A return value of FALSE indicates that there are no more records to | ||
2527 | * read. | ||
2528 | * | ||
2529 | * The function is similar to kmsg_dump_get_line(), but grabs no locks. | ||
2530 | */ | ||
2531 | bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, | ||
2532 | char *line, size_t size, size_t *len) | ||
2533 | { | ||
2534 | struct log *msg; | ||
2535 | size_t l = 0; | ||
2536 | bool ret = false; | ||
2537 | |||
2538 | if (!dumper->active) | ||
2539 | goto out; | ||
2540 | |||
2541 | if (dumper->cur_seq < log_first_seq) { | ||
2542 | /* messages are gone, move to first available one */ | ||
2543 | dumper->cur_seq = log_first_seq; | ||
2544 | dumper->cur_idx = log_first_idx; | ||
2545 | } | ||
2546 | |||
2547 | /* last entry */ | ||
2548 | if (dumper->cur_seq >= log_next_seq) | ||
2549 | goto out; | ||
2550 | |||
2551 | msg = log_from_idx(dumper->cur_idx); | ||
2552 | l = msg_print_text(msg, 0, syslog, line, size); | ||
2553 | |||
2554 | dumper->cur_idx = log_next(dumper->cur_idx); | ||
2555 | dumper->cur_seq++; | ||
2556 | ret = true; | ||
2557 | out: | ||
2558 | if (len) | ||
2559 | *len = l; | ||
2560 | return ret; | ||
2561 | } | ||
2562 | |||
2563 | /** | ||
2564 | * kmsg_dump_get_line - retrieve one kmsg log line | ||
2565 | * @dumper: registered kmsg dumper | ||
2566 | * @syslog: include the "<4>" prefixes | ||
2567 | * @line: buffer to copy the line to | ||
2568 | * @size: maximum size of the buffer | ||
2569 | * @len: length of line placed into buffer | ||
2570 | * | ||
2571 | * Start at the beginning of the kmsg buffer, with the oldest kmsg | ||
2572 | * record, and copy one record into the provided buffer. | ||
2573 | * | ||
2574 | * Consecutive calls will return the next available record moving | ||
2575 | * towards the end of the buffer with the youngest messages. | ||
2576 | * | ||
2577 | * A return value of FALSE indicates that there are no more records to | ||
2578 | * read. | ||
2579 | */ | ||
2580 | bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, | ||
2581 | char *line, size_t size, size_t *len) | ||
2582 | { | ||
2583 | unsigned long flags; | ||
2584 | bool ret; | ||
2585 | |||
1783 | raw_spin_lock_irqsave(&logbuf_lock, flags); | 2586 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1784 | end = log_end & LOG_BUF_MASK; | 2587 | ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); |
1785 | chars = logged_chars; | ||
1786 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 2588 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1787 | 2589 | ||
1788 | if (chars > end) { | 2590 | return ret; |
1789 | s1 = log_buf + log_buf_len - chars + end; | 2591 | } |
1790 | l1 = chars - end; | 2592 | EXPORT_SYMBOL_GPL(kmsg_dump_get_line); |
1791 | 2593 | ||
1792 | s2 = log_buf; | 2594 | /** |
1793 | l2 = end; | 2595 | * kmsg_dump_get_buffer - copy kmsg log lines |
1794 | } else { | 2596 | * @dumper: registered kmsg dumper |
1795 | s1 = ""; | 2597 | * @syslog: include the "<4>" prefixes |
1796 | l1 = 0; | 2598 | * @buf: buffer to copy the line to |
2599 | * @size: maximum size of the buffer | ||
2600 | * @len: length of line placed into buffer | ||
2601 | * | ||
2602 | * Start at the end of the kmsg buffer and fill the provided buffer | ||
2603 | * with as many of the the *youngest* kmsg records that fit into it. | ||
2604 | * If the buffer is large enough, all available kmsg records will be | ||
2605 | * copied with a single call. | ||
2606 | * | ||
2607 | * Consecutive calls will fill the buffer with the next block of | ||
2608 | * available older records, not including the earlier retrieved ones. | ||
2609 | * | ||
2610 | * A return value of FALSE indicates that there are no more records to | ||
2611 | * read. | ||
2612 | */ | ||
2613 | bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | ||
2614 | char *buf, size_t size, size_t *len) | ||
2615 | { | ||
2616 | unsigned long flags; | ||
2617 | u64 seq; | ||
2618 | u32 idx; | ||
2619 | u64 next_seq; | ||
2620 | u32 next_idx; | ||
2621 | enum log_flags prev; | ||
2622 | size_t l = 0; | ||
2623 | bool ret = false; | ||
2624 | |||
2625 | if (!dumper->active) | ||
2626 | goto out; | ||
1797 | 2627 | ||
1798 | s2 = log_buf + end - chars; | 2628 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1799 | l2 = chars; | 2629 | if (dumper->cur_seq < log_first_seq) { |
2630 | /* messages are gone, move to first available one */ | ||
2631 | dumper->cur_seq = log_first_seq; | ||
2632 | dumper->cur_idx = log_first_idx; | ||
1800 | } | 2633 | } |
1801 | 2634 | ||
1802 | rcu_read_lock(); | 2635 | /* last entry */ |
1803 | list_for_each_entry_rcu(dumper, &dump_list, list) | 2636 | if (dumper->cur_seq >= dumper->next_seq) { |
1804 | dumper->dump(dumper, reason, s1, l1, s2, l2); | 2637 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1805 | rcu_read_unlock(); | 2638 | goto out; |
2639 | } | ||
2640 | |||
2641 | /* calculate length of entire buffer */ | ||
2642 | seq = dumper->cur_seq; | ||
2643 | idx = dumper->cur_idx; | ||
2644 | prev = 0; | ||
2645 | while (seq < dumper->next_seq) { | ||
2646 | struct log *msg = log_from_idx(idx); | ||
2647 | |||
2648 | l += msg_print_text(msg, prev, true, NULL, 0); | ||
2649 | idx = log_next(idx); | ||
2650 | seq++; | ||
2651 | prev = msg->flags; | ||
2652 | } | ||
2653 | |||
2654 | /* move first record forward until length fits into the buffer */ | ||
2655 | seq = dumper->cur_seq; | ||
2656 | idx = dumper->cur_idx; | ||
2657 | prev = 0; | ||
2658 | while (l > size && seq < dumper->next_seq) { | ||
2659 | struct log *msg = log_from_idx(idx); | ||
2660 | |||
2661 | l -= msg_print_text(msg, prev, true, NULL, 0); | ||
2662 | idx = log_next(idx); | ||
2663 | seq++; | ||
2664 | prev = msg->flags; | ||
2665 | } | ||
2666 | |||
2667 | /* last message in next interation */ | ||
2668 | next_seq = seq; | ||
2669 | next_idx = idx; | ||
2670 | |||
2671 | l = 0; | ||
2672 | prev = 0; | ||
2673 | while (seq < dumper->next_seq) { | ||
2674 | struct log *msg = log_from_idx(idx); | ||
2675 | |||
2676 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); | ||
2677 | idx = log_next(idx); | ||
2678 | seq++; | ||
2679 | prev = msg->flags; | ||
2680 | } | ||
2681 | |||
2682 | dumper->next_seq = next_seq; | ||
2683 | dumper->next_idx = next_idx; | ||
2684 | ret = true; | ||
2685 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
2686 | out: | ||
2687 | if (len) | ||
2688 | *len = l; | ||
2689 | return ret; | ||
2690 | } | ||
2691 | EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); | ||
2692 | |||
2693 | /** | ||
2694 | * kmsg_dump_rewind_nolock - reset the interator (unlocked version) | ||
2695 | * @dumper: registered kmsg dumper | ||
2696 | * | ||
2697 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
2698 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
2699 | * times within the same dumper.dump() callback. | ||
2700 | * | ||
2701 | * The function is similar to kmsg_dump_rewind(), but grabs no locks. | ||
2702 | */ | ||
2703 | void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) | ||
2704 | { | ||
2705 | dumper->cur_seq = clear_seq; | ||
2706 | dumper->cur_idx = clear_idx; | ||
2707 | dumper->next_seq = log_next_seq; | ||
2708 | dumper->next_idx = log_next_idx; | ||
2709 | } | ||
2710 | |||
2711 | /** | ||
2712 | * kmsg_dump_rewind - reset the interator | ||
2713 | * @dumper: registered kmsg dumper | ||
2714 | * | ||
2715 | * Reset the dumper's iterator so that kmsg_dump_get_line() and | ||
2716 | * kmsg_dump_get_buffer() can be called again and used multiple | ||
2717 | * times within the same dumper.dump() callback. | ||
2718 | */ | ||
2719 | void kmsg_dump_rewind(struct kmsg_dumper *dumper) | ||
2720 | { | ||
2721 | unsigned long flags; | ||
2722 | |||
2723 | raw_spin_lock_irqsave(&logbuf_lock, flags); | ||
2724 | kmsg_dump_rewind_nolock(dumper); | ||
2725 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | ||
1806 | } | 2726 | } |
2727 | EXPORT_SYMBOL_GPL(kmsg_dump_rewind); | ||
1807 | #endif | 2728 | #endif |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ee8d49b9c309..a232bb59d93f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
198 | return 0; | 198 | return 0; |
199 | rcu_read_lock(); | 199 | rcu_read_lock(); |
200 | tcred = __task_cred(task); | 200 | tcred = __task_cred(task); |
201 | if (cred->user->user_ns == tcred->user->user_ns && | 201 | if (uid_eq(cred->uid, tcred->euid) && |
202 | (cred->uid == tcred->euid && | 202 | uid_eq(cred->uid, tcred->suid) && |
203 | cred->uid == tcred->suid && | 203 | uid_eq(cred->uid, tcred->uid) && |
204 | cred->uid == tcred->uid && | 204 | gid_eq(cred->gid, tcred->egid) && |
205 | cred->gid == tcred->egid && | 205 | gid_eq(cred->gid, tcred->sgid) && |
206 | cred->gid == tcred->sgid && | 206 | gid_eq(cred->gid, tcred->gid)) |
207 | cred->gid == tcred->gid)) | ||
208 | goto ok; | 207 | goto ok; |
209 | if (ptrace_has_cap(tcred->user->user_ns, mode)) | 208 | if (ptrace_has_cap(tcred->user_ns, mode)) |
210 | goto ok; | 209 | goto ok; |
211 | rcu_read_unlock(); | 210 | rcu_read_unlock(); |
212 | return -EPERM; | 211 | return -EPERM; |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a86f1741cc27..95cba41ce1e9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -51,6 +51,34 @@ | |||
51 | 51 | ||
52 | #include "rcu.h" | 52 | #include "rcu.h" |
53 | 53 | ||
54 | #ifdef CONFIG_PREEMPT_RCU | ||
55 | |||
56 | /* | ||
57 | * Check for a task exiting while in a preemptible-RCU read-side | ||
58 | * critical section, clean up if so. No need to issue warnings, | ||
59 | * as debug_check_no_locks_held() already does this if lockdep | ||
60 | * is enabled. | ||
61 | */ | ||
62 | void exit_rcu(void) | ||
63 | { | ||
64 | struct task_struct *t = current; | ||
65 | |||
66 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
67 | return; | ||
68 | t->rcu_read_lock_nesting = 1; | ||
69 | barrier(); | ||
70 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
71 | __rcu_read_unlock(); | ||
72 | } | ||
73 | |||
74 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
75 | |||
76 | void exit_rcu(void) | ||
77 | { | ||
78 | } | ||
79 | |||
80 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
81 | |||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 82 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
55 | static struct lock_class_key rcu_lock_key; | 83 | static struct lock_class_key rcu_lock_key; |
56 | struct lockdep_map rcu_lock_map = | 84 | struct lockdep_map rcu_lock_map = |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 22ecea0dfb62..fc31a2d65100 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void) | |||
851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | 851 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | ||
855 | * Check for a task exiting while in a preemptible -RCU read-side | ||
856 | * critical section, clean up if so. No need to issue warnings, | ||
857 | * as debug_check_no_locks_held() already does this if lockdep | ||
858 | * is enabled. | ||
859 | */ | ||
860 | void exit_rcu(void) | ||
861 | { | ||
862 | struct task_struct *t = current; | ||
863 | |||
864 | if (t->rcu_read_lock_nesting == 0) | ||
865 | return; | ||
866 | t->rcu_read_lock_nesting = 1; | ||
867 | __rcu_read_unlock(); | ||
868 | } | ||
869 | |||
870 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | 854 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
871 | 855 | ||
872 | #ifdef CONFIG_RCU_TRACE | 856 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a89b381a8c6e..e66b34ab7555 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -64,6 +64,7 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ | |||
64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | 68 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ |
68 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | 69 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ |
69 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | 70 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ |
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444); | |||
96 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 97 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
97 | module_param(fqs_stutter, int, 0444); | 98 | module_param(fqs_stutter, int, 0444); |
98 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 99 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
100 | module_param(n_barrier_cbs, int, 0444); | ||
101 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | ||
99 | module_param(onoff_interval, int, 0444); | 102 | module_param(onoff_interval, int, 0444); |
100 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 103 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); |
101 | module_param(onoff_holdoff, int, 0444); | 104 | module_param(onoff_holdoff, int, 0444); |
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task; | |||
139 | static struct task_struct *onoff_task; | 142 | static struct task_struct *onoff_task; |
140 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 143 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
141 | static struct task_struct *stall_task; | 144 | static struct task_struct *stall_task; |
145 | static struct task_struct **barrier_cbs_tasks; | ||
146 | static struct task_struct *barrier_task; | ||
142 | 147 | ||
143 | #define RCU_TORTURE_PIPE_LEN 10 | 148 | #define RCU_TORTURE_PIPE_LEN 10 |
144 | 149 | ||
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail; | |||
164 | static atomic_t n_rcu_torture_free; | 169 | static atomic_t n_rcu_torture_free; |
165 | static atomic_t n_rcu_torture_mberror; | 170 | static atomic_t n_rcu_torture_mberror; |
166 | static atomic_t n_rcu_torture_error; | 171 | static atomic_t n_rcu_torture_error; |
172 | static long n_rcu_torture_barrier_error; | ||
167 | static long n_rcu_torture_boost_ktrerror; | 173 | static long n_rcu_torture_boost_ktrerror; |
168 | static long n_rcu_torture_boost_rterror; | 174 | static long n_rcu_torture_boost_rterror; |
169 | static long n_rcu_torture_boost_failure; | 175 | static long n_rcu_torture_boost_failure; |
@@ -173,6 +179,8 @@ static long n_offline_attempts; | |||
173 | static long n_offline_successes; | 179 | static long n_offline_successes; |
174 | static long n_online_attempts; | 180 | static long n_online_attempts; |
175 | static long n_online_successes; | 181 | static long n_online_successes; |
182 | static long n_barrier_attempts; | ||
183 | static long n_barrier_successes; | ||
176 | static struct list_head rcu_torture_removed; | 184 | static struct list_head rcu_torture_removed; |
177 | static cpumask_var_t shuffle_tmp_mask; | 185 | static cpumask_var_t shuffle_tmp_mask; |
178 | 186 | ||
@@ -197,6 +205,10 @@ static unsigned long shutdown_time; /* jiffies to system shutdown. */ | |||
197 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 205 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
198 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 206 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
199 | /* and boost task create/destroy. */ | 207 | /* and boost task create/destroy. */ |
208 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | ||
209 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | ||
210 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | ||
211 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | ||
200 | 212 | ||
201 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | 213 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
202 | 214 | ||
@@ -327,6 +339,7 @@ struct rcu_torture_ops { | |||
327 | int (*completed)(void); | 339 | int (*completed)(void); |
328 | void (*deferred_free)(struct rcu_torture *p); | 340 | void (*deferred_free)(struct rcu_torture *p); |
329 | void (*sync)(void); | 341 | void (*sync)(void); |
342 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
330 | void (*cb_barrier)(void); | 343 | void (*cb_barrier)(void); |
331 | void (*fqs)(void); | 344 | void (*fqs)(void); |
332 | int (*stats)(char *page); | 345 | int (*stats)(char *page); |
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = { | |||
417 | .completed = rcu_torture_completed, | 430 | .completed = rcu_torture_completed, |
418 | .deferred_free = rcu_torture_deferred_free, | 431 | .deferred_free = rcu_torture_deferred_free, |
419 | .sync = synchronize_rcu, | 432 | .sync = synchronize_rcu, |
433 | .call = call_rcu, | ||
420 | .cb_barrier = rcu_barrier, | 434 | .cb_barrier = rcu_barrier, |
421 | .fqs = rcu_force_quiescent_state, | 435 | .fqs = rcu_force_quiescent_state, |
422 | .stats = NULL, | 436 | .stats = NULL, |
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = { | |||
460 | .completed = rcu_torture_completed, | 474 | .completed = rcu_torture_completed, |
461 | .deferred_free = rcu_sync_torture_deferred_free, | 475 | .deferred_free = rcu_sync_torture_deferred_free, |
462 | .sync = synchronize_rcu, | 476 | .sync = synchronize_rcu, |
477 | .call = NULL, | ||
463 | .cb_barrier = NULL, | 478 | .cb_barrier = NULL, |
464 | .fqs = rcu_force_quiescent_state, | 479 | .fqs = rcu_force_quiescent_state, |
465 | .stats = NULL, | 480 | .stats = NULL, |
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { | |||
477 | .completed = rcu_no_completed, | 492 | .completed = rcu_no_completed, |
478 | .deferred_free = rcu_sync_torture_deferred_free, | 493 | .deferred_free = rcu_sync_torture_deferred_free, |
479 | .sync = synchronize_rcu_expedited, | 494 | .sync = synchronize_rcu_expedited, |
495 | .call = NULL, | ||
480 | .cb_barrier = NULL, | 496 | .cb_barrier = NULL, |
481 | .fqs = rcu_force_quiescent_state, | 497 | .fqs = rcu_force_quiescent_state, |
482 | .stats = NULL, | 498 | .stats = NULL, |
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
519 | .completed = rcu_bh_torture_completed, | 535 | .completed = rcu_bh_torture_completed, |
520 | .deferred_free = rcu_bh_torture_deferred_free, | 536 | .deferred_free = rcu_bh_torture_deferred_free, |
521 | .sync = synchronize_rcu_bh, | 537 | .sync = synchronize_rcu_bh, |
538 | .call = call_rcu_bh, | ||
522 | .cb_barrier = rcu_barrier_bh, | 539 | .cb_barrier = rcu_barrier_bh, |
523 | .fqs = rcu_bh_force_quiescent_state, | 540 | .fqs = rcu_bh_force_quiescent_state, |
524 | .stats = NULL, | 541 | .stats = NULL, |
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
535 | .completed = rcu_bh_torture_completed, | 552 | .completed = rcu_bh_torture_completed, |
536 | .deferred_free = rcu_sync_torture_deferred_free, | 553 | .deferred_free = rcu_sync_torture_deferred_free, |
537 | .sync = synchronize_rcu_bh, | 554 | .sync = synchronize_rcu_bh, |
555 | .call = NULL, | ||
538 | .cb_barrier = NULL, | 556 | .cb_barrier = NULL, |
539 | .fqs = rcu_bh_force_quiescent_state, | 557 | .fqs = rcu_bh_force_quiescent_state, |
540 | .stats = NULL, | 558 | .stats = NULL, |
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = { | |||
551 | .completed = rcu_bh_torture_completed, | 569 | .completed = rcu_bh_torture_completed, |
552 | .deferred_free = rcu_sync_torture_deferred_free, | 570 | .deferred_free = rcu_sync_torture_deferred_free, |
553 | .sync = synchronize_rcu_bh_expedited, | 571 | .sync = synchronize_rcu_bh_expedited, |
572 | .call = NULL, | ||
554 | .cb_barrier = NULL, | 573 | .cb_barrier = NULL, |
555 | .fqs = rcu_bh_force_quiescent_state, | 574 | .fqs = rcu_bh_force_quiescent_state, |
556 | .stats = NULL, | 575 | .stats = NULL, |
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void) | |||
606 | return srcu_batches_completed(&srcu_ctl); | 625 | return srcu_batches_completed(&srcu_ctl); |
607 | } | 626 | } |
608 | 627 | ||
628 | static void srcu_torture_deferred_free(struct rcu_torture *rp) | ||
629 | { | ||
630 | call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); | ||
631 | } | ||
632 | |||
609 | static void srcu_torture_synchronize(void) | 633 | static void srcu_torture_synchronize(void) |
610 | { | 634 | { |
611 | synchronize_srcu(&srcu_ctl); | 635 | synchronize_srcu(&srcu_ctl); |
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page) | |||
620 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | 644 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", |
621 | torture_type, TORTURE_FLAG, idx); | 645 | torture_type, TORTURE_FLAG, idx); |
622 | for_each_possible_cpu(cpu) { | 646 | for_each_possible_cpu(cpu) { |
623 | cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, | 647 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, |
624 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 648 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], |
625 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 649 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); |
626 | } | 650 | } |
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = { | |||
635 | .read_delay = srcu_read_delay, | 659 | .read_delay = srcu_read_delay, |
636 | .readunlock = srcu_torture_read_unlock, | 660 | .readunlock = srcu_torture_read_unlock, |
637 | .completed = srcu_torture_completed, | 661 | .completed = srcu_torture_completed, |
638 | .deferred_free = rcu_sync_torture_deferred_free, | 662 | .deferred_free = srcu_torture_deferred_free, |
639 | .sync = srcu_torture_synchronize, | 663 | .sync = srcu_torture_synchronize, |
664 | .call = NULL, | ||
640 | .cb_barrier = NULL, | 665 | .cb_barrier = NULL, |
641 | .stats = srcu_torture_stats, | 666 | .stats = srcu_torture_stats, |
642 | .name = "srcu" | 667 | .name = "srcu" |
643 | }; | 668 | }; |
644 | 669 | ||
670 | static struct rcu_torture_ops srcu_sync_ops = { | ||
671 | .init = srcu_torture_init, | ||
672 | .cleanup = srcu_torture_cleanup, | ||
673 | .readlock = srcu_torture_read_lock, | ||
674 | .read_delay = srcu_read_delay, | ||
675 | .readunlock = srcu_torture_read_unlock, | ||
676 | .completed = srcu_torture_completed, | ||
677 | .deferred_free = rcu_sync_torture_deferred_free, | ||
678 | .sync = srcu_torture_synchronize, | ||
679 | .call = NULL, | ||
680 | .cb_barrier = NULL, | ||
681 | .stats = srcu_torture_stats, | ||
682 | .name = "srcu_sync" | ||
683 | }; | ||
684 | |||
645 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | 685 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) |
646 | { | 686 | { |
647 | return srcu_read_lock_raw(&srcu_ctl); | 687 | return srcu_read_lock_raw(&srcu_ctl); |
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = { | |||
659 | .read_delay = srcu_read_delay, | 699 | .read_delay = srcu_read_delay, |
660 | .readunlock = srcu_torture_read_unlock_raw, | 700 | .readunlock = srcu_torture_read_unlock_raw, |
661 | .completed = srcu_torture_completed, | 701 | .completed = srcu_torture_completed, |
662 | .deferred_free = rcu_sync_torture_deferred_free, | 702 | .deferred_free = srcu_torture_deferred_free, |
663 | .sync = srcu_torture_synchronize, | 703 | .sync = srcu_torture_synchronize, |
704 | .call = NULL, | ||
664 | .cb_barrier = NULL, | 705 | .cb_barrier = NULL, |
665 | .stats = srcu_torture_stats, | 706 | .stats = srcu_torture_stats, |
666 | .name = "srcu_raw" | 707 | .name = "srcu_raw" |
667 | }; | 708 | }; |
668 | 709 | ||
710 | static struct rcu_torture_ops srcu_raw_sync_ops = { | ||
711 | .init = srcu_torture_init, | ||
712 | .cleanup = srcu_torture_cleanup, | ||
713 | .readlock = srcu_torture_read_lock_raw, | ||
714 | .read_delay = srcu_read_delay, | ||
715 | .readunlock = srcu_torture_read_unlock_raw, | ||
716 | .completed = srcu_torture_completed, | ||
717 | .deferred_free = rcu_sync_torture_deferred_free, | ||
718 | .sync = srcu_torture_synchronize, | ||
719 | .call = NULL, | ||
720 | .cb_barrier = NULL, | ||
721 | .stats = srcu_torture_stats, | ||
722 | .name = "srcu_raw_sync" | ||
723 | }; | ||
724 | |||
669 | static void srcu_torture_synchronize_expedited(void) | 725 | static void srcu_torture_synchronize_expedited(void) |
670 | { | 726 | { |
671 | synchronize_srcu_expedited(&srcu_ctl); | 727 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = { | |||
680 | .completed = srcu_torture_completed, | 736 | .completed = srcu_torture_completed, |
681 | .deferred_free = rcu_sync_torture_deferred_free, | 737 | .deferred_free = rcu_sync_torture_deferred_free, |
682 | .sync = srcu_torture_synchronize_expedited, | 738 | .sync = srcu_torture_synchronize_expedited, |
739 | .call = NULL, | ||
683 | .cb_barrier = NULL, | 740 | .cb_barrier = NULL, |
684 | .stats = srcu_torture_stats, | 741 | .stats = srcu_torture_stats, |
685 | .name = "srcu_expedited" | 742 | .name = "srcu_expedited" |
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page) | |||
1129 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1186 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1130 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1187 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1131 | "rtbf: %ld rtb: %ld nt: %ld " | 1188 | "rtbf: %ld rtb: %ld nt: %ld " |
1132 | "onoff: %ld/%ld:%ld/%ld", | 1189 | "onoff: %ld/%ld:%ld/%ld " |
1190 | "barrier: %ld/%ld:%ld", | ||
1133 | rcu_torture_current, | 1191 | rcu_torture_current, |
1134 | rcu_torture_current_version, | 1192 | rcu_torture_current_version, |
1135 | list_empty(&rcu_torture_freelist), | 1193 | list_empty(&rcu_torture_freelist), |
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page) | |||
1145 | n_online_successes, | 1203 | n_online_successes, |
1146 | n_online_attempts, | 1204 | n_online_attempts, |
1147 | n_offline_successes, | 1205 | n_offline_successes, |
1148 | n_offline_attempts); | 1206 | n_offline_attempts, |
1207 | n_barrier_successes, | ||
1208 | n_barrier_attempts, | ||
1209 | n_rcu_torture_barrier_error); | ||
1210 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1149 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1211 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1212 | n_rcu_torture_barrier_error != 0 || | ||
1150 | n_rcu_torture_boost_ktrerror != 0 || | 1213 | n_rcu_torture_boost_ktrerror != 0 || |
1151 | n_rcu_torture_boost_rterror != 0 || | 1214 | n_rcu_torture_boost_rterror != 0 || |
1152 | n_rcu_torture_boost_failure != 0) | 1215 | n_rcu_torture_boost_failure != 0 || |
1153 | cnt += sprintf(&page[cnt], " !!!"); | 1216 | i > 1) { |
1154 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
1155 | if (i > 1) { | ||
1156 | cnt += sprintf(&page[cnt], "!!! "); | 1217 | cnt += sprintf(&page[cnt], "!!! "); |
1157 | atomic_inc(&n_rcu_torture_error); | 1218 | atomic_inc(&n_rcu_torture_error); |
1158 | WARN_ON_ONCE(1); | 1219 | WARN_ON_ONCE(1); |
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu) | |||
1337 | 1398 | ||
1338 | /* This must be outside of the mutex, otherwise deadlock! */ | 1399 | /* This must be outside of the mutex, otherwise deadlock! */ |
1339 | kthread_stop(t); | 1400 | kthread_stop(t); |
1401 | boost_tasks[cpu] = NULL; | ||
1340 | } | 1402 | } |
1341 | 1403 | ||
1342 | static int rcutorture_booster_init(int cpu) | 1404 | static int rcutorture_booster_init(int cpu) |
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void) | |||
1484 | return; | 1546 | return; |
1485 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | 1547 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); |
1486 | kthread_stop(onoff_task); | 1548 | kthread_stop(onoff_task); |
1549 | onoff_task = NULL; | ||
1487 | } | 1550 | } |
1488 | 1551 | ||
1489 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1552 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1490 | 1553 | ||
1491 | static void | 1554 | static int |
1492 | rcu_torture_onoff_init(void) | 1555 | rcu_torture_onoff_init(void) |
1493 | { | 1556 | { |
1557 | return 0; | ||
1494 | } | 1558 | } |
1495 | 1559 | ||
1496 | static void rcu_torture_onoff_cleanup(void) | 1560 | static void rcu_torture_onoff_cleanup(void) |
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void) | |||
1554 | return; | 1618 | return; |
1555 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | 1619 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); |
1556 | kthread_stop(stall_task); | 1620 | kthread_stop(stall_task); |
1621 | stall_task = NULL; | ||
1622 | } | ||
1623 | |||
1624 | /* Callback function for RCU barrier testing. */ | ||
1625 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | ||
1626 | { | ||
1627 | atomic_inc(&barrier_cbs_invoked); | ||
1628 | } | ||
1629 | |||
1630 | /* kthread function to register callbacks used to test RCU barriers. */ | ||
1631 | static int rcu_torture_barrier_cbs(void *arg) | ||
1632 | { | ||
1633 | long myid = (long)arg; | ||
1634 | struct rcu_head rcu; | ||
1635 | |||
1636 | init_rcu_head_on_stack(&rcu); | ||
1637 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); | ||
1638 | set_user_nice(current, 19); | ||
1639 | do { | ||
1640 | wait_event(barrier_cbs_wq[myid], | ||
1641 | atomic_read(&barrier_cbs_count) == n_barrier_cbs || | ||
1642 | kthread_should_stop() || | ||
1643 | fullstop != FULLSTOP_DONTSTOP); | ||
1644 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1645 | break; | ||
1646 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | ||
1647 | if (atomic_dec_and_test(&barrier_cbs_count)) | ||
1648 | wake_up(&barrier_wq); | ||
1649 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1650 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); | ||
1651 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1652 | while (!kthread_should_stop()) | ||
1653 | schedule_timeout_interruptible(1); | ||
1654 | cur_ops->cb_barrier(); | ||
1655 | destroy_rcu_head_on_stack(&rcu); | ||
1656 | return 0; | ||
1657 | } | ||
1658 | |||
1659 | /* kthread function to drive and coordinate RCU barrier testing. */ | ||
1660 | static int rcu_torture_barrier(void *arg) | ||
1661 | { | ||
1662 | int i; | ||
1663 | |||
1664 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); | ||
1665 | do { | ||
1666 | atomic_set(&barrier_cbs_invoked, 0); | ||
1667 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | ||
1668 | /* wake_up() path contains the required barriers. */ | ||
1669 | for (i = 0; i < n_barrier_cbs; i++) | ||
1670 | wake_up(&barrier_cbs_wq[i]); | ||
1671 | wait_event(barrier_wq, | ||
1672 | atomic_read(&barrier_cbs_count) == 0 || | ||
1673 | kthread_should_stop() || | ||
1674 | fullstop != FULLSTOP_DONTSTOP); | ||
1675 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
1676 | break; | ||
1677 | n_barrier_attempts++; | ||
1678 | cur_ops->cb_barrier(); | ||
1679 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | ||
1680 | n_rcu_torture_barrier_error++; | ||
1681 | WARN_ON_ONCE(1); | ||
1682 | } | ||
1683 | n_barrier_successes++; | ||
1684 | schedule_timeout_interruptible(HZ / 10); | ||
1685 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
1686 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | ||
1687 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
1688 | while (!kthread_should_stop()) | ||
1689 | schedule_timeout_interruptible(1); | ||
1690 | return 0; | ||
1691 | } | ||
1692 | |||
1693 | /* Initialize RCU barrier testing. */ | ||
1694 | static int rcu_torture_barrier_init(void) | ||
1695 | { | ||
1696 | int i; | ||
1697 | int ret; | ||
1698 | |||
1699 | if (n_barrier_cbs == 0) | ||
1700 | return 0; | ||
1701 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | ||
1702 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1703 | " Call or barrier ops missing for %s,\n", | ||
1704 | torture_type, cur_ops->name); | ||
1705 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1706 | " RCU barrier testing omitted from run.\n", | ||
1707 | torture_type); | ||
1708 | return 0; | ||
1709 | } | ||
1710 | atomic_set(&barrier_cbs_count, 0); | ||
1711 | atomic_set(&barrier_cbs_invoked, 0); | ||
1712 | barrier_cbs_tasks = | ||
1713 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | ||
1714 | GFP_KERNEL); | ||
1715 | barrier_cbs_wq = | ||
1716 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | ||
1717 | GFP_KERNEL); | ||
1718 | if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) | ||
1719 | return -ENOMEM; | ||
1720 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1721 | init_waitqueue_head(&barrier_cbs_wq[i]); | ||
1722 | barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, | ||
1723 | (void *)(long)i, | ||
1724 | "rcu_torture_barrier_cbs"); | ||
1725 | if (IS_ERR(barrier_cbs_tasks[i])) { | ||
1726 | ret = PTR_ERR(barrier_cbs_tasks[i]); | ||
1727 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); | ||
1728 | barrier_cbs_tasks[i] = NULL; | ||
1729 | return ret; | ||
1730 | } | ||
1731 | } | ||
1732 | barrier_task = kthread_run(rcu_torture_barrier, NULL, | ||
1733 | "rcu_torture_barrier"); | ||
1734 | if (IS_ERR(barrier_task)) { | ||
1735 | ret = PTR_ERR(barrier_task); | ||
1736 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); | ||
1737 | barrier_task = NULL; | ||
1738 | } | ||
1739 | return 0; | ||
1740 | } | ||
1741 | |||
1742 | /* Clean up after RCU barrier testing. */ | ||
1743 | static void rcu_torture_barrier_cleanup(void) | ||
1744 | { | ||
1745 | int i; | ||
1746 | |||
1747 | if (barrier_task != NULL) { | ||
1748 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); | ||
1749 | kthread_stop(barrier_task); | ||
1750 | barrier_task = NULL; | ||
1751 | } | ||
1752 | if (barrier_cbs_tasks != NULL) { | ||
1753 | for (i = 0; i < n_barrier_cbs; i++) { | ||
1754 | if (barrier_cbs_tasks[i] != NULL) { | ||
1755 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); | ||
1756 | kthread_stop(barrier_cbs_tasks[i]); | ||
1757 | barrier_cbs_tasks[i] = NULL; | ||
1758 | } | ||
1759 | } | ||
1760 | kfree(barrier_cbs_tasks); | ||
1761 | barrier_cbs_tasks = NULL; | ||
1762 | } | ||
1763 | if (barrier_cbs_wq != NULL) { | ||
1764 | kfree(barrier_cbs_wq); | ||
1765 | barrier_cbs_wq = NULL; | ||
1766 | } | ||
1557 | } | 1767 | } |
1558 | 1768 | ||
1559 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1769 | static int rcutorture_cpu_notify(struct notifier_block *self, |
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void) | |||
1598 | fullstop = FULLSTOP_RMMOD; | 1808 | fullstop = FULLSTOP_RMMOD; |
1599 | mutex_unlock(&fullstop_mutex); | 1809 | mutex_unlock(&fullstop_mutex); |
1600 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | 1810 | unregister_reboot_notifier(&rcutorture_shutdown_nb); |
1811 | rcu_torture_barrier_cleanup(); | ||
1601 | rcu_torture_stall_cleanup(); | 1812 | rcu_torture_stall_cleanup(); |
1602 | if (stutter_task) { | 1813 | if (stutter_task) { |
1603 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | 1814 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void) | |||
1665 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | 1876 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); |
1666 | kthread_stop(shutdown_task); | 1877 | kthread_stop(shutdown_task); |
1667 | } | 1878 | } |
1879 | shutdown_task = NULL; | ||
1668 | rcu_torture_onoff_cleanup(); | 1880 | rcu_torture_onoff_cleanup(); |
1669 | 1881 | ||
1670 | /* Wait for all RCU callbacks to fire. */ | 1882 | /* Wait for all RCU callbacks to fire. */ |
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void) | |||
1676 | 1888 | ||
1677 | if (cur_ops->cleanup) | 1889 | if (cur_ops->cleanup) |
1678 | cur_ops->cleanup(); | 1890 | cur_ops->cleanup(); |
1679 | if (atomic_read(&n_rcu_torture_error)) | 1891 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) |
1680 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | 1892 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); |
1681 | else if (n_online_successes != n_online_attempts || | 1893 | else if (n_online_successes != n_online_attempts || |
1682 | n_offline_successes != n_offline_attempts) | 1894 | n_offline_successes != n_offline_attempts) |
@@ -1692,10 +1904,12 @@ rcu_torture_init(void) | |||
1692 | int i; | 1904 | int i; |
1693 | int cpu; | 1905 | int cpu; |
1694 | int firsterr = 0; | 1906 | int firsterr = 0; |
1907 | int retval; | ||
1695 | static struct rcu_torture_ops *torture_ops[] = | 1908 | static struct rcu_torture_ops *torture_ops[] = |
1696 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1909 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1697 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1910 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1698 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, | 1911 | &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, |
1912 | &srcu_raw_sync_ops, &srcu_expedited_ops, | ||
1699 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1913 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1700 | 1914 | ||
1701 | mutex_lock(&fullstop_mutex); | 1915 | mutex_lock(&fullstop_mutex); |
@@ -1749,6 +1963,7 @@ rcu_torture_init(void) | |||
1749 | atomic_set(&n_rcu_torture_free, 0); | 1963 | atomic_set(&n_rcu_torture_free, 0); |
1750 | atomic_set(&n_rcu_torture_mberror, 0); | 1964 | atomic_set(&n_rcu_torture_mberror, 0); |
1751 | atomic_set(&n_rcu_torture_error, 0); | 1965 | atomic_set(&n_rcu_torture_error, 0); |
1966 | n_rcu_torture_barrier_error = 0; | ||
1752 | n_rcu_torture_boost_ktrerror = 0; | 1967 | n_rcu_torture_boost_ktrerror = 0; |
1753 | n_rcu_torture_boost_rterror = 0; | 1968 | n_rcu_torture_boost_rterror = 0; |
1754 | n_rcu_torture_boost_failure = 0; | 1969 | n_rcu_torture_boost_failure = 0; |
@@ -1872,7 +2087,6 @@ rcu_torture_init(void) | |||
1872 | test_boost_duration = 2; | 2087 | test_boost_duration = 2; |
1873 | if ((test_boost == 1 && cur_ops->can_boost) || | 2088 | if ((test_boost == 1 && cur_ops->can_boost) || |
1874 | test_boost == 2) { | 2089 | test_boost == 2) { |
1875 | int retval; | ||
1876 | 2090 | ||
1877 | boost_starttime = jiffies + test_boost_interval * HZ; | 2091 | boost_starttime = jiffies + test_boost_interval * HZ; |
1878 | register_cpu_notifier(&rcutorture_cpu_nb); | 2092 | register_cpu_notifier(&rcutorture_cpu_nb); |
@@ -1897,9 +2111,22 @@ rcu_torture_init(void) | |||
1897 | goto unwind; | 2111 | goto unwind; |
1898 | } | 2112 | } |
1899 | } | 2113 | } |
1900 | rcu_torture_onoff_init(); | 2114 | i = rcu_torture_onoff_init(); |
2115 | if (i != 0) { | ||
2116 | firsterr = i; | ||
2117 | goto unwind; | ||
2118 | } | ||
1901 | register_reboot_notifier(&rcutorture_shutdown_nb); | 2119 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1902 | rcu_torture_stall_init(); | 2120 | i = rcu_torture_stall_init(); |
2121 | if (i != 0) { | ||
2122 | firsterr = i; | ||
2123 | goto unwind; | ||
2124 | } | ||
2125 | retval = rcu_torture_barrier_init(); | ||
2126 | if (retval != 0) { | ||
2127 | firsterr = retval; | ||
2128 | goto unwind; | ||
2129 | } | ||
1903 | rcutorture_record_test_transition(); | 2130 | rcutorture_record_test_transition(); |
1904 | mutex_unlock(&fullstop_mutex); | 2131 | mutex_unlock(&fullstop_mutex); |
1905 | return 0; | 2132 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0c5baf1ab18..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
75 | .gpnum = -300, \ | 75 | .gpnum = -300, \ |
76 | .completed = -300, \ | 76 | .completed = -300, \ |
77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | ||
79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | ||
78 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
79 | .n_force_qs = 0, \ | 81 | .n_force_qs = 0, \ |
80 | .n_force_qs_ngp = 0, \ | 82 | .n_force_qs_ngp = 0, \ |
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
145 | unsigned long rcutorture_testseq; | 147 | unsigned long rcutorture_testseq; |
146 | unsigned long rcutorture_vernum; | 148 | unsigned long rcutorture_vernum; |
147 | 149 | ||
150 | /* State information for rcu_barrier() and friends. */ | ||
151 | |||
152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
153 | static atomic_t rcu_barrier_cpu_count; | ||
154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
155 | static struct completion rcu_barrier_completion; | ||
156 | |||
148 | /* | 157 | /* |
149 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
150 | * permit this function to be invoked without holding the root rcu_node | 159 | * permit this function to be invoked without holding the root rcu_node |
@@ -1311,95 +1320,135 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1311 | #ifdef CONFIG_HOTPLUG_CPU | 1320 | #ifdef CONFIG_HOTPLUG_CPU |
1312 | 1321 | ||
1313 | /* | 1322 | /* |
1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1323 | * Send the specified CPU's RCU callbacks to the orphanage. The |
1315 | * Also record a quiescent state for this CPU for the current grace period. | 1324 | * specified CPU must be offline, and the caller must hold the |
1316 | * Synchronization and interrupt disabling are not required because | 1325 | * ->onofflock. |
1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
1318 | * operations that might block must be done later from the CPU_DEAD | ||
1319 | * notifier. | ||
1320 | * | ||
1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
1323 | * destination from the bits set in that mask. | ||
1324 | */ | 1326 | */ |
1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1327 | static void |
1328 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
1329 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
1326 | { | 1330 | { |
1327 | int i; | 1331 | int i; |
1328 | unsigned long mask; | ||
1329 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
1333 | 1332 | ||
1334 | /* First, adjust the counts. */ | 1333 | /* |
1334 | * Orphan the callbacks. First adjust the counts. This is safe | ||
1335 | * because ->onofflock excludes _rcu_barrier()'s adoption of | ||
1336 | * the callbacks, thus no memory barrier is required. | ||
1337 | */ | ||
1335 | if (rdp->nxtlist != NULL) { | 1338 | if (rdp->nxtlist != NULL) { |
1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | 1339 | rsp->qlen_lazy += rdp->qlen_lazy; |
1337 | receive_rdp->qlen += rdp->qlen; | 1340 | rsp->qlen += rdp->qlen; |
1341 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1338 | rdp->qlen_lazy = 0; | 1342 | rdp->qlen_lazy = 0; |
1339 | rdp->qlen = 0; | 1343 | rdp->qlen = 0; |
1340 | } | 1344 | } |
1341 | 1345 | ||
1342 | /* | 1346 | /* |
1343 | * Next, move ready-to-invoke callbacks to be invoked on some | 1347 | * Next, move those callbacks still needing a grace period to |
1344 | * other CPU. These will not be required to pass through another | 1348 | * the orphanage, where some other CPU will pick them up. |
1345 | * grace period: They are done, regardless of CPU. | 1349 | * Some of the callbacks might have gone partway through a grace |
1350 | * period, but that is too bad. They get to start over because we | ||
1351 | * cannot assume that grace periods are synchronized across CPUs. | ||
1352 | * We don't bother updating the ->nxttail[] array yet, instead | ||
1353 | * we just reset the whole thing later on. | ||
1346 | */ | 1354 | */ |
1347 | if (rdp->nxtlist != NULL && | 1355 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { |
1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | 1356 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; |
1349 | struct rcu_head *oldhead; | 1357 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; |
1350 | struct rcu_head **oldtail; | 1358 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1351 | struct rcu_head **newtail; | ||
1352 | |||
1353 | oldhead = rdp->nxtlist; | ||
1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
1361 | receive_rdp->nxttail[i] = newtail; | ||
1362 | if (rdp->nxttail[i] == newtail) | ||
1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1364 | } | ||
1365 | } | 1359 | } |
1366 | 1360 | ||
1367 | /* | 1361 | /* |
1368 | * Finally, put the rest of the callbacks at the end of the list. | 1362 | * Then move the ready-to-invoke callbacks to the orphanage, |
1369 | * The ones that made it partway through get to start over: We | 1363 | * where some other CPU will pick them up. These will not be |
1370 | * cannot assume that grace periods are synchronized across CPUs. | 1364 | * required to pass though another grace period: They are done. |
1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but | ||
1372 | * this does not seem compelling. Not yet, anyway.) | ||
1373 | */ | 1365 | */ |
1374 | if (rdp->nxtlist != NULL) { | 1366 | if (rdp->nxtlist != NULL) { |
1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1367 | *rsp->orphan_donetail = rdp->nxtlist; |
1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | 1368 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
1380 | |||
1381 | rdp->nxtlist = NULL; | ||
1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1384 | } | 1369 | } |
1385 | 1370 | ||
1371 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
1372 | rdp->nxtlist = NULL; | ||
1373 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1374 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1375 | } | ||
1376 | |||
1377 | /* | ||
1378 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
1379 | * orphanage. The caller must hold the ->onofflock. | ||
1380 | */ | ||
1381 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1382 | { | ||
1383 | int i; | ||
1384 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
1385 | |||
1386 | /* | 1386 | /* |
1387 | * Record a quiescent state for the dying CPU. This is safe | 1387 | * If there is an rcu_barrier() operation in progress, then |
1388 | * only because we have already cleared out the callbacks. | 1388 | * only the task doing that operation is permitted to adopt |
1389 | * (Otherwise, the RCU core might try to schedule the invocation | 1389 | * callbacks. To do otherwise breaks rcu_barrier() and friends |
1390 | * of callbacks on this now-offline CPU, which would be bad.) | 1390 | * by causing them to fail to wait for the callbacks in the |
1391 | * orphanage. | ||
1391 | */ | 1392 | */ |
1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1393 | if (rsp->rcu_barrier_in_progress && |
1394 | rsp->rcu_barrier_in_progress != current) | ||
1395 | return; | ||
1396 | |||
1397 | /* Do the accounting first. */ | ||
1398 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
1399 | rdp->qlen += rsp->qlen; | ||
1400 | rdp->n_cbs_adopted += rsp->qlen; | ||
1401 | if (rsp->qlen_lazy != rsp->qlen) | ||
1402 | rcu_idle_count_callbacks_posted(); | ||
1403 | rsp->qlen_lazy = 0; | ||
1404 | rsp->qlen = 0; | ||
1405 | |||
1406 | /* | ||
1407 | * We do not need a memory barrier here because the only way we | ||
1408 | * can get here if there is an rcu_barrier() in flight is if | ||
1409 | * we are the task doing the rcu_barrier(). | ||
1410 | */ | ||
1411 | |||
1412 | /* First adopt the ready-to-invoke callbacks. */ | ||
1413 | if (rsp->orphan_donelist != NULL) { | ||
1414 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
1415 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
1416 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
1417 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
1418 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
1419 | rsp->orphan_donelist = NULL; | ||
1420 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1421 | } | ||
1422 | |||
1423 | /* And then adopt the callbacks that still need a grace period. */ | ||
1424 | if (rsp->orphan_nxtlist != NULL) { | ||
1425 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
1426 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
1427 | rsp->orphan_nxtlist = NULL; | ||
1428 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1429 | } | ||
1430 | } | ||
1431 | |||
1432 | /* | ||
1433 | * Trace the fact that this CPU is going offline. | ||
1434 | */ | ||
1435 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
1436 | { | ||
1437 | RCU_TRACE(unsigned long mask); | ||
1438 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
1439 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
1440 | |||
1441 | RCU_TRACE(mask = rdp->grpmask); | ||
1393 | trace_rcu_grace_period(rsp->name, | 1442 | trace_rcu_grace_period(rsp->name, |
1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1443 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1395 | "cpuofl"); | 1444 | "cpuofl"); |
1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
1398 | } | 1445 | } |
1399 | 1446 | ||
1400 | /* | 1447 | /* |
1401 | * The CPU has been completely removed, and some other CPU is reporting | 1448 | * The CPU has been completely removed, and some other CPU is reporting |
1402 | * this fact from process context. Do the remainder of the cleanup. | 1449 | * this fact from process context. Do the remainder of the cleanup, |
1450 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
1451 | * adopting them, if there is no _rcu_barrier() instance running. | ||
1403 | * There can only be one CPU hotplug operation at a time, so no other | 1452 | * There can only be one CPU hotplug operation at a time, so no other |
1404 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1453 | * CPU can be attempting to update rcu_cpu_kthread_task. |
1405 | */ | 1454 | */ |
@@ -1409,17 +1458,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1409 | unsigned long mask; | 1458 | unsigned long mask; |
1410 | int need_report = 0; | 1459 | int need_report = 0; |
1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1460 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ | 1461 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
1413 | 1462 | ||
1414 | /* Adjust any no-longer-needed kthreads. */ | 1463 | /* Adjust any no-longer-needed kthreads. */ |
1415 | rcu_stop_cpu_kthread(cpu); | 1464 | rcu_stop_cpu_kthread(cpu); |
1416 | rcu_node_kthread_setaffinity(rnp, -1); | 1465 | rcu_node_kthread_setaffinity(rnp, -1); |
1417 | 1466 | ||
1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | 1467 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1419 | 1468 | ||
1420 | /* Exclude any attempts to start a new grace period. */ | 1469 | /* Exclude any attempts to start a new grace period. */ |
1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1470 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1422 | 1471 | ||
1472 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
1473 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
1474 | rcu_adopt_orphan_cbs(rsp); | ||
1475 | |||
1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1476 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1477 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
1425 | do { | 1478 | do { |
@@ -1456,6 +1509,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1456 | 1509 | ||
1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1510 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1458 | 1511 | ||
1512 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
1513 | { | ||
1514 | } | ||
1515 | |||
1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1516 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
1460 | { | 1517 | { |
1461 | } | 1518 | } |
@@ -1474,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1474 | { | 1531 | { |
1475 | unsigned long flags; | 1532 | unsigned long flags; |
1476 | struct rcu_head *next, *list, **tail; | 1533 | struct rcu_head *next, *list, **tail; |
1477 | int bl, count, count_lazy; | 1534 | int bl, count, count_lazy, i; |
1478 | 1535 | ||
1479 | /* If no callbacks are ready, just return.*/ | 1536 | /* If no callbacks are ready, just return.*/ |
1480 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1537 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
@@ -1497,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1497 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1554 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1498 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1555 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
1499 | tail = rdp->nxttail[RCU_DONE_TAIL]; | 1556 | tail = rdp->nxttail[RCU_DONE_TAIL]; |
1500 | for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) | 1557 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) |
1501 | if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) | 1558 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) |
1502 | rdp->nxttail[count] = &rdp->nxtlist; | 1559 | rdp->nxttail[i] = &rdp->nxtlist; |
1503 | local_irq_restore(flags); | 1560 | local_irq_restore(flags); |
1504 | 1561 | ||
1505 | /* Invoke callbacks. */ | 1562 | /* Invoke callbacks. */ |
@@ -1524,18 +1581,19 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1524 | rcu_is_callbacks_kthread()); | 1581 | rcu_is_callbacks_kthread()); |
1525 | 1582 | ||
1526 | /* Update count, and requeue any remaining callbacks. */ | 1583 | /* Update count, and requeue any remaining callbacks. */ |
1527 | rdp->qlen_lazy -= count_lazy; | ||
1528 | rdp->qlen -= count; | ||
1529 | rdp->n_cbs_invoked += count; | ||
1530 | if (list != NULL) { | 1584 | if (list != NULL) { |
1531 | *tail = rdp->nxtlist; | 1585 | *tail = rdp->nxtlist; |
1532 | rdp->nxtlist = list; | 1586 | rdp->nxtlist = list; |
1533 | for (count = 0; count < RCU_NEXT_SIZE; count++) | 1587 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1534 | if (&rdp->nxtlist == rdp->nxttail[count]) | 1588 | if (&rdp->nxtlist == rdp->nxttail[i]) |
1535 | rdp->nxttail[count] = tail; | 1589 | rdp->nxttail[i] = tail; |
1536 | else | 1590 | else |
1537 | break; | 1591 | break; |
1538 | } | 1592 | } |
1593 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
1594 | rdp->qlen_lazy -= count_lazy; | ||
1595 | rdp->qlen -= count; | ||
1596 | rdp->n_cbs_invoked += count; | ||
1539 | 1597 | ||
1540 | /* Reinstate batch limit if we have worked down the excess. */ | 1598 | /* Reinstate batch limit if we have worked down the excess. */ |
1541 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1599 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
@@ -1823,11 +1881,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1823 | rdp = this_cpu_ptr(rsp->rda); | 1881 | rdp = this_cpu_ptr(rsp->rda); |
1824 | 1882 | ||
1825 | /* Add the callback to our list. */ | 1883 | /* Add the callback to our list. */ |
1826 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1827 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1828 | rdp->qlen++; | 1884 | rdp->qlen++; |
1829 | if (lazy) | 1885 | if (lazy) |
1830 | rdp->qlen_lazy++; | 1886 | rdp->qlen_lazy++; |
1887 | else | ||
1888 | rcu_idle_count_callbacks_posted(); | ||
1889 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
1890 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
1891 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
1831 | 1892 | ||
1832 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1893 | if (__is_kfree_rcu_offset((unsigned long)func)) |
1833 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1894 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
@@ -1893,6 +1954,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
1893 | } | 1954 | } |
1894 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 1955 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
1895 | 1956 | ||
1957 | /* | ||
1958 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | ||
1959 | * any blocking grace-period wait automatically implies a grace period | ||
1960 | * if there is only one CPU online at any point time during execution | ||
1961 | * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to | ||
1962 | * occasionally incorrectly indicate that there are multiple CPUs online | ||
1963 | * when there was in fact only one the whole time, as this just adds | ||
1964 | * some overhead: RCU still operates correctly. | ||
1965 | * | ||
1966 | * Of course, sampling num_online_cpus() with preemption enabled can | ||
1967 | * give erroneous results if there are concurrent CPU-hotplug operations. | ||
1968 | * For example, given a demonic sequence of preemptions in num_online_cpus() | ||
1969 | * and CPU-hotplug operations, there could be two or more CPUs online at | ||
1970 | * all times, but num_online_cpus() might well return one (or even zero). | ||
1971 | * | ||
1972 | * However, all such demonic sequences require at least one CPU-offline | ||
1973 | * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer | ||
1974 | * is only a problem if there is an RCU read-side critical section executing | ||
1975 | * throughout. But RCU-sched and RCU-bh read-side critical sections | ||
1976 | * disable either preemption or bh, which prevents a CPU from going offline. | ||
1977 | * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return | ||
1978 | * that there is only one CPU when in fact there was more than one throughout | ||
1979 | * is when there were no RCU readers in the system. If there are no | ||
1980 | * RCU readers, the grace period by definition can be of zero length, | ||
1981 | * regardless of the number of online CPUs. | ||
1982 | */ | ||
1983 | static inline int rcu_blocking_is_gp(void) | ||
1984 | { | ||
1985 | might_sleep(); /* Check for RCU read-side critical section. */ | ||
1986 | return num_online_cpus() <= 1; | ||
1987 | } | ||
1988 | |||
1896 | /** | 1989 | /** |
1897 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. | 1990 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. |
1898 | * | 1991 | * |
@@ -2166,11 +2259,10 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
2166 | rcu_preempt_cpu_has_callbacks(cpu); | 2259 | rcu_preempt_cpu_has_callbacks(cpu); |
2167 | } | 2260 | } |
2168 | 2261 | ||
2169 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2262 | /* |
2170 | static atomic_t rcu_barrier_cpu_count; | 2263 | * RCU callback function for _rcu_barrier(). If we are last, wake |
2171 | static DEFINE_MUTEX(rcu_barrier_mutex); | 2264 | * up the task executing _rcu_barrier(). |
2172 | static struct completion rcu_barrier_completion; | 2265 | */ |
2173 | |||
2174 | static void rcu_barrier_callback(struct rcu_head *notused) | 2266 | static void rcu_barrier_callback(struct rcu_head *notused) |
2175 | { | 2267 | { |
2176 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2268 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -2200,27 +2292,94 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
2200 | void (*call_rcu_func)(struct rcu_head *head, | 2292 | void (*call_rcu_func)(struct rcu_head *head, |
2201 | void (*func)(struct rcu_head *head))) | 2293 | void (*func)(struct rcu_head *head))) |
2202 | { | 2294 | { |
2203 | BUG_ON(in_interrupt()); | 2295 | int cpu; |
2296 | unsigned long flags; | ||
2297 | struct rcu_data *rdp; | ||
2298 | struct rcu_head rh; | ||
2299 | |||
2300 | init_rcu_head_on_stack(&rh); | ||
2301 | |||
2204 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2302 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
2205 | mutex_lock(&rcu_barrier_mutex); | 2303 | mutex_lock(&rcu_barrier_mutex); |
2206 | init_completion(&rcu_barrier_completion); | 2304 | |
2305 | smp_mb(); /* Prevent any prior operations from leaking in. */ | ||
2306 | |||
2207 | /* | 2307 | /* |
2208 | * Initialize rcu_barrier_cpu_count to 1, then invoke | 2308 | * Initialize the count to one rather than to zero in order to |
2209 | * rcu_barrier_func() on each CPU, so that each CPU also has | 2309 | * avoid a too-soon return to zero in case of a short grace period |
2210 | * incremented rcu_barrier_cpu_count. Only then is it safe to | 2310 | * (or preemption of this task). Also flag this task as doing |
2211 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 2311 | * an rcu_barrier(). This will prevent anyone else from adopting |
2212 | * might complete its grace period before all of the other CPUs | 2312 | * orphaned callbacks, which could cause otherwise failure if a |
2213 | * did their increment, causing this function to return too | 2313 | * CPU went offline and quickly came back online. To see this, |
2214 | * early. Note that on_each_cpu() disables irqs, which prevents | 2314 | * consider the following sequence of events: |
2215 | * any CPUs from coming online or going offline until each online | 2315 | * |
2216 | * CPU has queued its RCU-barrier callback. | 2316 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. |
2317 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
2318 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
2319 | * 4. CPU 1 comes back online. | ||
2320 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
2321 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
2322 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
2217 | */ | 2323 | */ |
2324 | init_completion(&rcu_barrier_completion); | ||
2218 | atomic_set(&rcu_barrier_cpu_count, 1); | 2325 | atomic_set(&rcu_barrier_cpu_count, 1); |
2219 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 2326 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
2327 | rsp->rcu_barrier_in_progress = current; | ||
2328 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2329 | |||
2330 | /* | ||
2331 | * Force every CPU with callbacks to register a new callback | ||
2332 | * that will tell us when all the preceding callbacks have | ||
2333 | * been invoked. If an offline CPU has callbacks, wait for | ||
2334 | * it to either come back online or to finish orphaning those | ||
2335 | * callbacks. | ||
2336 | */ | ||
2337 | for_each_possible_cpu(cpu) { | ||
2338 | preempt_disable(); | ||
2339 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2340 | if (cpu_is_offline(cpu)) { | ||
2341 | preempt_enable(); | ||
2342 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
2343 | schedule_timeout_interruptible(1); | ||
2344 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
2345 | smp_call_function_single(cpu, rcu_barrier_func, | ||
2346 | (void *)call_rcu_func, 1); | ||
2347 | preempt_enable(); | ||
2348 | } else { | ||
2349 | preempt_enable(); | ||
2350 | } | ||
2351 | } | ||
2352 | |||
2353 | /* | ||
2354 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
2355 | * posted, we can adopt all of the orphaned callbacks and place | ||
2356 | * an rcu_barrier_callback() callback after them. When that is done, | ||
2357 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
2358 | * following every callback that could possibly have been | ||
2359 | * registered before _rcu_barrier() was called. | ||
2360 | */ | ||
2361 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
2362 | rcu_adopt_orphan_cbs(rsp); | ||
2363 | rsp->rcu_barrier_in_progress = NULL; | ||
2364 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
2365 | atomic_inc(&rcu_barrier_cpu_count); | ||
2366 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
2367 | call_rcu_func(&rh, rcu_barrier_callback); | ||
2368 | |||
2369 | /* | ||
2370 | * Now that we have an rcu_barrier_callback() callback on each | ||
2371 | * CPU, and thus each counted, remove the initial count. | ||
2372 | */ | ||
2220 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2373 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
2221 | complete(&rcu_barrier_completion); | 2374 | complete(&rcu_barrier_completion); |
2375 | |||
2376 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
2222 | wait_for_completion(&rcu_barrier_completion); | 2377 | wait_for_completion(&rcu_barrier_completion); |
2378 | |||
2379 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
2223 | mutex_unlock(&rcu_barrier_mutex); | 2380 | mutex_unlock(&rcu_barrier_mutex); |
2381 | |||
2382 | destroy_rcu_head_on_stack(&rh); | ||
2224 | } | 2383 | } |
2225 | 2384 | ||
2226 | /** | 2385 | /** |
@@ -2417,7 +2576,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
2417 | 2576 | ||
2418 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) | 2577 | for (i = NUM_RCU_LVLS - 1; i > 0; i--) |
2419 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 2578 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
2420 | rsp->levelspread[0] = RCU_FANOUT_LEAF; | 2579 | rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; |
2421 | } | 2580 | } |
2422 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | 2581 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
2423 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 2582 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cdd1be0a4072..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -29,18 +29,14 @@ | |||
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. | 32 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and |
33 | * CONFIG_RCU_FANOUT_LEAF. | ||
33 | * In theory, it should be possible to add more levels straightforwardly. | 34 | * In theory, it should be possible to add more levels straightforwardly. |
34 | * In practice, this did work well going from three levels to four. | 35 | * In practice, this did work well going from three levels to four. |
35 | * Of course, your mileage may vary. | 36 | * Of course, your mileage may vary. |
36 | */ | 37 | */ |
37 | #define MAX_RCU_LVLS 4 | 38 | #define MAX_RCU_LVLS 4 |
38 | #if CONFIG_RCU_FANOUT > 16 | 39 | #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) |
39 | #define RCU_FANOUT_LEAF 16 | ||
40 | #else /* #if CONFIG_RCU_FANOUT > 16 */ | ||
41 | #define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) | ||
42 | #endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | ||
43 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
44 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | 40 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) |
45 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | 41 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) |
46 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | 42 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) |
@@ -88,6 +84,20 @@ struct rcu_dynticks { | |||
88 | /* Process level is worth LLONG_MAX/2. */ | 84 | /* Process level is worth LLONG_MAX/2. */ |
89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 85 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | 86 | atomic_t dynticks; /* Even value for idle, else odd. */ |
87 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
88 | int dyntick_drain; /* Prepare-for-idle state variable. */ | ||
89 | unsigned long dyntick_holdoff; | ||
90 | /* No retries for the jiffy of failure. */ | ||
91 | struct timer_list idle_gp_timer; | ||
92 | /* Wake up CPU sleeping with callbacks. */ | ||
93 | unsigned long idle_gp_timer_expires; | ||
94 | /* When to wake up CPU (for repost). */ | ||
95 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
96 | unsigned long nonlazy_posted; | ||
97 | /* # times non-lazy CBs posted to CPU. */ | ||
98 | unsigned long nonlazy_posted_snap; | ||
99 | /* idle-period nonlazy_posted snapshot. */ | ||
100 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
91 | }; | 101 | }; |
92 | 102 | ||
93 | /* RCU's kthread states for tracing. */ | 103 | /* RCU's kthread states for tracing. */ |
@@ -371,6 +381,17 @@ struct rcu_state { | |||
371 | 381 | ||
372 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 382 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
373 | /* starting new GP. */ | 383 | /* starting new GP. */ |
384 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
385 | /* need a grace period. */ | ||
386 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
387 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
388 | /* are ready to invoke. */ | ||
389 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
390 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
391 | long qlen; /* Total number of callbacks. */ | ||
392 | struct task_struct *rcu_barrier_in_progress; | ||
393 | /* Task doing rcu_barrier(), */ | ||
394 | /* or NULL if no barrier. */ | ||
374 | raw_spinlock_t fqslock; /* Only one task forcing */ | 395 | raw_spinlock_t fqslock; /* Only one task forcing */ |
375 | /* quiescent states. */ | 396 | /* quiescent states. */ |
376 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 397 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -471,6 +492,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); | |||
471 | static void rcu_prepare_for_idle_init(int cpu); | 492 | static void rcu_prepare_for_idle_init(int cpu); |
472 | static void rcu_cleanup_after_idle(int cpu); | 493 | static void rcu_cleanup_after_idle(int cpu); |
473 | static void rcu_prepare_for_idle(int cpu); | 494 | static void rcu_prepare_for_idle(int cpu); |
495 | static void rcu_idle_count_callbacks_posted(void); | ||
474 | static void print_cpu_stall_info_begin(void); | 496 | static void print_cpu_stall_info_begin(void); |
475 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 497 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
476 | static void print_cpu_stall_info_end(void); | 498 | static void print_cpu_stall_info_end(void); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c023464816be..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void) | |||
969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | 969 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
970 | } | 970 | } |
971 | 971 | ||
972 | /* | ||
973 | * Check for a task exiting while in a preemptible-RCU read-side | ||
974 | * critical section, clean up if so. No need to issue warnings, | ||
975 | * as debug_check_no_locks_held() already does this if lockdep | ||
976 | * is enabled. | ||
977 | */ | ||
978 | void exit_rcu(void) | ||
979 | { | ||
980 | struct task_struct *t = current; | ||
981 | |||
982 | if (t->rcu_read_lock_nesting == 0) | ||
983 | return; | ||
984 | t->rcu_read_lock_nesting = 1; | ||
985 | __rcu_read_unlock(); | ||
986 | } | ||
987 | |||
988 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 972 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
989 | 973 | ||
990 | static struct rcu_state *rcu_state = &rcu_sched_state; | 974 | static struct rcu_state *rcu_state = &rcu_sched_state; |
@@ -1910,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1910 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | 1894 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1911 | * any flavor of RCU. | 1895 | * any flavor of RCU. |
1912 | */ | 1896 | */ |
1913 | int rcu_needs_cpu(int cpu) | 1897 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
1914 | { | 1898 | { |
1899 | *delta_jiffies = ULONG_MAX; | ||
1915 | return rcu_cpu_has_callbacks(cpu); | 1900 | return rcu_cpu_has_callbacks(cpu); |
1916 | } | 1901 | } |
1917 | 1902 | ||
@@ -1938,6 +1923,14 @@ static void rcu_prepare_for_idle(int cpu) | |||
1938 | { | 1923 | { |
1939 | } | 1924 | } |
1940 | 1925 | ||
1926 | /* | ||
1927 | * Don't bother keeping a running count of the number of RCU callbacks | ||
1928 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
1929 | */ | ||
1930 | static void rcu_idle_count_callbacks_posted(void) | ||
1931 | { | ||
1932 | } | ||
1933 | |||
1941 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1934 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1942 | 1935 | ||
1943 | /* | 1936 | /* |
@@ -1978,30 +1971,6 @@ static void rcu_prepare_for_idle(int cpu) | |||
1978 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | 1971 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ |
1979 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1972 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1980 | 1973 | ||
1981 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | ||
1982 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | ||
1983 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | ||
1984 | static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ | ||
1985 | static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ | ||
1986 | |||
1987 | /* | ||
1988 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
1989 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
1990 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
1991 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
1992 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1993 | * it is better to incur scheduling-clock interrupts than to spin | ||
1994 | * continuously for the same time duration! | ||
1995 | */ | ||
1996 | int rcu_needs_cpu(int cpu) | ||
1997 | { | ||
1998 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
1999 | if (!rcu_cpu_has_callbacks(cpu)) | ||
2000 | return 0; | ||
2001 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
2002 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
2003 | } | ||
2004 | |||
2005 | /* | 1974 | /* |
2006 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1975 | * Does the specified flavor of RCU have non-lazy callbacks pending on |
2007 | * the specified CPU? Both RCU flavor and CPU are specified by the | 1976 | * the specified CPU? Both RCU flavor and CPU are specified by the |
@@ -2045,16 +2014,75 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | |||
2045 | } | 2014 | } |
2046 | 2015 | ||
2047 | /* | 2016 | /* |
2017 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | ||
2018 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | ||
2019 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | ||
2020 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | ||
2021 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
2022 | * it is better to incur scheduling-clock interrupts than to spin | ||
2023 | * continuously for the same time duration! | ||
2024 | * | ||
2025 | * The delta_jiffies argument is used to store the time when RCU is | ||
2026 | * going to need the CPU again if it still has callbacks. The reason | ||
2027 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
2028 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
2029 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
2030 | * delayed until the wakeup time, which defeats the purpose of posting | ||
2031 | * a timer. | ||
2032 | */ | ||
2033 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
2034 | { | ||
2035 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
2036 | |||
2037 | /* Flag a new idle sojourn to the idle-entry state machine. */ | ||
2038 | rdtp->idle_first_pass = 1; | ||
2039 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
2040 | if (!rcu_cpu_has_callbacks(cpu)) { | ||
2041 | *delta_jiffies = ULONG_MAX; | ||
2042 | return 0; | ||
2043 | } | ||
2044 | if (rdtp->dyntick_holdoff == jiffies) { | ||
2045 | /* RCU recently tried and failed, so don't try again. */ | ||
2046 | *delta_jiffies = 1; | ||
2047 | return 1; | ||
2048 | } | ||
2049 | /* Set up for the possibility that RCU will post a timer. */ | ||
2050 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | ||
2051 | *delta_jiffies = RCU_IDLE_GP_DELAY; | ||
2052 | else | ||
2053 | *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; | ||
2054 | return 0; | ||
2055 | } | ||
2056 | |||
2057 | /* | ||
2058 | * Handler for smp_call_function_single(). The only point of this | ||
2059 | * handler is to wake the CPU up, so the handler does only tracing. | ||
2060 | */ | ||
2061 | void rcu_idle_demigrate(void *unused) | ||
2062 | { | ||
2063 | trace_rcu_prep_idle("Demigrate"); | ||
2064 | } | ||
2065 | |||
2066 | /* | ||
2048 | * Timer handler used to force CPU to start pushing its remaining RCU | 2067 | * Timer handler used to force CPU to start pushing its remaining RCU |
2049 | * callbacks in the case where it entered dyntick-idle mode with callbacks | 2068 | * callbacks in the case where it entered dyntick-idle mode with callbacks |
2050 | * pending. The hander doesn't really need to do anything because the | 2069 | * pending. The hander doesn't really need to do anything because the |
2051 | * real work is done upon re-entry to idle, or by the next scheduling-clock | 2070 | * real work is done upon re-entry to idle, or by the next scheduling-clock |
2052 | * interrupt should idle not be re-entered. | 2071 | * interrupt should idle not be re-entered. |
2072 | * | ||
2073 | * One special case: the timer gets migrated without awakening the CPU | ||
2074 | * on which the timer was scheduled on. In this case, we must wake up | ||
2075 | * that CPU. We do so with smp_call_function_single(). | ||
2053 | */ | 2076 | */ |
2054 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | 2077 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) |
2055 | { | 2078 | { |
2079 | int cpu = (int)cpu_in; | ||
2080 | |||
2056 | trace_rcu_prep_idle("Timer"); | 2081 | trace_rcu_prep_idle("Timer"); |
2057 | return HRTIMER_NORESTART; | 2082 | if (cpu != smp_processor_id()) |
2083 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
2084 | else | ||
2085 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
2058 | } | 2086 | } |
2059 | 2087 | ||
2060 | /* | 2088 | /* |
@@ -2062,29 +2090,25 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | |||
2062 | */ | 2090 | */ |
2063 | static void rcu_prepare_for_idle_init(int cpu) | 2091 | static void rcu_prepare_for_idle_init(int cpu) |
2064 | { | 2092 | { |
2065 | static int firsttime = 1; | 2093 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2066 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2067 | 2094 | ||
2068 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 2095 | rdtp->dyntick_holdoff = jiffies - 1; |
2069 | hrtp->function = rcu_idle_gp_timer_func; | 2096 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); |
2070 | if (firsttime) { | 2097 | rdtp->idle_gp_timer_expires = jiffies - 1; |
2071 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | 2098 | rdtp->idle_first_pass = 1; |
2072 | |||
2073 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2074 | upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); | ||
2075 | rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2076 | firsttime = 0; | ||
2077 | } | ||
2078 | } | 2099 | } |
2079 | 2100 | ||
2080 | /* | 2101 | /* |
2081 | * Clean up for exit from idle. Because we are exiting from idle, there | 2102 | * Clean up for exit from idle. Because we are exiting from idle, there |
2082 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | 2103 | * is no longer any point to ->idle_gp_timer, so cancel it. This will |
2083 | * do nothing if this timer is not active, so just cancel it unconditionally. | 2104 | * do nothing if this timer is not active, so just cancel it unconditionally. |
2084 | */ | 2105 | */ |
2085 | static void rcu_cleanup_after_idle(int cpu) | 2106 | static void rcu_cleanup_after_idle(int cpu) |
2086 | { | 2107 | { |
2087 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | 2108 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2109 | |||
2110 | del_timer(&rdtp->idle_gp_timer); | ||
2111 | trace_rcu_prep_idle("Cleanup after idle"); | ||
2088 | } | 2112 | } |
2089 | 2113 | ||
2090 | /* | 2114 | /* |
@@ -2102,19 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu) | |||
2102 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2126 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
2103 | * disabled, we do one pass of force_quiescent_state(), then do a | 2127 | * disabled, we do one pass of force_quiescent_state(), then do a |
2104 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2128 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
2105 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2129 | * later. The ->dyntick_drain field controls the sequencing. |
2106 | * | 2130 | * |
2107 | * The caller must have disabled interrupts. | 2131 | * The caller must have disabled interrupts. |
2108 | */ | 2132 | */ |
2109 | static void rcu_prepare_for_idle(int cpu) | 2133 | static void rcu_prepare_for_idle(int cpu) |
2110 | { | 2134 | { |
2135 | struct timer_list *tp; | ||
2136 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
2137 | |||
2138 | /* | ||
2139 | * If this is an idle re-entry, for example, due to use of | ||
2140 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | ||
2141 | * loop, then don't take any state-machine actions, unless the | ||
2142 | * momentary exit from idle queued additional non-lazy callbacks. | ||
2143 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks | ||
2144 | * pending. | ||
2145 | */ | ||
2146 | if (!rdtp->idle_first_pass && | ||
2147 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { | ||
2148 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2149 | tp = &rdtp->idle_gp_timer; | ||
2150 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
2151 | } | ||
2152 | return; | ||
2153 | } | ||
2154 | rdtp->idle_first_pass = 0; | ||
2155 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; | ||
2156 | |||
2111 | /* | 2157 | /* |
2112 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 2158 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
2113 | * Also reset state to avoid prejudicing later attempts. | 2159 | * Also reset state to avoid prejudicing later attempts. |
2114 | */ | 2160 | */ |
2115 | if (!rcu_cpu_has_callbacks(cpu)) { | 2161 | if (!rcu_cpu_has_callbacks(cpu)) { |
2116 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2162 | rdtp->dyntick_holdoff = jiffies - 1; |
2117 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2163 | rdtp->dyntick_drain = 0; |
2118 | trace_rcu_prep_idle("No callbacks"); | 2164 | trace_rcu_prep_idle("No callbacks"); |
2119 | return; | 2165 | return; |
2120 | } | 2166 | } |
@@ -2123,32 +2169,37 @@ static void rcu_prepare_for_idle(int cpu) | |||
2123 | * If in holdoff mode, just return. We will presumably have | 2169 | * If in holdoff mode, just return. We will presumably have |
2124 | * refrained from disabling the scheduling-clock tick. | 2170 | * refrained from disabling the scheduling-clock tick. |
2125 | */ | 2171 | */ |
2126 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | 2172 | if (rdtp->dyntick_holdoff == jiffies) { |
2127 | trace_rcu_prep_idle("In holdoff"); | 2173 | trace_rcu_prep_idle("In holdoff"); |
2128 | return; | 2174 | return; |
2129 | } | 2175 | } |
2130 | 2176 | ||
2131 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2177 | /* Check and update the ->dyntick_drain sequencing. */ |
2132 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2178 | if (rdtp->dyntick_drain <= 0) { |
2133 | /* First time through, initialize the counter. */ | 2179 | /* First time through, initialize the counter. */ |
2134 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; | 2180 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; |
2135 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | 2181 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && |
2136 | !rcu_pending(cpu) && | 2182 | !rcu_pending(cpu) && |
2137 | !local_softirq_pending()) { | 2183 | !local_softirq_pending()) { |
2138 | /* Can we go dyntick-idle despite still having callbacks? */ | 2184 | /* Can we go dyntick-idle despite still having callbacks? */ |
2139 | trace_rcu_prep_idle("Dyntick with callbacks"); | 2185 | rdtp->dyntick_drain = 0; |
2140 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2186 | rdtp->dyntick_holdoff = jiffies; |
2141 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2187 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { |
2142 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) | 2188 | trace_rcu_prep_idle("Dyntick with callbacks"); |
2143 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2189 | rdtp->idle_gp_timer_expires = |
2144 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | 2190 | jiffies + RCU_IDLE_GP_DELAY; |
2145 | else | 2191 | } else { |
2146 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | 2192 | rdtp->idle_gp_timer_expires = |
2147 | rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); | 2193 | jiffies + RCU_IDLE_LAZY_GP_DELAY; |
2194 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); | ||
2195 | } | ||
2196 | tp = &rdtp->idle_gp_timer; | ||
2197 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
2198 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
2148 | return; /* Nothing more to do immediately. */ | 2199 | return; /* Nothing more to do immediately. */ |
2149 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2200 | } else if (--(rdtp->dyntick_drain) <= 0) { |
2150 | /* We have hit the limit, so time to give up. */ | 2201 | /* We have hit the limit, so time to give up. */ |
2151 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2202 | rdtp->dyntick_holdoff = jiffies; |
2152 | trace_rcu_prep_idle("Begin holdoff"); | 2203 | trace_rcu_prep_idle("Begin holdoff"); |
2153 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | 2204 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ |
2154 | return; | 2205 | return; |
@@ -2184,6 +2235,19 @@ static void rcu_prepare_for_idle(int cpu) | |||
2184 | trace_rcu_prep_idle("Callbacks drained"); | 2235 | trace_rcu_prep_idle("Callbacks drained"); |
2185 | } | 2236 | } |
2186 | 2237 | ||
2238 | /* | ||
2239 | * Keep a running count of the number of non-lazy callbacks posted | ||
2240 | * on this CPU. This running counter (which is never decremented) allows | ||
2241 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
2242 | * posts a callback, even if an equal number of callbacks are invoked. | ||
2243 | * Of course, callbacks should only be posted from within a trace event | ||
2244 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
2245 | */ | ||
2246 | static void rcu_idle_count_callbacks_posted(void) | ||
2247 | { | ||
2248 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); | ||
2249 | } | ||
2250 | |||
2187 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2251 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
2188 | 2252 | ||
2189 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 2253 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -2192,14 +2256,13 @@ static void rcu_prepare_for_idle(int cpu) | |||
2192 | 2256 | ||
2193 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 2257 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2194 | { | 2258 | { |
2195 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | 2259 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2260 | struct timer_list *tltp = &rdtp->idle_gp_timer; | ||
2196 | 2261 | ||
2197 | sprintf(cp, "drain=%d %c timer=%lld", | 2262 | sprintf(cp, "drain=%d %c timer=%lu", |
2198 | per_cpu(rcu_dyntick_drain, cpu), | 2263 | rdtp->dyntick_drain, |
2199 | per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', | 2264 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', |
2200 | hrtimer_active(hrtp) | 2265 | timer_pending(tltp) ? tltp->expires - jiffies : -1); |
2201 | ? ktime_to_us(hrtimer_get_remaining(hrtp)) | ||
2202 | : -1); | ||
2203 | } | 2266 | } |
2204 | 2267 | ||
2205 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 2268 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff43..d4bc16ddd1d4 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
271 | 271 | ||
272 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
275 | rsp->completed, gpnum, rsp->fqs_state, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
276 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
277 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs_lh); | 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
282 | if (rnp->level != level) { | 282 | if (rnp->level != level) { |
283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4d..e8cd2027abbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1235 | struct splice_pipe_desc spd = { | 1235 | struct splice_pipe_desc spd = { |
1236 | .pages = pages, | 1236 | .pages = pages, |
1237 | .nr_pages = 0, | 1237 | .nr_pages = 0, |
1238 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
1238 | .partial = partial, | 1239 | .partial = partial, |
1239 | .flags = flags, | 1240 | .flags = flags, |
1240 | .ops = &relay_pipe_buf_ops, | 1241 | .ops = &relay_pipe_buf_ops, |
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, | |||
1302 | ret += padding; | 1303 | ret += padding; |
1303 | 1304 | ||
1304 | out: | 1305 | out: |
1305 | splice_shrink_spd(pipe, &spd); | 1306 | splice_shrink_spd(&spd); |
1306 | return ret; | 1307 | return ret; |
1307 | } | 1308 | } |
1308 | 1309 | ||
1309 | static ssize_t relay_file_splice_read(struct file *in, | 1310 | static ssize_t relay_file_splice_read(struct file *in, |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b3..ad581aa2369a 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) | |||
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | 25 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val, |
26 | bool force) | ||
26 | { | 27 | { |
28 | int ret = 0; | ||
29 | |||
27 | if (counter->usage + val > counter->limit) { | 30 | if (counter->usage + val > counter->limit) { |
28 | counter->failcnt++; | 31 | counter->failcnt++; |
29 | return -ENOMEM; | 32 | ret = -ENOMEM; |
33 | if (!force) | ||
34 | return ret; | ||
30 | } | 35 | } |
31 | 36 | ||
32 | counter->usage += val; | 37 | counter->usage += val; |
33 | if (counter->usage > counter->max_usage) | 38 | if (counter->usage > counter->max_usage) |
34 | counter->max_usage = counter->usage; | 39 | counter->max_usage = counter->usage; |
35 | return 0; | 40 | return ret; |
36 | } | 41 | } |
37 | 42 | ||
38 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 43 | static int __res_counter_charge(struct res_counter *counter, unsigned long val, |
39 | struct res_counter **limit_fail_at) | 44 | struct res_counter **limit_fail_at, bool force) |
40 | { | 45 | { |
41 | int ret; | 46 | int ret, r; |
42 | unsigned long flags; | 47 | unsigned long flags; |
43 | struct res_counter *c, *u; | 48 | struct res_counter *c, *u; |
44 | 49 | ||
50 | r = ret = 0; | ||
45 | *limit_fail_at = NULL; | 51 | *limit_fail_at = NULL; |
46 | local_irq_save(flags); | 52 | local_irq_save(flags); |
47 | for (c = counter; c != NULL; c = c->parent) { | 53 | for (c = counter; c != NULL; c = c->parent) { |
48 | spin_lock(&c->lock); | 54 | spin_lock(&c->lock); |
49 | ret = res_counter_charge_locked(c, val); | 55 | r = res_counter_charge_locked(c, val, force); |
50 | spin_unlock(&c->lock); | 56 | spin_unlock(&c->lock); |
51 | if (ret < 0) { | 57 | if (r < 0 && !ret) { |
58 | ret = r; | ||
52 | *limit_fail_at = c; | 59 | *limit_fail_at = c; |
53 | goto undo; | 60 | if (!force) |
61 | break; | ||
54 | } | 62 | } |
55 | } | 63 | } |
56 | ret = 0; | 64 | |
57 | goto done; | 65 | if (ret < 0 && !force) { |
58 | undo: | 66 | for (u = counter; u != c; u = u->parent) { |
59 | for (u = counter; u != c; u = u->parent) { | 67 | spin_lock(&u->lock); |
60 | spin_lock(&u->lock); | 68 | res_counter_uncharge_locked(u, val); |
61 | res_counter_uncharge_locked(u, val); | 69 | spin_unlock(&u->lock); |
62 | spin_unlock(&u->lock); | 70 | } |
63 | } | 71 | } |
64 | done: | ||
65 | local_irq_restore(flags); | 72 | local_irq_restore(flags); |
73 | |||
66 | return ret; | 74 | return ret; |
67 | } | 75 | } |
68 | 76 | ||
77 | int res_counter_charge(struct res_counter *counter, unsigned long val, | ||
78 | struct res_counter **limit_fail_at) | ||
79 | { | ||
80 | return __res_counter_charge(counter, val, limit_fail_at, false); | ||
81 | } | ||
82 | |||
69 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, | 83 | int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, |
70 | struct res_counter **limit_fail_at) | 84 | struct res_counter **limit_fail_at) |
71 | { | 85 | { |
72 | int ret, r; | 86 | return __res_counter_charge(counter, val, limit_fail_at, true); |
73 | unsigned long flags; | ||
74 | struct res_counter *c; | ||
75 | |||
76 | r = ret = 0; | ||
77 | *limit_fail_at = NULL; | ||
78 | local_irq_save(flags); | ||
79 | for (c = counter; c != NULL; c = c->parent) { | ||
80 | spin_lock(&c->lock); | ||
81 | r = res_counter_charge_locked(c, val); | ||
82 | if (r) | ||
83 | c->usage += val; | ||
84 | spin_unlock(&c->lock); | ||
85 | if (r < 0 && ret == 0) { | ||
86 | *limit_fail_at = c; | ||
87 | ret = r; | ||
88 | } | ||
89 | } | ||
90 | local_irq_restore(flags); | ||
91 | |||
92 | return ret; | ||
93 | } | 87 | } |
88 | |||
94 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | 89 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) |
95 | { | 90 | { |
96 | if (WARN_ON(counter->usage < val)) | 91 | if (WARN_ON(counter->usage < val)) |
@@ -99,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | |||
99 | counter->usage -= val; | 94 | counter->usage -= val; |
100 | } | 95 | } |
101 | 96 | ||
102 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 97 | void res_counter_uncharge_until(struct res_counter *counter, |
98 | struct res_counter *top, | ||
99 | unsigned long val) | ||
103 | { | 100 | { |
104 | unsigned long flags; | 101 | unsigned long flags; |
105 | struct res_counter *c; | 102 | struct res_counter *c; |
106 | 103 | ||
107 | local_irq_save(flags); | 104 | local_irq_save(flags); |
108 | for (c = counter; c != NULL; c = c->parent) { | 105 | for (c = counter; c != top; c = c->parent) { |
109 | spin_lock(&c->lock); | 106 | spin_lock(&c->lock); |
110 | res_counter_uncharge_locked(c, val); | 107 | res_counter_uncharge_locked(c, val); |
111 | spin_unlock(&c->lock); | 108 | spin_unlock(&c->lock); |
@@ -113,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) | |||
113 | local_irq_restore(flags); | 110 | local_irq_restore(flags); |
114 | } | 111 | } |
115 | 112 | ||
113 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | ||
114 | { | ||
115 | res_counter_uncharge_until(counter, NULL, val); | ||
116 | } | ||
116 | 117 | ||
117 | static inline unsigned long long * | 118 | static inline unsigned long long * |
118 | res_counter_member(struct res_counter *counter, int member) | 119 | res_counter_member(struct res_counter *counter, int member) |
diff --git a/kernel/resource.c b/kernel/resource.c index 7e8ea66a8c01..e1d2b8ee76d5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -515,8 +515,8 @@ out: | |||
515 | * @root: root resource descriptor | 515 | * @root: root resource descriptor |
516 | * @new: resource descriptor desired by caller | 516 | * @new: resource descriptor desired by caller |
517 | * @size: requested resource region size | 517 | * @size: requested resource region size |
518 | * @min: minimum size to allocate | 518 | * @min: minimum boundary to allocate |
519 | * @max: maximum size to allocate | 519 | * @max: maximum boundary to allocate |
520 | * @align: alignment requested, in bytes | 520 | * @align: alignment requested, in bytes |
521 | * @alignf: alignment function, optional, called if not NULL | 521 | * @alignf: alignment function, optional, called if not NULL |
522 | * @alignf_data: arbitrary data to pass to the @alignf function | 522 | * @alignf_data: arbitrary data to pass to the @alignf function |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a7dd35102a3..173ea52f3af0 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | |||
20 | |||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533a688ce22..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
141 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
142 | #name , | 143 | #name , |
143 | 144 | ||
144 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
145 | #include "features.h" | 146 | #include "features.h" |
146 | NULL | ||
147 | }; | 147 | }; |
148 | 148 | ||
149 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 692 | } |
693 | #endif | 693 | #endif |
694 | 694 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 695 | static void set_load_weight(struct task_struct *p) |
698 | { | 696 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 697 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
2162 | } | 2160 | } |
2163 | 2161 | ||
2164 | 2162 | ||
2163 | /* | ||
2164 | * Global load-average calculations | ||
2165 | * | ||
2166 | * We take a distributed and async approach to calculating the global load-avg | ||
2167 | * in order to minimize overhead. | ||
2168 | * | ||
2169 | * The global load average is an exponentially decaying average of nr_running + | ||
2170 | * nr_uninterruptible. | ||
2171 | * | ||
2172 | * Once every LOAD_FREQ: | ||
2173 | * | ||
2174 | * nr_active = 0; | ||
2175 | * for_each_possible_cpu(cpu) | ||
2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2177 | * | ||
2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2179 | * | ||
2180 | * Due to a number of reasons the above turns in the mess below: | ||
2181 | * | ||
2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
2184 | * to calculating nr_active. | ||
2185 | * | ||
2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2188 | * | ||
2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2191 | * to obtain the same result. See calc_load_fold_active(). | ||
2192 | * | ||
2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2195 | * cpu to have completed this task. | ||
2196 | * | ||
2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2199 | * | ||
2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2205 | * all cpus yields the correct result. | ||
2206 | * | ||
2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2208 | */ | ||
2209 | |||
2165 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
2166 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
2167 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
2168 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
2169 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
2215 | |||
2216 | /** | ||
2217 | * get_avenrun - get the load average array | ||
2218 | * @loads: pointer to dest load array | ||
2219 | * @offset: offset to add | ||
2220 | * @shift: shift count to shift the result left | ||
2221 | * | ||
2222 | * These values are estimates at best, so no need for locking. | ||
2223 | */ | ||
2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2225 | { | ||
2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
2229 | } | ||
2170 | 2230 | ||
2171 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
2172 | { | 2232 | { |
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
2183 | return delta; | 2243 | return delta; |
2184 | } | 2244 | } |
2185 | 2245 | ||
2246 | /* | ||
2247 | * a1 = a0 * e + a * (1 - e) | ||
2248 | */ | ||
2186 | static unsigned long | 2249 | static unsigned long |
2187 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2188 | { | 2251 | { |
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
2194 | 2257 | ||
2195 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
2196 | /* | 2259 | /* |
2197 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
2261 | * | ||
2262 | * Since the above described distributed algorithm to compute the global | ||
2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2264 | * NO_HZ. | ||
2265 | * | ||
2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2268 | * when we read the global state. | ||
2269 | * | ||
2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2271 | * | ||
2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2273 | * contribution, causing under-accounting. | ||
2274 | * | ||
2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
2277 | * | ||
2278 | * The only trick is the slight shift in index flip for read vs write. | ||
2279 | * | ||
2280 | * 0s 5s 10s 15s | ||
2281 | * +10 +10 +10 +10 | ||
2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
2283 | * r:0 0 1 1 0 0 1 1 0 | ||
2284 | * w:0 1 1 0 0 1 1 0 0 | ||
2285 | * | ||
2286 | * This ensures we'll fold the old idle contribution in this window while | ||
2287 | * accumlating the new one. | ||
2288 | * | ||
2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2290 | * contribution, since we effectively move our sample point to a known | ||
2291 | * busy state. | ||
2292 | * | ||
2293 | * This is solved by pushing the window forward, and thus skipping the | ||
2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2295 | * was in effect at the time the window opened). This also solves the issue | ||
2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2297 | * LOAD_FREQ intervals. | ||
2198 | * | 2298 | * |
2199 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
2200 | */ | 2300 | */ |
2201 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
2302 | static int calc_load_idx; | ||
2202 | 2303 | ||
2203 | void calc_load_account_idle(struct rq *this_rq) | 2304 | static inline int calc_load_write_idx(void) |
2204 | { | 2305 | { |
2306 | int idx = calc_load_idx; | ||
2307 | |||
2308 | /* | ||
2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
2310 | * need to observe the new update time. | ||
2311 | */ | ||
2312 | smp_rmb(); | ||
2313 | |||
2314 | /* | ||
2315 | * If the folding window started, make sure we start writing in the | ||
2316 | * next idle-delta. | ||
2317 | */ | ||
2318 | if (!time_before(jiffies, calc_load_update)) | ||
2319 | idx++; | ||
2320 | |||
2321 | return idx & 1; | ||
2322 | } | ||
2323 | |||
2324 | static inline int calc_load_read_idx(void) | ||
2325 | { | ||
2326 | return calc_load_idx & 1; | ||
2327 | } | ||
2328 | |||
2329 | void calc_load_enter_idle(void) | ||
2330 | { | ||
2331 | struct rq *this_rq = this_rq(); | ||
2205 | long delta; | 2332 | long delta; |
2206 | 2333 | ||
2334 | /* | ||
2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2336 | * into the pending idle delta. | ||
2337 | */ | ||
2207 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
2208 | if (delta) | 2339 | if (delta) { |
2209 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2342 | } | ||
2210 | } | 2343 | } |
2211 | 2344 | ||
2212 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
2213 | { | 2346 | { |
2214 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
2215 | 2348 | ||
2216 | /* | 2349 | /* |
2217 | * Its got a race, we don't care... | 2350 | * If we're still before the sample window, we're done. |
2218 | */ | 2351 | */ |
2219 | if (atomic_long_read(&calc_load_tasks_idle)) | 2352 | if (time_before(jiffies, this_rq->calc_load_update)) |
2220 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2353 | return; |
2354 | |||
2355 | /* | ||
2356 | * We woke inside or after the sample window, this means we're already | ||
2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
2358 | * sync up for the next window. | ||
2359 | */ | ||
2360 | this_rq->calc_load_update = calc_load_update; | ||
2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
2363 | } | ||
2364 | |||
2365 | static long calc_load_fold_idle(void) | ||
2366 | { | ||
2367 | int idx = calc_load_read_idx(); | ||
2368 | long delta = 0; | ||
2369 | |||
2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2221 | 2372 | ||
2222 | return delta; | 2373 | return delta; |
2223 | } | 2374 | } |
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void) | |||
2303 | { | 2454 | { |
2304 | long delta, active, n; | 2455 | long delta, active, n; |
2305 | 2456 | ||
2306 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
2307 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
2308 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
2309 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
2310 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
2311 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
2312 | delta = calc_load_fold_idle(); | ||
2313 | if (delta) | ||
2314 | atomic_long_add(delta, &calc_load_tasks); | ||
2315 | 2463 | ||
2316 | /* | 2464 | active = atomic_long_read(&calc_load_tasks); |
2317 | * It could be the one fold was all it took, we done! | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
2318 | */ | ||
2319 | if (time_before(jiffies, calc_load_update + 10)) | ||
2320 | return; | ||
2321 | 2466 | ||
2322 | /* | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
2323 | * Catch-up, fold however many we are behind still | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
2324 | */ | 2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
2325 | delta = jiffies - calc_load_update - 10; | ||
2326 | n = 1 + (delta / LOAD_FREQ); | ||
2327 | 2470 | ||
2328 | active = atomic_long_read(&calc_load_tasks); | 2471 | calc_load_update += n * LOAD_FREQ; |
2329 | active = active > 0 ? active * FIXED_1 : 0; | 2472 | } |
2330 | |||
2331 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
2332 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
2333 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2334 | |||
2335 | calc_load_update += n * LOAD_FREQ; | ||
2336 | } | ||
2337 | #else | ||
2338 | void calc_load_account_idle(struct rq *this_rq) | ||
2339 | { | ||
2340 | } | ||
2341 | 2473 | ||
2342 | static inline long calc_load_fold_idle(void) | 2474 | /* |
2343 | { | 2475 | * Flip the idle index... |
2344 | return 0; | 2476 | * |
2477 | * Make sure we first write the new time then flip the index, so that | ||
2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
2479 | * index, this avoids a double flip messing things up. | ||
2480 | */ | ||
2481 | smp_wmb(); | ||
2482 | calc_load_idx++; | ||
2345 | } | 2483 | } |
2484 | #else /* !CONFIG_NO_HZ */ | ||
2346 | 2485 | ||
2347 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
2348 | { | 2487 | static inline void calc_global_nohz(void) { } |
2349 | } | ||
2350 | #endif | ||
2351 | 2488 | ||
2352 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
2353 | * get_avenrun - get the load average array | ||
2354 | * @loads: pointer to dest load array | ||
2355 | * @offset: offset to add | ||
2356 | * @shift: shift count to shift the result left | ||
2357 | * | ||
2358 | * These values are estimates at best, so no need for locking. | ||
2359 | */ | ||
2360 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2361 | { | ||
2362 | loads[0] = (avenrun[0] + offset) << shift; | ||
2363 | loads[1] = (avenrun[1] + offset) << shift; | ||
2364 | loads[2] = (avenrun[2] + offset) << shift; | ||
2365 | } | ||
2366 | 2490 | ||
2367 | /* | 2491 | /* |
2368 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
2370 | */ | 2494 | */ |
2371 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
2372 | { | 2496 | { |
2373 | long active; | 2497 | long active, delta; |
2374 | 2498 | ||
2375 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
2376 | return; | 2500 | return; |
2377 | 2501 | ||
2502 | /* | ||
2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2504 | */ | ||
2505 | delta = calc_load_fold_idle(); | ||
2506 | if (delta) | ||
2507 | atomic_long_add(delta, &calc_load_tasks); | ||
2508 | |||
2378 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
2379 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
2380 | 2511 | ||
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
2385 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
2386 | 2517 | ||
2387 | /* | 2518 | /* |
2388 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
2389 | * folding in the nohz state and ageing the entire idle period. | ||
2390 | * | ||
2391 | * This avoids loosing a sample when we go idle between | ||
2392 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
2393 | * under-accounting. | ||
2394 | */ | 2520 | */ |
2395 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
2396 | } | 2522 | } |
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2407 | return; | 2533 | return; |
2408 | 2534 | ||
2409 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
2410 | delta += calc_load_fold_idle(); | ||
2411 | if (delta) | 2536 | if (delta) |
2412 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
2413 | 2538 | ||
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2415 | } | 2540 | } |
2416 | 2541 | ||
2417 | /* | 2542 | /* |
2543 | * End of global load-average stuff | ||
2544 | */ | ||
2545 | |||
2546 | /* | ||
2418 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
2419 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
2420 | * | 2549 | * |
@@ -2486,22 +2615,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2615 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2487 | * every tick. We fix it up based on jiffies. | 2616 | * every tick. We fix it up based on jiffies. |
2488 | */ | 2617 | */ |
2489 | void update_cpu_load(struct rq *this_rq) | 2618 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2619 | unsigned long pending_updates) | ||
2490 | { | 2620 | { |
2491 | unsigned long this_load = this_rq->load.weight; | ||
2492 | unsigned long curr_jiffies = jiffies; | ||
2493 | unsigned long pending_updates; | ||
2494 | int i, scale; | 2621 | int i, scale; |
2495 | 2622 | ||
2496 | this_rq->nr_load_updates++; | 2623 | this_rq->nr_load_updates++; |
2497 | 2624 | ||
2498 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2499 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2500 | return; | ||
2501 | |||
2502 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2503 | this_rq->last_load_update_tick = curr_jiffies; | ||
2504 | |||
2505 | /* Update our load: */ | 2625 | /* Update our load: */ |
2506 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2626 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2507 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2627 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2526,9 +2646,78 @@ void update_cpu_load(struct rq *this_rq) | |||
2526 | sched_avg_update(this_rq); | 2646 | sched_avg_update(this_rq); |
2527 | } | 2647 | } |
2528 | 2648 | ||
2649 | #ifdef CONFIG_NO_HZ | ||
2650 | /* | ||
2651 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2652 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2653 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2654 | * | ||
2655 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2656 | * would seriously skew the load calculation. However we'll make do for those | ||
2657 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2658 | * (tick_nohz_idle_exit). | ||
2659 | * | ||
2660 | * This means we might still be one tick off for nohz periods. | ||
2661 | */ | ||
2662 | |||
2663 | /* | ||
2664 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2665 | * idle balance. | ||
2666 | */ | ||
2667 | void update_idle_cpu_load(struct rq *this_rq) | ||
2668 | { | ||
2669 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2670 | unsigned long load = this_rq->load.weight; | ||
2671 | unsigned long pending_updates; | ||
2672 | |||
2673 | /* | ||
2674 | * bail if there's load or we're actually up-to-date. | ||
2675 | */ | ||
2676 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2677 | return; | ||
2678 | |||
2679 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2680 | this_rq->last_load_update_tick = curr_jiffies; | ||
2681 | |||
2682 | __update_cpu_load(this_rq, load, pending_updates); | ||
2683 | } | ||
2684 | |||
2685 | /* | ||
2686 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2687 | */ | ||
2688 | void update_cpu_load_nohz(void) | ||
2689 | { | ||
2690 | struct rq *this_rq = this_rq(); | ||
2691 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2692 | unsigned long pending_updates; | ||
2693 | |||
2694 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2695 | return; | ||
2696 | |||
2697 | raw_spin_lock(&this_rq->lock); | ||
2698 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2699 | if (pending_updates) { | ||
2700 | this_rq->last_load_update_tick = curr_jiffies; | ||
2701 | /* | ||
2702 | * We were idle, this means load 0, the current load might be | ||
2703 | * !0 due to remote wakeups and the sort. | ||
2704 | */ | ||
2705 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2706 | } | ||
2707 | raw_spin_unlock(&this_rq->lock); | ||
2708 | } | ||
2709 | #endif /* CONFIG_NO_HZ */ | ||
2710 | |||
2711 | /* | ||
2712 | * Called from scheduler_tick() | ||
2713 | */ | ||
2529 | static void update_cpu_load_active(struct rq *this_rq) | 2714 | static void update_cpu_load_active(struct rq *this_rq) |
2530 | { | 2715 | { |
2531 | update_cpu_load(this_rq); | 2716 | /* |
2717 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2718 | */ | ||
2719 | this_rq->last_load_update_tick = jiffies; | ||
2720 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2532 | 2721 | ||
2533 | calc_load_account_active(this_rq); | 2722 | calc_load_account_active(this_rq); |
2534 | } | 2723 | } |
@@ -3113,6 +3302,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3113 | if (irqs_disabled()) | 3302 | if (irqs_disabled()) |
3114 | print_irqtrace_events(prev); | 3303 | print_irqtrace_events(prev); |
3115 | dump_stack(); | 3304 | dump_stack(); |
3305 | add_taint(TAINT_WARN); | ||
3116 | } | 3306 | } |
3117 | 3307 | ||
3118 | /* | 3308 | /* |
@@ -4042,11 +4232,8 @@ static bool check_same_owner(struct task_struct *p) | |||
4042 | 4232 | ||
4043 | rcu_read_lock(); | 4233 | rcu_read_lock(); |
4044 | pcred = __task_cred(p); | 4234 | pcred = __task_cred(p); |
4045 | if (cred->user->user_ns == pcred->user->user_ns) | 4235 | match = (uid_eq(cred->euid, pcred->euid) || |
4046 | match = (cred->euid == pcred->euid || | 4236 | uid_eq(cred->euid, pcred->uid)); |
4047 | cred->euid == pcred->uid); | ||
4048 | else | ||
4049 | match = false; | ||
4050 | rcu_read_unlock(); | 4237 | rcu_read_unlock(); |
4051 | return match; | 4238 | return match; |
4052 | } | 4239 | } |
@@ -4957,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
4957 | p->sched_class->set_cpus_allowed(p, new_mask); | 5144 | p->sched_class->set_cpus_allowed(p, new_mask); |
4958 | 5145 | ||
4959 | cpumask_copy(&p->cpus_allowed, new_mask); | 5146 | cpumask_copy(&p->cpus_allowed, new_mask); |
4960 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5147 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
4961 | } | 5148 | } |
4962 | 5149 | ||
4963 | /* | 5150 | /* |
@@ -5499,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | |||
5499 | 5686 | ||
5500 | #ifdef CONFIG_SCHED_DEBUG | 5687 | #ifdef CONFIG_SCHED_DEBUG |
5501 | 5688 | ||
5502 | static __read_mostly int sched_domain_debug_enabled; | 5689 | static __read_mostly int sched_debug_enabled; |
5503 | 5690 | ||
5504 | static int __init sched_domain_debug_setup(char *str) | 5691 | static int __init sched_debug_setup(char *str) |
5505 | { | 5692 | { |
5506 | sched_domain_debug_enabled = 1; | 5693 | sched_debug_enabled = 1; |
5507 | 5694 | ||
5508 | return 0; | 5695 | return 0; |
5509 | } | 5696 | } |
5510 | early_param("sched_debug", sched_domain_debug_setup); | 5697 | early_param("sched_debug", sched_debug_setup); |
5698 | |||
5699 | static inline bool sched_debug(void) | ||
5700 | { | ||
5701 | return sched_debug_enabled; | ||
5702 | } | ||
5511 | 5703 | ||
5512 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 5704 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
5513 | struct cpumask *groupmask) | 5705 | struct cpumask *groupmask) |
@@ -5547,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5547 | break; | 5739 | break; |
5548 | } | 5740 | } |
5549 | 5741 | ||
5550 | if (!group->sgp->power) { | 5742 | /* |
5743 | * Even though we initialize ->power to something semi-sane, | ||
5744 | * we leave power_orig unset. This allows us to detect if | ||
5745 | * domain iteration is still funny without causing /0 traps. | ||
5746 | */ | ||
5747 | if (!group->sgp->power_orig) { | ||
5551 | printk(KERN_CONT "\n"); | 5748 | printk(KERN_CONT "\n"); |
5552 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5749 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5553 | "set\n"); | 5750 | "set\n"); |
@@ -5560,7 +5757,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5560 | break; | 5757 | break; |
5561 | } | 5758 | } |
5562 | 5759 | ||
5563 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5760 | if (!(sd->flags & SD_OVERLAP) && |
5761 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5564 | printk(KERN_CONT "\n"); | 5762 | printk(KERN_CONT "\n"); |
5565 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5763 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5566 | break; | 5764 | break; |
@@ -5594,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5594 | { | 5792 | { |
5595 | int level = 0; | 5793 | int level = 0; |
5596 | 5794 | ||
5597 | if (!sched_domain_debug_enabled) | 5795 | if (!sched_debug_enabled) |
5598 | return; | 5796 | return; |
5599 | 5797 | ||
5600 | if (!sd) { | 5798 | if (!sd) { |
@@ -5615,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5615 | } | 5813 | } |
5616 | #else /* !CONFIG_SCHED_DEBUG */ | 5814 | #else /* !CONFIG_SCHED_DEBUG */ |
5617 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5815 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5816 | static inline bool sched_debug(void) | ||
5817 | { | ||
5818 | return false; | ||
5819 | } | ||
5618 | #endif /* CONFIG_SCHED_DEBUG */ | 5820 | #endif /* CONFIG_SCHED_DEBUG */ |
5619 | 5821 | ||
5620 | static int sd_degenerate(struct sched_domain *sd) | 5822 | static int sd_degenerate(struct sched_domain *sd) |
@@ -5898,99 +6100,11 @@ static int __init isolated_cpu_setup(char *str) | |||
5898 | 6100 | ||
5899 | __setup("isolcpus=", isolated_cpu_setup); | 6101 | __setup("isolcpus=", isolated_cpu_setup); |
5900 | 6102 | ||
5901 | #ifdef CONFIG_NUMA | ||
5902 | |||
5903 | /** | ||
5904 | * find_next_best_node - find the next node to include in a sched_domain | ||
5905 | * @node: node whose sched_domain we're building | ||
5906 | * @used_nodes: nodes already in the sched_domain | ||
5907 | * | ||
5908 | * Find the next node to include in a given scheduling domain. Simply | ||
5909 | * finds the closest node not already in the @used_nodes map. | ||
5910 | * | ||
5911 | * Should use nodemask_t. | ||
5912 | */ | ||
5913 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5914 | { | ||
5915 | int i, n, val, min_val, best_node = -1; | ||
5916 | |||
5917 | min_val = INT_MAX; | ||
5918 | |||
5919 | for (i = 0; i < nr_node_ids; i++) { | ||
5920 | /* Start at @node */ | ||
5921 | n = (node + i) % nr_node_ids; | ||
5922 | |||
5923 | if (!nr_cpus_node(n)) | ||
5924 | continue; | ||
5925 | |||
5926 | /* Skip already used nodes */ | ||
5927 | if (node_isset(n, *used_nodes)) | ||
5928 | continue; | ||
5929 | |||
5930 | /* Simple min distance search */ | ||
5931 | val = node_distance(node, n); | ||
5932 | |||
5933 | if (val < min_val) { | ||
5934 | min_val = val; | ||
5935 | best_node = n; | ||
5936 | } | ||
5937 | } | ||
5938 | |||
5939 | if (best_node != -1) | ||
5940 | node_set(best_node, *used_nodes); | ||
5941 | return best_node; | ||
5942 | } | ||
5943 | |||
5944 | /** | ||
5945 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5946 | * @node: node whose cpumask we're constructing | ||
5947 | * @span: resulting cpumask | ||
5948 | * | ||
5949 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5950 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5951 | * out optimally. | ||
5952 | */ | ||
5953 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5954 | { | ||
5955 | nodemask_t used_nodes; | ||
5956 | int i; | ||
5957 | |||
5958 | cpumask_clear(span); | ||
5959 | nodes_clear(used_nodes); | ||
5960 | |||
5961 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5962 | node_set(node, used_nodes); | ||
5963 | |||
5964 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5965 | int next_node = find_next_best_node(node, &used_nodes); | ||
5966 | if (next_node < 0) | ||
5967 | break; | ||
5968 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5969 | } | ||
5970 | } | ||
5971 | |||
5972 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5973 | { | ||
5974 | lockdep_assert_held(&sched_domains_mutex); | ||
5975 | |||
5976 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5977 | |||
5978 | return sched_domains_tmpmask; | ||
5979 | } | ||
5980 | |||
5981 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5982 | { | ||
5983 | return cpu_possible_mask; | ||
5984 | } | ||
5985 | #endif /* CONFIG_NUMA */ | ||
5986 | |||
5987 | static const struct cpumask *cpu_cpu_mask(int cpu) | 6103 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5988 | { | 6104 | { |
5989 | return cpumask_of_node(cpu_to_node(cpu)); | 6105 | return cpumask_of_node(cpu_to_node(cpu)); |
5990 | } | 6106 | } |
5991 | 6107 | ||
5992 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5993 | |||
5994 | struct sd_data { | 6108 | struct sd_data { |
5995 | struct sched_domain **__percpu sd; | 6109 | struct sched_domain **__percpu sd; |
5996 | struct sched_group **__percpu sg; | 6110 | struct sched_group **__percpu sg; |
@@ -6020,9 +6134,48 @@ struct sched_domain_topology_level { | |||
6020 | sched_domain_init_f init; | 6134 | sched_domain_init_f init; |
6021 | sched_domain_mask_f mask; | 6135 | sched_domain_mask_f mask; |
6022 | int flags; | 6136 | int flags; |
6137 | int numa_level; | ||
6023 | struct sd_data data; | 6138 | struct sd_data data; |
6024 | }; | 6139 | }; |
6025 | 6140 | ||
6141 | /* | ||
6142 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6143 | * domain traversal. | ||
6144 | * | ||
6145 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6146 | * unequal depth, make sure to skip domains that already cover the entire | ||
6147 | * range. | ||
6148 | * | ||
6149 | * In that case build_sched_domains() will have terminated the iteration early | ||
6150 | * and our sibling sd spans will be empty. Domains should always include the | ||
6151 | * cpu they're built on, so check that. | ||
6152 | * | ||
6153 | */ | ||
6154 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6155 | { | ||
6156 | const struct cpumask *span = sched_domain_span(sd); | ||
6157 | struct sd_data *sdd = sd->private; | ||
6158 | struct sched_domain *sibling; | ||
6159 | int i; | ||
6160 | |||
6161 | for_each_cpu(i, span) { | ||
6162 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6163 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6164 | continue; | ||
6165 | |||
6166 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6167 | } | ||
6168 | } | ||
6169 | |||
6170 | /* | ||
6171 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6172 | * of this group that's also in the iteration mask. | ||
6173 | */ | ||
6174 | int group_balance_cpu(struct sched_group *sg) | ||
6175 | { | ||
6176 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6177 | } | ||
6178 | |||
6026 | static int | 6179 | static int |
6027 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 6180 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6028 | { | 6181 | { |
@@ -6041,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6041 | if (cpumask_test_cpu(i, covered)) | 6194 | if (cpumask_test_cpu(i, covered)) |
6042 | continue; | 6195 | continue; |
6043 | 6196 | ||
6197 | child = *per_cpu_ptr(sdd->sd, i); | ||
6198 | |||
6199 | /* See the comment near build_group_mask(). */ | ||
6200 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | ||
6201 | continue; | ||
6202 | |||
6044 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6203 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6045 | GFP_KERNEL, cpu_to_node(cpu)); | 6204 | GFP_KERNEL, cpu_to_node(cpu)); |
6046 | 6205 | ||
@@ -6048,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6048 | goto fail; | 6207 | goto fail; |
6049 | 6208 | ||
6050 | sg_span = sched_group_cpus(sg); | 6209 | sg_span = sched_group_cpus(sg); |
6051 | |||
6052 | child = *per_cpu_ptr(sdd->sd, i); | ||
6053 | if (child->child) { | 6210 | if (child->child) { |
6054 | child = child->child; | 6211 | child = child->child; |
6055 | cpumask_copy(sg_span, sched_domain_span(child)); | 6212 | cpumask_copy(sg_span, sched_domain_span(child)); |
@@ -6058,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6058 | 6215 | ||
6059 | cpumask_or(covered, covered, sg_span); | 6216 | cpumask_or(covered, covered, sg_span); |
6060 | 6217 | ||
6061 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6218 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
6062 | atomic_inc(&sg->sgp->ref); | 6219 | if (atomic_inc_return(&sg->sgp->ref) == 1) |
6220 | build_group_mask(sd, sg); | ||
6063 | 6221 | ||
6064 | if (cpumask_test_cpu(cpu, sg_span)) | 6222 | /* |
6223 | * Initialize sgp->power such that even if we mess up the | ||
6224 | * domains and no possible iteration will get us here, we won't | ||
6225 | * die on a /0 trap. | ||
6226 | */ | ||
6227 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | ||
6228 | |||
6229 | /* | ||
6230 | * Make sure the first group of this domain contains the | ||
6231 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6232 | * breaks. See update_sg_lb_stats(). | ||
6233 | */ | ||
6234 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6235 | group_balance_cpu(sg) == cpu) | ||
6065 | groups = sg; | 6236 | groups = sg; |
6066 | 6237 | ||
6067 | if (!first) | 6238 | if (!first) |
@@ -6135,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
6135 | 6306 | ||
6136 | cpumask_clear(sched_group_cpus(sg)); | 6307 | cpumask_clear(sched_group_cpus(sg)); |
6137 | sg->sgp->power = 0; | 6308 | sg->sgp->power = 0; |
6309 | cpumask_setall(sched_group_mask(sg)); | ||
6138 | 6310 | ||
6139 | for_each_cpu(j, span) { | 6311 | for_each_cpu(j, span) { |
6140 | if (get_group(j, sdd, NULL) != group) | 6312 | if (get_group(j, sdd, NULL) != group) |
@@ -6176,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6176 | sg = sg->next; | 6348 | sg = sg->next; |
6177 | } while (sg != sd->groups); | 6349 | } while (sg != sd->groups); |
6178 | 6350 | ||
6179 | if (cpu != group_first_cpu(sg)) | 6351 | if (cpu != group_balance_cpu(sg)) |
6180 | return; | 6352 | return; |
6181 | 6353 | ||
6182 | update_group_power(sd, cpu); | 6354 | update_group_power(sd, cpu); |
@@ -6211,10 +6383,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6211 | } | 6383 | } |
6212 | 6384 | ||
6213 | SD_INIT_FUNC(CPU) | 6385 | SD_INIT_FUNC(CPU) |
6214 | #ifdef CONFIG_NUMA | ||
6215 | SD_INIT_FUNC(ALLNODES) | ||
6216 | SD_INIT_FUNC(NODE) | ||
6217 | #endif | ||
6218 | #ifdef CONFIG_SCHED_SMT | 6386 | #ifdef CONFIG_SCHED_SMT |
6219 | SD_INIT_FUNC(SIBLING) | 6387 | SD_INIT_FUNC(SIBLING) |
6220 | #endif | 6388 | #endif |
@@ -6230,11 +6398,8 @@ int sched_domain_level_max; | |||
6230 | 6398 | ||
6231 | static int __init setup_relax_domain_level(char *str) | 6399 | static int __init setup_relax_domain_level(char *str) |
6232 | { | 6400 | { |
6233 | unsigned long val; | 6401 | if (kstrtoint(str, 0, &default_relax_domain_level)) |
6234 | 6402 | pr_warn("Unable to set relax_domain_level\n"); | |
6235 | val = simple_strtoul(str, NULL, 0); | ||
6236 | if (val < sched_domain_level_max) | ||
6237 | default_relax_domain_level = val; | ||
6238 | 6403 | ||
6239 | return 1; | 6404 | return 1; |
6240 | } | 6405 | } |
@@ -6336,15 +6501,236 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6336 | { sd_init_BOOK, cpu_book_mask, }, | 6501 | { sd_init_BOOK, cpu_book_mask, }, |
6337 | #endif | 6502 | #endif |
6338 | { sd_init_CPU, cpu_cpu_mask, }, | 6503 | { sd_init_CPU, cpu_cpu_mask, }, |
6339 | #ifdef CONFIG_NUMA | ||
6340 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6341 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6342 | #endif | ||
6343 | { NULL, }, | 6504 | { NULL, }, |
6344 | }; | 6505 | }; |
6345 | 6506 | ||
6346 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6507 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6347 | 6508 | ||
6509 | #ifdef CONFIG_NUMA | ||
6510 | |||
6511 | static int sched_domains_numa_levels; | ||
6512 | static int *sched_domains_numa_distance; | ||
6513 | static struct cpumask ***sched_domains_numa_masks; | ||
6514 | static int sched_domains_curr_level; | ||
6515 | |||
6516 | static inline int sd_local_flags(int level) | ||
6517 | { | ||
6518 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) | ||
6519 | return 0; | ||
6520 | |||
6521 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6522 | } | ||
6523 | |||
6524 | static struct sched_domain * | ||
6525 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6526 | { | ||
6527 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6528 | int level = tl->numa_level; | ||
6529 | int sd_weight = cpumask_weight( | ||
6530 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6531 | |||
6532 | *sd = (struct sched_domain){ | ||
6533 | .min_interval = sd_weight, | ||
6534 | .max_interval = 2*sd_weight, | ||
6535 | .busy_factor = 32, | ||
6536 | .imbalance_pct = 125, | ||
6537 | .cache_nice_tries = 2, | ||
6538 | .busy_idx = 3, | ||
6539 | .idle_idx = 2, | ||
6540 | .newidle_idx = 0, | ||
6541 | .wake_idx = 0, | ||
6542 | .forkexec_idx = 0, | ||
6543 | |||
6544 | .flags = 1*SD_LOAD_BALANCE | ||
6545 | | 1*SD_BALANCE_NEWIDLE | ||
6546 | | 0*SD_BALANCE_EXEC | ||
6547 | | 0*SD_BALANCE_FORK | ||
6548 | | 0*SD_BALANCE_WAKE | ||
6549 | | 0*SD_WAKE_AFFINE | ||
6550 | | 0*SD_PREFER_LOCAL | ||
6551 | | 0*SD_SHARE_CPUPOWER | ||
6552 | | 0*SD_SHARE_PKG_RESOURCES | ||
6553 | | 1*SD_SERIALIZE | ||
6554 | | 0*SD_PREFER_SIBLING | ||
6555 | | sd_local_flags(level) | ||
6556 | , | ||
6557 | .last_balance = jiffies, | ||
6558 | .balance_interval = sd_weight, | ||
6559 | }; | ||
6560 | SD_INIT_NAME(sd, NUMA); | ||
6561 | sd->private = &tl->data; | ||
6562 | |||
6563 | /* | ||
6564 | * Ugly hack to pass state to sd_numa_mask()... | ||
6565 | */ | ||
6566 | sched_domains_curr_level = tl->numa_level; | ||
6567 | |||
6568 | return sd; | ||
6569 | } | ||
6570 | |||
6571 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6572 | { | ||
6573 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6574 | } | ||
6575 | |||
6576 | static void sched_numa_warn(const char *str) | ||
6577 | { | ||
6578 | static int done = false; | ||
6579 | int i,j; | ||
6580 | |||
6581 | if (done) | ||
6582 | return; | ||
6583 | |||
6584 | done = true; | ||
6585 | |||
6586 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6587 | |||
6588 | for (i = 0; i < nr_node_ids; i++) { | ||
6589 | printk(KERN_WARNING " "); | ||
6590 | for (j = 0; j < nr_node_ids; j++) | ||
6591 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6592 | printk(KERN_CONT "\n"); | ||
6593 | } | ||
6594 | printk(KERN_WARNING "\n"); | ||
6595 | } | ||
6596 | |||
6597 | static bool find_numa_distance(int distance) | ||
6598 | { | ||
6599 | int i; | ||
6600 | |||
6601 | if (distance == node_distance(0, 0)) | ||
6602 | return true; | ||
6603 | |||
6604 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6605 | if (sched_domains_numa_distance[i] == distance) | ||
6606 | return true; | ||
6607 | } | ||
6608 | |||
6609 | return false; | ||
6610 | } | ||
6611 | |||
6612 | static void sched_init_numa(void) | ||
6613 | { | ||
6614 | int next_distance, curr_distance = node_distance(0, 0); | ||
6615 | struct sched_domain_topology_level *tl; | ||
6616 | int level = 0; | ||
6617 | int i, j, k; | ||
6618 | |||
6619 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6620 | if (!sched_domains_numa_distance) | ||
6621 | return; | ||
6622 | |||
6623 | /* | ||
6624 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6625 | * unique distances in the node_distance() table. | ||
6626 | * | ||
6627 | * Assumes node_distance(0,j) includes all distances in | ||
6628 | * node_distance(i,j) in order to avoid cubic time. | ||
6629 | */ | ||
6630 | next_distance = curr_distance; | ||
6631 | for (i = 0; i < nr_node_ids; i++) { | ||
6632 | for (j = 0; j < nr_node_ids; j++) { | ||
6633 | for (k = 0; k < nr_node_ids; k++) { | ||
6634 | int distance = node_distance(i, k); | ||
6635 | |||
6636 | if (distance > curr_distance && | ||
6637 | (distance < next_distance || | ||
6638 | next_distance == curr_distance)) | ||
6639 | next_distance = distance; | ||
6640 | |||
6641 | /* | ||
6642 | * While not a strong assumption it would be nice to know | ||
6643 | * about cases where if node A is connected to B, B is not | ||
6644 | * equally connected to A. | ||
6645 | */ | ||
6646 | if (sched_debug() && node_distance(k, i) != distance) | ||
6647 | sched_numa_warn("Node-distance not symmetric"); | ||
6648 | |||
6649 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6650 | sched_numa_warn("Node-0 not representative"); | ||
6651 | } | ||
6652 | if (next_distance != curr_distance) { | ||
6653 | sched_domains_numa_distance[level++] = next_distance; | ||
6654 | sched_domains_numa_levels = level; | ||
6655 | curr_distance = next_distance; | ||
6656 | } else break; | ||
6657 | } | ||
6658 | |||
6659 | /* | ||
6660 | * In case of sched_debug() we verify the above assumption. | ||
6661 | */ | ||
6662 | if (!sched_debug()) | ||
6663 | break; | ||
6664 | } | ||
6665 | /* | ||
6666 | * 'level' contains the number of unique distances, excluding the | ||
6667 | * identity distance node_distance(i,i). | ||
6668 | * | ||
6669 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6670 | * numbers. | ||
6671 | */ | ||
6672 | |||
6673 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6674 | if (!sched_domains_numa_masks) | ||
6675 | return; | ||
6676 | |||
6677 | /* | ||
6678 | * Now for each level, construct a mask per node which contains all | ||
6679 | * cpus of nodes that are that many hops away from us. | ||
6680 | */ | ||
6681 | for (i = 0; i < level; i++) { | ||
6682 | sched_domains_numa_masks[i] = | ||
6683 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6684 | if (!sched_domains_numa_masks[i]) | ||
6685 | return; | ||
6686 | |||
6687 | for (j = 0; j < nr_node_ids; j++) { | ||
6688 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6689 | if (!mask) | ||
6690 | return; | ||
6691 | |||
6692 | sched_domains_numa_masks[i][j] = mask; | ||
6693 | |||
6694 | for (k = 0; k < nr_node_ids; k++) { | ||
6695 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6696 | continue; | ||
6697 | |||
6698 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6699 | } | ||
6700 | } | ||
6701 | } | ||
6702 | |||
6703 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6704 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6705 | if (!tl) | ||
6706 | return; | ||
6707 | |||
6708 | /* | ||
6709 | * Copy the default topology bits.. | ||
6710 | */ | ||
6711 | for (i = 0; default_topology[i].init; i++) | ||
6712 | tl[i] = default_topology[i]; | ||
6713 | |||
6714 | /* | ||
6715 | * .. and append 'j' levels of NUMA goodness. | ||
6716 | */ | ||
6717 | for (j = 0; j < level; i++, j++) { | ||
6718 | tl[i] = (struct sched_domain_topology_level){ | ||
6719 | .init = sd_numa_init, | ||
6720 | .mask = sd_numa_mask, | ||
6721 | .flags = SDTL_OVERLAP, | ||
6722 | .numa_level = j, | ||
6723 | }; | ||
6724 | } | ||
6725 | |||
6726 | sched_domain_topology = tl; | ||
6727 | } | ||
6728 | #else | ||
6729 | static inline void sched_init_numa(void) | ||
6730 | { | ||
6731 | } | ||
6732 | #endif /* CONFIG_NUMA */ | ||
6733 | |||
6348 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6734 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6349 | { | 6735 | { |
6350 | struct sched_domain_topology_level *tl; | 6736 | struct sched_domain_topology_level *tl; |
@@ -6382,9 +6768,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6382 | if (!sg) | 6768 | if (!sg) |
6383 | return -ENOMEM; | 6769 | return -ENOMEM; |
6384 | 6770 | ||
6771 | sg->next = sg; | ||
6772 | |||
6385 | *per_cpu_ptr(sdd->sg, j) = sg; | 6773 | *per_cpu_ptr(sdd->sg, j) = sg; |
6386 | 6774 | ||
6387 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6775 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), |
6388 | GFP_KERNEL, cpu_to_node(j)); | 6776 | GFP_KERNEL, cpu_to_node(j)); |
6389 | if (!sgp) | 6777 | if (!sgp) |
6390 | return -ENOMEM; | 6778 | return -ENOMEM; |
@@ -6437,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6437 | if (!sd) | 6825 | if (!sd) |
6438 | return child; | 6826 | return child; |
6439 | 6827 | ||
6440 | set_domain_attribute(sd, attr); | ||
6441 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 6828 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
6442 | if (child) { | 6829 | if (child) { |
6443 | sd->level = child->level + 1; | 6830 | sd->level = child->level + 1; |
@@ -6445,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6445 | child->parent = sd; | 6832 | child->parent = sd; |
6446 | } | 6833 | } |
6447 | sd->child = child; | 6834 | sd->child = child; |
6835 | set_domain_attribute(sd, attr); | ||
6448 | 6836 | ||
6449 | return sd; | 6837 | return sd; |
6450 | } | 6838 | } |
@@ -6585,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
6585 | if (!doms_cur) | 6973 | if (!doms_cur) |
6586 | doms_cur = &fallback_doms; | 6974 | doms_cur = &fallback_doms; |
6587 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6975 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
6588 | dattr_cur = NULL; | ||
6589 | err = build_sched_domains(doms_cur[0], NULL); | 6976 | err = build_sched_domains(doms_cur[0], NULL); |
6590 | register_sched_domain_sysctl(); | 6977 | register_sched_domain_sysctl(); |
6591 | 6978 | ||
@@ -6710,97 +7097,6 @@ match2: | |||
6710 | mutex_unlock(&sched_domains_mutex); | 7097 | mutex_unlock(&sched_domains_mutex); |
6711 | } | 7098 | } |
6712 | 7099 | ||
6713 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6714 | static void reinit_sched_domains(void) | ||
6715 | { | ||
6716 | get_online_cpus(); | ||
6717 | |||
6718 | /* Destroy domains first to force the rebuild */ | ||
6719 | partition_sched_domains(0, NULL, NULL); | ||
6720 | |||
6721 | rebuild_sched_domains(); | ||
6722 | put_online_cpus(); | ||
6723 | } | ||
6724 | |||
6725 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6726 | { | ||
6727 | unsigned int level = 0; | ||
6728 | |||
6729 | if (sscanf(buf, "%u", &level) != 1) | ||
6730 | return -EINVAL; | ||
6731 | |||
6732 | /* | ||
6733 | * level is always be positive so don't check for | ||
6734 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | ||
6735 | * What happens on 0 or 1 byte write, | ||
6736 | * need to check for count as well? | ||
6737 | */ | ||
6738 | |||
6739 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | ||
6740 | return -EINVAL; | ||
6741 | |||
6742 | if (smt) | ||
6743 | sched_smt_power_savings = level; | ||
6744 | else | ||
6745 | sched_mc_power_savings = level; | ||
6746 | |||
6747 | reinit_sched_domains(); | ||
6748 | |||
6749 | return count; | ||
6750 | } | ||
6751 | |||
6752 | #ifdef CONFIG_SCHED_MC | ||
6753 | static ssize_t sched_mc_power_savings_show(struct device *dev, | ||
6754 | struct device_attribute *attr, | ||
6755 | char *buf) | ||
6756 | { | ||
6757 | return sprintf(buf, "%u\n", sched_mc_power_savings); | ||
6758 | } | ||
6759 | static ssize_t sched_mc_power_savings_store(struct device *dev, | ||
6760 | struct device_attribute *attr, | ||
6761 | const char *buf, size_t count) | ||
6762 | { | ||
6763 | return sched_power_savings_store(buf, count, 0); | ||
6764 | } | ||
6765 | static DEVICE_ATTR(sched_mc_power_savings, 0644, | ||
6766 | sched_mc_power_savings_show, | ||
6767 | sched_mc_power_savings_store); | ||
6768 | #endif | ||
6769 | |||
6770 | #ifdef CONFIG_SCHED_SMT | ||
6771 | static ssize_t sched_smt_power_savings_show(struct device *dev, | ||
6772 | struct device_attribute *attr, | ||
6773 | char *buf) | ||
6774 | { | ||
6775 | return sprintf(buf, "%u\n", sched_smt_power_savings); | ||
6776 | } | ||
6777 | static ssize_t sched_smt_power_savings_store(struct device *dev, | ||
6778 | struct device_attribute *attr, | ||
6779 | const char *buf, size_t count) | ||
6780 | { | ||
6781 | return sched_power_savings_store(buf, count, 1); | ||
6782 | } | ||
6783 | static DEVICE_ATTR(sched_smt_power_savings, 0644, | ||
6784 | sched_smt_power_savings_show, | ||
6785 | sched_smt_power_savings_store); | ||
6786 | #endif | ||
6787 | |||
6788 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) | ||
6789 | { | ||
6790 | int err = 0; | ||
6791 | |||
6792 | #ifdef CONFIG_SCHED_SMT | ||
6793 | if (smt_capable()) | ||
6794 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); | ||
6795 | #endif | ||
6796 | #ifdef CONFIG_SCHED_MC | ||
6797 | if (!err && mc_capable()) | ||
6798 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); | ||
6799 | #endif | ||
6800 | return err; | ||
6801 | } | ||
6802 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
6803 | |||
6804 | /* | 7100 | /* |
6805 | * Update cpusets according to cpu_active mask. If cpusets are | 7101 | * Update cpusets according to cpu_active mask. If cpusets are |
6806 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 7102 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
@@ -6838,6 +7134,8 @@ void __init sched_init_smp(void) | |||
6838 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7134 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6839 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7135 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6840 | 7136 | ||
7137 | sched_init_numa(); | ||
7138 | |||
6841 | get_online_cpus(); | 7139 | get_online_cpus(); |
6842 | mutex_lock(&sched_domains_mutex); | 7140 | mutex_lock(&sched_domains_mutex); |
6843 | init_sched_domains(cpu_active_mask); | 7141 | init_sched_domains(cpu_active_mask); |
@@ -7059,6 +7357,7 @@ void __init sched_init(void) | |||
7059 | /* May be allocated at isolcpus cmdline parse time */ | 7357 | /* May be allocated at isolcpus cmdline parse time */ |
7060 | if (cpu_isolated_map == NULL) | 7358 | if (cpu_isolated_map == NULL) |
7061 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7359 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7360 | idle_thread_set_boot_cpu(); | ||
7062 | #endif | 7361 | #endif |
7063 | init_sched_fair_class(); | 7362 | init_sched_fair_class(); |
7064 | 7363 | ||
@@ -7980,13 +8279,9 @@ static struct cftype cpu_files[] = { | |||
7980 | .write_u64 = cpu_rt_period_write_uint, | 8279 | .write_u64 = cpu_rt_period_write_uint, |
7981 | }, | 8280 | }, |
7982 | #endif | 8281 | #endif |
8282 | { } /* terminate */ | ||
7983 | }; | 8283 | }; |
7984 | 8284 | ||
7985 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7986 | { | ||
7987 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7988 | } | ||
7989 | |||
7990 | struct cgroup_subsys cpu_cgroup_subsys = { | 8285 | struct cgroup_subsys cpu_cgroup_subsys = { |
7991 | .name = "cpu", | 8286 | .name = "cpu", |
7992 | .create = cpu_cgroup_create, | 8287 | .create = cpu_cgroup_create, |
@@ -7994,8 +8289,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7994 | .can_attach = cpu_cgroup_can_attach, | 8289 | .can_attach = cpu_cgroup_can_attach, |
7995 | .attach = cpu_cgroup_attach, | 8290 | .attach = cpu_cgroup_attach, |
7996 | .exit = cpu_cgroup_exit, | 8291 | .exit = cpu_cgroup_exit, |
7997 | .populate = cpu_cgroup_populate, | ||
7998 | .subsys_id = cpu_cgroup_subsys_id, | 8292 | .subsys_id = cpu_cgroup_subsys_id, |
8293 | .base_cftypes = cpu_files, | ||
7999 | .early_init = 1, | 8294 | .early_init = 1, |
8000 | }; | 8295 | }; |
8001 | 8296 | ||
@@ -8180,13 +8475,9 @@ static struct cftype files[] = { | |||
8180 | .name = "stat", | 8475 | .name = "stat", |
8181 | .read_map = cpuacct_stats_show, | 8476 | .read_map = cpuacct_stats_show, |
8182 | }, | 8477 | }, |
8478 | { } /* terminate */ | ||
8183 | }; | 8479 | }; |
8184 | 8480 | ||
8185 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8186 | { | ||
8187 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8188 | } | ||
8189 | |||
8190 | /* | 8481 | /* |
8191 | * charge this task's execution time to its accounting group. | 8482 | * charge this task's execution time to its accounting group. |
8192 | * | 8483 | * |
@@ -8218,7 +8509,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8218 | .name = "cpuacct", | 8509 | .name = "cpuacct", |
8219 | .create = cpuacct_create, | 8510 | .create = cpuacct_create, |
8220 | .destroy = cpuacct_destroy, | 8511 | .destroy = cpuacct_destroy, |
8221 | .populate = cpuacct_populate, | ||
8222 | .subsys_id = cpuacct_subsys_id, | 8512 | .subsys_id = cpuacct_subsys_id, |
8513 | .base_cftypes = files, | ||
8223 | }; | 8514 | }; |
8224 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8515 | #endif /* CONFIG_CGROUP_CPUACCT */ |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161d..6f79596e0ea9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | SPLIT_NS(spread0)); | 202 | SPLIT_NS(spread0)); |
203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 203 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
204 | cfs_rq->nr_spread_over); | 204 | cfs_rq->nr_spread_over); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
207 | #ifdef CONFIG_FAIR_GROUP_SCHED | 207 | #ifdef CONFIG_FAIR_GROUP_SCHED |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
260 | SEQ_printf(m, "\ncpu#%d\n", cpu); | 260 | SEQ_printf(m, "\ncpu#%d\n", cpu); |
261 | #endif | 261 | #endif |
262 | 262 | ||
263 | #define P(x) \ | 263 | #define P(x) \ |
264 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) | 264 | do { \ |
265 | if (sizeof(rq->x) == 4) \ | ||
266 | SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ | ||
267 | else \ | ||
268 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ | ||
269 | } while (0) | ||
270 | |||
265 | #define PN(x) \ | 271 | #define PN(x) \ |
266 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 272 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
267 | 273 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9553640c1c3..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2703 | int want_sd = 1; | 2703 | int want_sd = 1; |
2704 | int sync = wake_flags & WF_SYNC; | 2704 | int sync = wake_flags & WF_SYNC; |
2705 | 2705 | ||
2706 | if (p->rt.nr_cpus_allowed == 1) | 2706 | if (p->nr_cpus_allowed == 1) |
2707 | return prev_cpu; | 2707 | return prev_cpu; |
2708 | 2708 | ||
2709 | if (sd_flag & SD_BALANCE_WAKE) { | 2709 | if (sd_flag & SD_BALANCE_WAKE) { |
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2721 | * If power savings logic is enabled for a domain, see if we | 2721 | * If power savings logic is enabled for a domain, see if we |
2722 | * are not overloaded, if so, don't balance wider. | 2722 | * are not overloaded, if so, don't balance wider. |
2723 | */ | 2723 | */ |
2724 | if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { | 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { |
2725 | unsigned long power = 0; | 2725 | unsigned long power = 0; |
2726 | unsigned long nr_running = 0; | 2726 | unsigned long nr_running = 0; |
2727 | unsigned long capacity; | 2727 | unsigned long capacity; |
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2734 | 2734 | ||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
2736 | 2736 | ||
2737 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2738 | nr_running /= 2; | ||
2739 | |||
2740 | if (nr_running < capacity) | 2737 | if (nr_running < capacity) |
2741 | want_sd = 0; | 2738 | want_sd = 0; |
2742 | } | 2739 | } |
@@ -3082,7 +3079,7 @@ struct lb_env { | |||
3082 | struct rq *dst_rq; | 3079 | struct rq *dst_rq; |
3083 | 3080 | ||
3084 | enum cpu_idle_type idle; | 3081 | enum cpu_idle_type idle; |
3085 | long load_move; | 3082 | long imbalance; |
3086 | unsigned int flags; | 3083 | unsigned int flags; |
3087 | 3084 | ||
3088 | unsigned int loop; | 3085 | unsigned int loop; |
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p); | |||
3218 | static const unsigned int sched_nr_migrate_break = 32; | 3215 | static const unsigned int sched_nr_migrate_break = 32; |
3219 | 3216 | ||
3220 | /* | 3217 | /* |
3221 | * move_tasks tries to move up to load_move weighted load from busiest to | 3218 | * move_tasks tries to move up to imbalance weighted load from busiest to |
3222 | * this_rq, as part of a balancing operation within domain "sd". | 3219 | * this_rq, as part of a balancing operation within domain "sd". |
3223 | * Returns 1 if successful and 0 otherwise. | 3220 | * Returns 1 if successful and 0 otherwise. |
3224 | * | 3221 | * |
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env) | |||
3231 | unsigned long load; | 3228 | unsigned long load; |
3232 | int pulled = 0; | 3229 | int pulled = 0; |
3233 | 3230 | ||
3234 | if (env->load_move <= 0) | 3231 | if (env->imbalance <= 0) |
3235 | return 0; | 3232 | return 0; |
3236 | 3233 | ||
3237 | while (!list_empty(tasks)) { | 3234 | while (!list_empty(tasks)) { |
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env) | |||
3257 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) | 3254 | if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) |
3258 | goto next; | 3255 | goto next; |
3259 | 3256 | ||
3260 | if ((load / 2) > env->load_move) | 3257 | if ((load / 2) > env->imbalance) |
3261 | goto next; | 3258 | goto next; |
3262 | 3259 | ||
3263 | if (!can_migrate_task(p, env)) | 3260 | if (!can_migrate_task(p, env)) |
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env) | |||
3265 | 3262 | ||
3266 | move_task(p, env); | 3263 | move_task(p, env); |
3267 | pulled++; | 3264 | pulled++; |
3268 | env->load_move -= load; | 3265 | env->imbalance -= load; |
3269 | 3266 | ||
3270 | #ifdef CONFIG_PREEMPT | 3267 | #ifdef CONFIG_PREEMPT |
3271 | /* | 3268 | /* |
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env) | |||
3281 | * We only want to steal up to the prescribed amount of | 3278 | * We only want to steal up to the prescribed amount of |
3282 | * weighted load. | 3279 | * weighted load. |
3283 | */ | 3280 | */ |
3284 | if (env->load_move <= 0) | 3281 | if (env->imbalance <= 0) |
3285 | break; | 3282 | break; |
3286 | 3283 | ||
3287 | continue; | 3284 | continue; |
@@ -3435,14 +3432,6 @@ struct sd_lb_stats { | |||
3435 | unsigned int busiest_group_weight; | 3432 | unsigned int busiest_group_weight; |
3436 | 3433 | ||
3437 | int group_imb; /* Is there imbalance in this sd */ | 3434 | int group_imb; /* Is there imbalance in this sd */ |
3438 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3439 | int power_savings_balance; /* Is powersave balance needed for this sd */ | ||
3440 | struct sched_group *group_min; /* Least loaded group in sd */ | ||
3441 | struct sched_group *group_leader; /* Group which relieves group_min */ | ||
3442 | unsigned long min_load_per_task; /* load_per_task in group_min */ | ||
3443 | unsigned long leader_nr_running; /* Nr running of group_leader */ | ||
3444 | unsigned long min_nr_running; /* Nr running of group_min */ | ||
3445 | #endif | ||
3446 | }; | 3435 | }; |
3447 | 3436 | ||
3448 | /* | 3437 | /* |
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
3486 | return load_idx; | 3475 | return load_idx; |
3487 | } | 3476 | } |
3488 | 3477 | ||
3489 | |||
3490 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
3491 | /** | ||
3492 | * init_sd_power_savings_stats - Initialize power savings statistics for | ||
3493 | * the given sched_domain, during load balancing. | ||
3494 | * | ||
3495 | * @sd: Sched domain whose power-savings statistics are to be initialized. | ||
3496 | * @sds: Variable containing the statistics for sd. | ||
3497 | * @idle: Idle status of the CPU at which we're performing load-balancing. | ||
3498 | */ | ||
3499 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3500 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3501 | { | ||
3502 | /* | ||
3503 | * Busy processors will not participate in power savings | ||
3504 | * balance. | ||
3505 | */ | ||
3506 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
3507 | sds->power_savings_balance = 0; | ||
3508 | else { | ||
3509 | sds->power_savings_balance = 1; | ||
3510 | sds->min_nr_running = ULONG_MAX; | ||
3511 | sds->leader_nr_running = 0; | ||
3512 | } | ||
3513 | } | ||
3514 | |||
3515 | /** | ||
3516 | * update_sd_power_savings_stats - Update the power saving stats for a | ||
3517 | * sched_domain while performing load balancing. | ||
3518 | * | ||
3519 | * @group: sched_group belonging to the sched_domain under consideration. | ||
3520 | * @sds: Variable containing the statistics of the sched_domain | ||
3521 | * @local_group: Does group contain the CPU for which we're performing | ||
3522 | * load balancing ? | ||
3523 | * @sgs: Variable containing the statistics of the group. | ||
3524 | */ | ||
3525 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3526 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3527 | { | ||
3528 | |||
3529 | if (!sds->power_savings_balance) | ||
3530 | return; | ||
3531 | |||
3532 | /* | ||
3533 | * If the local group is idle or completely loaded | ||
3534 | * no need to do power savings balance at this domain | ||
3535 | */ | ||
3536 | if (local_group && (sds->this_nr_running >= sgs->group_capacity || | ||
3537 | !sds->this_nr_running)) | ||
3538 | sds->power_savings_balance = 0; | ||
3539 | |||
3540 | /* | ||
3541 | * If a group is already running at full capacity or idle, | ||
3542 | * don't include that group in power savings calculations | ||
3543 | */ | ||
3544 | if (!sds->power_savings_balance || | ||
3545 | sgs->sum_nr_running >= sgs->group_capacity || | ||
3546 | !sgs->sum_nr_running) | ||
3547 | return; | ||
3548 | |||
3549 | /* | ||
3550 | * Calculate the group which has the least non-idle load. | ||
3551 | * This is the group from where we need to pick up the load | ||
3552 | * for saving power | ||
3553 | */ | ||
3554 | if ((sgs->sum_nr_running < sds->min_nr_running) || | ||
3555 | (sgs->sum_nr_running == sds->min_nr_running && | ||
3556 | group_first_cpu(group) > group_first_cpu(sds->group_min))) { | ||
3557 | sds->group_min = group; | ||
3558 | sds->min_nr_running = sgs->sum_nr_running; | ||
3559 | sds->min_load_per_task = sgs->sum_weighted_load / | ||
3560 | sgs->sum_nr_running; | ||
3561 | } | ||
3562 | |||
3563 | /* | ||
3564 | * Calculate the group which is almost near its | ||
3565 | * capacity but still has some space to pick up some load | ||
3566 | * from other group and save more power | ||
3567 | */ | ||
3568 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) | ||
3569 | return; | ||
3570 | |||
3571 | if (sgs->sum_nr_running > sds->leader_nr_running || | ||
3572 | (sgs->sum_nr_running == sds->leader_nr_running && | ||
3573 | group_first_cpu(group) < group_first_cpu(sds->group_leader))) { | ||
3574 | sds->group_leader = group; | ||
3575 | sds->leader_nr_running = sgs->sum_nr_running; | ||
3576 | } | ||
3577 | } | ||
3578 | |||
3579 | /** | ||
3580 | * check_power_save_busiest_group - see if there is potential for some power-savings balance | ||
3581 | * @sds: Variable containing the statistics of the sched_domain | ||
3582 | * under consideration. | ||
3583 | * @this_cpu: Cpu at which we're currently performing load-balancing. | ||
3584 | * @imbalance: Variable to store the imbalance. | ||
3585 | * | ||
3586 | * Description: | ||
3587 | * Check if we have potential to perform some power-savings balance. | ||
3588 | * If yes, set the busiest group to be the least loaded group in the | ||
3589 | * sched_domain, so that it's CPUs can be put to idle. | ||
3590 | * | ||
3591 | * Returns 1 if there is potential to perform power-savings balance. | ||
3592 | * Else returns 0. | ||
3593 | */ | ||
3594 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3595 | int this_cpu, unsigned long *imbalance) | ||
3596 | { | ||
3597 | if (!sds->power_savings_balance) | ||
3598 | return 0; | ||
3599 | |||
3600 | if (sds->this != sds->group_leader || | ||
3601 | sds->group_leader == sds->group_min) | ||
3602 | return 0; | ||
3603 | |||
3604 | *imbalance = sds->min_load_per_task; | ||
3605 | sds->busiest = sds->group_min; | ||
3606 | |||
3607 | return 1; | ||
3608 | |||
3609 | } | ||
3610 | #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3611 | static inline void init_sd_power_savings_stats(struct sched_domain *sd, | ||
3612 | struct sd_lb_stats *sds, enum cpu_idle_type idle) | ||
3613 | { | ||
3614 | return; | ||
3615 | } | ||
3616 | |||
3617 | static inline void update_sd_power_savings_stats(struct sched_group *group, | ||
3618 | struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) | ||
3619 | { | ||
3620 | return; | ||
3621 | } | ||
3622 | |||
3623 | static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | ||
3624 | int this_cpu, unsigned long *imbalance) | ||
3625 | { | ||
3626 | return 0; | ||
3627 | } | ||
3628 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
3629 | |||
3630 | |||
3631 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 3478 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
3632 | { | 3479 | { |
3633 | return SCHED_POWER_SCALE; | 3480 | return SCHED_POWER_SCALE; |
@@ -3656,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
3656 | unsigned long scale_rt_power(int cpu) | 3503 | unsigned long scale_rt_power(int cpu) |
3657 | { | 3504 | { |
3658 | struct rq *rq = cpu_rq(cpu); | 3505 | struct rq *rq = cpu_rq(cpu); |
3659 | u64 total, available; | 3506 | u64 total, available, age_stamp, avg; |
3507 | |||
3508 | /* | ||
3509 | * Since we're reading these variables without serialization make sure | ||
3510 | * we read them once before doing sanity checks on them. | ||
3511 | */ | ||
3512 | age_stamp = ACCESS_ONCE(rq->age_stamp); | ||
3513 | avg = ACCESS_ONCE(rq->rt_avg); | ||
3660 | 3514 | ||
3661 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 3515 | total = sched_avg_period() + (rq->clock - age_stamp); |
3662 | 3516 | ||
3663 | if (unlikely(total < rq->rt_avg)) { | 3517 | if (unlikely(total < avg)) { |
3664 | /* Ensures that power won't end up being negative */ | 3518 | /* Ensures that power won't end up being negative */ |
3665 | available = 0; | 3519 | available = 0; |
3666 | } else { | 3520 | } else { |
3667 | available = total - rq->rt_avg; | 3521 | available = total - avg; |
3668 | } | 3522 | } |
3669 | 3523 | ||
3670 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 3524 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
@@ -3727,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
3727 | 3581 | ||
3728 | power = 0; | 3582 | power = 0; |
3729 | 3583 | ||
3730 | group = child->groups; | 3584 | if (child->flags & SD_OVERLAP) { |
3731 | do { | 3585 | /* |
3732 | power += group->sgp->power; | 3586 | * SD_OVERLAP domains cannot assume that child groups |
3733 | group = group->next; | 3587 | * span the current group. |
3734 | } while (group != child->groups); | 3588 | */ |
3735 | 3589 | ||
3736 | sdg->sgp->power = power; | 3590 | for_each_cpu(cpu, sched_group_cpus(sdg)) |
3591 | power += power_of(cpu); | ||
3592 | } else { | ||
3593 | /* | ||
3594 | * !SD_OVERLAP domains can assume that child groups | ||
3595 | * span the current group. | ||
3596 | */ | ||
3597 | |||
3598 | group = child->groups; | ||
3599 | do { | ||
3600 | power += group->sgp->power; | ||
3601 | group = group->next; | ||
3602 | } while (group != child->groups); | ||
3603 | } | ||
3604 | |||
3605 | sdg->sgp->power_orig = sdg->sgp->power = power; | ||
3737 | } | 3606 | } |
3738 | 3607 | ||
3739 | /* | 3608 | /* |
@@ -3763,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3763 | 3632 | ||
3764 | /** | 3633 | /** |
3765 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3634 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3766 | * @sd: The sched_domain whose statistics are to be updated. | 3635 | * @env: The load balancing environment. |
3767 | * @group: sched_group whose statistics are to be updated. | 3636 | * @group: sched_group whose statistics are to be updated. |
3768 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3769 | * @idle: Idle status of this_cpu | ||
3770 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3637 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3771 | * @local_group: Does group contain this_cpu. | 3638 | * @local_group: Does group contain this_cpu. |
3772 | * @cpus: Set of cpus considered for load balancing. | 3639 | * @cpus: Set of cpus considered for load balancing. |
3773 | * @balance: Should we balance. | 3640 | * @balance: Should we balance. |
3774 | * @sgs: variable to hold the statistics for this group. | 3641 | * @sgs: variable to hold the statistics for this group. |
3775 | */ | 3642 | */ |
3776 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 3643 | static inline void update_sg_lb_stats(struct lb_env *env, |
3777 | struct sched_group *group, int this_cpu, | 3644 | struct sched_group *group, int load_idx, |
3778 | enum cpu_idle_type idle, int load_idx, | ||
3779 | int local_group, const struct cpumask *cpus, | 3645 | int local_group, const struct cpumask *cpus, |
3780 | int *balance, struct sg_lb_stats *sgs) | 3646 | int *balance, struct sg_lb_stats *sgs) |
3781 | { | 3647 | { |
3782 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; | 3648 | unsigned long nr_running, max_nr_running, min_nr_running; |
3783 | int i; | 3649 | unsigned long load, max_cpu_load, min_cpu_load; |
3784 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3650 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3785 | unsigned long avg_load_per_task = 0; | 3651 | unsigned long avg_load_per_task = 0; |
3652 | int i; | ||
3786 | 3653 | ||
3787 | if (local_group) | 3654 | if (local_group) |
3788 | balance_cpu = group_first_cpu(group); | 3655 | balance_cpu = group_balance_cpu(group); |
3789 | 3656 | ||
3790 | /* Tally up the load of all CPUs in the group */ | 3657 | /* Tally up the load of all CPUs in the group */ |
3791 | max_cpu_load = 0; | 3658 | max_cpu_load = 0; |
3792 | min_cpu_load = ~0UL; | 3659 | min_cpu_load = ~0UL; |
3793 | max_nr_running = 0; | 3660 | max_nr_running = 0; |
3661 | min_nr_running = ~0UL; | ||
3794 | 3662 | ||
3795 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 3663 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
3796 | struct rq *rq = cpu_rq(i); | 3664 | struct rq *rq = cpu_rq(i); |
3797 | 3665 | ||
3666 | nr_running = rq->nr_running; | ||
3667 | |||
3798 | /* Bias balancing toward cpus of our domain */ | 3668 | /* Bias balancing toward cpus of our domain */ |
3799 | if (local_group) { | 3669 | if (local_group) { |
3800 | if (idle_cpu(i) && !first_idle_cpu) { | 3670 | if (idle_cpu(i) && !first_idle_cpu && |
3671 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
3801 | first_idle_cpu = 1; | 3672 | first_idle_cpu = 1; |
3802 | balance_cpu = i; | 3673 | balance_cpu = i; |
3803 | } | 3674 | } |
@@ -3805,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3805 | load = target_load(i, load_idx); | 3676 | load = target_load(i, load_idx); |
3806 | } else { | 3677 | } else { |
3807 | load = source_load(i, load_idx); | 3678 | load = source_load(i, load_idx); |
3808 | if (load > max_cpu_load) { | 3679 | if (load > max_cpu_load) |
3809 | max_cpu_load = load; | 3680 | max_cpu_load = load; |
3810 | max_nr_running = rq->nr_running; | ||
3811 | } | ||
3812 | if (min_cpu_load > load) | 3681 | if (min_cpu_load > load) |
3813 | min_cpu_load = load; | 3682 | min_cpu_load = load; |
3683 | |||
3684 | if (nr_running > max_nr_running) | ||
3685 | max_nr_running = nr_running; | ||
3686 | if (min_nr_running > nr_running) | ||
3687 | min_nr_running = nr_running; | ||
3814 | } | 3688 | } |
3815 | 3689 | ||
3816 | sgs->group_load += load; | 3690 | sgs->group_load += load; |
3817 | sgs->sum_nr_running += rq->nr_running; | 3691 | sgs->sum_nr_running += nr_running; |
3818 | sgs->sum_weighted_load += weighted_cpuload(i); | 3692 | sgs->sum_weighted_load += weighted_cpuload(i); |
3819 | if (idle_cpu(i)) | 3693 | if (idle_cpu(i)) |
3820 | sgs->idle_cpus++; | 3694 | sgs->idle_cpus++; |
@@ -3827,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3827 | * to do the newly idle load balance. | 3701 | * to do the newly idle load balance. |
3828 | */ | 3702 | */ |
3829 | if (local_group) { | 3703 | if (local_group) { |
3830 | if (idle != CPU_NEWLY_IDLE) { | 3704 | if (env->idle != CPU_NEWLY_IDLE) { |
3831 | if (balance_cpu != this_cpu) { | 3705 | if (balance_cpu != env->dst_cpu) { |
3832 | *balance = 0; | 3706 | *balance = 0; |
3833 | return; | 3707 | return; |
3834 | } | 3708 | } |
3835 | update_group_power(sd, this_cpu); | 3709 | update_group_power(env->sd, env->dst_cpu); |
3836 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | 3710 | } else if (time_after_eq(jiffies, group->sgp->next_update)) |
3837 | update_group_power(sd, this_cpu); | 3711 | update_group_power(env->sd, env->dst_cpu); |
3838 | } | 3712 | } |
3839 | 3713 | ||
3840 | /* Adjust by relative CPU power of the group */ | 3714 | /* Adjust by relative CPU power of the group */ |
@@ -3852,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3852 | if (sgs->sum_nr_running) | 3726 | if (sgs->sum_nr_running) |
3853 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 3727 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
3854 | 3728 | ||
3855 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 3729 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && |
3730 | (max_nr_running - min_nr_running) > 1) | ||
3856 | sgs->group_imb = 1; | 3731 | sgs->group_imb = 1; |
3857 | 3732 | ||
3858 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | 3733 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
3859 | SCHED_POWER_SCALE); | 3734 | SCHED_POWER_SCALE); |
3860 | if (!sgs->group_capacity) | 3735 | if (!sgs->group_capacity) |
3861 | sgs->group_capacity = fix_small_capacity(sd, group); | 3736 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
3862 | sgs->group_weight = group->group_weight; | 3737 | sgs->group_weight = group->group_weight; |
3863 | 3738 | ||
3864 | if (sgs->group_capacity > sgs->sum_nr_running) | 3739 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -3867,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3867 | 3742 | ||
3868 | /** | 3743 | /** |
3869 | * update_sd_pick_busiest - return 1 on busiest group | 3744 | * update_sd_pick_busiest - return 1 on busiest group |
3870 | * @sd: sched_domain whose statistics are to be checked | 3745 | * @env: The load balancing environment. |
3871 | * @sds: sched_domain statistics | 3746 | * @sds: sched_domain statistics |
3872 | * @sg: sched_group candidate to be checked for being the busiest | 3747 | * @sg: sched_group candidate to be checked for being the busiest |
3873 | * @sgs: sched_group statistics | 3748 | * @sgs: sched_group statistics |
3874 | * @this_cpu: the current cpu | ||
3875 | * | 3749 | * |
3876 | * Determine if @sg is a busier group than the previously selected | 3750 | * Determine if @sg is a busier group than the previously selected |
3877 | * busiest group. | 3751 | * busiest group. |
3878 | */ | 3752 | */ |
3879 | static bool update_sd_pick_busiest(struct sched_domain *sd, | 3753 | static bool update_sd_pick_busiest(struct lb_env *env, |
3880 | struct sd_lb_stats *sds, | 3754 | struct sd_lb_stats *sds, |
3881 | struct sched_group *sg, | 3755 | struct sched_group *sg, |
3882 | struct sg_lb_stats *sgs, | 3756 | struct sg_lb_stats *sgs) |
3883 | int this_cpu) | ||
3884 | { | 3757 | { |
3885 | if (sgs->avg_load <= sds->max_load) | 3758 | if (sgs->avg_load <= sds->max_load) |
3886 | return false; | 3759 | return false; |
@@ -3896,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3896 | * numbered CPUs in the group, therefore mark all groups | 3769 | * numbered CPUs in the group, therefore mark all groups |
3897 | * higher than ourself as busy. | 3770 | * higher than ourself as busy. |
3898 | */ | 3771 | */ |
3899 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | 3772 | if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && |
3900 | this_cpu < group_first_cpu(sg)) { | 3773 | env->dst_cpu < group_first_cpu(sg)) { |
3901 | if (!sds->busiest) | 3774 | if (!sds->busiest) |
3902 | return true; | 3775 | return true; |
3903 | 3776 | ||
@@ -3910,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
3910 | 3783 | ||
3911 | /** | 3784 | /** |
3912 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3785 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3913 | * @sd: sched_domain whose statistics are to be updated. | 3786 | * @env: The load balancing environment. |
3914 | * @this_cpu: Cpu for which load balance is currently performed. | ||
3915 | * @idle: Idle status of this_cpu | ||
3916 | * @cpus: Set of cpus considered for load balancing. | 3787 | * @cpus: Set of cpus considered for load balancing. |
3917 | * @balance: Should we balance. | 3788 | * @balance: Should we balance. |
3918 | * @sds: variable to hold the statistics for this sched_domain. | 3789 | * @sds: variable to hold the statistics for this sched_domain. |
3919 | */ | 3790 | */ |
3920 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 3791 | static inline void update_sd_lb_stats(struct lb_env *env, |
3921 | enum cpu_idle_type idle, const struct cpumask *cpus, | 3792 | const struct cpumask *cpus, |
3922 | int *balance, struct sd_lb_stats *sds) | 3793 | int *balance, struct sd_lb_stats *sds) |
3923 | { | 3794 | { |
3924 | struct sched_domain *child = sd->child; | 3795 | struct sched_domain *child = env->sd->child; |
3925 | struct sched_group *sg = sd->groups; | 3796 | struct sched_group *sg = env->sd->groups; |
3926 | struct sg_lb_stats sgs; | 3797 | struct sg_lb_stats sgs; |
3927 | int load_idx, prefer_sibling = 0; | 3798 | int load_idx, prefer_sibling = 0; |
3928 | 3799 | ||
3929 | if (child && child->flags & SD_PREFER_SIBLING) | 3800 | if (child && child->flags & SD_PREFER_SIBLING) |
3930 | prefer_sibling = 1; | 3801 | prefer_sibling = 1; |
3931 | 3802 | ||
3932 | init_sd_power_savings_stats(sd, sds, idle); | 3803 | load_idx = get_sd_load_idx(env->sd, env->idle); |
3933 | load_idx = get_sd_load_idx(sd, idle); | ||
3934 | 3804 | ||
3935 | do { | 3805 | do { |
3936 | int local_group; | 3806 | int local_group; |
3937 | 3807 | ||
3938 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 3808 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
3939 | memset(&sgs, 0, sizeof(sgs)); | 3809 | memset(&sgs, 0, sizeof(sgs)); |
3940 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, | 3810 | update_sg_lb_stats(env, sg, load_idx, local_group, |
3941 | local_group, cpus, balance, &sgs); | 3811 | cpus, balance, &sgs); |
3942 | 3812 | ||
3943 | if (local_group && !(*balance)) | 3813 | if (local_group && !(*balance)) |
3944 | return; | 3814 | return; |
@@ -3966,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3966 | sds->this_load_per_task = sgs.sum_weighted_load; | 3836 | sds->this_load_per_task = sgs.sum_weighted_load; |
3967 | sds->this_has_capacity = sgs.group_has_capacity; | 3837 | sds->this_has_capacity = sgs.group_has_capacity; |
3968 | sds->this_idle_cpus = sgs.idle_cpus; | 3838 | sds->this_idle_cpus = sgs.idle_cpus; |
3969 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 3839 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { |
3970 | sds->max_load = sgs.avg_load; | 3840 | sds->max_load = sgs.avg_load; |
3971 | sds->busiest = sg; | 3841 | sds->busiest = sg; |
3972 | sds->busiest_nr_running = sgs.sum_nr_running; | 3842 | sds->busiest_nr_running = sgs.sum_nr_running; |
@@ -3978,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3978 | sds->group_imb = sgs.group_imb; | 3848 | sds->group_imb = sgs.group_imb; |
3979 | } | 3849 | } |
3980 | 3850 | ||
3981 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); | ||
3982 | sg = sg->next; | 3851 | sg = sg->next; |
3983 | } while (sg != sd->groups); | 3852 | } while (sg != env->sd->groups); |
3984 | } | 3853 | } |
3985 | 3854 | ||
3986 | /** | 3855 | /** |
@@ -4003,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
4003 | * Returns 1 when packing is required and a task should be moved to | 3872 | * Returns 1 when packing is required and a task should be moved to |
4004 | * this CPU. The amount of the imbalance is returned in *imbalance. | 3873 | * this CPU. The amount of the imbalance is returned in *imbalance. |
4005 | * | 3874 | * |
4006 | * @sd: The sched_domain whose packing is to be checked. | 3875 | * @env: The load balancing environment. |
4007 | * @sds: Statistics of the sched_domain which is to be packed | 3876 | * @sds: Statistics of the sched_domain which is to be packed |
4008 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
4009 | * @imbalance: returns amount of imbalanced due to packing. | ||
4010 | */ | 3877 | */ |
4011 | static int check_asym_packing(struct sched_domain *sd, | 3878 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
4012 | struct sd_lb_stats *sds, | ||
4013 | int this_cpu, unsigned long *imbalance) | ||
4014 | { | 3879 | { |
4015 | int busiest_cpu; | 3880 | int busiest_cpu; |
4016 | 3881 | ||
4017 | if (!(sd->flags & SD_ASYM_PACKING)) | 3882 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
4018 | return 0; | 3883 | return 0; |
4019 | 3884 | ||
4020 | if (!sds->busiest) | 3885 | if (!sds->busiest) |
4021 | return 0; | 3886 | return 0; |
4022 | 3887 | ||
4023 | busiest_cpu = group_first_cpu(sds->busiest); | 3888 | busiest_cpu = group_first_cpu(sds->busiest); |
4024 | if (this_cpu > busiest_cpu) | 3889 | if (env->dst_cpu > busiest_cpu) |
4025 | return 0; | 3890 | return 0; |
4026 | 3891 | ||
4027 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, | 3892 | env->imbalance = DIV_ROUND_CLOSEST( |
4028 | SCHED_POWER_SCALE); | 3893 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); |
3894 | |||
4029 | return 1; | 3895 | return 1; |
4030 | } | 3896 | } |
4031 | 3897 | ||
@@ -4033,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd, | |||
4033 | * fix_small_imbalance - Calculate the minor imbalance that exists | 3899 | * fix_small_imbalance - Calculate the minor imbalance that exists |
4034 | * amongst the groups of a sched_domain, during | 3900 | * amongst the groups of a sched_domain, during |
4035 | * load balancing. | 3901 | * load balancing. |
3902 | * @env: The load balancing environment. | ||
4036 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | 3903 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. |
4037 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
4038 | * @imbalance: Variable to store the imbalance. | ||
4039 | */ | 3904 | */ |
4040 | static inline void fix_small_imbalance(struct sd_lb_stats *sds, | 3905 | static inline |
4041 | int this_cpu, unsigned long *imbalance) | 3906 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4042 | { | 3907 | { |
4043 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 3908 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4044 | unsigned int imbn = 2; | 3909 | unsigned int imbn = 2; |
@@ -4049,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4049 | if (sds->busiest_load_per_task > | 3914 | if (sds->busiest_load_per_task > |
4050 | sds->this_load_per_task) | 3915 | sds->this_load_per_task) |
4051 | imbn = 1; | 3916 | imbn = 1; |
4052 | } else | 3917 | } else { |
4053 | sds->this_load_per_task = | 3918 | sds->this_load_per_task = |
4054 | cpu_avg_load_per_task(this_cpu); | 3919 | cpu_avg_load_per_task(env->dst_cpu); |
3920 | } | ||
4055 | 3921 | ||
4056 | scaled_busy_load_per_task = sds->busiest_load_per_task | 3922 | scaled_busy_load_per_task = sds->busiest_load_per_task |
4057 | * SCHED_POWER_SCALE; | 3923 | * SCHED_POWER_SCALE; |
@@ -4059,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4059 | 3925 | ||
4060 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3926 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
4061 | (scaled_busy_load_per_task * imbn)) { | 3927 | (scaled_busy_load_per_task * imbn)) { |
4062 | *imbalance = sds->busiest_load_per_task; | 3928 | env->imbalance = sds->busiest_load_per_task; |
4063 | return; | 3929 | return; |
4064 | } | 3930 | } |
4065 | 3931 | ||
@@ -4096,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
4096 | 3962 | ||
4097 | /* Move if we gain throughput */ | 3963 | /* Move if we gain throughput */ |
4098 | if (pwr_move > pwr_now) | 3964 | if (pwr_move > pwr_now) |
4099 | *imbalance = sds->busiest_load_per_task; | 3965 | env->imbalance = sds->busiest_load_per_task; |
4100 | } | 3966 | } |
4101 | 3967 | ||
4102 | /** | 3968 | /** |
4103 | * calculate_imbalance - Calculate the amount of imbalance present within the | 3969 | * calculate_imbalance - Calculate the amount of imbalance present within the |
4104 | * groups of a given sched_domain during load balance. | 3970 | * groups of a given sched_domain during load balance. |
3971 | * @env: load balance environment | ||
4105 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. | 3972 | * @sds: statistics of the sched_domain whose imbalance is to be calculated. |
4106 | * @this_cpu: Cpu for which currently load balance is being performed. | ||
4107 | * @imbalance: The variable to store the imbalance. | ||
4108 | */ | 3973 | */ |
4109 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 3974 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4110 | unsigned long *imbalance) | ||
4111 | { | 3975 | { |
4112 | unsigned long max_pull, load_above_capacity = ~0UL; | 3976 | unsigned long max_pull, load_above_capacity = ~0UL; |
4113 | 3977 | ||
@@ -4123,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4123 | * its cpu_power, while calculating max_load..) | 3987 | * its cpu_power, while calculating max_load..) |
4124 | */ | 3988 | */ |
4125 | if (sds->max_load < sds->avg_load) { | 3989 | if (sds->max_load < sds->avg_load) { |
4126 | *imbalance = 0; | 3990 | env->imbalance = 0; |
4127 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3991 | return fix_small_imbalance(env, sds); |
4128 | } | 3992 | } |
4129 | 3993 | ||
4130 | if (!sds->group_imb) { | 3994 | if (!sds->group_imb) { |
@@ -4152,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4152 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4016 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
4153 | 4017 | ||
4154 | /* How much load to actually move to equalise the imbalance */ | 4018 | /* How much load to actually move to equalise the imbalance */ |
4155 | *imbalance = min(max_pull * sds->busiest->sgp->power, | 4019 | env->imbalance = min(max_pull * sds->busiest->sgp->power, |
4156 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4020 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
4157 | / SCHED_POWER_SCALE; | 4021 | / SCHED_POWER_SCALE; |
4158 | 4022 | ||
@@ -4162,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4162 | * a think about bumping its value to force at least one task to be | 4026 | * a think about bumping its value to force at least one task to be |
4163 | * moved | 4027 | * moved |
4164 | */ | 4028 | */ |
4165 | if (*imbalance < sds->busiest_load_per_task) | 4029 | if (env->imbalance < sds->busiest_load_per_task) |
4166 | return fix_small_imbalance(sds, this_cpu, imbalance); | 4030 | return fix_small_imbalance(env, sds); |
4167 | 4031 | ||
4168 | } | 4032 | } |
4169 | 4033 | ||
@@ -4179,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4179 | * Also calculates the amount of weighted load which should be moved | 4043 | * Also calculates the amount of weighted load which should be moved |
4180 | * to restore balance. | 4044 | * to restore balance. |
4181 | * | 4045 | * |
4182 | * @sd: The sched_domain whose busiest group is to be returned. | 4046 | * @env: The load balancing environment. |
4183 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
4184 | * @imbalance: Variable which stores amount of weighted load which should | ||
4185 | * be moved to restore balance/put a group to idle. | ||
4186 | * @idle: The idle status of this_cpu. | ||
4187 | * @cpus: The set of CPUs under consideration for load-balancing. | 4047 | * @cpus: The set of CPUs under consideration for load-balancing. |
4188 | * @balance: Pointer to a variable indicating if this_cpu | 4048 | * @balance: Pointer to a variable indicating if this_cpu |
4189 | * is the appropriate cpu to perform load balancing at this_level. | 4049 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -4194,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
4194 | * put to idle by rebalancing its tasks onto our group. | 4054 | * put to idle by rebalancing its tasks onto our group. |
4195 | */ | 4055 | */ |
4196 | static struct sched_group * | 4056 | static struct sched_group * |
4197 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 4057 | find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) |
4198 | unsigned long *imbalance, enum cpu_idle_type idle, | ||
4199 | const struct cpumask *cpus, int *balance) | ||
4200 | { | 4058 | { |
4201 | struct sd_lb_stats sds; | 4059 | struct sd_lb_stats sds; |
4202 | 4060 | ||
@@ -4206,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4206 | * Compute the various statistics relavent for load balancing at | 4064 | * Compute the various statistics relavent for load balancing at |
4207 | * this level. | 4065 | * this level. |
4208 | */ | 4066 | */ |
4209 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); | 4067 | update_sd_lb_stats(env, cpus, balance, &sds); |
4210 | 4068 | ||
4211 | /* | 4069 | /* |
4212 | * this_cpu is not the appropriate cpu to perform load balancing at | 4070 | * this_cpu is not the appropriate cpu to perform load balancing at |
@@ -4215,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4215 | if (!(*balance)) | 4073 | if (!(*balance)) |
4216 | goto ret; | 4074 | goto ret; |
4217 | 4075 | ||
4218 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | 4076 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4219 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 4077 | check_asym_packing(env, &sds)) |
4220 | return sds.busiest; | 4078 | return sds.busiest; |
4221 | 4079 | ||
4222 | /* There is no busy sibling group to pull tasks from */ | 4080 | /* There is no busy sibling group to pull tasks from */ |
@@ -4234,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4234 | goto force_balance; | 4092 | goto force_balance; |
4235 | 4093 | ||
4236 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4094 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4237 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4095 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
4238 | !sds.busiest_has_capacity) | 4096 | !sds.busiest_has_capacity) |
4239 | goto force_balance; | 4097 | goto force_balance; |
4240 | 4098 | ||
@@ -4252,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4252 | if (sds.this_load >= sds.avg_load) | 4110 | if (sds.this_load >= sds.avg_load) |
4253 | goto out_balanced; | 4111 | goto out_balanced; |
4254 | 4112 | ||
4255 | if (idle == CPU_IDLE) { | 4113 | if (env->idle == CPU_IDLE) { |
4256 | /* | 4114 | /* |
4257 | * This cpu is idle. If the busiest group load doesn't | 4115 | * This cpu is idle. If the busiest group load doesn't |
4258 | * have more tasks than the number of available cpu's and | 4116 | * have more tasks than the number of available cpu's and |
@@ -4267,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4267 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 4125 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4268 | * imbalance_pct to be conservative. | 4126 | * imbalance_pct to be conservative. |
4269 | */ | 4127 | */ |
4270 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 4128 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) |
4271 | goto out_balanced; | 4129 | goto out_balanced; |
4272 | } | 4130 | } |
4273 | 4131 | ||
4274 | force_balance: | 4132 | force_balance: |
4275 | /* Looks like there is an imbalance. Compute it */ | 4133 | /* Looks like there is an imbalance. Compute it */ |
4276 | calculate_imbalance(&sds, this_cpu, imbalance); | 4134 | calculate_imbalance(env, &sds); |
4277 | return sds.busiest; | 4135 | return sds.busiest; |
4278 | 4136 | ||
4279 | out_balanced: | 4137 | out_balanced: |
4280 | /* | ||
4281 | * There is no obvious imbalance. But check if we can do some balancing | ||
4282 | * to save power. | ||
4283 | */ | ||
4284 | if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) | ||
4285 | return sds.busiest; | ||
4286 | ret: | 4138 | ret: |
4287 | *imbalance = 0; | 4139 | env->imbalance = 0; |
4288 | return NULL; | 4140 | return NULL; |
4289 | } | 4141 | } |
4290 | 4142 | ||
4291 | /* | 4143 | /* |
4292 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 4144 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
4293 | */ | 4145 | */ |
4294 | static struct rq * | 4146 | static struct rq *find_busiest_queue(struct lb_env *env, |
4295 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | 4147 | struct sched_group *group, |
4296 | enum cpu_idle_type idle, unsigned long imbalance, | 4148 | const struct cpumask *cpus) |
4297 | const struct cpumask *cpus) | ||
4298 | { | 4149 | { |
4299 | struct rq *busiest = NULL, *rq; | 4150 | struct rq *busiest = NULL, *rq; |
4300 | unsigned long max_load = 0; | 4151 | unsigned long max_load = 0; |
@@ -4307,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4307 | unsigned long wl; | 4158 | unsigned long wl; |
4308 | 4159 | ||
4309 | if (!capacity) | 4160 | if (!capacity) |
4310 | capacity = fix_small_capacity(sd, group); | 4161 | capacity = fix_small_capacity(env->sd, group); |
4311 | 4162 | ||
4312 | if (!cpumask_test_cpu(i, cpus)) | 4163 | if (!cpumask_test_cpu(i, cpus)) |
4313 | continue; | 4164 | continue; |
@@ -4319,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4319 | * When comparing with imbalance, use weighted_cpuload() | 4170 | * When comparing with imbalance, use weighted_cpuload() |
4320 | * which is not scaled with the cpu power. | 4171 | * which is not scaled with the cpu power. |
4321 | */ | 4172 | */ |
4322 | if (capacity && rq->nr_running == 1 && wl > imbalance) | 4173 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) |
4323 | continue; | 4174 | continue; |
4324 | 4175 | ||
4325 | /* | 4176 | /* |
@@ -4348,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
4348 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4199 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4349 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4200 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
4350 | 4201 | ||
4351 | static int need_active_balance(struct sched_domain *sd, int idle, | 4202 | static int need_active_balance(struct lb_env *env) |
4352 | int busiest_cpu, int this_cpu) | ||
4353 | { | 4203 | { |
4354 | if (idle == CPU_NEWLY_IDLE) { | 4204 | struct sched_domain *sd = env->sd; |
4205 | |||
4206 | if (env->idle == CPU_NEWLY_IDLE) { | ||
4355 | 4207 | ||
4356 | /* | 4208 | /* |
4357 | * ASYM_PACKING needs to force migrate tasks from busy but | 4209 | * ASYM_PACKING needs to force migrate tasks from busy but |
4358 | * higher numbered CPUs in order to pack all tasks in the | 4210 | * higher numbered CPUs in order to pack all tasks in the |
4359 | * lowest numbered CPUs. | 4211 | * lowest numbered CPUs. |
4360 | */ | 4212 | */ |
4361 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | 4213 | if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) |
4362 | return 1; | 4214 | return 1; |
4363 | |||
4364 | /* | ||
4365 | * The only task running in a non-idle cpu can be moved to this | ||
4366 | * cpu in an attempt to completely freeup the other CPU | ||
4367 | * package. | ||
4368 | * | ||
4369 | * The package power saving logic comes from | ||
4370 | * find_busiest_group(). If there are no imbalance, then | ||
4371 | * f_b_g() will return NULL. However when sched_mc={1,2} then | ||
4372 | * f_b_g() will select a group from which a running task may be | ||
4373 | * pulled to this cpu in order to make the other package idle. | ||
4374 | * If there is no opportunity to make a package idle and if | ||
4375 | * there are no imbalance, then f_b_g() will return NULL and no | ||
4376 | * action will be taken in load_balance_newidle(). | ||
4377 | * | ||
4378 | * Under normal task pull operation due to imbalance, there | ||
4379 | * will be more than one task in the source run queue and | ||
4380 | * move_tasks() will succeed. ld_moved will be true and this | ||
4381 | * active balance code will not be triggered. | ||
4382 | */ | ||
4383 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | ||
4384 | return 0; | ||
4385 | } | 4215 | } |
4386 | 4216 | ||
4387 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 4217 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
@@ -4399,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4399 | { | 4229 | { |
4400 | int ld_moved, active_balance = 0; | 4230 | int ld_moved, active_balance = 0; |
4401 | struct sched_group *group; | 4231 | struct sched_group *group; |
4402 | unsigned long imbalance; | ||
4403 | struct rq *busiest; | 4232 | struct rq *busiest; |
4404 | unsigned long flags; | 4233 | unsigned long flags; |
4405 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4234 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
@@ -4417,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4417 | schedstat_inc(sd, lb_count[idle]); | 4246 | schedstat_inc(sd, lb_count[idle]); |
4418 | 4247 | ||
4419 | redo: | 4248 | redo: |
4420 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, | 4249 | group = find_busiest_group(&env, cpus, balance); |
4421 | cpus, balance); | ||
4422 | 4250 | ||
4423 | if (*balance == 0) | 4251 | if (*balance == 0) |
4424 | goto out_balanced; | 4252 | goto out_balanced; |
@@ -4428,7 +4256,7 @@ redo: | |||
4428 | goto out_balanced; | 4256 | goto out_balanced; |
4429 | } | 4257 | } |
4430 | 4258 | ||
4431 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); | 4259 | busiest = find_busiest_queue(&env, group, cpus); |
4432 | if (!busiest) { | 4260 | if (!busiest) { |
4433 | schedstat_inc(sd, lb_nobusyq[idle]); | 4261 | schedstat_inc(sd, lb_nobusyq[idle]); |
4434 | goto out_balanced; | 4262 | goto out_balanced; |
@@ -4436,7 +4264,7 @@ redo: | |||
4436 | 4264 | ||
4437 | BUG_ON(busiest == this_rq); | 4265 | BUG_ON(busiest == this_rq); |
4438 | 4266 | ||
4439 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 4267 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4440 | 4268 | ||
4441 | ld_moved = 0; | 4269 | ld_moved = 0; |
4442 | if (busiest->nr_running > 1) { | 4270 | if (busiest->nr_running > 1) { |
@@ -4447,10 +4275,9 @@ redo: | |||
4447 | * correctly treated as an imbalance. | 4275 | * correctly treated as an imbalance. |
4448 | */ | 4276 | */ |
4449 | env.flags |= LBF_ALL_PINNED; | 4277 | env.flags |= LBF_ALL_PINNED; |
4450 | env.load_move = imbalance; | 4278 | env.src_cpu = busiest->cpu; |
4451 | env.src_cpu = busiest->cpu; | 4279 | env.src_rq = busiest; |
4452 | env.src_rq = busiest; | 4280 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
4453 | env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); | ||
4454 | 4281 | ||
4455 | more_balance: | 4282 | more_balance: |
4456 | local_irq_save(flags); | 4283 | local_irq_save(flags); |
@@ -4492,7 +4319,7 @@ more_balance: | |||
4492 | if (idle != CPU_NEWLY_IDLE) | 4319 | if (idle != CPU_NEWLY_IDLE) |
4493 | sd->nr_balance_failed++; | 4320 | sd->nr_balance_failed++; |
4494 | 4321 | ||
4495 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { | 4322 | if (need_active_balance(&env)) { |
4496 | raw_spin_lock_irqsave(&busiest->lock, flags); | 4323 | raw_spin_lock_irqsave(&busiest->lock, flags); |
4497 | 4324 | ||
4498 | /* don't kick the active_load_balance_cpu_stop, | 4325 | /* don't kick the active_load_balance_cpu_stop, |
@@ -4519,10 +4346,11 @@ more_balance: | |||
4519 | } | 4346 | } |
4520 | raw_spin_unlock_irqrestore(&busiest->lock, flags); | 4347 | raw_spin_unlock_irqrestore(&busiest->lock, flags); |
4521 | 4348 | ||
4522 | if (active_balance) | 4349 | if (active_balance) { |
4523 | stop_one_cpu_nowait(cpu_of(busiest), | 4350 | stop_one_cpu_nowait(cpu_of(busiest), |
4524 | active_load_balance_cpu_stop, busiest, | 4351 | active_load_balance_cpu_stop, busiest, |
4525 | &busiest->active_balance_work); | 4352 | &busiest->active_balance_work); |
4353 | } | ||
4526 | 4354 | ||
4527 | /* | 4355 | /* |
4528 | * We've kicked active balancing, reset the failure | 4356 | * We've kicked active balancing, reset the failure |
@@ -4703,104 +4531,15 @@ static struct { | |||
4703 | unsigned long next_balance; /* in jiffy units */ | 4531 | unsigned long next_balance; /* in jiffy units */ |
4704 | } nohz ____cacheline_aligned; | 4532 | } nohz ____cacheline_aligned; |
4705 | 4533 | ||
4706 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4534 | static inline int find_new_ilb(int call_cpu) |
4707 | /** | ||
4708 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4709 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4710 | * be returned. | ||
4711 | * @flag: The flag to check for the lowest sched_domain | ||
4712 | * for the given cpu. | ||
4713 | * | ||
4714 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4715 | */ | ||
4716 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4717 | { | ||
4718 | struct sched_domain *sd; | ||
4719 | |||
4720 | for_each_domain(cpu, sd) | ||
4721 | if (sd->flags & flag) | ||
4722 | break; | ||
4723 | |||
4724 | return sd; | ||
4725 | } | ||
4726 | |||
4727 | /** | ||
4728 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4729 | * @cpu: The cpu whose domains we're iterating over. | ||
4730 | * @sd: variable holding the value of the power_savings_sd | ||
4731 | * for cpu. | ||
4732 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4733 | * | ||
4734 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4735 | * set, starting from the lowest sched_domain to the highest. | ||
4736 | */ | ||
4737 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4738 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4739 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4740 | |||
4741 | /** | ||
4742 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4743 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4744 | * | ||
4745 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4746 | * Else, returns >= nr_cpu_ids. | ||
4747 | * | ||
4748 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4749 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4750 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4751 | * when there are other idle cpu's which are better suited for that job. | ||
4752 | */ | ||
4753 | static int find_new_ilb(int cpu) | ||
4754 | { | 4535 | { |
4755 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 4536 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
4756 | struct sched_group *ilbg; | ||
4757 | struct sched_domain *sd; | ||
4758 | |||
4759 | /* | ||
4760 | * Have idle load balancer selection from semi-idle packages only | ||
4761 | * when power-aware load balancing is enabled | ||
4762 | */ | ||
4763 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4764 | goto out_done; | ||
4765 | |||
4766 | /* | ||
4767 | * Optimize for the case when we have no idle CPUs or only one | ||
4768 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4769 | */ | ||
4770 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | ||
4771 | goto out_done; | ||
4772 | 4537 | ||
4773 | rcu_read_lock(); | ||
4774 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4775 | ilbg = sd->groups; | ||
4776 | |||
4777 | do { | ||
4778 | if (ilbg->group_weight != | ||
4779 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { | ||
4780 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4781 | sched_group_cpus(ilbg)); | ||
4782 | goto unlock; | ||
4783 | } | ||
4784 | |||
4785 | ilbg = ilbg->next; | ||
4786 | |||
4787 | } while (ilbg != sd->groups); | ||
4788 | } | ||
4789 | unlock: | ||
4790 | rcu_read_unlock(); | ||
4791 | |||
4792 | out_done: | ||
4793 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | 4538 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4794 | return ilb; | 4539 | return ilb; |
4795 | 4540 | ||
4796 | return nr_cpu_ids; | 4541 | return nr_cpu_ids; |
4797 | } | 4542 | } |
4798 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4799 | static inline int find_new_ilb(int call_cpu) | ||
4800 | { | ||
4801 | return nr_cpu_ids; | ||
4802 | } | ||
4803 | #endif | ||
4804 | 4543 | ||
4805 | /* | 4544 | /* |
4806 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | 4545 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the |
@@ -5023,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
5023 | 4762 | ||
5024 | raw_spin_lock_irq(&this_rq->lock); | 4763 | raw_spin_lock_irq(&this_rq->lock); |
5025 | update_rq_clock(this_rq); | 4764 | update_rq_clock(this_rq); |
5026 | update_cpu_load(this_rq); | 4765 | update_idle_cpu_load(this_rq); |
5027 | raw_spin_unlock_irq(&this_rq->lock); | 4766 | raw_spin_unlock_irq(&this_rq->lock); |
5028 | 4767 | ||
5029 | rebalance_domains(balance_cpu, CPU_IDLE); | 4768 | rebalance_domains(balance_cpu, CPU_IDLE); |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f289..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
5 | * | 5 | * |
6 | * (NOTE: these are not related to SCHED_IDLE tasks which are | 6 | * (NOTE: these are not related to SCHED_IDLE tasks which are |
7 | * handled in sched_fair.c) | 7 | * handled in sched/fair.c) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
26 | { | 26 | { |
27 | schedstat_inc(rq, sched_goidle); | 27 | schedstat_inc(rq, sched_goidle); |
28 | calc_load_account_idle(rq); | ||
29 | return rq->idle; | 28 | return rq->idle; |
30 | } | 29 | } |
31 | 30 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d0..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) | |||
274 | 274 | ||
275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
276 | { | 276 | { |
277 | struct task_struct *p; | ||
278 | |||
277 | if (!rt_entity_is_task(rt_se)) | 279 | if (!rt_entity_is_task(rt_se)) |
278 | return; | 280 | return; |
279 | 281 | ||
282 | p = rt_task_of(rt_se); | ||
280 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 283 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
281 | 284 | ||
282 | rt_rq->rt_nr_total++; | 285 | rt_rq->rt_nr_total++; |
283 | if (rt_se->nr_cpus_allowed > 1) | 286 | if (p->nr_cpus_allowed > 1) |
284 | rt_rq->rt_nr_migratory++; | 287 | rt_rq->rt_nr_migratory++; |
285 | 288 | ||
286 | update_rt_migration(rt_rq); | 289 | update_rt_migration(rt_rq); |
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
288 | 291 | ||
289 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 292 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
290 | { | 293 | { |
294 | struct task_struct *p; | ||
295 | |||
291 | if (!rt_entity_is_task(rt_se)) | 296 | if (!rt_entity_is_task(rt_se)) |
292 | return; | 297 | return; |
293 | 298 | ||
299 | p = rt_task_of(rt_se); | ||
294 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 300 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
295 | 301 | ||
296 | rt_rq->rt_nr_total--; | 302 | rt_rq->rt_nr_total--; |
297 | if (rt_se->nr_cpus_allowed > 1) | 303 | if (p->nr_cpus_allowed > 1) |
298 | rt_rq->rt_nr_migratory--; | 304 | rt_rq->rt_nr_migratory--; |
299 | 305 | ||
300 | update_rt_migration(rt_rq); | 306 | update_rt_migration(rt_rq); |
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1161 | 1167 | ||
1162 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1168 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); |
1163 | 1169 | ||
1164 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1170 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
1165 | enqueue_pushable_task(rq, p); | 1171 | enqueue_pushable_task(rq, p); |
1166 | 1172 | ||
1167 | inc_nr_running(rq); | 1173 | inc_nr_running(rq); |
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1225 | 1231 | ||
1226 | cpu = task_cpu(p); | 1232 | cpu = task_cpu(p); |
1227 | 1233 | ||
1228 | if (p->rt.nr_cpus_allowed == 1) | 1234 | if (p->nr_cpus_allowed == 1) |
1229 | goto out; | 1235 | goto out; |
1230 | 1236 | ||
1231 | /* For anything but wake ups, just return the task_cpu */ | 1237 | /* For anything but wake ups, just return the task_cpu */ |
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1260 | * will have to sort it out. | 1266 | * will have to sort it out. |
1261 | */ | 1267 | */ |
1262 | if (curr && unlikely(rt_task(curr)) && | 1268 | if (curr && unlikely(rt_task(curr)) && |
1263 | (curr->rt.nr_cpus_allowed < 2 || | 1269 | (curr->nr_cpus_allowed < 2 || |
1264 | curr->prio <= p->prio) && | 1270 | curr->prio <= p->prio) && |
1265 | (p->rt.nr_cpus_allowed > 1)) { | 1271 | (p->nr_cpus_allowed > 1)) { |
1266 | int target = find_lowest_rq(p); | 1272 | int target = find_lowest_rq(p); |
1267 | 1273 | ||
1268 | if (target != -1) | 1274 | if (target != -1) |
@@ -1276,10 +1282,10 @@ out: | |||
1276 | 1282 | ||
1277 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1283 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
1278 | { | 1284 | { |
1279 | if (rq->curr->rt.nr_cpus_allowed == 1) | 1285 | if (rq->curr->nr_cpus_allowed == 1) |
1280 | return; | 1286 | return; |
1281 | 1287 | ||
1282 | if (p->rt.nr_cpus_allowed != 1 | 1288 | if (p->nr_cpus_allowed != 1 |
1283 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1289 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
1284 | return; | 1290 | return; |
1285 | 1291 | ||
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1395 | * The previous task needs to be made eligible for pushing | 1401 | * The previous task needs to be made eligible for pushing |
1396 | * if it is still active | 1402 | * if it is still active |
1397 | */ | 1403 | */ |
1398 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) | 1404 | if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) |
1399 | enqueue_pushable_task(rq, p); | 1405 | enqueue_pushable_task(rq, p); |
1400 | } | 1406 | } |
1401 | 1407 | ||
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1408 | { | 1414 | { |
1409 | if (!task_running(rq, p) && | 1415 | if (!task_running(rq, p) && |
1410 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1416 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1411 | (p->rt.nr_cpus_allowed > 1)) | 1417 | (p->nr_cpus_allowed > 1)) |
1412 | return 1; | 1418 | return 1; |
1413 | return 0; | 1419 | return 0; |
1414 | } | 1420 | } |
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1464 | if (unlikely(!lowest_mask)) | 1470 | if (unlikely(!lowest_mask)) |
1465 | return -1; | 1471 | return -1; |
1466 | 1472 | ||
1467 | if (task->rt.nr_cpus_allowed == 1) | 1473 | if (task->nr_cpus_allowed == 1) |
1468 | return -1; /* No other targets possible */ | 1474 | return -1; /* No other targets possible */ |
1469 | 1475 | ||
1470 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) | 1476 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1556 | task_running(rq, task) || | 1562 | task_running(rq, task) || |
1557 | !task->on_rq)) { | 1563 | !task->on_rq)) { |
1558 | 1564 | ||
1559 | raw_spin_unlock(&lowest_rq->lock); | 1565 | double_unlock_balance(rq, lowest_rq); |
1560 | lowest_rq = NULL; | 1566 | lowest_rq = NULL; |
1561 | break; | 1567 | break; |
1562 | } | 1568 | } |
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1586 | 1592 | ||
1587 | BUG_ON(rq->cpu != task_cpu(p)); | 1593 | BUG_ON(rq->cpu != task_cpu(p)); |
1588 | BUG_ON(task_current(rq, p)); | 1594 | BUG_ON(task_current(rq, p)); |
1589 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1595 | BUG_ON(p->nr_cpus_allowed <= 1); |
1590 | 1596 | ||
1591 | BUG_ON(!p->on_rq); | 1597 | BUG_ON(!p->on_rq); |
1592 | BUG_ON(!rt_task(p)); | 1598 | BUG_ON(!rt_task(p)); |
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1793 | if (!task_running(rq, p) && | 1799 | if (!task_running(rq, p) && |
1794 | !test_tsk_need_resched(rq->curr) && | 1800 | !test_tsk_need_resched(rq->curr) && |
1795 | has_pushable_tasks(rq) && | 1801 | has_pushable_tasks(rq) && |
1796 | p->rt.nr_cpus_allowed > 1 && | 1802 | p->nr_cpus_allowed > 1 && |
1797 | rt_task(rq->curr) && | 1803 | rt_task(rq->curr) && |
1798 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1804 | (rq->curr->nr_cpus_allowed < 2 || |
1799 | rq->curr->prio <= p->prio)) | 1805 | rq->curr->prio <= p->prio)) |
1800 | push_rt_tasks(rq); | 1806 | push_rt_tasks(rq); |
1801 | } | 1807 | } |
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1803 | static void set_cpus_allowed_rt(struct task_struct *p, | 1809 | static void set_cpus_allowed_rt(struct task_struct *p, |
1804 | const struct cpumask *new_mask) | 1810 | const struct cpumask *new_mask) |
1805 | { | 1811 | { |
1806 | int weight = cpumask_weight(new_mask); | 1812 | struct rq *rq; |
1813 | int weight; | ||
1807 | 1814 | ||
1808 | BUG_ON(!rt_task(p)); | 1815 | BUG_ON(!rt_task(p)); |
1809 | 1816 | ||
1810 | /* | 1817 | if (!p->on_rq) |
1811 | * Update the migration status of the RQ if we have an RT task | 1818 | return; |
1812 | * which is running AND changing its weight value. | ||
1813 | */ | ||
1814 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
1815 | struct rq *rq = task_rq(p); | ||
1816 | 1819 | ||
1817 | if (!task_current(rq, p)) { | 1820 | weight = cpumask_weight(new_mask); |
1818 | /* | ||
1819 | * Make sure we dequeue this task from the pushable list | ||
1820 | * before going further. It will either remain off of | ||
1821 | * the list because we are no longer pushable, or it | ||
1822 | * will be requeued. | ||
1823 | */ | ||
1824 | if (p->rt.nr_cpus_allowed > 1) | ||
1825 | dequeue_pushable_task(rq, p); | ||
1826 | 1821 | ||
1827 | /* | 1822 | /* |
1828 | * Requeue if our weight is changing and still > 1 | 1823 | * Only update if the process changes its state from whether it |
1829 | */ | 1824 | * can migrate or not. |
1830 | if (weight > 1) | 1825 | */ |
1831 | enqueue_pushable_task(rq, p); | 1826 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
1832 | 1827 | return; | |
1833 | } | ||
1834 | 1828 | ||
1835 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { | 1829 | rq = task_rq(p); |
1836 | rq->rt.rt_nr_migratory++; | ||
1837 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
1838 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1839 | rq->rt.rt_nr_migratory--; | ||
1840 | } | ||
1841 | 1830 | ||
1842 | update_rt_migration(&rq->rt); | 1831 | /* |
1832 | * The process used to be able to migrate OR it can now migrate | ||
1833 | */ | ||
1834 | if (weight <= 1) { | ||
1835 | if (!task_current(rq, p)) | ||
1836 | dequeue_pushable_task(rq, p); | ||
1837 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
1838 | rq->rt.rt_nr_migratory--; | ||
1839 | } else { | ||
1840 | if (!task_current(rq, p)) | ||
1841 | enqueue_pushable_task(rq, p); | ||
1842 | rq->rt.rt_nr_migratory++; | ||
1843 | } | 1843 | } |
1844 | |||
1845 | update_rt_migration(&rq->rt); | ||
1844 | } | 1846 | } |
1845 | 1847 | ||
1846 | /* Assumes rq->lock is held */ | 1848 | /* Assumes rq->lock is held */ |
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1983 | 1985 | ||
1984 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 1986 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
1985 | { | 1987 | { |
1988 | struct sched_rt_entity *rt_se = &p->rt; | ||
1989 | |||
1986 | update_curr_rt(rq); | 1990 | update_curr_rt(rq); |
1987 | 1991 | ||
1988 | watchdog(rq, p); | 1992 | watchdog(rq, p); |
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
2000 | p->rt.time_slice = RR_TIMESLICE; | 2004 | p->rt.time_slice = RR_TIMESLICE; |
2001 | 2005 | ||
2002 | /* | 2006 | /* |
2003 | * Requeue to the end of queue if we are not the only element | 2007 | * Requeue to the end of queue if we (and all of our ancestors) are the |
2004 | * on the queue: | 2008 | * only element on the queue |
2005 | */ | 2009 | */ |
2006 | if (p->rt.run_list.prev != p->rt.run_list.next) { | 2010 | for_each_sched_rt_entity(rt_se) { |
2007 | requeue_task_rt(rq, p, 0); | 2011 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
2008 | set_tsk_need_resched(p); | 2012 | requeue_task_rt(rq, p, 0); |
2013 | set_tsk_need_resched(p); | ||
2014 | return; | ||
2015 | } | ||
2009 | } | 2016 | } |
2010 | } | 2017 | } |
2011 | 2018 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52e..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -201,7 +201,7 @@ struct cfs_bandwidth { }; | |||
201 | /* CFS-related fields in a runqueue */ | 201 | /* CFS-related fields in a runqueue */ |
202 | struct cfs_rq { | 202 | struct cfs_rq { |
203 | struct load_weight load; | 203 | struct load_weight load; |
204 | unsigned long nr_running, h_nr_running; | 204 | unsigned int nr_running, h_nr_running; |
205 | 205 | ||
206 | u64 exec_clock; | 206 | u64 exec_clock; |
207 | u64 min_vruntime; | 207 | u64 min_vruntime; |
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void) | |||
279 | /* Real-Time classes' related field in a runqueue: */ | 279 | /* Real-Time classes' related field in a runqueue: */ |
280 | struct rt_rq { | 280 | struct rt_rq { |
281 | struct rt_prio_array active; | 281 | struct rt_prio_array active; |
282 | unsigned long rt_nr_running; | 282 | unsigned int rt_nr_running; |
283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 283 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
284 | struct { | 284 | struct { |
285 | int curr; /* highest queued rt task prio */ | 285 | int curr; /* highest queued rt task prio */ |
@@ -353,7 +353,7 @@ struct rq { | |||
353 | * nr_running and cpu_load should be in the same cacheline because | 353 | * nr_running and cpu_load should be in the same cacheline because |
354 | * remote CPUs use both these fields when doing load calculation. | 354 | * remote CPUs use both these fields when doing load calculation. |
355 | */ | 355 | */ |
356 | unsigned long nr_running; | 356 | unsigned int nr_running; |
357 | #define CPU_LOAD_IDX_MAX 5 | 357 | #define CPU_LOAD_IDX_MAX 5 |
358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 358 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
359 | unsigned long last_load_update_tick; | 359 | unsigned long last_load_update_tick; |
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
527 | DECLARE_PER_CPU(int, sd_llc_id); | 527 | DECLARE_PER_CPU(int, sd_llc_id); |
528 | 528 | ||
529 | extern int group_balance_cpu(struct sched_group *sg); | ||
530 | |||
529 | #endif /* CONFIG_SMP */ | 531 | #endif /* CONFIG_SMP */ |
530 | 532 | ||
531 | #include "stats.h" | 533 | #include "stats.h" |
@@ -876,7 +878,7 @@ extern void resched_cpu(int cpu); | |||
876 | extern struct rt_bandwidth def_rt_bandwidth; | 878 | extern struct rt_bandwidth def_rt_bandwidth; |
877 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | 879 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); |
878 | 880 | ||
879 | extern void update_cpu_load(struct rq *this_rq); | 881 | extern void update_idle_cpu_load(struct rq *this_rq); |
880 | 882 | ||
881 | #ifdef CONFIG_CGROUP_CPUACCT | 883 | #ifdef CONFIG_CGROUP_CPUACCT |
882 | #include <linux/cgroup.h> | 884 | #include <linux/cgroup.h> |
@@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void) | |||
940 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
941 | } | 943 | } |
942 | 944 | ||
943 | void calc_load_account_idle(struct rq *this_rq); | ||
944 | |||
945 | #ifdef CONFIG_SCHED_HRTICK | 945 | #ifdef CONFIG_SCHED_HRTICK |
946 | 946 | ||
947 | /* | 947 | /* |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895ea..ee376beedaf9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -3,16 +3,357 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> | 4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * | 5 | * |
6 | * This defines a simple but solid secure-computing mode. | 6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> | ||
8 | * | ||
9 | * This defines a simple but solid secure-computing facility. | ||
10 | * | ||
11 | * Mode 1 uses a fixed list of allowed system calls. | ||
12 | * Mode 2 allows user-defined system call filters in the form | ||
13 | * of Berkeley Packet Filters/Linux Socket Filters. | ||
7 | */ | 14 | */ |
8 | 15 | ||
16 | #include <linux/atomic.h> | ||
9 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
10 | #include <linux/seccomp.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> | ||
20 | #include <linux/seccomp.h> | ||
13 | 21 | ||
14 | /* #define SECCOMP_DEBUG 1 */ | 22 | /* #define SECCOMP_DEBUG 1 */ |
15 | #define NR_SECCOMP_MODES 1 | 23 | |
24 | #ifdef CONFIG_SECCOMP_FILTER | ||
25 | #include <asm/syscall.h> | ||
26 | #include <linux/filter.h> | ||
27 | #include <linux/ptrace.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/tracehook.h> | ||
31 | #include <linux/uaccess.h> | ||
32 | |||
33 | /** | ||
34 | * struct seccomp_filter - container for seccomp BPF programs | ||
35 | * | ||
36 | * @usage: reference count to manage the object lifetime. | ||
37 | * get/put helpers should be used when accessing an instance | ||
38 | * outside of a lifetime-guarded section. In general, this | ||
39 | * is only needed for handling filters shared across tasks. | ||
40 | * @prev: points to a previously installed, or inherited, filter | ||
41 | * @len: the number of instructions in the program | ||
42 | * @insns: the BPF program instructions to evaluate | ||
43 | * | ||
44 | * seccomp_filter objects are organized in a tree linked via the @prev | ||
45 | * pointer. For any task, it appears to be a singly-linked list starting | ||
46 | * with current->seccomp.filter, the most recently attached or inherited filter. | ||
47 | * However, multiple filters may share a @prev node, by way of fork(), which | ||
48 | * results in a unidirectional tree existing in memory. This is similar to | ||
49 | * how namespaces work. | ||
50 | * | ||
51 | * seccomp_filter objects should never be modified after being attached | ||
52 | * to a task_struct (other than @usage). | ||
53 | */ | ||
54 | struct seccomp_filter { | ||
55 | atomic_t usage; | ||
56 | struct seccomp_filter *prev; | ||
57 | unsigned short len; /* Instruction count */ | ||
58 | struct sock_filter insns[]; | ||
59 | }; | ||
60 | |||
61 | /* Limit any path through the tree to 256KB worth of instructions. */ | ||
62 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) | ||
63 | |||
64 | /** | ||
65 | * get_u32 - returns a u32 offset into data | ||
66 | * @data: a unsigned 64 bit value | ||
67 | * @index: 0 or 1 to return the first or second 32-bits | ||
68 | * | ||
69 | * This inline exists to hide the length of unsigned long. If a 32-bit | ||
70 | * unsigned long is passed in, it will be extended and the top 32-bits will be | ||
71 | * 0. If it is a 64-bit unsigned long, then whatever data is resident will be | ||
72 | * properly returned. | ||
73 | * | ||
74 | * Endianness is explicitly ignored and left for BPF program authors to manage | ||
75 | * as per the specific architecture. | ||
76 | */ | ||
77 | static inline u32 get_u32(u64 data, int index) | ||
78 | { | ||
79 | return ((u32 *)&data)[index]; | ||
80 | } | ||
81 | |||
82 | /* Helper for bpf_load below. */ | ||
83 | #define BPF_DATA(_name) offsetof(struct seccomp_data, _name) | ||
84 | /** | ||
85 | * bpf_load: checks and returns a pointer to the requested offset | ||
86 | * @off: offset into struct seccomp_data to load from | ||
87 | * | ||
88 | * Returns the requested 32-bits of data. | ||
89 | * seccomp_check_filter() should assure that @off is 32-bit aligned | ||
90 | * and not out of bounds. Failure to do so is a BUG. | ||
91 | */ | ||
92 | u32 seccomp_bpf_load(int off) | ||
93 | { | ||
94 | struct pt_regs *regs = task_pt_regs(current); | ||
95 | if (off == BPF_DATA(nr)) | ||
96 | return syscall_get_nr(current, regs); | ||
97 | if (off == BPF_DATA(arch)) | ||
98 | return syscall_get_arch(current, regs); | ||
99 | if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { | ||
100 | unsigned long value; | ||
101 | int arg = (off - BPF_DATA(args[0])) / sizeof(u64); | ||
102 | int index = !!(off % sizeof(u64)); | ||
103 | syscall_get_arguments(current, regs, arg, 1, &value); | ||
104 | return get_u32(value, index); | ||
105 | } | ||
106 | if (off == BPF_DATA(instruction_pointer)) | ||
107 | return get_u32(KSTK_EIP(current), 0); | ||
108 | if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) | ||
109 | return get_u32(KSTK_EIP(current), 1); | ||
110 | /* seccomp_check_filter should make this impossible. */ | ||
111 | BUG(); | ||
112 | } | ||
113 | |||
114 | /** | ||
115 | * seccomp_check_filter - verify seccomp filter code | ||
116 | * @filter: filter to verify | ||
117 | * @flen: length of filter | ||
118 | * | ||
119 | * Takes a previously checked filter (by sk_chk_filter) and | ||
120 | * redirects all filter code that loads struct sk_buff data | ||
121 | * and related data through seccomp_bpf_load. It also | ||
122 | * enforces length and alignment checking of those loads. | ||
123 | * | ||
124 | * Returns 0 if the rule set is legal or -EINVAL if not. | ||
125 | */ | ||
126 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | ||
127 | { | ||
128 | int pc; | ||
129 | for (pc = 0; pc < flen; pc++) { | ||
130 | struct sock_filter *ftest = &filter[pc]; | ||
131 | u16 code = ftest->code; | ||
132 | u32 k = ftest->k; | ||
133 | |||
134 | switch (code) { | ||
135 | case BPF_S_LD_W_ABS: | ||
136 | ftest->code = BPF_S_ANC_SECCOMP_LD_W; | ||
137 | /* 32-bit aligned and not out of bounds. */ | ||
138 | if (k >= sizeof(struct seccomp_data) || k & 3) | ||
139 | return -EINVAL; | ||
140 | continue; | ||
141 | case BPF_S_LD_W_LEN: | ||
142 | ftest->code = BPF_S_LD_IMM; | ||
143 | ftest->k = sizeof(struct seccomp_data); | ||
144 | continue; | ||
145 | case BPF_S_LDX_W_LEN: | ||
146 | ftest->code = BPF_S_LDX_IMM; | ||
147 | ftest->k = sizeof(struct seccomp_data); | ||
148 | continue; | ||
149 | /* Explicitly include allowed calls. */ | ||
150 | case BPF_S_RET_K: | ||
151 | case BPF_S_RET_A: | ||
152 | case BPF_S_ALU_ADD_K: | ||
153 | case BPF_S_ALU_ADD_X: | ||
154 | case BPF_S_ALU_SUB_K: | ||
155 | case BPF_S_ALU_SUB_X: | ||
156 | case BPF_S_ALU_MUL_K: | ||
157 | case BPF_S_ALU_MUL_X: | ||
158 | case BPF_S_ALU_DIV_X: | ||
159 | case BPF_S_ALU_AND_K: | ||
160 | case BPF_S_ALU_AND_X: | ||
161 | case BPF_S_ALU_OR_K: | ||
162 | case BPF_S_ALU_OR_X: | ||
163 | case BPF_S_ALU_LSH_K: | ||
164 | case BPF_S_ALU_LSH_X: | ||
165 | case BPF_S_ALU_RSH_K: | ||
166 | case BPF_S_ALU_RSH_X: | ||
167 | case BPF_S_ALU_NEG: | ||
168 | case BPF_S_LD_IMM: | ||
169 | case BPF_S_LDX_IMM: | ||
170 | case BPF_S_MISC_TAX: | ||
171 | case BPF_S_MISC_TXA: | ||
172 | case BPF_S_ALU_DIV_K: | ||
173 | case BPF_S_LD_MEM: | ||
174 | case BPF_S_LDX_MEM: | ||
175 | case BPF_S_ST: | ||
176 | case BPF_S_STX: | ||
177 | case BPF_S_JMP_JA: | ||
178 | case BPF_S_JMP_JEQ_K: | ||
179 | case BPF_S_JMP_JEQ_X: | ||
180 | case BPF_S_JMP_JGE_K: | ||
181 | case BPF_S_JMP_JGE_X: | ||
182 | case BPF_S_JMP_JGT_K: | ||
183 | case BPF_S_JMP_JGT_X: | ||
184 | case BPF_S_JMP_JSET_K: | ||
185 | case BPF_S_JMP_JSET_X: | ||
186 | continue; | ||
187 | default: | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * seccomp_run_filters - evaluates all seccomp filters against @syscall | ||
196 | * @syscall: number of the current system call | ||
197 | * | ||
198 | * Returns valid seccomp BPF response codes. | ||
199 | */ | ||
200 | static u32 seccomp_run_filters(int syscall) | ||
201 | { | ||
202 | struct seccomp_filter *f; | ||
203 | u32 ret = SECCOMP_RET_ALLOW; | ||
204 | |||
205 | /* Ensure unexpected behavior doesn't result in failing open. */ | ||
206 | if (WARN_ON(current->seccomp.filter == NULL)) | ||
207 | return SECCOMP_RET_KILL; | ||
208 | |||
209 | /* | ||
210 | * All filters in the list are evaluated and the lowest BPF return | ||
211 | * value always takes priority (ignoring the DATA). | ||
212 | */ | ||
213 | for (f = current->seccomp.filter; f; f = f->prev) { | ||
214 | u32 cur_ret = sk_run_filter(NULL, f->insns); | ||
215 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | ||
216 | ret = cur_ret; | ||
217 | } | ||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | /** | ||
222 | * seccomp_attach_filter: Attaches a seccomp filter to current. | ||
223 | * @fprog: BPF program to install | ||
224 | * | ||
225 | * Returns 0 on success or an errno on failure. | ||
226 | */ | ||
227 | static long seccomp_attach_filter(struct sock_fprog *fprog) | ||
228 | { | ||
229 | struct seccomp_filter *filter; | ||
230 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | ||
231 | unsigned long total_insns = fprog->len; | ||
232 | long ret; | ||
233 | |||
234 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | ||
235 | return -EINVAL; | ||
236 | |||
237 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | ||
238 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | ||
239 | if (total_insns > MAX_INSNS_PER_PATH) | ||
240 | return -ENOMEM; | ||
241 | |||
242 | /* | ||
243 | * Installing a seccomp filter requires that the task have | ||
244 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | ||
245 | * This avoids scenarios where unprivileged tasks can affect the | ||
246 | * behavior of privileged children. | ||
247 | */ | ||
248 | if (!current->no_new_privs && | ||
249 | security_capable_noaudit(current_cred(), current_user_ns(), | ||
250 | CAP_SYS_ADMIN) != 0) | ||
251 | return -EACCES; | ||
252 | |||
253 | /* Allocate a new seccomp_filter */ | ||
254 | filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, | ||
255 | GFP_KERNEL|__GFP_NOWARN); | ||
256 | if (!filter) | ||
257 | return -ENOMEM; | ||
258 | atomic_set(&filter->usage, 1); | ||
259 | filter->len = fprog->len; | ||
260 | |||
261 | /* Copy the instructions from fprog. */ | ||
262 | ret = -EFAULT; | ||
263 | if (copy_from_user(filter->insns, fprog->filter, fp_size)) | ||
264 | goto fail; | ||
265 | |||
266 | /* Check and rewrite the fprog via the skb checker */ | ||
267 | ret = sk_chk_filter(filter->insns, filter->len); | ||
268 | if (ret) | ||
269 | goto fail; | ||
270 | |||
271 | /* Check and rewrite the fprog for seccomp use */ | ||
272 | ret = seccomp_check_filter(filter->insns, filter->len); | ||
273 | if (ret) | ||
274 | goto fail; | ||
275 | |||
276 | /* | ||
277 | * If there is an existing filter, make it the prev and don't drop its | ||
278 | * task reference. | ||
279 | */ | ||
280 | filter->prev = current->seccomp.filter; | ||
281 | current->seccomp.filter = filter; | ||
282 | return 0; | ||
283 | fail: | ||
284 | kfree(filter); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | ||
290 | * @user_filter: pointer to the user data containing a sock_fprog. | ||
291 | * | ||
292 | * Returns 0 on success and non-zero otherwise. | ||
293 | */ | ||
294 | long seccomp_attach_user_filter(char __user *user_filter) | ||
295 | { | ||
296 | struct sock_fprog fprog; | ||
297 | long ret = -EFAULT; | ||
298 | |||
299 | #ifdef CONFIG_COMPAT | ||
300 | if (is_compat_task()) { | ||
301 | struct compat_sock_fprog fprog32; | ||
302 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) | ||
303 | goto out; | ||
304 | fprog.len = fprog32.len; | ||
305 | fprog.filter = compat_ptr(fprog32.filter); | ||
306 | } else /* falls through to the if below. */ | ||
307 | #endif | ||
308 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | ||
309 | goto out; | ||
310 | ret = seccomp_attach_filter(&fprog); | ||
311 | out: | ||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | ||
316 | void get_seccomp_filter(struct task_struct *tsk) | ||
317 | { | ||
318 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
319 | if (!orig) | ||
320 | return; | ||
321 | /* Reference count is bounded by the number of total processes. */ | ||
322 | atomic_inc(&orig->usage); | ||
323 | } | ||
324 | |||
325 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | ||
326 | void put_seccomp_filter(struct task_struct *tsk) | ||
327 | { | ||
328 | struct seccomp_filter *orig = tsk->seccomp.filter; | ||
329 | /* Clean up single-reference branches iteratively. */ | ||
330 | while (orig && atomic_dec_and_test(&orig->usage)) { | ||
331 | struct seccomp_filter *freeme = orig; | ||
332 | orig = orig->prev; | ||
333 | kfree(freeme); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation | ||
339 | * @syscall: syscall number to send to userland | ||
340 | * @reason: filter-supplied reason code to send to userland (via si_errno) | ||
341 | * | ||
342 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. | ||
343 | */ | ||
344 | static void seccomp_send_sigsys(int syscall, int reason) | ||
345 | { | ||
346 | struct siginfo info; | ||
347 | memset(&info, 0, sizeof(info)); | ||
348 | info.si_signo = SIGSYS; | ||
349 | info.si_code = SYS_SECCOMP; | ||
350 | info.si_call_addr = (void __user *)KSTK_EIP(current); | ||
351 | info.si_errno = reason; | ||
352 | info.si_arch = syscall_get_arch(current, task_pt_regs(current)); | ||
353 | info.si_syscall = syscall; | ||
354 | force_sig_info(SIGSYS, &info, current); | ||
355 | } | ||
356 | #endif /* CONFIG_SECCOMP_FILTER */ | ||
16 | 357 | ||
17 | /* | 358 | /* |
18 | * Secure computing mode 1 allows only read/write/exit/sigreturn. | 359 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { | |||
31 | }; | 372 | }; |
32 | #endif | 373 | #endif |
33 | 374 | ||
34 | void __secure_computing(int this_syscall) | 375 | int __secure_computing(int this_syscall) |
35 | { | 376 | { |
36 | int mode = current->seccomp.mode; | 377 | int mode = current->seccomp.mode; |
37 | int * syscall; | 378 | int exit_sig = 0; |
379 | int *syscall; | ||
380 | u32 ret; | ||
38 | 381 | ||
39 | switch (mode) { | 382 | switch (mode) { |
40 | case 1: | 383 | case SECCOMP_MODE_STRICT: |
41 | syscall = mode1_syscalls; | 384 | syscall = mode1_syscalls; |
42 | #ifdef CONFIG_COMPAT | 385 | #ifdef CONFIG_COMPAT |
43 | if (is_compat_task()) | 386 | if (is_compat_task()) |
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) | |||
45 | #endif | 388 | #endif |
46 | do { | 389 | do { |
47 | if (*syscall == this_syscall) | 390 | if (*syscall == this_syscall) |
48 | return; | 391 | return 0; |
49 | } while (*++syscall); | 392 | } while (*++syscall); |
393 | exit_sig = SIGKILL; | ||
394 | ret = SECCOMP_RET_KILL; | ||
395 | break; | ||
396 | #ifdef CONFIG_SECCOMP_FILTER | ||
397 | case SECCOMP_MODE_FILTER: { | ||
398 | int data; | ||
399 | ret = seccomp_run_filters(this_syscall); | ||
400 | data = ret & SECCOMP_RET_DATA; | ||
401 | ret &= SECCOMP_RET_ACTION; | ||
402 | switch (ret) { | ||
403 | case SECCOMP_RET_ERRNO: | ||
404 | /* Set the low-order 16-bits as a errno. */ | ||
405 | syscall_set_return_value(current, task_pt_regs(current), | ||
406 | -data, 0); | ||
407 | goto skip; | ||
408 | case SECCOMP_RET_TRAP: | ||
409 | /* Show the handler the original registers. */ | ||
410 | syscall_rollback(current, task_pt_regs(current)); | ||
411 | /* Let the filter pass back 16 bits of data. */ | ||
412 | seccomp_send_sigsys(this_syscall, data); | ||
413 | goto skip; | ||
414 | case SECCOMP_RET_TRACE: | ||
415 | /* Skip these calls if there is no tracer. */ | ||
416 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) | ||
417 | goto skip; | ||
418 | /* Allow the BPF to provide the event message */ | ||
419 | ptrace_event(PTRACE_EVENT_SECCOMP, data); | ||
420 | /* | ||
421 | * The delivery of a fatal signal during event | ||
422 | * notification may silently skip tracer notification. | ||
423 | * Terminating the task now avoids executing a system | ||
424 | * call that may not be intended. | ||
425 | */ | ||
426 | if (fatal_signal_pending(current)) | ||
427 | break; | ||
428 | return 0; | ||
429 | case SECCOMP_RET_ALLOW: | ||
430 | return 0; | ||
431 | case SECCOMP_RET_KILL: | ||
432 | default: | ||
433 | break; | ||
434 | } | ||
435 | exit_sig = SIGSYS; | ||
50 | break; | 436 | break; |
437 | } | ||
438 | #endif | ||
51 | default: | 439 | default: |
52 | BUG(); | 440 | BUG(); |
53 | } | 441 | } |
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) | |||
55 | #ifdef SECCOMP_DEBUG | 443 | #ifdef SECCOMP_DEBUG |
56 | dump_stack(); | 444 | dump_stack(); |
57 | #endif | 445 | #endif |
58 | audit_seccomp(this_syscall); | 446 | audit_seccomp(this_syscall, exit_sig, ret); |
59 | do_exit(SIGKILL); | 447 | do_exit(exit_sig); |
448 | #ifdef CONFIG_SECCOMP_FILTER | ||
449 | skip: | ||
450 | audit_seccomp(this_syscall, exit_sig, ret); | ||
451 | #endif | ||
452 | return -1; | ||
60 | } | 453 | } |
61 | 454 | ||
62 | long prctl_get_seccomp(void) | 455 | long prctl_get_seccomp(void) |
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void) | |||
64 | return current->seccomp.mode; | 457 | return current->seccomp.mode; |
65 | } | 458 | } |
66 | 459 | ||
67 | long prctl_set_seccomp(unsigned long seccomp_mode) | 460 | /** |
461 | * prctl_set_seccomp: configures current->seccomp.mode | ||
462 | * @seccomp_mode: requested mode to use | ||
463 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
464 | * | ||
465 | * This function may be called repeatedly with a @seccomp_mode of | ||
466 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | ||
467 | * successfully installed will be evaluated (in reverse order) for each system | ||
468 | * call the task makes. | ||
469 | * | ||
470 | * Once current->seccomp.mode is non-zero, it may not be changed. | ||
471 | * | ||
472 | * Returns 0 on success or -EINVAL on failure. | ||
473 | */ | ||
474 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | ||
68 | { | 475 | { |
69 | long ret; | 476 | long ret = -EINVAL; |
70 | 477 | ||
71 | /* can set it only once to be even more secure */ | 478 | if (current->seccomp.mode && |
72 | ret = -EPERM; | 479 | current->seccomp.mode != seccomp_mode) |
73 | if (unlikely(current->seccomp.mode)) | ||
74 | goto out; | 480 | goto out; |
75 | 481 | ||
76 | ret = -EINVAL; | 482 | switch (seccomp_mode) { |
77 | if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { | 483 | case SECCOMP_MODE_STRICT: |
78 | current->seccomp.mode = seccomp_mode; | 484 | ret = 0; |
79 | set_thread_flag(TIF_SECCOMP); | ||
80 | #ifdef TIF_NOTSC | 485 | #ifdef TIF_NOTSC |
81 | disable_TSC(); | 486 | disable_TSC(); |
82 | #endif | 487 | #endif |
83 | ret = 0; | 488 | break; |
489 | #ifdef CONFIG_SECCOMP_FILTER | ||
490 | case SECCOMP_MODE_FILTER: | ||
491 | ret = seccomp_attach_user_filter(filter); | ||
492 | if (ret) | ||
493 | goto out; | ||
494 | break; | ||
495 | #endif | ||
496 | default: | ||
497 | goto out; | ||
84 | } | 498 | } |
85 | 499 | ||
86 | out: | 500 | current->seccomp.mode = seccomp_mode; |
501 | set_thread_flag(TIF_SECCOMP); | ||
502 | out: | ||
87 | return ret; | 503 | return ret; |
88 | } | 504 | } |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c3..4567fc020fe3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable); | |||
118 | * down_trylock - try to acquire the semaphore, without waiting | 118 | * down_trylock - try to acquire the semaphore, without waiting |
119 | * @sem: the semaphore to be acquired | 119 | * @sem: the semaphore to be acquired |
120 | * | 120 | * |
121 | * Try to acquire the semaphore atomically. Returns 0 if the mutex has | 121 | * Try to acquire the semaphore atomically. Returns 0 if the semaphore has |
122 | * been acquired successfully or 1 if it it cannot be acquired. | 122 | * been acquired successfully or 1 if it it cannot be acquired. |
123 | * | 123 | * |
124 | * NOTE: This return value is inverted from both spin_trylock and | 124 | * NOTE: This return value is inverted from both spin_trylock and |
diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d0..677102789cf2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/pid_namespace.h> | 29 | #include <linux/pid_namespace.h> |
30 | #include <linux/nsproxy.h> | 30 | #include <linux/nsproxy.h> |
31 | #include <linux/user_namespace.h> | 31 | #include <linux/user_namespace.h> |
32 | #include <linux/uprobes.h> | ||
32 | #define CREATE_TRACE_POINTS | 33 | #define CREATE_TRACE_POINTS |
33 | #include <trace/events/signal.h> | 34 | #include <trace/events/signal.h> |
34 | 35 | ||
@@ -160,7 +161,7 @@ void recalc_sigpending(void) | |||
160 | 161 | ||
161 | #define SYNCHRONOUS_MASK \ | 162 | #define SYNCHRONOUS_MASK \ |
162 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ | 163 | (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ |
163 | sigmask(SIGTRAP) | sigmask(SIGFPE)) | 164 | sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) |
164 | 165 | ||
165 | int next_signal(struct sigpending *pending, sigset_t *mask) | 166 | int next_signal(struct sigpending *pending, sigset_t *mask) |
166 | { | 167 | { |
@@ -767,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t) | |||
767 | const struct cred *cred = current_cred(); | 768 | const struct cred *cred = current_cred(); |
768 | const struct cred *tcred = __task_cred(t); | 769 | const struct cred *tcred = __task_cred(t); |
769 | 770 | ||
770 | if (cred->user->user_ns == tcred->user->user_ns && | 771 | if (uid_eq(cred->euid, tcred->suid) || |
771 | (cred->euid == tcred->suid || | 772 | uid_eq(cred->euid, tcred->uid) || |
772 | cred->euid == tcred->uid || | 773 | uid_eq(cred->uid, tcred->suid) || |
773 | cred->uid == tcred->suid || | 774 | uid_eq(cred->uid, tcred->uid)) |
774 | cred->uid == tcred->uid)) | ||
775 | return 1; | 775 | return 1; |
776 | 776 | ||
777 | if (ns_capable(tcred->user->user_ns, CAP_KILL)) | 777 | if (ns_capable(tcred->user_ns, CAP_KILL)) |
778 | return 1; | 778 | return 1; |
779 | 779 | ||
780 | return 0; | 780 | return 0; |
@@ -1020,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig) | |||
1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | 1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); |
1021 | } | 1021 | } |
1022 | 1022 | ||
1023 | /* | ||
1024 | * map the uid in struct cred into user namespace *ns | ||
1025 | */ | ||
1026 | static inline uid_t map_cred_ns(const struct cred *cred, | ||
1027 | struct user_namespace *ns) | ||
1028 | { | ||
1029 | return user_ns_map_uid(ns, cred, cred->uid); | ||
1030 | } | ||
1031 | |||
1032 | #ifdef CONFIG_USER_NS | 1023 | #ifdef CONFIG_USER_NS |
1033 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | 1024 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) |
1034 | { | 1025 | { |
@@ -1038,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str | |||
1038 | if (SI_FROMKERNEL(info)) | 1029 | if (SI_FROMKERNEL(info)) |
1039 | return; | 1030 | return; |
1040 | 1031 | ||
1041 | info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), | 1032 | rcu_read_lock(); |
1042 | current_cred(), info->si_uid); | 1033 | info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), |
1034 | make_kuid(current_user_ns(), info->si_uid)); | ||
1035 | rcu_read_unlock(); | ||
1043 | } | 1036 | } |
1044 | #else | 1037 | #else |
1045 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | 1038 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) |
@@ -1106,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1106 | q->info.si_code = SI_USER; | 1099 | q->info.si_code = SI_USER; |
1107 | q->info.si_pid = task_tgid_nr_ns(current, | 1100 | q->info.si_pid = task_tgid_nr_ns(current, |
1108 | task_active_pid_ns(t)); | 1101 | task_active_pid_ns(t)); |
1109 | q->info.si_uid = current_uid(); | 1102 | q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
1110 | break; | 1103 | break; |
1111 | case (unsigned long) SEND_SIG_PRIV: | 1104 | case (unsigned long) SEND_SIG_PRIV: |
1112 | q->info.si_signo = sig; | 1105 | q->info.si_signo = sig; |
@@ -1387,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred, | |||
1387 | struct task_struct *target) | 1380 | struct task_struct *target) |
1388 | { | 1381 | { |
1389 | const struct cred *pcred = __task_cred(target); | 1382 | const struct cred *pcred = __task_cred(target); |
1390 | if (cred->user_ns != pcred->user_ns) | 1383 | if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && |
1391 | return 0; | 1384 | !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) |
1392 | if (cred->euid != pcred->suid && cred->euid != pcred->uid && | ||
1393 | cred->uid != pcred->suid && cred->uid != pcred->uid) | ||
1394 | return 0; | 1385 | return 0; |
1395 | return 1; | 1386 | return 1; |
1396 | } | 1387 | } |
@@ -1665,21 +1656,20 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1665 | info.si_signo = sig; | 1656 | info.si_signo = sig; |
1666 | info.si_errno = 0; | 1657 | info.si_errno = 0; |
1667 | /* | 1658 | /* |
1668 | * we are under tasklist_lock here so our parent is tied to | 1659 | * We are under tasklist_lock here so our parent is tied to |
1669 | * us and cannot exit and release its namespace. | 1660 | * us and cannot change. |
1670 | * | 1661 | * |
1671 | * the only it can is to switch its nsproxy with sys_unshare, | 1662 | * task_active_pid_ns will always return the same pid namespace |
1672 | * bu uncharing pid namespaces is not allowed, so we'll always | 1663 | * until a task passes through release_task. |
1673 | * see relevant namespace | ||
1674 | * | 1664 | * |
1675 | * write_lock() currently calls preempt_disable() which is the | 1665 | * write_lock() currently calls preempt_disable() which is the |
1676 | * same as rcu_read_lock(), but according to Oleg, this is not | 1666 | * same as rcu_read_lock(), but according to Oleg, this is not |
1677 | * correct to rely on this | 1667 | * correct to rely on this |
1678 | */ | 1668 | */ |
1679 | rcu_read_lock(); | 1669 | rcu_read_lock(); |
1680 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); | 1670 | info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); |
1681 | info.si_uid = map_cred_ns(__task_cred(tsk), | 1671 | info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), |
1682 | task_cred_xxx(tsk->parent, user_ns)); | 1672 | task_uid(tsk)); |
1683 | rcu_read_unlock(); | 1673 | rcu_read_unlock(); |
1684 | 1674 | ||
1685 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); | 1675 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); |
@@ -1762,8 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1762 | */ | 1752 | */ |
1763 | rcu_read_lock(); | 1753 | rcu_read_lock(); |
1764 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1754 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
1765 | info.si_uid = map_cred_ns(__task_cred(tsk), | 1755 | info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); |
1766 | task_cred_xxx(parent, user_ns)); | ||
1767 | rcu_read_unlock(); | 1756 | rcu_read_unlock(); |
1768 | 1757 | ||
1769 | info.si_utime = cputime_to_clock_t(tsk->utime); | 1758 | info.si_utime = cputime_to_clock_t(tsk->utime); |
@@ -1973,7 +1962,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
1973 | info.si_signo = signr; | 1962 | info.si_signo = signr; |
1974 | info.si_code = exit_code; | 1963 | info.si_code = exit_code; |
1975 | info.si_pid = task_pid_vnr(current); | 1964 | info.si_pid = task_pid_vnr(current); |
1976 | info.si_uid = current_uid(); | 1965 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
1977 | 1966 | ||
1978 | /* Let the debugger run. */ | 1967 | /* Let the debugger run. */ |
1979 | ptrace_stop(exit_code, why, 1, &info); | 1968 | ptrace_stop(exit_code, why, 1, &info); |
@@ -2181,8 +2170,8 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
2181 | info->si_code = SI_USER; | 2170 | info->si_code = SI_USER; |
2182 | rcu_read_lock(); | 2171 | rcu_read_lock(); |
2183 | info->si_pid = task_pid_vnr(current->parent); | 2172 | info->si_pid = task_pid_vnr(current->parent); |
2184 | info->si_uid = map_cred_ns(__task_cred(current->parent), | 2173 | info->si_uid = from_kuid_munged(current_user_ns(), |
2185 | current_user_ns()); | 2174 | task_uid(current->parent)); |
2186 | rcu_read_unlock(); | 2175 | rcu_read_unlock(); |
2187 | } | 2176 | } |
2188 | 2177 | ||
@@ -2202,6 +2191,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
2202 | struct signal_struct *signal = current->signal; | 2191 | struct signal_struct *signal = current->signal; |
2203 | int signr; | 2192 | int signr; |
2204 | 2193 | ||
2194 | if (unlikely(uprobe_deny_signal())) | ||
2195 | return 0; | ||
2196 | |||
2205 | relock: | 2197 | relock: |
2206 | /* | 2198 | /* |
2207 | * We'll jump back here after any time we were stopped in TASK_STOPPED. | 2199 | * We'll jump back here after any time we were stopped in TASK_STOPPED. |
@@ -2376,24 +2368,34 @@ relock: | |||
2376 | } | 2368 | } |
2377 | 2369 | ||
2378 | /** | 2370 | /** |
2379 | * block_sigmask - add @ka's signal mask to current->blocked | 2371 | * signal_delivered - |
2380 | * @ka: action for @signr | 2372 | * @sig: number of signal being delivered |
2381 | * @signr: signal that has been successfully delivered | 2373 | * @info: siginfo_t of signal being delivered |
2374 | * @ka: sigaction setting that chose the handler | ||
2375 | * @regs: user register state | ||
2376 | * @stepping: nonzero if debugger single-step or block-step in use | ||
2382 | * | 2377 | * |
2383 | * This function should be called when a signal has succesfully been | 2378 | * This function should be called when a signal has succesfully been |
2384 | * delivered. It adds the mask of signals for @ka to current->blocked | 2379 | * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask |
2385 | * so that they are blocked during the execution of the signal | 2380 | * is always blocked, and the signal itself is blocked unless %SA_NODEFER |
2386 | * handler. In addition, @signr will be blocked unless %SA_NODEFER is | 2381 | * is set in @ka->sa.sa_flags. Tracing is notified. |
2387 | * set in @ka->sa.sa_flags. | ||
2388 | */ | 2382 | */ |
2389 | void block_sigmask(struct k_sigaction *ka, int signr) | 2383 | void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, |
2384 | struct pt_regs *regs, int stepping) | ||
2390 | { | 2385 | { |
2391 | sigset_t blocked; | 2386 | sigset_t blocked; |
2392 | 2387 | ||
2388 | /* A signal was successfully delivered, and the | ||
2389 | saved sigmask was stored on the signal frame, | ||
2390 | and will be restored by sigreturn. So we can | ||
2391 | simply clear the restore sigmask flag. */ | ||
2392 | clear_restore_sigmask(); | ||
2393 | |||
2393 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | 2394 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); |
2394 | if (!(ka->sa.sa_flags & SA_NODEFER)) | 2395 | if (!(ka->sa.sa_flags & SA_NODEFER)) |
2395 | sigaddset(&blocked, signr); | 2396 | sigaddset(&blocked, sig); |
2396 | set_current_blocked(&blocked); | 2397 | set_current_blocked(&blocked); |
2398 | tracehook_signal_handler(sig, info, ka, regs, stepping); | ||
2397 | } | 2399 | } |
2398 | 2400 | ||
2399 | /* | 2401 | /* |
@@ -2526,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) | |||
2526 | * It is wrong to change ->blocked directly, this helper should be used | 2528 | * It is wrong to change ->blocked directly, this helper should be used |
2527 | * to ensure the process can't miss a shared signal we are going to block. | 2529 | * to ensure the process can't miss a shared signal we are going to block. |
2528 | */ | 2530 | */ |
2529 | void set_current_blocked(const sigset_t *newset) | 2531 | void set_current_blocked(sigset_t *newset) |
2532 | { | ||
2533 | struct task_struct *tsk = current; | ||
2534 | sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
2535 | spin_lock_irq(&tsk->sighand->siglock); | ||
2536 | __set_task_blocked(tsk, newset); | ||
2537 | spin_unlock_irq(&tsk->sighand->siglock); | ||
2538 | } | ||
2539 | |||
2540 | void __set_current_blocked(const sigset_t *newset) | ||
2530 | { | 2541 | { |
2531 | struct task_struct *tsk = current; | 2542 | struct task_struct *tsk = current; |
2532 | 2543 | ||
@@ -2566,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | |||
2566 | return -EINVAL; | 2577 | return -EINVAL; |
2567 | } | 2578 | } |
2568 | 2579 | ||
2569 | set_current_blocked(&newset); | 2580 | __set_current_blocked(&newset); |
2570 | return 0; | 2581 | return 0; |
2571 | } | 2582 | } |
2572 | 2583 | ||
@@ -2706,6 +2717,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
2706 | err |= __put_user(from->si_uid, &to->si_uid); | 2717 | err |= __put_user(from->si_uid, &to->si_uid); |
2707 | err |= __put_user(from->si_ptr, &to->si_ptr); | 2718 | err |= __put_user(from->si_ptr, &to->si_ptr); |
2708 | break; | 2719 | break; |
2720 | #ifdef __ARCH_SIGSYS | ||
2721 | case __SI_SYS: | ||
2722 | err |= __put_user(from->si_call_addr, &to->si_call_addr); | ||
2723 | err |= __put_user(from->si_syscall, &to->si_syscall); | ||
2724 | err |= __put_user(from->si_arch, &to->si_arch); | ||
2725 | break; | ||
2726 | #endif | ||
2709 | default: /* this is just in case for now ... */ | 2727 | default: /* this is just in case for now ... */ |
2710 | err |= __put_user(from->si_pid, &to->si_pid); | 2728 | err |= __put_user(from->si_pid, &to->si_pid); |
2711 | err |= __put_user(from->si_uid, &to->si_uid); | 2729 | err |= __put_user(from->si_uid, &to->si_uid); |
@@ -2828,7 +2846,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | |||
2828 | info.si_errno = 0; | 2846 | info.si_errno = 0; |
2829 | info.si_code = SI_USER; | 2847 | info.si_code = SI_USER; |
2830 | info.si_pid = task_tgid_vnr(current); | 2848 | info.si_pid = task_tgid_vnr(current); |
2831 | info.si_uid = current_uid(); | 2849 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
2832 | 2850 | ||
2833 | return kill_something_info(sig, &info, pid); | 2851 | return kill_something_info(sig, &info, pid); |
2834 | } | 2852 | } |
@@ -2871,7 +2889,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) | |||
2871 | info.si_errno = 0; | 2889 | info.si_errno = 0; |
2872 | info.si_code = SI_TKILL; | 2890 | info.si_code = SI_TKILL; |
2873 | info.si_pid = task_tgid_vnr(current); | 2891 | info.si_pid = task_tgid_vnr(current); |
2874 | info.si_uid = current_uid(); | 2892 | info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
2875 | 2893 | ||
2876 | return do_send_specific(tgid, pid, sig, &info); | 2894 | return do_send_specific(tgid, pid, sig, &info); |
2877 | } | 2895 | } |
@@ -3133,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, | |||
3133 | return -EINVAL; | 3151 | return -EINVAL; |
3134 | } | 3152 | } |
3135 | 3153 | ||
3136 | set_current_blocked(&new_blocked); | 3154 | __set_current_blocked(&new_blocked); |
3137 | } | 3155 | } |
3138 | 3156 | ||
3139 | if (oset) { | 3157 | if (oset) { |
@@ -3197,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) | |||
3197 | int old = current->blocked.sig[0]; | 3215 | int old = current->blocked.sig[0]; |
3198 | sigset_t newset; | 3216 | sigset_t newset; |
3199 | 3217 | ||
3200 | siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); | ||
3201 | set_current_blocked(&newset); | 3218 | set_current_blocked(&newset); |
3202 | 3219 | ||
3203 | return old; | 3220 | return old; |
@@ -3236,6 +3253,17 @@ SYSCALL_DEFINE0(pause) | |||
3236 | 3253 | ||
3237 | #endif | 3254 | #endif |
3238 | 3255 | ||
3256 | int sigsuspend(sigset_t *set) | ||
3257 | { | ||
3258 | current->saved_sigmask = current->blocked; | ||
3259 | set_current_blocked(set); | ||
3260 | |||
3261 | current->state = TASK_INTERRUPTIBLE; | ||
3262 | schedule(); | ||
3263 | set_restore_sigmask(); | ||
3264 | return -ERESTARTNOHAND; | ||
3265 | } | ||
3266 | |||
3239 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | 3267 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND |
3240 | /** | 3268 | /** |
3241 | * sys_rt_sigsuspend - replace the signal mask for a value with the | 3269 | * sys_rt_sigsuspend - replace the signal mask for a value with the |
@@ -3253,15 +3281,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | |||
3253 | 3281 | ||
3254 | if (copy_from_user(&newset, unewset, sizeof(newset))) | 3282 | if (copy_from_user(&newset, unewset, sizeof(newset))) |
3255 | return -EFAULT; | 3283 | return -EFAULT; |
3256 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 3284 | return sigsuspend(&newset); |
3257 | |||
3258 | current->saved_sigmask = current->blocked; | ||
3259 | set_current_blocked(&newset); | ||
3260 | |||
3261 | current->state = TASK_INTERRUPTIBLE; | ||
3262 | schedule(); | ||
3263 | set_restore_sigmask(); | ||
3264 | return -ERESTARTNOHAND; | ||
3265 | } | 3285 | } |
3266 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | 3286 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ |
3267 | 3287 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 2f8b10ecf759..d0ae5b24875e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -13,6 +13,8 @@ | |||
13 | #include <linux/smp.h> | 13 | #include <linux/smp.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | 15 | ||
16 | #include "smpboot.h" | ||
17 | |||
16 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS | 18 | #ifdef CONFIG_USE_GENERIC_SMP_HELPERS |
17 | static struct { | 19 | static struct { |
18 | struct list_head queue; | 20 | struct list_head queue; |
@@ -669,6 +671,8 @@ void __init smp_init(void) | |||
669 | { | 671 | { |
670 | unsigned int cpu; | 672 | unsigned int cpu; |
671 | 673 | ||
674 | idle_threads_init(); | ||
675 | |||
672 | /* FIXME: This should be done in userspace --RR */ | 676 | /* FIXME: This should be done in userspace --RR */ |
673 | for_each_present_cpu(cpu) { | 677 | for_each_present_cpu(cpu) { |
674 | if (num_online_cpus() >= setup_max_cpus) | 678 | if (num_online_cpus() >= setup_max_cpus) |
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
791 | } | 795 | } |
792 | } | 796 | } |
793 | EXPORT_SYMBOL(on_each_cpu_cond); | 797 | EXPORT_SYMBOL(on_each_cpu_cond); |
798 | |||
799 | static void do_nothing(void *unused) | ||
800 | { | ||
801 | } | ||
802 | |||
803 | /** | ||
804 | * kick_all_cpus_sync - Force all cpus out of idle | ||
805 | * | ||
806 | * Used to synchronize the update of pm_idle function pointer. It's | ||
807 | * called after the pointer is updated and returns after the dummy | ||
808 | * callback function has been executed on all cpus. The execution of | ||
809 | * the function can only happen on the remote cpus after they have | ||
810 | * left the idle function which had been called via pm_idle function | ||
811 | * pointer. So it's guaranteed that nothing uses the previous pointer | ||
812 | * anymore. | ||
813 | */ | ||
814 | void kick_all_cpus_sync(void) | ||
815 | { | ||
816 | /* Make sure the change is visible before we kick the cpus */ | ||
817 | smp_mb(); | ||
818 | smp_call_function(do_nothing, NULL, 1); | ||
819 | } | ||
820 | EXPORT_SYMBOL_GPL(kick_all_cpus_sync); | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c new file mode 100644 index 000000000000..98f60c5caa1b --- /dev/null +++ b/kernel/smpboot.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Common SMP CPU bringup/teardown functions | ||
3 | */ | ||
4 | #include <linux/err.h> | ||
5 | #include <linux/smp.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/percpu.h> | ||
9 | |||
10 | #include "smpboot.h" | ||
11 | |||
12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
13 | /* | ||
14 | * For the hotplug case we keep the task structs around and reuse | ||
15 | * them. | ||
16 | */ | ||
17 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); | ||
18 | |||
19 | struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) | ||
20 | { | ||
21 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
22 | |||
23 | if (!tsk) | ||
24 | return ERR_PTR(-ENOMEM); | ||
25 | init_idle(tsk, cpu); | ||
26 | return tsk; | ||
27 | } | ||
28 | |||
29 | void __init idle_thread_set_boot_cpu(void) | ||
30 | { | ||
31 | per_cpu(idle_threads, smp_processor_id()) = current; | ||
32 | } | ||
33 | |||
34 | /** | ||
35 | * idle_init - Initialize the idle thread for a cpu | ||
36 | * @cpu: The cpu for which the idle thread should be initialized | ||
37 | * | ||
38 | * Creates the thread if it does not exist. | ||
39 | */ | ||
40 | static inline void idle_init(unsigned int cpu) | ||
41 | { | ||
42 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | ||
43 | |||
44 | if (!tsk) { | ||
45 | tsk = fork_idle(cpu); | ||
46 | if (IS_ERR(tsk)) | ||
47 | pr_err("SMP: fork_idle() failed for CPU %u\n", cpu); | ||
48 | else | ||
49 | per_cpu(idle_threads, cpu) = tsk; | ||
50 | } | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * idle_threads_init - Initialize idle threads for all cpus | ||
55 | */ | ||
56 | void __init idle_threads_init(void) | ||
57 | { | ||
58 | unsigned int cpu, boot_cpu; | ||
59 | |||
60 | boot_cpu = smp_processor_id(); | ||
61 | |||
62 | for_each_possible_cpu(cpu) { | ||
63 | if (cpu != boot_cpu) | ||
64 | idle_init(cpu); | ||
65 | } | ||
66 | } | ||
67 | #endif | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h new file mode 100644 index 000000000000..80c0acfb8472 --- /dev/null +++ b/kernel/smpboot.h | |||
@@ -0,0 +1,18 @@ | |||
1 | #ifndef SMPBOOT_H | ||
2 | #define SMPBOOT_H | ||
3 | |||
4 | struct task_struct; | ||
5 | |||
6 | int smpboot_prepare(unsigned int cpu); | ||
7 | |||
8 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | ||
9 | struct task_struct *idle_thread_get(unsigned int cpu); | ||
10 | void idle_thread_set_boot_cpu(void); | ||
11 | void idle_threads_init(void); | ||
12 | #else | ||
13 | static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; } | ||
14 | static inline void idle_thread_set_boot_cpu(void) { } | ||
15 | static inline void idle_threads_init(void) { } | ||
16 | #endif | ||
17 | |||
18 | #endif | ||
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..2095be3318d5 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,10 +34,77 @@ | |||
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | /* | ||
38 | * Initialize an rcu_batch structure to empty. | ||
39 | */ | ||
40 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
41 | { | ||
42 | b->head = NULL; | ||
43 | b->tail = &b->head; | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
48 | */ | ||
49 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
50 | { | ||
51 | *b->tail = head; | ||
52 | b->tail = &head->next; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Is the specified rcu_batch structure empty? | ||
57 | */ | ||
58 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
59 | { | ||
60 | return b->tail == &b->head; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Remove the callback at the head of the specified rcu_batch structure | ||
65 | * and return a pointer to it, or return NULL if the structure is empty. | ||
66 | */ | ||
67 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
68 | { | ||
69 | struct rcu_head *head; | ||
70 | |||
71 | if (rcu_batch_empty(b)) | ||
72 | return NULL; | ||
73 | |||
74 | head = b->head; | ||
75 | b->head = head->next; | ||
76 | if (b->tail == &head->next) | ||
77 | rcu_batch_init(b); | ||
78 | |||
79 | return head; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
84 | * the structure specified by "to". | ||
85 | */ | ||
86 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
87 | { | ||
88 | if (!rcu_batch_empty(from)) { | ||
89 | *to->tail = from->head; | ||
90 | to->tail = from->tail; | ||
91 | rcu_batch_init(from); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 98 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | 99 | { |
39 | sp->completed = 0; | 100 | sp->completed = 0; |
40 | mutex_init(&sp->mutex); | 101 | spin_lock_init(&sp->queue_lock); |
102 | sp->running = false; | ||
103 | rcu_batch_init(&sp->batch_queue); | ||
104 | rcu_batch_init(&sp->batch_check0); | ||
105 | rcu_batch_init(&sp->batch_check1); | ||
106 | rcu_batch_init(&sp->batch_done); | ||
107 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | 108 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | 109 | return sp->per_cpu_ref ? 0 : -ENOMEM; |
43 | } | 110 | } |
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 140 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 141 | ||
75 | /* | 142 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 143 | * Returns approximate total of the readers' ->seq[] values for the |
77 | * active on the specified rank of per-CPU counters. | 144 | * rank of per-CPU counters specified by idx. |
78 | */ | 145 | */ |
146 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
147 | { | ||
148 | int cpu; | ||
149 | unsigned long sum = 0; | ||
150 | unsigned long t; | ||
79 | 151 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 152 | for_each_possible_cpu(cpu) { |
153 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
154 | sum += t; | ||
155 | } | ||
156 | return sum; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Returns approximate number of readers active on the specified rank | ||
161 | * of the per-CPU ->c[] counters. | ||
162 | */ | ||
163 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
81 | { | 164 | { |
82 | int cpu; | 165 | int cpu; |
83 | int sum; | 166 | unsigned long sum = 0; |
167 | unsigned long t; | ||
84 | 168 | ||
85 | sum = 0; | 169 | for_each_possible_cpu(cpu) { |
86 | for_each_possible_cpu(cpu) | 170 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 171 | sum += t; |
172 | } | ||
88 | return sum; | 173 | return sum; |
89 | } | 174 | } |
90 | 175 | ||
176 | /* | ||
177 | * Return true if the number of pre-existing readers is determined to | ||
178 | * be stably zero. An example unstable zero can occur if the call | ||
179 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
180 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
181 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
182 | * time to sum the array, and might in fact be interrupted or preempted | ||
183 | * partway through the summation. | ||
184 | */ | ||
185 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
186 | { | ||
187 | unsigned long seq; | ||
188 | |||
189 | seq = srcu_readers_seq_idx(sp, idx); | ||
190 | |||
191 | /* | ||
192 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
193 | * __srcu_read_lock(). This pairing ensures that if an | ||
194 | * __srcu_read_lock() increments its counter after the summation | ||
195 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
196 | * critical section will see any changes made prior to the start | ||
197 | * of the current SRCU grace period. | ||
198 | * | ||
199 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
200 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
201 | * must see the increment of ->c[]. | ||
202 | */ | ||
203 | smp_mb(); /* A */ | ||
204 | |||
205 | /* | ||
206 | * Note that srcu_readers_active_idx() can incorrectly return | ||
207 | * zero even though there is a pre-existing reader throughout. | ||
208 | * To see this, suppose that task A is in a very long SRCU | ||
209 | * read-side critical section that started on CPU 0, and that | ||
210 | * no other reader exists, so that the sum of the counters | ||
211 | * is equal to one. Then suppose that task B starts executing | ||
212 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
213 | * task C starts reading on CPU 0, so that its increment is not | ||
214 | * summed, but finishes reading on CPU 2, so that its decrement | ||
215 | * -is- summed. Then when task B completes its sum, it will | ||
216 | * incorrectly get zero, despite the fact that task A has been | ||
217 | * in its SRCU read-side critical section the whole time. | ||
218 | * | ||
219 | * We therefore do a validation step should srcu_readers_active_idx() | ||
220 | * return zero. | ||
221 | */ | ||
222 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * The remainder of this function is the validation step. | ||
227 | * The following smp_mb() D pairs with the smp_mb() C in | ||
228 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
229 | * by srcu_readers_active_idx() above, then any destructive | ||
230 | * operation performed after the grace period will happen after | ||
231 | * the corresponding SRCU read-side critical section. | ||
232 | * | ||
233 | * Note that there can be at most NR_CPUS worth of readers using | ||
234 | * the old index, which is not enough to overflow even a 32-bit | ||
235 | * integer. (Yes, this does mean that systems having more than | ||
236 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
237 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
238 | * Therefore, the only way that the return values of the two | ||
239 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
240 | * no increments of the corresponding rank of ->seq[] counts | ||
241 | * in the interim. But the missed-increment scenario laid out | ||
242 | * above includes an increment of the ->seq[] counter by | ||
243 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
244 | * scenario occurs, the return values from the two calls to | ||
245 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
246 | * step below suffices. | ||
247 | */ | ||
248 | smp_mb(); /* D */ | ||
249 | |||
250 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
251 | } | ||
252 | |||
91 | /** | 253 | /** |
92 | * srcu_readers_active - returns approximate number of readers. | 254 | * srcu_readers_active - returns approximate number of readers. |
93 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 255 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
98 | */ | 260 | */ |
99 | static int srcu_readers_active(struct srcu_struct *sp) | 261 | static int srcu_readers_active(struct srcu_struct *sp) |
100 | { | 262 | { |
101 | return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); | 263 | int cpu; |
264 | unsigned long sum = 0; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
268 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
269 | } | ||
270 | return sum; | ||
102 | } | 271 | } |
103 | 272 | ||
104 | /** | 273 | /** |
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 300 | int idx; |
132 | 301 | ||
133 | preempt_disable(); | 302 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 303 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 304 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 306 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
307 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
138 | preempt_enable(); | 308 | preempt_enable(); |
139 | return idx; | 309 | return idx; |
140 | } | 310 | } |
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 319 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 320 | { |
151 | preempt_disable(); | 321 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 322 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 323 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
154 | preempt_enable(); | 324 | preempt_enable(); |
155 | } | 325 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 326 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 333 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 334 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 335 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 336 | #define SRCU_RETRY_CHECK_DELAY 5 |
337 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
338 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
167 | 339 | ||
168 | /* | 340 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 341 | * @@@ Wait until all pre-existing readers complete. Such readers |
342 | * will have used the index specified by "idx". | ||
343 | * the caller should ensures the ->completed is not changed while checking | ||
344 | * and idx = (->completed & 1) ^ 1 | ||
170 | */ | 345 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 346 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) |
172 | { | 347 | { |
173 | int idx; | 348 | for (;;) { |
174 | 349 | if (srcu_readers_active_idx_check(sp, idx)) | |
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 350 | return true; |
176 | !lock_is_held(&rcu_bh_lock_map) && | 351 | if (--trycount <= 0) |
177 | !lock_is_held(&rcu_lock_map) && | 352 | return false; |
178 | !lock_is_held(&rcu_sched_lock_map), | 353 | udelay(SRCU_RETRY_CHECK_DELAY); |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 354 | } |
180 | 355 | } | |
181 | idx = sp->completed; | ||
182 | mutex_lock(&sp->mutex); | ||
183 | 356 | ||
184 | /* | 357 | /* |
185 | * Check to see if someone else did the work for us while we were | 358 | * Increment the ->completed counter so that future SRCU readers will |
186 | * waiting to acquire the lock. We need -two- advances of | 359 | * use the other rank of the ->c[] and ->seq[] arrays. This allows |
187 | * the counter, not just one. If there was but one, we might have | 360 | * us to wait for pre-existing readers in a starvation-free manner. |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 361 | */ |
189 | * having failed to prevent CPU-reordering races with concurrent | 362 | static void srcu_flip(struct srcu_struct *sp) |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 363 | { |
191 | * either (1) wait for two or (2) supply the second ourselves. | 364 | sp->completed++; |
192 | */ | 365 | } |
193 | 366 | ||
194 | if ((sp->completed - idx) >= 2) { | 367 | /* |
195 | mutex_unlock(&sp->mutex); | 368 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
196 | return; | 369 | * initiating grace-period processing if it is not already running. |
370 | */ | ||
371 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
372 | void (*func)(struct rcu_head *head)) | ||
373 | { | ||
374 | unsigned long flags; | ||
375 | |||
376 | head->next = NULL; | ||
377 | head->func = func; | ||
378 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
379 | rcu_batch_queue(&sp->batch_queue, head); | ||
380 | if (!sp->running) { | ||
381 | sp->running = true; | ||
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | ||
197 | } | 383 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
385 | } | ||
386 | EXPORT_SYMBOL_GPL(call_srcu); | ||
198 | 387 | ||
199 | sync_func(); /* Force memory barrier on all CPUs. */ | 388 | struct rcu_synchronize { |
389 | struct rcu_head head; | ||
390 | struct completion completion; | ||
391 | }; | ||
200 | 392 | ||
201 | /* | 393 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 394 | * Awaken the corresponding synchronize_srcu() instance now that a |
203 | * sees the new value of sp->completed will also see any preceding | 395 | * grace period has elapsed. |
204 | * changes to data structures made by this CPU. This prevents | 396 | */ |
205 | * some other CPU from reordering the accesses in its SRCU | 397 | static void wakeme_after_rcu(struct rcu_head *head) |
206 | * read-side critical section to precede the corresponding | 398 | { |
207 | * srcu_read_lock() -- ensuring that such references will in | 399 | struct rcu_synchronize *rcu; |
208 | * fact be protected. | ||
209 | * | ||
210 | * So it is now safe to do the flip. | ||
211 | */ | ||
212 | 400 | ||
213 | idx = sp->completed & 0x1; | 401 | rcu = container_of(head, struct rcu_synchronize, head); |
214 | sp->completed++; | 402 | complete(&rcu->completion); |
403 | } | ||
215 | 404 | ||
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
406 | static void srcu_reschedule(struct srcu_struct *sp); | ||
217 | 407 | ||
218 | /* | 408 | /* |
219 | * At this point, because of the preceding synchronize_sched(), | 409 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
220 | * all srcu_read_lock() calls using the old counters have completed. | 410 | */ |
221 | * Their corresponding critical sections might well be still | 411 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
222 | * executing, but the srcu_read_lock() primitives themselves | 412 | { |
223 | * will have finished executing. We initially give readers | 413 | struct rcu_synchronize rcu; |
224 | * an arbitrarily chosen 10 microseconds to get out of their | 414 | struct rcu_head *head = &rcu.head; |
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | 415 | bool done = false; |
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | 416 | ||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | 417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
418 | !lock_is_held(&rcu_bh_lock_map) && | ||
419 | !lock_is_held(&rcu_lock_map) && | ||
420 | !lock_is_held(&rcu_sched_lock_map), | ||
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
236 | 422 | ||
237 | /* | 423 | init_completion(&rcu.completion); |
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | 424 | |
239 | * primitives that were executing concurrently with the preceding | 425 | head->next = NULL; |
240 | * for_each_possible_cpu() loop to have completed by this point. | 426 | head->func = wakeme_after_rcu; |
241 | * More importantly, it also forces the corresponding SRCU read-side | 427 | spin_lock_irq(&sp->queue_lock); |
242 | * critical sections to have also completed, and the corresponding | 428 | if (!sp->running) { |
243 | * references to SRCU-protected data items to be dropped. | 429 | /* steal the processing owner */ |
244 | * | 430 | sp->running = true; |
245 | * Note: | 431 | rcu_batch_queue(&sp->batch_check0, head); |
246 | * | 432 | spin_unlock_irq(&sp->queue_lock); |
247 | * Despite what you might think at first glance, the | 433 | |
248 | * preceding synchronize_sched() -must- be within the | 434 | srcu_advance_batches(sp, trycount); |
249 | * critical section ended by the following mutex_unlock(). | 435 | if (!rcu_batch_empty(&sp->batch_done)) { |
250 | * Otherwise, a task taking the early exit can race | 436 | BUG_ON(sp->batch_done.head != head); |
251 | * with a srcu_read_unlock(), which might have executed | 437 | rcu_batch_dequeue(&sp->batch_done); |
252 | * just before the preceding srcu_readers_active() check, | 438 | done = true; |
253 | * and whose CPU might have reordered the srcu_read_unlock() | 439 | } |
254 | * with the preceding critical section. In this case, there | 440 | /* give the processing owner to work_struct */ |
255 | * is nothing preventing the synchronize_sched() task that is | 441 | srcu_reschedule(sp); |
256 | * taking the early exit from freeing a data structure that | 442 | } else { |
257 | * is still being referenced (out of order) by the task | 443 | rcu_batch_queue(&sp->batch_queue, head); |
258 | * doing the srcu_read_unlock(). | 444 | spin_unlock_irq(&sp->queue_lock); |
259 | * | 445 | } |
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | ||
264 | 446 | ||
265 | mutex_unlock(&sp->mutex); | 447 | if (!done) |
448 | wait_for_completion(&rcu.completion); | ||
266 | } | 449 | } |
267 | 450 | ||
268 | /** | 451 | /** |
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 464 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 465 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 466 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
285 | } | 468 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 469 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 470 | ||
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 472 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 473 | * @sp: srcu_struct with which to synchronize. |
291 | * | 474 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 475 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 476 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 477 | * |
300 | * Note that it is illegal to call this function while holding any lock | 478 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 479 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 480 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 481 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 482 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 485 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 487 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
313 | } | 489 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 491 | ||
316 | /** | 492 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
494 | */ | ||
495 | void srcu_barrier(struct srcu_struct *sp) | ||
496 | { | ||
497 | synchronize_srcu(sp); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
500 | |||
501 | /** | ||
317 | * srcu_batches_completed - return batches completed. | 502 | * srcu_batches_completed - return batches completed. |
318 | * @sp: srcu_struct on which to report batch completion. | 503 | * @sp: srcu_struct on which to report batch completion. |
319 | * | 504 | * |
320 | * Report the number of batches, correlated with, but not necessarily | 505 | * Report the number of batches, correlated with, but not necessarily |
321 | * precisely the same as, the number of grace periods that have elapsed. | 506 | * precisely the same as, the number of grace periods that have elapsed. |
322 | */ | 507 | */ |
323 | |||
324 | long srcu_batches_completed(struct srcu_struct *sp) | 508 | long srcu_batches_completed(struct srcu_struct *sp) |
325 | { | 509 | { |
326 | return sp->completed; | 510 | return sp->completed; |
327 | } | 511 | } |
328 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |
513 | |||
514 | #define SRCU_CALLBACK_BATCH 10 | ||
515 | #define SRCU_INTERVAL 1 | ||
516 | |||
517 | /* | ||
518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
519 | * period pipeline. | ||
520 | */ | ||
521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
522 | { | ||
523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
524 | spin_lock_irq(&sp->queue_lock); | ||
525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
526 | spin_unlock_irq(&sp->queue_lock); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
533 | */ | ||
534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
535 | { | ||
536 | int idx = 1 ^ (sp->completed & 1); | ||
537 | |||
538 | /* | ||
539 | * Because readers might be delayed for an extended period after | ||
540 | * fetching ->completed for their index, at any point in time there | ||
541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
542 | * need to wait for readers to clear from both index values before | ||
543 | * invoking a callback. | ||
544 | */ | ||
545 | |||
546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
547 | rcu_batch_empty(&sp->batch_check1)) | ||
548 | return; /* no callbacks need to be advanced */ | ||
549 | |||
550 | if (!try_check_zero(sp, idx, trycount)) | ||
551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
552 | |||
553 | /* | ||
554 | * The callbacks in ->batch_check1 have already done with their | ||
555 | * first zero check and flip back when they were enqueued on | ||
556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
557 | * (Presumably try_check_zero() returned false during that | ||
558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
560 | */ | ||
561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
562 | |||
563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
564 | return; /* no callbacks need to be advanced */ | ||
565 | srcu_flip(sp); | ||
566 | |||
567 | /* | ||
568 | * The callbacks in ->batch_check0 just finished their | ||
569 | * first check zero and flip, so move them to ->batch_check1 | ||
570 | * for future checking on the other idx. | ||
571 | */ | ||
572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
573 | |||
574 | /* | ||
575 | * SRCU read-side critical sections are normally short, so check | ||
576 | * at least twice in quick succession after a flip. | ||
577 | */ | ||
578 | trycount = trycount < 2 ? 2 : trycount; | ||
579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
581 | |||
582 | /* | ||
583 | * The callbacks in ->batch_check1 have now waited for all | ||
584 | * pre-existing readers using both idx values. They are therefore | ||
585 | * ready to invoke, so move them to ->batch_done. | ||
586 | */ | ||
587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
592 | * their grace period. If there are more to do, SRCU will reschedule | ||
593 | * the workqueue. | ||
594 | */ | ||
595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
596 | { | ||
597 | int i; | ||
598 | struct rcu_head *head; | ||
599 | |||
600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
602 | if (!head) | ||
603 | break; | ||
604 | local_bh_disable(); | ||
605 | head->func(head); | ||
606 | local_bh_enable(); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Finished one round of SRCU grace period. Start another if there are | ||
612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
613 | */ | ||
614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
615 | { | ||
616 | bool pending = true; | ||
617 | |||
618 | if (rcu_batch_empty(&sp->batch_done) && | ||
619 | rcu_batch_empty(&sp->batch_check1) && | ||
620 | rcu_batch_empty(&sp->batch_check0) && | ||
621 | rcu_batch_empty(&sp->batch_queue)) { | ||
622 | spin_lock_irq(&sp->queue_lock); | ||
623 | if (rcu_batch_empty(&sp->batch_done) && | ||
624 | rcu_batch_empty(&sp->batch_check1) && | ||
625 | rcu_batch_empty(&sp->batch_check0) && | ||
626 | rcu_batch_empty(&sp->batch_queue)) { | ||
627 | sp->running = false; | ||
628 | pending = false; | ||
629 | } | ||
630 | spin_unlock_irq(&sp->queue_lock); | ||
631 | } | ||
632 | |||
633 | if (pending) | ||
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * This is the work-queue function that handles SRCU grace periods. | ||
639 | */ | ||
640 | static void process_srcu(struct work_struct *work) | ||
641 | { | ||
642 | struct srcu_struct *sp; | ||
643 | |||
644 | sp = container_of(work, struct srcu_struct, work.work); | ||
645 | |||
646 | srcu_collect_new(sp); | ||
647 | srcu_advance_batches(sp, 1); | ||
648 | srcu_invoke_callbacks(sp); | ||
649 | srcu_reschedule(sp); | ||
650 | } | ||
diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e4..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -36,6 +36,8 @@ | |||
36 | #include <linux/personality.h> | 36 | #include <linux/personality.h> |
37 | #include <linux/ptrace.h> | 37 | #include <linux/ptrace.h> |
38 | #include <linux/fs_struct.h> | 38 | #include <linux/fs_struct.h> |
39 | #include <linux/file.h> | ||
40 | #include <linux/mount.h> | ||
39 | #include <linux/gfp.h> | 41 | #include <linux/gfp.h> |
40 | #include <linux/syscore_ops.h> | 42 | #include <linux/syscore_ops.h> |
41 | #include <linux/version.h> | 43 | #include <linux/version.h> |
@@ -93,10 +95,8 @@ | |||
93 | int overflowuid = DEFAULT_OVERFLOWUID; | 95 | int overflowuid = DEFAULT_OVERFLOWUID; |
94 | int overflowgid = DEFAULT_OVERFLOWGID; | 96 | int overflowgid = DEFAULT_OVERFLOWGID; |
95 | 97 | ||
96 | #ifdef CONFIG_UID16 | ||
97 | EXPORT_SYMBOL(overflowuid); | 98 | EXPORT_SYMBOL(overflowuid); |
98 | EXPORT_SYMBOL(overflowgid); | 99 | EXPORT_SYMBOL(overflowgid); |
99 | #endif | ||
100 | 100 | ||
101 | /* | 101 | /* |
102 | * the same as above, but for filesystems which can only store a 16-bit | 102 | * the same as above, but for filesystems which can only store a 16-bit |
@@ -133,11 +133,10 @@ static bool set_one_prio_perm(struct task_struct *p) | |||
133 | { | 133 | { |
134 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); | 134 | const struct cred *cred = current_cred(), *pcred = __task_cred(p); |
135 | 135 | ||
136 | if (pcred->user->user_ns == cred->user->user_ns && | 136 | if (uid_eq(pcred->uid, cred->euid) || |
137 | (pcred->uid == cred->euid || | 137 | uid_eq(pcred->euid, cred->euid)) |
138 | pcred->euid == cred->euid)) | ||
139 | return true; | 138 | return true; |
140 | if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) | 139 | if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) |
141 | return true; | 140 | return true; |
142 | return false; | 141 | return false; |
143 | } | 142 | } |
@@ -177,6 +176,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
177 | const struct cred *cred = current_cred(); | 176 | const struct cred *cred = current_cred(); |
178 | int error = -EINVAL; | 177 | int error = -EINVAL; |
179 | struct pid *pgrp; | 178 | struct pid *pgrp; |
179 | kuid_t uid; | ||
180 | 180 | ||
181 | if (which > PRIO_USER || which < PRIO_PROCESS) | 181 | if (which > PRIO_USER || which < PRIO_PROCESS) |
182 | goto out; | 182 | goto out; |
@@ -209,18 +209,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
209 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 209 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
210 | break; | 210 | break; |
211 | case PRIO_USER: | 211 | case PRIO_USER: |
212 | user = (struct user_struct *) cred->user; | 212 | uid = make_kuid(cred->user_ns, who); |
213 | user = cred->user; | ||
213 | if (!who) | 214 | if (!who) |
214 | who = cred->uid; | 215 | uid = cred->uid; |
215 | else if ((who != cred->uid) && | 216 | else if (!uid_eq(uid, cred->uid) && |
216 | !(user = find_user(who))) | 217 | !(user = find_user(uid))) |
217 | goto out_unlock; /* No processes for this user */ | 218 | goto out_unlock; /* No processes for this user */ |
218 | 219 | ||
219 | do_each_thread(g, p) { | 220 | do_each_thread(g, p) { |
220 | if (__task_cred(p)->uid == who) | 221 | if (uid_eq(task_uid(p), uid)) |
221 | error = set_one_prio(p, niceval, error); | 222 | error = set_one_prio(p, niceval, error); |
222 | } while_each_thread(g, p); | 223 | } while_each_thread(g, p); |
223 | if (who != cred->uid) | 224 | if (!uid_eq(uid, cred->uid)) |
224 | free_uid(user); /* For find_user() */ | 225 | free_uid(user); /* For find_user() */ |
225 | break; | 226 | break; |
226 | } | 227 | } |
@@ -244,6 +245,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
244 | const struct cred *cred = current_cred(); | 245 | const struct cred *cred = current_cred(); |
245 | long niceval, retval = -ESRCH; | 246 | long niceval, retval = -ESRCH; |
246 | struct pid *pgrp; | 247 | struct pid *pgrp; |
248 | kuid_t uid; | ||
247 | 249 | ||
248 | if (which > PRIO_USER || which < PRIO_PROCESS) | 250 | if (which > PRIO_USER || which < PRIO_PROCESS) |
249 | return -EINVAL; | 251 | return -EINVAL; |
@@ -274,21 +276,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
274 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 276 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
275 | break; | 277 | break; |
276 | case PRIO_USER: | 278 | case PRIO_USER: |
277 | user = (struct user_struct *) cred->user; | 279 | uid = make_kuid(cred->user_ns, who); |
280 | user = cred->user; | ||
278 | if (!who) | 281 | if (!who) |
279 | who = cred->uid; | 282 | uid = cred->uid; |
280 | else if ((who != cred->uid) && | 283 | else if (!uid_eq(uid, cred->uid) && |
281 | !(user = find_user(who))) | 284 | !(user = find_user(uid))) |
282 | goto out_unlock; /* No processes for this user */ | 285 | goto out_unlock; /* No processes for this user */ |
283 | 286 | ||
284 | do_each_thread(g, p) { | 287 | do_each_thread(g, p) { |
285 | if (__task_cred(p)->uid == who) { | 288 | if (uid_eq(task_uid(p), uid)) { |
286 | niceval = 20 - task_nice(p); | 289 | niceval = 20 - task_nice(p); |
287 | if (niceval > retval) | 290 | if (niceval > retval) |
288 | retval = niceval; | 291 | retval = niceval; |
289 | } | 292 | } |
290 | } while_each_thread(g, p); | 293 | } while_each_thread(g, p); |
291 | if (who != cred->uid) | 294 | if (!uid_eq(uid, cred->uid)) |
292 | free_uid(user); /* for find_user() */ | 295 | free_uid(user); /* for find_user() */ |
293 | break; | 296 | break; |
294 | } | 297 | } |
@@ -553,9 +556,19 @@ void ctrl_alt_del(void) | |||
553 | */ | 556 | */ |
554 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 557 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) |
555 | { | 558 | { |
559 | struct user_namespace *ns = current_user_ns(); | ||
556 | const struct cred *old; | 560 | const struct cred *old; |
557 | struct cred *new; | 561 | struct cred *new; |
558 | int retval; | 562 | int retval; |
563 | kgid_t krgid, kegid; | ||
564 | |||
565 | krgid = make_kgid(ns, rgid); | ||
566 | kegid = make_kgid(ns, egid); | ||
567 | |||
568 | if ((rgid != (gid_t) -1) && !gid_valid(krgid)) | ||
569 | return -EINVAL; | ||
570 | if ((egid != (gid_t) -1) && !gid_valid(kegid)) | ||
571 | return -EINVAL; | ||
559 | 572 | ||
560 | new = prepare_creds(); | 573 | new = prepare_creds(); |
561 | if (!new) | 574 | if (!new) |
@@ -564,25 +577,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
564 | 577 | ||
565 | retval = -EPERM; | 578 | retval = -EPERM; |
566 | if (rgid != (gid_t) -1) { | 579 | if (rgid != (gid_t) -1) { |
567 | if (old->gid == rgid || | 580 | if (gid_eq(old->gid, krgid) || |
568 | old->egid == rgid || | 581 | gid_eq(old->egid, krgid) || |
569 | nsown_capable(CAP_SETGID)) | 582 | nsown_capable(CAP_SETGID)) |
570 | new->gid = rgid; | 583 | new->gid = krgid; |
571 | else | 584 | else |
572 | goto error; | 585 | goto error; |
573 | } | 586 | } |
574 | if (egid != (gid_t) -1) { | 587 | if (egid != (gid_t) -1) { |
575 | if (old->gid == egid || | 588 | if (gid_eq(old->gid, kegid) || |
576 | old->egid == egid || | 589 | gid_eq(old->egid, kegid) || |
577 | old->sgid == egid || | 590 | gid_eq(old->sgid, kegid) || |
578 | nsown_capable(CAP_SETGID)) | 591 | nsown_capable(CAP_SETGID)) |
579 | new->egid = egid; | 592 | new->egid = kegid; |
580 | else | 593 | else |
581 | goto error; | 594 | goto error; |
582 | } | 595 | } |
583 | 596 | ||
584 | if (rgid != (gid_t) -1 || | 597 | if (rgid != (gid_t) -1 || |
585 | (egid != (gid_t) -1 && egid != old->gid)) | 598 | (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) |
586 | new->sgid = new->egid; | 599 | new->sgid = new->egid; |
587 | new->fsgid = new->egid; | 600 | new->fsgid = new->egid; |
588 | 601 | ||
@@ -600,9 +613,15 @@ error: | |||
600 | */ | 613 | */ |
601 | SYSCALL_DEFINE1(setgid, gid_t, gid) | 614 | SYSCALL_DEFINE1(setgid, gid_t, gid) |
602 | { | 615 | { |
616 | struct user_namespace *ns = current_user_ns(); | ||
603 | const struct cred *old; | 617 | const struct cred *old; |
604 | struct cred *new; | 618 | struct cred *new; |
605 | int retval; | 619 | int retval; |
620 | kgid_t kgid; | ||
621 | |||
622 | kgid = make_kgid(ns, gid); | ||
623 | if (!gid_valid(kgid)) | ||
624 | return -EINVAL; | ||
606 | 625 | ||
607 | new = prepare_creds(); | 626 | new = prepare_creds(); |
608 | if (!new) | 627 | if (!new) |
@@ -611,9 +630,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
611 | 630 | ||
612 | retval = -EPERM; | 631 | retval = -EPERM; |
613 | if (nsown_capable(CAP_SETGID)) | 632 | if (nsown_capable(CAP_SETGID)) |
614 | new->gid = new->egid = new->sgid = new->fsgid = gid; | 633 | new->gid = new->egid = new->sgid = new->fsgid = kgid; |
615 | else if (gid == old->gid || gid == old->sgid) | 634 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) |
616 | new->egid = new->fsgid = gid; | 635 | new->egid = new->fsgid = kgid; |
617 | else | 636 | else |
618 | goto error; | 637 | goto error; |
619 | 638 | ||
@@ -631,7 +650,7 @@ static int set_user(struct cred *new) | |||
631 | { | 650 | { |
632 | struct user_struct *new_user; | 651 | struct user_struct *new_user; |
633 | 652 | ||
634 | new_user = alloc_uid(current_user_ns(), new->uid); | 653 | new_user = alloc_uid(new->uid); |
635 | if (!new_user) | 654 | if (!new_user) |
636 | return -EAGAIN; | 655 | return -EAGAIN; |
637 | 656 | ||
@@ -670,9 +689,19 @@ static int set_user(struct cred *new) | |||
670 | */ | 689 | */ |
671 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 690 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) |
672 | { | 691 | { |
692 | struct user_namespace *ns = current_user_ns(); | ||
673 | const struct cred *old; | 693 | const struct cred *old; |
674 | struct cred *new; | 694 | struct cred *new; |
675 | int retval; | 695 | int retval; |
696 | kuid_t kruid, keuid; | ||
697 | |||
698 | kruid = make_kuid(ns, ruid); | ||
699 | keuid = make_kuid(ns, euid); | ||
700 | |||
701 | if ((ruid != (uid_t) -1) && !uid_valid(kruid)) | ||
702 | return -EINVAL; | ||
703 | if ((euid != (uid_t) -1) && !uid_valid(keuid)) | ||
704 | return -EINVAL; | ||
676 | 705 | ||
677 | new = prepare_creds(); | 706 | new = prepare_creds(); |
678 | if (!new) | 707 | if (!new) |
@@ -681,29 +710,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
681 | 710 | ||
682 | retval = -EPERM; | 711 | retval = -EPERM; |
683 | if (ruid != (uid_t) -1) { | 712 | if (ruid != (uid_t) -1) { |
684 | new->uid = ruid; | 713 | new->uid = kruid; |
685 | if (old->uid != ruid && | 714 | if (!uid_eq(old->uid, kruid) && |
686 | old->euid != ruid && | 715 | !uid_eq(old->euid, kruid) && |
687 | !nsown_capable(CAP_SETUID)) | 716 | !nsown_capable(CAP_SETUID)) |
688 | goto error; | 717 | goto error; |
689 | } | 718 | } |
690 | 719 | ||
691 | if (euid != (uid_t) -1) { | 720 | if (euid != (uid_t) -1) { |
692 | new->euid = euid; | 721 | new->euid = keuid; |
693 | if (old->uid != euid && | 722 | if (!uid_eq(old->uid, keuid) && |
694 | old->euid != euid && | 723 | !uid_eq(old->euid, keuid) && |
695 | old->suid != euid && | 724 | !uid_eq(old->suid, keuid) && |
696 | !nsown_capable(CAP_SETUID)) | 725 | !nsown_capable(CAP_SETUID)) |
697 | goto error; | 726 | goto error; |
698 | } | 727 | } |
699 | 728 | ||
700 | if (new->uid != old->uid) { | 729 | if (!uid_eq(new->uid, old->uid)) { |
701 | retval = set_user(new); | 730 | retval = set_user(new); |
702 | if (retval < 0) | 731 | if (retval < 0) |
703 | goto error; | 732 | goto error; |
704 | } | 733 | } |
705 | if (ruid != (uid_t) -1 || | 734 | if (ruid != (uid_t) -1 || |
706 | (euid != (uid_t) -1 && euid != old->uid)) | 735 | (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) |
707 | new->suid = new->euid; | 736 | new->suid = new->euid; |
708 | new->fsuid = new->euid; | 737 | new->fsuid = new->euid; |
709 | 738 | ||
@@ -731,9 +760,15 @@ error: | |||
731 | */ | 760 | */ |
732 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 761 | SYSCALL_DEFINE1(setuid, uid_t, uid) |
733 | { | 762 | { |
763 | struct user_namespace *ns = current_user_ns(); | ||
734 | const struct cred *old; | 764 | const struct cred *old; |
735 | struct cred *new; | 765 | struct cred *new; |
736 | int retval; | 766 | int retval; |
767 | kuid_t kuid; | ||
768 | |||
769 | kuid = make_kuid(ns, uid); | ||
770 | if (!uid_valid(kuid)) | ||
771 | return -EINVAL; | ||
737 | 772 | ||
738 | new = prepare_creds(); | 773 | new = prepare_creds(); |
739 | if (!new) | 774 | if (!new) |
@@ -742,17 +777,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
742 | 777 | ||
743 | retval = -EPERM; | 778 | retval = -EPERM; |
744 | if (nsown_capable(CAP_SETUID)) { | 779 | if (nsown_capable(CAP_SETUID)) { |
745 | new->suid = new->uid = uid; | 780 | new->suid = new->uid = kuid; |
746 | if (uid != old->uid) { | 781 | if (!uid_eq(kuid, old->uid)) { |
747 | retval = set_user(new); | 782 | retval = set_user(new); |
748 | if (retval < 0) | 783 | if (retval < 0) |
749 | goto error; | 784 | goto error; |
750 | } | 785 | } |
751 | } else if (uid != old->uid && uid != new->suid) { | 786 | } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { |
752 | goto error; | 787 | goto error; |
753 | } | 788 | } |
754 | 789 | ||
755 | new->fsuid = new->euid = uid; | 790 | new->fsuid = new->euid = kuid; |
756 | 791 | ||
757 | retval = security_task_fix_setuid(new, old, LSM_SETID_ID); | 792 | retval = security_task_fix_setuid(new, old, LSM_SETID_ID); |
758 | if (retval < 0) | 793 | if (retval < 0) |
@@ -772,9 +807,24 @@ error: | |||
772 | */ | 807 | */ |
773 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | 808 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) |
774 | { | 809 | { |
810 | struct user_namespace *ns = current_user_ns(); | ||
775 | const struct cred *old; | 811 | const struct cred *old; |
776 | struct cred *new; | 812 | struct cred *new; |
777 | int retval; | 813 | int retval; |
814 | kuid_t kruid, keuid, ksuid; | ||
815 | |||
816 | kruid = make_kuid(ns, ruid); | ||
817 | keuid = make_kuid(ns, euid); | ||
818 | ksuid = make_kuid(ns, suid); | ||
819 | |||
820 | if ((ruid != (uid_t) -1) && !uid_valid(kruid)) | ||
821 | return -EINVAL; | ||
822 | |||
823 | if ((euid != (uid_t) -1) && !uid_valid(keuid)) | ||
824 | return -EINVAL; | ||
825 | |||
826 | if ((suid != (uid_t) -1) && !uid_valid(ksuid)) | ||
827 | return -EINVAL; | ||
778 | 828 | ||
779 | new = prepare_creds(); | 829 | new = prepare_creds(); |
780 | if (!new) | 830 | if (!new) |
@@ -784,29 +834,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
784 | 834 | ||
785 | retval = -EPERM; | 835 | retval = -EPERM; |
786 | if (!nsown_capable(CAP_SETUID)) { | 836 | if (!nsown_capable(CAP_SETUID)) { |
787 | if (ruid != (uid_t) -1 && ruid != old->uid && | 837 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && |
788 | ruid != old->euid && ruid != old->suid) | 838 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) |
789 | goto error; | 839 | goto error; |
790 | if (euid != (uid_t) -1 && euid != old->uid && | 840 | if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && |
791 | euid != old->euid && euid != old->suid) | 841 | !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) |
792 | goto error; | 842 | goto error; |
793 | if (suid != (uid_t) -1 && suid != old->uid && | 843 | if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && |
794 | suid != old->euid && suid != old->suid) | 844 | !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) |
795 | goto error; | 845 | goto error; |
796 | } | 846 | } |
797 | 847 | ||
798 | if (ruid != (uid_t) -1) { | 848 | if (ruid != (uid_t) -1) { |
799 | new->uid = ruid; | 849 | new->uid = kruid; |
800 | if (ruid != old->uid) { | 850 | if (!uid_eq(kruid, old->uid)) { |
801 | retval = set_user(new); | 851 | retval = set_user(new); |
802 | if (retval < 0) | 852 | if (retval < 0) |
803 | goto error; | 853 | goto error; |
804 | } | 854 | } |
805 | } | 855 | } |
806 | if (euid != (uid_t) -1) | 856 | if (euid != (uid_t) -1) |
807 | new->euid = euid; | 857 | new->euid = keuid; |
808 | if (suid != (uid_t) -1) | 858 | if (suid != (uid_t) -1) |
809 | new->suid = suid; | 859 | new->suid = ksuid; |
810 | new->fsuid = new->euid; | 860 | new->fsuid = new->euid; |
811 | 861 | ||
812 | retval = security_task_fix_setuid(new, old, LSM_SETID_RES); | 862 | retval = security_task_fix_setuid(new, old, LSM_SETID_RES); |
@@ -820,14 +870,19 @@ error: | |||
820 | return retval; | 870 | return retval; |
821 | } | 871 | } |
822 | 872 | ||
823 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) | 873 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) |
824 | { | 874 | { |
825 | const struct cred *cred = current_cred(); | 875 | const struct cred *cred = current_cred(); |
826 | int retval; | 876 | int retval; |
877 | uid_t ruid, euid, suid; | ||
878 | |||
879 | ruid = from_kuid_munged(cred->user_ns, cred->uid); | ||
880 | euid = from_kuid_munged(cred->user_ns, cred->euid); | ||
881 | suid = from_kuid_munged(cred->user_ns, cred->suid); | ||
827 | 882 | ||
828 | if (!(retval = put_user(cred->uid, ruid)) && | 883 | if (!(retval = put_user(ruid, ruidp)) && |
829 | !(retval = put_user(cred->euid, euid))) | 884 | !(retval = put_user(euid, euidp))) |
830 | retval = put_user(cred->suid, suid); | 885 | retval = put_user(suid, suidp); |
831 | 886 | ||
832 | return retval; | 887 | return retval; |
833 | } | 888 | } |
@@ -837,9 +892,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u | |||
837 | */ | 892 | */ |
838 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | 893 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) |
839 | { | 894 | { |
895 | struct user_namespace *ns = current_user_ns(); | ||
840 | const struct cred *old; | 896 | const struct cred *old; |
841 | struct cred *new; | 897 | struct cred *new; |
842 | int retval; | 898 | int retval; |
899 | kgid_t krgid, kegid, ksgid; | ||
900 | |||
901 | krgid = make_kgid(ns, rgid); | ||
902 | kegid = make_kgid(ns, egid); | ||
903 | ksgid = make_kgid(ns, sgid); | ||
904 | |||
905 | if ((rgid != (gid_t) -1) && !gid_valid(krgid)) | ||
906 | return -EINVAL; | ||
907 | if ((egid != (gid_t) -1) && !gid_valid(kegid)) | ||
908 | return -EINVAL; | ||
909 | if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) | ||
910 | return -EINVAL; | ||
843 | 911 | ||
844 | new = prepare_creds(); | 912 | new = prepare_creds(); |
845 | if (!new) | 913 | if (!new) |
@@ -848,23 +916,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
848 | 916 | ||
849 | retval = -EPERM; | 917 | retval = -EPERM; |
850 | if (!nsown_capable(CAP_SETGID)) { | 918 | if (!nsown_capable(CAP_SETGID)) { |
851 | if (rgid != (gid_t) -1 && rgid != old->gid && | 919 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && |
852 | rgid != old->egid && rgid != old->sgid) | 920 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) |
853 | goto error; | 921 | goto error; |
854 | if (egid != (gid_t) -1 && egid != old->gid && | 922 | if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && |
855 | egid != old->egid && egid != old->sgid) | 923 | !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) |
856 | goto error; | 924 | goto error; |
857 | if (sgid != (gid_t) -1 && sgid != old->gid && | 925 | if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && |
858 | sgid != old->egid && sgid != old->sgid) | 926 | !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) |
859 | goto error; | 927 | goto error; |
860 | } | 928 | } |
861 | 929 | ||
862 | if (rgid != (gid_t) -1) | 930 | if (rgid != (gid_t) -1) |
863 | new->gid = rgid; | 931 | new->gid = krgid; |
864 | if (egid != (gid_t) -1) | 932 | if (egid != (gid_t) -1) |
865 | new->egid = egid; | 933 | new->egid = kegid; |
866 | if (sgid != (gid_t) -1) | 934 | if (sgid != (gid_t) -1) |
867 | new->sgid = sgid; | 935 | new->sgid = ksgid; |
868 | new->fsgid = new->egid; | 936 | new->fsgid = new->egid; |
869 | 937 | ||
870 | return commit_creds(new); | 938 | return commit_creds(new); |
@@ -874,14 +942,19 @@ error: | |||
874 | return retval; | 942 | return retval; |
875 | } | 943 | } |
876 | 944 | ||
877 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) | 945 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) |
878 | { | 946 | { |
879 | const struct cred *cred = current_cred(); | 947 | const struct cred *cred = current_cred(); |
880 | int retval; | 948 | int retval; |
949 | gid_t rgid, egid, sgid; | ||
950 | |||
951 | rgid = from_kgid_munged(cred->user_ns, cred->gid); | ||
952 | egid = from_kgid_munged(cred->user_ns, cred->egid); | ||
953 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); | ||
881 | 954 | ||
882 | if (!(retval = put_user(cred->gid, rgid)) && | 955 | if (!(retval = put_user(rgid, rgidp)) && |
883 | !(retval = put_user(cred->egid, egid))) | 956 | !(retval = put_user(egid, egidp))) |
884 | retval = put_user(cred->sgid, sgid); | 957 | retval = put_user(sgid, sgidp); |
885 | 958 | ||
886 | return retval; | 959 | return retval; |
887 | } | 960 | } |
@@ -898,18 +971,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
898 | const struct cred *old; | 971 | const struct cred *old; |
899 | struct cred *new; | 972 | struct cred *new; |
900 | uid_t old_fsuid; | 973 | uid_t old_fsuid; |
974 | kuid_t kuid; | ||
975 | |||
976 | old = current_cred(); | ||
977 | old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); | ||
978 | |||
979 | kuid = make_kuid(old->user_ns, uid); | ||
980 | if (!uid_valid(kuid)) | ||
981 | return old_fsuid; | ||
901 | 982 | ||
902 | new = prepare_creds(); | 983 | new = prepare_creds(); |
903 | if (!new) | 984 | if (!new) |
904 | return current_fsuid(); | 985 | return old_fsuid; |
905 | old = current_cred(); | ||
906 | old_fsuid = old->fsuid; | ||
907 | 986 | ||
908 | if (uid == old->uid || uid == old->euid || | 987 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || |
909 | uid == old->suid || uid == old->fsuid || | 988 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || |
910 | nsown_capable(CAP_SETUID)) { | 989 | nsown_capable(CAP_SETUID)) { |
911 | if (uid != old_fsuid) { | 990 | if (!uid_eq(kuid, old->fsuid)) { |
912 | new->fsuid = uid; | 991 | new->fsuid = kuid; |
913 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 992 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
914 | goto change_okay; | 993 | goto change_okay; |
915 | } | 994 | } |
@@ -931,18 +1010,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
931 | const struct cred *old; | 1010 | const struct cred *old; |
932 | struct cred *new; | 1011 | struct cred *new; |
933 | gid_t old_fsgid; | 1012 | gid_t old_fsgid; |
1013 | kgid_t kgid; | ||
1014 | |||
1015 | old = current_cred(); | ||
1016 | old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); | ||
1017 | |||
1018 | kgid = make_kgid(old->user_ns, gid); | ||
1019 | if (!gid_valid(kgid)) | ||
1020 | return old_fsgid; | ||
934 | 1021 | ||
935 | new = prepare_creds(); | 1022 | new = prepare_creds(); |
936 | if (!new) | 1023 | if (!new) |
937 | return current_fsgid(); | 1024 | return old_fsgid; |
938 | old = current_cred(); | ||
939 | old_fsgid = old->fsgid; | ||
940 | 1025 | ||
941 | if (gid == old->gid || gid == old->egid || | 1026 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || |
942 | gid == old->sgid || gid == old->fsgid || | 1027 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || |
943 | nsown_capable(CAP_SETGID)) { | 1028 | nsown_capable(CAP_SETGID)) { |
944 | if (gid != old_fsgid) { | 1029 | if (!gid_eq(kgid, old->fsgid)) { |
945 | new->fsgid = gid; | 1030 | new->fsgid = kgid; |
946 | goto change_okay; | 1031 | goto change_okay; |
947 | } | 1032 | } |
948 | } | 1033 | } |
@@ -1295,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) | |||
1295 | memcpy(u->nodename, tmp, len); | 1380 | memcpy(u->nodename, tmp, len); |
1296 | memset(u->nodename + len, 0, sizeof(u->nodename) - len); | 1381 | memset(u->nodename + len, 0, sizeof(u->nodename) - len); |
1297 | errno = 0; | 1382 | errno = 0; |
1383 | uts_proc_notify(UTS_PROC_HOSTNAME); | ||
1298 | } | 1384 | } |
1299 | uts_proc_notify(UTS_PROC_HOSTNAME); | ||
1300 | up_write(&uts_sem); | 1385 | up_write(&uts_sem); |
1301 | return errno; | 1386 | return errno; |
1302 | } | 1387 | } |
@@ -1346,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
1346 | memcpy(u->domainname, tmp, len); | 1431 | memcpy(u->domainname, tmp, len); |
1347 | memset(u->domainname + len, 0, sizeof(u->domainname) - len); | 1432 | memset(u->domainname + len, 0, sizeof(u->domainname) - len); |
1348 | errno = 0; | 1433 | errno = 0; |
1434 | uts_proc_notify(UTS_PROC_DOMAINNAME); | ||
1349 | } | 1435 | } |
1350 | uts_proc_notify(UTS_PROC_DOMAINNAME); | ||
1351 | up_write(&uts_sem); | 1436 | up_write(&uts_sem); |
1352 | return errno; | 1437 | return errno; |
1353 | } | 1438 | } |
@@ -1498,15 +1583,14 @@ static int check_prlimit_permission(struct task_struct *task) | |||
1498 | return 0; | 1583 | return 0; |
1499 | 1584 | ||
1500 | tcred = __task_cred(task); | 1585 | tcred = __task_cred(task); |
1501 | if (cred->user->user_ns == tcred->user->user_ns && | 1586 | if (uid_eq(cred->uid, tcred->euid) && |
1502 | (cred->uid == tcred->euid && | 1587 | uid_eq(cred->uid, tcred->suid) && |
1503 | cred->uid == tcred->suid && | 1588 | uid_eq(cred->uid, tcred->uid) && |
1504 | cred->uid == tcred->uid && | 1589 | gid_eq(cred->gid, tcred->egid) && |
1505 | cred->gid == tcred->egid && | 1590 | gid_eq(cred->gid, tcred->sgid) && |
1506 | cred->gid == tcred->sgid && | 1591 | gid_eq(cred->gid, tcred->gid)) |
1507 | cred->gid == tcred->gid)) | ||
1508 | return 0; | 1592 | return 0; |
1509 | if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) | 1593 | if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) |
1510 | return 0; | 1594 | return 0; |
1511 | 1595 | ||
1512 | return -EPERM; | 1596 | return -EPERM; |
@@ -1702,77 +1786,105 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1702 | } | 1786 | } |
1703 | 1787 | ||
1704 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE |
1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | ||
1790 | { | ||
1791 | struct file *exe_file; | ||
1792 | struct dentry *dentry; | ||
1793 | int err; | ||
1794 | |||
1795 | exe_file = fget(fd); | ||
1796 | if (!exe_file) | ||
1797 | return -EBADF; | ||
1798 | |||
1799 | dentry = exe_file->f_path.dentry; | ||
1800 | |||
1801 | /* | ||
1802 | * Because the original mm->exe_file points to executable file, make | ||
1803 | * sure that this one is executable as well, to avoid breaking an | ||
1804 | * overall picture. | ||
1805 | */ | ||
1806 | err = -EACCES; | ||
1807 | if (!S_ISREG(dentry->d_inode->i_mode) || | ||
1808 | exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) | ||
1809 | goto exit; | ||
1810 | |||
1811 | err = inode_permission(dentry->d_inode, MAY_EXEC); | ||
1812 | if (err) | ||
1813 | goto exit; | ||
1814 | |||
1815 | down_write(&mm->mmap_sem); | ||
1816 | |||
1817 | /* | ||
1818 | * Forbid mm->exe_file change if old file still mapped. | ||
1819 | */ | ||
1820 | err = -EBUSY; | ||
1821 | if (mm->exe_file) { | ||
1822 | struct vm_area_struct *vma; | ||
1823 | |||
1824 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
1825 | if (vma->vm_file && | ||
1826 | path_equal(&vma->vm_file->f_path, | ||
1827 | &mm->exe_file->f_path)) | ||
1828 | goto exit_unlock; | ||
1829 | } | ||
1830 | |||
1831 | /* | ||
1832 | * The symlink can be changed only once, just to disallow arbitrary | ||
1833 | * transitions malicious software might bring in. This means one | ||
1834 | * could make a snapshot over all processes running and monitor | ||
1835 | * /proc/pid/exe changes to notice unusual activity if needed. | ||
1836 | */ | ||
1837 | err = -EPERM; | ||
1838 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | ||
1839 | goto exit_unlock; | ||
1840 | |||
1841 | err = 0; | ||
1842 | set_mm_exe_file(mm, exe_file); | ||
1843 | exit_unlock: | ||
1844 | up_write(&mm->mmap_sem); | ||
1845 | |||
1846 | exit: | ||
1847 | fput(exe_file); | ||
1848 | return err; | ||
1849 | } | ||
1850 | |||
1705 | static int prctl_set_mm(int opt, unsigned long addr, | 1851 | static int prctl_set_mm(int opt, unsigned long addr, |
1706 | unsigned long arg4, unsigned long arg5) | 1852 | unsigned long arg4, unsigned long arg5) |
1707 | { | 1853 | { |
1708 | unsigned long rlim = rlimit(RLIMIT_DATA); | 1854 | unsigned long rlim = rlimit(RLIMIT_DATA); |
1709 | unsigned long vm_req_flags; | ||
1710 | unsigned long vm_bad_flags; | ||
1711 | struct vm_area_struct *vma; | ||
1712 | int error = 0; | ||
1713 | struct mm_struct *mm = current->mm; | 1855 | struct mm_struct *mm = current->mm; |
1856 | struct vm_area_struct *vma; | ||
1857 | int error; | ||
1714 | 1858 | ||
1715 | if (arg4 | arg5) | 1859 | if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) |
1716 | return -EINVAL; | 1860 | return -EINVAL; |
1717 | 1861 | ||
1718 | if (!capable(CAP_SYS_RESOURCE)) | 1862 | if (!capable(CAP_SYS_RESOURCE)) |
1719 | return -EPERM; | 1863 | return -EPERM; |
1720 | 1864 | ||
1721 | if (addr >= TASK_SIZE) | 1865 | if (opt == PR_SET_MM_EXE_FILE) |
1866 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | ||
1867 | |||
1868 | if (addr >= TASK_SIZE || addr < mmap_min_addr) | ||
1722 | return -EINVAL; | 1869 | return -EINVAL; |
1723 | 1870 | ||
1871 | error = -EINVAL; | ||
1872 | |||
1724 | down_read(&mm->mmap_sem); | 1873 | down_read(&mm->mmap_sem); |
1725 | vma = find_vma(mm, addr); | 1874 | vma = find_vma(mm, addr); |
1726 | 1875 | ||
1727 | if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { | ||
1728 | /* It must be existing VMA */ | ||
1729 | if (!vma || vma->vm_start > addr) | ||
1730 | goto out; | ||
1731 | } | ||
1732 | |||
1733 | error = -EINVAL; | ||
1734 | switch (opt) { | 1876 | switch (opt) { |
1735 | case PR_SET_MM_START_CODE: | 1877 | case PR_SET_MM_START_CODE: |
1878 | mm->start_code = addr; | ||
1879 | break; | ||
1736 | case PR_SET_MM_END_CODE: | 1880 | case PR_SET_MM_END_CODE: |
1737 | vm_req_flags = VM_READ | VM_EXEC; | 1881 | mm->end_code = addr; |
1738 | vm_bad_flags = VM_WRITE | VM_MAYSHARE; | ||
1739 | |||
1740 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1741 | (vma->vm_flags & vm_bad_flags)) | ||
1742 | goto out; | ||
1743 | |||
1744 | if (opt == PR_SET_MM_START_CODE) | ||
1745 | mm->start_code = addr; | ||
1746 | else | ||
1747 | mm->end_code = addr; | ||
1748 | break; | 1882 | break; |
1749 | |||
1750 | case PR_SET_MM_START_DATA: | 1883 | case PR_SET_MM_START_DATA: |
1751 | case PR_SET_MM_END_DATA: | 1884 | mm->start_data = addr; |
1752 | vm_req_flags = VM_READ | VM_WRITE; | ||
1753 | vm_bad_flags = VM_EXEC | VM_MAYSHARE; | ||
1754 | |||
1755 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1756 | (vma->vm_flags & vm_bad_flags)) | ||
1757 | goto out; | ||
1758 | |||
1759 | if (opt == PR_SET_MM_START_DATA) | ||
1760 | mm->start_data = addr; | ||
1761 | else | ||
1762 | mm->end_data = addr; | ||
1763 | break; | 1885 | break; |
1764 | 1886 | case PR_SET_MM_END_DATA: | |
1765 | case PR_SET_MM_START_STACK: | 1887 | mm->end_data = addr; |
1766 | |||
1767 | #ifdef CONFIG_STACK_GROWSUP | ||
1768 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; | ||
1769 | #else | ||
1770 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; | ||
1771 | #endif | ||
1772 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags) | ||
1773 | goto out; | ||
1774 | |||
1775 | mm->start_stack = addr; | ||
1776 | break; | 1888 | break; |
1777 | 1889 | ||
1778 | case PR_SET_MM_START_BRK: | 1890 | case PR_SET_MM_START_BRK: |
@@ -1799,24 +1911,89 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1799 | mm->brk = addr; | 1911 | mm->brk = addr; |
1800 | break; | 1912 | break; |
1801 | 1913 | ||
1914 | /* | ||
1915 | * If command line arguments and environment | ||
1916 | * are placed somewhere else on stack, we can | ||
1917 | * set them up here, ARG_START/END to setup | ||
1918 | * command line argumets and ENV_START/END | ||
1919 | * for environment. | ||
1920 | */ | ||
1921 | case PR_SET_MM_START_STACK: | ||
1922 | case PR_SET_MM_ARG_START: | ||
1923 | case PR_SET_MM_ARG_END: | ||
1924 | case PR_SET_MM_ENV_START: | ||
1925 | case PR_SET_MM_ENV_END: | ||
1926 | if (!vma) { | ||
1927 | error = -EFAULT; | ||
1928 | goto out; | ||
1929 | } | ||
1930 | if (opt == PR_SET_MM_START_STACK) | ||
1931 | mm->start_stack = addr; | ||
1932 | else if (opt == PR_SET_MM_ARG_START) | ||
1933 | mm->arg_start = addr; | ||
1934 | else if (opt == PR_SET_MM_ARG_END) | ||
1935 | mm->arg_end = addr; | ||
1936 | else if (opt == PR_SET_MM_ENV_START) | ||
1937 | mm->env_start = addr; | ||
1938 | else if (opt == PR_SET_MM_ENV_END) | ||
1939 | mm->env_end = addr; | ||
1940 | break; | ||
1941 | |||
1942 | /* | ||
1943 | * This doesn't move auxiliary vector itself | ||
1944 | * since it's pinned to mm_struct, but allow | ||
1945 | * to fill vector with new values. It's up | ||
1946 | * to a caller to provide sane values here | ||
1947 | * otherwise user space tools which use this | ||
1948 | * vector might be unhappy. | ||
1949 | */ | ||
1950 | case PR_SET_MM_AUXV: { | ||
1951 | unsigned long user_auxv[AT_VECTOR_SIZE]; | ||
1952 | |||
1953 | if (arg4 > sizeof(user_auxv)) | ||
1954 | goto out; | ||
1955 | up_read(&mm->mmap_sem); | ||
1956 | |||
1957 | if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) | ||
1958 | return -EFAULT; | ||
1959 | |||
1960 | /* Make sure the last entry is always AT_NULL */ | ||
1961 | user_auxv[AT_VECTOR_SIZE - 2] = 0; | ||
1962 | user_auxv[AT_VECTOR_SIZE - 1] = 0; | ||
1963 | |||
1964 | BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); | ||
1965 | |||
1966 | task_lock(current); | ||
1967 | memcpy(mm->saved_auxv, user_auxv, arg4); | ||
1968 | task_unlock(current); | ||
1969 | |||
1970 | return 0; | ||
1971 | } | ||
1802 | default: | 1972 | default: |
1803 | error = -EINVAL; | ||
1804 | goto out; | 1973 | goto out; |
1805 | } | 1974 | } |
1806 | 1975 | ||
1807 | error = 0; | 1976 | error = 0; |
1808 | |||
1809 | out: | 1977 | out: |
1810 | up_read(&mm->mmap_sem); | 1978 | up_read(&mm->mmap_sem); |
1811 | |||
1812 | return error; | 1979 | return error; |
1813 | } | 1980 | } |
1981 | |||
1982 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | ||
1983 | { | ||
1984 | return put_user(me->clear_child_tid, tid_addr); | ||
1985 | } | ||
1986 | |||
1814 | #else /* CONFIG_CHECKPOINT_RESTORE */ | 1987 | #else /* CONFIG_CHECKPOINT_RESTORE */ |
1815 | static int prctl_set_mm(int opt, unsigned long addr, | 1988 | static int prctl_set_mm(int opt, unsigned long addr, |
1816 | unsigned long arg4, unsigned long arg5) | 1989 | unsigned long arg4, unsigned long arg5) |
1817 | { | 1990 | { |
1818 | return -EINVAL; | 1991 | return -EINVAL; |
1819 | } | 1992 | } |
1993 | static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) | ||
1994 | { | ||
1995 | return -EINVAL; | ||
1996 | } | ||
1820 | #endif | 1997 | #endif |
1821 | 1998 | ||
1822 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | 1999 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, |
@@ -1908,7 +2085,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1908 | error = prctl_get_seccomp(); | 2085 | error = prctl_get_seccomp(); |
1909 | break; | 2086 | break; |
1910 | case PR_SET_SECCOMP: | 2087 | case PR_SET_SECCOMP: |
1911 | error = prctl_set_seccomp(arg2); | 2088 | error = prctl_set_seccomp(arg2, (char __user *)arg3); |
1912 | break; | 2089 | break; |
1913 | case PR_GET_TSC: | 2090 | case PR_GET_TSC: |
1914 | error = GET_TSC_CTL(arg2); | 2091 | error = GET_TSC_CTL(arg2); |
@@ -1971,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1971 | case PR_SET_MM: | 2148 | case PR_SET_MM: |
1972 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | 2149 | error = prctl_set_mm(arg2, arg3, arg4, arg5); |
1973 | break; | 2150 | break; |
2151 | case PR_GET_TID_ADDRESS: | ||
2152 | error = prctl_get_tid_address(me, (int __user **)arg2); | ||
2153 | break; | ||
1974 | case PR_SET_CHILD_SUBREAPER: | 2154 | case PR_SET_CHILD_SUBREAPER: |
1975 | me->signal->is_child_subreaper = !!arg2; | 2155 | me->signal->is_child_subreaper = !!arg2; |
1976 | error = 0; | 2156 | error = 0; |
@@ -1979,6 +2159,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1979 | error = put_user(me->signal->is_child_subreaper, | 2159 | error = put_user(me->signal->is_child_subreaper, |
1980 | (int __user *) arg2); | 2160 | (int __user *) arg2); |
1981 | break; | 2161 | break; |
2162 | case PR_SET_NO_NEW_PRIVS: | ||
2163 | if (arg2 != 1 || arg3 || arg4 || arg5) | ||
2164 | return -EINVAL; | ||
2165 | |||
2166 | current->no_new_privs = 1; | ||
2167 | break; | ||
2168 | case PR_GET_NO_NEW_PRIVS: | ||
2169 | if (arg2 || arg3 || arg4 || arg5) | ||
2170 | return -EINVAL; | ||
2171 | return current->no_new_privs ? 1 : 0; | ||
1982 | default: | 2172 | default: |
1983 | error = -EINVAL; | 2173 | error = -EINVAL; |
1984 | break; | 2174 | break; |
@@ -2022,7 +2212,6 @@ int orderly_poweroff(bool force) | |||
2022 | NULL | 2212 | NULL |
2023 | }; | 2213 | }; |
2024 | int ret = -ENOMEM; | 2214 | int ret = -ENOMEM; |
2025 | struct subprocess_info *info; | ||
2026 | 2215 | ||
2027 | if (argv == NULL) { | 2216 | if (argv == NULL) { |
2028 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | 2217 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", |
@@ -2030,18 +2219,16 @@ int orderly_poweroff(bool force) | |||
2030 | goto out; | 2219 | goto out; |
2031 | } | 2220 | } |
2032 | 2221 | ||
2033 | info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); | 2222 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, |
2034 | if (info == NULL) { | 2223 | NULL, argv_cleanup, NULL); |
2035 | argv_free(argv); | 2224 | out: |
2036 | goto out; | 2225 | if (likely(!ret)) |
2037 | } | 2226 | return 0; |
2038 | |||
2039 | call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); | ||
2040 | 2227 | ||
2041 | ret = call_usermodehelper_exec(info, UMH_NO_WAIT); | 2228 | if (ret == -ENOMEM) |
2229 | argv_free(argv); | ||
2042 | 2230 | ||
2043 | out: | 2231 | if (force) { |
2044 | if (ret && force) { | ||
2045 | printk(KERN_WARNING "Failed to start orderly shutdown: " | 2232 | printk(KERN_WARNING "Failed to start orderly shutdown: " |
2046 | "forcing the issue\n"); | 2233 | "forcing the issue\n"); |
2047 | 2234 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); | |||
203 | cond_syscall(sys_name_to_handle_at); | 203 | cond_syscall(sys_name_to_handle_at); |
204 | cond_syscall(sys_open_by_handle_at); | 204 | cond_syscall(sys_open_by_handle_at); |
205 | cond_syscall(compat_sys_open_by_handle_at); | 205 | cond_syscall(compat_sys_open_by_handle_at); |
206 | |||
207 | /* compare kernel pointers */ | ||
208 | cond_syscall(sys_kcmp); | ||
diff --git a/kernel/task_work.c b/kernel/task_work.c new file mode 100644 index 000000000000..82d1c794066d --- /dev/null +++ b/kernel/task_work.c | |||
@@ -0,0 +1,84 @@ | |||
1 | #include <linux/spinlock.h> | ||
2 | #include <linux/task_work.h> | ||
3 | #include <linux/tracehook.h> | ||
4 | |||
5 | int | ||
6 | task_work_add(struct task_struct *task, struct task_work *twork, bool notify) | ||
7 | { | ||
8 | unsigned long flags; | ||
9 | int err = -ESRCH; | ||
10 | |||
11 | #ifndef TIF_NOTIFY_RESUME | ||
12 | if (notify) | ||
13 | return -ENOTSUPP; | ||
14 | #endif | ||
15 | /* | ||
16 | * We must not insert the new work if the task has already passed | ||
17 | * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait() | ||
18 | * and check PF_EXITING under pi_lock. | ||
19 | */ | ||
20 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
21 | if (likely(!(task->flags & PF_EXITING))) { | ||
22 | hlist_add_head(&twork->hlist, &task->task_works); | ||
23 | err = 0; | ||
24 | } | ||
25 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
26 | |||
27 | /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ | ||
28 | if (likely(!err) && notify) | ||
29 | set_notify_resume(task); | ||
30 | return err; | ||
31 | } | ||
32 | |||
33 | struct task_work * | ||
34 | task_work_cancel(struct task_struct *task, task_work_func_t func) | ||
35 | { | ||
36 | unsigned long flags; | ||
37 | struct task_work *twork; | ||
38 | struct hlist_node *pos; | ||
39 | |||
40 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
41 | hlist_for_each_entry(twork, pos, &task->task_works, hlist) { | ||
42 | if (twork->func == func) { | ||
43 | hlist_del(&twork->hlist); | ||
44 | goto found; | ||
45 | } | ||
46 | } | ||
47 | twork = NULL; | ||
48 | found: | ||
49 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
50 | |||
51 | return twork; | ||
52 | } | ||
53 | |||
54 | void task_work_run(void) | ||
55 | { | ||
56 | struct task_struct *task = current; | ||
57 | struct hlist_head task_works; | ||
58 | struct hlist_node *pos; | ||
59 | |||
60 | raw_spin_lock_irq(&task->pi_lock); | ||
61 | hlist_move_list(&task->task_works, &task_works); | ||
62 | raw_spin_unlock_irq(&task->pi_lock); | ||
63 | |||
64 | if (unlikely(hlist_empty(&task_works))) | ||
65 | return; | ||
66 | /* | ||
67 | * We use hlist to save the space in task_struct, but we want fifo. | ||
68 | * Find the last entry, the list should be short, then process them | ||
69 | * in reverse order. | ||
70 | */ | ||
71 | for (pos = task_works.first; pos->next; pos = pos->next) | ||
72 | ; | ||
73 | |||
74 | for (;;) { | ||
75 | struct hlist_node **pprev = pos->pprev; | ||
76 | struct task_work *twork = container_of(pos, struct task_work, | ||
77 | hlist); | ||
78 | twork->func(twork); | ||
79 | |||
80 | if (pprev == &task_works.first) | ||
81 | break; | ||
82 | pos = container_of(pprev, struct hlist_node, next); | ||
83 | } | ||
84 | } | ||
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a20dc8a3c949..fd42bd452b75 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -2,6 +2,55 @@ | |||
2 | # Timer subsystem related configuration options | 2 | # Timer subsystem related configuration options |
3 | # | 3 | # |
4 | 4 | ||
5 | # Options selectable by arch Kconfig | ||
6 | |||
7 | # Watchdog function for clocksources to detect instabilities | ||
8 | config CLOCKSOURCE_WATCHDOG | ||
9 | bool | ||
10 | |||
11 | # Architecture has extra clocksource data | ||
12 | config ARCH_CLOCKSOURCE_DATA | ||
13 | bool | ||
14 | |||
15 | # Timekeeping vsyscall support | ||
16 | config GENERIC_TIME_VSYSCALL | ||
17 | bool | ||
18 | |||
19 | # ktime_t scalar 64bit nsec representation | ||
20 | config KTIME_SCALAR | ||
21 | bool | ||
22 | |||
23 | # Old style timekeeping | ||
24 | config ARCH_USES_GETTIMEOFFSET | ||
25 | bool | ||
26 | |||
27 | # The generic clock events infrastructure | ||
28 | config GENERIC_CLOCKEVENTS | ||
29 | bool | ||
30 | |||
31 | # Migration helper. Builds, but does not invoke | ||
32 | config GENERIC_CLOCKEVENTS_BUILD | ||
33 | bool | ||
34 | default y | ||
35 | depends on GENERIC_CLOCKEVENTS | ||
36 | |||
37 | # Clockevents broadcasting infrastructure | ||
38 | config GENERIC_CLOCKEVENTS_BROADCAST | ||
39 | bool | ||
40 | depends on GENERIC_CLOCKEVENTS | ||
41 | |||
42 | # Automatically adjust the min. reprogramming time for | ||
43 | # clock event device | ||
44 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
45 | bool | ||
46 | |||
47 | # Generic update of CMOS clock | ||
48 | config GENERIC_CMOS_UPDATE | ||
49 | bool | ||
50 | |||
51 | if GENERIC_CLOCKEVENTS | ||
52 | menu "Timers subsystem" | ||
53 | |||
5 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is | 54 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is |
6 | # only related to the tick functionality. Oneshot clockevent devices | 55 | # only related to the tick functionality. Oneshot clockevent devices |
7 | # are supported independ of this. | 56 | # are supported independ of this. |
@@ -26,10 +75,5 @@ config HIGH_RES_TIMERS | |||
26 | hardware is not capable then this option only increases | 75 | hardware is not capable then this option only increases |
27 | the size of the kernel image. | 76 | the size of the kernel image. |
28 | 77 | ||
29 | config GENERIC_CLOCKEVENTS_BUILD | 78 | endmenu |
30 | bool | 79 | endif |
31 | default y | ||
32 | depends on GENERIC_CLOCKEVENTS | ||
33 | |||
34 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
35 | bool | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7b..aa27d391bfc8 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); | |||
59 | * If one has not already been chosen, it checks to see if a | 59 | * If one has not already been chosen, it checks to see if a |
60 | * functional rtc device is available. | 60 | * functional rtc device is available. |
61 | */ | 61 | */ |
62 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | struct rtc_device *alarmtimer_get_rtcdev(void) |
63 | { | 63 | { |
64 | unsigned long flags; | 64 | unsigned long flags; |
65 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) | |||
115 | class_interface_unregister(&alarmtimer_rtc_interface); | 115 | class_interface_unregister(&alarmtimer_rtc_interface); |
116 | } | 116 | } |
117 | #else | 117 | #else |
118 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) | 118 | struct rtc_device *alarmtimer_get_rtcdev(void) |
119 | { | 119 | { |
120 | return NULL; | 120 | return NULL; |
121 | } | 121 | } |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c6..7e1ce012a851 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
297 | } | 297 | } |
298 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 298 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
299 | 299 | ||
300 | static void clockevents_config(struct clock_event_device *dev, | 300 | void clockevents_config(struct clock_event_device *dev, u32 freq) |
301 | u32 freq) | ||
302 | { | 301 | { |
303 | u64 sec; | 302 | u64 sec; |
304 | 303 | ||
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f03fd83b170b..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -409,15 +409,20 @@ int second_overflow(unsigned long secs) | |||
409 | time_state = TIME_DEL; | 409 | time_state = TIME_DEL; |
410 | break; | 410 | break; |
411 | case TIME_INS: | 411 | case TIME_INS: |
412 | if (secs % 86400 == 0) { | 412 | if (!(time_status & STA_INS)) |
413 | time_state = TIME_OK; | ||
414 | else if (secs % 86400 == 0) { | ||
413 | leap = -1; | 415 | leap = -1; |
414 | time_state = TIME_OOP; | 416 | time_state = TIME_OOP; |
417 | time_tai++; | ||
415 | printk(KERN_NOTICE | 418 | printk(KERN_NOTICE |
416 | "Clock: inserting leap second 23:59:60 UTC\n"); | 419 | "Clock: inserting leap second 23:59:60 UTC\n"); |
417 | } | 420 | } |
418 | break; | 421 | break; |
419 | case TIME_DEL: | 422 | case TIME_DEL: |
420 | if ((secs + 1) % 86400 == 0) { | 423 | if (!(time_status & STA_DEL)) |
424 | time_state = TIME_OK; | ||
425 | else if ((secs + 1) % 86400 == 0) { | ||
421 | leap = 1; | 426 | leap = 1; |
422 | time_tai--; | 427 | time_tai--; |
423 | time_state = TIME_WAIT; | 428 | time_state = TIME_WAIT; |
@@ -426,7 +431,6 @@ int second_overflow(unsigned long secs) | |||
426 | } | 431 | } |
427 | break; | 432 | break; |
428 | case TIME_OOP: | 433 | case TIME_OOP: |
429 | time_tai++; | ||
430 | time_state = TIME_WAIT; | 434 | time_state = TIME_WAIT; |
431 | break; | 435 | break; |
432 | 436 | ||
@@ -473,8 +477,6 @@ int second_overflow(unsigned long secs) | |||
473 | << NTP_SCALE_SHIFT; | 477 | << NTP_SCALE_SHIFT; |
474 | time_adjust = 0; | 478 | time_adjust = 0; |
475 | 479 | ||
476 | |||
477 | |||
478 | out: | 480 | out: |
479 | spin_unlock_irqrestore(&ntp_lock, flags); | 481 | spin_unlock_irqrestore(&ntp_lock, flags); |
480 | 482 | ||
@@ -559,10 +561,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
559 | /* only set allowed bits */ | 561 | /* only set allowed bits */ |
560 | time_status &= STA_RONLY; | 562 | time_status &= STA_RONLY; |
561 | time_status |= txc->status & ~STA_RONLY; | 563 | time_status |= txc->status & ~STA_RONLY; |
562 | |||
563 | } | 564 | } |
565 | |||
564 | /* | 566 | /* |
565 | * Called with the xtime lock held, so we can access and modify | 567 | * Called with ntp_lock held, so we can access and modify |
566 | * all the global NTP state: | 568 | * all the global NTP state: |
567 | */ | 569 | */ |
568 | static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) | 570 | static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | |||
274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | 274 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
275 | { | 275 | { |
276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; | 276 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
277 | unsigned long rcu_delta_jiffies; | ||
277 | ktime_t last_update, expires, now; | 278 | ktime_t last_update, expires, now; |
278 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 279 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
279 | u64 time_delta; | 280 | u64 time_delta; |
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
322 | time_delta = timekeeping_max_deferment(); | 323 | time_delta = timekeeping_max_deferment(); |
323 | } while (read_seqretry(&xtime_lock, seq)); | 324 | } while (read_seqretry(&xtime_lock, seq)); |
324 | 325 | ||
325 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 326 | if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || |
326 | arch_needs_cpu(cpu)) { | 327 | arch_needs_cpu(cpu)) { |
327 | next_jiffies = last_jiffies + 1; | 328 | next_jiffies = last_jiffies + 1; |
328 | delta_jiffies = 1; | 329 | delta_jiffies = 1; |
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
330 | /* Get the next timer wheel timer */ | 331 | /* Get the next timer wheel timer */ |
331 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 332 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
332 | delta_jiffies = next_jiffies - last_jiffies; | 333 | delta_jiffies = next_jiffies - last_jiffies; |
334 | if (rcu_delta_jiffies < delta_jiffies) { | ||
335 | next_jiffies = last_jiffies + rcu_delta_jiffies; | ||
336 | delta_jiffies = rcu_delta_jiffies; | ||
337 | } | ||
333 | } | 338 | } |
334 | /* | 339 | /* |
335 | * Do not stop the tick, if we are only one off | 340 | * Do not stop the tick, if we are only one off |
@@ -401,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) | |||
401 | */ | 406 | */ |
402 | if (!ts->tick_stopped) { | 407 | if (!ts->tick_stopped) { |
403 | select_nohz_load_balancer(1); | 408 | select_nohz_load_balancer(1); |
409 | calc_load_enter_idle(); | ||
404 | 410 | ||
405 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 411 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
406 | ts->tick_stopped = 1; | 412 | ts->tick_stopped = 1; |
@@ -576,6 +582,7 @@ void tick_nohz_idle_exit(void) | |||
576 | /* Update jiffies first */ | 582 | /* Update jiffies first */ |
577 | select_nohz_load_balancer(0); | 583 | select_nohz_load_balancer(0); |
578 | tick_do_update_jiffies64(now); | 584 | tick_do_update_jiffies64(now); |
585 | update_cpu_load_nohz(); | ||
579 | 586 | ||
580 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 587 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
581 | /* | 588 | /* |
@@ -591,6 +598,7 @@ void tick_nohz_idle_exit(void) | |||
591 | account_idle_ticks(ticks); | 598 | account_idle_ticks(ticks); |
592 | #endif | 599 | #endif |
593 | 600 | ||
601 | calc_load_exit_idle(); | ||
594 | touch_softlockup_watchdog(); | 602 | touch_softlockup_watchdog(); |
595 | /* | 603 | /* |
596 | * Cancel the scheduled timer and restore the tick | 604 | * Cancel the scheduled timer and restore the tick |
@@ -814,6 +822,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
814 | return HRTIMER_RESTART; | 822 | return HRTIMER_RESTART; |
815 | } | 823 | } |
816 | 824 | ||
825 | static int sched_skew_tick; | ||
826 | |||
827 | static int __init skew_tick(char *str) | ||
828 | { | ||
829 | get_option(&str, &sched_skew_tick); | ||
830 | |||
831 | return 0; | ||
832 | } | ||
833 | early_param("skew_tick", skew_tick); | ||
834 | |||
817 | /** | 835 | /** |
818 | * tick_setup_sched_timer - setup the tick emulation timer | 836 | * tick_setup_sched_timer - setup the tick emulation timer |
819 | */ | 837 | */ |
@@ -831,6 +849,14 @@ void tick_setup_sched_timer(void) | |||
831 | /* Get the next period (per cpu) */ | 849 | /* Get the next period (per cpu) */ |
832 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 850 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
833 | 851 | ||
852 | /* Offset the tick to avert xtime_lock contention. */ | ||
853 | if (sched_skew_tick) { | ||
854 | u64 offset = ktime_to_ns(tick_period) >> 1; | ||
855 | do_div(offset, num_possible_cpus()); | ||
856 | offset *= smp_processor_id(); | ||
857 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
858 | } | ||
859 | |||
834 | for (;;) { | 860 | for (;;) { |
835 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 861 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
836 | hrtimer_start_expires(&ts->sched_timer, | 862 | hrtimer_start_expires(&ts->sched_timer, |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d66b21308f7c..3447cfaf11e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -70,6 +70,12 @@ struct timekeeper { | |||
70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ | 70 | /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ |
71 | struct timespec raw_time; | 71 | struct timespec raw_time; |
72 | 72 | ||
73 | /* Offset clock monotonic -> clock realtime */ | ||
74 | ktime_t offs_real; | ||
75 | |||
76 | /* Offset clock monotonic -> clock boottime */ | ||
77 | ktime_t offs_boot; | ||
78 | |||
73 | /* Seqlock for all timekeeper values */ | 79 | /* Seqlock for all timekeeper values */ |
74 | seqlock_t lock; | 80 | seqlock_t lock; |
75 | }; | 81 | }; |
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
172 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 178 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
173 | } | 179 | } |
174 | 180 | ||
181 | static void update_rt_offset(void) | ||
182 | { | ||
183 | struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; | ||
184 | |||
185 | set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); | ||
186 | timekeeper.offs_real = timespec_to_ktime(tmp); | ||
187 | } | ||
188 | |||
175 | /* must hold write on timekeeper.lock */ | 189 | /* must hold write on timekeeper.lock */ |
176 | static void timekeeping_update(bool clearntp) | 190 | static void timekeeping_update(bool clearntp) |
177 | { | 191 | { |
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) | |||
179 | timekeeper.ntp_error = 0; | 193 | timekeeper.ntp_error = 0; |
180 | ntp_clear(); | 194 | ntp_clear(); |
181 | } | 195 | } |
196 | update_rt_offset(); | ||
182 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, | 197 | update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, |
183 | timekeeper.clock, timekeeper.mult); | 198 | timekeeper.clock, timekeeper.mult); |
184 | } | 199 | } |
@@ -240,7 +255,6 @@ void getnstimeofday(struct timespec *ts) | |||
240 | 255 | ||
241 | timespec_add_ns(ts, nsecs); | 256 | timespec_add_ns(ts, nsecs); |
242 | } | 257 | } |
243 | |||
244 | EXPORT_SYMBOL(getnstimeofday); | 258 | EXPORT_SYMBOL(getnstimeofday); |
245 | 259 | ||
246 | ktime_t ktime_get(void) | 260 | ktime_t ktime_get(void) |
@@ -357,8 +371,8 @@ void do_gettimeofday(struct timeval *tv) | |||
357 | tv->tv_sec = now.tv_sec; | 371 | tv->tv_sec = now.tv_sec; |
358 | tv->tv_usec = now.tv_nsec/1000; | 372 | tv->tv_usec = now.tv_nsec/1000; |
359 | } | 373 | } |
360 | |||
361 | EXPORT_SYMBOL(do_gettimeofday); | 374 | EXPORT_SYMBOL(do_gettimeofday); |
375 | |||
362 | /** | 376 | /** |
363 | * do_settimeofday - Sets the time of day | 377 | * do_settimeofday - Sets the time of day |
364 | * @tv: pointer to the timespec variable containing the new time | 378 | * @tv: pointer to the timespec variable containing the new time |
@@ -392,7 +406,6 @@ int do_settimeofday(const struct timespec *tv) | |||
392 | 406 | ||
393 | return 0; | 407 | return 0; |
394 | } | 408 | } |
395 | |||
396 | EXPORT_SYMBOL(do_settimeofday); | 409 | EXPORT_SYMBOL(do_settimeofday); |
397 | 410 | ||
398 | 411 | ||
@@ -606,6 +619,7 @@ void __init timekeeping_init(void) | |||
606 | } | 619 | } |
607 | set_normalized_timespec(&timekeeper.wall_to_monotonic, | 620 | set_normalized_timespec(&timekeeper.wall_to_monotonic, |
608 | -boot.tv_sec, -boot.tv_nsec); | 621 | -boot.tv_sec, -boot.tv_nsec); |
622 | update_rt_offset(); | ||
609 | timekeeper.total_sleep_time.tv_sec = 0; | 623 | timekeeper.total_sleep_time.tv_sec = 0; |
610 | timekeeper.total_sleep_time.tv_nsec = 0; | 624 | timekeeper.total_sleep_time.tv_nsec = 0; |
611 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 625 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
@@ -614,6 +628,12 @@ void __init timekeeping_init(void) | |||
614 | /* time in seconds when suspend began */ | 628 | /* time in seconds when suspend began */ |
615 | static struct timespec timekeeping_suspend_time; | 629 | static struct timespec timekeeping_suspend_time; |
616 | 630 | ||
631 | static void update_sleep_time(struct timespec t) | ||
632 | { | ||
633 | timekeeper.total_sleep_time = t; | ||
634 | timekeeper.offs_boot = timespec_to_ktime(t); | ||
635 | } | ||
636 | |||
617 | /** | 637 | /** |
618 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | 638 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval |
619 | * @delta: pointer to a timespec delta value | 639 | * @delta: pointer to a timespec delta value |
@@ -632,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) | |||
632 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); | 652 | timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); |
633 | timekeeper.wall_to_monotonic = | 653 | timekeeper.wall_to_monotonic = |
634 | timespec_sub(timekeeper.wall_to_monotonic, *delta); | 654 | timespec_sub(timekeeper.wall_to_monotonic, *delta); |
635 | timekeeper.total_sleep_time = timespec_add( | 655 | update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); |
636 | timekeeper.total_sleep_time, *delta); | ||
637 | } | 656 | } |
638 | 657 | ||
639 | 658 | ||
@@ -698,6 +717,7 @@ static void timekeeping_resume(void) | |||
698 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 717 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
699 | timekeeper.ntp_error = 0; | 718 | timekeeper.ntp_error = 0; |
700 | timekeeping_suspended = 0; | 719 | timekeeping_suspended = 0; |
720 | timekeeping_update(false); | ||
701 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | 721 | write_sequnlock_irqrestore(&timekeeper.lock, flags); |
702 | 722 | ||
703 | touch_softlockup_watchdog(); | 723 | touch_softlockup_watchdog(); |
@@ -964,6 +984,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
964 | timekeeper.xtime.tv_sec++; | 984 | timekeeper.xtime.tv_sec++; |
965 | leap = second_overflow(timekeeper.xtime.tv_sec); | 985 | leap = second_overflow(timekeeper.xtime.tv_sec); |
966 | timekeeper.xtime.tv_sec += leap; | 986 | timekeeper.xtime.tv_sec += leap; |
987 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
988 | if (leap) | ||
989 | clock_was_set_delayed(); | ||
967 | } | 990 | } |
968 | 991 | ||
969 | /* Accumulate raw time */ | 992 | /* Accumulate raw time */ |
@@ -1079,6 +1102,9 @@ static void update_wall_time(void) | |||
1079 | timekeeper.xtime.tv_sec++; | 1102 | timekeeper.xtime.tv_sec++; |
1080 | leap = second_overflow(timekeeper.xtime.tv_sec); | 1103 | leap = second_overflow(timekeeper.xtime.tv_sec); |
1081 | timekeeper.xtime.tv_sec += leap; | 1104 | timekeeper.xtime.tv_sec += leap; |
1105 | timekeeper.wall_to_monotonic.tv_sec -= leap; | ||
1106 | if (leap) | ||
1107 | clock_was_set_delayed(); | ||
1082 | } | 1108 | } |
1083 | 1109 | ||
1084 | timekeeping_update(false); | 1110 | timekeeping_update(false); |
@@ -1246,6 +1272,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1246 | } while (read_seqretry(&timekeeper.lock, seq)); | 1272 | } while (read_seqretry(&timekeeper.lock, seq)); |
1247 | } | 1273 | } |
1248 | 1274 | ||
1275 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
1276 | /** | ||
1277 | * ktime_get_update_offsets - hrtimer helper | ||
1278 | * @offs_real: pointer to storage for monotonic -> realtime offset | ||
1279 | * @offs_boot: pointer to storage for monotonic -> boottime offset | ||
1280 | * | ||
1281 | * Returns current monotonic time and updates the offsets | ||
1282 | * Called from hrtimer_interupt() or retrigger_next_event() | ||
1283 | */ | ||
1284 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) | ||
1285 | { | ||
1286 | ktime_t now; | ||
1287 | unsigned int seq; | ||
1288 | u64 secs, nsecs; | ||
1289 | |||
1290 | do { | ||
1291 | seq = read_seqbegin(&timekeeper.lock); | ||
1292 | |||
1293 | secs = timekeeper.xtime.tv_sec; | ||
1294 | nsecs = timekeeper.xtime.tv_nsec; | ||
1295 | nsecs += timekeeping_get_ns(); | ||
1296 | /* If arch requires, add in gettimeoffset() */ | ||
1297 | nsecs += arch_gettimeoffset(); | ||
1298 | |||
1299 | *offs_real = timekeeper.offs_real; | ||
1300 | *offs_boot = timekeeper.offs_boot; | ||
1301 | } while (read_seqretry(&timekeeper.lock, seq)); | ||
1302 | |||
1303 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | ||
1304 | now = ktime_sub(now, *offs_real); | ||
1305 | return now; | ||
1306 | } | ||
1307 | #endif | ||
1308 | |||
1249 | /** | 1309 | /** |
1250 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | 1310 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format |
1251 | */ | 1311 | */ |
diff --git a/kernel/timer.c b/kernel/timer.c index a297ffcf888e..6ec7e7e0db43 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer); | |||
861 | * | 861 | * |
862 | * mod_timer_pinned() is a way to update the expire field of an | 862 | * mod_timer_pinned() is a way to update the expire field of an |
863 | * active timer (if the timer is inactive it will be activated) | 863 | * active timer (if the timer is inactive it will be activated) |
864 | * and not allow the timer to be migrated to a different CPU. | 864 | * and to ensure that the timer is scheduled on the current CPU. |
865 | * | ||
866 | * Note that this does not prevent the timer from being migrated | ||
867 | * when the current CPU goes offline. If this is a problem for | ||
868 | * you, use CPU-hotplug notifiers to handle it correctly, for | ||
869 | * example, cancelling the timer when the corresponding CPU goes | ||
870 | * offline. | ||
865 | * | 871 | * |
866 | * mod_timer_pinned(timer, expires) is equivalent to: | 872 | * mod_timer_pinned(timer, expires) is equivalent to: |
867 | * | 873 | * |
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
1102 | * warnings as well as problems when looking into | 1108 | * warnings as well as problems when looking into |
1103 | * timer->lockdep_map, make a copy and use that here. | 1109 | * timer->lockdep_map, make a copy and use that here. |
1104 | */ | 1110 | */ |
1105 | struct lockdep_map lockdep_map = timer->lockdep_map; | 1111 | struct lockdep_map lockdep_map; |
1112 | |||
1113 | lockdep_copy_map(&lockdep_map, &timer->lockdep_map); | ||
1106 | #endif | 1114 | #endif |
1107 | /* | 1115 | /* |
1108 | * Couple the lock chain with the lock chain at | 1116 | * Couple the lock chain with the lock chain at |
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid) | |||
1427 | SYSCALL_DEFINE0(getuid) | 1435 | SYSCALL_DEFINE0(getuid) |
1428 | { | 1436 | { |
1429 | /* Only we change this so SMP safe */ | 1437 | /* Only we change this so SMP safe */ |
1430 | return current_uid(); | 1438 | return from_kuid_munged(current_user_ns(), current_uid()); |
1431 | } | 1439 | } |
1432 | 1440 | ||
1433 | SYSCALL_DEFINE0(geteuid) | 1441 | SYSCALL_DEFINE0(geteuid) |
1434 | { | 1442 | { |
1435 | /* Only we change this so SMP safe */ | 1443 | /* Only we change this so SMP safe */ |
1436 | return current_euid(); | 1444 | return from_kuid_munged(current_user_ns(), current_euid()); |
1437 | } | 1445 | } |
1438 | 1446 | ||
1439 | SYSCALL_DEFINE0(getgid) | 1447 | SYSCALL_DEFINE0(getgid) |
1440 | { | 1448 | { |
1441 | /* Only we change this so SMP safe */ | 1449 | /* Only we change this so SMP safe */ |
1442 | return current_gid(); | 1450 | return from_kgid_munged(current_user_ns(), current_gid()); |
1443 | } | 1451 | } |
1444 | 1452 | ||
1445 | SYSCALL_DEFINE0(getegid) | 1453 | SYSCALL_DEFINE0(getegid) |
1446 | { | 1454 | { |
1447 | /* Only we change this so SMP safe */ | 1455 | /* Only we change this so SMP safe */ |
1448 | return current_egid(); | 1456 | return from_kgid_munged(current_user_ns(), current_egid()); |
1449 | } | 1457 | } |
1450 | 1458 | ||
1451 | #endif | 1459 | #endif |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..8c4c07071cc5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,6 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE | ||
145 | select KALLSYMS | 144 | select KALLSYMS |
146 | select GENERIC_TRACER | 145 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 146 | select CONTEXT_SWITCH_TRACER |
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
272 | bool "Trace likely/unlikely profiler" | 271 | bool "Trace likely/unlikely profiler" |
273 | select TRACE_BRANCH_PROFILING | 272 | select TRACE_BRANCH_PROFILING |
274 | help | 273 | help |
275 | This tracer profiles all the the likely and unlikely macros | 274 | This tracer profiles all likely and unlikely macros |
276 | in the kernel. It will display the results in: | 275 | in the kernel. It will display the results in: |
277 | 276 | ||
278 | /sys/kernel/debug/tracing/trace_stat/branch_annotated | 277 | /sys/kernel/debug/tracing/trace_stat/branch_annotated |
@@ -373,6 +372,7 @@ config KPROBE_EVENT | |||
373 | depends on HAVE_REGS_AND_STACK_ACCESS_API | 372 | depends on HAVE_REGS_AND_STACK_ACCESS_API |
374 | bool "Enable kprobes-based dynamic events" | 373 | bool "Enable kprobes-based dynamic events" |
375 | select TRACING | 374 | select TRACING |
375 | select PROBE_EVENTS | ||
376 | default y | 376 | default y |
377 | help | 377 | help |
378 | This allows the user to add tracing events (similar to tracepoints) | 378 | This allows the user to add tracing events (similar to tracepoints) |
@@ -385,6 +385,25 @@ config KPROBE_EVENT | |||
385 | This option is also required by perf-probe subcommand of perf tools. | 385 | This option is also required by perf-probe subcommand of perf tools. |
386 | If you want to use perf tools, this option is strongly recommended. | 386 | If you want to use perf tools, this option is strongly recommended. |
387 | 387 | ||
388 | config UPROBE_EVENT | ||
389 | bool "Enable uprobes-based dynamic events" | ||
390 | depends on ARCH_SUPPORTS_UPROBES | ||
391 | depends on MMU | ||
392 | select UPROBES | ||
393 | select PROBE_EVENTS | ||
394 | select TRACING | ||
395 | default n | ||
396 | help | ||
397 | This allows the user to add tracing events on top of userspace | ||
398 | dynamic events (similar to tracepoints) on the fly via the trace | ||
399 | events interface. Those events can be inserted wherever uprobes | ||
400 | can probe, and record various registers. | ||
401 | This option is required if you plan to use perf-probe subcommand | ||
402 | of perf tools on user space applications. | ||
403 | |||
404 | config PROBE_EVENTS | ||
405 | def_bool n | ||
406 | |||
388 | config DYNAMIC_FTRACE | 407 | config DYNAMIC_FTRACE |
389 | bool "enable/disable ftrace tracepoints dynamically" | 408 | bool "enable/disable ftrace tracepoints dynamically" |
390 | depends on FUNCTION_TRACER | 409 | depends on FUNCTION_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5ea..b831087c8200 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o | |||
41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 41 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | ||
45 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 44 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
46 | ifeq ($(CONFIG_BLOCK),y) | 45 | ifeq ($(CONFIG_BLOCK),y) |
47 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o | 46 | obj-$(CONFIG_EVENT_TRACING) += blktrace.o |
@@ -61,5 +60,7 @@ endif | |||
61 | ifeq ($(CONFIG_TRACING),y) | 60 | ifeq ($(CONFIG_TRACING),y) |
62 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 61 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
63 | endif | 62 | endif |
63 | obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o | ||
64 | obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o | ||
64 | 65 | ||
65 | libftrace-y := ftrace.o | 66 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c92..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
1383 | 1383 | ||
1384 | static int ftrace_cmp_recs(const void *a, const void *b) | 1384 | static int ftrace_cmp_recs(const void *a, const void *b) |
1385 | { | 1385 | { |
1386 | const struct dyn_ftrace *reca = a; | 1386 | const struct dyn_ftrace *key = a; |
1387 | const struct dyn_ftrace *recb = b; | 1387 | const struct dyn_ftrace *rec = b; |
1388 | 1388 | ||
1389 | if (reca->ip > recb->ip) | 1389 | if (key->flags < rec->ip) |
1390 | return 1; | ||
1391 | if (reca->ip < recb->ip) | ||
1392 | return -1; | 1390 | return -1; |
1391 | if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) | ||
1392 | return 1; | ||
1393 | return 0; | 1393 | return 0; |
1394 | } | 1394 | } |
1395 | 1395 | ||
1396 | /** | 1396 | static unsigned long ftrace_location_range(unsigned long start, unsigned long end) |
1397 | * ftrace_location - return true if the ip giving is a traced location | ||
1398 | * @ip: the instruction pointer to check | ||
1399 | * | ||
1400 | * Returns 1 if @ip given is a pointer to a ftrace location. | ||
1401 | * That is, the instruction that is either a NOP or call to | ||
1402 | * the function tracer. It checks the ftrace internal tables to | ||
1403 | * determine if the address belongs or not. | ||
1404 | */ | ||
1405 | int ftrace_location(unsigned long ip) | ||
1406 | { | 1397 | { |
1407 | struct ftrace_page *pg; | 1398 | struct ftrace_page *pg; |
1408 | struct dyn_ftrace *rec; | 1399 | struct dyn_ftrace *rec; |
1409 | struct dyn_ftrace key; | 1400 | struct dyn_ftrace key; |
1410 | 1401 | ||
1411 | key.ip = ip; | 1402 | key.ip = start; |
1403 | key.flags = end; /* overload flags, as it is unsigned long */ | ||
1412 | 1404 | ||
1413 | for (pg = ftrace_pages_start; pg; pg = pg->next) { | 1405 | for (pg = ftrace_pages_start; pg; pg = pg->next) { |
1406 | if (end < pg->records[0].ip || | ||
1407 | start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) | ||
1408 | continue; | ||
1414 | rec = bsearch(&key, pg->records, pg->index, | 1409 | rec = bsearch(&key, pg->records, pg->index, |
1415 | sizeof(struct dyn_ftrace), | 1410 | sizeof(struct dyn_ftrace), |
1416 | ftrace_cmp_recs); | 1411 | ftrace_cmp_recs); |
1417 | if (rec) | 1412 | if (rec) |
1418 | return 1; | 1413 | return rec->ip; |
1419 | } | 1414 | } |
1420 | 1415 | ||
1421 | return 0; | 1416 | return 0; |
1422 | } | 1417 | } |
1423 | 1418 | ||
1419 | /** | ||
1420 | * ftrace_location - return true if the ip giving is a traced location | ||
1421 | * @ip: the instruction pointer to check | ||
1422 | * | ||
1423 | * Returns rec->ip if @ip given is a pointer to a ftrace location. | ||
1424 | * That is, the instruction that is either a NOP or call to | ||
1425 | * the function tracer. It checks the ftrace internal tables to | ||
1426 | * determine if the address belongs or not. | ||
1427 | */ | ||
1428 | unsigned long ftrace_location(unsigned long ip) | ||
1429 | { | ||
1430 | return ftrace_location_range(ip, ip); | ||
1431 | } | ||
1432 | |||
1433 | /** | ||
1434 | * ftrace_text_reserved - return true if range contains an ftrace location | ||
1435 | * @start: start of range to search | ||
1436 | * @end: end of range to search (inclusive). @end points to the last byte to check. | ||
1437 | * | ||
1438 | * Returns 1 if @start and @end contains a ftrace location. | ||
1439 | * That is, the instruction that is either a NOP or call to | ||
1440 | * the function tracer. It checks the ftrace internal tables to | ||
1441 | * determine if the address belongs or not. | ||
1442 | */ | ||
1443 | int ftrace_text_reserved(void *start, void *end) | ||
1444 | { | ||
1445 | unsigned long ret; | ||
1446 | |||
1447 | ret = ftrace_location_range((unsigned long)start, | ||
1448 | (unsigned long)end); | ||
1449 | |||
1450 | return (int)!!ret; | ||
1451 | } | ||
1452 | |||
1424 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1453 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1425 | int filter_hash, | 1454 | int filter_hash, |
1426 | bool inc) | 1455 | bool inc) |
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | |||
1520 | __ftrace_hash_rec_update(ops, filter_hash, 1); | 1549 | __ftrace_hash_rec_update(ops, filter_hash, 1); |
1521 | } | 1550 | } |
1522 | 1551 | ||
1523 | static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) | ||
1524 | { | ||
1525 | if (ftrace_pages->index == ftrace_pages->size) { | ||
1526 | /* We should have allocated enough */ | ||
1527 | if (WARN_ON(!ftrace_pages->next)) | ||
1528 | return NULL; | ||
1529 | ftrace_pages = ftrace_pages->next; | ||
1530 | } | ||
1531 | |||
1532 | return &ftrace_pages->records[ftrace_pages->index++]; | ||
1533 | } | ||
1534 | |||
1535 | static struct dyn_ftrace * | ||
1536 | ftrace_record_ip(unsigned long ip) | ||
1537 | { | ||
1538 | struct dyn_ftrace *rec; | ||
1539 | |||
1540 | if (ftrace_disabled) | ||
1541 | return NULL; | ||
1542 | |||
1543 | rec = ftrace_alloc_dyn_node(ip); | ||
1544 | if (!rec) | ||
1545 | return NULL; | ||
1546 | |||
1547 | rec->ip = ip; | ||
1548 | |||
1549 | return rec; | ||
1550 | } | ||
1551 | |||
1552 | static void print_ip_ins(const char *fmt, unsigned char *p) | 1552 | static void print_ip_ins(const char *fmt, unsigned char *p) |
1553 | { | 1553 | { |
1554 | int i; | 1554 | int i; |
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) | |||
1598 | } | 1598 | } |
1599 | } | 1599 | } |
1600 | 1600 | ||
1601 | |||
1602 | /* Return 1 if the address range is reserved for ftrace */ | ||
1603 | int ftrace_text_reserved(void *start, void *end) | ||
1604 | { | ||
1605 | struct dyn_ftrace *rec; | ||
1606 | struct ftrace_page *pg; | ||
1607 | |||
1608 | do_for_each_ftrace_rec(pg, rec) { | ||
1609 | if (rec->ip <= (unsigned long)end && | ||
1610 | rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) | ||
1611 | return 1; | ||
1612 | } while_for_each_ftrace_rec(); | ||
1613 | return 0; | ||
1614 | } | ||
1615 | |||
1616 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | 1601 | static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) |
1617 | { | 1602 | { |
1618 | unsigned long flag = 0UL; | 1603 | unsigned long flag = 0UL; |
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1698 | return -1; /* unknow ftrace bug */ | 1683 | return -1; /* unknow ftrace bug */ |
1699 | } | 1684 | } |
1700 | 1685 | ||
1701 | static void ftrace_replace_code(int update) | 1686 | void __weak ftrace_replace_code(int enable) |
1702 | { | 1687 | { |
1703 | struct dyn_ftrace *rec; | 1688 | struct dyn_ftrace *rec; |
1704 | struct ftrace_page *pg; | 1689 | struct ftrace_page *pg; |
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) | |||
1708 | return; | 1693 | return; |
1709 | 1694 | ||
1710 | do_for_each_ftrace_rec(pg, rec) { | 1695 | do_for_each_ftrace_rec(pg, rec) { |
1711 | failed = __ftrace_replace_code(rec, update); | 1696 | failed = __ftrace_replace_code(rec, enable); |
1712 | if (failed) { | 1697 | if (failed) { |
1713 | ftrace_bug(failed, rec->ip); | 1698 | ftrace_bug(failed, rec->ip); |
1714 | /* Stop processing */ | 1699 | /* Stop processing */ |
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
1826 | return 0; | 1811 | return 0; |
1827 | } | 1812 | } |
1828 | 1813 | ||
1829 | static int __ftrace_modify_code(void *data) | 1814 | void ftrace_modify_all_code(int command) |
1830 | { | 1815 | { |
1831 | int *command = data; | 1816 | if (command & FTRACE_UPDATE_CALLS) |
1832 | |||
1833 | if (*command & FTRACE_UPDATE_CALLS) | ||
1834 | ftrace_replace_code(1); | 1817 | ftrace_replace_code(1); |
1835 | else if (*command & FTRACE_DISABLE_CALLS) | 1818 | else if (command & FTRACE_DISABLE_CALLS) |
1836 | ftrace_replace_code(0); | 1819 | ftrace_replace_code(0); |
1837 | 1820 | ||
1838 | if (*command & FTRACE_UPDATE_TRACE_FUNC) | 1821 | if (command & FTRACE_UPDATE_TRACE_FUNC) |
1839 | ftrace_update_ftrace_func(ftrace_trace_function); | 1822 | ftrace_update_ftrace_func(ftrace_trace_function); |
1840 | 1823 | ||
1841 | if (*command & FTRACE_START_FUNC_RET) | 1824 | if (command & FTRACE_START_FUNC_RET) |
1842 | ftrace_enable_ftrace_graph_caller(); | 1825 | ftrace_enable_ftrace_graph_caller(); |
1843 | else if (*command & FTRACE_STOP_FUNC_RET) | 1826 | else if (command & FTRACE_STOP_FUNC_RET) |
1844 | ftrace_disable_ftrace_graph_caller(); | 1827 | ftrace_disable_ftrace_graph_caller(); |
1828 | } | ||
1829 | |||
1830 | static int __ftrace_modify_code(void *data) | ||
1831 | { | ||
1832 | int *command = data; | ||
1833 | |||
1834 | ftrace_modify_all_code(*command); | ||
1845 | 1835 | ||
1846 | return 0; | 1836 | return 0; |
1847 | } | 1837 | } |
@@ -2469,57 +2459,35 @@ static int | |||
2469 | ftrace_avail_open(struct inode *inode, struct file *file) | 2459 | ftrace_avail_open(struct inode *inode, struct file *file) |
2470 | { | 2460 | { |
2471 | struct ftrace_iterator *iter; | 2461 | struct ftrace_iterator *iter; |
2472 | int ret; | ||
2473 | 2462 | ||
2474 | if (unlikely(ftrace_disabled)) | 2463 | if (unlikely(ftrace_disabled)) |
2475 | return -ENODEV; | 2464 | return -ENODEV; |
2476 | 2465 | ||
2477 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2466 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2478 | if (!iter) | 2467 | if (iter) { |
2479 | return -ENOMEM; | 2468 | iter->pg = ftrace_pages_start; |
2480 | 2469 | iter->ops = &global_ops; | |
2481 | iter->pg = ftrace_pages_start; | ||
2482 | iter->ops = &global_ops; | ||
2483 | |||
2484 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2485 | if (!ret) { | ||
2486 | struct seq_file *m = file->private_data; | ||
2487 | |||
2488 | m->private = iter; | ||
2489 | } else { | ||
2490 | kfree(iter); | ||
2491 | } | 2470 | } |
2492 | 2471 | ||
2493 | return ret; | 2472 | return iter ? 0 : -ENOMEM; |
2494 | } | 2473 | } |
2495 | 2474 | ||
2496 | static int | 2475 | static int |
2497 | ftrace_enabled_open(struct inode *inode, struct file *file) | 2476 | ftrace_enabled_open(struct inode *inode, struct file *file) |
2498 | { | 2477 | { |
2499 | struct ftrace_iterator *iter; | 2478 | struct ftrace_iterator *iter; |
2500 | int ret; | ||
2501 | 2479 | ||
2502 | if (unlikely(ftrace_disabled)) | 2480 | if (unlikely(ftrace_disabled)) |
2503 | return -ENODEV; | 2481 | return -ENODEV; |
2504 | 2482 | ||
2505 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2483 | iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); |
2506 | if (!iter) | 2484 | if (iter) { |
2507 | return -ENOMEM; | 2485 | iter->pg = ftrace_pages_start; |
2508 | 2486 | iter->flags = FTRACE_ITER_ENABLED; | |
2509 | iter->pg = ftrace_pages_start; | 2487 | iter->ops = &global_ops; |
2510 | iter->flags = FTRACE_ITER_ENABLED; | ||
2511 | iter->ops = &global_ops; | ||
2512 | |||
2513 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
2514 | if (!ret) { | ||
2515 | struct seq_file *m = file->private_data; | ||
2516 | |||
2517 | m->private = iter; | ||
2518 | } else { | ||
2519 | kfree(iter); | ||
2520 | } | 2488 | } |
2521 | 2489 | ||
2522 | return ret; | 2490 | return iter ? 0 : -ENOMEM; |
2523 | } | 2491 | } |
2524 | 2492 | ||
2525 | static void ftrace_filter_reset(struct ftrace_hash *hash) | 2493 | static void ftrace_filter_reset(struct ftrace_hash *hash) |
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
3688 | return 0; | 3656 | return 0; |
3689 | } | 3657 | } |
3690 | 3658 | ||
3691 | static void ftrace_swap_recs(void *a, void *b, int size) | 3659 | static int ftrace_cmp_ips(const void *a, const void *b) |
3660 | { | ||
3661 | const unsigned long *ipa = a; | ||
3662 | const unsigned long *ipb = b; | ||
3663 | |||
3664 | if (*ipa > *ipb) | ||
3665 | return 1; | ||
3666 | if (*ipa < *ipb) | ||
3667 | return -1; | ||
3668 | return 0; | ||
3669 | } | ||
3670 | |||
3671 | static void ftrace_swap_ips(void *a, void *b, int size) | ||
3692 | { | 3672 | { |
3693 | struct dyn_ftrace *reca = a; | 3673 | unsigned long *ipa = a; |
3694 | struct dyn_ftrace *recb = b; | 3674 | unsigned long *ipb = b; |
3695 | struct dyn_ftrace t; | 3675 | unsigned long t; |
3696 | 3676 | ||
3697 | t = *reca; | 3677 | t = *ipa; |
3698 | *reca = *recb; | 3678 | *ipa = *ipb; |
3699 | *recb = t; | 3679 | *ipb = t; |
3700 | } | 3680 | } |
3701 | 3681 | ||
3702 | static int ftrace_process_locs(struct module *mod, | 3682 | static int ftrace_process_locs(struct module *mod, |
3703 | unsigned long *start, | 3683 | unsigned long *start, |
3704 | unsigned long *end) | 3684 | unsigned long *end) |
3705 | { | 3685 | { |
3686 | struct ftrace_page *start_pg; | ||
3706 | struct ftrace_page *pg; | 3687 | struct ftrace_page *pg; |
3688 | struct dyn_ftrace *rec; | ||
3707 | unsigned long count; | 3689 | unsigned long count; |
3708 | unsigned long *p; | 3690 | unsigned long *p; |
3709 | unsigned long addr; | 3691 | unsigned long addr; |
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3715 | if (!count) | 3697 | if (!count) |
3716 | return 0; | 3698 | return 0; |
3717 | 3699 | ||
3718 | pg = ftrace_allocate_pages(count); | 3700 | sort(start, count, sizeof(*start), |
3719 | if (!pg) | 3701 | ftrace_cmp_ips, ftrace_swap_ips); |
3702 | |||
3703 | start_pg = ftrace_allocate_pages(count); | ||
3704 | if (!start_pg) | ||
3720 | return -ENOMEM; | 3705 | return -ENOMEM; |
3721 | 3706 | ||
3722 | mutex_lock(&ftrace_lock); | 3707 | mutex_lock(&ftrace_lock); |
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, | |||
3729 | if (!mod) { | 3714 | if (!mod) { |
3730 | WARN_ON(ftrace_pages || ftrace_pages_start); | 3715 | WARN_ON(ftrace_pages || ftrace_pages_start); |
3731 | /* First initialization */ | 3716 | /* First initialization */ |
3732 | ftrace_pages = ftrace_pages_start = pg; | 3717 | ftrace_pages = ftrace_pages_start = start_pg; |
3733 | } else { | 3718 | } else { |
3734 | if (!ftrace_pages) | 3719 | if (!ftrace_pages) |
3735 | goto out; | 3720 | goto out; |
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, | |||
3740 | ftrace_pages = ftrace_pages->next; | 3725 | ftrace_pages = ftrace_pages->next; |
3741 | } | 3726 | } |
3742 | 3727 | ||
3743 | ftrace_pages->next = pg; | 3728 | ftrace_pages->next = start_pg; |
3744 | ftrace_pages = pg; | ||
3745 | } | 3729 | } |
3746 | 3730 | ||
3747 | p = start; | 3731 | p = start; |
3732 | pg = start_pg; | ||
3748 | while (p < end) { | 3733 | while (p < end) { |
3749 | addr = ftrace_call_adjust(*p++); | 3734 | addr = ftrace_call_adjust(*p++); |
3750 | /* | 3735 | /* |
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, | |||
3755 | */ | 3740 | */ |
3756 | if (!addr) | 3741 | if (!addr) |
3757 | continue; | 3742 | continue; |
3758 | if (!ftrace_record_ip(addr)) | 3743 | |
3759 | break; | 3744 | if (pg->index == pg->size) { |
3745 | /* We should have allocated enough */ | ||
3746 | if (WARN_ON(!pg->next)) | ||
3747 | break; | ||
3748 | pg = pg->next; | ||
3749 | } | ||
3750 | |||
3751 | rec = &pg->records[pg->index++]; | ||
3752 | rec->ip = addr; | ||
3760 | } | 3753 | } |
3761 | 3754 | ||
3762 | /* These new locations need to be initialized */ | 3755 | /* We should have used all pages */ |
3763 | ftrace_new_pgs = pg; | 3756 | WARN_ON(pg->next); |
3757 | |||
3758 | /* Assign the last page to ftrace_pages */ | ||
3759 | ftrace_pages = pg; | ||
3764 | 3760 | ||
3765 | /* Make each individual set of pages sorted by ips */ | 3761 | /* These new locations need to be initialized */ |
3766 | for (; pg; pg = pg->next) | 3762 | ftrace_new_pgs = start_pg; |
3767 | sort(pg->records, pg->index, sizeof(struct dyn_ftrace), | ||
3768 | ftrace_cmp_recs, ftrace_swap_recs); | ||
3769 | 3763 | ||
3770 | /* | 3764 | /* |
3771 | * We only need to disable interrupts on start up | 3765 | * We only need to disable interrupts on start up |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91efd..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -23,6 +23,8 @@ | |||
23 | #include <asm/local.h> | 23 | #include <asm/local.h> |
24 | #include "trace.h" | 24 | #include "trace.h" |
25 | 25 | ||
26 | static void update_pages_handler(struct work_struct *work); | ||
27 | |||
26 | /* | 28 | /* |
27 | * The ring buffer header is special. We must manually up keep it. | 29 | * The ring buffer header is special. We must manually up keep it. |
28 | */ | 30 | */ |
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu { | |||
449 | raw_spinlock_t reader_lock; /* serialize readers */ | 451 | raw_spinlock_t reader_lock; /* serialize readers */ |
450 | arch_spinlock_t lock; | 452 | arch_spinlock_t lock; |
451 | struct lock_class_key lock_key; | 453 | struct lock_class_key lock_key; |
454 | unsigned int nr_pages; | ||
452 | struct list_head *pages; | 455 | struct list_head *pages; |
453 | struct buffer_page *head_page; /* read from head */ | 456 | struct buffer_page *head_page; /* read from head */ |
454 | struct buffer_page *tail_page; /* write to tail */ | 457 | struct buffer_page *tail_page; /* write to tail */ |
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu { | |||
466 | unsigned long read_bytes; | 469 | unsigned long read_bytes; |
467 | u64 write_stamp; | 470 | u64 write_stamp; |
468 | u64 read_stamp; | 471 | u64 read_stamp; |
472 | /* ring buffer pages to update, > 0 to add, < 0 to remove */ | ||
473 | int nr_pages_to_update; | ||
474 | struct list_head new_pages; /* new pages to add */ | ||
475 | struct work_struct update_pages_work; | ||
476 | struct completion update_done; | ||
469 | }; | 477 | }; |
470 | 478 | ||
471 | struct ring_buffer { | 479 | struct ring_buffer { |
472 | unsigned pages; | ||
473 | unsigned flags; | 480 | unsigned flags; |
474 | int cpus; | 481 | int cpus; |
475 | atomic_t record_disabled; | 482 | atomic_t record_disabled; |
483 | atomic_t resize_disabled; | ||
476 | cpumask_var_t cpumask; | 484 | cpumask_var_t cpumask; |
477 | 485 | ||
478 | struct lock_class_key *reader_lock_key; | 486 | struct lock_class_key *reader_lock_key; |
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
937 | struct list_head *head = cpu_buffer->pages; | 945 | struct list_head *head = cpu_buffer->pages; |
938 | struct buffer_page *bpage, *tmp; | 946 | struct buffer_page *bpage, *tmp; |
939 | 947 | ||
948 | /* Reset the head page if it exists */ | ||
949 | if (cpu_buffer->head_page) | ||
950 | rb_set_head_page(cpu_buffer); | ||
951 | |||
940 | rb_head_page_deactivate(cpu_buffer); | 952 | rb_head_page_deactivate(cpu_buffer); |
941 | 953 | ||
942 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) | 954 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) |
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
963 | return 0; | 975 | return 0; |
964 | } | 976 | } |
965 | 977 | ||
966 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | 978 | static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) |
967 | unsigned nr_pages) | ||
968 | { | 979 | { |
980 | int i; | ||
969 | struct buffer_page *bpage, *tmp; | 981 | struct buffer_page *bpage, *tmp; |
970 | LIST_HEAD(pages); | ||
971 | unsigned i; | ||
972 | |||
973 | WARN_ON(!nr_pages); | ||
974 | 982 | ||
975 | for (i = 0; i < nr_pages; i++) { | 983 | for (i = 0; i < nr_pages; i++) { |
976 | struct page *page; | 984 | struct page *page; |
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
981 | */ | 989 | */ |
982 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 990 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
983 | GFP_KERNEL | __GFP_NORETRY, | 991 | GFP_KERNEL | __GFP_NORETRY, |
984 | cpu_to_node(cpu_buffer->cpu)); | 992 | cpu_to_node(cpu)); |
985 | if (!bpage) | 993 | if (!bpage) |
986 | goto free_pages; | 994 | goto free_pages; |
987 | 995 | ||
988 | rb_check_bpage(cpu_buffer, bpage); | 996 | list_add(&bpage->list, pages); |
989 | 997 | ||
990 | list_add(&bpage->list, &pages); | 998 | page = alloc_pages_node(cpu_to_node(cpu), |
991 | |||
992 | page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), | ||
993 | GFP_KERNEL | __GFP_NORETRY, 0); | 999 | GFP_KERNEL | __GFP_NORETRY, 0); |
994 | if (!page) | 1000 | if (!page) |
995 | goto free_pages; | 1001 | goto free_pages; |
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
997 | rb_init_page(bpage->page); | 1003 | rb_init_page(bpage->page); |
998 | } | 1004 | } |
999 | 1005 | ||
1006 | return 0; | ||
1007 | |||
1008 | free_pages: | ||
1009 | list_for_each_entry_safe(bpage, tmp, pages, list) { | ||
1010 | list_del_init(&bpage->list); | ||
1011 | free_buffer_page(bpage); | ||
1012 | } | ||
1013 | |||
1014 | return -ENOMEM; | ||
1015 | } | ||
1016 | |||
1017 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | ||
1018 | unsigned nr_pages) | ||
1019 | { | ||
1020 | LIST_HEAD(pages); | ||
1021 | |||
1022 | WARN_ON(!nr_pages); | ||
1023 | |||
1024 | if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) | ||
1025 | return -ENOMEM; | ||
1026 | |||
1000 | /* | 1027 | /* |
1001 | * The ring buffer page list is a circular list that does not | 1028 | * The ring buffer page list is a circular list that does not |
1002 | * start and end with a list head. All page list items point to | 1029 | * start and end with a list head. All page list items point to |
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1005 | cpu_buffer->pages = pages.next; | 1032 | cpu_buffer->pages = pages.next; |
1006 | list_del(&pages); | 1033 | list_del(&pages); |
1007 | 1034 | ||
1035 | cpu_buffer->nr_pages = nr_pages; | ||
1036 | |||
1008 | rb_check_pages(cpu_buffer); | 1037 | rb_check_pages(cpu_buffer); |
1009 | 1038 | ||
1010 | return 0; | 1039 | return 0; |
1011 | |||
1012 | free_pages: | ||
1013 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | ||
1014 | list_del_init(&bpage->list); | ||
1015 | free_buffer_page(bpage); | ||
1016 | } | ||
1017 | return -ENOMEM; | ||
1018 | } | 1040 | } |
1019 | 1041 | ||
1020 | static struct ring_buffer_per_cpu * | 1042 | static struct ring_buffer_per_cpu * |
1021 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | 1043 | rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) |
1022 | { | 1044 | { |
1023 | struct ring_buffer_per_cpu *cpu_buffer; | 1045 | struct ring_buffer_per_cpu *cpu_buffer; |
1024 | struct buffer_page *bpage; | 1046 | struct buffer_page *bpage; |
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1035 | raw_spin_lock_init(&cpu_buffer->reader_lock); | 1057 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1036 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1058 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1037 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1059 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1060 | INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); | ||
1061 | init_completion(&cpu_buffer->update_done); | ||
1038 | 1062 | ||
1039 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1063 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1040 | GFP_KERNEL, cpu_to_node(cpu)); | 1064 | GFP_KERNEL, cpu_to_node(cpu)); |
@@ -1051,8 +1075,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1051 | rb_init_page(bpage->page); | 1075 | rb_init_page(bpage->page); |
1052 | 1076 | ||
1053 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1077 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
1078 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1054 | 1079 | ||
1055 | ret = rb_allocate_pages(cpu_buffer, buffer->pages); | 1080 | ret = rb_allocate_pages(cpu_buffer, nr_pages); |
1056 | if (ret < 0) | 1081 | if (ret < 0) |
1057 | goto fail_free_reader; | 1082 | goto fail_free_reader; |
1058 | 1083 | ||
@@ -1113,7 +1138,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1113 | { | 1138 | { |
1114 | struct ring_buffer *buffer; | 1139 | struct ring_buffer *buffer; |
1115 | int bsize; | 1140 | int bsize; |
1116 | int cpu; | 1141 | int cpu, nr_pages; |
1117 | 1142 | ||
1118 | /* keep it in its own cache line */ | 1143 | /* keep it in its own cache line */ |
1119 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), | 1144 | buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), |
@@ -1124,14 +1149,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1124 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) | 1149 | if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) |
1125 | goto fail_free_buffer; | 1150 | goto fail_free_buffer; |
1126 | 1151 | ||
1127 | buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1152 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1128 | buffer->flags = flags; | 1153 | buffer->flags = flags; |
1129 | buffer->clock = trace_clock_local; | 1154 | buffer->clock = trace_clock_local; |
1130 | buffer->reader_lock_key = key; | 1155 | buffer->reader_lock_key = key; |
1131 | 1156 | ||
1132 | /* need at least two pages */ | 1157 | /* need at least two pages */ |
1133 | if (buffer->pages < 2) | 1158 | if (nr_pages < 2) |
1134 | buffer->pages = 2; | 1159 | nr_pages = 2; |
1135 | 1160 | ||
1136 | /* | 1161 | /* |
1137 | * In case of non-hotplug cpu, if the ring-buffer is allocated | 1162 | * In case of non-hotplug cpu, if the ring-buffer is allocated |
@@ -1154,7 +1179,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, | |||
1154 | 1179 | ||
1155 | for_each_buffer_cpu(buffer, cpu) { | 1180 | for_each_buffer_cpu(buffer, cpu) { |
1156 | buffer->buffers[cpu] = | 1181 | buffer->buffers[cpu] = |
1157 | rb_allocate_cpu_buffer(buffer, cpu); | 1182 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
1158 | if (!buffer->buffers[cpu]) | 1183 | if (!buffer->buffers[cpu]) |
1159 | goto fail_free_buffers; | 1184 | goto fail_free_buffers; |
1160 | } | 1185 | } |
@@ -1222,58 +1247,221 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, | |||
1222 | 1247 | ||
1223 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); | 1248 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); |
1224 | 1249 | ||
1225 | static void | 1250 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) |
1226 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | ||
1227 | { | 1251 | { |
1228 | struct buffer_page *bpage; | 1252 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1229 | struct list_head *p; | 1253 | } |
1230 | unsigned i; | 1254 | |
1255 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1256 | { | ||
1257 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1258 | } | ||
1259 | |||
1260 | static int | ||
1261 | rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) | ||
1262 | { | ||
1263 | struct list_head *tail_page, *to_remove, *next_page; | ||
1264 | struct buffer_page *to_remove_page, *tmp_iter_page; | ||
1265 | struct buffer_page *last_page, *first_page; | ||
1266 | unsigned int nr_removed; | ||
1267 | unsigned long head_bit; | ||
1268 | int page_entries; | ||
1269 | |||
1270 | head_bit = 0; | ||
1231 | 1271 | ||
1232 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1272 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1233 | rb_head_page_deactivate(cpu_buffer); | 1273 | atomic_inc(&cpu_buffer->record_disabled); |
1274 | /* | ||
1275 | * We don't race with the readers since we have acquired the reader | ||
1276 | * lock. We also don't race with writers after disabling recording. | ||
1277 | * This makes it easy to figure out the first and the last page to be | ||
1278 | * removed from the list. We unlink all the pages in between including | ||
1279 | * the first and last pages. This is done in a busy loop so that we | ||
1280 | * lose the least number of traces. | ||
1281 | * The pages are freed after we restart recording and unlock readers. | ||
1282 | */ | ||
1283 | tail_page = &cpu_buffer->tail_page->list; | ||
1234 | 1284 | ||
1235 | for (i = 0; i < nr_pages; i++) { | 1285 | /* |
1236 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | 1286 | * tail page might be on reader page, we remove the next page |
1237 | goto out; | 1287 | * from the ring buffer |
1238 | p = cpu_buffer->pages->next; | 1288 | */ |
1239 | bpage = list_entry(p, struct buffer_page, list); | 1289 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) |
1240 | list_del_init(&bpage->list); | 1290 | tail_page = rb_list_head(tail_page->next); |
1241 | free_buffer_page(bpage); | 1291 | to_remove = tail_page; |
1292 | |||
1293 | /* start of pages to remove */ | ||
1294 | first_page = list_entry(rb_list_head(to_remove->next), | ||
1295 | struct buffer_page, list); | ||
1296 | |||
1297 | for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { | ||
1298 | to_remove = rb_list_head(to_remove)->next; | ||
1299 | head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; | ||
1242 | } | 1300 | } |
1243 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) | ||
1244 | goto out; | ||
1245 | 1301 | ||
1246 | rb_reset_cpu(cpu_buffer); | 1302 | next_page = rb_list_head(to_remove)->next; |
1247 | rb_check_pages(cpu_buffer); | ||
1248 | 1303 | ||
1249 | out: | 1304 | /* |
1305 | * Now we remove all pages between tail_page and next_page. | ||
1306 | * Make sure that we have head_bit value preserved for the | ||
1307 | * next page | ||
1308 | */ | ||
1309 | tail_page->next = (struct list_head *)((unsigned long)next_page | | ||
1310 | head_bit); | ||
1311 | next_page = rb_list_head(next_page); | ||
1312 | next_page->prev = tail_page; | ||
1313 | |||
1314 | /* make sure pages points to a valid page in the ring buffer */ | ||
1315 | cpu_buffer->pages = next_page; | ||
1316 | |||
1317 | /* update head page */ | ||
1318 | if (head_bit) | ||
1319 | cpu_buffer->head_page = list_entry(next_page, | ||
1320 | struct buffer_page, list); | ||
1321 | |||
1322 | /* | ||
1323 | * change read pointer to make sure any read iterators reset | ||
1324 | * themselves | ||
1325 | */ | ||
1326 | cpu_buffer->read = 0; | ||
1327 | |||
1328 | /* pages are removed, resume tracing and then free the pages */ | ||
1329 | atomic_dec(&cpu_buffer->record_disabled); | ||
1250 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1330 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1331 | |||
1332 | RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); | ||
1333 | |||
1334 | /* last buffer page to remove */ | ||
1335 | last_page = list_entry(rb_list_head(to_remove), struct buffer_page, | ||
1336 | list); | ||
1337 | tmp_iter_page = first_page; | ||
1338 | |||
1339 | do { | ||
1340 | to_remove_page = tmp_iter_page; | ||
1341 | rb_inc_page(cpu_buffer, &tmp_iter_page); | ||
1342 | |||
1343 | /* update the counters */ | ||
1344 | page_entries = rb_page_entries(to_remove_page); | ||
1345 | if (page_entries) { | ||
1346 | /* | ||
1347 | * If something was added to this page, it was full | ||
1348 | * since it is not the tail page. So we deduct the | ||
1349 | * bytes consumed in ring buffer from here. | ||
1350 | * Increment overrun to account for the lost events. | ||
1351 | */ | ||
1352 | local_add(page_entries, &cpu_buffer->overrun); | ||
1353 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * We have already removed references to this list item, just | ||
1358 | * free up the buffer_page and its page | ||
1359 | */ | ||
1360 | free_buffer_page(to_remove_page); | ||
1361 | nr_removed--; | ||
1362 | |||
1363 | } while (to_remove_page != last_page); | ||
1364 | |||
1365 | RB_WARN_ON(cpu_buffer, nr_removed); | ||
1366 | |||
1367 | return nr_removed == 0; | ||
1251 | } | 1368 | } |
1252 | 1369 | ||
1253 | static void | 1370 | static int |
1254 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | 1371 | rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) |
1255 | struct list_head *pages, unsigned nr_pages) | ||
1256 | { | 1372 | { |
1257 | struct buffer_page *bpage; | 1373 | struct list_head *pages = &cpu_buffer->new_pages; |
1258 | struct list_head *p; | 1374 | int retries, success; |
1259 | unsigned i; | ||
1260 | 1375 | ||
1261 | raw_spin_lock_irq(&cpu_buffer->reader_lock); | 1376 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1262 | rb_head_page_deactivate(cpu_buffer); | 1377 | /* |
1378 | * We are holding the reader lock, so the reader page won't be swapped | ||
1379 | * in the ring buffer. Now we are racing with the writer trying to | ||
1380 | * move head page and the tail page. | ||
1381 | * We are going to adapt the reader page update process where: | ||
1382 | * 1. We first splice the start and end of list of new pages between | ||
1383 | * the head page and its previous page. | ||
1384 | * 2. We cmpxchg the prev_page->next to point from head page to the | ||
1385 | * start of new pages list. | ||
1386 | * 3. Finally, we update the head->prev to the end of new list. | ||
1387 | * | ||
1388 | * We will try this process 10 times, to make sure that we don't keep | ||
1389 | * spinning. | ||
1390 | */ | ||
1391 | retries = 10; | ||
1392 | success = 0; | ||
1393 | while (retries--) { | ||
1394 | struct list_head *head_page, *prev_page, *r; | ||
1395 | struct list_head *last_page, *first_page; | ||
1396 | struct list_head *head_page_with_bit; | ||
1263 | 1397 | ||
1264 | for (i = 0; i < nr_pages; i++) { | 1398 | head_page = &rb_set_head_page(cpu_buffer)->list; |
1265 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1399 | prev_page = head_page->prev; |
1266 | goto out; | 1400 | |
1267 | p = pages->next; | 1401 | first_page = pages->next; |
1268 | bpage = list_entry(p, struct buffer_page, list); | 1402 | last_page = pages->prev; |
1269 | list_del_init(&bpage->list); | 1403 | |
1270 | list_add_tail(&bpage->list, cpu_buffer->pages); | 1404 | head_page_with_bit = (struct list_head *) |
1405 | ((unsigned long)head_page | RB_PAGE_HEAD); | ||
1406 | |||
1407 | last_page->next = head_page_with_bit; | ||
1408 | first_page->prev = prev_page; | ||
1409 | |||
1410 | r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); | ||
1411 | |||
1412 | if (r == head_page_with_bit) { | ||
1413 | /* | ||
1414 | * yay, we replaced the page pointer to our new list, | ||
1415 | * now, we just have to update to head page's prev | ||
1416 | * pointer to point to end of list | ||
1417 | */ | ||
1418 | head_page->prev = last_page; | ||
1419 | success = 1; | ||
1420 | break; | ||
1421 | } | ||
1271 | } | 1422 | } |
1272 | rb_reset_cpu(cpu_buffer); | ||
1273 | rb_check_pages(cpu_buffer); | ||
1274 | 1423 | ||
1275 | out: | 1424 | if (success) |
1425 | INIT_LIST_HEAD(pages); | ||
1426 | /* | ||
1427 | * If we weren't successful in adding in new pages, warn and stop | ||
1428 | * tracing | ||
1429 | */ | ||
1430 | RB_WARN_ON(cpu_buffer, !success); | ||
1276 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); | 1431 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1432 | |||
1433 | /* free pages if they weren't inserted */ | ||
1434 | if (!success) { | ||
1435 | struct buffer_page *bpage, *tmp; | ||
1436 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, | ||
1437 | list) { | ||
1438 | list_del_init(&bpage->list); | ||
1439 | free_buffer_page(bpage); | ||
1440 | } | ||
1441 | } | ||
1442 | return success; | ||
1443 | } | ||
1444 | |||
1445 | static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) | ||
1446 | { | ||
1447 | int success; | ||
1448 | |||
1449 | if (cpu_buffer->nr_pages_to_update > 0) | ||
1450 | success = rb_insert_pages(cpu_buffer); | ||
1451 | else | ||
1452 | success = rb_remove_pages(cpu_buffer, | ||
1453 | -cpu_buffer->nr_pages_to_update); | ||
1454 | |||
1455 | if (success) | ||
1456 | cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; | ||
1457 | } | ||
1458 | |||
1459 | static void update_pages_handler(struct work_struct *work) | ||
1460 | { | ||
1461 | struct ring_buffer_per_cpu *cpu_buffer = container_of(work, | ||
1462 | struct ring_buffer_per_cpu, update_pages_work); | ||
1463 | rb_update_pages(cpu_buffer); | ||
1464 | complete(&cpu_buffer->update_done); | ||
1277 | } | 1465 | } |
1278 | 1466 | ||
1279 | /** | 1467 | /** |
@@ -1283,16 +1471,14 @@ out: | |||
1283 | * | 1471 | * |
1284 | * Minimum size is 2 * BUF_PAGE_SIZE. | 1472 | * Minimum size is 2 * BUF_PAGE_SIZE. |
1285 | * | 1473 | * |
1286 | * Returns -1 on failure. | 1474 | * Returns 0 on success and < 0 on failure. |
1287 | */ | 1475 | */ |
1288 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | 1476 | int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, |
1477 | int cpu_id) | ||
1289 | { | 1478 | { |
1290 | struct ring_buffer_per_cpu *cpu_buffer; | 1479 | struct ring_buffer_per_cpu *cpu_buffer; |
1291 | unsigned nr_pages, rm_pages, new_pages; | 1480 | unsigned nr_pages; |
1292 | struct buffer_page *bpage, *tmp; | 1481 | int cpu, err = 0; |
1293 | unsigned long buffer_size; | ||
1294 | LIST_HEAD(pages); | ||
1295 | int i, cpu; | ||
1296 | 1482 | ||
1297 | /* | 1483 | /* |
1298 | * Always succeed at resizing a non-existent buffer: | 1484 | * Always succeed at resizing a non-existent buffer: |
@@ -1300,115 +1486,161 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1300 | if (!buffer) | 1486 | if (!buffer) |
1301 | return size; | 1487 | return size; |
1302 | 1488 | ||
1489 | /* Make sure the requested buffer exists */ | ||
1490 | if (cpu_id != RING_BUFFER_ALL_CPUS && | ||
1491 | !cpumask_test_cpu(cpu_id, buffer->cpumask)) | ||
1492 | return size; | ||
1493 | |||
1303 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | 1494 | size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1304 | size *= BUF_PAGE_SIZE; | 1495 | size *= BUF_PAGE_SIZE; |
1305 | buffer_size = buffer->pages * BUF_PAGE_SIZE; | ||
1306 | 1496 | ||
1307 | /* we need a minimum of two pages */ | 1497 | /* we need a minimum of two pages */ |
1308 | if (size < BUF_PAGE_SIZE * 2) | 1498 | if (size < BUF_PAGE_SIZE * 2) |
1309 | size = BUF_PAGE_SIZE * 2; | 1499 | size = BUF_PAGE_SIZE * 2; |
1310 | 1500 | ||
1311 | if (size == buffer_size) | 1501 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); |
1312 | return size; | ||
1313 | |||
1314 | atomic_inc(&buffer->record_disabled); | ||
1315 | 1502 | ||
1316 | /* Make sure all writers are done with this buffer. */ | 1503 | /* |
1317 | synchronize_sched(); | 1504 | * Don't succeed if resizing is disabled, as a reader might be |
1505 | * manipulating the ring buffer and is expecting a sane state while | ||
1506 | * this is true. | ||
1507 | */ | ||
1508 | if (atomic_read(&buffer->resize_disabled)) | ||
1509 | return -EBUSY; | ||
1318 | 1510 | ||
1511 | /* prevent another thread from changing buffer sizes */ | ||
1319 | mutex_lock(&buffer->mutex); | 1512 | mutex_lock(&buffer->mutex); |
1320 | get_online_cpus(); | ||
1321 | |||
1322 | nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); | ||
1323 | 1513 | ||
1324 | if (size < buffer_size) { | 1514 | if (cpu_id == RING_BUFFER_ALL_CPUS) { |
1515 | /* calculate the pages to update */ | ||
1516 | for_each_buffer_cpu(buffer, cpu) { | ||
1517 | cpu_buffer = buffer->buffers[cpu]; | ||
1325 | 1518 | ||
1326 | /* easy case, just free pages */ | 1519 | cpu_buffer->nr_pages_to_update = nr_pages - |
1327 | if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) | 1520 | cpu_buffer->nr_pages; |
1328 | goto out_fail; | 1521 | /* |
1522 | * nothing more to do for removing pages or no update | ||
1523 | */ | ||
1524 | if (cpu_buffer->nr_pages_to_update <= 0) | ||
1525 | continue; | ||
1526 | /* | ||
1527 | * to add pages, make sure all new pages can be | ||
1528 | * allocated without receiving ENOMEM | ||
1529 | */ | ||
1530 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
1531 | if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, | ||
1532 | &cpu_buffer->new_pages, cpu)) { | ||
1533 | /* not enough memory for new pages */ | ||
1534 | err = -ENOMEM; | ||
1535 | goto out_err; | ||
1536 | } | ||
1537 | } | ||
1329 | 1538 | ||
1330 | rm_pages = buffer->pages - nr_pages; | 1539 | get_online_cpus(); |
1540 | /* | ||
1541 | * Fire off all the required work handlers | ||
1542 | * We can't schedule on offline CPUs, but it's not necessary | ||
1543 | * since we can change their buffer sizes without any race. | ||
1544 | */ | ||
1545 | for_each_buffer_cpu(buffer, cpu) { | ||
1546 | cpu_buffer = buffer->buffers[cpu]; | ||
1547 | if (!cpu_buffer->nr_pages_to_update) | ||
1548 | continue; | ||
1549 | |||
1550 | if (cpu_online(cpu)) | ||
1551 | schedule_work_on(cpu, | ||
1552 | &cpu_buffer->update_pages_work); | ||
1553 | else | ||
1554 | rb_update_pages(cpu_buffer); | ||
1555 | } | ||
1331 | 1556 | ||
1557 | /* wait for all the updates to complete */ | ||
1332 | for_each_buffer_cpu(buffer, cpu) { | 1558 | for_each_buffer_cpu(buffer, cpu) { |
1333 | cpu_buffer = buffer->buffers[cpu]; | 1559 | cpu_buffer = buffer->buffers[cpu]; |
1334 | rb_remove_pages(cpu_buffer, rm_pages); | 1560 | if (!cpu_buffer->nr_pages_to_update) |
1561 | continue; | ||
1562 | |||
1563 | if (cpu_online(cpu)) | ||
1564 | wait_for_completion(&cpu_buffer->update_done); | ||
1565 | cpu_buffer->nr_pages_to_update = 0; | ||
1335 | } | 1566 | } |
1336 | goto out; | ||
1337 | } | ||
1338 | 1567 | ||
1339 | /* | 1568 | put_online_cpus(); |
1340 | * This is a bit more difficult. We only want to add pages | 1569 | } else { |
1341 | * when we can allocate enough for all CPUs. We do this | 1570 | cpu_buffer = buffer->buffers[cpu_id]; |
1342 | * by allocating all the pages and storing them on a local | ||
1343 | * link list. If we succeed in our allocation, then we | ||
1344 | * add these pages to the cpu_buffers. Otherwise we just free | ||
1345 | * them all and return -ENOMEM; | ||
1346 | */ | ||
1347 | if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) | ||
1348 | goto out_fail; | ||
1349 | 1571 | ||
1350 | new_pages = nr_pages - buffer->pages; | 1572 | if (nr_pages == cpu_buffer->nr_pages) |
1573 | goto out; | ||
1351 | 1574 | ||
1352 | for_each_buffer_cpu(buffer, cpu) { | 1575 | cpu_buffer->nr_pages_to_update = nr_pages - |
1353 | for (i = 0; i < new_pages; i++) { | 1576 | cpu_buffer->nr_pages; |
1354 | struct page *page; | 1577 | |
1355 | /* | 1578 | INIT_LIST_HEAD(&cpu_buffer->new_pages); |
1356 | * __GFP_NORETRY flag makes sure that the allocation | 1579 | if (cpu_buffer->nr_pages_to_update > 0 && |
1357 | * fails gracefully without invoking oom-killer and | 1580 | __rb_allocate_pages(cpu_buffer->nr_pages_to_update, |
1358 | * the system is not destabilized. | 1581 | &cpu_buffer->new_pages, cpu_id)) { |
1359 | */ | 1582 | err = -ENOMEM; |
1360 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), | 1583 | goto out_err; |
1361 | cache_line_size()), | ||
1362 | GFP_KERNEL | __GFP_NORETRY, | ||
1363 | cpu_to_node(cpu)); | ||
1364 | if (!bpage) | ||
1365 | goto free_pages; | ||
1366 | list_add(&bpage->list, &pages); | ||
1367 | page = alloc_pages_node(cpu_to_node(cpu), | ||
1368 | GFP_KERNEL | __GFP_NORETRY, 0); | ||
1369 | if (!page) | ||
1370 | goto free_pages; | ||
1371 | bpage->page = page_address(page); | ||
1372 | rb_init_page(bpage->page); | ||
1373 | } | 1584 | } |
1374 | } | ||
1375 | 1585 | ||
1376 | for_each_buffer_cpu(buffer, cpu) { | 1586 | get_online_cpus(); |
1377 | cpu_buffer = buffer->buffers[cpu]; | ||
1378 | rb_insert_pages(cpu_buffer, &pages, new_pages); | ||
1379 | } | ||
1380 | 1587 | ||
1381 | if (RB_WARN_ON(buffer, !list_empty(&pages))) | 1588 | if (cpu_online(cpu_id)) { |
1382 | goto out_fail; | 1589 | schedule_work_on(cpu_id, |
1590 | &cpu_buffer->update_pages_work); | ||
1591 | wait_for_completion(&cpu_buffer->update_done); | ||
1592 | } else | ||
1593 | rb_update_pages(cpu_buffer); | ||
1594 | |||
1595 | cpu_buffer->nr_pages_to_update = 0; | ||
1596 | put_online_cpus(); | ||
1597 | } | ||
1383 | 1598 | ||
1384 | out: | 1599 | out: |
1385 | buffer->pages = nr_pages; | 1600 | /* |
1386 | put_online_cpus(); | 1601 | * The ring buffer resize can happen with the ring buffer |
1602 | * enabled, so that the update disturbs the tracing as little | ||
1603 | * as possible. But if the buffer is disabled, we do not need | ||
1604 | * to worry about that, and we can take the time to verify | ||
1605 | * that the buffer is not corrupt. | ||
1606 | */ | ||
1607 | if (atomic_read(&buffer->record_disabled)) { | ||
1608 | atomic_inc(&buffer->record_disabled); | ||
1609 | /* | ||
1610 | * Even though the buffer was disabled, we must make sure | ||
1611 | * that it is truly disabled before calling rb_check_pages. | ||
1612 | * There could have been a race between checking | ||
1613 | * record_disable and incrementing it. | ||
1614 | */ | ||
1615 | synchronize_sched(); | ||
1616 | for_each_buffer_cpu(buffer, cpu) { | ||
1617 | cpu_buffer = buffer->buffers[cpu]; | ||
1618 | rb_check_pages(cpu_buffer); | ||
1619 | } | ||
1620 | atomic_dec(&buffer->record_disabled); | ||
1621 | } | ||
1622 | |||
1387 | mutex_unlock(&buffer->mutex); | 1623 | mutex_unlock(&buffer->mutex); |
1624 | return size; | ||
1388 | 1625 | ||
1389 | atomic_dec(&buffer->record_disabled); | 1626 | out_err: |
1627 | for_each_buffer_cpu(buffer, cpu) { | ||
1628 | struct buffer_page *bpage, *tmp; | ||
1390 | 1629 | ||
1391 | return size; | 1630 | cpu_buffer = buffer->buffers[cpu]; |
1631 | cpu_buffer->nr_pages_to_update = 0; | ||
1392 | 1632 | ||
1393 | free_pages: | 1633 | if (list_empty(&cpu_buffer->new_pages)) |
1394 | list_for_each_entry_safe(bpage, tmp, &pages, list) { | 1634 | continue; |
1395 | list_del_init(&bpage->list); | ||
1396 | free_buffer_page(bpage); | ||
1397 | } | ||
1398 | put_online_cpus(); | ||
1399 | mutex_unlock(&buffer->mutex); | ||
1400 | atomic_dec(&buffer->record_disabled); | ||
1401 | return -ENOMEM; | ||
1402 | 1635 | ||
1403 | /* | 1636 | list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, |
1404 | * Something went totally wrong, and we are too paranoid | 1637 | list) { |
1405 | * to even clean up the mess. | 1638 | list_del_init(&bpage->list); |
1406 | */ | 1639 | free_buffer_page(bpage); |
1407 | out_fail: | 1640 | } |
1408 | put_online_cpus(); | 1641 | } |
1409 | mutex_unlock(&buffer->mutex); | 1642 | mutex_unlock(&buffer->mutex); |
1410 | atomic_dec(&buffer->record_disabled); | 1643 | return err; |
1411 | return -1; | ||
1412 | } | 1644 | } |
1413 | EXPORT_SYMBOL_GPL(ring_buffer_resize); | 1645 | EXPORT_SYMBOL_GPL(ring_buffer_resize); |
1414 | 1646 | ||
@@ -1447,21 +1679,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) | |||
1447 | return __rb_page_index(iter->head_page, iter->head); | 1679 | return __rb_page_index(iter->head_page, iter->head); |
1448 | } | 1680 | } |
1449 | 1681 | ||
1450 | static inline unsigned long rb_page_write(struct buffer_page *bpage) | ||
1451 | { | ||
1452 | return local_read(&bpage->write) & RB_WRITE_MASK; | ||
1453 | } | ||
1454 | |||
1455 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | 1682 | static inline unsigned rb_page_commit(struct buffer_page *bpage) |
1456 | { | 1683 | { |
1457 | return local_read(&bpage->page->commit); | 1684 | return local_read(&bpage->page->commit); |
1458 | } | 1685 | } |
1459 | 1686 | ||
1460 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | ||
1461 | { | ||
1462 | return local_read(&bpage->entries) & RB_WRITE_MASK; | ||
1463 | } | ||
1464 | |||
1465 | /* Size is determined by what has been committed */ | 1687 | /* Size is determined by what has been committed */ |
1466 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1688 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1467 | { | 1689 | { |
@@ -1510,7 +1732,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | |||
1510 | * assign the commit to the tail. | 1732 | * assign the commit to the tail. |
1511 | */ | 1733 | */ |
1512 | again: | 1734 | again: |
1513 | max_count = cpu_buffer->buffer->pages * 100; | 1735 | max_count = cpu_buffer->nr_pages * 100; |
1514 | 1736 | ||
1515 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | 1737 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { |
1516 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) | 1738 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) |
@@ -3486,6 +3708,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) | |||
3486 | 3708 | ||
3487 | iter->cpu_buffer = cpu_buffer; | 3709 | iter->cpu_buffer = cpu_buffer; |
3488 | 3710 | ||
3711 | atomic_inc(&buffer->resize_disabled); | ||
3489 | atomic_inc(&cpu_buffer->record_disabled); | 3712 | atomic_inc(&cpu_buffer->record_disabled); |
3490 | 3713 | ||
3491 | return iter; | 3714 | return iter; |
@@ -3548,7 +3771,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) | |||
3548 | { | 3771 | { |
3549 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3772 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3550 | 3773 | ||
3774 | /* | ||
3775 | * Ring buffer is disabled from recording, here's a good place | ||
3776 | * to check the integrity of the ring buffer. | ||
3777 | */ | ||
3778 | rb_check_pages(cpu_buffer); | ||
3779 | |||
3551 | atomic_dec(&cpu_buffer->record_disabled); | 3780 | atomic_dec(&cpu_buffer->record_disabled); |
3781 | atomic_dec(&cpu_buffer->buffer->resize_disabled); | ||
3552 | kfree(iter); | 3782 | kfree(iter); |
3553 | } | 3783 | } |
3554 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); | 3784 | EXPORT_SYMBOL_GPL(ring_buffer_read_finish); |
@@ -3588,9 +3818,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); | |||
3588 | * ring_buffer_size - return the size of the ring buffer (in bytes) | 3818 | * ring_buffer_size - return the size of the ring buffer (in bytes) |
3589 | * @buffer: The ring buffer. | 3819 | * @buffer: The ring buffer. |
3590 | */ | 3820 | */ |
3591 | unsigned long ring_buffer_size(struct ring_buffer *buffer) | 3821 | unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) |
3592 | { | 3822 | { |
3593 | return BUF_PAGE_SIZE * buffer->pages; | 3823 | /* |
3824 | * Earlier, this method returned | ||
3825 | * BUF_PAGE_SIZE * buffer->nr_pages | ||
3826 | * Since the nr_pages field is now removed, we have converted this to | ||
3827 | * return the per cpu buffer value. | ||
3828 | */ | ||
3829 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
3830 | return 0; | ||
3831 | |||
3832 | return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; | ||
3594 | } | 3833 | } |
3595 | EXPORT_SYMBOL_GPL(ring_buffer_size); | 3834 | EXPORT_SYMBOL_GPL(ring_buffer_size); |
3596 | 3835 | ||
@@ -3611,6 +3850,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3611 | cpu_buffer->commit_page = cpu_buffer->head_page; | 3850 | cpu_buffer->commit_page = cpu_buffer->head_page; |
3612 | 3851 | ||
3613 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 3852 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
3853 | INIT_LIST_HEAD(&cpu_buffer->new_pages); | ||
3614 | local_set(&cpu_buffer->reader_page->write, 0); | 3854 | local_set(&cpu_buffer->reader_page->write, 0); |
3615 | local_set(&cpu_buffer->reader_page->entries, 0); | 3855 | local_set(&cpu_buffer->reader_page->entries, 0); |
3616 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3856 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
@@ -3647,8 +3887,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3647 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 3887 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
3648 | return; | 3888 | return; |
3649 | 3889 | ||
3890 | atomic_inc(&buffer->resize_disabled); | ||
3650 | atomic_inc(&cpu_buffer->record_disabled); | 3891 | atomic_inc(&cpu_buffer->record_disabled); |
3651 | 3892 | ||
3893 | /* Make sure all commits have finished */ | ||
3894 | synchronize_sched(); | ||
3895 | |||
3652 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3896 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3653 | 3897 | ||
3654 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3898 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
@@ -3664,6 +3908,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3664 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3908 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3665 | 3909 | ||
3666 | atomic_dec(&cpu_buffer->record_disabled); | 3910 | atomic_dec(&cpu_buffer->record_disabled); |
3911 | atomic_dec(&buffer->resize_disabled); | ||
3667 | } | 3912 | } |
3668 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); | 3913 | EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); |
3669 | 3914 | ||
@@ -3765,8 +4010,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3765 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) | 4010 | !cpumask_test_cpu(cpu, buffer_b->cpumask)) |
3766 | goto out; | 4011 | goto out; |
3767 | 4012 | ||
4013 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
4014 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
4015 | |||
3768 | /* At least make sure the two buffers are somewhat the same */ | 4016 | /* At least make sure the two buffers are somewhat the same */ |
3769 | if (buffer_a->pages != buffer_b->pages) | 4017 | if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) |
3770 | goto out; | 4018 | goto out; |
3771 | 4019 | ||
3772 | ret = -EAGAIN; | 4020 | ret = -EAGAIN; |
@@ -3780,9 +4028,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, | |||
3780 | if (atomic_read(&buffer_b->record_disabled)) | 4028 | if (atomic_read(&buffer_b->record_disabled)) |
3781 | goto out; | 4029 | goto out; |
3782 | 4030 | ||
3783 | cpu_buffer_a = buffer_a->buffers[cpu]; | ||
3784 | cpu_buffer_b = buffer_b->buffers[cpu]; | ||
3785 | |||
3786 | if (atomic_read(&cpu_buffer_a->record_disabled)) | 4031 | if (atomic_read(&cpu_buffer_a->record_disabled)) |
3787 | goto out; | 4032 | goto out; |
3788 | 4033 | ||
@@ -4071,6 +4316,8 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4071 | struct ring_buffer *buffer = | 4316 | struct ring_buffer *buffer = |
4072 | container_of(self, struct ring_buffer, cpu_notify); | 4317 | container_of(self, struct ring_buffer, cpu_notify); |
4073 | long cpu = (long)hcpu; | 4318 | long cpu = (long)hcpu; |
4319 | int cpu_i, nr_pages_same; | ||
4320 | unsigned int nr_pages; | ||
4074 | 4321 | ||
4075 | switch (action) { | 4322 | switch (action) { |
4076 | case CPU_UP_PREPARE: | 4323 | case CPU_UP_PREPARE: |
@@ -4078,8 +4325,23 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
4078 | if (cpumask_test_cpu(cpu, buffer->cpumask)) | 4325 | if (cpumask_test_cpu(cpu, buffer->cpumask)) |
4079 | return NOTIFY_OK; | 4326 | return NOTIFY_OK; |
4080 | 4327 | ||
4328 | nr_pages = 0; | ||
4329 | nr_pages_same = 1; | ||
4330 | /* check if all cpu sizes are same */ | ||
4331 | for_each_buffer_cpu(buffer, cpu_i) { | ||
4332 | /* fill in the size from first enabled cpu */ | ||
4333 | if (nr_pages == 0) | ||
4334 | nr_pages = buffer->buffers[cpu_i]->nr_pages; | ||
4335 | if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { | ||
4336 | nr_pages_same = 0; | ||
4337 | break; | ||
4338 | } | ||
4339 | } | ||
4340 | /* allocate minimum pages, user can later expand it */ | ||
4341 | if (!nr_pages_same) | ||
4342 | nr_pages = 2; | ||
4081 | buffer->buffers[cpu] = | 4343 | buffer->buffers[cpu] = |
4082 | rb_allocate_cpu_buffer(buffer, cpu); | 4344 | rb_allocate_cpu_buffer(buffer, nr_pages, cpu); |
4083 | if (!buffer->buffers[cpu]) { | 4345 | if (!buffer->buffers[cpu]) { |
4084 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", | 4346 | WARN(1, "failed to allocate ring buffer on CPU %ld\n", |
4085 | cpu); | 4347 | cpu); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a22255c1010..a7fa0702be1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -87,18 +87,6 @@ static int tracing_disabled = 1; | |||
87 | 87 | ||
88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); | 88 | DEFINE_PER_CPU(int, ftrace_cpu_disabled); |
89 | 89 | ||
90 | static inline void ftrace_disable_cpu(void) | ||
91 | { | ||
92 | preempt_disable(); | ||
93 | __this_cpu_inc(ftrace_cpu_disabled); | ||
94 | } | ||
95 | |||
96 | static inline void ftrace_enable_cpu(void) | ||
97 | { | ||
98 | __this_cpu_dec(ftrace_cpu_disabled); | ||
99 | preempt_enable(); | ||
100 | } | ||
101 | |||
102 | cpumask_var_t __read_mostly tracing_buffer_mask; | 90 | cpumask_var_t __read_mostly tracing_buffer_mask; |
103 | 91 | ||
104 | /* | 92 | /* |
@@ -383,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
383 | void tracing_off(void) | 371 | void tracing_off(void) |
384 | { | 372 | { |
385 | if (global_trace.buffer) | 373 | if (global_trace.buffer) |
386 | ring_buffer_record_on(global_trace.buffer); | 374 | ring_buffer_record_off(global_trace.buffer); |
387 | /* | 375 | /* |
388 | * This flag is only looked at when buffers haven't been | 376 | * This flag is only looked at when buffers haven't been |
389 | * allocated yet. We don't really care about the race | 377 | * allocated yet. We don't really care about the race |
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) | |||
629 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 617 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
630 | { | 618 | { |
631 | int len; | 619 | int len; |
632 | void *ret; | ||
633 | 620 | ||
634 | if (s->len <= s->readpos) | 621 | if (s->len <= s->readpos) |
635 | return -EBUSY; | 622 | return -EBUSY; |
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
637 | len = s->len - s->readpos; | 624 | len = s->len - s->readpos; |
638 | if (cnt > len) | 625 | if (cnt > len) |
639 | cnt = len; | 626 | cnt = len; |
640 | ret = memcpy(buf, s->buffer + s->readpos, cnt); | 627 | memcpy(buf, s->buffer + s->readpos, cnt); |
641 | if (!ret) | ||
642 | return -EFAULT; | ||
643 | 628 | ||
644 | s->readpos += cnt; | 629 | s->readpos += cnt; |
645 | return cnt; | 630 | return cnt; |
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
751 | 736 | ||
752 | arch_spin_lock(&ftrace_max_lock); | 737 | arch_spin_lock(&ftrace_max_lock); |
753 | 738 | ||
754 | ftrace_disable_cpu(); | ||
755 | |||
756 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); | 739 | ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); |
757 | 740 | ||
758 | if (ret == -EBUSY) { | 741 | if (ret == -EBUSY) { |
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
766 | "Failed to swap buffers due to commit in progress\n"); | 749 | "Failed to swap buffers due to commit in progress\n"); |
767 | } | 750 | } |
768 | 751 | ||
769 | ftrace_enable_cpu(); | ||
770 | |||
771 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); | 752 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); |
772 | 753 | ||
773 | __update_max_tr(tr, tsk, cpu); | 754 | __update_max_tr(tr, tsk, cpu); |
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
782 | * Register a new plugin tracer. | 763 | * Register a new plugin tracer. |
783 | */ | 764 | */ |
784 | int register_tracer(struct tracer *type) | 765 | int register_tracer(struct tracer *type) |
785 | __releases(kernel_lock) | ||
786 | __acquires(kernel_lock) | ||
787 | { | 766 | { |
788 | struct tracer *t; | 767 | struct tracer *t; |
789 | int ret = 0; | 768 | int ret = 0; |
@@ -841,7 +820,8 @@ __acquires(kernel_lock) | |||
841 | 820 | ||
842 | /* If we expanded the buffers, make sure the max is expanded too */ | 821 | /* If we expanded the buffers, make sure the max is expanded too */ |
843 | if (ring_buffer_expanded && type->use_max_tr) | 822 | if (ring_buffer_expanded && type->use_max_tr) |
844 | ring_buffer_resize(max_tr.buffer, trace_buf_size); | 823 | ring_buffer_resize(max_tr.buffer, trace_buf_size, |
824 | RING_BUFFER_ALL_CPUS); | ||
845 | 825 | ||
846 | /* the test is responsible for initializing and enabling */ | 826 | /* the test is responsible for initializing and enabling */ |
847 | pr_info("Testing tracer %s: ", type->name); | 827 | pr_info("Testing tracer %s: ", type->name); |
@@ -857,7 +837,8 @@ __acquires(kernel_lock) | |||
857 | 837 | ||
858 | /* Shrink the max buffer again */ | 838 | /* Shrink the max buffer again */ |
859 | if (ring_buffer_expanded && type->use_max_tr) | 839 | if (ring_buffer_expanded && type->use_max_tr) |
860 | ring_buffer_resize(max_tr.buffer, 1); | 840 | ring_buffer_resize(max_tr.buffer, 1, |
841 | RING_BUFFER_ALL_CPUS); | ||
861 | 842 | ||
862 | printk(KERN_CONT "PASSED\n"); | 843 | printk(KERN_CONT "PASSED\n"); |
863 | } | 844 | } |
@@ -917,13 +898,6 @@ out: | |||
917 | mutex_unlock(&trace_types_lock); | 898 | mutex_unlock(&trace_types_lock); |
918 | } | 899 | } |
919 | 900 | ||
920 | static void __tracing_reset(struct ring_buffer *buffer, int cpu) | ||
921 | { | ||
922 | ftrace_disable_cpu(); | ||
923 | ring_buffer_reset_cpu(buffer, cpu); | ||
924 | ftrace_enable_cpu(); | ||
925 | } | ||
926 | |||
927 | void tracing_reset(struct trace_array *tr, int cpu) | 901 | void tracing_reset(struct trace_array *tr, int cpu) |
928 | { | 902 | { |
929 | struct ring_buffer *buffer = tr->buffer; | 903 | struct ring_buffer *buffer = tr->buffer; |
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu) | |||
932 | 906 | ||
933 | /* Make sure all commits have finished */ | 907 | /* Make sure all commits have finished */ |
934 | synchronize_sched(); | 908 | synchronize_sched(); |
935 | __tracing_reset(buffer, cpu); | 909 | ring_buffer_reset_cpu(buffer, cpu); |
936 | 910 | ||
937 | ring_buffer_record_enable(buffer); | 911 | ring_buffer_record_enable(buffer); |
938 | } | 912 | } |
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) | |||
950 | tr->time_start = ftrace_now(tr->cpu); | 924 | tr->time_start = ftrace_now(tr->cpu); |
951 | 925 | ||
952 | for_each_online_cpu(cpu) | 926 | for_each_online_cpu(cpu) |
953 | __tracing_reset(buffer, cpu); | 927 | ring_buffer_reset_cpu(buffer, cpu); |
954 | 928 | ||
955 | ring_buffer_record_enable(buffer); | 929 | ring_buffer_record_enable(buffer); |
956 | } | 930 | } |
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
1498 | 1472 | ||
1499 | #endif /* CONFIG_STACKTRACE */ | 1473 | #endif /* CONFIG_STACKTRACE */ |
1500 | 1474 | ||
1475 | /* created for use with alloc_percpu */ | ||
1476 | struct trace_buffer_struct { | ||
1477 | char buffer[TRACE_BUF_SIZE]; | ||
1478 | }; | ||
1479 | |||
1480 | static struct trace_buffer_struct *trace_percpu_buffer; | ||
1481 | static struct trace_buffer_struct *trace_percpu_sirq_buffer; | ||
1482 | static struct trace_buffer_struct *trace_percpu_irq_buffer; | ||
1483 | static struct trace_buffer_struct *trace_percpu_nmi_buffer; | ||
1484 | |||
1485 | /* | ||
1486 | * The buffer used is dependent on the context. There is a per cpu | ||
1487 | * buffer for normal context, softirq contex, hard irq context and | ||
1488 | * for NMI context. Thise allows for lockless recording. | ||
1489 | * | ||
1490 | * Note, if the buffers failed to be allocated, then this returns NULL | ||
1491 | */ | ||
1492 | static char *get_trace_buf(void) | ||
1493 | { | ||
1494 | struct trace_buffer_struct *percpu_buffer; | ||
1495 | struct trace_buffer_struct *buffer; | ||
1496 | |||
1497 | /* | ||
1498 | * If we have allocated per cpu buffers, then we do not | ||
1499 | * need to do any locking. | ||
1500 | */ | ||
1501 | if (in_nmi()) | ||
1502 | percpu_buffer = trace_percpu_nmi_buffer; | ||
1503 | else if (in_irq()) | ||
1504 | percpu_buffer = trace_percpu_irq_buffer; | ||
1505 | else if (in_softirq()) | ||
1506 | percpu_buffer = trace_percpu_sirq_buffer; | ||
1507 | else | ||
1508 | percpu_buffer = trace_percpu_buffer; | ||
1509 | |||
1510 | if (!percpu_buffer) | ||
1511 | return NULL; | ||
1512 | |||
1513 | buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); | ||
1514 | |||
1515 | return buffer->buffer; | ||
1516 | } | ||
1517 | |||
1518 | static int alloc_percpu_trace_buffer(void) | ||
1519 | { | ||
1520 | struct trace_buffer_struct *buffers; | ||
1521 | struct trace_buffer_struct *sirq_buffers; | ||
1522 | struct trace_buffer_struct *irq_buffers; | ||
1523 | struct trace_buffer_struct *nmi_buffers; | ||
1524 | |||
1525 | buffers = alloc_percpu(struct trace_buffer_struct); | ||
1526 | if (!buffers) | ||
1527 | goto err_warn; | ||
1528 | |||
1529 | sirq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1530 | if (!sirq_buffers) | ||
1531 | goto err_sirq; | ||
1532 | |||
1533 | irq_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1534 | if (!irq_buffers) | ||
1535 | goto err_irq; | ||
1536 | |||
1537 | nmi_buffers = alloc_percpu(struct trace_buffer_struct); | ||
1538 | if (!nmi_buffers) | ||
1539 | goto err_nmi; | ||
1540 | |||
1541 | trace_percpu_buffer = buffers; | ||
1542 | trace_percpu_sirq_buffer = sirq_buffers; | ||
1543 | trace_percpu_irq_buffer = irq_buffers; | ||
1544 | trace_percpu_nmi_buffer = nmi_buffers; | ||
1545 | |||
1546 | return 0; | ||
1547 | |||
1548 | err_nmi: | ||
1549 | free_percpu(irq_buffers); | ||
1550 | err_irq: | ||
1551 | free_percpu(sirq_buffers); | ||
1552 | err_sirq: | ||
1553 | free_percpu(buffers); | ||
1554 | err_warn: | ||
1555 | WARN(1, "Could not allocate percpu trace_printk buffer"); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | |||
1559 | void trace_printk_init_buffers(void) | ||
1560 | { | ||
1561 | static int buffers_allocated; | ||
1562 | |||
1563 | if (buffers_allocated) | ||
1564 | return; | ||
1565 | |||
1566 | if (alloc_percpu_trace_buffer()) | ||
1567 | return; | ||
1568 | |||
1569 | pr_info("ftrace: Allocated trace_printk buffers\n"); | ||
1570 | |||
1571 | buffers_allocated = 1; | ||
1572 | } | ||
1573 | |||
1501 | /** | 1574 | /** |
1502 | * trace_vbprintk - write binary msg to tracing buffer | 1575 | * trace_vbprintk - write binary msg to tracing buffer |
1503 | * | 1576 | * |
1504 | */ | 1577 | */ |
1505 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | 1578 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) |
1506 | { | 1579 | { |
1507 | static arch_spinlock_t trace_buf_lock = | ||
1508 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
1509 | static u32 trace_buf[TRACE_BUF_SIZE]; | ||
1510 | |||
1511 | struct ftrace_event_call *call = &event_bprint; | 1580 | struct ftrace_event_call *call = &event_bprint; |
1512 | struct ring_buffer_event *event; | 1581 | struct ring_buffer_event *event; |
1513 | struct ring_buffer *buffer; | 1582 | struct ring_buffer *buffer; |
1514 | struct trace_array *tr = &global_trace; | 1583 | struct trace_array *tr = &global_trace; |
1515 | struct trace_array_cpu *data; | ||
1516 | struct bprint_entry *entry; | 1584 | struct bprint_entry *entry; |
1517 | unsigned long flags; | 1585 | unsigned long flags; |
1518 | int disable; | 1586 | char *tbuffer; |
1519 | int cpu, len = 0, size, pc; | 1587 | int len = 0, size, pc; |
1520 | 1588 | ||
1521 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1589 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
1522 | return 0; | 1590 | return 0; |
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1526 | 1594 | ||
1527 | pc = preempt_count(); | 1595 | pc = preempt_count(); |
1528 | preempt_disable_notrace(); | 1596 | preempt_disable_notrace(); |
1529 | cpu = raw_smp_processor_id(); | ||
1530 | data = tr->data[cpu]; | ||
1531 | 1597 | ||
1532 | disable = atomic_inc_return(&data->disabled); | 1598 | tbuffer = get_trace_buf(); |
1533 | if (unlikely(disable != 1)) | 1599 | if (!tbuffer) { |
1600 | len = 0; | ||
1534 | goto out; | 1601 | goto out; |
1602 | } | ||
1535 | 1603 | ||
1536 | /* Lockdep uses trace_printk for lock tracing */ | 1604 | len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); |
1537 | local_irq_save(flags); | ||
1538 | arch_spin_lock(&trace_buf_lock); | ||
1539 | len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1540 | 1605 | ||
1541 | if (len > TRACE_BUF_SIZE || len < 0) | 1606 | if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) |
1542 | goto out_unlock; | 1607 | goto out; |
1543 | 1608 | ||
1609 | local_save_flags(flags); | ||
1544 | size = sizeof(*entry) + sizeof(u32) * len; | 1610 | size = sizeof(*entry) + sizeof(u32) * len; |
1545 | buffer = tr->buffer; | 1611 | buffer = tr->buffer; |
1546 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, | 1612 | event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, |
1547 | flags, pc); | 1613 | flags, pc); |
1548 | if (!event) | 1614 | if (!event) |
1549 | goto out_unlock; | 1615 | goto out; |
1550 | entry = ring_buffer_event_data(event); | 1616 | entry = ring_buffer_event_data(event); |
1551 | entry->ip = ip; | 1617 | entry->ip = ip; |
1552 | entry->fmt = fmt; | 1618 | entry->fmt = fmt; |
1553 | 1619 | ||
1554 | memcpy(entry->buf, trace_buf, sizeof(u32) * len); | 1620 | memcpy(entry->buf, tbuffer, sizeof(u32) * len); |
1555 | if (!filter_check_discard(call, entry, buffer, event)) { | 1621 | if (!filter_check_discard(call, entry, buffer, event)) { |
1556 | ring_buffer_unlock_commit(buffer, event); | 1622 | ring_buffer_unlock_commit(buffer, event); |
1557 | ftrace_trace_stack(buffer, flags, 6, pc); | 1623 | ftrace_trace_stack(buffer, flags, 6, pc); |
1558 | } | 1624 | } |
1559 | 1625 | ||
1560 | out_unlock: | ||
1561 | arch_spin_unlock(&trace_buf_lock); | ||
1562 | local_irq_restore(flags); | ||
1563 | |||
1564 | out: | 1626 | out: |
1565 | atomic_dec_return(&data->disabled); | ||
1566 | preempt_enable_notrace(); | 1627 | preempt_enable_notrace(); |
1567 | unpause_graph_tracing(); | 1628 | unpause_graph_tracing(); |
1568 | 1629 | ||
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr, | |||
1588 | int trace_array_vprintk(struct trace_array *tr, | 1649 | int trace_array_vprintk(struct trace_array *tr, |
1589 | unsigned long ip, const char *fmt, va_list args) | 1650 | unsigned long ip, const char *fmt, va_list args) |
1590 | { | 1651 | { |
1591 | static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; | ||
1592 | static char trace_buf[TRACE_BUF_SIZE]; | ||
1593 | |||
1594 | struct ftrace_event_call *call = &event_print; | 1652 | struct ftrace_event_call *call = &event_print; |
1595 | struct ring_buffer_event *event; | 1653 | struct ring_buffer_event *event; |
1596 | struct ring_buffer *buffer; | 1654 | struct ring_buffer *buffer; |
1597 | struct trace_array_cpu *data; | 1655 | int len = 0, size, pc; |
1598 | int cpu, len = 0, size, pc; | ||
1599 | struct print_entry *entry; | 1656 | struct print_entry *entry; |
1600 | unsigned long irq_flags; | 1657 | unsigned long flags; |
1601 | int disable; | 1658 | char *tbuffer; |
1602 | 1659 | ||
1603 | if (tracing_disabled || tracing_selftest_running) | 1660 | if (tracing_disabled || tracing_selftest_running) |
1604 | return 0; | 1661 | return 0; |
1605 | 1662 | ||
1663 | /* Don't pollute graph traces with trace_vprintk internals */ | ||
1664 | pause_graph_tracing(); | ||
1665 | |||
1606 | pc = preempt_count(); | 1666 | pc = preempt_count(); |
1607 | preempt_disable_notrace(); | 1667 | preempt_disable_notrace(); |
1608 | cpu = raw_smp_processor_id(); | ||
1609 | data = tr->data[cpu]; | ||
1610 | 1668 | ||
1611 | disable = atomic_inc_return(&data->disabled); | 1669 | |
1612 | if (unlikely(disable != 1)) | 1670 | tbuffer = get_trace_buf(); |
1671 | if (!tbuffer) { | ||
1672 | len = 0; | ||
1613 | goto out; | 1673 | goto out; |
1674 | } | ||
1614 | 1675 | ||
1615 | pause_graph_tracing(); | 1676 | len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); |
1616 | raw_local_irq_save(irq_flags); | 1677 | if (len > TRACE_BUF_SIZE) |
1617 | arch_spin_lock(&trace_buf_lock); | 1678 | goto out; |
1618 | len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); | ||
1619 | 1679 | ||
1680 | local_save_flags(flags); | ||
1620 | size = sizeof(*entry) + len + 1; | 1681 | size = sizeof(*entry) + len + 1; |
1621 | buffer = tr->buffer; | 1682 | buffer = tr->buffer; |
1622 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 1683 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
1623 | irq_flags, pc); | 1684 | flags, pc); |
1624 | if (!event) | 1685 | if (!event) |
1625 | goto out_unlock; | 1686 | goto out; |
1626 | entry = ring_buffer_event_data(event); | 1687 | entry = ring_buffer_event_data(event); |
1627 | entry->ip = ip; | 1688 | entry->ip = ip; |
1628 | 1689 | ||
1629 | memcpy(&entry->buf, trace_buf, len); | 1690 | memcpy(&entry->buf, tbuffer, len); |
1630 | entry->buf[len] = '\0'; | 1691 | entry->buf[len] = '\0'; |
1631 | if (!filter_check_discard(call, entry, buffer, event)) { | 1692 | if (!filter_check_discard(call, entry, buffer, event)) { |
1632 | ring_buffer_unlock_commit(buffer, event); | 1693 | ring_buffer_unlock_commit(buffer, event); |
1633 | ftrace_trace_stack(buffer, irq_flags, 6, pc); | 1694 | ftrace_trace_stack(buffer, flags, 6, pc); |
1634 | } | 1695 | } |
1635 | |||
1636 | out_unlock: | ||
1637 | arch_spin_unlock(&trace_buf_lock); | ||
1638 | raw_local_irq_restore(irq_flags); | ||
1639 | unpause_graph_tracing(); | ||
1640 | out: | 1696 | out: |
1641 | atomic_dec_return(&data->disabled); | ||
1642 | preempt_enable_notrace(); | 1697 | preempt_enable_notrace(); |
1698 | unpause_graph_tracing(); | ||
1643 | 1699 | ||
1644 | return len; | 1700 | return len; |
1645 | } | 1701 | } |
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk); | |||
1652 | 1708 | ||
1653 | static void trace_iterator_increment(struct trace_iterator *iter) | 1709 | static void trace_iterator_increment(struct trace_iterator *iter) |
1654 | { | 1710 | { |
1655 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1656 | ftrace_disable_cpu(); | ||
1657 | |||
1658 | iter->idx++; | 1711 | iter->idx++; |
1659 | if (iter->buffer_iter[iter->cpu]) | 1712 | if (iter->buffer_iter[iter->cpu]) |
1660 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); | 1713 | ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); |
1661 | |||
1662 | ftrace_enable_cpu(); | ||
1663 | } | 1714 | } |
1664 | 1715 | ||
1665 | static struct trace_entry * | 1716 | static struct trace_entry * |
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1669 | struct ring_buffer_event *event; | 1720 | struct ring_buffer_event *event; |
1670 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; | 1721 | struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; |
1671 | 1722 | ||
1672 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1673 | ftrace_disable_cpu(); | ||
1674 | |||
1675 | if (buf_iter) | 1723 | if (buf_iter) |
1676 | event = ring_buffer_iter_peek(buf_iter, ts); | 1724 | event = ring_buffer_iter_peek(buf_iter, ts); |
1677 | else | 1725 | else |
1678 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, | 1726 | event = ring_buffer_peek(iter->tr->buffer, cpu, ts, |
1679 | lost_events); | 1727 | lost_events); |
1680 | 1728 | ||
1681 | ftrace_enable_cpu(); | ||
1682 | |||
1683 | if (event) { | 1729 | if (event) { |
1684 | iter->ent_size = ring_buffer_event_length(event); | 1730 | iter->ent_size = ring_buffer_event_length(event); |
1685 | return ring_buffer_event_data(event); | 1731 | return ring_buffer_event_data(event); |
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) | |||
1769 | 1815 | ||
1770 | static void trace_consume(struct trace_iterator *iter) | 1816 | static void trace_consume(struct trace_iterator *iter) |
1771 | { | 1817 | { |
1772 | /* Don't allow ftrace to trace into the ring buffers */ | ||
1773 | ftrace_disable_cpu(); | ||
1774 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, | 1818 | ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, |
1775 | &iter->lost_events); | 1819 | &iter->lost_events); |
1776 | ftrace_enable_cpu(); | ||
1777 | } | 1820 | } |
1778 | 1821 | ||
1779 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) | 1822 | static void *s_next(struct seq_file *m, void *v, loff_t *pos) |
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1862 | iter->cpu = 0; | 1905 | iter->cpu = 0; |
1863 | iter->idx = -1; | 1906 | iter->idx = -1; |
1864 | 1907 | ||
1865 | ftrace_disable_cpu(); | ||
1866 | |||
1867 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | 1908 | if (cpu_file == TRACE_PIPE_ALL_CPU) { |
1868 | for_each_tracing_cpu(cpu) | 1909 | for_each_tracing_cpu(cpu) |
1869 | tracing_iter_reset(iter, cpu); | 1910 | tracing_iter_reset(iter, cpu); |
1870 | } else | 1911 | } else |
1871 | tracing_iter_reset(iter, cpu_file); | 1912 | tracing_iter_reset(iter, cpu_file); |
1872 | 1913 | ||
1873 | ftrace_enable_cpu(); | ||
1874 | |||
1875 | iter->leftover = 0; | 1914 | iter->leftover = 0; |
1876 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) | 1915 | for (p = iter; p && l < *pos; p = s_next(m, p, &l)) |
1877 | ; | 1916 | ; |
@@ -2332,15 +2371,13 @@ static struct trace_iterator * | |||
2332 | __tracing_open(struct inode *inode, struct file *file) | 2371 | __tracing_open(struct inode *inode, struct file *file) |
2333 | { | 2372 | { |
2334 | long cpu_file = (long) inode->i_private; | 2373 | long cpu_file = (long) inode->i_private; |
2335 | void *fail_ret = ERR_PTR(-ENOMEM); | ||
2336 | struct trace_iterator *iter; | 2374 | struct trace_iterator *iter; |
2337 | struct seq_file *m; | 2375 | int cpu; |
2338 | int cpu, ret; | ||
2339 | 2376 | ||
2340 | if (tracing_disabled) | 2377 | if (tracing_disabled) |
2341 | return ERR_PTR(-ENODEV); | 2378 | return ERR_PTR(-ENODEV); |
2342 | 2379 | ||
2343 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2380 | iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); |
2344 | if (!iter) | 2381 | if (!iter) |
2345 | return ERR_PTR(-ENOMEM); | 2382 | return ERR_PTR(-ENOMEM); |
2346 | 2383 | ||
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file) | |||
2397 | tracing_iter_reset(iter, cpu); | 2434 | tracing_iter_reset(iter, cpu); |
2398 | } | 2435 | } |
2399 | 2436 | ||
2400 | ret = seq_open(file, &tracer_seq_ops); | ||
2401 | if (ret < 0) { | ||
2402 | fail_ret = ERR_PTR(ret); | ||
2403 | goto fail_buffer; | ||
2404 | } | ||
2405 | |||
2406 | m = file->private_data; | ||
2407 | m->private = iter; | ||
2408 | |||
2409 | mutex_unlock(&trace_types_lock); | 2437 | mutex_unlock(&trace_types_lock); |
2410 | 2438 | ||
2411 | return iter; | 2439 | return iter; |
2412 | 2440 | ||
2413 | fail_buffer: | ||
2414 | for_each_tracing_cpu(cpu) { | ||
2415 | if (iter->buffer_iter[cpu]) | ||
2416 | ring_buffer_read_finish(iter->buffer_iter[cpu]); | ||
2417 | } | ||
2418 | free_cpumask_var(iter->started); | ||
2419 | tracing_start(); | ||
2420 | fail: | 2441 | fail: |
2421 | mutex_unlock(&trace_types_lock); | 2442 | mutex_unlock(&trace_types_lock); |
2422 | kfree(iter->trace); | 2443 | kfree(iter->trace); |
2423 | kfree(iter); | 2444 | seq_release_private(inode, file); |
2424 | 2445 | return ERR_PTR(-ENOMEM); | |
2425 | return fail_ret; | ||
2426 | } | 2446 | } |
2427 | 2447 | ||
2428 | int tracing_open_generic(struct inode *inode, struct file *filp) | 2448 | int tracing_open_generic(struct inode *inode, struct file *filp) |
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2458 | tracing_start(); | 2478 | tracing_start(); |
2459 | mutex_unlock(&trace_types_lock); | 2479 | mutex_unlock(&trace_types_lock); |
2460 | 2480 | ||
2461 | seq_release(inode, file); | ||
2462 | mutex_destroy(&iter->mutex); | 2481 | mutex_destroy(&iter->mutex); |
2463 | free_cpumask_var(iter->started); | 2482 | free_cpumask_var(iter->started); |
2464 | kfree(iter->trace); | 2483 | kfree(iter->trace); |
2465 | kfree(iter); | 2484 | seq_release_private(inode, file); |
2466 | return 0; | 2485 | return 0; |
2467 | } | 2486 | } |
2468 | 2487 | ||
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
2648 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 2667 | if (cpumask_test_cpu(cpu, tracing_cpumask) && |
2649 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2668 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2650 | atomic_inc(&global_trace.data[cpu]->disabled); | 2669 | atomic_inc(&global_trace.data[cpu]->disabled); |
2670 | ring_buffer_record_disable_cpu(global_trace.buffer, cpu); | ||
2651 | } | 2671 | } |
2652 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 2672 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && |
2653 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 2673 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
2654 | atomic_dec(&global_trace.data[cpu]->disabled); | 2674 | atomic_dec(&global_trace.data[cpu]->disabled); |
2675 | ring_buffer_record_enable_cpu(global_trace.buffer, cpu); | ||
2655 | } | 2676 | } |
2656 | } | 2677 | } |
2657 | arch_spin_unlock(&ftrace_max_lock); | 2678 | arch_spin_unlock(&ftrace_max_lock); |
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr) | |||
2974 | return t->init(tr); | 2995 | return t->init(tr); |
2975 | } | 2996 | } |
2976 | 2997 | ||
2977 | static int __tracing_resize_ring_buffer(unsigned long size) | 2998 | static void set_buffer_entries(struct trace_array *tr, unsigned long val) |
2999 | { | ||
3000 | int cpu; | ||
3001 | for_each_tracing_cpu(cpu) | ||
3002 | tr->data[cpu]->entries = val; | ||
3003 | } | ||
3004 | |||
3005 | static int __tracing_resize_ring_buffer(unsigned long size, int cpu) | ||
2978 | { | 3006 | { |
2979 | int ret; | 3007 | int ret; |
2980 | 3008 | ||
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
2985 | */ | 3013 | */ |
2986 | ring_buffer_expanded = 1; | 3014 | ring_buffer_expanded = 1; |
2987 | 3015 | ||
2988 | ret = ring_buffer_resize(global_trace.buffer, size); | 3016 | ret = ring_buffer_resize(global_trace.buffer, size, cpu); |
2989 | if (ret < 0) | 3017 | if (ret < 0) |
2990 | return ret; | 3018 | return ret; |
2991 | 3019 | ||
2992 | if (!current_trace->use_max_tr) | 3020 | if (!current_trace->use_max_tr) |
2993 | goto out; | 3021 | goto out; |
2994 | 3022 | ||
2995 | ret = ring_buffer_resize(max_tr.buffer, size); | 3023 | ret = ring_buffer_resize(max_tr.buffer, size, cpu); |
2996 | if (ret < 0) { | 3024 | if (ret < 0) { |
2997 | int r; | 3025 | int r = 0; |
3026 | |||
3027 | if (cpu == RING_BUFFER_ALL_CPUS) { | ||
3028 | int i; | ||
3029 | for_each_tracing_cpu(i) { | ||
3030 | r = ring_buffer_resize(global_trace.buffer, | ||
3031 | global_trace.data[i]->entries, | ||
3032 | i); | ||
3033 | if (r < 0) | ||
3034 | break; | ||
3035 | } | ||
3036 | } else { | ||
3037 | r = ring_buffer_resize(global_trace.buffer, | ||
3038 | global_trace.data[cpu]->entries, | ||
3039 | cpu); | ||
3040 | } | ||
2998 | 3041 | ||
2999 | r = ring_buffer_resize(global_trace.buffer, | ||
3000 | global_trace.entries); | ||
3001 | if (r < 0) { | 3042 | if (r < 0) { |
3002 | /* | 3043 | /* |
3003 | * AARGH! We are left with different | 3044 | * AARGH! We are left with different |
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size) | |||
3019 | return ret; | 3060 | return ret; |
3020 | } | 3061 | } |
3021 | 3062 | ||
3022 | max_tr.entries = size; | 3063 | if (cpu == RING_BUFFER_ALL_CPUS) |
3064 | set_buffer_entries(&max_tr, size); | ||
3065 | else | ||
3066 | max_tr.data[cpu]->entries = size; | ||
3067 | |||
3023 | out: | 3068 | out: |
3024 | global_trace.entries = size; | 3069 | if (cpu == RING_BUFFER_ALL_CPUS) |
3070 | set_buffer_entries(&global_trace, size); | ||
3071 | else | ||
3072 | global_trace.data[cpu]->entries = size; | ||
3025 | 3073 | ||
3026 | return ret; | 3074 | return ret; |
3027 | } | 3075 | } |
3028 | 3076 | ||
3029 | static ssize_t tracing_resize_ring_buffer(unsigned long size) | 3077 | static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) |
3030 | { | 3078 | { |
3031 | int cpu, ret = size; | 3079 | int ret = size; |
3032 | 3080 | ||
3033 | mutex_lock(&trace_types_lock); | 3081 | mutex_lock(&trace_types_lock); |
3034 | 3082 | ||
3035 | tracing_stop(); | 3083 | if (cpu_id != RING_BUFFER_ALL_CPUS) { |
3036 | 3084 | /* make sure, this cpu is enabled in the mask */ | |
3037 | /* disable all cpu buffers */ | 3085 | if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { |
3038 | for_each_tracing_cpu(cpu) { | 3086 | ret = -EINVAL; |
3039 | if (global_trace.data[cpu]) | 3087 | goto out; |
3040 | atomic_inc(&global_trace.data[cpu]->disabled); | 3088 | } |
3041 | if (max_tr.data[cpu]) | ||
3042 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
3043 | } | 3089 | } |
3044 | 3090 | ||
3045 | if (size != global_trace.entries) | 3091 | ret = __tracing_resize_ring_buffer(size, cpu_id); |
3046 | ret = __tracing_resize_ring_buffer(size); | ||
3047 | |||
3048 | if (ret < 0) | 3092 | if (ret < 0) |
3049 | ret = -ENOMEM; | 3093 | ret = -ENOMEM; |
3050 | 3094 | ||
3051 | for_each_tracing_cpu(cpu) { | 3095 | out: |
3052 | if (global_trace.data[cpu]) | ||
3053 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
3054 | if (max_tr.data[cpu]) | ||
3055 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
3056 | } | ||
3057 | |||
3058 | tracing_start(); | ||
3059 | mutex_unlock(&trace_types_lock); | 3096 | mutex_unlock(&trace_types_lock); |
3060 | 3097 | ||
3061 | return ret; | 3098 | return ret; |
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void) | |||
3078 | 3115 | ||
3079 | mutex_lock(&trace_types_lock); | 3116 | mutex_lock(&trace_types_lock); |
3080 | if (!ring_buffer_expanded) | 3117 | if (!ring_buffer_expanded) |
3081 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3118 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3119 | RING_BUFFER_ALL_CPUS); | ||
3082 | mutex_unlock(&trace_types_lock); | 3120 | mutex_unlock(&trace_types_lock); |
3083 | 3121 | ||
3084 | return ret; | 3122 | return ret; |
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf) | |||
3102 | mutex_lock(&trace_types_lock); | 3140 | mutex_lock(&trace_types_lock); |
3103 | 3141 | ||
3104 | if (!ring_buffer_expanded) { | 3142 | if (!ring_buffer_expanded) { |
3105 | ret = __tracing_resize_ring_buffer(trace_buf_size); | 3143 | ret = __tracing_resize_ring_buffer(trace_buf_size, |
3144 | RING_BUFFER_ALL_CPUS); | ||
3106 | if (ret < 0) | 3145 | if (ret < 0) |
3107 | goto out; | 3146 | goto out; |
3108 | ret = 0; | 3147 | ret = 0; |
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf) | |||
3128 | * The max_tr ring buffer has some state (e.g. ring->clock) and | 3167 | * The max_tr ring buffer has some state (e.g. ring->clock) and |
3129 | * we want preserve it. | 3168 | * we want preserve it. |
3130 | */ | 3169 | */ |
3131 | ring_buffer_resize(max_tr.buffer, 1); | 3170 | ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); |
3132 | max_tr.entries = 1; | 3171 | set_buffer_entries(&max_tr, 1); |
3133 | } | 3172 | } |
3134 | destroy_trace_option_files(topts); | 3173 | destroy_trace_option_files(topts); |
3135 | 3174 | ||
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf) | |||
3137 | 3176 | ||
3138 | topts = create_trace_option_files(current_trace); | 3177 | topts = create_trace_option_files(current_trace); |
3139 | if (current_trace->use_max_tr) { | 3178 | if (current_trace->use_max_tr) { |
3140 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | 3179 | int cpu; |
3141 | if (ret < 0) | 3180 | /* we need to make per cpu buffer sizes equivalent */ |
3142 | goto out; | 3181 | for_each_tracing_cpu(cpu) { |
3143 | max_tr.entries = global_trace.entries; | 3182 | ret = ring_buffer_resize(max_tr.buffer, |
3183 | global_trace.data[cpu]->entries, | ||
3184 | cpu); | ||
3185 | if (ret < 0) | ||
3186 | goto out; | ||
3187 | max_tr.data[cpu]->entries = | ||
3188 | global_trace.data[cpu]->entries; | ||
3189 | } | ||
3144 | } | 3190 | } |
3145 | 3191 | ||
3146 | if (t->init) { | 3192 | if (t->init) { |
@@ -3563,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3563 | .pages = pages_def, | 3609 | .pages = pages_def, |
3564 | .partial = partial_def, | 3610 | .partial = partial_def, |
3565 | .nr_pages = 0, /* This gets updated below. */ | 3611 | .nr_pages = 0, /* This gets updated below. */ |
3612 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
3566 | .flags = flags, | 3613 | .flags = flags, |
3567 | .ops = &tracing_pipe_buf_ops, | 3614 | .ops = &tracing_pipe_buf_ops, |
3568 | .spd_release = tracing_spd_release_pipe, | 3615 | .spd_release = tracing_spd_release_pipe, |
@@ -3634,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3634 | 3681 | ||
3635 | ret = splice_to_pipe(pipe, &spd); | 3682 | ret = splice_to_pipe(pipe, &spd); |
3636 | out: | 3683 | out: |
3637 | splice_shrink_spd(pipe, &spd); | 3684 | splice_shrink_spd(&spd); |
3638 | return ret; | 3685 | return ret; |
3639 | 3686 | ||
3640 | out_err: | 3687 | out_err: |
@@ -3642,30 +3689,82 @@ out_err: | |||
3642 | goto out; | 3689 | goto out; |
3643 | } | 3690 | } |
3644 | 3691 | ||
3692 | struct ftrace_entries_info { | ||
3693 | struct trace_array *tr; | ||
3694 | int cpu; | ||
3695 | }; | ||
3696 | |||
3697 | static int tracing_entries_open(struct inode *inode, struct file *filp) | ||
3698 | { | ||
3699 | struct ftrace_entries_info *info; | ||
3700 | |||
3701 | if (tracing_disabled) | ||
3702 | return -ENODEV; | ||
3703 | |||
3704 | info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
3705 | if (!info) | ||
3706 | return -ENOMEM; | ||
3707 | |||
3708 | info->tr = &global_trace; | ||
3709 | info->cpu = (unsigned long)inode->i_private; | ||
3710 | |||
3711 | filp->private_data = info; | ||
3712 | |||
3713 | return 0; | ||
3714 | } | ||
3715 | |||
3645 | static ssize_t | 3716 | static ssize_t |
3646 | tracing_entries_read(struct file *filp, char __user *ubuf, | 3717 | tracing_entries_read(struct file *filp, char __user *ubuf, |
3647 | size_t cnt, loff_t *ppos) | 3718 | size_t cnt, loff_t *ppos) |
3648 | { | 3719 | { |
3649 | struct trace_array *tr = filp->private_data; | 3720 | struct ftrace_entries_info *info = filp->private_data; |
3650 | char buf[96]; | 3721 | struct trace_array *tr = info->tr; |
3651 | int r; | 3722 | char buf[64]; |
3723 | int r = 0; | ||
3724 | ssize_t ret; | ||
3652 | 3725 | ||
3653 | mutex_lock(&trace_types_lock); | 3726 | mutex_lock(&trace_types_lock); |
3654 | if (!ring_buffer_expanded) | 3727 | |
3655 | r = sprintf(buf, "%lu (expanded: %lu)\n", | 3728 | if (info->cpu == RING_BUFFER_ALL_CPUS) { |
3656 | tr->entries >> 10, | 3729 | int cpu, buf_size_same; |
3657 | trace_buf_size >> 10); | 3730 | unsigned long size; |
3658 | else | 3731 | |
3659 | r = sprintf(buf, "%lu\n", tr->entries >> 10); | 3732 | size = 0; |
3733 | buf_size_same = 1; | ||
3734 | /* check if all cpu sizes are same */ | ||
3735 | for_each_tracing_cpu(cpu) { | ||
3736 | /* fill in the size from first enabled cpu */ | ||
3737 | if (size == 0) | ||
3738 | size = tr->data[cpu]->entries; | ||
3739 | if (size != tr->data[cpu]->entries) { | ||
3740 | buf_size_same = 0; | ||
3741 | break; | ||
3742 | } | ||
3743 | } | ||
3744 | |||
3745 | if (buf_size_same) { | ||
3746 | if (!ring_buffer_expanded) | ||
3747 | r = sprintf(buf, "%lu (expanded: %lu)\n", | ||
3748 | size >> 10, | ||
3749 | trace_buf_size >> 10); | ||
3750 | else | ||
3751 | r = sprintf(buf, "%lu\n", size >> 10); | ||
3752 | } else | ||
3753 | r = sprintf(buf, "X\n"); | ||
3754 | } else | ||
3755 | r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); | ||
3756 | |||
3660 | mutex_unlock(&trace_types_lock); | 3757 | mutex_unlock(&trace_types_lock); |
3661 | 3758 | ||
3662 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 3759 | ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
3760 | return ret; | ||
3663 | } | 3761 | } |
3664 | 3762 | ||
3665 | static ssize_t | 3763 | static ssize_t |
3666 | tracing_entries_write(struct file *filp, const char __user *ubuf, | 3764 | tracing_entries_write(struct file *filp, const char __user *ubuf, |
3667 | size_t cnt, loff_t *ppos) | 3765 | size_t cnt, loff_t *ppos) |
3668 | { | 3766 | { |
3767 | struct ftrace_entries_info *info = filp->private_data; | ||
3669 | unsigned long val; | 3768 | unsigned long val; |
3670 | int ret; | 3769 | int ret; |
3671 | 3770 | ||
@@ -3680,7 +3779,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3680 | /* value is in KB */ | 3779 | /* value is in KB */ |
3681 | val <<= 10; | 3780 | val <<= 10; |
3682 | 3781 | ||
3683 | ret = tracing_resize_ring_buffer(val); | 3782 | ret = tracing_resize_ring_buffer(val, info->cpu); |
3684 | if (ret < 0) | 3783 | if (ret < 0) |
3685 | return ret; | 3784 | return ret; |
3686 | 3785 | ||
@@ -3689,6 +3788,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3689 | return cnt; | 3788 | return cnt; |
3690 | } | 3789 | } |
3691 | 3790 | ||
3791 | static int | ||
3792 | tracing_entries_release(struct inode *inode, struct file *filp) | ||
3793 | { | ||
3794 | struct ftrace_entries_info *info = filp->private_data; | ||
3795 | |||
3796 | kfree(info); | ||
3797 | |||
3798 | return 0; | ||
3799 | } | ||
3800 | |||
3692 | static ssize_t | 3801 | static ssize_t |
3693 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | 3802 | tracing_total_entries_read(struct file *filp, char __user *ubuf, |
3694 | size_t cnt, loff_t *ppos) | 3803 | size_t cnt, loff_t *ppos) |
@@ -3700,7 +3809,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, | |||
3700 | 3809 | ||
3701 | mutex_lock(&trace_types_lock); | 3810 | mutex_lock(&trace_types_lock); |
3702 | for_each_tracing_cpu(cpu) { | 3811 | for_each_tracing_cpu(cpu) { |
3703 | size += tr->entries >> 10; | 3812 | size += tr->data[cpu]->entries >> 10; |
3704 | if (!ring_buffer_expanded) | 3813 | if (!ring_buffer_expanded) |
3705 | expanded_size += trace_buf_size >> 10; | 3814 | expanded_size += trace_buf_size >> 10; |
3706 | } | 3815 | } |
@@ -3734,7 +3843,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
3734 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | 3843 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) |
3735 | tracing_off(); | 3844 | tracing_off(); |
3736 | /* resize the ring buffer to 0 */ | 3845 | /* resize the ring buffer to 0 */ |
3737 | tracing_resize_ring_buffer(0); | 3846 | tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); |
3738 | 3847 | ||
3739 | return 0; | 3848 | return 0; |
3740 | } | 3849 | } |
@@ -3749,14 +3858,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3749 | struct print_entry *entry; | 3858 | struct print_entry *entry; |
3750 | unsigned long irq_flags; | 3859 | unsigned long irq_flags; |
3751 | struct page *pages[2]; | 3860 | struct page *pages[2]; |
3861 | void *map_page[2]; | ||
3752 | int nr_pages = 1; | 3862 | int nr_pages = 1; |
3753 | ssize_t written; | 3863 | ssize_t written; |
3754 | void *page1; | ||
3755 | void *page2; | ||
3756 | int offset; | 3864 | int offset; |
3757 | int size; | 3865 | int size; |
3758 | int len; | 3866 | int len; |
3759 | int ret; | 3867 | int ret; |
3868 | int i; | ||
3760 | 3869 | ||
3761 | if (tracing_disabled) | 3870 | if (tracing_disabled) |
3762 | return -EINVAL; | 3871 | return -EINVAL; |
@@ -3795,9 +3904,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3795 | goto out; | 3904 | goto out; |
3796 | } | 3905 | } |
3797 | 3906 | ||
3798 | page1 = kmap_atomic(pages[0]); | 3907 | for (i = 0; i < nr_pages; i++) |
3799 | if (nr_pages == 2) | 3908 | map_page[i] = kmap_atomic(pages[i]); |
3800 | page2 = kmap_atomic(pages[1]); | ||
3801 | 3909 | ||
3802 | local_save_flags(irq_flags); | 3910 | local_save_flags(irq_flags); |
3803 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 3911 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
@@ -3815,10 +3923,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3815 | 3923 | ||
3816 | if (nr_pages == 2) { | 3924 | if (nr_pages == 2) { |
3817 | len = PAGE_SIZE - offset; | 3925 | len = PAGE_SIZE - offset; |
3818 | memcpy(&entry->buf, page1 + offset, len); | 3926 | memcpy(&entry->buf, map_page[0] + offset, len); |
3819 | memcpy(&entry->buf[len], page2, cnt - len); | 3927 | memcpy(&entry->buf[len], map_page[1], cnt - len); |
3820 | } else | 3928 | } else |
3821 | memcpy(&entry->buf, page1 + offset, cnt); | 3929 | memcpy(&entry->buf, map_page[0] + offset, cnt); |
3822 | 3930 | ||
3823 | if (entry->buf[cnt - 1] != '\n') { | 3931 | if (entry->buf[cnt - 1] != '\n') { |
3824 | entry->buf[cnt] = '\n'; | 3932 | entry->buf[cnt] = '\n'; |
@@ -3833,11 +3941,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3833 | *fpos += written; | 3941 | *fpos += written; |
3834 | 3942 | ||
3835 | out_unlock: | 3943 | out_unlock: |
3836 | if (nr_pages == 2) | 3944 | for (i = 0; i < nr_pages; i++){ |
3837 | kunmap_atomic(page2); | 3945 | kunmap_atomic(map_page[i]); |
3838 | kunmap_atomic(page1); | 3946 | put_page(pages[i]); |
3839 | while (nr_pages > 0) | 3947 | } |
3840 | put_page(pages[--nr_pages]); | ||
3841 | out: | 3948 | out: |
3842 | return written; | 3949 | return written; |
3843 | } | 3950 | } |
@@ -3933,9 +4040,10 @@ static const struct file_operations tracing_pipe_fops = { | |||
3933 | }; | 4040 | }; |
3934 | 4041 | ||
3935 | static const struct file_operations tracing_entries_fops = { | 4042 | static const struct file_operations tracing_entries_fops = { |
3936 | .open = tracing_open_generic, | 4043 | .open = tracing_entries_open, |
3937 | .read = tracing_entries_read, | 4044 | .read = tracing_entries_read, |
3938 | .write = tracing_entries_write, | 4045 | .write = tracing_entries_write, |
4046 | .release = tracing_entries_release, | ||
3939 | .llseek = generic_file_llseek, | 4047 | .llseek = generic_file_llseek, |
3940 | }; | 4048 | }; |
3941 | 4049 | ||
@@ -4124,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4124 | struct splice_pipe_desc spd = { | 4232 | struct splice_pipe_desc spd = { |
4125 | .pages = pages_def, | 4233 | .pages = pages_def, |
4126 | .partial = partial_def, | 4234 | .partial = partial_def, |
4235 | .nr_pages_max = PIPE_DEF_BUFFERS, | ||
4127 | .flags = flags, | 4236 | .flags = flags, |
4128 | .ops = &buffer_pipe_buf_ops, | 4237 | .ops = &buffer_pipe_buf_ops, |
4129 | .spd_release = buffer_spd_release, | 4238 | .spd_release = buffer_spd_release, |
@@ -4211,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
4211 | } | 4320 | } |
4212 | 4321 | ||
4213 | ret = splice_to_pipe(pipe, &spd); | 4322 | ret = splice_to_pipe(pipe, &spd); |
4214 | splice_shrink_spd(pipe, &spd); | 4323 | splice_shrink_spd(&spd); |
4215 | out: | 4324 | out: |
4216 | return ret; | 4325 | return ret; |
4217 | } | 4326 | } |
@@ -4367,6 +4476,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4367 | struct dentry *d_cpu; | 4476 | struct dentry *d_cpu; |
4368 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 4477 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
4369 | 4478 | ||
4479 | if (!d_percpu) | ||
4480 | return; | ||
4481 | |||
4370 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 4482 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
4371 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 4483 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); |
4372 | if (!d_cpu) { | 4484 | if (!d_cpu) { |
@@ -4387,6 +4499,9 @@ static void tracing_init_debugfs_percpu(long cpu) | |||
4387 | 4499 | ||
4388 | trace_create_file("stats", 0444, d_cpu, | 4500 | trace_create_file("stats", 0444, d_cpu, |
4389 | (void *) cpu, &tracing_stats_fops); | 4501 | (void *) cpu, &tracing_stats_fops); |
4502 | |||
4503 | trace_create_file("buffer_size_kb", 0444, d_cpu, | ||
4504 | (void *) cpu, &tracing_entries_fops); | ||
4390 | } | 4505 | } |
4391 | 4506 | ||
4392 | #ifdef CONFIG_FTRACE_SELFTEST | 4507 | #ifdef CONFIG_FTRACE_SELFTEST |
@@ -4718,7 +4833,7 @@ static __init int tracer_init_debugfs(void) | |||
4718 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); | 4833 | (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); |
4719 | 4834 | ||
4720 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4835 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4721 | &global_trace, &tracing_entries_fops); | 4836 | (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); |
4722 | 4837 | ||
4723 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | 4838 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, |
4724 | &global_trace, &tracing_total_entries_fops); | 4839 | &global_trace, &tracing_total_entries_fops); |
@@ -4957,6 +5072,10 @@ __init static int tracer_alloc_buffers(void) | |||
4957 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 5072 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4958 | goto out_free_buffer_mask; | 5073 | goto out_free_buffer_mask; |
4959 | 5074 | ||
5075 | /* Only allocate trace_printk buffers if a trace_printk exists */ | ||
5076 | if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) | ||
5077 | trace_printk_init_buffers(); | ||
5078 | |||
4960 | /* To save memory, keep the ring buffer size to its minimum */ | 5079 | /* To save memory, keep the ring buffer size to its minimum */ |
4961 | if (ring_buffer_expanded) | 5080 | if (ring_buffer_expanded) |
4962 | ring_buf_size = trace_buf_size; | 5081 | ring_buf_size = trace_buf_size; |
@@ -4975,7 +5094,6 @@ __init static int tracer_alloc_buffers(void) | |||
4975 | WARN_ON(1); | 5094 | WARN_ON(1); |
4976 | goto out_free_cpumask; | 5095 | goto out_free_cpumask; |
4977 | } | 5096 | } |
4978 | global_trace.entries = ring_buffer_size(global_trace.buffer); | ||
4979 | if (global_trace.buffer_disabled) | 5097 | if (global_trace.buffer_disabled) |
4980 | tracing_off(); | 5098 | tracing_off(); |
4981 | 5099 | ||
@@ -4988,7 +5106,6 @@ __init static int tracer_alloc_buffers(void) | |||
4988 | ring_buffer_free(global_trace.buffer); | 5106 | ring_buffer_free(global_trace.buffer); |
4989 | goto out_free_cpumask; | 5107 | goto out_free_cpumask; |
4990 | } | 5108 | } |
4991 | max_tr.entries = 1; | ||
4992 | #endif | 5109 | #endif |
4993 | 5110 | ||
4994 | /* Allocate the first page for all buffers */ | 5111 | /* Allocate the first page for all buffers */ |
@@ -4997,6 +5114,12 @@ __init static int tracer_alloc_buffers(void) | |||
4997 | max_tr.data[i] = &per_cpu(max_tr_data, i); | 5114 | max_tr.data[i] = &per_cpu(max_tr_data, i); |
4998 | } | 5115 | } |
4999 | 5116 | ||
5117 | set_buffer_entries(&global_trace, | ||
5118 | ring_buffer_size(global_trace.buffer, 0)); | ||
5119 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
5120 | set_buffer_entries(&max_tr, 1); | ||
5121 | #endif | ||
5122 | |||
5000 | trace_init_cmdlines(); | 5123 | trace_init_cmdlines(); |
5001 | 5124 | ||
5002 | register_tracer(&nop_trace); | 5125 | register_tracer(&nop_trace); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f95d65da6db8..5aec220d2de0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head { | |||
103 | unsigned long ret_ip; | 103 | unsigned long ret_ip; |
104 | }; | 104 | }; |
105 | 105 | ||
106 | struct uprobe_trace_entry_head { | ||
107 | struct trace_entry ent; | ||
108 | unsigned long ip; | ||
109 | }; | ||
110 | |||
106 | /* | 111 | /* |
107 | * trace_flag_type is an enumeration that holds different | 112 | * trace_flag_type is an enumeration that holds different |
108 | * states when a trace occurs. These are: | 113 | * states when a trace occurs. These are: |
@@ -131,6 +136,7 @@ struct trace_array_cpu { | |||
131 | atomic_t disabled; | 136 | atomic_t disabled; |
132 | void *buffer_page; /* ring buffer spare */ | 137 | void *buffer_page; /* ring buffer spare */ |
133 | 138 | ||
139 | unsigned long entries; | ||
134 | unsigned long saved_latency; | 140 | unsigned long saved_latency; |
135 | unsigned long critical_start; | 141 | unsigned long critical_start; |
136 | unsigned long critical_end; | 142 | unsigned long critical_end; |
@@ -152,7 +158,6 @@ struct trace_array_cpu { | |||
152 | */ | 158 | */ |
153 | struct trace_array { | 159 | struct trace_array { |
154 | struct ring_buffer *buffer; | 160 | struct ring_buffer *buffer; |
155 | unsigned long entries; | ||
156 | int cpu; | 161 | int cpu; |
157 | int buffer_disabled; | 162 | int buffer_disabled; |
158 | cycle_t time_start; | 163 | cycle_t time_start; |
@@ -826,6 +831,8 @@ extern struct list_head ftrace_events; | |||
826 | extern const char *__start___trace_bprintk_fmt[]; | 831 | extern const char *__start___trace_bprintk_fmt[]; |
827 | extern const char *__stop___trace_bprintk_fmt[]; | 832 | extern const char *__stop___trace_bprintk_fmt[]; |
828 | 833 | ||
834 | void trace_printk_init_buffers(void); | ||
835 | |||
829 | #undef FTRACE_ENTRY | 836 | #undef FTRACE_ENTRY |
830 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ | 837 | #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ |
831 | extern struct ftrace_event_call \ | 838 | extern struct ftrace_event_call \ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 079a93ae8a9d..29111da1d100 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
294 | if (!call->name || !call->class || !call->class->reg) | 294 | if (!call->name || !call->class || !call->class->reg) |
295 | continue; | 295 | continue; |
296 | 296 | ||
297 | if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) | ||
298 | continue; | ||
299 | |||
297 | if (match && | 300 | if (match && |
298 | strcmp(match, call->name) != 0 && | 301 | strcmp(match, call->name) != 0 && |
299 | strcmp(match, call->class->system) != 0) | 302 | strcmp(match, call->class->system) != 0) |
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
1164 | return -1; | 1167 | return -1; |
1165 | } | 1168 | } |
1166 | 1169 | ||
1167 | if (call->class->reg) | 1170 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1168 | trace_create_file("enable", 0644, call->dir, call, | 1171 | trace_create_file("enable", 0644, call->dir, call, |
1169 | enable); | 1172 | enable); |
1170 | 1173 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 3dd15e8bc856..e039906b037d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
180 | .event.type = etype, \ | 180 | .event.type = etype, \ |
181 | .class = &event_class_ftrace_##call, \ | 181 | .class = &event_class_ftrace_##call, \ |
182 | .print_fmt = print, \ | 182 | .print_fmt = print, \ |
183 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | ||
183 | }; \ | 184 | }; \ |
184 | struct ftrace_event_call __used \ | 185 | struct ftrace_event_call __used \ |
185 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 186 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 580a05ec926b..b31d3d5699fe 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -19,547 +19,15 @@ | |||
19 | 19 | ||
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/seq_file.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/smp.h> | ||
26 | #include <linux/debugfs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/string.h> | ||
29 | #include <linux/ctype.h> | ||
30 | #include <linux/ptrace.h> | ||
31 | #include <linux/perf_event.h> | ||
32 | #include <linux/stringify.h> | ||
33 | #include <linux/limits.h> | ||
34 | #include <asm/bitsperlong.h> | ||
35 | |||
36 | #include "trace.h" | ||
37 | #include "trace_output.h" | ||
38 | |||
39 | #define MAX_TRACE_ARGS 128 | ||
40 | #define MAX_ARGSTR_LEN 63 | ||
41 | #define MAX_EVENT_NAME_LEN 64 | ||
42 | #define MAX_STRING_SIZE PATH_MAX | ||
43 | #define KPROBE_EVENT_SYSTEM "kprobes" | ||
44 | |||
45 | /* Reserved field names */ | ||
46 | #define FIELD_STRING_IP "__probe_ip" | ||
47 | #define FIELD_STRING_RETIP "__probe_ret_ip" | ||
48 | #define FIELD_STRING_FUNC "__probe_func" | ||
49 | |||
50 | const char *reserved_field_names[] = { | ||
51 | "common_type", | ||
52 | "common_flags", | ||
53 | "common_preempt_count", | ||
54 | "common_pid", | ||
55 | "common_tgid", | ||
56 | FIELD_STRING_IP, | ||
57 | FIELD_STRING_RETIP, | ||
58 | FIELD_STRING_FUNC, | ||
59 | }; | ||
60 | |||
61 | /* Printing function type */ | ||
62 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, | ||
63 | void *); | ||
64 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | ||
65 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | ||
66 | |||
67 | /* Printing in basic type function template */ | ||
68 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | ||
69 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | ||
70 | const char *name, \ | ||
71 | void *data, void *ent)\ | ||
72 | { \ | ||
73 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | ||
74 | } \ | ||
75 | static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; | ||
76 | |||
77 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) | ||
78 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) | ||
79 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) | ||
80 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) | ||
81 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) | ||
82 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | ||
83 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | ||
84 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | ||
85 | |||
86 | /* data_rloc: data relative location, compatible with u32 */ | ||
87 | #define make_data_rloc(len, roffs) \ | ||
88 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
89 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
90 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
91 | |||
92 | static inline void *get_rloc_data(u32 *dl) | ||
93 | { | ||
94 | return (u8 *)dl + get_rloc_offs(*dl); | ||
95 | } | ||
96 | |||
97 | /* For data_loc conversion */ | ||
98 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
99 | { | ||
100 | return (u8 *)ent + get_rloc_offs(*dl); | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Convert data_rloc to data_loc: | ||
105 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
106 | * stores the offset from event entry. | ||
107 | */ | ||
108 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
109 | |||
110 | /* For defining macros, define string/string_size types */ | ||
111 | typedef u32 string; | ||
112 | typedef u32 string_size; | ||
113 | |||
114 | /* Print type function for string type */ | ||
115 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
116 | const char *name, | ||
117 | void *data, void *ent) | ||
118 | { | ||
119 | int len = *(u32 *)data >> 16; | ||
120 | |||
121 | if (!len) | ||
122 | return trace_seq_printf(s, " %s=(fault)", name); | ||
123 | else | ||
124 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
125 | (const char *)get_loc_data(data, ent)); | ||
126 | } | ||
127 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
128 | |||
129 | /* Data fetch function type */ | ||
130 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | ||
131 | |||
132 | struct fetch_param { | ||
133 | fetch_func_t fn; | ||
134 | void *data; | ||
135 | }; | ||
136 | |||
137 | static __kprobes void call_fetch(struct fetch_param *fprm, | ||
138 | struct pt_regs *regs, void *dest) | ||
139 | { | ||
140 | return fprm->fn(regs, fprm->data, dest); | ||
141 | } | ||
142 | |||
143 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | ||
144 | /* | ||
145 | * Define macro for basic types - we don't need to define s* types, because | ||
146 | * we have to care only about bitwidth at recording time. | ||
147 | */ | ||
148 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ | ||
149 | DEFINE_FETCH_##method(u8) \ | ||
150 | DEFINE_FETCH_##method(u16) \ | ||
151 | DEFINE_FETCH_##method(u32) \ | ||
152 | DEFINE_FETCH_##method(u64) | ||
153 | |||
154 | #define CHECK_FETCH_FUNCS(method, fn) \ | ||
155 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ | ||
156 | (FETCH_FUNC_NAME(method, u16) == fn) || \ | ||
157 | (FETCH_FUNC_NAME(method, u32) == fn) || \ | ||
158 | (FETCH_FUNC_NAME(method, u64) == fn) || \ | ||
159 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
160 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
161 | && (fn != NULL)) | ||
162 | |||
163 | /* Data fetch function templates */ | ||
164 | #define DEFINE_FETCH_reg(type) \ | ||
165 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | ||
166 | void *offset, void *dest) \ | ||
167 | { \ | ||
168 | *(type *)dest = (type)regs_get_register(regs, \ | ||
169 | (unsigned int)((unsigned long)offset)); \ | ||
170 | } | ||
171 | DEFINE_BASIC_FETCH_FUNCS(reg) | ||
172 | /* No string on the register */ | ||
173 | #define fetch_reg_string NULL | ||
174 | #define fetch_reg_string_size NULL | ||
175 | |||
176 | #define DEFINE_FETCH_stack(type) \ | ||
177 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
178 | void *offset, void *dest) \ | ||
179 | { \ | ||
180 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | ||
181 | (unsigned int)((unsigned long)offset)); \ | ||
182 | } | ||
183 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
184 | /* No string on the stack entry */ | ||
185 | #define fetch_stack_string NULL | ||
186 | #define fetch_stack_string_size NULL | ||
187 | |||
188 | #define DEFINE_FETCH_retval(type) \ | ||
189 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | ||
190 | void *dummy, void *dest) \ | ||
191 | { \ | ||
192 | *(type *)dest = (type)regs_return_value(regs); \ | ||
193 | } | ||
194 | DEFINE_BASIC_FETCH_FUNCS(retval) | ||
195 | /* No string on the retval */ | ||
196 | #define fetch_retval_string NULL | ||
197 | #define fetch_retval_string_size NULL | ||
198 | |||
199 | #define DEFINE_FETCH_memory(type) \ | ||
200 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
201 | void *addr, void *dest) \ | ||
202 | { \ | ||
203 | type retval; \ | ||
204 | if (probe_kernel_address(addr, retval)) \ | ||
205 | *(type *)dest = 0; \ | ||
206 | else \ | ||
207 | *(type *)dest = retval; \ | ||
208 | } | ||
209 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
210 | /* | ||
211 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
212 | * length and relative data location. | ||
213 | */ | ||
214 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
215 | void *addr, void *dest) | ||
216 | { | ||
217 | long ret; | ||
218 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
219 | u8 *dst = get_rloc_data(dest); | ||
220 | u8 *src = addr; | ||
221 | mm_segment_t old_fs = get_fs(); | ||
222 | if (!maxlen) | ||
223 | return; | ||
224 | /* | ||
225 | * Try to get string again, since the string can be changed while | ||
226 | * probing. | ||
227 | */ | ||
228 | set_fs(KERNEL_DS); | ||
229 | pagefault_disable(); | ||
230 | do | ||
231 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
232 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
233 | dst[-1] = '\0'; | ||
234 | pagefault_enable(); | ||
235 | set_fs(old_fs); | ||
236 | |||
237 | if (ret < 0) { /* Failed to fetch string */ | ||
238 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
239 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
240 | } else | ||
241 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
242 | get_rloc_offs(*(u32 *)dest)); | ||
243 | } | ||
244 | /* Return the length of string -- including null terminal byte */ | ||
245 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
246 | void *addr, void *dest) | ||
247 | { | ||
248 | int ret, len = 0; | ||
249 | u8 c; | ||
250 | mm_segment_t old_fs = get_fs(); | ||
251 | |||
252 | set_fs(KERNEL_DS); | ||
253 | pagefault_disable(); | ||
254 | do { | ||
255 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
256 | len++; | ||
257 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
258 | pagefault_enable(); | ||
259 | set_fs(old_fs); | ||
260 | |||
261 | if (ret < 0) /* Failed to check the length */ | ||
262 | *(u32 *)dest = 0; | ||
263 | else | ||
264 | *(u32 *)dest = len; | ||
265 | } | ||
266 | |||
267 | /* Memory fetching by symbol */ | ||
268 | struct symbol_cache { | ||
269 | char *symbol; | ||
270 | long offset; | ||
271 | unsigned long addr; | ||
272 | }; | ||
273 | |||
274 | static unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
275 | { | ||
276 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
277 | if (sc->addr) | ||
278 | sc->addr += sc->offset; | ||
279 | return sc->addr; | ||
280 | } | ||
281 | |||
282 | static void free_symbol_cache(struct symbol_cache *sc) | ||
283 | { | ||
284 | kfree(sc->symbol); | ||
285 | kfree(sc); | ||
286 | } | ||
287 | |||
288 | static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
289 | { | ||
290 | struct symbol_cache *sc; | ||
291 | |||
292 | if (!sym || strlen(sym) == 0) | ||
293 | return NULL; | ||
294 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
295 | if (!sc) | ||
296 | return NULL; | ||
297 | |||
298 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
299 | if (!sc->symbol) { | ||
300 | kfree(sc); | ||
301 | return NULL; | ||
302 | } | ||
303 | sc->offset = offset; | ||
304 | 22 | ||
305 | update_symbol_cache(sc); | 23 | #include "trace_probe.h" |
306 | return sc; | ||
307 | } | ||
308 | |||
309 | #define DEFINE_FETCH_symbol(type) \ | ||
310 | static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | ||
311 | void *data, void *dest) \ | ||
312 | { \ | ||
313 | struct symbol_cache *sc = data; \ | ||
314 | if (sc->addr) \ | ||
315 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | ||
316 | else \ | ||
317 | *(type *)dest = 0; \ | ||
318 | } | ||
319 | DEFINE_BASIC_FETCH_FUNCS(symbol) | ||
320 | DEFINE_FETCH_symbol(string) | ||
321 | DEFINE_FETCH_symbol(string_size) | ||
322 | |||
323 | /* Dereference memory access function */ | ||
324 | struct deref_fetch_param { | ||
325 | struct fetch_param orig; | ||
326 | long offset; | ||
327 | }; | ||
328 | |||
329 | #define DEFINE_FETCH_deref(type) \ | ||
330 | static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | ||
331 | void *data, void *dest) \ | ||
332 | { \ | ||
333 | struct deref_fetch_param *dprm = data; \ | ||
334 | unsigned long addr; \ | ||
335 | call_fetch(&dprm->orig, regs, &addr); \ | ||
336 | if (addr) { \ | ||
337 | addr += dprm->offset; \ | ||
338 | fetch_memory_##type(regs, (void *)addr, dest); \ | ||
339 | } else \ | ||
340 | *(type *)dest = 0; \ | ||
341 | } | ||
342 | DEFINE_BASIC_FETCH_FUNCS(deref) | ||
343 | DEFINE_FETCH_deref(string) | ||
344 | DEFINE_FETCH_deref(string_size) | ||
345 | |||
346 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | ||
347 | { | ||
348 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
349 | update_deref_fetch_param(data->orig.data); | ||
350 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
351 | update_symbol_cache(data->orig.data); | ||
352 | } | ||
353 | |||
354 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | ||
355 | { | ||
356 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
357 | free_deref_fetch_param(data->orig.data); | ||
358 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
359 | free_symbol_cache(data->orig.data); | ||
360 | kfree(data); | ||
361 | } | ||
362 | |||
363 | /* Bitfield fetch function */ | ||
364 | struct bitfield_fetch_param { | ||
365 | struct fetch_param orig; | ||
366 | unsigned char hi_shift; | ||
367 | unsigned char low_shift; | ||
368 | }; | ||
369 | 24 | ||
370 | #define DEFINE_FETCH_bitfield(type) \ | 25 | #define KPROBE_EVENT_SYSTEM "kprobes" |
371 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
372 | void *data, void *dest) \ | ||
373 | { \ | ||
374 | struct bitfield_fetch_param *bprm = data; \ | ||
375 | type buf = 0; \ | ||
376 | call_fetch(&bprm->orig, regs, &buf); \ | ||
377 | if (buf) { \ | ||
378 | buf <<= bprm->hi_shift; \ | ||
379 | buf >>= bprm->low_shift; \ | ||
380 | } \ | ||
381 | *(type *)dest = buf; \ | ||
382 | } | ||
383 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
384 | #define fetch_bitfield_string NULL | ||
385 | #define fetch_bitfield_string_size NULL | ||
386 | |||
387 | static __kprobes void | ||
388 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
389 | { | ||
390 | /* | ||
391 | * Don't check the bitfield itself, because this must be the | ||
392 | * last fetch function. | ||
393 | */ | ||
394 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
395 | update_deref_fetch_param(data->orig.data); | ||
396 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
397 | update_symbol_cache(data->orig.data); | ||
398 | } | ||
399 | |||
400 | static __kprobes void | ||
401 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
402 | { | ||
403 | /* | ||
404 | * Don't check the bitfield itself, because this must be the | ||
405 | * last fetch function. | ||
406 | */ | ||
407 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
408 | free_deref_fetch_param(data->orig.data); | ||
409 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
410 | free_symbol_cache(data->orig.data); | ||
411 | kfree(data); | ||
412 | } | ||
413 | |||
414 | /* Default (unsigned long) fetch type */ | ||
415 | #define __DEFAULT_FETCH_TYPE(t) u##t | ||
416 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | ||
417 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | ||
418 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | ||
419 | |||
420 | /* Fetch types */ | ||
421 | enum { | ||
422 | FETCH_MTD_reg = 0, | ||
423 | FETCH_MTD_stack, | ||
424 | FETCH_MTD_retval, | ||
425 | FETCH_MTD_memory, | ||
426 | FETCH_MTD_symbol, | ||
427 | FETCH_MTD_deref, | ||
428 | FETCH_MTD_bitfield, | ||
429 | FETCH_MTD_END, | ||
430 | }; | ||
431 | |||
432 | #define ASSIGN_FETCH_FUNC(method, type) \ | ||
433 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) | ||
434 | |||
435 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ | ||
436 | {.name = _name, \ | ||
437 | .size = _size, \ | ||
438 | .is_signed = sign, \ | ||
439 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
440 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
441 | .fmttype = _fmttype, \ | ||
442 | .fetch = { \ | ||
443 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
444 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
445 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
446 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
447 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
448 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
449 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
450 | } \ | ||
451 | } | ||
452 | |||
453 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
454 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
455 | |||
456 | #define FETCH_TYPE_STRING 0 | ||
457 | #define FETCH_TYPE_STRSIZE 1 | ||
458 | |||
459 | /* Fetch type information table */ | ||
460 | static const struct fetch_type { | ||
461 | const char *name; /* Name of type */ | ||
462 | size_t size; /* Byte size of type */ | ||
463 | int is_signed; /* Signed flag */ | ||
464 | print_type_func_t print; /* Print functions */ | ||
465 | const char *fmt; /* Fromat string */ | ||
466 | const char *fmttype; /* Name in format file */ | ||
467 | /* Fetch functions */ | ||
468 | fetch_func_t fetch[FETCH_MTD_END]; | ||
469 | } fetch_type_table[] = { | ||
470 | /* Special types */ | ||
471 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
472 | sizeof(u32), 1, "__data_loc char[]"), | ||
473 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
474 | string_size, sizeof(u32), 0, "u32"), | ||
475 | /* Basic types */ | ||
476 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
477 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
478 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
479 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
480 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
481 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
482 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
483 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
484 | }; | ||
485 | |||
486 | static const struct fetch_type *find_fetch_type(const char *type) | ||
487 | { | ||
488 | int i; | ||
489 | |||
490 | if (!type) | ||
491 | type = DEFAULT_FETCH_TYPE_STR; | ||
492 | |||
493 | /* Special case: bitfield */ | ||
494 | if (*type == 'b') { | ||
495 | unsigned long bs; | ||
496 | type = strchr(type, '/'); | ||
497 | if (!type) | ||
498 | goto fail; | ||
499 | type++; | ||
500 | if (strict_strtoul(type, 0, &bs)) | ||
501 | goto fail; | ||
502 | switch (bs) { | ||
503 | case 8: | ||
504 | return find_fetch_type("u8"); | ||
505 | case 16: | ||
506 | return find_fetch_type("u16"); | ||
507 | case 32: | ||
508 | return find_fetch_type("u32"); | ||
509 | case 64: | ||
510 | return find_fetch_type("u64"); | ||
511 | default: | ||
512 | goto fail; | ||
513 | } | ||
514 | } | ||
515 | |||
516 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | ||
517 | if (strcmp(type, fetch_type_table[i].name) == 0) | ||
518 | return &fetch_type_table[i]; | ||
519 | fail: | ||
520 | return NULL; | ||
521 | } | ||
522 | |||
523 | /* Special function : only accept unsigned long */ | ||
524 | static __kprobes void fetch_stack_address(struct pt_regs *regs, | ||
525 | void *dummy, void *dest) | ||
526 | { | ||
527 | *(unsigned long *)dest = kernel_stack_pointer(regs); | ||
528 | } | ||
529 | |||
530 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
531 | fetch_func_t orig_fn) | ||
532 | { | ||
533 | int i; | ||
534 | |||
535 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
536 | return NULL; /* Only string type needs size function */ | ||
537 | for (i = 0; i < FETCH_MTD_END; i++) | ||
538 | if (type->fetch[i] == orig_fn) | ||
539 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
540 | |||
541 | WARN_ON(1); /* This should not happen */ | ||
542 | return NULL; | ||
543 | } | ||
544 | 26 | ||
545 | /** | 27 | /** |
546 | * Kprobe event core functions | 28 | * Kprobe event core functions |
547 | */ | 29 | */ |
548 | 30 | ||
549 | struct probe_arg { | ||
550 | struct fetch_param fetch; | ||
551 | struct fetch_param fetch_size; | ||
552 | unsigned int offset; /* Offset from argument entry */ | ||
553 | const char *name; /* Name of this argument */ | ||
554 | const char *comm; /* Command of this argument */ | ||
555 | const struct fetch_type *type; /* Type of this argument */ | ||
556 | }; | ||
557 | |||
558 | /* Flags for trace_probe */ | ||
559 | #define TP_FLAG_TRACE 1 | ||
560 | #define TP_FLAG_PROFILE 2 | ||
561 | #define TP_FLAG_REGISTERED 4 | ||
562 | |||
563 | struct trace_probe { | 31 | struct trace_probe { |
564 | struct list_head list; | 32 | struct list_head list; |
565 | struct kretprobe rp; /* Use rp.kp for kprobe use */ | 33 | struct kretprobe rp; /* Use rp.kp for kprobe use */ |
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | |||
631 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | 99 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, |
632 | struct pt_regs *regs); | 100 | struct pt_regs *regs); |
633 | 101 | ||
634 | /* Check the name is good for event/group/fields */ | ||
635 | static int is_good_name(const char *name) | ||
636 | { | ||
637 | if (!isalpha(*name) && *name != '_') | ||
638 | return 0; | ||
639 | while (*++name != '\0') { | ||
640 | if (!isalpha(*name) && !isdigit(*name) && *name != '_') | ||
641 | return 0; | ||
642 | } | ||
643 | return 1; | ||
644 | } | ||
645 | |||
646 | /* | 102 | /* |
647 | * Allocate new trace_probe and initialize it (including kprobes). | 103 | * Allocate new trace_probe and initialize it (including kprobes). |
648 | */ | 104 | */ |
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
651 | void *addr, | 107 | void *addr, |
652 | const char *symbol, | 108 | const char *symbol, |
653 | unsigned long offs, | 109 | unsigned long offs, |
654 | int nargs, int is_return) | 110 | int nargs, bool is_return) |
655 | { | 111 | { |
656 | struct trace_probe *tp; | 112 | struct trace_probe *tp; |
657 | int ret = -ENOMEM; | 113 | int ret = -ENOMEM; |
@@ -702,34 +158,12 @@ error: | |||
702 | return ERR_PTR(ret); | 158 | return ERR_PTR(ret); |
703 | } | 159 | } |
704 | 160 | ||
705 | static void update_probe_arg(struct probe_arg *arg) | ||
706 | { | ||
707 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
708 | update_bitfield_fetch_param(arg->fetch.data); | ||
709 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
710 | update_deref_fetch_param(arg->fetch.data); | ||
711 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
712 | update_symbol_cache(arg->fetch.data); | ||
713 | } | ||
714 | |||
715 | static void free_probe_arg(struct probe_arg *arg) | ||
716 | { | ||
717 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
718 | free_bitfield_fetch_param(arg->fetch.data); | ||
719 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
720 | free_deref_fetch_param(arg->fetch.data); | ||
721 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
722 | free_symbol_cache(arg->fetch.data); | ||
723 | kfree(arg->name); | ||
724 | kfree(arg->comm); | ||
725 | } | ||
726 | |||
727 | static void free_trace_probe(struct trace_probe *tp) | 161 | static void free_trace_probe(struct trace_probe *tp) |
728 | { | 162 | { |
729 | int i; | 163 | int i; |
730 | 164 | ||
731 | for (i = 0; i < tp->nr_args; i++) | 165 | for (i = 0; i < tp->nr_args; i++) |
732 | free_probe_arg(&tp->args[i]); | 166 | traceprobe_free_probe_arg(&tp->args[i]); |
733 | 167 | ||
734 | kfree(tp->call.class->system); | 168 | kfree(tp->call.class->system); |
735 | kfree(tp->call.name); | 169 | kfree(tp->call.name); |
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp) | |||
787 | return -EINVAL; | 221 | return -EINVAL; |
788 | 222 | ||
789 | for (i = 0; i < tp->nr_args; i++) | 223 | for (i = 0; i < tp->nr_args; i++) |
790 | update_probe_arg(&tp->args[i]); | 224 | traceprobe_update_arg(&tp->args[i]); |
791 | 225 | ||
792 | /* Set/clear disabled flag according to tp->flag */ | 226 | /* Set/clear disabled flag according to tp->flag */ |
793 | if (trace_probe_is_enabled(tp)) | 227 | if (trace_probe_is_enabled(tp)) |
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = { | |||
919 | .priority = 1 /* Invoked after kprobe module callback */ | 353 | .priority = 1 /* Invoked after kprobe module callback */ |
920 | }; | 354 | }; |
921 | 355 | ||
922 | /* Split symbol and offset. */ | ||
923 | static int split_symbol_offset(char *symbol, unsigned long *offset) | ||
924 | { | ||
925 | char *tmp; | ||
926 | int ret; | ||
927 | |||
928 | if (!offset) | ||
929 | return -EINVAL; | ||
930 | |||
931 | tmp = strchr(symbol, '+'); | ||
932 | if (tmp) { | ||
933 | /* skip sign because strict_strtol doesn't accept '+' */ | ||
934 | ret = strict_strtoul(tmp + 1, 0, offset); | ||
935 | if (ret) | ||
936 | return ret; | ||
937 | *tmp = '\0'; | ||
938 | } else | ||
939 | *offset = 0; | ||
940 | return 0; | ||
941 | } | ||
942 | |||
943 | #define PARAM_MAX_ARGS 16 | ||
944 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | ||
945 | |||
946 | static int parse_probe_vars(char *arg, const struct fetch_type *t, | ||
947 | struct fetch_param *f, int is_return) | ||
948 | { | ||
949 | int ret = 0; | ||
950 | unsigned long param; | ||
951 | |||
952 | if (strcmp(arg, "retval") == 0) { | ||
953 | if (is_return) | ||
954 | f->fn = t->fetch[FETCH_MTD_retval]; | ||
955 | else | ||
956 | ret = -EINVAL; | ||
957 | } else if (strncmp(arg, "stack", 5) == 0) { | ||
958 | if (arg[5] == '\0') { | ||
959 | if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) | ||
960 | f->fn = fetch_stack_address; | ||
961 | else | ||
962 | ret = -EINVAL; | ||
963 | } else if (isdigit(arg[5])) { | ||
964 | ret = strict_strtoul(arg + 5, 10, ¶m); | ||
965 | if (ret || param > PARAM_MAX_STACK) | ||
966 | ret = -EINVAL; | ||
967 | else { | ||
968 | f->fn = t->fetch[FETCH_MTD_stack]; | ||
969 | f->data = (void *)param; | ||
970 | } | ||
971 | } else | ||
972 | ret = -EINVAL; | ||
973 | } else | ||
974 | ret = -EINVAL; | ||
975 | return ret; | ||
976 | } | ||
977 | |||
978 | /* Recursive argument parser */ | ||
979 | static int __parse_probe_arg(char *arg, const struct fetch_type *t, | ||
980 | struct fetch_param *f, int is_return) | ||
981 | { | ||
982 | int ret = 0; | ||
983 | unsigned long param; | ||
984 | long offset; | ||
985 | char *tmp; | ||
986 | |||
987 | switch (arg[0]) { | ||
988 | case '$': | ||
989 | ret = parse_probe_vars(arg + 1, t, f, is_return); | ||
990 | break; | ||
991 | case '%': /* named register */ | ||
992 | ret = regs_query_register_offset(arg + 1); | ||
993 | if (ret >= 0) { | ||
994 | f->fn = t->fetch[FETCH_MTD_reg]; | ||
995 | f->data = (void *)(unsigned long)ret; | ||
996 | ret = 0; | ||
997 | } | ||
998 | break; | ||
999 | case '@': /* memory or symbol */ | ||
1000 | if (isdigit(arg[1])) { | ||
1001 | ret = strict_strtoul(arg + 1, 0, ¶m); | ||
1002 | if (ret) | ||
1003 | break; | ||
1004 | f->fn = t->fetch[FETCH_MTD_memory]; | ||
1005 | f->data = (void *)param; | ||
1006 | } else { | ||
1007 | ret = split_symbol_offset(arg + 1, &offset); | ||
1008 | if (ret) | ||
1009 | break; | ||
1010 | f->data = alloc_symbol_cache(arg + 1, offset); | ||
1011 | if (f->data) | ||
1012 | f->fn = t->fetch[FETCH_MTD_symbol]; | ||
1013 | } | ||
1014 | break; | ||
1015 | case '+': /* deref memory */ | ||
1016 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
1017 | case '-': | ||
1018 | tmp = strchr(arg, '('); | ||
1019 | if (!tmp) | ||
1020 | break; | ||
1021 | *tmp = '\0'; | ||
1022 | ret = strict_strtol(arg, 0, &offset); | ||
1023 | if (ret) | ||
1024 | break; | ||
1025 | arg = tmp + 1; | ||
1026 | tmp = strrchr(arg, ')'); | ||
1027 | if (tmp) { | ||
1028 | struct deref_fetch_param *dprm; | ||
1029 | const struct fetch_type *t2 = find_fetch_type(NULL); | ||
1030 | *tmp = '\0'; | ||
1031 | dprm = kzalloc(sizeof(struct deref_fetch_param), | ||
1032 | GFP_KERNEL); | ||
1033 | if (!dprm) | ||
1034 | return -ENOMEM; | ||
1035 | dprm->offset = offset; | ||
1036 | ret = __parse_probe_arg(arg, t2, &dprm->orig, | ||
1037 | is_return); | ||
1038 | if (ret) | ||
1039 | kfree(dprm); | ||
1040 | else { | ||
1041 | f->fn = t->fetch[FETCH_MTD_deref]; | ||
1042 | f->data = (void *)dprm; | ||
1043 | } | ||
1044 | } | ||
1045 | break; | ||
1046 | } | ||
1047 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ | ||
1048 | pr_info("%s type has no corresponding fetch method.\n", | ||
1049 | t->name); | ||
1050 | ret = -EINVAL; | ||
1051 | } | ||
1052 | return ret; | ||
1053 | } | ||
1054 | |||
1055 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
1056 | |||
1057 | /* Bitfield type needs to be parsed into a fetch function */ | ||
1058 | static int __parse_bitfield_probe_arg(const char *bf, | ||
1059 | const struct fetch_type *t, | ||
1060 | struct fetch_param *f) | ||
1061 | { | ||
1062 | struct bitfield_fetch_param *bprm; | ||
1063 | unsigned long bw, bo; | ||
1064 | char *tail; | ||
1065 | |||
1066 | if (*bf != 'b') | ||
1067 | return 0; | ||
1068 | |||
1069 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
1070 | if (!bprm) | ||
1071 | return -ENOMEM; | ||
1072 | bprm->orig = *f; | ||
1073 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
1074 | f->data = (void *)bprm; | ||
1075 | |||
1076 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
1077 | if (bw == 0 || *tail != '@') | ||
1078 | return -EINVAL; | ||
1079 | |||
1080 | bf = tail + 1; | ||
1081 | bo = simple_strtoul(bf, &tail, 0); | ||
1082 | if (tail == bf || *tail != '/') | ||
1083 | return -EINVAL; | ||
1084 | |||
1085 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
1086 | bprm->low_shift = bprm->hi_shift + bo; | ||
1087 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
1088 | } | ||
1089 | |||
1090 | /* String length checking wrapper */ | ||
1091 | static int parse_probe_arg(char *arg, struct trace_probe *tp, | ||
1092 | struct probe_arg *parg, int is_return) | ||
1093 | { | ||
1094 | const char *t; | ||
1095 | int ret; | ||
1096 | |||
1097 | if (strlen(arg) > MAX_ARGSTR_LEN) { | ||
1098 | pr_info("Argument is too long.: %s\n", arg); | ||
1099 | return -ENOSPC; | ||
1100 | } | ||
1101 | parg->comm = kstrdup(arg, GFP_KERNEL); | ||
1102 | if (!parg->comm) { | ||
1103 | pr_info("Failed to allocate memory for command '%s'.\n", arg); | ||
1104 | return -ENOMEM; | ||
1105 | } | ||
1106 | t = strchr(parg->comm, ':'); | ||
1107 | if (t) { | ||
1108 | arg[t - parg->comm] = '\0'; | ||
1109 | t++; | ||
1110 | } | ||
1111 | parg->type = find_fetch_type(t); | ||
1112 | if (!parg->type) { | ||
1113 | pr_info("Unsupported type: %s\n", t); | ||
1114 | return -EINVAL; | ||
1115 | } | ||
1116 | parg->offset = tp->size; | ||
1117 | tp->size += parg->type->size; | ||
1118 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | ||
1119 | if (ret >= 0 && t != NULL) | ||
1120 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
1121 | if (ret >= 0) { | ||
1122 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
1123 | parg->fetch.fn); | ||
1124 | parg->fetch_size.data = parg->fetch.data; | ||
1125 | } | ||
1126 | return ret; | ||
1127 | } | ||
1128 | |||
1129 | /* Return 1 if name is reserved or already used by another argument */ | ||
1130 | static int conflict_field_name(const char *name, | ||
1131 | struct probe_arg *args, int narg) | ||
1132 | { | ||
1133 | int i; | ||
1134 | for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) | ||
1135 | if (strcmp(reserved_field_names[i], name) == 0) | ||
1136 | return 1; | ||
1137 | for (i = 0; i < narg; i++) | ||
1138 | if (strcmp(args[i].name, name) == 0) | ||
1139 | return 1; | ||
1140 | return 0; | ||
1141 | } | ||
1142 | |||
1143 | static int create_trace_probe(int argc, char **argv) | 356 | static int create_trace_probe(int argc, char **argv) |
1144 | { | 357 | { |
1145 | /* | 358 | /* |
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv) | |||
1162 | */ | 375 | */ |
1163 | struct trace_probe *tp; | 376 | struct trace_probe *tp; |
1164 | int i, ret = 0; | 377 | int i, ret = 0; |
1165 | int is_return = 0, is_delete = 0; | 378 | bool is_return = false, is_delete = false; |
1166 | char *symbol = NULL, *event = NULL, *group = NULL; | 379 | char *symbol = NULL, *event = NULL, *group = NULL; |
1167 | char *arg; | 380 | char *arg; |
1168 | unsigned long offset = 0; | 381 | unsigned long offset = 0; |
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv) | |||
1171 | 384 | ||
1172 | /* argc must be >= 1 */ | 385 | /* argc must be >= 1 */ |
1173 | if (argv[0][0] == 'p') | 386 | if (argv[0][0] == 'p') |
1174 | is_return = 0; | 387 | is_return = false; |
1175 | else if (argv[0][0] == 'r') | 388 | else if (argv[0][0] == 'r') |
1176 | is_return = 1; | 389 | is_return = true; |
1177 | else if (argv[0][0] == '-') | 390 | else if (argv[0][0] == '-') |
1178 | is_delete = 1; | 391 | is_delete = true; |
1179 | else { | 392 | else { |
1180 | pr_info("Probe definition must be started with 'p', 'r' or" | 393 | pr_info("Probe definition must be started with 'p', 'r' or" |
1181 | " '-'.\n"); | 394 | " '-'.\n"); |
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv) | |||
1240 | /* a symbol specified */ | 453 | /* a symbol specified */ |
1241 | symbol = argv[1]; | 454 | symbol = argv[1]; |
1242 | /* TODO: support .init module functions */ | 455 | /* TODO: support .init module functions */ |
1243 | ret = split_symbol_offset(symbol, &offset); | 456 | ret = traceprobe_split_symbol_offset(symbol, &offset); |
1244 | if (ret) { | 457 | if (ret) { |
1245 | pr_info("Failed to parse symbol.\n"); | 458 | pr_info("Failed to parse symbol.\n"); |
1246 | return ret; | 459 | return ret; |
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv) | |||
1302 | goto error; | 515 | goto error; |
1303 | } | 516 | } |
1304 | 517 | ||
1305 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { | 518 | if (traceprobe_conflict_field_name(tp->args[i].name, |
519 | tp->args, i)) { | ||
1306 | pr_info("Argument[%d] name '%s' conflicts with " | 520 | pr_info("Argument[%d] name '%s' conflicts with " |
1307 | "another field.\n", i, argv[i]); | 521 | "another field.\n", i, argv[i]); |
1308 | ret = -EINVAL; | 522 | ret = -EINVAL; |
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv) | |||
1310 | } | 524 | } |
1311 | 525 | ||
1312 | /* Parse fetch argument */ | 526 | /* Parse fetch argument */ |
1313 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); | 527 | ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], |
528 | is_return, true); | ||
1314 | if (ret) { | 529 | if (ret) { |
1315 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 530 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
1316 | goto error; | 531 | goto error; |
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file) | |||
1412 | return seq_open(file, &probes_seq_op); | 627 | return seq_open(file, &probes_seq_op); |
1413 | } | 628 | } |
1414 | 629 | ||
1415 | static int command_trace_probe(const char *buf) | ||
1416 | { | ||
1417 | char **argv; | ||
1418 | int argc = 0, ret = 0; | ||
1419 | |||
1420 | argv = argv_split(GFP_KERNEL, buf, &argc); | ||
1421 | if (!argv) | ||
1422 | return -ENOMEM; | ||
1423 | |||
1424 | if (argc) | ||
1425 | ret = create_trace_probe(argc, argv); | ||
1426 | |||
1427 | argv_free(argv); | ||
1428 | return ret; | ||
1429 | } | ||
1430 | |||
1431 | #define WRITE_BUFSIZE 4096 | ||
1432 | |||
1433 | static ssize_t probes_write(struct file *file, const char __user *buffer, | 630 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
1434 | size_t count, loff_t *ppos) | 631 | size_t count, loff_t *ppos) |
1435 | { | 632 | { |
1436 | char *kbuf, *tmp; | 633 | return traceprobe_probes_write(file, buffer, count, ppos, |
1437 | int ret; | 634 | create_trace_probe); |
1438 | size_t done; | ||
1439 | size_t size; | ||
1440 | |||
1441 | kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | ||
1442 | if (!kbuf) | ||
1443 | return -ENOMEM; | ||
1444 | |||
1445 | ret = done = 0; | ||
1446 | while (done < count) { | ||
1447 | size = count - done; | ||
1448 | if (size >= WRITE_BUFSIZE) | ||
1449 | size = WRITE_BUFSIZE - 1; | ||
1450 | if (copy_from_user(kbuf, buffer + done, size)) { | ||
1451 | ret = -EFAULT; | ||
1452 | goto out; | ||
1453 | } | ||
1454 | kbuf[size] = '\0'; | ||
1455 | tmp = strchr(kbuf, '\n'); | ||
1456 | if (tmp) { | ||
1457 | *tmp = '\0'; | ||
1458 | size = tmp - kbuf + 1; | ||
1459 | } else if (done + size < count) { | ||
1460 | pr_warning("Line length is too long: " | ||
1461 | "Should be less than %d.", WRITE_BUFSIZE); | ||
1462 | ret = -EINVAL; | ||
1463 | goto out; | ||
1464 | } | ||
1465 | done += size; | ||
1466 | /* Remove comments */ | ||
1467 | tmp = strchr(kbuf, '#'); | ||
1468 | if (tmp) | ||
1469 | *tmp = '\0'; | ||
1470 | |||
1471 | ret = command_trace_probe(kbuf); | ||
1472 | if (ret) | ||
1473 | goto out; | ||
1474 | } | ||
1475 | ret = done; | ||
1476 | out: | ||
1477 | kfree(kbuf); | ||
1478 | return ret; | ||
1479 | } | 635 | } |
1480 | 636 | ||
1481 | static const struct file_operations kprobe_events_ops = { | 637 | static const struct file_operations kprobe_events_ops = { |
@@ -1711,16 +867,6 @@ partial: | |||
1711 | return TRACE_TYPE_PARTIAL_LINE; | 867 | return TRACE_TYPE_PARTIAL_LINE; |
1712 | } | 868 | } |
1713 | 869 | ||
1714 | #undef DEFINE_FIELD | ||
1715 | #define DEFINE_FIELD(type, item, name, is_signed) \ | ||
1716 | do { \ | ||
1717 | ret = trace_define_field(event_call, #type, name, \ | ||
1718 | offsetof(typeof(field), item), \ | ||
1719 | sizeof(field.item), is_signed, \ | ||
1720 | FILTER_OTHER); \ | ||
1721 | if (ret) \ | ||
1722 | return ret; \ | ||
1723 | } while (0) | ||
1724 | 870 | ||
1725 | static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | 871 | static int kprobe_event_define_fields(struct ftrace_event_call *event_call) |
1726 | { | 872 | { |
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2051 | 1197 | ||
2052 | pr_info("Testing kprobe tracing: "); | 1198 | pr_info("Testing kprobe tracing: "); |
2053 | 1199 | ||
2054 | ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " | 1200 | ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " |
2055 | "$stack $stack0 +0($stack)"); | 1201 | "$stack $stack0 +0($stack)", |
1202 | create_trace_probe); | ||
2056 | if (WARN_ON_ONCE(ret)) { | 1203 | if (WARN_ON_ONCE(ret)) { |
2057 | pr_warning("error on probing function entry.\n"); | 1204 | pr_warning("error on probing function entry.\n"); |
2058 | warn++; | 1205 | warn++; |
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2066 | enable_trace_probe(tp, TP_FLAG_TRACE); | 1213 | enable_trace_probe(tp, TP_FLAG_TRACE); |
2067 | } | 1214 | } |
2068 | 1215 | ||
2069 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " | 1216 | ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " |
2070 | "$retval"); | 1217 | "$retval", create_trace_probe); |
2071 | if (WARN_ON_ONCE(ret)) { | 1218 | if (WARN_ON_ONCE(ret)) { |
2072 | pr_warning("error on probing function return.\n"); | 1219 | pr_warning("error on probing function return.\n"); |
2073 | warn++; | 1220 | warn++; |
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2101 | } else | 1248 | } else |
2102 | disable_trace_probe(tp, TP_FLAG_TRACE); | 1249 | disable_trace_probe(tp, TP_FLAG_TRACE); |
2103 | 1250 | ||
2104 | ret = command_trace_probe("-:testprobe"); | 1251 | ret = traceprobe_command("-:testprobe", create_trace_probe); |
2105 | if (WARN_ON_ONCE(ret)) { | 1252 | if (WARN_ON_ONCE(ret)) { |
2106 | pr_warning("error on deleting a probe.\n"); | 1253 | pr_warning("error on deleting a probe.\n"); |
2107 | warn++; | 1254 | warn++; |
2108 | } | 1255 | } |
2109 | 1256 | ||
2110 | ret = command_trace_probe("-:testprobe2"); | 1257 | ret = traceprobe_command("-:testprobe2", create_trace_probe); |
2111 | if (WARN_ON_ONCE(ret)) { | 1258 | if (WARN_ON_ONCE(ret)) { |
2112 | pr_warning("error on deleting a probe.\n"); | 1259 | pr_warning("error on deleting a probe.\n"); |
2113 | warn++; | 1260 | warn++; |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f9..a9077c1b4ad3 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | 52 | char *fmt; |
53 | 53 | ||
54 | /* allocate the trace_printk per cpu buffers */ | ||
55 | if (start != end) | ||
56 | trace_printk_init_buffers(); | ||
57 | |||
54 | mutex_lock(&btrace_mutex); | 58 | mutex_lock(&btrace_mutex); |
55 | for (iter = start; iter < end; iter++) { | 59 | for (iter = start; iter < end; iter++) { |
56 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); | 60 | struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 000000000000..daa9980153af --- /dev/null +++ b/kernel/trace/trace_probe.c | |||
@@ -0,0 +1,839 @@ | |||
1 | /* | ||
2 | * Common code for probe-based Dynamic events. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
16 | * | ||
17 | * This code was copied from kernel/trace/trace_kprobe.c written by | ||
18 | * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> | ||
19 | * | ||
20 | * Updates to make this generic: | ||
21 | * Copyright (C) IBM Corporation, 2010-2011 | ||
22 | * Author: Srikar Dronamraju | ||
23 | */ | ||
24 | |||
25 | #include "trace_probe.h" | ||
26 | |||
27 | const char *reserved_field_names[] = { | ||
28 | "common_type", | ||
29 | "common_flags", | ||
30 | "common_preempt_count", | ||
31 | "common_pid", | ||
32 | "common_tgid", | ||
33 | FIELD_STRING_IP, | ||
34 | FIELD_STRING_RETIP, | ||
35 | FIELD_STRING_FUNC, | ||
36 | }; | ||
37 | |||
38 | /* Printing function type */ | ||
39 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | ||
40 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | ||
41 | |||
42 | /* Printing in basic type function template */ | ||
43 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | ||
44 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | ||
45 | const char *name, \ | ||
46 | void *data, void *ent)\ | ||
47 | { \ | ||
48 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | ||
49 | } \ | ||
50 | static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; | ||
51 | |||
52 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) | ||
53 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) | ||
54 | DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) | ||
55 | DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) | ||
56 | DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) | ||
57 | DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | ||
58 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | ||
59 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | ||
60 | |||
61 | static inline void *get_rloc_data(u32 *dl) | ||
62 | { | ||
63 | return (u8 *)dl + get_rloc_offs(*dl); | ||
64 | } | ||
65 | |||
66 | /* For data_loc conversion */ | ||
67 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
68 | { | ||
69 | return (u8 *)ent + get_rloc_offs(*dl); | ||
70 | } | ||
71 | |||
72 | /* For defining macros, define string/string_size types */ | ||
73 | typedef u32 string; | ||
74 | typedef u32 string_size; | ||
75 | |||
76 | /* Print type function for string type */ | ||
77 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
78 | const char *name, | ||
79 | void *data, void *ent) | ||
80 | { | ||
81 | int len = *(u32 *)data >> 16; | ||
82 | |||
83 | if (!len) | ||
84 | return trace_seq_printf(s, " %s=(fault)", name); | ||
85 | else | ||
86 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
87 | (const char *)get_loc_data(data, ent)); | ||
88 | } | ||
89 | |||
90 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
91 | |||
92 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type | ||
93 | /* | ||
94 | * Define macro for basic types - we don't need to define s* types, because | ||
95 | * we have to care only about bitwidth at recording time. | ||
96 | */ | ||
97 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ | ||
98 | DEFINE_FETCH_##method(u8) \ | ||
99 | DEFINE_FETCH_##method(u16) \ | ||
100 | DEFINE_FETCH_##method(u32) \ | ||
101 | DEFINE_FETCH_##method(u64) | ||
102 | |||
103 | #define CHECK_FETCH_FUNCS(method, fn) \ | ||
104 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ | ||
105 | (FETCH_FUNC_NAME(method, u16) == fn) || \ | ||
106 | (FETCH_FUNC_NAME(method, u32) == fn) || \ | ||
107 | (FETCH_FUNC_NAME(method, u64) == fn) || \ | ||
108 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
109 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
110 | && (fn != NULL)) | ||
111 | |||
112 | /* Data fetch function templates */ | ||
113 | #define DEFINE_FETCH_reg(type) \ | ||
114 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | ||
115 | void *offset, void *dest) \ | ||
116 | { \ | ||
117 | *(type *)dest = (type)regs_get_register(regs, \ | ||
118 | (unsigned int)((unsigned long)offset)); \ | ||
119 | } | ||
120 | DEFINE_BASIC_FETCH_FUNCS(reg) | ||
121 | /* No string on the register */ | ||
122 | #define fetch_reg_string NULL | ||
123 | #define fetch_reg_string_size NULL | ||
124 | |||
125 | #define DEFINE_FETCH_stack(type) \ | ||
126 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | ||
127 | void *offset, void *dest) \ | ||
128 | { \ | ||
129 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | ||
130 | (unsigned int)((unsigned long)offset)); \ | ||
131 | } | ||
132 | DEFINE_BASIC_FETCH_FUNCS(stack) | ||
133 | /* No string on the stack entry */ | ||
134 | #define fetch_stack_string NULL | ||
135 | #define fetch_stack_string_size NULL | ||
136 | |||
137 | #define DEFINE_FETCH_retval(type) \ | ||
138 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | ||
139 | void *dummy, void *dest) \ | ||
140 | { \ | ||
141 | *(type *)dest = (type)regs_return_value(regs); \ | ||
142 | } | ||
143 | DEFINE_BASIC_FETCH_FUNCS(retval) | ||
144 | /* No string on the retval */ | ||
145 | #define fetch_retval_string NULL | ||
146 | #define fetch_retval_string_size NULL | ||
147 | |||
148 | #define DEFINE_FETCH_memory(type) \ | ||
149 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | ||
150 | void *addr, void *dest) \ | ||
151 | { \ | ||
152 | type retval; \ | ||
153 | if (probe_kernel_address(addr, retval)) \ | ||
154 | *(type *)dest = 0; \ | ||
155 | else \ | ||
156 | *(type *)dest = retval; \ | ||
157 | } | ||
158 | DEFINE_BASIC_FETCH_FUNCS(memory) | ||
159 | /* | ||
160 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
161 | * length and relative data location. | ||
162 | */ | ||
163 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
164 | void *addr, void *dest) | ||
165 | { | ||
166 | long ret; | ||
167 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
168 | u8 *dst = get_rloc_data(dest); | ||
169 | u8 *src = addr; | ||
170 | mm_segment_t old_fs = get_fs(); | ||
171 | |||
172 | if (!maxlen) | ||
173 | return; | ||
174 | |||
175 | /* | ||
176 | * Try to get string again, since the string can be changed while | ||
177 | * probing. | ||
178 | */ | ||
179 | set_fs(KERNEL_DS); | ||
180 | pagefault_disable(); | ||
181 | |||
182 | do | ||
183 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
184 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
185 | |||
186 | dst[-1] = '\0'; | ||
187 | pagefault_enable(); | ||
188 | set_fs(old_fs); | ||
189 | |||
190 | if (ret < 0) { /* Failed to fetch string */ | ||
191 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
192 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
193 | } else { | ||
194 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
195 | get_rloc_offs(*(u32 *)dest)); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* Return the length of string -- including null terminal byte */ | ||
200 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
201 | void *addr, void *dest) | ||
202 | { | ||
203 | mm_segment_t old_fs; | ||
204 | int ret, len = 0; | ||
205 | u8 c; | ||
206 | |||
207 | old_fs = get_fs(); | ||
208 | set_fs(KERNEL_DS); | ||
209 | pagefault_disable(); | ||
210 | |||
211 | do { | ||
212 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
213 | len++; | ||
214 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
215 | |||
216 | pagefault_enable(); | ||
217 | set_fs(old_fs); | ||
218 | |||
219 | if (ret < 0) /* Failed to check the length */ | ||
220 | *(u32 *)dest = 0; | ||
221 | else | ||
222 | *(u32 *)dest = len; | ||
223 | } | ||
224 | |||
225 | /* Memory fetching by symbol */ | ||
226 | struct symbol_cache { | ||
227 | char *symbol; | ||
228 | long offset; | ||
229 | unsigned long addr; | ||
230 | }; | ||
231 | |||
232 | static unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
233 | { | ||
234 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
235 | |||
236 | if (sc->addr) | ||
237 | sc->addr += sc->offset; | ||
238 | |||
239 | return sc->addr; | ||
240 | } | ||
241 | |||
242 | static void free_symbol_cache(struct symbol_cache *sc) | ||
243 | { | ||
244 | kfree(sc->symbol); | ||
245 | kfree(sc); | ||
246 | } | ||
247 | |||
248 | static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
249 | { | ||
250 | struct symbol_cache *sc; | ||
251 | |||
252 | if (!sym || strlen(sym) == 0) | ||
253 | return NULL; | ||
254 | |||
255 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
256 | if (!sc) | ||
257 | return NULL; | ||
258 | |||
259 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
260 | if (!sc->symbol) { | ||
261 | kfree(sc); | ||
262 | return NULL; | ||
263 | } | ||
264 | sc->offset = offset; | ||
265 | update_symbol_cache(sc); | ||
266 | |||
267 | return sc; | ||
268 | } | ||
269 | |||
270 | #define DEFINE_FETCH_symbol(type) \ | ||
271 | static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | ||
272 | void *data, void *dest) \ | ||
273 | { \ | ||
274 | struct symbol_cache *sc = data; \ | ||
275 | if (sc->addr) \ | ||
276 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | ||
277 | else \ | ||
278 | *(type *)dest = 0; \ | ||
279 | } | ||
280 | DEFINE_BASIC_FETCH_FUNCS(symbol) | ||
281 | DEFINE_FETCH_symbol(string) | ||
282 | DEFINE_FETCH_symbol(string_size) | ||
283 | |||
284 | /* Dereference memory access function */ | ||
285 | struct deref_fetch_param { | ||
286 | struct fetch_param orig; | ||
287 | long offset; | ||
288 | }; | ||
289 | |||
290 | #define DEFINE_FETCH_deref(type) \ | ||
291 | static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | ||
292 | void *data, void *dest) \ | ||
293 | { \ | ||
294 | struct deref_fetch_param *dprm = data; \ | ||
295 | unsigned long addr; \ | ||
296 | call_fetch(&dprm->orig, regs, &addr); \ | ||
297 | if (addr) { \ | ||
298 | addr += dprm->offset; \ | ||
299 | fetch_memory_##type(regs, (void *)addr, dest); \ | ||
300 | } else \ | ||
301 | *(type *)dest = 0; \ | ||
302 | } | ||
303 | DEFINE_BASIC_FETCH_FUNCS(deref) | ||
304 | DEFINE_FETCH_deref(string) | ||
305 | DEFINE_FETCH_deref(string_size) | ||
306 | |||
307 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | ||
308 | { | ||
309 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
310 | update_deref_fetch_param(data->orig.data); | ||
311 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
312 | update_symbol_cache(data->orig.data); | ||
313 | } | ||
314 | |||
315 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | ||
316 | { | ||
317 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
318 | free_deref_fetch_param(data->orig.data); | ||
319 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
320 | free_symbol_cache(data->orig.data); | ||
321 | kfree(data); | ||
322 | } | ||
323 | |||
324 | /* Bitfield fetch function */ | ||
325 | struct bitfield_fetch_param { | ||
326 | struct fetch_param orig; | ||
327 | unsigned char hi_shift; | ||
328 | unsigned char low_shift; | ||
329 | }; | ||
330 | |||
331 | #define DEFINE_FETCH_bitfield(type) \ | ||
332 | static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ | ||
333 | void *data, void *dest) \ | ||
334 | { \ | ||
335 | struct bitfield_fetch_param *bprm = data; \ | ||
336 | type buf = 0; \ | ||
337 | call_fetch(&bprm->orig, regs, &buf); \ | ||
338 | if (buf) { \ | ||
339 | buf <<= bprm->hi_shift; \ | ||
340 | buf >>= bprm->low_shift; \ | ||
341 | } \ | ||
342 | *(type *)dest = buf; \ | ||
343 | } | ||
344 | |||
345 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | ||
346 | #define fetch_bitfield_string NULL | ||
347 | #define fetch_bitfield_string_size NULL | ||
348 | |||
349 | static __kprobes void | ||
350 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
351 | { | ||
352 | /* | ||
353 | * Don't check the bitfield itself, because this must be the | ||
354 | * last fetch function. | ||
355 | */ | ||
356 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
357 | update_deref_fetch_param(data->orig.data); | ||
358 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
359 | update_symbol_cache(data->orig.data); | ||
360 | } | ||
361 | |||
362 | static __kprobes void | ||
363 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
364 | { | ||
365 | /* | ||
366 | * Don't check the bitfield itself, because this must be the | ||
367 | * last fetch function. | ||
368 | */ | ||
369 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
370 | free_deref_fetch_param(data->orig.data); | ||
371 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
372 | free_symbol_cache(data->orig.data); | ||
373 | |||
374 | kfree(data); | ||
375 | } | ||
376 | |||
377 | /* Default (unsigned long) fetch type */ | ||
378 | #define __DEFAULT_FETCH_TYPE(t) u##t | ||
379 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | ||
380 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | ||
381 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | ||
382 | |||
383 | #define ASSIGN_FETCH_FUNC(method, type) \ | ||
384 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) | ||
385 | |||
386 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ | ||
387 | {.name = _name, \ | ||
388 | .size = _size, \ | ||
389 | .is_signed = sign, \ | ||
390 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
391 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
392 | .fmttype = _fmttype, \ | ||
393 | .fetch = { \ | ||
394 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
395 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
396 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
397 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
398 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
399 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
400 | ASSIGN_FETCH_FUNC(bitfield, ftype), \ | ||
401 | } \ | ||
402 | } | ||
403 | |||
404 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
405 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
406 | |||
407 | #define FETCH_TYPE_STRING 0 | ||
408 | #define FETCH_TYPE_STRSIZE 1 | ||
409 | |||
410 | /* Fetch type information table */ | ||
411 | static const struct fetch_type fetch_type_table[] = { | ||
412 | /* Special types */ | ||
413 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
414 | sizeof(u32), 1, "__data_loc char[]"), | ||
415 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
416 | string_size, sizeof(u32), 0, "u32"), | ||
417 | /* Basic types */ | ||
418 | ASSIGN_FETCH_TYPE(u8, u8, 0), | ||
419 | ASSIGN_FETCH_TYPE(u16, u16, 0), | ||
420 | ASSIGN_FETCH_TYPE(u32, u32, 0), | ||
421 | ASSIGN_FETCH_TYPE(u64, u64, 0), | ||
422 | ASSIGN_FETCH_TYPE(s8, u8, 1), | ||
423 | ASSIGN_FETCH_TYPE(s16, u16, 1), | ||
424 | ASSIGN_FETCH_TYPE(s32, u32, 1), | ||
425 | ASSIGN_FETCH_TYPE(s64, u64, 1), | ||
426 | }; | ||
427 | |||
428 | static const struct fetch_type *find_fetch_type(const char *type) | ||
429 | { | ||
430 | int i; | ||
431 | |||
432 | if (!type) | ||
433 | type = DEFAULT_FETCH_TYPE_STR; | ||
434 | |||
435 | /* Special case: bitfield */ | ||
436 | if (*type == 'b') { | ||
437 | unsigned long bs; | ||
438 | |||
439 | type = strchr(type, '/'); | ||
440 | if (!type) | ||
441 | goto fail; | ||
442 | |||
443 | type++; | ||
444 | if (strict_strtoul(type, 0, &bs)) | ||
445 | goto fail; | ||
446 | |||
447 | switch (bs) { | ||
448 | case 8: | ||
449 | return find_fetch_type("u8"); | ||
450 | case 16: | ||
451 | return find_fetch_type("u16"); | ||
452 | case 32: | ||
453 | return find_fetch_type("u32"); | ||
454 | case 64: | ||
455 | return find_fetch_type("u64"); | ||
456 | default: | ||
457 | goto fail; | ||
458 | } | ||
459 | } | ||
460 | |||
461 | for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) | ||
462 | if (strcmp(type, fetch_type_table[i].name) == 0) | ||
463 | return &fetch_type_table[i]; | ||
464 | |||
465 | fail: | ||
466 | return NULL; | ||
467 | } | ||
468 | |||
469 | /* Special function : only accept unsigned long */ | ||
470 | static __kprobes void fetch_stack_address(struct pt_regs *regs, | ||
471 | void *dummy, void *dest) | ||
472 | { | ||
473 | *(unsigned long *)dest = kernel_stack_pointer(regs); | ||
474 | } | ||
475 | |||
476 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
477 | fetch_func_t orig_fn) | ||
478 | { | ||
479 | int i; | ||
480 | |||
481 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
482 | return NULL; /* Only string type needs size function */ | ||
483 | |||
484 | for (i = 0; i < FETCH_MTD_END; i++) | ||
485 | if (type->fetch[i] == orig_fn) | ||
486 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
487 | |||
488 | WARN_ON(1); /* This should not happen */ | ||
489 | |||
490 | return NULL; | ||
491 | } | ||
492 | |||
493 | /* Split symbol and offset. */ | ||
494 | int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) | ||
495 | { | ||
496 | char *tmp; | ||
497 | int ret; | ||
498 | |||
499 | if (!offset) | ||
500 | return -EINVAL; | ||
501 | |||
502 | tmp = strchr(symbol, '+'); | ||
503 | if (tmp) { | ||
504 | /* skip sign because strict_strtol doesn't accept '+' */ | ||
505 | ret = strict_strtoul(tmp + 1, 0, offset); | ||
506 | if (ret) | ||
507 | return ret; | ||
508 | |||
509 | *tmp = '\0'; | ||
510 | } else | ||
511 | *offset = 0; | ||
512 | |||
513 | return 0; | ||
514 | } | ||
515 | |||
516 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | ||
517 | |||
518 | static int parse_probe_vars(char *arg, const struct fetch_type *t, | ||
519 | struct fetch_param *f, bool is_return) | ||
520 | { | ||
521 | int ret = 0; | ||
522 | unsigned long param; | ||
523 | |||
524 | if (strcmp(arg, "retval") == 0) { | ||
525 | if (is_return) | ||
526 | f->fn = t->fetch[FETCH_MTD_retval]; | ||
527 | else | ||
528 | ret = -EINVAL; | ||
529 | } else if (strncmp(arg, "stack", 5) == 0) { | ||
530 | if (arg[5] == '\0') { | ||
531 | if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) | ||
532 | f->fn = fetch_stack_address; | ||
533 | else | ||
534 | ret = -EINVAL; | ||
535 | } else if (isdigit(arg[5])) { | ||
536 | ret = strict_strtoul(arg + 5, 10, ¶m); | ||
537 | if (ret || param > PARAM_MAX_STACK) | ||
538 | ret = -EINVAL; | ||
539 | else { | ||
540 | f->fn = t->fetch[FETCH_MTD_stack]; | ||
541 | f->data = (void *)param; | ||
542 | } | ||
543 | } else | ||
544 | ret = -EINVAL; | ||
545 | } else | ||
546 | ret = -EINVAL; | ||
547 | |||
548 | return ret; | ||
549 | } | ||
550 | |||
551 | /* Recursive argument parser */ | ||
552 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | ||
553 | struct fetch_param *f, bool is_return, bool is_kprobe) | ||
554 | { | ||
555 | unsigned long param; | ||
556 | long offset; | ||
557 | char *tmp; | ||
558 | int ret; | ||
559 | |||
560 | ret = 0; | ||
561 | |||
562 | /* Until uprobe_events supports only reg arguments */ | ||
563 | if (!is_kprobe && arg[0] != '%') | ||
564 | return -EINVAL; | ||
565 | |||
566 | switch (arg[0]) { | ||
567 | case '$': | ||
568 | ret = parse_probe_vars(arg + 1, t, f, is_return); | ||
569 | break; | ||
570 | |||
571 | case '%': /* named register */ | ||
572 | ret = regs_query_register_offset(arg + 1); | ||
573 | if (ret >= 0) { | ||
574 | f->fn = t->fetch[FETCH_MTD_reg]; | ||
575 | f->data = (void *)(unsigned long)ret; | ||
576 | ret = 0; | ||
577 | } | ||
578 | break; | ||
579 | |||
580 | case '@': /* memory or symbol */ | ||
581 | if (isdigit(arg[1])) { | ||
582 | ret = strict_strtoul(arg + 1, 0, ¶m); | ||
583 | if (ret) | ||
584 | break; | ||
585 | |||
586 | f->fn = t->fetch[FETCH_MTD_memory]; | ||
587 | f->data = (void *)param; | ||
588 | } else { | ||
589 | ret = traceprobe_split_symbol_offset(arg + 1, &offset); | ||
590 | if (ret) | ||
591 | break; | ||
592 | |||
593 | f->data = alloc_symbol_cache(arg + 1, offset); | ||
594 | if (f->data) | ||
595 | f->fn = t->fetch[FETCH_MTD_symbol]; | ||
596 | } | ||
597 | break; | ||
598 | |||
599 | case '+': /* deref memory */ | ||
600 | arg++; /* Skip '+', because strict_strtol() rejects it. */ | ||
601 | case '-': | ||
602 | tmp = strchr(arg, '('); | ||
603 | if (!tmp) | ||
604 | break; | ||
605 | |||
606 | *tmp = '\0'; | ||
607 | ret = strict_strtol(arg, 0, &offset); | ||
608 | |||
609 | if (ret) | ||
610 | break; | ||
611 | |||
612 | arg = tmp + 1; | ||
613 | tmp = strrchr(arg, ')'); | ||
614 | |||
615 | if (tmp) { | ||
616 | struct deref_fetch_param *dprm; | ||
617 | const struct fetch_type *t2; | ||
618 | |||
619 | t2 = find_fetch_type(NULL); | ||
620 | *tmp = '\0'; | ||
621 | dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); | ||
622 | |||
623 | if (!dprm) | ||
624 | return -ENOMEM; | ||
625 | |||
626 | dprm->offset = offset; | ||
627 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | ||
628 | is_kprobe); | ||
629 | if (ret) | ||
630 | kfree(dprm); | ||
631 | else { | ||
632 | f->fn = t->fetch[FETCH_MTD_deref]; | ||
633 | f->data = (void *)dprm; | ||
634 | } | ||
635 | } | ||
636 | break; | ||
637 | } | ||
638 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ | ||
639 | pr_info("%s type has no corresponding fetch method.\n", t->name); | ||
640 | ret = -EINVAL; | ||
641 | } | ||
642 | |||
643 | return ret; | ||
644 | } | ||
645 | |||
646 | #define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) | ||
647 | |||
648 | /* Bitfield type needs to be parsed into a fetch function */ | ||
649 | static int __parse_bitfield_probe_arg(const char *bf, | ||
650 | const struct fetch_type *t, | ||
651 | struct fetch_param *f) | ||
652 | { | ||
653 | struct bitfield_fetch_param *bprm; | ||
654 | unsigned long bw, bo; | ||
655 | char *tail; | ||
656 | |||
657 | if (*bf != 'b') | ||
658 | return 0; | ||
659 | |||
660 | bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); | ||
661 | if (!bprm) | ||
662 | return -ENOMEM; | ||
663 | |||
664 | bprm->orig = *f; | ||
665 | f->fn = t->fetch[FETCH_MTD_bitfield]; | ||
666 | f->data = (void *)bprm; | ||
667 | bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ | ||
668 | |||
669 | if (bw == 0 || *tail != '@') | ||
670 | return -EINVAL; | ||
671 | |||
672 | bf = tail + 1; | ||
673 | bo = simple_strtoul(bf, &tail, 0); | ||
674 | |||
675 | if (tail == bf || *tail != '/') | ||
676 | return -EINVAL; | ||
677 | |||
678 | bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); | ||
679 | bprm->low_shift = bprm->hi_shift + bo; | ||
680 | |||
681 | return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; | ||
682 | } | ||
683 | |||
684 | /* String length checking wrapper */ | ||
685 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | ||
686 | struct probe_arg *parg, bool is_return, bool is_kprobe) | ||
687 | { | ||
688 | const char *t; | ||
689 | int ret; | ||
690 | |||
691 | if (strlen(arg) > MAX_ARGSTR_LEN) { | ||
692 | pr_info("Argument is too long.: %s\n", arg); | ||
693 | return -ENOSPC; | ||
694 | } | ||
695 | parg->comm = kstrdup(arg, GFP_KERNEL); | ||
696 | if (!parg->comm) { | ||
697 | pr_info("Failed to allocate memory for command '%s'.\n", arg); | ||
698 | return -ENOMEM; | ||
699 | } | ||
700 | t = strchr(parg->comm, ':'); | ||
701 | if (t) { | ||
702 | arg[t - parg->comm] = '\0'; | ||
703 | t++; | ||
704 | } | ||
705 | parg->type = find_fetch_type(t); | ||
706 | if (!parg->type) { | ||
707 | pr_info("Unsupported type: %s\n", t); | ||
708 | return -EINVAL; | ||
709 | } | ||
710 | parg->offset = *size; | ||
711 | *size += parg->type->size; | ||
712 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); | ||
713 | |||
714 | if (ret >= 0 && t != NULL) | ||
715 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | ||
716 | |||
717 | if (ret >= 0) { | ||
718 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
719 | parg->fetch.fn); | ||
720 | parg->fetch_size.data = parg->fetch.data; | ||
721 | } | ||
722 | |||
723 | return ret; | ||
724 | } | ||
725 | |||
726 | /* Return 1 if name is reserved or already used by another argument */ | ||
727 | int traceprobe_conflict_field_name(const char *name, | ||
728 | struct probe_arg *args, int narg) | ||
729 | { | ||
730 | int i; | ||
731 | |||
732 | for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) | ||
733 | if (strcmp(reserved_field_names[i], name) == 0) | ||
734 | return 1; | ||
735 | |||
736 | for (i = 0; i < narg; i++) | ||
737 | if (strcmp(args[i].name, name) == 0) | ||
738 | return 1; | ||
739 | |||
740 | return 0; | ||
741 | } | ||
742 | |||
743 | void traceprobe_update_arg(struct probe_arg *arg) | ||
744 | { | ||
745 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
746 | update_bitfield_fetch_param(arg->fetch.data); | ||
747 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
748 | update_deref_fetch_param(arg->fetch.data); | ||
749 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
750 | update_symbol_cache(arg->fetch.data); | ||
751 | } | ||
752 | |||
753 | void traceprobe_free_probe_arg(struct probe_arg *arg) | ||
754 | { | ||
755 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
756 | free_bitfield_fetch_param(arg->fetch.data); | ||
757 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
758 | free_deref_fetch_param(arg->fetch.data); | ||
759 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
760 | free_symbol_cache(arg->fetch.data); | ||
761 | |||
762 | kfree(arg->name); | ||
763 | kfree(arg->comm); | ||
764 | } | ||
765 | |||
766 | int traceprobe_command(const char *buf, int (*createfn)(int, char **)) | ||
767 | { | ||
768 | char **argv; | ||
769 | int argc, ret; | ||
770 | |||
771 | argc = 0; | ||
772 | ret = 0; | ||
773 | argv = argv_split(GFP_KERNEL, buf, &argc); | ||
774 | if (!argv) | ||
775 | return -ENOMEM; | ||
776 | |||
777 | if (argc) | ||
778 | ret = createfn(argc, argv); | ||
779 | |||
780 | argv_free(argv); | ||
781 | |||
782 | return ret; | ||
783 | } | ||
784 | |||
785 | #define WRITE_BUFSIZE 4096 | ||
786 | |||
787 | ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, | ||
788 | size_t count, loff_t *ppos, | ||
789 | int (*createfn)(int, char **)) | ||
790 | { | ||
791 | char *kbuf, *tmp; | ||
792 | int ret = 0; | ||
793 | size_t done = 0; | ||
794 | size_t size; | ||
795 | |||
796 | kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | ||
797 | if (!kbuf) | ||
798 | return -ENOMEM; | ||
799 | |||
800 | while (done < count) { | ||
801 | size = count - done; | ||
802 | |||
803 | if (size >= WRITE_BUFSIZE) | ||
804 | size = WRITE_BUFSIZE - 1; | ||
805 | |||
806 | if (copy_from_user(kbuf, buffer + done, size)) { | ||
807 | ret = -EFAULT; | ||
808 | goto out; | ||
809 | } | ||
810 | kbuf[size] = '\0'; | ||
811 | tmp = strchr(kbuf, '\n'); | ||
812 | |||
813 | if (tmp) { | ||
814 | *tmp = '\0'; | ||
815 | size = tmp - kbuf + 1; | ||
816 | } else if (done + size < count) { | ||
817 | pr_warning("Line length is too long: " | ||
818 | "Should be less than %d.", WRITE_BUFSIZE); | ||
819 | ret = -EINVAL; | ||
820 | goto out; | ||
821 | } | ||
822 | done += size; | ||
823 | /* Remove comments */ | ||
824 | tmp = strchr(kbuf, '#'); | ||
825 | |||
826 | if (tmp) | ||
827 | *tmp = '\0'; | ||
828 | |||
829 | ret = traceprobe_command(kbuf, createfn); | ||
830 | if (ret) | ||
831 | goto out; | ||
832 | } | ||
833 | ret = done; | ||
834 | |||
835 | out: | ||
836 | kfree(kbuf); | ||
837 | |||
838 | return ret; | ||
839 | } | ||
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 000000000000..933708677814 --- /dev/null +++ b/kernel/trace/trace_probe.h | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * Common header file for probe-based Dynamic events. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
16 | * | ||
17 | * This code was copied from kernel/trace/trace_kprobe.h written by | ||
18 | * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> | ||
19 | * | ||
20 | * Updates to make this generic: | ||
21 | * Copyright (C) IBM Corporation, 2010-2011 | ||
22 | * Author: Srikar Dronamraju | ||
23 | */ | ||
24 | |||
25 | #include <linux/seq_file.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/smp.h> | ||
28 | #include <linux/debugfs.h> | ||
29 | #include <linux/types.h> | ||
30 | #include <linux/string.h> | ||
31 | #include <linux/ctype.h> | ||
32 | #include <linux/ptrace.h> | ||
33 | #include <linux/perf_event.h> | ||
34 | #include <linux/kprobes.h> | ||
35 | #include <linux/stringify.h> | ||
36 | #include <linux/limits.h> | ||
37 | #include <linux/uaccess.h> | ||
38 | #include <asm/bitsperlong.h> | ||
39 | |||
40 | #include "trace.h" | ||
41 | #include "trace_output.h" | ||
42 | |||
43 | #define MAX_TRACE_ARGS 128 | ||
44 | #define MAX_ARGSTR_LEN 63 | ||
45 | #define MAX_EVENT_NAME_LEN 64 | ||
46 | #define MAX_STRING_SIZE PATH_MAX | ||
47 | |||
48 | /* Reserved field names */ | ||
49 | #define FIELD_STRING_IP "__probe_ip" | ||
50 | #define FIELD_STRING_RETIP "__probe_ret_ip" | ||
51 | #define FIELD_STRING_FUNC "__probe_func" | ||
52 | |||
53 | #undef DEFINE_FIELD | ||
54 | #define DEFINE_FIELD(type, item, name, is_signed) \ | ||
55 | do { \ | ||
56 | ret = trace_define_field(event_call, #type, name, \ | ||
57 | offsetof(typeof(field), item), \ | ||
58 | sizeof(field.item), is_signed, \ | ||
59 | FILTER_OTHER); \ | ||
60 | if (ret) \ | ||
61 | return ret; \ | ||
62 | } while (0) | ||
63 | |||
64 | |||
65 | /* Flags for trace_probe */ | ||
66 | #define TP_FLAG_TRACE 1 | ||
67 | #define TP_FLAG_PROFILE 2 | ||
68 | #define TP_FLAG_REGISTERED 4 | ||
69 | #define TP_FLAG_UPROBE 8 | ||
70 | |||
71 | |||
72 | /* data_rloc: data relative location, compatible with u32 */ | ||
73 | #define make_data_rloc(len, roffs) \ | ||
74 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
75 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
76 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
77 | |||
78 | /* | ||
79 | * Convert data_rloc to data_loc: | ||
80 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
81 | * stores the offset from event entry. | ||
82 | */ | ||
83 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
84 | |||
85 | /* Data fetch function type */ | ||
86 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | ||
87 | /* Printing function type */ | ||
88 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); | ||
89 | |||
90 | /* Fetch types */ | ||
91 | enum { | ||
92 | FETCH_MTD_reg = 0, | ||
93 | FETCH_MTD_stack, | ||
94 | FETCH_MTD_retval, | ||
95 | FETCH_MTD_memory, | ||
96 | FETCH_MTD_symbol, | ||
97 | FETCH_MTD_deref, | ||
98 | FETCH_MTD_bitfield, | ||
99 | FETCH_MTD_END, | ||
100 | }; | ||
101 | |||
102 | /* Fetch type information table */ | ||
103 | struct fetch_type { | ||
104 | const char *name; /* Name of type */ | ||
105 | size_t size; /* Byte size of type */ | ||
106 | int is_signed; /* Signed flag */ | ||
107 | print_type_func_t print; /* Print functions */ | ||
108 | const char *fmt; /* Fromat string */ | ||
109 | const char *fmttype; /* Name in format file */ | ||
110 | /* Fetch functions */ | ||
111 | fetch_func_t fetch[FETCH_MTD_END]; | ||
112 | }; | ||
113 | |||
114 | struct fetch_param { | ||
115 | fetch_func_t fn; | ||
116 | void *data; | ||
117 | }; | ||
118 | |||
119 | struct probe_arg { | ||
120 | struct fetch_param fetch; | ||
121 | struct fetch_param fetch_size; | ||
122 | unsigned int offset; /* Offset from argument entry */ | ||
123 | const char *name; /* Name of this argument */ | ||
124 | const char *comm; /* Command of this argument */ | ||
125 | const struct fetch_type *type; /* Type of this argument */ | ||
126 | }; | ||
127 | |||
128 | static inline __kprobes void call_fetch(struct fetch_param *fprm, | ||
129 | struct pt_regs *regs, void *dest) | ||
130 | { | ||
131 | return fprm->fn(regs, fprm->data, dest); | ||
132 | } | ||
133 | |||
134 | /* Check the name is good for event/group/fields */ | ||
135 | static inline int is_good_name(const char *name) | ||
136 | { | ||
137 | if (!isalpha(*name) && *name != '_') | ||
138 | return 0; | ||
139 | while (*++name != '\0') { | ||
140 | if (!isalpha(*name) && !isdigit(*name) && *name != '_') | ||
141 | return 0; | ||
142 | } | ||
143 | return 1; | ||
144 | } | ||
145 | |||
146 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | ||
147 | struct probe_arg *parg, bool is_return, bool is_kprobe); | ||
148 | |||
149 | extern int traceprobe_conflict_field_name(const char *name, | ||
150 | struct probe_arg *args, int narg); | ||
151 | |||
152 | extern void traceprobe_update_arg(struct probe_arg *arg); | ||
153 | extern void traceprobe_free_probe_arg(struct probe_arg *arg); | ||
154 | |||
155 | extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); | ||
156 | |||
157 | extern ssize_t traceprobe_probes_write(struct file *file, | ||
158 | const char __user *buffer, size_t count, loff_t *ppos, | ||
159 | int (*createfn)(int, char**)); | ||
160 | |||
161 | extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 000000000000..2b36ac68549e --- /dev/null +++ b/kernel/trace/trace_uprobe.c | |||
@@ -0,0 +1,788 @@ | |||
1 | /* | ||
2 | * uprobes-based tracing events | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
16 | * | ||
17 | * Copyright (C) IBM Corporation, 2010-2012 | ||
18 | * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/uprobes.h> | ||
24 | #include <linux/namei.h> | ||
25 | |||
26 | #include "trace_probe.h" | ||
27 | |||
28 | #define UPROBE_EVENT_SYSTEM "uprobes" | ||
29 | |||
30 | /* | ||
31 | * uprobe event core functions | ||
32 | */ | ||
33 | struct trace_uprobe; | ||
34 | struct uprobe_trace_consumer { | ||
35 | struct uprobe_consumer cons; | ||
36 | struct trace_uprobe *tu; | ||
37 | }; | ||
38 | |||
39 | struct trace_uprobe { | ||
40 | struct list_head list; | ||
41 | struct ftrace_event_class class; | ||
42 | struct ftrace_event_call call; | ||
43 | struct uprobe_trace_consumer *consumer; | ||
44 | struct inode *inode; | ||
45 | char *filename; | ||
46 | unsigned long offset; | ||
47 | unsigned long nhit; | ||
48 | unsigned int flags; /* For TP_FLAG_* */ | ||
49 | ssize_t size; /* trace entry size */ | ||
50 | unsigned int nr_args; | ||
51 | struct probe_arg args[]; | ||
52 | }; | ||
53 | |||
54 | #define SIZEOF_TRACE_UPROBE(n) \ | ||
55 | (offsetof(struct trace_uprobe, args) + \ | ||
56 | (sizeof(struct probe_arg) * (n))) | ||
57 | |||
58 | static int register_uprobe_event(struct trace_uprobe *tu); | ||
59 | static void unregister_uprobe_event(struct trace_uprobe *tu); | ||
60 | |||
61 | static DEFINE_MUTEX(uprobe_lock); | ||
62 | static LIST_HEAD(uprobe_list); | ||
63 | |||
64 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); | ||
65 | |||
66 | /* | ||
67 | * Allocate new trace_uprobe and initialize it (including uprobes). | ||
68 | */ | ||
69 | static struct trace_uprobe * | ||
70 | alloc_trace_uprobe(const char *group, const char *event, int nargs) | ||
71 | { | ||
72 | struct trace_uprobe *tu; | ||
73 | |||
74 | if (!event || !is_good_name(event)) | ||
75 | return ERR_PTR(-EINVAL); | ||
76 | |||
77 | if (!group || !is_good_name(group)) | ||
78 | return ERR_PTR(-EINVAL); | ||
79 | |||
80 | tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); | ||
81 | if (!tu) | ||
82 | return ERR_PTR(-ENOMEM); | ||
83 | |||
84 | tu->call.class = &tu->class; | ||
85 | tu->call.name = kstrdup(event, GFP_KERNEL); | ||
86 | if (!tu->call.name) | ||
87 | goto error; | ||
88 | |||
89 | tu->class.system = kstrdup(group, GFP_KERNEL); | ||
90 | if (!tu->class.system) | ||
91 | goto error; | ||
92 | |||
93 | INIT_LIST_HEAD(&tu->list); | ||
94 | return tu; | ||
95 | |||
96 | error: | ||
97 | kfree(tu->call.name); | ||
98 | kfree(tu); | ||
99 | |||
100 | return ERR_PTR(-ENOMEM); | ||
101 | } | ||
102 | |||
103 | static void free_trace_uprobe(struct trace_uprobe *tu) | ||
104 | { | ||
105 | int i; | ||
106 | |||
107 | for (i = 0; i < tu->nr_args; i++) | ||
108 | traceprobe_free_probe_arg(&tu->args[i]); | ||
109 | |||
110 | iput(tu->inode); | ||
111 | kfree(tu->call.class->system); | ||
112 | kfree(tu->call.name); | ||
113 | kfree(tu->filename); | ||
114 | kfree(tu); | ||
115 | } | ||
116 | |||
117 | static struct trace_uprobe *find_probe_event(const char *event, const char *group) | ||
118 | { | ||
119 | struct trace_uprobe *tu; | ||
120 | |||
121 | list_for_each_entry(tu, &uprobe_list, list) | ||
122 | if (strcmp(tu->call.name, event) == 0 && | ||
123 | strcmp(tu->call.class->system, group) == 0) | ||
124 | return tu; | ||
125 | |||
126 | return NULL; | ||
127 | } | ||
128 | |||
129 | /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ | ||
130 | static void unregister_trace_uprobe(struct trace_uprobe *tu) | ||
131 | { | ||
132 | list_del(&tu->list); | ||
133 | unregister_uprobe_event(tu); | ||
134 | free_trace_uprobe(tu); | ||
135 | } | ||
136 | |||
137 | /* Register a trace_uprobe and probe_event */ | ||
138 | static int register_trace_uprobe(struct trace_uprobe *tu) | ||
139 | { | ||
140 | struct trace_uprobe *old_tp; | ||
141 | int ret; | ||
142 | |||
143 | mutex_lock(&uprobe_lock); | ||
144 | |||
145 | /* register as an event */ | ||
146 | old_tp = find_probe_event(tu->call.name, tu->call.class->system); | ||
147 | if (old_tp) | ||
148 | /* delete old event */ | ||
149 | unregister_trace_uprobe(old_tp); | ||
150 | |||
151 | ret = register_uprobe_event(tu); | ||
152 | if (ret) { | ||
153 | pr_warning("Failed to register probe event(%d)\n", ret); | ||
154 | goto end; | ||
155 | } | ||
156 | |||
157 | list_add_tail(&tu->list, &uprobe_list); | ||
158 | |||
159 | end: | ||
160 | mutex_unlock(&uprobe_lock); | ||
161 | |||
162 | return ret; | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * Argument syntax: | ||
167 | * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] | ||
168 | * | ||
169 | * - Remove uprobe: -:[GRP/]EVENT | ||
170 | */ | ||
171 | static int create_trace_uprobe(int argc, char **argv) | ||
172 | { | ||
173 | struct trace_uprobe *tu; | ||
174 | struct inode *inode; | ||
175 | char *arg, *event, *group, *filename; | ||
176 | char buf[MAX_EVENT_NAME_LEN]; | ||
177 | struct path path; | ||
178 | unsigned long offset; | ||
179 | bool is_delete; | ||
180 | int i, ret; | ||
181 | |||
182 | inode = NULL; | ||
183 | ret = 0; | ||
184 | is_delete = false; | ||
185 | event = NULL; | ||
186 | group = NULL; | ||
187 | |||
188 | /* argc must be >= 1 */ | ||
189 | if (argv[0][0] == '-') | ||
190 | is_delete = true; | ||
191 | else if (argv[0][0] != 'p') { | ||
192 | pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n"); | ||
193 | return -EINVAL; | ||
194 | } | ||
195 | |||
196 | if (argv[0][1] == ':') { | ||
197 | event = &argv[0][2]; | ||
198 | arg = strchr(event, '/'); | ||
199 | |||
200 | if (arg) { | ||
201 | group = event; | ||
202 | event = arg + 1; | ||
203 | event[-1] = '\0'; | ||
204 | |||
205 | if (strlen(group) == 0) { | ||
206 | pr_info("Group name is not specified\n"); | ||
207 | return -EINVAL; | ||
208 | } | ||
209 | } | ||
210 | if (strlen(event) == 0) { | ||
211 | pr_info("Event name is not specified\n"); | ||
212 | return -EINVAL; | ||
213 | } | ||
214 | } | ||
215 | if (!group) | ||
216 | group = UPROBE_EVENT_SYSTEM; | ||
217 | |||
218 | if (is_delete) { | ||
219 | if (!event) { | ||
220 | pr_info("Delete command needs an event name.\n"); | ||
221 | return -EINVAL; | ||
222 | } | ||
223 | mutex_lock(&uprobe_lock); | ||
224 | tu = find_probe_event(event, group); | ||
225 | |||
226 | if (!tu) { | ||
227 | mutex_unlock(&uprobe_lock); | ||
228 | pr_info("Event %s/%s doesn't exist.\n", group, event); | ||
229 | return -ENOENT; | ||
230 | } | ||
231 | /* delete an event */ | ||
232 | unregister_trace_uprobe(tu); | ||
233 | mutex_unlock(&uprobe_lock); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | if (argc < 2) { | ||
238 | pr_info("Probe point is not specified.\n"); | ||
239 | return -EINVAL; | ||
240 | } | ||
241 | if (isdigit(argv[1][0])) { | ||
242 | pr_info("probe point must be have a filename.\n"); | ||
243 | return -EINVAL; | ||
244 | } | ||
245 | arg = strchr(argv[1], ':'); | ||
246 | if (!arg) | ||
247 | goto fail_address_parse; | ||
248 | |||
249 | *arg++ = '\0'; | ||
250 | filename = argv[1]; | ||
251 | ret = kern_path(filename, LOOKUP_FOLLOW, &path); | ||
252 | if (ret) | ||
253 | goto fail_address_parse; | ||
254 | |||
255 | ret = strict_strtoul(arg, 0, &offset); | ||
256 | if (ret) | ||
257 | goto fail_address_parse; | ||
258 | |||
259 | inode = igrab(path.dentry->d_inode); | ||
260 | |||
261 | argc -= 2; | ||
262 | argv += 2; | ||
263 | |||
264 | /* setup a probe */ | ||
265 | if (!event) { | ||
266 | char *tail = strrchr(filename, '/'); | ||
267 | char *ptr; | ||
268 | |||
269 | ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL); | ||
270 | if (!ptr) { | ||
271 | ret = -ENOMEM; | ||
272 | goto fail_address_parse; | ||
273 | } | ||
274 | |||
275 | tail = ptr; | ||
276 | ptr = strpbrk(tail, ".-_"); | ||
277 | if (ptr) | ||
278 | *ptr = '\0'; | ||
279 | |||
280 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); | ||
281 | event = buf; | ||
282 | kfree(tail); | ||
283 | } | ||
284 | |||
285 | tu = alloc_trace_uprobe(group, event, argc); | ||
286 | if (IS_ERR(tu)) { | ||
287 | pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); | ||
288 | ret = PTR_ERR(tu); | ||
289 | goto fail_address_parse; | ||
290 | } | ||
291 | tu->offset = offset; | ||
292 | tu->inode = inode; | ||
293 | tu->filename = kstrdup(filename, GFP_KERNEL); | ||
294 | |||
295 | if (!tu->filename) { | ||
296 | pr_info("Failed to allocate filename.\n"); | ||
297 | ret = -ENOMEM; | ||
298 | goto error; | ||
299 | } | ||
300 | |||
301 | /* parse arguments */ | ||
302 | ret = 0; | ||
303 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | ||
304 | /* Increment count for freeing args in error case */ | ||
305 | tu->nr_args++; | ||
306 | |||
307 | /* Parse argument name */ | ||
308 | arg = strchr(argv[i], '='); | ||
309 | if (arg) { | ||
310 | *arg++ = '\0'; | ||
311 | tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); | ||
312 | } else { | ||
313 | arg = argv[i]; | ||
314 | /* If argument name is omitted, set "argN" */ | ||
315 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | ||
316 | tu->args[i].name = kstrdup(buf, GFP_KERNEL); | ||
317 | } | ||
318 | |||
319 | if (!tu->args[i].name) { | ||
320 | pr_info("Failed to allocate argument[%d] name.\n", i); | ||
321 | ret = -ENOMEM; | ||
322 | goto error; | ||
323 | } | ||
324 | |||
325 | if (!is_good_name(tu->args[i].name)) { | ||
326 | pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); | ||
327 | ret = -EINVAL; | ||
328 | goto error; | ||
329 | } | ||
330 | |||
331 | if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { | ||
332 | pr_info("Argument[%d] name '%s' conflicts with " | ||
333 | "another field.\n", i, argv[i]); | ||
334 | ret = -EINVAL; | ||
335 | goto error; | ||
336 | } | ||
337 | |||
338 | /* Parse fetch argument */ | ||
339 | ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); | ||
340 | if (ret) { | ||
341 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | ||
342 | goto error; | ||
343 | } | ||
344 | } | ||
345 | |||
346 | ret = register_trace_uprobe(tu); | ||
347 | if (ret) | ||
348 | goto error; | ||
349 | return 0; | ||
350 | |||
351 | error: | ||
352 | free_trace_uprobe(tu); | ||
353 | return ret; | ||
354 | |||
355 | fail_address_parse: | ||
356 | if (inode) | ||
357 | iput(inode); | ||
358 | |||
359 | pr_info("Failed to parse address.\n"); | ||
360 | |||
361 | return ret; | ||
362 | } | ||
363 | |||
364 | static void cleanup_all_probes(void) | ||
365 | { | ||
366 | struct trace_uprobe *tu; | ||
367 | |||
368 | mutex_lock(&uprobe_lock); | ||
369 | while (!list_empty(&uprobe_list)) { | ||
370 | tu = list_entry(uprobe_list.next, struct trace_uprobe, list); | ||
371 | unregister_trace_uprobe(tu); | ||
372 | } | ||
373 | mutex_unlock(&uprobe_lock); | ||
374 | } | ||
375 | |||
376 | /* Probes listing interfaces */ | ||
377 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) | ||
378 | { | ||
379 | mutex_lock(&uprobe_lock); | ||
380 | return seq_list_start(&uprobe_list, *pos); | ||
381 | } | ||
382 | |||
383 | static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
384 | { | ||
385 | return seq_list_next(v, &uprobe_list, pos); | ||
386 | } | ||
387 | |||
388 | static void probes_seq_stop(struct seq_file *m, void *v) | ||
389 | { | ||
390 | mutex_unlock(&uprobe_lock); | ||
391 | } | ||
392 | |||
393 | static int probes_seq_show(struct seq_file *m, void *v) | ||
394 | { | ||
395 | struct trace_uprobe *tu = v; | ||
396 | int i; | ||
397 | |||
398 | seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); | ||
399 | seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); | ||
400 | |||
401 | for (i = 0; i < tu->nr_args; i++) | ||
402 | seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); | ||
403 | |||
404 | seq_printf(m, "\n"); | ||
405 | return 0; | ||
406 | } | ||
407 | |||
408 | static const struct seq_operations probes_seq_op = { | ||
409 | .start = probes_seq_start, | ||
410 | .next = probes_seq_next, | ||
411 | .stop = probes_seq_stop, | ||
412 | .show = probes_seq_show | ||
413 | }; | ||
414 | |||
415 | static int probes_open(struct inode *inode, struct file *file) | ||
416 | { | ||
417 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) | ||
418 | cleanup_all_probes(); | ||
419 | |||
420 | return seq_open(file, &probes_seq_op); | ||
421 | } | ||
422 | |||
423 | static ssize_t probes_write(struct file *file, const char __user *buffer, | ||
424 | size_t count, loff_t *ppos) | ||
425 | { | ||
426 | return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); | ||
427 | } | ||
428 | |||
429 | static const struct file_operations uprobe_events_ops = { | ||
430 | .owner = THIS_MODULE, | ||
431 | .open = probes_open, | ||
432 | .read = seq_read, | ||
433 | .llseek = seq_lseek, | ||
434 | .release = seq_release, | ||
435 | .write = probes_write, | ||
436 | }; | ||
437 | |||
438 | /* Probes profiling interfaces */ | ||
439 | static int probes_profile_seq_show(struct seq_file *m, void *v) | ||
440 | { | ||
441 | struct trace_uprobe *tu = v; | ||
442 | |||
443 | seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); | ||
444 | return 0; | ||
445 | } | ||
446 | |||
447 | static const struct seq_operations profile_seq_op = { | ||
448 | .start = probes_seq_start, | ||
449 | .next = probes_seq_next, | ||
450 | .stop = probes_seq_stop, | ||
451 | .show = probes_profile_seq_show | ||
452 | }; | ||
453 | |||
454 | static int profile_open(struct inode *inode, struct file *file) | ||
455 | { | ||
456 | return seq_open(file, &profile_seq_op); | ||
457 | } | ||
458 | |||
459 | static const struct file_operations uprobe_profile_ops = { | ||
460 | .owner = THIS_MODULE, | ||
461 | .open = profile_open, | ||
462 | .read = seq_read, | ||
463 | .llseek = seq_lseek, | ||
464 | .release = seq_release, | ||
465 | }; | ||
466 | |||
467 | /* uprobe handler */ | ||
468 | static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) | ||
469 | { | ||
470 | struct uprobe_trace_entry_head *entry; | ||
471 | struct ring_buffer_event *event; | ||
472 | struct ring_buffer *buffer; | ||
473 | u8 *data; | ||
474 | int size, i, pc; | ||
475 | unsigned long irq_flags; | ||
476 | struct ftrace_event_call *call = &tu->call; | ||
477 | |||
478 | tu->nhit++; | ||
479 | |||
480 | local_save_flags(irq_flags); | ||
481 | pc = preempt_count(); | ||
482 | |||
483 | size = sizeof(*entry) + tu->size; | ||
484 | |||
485 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | ||
486 | size, irq_flags, pc); | ||
487 | if (!event) | ||
488 | return; | ||
489 | |||
490 | entry = ring_buffer_event_data(event); | ||
491 | entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); | ||
492 | data = (u8 *)&entry[1]; | ||
493 | for (i = 0; i < tu->nr_args; i++) | ||
494 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | ||
495 | |||
496 | if (!filter_current_check_discard(buffer, call, entry, event)) | ||
497 | trace_buffer_unlock_commit(buffer, event, irq_flags, pc); | ||
498 | } | ||
499 | |||
500 | /* Event entry printers */ | ||
501 | static enum print_line_t | ||
502 | print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) | ||
503 | { | ||
504 | struct uprobe_trace_entry_head *field; | ||
505 | struct trace_seq *s = &iter->seq; | ||
506 | struct trace_uprobe *tu; | ||
507 | u8 *data; | ||
508 | int i; | ||
509 | |||
510 | field = (struct uprobe_trace_entry_head *)iter->ent; | ||
511 | tu = container_of(event, struct trace_uprobe, call.event); | ||
512 | |||
513 | if (!trace_seq_printf(s, "%s: (", tu->call.name)) | ||
514 | goto partial; | ||
515 | |||
516 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) | ||
517 | goto partial; | ||
518 | |||
519 | if (!trace_seq_puts(s, ")")) | ||
520 | goto partial; | ||
521 | |||
522 | data = (u8 *)&field[1]; | ||
523 | for (i = 0; i < tu->nr_args; i++) { | ||
524 | if (!tu->args[i].type->print(s, tu->args[i].name, | ||
525 | data + tu->args[i].offset, field)) | ||
526 | goto partial; | ||
527 | } | ||
528 | |||
529 | if (trace_seq_puts(s, "\n")) | ||
530 | return TRACE_TYPE_HANDLED; | ||
531 | |||
532 | partial: | ||
533 | return TRACE_TYPE_PARTIAL_LINE; | ||
534 | } | ||
535 | |||
536 | static int probe_event_enable(struct trace_uprobe *tu, int flag) | ||
537 | { | ||
538 | struct uprobe_trace_consumer *utc; | ||
539 | int ret = 0; | ||
540 | |||
541 | if (!tu->inode || tu->consumer) | ||
542 | return -EINTR; | ||
543 | |||
544 | utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); | ||
545 | if (!utc) | ||
546 | return -EINTR; | ||
547 | |||
548 | utc->cons.handler = uprobe_dispatcher; | ||
549 | utc->cons.filter = NULL; | ||
550 | ret = uprobe_register(tu->inode, tu->offset, &utc->cons); | ||
551 | if (ret) { | ||
552 | kfree(utc); | ||
553 | return ret; | ||
554 | } | ||
555 | |||
556 | tu->flags |= flag; | ||
557 | utc->tu = tu; | ||
558 | tu->consumer = utc; | ||
559 | |||
560 | return 0; | ||
561 | } | ||
562 | |||
563 | static void probe_event_disable(struct trace_uprobe *tu, int flag) | ||
564 | { | ||
565 | if (!tu->inode || !tu->consumer) | ||
566 | return; | ||
567 | |||
568 | uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); | ||
569 | tu->flags &= ~flag; | ||
570 | kfree(tu->consumer); | ||
571 | tu->consumer = NULL; | ||
572 | } | ||
573 | |||
574 | static int uprobe_event_define_fields(struct ftrace_event_call *event_call) | ||
575 | { | ||
576 | int ret, i; | ||
577 | struct uprobe_trace_entry_head field; | ||
578 | struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; | ||
579 | |||
580 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | ||
581 | /* Set argument names as fields */ | ||
582 | for (i = 0; i < tu->nr_args; i++) { | ||
583 | ret = trace_define_field(event_call, tu->args[i].type->fmttype, | ||
584 | tu->args[i].name, | ||
585 | sizeof(field) + tu->args[i].offset, | ||
586 | tu->args[i].type->size, | ||
587 | tu->args[i].type->is_signed, | ||
588 | FILTER_OTHER); | ||
589 | |||
590 | if (ret) | ||
591 | return ret; | ||
592 | } | ||
593 | return 0; | ||
594 | } | ||
595 | |||
596 | #define LEN_OR_ZERO (len ? len - pos : 0) | ||
597 | static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) | ||
598 | { | ||
599 | const char *fmt, *arg; | ||
600 | int i; | ||
601 | int pos = 0; | ||
602 | |||
603 | fmt = "(%lx)"; | ||
604 | arg = "REC->" FIELD_STRING_IP; | ||
605 | |||
606 | /* When len=0, we just calculate the needed length */ | ||
607 | |||
608 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); | ||
609 | |||
610 | for (i = 0; i < tu->nr_args; i++) { | ||
611 | pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", | ||
612 | tu->args[i].name, tu->args[i].type->fmt); | ||
613 | } | ||
614 | |||
615 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | ||
616 | |||
617 | for (i = 0; i < tu->nr_args; i++) { | ||
618 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
619 | tu->args[i].name); | ||
620 | } | ||
621 | |||
622 | return pos; /* return the length of print_fmt */ | ||
623 | } | ||
624 | #undef LEN_OR_ZERO | ||
625 | |||
626 | static int set_print_fmt(struct trace_uprobe *tu) | ||
627 | { | ||
628 | char *print_fmt; | ||
629 | int len; | ||
630 | |||
631 | /* First: called with 0 length to calculate the needed length */ | ||
632 | len = __set_print_fmt(tu, NULL, 0); | ||
633 | print_fmt = kmalloc(len + 1, GFP_KERNEL); | ||
634 | if (!print_fmt) | ||
635 | return -ENOMEM; | ||
636 | |||
637 | /* Second: actually write the @print_fmt */ | ||
638 | __set_print_fmt(tu, print_fmt, len + 1); | ||
639 | tu->call.print_fmt = print_fmt; | ||
640 | |||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | #ifdef CONFIG_PERF_EVENTS | ||
645 | /* uprobe profile handler */ | ||
646 | static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) | ||
647 | { | ||
648 | struct ftrace_event_call *call = &tu->call; | ||
649 | struct uprobe_trace_entry_head *entry; | ||
650 | struct hlist_head *head; | ||
651 | u8 *data; | ||
652 | int size, __size, i; | ||
653 | int rctx; | ||
654 | |||
655 | __size = sizeof(*entry) + tu->size; | ||
656 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | ||
657 | size -= sizeof(u32); | ||
658 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) | ||
659 | return; | ||
660 | |||
661 | preempt_disable(); | ||
662 | |||
663 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | ||
664 | if (!entry) | ||
665 | goto out; | ||
666 | |||
667 | entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); | ||
668 | data = (u8 *)&entry[1]; | ||
669 | for (i = 0; i < tu->nr_args; i++) | ||
670 | call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); | ||
671 | |||
672 | head = this_cpu_ptr(call->perf_events); | ||
673 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | ||
674 | |||
675 | out: | ||
676 | preempt_enable(); | ||
677 | } | ||
678 | #endif /* CONFIG_PERF_EVENTS */ | ||
679 | |||
680 | static | ||
681 | int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) | ||
682 | { | ||
683 | struct trace_uprobe *tu = (struct trace_uprobe *)event->data; | ||
684 | |||
685 | switch (type) { | ||
686 | case TRACE_REG_REGISTER: | ||
687 | return probe_event_enable(tu, TP_FLAG_TRACE); | ||
688 | |||
689 | case TRACE_REG_UNREGISTER: | ||
690 | probe_event_disable(tu, TP_FLAG_TRACE); | ||
691 | return 0; | ||
692 | |||
693 | #ifdef CONFIG_PERF_EVENTS | ||
694 | case TRACE_REG_PERF_REGISTER: | ||
695 | return probe_event_enable(tu, TP_FLAG_PROFILE); | ||
696 | |||
697 | case TRACE_REG_PERF_UNREGISTER: | ||
698 | probe_event_disable(tu, TP_FLAG_PROFILE); | ||
699 | return 0; | ||
700 | #endif | ||
701 | default: | ||
702 | return 0; | ||
703 | } | ||
704 | return 0; | ||
705 | } | ||
706 | |||
707 | static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) | ||
708 | { | ||
709 | struct uprobe_trace_consumer *utc; | ||
710 | struct trace_uprobe *tu; | ||
711 | |||
712 | utc = container_of(con, struct uprobe_trace_consumer, cons); | ||
713 | tu = utc->tu; | ||
714 | if (!tu || tu->consumer != utc) | ||
715 | return 0; | ||
716 | |||
717 | if (tu->flags & TP_FLAG_TRACE) | ||
718 | uprobe_trace_func(tu, regs); | ||
719 | |||
720 | #ifdef CONFIG_PERF_EVENTS | ||
721 | if (tu->flags & TP_FLAG_PROFILE) | ||
722 | uprobe_perf_func(tu, regs); | ||
723 | #endif | ||
724 | return 0; | ||
725 | } | ||
726 | |||
727 | static struct trace_event_functions uprobe_funcs = { | ||
728 | .trace = print_uprobe_event | ||
729 | }; | ||
730 | |||
731 | static int register_uprobe_event(struct trace_uprobe *tu) | ||
732 | { | ||
733 | struct ftrace_event_call *call = &tu->call; | ||
734 | int ret; | ||
735 | |||
736 | /* Initialize ftrace_event_call */ | ||
737 | INIT_LIST_HEAD(&call->class->fields); | ||
738 | call->event.funcs = &uprobe_funcs; | ||
739 | call->class->define_fields = uprobe_event_define_fields; | ||
740 | |||
741 | if (set_print_fmt(tu) < 0) | ||
742 | return -ENOMEM; | ||
743 | |||
744 | ret = register_ftrace_event(&call->event); | ||
745 | if (!ret) { | ||
746 | kfree(call->print_fmt); | ||
747 | return -ENODEV; | ||
748 | } | ||
749 | call->flags = 0; | ||
750 | call->class->reg = trace_uprobe_register; | ||
751 | call->data = tu; | ||
752 | ret = trace_add_event_call(call); | ||
753 | |||
754 | if (ret) { | ||
755 | pr_info("Failed to register uprobe event: %s\n", call->name); | ||
756 | kfree(call->print_fmt); | ||
757 | unregister_ftrace_event(&call->event); | ||
758 | } | ||
759 | |||
760 | return ret; | ||
761 | } | ||
762 | |||
763 | static void unregister_uprobe_event(struct trace_uprobe *tu) | ||
764 | { | ||
765 | /* tu->event is unregistered in trace_remove_event_call() */ | ||
766 | trace_remove_event_call(&tu->call); | ||
767 | kfree(tu->call.print_fmt); | ||
768 | tu->call.print_fmt = NULL; | ||
769 | } | ||
770 | |||
771 | /* Make a trace interface for controling probe points */ | ||
772 | static __init int init_uprobe_trace(void) | ||
773 | { | ||
774 | struct dentry *d_tracer; | ||
775 | |||
776 | d_tracer = tracing_init_dentry(); | ||
777 | if (!d_tracer) | ||
778 | return 0; | ||
779 | |||
780 | trace_create_file("uprobe_events", 0644, d_tracer, | ||
781 | NULL, &uprobe_events_ops); | ||
782 | /* Profile interface */ | ||
783 | trace_create_file("uprobe_profile", 0444, d_tracer, | ||
784 | NULL, &uprobe_profile_ops); | ||
785 | return 0; | ||
786 | } | ||
787 | |||
788 | fs_initcall(init_uprobe_trace); | ||
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a4721..000000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null | |||
@@ -1,300 +0,0 @@ | |||
1 | /* | ||
2 | * Workqueue statistical tracer. | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | |||
9 | #include <trace/events/workqueue.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/percpu.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/kref.h> | ||
14 | #include "trace_stat.h" | ||
15 | #include "trace.h" | ||
16 | |||
17 | |||
18 | /* A cpu workqueue thread */ | ||
19 | struct cpu_workqueue_stats { | ||
20 | struct list_head list; | ||
21 | struct kref kref; | ||
22 | int cpu; | ||
23 | pid_t pid; | ||
24 | /* Can be inserted from interrupt or user context, need to be atomic */ | ||
25 | atomic_t inserted; | ||
26 | /* | ||
27 | * Don't need to be atomic, works are serialized in a single workqueue thread | ||
28 | * on a single CPU. | ||
29 | */ | ||
30 | unsigned int executed; | ||
31 | }; | ||
32 | |||
33 | /* List of workqueue threads on one cpu */ | ||
34 | struct workqueue_global_stats { | ||
35 | struct list_head list; | ||
36 | spinlock_t lock; | ||
37 | }; | ||
38 | |||
39 | /* Don't need a global lock because allocated before the workqueues, and | ||
40 | * never freed. | ||
41 | */ | ||
42 | static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); | ||
43 | #define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) | ||
44 | |||
45 | static void cpu_workqueue_stat_free(struct kref *kref) | ||
46 | { | ||
47 | kfree(container_of(kref, struct cpu_workqueue_stats, kref)); | ||
48 | } | ||
49 | |||
50 | /* Insertion of a work */ | ||
51 | static void | ||
52 | probe_workqueue_insertion(void *ignore, | ||
53 | struct task_struct *wq_thread, | ||
54 | struct work_struct *work) | ||
55 | { | ||
56 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
57 | struct cpu_workqueue_stats *node; | ||
58 | unsigned long flags; | ||
59 | |||
60 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
61 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
62 | if (node->pid == wq_thread->pid) { | ||
63 | atomic_inc(&node->inserted); | ||
64 | goto found; | ||
65 | } | ||
66 | } | ||
67 | pr_debug("trace_workqueue: entry not found\n"); | ||
68 | found: | ||
69 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
70 | } | ||
71 | |||
72 | /* Execution of a work */ | ||
73 | static void | ||
74 | probe_workqueue_execution(void *ignore, | ||
75 | struct task_struct *wq_thread, | ||
76 | struct work_struct *work) | ||
77 | { | ||
78 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
79 | struct cpu_workqueue_stats *node; | ||
80 | unsigned long flags; | ||
81 | |||
82 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
83 | list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { | ||
84 | if (node->pid == wq_thread->pid) { | ||
85 | node->executed++; | ||
86 | goto found; | ||
87 | } | ||
88 | } | ||
89 | pr_debug("trace_workqueue: entry not found\n"); | ||
90 | found: | ||
91 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
92 | } | ||
93 | |||
94 | /* Creation of a cpu workqueue thread */ | ||
95 | static void probe_workqueue_creation(void *ignore, | ||
96 | struct task_struct *wq_thread, int cpu) | ||
97 | { | ||
98 | struct cpu_workqueue_stats *cws; | ||
99 | unsigned long flags; | ||
100 | |||
101 | WARN_ON(cpu < 0); | ||
102 | |||
103 | /* Workqueues are sometimes created in atomic context */ | ||
104 | cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); | ||
105 | if (!cws) { | ||
106 | pr_warning("trace_workqueue: not enough memory\n"); | ||
107 | return; | ||
108 | } | ||
109 | INIT_LIST_HEAD(&cws->list); | ||
110 | kref_init(&cws->kref); | ||
111 | cws->cpu = cpu; | ||
112 | cws->pid = wq_thread->pid; | ||
113 | |||
114 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
115 | list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); | ||
116 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
117 | } | ||
118 | |||
119 | /* Destruction of a cpu workqueue thread */ | ||
120 | static void | ||
121 | probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) | ||
122 | { | ||
123 | /* Workqueue only execute on one cpu */ | ||
124 | int cpu = cpumask_first(&wq_thread->cpus_allowed); | ||
125 | struct cpu_workqueue_stats *node, *next; | ||
126 | unsigned long flags; | ||
127 | |||
128 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
129 | list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, | ||
130 | list) { | ||
131 | if (node->pid == wq_thread->pid) { | ||
132 | list_del(&node->list); | ||
133 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
134 | goto found; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | pr_debug("trace_workqueue: don't find workqueue to destroy\n"); | ||
139 | found: | ||
140 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
141 | |||
142 | } | ||
143 | |||
144 | static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) | ||
145 | { | ||
146 | unsigned long flags; | ||
147 | struct cpu_workqueue_stats *ret = NULL; | ||
148 | |||
149 | |||
150 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
151 | |||
152 | if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { | ||
153 | ret = list_entry(workqueue_cpu_stat(cpu)->list.next, | ||
154 | struct cpu_workqueue_stats, list); | ||
155 | kref_get(&ret->kref); | ||
156 | } | ||
157 | |||
158 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
159 | |||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | static void *workqueue_stat_start(struct tracer_stat *trace) | ||
164 | { | ||
165 | int cpu; | ||
166 | void *ret = NULL; | ||
167 | |||
168 | for_each_possible_cpu(cpu) { | ||
169 | ret = workqueue_stat_start_cpu(cpu); | ||
170 | if (ret) | ||
171 | return ret; | ||
172 | } | ||
173 | return NULL; | ||
174 | } | ||
175 | |||
176 | static void *workqueue_stat_next(void *prev, int idx) | ||
177 | { | ||
178 | struct cpu_workqueue_stats *prev_cws = prev; | ||
179 | struct cpu_workqueue_stats *ret; | ||
180 | int cpu = prev_cws->cpu; | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); | ||
184 | if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { | ||
185 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
186 | do { | ||
187 | cpu = cpumask_next(cpu, cpu_possible_mask); | ||
188 | if (cpu >= nr_cpu_ids) | ||
189 | return NULL; | ||
190 | } while (!(ret = workqueue_stat_start_cpu(cpu))); | ||
191 | return ret; | ||
192 | } else { | ||
193 | ret = list_entry(prev_cws->list.next, | ||
194 | struct cpu_workqueue_stats, list); | ||
195 | kref_get(&ret->kref); | ||
196 | } | ||
197 | spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static int workqueue_stat_show(struct seq_file *s, void *p) | ||
203 | { | ||
204 | struct cpu_workqueue_stats *cws = p; | ||
205 | struct pid *pid; | ||
206 | struct task_struct *tsk; | ||
207 | |||
208 | pid = find_get_pid(cws->pid); | ||
209 | if (pid) { | ||
210 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
211 | if (tsk) { | ||
212 | seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, | ||
213 | atomic_read(&cws->inserted), cws->executed, | ||
214 | tsk->comm); | ||
215 | put_task_struct(tsk); | ||
216 | } | ||
217 | put_pid(pid); | ||
218 | } | ||
219 | |||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | static void workqueue_stat_release(void *stat) | ||
224 | { | ||
225 | struct cpu_workqueue_stats *node = stat; | ||
226 | |||
227 | kref_put(&node->kref, cpu_workqueue_stat_free); | ||
228 | } | ||
229 | |||
230 | static int workqueue_stat_headers(struct seq_file *s) | ||
231 | { | ||
232 | seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); | ||
233 | seq_printf(s, "# | | | |\n"); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct tracer_stat workqueue_stats __read_mostly = { | ||
238 | .name = "workqueues", | ||
239 | .stat_start = workqueue_stat_start, | ||
240 | .stat_next = workqueue_stat_next, | ||
241 | .stat_show = workqueue_stat_show, | ||
242 | .stat_release = workqueue_stat_release, | ||
243 | .stat_headers = workqueue_stat_headers | ||
244 | }; | ||
245 | |||
246 | |||
247 | int __init stat_workqueue_init(void) | ||
248 | { | ||
249 | if (register_stat_tracer(&workqueue_stats)) { | ||
250 | pr_warning("Unable to register workqueue stat tracer\n"); | ||
251 | return 1; | ||
252 | } | ||
253 | |||
254 | return 0; | ||
255 | } | ||
256 | fs_initcall(stat_workqueue_init); | ||
257 | |||
258 | /* | ||
259 | * Workqueues are created very early, just after pre-smp initcalls. | ||
260 | * So we must register our tracepoints at this stage. | ||
261 | */ | ||
262 | int __init trace_workqueue_early_init(void) | ||
263 | { | ||
264 | int ret, cpu; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
269 | } | ||
270 | |||
271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
272 | if (ret) | ||
273 | goto out; | ||
274 | |||
275 | ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
276 | if (ret) | ||
277 | goto no_insertion; | ||
278 | |||
279 | ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
280 | if (ret) | ||
281 | goto no_execution; | ||
282 | |||
283 | ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); | ||
284 | if (ret) | ||
285 | goto no_creation; | ||
286 | |||
287 | return 0; | ||
288 | |||
289 | no_creation: | ||
290 | unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); | ||
291 | no_execution: | ||
292 | unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); | ||
293 | no_insertion: | ||
294 | unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | ||
295 | out: | ||
296 | pr_warning("trace_workqueue: unable to trace workqueues\n"); | ||
297 | |||
298 | return 1; | ||
299 | } | ||
300 | early_initcall(trace_workqueue_early_init); | ||
diff --git a/kernel/uid16.c b/kernel/uid16.c index 51c6e89e8619..d7948eb10225 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) | |||
81 | return ret; | 81 | return ret; |
82 | } | 82 | } |
83 | 83 | ||
84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) | 84 | SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) |
85 | { | 85 | { |
86 | const struct cred *cred = current_cred(); | 86 | const struct cred *cred = current_cred(); |
87 | int retval; | 87 | int retval; |
88 | old_uid_t ruid, euid, suid; | ||
88 | 89 | ||
89 | if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && | 90 | ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); |
90 | !(retval = put_user(high2lowuid(cred->euid), euid))) | 91 | euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); |
91 | retval = put_user(high2lowuid(cred->suid), suid); | 92 | suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); |
93 | |||
94 | if (!(retval = put_user(ruid, ruidp)) && | ||
95 | !(retval = put_user(euid, euidp))) | ||
96 | retval = put_user(suid, suidp); | ||
92 | 97 | ||
93 | return retval; | 98 | return retval; |
94 | } | 99 | } |
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) | |||
103 | } | 108 | } |
104 | 109 | ||
105 | 110 | ||
106 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) | 111 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) |
107 | { | 112 | { |
108 | const struct cred *cred = current_cred(); | 113 | const struct cred *cred = current_cred(); |
109 | int retval; | 114 | int retval; |
115 | old_gid_t rgid, egid, sgid; | ||
116 | |||
117 | rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid)); | ||
118 | egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid)); | ||
119 | sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid)); | ||
110 | 120 | ||
111 | if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && | 121 | if (!(retval = put_user(rgid, rgidp)) && |
112 | !(retval = put_user(high2lowgid(cred->egid), egid))) | 122 | !(retval = put_user(egid, egidp))) |
113 | retval = put_user(high2lowgid(cred->sgid), sgid); | 123 | retval = put_user(sgid, sgidp); |
114 | 124 | ||
115 | return retval; | 125 | return retval; |
116 | } | 126 | } |
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) | |||
134 | static int groups16_to_user(old_gid_t __user *grouplist, | 144 | static int groups16_to_user(old_gid_t __user *grouplist, |
135 | struct group_info *group_info) | 145 | struct group_info *group_info) |
136 | { | 146 | { |
147 | struct user_namespace *user_ns = current_user_ns(); | ||
137 | int i; | 148 | int i; |
138 | old_gid_t group; | 149 | old_gid_t group; |
150 | kgid_t kgid; | ||
139 | 151 | ||
140 | for (i = 0; i < group_info->ngroups; i++) { | 152 | for (i = 0; i < group_info->ngroups; i++) { |
141 | group = high2lowgid(GROUP_AT(group_info, i)); | 153 | kgid = GROUP_AT(group_info, i); |
154 | group = high2lowgid(from_kgid_munged(user_ns, kgid)); | ||
142 | if (put_user(group, grouplist+i)) | 155 | if (put_user(group, grouplist+i)) |
143 | return -EFAULT; | 156 | return -EFAULT; |
144 | } | 157 | } |
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist, | |||
149 | static int groups16_from_user(struct group_info *group_info, | 162 | static int groups16_from_user(struct group_info *group_info, |
150 | old_gid_t __user *grouplist) | 163 | old_gid_t __user *grouplist) |
151 | { | 164 | { |
165 | struct user_namespace *user_ns = current_user_ns(); | ||
152 | int i; | 166 | int i; |
153 | old_gid_t group; | 167 | old_gid_t group; |
168 | kgid_t kgid; | ||
154 | 169 | ||
155 | for (i = 0; i < group_info->ngroups; i++) { | 170 | for (i = 0; i < group_info->ngroups; i++) { |
156 | if (get_user(group, grouplist+i)) | 171 | if (get_user(group, grouplist+i)) |
157 | return -EFAULT; | 172 | return -EFAULT; |
158 | GROUP_AT(group_info, i) = low2highgid(group); | 173 | |
174 | kgid = make_kgid(user_ns, low2highgid(group)); | ||
175 | if (!gid_valid(kgid)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | GROUP_AT(group_info, i) = kgid; | ||
159 | } | 179 | } |
160 | 180 | ||
161 | return 0; | 181 | return 0; |
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
211 | 231 | ||
212 | SYSCALL_DEFINE0(getuid16) | 232 | SYSCALL_DEFINE0(getuid16) |
213 | { | 233 | { |
214 | return high2lowuid(current_uid()); | 234 | return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); |
215 | } | 235 | } |
216 | 236 | ||
217 | SYSCALL_DEFINE0(geteuid16) | 237 | SYSCALL_DEFINE0(geteuid16) |
218 | { | 238 | { |
219 | return high2lowuid(current_euid()); | 239 | return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); |
220 | } | 240 | } |
221 | 241 | ||
222 | SYSCALL_DEFINE0(getgid16) | 242 | SYSCALL_DEFINE0(getgid16) |
223 | { | 243 | { |
224 | return high2lowgid(current_gid()); | 244 | return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); |
225 | } | 245 | } |
226 | 246 | ||
227 | SYSCALL_DEFINE0(getegid16) | 247 | SYSCALL_DEFINE0(getegid16) |
228 | { | 248 | { |
229 | return high2lowgid(current_egid()); | 249 | return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); |
230 | } | 250 | } |
diff --git a/kernel/user.c b/kernel/user.c index 71dd2363ab0f..b815fefbe76f 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -22,10 +22,27 @@ | |||
22 | * and 1 for... ? | 22 | * and 1 for... ? |
23 | */ | 23 | */ |
24 | struct user_namespace init_user_ns = { | 24 | struct user_namespace init_user_ns = { |
25 | .uid_map = { | ||
26 | .nr_extents = 1, | ||
27 | .extent[0] = { | ||
28 | .first = 0, | ||
29 | .lower_first = 0, | ||
30 | .count = 4294967295U, | ||
31 | }, | ||
32 | }, | ||
33 | .gid_map = { | ||
34 | .nr_extents = 1, | ||
35 | .extent[0] = { | ||
36 | .first = 0, | ||
37 | .lower_first = 0, | ||
38 | .count = 4294967295U, | ||
39 | }, | ||
40 | }, | ||
25 | .kref = { | 41 | .kref = { |
26 | .refcount = ATOMIC_INIT(3), | 42 | .refcount = ATOMIC_INIT(3), |
27 | }, | 43 | }, |
28 | .creator = &root_user, | 44 | .owner = GLOBAL_ROOT_UID, |
45 | .group = GLOBAL_ROOT_GID, | ||
29 | }; | 46 | }; |
30 | EXPORT_SYMBOL_GPL(init_user_ns); | 47 | EXPORT_SYMBOL_GPL(init_user_ns); |
31 | 48 | ||
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns); | |||
34 | * when changing user ID's (ie setuid() and friends). | 51 | * when changing user ID's (ie setuid() and friends). |
35 | */ | 52 | */ |
36 | 53 | ||
54 | #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) | ||
55 | #define UIDHASH_SZ (1 << UIDHASH_BITS) | ||
37 | #define UIDHASH_MASK (UIDHASH_SZ - 1) | 56 | #define UIDHASH_MASK (UIDHASH_SZ - 1) |
38 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | 57 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) |
39 | #define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) | 58 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) |
40 | 59 | ||
41 | static struct kmem_cache *uid_cachep; | 60 | static struct kmem_cache *uid_cachep; |
61 | struct hlist_head uidhash_table[UIDHASH_SZ]; | ||
42 | 62 | ||
43 | /* | 63 | /* |
44 | * The uidhash_lock is mostly taken from process context, but it is | 64 | * The uidhash_lock is mostly taken from process context, but it is |
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep; | |||
51 | */ | 71 | */ |
52 | static DEFINE_SPINLOCK(uidhash_lock); | 72 | static DEFINE_SPINLOCK(uidhash_lock); |
53 | 73 | ||
54 | /* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ | 74 | /* root_user.__count is 1, for init task cred */ |
55 | struct user_struct root_user = { | 75 | struct user_struct root_user = { |
56 | .__count = ATOMIC_INIT(2), | 76 | .__count = ATOMIC_INIT(1), |
57 | .processes = ATOMIC_INIT(1), | 77 | .processes = ATOMIC_INIT(1), |
58 | .files = ATOMIC_INIT(0), | 78 | .files = ATOMIC_INIT(0), |
59 | .sigpending = ATOMIC_INIT(0), | 79 | .sigpending = ATOMIC_INIT(0), |
60 | .locked_shm = 0, | 80 | .locked_shm = 0, |
61 | .user_ns = &init_user_ns, | 81 | .uid = GLOBAL_ROOT_UID, |
62 | }; | 82 | }; |
63 | 83 | ||
64 | /* | 84 | /* |
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) | |||
72 | static void uid_hash_remove(struct user_struct *up) | 92 | static void uid_hash_remove(struct user_struct *up) |
73 | { | 93 | { |
74 | hlist_del_init(&up->uidhash_node); | 94 | hlist_del_init(&up->uidhash_node); |
75 | put_user_ns(up->user_ns); | ||
76 | } | 95 | } |
77 | 96 | ||
78 | static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) | 97 | static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) |
79 | { | 98 | { |
80 | struct user_struct *user; | 99 | struct user_struct *user; |
81 | struct hlist_node *h; | 100 | struct hlist_node *h; |
82 | 101 | ||
83 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | 102 | hlist_for_each_entry(user, h, hashent, uidhash_node) { |
84 | if (user->uid == uid) { | 103 | if (uid_eq(user->uid, uid)) { |
85 | atomic_inc(&user->__count); | 104 | atomic_inc(&user->__count); |
86 | return user; | 105 | return user; |
87 | } | 106 | } |
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
110 | * | 129 | * |
111 | * If the user_struct could not be found, return NULL. | 130 | * If the user_struct could not be found, return NULL. |
112 | */ | 131 | */ |
113 | struct user_struct *find_user(uid_t uid) | 132 | struct user_struct *find_user(kuid_t uid) |
114 | { | 133 | { |
115 | struct user_struct *ret; | 134 | struct user_struct *ret; |
116 | unsigned long flags; | 135 | unsigned long flags; |
117 | struct user_namespace *ns = current_user_ns(); | ||
118 | 136 | ||
119 | spin_lock_irqsave(&uidhash_lock, flags); | 137 | spin_lock_irqsave(&uidhash_lock, flags); |
120 | ret = uid_hash_find(uid, uidhashentry(ns, uid)); | 138 | ret = uid_hash_find(uid, uidhashentry(uid)); |
121 | spin_unlock_irqrestore(&uidhash_lock, flags); | 139 | spin_unlock_irqrestore(&uidhash_lock, flags); |
122 | return ret; | 140 | return ret; |
123 | } | 141 | } |
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up) | |||
136 | local_irq_restore(flags); | 154 | local_irq_restore(flags); |
137 | } | 155 | } |
138 | 156 | ||
139 | struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | 157 | struct user_struct *alloc_uid(kuid_t uid) |
140 | { | 158 | { |
141 | struct hlist_head *hashent = uidhashentry(ns, uid); | 159 | struct hlist_head *hashent = uidhashentry(uid); |
142 | struct user_struct *up, *new; | 160 | struct user_struct *up, *new; |
143 | 161 | ||
144 | spin_lock_irq(&uidhash_lock); | 162 | spin_lock_irq(&uidhash_lock); |
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
153 | new->uid = uid; | 171 | new->uid = uid; |
154 | atomic_set(&new->__count, 1); | 172 | atomic_set(&new->__count, 1); |
155 | 173 | ||
156 | new->user_ns = get_user_ns(ns); | ||
157 | |||
158 | /* | 174 | /* |
159 | * Before adding this, check whether we raced | 175 | * Before adding this, check whether we raced |
160 | * on adding the same user already.. | 176 | * on adding the same user already.. |
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) | |||
162 | spin_lock_irq(&uidhash_lock); | 178 | spin_lock_irq(&uidhash_lock); |
163 | up = uid_hash_find(uid, hashent); | 179 | up = uid_hash_find(uid, hashent); |
164 | if (up) { | 180 | if (up) { |
165 | put_user_ns(ns); | ||
166 | key_put(new->uid_keyring); | 181 | key_put(new->uid_keyring); |
167 | key_put(new->session_keyring); | 182 | key_put(new->session_keyring); |
168 | kmem_cache_free(uid_cachep, new); | 183 | kmem_cache_free(uid_cachep, new); |
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void) | |||
187 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 202 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
188 | 203 | ||
189 | for(n = 0; n < UIDHASH_SZ; ++n) | 204 | for(n = 0; n < UIDHASH_SZ; ++n) |
190 | INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); | 205 | INIT_HLIST_HEAD(uidhash_table + n); |
191 | 206 | ||
192 | /* Insert the root user immediately (init already runs as root) */ | 207 | /* Insert the root user immediately (init already runs as root) */ |
193 | spin_lock_irq(&uidhash_lock); | 208 | spin_lock_irq(&uidhash_lock); |
194 | uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); | 209 | uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); |
195 | spin_unlock_irq(&uidhash_lock); | 210 | spin_unlock_irq(&uidhash_lock); |
196 | 211 | ||
197 | return 0; | 212 | return 0; |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3b906e98b1db..86602316422d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -11,9 +11,20 @@ | |||
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
12 | #include <linux/highuid.h> | 12 | #include <linux/highuid.h> |
13 | #include <linux/cred.h> | 13 | #include <linux/cred.h> |
14 | #include <linux/securebits.h> | ||
15 | #include <linux/keyctl.h> | ||
16 | #include <linux/key-type.h> | ||
17 | #include <keys/user-type.h> | ||
18 | #include <linux/seq_file.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/uaccess.h> | ||
21 | #include <linux/ctype.h> | ||
14 | 22 | ||
15 | static struct kmem_cache *user_ns_cachep __read_mostly; | 23 | static struct kmem_cache *user_ns_cachep __read_mostly; |
16 | 24 | ||
25 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | ||
26 | struct uid_gid_map *map); | ||
27 | |||
17 | /* | 28 | /* |
18 | * Create a new user namespace, deriving the creator from the user in the | 29 | * Create a new user namespace, deriving the creator from the user in the |
19 | * passed credentials, and replacing that user with the new root user for the | 30 | * passed credentials, and replacing that user with the new root user for the |
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly; | |||
24 | */ | 35 | */ |
25 | int create_user_ns(struct cred *new) | 36 | int create_user_ns(struct cred *new) |
26 | { | 37 | { |
27 | struct user_namespace *ns; | 38 | struct user_namespace *ns, *parent_ns = new->user_ns; |
28 | struct user_struct *root_user; | 39 | kuid_t owner = new->euid; |
29 | int n; | 40 | kgid_t group = new->egid; |
41 | |||
42 | /* The creator needs a mapping in the parent user namespace | ||
43 | * or else we won't be able to reasonably tell userspace who | ||
44 | * created a user_namespace. | ||
45 | */ | ||
46 | if (!kuid_has_mapping(parent_ns, owner) || | ||
47 | !kgid_has_mapping(parent_ns, group)) | ||
48 | return -EPERM; | ||
30 | 49 | ||
31 | ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); | 50 | ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); |
32 | if (!ns) | 51 | if (!ns) |
33 | return -ENOMEM; | 52 | return -ENOMEM; |
34 | 53 | ||
35 | kref_init(&ns->kref); | 54 | kref_init(&ns->kref); |
55 | ns->parent = parent_ns; | ||
56 | ns->owner = owner; | ||
57 | ns->group = group; | ||
36 | 58 | ||
37 | for (n = 0; n < UIDHASH_SZ; ++n) | 59 | /* Start with the same capabilities as init but useless for doing |
38 | INIT_HLIST_HEAD(ns->uidhash_table + n); | 60 | * anything as the capabilities are bound to the new user namespace. |
39 | 61 | */ | |
40 | /* Alloc new root user. */ | 62 | new->securebits = SECUREBITS_DEFAULT; |
41 | root_user = alloc_uid(ns, 0); | 63 | new->cap_inheritable = CAP_EMPTY_SET; |
42 | if (!root_user) { | 64 | new->cap_permitted = CAP_FULL_SET; |
43 | kmem_cache_free(user_ns_cachep, ns); | 65 | new->cap_effective = CAP_FULL_SET; |
44 | return -ENOMEM; | 66 | new->cap_bset = CAP_FULL_SET; |
45 | } | ||
46 | |||
47 | /* set the new root user in the credentials under preparation */ | ||
48 | ns->creator = new->user; | ||
49 | new->user = root_user; | ||
50 | new->uid = new->euid = new->suid = new->fsuid = 0; | ||
51 | new->gid = new->egid = new->sgid = new->fsgid = 0; | ||
52 | put_group_info(new->group_info); | ||
53 | new->group_info = get_group_info(&init_groups); | ||
54 | #ifdef CONFIG_KEYS | 67 | #ifdef CONFIG_KEYS |
55 | key_put(new->request_key_auth); | 68 | key_put(new->request_key_auth); |
56 | new->request_key_auth = NULL; | 69 | new->request_key_auth = NULL; |
57 | #endif | 70 | #endif |
58 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ | 71 | /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ |
59 | 72 | ||
60 | /* root_user holds a reference to ns, our reference can be dropped */ | 73 | /* Leave the new->user_ns reference with the new user namespace. */ |
61 | put_user_ns(ns); | 74 | /* Leave the reference to our user_ns with the new cred. */ |
75 | new->user_ns = ns; | ||
62 | 76 | ||
63 | return 0; | 77 | return 0; |
64 | } | 78 | } |
65 | 79 | ||
66 | /* | 80 | void free_user_ns(struct kref *kref) |
67 | * Deferred destructor for a user namespace. This is required because | ||
68 | * free_user_ns() may be called with uidhash_lock held, but we need to call | ||
69 | * back to free_uid() which will want to take the lock again. | ||
70 | */ | ||
71 | static void free_user_ns_work(struct work_struct *work) | ||
72 | { | 81 | { |
73 | struct user_namespace *ns = | 82 | struct user_namespace *parent, *ns = |
74 | container_of(work, struct user_namespace, destroyer); | 83 | container_of(kref, struct user_namespace, kref); |
75 | free_uid(ns->creator); | 84 | |
85 | parent = ns->parent; | ||
76 | kmem_cache_free(user_ns_cachep, ns); | 86 | kmem_cache_free(user_ns_cachep, ns); |
87 | put_user_ns(parent); | ||
77 | } | 88 | } |
89 | EXPORT_SYMBOL(free_user_ns); | ||
78 | 90 | ||
79 | void free_user_ns(struct kref *kref) | 91 | static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) |
80 | { | 92 | { |
81 | struct user_namespace *ns = | 93 | unsigned idx, extents; |
82 | container_of(kref, struct user_namespace, kref); | 94 | u32 first, last, id2; |
95 | |||
96 | id2 = id + count - 1; | ||
97 | |||
98 | /* Find the matching extent */ | ||
99 | extents = map->nr_extents; | ||
100 | smp_read_barrier_depends(); | ||
101 | for (idx = 0; idx < extents; idx++) { | ||
102 | first = map->extent[idx].first; | ||
103 | last = first + map->extent[idx].count - 1; | ||
104 | if (id >= first && id <= last && | ||
105 | (id2 >= first && id2 <= last)) | ||
106 | break; | ||
107 | } | ||
108 | /* Map the id or note failure */ | ||
109 | if (idx < extents) | ||
110 | id = (id - first) + map->extent[idx].lower_first; | ||
111 | else | ||
112 | id = (u32) -1; | ||
83 | 113 | ||
84 | INIT_WORK(&ns->destroyer, free_user_ns_work); | 114 | return id; |
85 | schedule_work(&ns->destroyer); | ||
86 | } | 115 | } |
87 | EXPORT_SYMBOL(free_user_ns); | ||
88 | 116 | ||
89 | uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) | 117 | static u32 map_id_down(struct uid_gid_map *map, u32 id) |
90 | { | 118 | { |
91 | struct user_namespace *tmp; | 119 | unsigned idx, extents; |
120 | u32 first, last; | ||
92 | 121 | ||
93 | if (likely(to == cred->user->user_ns)) | 122 | /* Find the matching extent */ |
94 | return uid; | 123 | extents = map->nr_extents; |
124 | smp_read_barrier_depends(); | ||
125 | for (idx = 0; idx < extents; idx++) { | ||
126 | first = map->extent[idx].first; | ||
127 | last = first + map->extent[idx].count - 1; | ||
128 | if (id >= first && id <= last) | ||
129 | break; | ||
130 | } | ||
131 | /* Map the id or note failure */ | ||
132 | if (idx < extents) | ||
133 | id = (id - first) + map->extent[idx].lower_first; | ||
134 | else | ||
135 | id = (u32) -1; | ||
95 | 136 | ||
137 | return id; | ||
138 | } | ||
96 | 139 | ||
97 | /* Is cred->user the creator of the target user_ns | 140 | static u32 map_id_up(struct uid_gid_map *map, u32 id) |
98 | * or the creator of one of it's parents? | 141 | { |
99 | */ | 142 | unsigned idx, extents; |
100 | for ( tmp = to; tmp != &init_user_ns; | 143 | u32 first, last; |
101 | tmp = tmp->creator->user_ns ) { | 144 | |
102 | if (cred->user == tmp->creator) { | 145 | /* Find the matching extent */ |
103 | return (uid_t)0; | 146 | extents = map->nr_extents; |
104 | } | 147 | smp_read_barrier_depends(); |
148 | for (idx = 0; idx < extents; idx++) { | ||
149 | first = map->extent[idx].lower_first; | ||
150 | last = first + map->extent[idx].count - 1; | ||
151 | if (id >= first && id <= last) | ||
152 | break; | ||
105 | } | 153 | } |
154 | /* Map the id or note failure */ | ||
155 | if (idx < extents) | ||
156 | id = (id - first) + map->extent[idx].first; | ||
157 | else | ||
158 | id = (u32) -1; | ||
159 | |||
160 | return id; | ||
161 | } | ||
162 | |||
163 | /** | ||
164 | * make_kuid - Map a user-namespace uid pair into a kuid. | ||
165 | * @ns: User namespace that the uid is in | ||
166 | * @uid: User identifier | ||
167 | * | ||
168 | * Maps a user-namespace uid pair into a kernel internal kuid, | ||
169 | * and returns that kuid. | ||
170 | * | ||
171 | * When there is no mapping defined for the user-namespace uid | ||
172 | * pair INVALID_UID is returned. Callers are expected to test | ||
173 | * for and handle handle INVALID_UID being returned. INVALID_UID | ||
174 | * may be tested for using uid_valid(). | ||
175 | */ | ||
176 | kuid_t make_kuid(struct user_namespace *ns, uid_t uid) | ||
177 | { | ||
178 | /* Map the uid to a global kernel uid */ | ||
179 | return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); | ||
180 | } | ||
181 | EXPORT_SYMBOL(make_kuid); | ||
182 | |||
183 | /** | ||
184 | * from_kuid - Create a uid from a kuid user-namespace pair. | ||
185 | * @targ: The user namespace we want a uid in. | ||
186 | * @kuid: The kernel internal uid to start with. | ||
187 | * | ||
188 | * Map @kuid into the user-namespace specified by @targ and | ||
189 | * return the resulting uid. | ||
190 | * | ||
191 | * There is always a mapping into the initial user_namespace. | ||
192 | * | ||
193 | * If @kuid has no mapping in @targ (uid_t)-1 is returned. | ||
194 | */ | ||
195 | uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) | ||
196 | { | ||
197 | /* Map the uid from a global kernel uid */ | ||
198 | return map_id_up(&targ->uid_map, __kuid_val(kuid)); | ||
199 | } | ||
200 | EXPORT_SYMBOL(from_kuid); | ||
106 | 201 | ||
107 | /* No useful relationship so no mapping */ | 202 | /** |
108 | return overflowuid; | 203 | * from_kuid_munged - Create a uid from a kuid user-namespace pair. |
204 | * @targ: The user namespace we want a uid in. | ||
205 | * @kuid: The kernel internal uid to start with. | ||
206 | * | ||
207 | * Map @kuid into the user-namespace specified by @targ and | ||
208 | * return the resulting uid. | ||
209 | * | ||
210 | * There is always a mapping into the initial user_namespace. | ||
211 | * | ||
212 | * Unlike from_kuid from_kuid_munged never fails and always | ||
213 | * returns a valid uid. This makes from_kuid_munged appropriate | ||
214 | * for use in syscalls like stat and getuid where failing the | ||
215 | * system call and failing to provide a valid uid are not an | ||
216 | * options. | ||
217 | * | ||
218 | * If @kuid has no mapping in @targ overflowuid is returned. | ||
219 | */ | ||
220 | uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) | ||
221 | { | ||
222 | uid_t uid; | ||
223 | uid = from_kuid(targ, kuid); | ||
224 | |||
225 | if (uid == (uid_t) -1) | ||
226 | uid = overflowuid; | ||
227 | return uid; | ||
109 | } | 228 | } |
229 | EXPORT_SYMBOL(from_kuid_munged); | ||
110 | 230 | ||
111 | gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) | 231 | /** |
232 | * make_kgid - Map a user-namespace gid pair into a kgid. | ||
233 | * @ns: User namespace that the gid is in | ||
234 | * @uid: group identifier | ||
235 | * | ||
236 | * Maps a user-namespace gid pair into a kernel internal kgid, | ||
237 | * and returns that kgid. | ||
238 | * | ||
239 | * When there is no mapping defined for the user-namespace gid | ||
240 | * pair INVALID_GID is returned. Callers are expected to test | ||
241 | * for and handle INVALID_GID being returned. INVALID_GID may be | ||
242 | * tested for using gid_valid(). | ||
243 | */ | ||
244 | kgid_t make_kgid(struct user_namespace *ns, gid_t gid) | ||
112 | { | 245 | { |
113 | struct user_namespace *tmp; | 246 | /* Map the gid to a global kernel gid */ |
247 | return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); | ||
248 | } | ||
249 | EXPORT_SYMBOL(make_kgid); | ||
114 | 250 | ||
115 | if (likely(to == cred->user->user_ns)) | 251 | /** |
116 | return gid; | 252 | * from_kgid - Create a gid from a kgid user-namespace pair. |
253 | * @targ: The user namespace we want a gid in. | ||
254 | * @kgid: The kernel internal gid to start with. | ||
255 | * | ||
256 | * Map @kgid into the user-namespace specified by @targ and | ||
257 | * return the resulting gid. | ||
258 | * | ||
259 | * There is always a mapping into the initial user_namespace. | ||
260 | * | ||
261 | * If @kgid has no mapping in @targ (gid_t)-1 is returned. | ||
262 | */ | ||
263 | gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) | ||
264 | { | ||
265 | /* Map the gid from a global kernel gid */ | ||
266 | return map_id_up(&targ->gid_map, __kgid_val(kgid)); | ||
267 | } | ||
268 | EXPORT_SYMBOL(from_kgid); | ||
269 | |||
270 | /** | ||
271 | * from_kgid_munged - Create a gid from a kgid user-namespace pair. | ||
272 | * @targ: The user namespace we want a gid in. | ||
273 | * @kgid: The kernel internal gid to start with. | ||
274 | * | ||
275 | * Map @kgid into the user-namespace specified by @targ and | ||
276 | * return the resulting gid. | ||
277 | * | ||
278 | * There is always a mapping into the initial user_namespace. | ||
279 | * | ||
280 | * Unlike from_kgid from_kgid_munged never fails and always | ||
281 | * returns a valid gid. This makes from_kgid_munged appropriate | ||
282 | * for use in syscalls like stat and getgid where failing the | ||
283 | * system call and failing to provide a valid gid are not options. | ||
284 | * | ||
285 | * If @kgid has no mapping in @targ overflowgid is returned. | ||
286 | */ | ||
287 | gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) | ||
288 | { | ||
289 | gid_t gid; | ||
290 | gid = from_kgid(targ, kgid); | ||
117 | 291 | ||
118 | /* Is cred->user the creator of the target user_ns | 292 | if (gid == (gid_t) -1) |
119 | * or the creator of one of it's parents? | 293 | gid = overflowgid; |
294 | return gid; | ||
295 | } | ||
296 | EXPORT_SYMBOL(from_kgid_munged); | ||
297 | |||
298 | static int uid_m_show(struct seq_file *seq, void *v) | ||
299 | { | ||
300 | struct user_namespace *ns = seq->private; | ||
301 | struct uid_gid_extent *extent = v; | ||
302 | struct user_namespace *lower_ns; | ||
303 | uid_t lower; | ||
304 | |||
305 | lower_ns = current_user_ns(); | ||
306 | if ((lower_ns == ns) && lower_ns->parent) | ||
307 | lower_ns = lower_ns->parent; | ||
308 | |||
309 | lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); | ||
310 | |||
311 | seq_printf(seq, "%10u %10u %10u\n", | ||
312 | extent->first, | ||
313 | lower, | ||
314 | extent->count); | ||
315 | |||
316 | return 0; | ||
317 | } | ||
318 | |||
319 | static int gid_m_show(struct seq_file *seq, void *v) | ||
320 | { | ||
321 | struct user_namespace *ns = seq->private; | ||
322 | struct uid_gid_extent *extent = v; | ||
323 | struct user_namespace *lower_ns; | ||
324 | gid_t lower; | ||
325 | |||
326 | lower_ns = current_user_ns(); | ||
327 | if ((lower_ns == ns) && lower_ns->parent) | ||
328 | lower_ns = lower_ns->parent; | ||
329 | |||
330 | lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); | ||
331 | |||
332 | seq_printf(seq, "%10u %10u %10u\n", | ||
333 | extent->first, | ||
334 | lower, | ||
335 | extent->count); | ||
336 | |||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | ||
341 | { | ||
342 | struct uid_gid_extent *extent = NULL; | ||
343 | loff_t pos = *ppos; | ||
344 | |||
345 | if (pos < map->nr_extents) | ||
346 | extent = &map->extent[pos]; | ||
347 | |||
348 | return extent; | ||
349 | } | ||
350 | |||
351 | static void *uid_m_start(struct seq_file *seq, loff_t *ppos) | ||
352 | { | ||
353 | struct user_namespace *ns = seq->private; | ||
354 | |||
355 | return m_start(seq, ppos, &ns->uid_map); | ||
356 | } | ||
357 | |||
358 | static void *gid_m_start(struct seq_file *seq, loff_t *ppos) | ||
359 | { | ||
360 | struct user_namespace *ns = seq->private; | ||
361 | |||
362 | return m_start(seq, ppos, &ns->gid_map); | ||
363 | } | ||
364 | |||
365 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) | ||
366 | { | ||
367 | (*pos)++; | ||
368 | return seq->op->start(seq, pos); | ||
369 | } | ||
370 | |||
371 | static void m_stop(struct seq_file *seq, void *v) | ||
372 | { | ||
373 | return; | ||
374 | } | ||
375 | |||
376 | struct seq_operations proc_uid_seq_operations = { | ||
377 | .start = uid_m_start, | ||
378 | .stop = m_stop, | ||
379 | .next = m_next, | ||
380 | .show = uid_m_show, | ||
381 | }; | ||
382 | |||
383 | struct seq_operations proc_gid_seq_operations = { | ||
384 | .start = gid_m_start, | ||
385 | .stop = m_stop, | ||
386 | .next = m_next, | ||
387 | .show = gid_m_show, | ||
388 | }; | ||
389 | |||
390 | static DEFINE_MUTEX(id_map_mutex); | ||
391 | |||
392 | static ssize_t map_write(struct file *file, const char __user *buf, | ||
393 | size_t count, loff_t *ppos, | ||
394 | int cap_setid, | ||
395 | struct uid_gid_map *map, | ||
396 | struct uid_gid_map *parent_map) | ||
397 | { | ||
398 | struct seq_file *seq = file->private_data; | ||
399 | struct user_namespace *ns = seq->private; | ||
400 | struct uid_gid_map new_map; | ||
401 | unsigned idx; | ||
402 | struct uid_gid_extent *extent, *last = NULL; | ||
403 | unsigned long page = 0; | ||
404 | char *kbuf, *pos, *next_line; | ||
405 | ssize_t ret = -EINVAL; | ||
406 | |||
407 | /* | ||
408 | * The id_map_mutex serializes all writes to any given map. | ||
409 | * | ||
410 | * Any map is only ever written once. | ||
411 | * | ||
412 | * An id map fits within 1 cache line on most architectures. | ||
413 | * | ||
414 | * On read nothing needs to be done unless you are on an | ||
415 | * architecture with a crazy cache coherency model like alpha. | ||
416 | * | ||
417 | * There is a one time data dependency between reading the | ||
418 | * count of the extents and the values of the extents. The | ||
419 | * desired behavior is to see the values of the extents that | ||
420 | * were written before the count of the extents. | ||
421 | * | ||
422 | * To achieve this smp_wmb() is used on guarantee the write | ||
423 | * order and smp_read_barrier_depends() is guaranteed that we | ||
424 | * don't have crazy architectures returning stale data. | ||
425 | * | ||
120 | */ | 426 | */ |
121 | for ( tmp = to; tmp != &init_user_ns; | 427 | mutex_lock(&id_map_mutex); |
122 | tmp = tmp->creator->user_ns ) { | 428 | |
123 | if (cred->user == tmp->creator) { | 429 | ret = -EPERM; |
124 | return (gid_t)0; | 430 | /* Only allow one successful write to the map */ |
431 | if (map->nr_extents != 0) | ||
432 | goto out; | ||
433 | |||
434 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID | ||
435 | * over the user namespace in order to set the id mapping. | ||
436 | */ | ||
437 | if (!ns_capable(ns, cap_setid)) | ||
438 | goto out; | ||
439 | |||
440 | /* Get a buffer */ | ||
441 | ret = -ENOMEM; | ||
442 | page = __get_free_page(GFP_TEMPORARY); | ||
443 | kbuf = (char *) page; | ||
444 | if (!page) | ||
445 | goto out; | ||
446 | |||
447 | /* Only allow <= page size writes at the beginning of the file */ | ||
448 | ret = -EINVAL; | ||
449 | if ((*ppos != 0) || (count >= PAGE_SIZE)) | ||
450 | goto out; | ||
451 | |||
452 | /* Slurp in the user data */ | ||
453 | ret = -EFAULT; | ||
454 | if (copy_from_user(kbuf, buf, count)) | ||
455 | goto out; | ||
456 | kbuf[count] = '\0'; | ||
457 | |||
458 | /* Parse the user data */ | ||
459 | ret = -EINVAL; | ||
460 | pos = kbuf; | ||
461 | new_map.nr_extents = 0; | ||
462 | for (;pos; pos = next_line) { | ||
463 | extent = &new_map.extent[new_map.nr_extents]; | ||
464 | |||
465 | /* Find the end of line and ensure I don't look past it */ | ||
466 | next_line = strchr(pos, '\n'); | ||
467 | if (next_line) { | ||
468 | *next_line = '\0'; | ||
469 | next_line++; | ||
470 | if (*next_line == '\0') | ||
471 | next_line = NULL; | ||
125 | } | 472 | } |
473 | |||
474 | pos = skip_spaces(pos); | ||
475 | extent->first = simple_strtoul(pos, &pos, 10); | ||
476 | if (!isspace(*pos)) | ||
477 | goto out; | ||
478 | |||
479 | pos = skip_spaces(pos); | ||
480 | extent->lower_first = simple_strtoul(pos, &pos, 10); | ||
481 | if (!isspace(*pos)) | ||
482 | goto out; | ||
483 | |||
484 | pos = skip_spaces(pos); | ||
485 | extent->count = simple_strtoul(pos, &pos, 10); | ||
486 | if (*pos && !isspace(*pos)) | ||
487 | goto out; | ||
488 | |||
489 | /* Verify there is not trailing junk on the line */ | ||
490 | pos = skip_spaces(pos); | ||
491 | if (*pos != '\0') | ||
492 | goto out; | ||
493 | |||
494 | /* Verify we have been given valid starting values */ | ||
495 | if ((extent->first == (u32) -1) || | ||
496 | (extent->lower_first == (u32) -1 )) | ||
497 | goto out; | ||
498 | |||
499 | /* Verify count is not zero and does not cause the extent to wrap */ | ||
500 | if ((extent->first + extent->count) <= extent->first) | ||
501 | goto out; | ||
502 | if ((extent->lower_first + extent->count) <= extent->lower_first) | ||
503 | goto out; | ||
504 | |||
505 | /* For now only accept extents that are strictly in order */ | ||
506 | if (last && | ||
507 | (((last->first + last->count) > extent->first) || | ||
508 | ((last->lower_first + last->count) > extent->lower_first))) | ||
509 | goto out; | ||
510 | |||
511 | new_map.nr_extents++; | ||
512 | last = extent; | ||
513 | |||
514 | /* Fail if the file contains too many extents */ | ||
515 | if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && | ||
516 | (next_line != NULL)) | ||
517 | goto out; | ||
518 | } | ||
519 | /* Be very certaint the new map actually exists */ | ||
520 | if (new_map.nr_extents == 0) | ||
521 | goto out; | ||
522 | |||
523 | ret = -EPERM; | ||
524 | /* Validate the user is allowed to use user id's mapped to. */ | ||
525 | if (!new_idmap_permitted(ns, cap_setid, &new_map)) | ||
526 | goto out; | ||
527 | |||
528 | /* Map the lower ids from the parent user namespace to the | ||
529 | * kernel global id space. | ||
530 | */ | ||
531 | for (idx = 0; idx < new_map.nr_extents; idx++) { | ||
532 | u32 lower_first; | ||
533 | extent = &new_map.extent[idx]; | ||
534 | |||
535 | lower_first = map_id_range_down(parent_map, | ||
536 | extent->lower_first, | ||
537 | extent->count); | ||
538 | |||
539 | /* Fail if we can not map the specified extent to | ||
540 | * the kernel global id space. | ||
541 | */ | ||
542 | if (lower_first == (u32) -1) | ||
543 | goto out; | ||
544 | |||
545 | extent->lower_first = lower_first; | ||
126 | } | 546 | } |
127 | 547 | ||
128 | /* No useful relationship so no mapping */ | 548 | /* Install the map */ |
129 | return overflowgid; | 549 | memcpy(map->extent, new_map.extent, |
550 | new_map.nr_extents*sizeof(new_map.extent[0])); | ||
551 | smp_wmb(); | ||
552 | map->nr_extents = new_map.nr_extents; | ||
553 | |||
554 | *ppos = count; | ||
555 | ret = count; | ||
556 | out: | ||
557 | mutex_unlock(&id_map_mutex); | ||
558 | if (page) | ||
559 | free_page(page); | ||
560 | return ret; | ||
561 | } | ||
562 | |||
563 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
564 | { | ||
565 | struct seq_file *seq = file->private_data; | ||
566 | struct user_namespace *ns = seq->private; | ||
567 | |||
568 | if (!ns->parent) | ||
569 | return -EPERM; | ||
570 | |||
571 | return map_write(file, buf, size, ppos, CAP_SETUID, | ||
572 | &ns->uid_map, &ns->parent->uid_map); | ||
573 | } | ||
574 | |||
575 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
576 | { | ||
577 | struct seq_file *seq = file->private_data; | ||
578 | struct user_namespace *ns = seq->private; | ||
579 | |||
580 | if (!ns->parent) | ||
581 | return -EPERM; | ||
582 | |||
583 | return map_write(file, buf, size, ppos, CAP_SETGID, | ||
584 | &ns->gid_map, &ns->parent->gid_map); | ||
585 | } | ||
586 | |||
587 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | ||
588 | struct uid_gid_map *new_map) | ||
589 | { | ||
590 | /* Allow the specified ids if we have the appropriate capability | ||
591 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. | ||
592 | */ | ||
593 | if (ns_capable(ns->parent, cap_setid)) | ||
594 | return true; | ||
595 | |||
596 | return false; | ||
130 | } | 597 | } |
131 | 598 | ||
132 | static __init int user_namespaces_init(void) | 599 | static __init int user_namespaces_init(void) |
diff --git a/kernel/utsname.c b/kernel/utsname.c index 405caf91aad5..679d97a5d3fd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, | |||
43 | 43 | ||
44 | down_read(&uts_sem); | 44 | down_read(&uts_sem); |
45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 45 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); | 46 | ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); |
47 | up_read(&uts_sem); | 47 | up_read(&uts_sem); |
48 | return ns; | 48 | return ns; |
49 | } | 49 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index df30ee08bdd4..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/sysctl.h> | 24 | #include <linux/sysctl.h> |
25 | 25 | ||
26 | #include <asm/irq_regs.h> | 26 | #include <asm/irq_regs.h> |
27 | #include <linux/kvm_para.h> | ||
27 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
28 | 29 | ||
29 | int watchdog_enabled = 1; | 30 | int watchdog_enabled = 1; |
@@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
280 | __this_cpu_write(softlockup_touch_sync, false); | 281 | __this_cpu_write(softlockup_touch_sync, false); |
281 | sched_clock_tick(); | 282 | sched_clock_tick(); |
282 | } | 283 | } |
284 | |||
285 | /* Clear the guest paused flag on watchdog reset */ | ||
286 | kvm_check_and_clear_guest_paused(); | ||
283 | __touch_watchdog(); | 287 | __touch_watchdog(); |
284 | return HRTIMER_RESTART; | 288 | return HRTIMER_RESTART; |
285 | } | 289 | } |
@@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
292 | */ | 296 | */ |
293 | duration = is_softlockup(touch_ts); | 297 | duration = is_softlockup(touch_ts); |
294 | if (unlikely(duration)) { | 298 | if (unlikely(duration)) { |
299 | /* | ||
300 | * If a virtual machine is stopped by the host it can look to | ||
301 | * the watchdog like a soft lockup, check to see if the host | ||
302 | * stopped the vm before we issue the warning | ||
303 | */ | ||
304 | if (kvm_check_and_clear_guest_paused()) | ||
305 | return HRTIMER_RESTART; | ||
306 | |||
295 | /* only warn once */ | 307 | /* only warn once */ |
296 | if (__this_cpu_read(soft_watchdog_warn) == true) | 308 | if (__this_cpu_read(soft_watchdog_warn) == true) |
297 | return HRTIMER_RESTART; | 309 | return HRTIMER_RESTART; |
@@ -360,6 +372,13 @@ static int watchdog(void *unused) | |||
360 | 372 | ||
361 | 373 | ||
362 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
375 | /* | ||
376 | * People like the simple clean cpu node info on boot. | ||
377 | * Reduce the watchdog noise by only printing messages | ||
378 | * that are different from what cpu0 displayed. | ||
379 | */ | ||
380 | static unsigned long cpu0_err; | ||
381 | |||
363 | static int watchdog_nmi_enable(int cpu) | 382 | static int watchdog_nmi_enable(int cpu) |
364 | { | 383 | { |
365 | struct perf_event_attr *wd_attr; | 384 | struct perf_event_attr *wd_attr; |
@@ -378,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) | |||
378 | 397 | ||
379 | /* Try to register using hardware perf events */ | 398 | /* Try to register using hardware perf events */ |
380 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 399 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
400 | |||
401 | /* save cpu0 error for future comparision */ | ||
402 | if (cpu == 0 && IS_ERR(event)) | ||
403 | cpu0_err = PTR_ERR(event); | ||
404 | |||
381 | if (!IS_ERR(event)) { | 405 | if (!IS_ERR(event)) { |
382 | pr_info("enabled, takes one hw-pmu counter.\n"); | 406 | /* only print for cpu0 or different than cpu0 */ |
407 | if (cpu == 0 || cpu0_err) | ||
408 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
383 | goto out_save; | 409 | goto out_save; |
384 | } | 410 | } |
385 | 411 | ||
412 | /* skip displaying the same error again */ | ||
413 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
414 | return PTR_ERR(event); | ||
386 | 415 | ||
387 | /* vary the KERN level based on the returned errno */ | 416 | /* vary the KERN level based on the returned errno */ |
388 | if (PTR_ERR(event) == -EOPNOTSUPP) | 417 | if (PTR_ERR(event) == -EOPNOTSUPP) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c08..9a3128dc67df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
1032 | cwq = get_cwq(gcwq->cpu, wq); | 1032 | cwq = get_cwq(gcwq->cpu, wq); |
1033 | trace_workqueue_queue_work(cpu, cwq, work); | 1033 | trace_workqueue_queue_work(cpu, cwq, work); |
1034 | 1034 | ||
1035 | BUG_ON(!list_empty(&work->entry)); | 1035 | if (WARN_ON(!list_empty(&work->entry))) { |
1036 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
1037 | return; | ||
1038 | } | ||
1036 | 1039 | ||
1037 | cwq->nr_in_flight[cwq->work_color]++; | 1040 | cwq->nr_in_flight[cwq->work_color]++; |
1038 | work_flags = work_color_to_flags(cwq->work_color); | 1041 | work_flags = work_color_to_flags(cwq->work_color); |
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) | |||
1210 | } else | 1213 | } else |
1211 | wake_up_all(&gcwq->trustee_wait); | 1214 | wake_up_all(&gcwq->trustee_wait); |
1212 | 1215 | ||
1213 | /* sanity check nr_running */ | 1216 | /* |
1214 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | 1217 | * Sanity check nr_running. Because trustee releases gcwq->lock |
1218 | * between setting %WORKER_ROGUE and zapping nr_running, the | ||
1219 | * warning may trigger spuriously. Check iff trustee is idle. | ||
1220 | */ | ||
1221 | WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && | ||
1222 | gcwq->nr_workers == gcwq->nr_idle && | ||
1215 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | 1223 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); |
1216 | } | 1224 | } |
1217 | 1225 | ||
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) | |||
1810 | * lock freed" warnings as well as problems when looking into | 1818 | * lock freed" warnings as well as problems when looking into |
1811 | * work->lockdep_map, make a copy and use that here. | 1819 | * work->lockdep_map, make a copy and use that here. |
1812 | */ | 1820 | */ |
1813 | struct lockdep_map lockdep_map = work->lockdep_map; | 1821 | struct lockdep_map lockdep_map; |
1822 | |||
1823 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | ||
1814 | #endif | 1824 | #endif |
1815 | /* | 1825 | /* |
1816 | * A single work shouldn't be executed concurrently by | 1826 | * A single work shouldn't be executed concurrently by |
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) | |||
2506 | { | 2516 | { |
2507 | struct wq_barrier barr; | 2517 | struct wq_barrier barr; |
2508 | 2518 | ||
2519 | lock_map_acquire(&work->lockdep_map); | ||
2520 | lock_map_release(&work->lockdep_map); | ||
2521 | |||
2509 | if (start_flush_work(work, &barr, true)) { | 2522 | if (start_flush_work(work, &barr, true)) { |
2510 | wait_for_completion(&barr.done); | 2523 | wait_for_completion(&barr.done); |
2511 | destroy_work_on_stack(&barr.work); | 2524 | destroy_work_on_stack(&barr.work); |